In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('HR_Employee_Attrition.csv')

Comprobamos que las columnas sospechosas tienen valores únicos.

In [4]:
print('Employee Count: ' + str(data['EmployeeCount'].unique()))
print('Over 18: ' + str(data['Over18'].unique()))
print('StandardHours: ' + str(data['StandardHours'].unique()))

Employee Count: [1]
Over 18: ['Y']
StandardHours: [80]


Tras confirmarlo, eliminamos las columnas de nuestro análisis.

In [5]:
data = data.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis=1)

Ahora exploramos un poco más el dataset:

In [6]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,4,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,5,4,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,7,1,...,3,4,1,6,3,3,2,2,2,2


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeNumber            1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                 

In [8]:
data.loc[:, ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 
             'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 
             'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']].describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,HourlyRate,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,65.891156,6502.931293,14313.103401,2.693197,15.209524,11.279592,2.79932,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,20.329428,4707.956783,7117.786044,2.498009,3.659938,7.780782,1.289271,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,30.0,1009.0,2094.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,48.0,2911.0,8047.0,1.0,12.0,6.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,66.0,4919.0,14235.5,2.0,14.0,10.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,83.75,8379.0,20461.5,4.0,18.0,15.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,100.0,19999.0,26999.0,9.0,25.0,40.0,6.0,40.0,18.0,15.0,17.0


In [9]:
data['JobRole'].value_counts()

JobRole
Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145
Healthcare Representative    131
Manager                      102
Sales Representative          83
Research Director             80
Human Resources               52
Name: count, dtype: int64

Comprobamos si el identificador de empleado es único

In [10]:
data['EmployeeNumber'].nunique()

1470

El número de valores únicos coincide con el total de filas, por lo que podemos concluir que es único

Ahora comprobaremos si existe alguna relación entre las columnas económicas: "HourlyRate", "DailyRate", "MonthlyRate" y "MonthlyIncome".

In [None]:
ratios = pd.DataFrame()
ratios["MonthMonth"] = data["MonthlyRate"] / data["MonthlyIncome"]
ratios["MonthDay"] = data["MonthlyRate"] / data["DailyRate"]
ratios["MonthHour"] = data["MonthlyRate"] / data["HourlyRate"]
ratios["DayHour"] = data["DailyRate"] / data["HourlyRate"]
ratios["IncomeDay"] = data["MonthlyIncome"] / data["DailyRate"]
ratios["IncomeHour"] = data["MonthlyIncome"] / data["HourlyRate"]

In [15]:
ratios.describe()

Unnamed: 0,MonthMonth,MonthDay,MonthHour,DayHour,IncomeDay,IncomeHour
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,3.344066,27.808246,243.345719,13.566844,12.462828,110.8361
std,2.8662,30.827871,155.844962,8.786657,16.428979,95.245212
min,0.108963,1.511881,23.228261,1.181818,0.740822,12.146067
25%,1.274897,10.088197,121.698764,7.013242,3.90385,47.334751
50%,2.520108,17.456525,215.169125,11.973818,7.147781,78.270321
75%,4.531661,33.206233,323.004444,17.802426,14.246413,141.491577
max,26.758176,255.951923,876.166667,49.666667,185.057143,620.83871


In [15]:
ratios.head()

Unnamed: 0,MonthlyIncome,MonthlyRate,DailyRate,HourlyRate,MonthMonth,MonthDay,MonthHour,DayHour,IncomeDay,IncomeHour
0,5993,19479,1102,94,3.250292,17.676044,207.223404,11.723404,5.438294,63.755319
1,5130,24907,279,61,4.855166,89.272401,408.311475,4.57377,18.387097,84.098361
2,2090,2396,1373,92,1.146411,1.745084,26.043478,14.923913,1.522214,22.717391
3,2909,23159,1392,56,7.961155,16.637213,413.553571,24.857143,2.089799,51.946429
4,3468,16632,591,40,4.795848,28.142132,415.8,14.775,5.86802,86.7
