In [30]:
import os
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

sns.set()
pd.set_option('display.max_columns', None)

In [31]:
# Load the csv dataset into pandas dataframes
data_file = os.path.join(os.getcwd(), 'dataset', 'hr_employee_attrition.csv')
hr_df = pd.read_csv(data_file)
hr_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [32]:
hr_df.describe(include='all')

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470,1470,1470.0,1470,1470.0,1470.0,1470,1470.0,1470.0,1470.0,1470,1470.0,1470.0,1470.0,1470,1470.0,1470,1470.0,1470.0,1470.0,1470,1470,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
unique,,2,3,,3,,,6,,,,2,,,,9,,3,,,,1,2,,,,,,,,,,,,
top,,No,Travel_Rarely,,Research & Development,,,Life Sciences,,,,Male,,,,Sales Executive,,Married,,,,Y,No,,,,,,,,,,,,
freq,,1233,1043,,961,,,606,,,,882,,,,326,,673,,,,1470,1054,,,,,,,,,,,,
mean,36.92381,,,802.485714,,9.192517,2.912925,,1.0,1024.865306,2.721769,,65.891156,2.729932,2.063946,,2.728571,,6502.931293,14313.103401,2.693197,,,15.209524,3.153741,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,,,403.5091,,8.106864,1.024165,,0.0,602.024335,1.093082,,20.329428,0.711561,1.10694,,1.102846,,4707.956783,7117.786044,2.498009,,,3.659938,0.360824,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,,,102.0,,1.0,1.0,,1.0,1.0,1.0,,30.0,1.0,1.0,,1.0,,1009.0,2094.0,0.0,,,11.0,3.0,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,,,465.0,,2.0,2.0,,1.0,491.25,2.0,,48.0,2.0,1.0,,2.0,,2911.0,8047.0,1.0,,,12.0,3.0,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,,,802.0,,7.0,3.0,,1.0,1020.5,3.0,,66.0,3.0,2.0,,3.0,,4919.0,14235.5,2.0,,,14.0,3.0,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,,,1157.0,,14.0,4.0,,1.0,1555.75,4.0,,83.75,3.0,3.0,,4.0,,8379.0,20461.5,4.0,,,18.0,3.0,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0


In [33]:
hr_df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

All employee numbers in the dataset are unique. I made this as the index.

In [34]:
hr_df['EmployeeNumber'].nunique()

1470

In [35]:
hr_df.set_index('EmployeeNumber', inplace=True)

In [36]:
hr_df.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCom

In [37]:
hr_df['StockOptionLevel'].value_counts()

0    631
1    596
2    158
3     85
Name: StockOptionLevel, dtype: int64

In [38]:
hr_df['JobLevel'].value_counts()

1    543
2    534
3    218
4    106
5     69
Name: JobLevel, dtype: int64

In [39]:
hr_df

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
4,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
5,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
7,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2061,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,3,Male,41,4,2,Laboratory Technician,4,Married,2571,12290,4,Y,No,17,3,3,80,1,17,3,3,5,2,0,3
2062,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,4,Male,42,2,3,Healthcare Representative,1,Married,9991,21457,4,Y,No,15,3,1,80,1,9,5,3,7,7,1,7
2064,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2,Male,87,4,2,Manufacturing Director,2,Married,6142,5174,1,Y,Yes,20,4,2,80,1,6,0,3,6,2,0,3
2065,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,4,Male,63,2,2,Sales Executive,2,Married,5390,13243,2,Y,No,14,3,4,80,0,17,3,2,9,6,0,8


In [40]:
hr_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [41]:
nominal_categories = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 
                      'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

# Train Test Split

In [42]:
target = hr_df['Attrition']
features = hr_df.drop(['Attrition'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=234)

In [43]:
train_df = x_train.copy()
train_df['Attrition'] = y_train
train_df['Attrition']

EmployeeNumber
601      No
1710     No
1245     No
492     Yes
481      No
       ... 
1666     No
1207     No
974      No
441      No
613      No
Name: Attrition, Length: 1176, dtype: object

In [44]:
check_train_df = pd.merge(left=train_df, right=hr_df, left_index=True, right_index=True)
check_train_df[(check_train_df['Attrition_x'] != check_train_df['Attrition_y'])]

Unnamed: 0_level_0,Age_x,BusinessTravel_x,DailyRate_x,Department_x,DistanceFromHome_x,Education_x,EducationField_x,EmployeeCount_x,EnvironmentSatisfaction_x,Gender_x,HourlyRate_x,JobInvolvement_x,JobLevel_x,JobRole_x,JobSatisfaction_x,MaritalStatus_x,MonthlyIncome_x,MonthlyRate_x,NumCompaniesWorked_x,Over18_x,OverTime_x,PercentSalaryHike_x,PerformanceRating_x,RelationshipSatisfaction_x,StandardHours_x,StockOptionLevel_x,TotalWorkingYears_x,TrainingTimesLastYear_x,WorkLifeBalance_x,YearsAtCompany_x,YearsInCurrentRole_x,YearsSinceLastPromotion_x,YearsWithCurrManager_x,Attrition_x,Age_y,Attrition_y,BusinessTravel_y,DailyRate_y,Department_y,DistanceFromHome_y,Education_y,EducationField_y,EmployeeCount_y,EnvironmentSatisfaction_y,Gender_y,HourlyRate_y,JobInvolvement_y,JobLevel_y,JobRole_y,JobSatisfaction_y,MaritalStatus_y,MonthlyIncome_y,MonthlyRate_y,NumCompaniesWorked_y,Over18_y,OverTime_y,PercentSalaryHike_y,PerformanceRating_y,RelationshipSatisfaction_y,StandardHours_y,StockOptionLevel_y,TotalWorkingYears_y,TrainingTimesLastYear_y,WorkLifeBalance_y,YearsAtCompany_y,YearsInCurrentRole_y,YearsSinceLastPromotion_y,YearsWithCurrManager_y
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1


In [45]:
test_df = x_test.copy()
test_df['Attrition'] = y_test
test_df['Attrition']

EmployeeNumber
1527     No
500      No
1212     No
2056     No
1618     No
       ... 
550      No
1016    Yes
463      No
916      No
1613     No
Name: Attrition, Length: 294, dtype: object

In [46]:
check_test_df = pd.merge(left=test_df, right=hr_df, left_index=True, right_index=True)
# check_test_df
check_test_df[(check_test_df['Attrition_x'] != check_test_df['Attrition_y'])]

Unnamed: 0_level_0,Age_x,BusinessTravel_x,DailyRate_x,Department_x,DistanceFromHome_x,Education_x,EducationField_x,EmployeeCount_x,EnvironmentSatisfaction_x,Gender_x,HourlyRate_x,JobInvolvement_x,JobLevel_x,JobRole_x,JobSatisfaction_x,MaritalStatus_x,MonthlyIncome_x,MonthlyRate_x,NumCompaniesWorked_x,Over18_x,OverTime_x,PercentSalaryHike_x,PerformanceRating_x,RelationshipSatisfaction_x,StandardHours_x,StockOptionLevel_x,TotalWorkingYears_x,TrainingTimesLastYear_x,WorkLifeBalance_x,YearsAtCompany_x,YearsInCurrentRole_x,YearsSinceLastPromotion_x,YearsWithCurrManager_x,Attrition_x,Age_y,Attrition_y,BusinessTravel_y,DailyRate_y,Department_y,DistanceFromHome_y,Education_y,EducationField_y,EmployeeCount_y,EnvironmentSatisfaction_y,Gender_y,HourlyRate_y,JobInvolvement_y,JobLevel_y,JobRole_y,JobSatisfaction_y,MaritalStatus_y,MonthlyIncome_y,MonthlyRate_y,NumCompaniesWorked_y,Over18_y,OverTime_y,PercentSalaryHike_y,PerformanceRating_y,RelationshipSatisfaction_y,StandardHours_y,StockOptionLevel_y,TotalWorkingYears_y,TrainingTimesLastYear_y,WorkLifeBalance_y,YearsAtCompany_y,YearsInCurrentRole_y,YearsSinceLastPromotion_y,YearsWithCurrManager_y
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1


# Label Encoder on Training Data

In [47]:
# label_enc = LabelEncoder()
# label_enc = label_en.fit_transform()

In [48]:
label_enc_dict = defaultdict(LabelEncoder)
label_enc_dict

defaultdict(sklearn.preprocessing._label.LabelEncoder, {})

In [75]:
train_df = train_df.apply(lambda x: label_enc_dict[x.name].fit_transform(x) if x.name in nominal_categories else x)
train_df

Unnamed: 0_level_0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
601,40,2,302,1,6,3,1,1,2,0,75,3,4,4,3,2,13237,20364,7,Y,0,15,3,3,80,0,22,3,3,20,6,5,13,0
1710,36,2,530,2,2,4,1,1,3,0,51,3,2,8,4,2,4502,7439,3,Y,0,15,3,3,80,0,17,2,2,13,7,6,7,0
1245,54,1,966,1,1,4,1,1,4,0,53,3,3,4,3,0,10502,9659,7,Y,0,17,3,1,80,1,33,2,1,5,4,1,4,0
492,40,2,575,2,22,2,2,1,3,1,68,2,2,7,3,1,6380,6110,2,Y,1,12,3,1,80,2,8,6,3,6,4,1,0,1
481,36,1,541,2,3,4,3,1,1,1,48,2,3,7,4,1,9699,7246,4,Y,0,11,3,1,80,1,16,2,3,13,9,1,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1666,43,2,1291,1,15,2,1,1,3,1,65,2,4,5,3,1,17603,3525,1,Y,0,24,4,1,80,1,14,3,3,14,10,6,11,0
1207,33,2,147,0,2,3,0,1,2,1,99,3,1,1,3,1,3600,8429,1,Y,0,13,3,4,80,1,5,2,3,5,4,1,4,0
974,29,1,1404,2,20,3,5,1,3,0,84,3,1,8,4,1,2157,18203,1,Y,0,15,3,2,80,1,3,5,3,3,1,0,2,0
441,30,2,1275,1,28,2,3,1,4,0,64,3,2,6,4,1,5775,11934,1,Y,0,13,3,4,80,2,11,2,3,10,8,1,9,0


In [50]:
hr_df.loc[[601, 1710, 1245], :]

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
601,40,No,Travel_Rarely,302,Research & Development,6,3,Life Sciences,1,2,Female,75,3,4,Manufacturing Director,3,Single,13237,20364,7,Y,No,15,3,3,80,0,22,3,3,20,6,5,13
1710,36,No,Travel_Rarely,530,Sales,2,4,Life Sciences,1,3,Female,51,3,2,Sales Representative,4,Single,4502,7439,3,Y,No,15,3,3,80,0,17,2,2,13,7,6,7
1245,54,No,Travel_Frequently,966,Research & Development,1,4,Life Sciences,1,4,Female,53,3,3,Manufacturing Director,3,Divorced,10502,9659,7,Y,No,17,3,1,80,1,33,2,1,5,4,1,4


# One Hot Encoding on Training Data

In [64]:
arr_train = np.asarray(train_df[nominal_categories])

In [86]:
# ohe = OneHotEncoder(categories=[['BusinessTravel', 'Department', 'EducationField', 
#                       'Gender', 'JobRole', 'MaritalStatus', 'OverTime']], drop='first')
ohe = OneHotEncoder(categories='auto', drop='first')
ohe_arr = ohe.fit_transform(train_df[nominal_categories]).toarray()
ohe_labels = ohe.categories_
print(ohe_labels)

[array([0, 1], dtype=int32), array([0, 1, 2], dtype=int32), array([0, 1, 2], dtype=int32), array([0, 1, 2, 3, 4, 5], dtype=int32), array([0, 1], dtype=int32), array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int32), array([0, 1, 2], dtype=int32), array([0, 1], dtype=int32)]


In [87]:
ohe_labels = np.array(ohe_labels).ravel()
print(ohe_labels)

[array([0, 1], dtype=int32) array([0, 1, 2], dtype=int32)
 array([0, 1, 2], dtype=int32) array([0, 1, 2, 3, 4, 5], dtype=int32)
 array([0, 1], dtype=int32)
 array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int32)
 array([0, 1, 2], dtype=int32) array([0, 1], dtype=int32)]


In [88]:
ohe_df = pd.DataFrame(ohe_arr, columns=ohe_labels)
ohe_df

ValueError: Shape of passed values is (1176, 22), indices imply (1176, 8)

In [67]:
train_df['Attrition'].shape

(1176,)

In [69]:
nominal_categories

['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [54]:
ohe_dict = defaultdict(OneHotEncoder)
ohe_train_df = train_df[nominal_categories].apply(lambda x: ohe_dict[x.name].fit(x))

ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [24]:
# onehot_enc_dict = defaultdict(OneHotEncoder)
# train_df = train_df.apply(lambda x: onehot_enc_dict[x.name].fit_transform(x) if x.name in nominal_categories else x )
# train_df

ValueError: Expected 2D array, got 1D array instead:
array=[2 2 1 ... 1 2 2].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [23]:
1/0

ZeroDivisionError: division by zero

Only Education is the numerical category since the source defined the values amoung ('Below College', 'College', 'Bachelor', 'Master', 'Doctor')

In [None]:
numerical_categs = ['Education']
# for categ in numerical_categs:
    