In [1]:
# Data Preprocessing

import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder 

print('pandas ', pd.__version__)
print('numpy ', np.__version__)
print('sklearn', sk.__version__)

pandas  1.1.3
numpy  1.19.2
sklearn 0.23.2


In [2]:
# Load the data
data = pd.read_csv('data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
data.shape

(1470, 35)

In [3]:
# Separate true labels from attribute data
y = data['Attrition']
print(y)

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object


In [4]:
# Separate true labels from attribute data
X = data.drop(columns =['Attrition'])
print(X)

      Age     BusinessTravel  DailyRate              Department  \
0      41      Travel_Rarely       1102                   Sales   
1      49  Travel_Frequently        279  Research & Development   
2      37      Travel_Rarely       1373  Research & Development   
3      33  Travel_Frequently       1392  Research & Development   
4      27      Travel_Rarely        591  Research & Development   
...   ...                ...        ...                     ...   
1465   36  Travel_Frequently        884  Research & Development   
1466   39      Travel_Rarely        613  Research & Development   
1467   27      Travel_Rarely        155  Research & Development   
1468   49  Travel_Frequently       1023                   Sales   
1469   34      Travel_Rarely        628  Research & Development   

      DistanceFromHome  Education EducationField  EmployeeCount  \
0                    1          2  Life Sciences              1   
1                    8          1  Life Sciences             

In [5]:
def transform(feature):
    """
    Function to transform categorical features into numerical features
    """
    le=LabelEncoder()
    X[feature]=le.fit_transform(X[feature])
    print(le.classes_)

In [6]:
cat_df=X.select_dtypes(include='object')
cat_df.columns

Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'Over18', 'OverTime'],
      dtype='object')

In [7]:
for col in cat_df.columns:
    transform(col)

['Non-Travel' 'Travel_Frequently' 'Travel_Rarely']
['Human Resources' 'Research & Development' 'Sales']
['Human Resources' 'Life Sciences' 'Marketing' 'Medical' 'Other'
 'Technical Degree']
['Female' 'Male']
['Healthcare Representative' 'Human Resources' 'Laboratory Technician'
 'Manager' 'Manufacturing Director' 'Research Director'
 'Research Scientist' 'Sales Executive' 'Sales Representative']
['Divorced' 'Married' 'Single']
['Y']
['No' 'Yes']


In [8]:
# transform y
le=LabelEncoder()
y=le.fit_transform(y)


In [9]:
print(y)

[1 0 1 ... 0 0 0]


In [10]:
X.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,2,1102,2,1,2,1,1,1,2,...,1,80,0,8,0,1,6,4,0,5
1,49,1,279,1,8,1,1,1,2,3,...,4,80,1,10,3,3,10,7,1,7
2,37,2,1373,1,2,2,4,1,4,4,...,2,80,0,7,3,3,0,0,0,0
3,33,1,1392,1,3,4,1,1,5,4,...,3,80,0,8,3,3,8,7,3,0
4,27,2,591,1,2,1,3,1,7,1,...,4,80,1,6,3,3,2,2,2,2


# Feature Selection

In [64]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Feature selection
# Use the built in scikit-learn function SelectKBest to select features
# Arbitrarily chose 22 features

selector = SelectKBest(chi2, k=22)

X_auto = selector.fit_transform(X,y)
selected_features = selector.get_support()

column_headers = []

for i in range(len(X.columns)):
    if selected_features[i] == True:
        column_headers.append(X.columns[i])

print(column_headers)

['Age', 'DailyRate', 'DistanceFromHome', 'EmployeeNumber', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [65]:
X_auto.shape

(1470, 22)

In [66]:
# Standardize the data

scaler=StandardScaler()
scaled_df=scaler.fit_transform(X_auto)
X_new = scaled_df


In [67]:
print(X_new[0])

[ 0.4463504   0.74252653 -1.01090934 -1.70128295 -0.66053067  0.37967213
 -0.05778755  1.03271569  1.15325359  1.23682046 -0.10834951  0.72601994
  2.12513592  1.59174553 -1.58417824 -0.93201439 -0.42164246 -2.17198183
 -0.16461311 -0.0632959  -0.67914568  0.24583399]


# Balance the Dataset

In [68]:
# combine SMOTE and Tomek techniques to balance the data

from imblearn.combine import SMOTETomek

smt = SMOTETomek(sampling_strategy='auto')
X_smt, y_smt = smt.fit_sample(X_new, y)

In [69]:
df = pd.DataFrame(X_smt, columns = column_headers)

In [70]:
df['Attrition'] = y_smt

In [71]:
df.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,...,OverTime,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0.44635,0.742527,-1.010909,-1.701283,-0.660531,0.379672,-0.057788,1.032716,1.153254,1.23682,...,1.591746,-1.584178,-0.932014,-0.421642,-2.171982,-0.164613,-0.063296,-0.679146,0.245834,1
1,1.322365,-1.297775,-0.14715,-1.699621,0.254625,-1.026167,-0.057788,0.626374,-0.660853,-0.133282,...,-0.628241,1.191438,0.241988,-0.164511,0.155707,0.488508,0.764998,-0.368715,0.806541,0
2,0.008343,1.414363,-0.887515,-1.696298,1.169781,-1.026167,-0.961486,-0.998992,0.2462,1.23682,...,1.591746,-0.658973,-0.932014,-0.550208,0.155707,-1.144294,-1.167687,-0.679146,-1.155935,1
3,-0.429664,1.461466,-0.764121,-1.694636,1.169781,0.379672,-0.961486,0.626374,0.2462,-0.133282,...,1.591746,0.266233,-0.932014,-0.421642,0.155707,0.161947,0.764998,0.252146,-1.155935,0
4,-1.086676,-0.524295,-0.887515,-1.691313,-1.575686,0.379672,-0.961486,-0.998992,-0.660853,-0.133282,...,-0.628241,1.191438,0.241988,-0.678774,0.155707,-0.817734,-0.615492,-0.058285,-0.595227,0


In [72]:
df.shape

(2464, 23)

In [73]:
# Save the prepocessed data
df.to_csv(r'data/data.csv', index=False)