In [1]:
# Data Preprocessing

import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder 

print('pandas ', pd.__version__)
print('numpy ', np.__version__)
print('sklearn', sk.__version__)

pandas  1.1.3
numpy  1.19.2
sklearn 0.23.2


In [2]:
# Load the data
data = pd.read_csv('data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
data.shape

(1470, 35)

In [3]:
# Separate true labels from attribute data
y = data['Attrition']
print(y)

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object


In [4]:
# Separate true labels from attribute data
X = data.drop(columns =['Attrition'])
print(X)

      Age     BusinessTravel  DailyRate              Department  \
0      41      Travel_Rarely       1102                   Sales   
1      49  Travel_Frequently        279  Research & Development   
2      37      Travel_Rarely       1373  Research & Development   
3      33  Travel_Frequently       1392  Research & Development   
4      27      Travel_Rarely        591  Research & Development   
...   ...                ...        ...                     ...   
1465   36  Travel_Frequently        884  Research & Development   
1466   39      Travel_Rarely        613  Research & Development   
1467   27      Travel_Rarely        155  Research & Development   
1468   49  Travel_Frequently       1023                   Sales   
1469   34      Travel_Rarely        628  Research & Development   

      DistanceFromHome  Education EducationField  EmployeeCount  \
0                    1          2  Life Sciences              1   
1                    8          1  Life Sciences             

In [5]:
def transform(feature):
    """
    Function to transform categorical features into numerical features
    """
    le=LabelEncoder()
    X[feature]=le.fit_transform(X[feature])
    print(le.classes_)

In [6]:
cat_df=X.select_dtypes(include='object')
cat_df.columns

Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'Over18', 'OverTime'],
      dtype='object')

In [7]:
for col in cat_df.columns:
    transform(col)

['Non-Travel' 'Travel_Frequently' 'Travel_Rarely']
['Human Resources' 'Research & Development' 'Sales']
['Human Resources' 'Life Sciences' 'Marketing' 'Medical' 'Other'
 'Technical Degree']
['Female' 'Male']
['Healthcare Representative' 'Human Resources' 'Laboratory Technician'
 'Manager' 'Manufacturing Director' 'Research Director'
 'Research Scientist' 'Sales Executive' 'Sales Representative']
['Divorced' 'Married' 'Single']
['Y']
['No' 'Yes']


In [8]:
# transform y
le=LabelEncoder()
y=le.fit_transform(y)


In [9]:
print(y)

[1 0 1 ... 0 0 0]


In [10]:
X.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,2,1102,2,1,2,1,1,1,2,...,1,80,0,8,0,1,6,4,0,5
1,49,1,279,1,8,1,1,1,2,3,...,4,80,1,10,3,3,10,7,1,7
2,37,2,1373,1,2,2,4,1,4,4,...,2,80,0,7,3,3,0,0,0,0
3,33,1,1392,1,3,4,1,1,5,4,...,3,80,0,8,3,3,8,7,3,0
4,27,2,591,1,2,1,3,1,7,1,...,4,80,1,6,3,3,2,2,2,2


# Feature Selection

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA

# Feature selection
# Use the built in scikit-learn function SelectKBest to select features
# Arbitrarily chose 22 features

selector = SelectKBest(chi2, k=22)

X_auto = selector.fit_transform(X,y)
selected_features = selector.get_support()

column_headers = []

for i in range(len(X.columns)):
    if selected_features[i] == True:
        column_headers.append(X.columns[i])

print(column_headers)



['Age', 'DailyRate', 'DistanceFromHome', 'EmployeeNumber', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [12]:
X_auto.shape

(1470, 22)

In [13]:
# Standardize the data

scaler=MinMaxScaler()
scaled_df=scaler.fit_transform(X_auto)
X_new = scaled_df


In [14]:
print(X_new[0])

[0.54761905 0.71581961 0.         0.         0.33333333 0.66666667
 0.25       0.875      1.         1.         0.26245392 0.6980526
 0.88888889 1.         0.         0.         0.2        0.
 0.15       0.22222222 0.         0.29411765]



# Balance the Dataset

In [15]:
# combine SMOTE and Tomek techniques to balance the data

from imblearn.over_sampling import BorderlineSMOTE, SVMSMOTE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.15, random_state=0, stratify=y)

smt = SVMSMOTE(sampling_strategy='minority')
X_smt, y_smt = smt.fit_resample(X_train, y_train)

df_train = pd.DataFrame(X_smt, columns=column_headers)
df_train['Attrition'] = y_smt

df_test = pd.DataFrame(X_test, columns=column_headers)
df_test['Attrition'] = y_test

In [16]:
df_train.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,...,OverTime,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0.809524,0.155333,0.071429,0.262216,1.0,0.333333,0.5,0.5,0.666667,0.5,...,1.0,0.666667,0.0,0.7,0.666667,0.125,0.222222,0.0,0.235294,0
1,0.190476,0.892627,0.785714,0.27044,0.0,0.666667,0.0,0.75,1.0,0.0,...,0.0,0.333333,0.666667,0.075,0.5,0.075,0.111111,0.0,0.117647,0
2,0.47619,0.012169,0.071429,0.148041,0.0,0.666667,0.5,0.875,0.666667,0.0,...,0.0,0.666667,0.666667,0.425,1.0,0.4,0.555556,0.333333,0.764706,0
3,0.857143,0.211167,0.642857,0.337204,0.666667,0.666667,0.5,0.125,0.333333,0.5,...,0.0,0.666667,0.333333,0.4,0.166667,0.225,0.388889,0.466667,0.058824,0
4,0.761905,0.8733,0.964286,0.636672,1.0,0.666667,0.75,0.625,0.0,0.5,...,1.0,0.333333,0.0,0.625,0.333333,0.075,0.111111,0.066667,0.117647,0


In [17]:
df_test.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,...,OverTime,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0.309524,0.425913,0.321429,0.956942,0.666667,0.666667,0.5,0.625,0.666667,0.5,...,0.0,0.666667,0.333333,0.325,0.333333,0.275,0.388889,0.266667,0.470588,0
1,0.761905,0.555476,0.0,0.988389,0.333333,0.666667,0.25,0.875,0.666667,0.0,...,0.0,1.0,0.666667,0.3,0.5,0.15,0.166667,0.0,0.058824,1
2,0.333333,0.72083,1.0,0.505564,1.0,0.666667,0.0,0.25,0.666667,1.0,...,0.0,0.333333,0.0,0.25,0.333333,0.2,0.388889,0.466667,0.411765,0
3,0.238095,0.261274,0.214286,0.728592,0.666667,0.666667,0.25,0.875,0.0,1.0,...,0.0,1.0,0.0,0.125,0.833333,0.1,0.111111,0.066667,0.176471,0
4,0.095238,0.740157,0.142857,0.136913,1.0,1.0,0.0,0.75,0.333333,0.0,...,1.0,0.0,0.333333,0.1,0.333333,0.1,0.111111,0.133333,0.117647,0


In [18]:
df_train.shape

(2096, 23)

In [19]:
df_test.shape

(221, 23)

In [20]:
# Save the prepocessed data
df_train.to_csv(r'data/train.csv', index=False)
df_test.to_csv(r'data/test.csv', index=False)