# Microsoft Employee Attrition Classification Problem

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
data = pd.read_csv('employee-attrition.csv')
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Splitting data into data type
data.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

There are objects and int64 data types.

In [4]:
# Integer data
data_int = data.select_dtypes(include = ['int64'])
data_int.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2


In [5]:
scaler = MinMaxScaler()
data_int = scaler.fit_transform(data_int)

In [6]:
# object data
data_obj = data.select_dtypes(include = ['object'])
data_obj.head()

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No


In [7]:
# Data Munging
le = LabelEncoder()
data_obj = data_obj.apply(le.fit_transform)
data_obj = data_obj.iloc[:, 1:8]
data_obj.head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18
0,2,2,1,0,7,2,0
1,1,1,1,1,6,1,0
2,2,1,4,1,2,2,0
3,1,1,1,0,6,1,0
4,2,1,3,1,2,1,0


In [8]:
target = data.iloc[:,[1]]
le = LabelEncoder()
target = target.apply(le.fit_transform)
target.head()

Unnamed: 0,Attrition
0,1
1,0
2,1
3,0
4,0


In [9]:
# One Hot Encoding
enc = OneHotEncoder()
data_obj = enc.fit_transform(data_obj).toarray()
data_obj.shape

(1470, 27)

In [10]:
data = np.hstack([data_obj,data_int])
data.shape

(1470, 53)

In [11]:
# Feature Selection
clf = RandomForestClassifier()
clf.fit(data,target)
clf.feature_importances_
model = SelectFromModel(clf, prefit = True)
data = model.transform(data)

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2)

In [22]:
# Stacking Classifier
SGD_regressor = SGDClassifier()
rf = RandomForestClassifier()
gb = LogisticRegression(C = 10)
knn = SVC(C = 0.001)
gbr = GradientBoostingClassifier(max_depth = 3)

In [23]:
# Meta Estimator
stack = StackingRegressor(regressors=(SGD_regressor,gb, knn,rf),
                            meta_regressor=gbr)

In [24]:
stack.fit(X_train, y_train)
prediction = stack.predict(X_test)

  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)


In [25]:
accuracy = accuracy_score(y_test, prediction)
cm = confusion_matrix(y_test, prediction)
print(accuracy)
print(cm)

0.809523809524
[[233   8]
 [ 48   5]]


In [17]:
cvs = cross_val_score(stack,X_train, y_train, scoring = 'accuracy', cv = 10)
cvs = np.mean(cvs)
print(cvs)

  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  regr.fit(X, y)
  y = column_or_1d(y, warn=True)


0.841815152832


In [21]:
param_grid = { 'C': [0.001,0.01,0.1,1,10]}
cv = GridSearchCV(knn,param_grid,scoring = 'accuracy',n_jobs = -1, cv = 5 )
cv.fit(X_train, y_train)
best_param = cv.best_params_
print(best_param)

{'C': 0.001}


  y = column_or_1d(y, warn=True)
