# ML model to predict Employee Attriation

In [1]:
# Checking python version
from platform import python_version
print(python_version())

3.7.0


### Import required libraries

In [2]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

import matplotlib.pyplot as plt



In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 1. Read and Explore the data

In [4]:
df = pd.read_csv("Employee Attrition.csv")

In [5]:
# shape of the data frame
print ("The shape of dataframe is :",df.shape)


The shape of dataframe is : (1470, 34)


In [6]:
# checking datatypes
print("The datatypes of the coloums :\n",df.dtypes)


The datatypes of the coloums :
 EmployeeNumber               int64
Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                   object
EducationField              object
EnvironmentSatisfaction     object
Gender                      object
HourlyRate                   int64
JobInvolvement              object
JobLevel                     int64
JobRole                     object
JobSatisfaction             object
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating           object
RelationshipSatisfaction    object
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears      

In [7]:
# column names of data frame
col_names = list(df.columns)
col_names

['EmployeeNumber',
 'Age',
 'Attrition',
 'BusinessTravel',
 'DailyRate',
 'Department',
 'DistanceFromHome',
 'Education',
 'EducationField',
 'EnvironmentSatisfaction',
 'Gender',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'Over18',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [8]:
# Check for null values
df.isnull().sum()

EmployeeNumber              0
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithC

In [9]:
# description of the data
df.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
EmployeeNumber,1470,,,,735.5,424.497,1.0,368.25,735.5,1102.75,1470.0
Age,1470,,,,36.9238,9.13537,18.0,30.0,36.0,43.0,60.0
Attrition,1470,2.0,No,1233.0,,,,,,,
BusinessTravel,1470,3.0,Travel_Rarely,1043.0,,,,,,,
DailyRate,1470,,,,802.486,403.509,102.0,465.0,802.0,1157.0,1499.0
Department,1470,3.0,Research & Development,961.0,,,,,,,
DistanceFromHome,1470,,,,9.19252,8.10686,1.0,2.0,7.0,14.0,29.0
Education,1470,5.0,Bachelor,572.0,,,,,,,
EducationField,1470,6.0,Life Sciences,606.0,,,,,,,
EnvironmentSatisfaction,1470,4.0,High,453.0,,,,,,,


# Number of unique values of each column

In [10]:
for i in df.columns:
    print ("Number of unique value in {} column are {} \n The unique values are{} :".format(i,len(df[i].unique()),df[i].unique()))
    print("\n____________________________________\n")


Number of unique value in EmployeeNumber column are 1470 
 The unique values are[   1    2    3 ... 1468 1469 1470] :

____________________________________

Number of unique value in Age column are 43 
 The unique values are[41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60] :

____________________________________

Number of unique value in Attrition column are 2 
 The unique values are['Yes' 'No'] :

____________________________________

Number of unique value in BusinessTravel column are 3 
 The unique values are['Travel_Rarely' 'Travel_Frequently' 'Non-Travel'] :

____________________________________

Number of unique value in DailyRate column are 886 
 The unique values are[1102  279 1373 1392  591 1005 1324 1358  216 1299  809  153  670 1346
  103 1389  334 1123 1219  371  673 1218  419  391  699 1282 1125  691
  477  705  924 1459  125  895  813 1273  869  890  852 1141  464 1240
 1357  994  721 1360 

In [11]:
# categorical columns
cat_col = []
for i in col_names:
    if df[i].dtype =='object':
        cat_col.append(i)
    

In [12]:
cat_col

['Attrition',
 'BusinessTravel',
 'Department',
 'Education',
 'EducationField',
 'EnvironmentSatisfaction',
 'Gender',
 'JobInvolvement',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'Over18',
 'OverTime',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'WorkLifeBalance']

In [13]:
df.Education.unique()

array(['College', 'Below College', 'Master', 'Bachelor', 'Doctor'],
      dtype=object)

In [14]:
# convert ordinal columns to integers 
df.Education = df.Education.replace(to_replace=[1,2,3,4,5],value=['Below College','College','Bachelor','Master', 'Doctor'])

In [15]:
df.EnvironmentSatisfaction.unique()

array(['Medium', 'High', 'Very High', 'Low'], dtype=object)

In [16]:
# convert ordinal columns to integers 
df.EnvironmentSatisfaction = df.EnvironmentSatisfaction.replace(to_replace=[1,2,3,4],value=['Low','Medium', 'High', 'Very High' ])

In [17]:
df.JobInvolvement.unique()

array(['High', 'Medium', 'Very High', 'Low'], dtype=object)

In [18]:
# convert ordinal columns to integers 
df.JobInvolvement = df.JobInvolvement.replace(to_replace=[1,2,3,4],value=['Low','Medium', 'High', 'Very High' ])

In [19]:
df.JobSatisfaction.unique()

array(['Very High', 'Medium', 'High', 'Low'], dtype=object)

In [20]:
# convert ordinal columns to integers 
df.JobSatisfaction = df.JobSatisfaction.replace(to_replace=[1,2,3,4],value=['Low','Medium', 'High', 'Very High' ])

In [21]:
df.PerformanceRating.unique()

array(['Excellent', 'Outstanding'], dtype=object)

In [22]:
# convert ordinal columns to integers 
df.PerformanceRating = df.PerformanceRating.replace(to_replace=[3,4],value=['Excellent', 'Outstanding'])

In [23]:
df.RelationshipSatisfaction.unique()

array(['Low', 'Very High', 'Medium', 'High'], dtype=object)

In [24]:
# convert ordinal columns to integers 
df.RelationshipSatisfaction = df.RelationshipSatisfaction.replace(to_replace=[1,2,3,4],value=['Low','Medium', 'High', 'Very High' ])

In [25]:
df.WorkLifeBalance.unique()

array(['Bad', 'Better', 'Good', 'Best'], dtype=object)

In [26]:
# convert ordinal columns to integers 
df.WorkLifeBalance = df.WorkLifeBalance.replace(to_replace=[1,2,3,4],value=['Bad', 'Good','Better',  'Best'])

## Convert all categorical values to labels if there are less than 15 unique values

In [27]:
cat_cols=[]
for i in df.columns:
    if df[i].dtype == 'object' or len(df[i].unique())<15:
        cat_cols.append(i)
        print ("{} : {} : {}".format(i,len(df[i].unique()),df[i].unique()))

Attrition : 2 : ['Yes' 'No']
BusinessTravel : 3 : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department : 3 : ['Sales' 'Research & Development' 'Human Resources']
Education : 5 : ['College' 'Below College' 'Master' 'Bachelor' 'Doctor']
EducationField : 6 : ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
EnvironmentSatisfaction : 4 : ['Medium' 'High' 'Very High' 'Low']
Gender : 2 : ['Female' 'Male']
JobInvolvement : 4 : ['High' 'Medium' 'Very High' 'Low']
JobLevel : 5 : [2 1 3 4 5]
JobRole : 9 : ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
JobSatisfaction : 4 : ['Very High' 'Medium' 'High' 'Low']
MaritalStatus : 3 : ['Single' 'Married' 'Divorced']
NumCompaniesWorked : 10 : [8 1 6 9 0 4 5 2 7 3]
Over18 : 1 : ['Y']
OverTime : 2 : ['Yes' 'No']
PerformanceRating : 2 : ['Excellent' 'Outstanding']
Relati

In [28]:
cat_cols

['Attrition',
 'BusinessTravel',
 'Department',
 'Education',
 'EducationField',
 'EnvironmentSatisfaction',
 'Gender',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'NumCompaniesWorked',
 'Over18',
 'OverTime',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TrainingTimesLastYear',
 'WorkLifeBalance']

In [29]:
df[cat_cols].dtypes

Attrition                   object
BusinessTravel              object
Department                  object
Education                   object
EducationField              object
EnvironmentSatisfaction     object
Gender                      object
JobInvolvement              object
JobLevel                     int64
JobRole                     object
JobSatisfaction             object
MaritalStatus               object
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PerformanceRating           object
RelationshipSatisfaction    object
StandardHours                int64
StockOptionLevel             int64
TrainingTimesLastYear        int64
WorkLifeBalance             object
dtype: object

## convert all columns in cat_cols to categorical datatype

In [30]:
for i in cat_cols:
    df[i]=df[i].astype("category")

In [31]:
df[cat_cols].dtypes

Attrition                   category
BusinessTravel              category
Department                  category
Education                   category
EducationField              category
EnvironmentSatisfaction     category
Gender                      category
JobInvolvement              category
JobLevel                    category
JobRole                     category
JobSatisfaction             category
MaritalStatus               category
NumCompaniesWorked          category
Over18                      category
OverTime                    category
PerformanceRating           category
RelationshipSatisfaction    category
StandardHours               category
StockOptionLevel            category
TrainingTimesLastYear       category
WorkLifeBalance             category
dtype: object

## Summary of categorical columns

In [32]:
df.describe(include="category").transpose()

Unnamed: 0,count,unique,top,freq
Attrition,1470,2,No,1233
BusinessTravel,1470,3,Travel_Rarely,1043
Department,1470,3,Research & Development,961
Education,1470,5,Bachelor,572
EducationField,1470,6,Life Sciences,606
EnvironmentSatisfaction,1470,4,High,453
Gender,1470,2,Male,882
JobInvolvement,1470,4,High,868
JobLevel,1470,5,1,543
JobRole,1470,9,Sales Executive,326


In [33]:
num_cols = [i for i in df.columns if i not in cat_cols]
num_cols

['EmployeeNumber',
 'Age',
 'DailyRate',
 'DistanceFromHome',
 'HourlyRate',
 'MonthlyIncome',
 'MonthlyRate',
 'PercentSalaryHike',
 'TotalWorkingYears',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

# Building Prediction model


In [34]:
df.head()

Unnamed: 0,EmployeeNumber,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1,41,Yes,Travel_Rarely,1102,Sales,1,College,Life Sciences,Medium,...,Low,80,0,8,0,Bad,6,4,0,5
1,2,49,No,Travel_Frequently,279,Research & Development,8,Below College,Life Sciences,High,...,Very High,80,1,10,3,Better,10,7,1,7
2,3,37,Yes,Travel_Rarely,1373,Research & Development,2,College,Other,Very High,...,Medium,80,0,7,3,Better,0,0,0,0
3,4,33,No,Travel_Frequently,1392,Research & Development,3,Master,Life Sciences,Very High,...,High,80,0,8,3,Better,8,7,3,0
4,5,27,No,Travel_Rarely,591,Research & Development,2,Below College,Medical,Low,...,Very High,80,1,6,3,Better,2,2,2,2


In [35]:
df.shape

(1470, 34)

## Lable Encoding



In [36]:
from sklearn.preprocessing import LabelEncoder

df_lb_encd = df.select_dtypes(exclude=['number']) \
                .apply(LabelEncoder().fit_transform) \
                .join(df.select_dtypes(include=['number']))

### Target 

In [37]:
y=df_lb_encd['Attrition']

In [38]:
y.shape

(1470,)

### Predictors

In [39]:
X = df_lb_encd.drop(columns="Attrition")

In [40]:
X.shape

(1470, 33)

### Split data into train and test

In [41]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1234)

In [42]:
# Shape of train and test
print("train shape {} \n test shape  {}".format(X_train.shape,X_test.shape))

train shape (1029, 33) 
 test shape  (441, 33)


In [43]:
type(y_train)

pandas.core.series.Series

In [44]:
type(X_train)

pandas.core.frame.DataFrame

## Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

### Training the model

In [46]:
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Predict on first 10 test values

In [47]:
log_reg.predict(X_test[0:10])

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [48]:
y_test[0:10]

520     0
286     1
347     0
1245    0
1153    1
338     0
1430    0
1438    1
466     0
536     0
Name: Attrition, dtype: int64

### Predict on test data

In [49]:
predict_log_reg = log_reg.predict(X_test)

In [50]:
score = log_reg.score(X_test,y_test)
score

0.854875283446712

## Confusion Matrix

In [51]:
from sklearn import metrics

In [52]:
log_reg_cm = metrics.confusion_matrix(y_test,predict_log_reg)
log_reg_cm

array([[367,   4],
       [ 60,  10]], dtype=int64)

# Decision Tree


In [None]:
# import libaries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [62]:
# train Decision tree
dt_clf = tree.DecisionTreeClassifier()
dt_clf= dt_clf.fit(X_train,y_train)

In [63]:
dt_clf.predict(X_test[0:10])

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0], dtype=int64)

In [65]:
y_test[0:10]

520     0
286     1
347     0
1245    0
1153    1
338     0
1430    0
1438    1
466     0
536     0
Name: Attrition, dtype: int64

### prediction on test data by decision tree

In [66]:
# predicting on test data
dt_clf_test_pdt = dt_clf.predict(X_test)

## Confusion matrix

In [70]:
from sklearn import metrics
dt_clf_CM = metrics.confusion_matrix(y_true=y_test,y_pred=dt_clf_test_pdt)
print ("Confusion matrix of DT")
dt_clf_CM

Confusion matrix of DT


array([[331,  40],
       [ 44,  26]], dtype=int64)

In [72]:
# Accuracy
dt_clf.score(X=X_test,y=y_test)

0.8095238095238095

# Decision tree using gini 

In [76]:
dt_clf_gini = DecisionTreeClassifier(criterion="gini",random_state=100,max_depth=3,min_samples_leaf=5)

In [77]:
dt_clf_gini.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [78]:
dt_clf_gini.score(X=X_test,y=y_test)

0.8503401360544217

In [79]:
# prediction on test data
dt_clf_gini_test_pdt = dt_clf_gini.predict(X=X_test)

In [81]:
from sklearn import metrics
dt_clf_gini_CM = metrics.confusion_matrix(y_true=y_test,y_pred=dt_clf_gini_test_pdt)
print ("Confusion matrix of DT")
dt_clf_gini_CM

Confusion matrix of DT


array([[361,  10],
       [ 56,  14]], dtype=int64)

In [82]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = dt_clf_gini

ModuleNotFoundError: No module named 'pydot'

In [None]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = dt_clf_gini.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')