In [1]:
# Prepare a model for the HR department to predict the Attrition and give the insights from the data
# about the important factors associated with the attrition so that HR can take the corrective or
# previntive measures to stop or control the attrition.



In [2]:
# Import modules
%matplotlib inline

import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from urllib.request import urlopen 
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from six import StringIO 
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

plt.style.use('ggplot')
pd.set_option('display.max_columns', 500) 

In [3]:
df = pd.read_csv('C:/Users/DeepaK/Desktop/My Folder/My Learnings/HR_Employee_Attrition_Data.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/DeepaK/Desktop/My Folder/My Learnings/HR_Employee_Attrition_Data.csv'

In [None]:
df.columns

In [None]:
df.shape

In [None]:
# Converting features into a list

dx = ['Yes', 'No']
df.columns.tolist()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
# Calculating % of employees having Attrition labels as Yes and No. This also removes null / missing value
# possibility in Target Variable (TV).

df[df.Attrition.isin(['Yes', 'No'])].shape[0]/len(df)*100

In [None]:
# Encoding for the TV

df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})

# EDA

In [None]:
# 1. Null / Missing Checking & Removal 

totalnulls = df.isnull().sum().sort_values(ascending=True)
missing_data = pd.concat([totalnulls], axis=1, keys=['Totalnulls'])
missing_data.head()

In [None]:
# 2. Checking for the 3 unwanted features: EmployeeCount, Over18, StandardHours as have same values throughout the features.

print('EmployeeCount unique values: ', df['EmployeeCount'].unique())
print('Over18 unique values: ',df['Over18'].unique())
print('StandardHours unique values:',df['StandardHours'].unique())

In [None]:
# 2.1 Thus dropping these features

mydf = df.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis = 1)
mydf.head()

In [None]:
mydf.shape

In [None]:
# Plotting total count of Attrition in the total dataset

plt.figure(figsize = (4,4))
sns.countplot('Attrition', data = mydf)
plt.show()

In [None]:
# Calculating % Attrition

attrpc = mydf['Attrition'].value_counts()[1]/mydf['Attrition'].count()*100
attrpc

In [None]:
# Plotting distribution of 1st feature Age against Attrition

plt.figure(figsize = (8,3))
sns.distplot(mydf['Age'])
plt.show()

In [None]:
# Plotting Attrition Vs Age to check any relationship to generate an insight.

plt.figure(figsize = (16,6))
sns.swarmplot(y = 'Age', x = 'Attrition', data = mydf, hue = 'Attrition')
plt.show()

#### Insight 1: The Swarmplot shows Agewise, Atrrition is highest amongst Age group 28 - 34.

In [None]:
# Plotting distribution of Business Travel feature.

plt.figure(figsize = (8,2))
print(mydf['BusinessTravel'].value_counts())
sns.countplot(x= 'BusinessTravel', data = mydf)
plt.show()

In [None]:
# Plotting Attrition Vs Business travel to check any relationship to generate an insight.

plt.figure(figsize = (20,10))
sns.swarmplot(x= 'Attrition', y='Age',  data = mydf, hue = 'BusinessTravel')
plt.show()

#### Insight 2: Attrition is highest among age group 30-32. This section also travels frequently. 

In [None]:
# Plotting distribution for next feature Departmemnt

print(mydf['Department'].value_counts())
plt.figure(figsize = (8,3))
sns.countplot(mydf['Department'])
plt.show()

In [None]:
# Getting department wise count of all 3 Business travel categories

departmentgrp = mydf.groupby(by = 'Department')

#HR
df1 = departmentgrp.get_group('Human Resources')['BusinessTravel'].value_counts()
df1 = pd.DataFrame(df1)
df1 = df1.reset_index()
df1['Department'] = 'HR'
df1

# RnD

df2 = departmentgrp.get_group('Research & Development')['BusinessTravel'].value_counts()
df2 = df2.to_frame().reset_index()
df2['Department'] = 'R&D'
df3 = df1.append(df2, ignore_index=True)
df3

# Sales

df2 = departmentgrp.get_group('Sales')['BusinessTravel'].value_counts()
df2 = df2.to_frame().reset_index()
df2['Department'] = 'Sales'
df4 = df3.append(df2, ignore_index=True)
df4

In [None]:
#Plotting department data against travel

plt.figure(figsize = (8,3))
sns.barplot(x= 'Department', y = 'BusinessTravel' , data = df4, hue = 'index')
plt.show()

In [None]:
# Calculating attrition % for each department

#Sales
attrbysales = departmentgrp.get_group('Sales')['Attrition'].value_counts()
#HR
attrbyHR = departmentgrp.get_group('Human Resources')['Attrition'].value_counts()
#RnD
attrbyRnD = departmentgrp.get_group('Research & Development')['Attrition'].value_counts()

attrbydept = pd.DataFrame(columns = ['Department', 'Attrition'])

attrbydept.loc[len(attrbydept)] = ["Sales", attrbysales[1]/(attrbysales.sum())*100]
attrbydept.loc[len(attrbydept)] = ["HR",(attrbyHR[1]/attrbyHR.sum())*100]
attrbydept.loc[len(attrbydept)] = ["R&D",(attrbyRnD[1]/attrbyRnD.sum())*100]
attrbydept

In [None]:
#Plotting Attrition% for each department to find any relationship to generate an insight

plt.figure(figsize = (8,3))
sns.barplot(x= 'Department', y = 'Attrition' , data = attrbydept, hue = 'Department')
plt.show()

#### Insight 3: Sales Department is registering highest Attrition

In [None]:
# Calculating Attrition against EnvironmentSatisfaction level

narr = mydf.EnvironmentSatisfaction.unique()
arr = np.sort(narr)
EnvironmentSatisfaction = pd.Series(arr)
EnvironmentSatisfaction

In [None]:
trydf = pd.DataFrame(columns = ['Attrition', 'Count']) 
trydf['EnvironmentSatisfaction'] = EnvironmentSatisfaction
trydf
v = trydf[list(trydf.columns)[-1]]
trydf = trydf.drop(['EnvironmentSatisfaction'], axis = 1)
trydf.insert(0,'EnvironmentSatisfaction',v)
trydf['Attrition'] = 1

occur = mydf.groupby(['EnvironmentSatisfaction', 'Attrition']).size()

trydf.Count[0] = occur[1,1]
trydf.Count[1]= occur[2,1]
trydf.Count[2]= occur[3,1]
trydf.Count[3]= occur[4,1]
trydf

In [None]:
# Plotting Attrition vs Environmentsatisfaction to generate insight

plt.figure(figsize = (8,3))
sns.barplot(x= 'EnvironmentSatisfaction', y = 'Count' , data = trydf, hue = 'EnvironmentSatisfaction')
plt.show()

In [None]:
AttrG = mydf.groupby(['Attrition', 'Gender'])

In [None]:
# Checking Attrition Against Gender

df1 = mydf.groupby(['Attrition','Gender']).size().reset_index().rename(columns={0:'Count'})
df1

In [None]:
NoAttr = df1[df1['Attrition'] == 0].index
NoAttr

In [None]:
df1.drop(NoAttr, inplace = True)
df1

In [None]:
# Plotting Attrition Vs Gender to check any relationship to generate an insight

plt.figure(figsize = (8,3))
sns.barplot(x= 'Gender', y = 'Count' , data = df1, hue = 'Gender')
plt.show()

#### Insight 4: Male have higher Attrition rate than female.

In [None]:
# Checking Attrition against job level

JLdf = mydf.groupby(['Attrition','JobLevel']).size().reset_index().rename(columns={0:'Count'})
JLdf

In [None]:
NoAttr = JLdf[JLdf['Attrition'] == 0].index
JLdf
JLdf1 = JLdf.drop(NoAttr)
JLdf1 = JLdf1.reset_index()
JLdf1['Count'][0]

In [None]:
Attritionpc = [int(100 * JLdf1['Count'][0] / (JLdf1['Count'][0] + JLdf['Count'][0])),
               int(100 * JLdf1['Count'][1] / (JLdf1['Count'][1] + JLdf['Count'][1])),
               int(100 * JLdf1['Count'][2] / (JLdf1['Count'][2] + JLdf['Count'][2])),
               int(100 * JLdf1['Count'][3] / (JLdf1['Count'][3] + JLdf['Count'][3])),
               int(100 * JLdf1['Count'][4] / (JLdf1['Count'][4] + JLdf['Count'][4]))
              ] 
Attritionpc
JLdf1['Attrition %'] = Attritionpc
JLdf1
JLdf1 = JLdf1.drop('index', axis = 1)
JLdf1

In [None]:
# Plotting Attrition % Against Job level to check any relationship to generate an insight

plt.figure(figsize = (8,3))
sns.barplot(x= 'Attrition', y = 'Attrition %' , data = JLdf1, hue = 'JobLevel')
plt.show()

#### Insight 5: Employees belonging to Job Level 1 are exiting the most.

In [None]:
# Plotting Attrition against Monthly Income to check any relationship to generate an insight

plt.figure(figsize = (8,3))
sns.swarmplot(x= 'Attrition', y = 'MonthlyIncome' , data = mydf, hue= 'MaritalStatus')
plt.show()

#### Insight 6: Attrition is highest among employeeswho are Single. Morevoer, Bulk Attrition is happening with respect to the employees earning between 2500 to 2750.

In [None]:
#Plotting Age vs monthly income

plt.figure(figsize = (16,6))
sns.regplot(x= 'Age', y = 'MonthlyIncome' , data = mydf)
plt.show()

####  Insight 7: Looks like Age & Monthly Income features are having almost Linear Relationship.

In [None]:
# Analyzing Salary hike and plotting against Attrition to generate an insight

mydf['PercentSalaryHike'].describe()

In [None]:
plt.figure(figsize = (8,3))
sns.distplot(mydf['PercentSalaryHike'], kde = True)
plt.show()

#### Insight 8: Bulk Salary hike is < 15%. 

In [None]:
saldf = mydf.groupby(['PercentSalaryHike', 'Attrition'])

In [None]:
# Plotting Histogram of Count Vs PercentSalaryHike to check any relationship to generate an insight

plt.figure(figsize = (16,3))
plt.bar(saldf.PercentSalaryHike, saldf.Count)

#### Insight 9: If hike is < 13%, Attiriton is very high. On other hand, it is very low if the hike is > 13. 

In [None]:
# Checking Attrition Vs JobSatisfaction to check any relationship to generate an insight

JSdf = mydf.groupby(['Attrition','JobSatisfaction']).size().reset_index().rename(columns={0:'Count'})
JSdf

In [None]:
NoAttr = JSdf[JSdf['Attrition'] == 0].index
JSdf
JSdf1 = JSdf.drop(NoAttr)
JSdf1 = JSdf1.reset_index().drop(['index'], axis = 1)
JSdf1

In [None]:
Attritionpc = [int(100 * JSdf1['Count'][0] / (JSdf1['Count'][0] + JSdf['Count'][0])),
               int(100 * JSdf1['Count'][1] / (JSdf1['Count'][1] + JSdf['Count'][1])),
               int(100 * JSdf1['Count'][2] / (JSdf1['Count'][2] + JSdf['Count'][2])),
               int(100 * JSdf1['Count'][3] / (JSdf1['Count'][3] + JSdf['Count'][3])),
              ] 
Attritionpc
JSdf1['Attrition %'] = Attritionpc
JSdf1


In [None]:
# Plotting Attrition against JobSatisfactionelevel

plt.figure(figsize = (16,6))
sns.barplot(y = 'Attrition %', x = 'JobSatisfaction', data = JSdf1)
plt.show()

#### Insight 10: We can infer an Inverse Linear Relationship. As the Job Satisfaction level is increasing, Attrition is decreasing.

### Building Random Forest Classifier as per RF code on mydf

In [None]:
# Pre-Processing data: Encoding

from sklearn import preprocessing

def preprocessor(df):
    res_df = df.copy()
    le = preprocessing.LabelEncoder()
    
    res_df["Attrition"] = le.fit_transform(res_df["Attrition"])
    res_df["BusinessTravel"] = le.fit_transform(res_df["BusinessTravel"])
    res_df["Department"] = le.fit_transform(res_df["Department"])
    res_df["EducationField"] = le.fit_transform(res_df["EducationField"])
    res_df["Gender"] = le.fit_transform(res_df["Gender"])
    res_df["JobRole"] = le.fit_transform(res_df["JobRole"])
    res_df["MaritalStatus"] = le.fit_transform(res_df["MaritalStatus"])
    res_df["OverTime"] = le.fit_transform(res_df["OverTime"])
    
    return res_df

In [None]:
encoded_df = preprocessor(mydf)
encoded_df.shape
x = encoded_df.drop(['Attrition'],axis =1).values    # Dropping Target Variable
y = encoded_df['Attrition'].values

In [None]:
encoded_df.head()

In [None]:
#Segregating the Independent and the dependant variable

y = encoded_df["Attrition"].values
X = encoded_df.drop(["Attrition"],axis =1)
y

In [None]:
# Splitting dataset into Training & Testing

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.2)
print(x_train.shape)
print(y_train.shape)

In [None]:
# Initializing Random Forest 

myhrmodel = RandomForestClassifier(random_state=42)

In [None]:
# Decide n_estimator using OOBER

myhrmodel.set_params(warm_start=False, 
                  oob_score=True)

min_estimators = 15
max_estimators = 500

error_rate = {}

for i in range(min_estimators, max_estimators + 1):
    myhrmodel.set_params(n_estimators=i)
    myhrmodel.fit(x_train, y_train)

    oob_error = 1 - myhrmodel.oob_score_
    error_rate[i] = oob_error

In [None]:
# Convert dictionary to a pandas series for easy plotting 
oob_series = pd.Series(error_rate)

In [None]:
# Plotting n_estimator Vs OOBER to determine optimal n_estimator.

fig, ax = plt.subplots(figsize=(10, 10))

ax.set_facecolor('#fafafa')

oob_series.plot(kind='line',color = 'red')
plt.axhline(0.055, color='#875FDB',linestyle='--')
plt.axhline(0.05, color='#875FDB',linestyle='--')
plt.xlabel('n_estimators')
plt.ylabel('OOB Error Rate')
plt.title('OOB Error Rate Across various Forest sizes \n(From {} to {} trees)'.format(min_estimators, max_estimators))

In [None]:
# Build the decision tree model with tree size 300 as we can see from above graph, OOBER is stabilizing around 300.

myhrmodel = RandomForestClassifier(n_estimators = 300, random_state = 0)
myhrmodel.fit(x_train, y_train)
myhrmodel_score_train = myhrmodel.score(x_train, y_train)
print("Training score: ", myhrmodel_score_train)
myhrmodel_score_test = myhrmodel.score(x_test, y_test)
print("Testing score: ", myhrmodel_score_test)

### Predictions

In [None]:
# Doing Predictions of Attritions against X_test

HRPred = myhrmodel.predict(x_test)

### Confusion Matrix

In [None]:
def create_conf_mat(y_test, HRPred):
    """Function returns confusion matrix comparing two arrays"""
    
    if (len(y_test.shape) != len(HRPred.shape) == 1):
        return print('Arrays entered are not 1-D.\nPlease enter the correctly sized sets.')
    
    elif (y_test.shape != HRPred.shape):
        return print('Number of values inside the Arrays are not equal to each other.\nPlease make sure the array has the same number of instances.')
    else:
        # Set Metrics
        test_crosstb_comp = pd.crosstab(index = y_test,
                                        columns = HRPred)
        
        # Changed for Future deprecation of as_matrix
        test_crosstb = test_crosstb_comp.values
        return test_crosstb

In [None]:
conf_mat = create_conf_mat(y_test, HRPred)
sns.heatmap(conf_mat, annot=True, fmt='d', cbar=False)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Actual vs. Predicted Confusion Matrix')
plt.show()

In [None]:
#Printing confusion matrix
print (confusion_matrix(y_test, HRPred))

### Accuracy

In [None]:
accuracy_HR = myhrmodel.score(x_test, y_test)

print("Here is our mean accuracy on the test set:\n {0:.3f}".format(accuracy_HR))

### Test Error Rate

In [None]:
# Here we calculate the test error rate.

test_error_rate_HR = 1 - accuracy_HR
print("The test error rate for our model is:\n {0: .4f}".format(test_error_rate_HR))

### Area Under the Curve

In [None]:
# We grab the second array from the output which corresponds to  the predicted probabilites of positive classes 
# Ordered wrt fit.classes_ in our case [0, 1] where 1 is our positive class

predictions_prob = myhrmodel.predict_proba(x_test)[:, 1]

fpr2, tpr2, _  = roc_curve(y_test,
                          predictions_prob,
                          pos_label = 1)

In [None]:
auc_HR = auc(fpr2, tpr2)

In [None]:
def plot_roc_curve(fpr, tpr, auc, estimator, xlim = None, ylim = None):
   
    my_estimators = {'knn': ['Kth Nearest Neighbor', 'deeppink'],
                     'rf': ['Random Forest', 'red'],
                     'nn': ['Neural Network', 'purple']
                    }

    try:
        plot_title = my_estimators[estimator][0]
        color_value = my_estimators[estimator][1]
    except KeyError as e:
        
        print("'{0}' does not correspond with the appropriate key inside the estimators dictionary. \
\nPlease refer to function to check `my_estimators` dictionary.".format(estimator))
        
        raise

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_facecolor('#fafafa')

    plt.plot(fpr, tpr,
             color=color_value,
             linewidth=1)
    plt.title('ROC Curve For {0} (AUC = {1: 0.3f})'\
              .format(plot_title, auc))

    plt.plot([0, 1], [0, 1], 'k--', lw=2) # Add Diagonal line
    plt.plot([0, 0], [1, 0], 'k--', lw=2, color = 'black')
    plt.plot([1, 0], [1, 1], 'k--', lw=2, color = 'black')
    if xlim is not None:
        plt.xlim(*xlim)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    plt.close()

In [None]:
plot_roc_curve(fpr2, tpr2, auc_HR, 'rf',
               xlim=(-0.01, 1.05), 
               ylim=(0.001, 1.05))

### Classification Report

In [None]:
def print_class_report(predictions, alg_name):
   
    print('Classification Report for {0}:'.format(alg_name))
    print(classification_report(predictions, 
            y_test, 
            target_names = dx))

In [None]:
class_report = print_class_report(HRPred, 'Random Forest')