# Project 2: Build Decision Tree for Attrition Rate Analysis
DV - "Attrition"
IDV - Output of RF Algorithm

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing
from warnings import filterwarnings
from sklearn.ensemble import RandomForestClassifier
filterwarnings('ignore')

In [2]:
dataset=pd.read_csv('general_data.csv')
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
dataset.shape

(4410, 24)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

In [5]:
# Checking null values
dataset.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [6]:
dataset['TotalWorkingYears'].mean()

11.279936378095888

In [7]:
dataset['NumCompaniesWorked'].median()

2.0

In [8]:
# Filling the Null values by mean and median
dataset['TotalWorkingYears']=dataset['TotalWorkingYears'].fillna(11.28) 
dataset['NumCompaniesWorked']=dataset['NumCompaniesWorked'].fillna(2)

In [9]:
# Verified that there is no null values
dataset.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [10]:
# Conversion of Categorical Variables
encoder=preprocessing.LabelEncoder()
dataset['Attrition']=encoder.fit_transform(dataset['Attrition'])
dataset['BusinessTravel']=encoder.fit_transform(dataset['BusinessTravel'])
dataset['Department']=encoder.fit_transform(dataset['Department'])
dataset['Over18']=encoder.fit_transform(dataset['Over18'])
dataset['MaritalStatus']=encoder.fit_transform(dataset['MaritalStatus'])
dataset['JobRole']=encoder.fit_transform(dataset['JobRole'])
dataset['Gender']=encoder.fit_transform(dataset['Gender'])
dataset['EducationField']=encoder.fit_transform(dataset['EducationField'])

In [11]:
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,1,1,1,0,...,1.0,0,11,8,0,1.0,6,1,0,0
1,31,1,1,1,10,1,1,1,2,0,...,0.0,0,23,8,1,6.0,3,5,1,4
2,32,0,1,1,17,4,4,1,3,1,...,1.0,0,15,8,3,5.0,2,5,0,3
3,38,0,0,1,2,5,1,1,4,1,...,3.0,0,11,8,3,13.0,5,8,7,5
4,32,0,2,1,10,1,3,1,5,1,...,4.0,0,12,8,2,9.0,2,6,0,4


In [12]:
# Dropping unnecessary columns
dataset=dataset.drop(['EmployeeCount','EmployeeID','Over18','StandardHours'],axis=1)
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,1,0,1,0,1,131160,1.0,11,0,1.0,6,1,0,0
1,31,1,1,1,10,1,1,0,1,6,2,41890,0.0,23,1,6.0,3,5,1,4
2,32,0,1,1,17,4,4,1,4,7,1,193280,1.0,15,3,5.0,2,5,0,3
3,38,0,0,1,2,5,1,1,3,1,1,83210,3.0,11,3,13.0,5,8,7,5
4,32,0,2,1,10,1,3,1,1,7,2,23420,4.0,12,2,9.0,2,6,0,4


In [13]:
# Splitting the Dataset
y=dataset.iloc[:,1:2]
y.head()

Unnamed: 0,Attrition
0,0
1,1
2,0
3,0
4,0


In [14]:
x=dataset.drop(['Attrition'],axis=1)
x.head()

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,2,2,6,2,1,0,1,0,1,131160,1.0,11,0,1.0,6,1,0,0
1,31,1,1,10,1,1,0,1,6,2,41890,0.0,23,1,6.0,3,5,1,4
2,32,1,1,17,4,4,1,4,7,1,193280,1.0,15,3,5.0,2,5,0,3
3,38,0,1,2,5,1,1,3,1,1,83210,3.0,11,3,13.0,5,8,7,5
4,32,2,1,10,1,3,1,1,7,2,23420,4.0,12,2,9.0,2,6,0,4


In [15]:
y.shape

(4410, 1)

In [16]:
x.shape

(4410, 19)

In [17]:
# Building and fitting the model
tree_model=tree.DecisionTreeClassifier()
tree_model.fit(X=x,y=y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [18]:
# Checking the model score/accuracy for prediction
tree_model.score(X=x,y=y)

1.0

The Model is 100% Accurate for prediction

In [19]:
# Now Applying Random Forest to identify the Important IDV for Decision Tree
rf_model=RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)

In [20]:
# fitting the RF model
rf_model.fit(X=x,y=y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
# Checking the Accuracy Score
print("OOB Accuracy Score:",rf_model.oob_score_)

OOB Accuracy Score: 1.0


In [22]:
# Checking the feature importances
for feature,imp in zip(x,rf_model.feature_importances_):
    print(feature,":",imp)

Age : 0.09647280292904185
BusinessTravel : 0.02800401296037174
Department : 0.02610932149436946
DistanceFromHome : 0.06909630020838216
Education : 0.040392492479776915
EducationField : 0.04095526779738022
Gender : 0.018020807216664218
JobLevel : 0.0378738146348469
JobRole : 0.0560635014737676
MaritalStatus : 0.03981551834352379
MonthlyIncome : 0.09482828610568833
NumCompaniesWorked : 0.05675860128066217
PercentSalaryHike : 0.06641309372373885
StockOptionLevel : 0.03394580597163122
TotalWorkingYears : 0.08608783913933209
TrainingTimesLastYear : 0.04495821903904646
YearsAtCompany : 0.06774023396400888
YearsSinceLastPromotion : 0.04243459304128336
YearsWithCurrManager : 0.05402948819648387


Here we observe that the following features are having their values >=0.05.
1. Age
2. DistanceFromHome
3. MonthlyIncome
4. PercentSalaryHike
5. TotalWorkingYears
6. YearsAtCompany
7. JobRole
8. NumCompaniesWorked
9. YearsWithCurrManager

These all are the important features which affect the attrition.

In [23]:
# Splitting the IDV i.e Important features
imp_features=dataset[['Age','DistanceFromHome','MonthlyIncome','PercentSalaryHike','TotalWorkingYears','YearsAtCompany','JobRole','NumCompaniesWorked','YearsWithCurrManager']]
imp_features.head()

Unnamed: 0,Age,DistanceFromHome,MonthlyIncome,PercentSalaryHike,TotalWorkingYears,YearsAtCompany,JobRole,NumCompaniesWorked,YearsWithCurrManager
0,51,6,131160,11,1.0,1,0,1.0,0
1,31,10,41890,23,6.0,5,6,0.0,4
2,32,17,193280,15,5.0,5,7,1.0,3
3,38,2,83210,11,13.0,8,1,3.0,5
4,32,10,23420,12,9.0,6,7,4.0,4


In [24]:
# Building the Decision Tree Model (Here depth=18 means we have 2 outputs attrition 1 and attrition 0 and we have 9 IDV i.e 2*9=18)
tree_model=tree.DecisionTreeClassifier(max_depth=18)

In [25]:
# Fitting the Decision Tree Model
tree_model.fit(X=imp_features,y=y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=18, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
# Creating the Decision Tree
with open("Dtree.dot","w") as f:
    f=tree.export_graphviz(tree_model,feature_names=["Age","DistanceFromHome","MonthlyIncome","PercentSalaryHike","TotalWorkingYears","YearsAtCompany","JobRole","NumCompaniesWorked","YearsWithCurrManager"],out_file=f);

In [27]:
# Checking the model score/accuracy for prediction
tree_model.score(X=imp_features,y=y)

0.9931972789115646

The Model is 99.3% Accurate for prediction

In [28]:
# Fitting the RF model
rf_model.fit(X=imp_features,y=y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [29]:
# Checking the Accuracy Score
print("OOB Accuracy Score:",rf_model.oob_score_)

OOB Accuracy Score: 0.9993197278911564


The OOB Accuracy is 99.9% which is almost considered as 100%

# Rules of the Decision Tree
1) Six Employees with Age greater than 33.5 years and total working years greater than 39 years has attrition as YES.

2) Three Employees with Age greater than 33.5 years but less than 39 years with total working years greater than 1.5 years and years with curr manager is greater than 0.5 and having monthly income less than 10300 has attrition as YES.

3) Thirty Employees with Age greater than 33.5 years with total working years less than 1.5 years with monthly income greater than 25425 has attrition as NO.

4) Three Employees with Age greater than 33.5 years with total working years less than 1.5 years with monthly income less than 25425 and distance from home is less than 2.5 has attrition as YES.

5) Three Employees with Age greater than 33.5 years with total working years less than 1.5 years and monthly income less than 25425 with distance from home is greater than 2.5 has attrition as NO.

6) Twenty One Employees with Age less than 33.5 years with total working years less than 1.5 years with monthly income greater than 112610 and job role is less than 5 has attrition as NO.