In [72]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.formula.api as sm
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [73]:
df = pd.read_excel("IBM Employee Attrition.xlsx")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,1,2,Female,3,2,Sales Executive,4,Single,5993,8,Y,Yes,11,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,2,3,Male,2,2,Research Scientist,2,Married,5130,1,Y,No,23,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,4,Male,2,1,Laboratory Technician,3,Single,2090,6,Y,Yes,15,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,5,4,Female,3,1,Research Scientist,3,Married,2909,1,Y,Yes,11,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,7,1,Male,3,1,Laboratory Technician,2,Married,3468,9,Y,No,12,3,4,1,6,3,3,2,2,2,2


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   Department                1470 non-null   object
 4   DistanceFromHome          1470 non-null   int64 
 5   Education                 1470 non-null   int64 
 6   EducationField            1470 non-null   object
 7   EmployeeNumber            1470 non-null   int64 
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   object
 10  JobInvolvement            1470 non-null   int64 
 11  JobLevel                  1470 non-null   int64 
 12  JobRole                   1470 non-null   object
 13  JobSatisfaction           1470 non-null   int64 
 14  MaritalStatus           

In [75]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [76]:
df.drop(columns=["Over18", "EmployeeNumber"], inplace = True)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,3,2,Sales Executive,4,Single,5993,8,Yes,11,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,2,2,Research Scientist,2,Married,5130,1,No,23,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,Male,2,1,Laboratory Technician,3,Single,2090,6,Yes,15,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,3,1,Research Scientist,3,Married,2909,1,Yes,11,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,Male,3,1,Laboratory Technician,2,Married,3468,9,No,12,3,4,1,6,3,3,2,2,2,2


In [77]:
df["zscore_Age"] = np.abs(stats.zscore(df["Age"]))
df["zscore_DistanceFromHome"] = np.abs(stats.zscore(df["DistanceFromHome"]))
df["zscore_Education"] = np.abs(stats.zscore(df["Education"]))
df["zscore_EnvironmentSatisfaction"] = np.abs(stats.zscore(df["EnvironmentSatisfaction"]))
df["zscore_JobInvolvement"] = np.abs(stats.zscore(df["JobInvolvement"]))
df["zscore_JobLevel"] = np.abs(stats.zscore(df["JobLevel"]))
df["zscore_JobSatisfaction"] = np.abs(stats.zscore(df["JobSatisfaction"]))
df["zscore_NumCompaniesWorked"] = np.abs(stats.zscore(df["NumCompaniesWorked"]))
df["zscore_PercentSalaryHike"] = np.abs(stats.zscore(df["PercentSalaryHike"]))
df["zscore_PerformanceRating"] = np.abs(stats.zscore(df["PerformanceRating"]))
df["zscore_RelationshipSatisfaction"] = np.abs(stats.zscore(df["RelationshipSatisfaction"]))
df["zscore_StockOptionLevel"] = np.abs(stats.zscore(df["StockOptionLevel"]))
df["zscore_TotalWorkingYears"] = np.abs(stats.zscore(df["TotalWorkingYears"]))
df["zscore_TrainingTimesLastYear"] = np.abs(stats.zscore(df["TrainingTimesLastYear"]))
df["zscore_WorkLifeBalance"] = np.abs(stats.zscore(df["WorkLifeBalance"]))
df["zscore_YearsAtCompany"] = np.abs(stats.zscore(df["YearsAtCompany"]))
df["zscore_YearsInCurrentRole"] = np.abs(stats.zscore(df["YearsInCurrentRole"]))
df["zscore_YearsSinceLastPromotion"] = np.abs(stats.zscore(df["YearsSinceLastPromotion"]))
df["zscore_YearsWithCurrManager"] = np.abs(stats.zscore(df["YearsWithCurrManager"]))
df["zscore_MonthlyIncome"] = np.abs(stats.zscore(df["MonthlyIncome"]))

In [78]:
Age_outliers = df[df["zscore_Age"] > 3].index
df = df.drop(Age_outliers)

## 
DistanceFromHome_outliers = df[df["zscore_DistanceFromHome"] > 3].index
df = df.drop(DistanceFromHome_outliers)

## 
Education_outliers = df[df["zscore_Education"] > 3].index
df = df.drop(Education_outliers)

## 
EnvironmentSatisfaction_outliers = df[df["zscore_EnvironmentSatisfaction"] > 3].index
df = df.drop(EnvironmentSatisfaction_outliers)

## 
JobInvolvement_outliers = df[df["zscore_JobInvolvement"] > 3].index
df = df.drop(JobInvolvement_outliers)

## 
JobSatisfaction_outliers = df[df["zscore_JobSatisfaction"] > 3].index
df = df.drop(JobSatisfaction_outliers)

## 
JobLevel_outliers = df[df["zscore_JobLevel"] > 3].index
df = df.drop(JobLevel_outliers)

## 
MonthlyIncome_outliers = df[df["zscore_MonthlyIncome"] > 3].index
df = df.drop(MonthlyIncome_outliers)

## 
NumCompaniesWorked_outliers = df[df["zscore_NumCompaniesWorked"] > 3].index
df = df.drop(NumCompaniesWorked_outliers)

## 
PercentSalaryHike_outliers = df[df["zscore_PercentSalaryHike"] > 3].index
df = df.drop(PercentSalaryHike_outliers)

## 
PerformanceRating_outliers = df[df["zscore_PerformanceRating"] > 3].index
df = df.drop(PerformanceRating_outliers)

## 
RelationshipSatisfaction_outliers = df[df["zscore_RelationshipSatisfaction"] > 3].index
df = df.drop(RelationshipSatisfaction_outliers)

## 
StockOptionLevel_outliers = df[df["zscore_StockOptionLevel"] > 3].index
df = df.drop(StockOptionLevel_outliers)

## 
TotalWorkingYears_outliers = df[df["zscore_TotalWorkingYears"] > 3].index
df = df.drop(TotalWorkingYears_outliers)

## 
TrainingTimesLastYear_outliers = df[df["zscore_TrainingTimesLastYear"] > 3].index
df = df.drop(TrainingTimesLastYear_outliers)

## 
WorkLifeBalance_outliers = df[df["zscore_WorkLifeBalance"] > 3].index
df = df.drop(WorkLifeBalance_outliers)

## 
YearsAtCompany_outliers = df[df["zscore_YearsAtCompany"] > 3].index
df = df.drop(YearsAtCompany_outliers)

## 
YearsInCurrentRole_outliers = df[df["zscore_YearsInCurrentRole"] > 3].index
df = df.drop(YearsInCurrentRole_outliers)

## 
YearsWithCurrManager_outliers = df[df["zscore_YearsWithCurrManager"] > 3].index
df = df.drop(YearsWithCurrManager_outliers)

In [79]:
df["Attrition"] = np.where(df["Attrition"] == "Yes", 1, 0)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,zscore_Age,zscore_DistanceFromHome,zscore_Education,zscore_EnvironmentSatisfaction,zscore_JobInvolvement,zscore_JobLevel,zscore_JobSatisfaction,zscore_NumCompaniesWorked,zscore_PercentSalaryHike,zscore_PerformanceRating,zscore_RelationshipSatisfaction,zscore_StockOptionLevel,zscore_TotalWorkingYears,zscore_TrainingTimesLastYear,zscore_WorkLifeBalance,zscore_YearsAtCompany,zscore_YearsInCurrentRole,zscore_YearsSinceLastPromotion,zscore_YearsWithCurrManager,zscore_MonthlyIncome
0,41,1,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,3,2,Sales Executive,4,Single,5993,8,Yes,11,3,1,0,8,0,1,6,4,0,5,0.44635,1.010909,0.891688,0.660531,0.379672,0.057788,1.153254,2.125136,1.150554,0.42623,1.584178,0.932014,0.421642,2.171982,2.49382,0.164613,0.063296,0.679146,0.245834,0.10835
1,49,0,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,2,2,Research Scientist,2,Married,5130,1,No,23,4,4,1,10,3,3,10,7,1,7,1.322365,0.14715,1.868426,0.254625,1.026167,0.057788,0.660853,0.678049,2.129306,2.346151,1.191438,0.241988,0.164511,0.155707,0.338096,0.488508,0.764998,0.368715,0.806541,0.291719
2,37,1,Travel_Rarely,Research & Development,2,2,Other,4,Male,2,1,Laboratory Technician,3,Single,2090,6,Yes,15,3,2,0,7,3,3,0,0,0,0,0.008343,0.887515,0.891688,1.169781,1.026167,0.961486,0.2462,1.324226,0.057267,0.42623,0.658973,0.932014,0.550208,0.155707,0.338096,1.144294,1.167687,0.679146,1.155935,0.937654
3,33,0,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,3,1,Research Scientist,3,Married,2909,1,Yes,11,3,3,0,8,3,3,8,7,3,0,0.429664,0.764121,1.061787,1.169781,0.379672,0.961486,0.2462,0.678049,1.150554,0.42623,0.266233,0.932014,0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,1.155935,0.763634
4,27,0,Travel_Rarely,Research & Development,2,1,Medical,1,Male,3,1,Laboratory Technician,2,Married,3468,9,No,12,3,4,1,6,3,3,2,2,2,2,1.086676,0.887515,1.868426,1.575686,0.379672,0.961486,0.660853,2.525591,0.877232,0.42623,1.191438,0.241988,0.678774,0.155707,0.338096,0.817734,0.615492,0.058285,0.595227,0.644858


In [80]:
df["BusinessTravel"].value_counts()

Travel_Rarely        997
Travel_Frequently    270
Non-Travel           145
Name: BusinessTravel, dtype: int64

In [81]:
df.loc[df["BusinessTravel"] == "Non-Travel", "BusinessTravel"] = 0
df.loc[df["BusinessTravel"] == "Travel_Rarely", "BusinessTravel"] = 1
df.loc[df["BusinessTravel"] == "Travel_Frequently", "BusinessTravel"] = 2
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,zscore_Age,zscore_DistanceFromHome,zscore_Education,zscore_EnvironmentSatisfaction,zscore_JobInvolvement,zscore_JobLevel,zscore_JobSatisfaction,zscore_NumCompaniesWorked,zscore_PercentSalaryHike,zscore_PerformanceRating,zscore_RelationshipSatisfaction,zscore_StockOptionLevel,zscore_TotalWorkingYears,zscore_TrainingTimesLastYear,zscore_WorkLifeBalance,zscore_YearsAtCompany,zscore_YearsInCurrentRole,zscore_YearsSinceLastPromotion,zscore_YearsWithCurrManager,zscore_MonthlyIncome
0,41,1,1,Sales,1,2,Life Sciences,2,Female,3,2,Sales Executive,4,Single,5993,8,Yes,11,3,1,0,8,0,1,6,4,0,5,0.44635,1.010909,0.891688,0.660531,0.379672,0.057788,1.153254,2.125136,1.150554,0.42623,1.584178,0.932014,0.421642,2.171982,2.49382,0.164613,0.063296,0.679146,0.245834,0.10835
1,49,0,2,Research & Development,8,1,Life Sciences,3,Male,2,2,Research Scientist,2,Married,5130,1,No,23,4,4,1,10,3,3,10,7,1,7,1.322365,0.14715,1.868426,0.254625,1.026167,0.057788,0.660853,0.678049,2.129306,2.346151,1.191438,0.241988,0.164511,0.155707,0.338096,0.488508,0.764998,0.368715,0.806541,0.291719
2,37,1,1,Research & Development,2,2,Other,4,Male,2,1,Laboratory Technician,3,Single,2090,6,Yes,15,3,2,0,7,3,3,0,0,0,0,0.008343,0.887515,0.891688,1.169781,1.026167,0.961486,0.2462,1.324226,0.057267,0.42623,0.658973,0.932014,0.550208,0.155707,0.338096,1.144294,1.167687,0.679146,1.155935,0.937654
3,33,0,2,Research & Development,3,4,Life Sciences,4,Female,3,1,Research Scientist,3,Married,2909,1,Yes,11,3,3,0,8,3,3,8,7,3,0,0.429664,0.764121,1.061787,1.169781,0.379672,0.961486,0.2462,0.678049,1.150554,0.42623,0.266233,0.932014,0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,1.155935,0.763634
4,27,0,1,Research & Development,2,1,Medical,1,Male,3,1,Laboratory Technician,2,Married,3468,9,No,12,3,4,1,6,3,3,2,2,2,2,1.086676,0.887515,1.868426,1.575686,0.379672,0.961486,0.660853,2.525591,0.877232,0.42623,1.191438,0.241988,0.678774,0.155707,0.338096,0.817734,0.615492,0.058285,0.595227,0.644858


In [82]:
df["Attrition"].value_counts()

0    1179
1     233
Name: Attrition, dtype: int64

In [83]:
df["Gender"].value_counts()

Male      848
Female    564
Name: Gender, dtype: int64

In [84]:
pd.crosstab(df["Gender"], df["Attrition"])

Attrition,0,1
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,480,84
Male,699,149


In [85]:
df["YearsAtCompany"].groupby(df["JobRole"]).mean()

JobRole
Healthcare Representative     7.653543
Human Resources               5.326923
Laboratory Technician         4.788235
Manager                      11.103896
Manufacturing Director        7.283688
Research Director             8.865672
Research Scientist            5.061856
Sales Executive               7.253918
Sales Representative          2.915663
Name: YearsAtCompany, dtype: float64

In [86]:
df["JobSatisfaction"].groupby(df["Gender"]).mean()

Gender
Female    2.684397
Male      2.772406
Name: JobSatisfaction, dtype: float64

In [87]:
pd.crosstab(df["JobRole"], df["Attrition"])

Attrition,0,1
JobRole,Unnamed: 1_level_1,Unnamed: 2_level_1
Healthcare Representative,119,8
Human Resources,40,12
Laboratory Technician,193,62
Manager,73,4
Manufacturing Director,132,9
Research Director,66,1
Research Scientist,244,47
Sales Executive,262,57
Sales Representative,50,33


In [88]:
df["YearsAtCompany"].groupby(df["Attrition"]).mean()

Attrition
0    6.696353
1    4.635193
Name: YearsAtCompany, dtype: float64

In [89]:
bins = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
bin_labels = ["1000", "2000", "3000", "4000", "5000", "6000", "7000", "8000", "9000", "10000+"]
df["MonthlyIncome_Groups"] = pd.cut(df["MonthlyIncome"], bins, labels = bin_labels)

In [90]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,zscore_Age,zscore_DistanceFromHome,zscore_Education,zscore_EnvironmentSatisfaction,zscore_JobInvolvement,zscore_JobLevel,zscore_JobSatisfaction,zscore_NumCompaniesWorked,zscore_PercentSalaryHike,zscore_PerformanceRating,zscore_RelationshipSatisfaction,zscore_StockOptionLevel,zscore_TotalWorkingYears,zscore_TrainingTimesLastYear,zscore_WorkLifeBalance,zscore_YearsAtCompany,zscore_YearsInCurrentRole,zscore_YearsSinceLastPromotion,zscore_YearsWithCurrManager,zscore_MonthlyIncome,MonthlyIncome_Groups
0,41,1,1,Sales,1,2,Life Sciences,2,Female,3,2,Sales Executive,4,Single,5993,8,Yes,11,3,1,0,8,0,1,6,4,0,5,0.44635,1.010909,0.891688,0.660531,0.379672,0.057788,1.153254,2.125136,1.150554,0.42623,1.584178,0.932014,0.421642,2.171982,2.49382,0.164613,0.063296,0.679146,0.245834,0.10835,6000
1,49,0,2,Research & Development,8,1,Life Sciences,3,Male,2,2,Research Scientist,2,Married,5130,1,No,23,4,4,1,10,3,3,10,7,1,7,1.322365,0.14715,1.868426,0.254625,1.026167,0.057788,0.660853,0.678049,2.129306,2.346151,1.191438,0.241988,0.164511,0.155707,0.338096,0.488508,0.764998,0.368715,0.806541,0.291719,6000
2,37,1,1,Research & Development,2,2,Other,4,Male,2,1,Laboratory Technician,3,Single,2090,6,Yes,15,3,2,0,7,3,3,0,0,0,0,0.008343,0.887515,0.891688,1.169781,1.026167,0.961486,0.2462,1.324226,0.057267,0.42623,0.658973,0.932014,0.550208,0.155707,0.338096,1.144294,1.167687,0.679146,1.155935,0.937654,3000
3,33,0,2,Research & Development,3,4,Life Sciences,4,Female,3,1,Research Scientist,3,Married,2909,1,Yes,11,3,3,0,8,3,3,8,7,3,0,0.429664,0.764121,1.061787,1.169781,0.379672,0.961486,0.2462,0.678049,1.150554,0.42623,0.266233,0.932014,0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,1.155935,0.763634,3000
4,27,0,1,Research & Development,2,1,Medical,1,Male,3,1,Laboratory Technician,2,Married,3468,9,No,12,3,4,1,6,3,3,2,2,2,2,1.086676,0.887515,1.868426,1.575686,0.379672,0.961486,0.660853,2.525591,0.877232,0.42623,1.191438,0.241988,0.678774,0.155707,0.338096,0.817734,0.615492,0.058285,0.595227,0.644858,4000


In [91]:
pd.crosstab(df["MonthlyIncome_Groups"], df["Attrition"])

Attrition,0,1
MonthlyIncome_Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,15,18
3000,266,95
4000,123,24
5000,177,26
6000,149,16
7000,110,11
8000,44,7
9000,42,6
10000+,43,9


In [92]:
df["JobSatisfaction"].groupby(df["Attrition"]).mean()

Attrition
0    2.793893
1    2.450644
Name: JobSatisfaction, dtype: float64

In [94]:
df["OverTime"].replace(["Yes","No"],[1, 0], inplace = True)

In [97]:
bins = [0, 18, 25, 30, 40, 45, 50, 55, 60, 65, 100]
bin_labels = ["18", "25", "30", "40", "45", "50", "55", "60", "65", "100"]
df["AgeGroup"] = pd.cut(df["Age"], bins, labels = bin_labels)

In [69]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,zscore_Age,zscore_DistanceFromHome,zscore_Education,zscore_EnvironmentSatisfaction,zscore_JobInvolvement,zscore_JobLevel,zscore_JobSatisfaction,zscore_NumCompaniesWorked,zscore_PercentSalaryHike,zscore_PerformanceRating,zscore_RelationshipSatisfaction,zscore_StockOptionLevel,zscore_TotalWorkingYears,zscore_TrainingTimesLastYear,zscore_WorkLifeBalance,zscore_YearsAtCompany,zscore_YearsInCurrentRole,zscore_YearsSinceLastPromotion,zscore_YearsWithCurrManager,zscore_MonthlyIncome,MonthlyIncome_Groups
0,40,1,1,Sales,1,2,Life Sciences,2,Female,3,2,Sales Executive,4,Single,5993,8,1,11,3,1,0,8,0,1,6,4,0,5,0.44635,1.010909,0.891688,0.660531,0.379672,0.057788,1.153254,2.125136,1.150554,0.42623,1.584178,0.932014,0.421642,2.171982,2.49382,0.164613,0.063296,0.679146,0.245834,0.10835,6000
1,45,0,2,Research & Development,8,1,Life Sciences,3,Male,2,2,Research Scientist,2,Married,5130,1,0,23,4,4,1,10,3,3,10,7,1,7,1.322365,0.14715,1.868426,0.254625,1.026167,0.057788,0.660853,0.678049,2.129306,2.346151,1.191438,0.241988,0.164511,0.155707,0.338096,0.488508,0.764998,0.368715,0.806541,0.291719,6000
2,30,1,1,Research & Development,2,2,Other,4,Male,2,1,Laboratory Technician,3,Single,2090,6,1,15,3,2,0,7,3,3,0,0,0,0,0.008343,0.887515,0.891688,1.169781,1.026167,0.961486,0.2462,1.324226,0.057267,0.42623,0.658973,0.932014,0.550208,0.155707,0.338096,1.144294,1.167687,0.679146,1.155935,0.937654,3000
3,30,0,2,Research & Development,3,4,Life Sciences,4,Female,3,1,Research Scientist,3,Married,2909,1,1,11,3,3,0,8,3,3,8,7,3,0,0.429664,0.764121,1.061787,1.169781,0.379672,0.961486,0.2462,0.678049,1.150554,0.42623,0.266233,0.932014,0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,1.155935,0.763634,3000
4,25,0,1,Research & Development,2,1,Medical,1,Male,3,1,Laboratory Technician,2,Married,3468,9,0,12,3,4,1,6,3,3,2,2,2,2,1.086676,0.887515,1.868426,1.575686,0.379672,0.961486,0.660853,2.525591,0.877232,0.42623,1.191438,0.241988,0.678774,0.155707,0.338096,0.817734,0.615492,0.058285,0.595227,0.644858,4000


In [98]:
mod1 = sm.logit('Attrition ~ Age + Education + Gender + JobSatisfaction + MonthlyIncome + OverTime + TotalWorkingYears + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion', data=df).fit()

mod1.summary()

Optimization terminated successfully.
         Current function value: 0.373209
         Iterations 7


0,1,2,3
Dep. Variable:,Attrition,No. Observations:,1412.0
Model:,Logit,Df Residuals:,1401.0
Method:,MLE,Df Model:,10.0
Date:,"Tue, 29 Nov 2022",Pseudo R-squ.:,0.1667
Time:,20:58:20,Log-Likelihood:,-526.97
converged:,True,LL-Null:,-632.42
Covariance Type:,nonrobust,LLR p-value:,8.536e-40

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5566,0.443,1.256,0.209,-0.312,1.425
Gender[T.Male],0.3022,0.164,1.848,0.065,-0.018,0.623
Age,-0.0346,0.012,-2.856,0.004,-0.058,-0.011
Education,0.0311,0.078,0.399,0.690,-0.122,0.184
JobSatisfaction,-0.3360,0.070,-4.771,0.000,-0.474,-0.198
MonthlyIncome,-6.602e-05,3.3e-05,-2.003,0.045,-0.000,-1.42e-06
OverTime,1.5421,0.161,9.569,0.000,1.226,1.858
TotalWorkingYears,-0.0215,0.024,-0.890,0.374,-0.069,0.026
YearsAtCompany,-0.0368,0.036,-1.014,0.311,-0.108,0.034


In [99]:
mod1.params

Intercept                  0.556629
Gender[T.Male]             0.302239
Age                       -0.034612
Education                  0.031149
JobSatisfaction           -0.336000
MonthlyIncome             -0.000066
OverTime                   1.542150
TotalWorkingYears         -0.021515
YearsAtCompany            -0.036818
YearsInCurrentRole        -0.150463
YearsSinceLastPromotion    0.174384
dtype: float64

In [101]:
dfz = df.copy()

In [104]:
dfz_corr = dfz.copy()  ## correlation dataframe

dfz_corr.drop(dfz_corr.columns[28:49], axis = 1, inplace = True)   ## drop all zscore columns
dfz_corr.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,AgeGroup
0,41,1,1,Sales,1,2,Life Sciences,2,Female,3,2,Sales Executive,4,Single,5993,8,1,11,3,1,0,8,0,1,6,4,0,5,45
1,49,0,2,Research & Development,8,1,Life Sciences,3,Male,2,2,Research Scientist,2,Married,5130,1,0,23,4,4,1,10,3,3,10,7,1,7,50
2,37,1,1,Research & Development,2,2,Other,4,Male,2,1,Laboratory Technician,3,Single,2090,6,1,15,3,2,0,7,3,3,0,0,0,0,40
3,33,0,2,Research & Development,3,4,Life Sciences,4,Female,3,1,Research Scientist,3,Married,2909,1,1,11,3,3,0,8,3,3,8,7,3,0,40
4,27,0,1,Research & Development,2,1,Medical,1,Male,3,1,Laboratory Technician,2,Married,3468,9,0,12,3,4,1,6,3,3,2,2,2,2,30


In [105]:
dfz_corr.corr()['Attrition']

Age                        -0.158923
Attrition                   1.000000
DistanceFromHome            0.084272
Education                  -0.029144
EnvironmentSatisfaction    -0.110806
JobInvolvement             -0.129967
JobLevel                   -0.166751
JobSatisfaction            -0.115660
MonthlyIncome              -0.157241
NumCompaniesWorked          0.047908
OverTime                    0.254033
PercentSalaryHike          -0.011557
PerformanceRating           0.007192
RelationshipSatisfaction   -0.052436
StockOptionLevel           -0.141979
TotalWorkingYears          -0.179725
TrainingTimesLastYear      -0.063394
WorkLifeBalance            -0.065408
YearsAtCompany             -0.152587
YearsInCurrentRole         -0.162731
YearsSinceLastPromotion    -0.023190
YearsWithCurrManager       -0.149672
Name: Attrition, dtype: float64