In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

In [2]:
from sklearn.metrics import make_scorer,fbeta_score

def f2_score(y_true, y_pred):
    score = fbeta_score(y_true, y_pred, beta=2)
    return score

def f2():
    return make_scorer(f2_score)

In [3]:
#Dataset Columns


#EmpID: Employee ID
#Age: Age of the employee
#AgeGroup: Age group to which the employee belongs
#Attrition: Employee attrition status (whether the employee has left the organization or is still active)
#BusinessTravel: Frequency of business travel for the employee
#DailyRate: Daily rate of pay for the employee
#Department: Department in which the employee works
#DistanceFromHome: Distance in miles from the employee's home to the workplace
#Education: Level of education attained by the employee
#EducationField: Field of education of the employee
#EmployeeCount: Number of employees
#EmployeeNumber: Unique identifier for each employee
#EnvironmentSatisfaction: Employee's satisfaction level with the work environment
#Gender: Gender of the employee
#HourlyRate: Hourly rate of pay for the employee
#JobInvolvement: Employee's level of job involvement
#JobLevel: Level of the employee's job position
#JobRole: Role of the employee within the organization
#JobSatisfaction: Employee's satisfaction level with their job
#MaritalStatus: Marital status of the employee
#MonthlyIncome: Monthly income of the employee
#SalarySlab: Categorization of monthly income into salary slabs
#MonthlyRate: Monthly rate of pay for the employee
#NumCompaniesWorked: Number of companies the employee has worked for in the past
#Over18: Whether the employee is over 18 years old
#OverTime: Whether the employee works overtime or not
#PercentSalaryHike: Percentage increase in salary for the employee
#PerformanceRating: Performance rating of the employee
#RelationshipSatisfaction: Employee's satisfaction level with work relationships
#StandardHours: Standard working hours for the employee
#StockOptionLevel: Level of stock options granted to the employee
#TotalWorkingYears: Total number of years the employee has worked
#TrainingTimesLastYear: Number of training sessions attended by the employee in the last year
#WorkLifeBalance: Employee's work-life balance satisfaction level
#YearsAtCompany: Number of years the employee has worked at the current company
#YearsInCurrentRole: Number of years the employee has been in the current role
#YearsSinceLastPromotion: Number of years since the employee's last promotion
#YearsWithCurrManager: Number of years the employee has been working with the current manager

In [4]:
df = pd.read_csv('Data/HR_Analytics.csv')

df.head()

Unnamed: 0,EmpID,Age,AgeGroup,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,RM297,18,18-25,Yes,Travel_Rarely,230,Research & Development,3,3,Life Sciences,...,3,80,0,0,2,3,0,0,0,0.0
1,RM302,18,18-25,No,Travel_Rarely,812,Sales,10,3,Medical,...,1,80,0,0,2,3,0,0,0,0.0
2,RM458,18,18-25,Yes,Travel_Frequently,1306,Sales,5,3,Marketing,...,4,80,0,0,3,3,0,0,0,0.0
3,RM728,18,18-25,No,Non-Travel,287,Research & Development,5,2,Life Sciences,...,4,80,0,0,2,3,0,0,0,0.0
4,RM829,18,18-25,Yes,Non-Travel,247,Research & Development,8,1,Medical,...,4,80,0,0,0,3,0,0,0,0.0


In [5]:
df.dtypes

EmpID                        object
Age                           int64
AgeGroup                     object
Attrition                    object
BusinessTravel               object
DailyRate                     int64
Department                   object
DistanceFromHome              int64
Education                     int64
EducationField               object
EmployeeCount                 int64
EmployeeNumber                int64
EnvironmentSatisfaction       int64
Gender                       object
HourlyRate                    int64
JobInvolvement                int64
JobLevel                      int64
JobRole                      object
JobSatisfaction               int64
MaritalStatus                object
MonthlyIncome                 int64
SalarySlab                   object
MonthlyRate                   int64
NumCompaniesWorked            int64
Over18                       object
OverTime                     object
PercentSalaryHike             int64
PerformanceRating           

In [6]:
df.shape

(1480, 38)

In [7]:
df.isna().sum()
df = df.dropna(axis = 0)
df.isna().sum()

EmpID                       0
Age                         0
AgeGroup                    0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
SalarySlab                  0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBa

In [8]:
mask = df.loc[:, 'YearsWithCurrManager'].isna()

df.loc[mask, :]

Unnamed: 0,EmpID,Age,AgeGroup,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager


In [9]:
#print(df.duplicated().sum())

mask = df.duplicated() == True

df.loc[mask, :]

Unnamed: 0,EmpID,Age,AgeGroup,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
211,RM1468,27,26-35,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,...,2,80,1,6,0,3,6,2,0,3.0
328,RM1461,29,26-35,No,Travel_Rarely,468,Research & Development,28,4,Medical,...,2,80,0,5,3,1,5,4,0,4.0
458,RM1464,31,26-35,No,Non-Travel,325,Research & Development,5,3,Medical,...,2,80,0,10,2,3,9,4,1,7.0
655,RM1470,34,26-35,No,TravelRarely,628,Research & Development,8,3,Medical,...,1,80,0,6,3,4,4,3,1,2.0
954,RM1463,39,36-45,No,Travel_Rarely,722,Sales,24,1,Marketing,...,1,80,1,21,2,2,20,9,9,6.0
1305,RM1469,49,46-55,No,Travel_Frequently,1023,Sales,2,3,Medical,...,4,80,0,17,3,2,9,6,0,8.0
1336,RM1462,50,46-55,Yes,Travel_Rarely,410,Sales,28,3,Marketing,...,2,80,1,20,3,3,3,2,2,0.0


In [10]:
df = df.drop_duplicates()
df.duplicated().sum()

0

In [11]:
df.nunique()

EmpID                       1413
Age                           43
AgeGroup                       5
Attrition                      2
BusinessTravel                 4
DailyRate                    867
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EmployeeNumber              1413
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1303
SalarySlab                     4
MonthlyRate                 1373
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptio

In [12]:
df.loc[:, 'Department'].unique()

array(['Research & Development', 'Sales', 'Human Resources'], dtype=object)

In [13]:
df.loc[:, 'EmployeeCount'].unique()

array([1], dtype=int64)

In [14]:
df.loc[:, 'Over18'].unique()

array(['Y'], dtype=object)

In [15]:
#employeecount,standardtime, over 18, empoyeenumber or empid

df = df.drop(['EmployeeCount', 'StandardHours', 'Over18', 'EmpID', 'AgeGroup'], axis = 1) 

df.dtypes

Age                           int64
Attrition                    object
BusinessTravel               object
DailyRate                     int64
Department                   object
DistanceFromHome              int64
Education                     int64
EducationField               object
EmployeeNumber                int64
EnvironmentSatisfaction       int64
Gender                       object
HourlyRate                    int64
JobInvolvement                int64
JobLevel                      int64
JobRole                      object
JobSatisfaction               int64
MaritalStatus                object
MonthlyIncome                 int64
SalarySlab                   object
MonthlyRate                   int64
NumCompaniesWorked            int64
OverTime                     object
PercentSalaryHike             int64
PerformanceRating             int64
RelationshipSatisfaction      int64
StockOptionLevel              int64
TotalWorkingYears             int64
TrainingTimesLastYear       

In [16]:
df.loc[:, 'MaritalStatus'].unique()

array(['Single', 'Divorced', 'Married'], dtype=object)

In [17]:
df.loc[:, 'JobInvolvement'].unique()

array([3, 2, 1, 4], dtype=int64)

In [18]:
df.loc[:, 'JobLevel'].unique()

array([1, 2, 3, 4, 5], dtype=int64)

In [19]:
df.loc[:, 'JobRole'].unique()

array(['Laboratory Technician', 'Sales Representative',
       'Research Scientist', 'Human Resources', 'Manufacturing Director',
       'Sales Executive', 'Healthcare Representative',
       'Research Director', 'Manager'], dtype=object)

In [20]:
df.loc[:, 'JobSatisfaction'].unique()

array([3, 2, 4, 1], dtype=int64)

In [21]:
df.loc[:, 'OverTime'].unique()

array(['No', 'Yes'], dtype=object)

In [22]:
df.iloc[:, :21].describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked
count,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0
mean,36.924435,803.329802,9.238701,2.907486,1058.739407,2.725989,65.989407,2.726695,2.069209,2.728814,6516.679379,14319.355932,2.711158
std,9.135606,404.13328,8.122617,1.024395,592.701115,1.090169,20.396197,0.711953,1.108023,1.099266,4723.565527,7112.986512,2.507778
min,18.0,102.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,2097.0,0.0
25%,30.0,465.0,2.0,2.0,548.75,2.0,48.0,2.0,1.0,2.0,2922.25,8057.5,1.0
50%,36.0,804.5,7.0,3.0,1058.0,3.0,66.0,3.0,2.0,3.0,4938.5,14288.5,2.0
75%,43.0,1159.0,14.0,4.0,1581.25,4.0,84.0,3.0,3.0,4.0,8380.25,20440.5,4.0
max,60.0,1499.0,29.0,5.0,2068.0,4.0,100.0,4.0,5.0,4.0,19999.0,26999.0,9.0


In [23]:
df.iloc[:, 21:].describe()

Unnamed: 0,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0,1416.0
mean,15.199153,3.15113,2.704802,0.799435,11.298729,2.80226,2.762712,7.037429,4.254944,2.213277,4.117232
std,3.638219,0.358302,1.080704,0.851952,7.825239,1.288885,0.709487,6.151044,3.636385,3.24931,3.559344
min,11.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,12.0,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,14.0,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,18.0,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,25.0,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [24]:
df = df.replace({'Yes': 1, 'No': 0, 'Female': 1, 'Male': 0, 'Non-Travel': 0, 'TravelRarely': 1, 'Travel_Rarely': 1, 'Travel_Frequently': 2, 
                'Single' : 0, 'Married' : 1, 'Divorced' : 2})
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,18,1,1,230,Research & Development,3,3,Life Sciences,405,3,...,3,3,0,0,2,3,0,0,0,0.0
1,18,0,1,812,Sales,10,3,Medical,411,4,...,3,1,0,0,2,3,0,0,0,0.0
2,18,1,2,1306,Sales,5,3,Marketing,614,2,...,3,4,0,0,3,3,0,0,0,0.0
3,18,0,0,287,Research & Development,5,2,Life Sciences,1012,2,...,3,4,0,0,2,3,0,0,0,0.0
4,18,1,0,247,Research & Development,8,1,Medical,1156,3,...,3,4,0,0,0,3,0,0,0,0.0


In [25]:
catcols = ['SalarySlab', 'Department', 'EducationField', 'JobRole']
numcols = ['EmployeeNumber', 'Education', 'EnvironmentSatisfaction', 'BusinessTravel', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
           'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance', 'OverTime', 'Gender', 'Age', 'DailyRate', 
           'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 
           'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'MaritalStatus']

#df.loc[:, catcols].head()

df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,18,1,1,230,Research & Development,3,3,Life Sciences,405,3,...,3,3,0,0,2,3,0,0,0,0.0
1,18,0,1,812,Sales,10,3,Medical,411,4,...,3,1,0,0,2,3,0,0,0,0.0
2,18,1,2,1306,Sales,5,3,Marketing,614,2,...,3,4,0,0,3,3,0,0,0,0.0
3,18,0,0,287,Research & Development,5,2,Life Sciences,1012,2,...,3,4,0,0,2,3,0,0,0,0.0
4,18,1,0,247,Research & Development,8,1,Medical,1156,3,...,3,4,0,0,0,3,0,0,0,0.0


In [26]:
features = df.drop('Attrition', axis = 1)
target = df.loc[:, 'Attrition']

features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=42, test_size=0.1)

features_train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
372,30,1,1138,Research & Development,6,3,Technical Degree,1311,1,1,...,3,1,1,10,6,3,9,2,6,7.0
32,21,1,156,Sales,12,3,Life Sciences,494,3,1,...,3,4,0,1,0,3,1,0,0,0.0
255,28,1,1172,Sales,3,3,Medical,1875,2,1,...,3,4,1,1,3,3,1,0,0,0.0
1198,45,0,1238,Research & Development,1,1,Life Sciences,1712,3,0,...,4,4,1,25,3,2,23,15,14,4.0
460,32,1,334,Research & Development,5,2,Life Sciences,21,1,0,...,3,4,2,7,5,2,6,2,0,5.0


In [27]:
model = DecisionTreeClassifier(max_depth = 100 , class_weight = 'balanced', random_state = 42)

In [28]:
ohe_train = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

ohe_train.fit(features_train.loc[:, catcols]) 

ohe_names = ohe_train.get_feature_names_out(catcols)

df_train_ohe = pd.DataFrame(ohe_train.transform(features_train.loc[:, catcols]), columns = list(ohe_names))

df_train_ohe

Unnamed: 0,SalarySlab_10k-15k,SalarySlab_15k+,SalarySlab_5k-10k,SalarySlab_Upto 5k,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,...,EducationField_Technical Degree,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1270,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1271,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1272,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
ohe_test = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

ohe_test.fit(features_train.loc[:, catcols])

ohe_names = ohe_test.get_feature_names_out(catcols)

df_test_ohe = pd.DataFrame(ohe_test.transform(features_test.loc[:, catcols]), columns = list(ohe_names))

df_test_ohe

Unnamed: 0,SalarySlab_10k-15k,SalarySlab_15k+,SalarySlab_5k-10k,SalarySlab_Upto 5k,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,...,EducationField_Technical Degree,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
138,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
139,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
df_train_ohe.index = features_train.index

df_test_ohe.index = features_test.index

features_train_con = pd.concat((features_train.loc[:, numcols], df_train_ohe), axis = 1)
features_test_con = pd.concat((features_test.loc[:, numcols], df_test_ohe), axis = 1)

features_train_con

Unnamed: 0,EmployeeNumber,Education,EnvironmentSatisfaction,BusinessTravel,JobInvolvement,JobLevel,JobSatisfaction,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,...,EducationField_Technical Degree,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
372,1311,3,1,1,2,2,4,3,1,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
32,494,3,3,1,4,1,2,3,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
255,1875,3,2,1,3,1,2,3,4,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1198,1712,1,3,0,2,3,3,4,4,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
460,21,2,1,1,4,1,2,3,4,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,1206,3,4,0,1,1,3,3,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1184,1002,2,4,0,3,2,1,3,3,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1353,1367,2,4,1,2,4,2,3,2,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
900,1766,3,2,1,1,2,4,3,3,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
model.fit(features_train_con, target_train)
first_pred = model.predict(features_test_con)

print('f1 :', f1_score(target_test, first_pred))

print('precision :', precision_score(target_test, first_pred))

print('recall :', precision_score(target_test, first_pred))

f1 : 0.3461538461538462
precision : 0.375
recall : 0.375


In [32]:
feature_importances = model.feature_importances_

feature_names = features_train_con.columns

results = pd.DataFrame({'Features': feature_names, 'Importances': feature_importances})

results.sort_values(by='Importances', ascending = False, inplace = True)

results[:21]

Unnamed: 0,Features,Importances
11,OverTime,0.107272
21,TotalWorkingYears,0.09102
17,MonthlyIncome,0.085888
14,DailyRate,0.070046
9,StockOptionLevel,0.055753
15,DistanceFromHome,0.048842
8,RelationshipSatisfaction,0.046388
18,MonthlyRate,0.043069
13,Age,0.040355
48,JobRole_Sales Executive,0.034245


In [33]:
best_features = ['MonthlyIncome', 'OverTime', 'MonthlyRate', 'EmployeeNumber', 'MaritalStatus', 'TotalWorkingYears', 
                 'JobInvolvement', 'Age', 'NumCompaniesWorked', 'DistanceFromHome', 'EnvironmentSatisfaction', 'JobSatisfaction',
                 'HourlyRate', 'PercentSalaryHike', 'WorkLifeBalance', 'YearsWithCurrManager', 'EducationField_Technical Degree',
                 'YearsAtCompany', 'Department_Sales', 'YearsInCurrentRole']


best_features_reduced = ['MonthlyIncome', 'OverTime', 'MonthlyRate', 'EmployeeNumber', 
                         'MaritalStatus', 'TotalWorkingYears', 'JobInvolvement', 'Age', 
                         'NumCompaniesWorked', 'DistanceFromHome', 'EnvironmentSatisfaction',
                         'JobSatisfaction']

model.fit(features_train_con.loc[:, best_features_reduced], target_train)
first_pred = model.predict(features_test_con.loc[:, best_features_reduced])

print('f1 :', f1_score(target_test, first_pred))

print('precision :', precision_score(target_test, first_pred))

print('recall :', precision_score(target_test, first_pred))

print('f2 :', f2_score(target_test, first_pred))

f1 : 0.42105263157894735
precision : 0.41379310344827586
recall : 0.41379310344827586
f2 : 0.425531914893617


In [34]:
best_features = ['MonthlyIncome', 'OverTime', 'MonthlyRate', 
                 'EmployeeNumber', 
                 'MaritalStatus', 'TotalWorkingYears', 
                 'JobInvolvement', 'Age', 'NumCompaniesWorked', 'DistanceFromHome', 'EnvironmentSatisfaction', 'JobSatisfaction',
                 'HourlyRate', 'PercentSalaryHike', 'WorkLifeBalance', 'YearsWithCurrManager', 'EducationField_Technical Degree',
                 'YearsAtCompany', 'Department_Sales', 'YearsInCurrentRole']

model.fit(features_train_con.loc[:, best_features], target_train)
first_pred = model.predict(features_test_con.loc[:, best_features])

print('f1 :', f1_score(target_test, first_pred))

print('precision :', precision_score(target_test, first_pred))

print('recall :', precision_score(target_test, first_pred))

print('f2 :', f2_score(target_test, first_pred))

f1 : 0.3461538461538462
precision : 0.375
recall : 0.375
f2 : 0.3308823529411765


In [35]:
best_features = ['MonthlyIncome', 'OverTime', 'MonthlyRate', 
                 'EmployeeNumber', 'MaritalStatus', 'TotalWorkingYears', 
                 'JobInvolvement', 'Age', 'NumCompaniesWorked', 'DistanceFromHome', 'EnvironmentSatisfaction', 'JobSatisfaction',
                 'HourlyRate', 'PercentSalaryHike', 'WorkLifeBalance', 'YearsWithCurrManager', 'EducationField_Technical Degree',
                 'YearsAtCompany', 'Department_Sales', 'YearsInCurrentRole']


best_features_reduced_more = ['MonthlyIncome', 'OverTime', 'MonthlyRate', 'EmployeeNumber', 
                         'MaritalStatus', 'TotalWorkingYears', 'JobInvolvement', 'Age', 
                         ]

model.fit(features_train_con.loc[:, best_features_reduced], target_train)
first_pred = model.predict(features_test_con.loc[:, best_features_reduced])

print('f1 :', f1_score(target_test, first_pred))

print('precision :', precision_score(target_test, first_pred))

print('recall :', precision_score(target_test, first_pred))

print('f2 :', f2_score(target_test, first_pred))

f1 : 0.42105263157894735
precision : 0.41379310344827586
recall : 0.41379310344827586
f2 : 0.425531914893617


In [43]:
search_space_dt = {'max_depth': np.geomspace(start = 5,
                                             stop = 2000, num = 10, dtype = 'int')}

model_dt = GridSearchCV(estimator = DecisionTreeClassifier(class_weight = 'balanced',
                                                           random_state=42), 
                        param_grid = search_space_dt, scoring = f2(), cv = 5)


print(model_dt)

GridSearchCV(cv=5,
             estimator=DecisionTreeClassifier(class_weight='balanced',
                                              random_state=42),
             param_grid={'max_depth': array([   5,    9,   18,   36,   71,  139,  271,  528, 1027, 2000])},
             scoring=make_scorer(f2_score))


In [45]:
model_dt.fit(features_train_con.loc[:, best_features_reduced], target_train)

first_pred = model_dt.predict(features_test_con.loc[:, best_features_reduced])

print('f1 :', f1_score(target_test, first_pred))

print('precision :', precision_score(target_test, first_pred))

print('recall :', precision_score(target_test, first_pred))

print('f2 :', f2_score(target_test, first_pred))

f1 : 0.39999999999999997
precision : 0.35135135135135137
recall : 0.35135135135135137
f2 : 0.43624161073825507
