In [38]:
import numpy as np 
import pandas as pd 

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.model_selection import GridSearchCV

In [2]:
data=pd.read_csv('./data/aaizemberg-hr-employee-attrition/data/wa_fn_usec_hr_employee_attrition_tsv.csv')

In [3]:
data.head()

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,...,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
data.columns

Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeecount',
       'employeenumber', 'environmentsatisfaction', 'gender', 'hourlyrate',
       'jobinvolvement', 'joblevel', 'jobrole', 'jobsatisfaction',
       'maritalstatus', 'monthlyincome', 'monthlyrate', 'numcompaniesworked',
       'over18', 'overtime', 'percentsalaryhike', 'performancerating',
       'relationshipsatisfaction', 'standardhours', 'stockoptionlevel',
       'totalworkingyears', 'trainingtimeslastyear', 'worklifebalance',
       'yearsatcompany', 'yearsincurrentrole', 'yearssincelastpromotion',
       'yearswithcurrmanager'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   age                       1470 non-null   int64 
 1   attrition                 1470 non-null   object
 2   businesstravel            1470 non-null   object
 3   dailyrate                 1470 non-null   int64 
 4   department                1470 non-null   object
 5   distancefromhome          1470 non-null   int64 
 6   education                 1470 non-null   int64 
 7   educationfield            1470 non-null   object
 8   employeecount             1470 non-null   int64 
 9   employeenumber            1470 non-null   int64 
 10  environmentsatisfaction   1470 non-null   int64 
 11  gender                    1470 non-null   object
 12  hourlyrate                1470 non-null   int64 
 13  jobinvolvement            1470 non-null   int64 
 14  joblevel                

In [6]:
data.isnull().sum()

age                         0
attrition                   0
businesstravel              0
dailyrate                   0
department                  0
distancefromhome            0
education                   0
educationfield              0
employeecount               0
employeenumber              0
environmentsatisfaction     0
gender                      0
hourlyrate                  0
jobinvolvement              0
joblevel                    0
jobrole                     0
jobsatisfaction             0
maritalstatus               0
monthlyincome               0
monthlyrate                 0
numcompaniesworked          0
over18                      0
overtime                    0
percentsalaryhike           0
performancerating           0
relationshipsatisfaction    0
standardhours               0
stockoptionlevel            0
totalworkingyears           0
trainingtimeslastyear       0
worklifebalance             0
yearsatcompany              0
yearsincurrentrole          0
yearssince

## Data preprocessing

In [7]:
data_num = data.copy()
data_num.drop(['employeecount', 'employeenumber', 'over18', 'standardhours'],
        axis="columns", inplace=True)

In [8]:
data_num.loc[data_num['attrition'] == 'Yes', 'attrition'] = 1
data_num.loc[data_num['attrition'] == 'No', 'attrition'] = 0
data_num['attrition']= data_num['attrition'].astype(int)

In [9]:
set(data_num['businesstravel'].values)

{'Non-Travel', 'Travel_Frequently', 'Travel_Rarely'}

In [10]:
data_num.loc[data_num['businesstravel'] == 'Non-Travel', 'businesstravel'] = 0
data_num.loc[data_num['businesstravel'] == 'Travel_Frequently', 'businesstravel'] = 2
data_num.loc[data_num['businesstravel'] == 'Travel_Rarely', 'businesstravel'] = 1
data_num['businesstravel']= data_num['businesstravel'].astype(int)

In [11]:
set(data_num['department'].values)

{'Human Resources', 'Research & Development', 'Sales'}

In [12]:
data_num.loc[data_num['department'] == 'Human Resources', 'department'] = 0
data_num.loc[data_num['department'] == 'Sales'] = 1
data_num.loc[data_num['department'] == 'Research & Development', 'department'] = 2
data_num['department']= data_num['department'].astype(int)

In [13]:
data_num.loc[data_num['gender'] == 'Female', 'gender'] = 0
data_num.loc[data_num['gender'] == 'Male', 'gender'] = 1
data_num['gender']= data_num['gender'].astype(int)

In [14]:
set(data_num['maritalstatus'].values)

{1, 'Divorced', 'Married', 'Single'}

In [15]:
data_num.loc[data_num['maritalstatus'] == 'Single', 'maritalstatus'] = 0
data_num.loc[data_num['maritalstatus'] == 'Married', 'maritalstatus'] = 1
data_num.loc[data_num['maritalstatus'] == 'Divorced', 'maritalstatus'] = 2
data_num['maritalstatus']= data_num['maritalstatus'].astype(int)

In [16]:
data_num.loc[data_num['overtime'] == 'Yes','overtime' ] = 1
data_num.loc[data_num['overtime'] == 'No', 'overtime'] = 0
data_num['overtime']= data_num['overtime'].astype(int)

In [17]:
set(data_num['jobrole'].values)

{1,
 'Healthcare Representative',
 'Human Resources',
 'Laboratory Technician',
 'Manager',
 'Manufacturing Director',
 'Research Director',
 'Research Scientist'}

In [18]:
data_num.loc[data_num['jobrole'] == 'Healthcare Representative','jobrole' ] = 0
data_num.loc[data_num['jobrole'] == 'Human Resources','jobrole' ] = 1
data_num.loc[data_num['jobrole'] == 'Laboratory Technician','jobrole' ] = 2
data_num.loc[data_num['jobrole'] == 'Manager','jobrole' ] = 3
data_num.loc[data_num['jobrole'] == 'Manufacturing Director','jobrole' ] = 4
data_num.loc[data_num['jobrole'] == 'Research Scientist','jobrole' ] = 5
data_num.loc[data_num['jobrole'] == 'Research Director','jobrole' ] = 6
data_num['jobrole']= data_num['jobrole'].astype(int)

In [19]:
set(data_num['educationfield'].values)

{1, 'Human Resources', 'Life Sciences', 'Medical', 'Other', 'Technical Degree'}

In [20]:
data_num.loc[data_num['educationfield'] == 'Human Resources','educationfield' ] = 0
data_num.loc[data_num['educationfield'] == 'Life Sciences','educationfield' ] = 1
data_num.loc[data_num['educationfield'] == 'Medical','educationfield' ] = 2
data_num.loc[data_num['educationfield'] == 'Other','educationfield' ] = 3
data_num.loc[data_num['educationfield'] == 'Technical Degree','educationfield' ] = 4
data_num['educationfield']= data_num['educationfield'].astype(int)

In [21]:
data_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   age                       1470 non-null   int64
 1   attrition                 1470 non-null   int64
 2   businesstravel            1470 non-null   int64
 3   dailyrate                 1470 non-null   int64
 4   department                1470 non-null   int64
 5   distancefromhome          1470 non-null   int64
 6   education                 1470 non-null   int64
 7   educationfield            1470 non-null   int64
 8   environmentsatisfaction   1470 non-null   int64
 9   gender                    1470 non-null   int64
 10  hourlyrate                1470 non-null   int64
 11  jobinvolvement            1470 non-null   int64
 12  joblevel                  1470 non-null   int64
 13  jobrole                   1470 non-null   int64
 14  jobsatisfaction           1470 non-null 

In [23]:
X = data_num.drop(columns=["attrition"])
y = data_num["attrition"]

### GradientBoostingClassifier

In [32]:
clf_grid = {
    "loss":["deviance"],
    "learning_rate": [0.1, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 5),
    "min_samples_leaf": np.linspace(0.1, 0.5, 5),
    "max_depth":[3],
    "max_features":["log2","sqrt"],
    "subsample":[0.85, 0.9, 1.0],
    "n_estimators":[3]
    }

clf = GridSearchCV(GradientBoostingClassifier(), clf_grid, cv=10, n_jobs=-1)
clf.fit(X, y)
print("score",clf.score(X, y))
print("best parameters", clf.best_params_)

score 0.8945578231292517
best parameters {'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.4, 'min_samples_split': 0.30000000000000004, 'n_estimators': 3, 'subsample': 1.0}


### DecisionTreeClassifier

In [35]:
dt_grid= {'max_depth': [None, 2, 5, 10, 15], 'min_samples_split': [2, 10, 100, 500], 'random_state': [50]}
dt= DecisionTreeClassifier()
search= GridSearchCV(dt, dt_grid, scoring= 'roc_auc', n_jobs= -1, cv= 10)
search.fit(X, y)
print('Best hyperparameter settings:', search.best_params_)
print('Corresponding ROC-AUC score (avg of 10-fold CV):', search.best_score_)

Best hyperparameter settings: {'max_depth': None, 'min_samples_split': 100, 'random_state': 50}
Corresponding ROC-AUC score (avg of 10-fold CV): 0.9309926633370527


### SVC

In [48]:
svc_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]
svc = svm.SVC()
search= GridSearchCV(svc, svc_grid, scoring= 'precision', n_jobs= -1, cv= 10)
search.fit(X, y)
print('Best hyperparameter settings:', search.best_params_)
print('Corresponding ROC-AUC score (avg of 10-fold CV):', search.best_score_)

Best hyperparameter settings: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
Corresponding ROC-AUC score (avg of 10-fold CV): 1.0


### LogisticRegression

In [47]:
logreg=LogisticRegression(solver="lbfgs", multi_class="auto",max_iter=8000)
log_grid={"C":np.logspace(-3,3,7), 'penalty': ['l2']}
search=GridSearchCV(logreg, log_grid, cv=10)
search.fit(X,y)
print("tuned hpyerparameters :(best parameters) ",search.best_params_)
print("accuracy :",search.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 0.1, 'penalty': 'l2'}
accuracy : 0.9149659863945578
