In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import precision_recall_cutoff # Calling .py function
from scipy.stats import boxcox
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from cost_function import cost_function, cost_function_cutoff

train = pd.read_csv('turnover_train.csv')
test = pd.read_csv('turnover_test.csv')
val = pd.read_csv('turnover_val.csv')

In [2]:
## Changing sales to dummy variables
train = pd.concat([train.drop(columns = ['sales']), pd.get_dummies(train['sales'])], axis = 1)
test = pd.concat([test.drop(columns = ['sales']), pd.get_dummies(test['sales'])], axis = 1)
val = pd.concat([val.drop(columns = ['sales']), pd.get_dummies(val['sales'])], axis = 1)

## Changing sales to dummy variables
train = pd.concat([train, pd.get_dummies(train['salary'])], axis = 1)
test = pd.concat([test, pd.get_dummies(test['salary'])], axis = 1)
val = pd.concat([val, pd.get_dummies(val['salary'])], axis = 1)

In [3]:
## Creating interactions/features from the decision tree
# interaction 1
train['interaction_1'] = np.where((train['satisfaction_level'] <= 0.465) & 
                                     (train['number_project'] <= 2.5) & 
                                     (train['last_evaluation'] <= 0.575), 1, 0)

test['interaction_1'] = np.where((test['satisfaction_level'] <= 0.465) & 
                                     (test['number_project'] <= 2.5) & 
                                     (test['last_evaluation'] <= 0.575), 1, 0)

val['interaction_1'] = np.where((val['satisfaction_level'] <= 0.465) & 
                                     (val['number_project'] <= 2.5) & 
                                     (val['last_evaluation'] <= 0.575), 1, 0)


# interaction 2
train['interaction_2'] = np.where((train['satisfaction_level'] <= 0.465) & 
                                     (train['number_project'] >= 2.5) & 
                                     (train['satisfaction_level'] >= 0.115), 1, 0)

test['interaction_2'] = np.where((test['satisfaction_level'] <= 0.465) & 
                                     (test['number_project'] >= 2.5) & 
                                     (test['satisfaction_level'] >= 0.115), 1, 0)

val['interaction_2'] = np.where((val['satisfaction_level'] <= 0.465) & 
                                     (val['number_project'] >= 2.5) & 
                                     (val['satisfaction_level'] >= 0.115), 1, 0)

# interaction 3
train['interaction_3'] = np.where((train['satisfaction_level'] > 0.465) & 
                                     (train['time_spend_company'] <= 4.5) & 
                                     (train['average_montly_hours'] <= 290.5), 1, 0)

test['interaction_3'] = np.where((test['satisfaction_level'] > 0.465) & 
                                     (test['time_spend_company'] <= 4.5) & 
                                     (test['average_montly_hours'] <= 290.5), 1, 0)

val['interaction_3'] = np.where((val['satisfaction_level'] > 0.465) & 
                                     (val['time_spend_company'] <= 4.5) & 
                                     (val['average_montly_hours'] <= 290.5), 1, 0)

In [4]:
# Defining input and target variables
X_train = train[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_train = train['left']

X_test = test[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_test = test['left']

X_val = val[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
Y_val = val['left']


## Changing the scale
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_val = scaler.fit_transform(X_val)

In [5]:
## Defining the hyper-parameters for RF
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15], 
                 'min_samples_leaf': [5, 7], 
                 'max_depth' : [3, 5, 7]}

# Defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# Performing GridSearch
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

# Extracting the best model
RF_model = RF_grid_search.best_estimator_
RF_model

RandomForestClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=10,
                       n_estimators=500)

In [7]:
# Predicting on validation and test
RF_val_pred = RF_model.predict_proba(X_val)[:,1]
RF_test_pred = RF_model.predict_proba(X_test)[:,1]

# Indentifyung the optimal cut-off value
opt_cutoff = cost_function_cutoff(Y_val, RF_val_pred)

# Changing the likelihoods to labels
RF_label = np.where(RF_test_pred < opt_cutoff, 0, 1)

# Computing the confusion matrix
X = confusion_matrix(Y_test, RF_label)
print(X)
print('The cost of the RF model is: ', -1500*X[1,0] - 1000*X[0,1] + 500*X[1, 1])

[[1130   13]
 [  28  329]]
The cost of the RF model is:  109500


In [8]:
## Defining the hyper-parameters for svm
SVM_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'], 
                  'C': [0.01, 0.1, 1, 10],
                  'gamma': [0.001, 0.01, 0.1, 1]}


SVM_grid_search = GridSearchCV(SVC(probability = True), SVM_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

# Extracting the best model
svm_md = SVM_grid_search.best_estimator_

In [9]:
# Predicting on validation and test
svm_val_pred = svm_md.predict_proba(X_val)[:,1]
svm_test_pred = svm_md.predict_proba(X_test)[:,1]

# Indentifyung the optimal cut-off value
opt_cutoff = cost_function_cutoff(Y_val, svm_val_pred)

# Changing the likelihoods to labels
svm_label = np.where(svm_test_pred < opt_cutoff, 0, 1)

# Computing the confusion matrix
X = confusion_matrix(Y_test, svm_label)
print(X)
print('The cost of the svm model is: ', -1500*X[1,0] - 1000*X[0,1] + 500*X[1, 1])

[[1121   22]
 [  31  326]]
The cost of the svm model is:  94500


### Based on the above results, Random Forest had a better perfromance.