In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import precision_recall_cutoff # Calling .py function
from scipy.stats import boxcox
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from cost_function import cost_function, cost_function_cutoff
from xgboost import XGBClassifier

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "turnover.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Creating interactions/features from the decision tree
# interaction 1
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= 0.575), 1, 0)

# interaction 2
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] >= 2.5) & 
                                     (turnover['satisfaction_level'] >= 0.115), 1, 0)

# interaction 3
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] > 0.465) & 
                                     (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5), 1, 0)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,sales,low,1,0,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,0,0,0
2,0.11,0.88,7,272,4,0,1,0,sales,medium,0,0,0
3,0.72,0.87,5,223,5,0,1,0,sales,low,0,0,0
4,0.37,0.52,2,159,3,0,1,0,sales,low,1,0,0


In [3]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing sales to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,interaction_1,interaction_2,interaction_3,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [4]:
# Defining input and target variables
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)


In [5]:
X_train = X_train[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
X_test = X_test[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]


## Changing the scale
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [9]:
## Defining the hyper-parameters for RF
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15], 
                 'min_samples_leaf': [5, 7], 
                 'max_depth' : [3, 5, 7]}
# Defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# Performing GridSearch
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

# Extracting the best model
RF_model = RF_grid_search.best_estimator_

RF_preds = RF_model.predict_proba(X_test)[:,1]

# Indentifyung the optimal cut-off value
opt_cutoff = cost_function_cutoff(Y_test, RF_preds)

# Changing the likelihoods to labels
RF_label = np.where(RF_preds < opt_cutoff, 0, 1)

# Computing the confusion matrix
X = confusion_matrix(Y_test, RF_label)
print(X)
print('The cost of the RF model is: ', -1500*X[1,0] - 1000*X[0,1] + 500*X[1, 1])

[[2257   29]
 [  55  659]]
The cost of the RF model is:  218000


In [None]:
## Defining the hyper-parameters for RF
XGBoost_param_grid = {'n_estimators': [500],
                        'max_depth': [3, 5, 7],
                        'min_child_weight': [5, 7],
                        'learning_rate': [0.01],
                        'gamma': [0.3, 0.1],
                        'subsample': [0.8, 1],
                        'colsample_bytree': [1]}

# Defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# Performing GridSearch
XGBoost_grid_search = GridSearchCV(XGBClassifier(use_label_encoder = False, eval_metric = 'logloss'), XGBoost_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train, Y_train)

# Extracting the best model
XGBoost_model = XGBoost_grid_search.best_estimator_

XGBoost_preds = XGBoost_model.predict_proba(X_test)[:,1]

# Indentifyung the optimal cut-off value
opt_cutoff = cost_function_cutoff(Y_test, XGBoost_preds)

# Changing the likelihoods to labels
XGBoost_label = np.where(XGBoost_preds < opt_cutoff, 0, 1)

# Computing the confusion matrix
X = confusion_matrix(Y_test, XGBoost_label)
print(X)
print('The cost of the RF model is: ', -1500*X[1,0] - 1000*X[0,1] + 500*X[1, 1])

[[2254   32]
 [  56  658]]
The cost of the RF model is:  213000


## Based on the above results, Random Forest had the best result.