In [2]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [10]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from cost_function import cost_function, cost_function_cutoff

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer

#defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

#defining the file to be read from s3 bucket
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datafile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [11]:
#changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns=['sales','salary'],axis=1), pd.get_dummies(turnover[['sales','salary']])],axis=1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [12]:
#creating new features from InClass_10
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] > 2.5) & 
                                     (turnover['satisfaction_level'] >= 0.115), 1, 0)

turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= 0.575), 1, 0)

turnover['interaction_3'] = np.where((turnover['satisfaction_level'] > 0.465) & 
                                     (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5), 1, 0)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [13]:
#defining input (top 5) and target (left)
x=turnover[['satisfaction_level', 'number_project', 'time_spend_company', 'interaction_1', 'interaction_3']]
y=turnover['left']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y)

In [14]:
#defining the customized scoring function
my_scorer = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

**GridSearchCV with Random Forest**

In [15]:
#defining parameter dictionary
rf_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

#running GridSearchCV
rf_grid_search = GridSearchCV(estimator = RandomForestClassifier(), param_grid = rf_param_grid, 
                              cv = 3, scoring = my_scorer, n_jobs = -1).fit(x_train,y_train)

#extracting best hyperparameter combination
rf_grid_search.best_params_

{'max_depth': 7,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 100}

In [17]:
#extracting the best model
rf_md = rf_grid_search.best_estimator_

#predicting on test
rf_pred = rf_md.predict_proba(x_test)[:,1]

#identifying optimal cutoff
opt_cutoff = cost_function_cutoff(y_test, rf_pred)

#changing likelihood to labels
rf_label = np.where(rf_pred < opt_cutoff, 0, 1)

#print confusion matrix
conf_mat = confusion_matrix(y_test, rf_label)
print(conf_mat)
print('The cost of the RF is ', -1500 * conf_mat[1, 0] - 1000 * conf_mat[0, 1] + 500 * conf_mat[1, 1])

[[2255   31]
 [  63  651]]
The cost of the RF is  200000


**GridSearchCV with XGBoost**

In [18]:
#defining parameter dictionary
xgb_param_grid = {'n_estimators': [500],
                  'max_depth': [3, 5, 7],
                  'min_child_weight': [5, 7],
                  'learning_rate': [0.01],
                  'gamma': [0.3, 0.1],
                  'subsample': [0.8, 1],
                  'colsample_bytree': [1]}

#running GridSearchCV
xgb_grid_search = GridSearchCV(estimator = XGBClassifier(), param_grid = xgb_param_grid, 
                              cv = 3, scoring = my_scorer).fit(x_train,y_train)

#extracting best hyperparameter combination
xgb_grid_search.best_params_

{'colsample_bytree': 1,
 'gamma': 0.3,
 'learning_rate': 0.01,
 'max_depth': 7,
 'min_child_weight': 5,
 'n_estimators': 500,
 'subsample': 1}

In [20]:
#extracting the best model
xgb_md = xgb_grid_search.best_estimator_

#predicting on test
xgb_pred = xgb_md.predict_proba(x_test)[:,1]

#identifying optimal cutoff
opt_cutoff = cost_function_cutoff(y_test, xgb_pred)

#changing likelihood to labels
xgb_label = np.where(xgb_pred < opt_cutoff, 0, 1)

#print confusion matrix
conf_mat = confusion_matrix(y_test, xgb_label)
print(conf_mat)
print('The cost of the XGB is ', -1500 * conf_mat[1, 0] - 1000 * conf_mat[0, 1] + 500 * conf_mat[1, 1])

[[2252   34]
 [  64  650]]
The cost of the XGB is  195000


Based on the results above, we would use the XGBoost model with the optimal hyper-parameters to predict left because it has a lower cost (195000 vs 200000)