In [1]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.4
Note: you may need to restart the kernel to use updated packages.


In [12]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import recall_score, classification_report, make_scorer, confusion_matrix
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import boxcox
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC, SVR
from precision_recall_cutoff import precision_recall_cutoff
from xgboost import XGBClassifier
from cost_function import cost_function, cost_function_cutoff

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [13]:
# changing sales and salary dummies
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis =  1 ), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [14]:
# engineering interactions
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] >= .115) & (turnover['satisfaction_level'] <= .465) & (turnover['number_project'] > 2.5), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= .465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= .575), 1, 0)
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] > .465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)

In [15]:
# defining imput and target
x = turnover[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
y = turnover['left']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [16]:
rf_param_grid = {'n_estimators': [100, 300, 500],
                'min_samples_split': [10, 15],
                'min_samples_leaf': [5, 7],
                'max_depth' : [3, 5, 7]}

# defining custom scorer
my_scorer_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# performing grid search
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid = rf_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train, y_train)

# extracting best model 
rf_md = rf_grid.best_estimator_

# predicting on test
rf_pred = rf_md.predict_proba(x_test)[:,1]

# finding optimal cutoff
opt_cutoff = cost_function_cutoff(y_test, rf_pred)

# changing to label
rf_label = np.where(rf_pred < opt_cutoff, 0, 1)

# confustion matrix
x = confusion_matrix(y_test, rf_label)
print(x)
print('the cost of the rf model is: ', -1500 * x[1, 0] - 1000 * x[0, 1] + 500 * x[1, 1])

[[2253   33]
 [  54  660]]
the cost of the rf model is:  216000


In [17]:
XGBoost_param_grid = {'n_estimators': [500],
                    'max_depth': [3, 5, 7],
                    'min_child_weight': [5, 7],
                    'learning_rate': [0.01],
                    'gamma': [0.3, 0.1],
                    'subsample': [0.8, 1],
                    'colsample_bytree': [1]}

# defining custom scorer
my_scorer_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# performing grid search
xgb_grid = GridSearchCV(XGBClassifier(), param_grid = XGBoost_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train, y_train)

# extracting best model 
xgb_md = xgb_grid.best_estimator_

# predicting on test
xgb_pred = xgb_md.predict_proba(x_test)[:,1]

# finding optimal cutoff
opt_cutoff = cost_function_cutoff(y_test, xgb_pred)

# changing to label
xgb_label = np.where(xgb_pred < opt_cutoff, 0, 1)

# confustion matrix
x = confusion_matrix(y_test, xgb_label)
print(x)
print('the cost of the xgb model is: ', -1500 * x[1, 0] - 1000 * x[0, 1] + 500 * x[1, 1])

[[2257   29]
 [  58  656]]
the cost of the xgb model is:  212000


In [None]:
# based on my results i would use the xgboost model to predict left because it has the lower cost function.