In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0
Note: you may nee

In [27]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import boxcox
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC, SVR
from precision_recall_cutoff import precision_recall_cutoff
from cost_function import cost_function

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'turnover_test.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

file_key2 = 'turnover_train.csv'

bucket_object2 = bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

file_key3 = 'turnover_val.csv'

bucket_object3 = bucket.Object(file_key3)
file_object3 = bucket_object3.get()
file_content_stream3 = file_object3.get('Body')

# reading data file
test = pd.read_csv(file_content_stream)
train = pd.read_csv(file_content_stream2)
val = pd.read_csv(file_content_stream3)
test.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary,left
0,0.77,0.91,5,261,6,0,0,sales,medium,1
1,0.17,0.45,2,119,3,0,0,sales,medium,0
2,0.54,0.58,3,169,2,1,0,technical,high,0
3,0.56,0.73,3,226,3,0,0,RandD,medium,0
4,0.88,0.71,5,255,3,0,0,support,medium,0


In [28]:
# changing sales and salary dummies
train = pd.concat([train.drop(columns = ['sales', 'salary'], axis =  1 ), pd.get_dummies(train[['sales', 'salary']])], axis = 1)
test = pd.concat([test.drop(columns = ['sales', 'salary'], axis =  1 ), pd.get_dummies(test[['sales', 'salary']])], axis = 1)
val = pd.concat([val.drop(columns = ['sales', 'salary'], axis =  1 ), pd.get_dummies(val[['sales', 'salary']])], axis = 1)
train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.92,0.95,6,239,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,0.88,0.89,4,254,5,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,0.66,0.93,5,253,5,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0.46,0.45,2,172,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,0.88,0.75,5,152,3,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0


In [29]:
# engineering interactions
train['interaction_1'] = np.where((train['satisfaction_level'] >= .115) & (train['satisfaction_level'] <= .465) & (train['number_project'] > 2.5), 1, 0)
train['interaction_2'] = np.where((train['satisfaction_level'] <= .465) & (train['number_project'] <= 2.5) & (train['last_evaluation'] <= .575), 1, 0)
train['interaction_3'] = np.where((train['satisfaction_level'] > .465) & (train['time_spend_company'] <= 4.5) & (train['average_montly_hours'] <= 290.5), 1, 0)

# engineering interactions
test['interaction_1'] = np.where((test['satisfaction_level'] >= .115) & (test['satisfaction_level'] <= .465) & (test['number_project'] > 2.5), 1, 0)
test['interaction_2'] = np.where((test['satisfaction_level'] <= .465) & (test['number_project'] <= 2.5) & (test['last_evaluation'] <= .575), 1, 0)
test['interaction_3'] = np.where((test['satisfaction_level'] > .465) & (test['time_spend_company'] <= 4.5) & (test['average_montly_hours'] <= 290.5), 1, 0)

# engineering interactions
val['interaction_1'] = np.where((val['satisfaction_level'] >= .115) & (val['satisfaction_level'] <= .465) & (val['number_project'] > 2.5), 1, 0)
val['interaction_2'] = np.where((val['satisfaction_level'] <= .465) & (val['number_project'] <= 2.5) & (val['last_evaluation'] <= .575), 1, 0)
val['interaction_3'] = np.where((val['satisfaction_level'] > .465) & (val['time_spend_company'] <= 4.5) & (val['average_montly_hours'] <= 290.5), 1, 0)

In [30]:
x = train[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
y = train['left']

class objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                        min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                        min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                        max_depth = trial.suggest_int('max_depth', 2, 10))
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, val_idx in skf.split(x, y):
            
            ## Splitting the data 
            x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
            
            pred_val = rf_md.predict_proba(x_val)[:,1]
            
            score = cost_function(y_val, pred_val)
            scores. append(score[0])
        
        return np.mean(scores)

In [31]:
seed = 42
n_trials = 20

study = optuna.create_study(direction = 'maximize')
study.optimize(objective(seed), n_trials = n_trials)

[32m[I 2023-03-24 17:58:27,602][0m A new study created in memory with name: no-name-d11ce2e9-4ff8-4cbf-9ce9-56d788d55c71[0m
[32m[I 2023-03-24 17:58:38,339][0m Trial 0 finished with value: 209166.66666666666 and parameters: {'n_estimators': 1206, 'min_samples_split': 25, 'min_samples_leaf': 29, 'max_depth': 4}. Best is trial 0 with value: 209166.66666666666.[0m
[32m[I 2023-03-24 17:58:55,560][0m Trial 1 finished with value: 274166.6666666667 and parameters: {'n_estimators': 1539, 'min_samples_split': 13, 'min_samples_leaf': 18, 'max_depth': 10}. Best is trial 1 with value: 274166.6666666667.[0m
[32m[I 2023-03-24 17:59:10,404][0m Trial 2 finished with value: 159166.66666666666 and parameters: {'n_estimators': 1827, 'min_samples_split': 13, 'min_samples_leaf': 28, 'max_depth': 3}. Best is trial 1 with value: 274166.6666666667.[0m
[32m[I 2023-03-24 17:59:23,176][0m Trial 3 finished with value: 160166.66666666666 and parameters: {'n_estimators': 1560, 'min_samples_split': 29, 

In [32]:
study.best_trial.params

{'n_estimators': 765,
 'min_samples_split': 25,
 'min_samples_leaf': 5,
 'max_depth': 9}

In [35]:
rf_md = RandomForestClassifier(**study.best_trial.params).fit(x,y)

x_val = val[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
x_test = test[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]

y_val = val['left']
y_test = test['left']

rf_val_pred = rf_md.predict_proba(x_val)[:,1]
rf_test_pred = rf_md.predict_proba(x_test)[:,1]

opt_cutoff = cost_function(y_val, rf_val_pred)[1]

rf_label = np.where(rf_test_pred < opt_cutoff, 0, 1)

con_mat = confusion_matrix(y_test, rf_label)
print(con_mat)
print('the cost of the rf model is: ', -1500 * con_mat[1, 0] - 1000 * con_mat[0, 1] + 500 * con_mat[1, 1])

[[1126   17]
 [  28  329]]
the cost of the rf model is:  105500
