In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0
Note: you may nee

In [8]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna

from cost_function import cost_function

from tqdm import tqdm
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.metrics import classification_report, make_scorer, confusion_matrix

#reading the data
train = pd.read_csv('turnover_train.csv')
validation = pd.read_csv('turnover_val.csv')
test = pd.read_csv('turnover_test.csv')

In [9]:
#changing sales and salary to dummies
train = pd.concat([train.drop(columns=['sales','salary'],axis=1), pd.get_dummies(train[['sales','salary']])],axis=1)
validation = pd.concat([validation.drop(columns=['sales','salary'],axis=1), pd.get_dummies(validation[['sales','salary']])],axis=1)
test = pd.concat([test.drop(columns=['sales','salary'],axis=1), pd.get_dummies(test[['sales','salary']])],axis=1)

In [10]:
#creating new features from InClass_10
train['interaction_1'] = np.where((train['satisfaction_level'] <= 0.465) & 
                                     (train['number_project'] > 2.5) & 
                                     (train['satisfaction_level'] >= 0.115), 1, 0)

train['interaction_2'] = np.where((train['satisfaction_level'] <= 0.465) & 
                                     (train['number_project'] <= 2.5) & 
                                     (train['last_evaluation'] <= 0.575), 1, 0)

train['interaction_3'] = np.where((train['satisfaction_level'] > 0.465) & 
                                     (train['time_spend_company'] <= 4.5) & 
                                     (train['average_montly_hours'] <= 290.5), 1, 0)

validation['interaction_1'] = np.where((validation['satisfaction_level'] <= 0.465) & 
                                     (validation['number_project'] > 2.5) & 
                                     (validation['satisfaction_level'] >= 0.115), 1, 0)

validation['interaction_2'] = np.where((validation['satisfaction_level'] <= 0.465) & 
                                     (validation['number_project'] <= 2.5) & 
                                     (validation['last_evaluation'] <= 0.575), 1, 0)

validation['interaction_3'] = np.where((validation['satisfaction_level'] > 0.465) & 
                                     (validation['time_spend_company'] <= 4.5) & 
                                     (validation['average_montly_hours'] <= 290.5), 1, 0)

test['interaction_1'] = np.where((test['satisfaction_level'] <= 0.465) & 
                                     (test['number_project'] > 2.5) & 
                                     (test['satisfaction_level'] >= 0.115), 1, 0)

test['interaction_2'] = np.where((test['satisfaction_level'] <= 0.465) & 
                                     (test['number_project'] <= 2.5) & 
                                     (test['last_evaluation'] <= 0.575), 1, 0)

test['interaction_3'] = np.where((test['satisfaction_level'] > 0.465) & 
                                     (test['time_spend_company'] <= 4.5) & 
                                     (test['average_montly_hours'] <= 290.5), 1, 0)

**Random Forest w/ Optuna**

In [18]:
#defining input and target
x = train[['satisfaction_level', 'number_project', 'time_spend_company', 'interaction_1', 'interaction_3']]
y = train['left']

class Objective:
    
    def __init__(self, seed):
        
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000), 
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10))
        
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x, y):
            
            x_train, x_valid = x.iloc[train_idx], x.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
            
            rf_md = RandomForestClassifier(**params).fit(x_train,y_train)
            
            pred_valid = rf_md.predict_proba(x_valid)[:,1]
            
            score = cost_function(y_valid, pred_valid)
            
            scores.append(score[0])
            
        return np.mean(scores)

In [13]:
SEED = 42
N_TRIALS = 20

study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-24 17:43:58,435][0m A new study created in memory with name: no-name-03a2ff26-52be-497f-9040-4e224364938f[0m
[32m[I 2023-03-24 17:44:16,709][0m Trial 0 finished with value: 294666.6666666667 and parameters: {'n_estimators': 1781, 'min_samples_split': 27, 'min_samples_leaf': 5, 'max_depth': 7}. Best is trial 0 with value: 294666.6666666667.[0m
[32m[I 2023-03-24 17:44:32,268][0m Trial 1 finished with value: 290166.6666666667 and parameters: {'n_estimators': 1452, 'min_samples_split': 30, 'min_samples_leaf': 14, 'max_depth': 9}. Best is trial 0 with value: 294666.6666666667.[0m
[32m[I 2023-03-24 17:44:42,246][0m Trial 2 finished with value: 263166.6666666667 and parameters: {'n_estimators': 1045, 'min_samples_split': 9, 'min_samples_leaf': 28, 'max_depth': 6}. Best is trial 0 with value: 294666.6666666667.[0m
[32m[I 2023-03-24 17:44:51,987][0m Trial 3 finished with value: 261666.66666666666 and parameters: {'n_estimators': 1055, 'min_samples_split': 8, 'min_sa

In [15]:
study.best_trial.params

{'n_estimators': 1781,
 'min_samples_split': 27,
 'min_samples_leaf': 5,
 'max_depth': 7}

In [22]:
#building the random forest model 
rf_md = RandomForestClassifier(**study.best_trial.params).fit(x,y)

#predict on validation and test
x_val = validation[['satisfaction_level', 'number_project', 'time_spend_company', 'interaction_1', 'interaction_3']]
y_val = validation['left']
x_test = test[['satisfaction_level', 'number_project', 'time_spend_company', 'interaction_1', 'interaction_3']]
y_test = test['left']

rf_val_pred = rf_md.predict_proba(x_val)[:,1]
rf_test_pred = rf_md.predict_proba(x_test)[:,1]

#identify optimal cut off
opt_cutoff = cost_function(y_val, rf_val_pred)[1]

#changing likelihoods to labels
rf_label = np.where(rf_test_pred < opt_cutoff, 0, 1)

conf_mat = confusion_matrix(y_test, rf_label)
print(conf_mat)
print('The cost of the RF is ', -1500 * conf_mat[1, 0] - 1000 * conf_mat[0, 1] + 500 * conf_mat[1, 1])

[[1126   17]
 [  28  329]]
The cost of the RF is  105500
