In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0

In [28]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import boxcox
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix

import optuna

from cost_function import cost_function

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'turnover.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
train = pd.read_csv('turnover_train.csv')
validation = pd.read_csv('turnover_val.csv')
test = pd.read_csv('turnover_test.csv')

In [29]:
## changing 'sales' and 'salary' to dummy variables
train = pd.concat([train.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(train[['sales', 'salary']])], axis = 1)
validation = pd.concat([validation.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(validation[['sales', 'salary']])], axis = 1)
test = pd.concat([test.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(test[['sales', 'salary']])], axis = 1)

In [30]:
## Engineering Features from the decision tree model
train['interaction_1'] = np.where(((train['satisfaction_level'] >= 0.115) &
                                      (train['satisfaction_level'] <= 0.465) &
                                      (train['number_project'] > 2.5)), 1, 0)
                              
train['interaction_2'] = np.where(((train['satisfaction_level'] <= 0.465) &
                                      (train['satisfaction_level'] <= 2.5) &
                                      (train['number_project'] <= 0.575)), 1, 0)

train['interaction_3'] = np.where(((train['satisfaction_level'] > 0.465) &
                                      (train['time_spend_company'] <= 0.45) &
                                      (train['average_montly_hours'] <= 290.5)), 1, 0)

## Engineering Features from the decision tree model
validation['interaction_1'] = np.where(((validation['satisfaction_level'] >= 0.115) &
                                      (validation['satisfaction_level'] <= 0.465) &
                                      (validation['number_project'] > 2.5)), 1, 0)

validation['interaction_2'] = np.where(((validation['satisfaction_level'] <= 0.465) &
                                      (validation['satisfaction_level'] <= 2.5) &
                                      (validation['number_project'] <= 0.575)), 1, 0)

validation['interaction_3'] = np.where(((validation['satisfaction_level'] > 0.465) &
                                      (validation['time_spend_company'] <= 0.45) &
                                      (validation['average_montly_hours'] <= 290.5)), 1, 0)

## Engineering Features from the decision tree model
test['interaction_1'] = np.where(((test['satisfaction_level'] >= 0.115) &
                                      (test['satisfaction_level'] <= 0.465) &
                                      (test['number_project'] > 2.5)), 1, 0)
                              
test['interaction_2'] = np.where(((test['satisfaction_level'] <= 0.465) &
                                      (test['satisfaction_level'] <= 2.5) &
                                      (test['number_project'] <= 0.575)), 1, 0)

test['interaction_3'] = np.where(((test['satisfaction_level'] > 0.465) &
                                      (test['time_spend_company'] <= 0.45) &
                                      (test['average_montly_hours'] <= 290.5)), 1, 0)

## Optuna Random Forest

In [25]:
x = train[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
y = train[['left']]

class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
            
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                          min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                          min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                          max_depth = trial.suggest_int('max_depth', 2, 10)
                          )
        scores = list()
            
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
            
        for train_idx, valid_idx in skf.split(x,y):
            x_train, x_valid = x.iloc[train_idx], x.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
                
            rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
                
            pred_valid = rf_md.predict_proba(x_valid)[:, 1]
                
            score = cost_function(y_valid, pred_valid)
                
            scores.append(score[0])
                
        return np.mean(scores)

In [26]:
seed = 42
n_trials = 20

study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(seed), n_trials = n_trials)

[32m[I 2023-03-24 17:46:26,806][0m A new study created in memory with name: no-name-b6ef812e-d32b-4153-a35c-6f161c8bc3b7[0m
  rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
  rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
  rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
[32m[I 2023-03-24 17:46:34,251][0m Trial 0 finished with value: 168666.66666666666 and parameters: {'n_estimators': 758, 'min_samples_split': 5, 'min_samples_leaf': 30, 'max_depth': 3}. Best is trial 0 with value: 168666.66666666666.[0m
  rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
  rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
  rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
[32m[I 2023-03-24 17:46:36,468][0m Trial 1 finished with value: 223166.66666666666 and parameters: {'n_estimators': 220, 'min_samples_split': 11, 'min_samples_leaf': 11, 'max_depth': 4}. Best is trial 1 with value: 223166.66666666666.[0m
  

In [27]:
study.best_trial.params

{'n_estimators': 692,
 'min_samples_split': 19,
 'min_samples_leaf': 5,
 'max_depth': 10}

In [38]:
## validation, test definition
x_val = validation[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]
x_test = test[['interaction_3', 'interaction_1', 'satisfaction_level', 'time_spend_company', 'number_project']]

y_val = validation['left']
y_test = test['left']
## building a random forest model using the above parameters from optuna
rf_md2 = RandomForestClassifier(**study.best_trial.params).fit(x, y)

## predicting on validation and test
rf_val_pred = rf_md2.predict_proba(x_val)[:,1]
rf_test_pred = rf_md2.predict_proba(x_test)[:,1]

## identifying optimal cutoff
opt_cutoff = cost_function(y_val, rf_val_pred)[1]

## changing likelihoods to labels
rf_label = np.where(rf_test_pred < opt_cutoff, 0, 1)

conf_mat = confusion_matrix(y_test, rf_label) 
print(conf_mat)
print('the cost of the RF is:', -1500 * conf_mat[1, 0] - 1000 * conf_mat[0, 1] + 500 * conf_mat[1, 1])

  rf_md2 = RandomForestClassifier(**study.best_trial.params).fit(x, y)


[[1125   18]
 [  28  329]]
the cost of the RF is: 104500
