In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


## Exercise 1
***

In [5]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.feature_selection import RFE, RFECV

from sklearn.metrics import classification_report, confusion_matrix, make_scorer
from cost_function import cost_function, cost_function_cutoff

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'train.csv'
file_key2= 'test.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

bucket_object2= bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

#reading the datefile
train = pd.read_csv(file_content_stream, sep = '|')
test = pd.read_csv(file_content_stream2, sep = '|')

In [6]:
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [7]:
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


### top variables from hw5
##### totalScanTimeInSeconds
##### trustLevel
##### lineItemVoidsPerPosition
##### interaction_4
##### scannedLineItemsPerSecond
##### scansWithoutRegistration
##### lineItemVoids

In [8]:
## engineering features from hw5 (interaction_4 only)
train['interaction_4'] = np.where(((train['trustLevel'] <= 0.431) &
                                      (train['scannedLineItemsPerSecond'] <= 0.012) &
                                      (train['totalScanTimeInSeconds'] <= 895)), 1, 0)

test['interaction_4'] = np.where(((test['trustLevel'] <= 0.431) &
                                      (test['scannedLineItemsPerSecond'] <= 0.012) &
                                      (test['totalScanTimeInSeconds'] <= 895)), 1, 0)

## Exercise 2
***

In [9]:
## defining input and target variables (top 7 features from hw5)
x_train_top7 = train[['totalScanTimeInSeconds', 'trustLevel', 'lineItemVoidsPerPosition',
                 'interaction_4', 'scannedLineItemsPerSecond', 'scansWithoutRegistration', 'lineItemVoids']]

x_train_top6 = train[['totalScanTimeInSeconds', 'trustLevel', 'lineItemVoidsPerPosition',
                 'interaction_4', 'scannedLineItemsPerSecond', 'scansWithoutRegistration']]

x_train_top5 = train[['totalScanTimeInSeconds', 'trustLevel', 'lineItemVoidsPerPosition',
                 'interaction_4', 'scannedLineItemsPerSecond']]

y_train = train['fraud']

### i) GridSearchCV: XGB

##### model 1

top 5 features

In [10]:
## defining the parameters to tune
xgb_params = {'n_estimators': [500],
'max_depth': [3, 5, 7],
'min_child_weight': [5, 7],
'learning_rate': [0.01],
'gamma': [0.3, 0.1],
'subsample': [0.8, 1],
'colsample_bytree': [1]}

## creating customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## running gridsearch cv
xgb_gridsearch_1 = GridSearchCV(estimator = XGBClassifier(), param_grid = xgb_params, cv = 3, scoring = my_score_function,
                                n_jobs = -1).fit(x_train_top5, y_train)
    
## extracting best hyperparamter combo
xgb_md_1 = xgb_gridsearch_1.best_params_
xgb_score_1 = xgb_gridsearch_1.best_score_

## printing the best combination of hyper parameters
print('The best combination of hyperparameters for XGB is:', xgb_md_1)

## printing the best score
print('The best score for XBG is:', xgb_score_1)

The best combination of hyperparameters for XGB is: {'colsample_bytree': 1, 'gamma': 0.3, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 0.8}
The best score for XBG is: -25.0


#### model 2

top 6 features

In [11]:
## defining the parameters to tune
xgb_params = {'n_estimators': [500],
'max_depth': [3, 5, 7],
'min_child_weight': [5, 7],
'learning_rate': [0.01],
'gamma': [0.3, 0.1],
'subsample': [0.8, 1],
'colsample_bytree': [1]}

## creating customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## running gridsearch cv
xgb_gridsearch_2 = GridSearchCV(estimator = XGBClassifier(), param_grid = xgb_params, cv = 3, scoring = my_score_function,
                                n_jobs = -1).fit(x_train_top6, y_train)
    
## extracting best hyperparamter combo
xgb_md_2 = xgb_gridsearch_2.best_params_
xgb_score_2 = xgb_gridsearch_2.best_score_

## printing the best combination of hyper parameters
print('The best combination of hyperparameters for XGB is:', xgb_md_2)

## printing the best score
print('The best score for XBG is:', xgb_score_2)

The best combination of hyperparameters for XGB is: {'colsample_bytree': 1, 'gamma': 0.3, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 1}
The best score for XBG is: -1.6666666666666667


#### model 3

top 7 features

In [12]:
## defining the parameters to tune
xgb_params = {'n_estimators': [500],
'max_depth': [3, 5, 7],
'min_child_weight': [5, 7],
'learning_rate': [0.01],
'gamma': [0.3, 0.1],
'subsample': [0.8, 1],
'colsample_bytree': [1]}

## creating customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## gridsearch cv
xgb_gridsearch_3 = GridSearchCV(estimator = XGBClassifier(), param_grid = xgb_params, cv = 3, scoring = my_score_function,
                                n_jobs = -1).fit(x_train_top7, y_train)
    
## extracting best hyperparamter combo
xgb_md_3 = xgb_gridsearch_3.best_params_
xgb_score_3 = xgb_gridsearch_3.best_score_

## printing the best combination of hyper parameters
print('The best combination of hyperparameters for XGB is:', xgb_md_3)

## printing the best score
print('The best score for XBG is:', xgb_score_3)

The best combination of hyperparameters for XGB is: {'colsample_bytree': 1, 'gamma': 0.3, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 0.8}
The best score for XBG is: 8.333333333333334


Best model = model 3

### ii) RandomSearchCV: AdaBoost

#### model 1
top 5 features

In [13]:
## defining hyperparameters to consider tuning
ada_params = {'n_estimators': [100, 300],
                 'base_estimator__min_samples_split': [10, 15],
                 'base_estimator__min_samples_leaf': [5, 7],
                 'base_estimator__max_depth': [3, 5, 7],
                 'learning_rate': [0.001]}

## creating customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## running GridsearchCV
ada_random_search_1 = RandomizedSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), 
                                   param_distributions = ada_params, 
                                   cv = 3, 
                                   scoring = my_score_function, 
                                   n_jobs = -1,
                                   n_iter = 30).fit(x_train_top5, y_train)


## extracting the best hyperparameter combinations
ada_random_search_1.best_params_

## extracting the best score
ada_score_1 = ada_random_search_1.best_score_



In [14]:
print('Best hyper-parameter combination', ada_random_search_1.best_params_)
print('Best score:', ada_score_1)

Best hyper-parameter combination {'n_estimators': 100, 'learning_rate': 0.001, 'base_estimator__min_samples_split': 10, 'base_estimator__min_samples_leaf': 5, 'base_estimator__max_depth': 7}
Best score: -30.0


#### model 2
top 6 features

In [15]:
## defining hyperparameters to consider tuning
ada_params = {'n_estimators': [100, 300],
                 'base_estimator__min_samples_split': [10, 15],
                 'base_estimator__min_samples_leaf': [5, 7],
                 'base_estimator__max_depth': [3, 5, 7],
                 'learning_rate': [0.001]}

## creating customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## running GridsearchCV
ada_random_search_2 = RandomizedSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()),
                               param_distributions = ada_params, cv = 3, scoring = my_score_function, n_jobs = -1).fit(x_train_top6, y_train)

## extracting the best hyperparameter combinations
ada_random_search_2.best_params_

## extracting the best score
ada_score_2 = ada_random_search_2.best_score_



In [16]:
## extracting the best score
ada_score_2 = ada_random_search_2.best_score_

In [17]:
ada_score_2

-30.0

In [18]:
print('Best hyper-parameter combination', ada_random_search_2.best_params_)
print('Best score:', ada_score_2)

Best hyper-parameter combination {'n_estimators': 300, 'learning_rate': 0.001, 'base_estimator__min_samples_split': 10, 'base_estimator__min_samples_leaf': 5, 'base_estimator__max_depth': 5}
Best score: -30.0


#### model 3
top 7 features

In [19]:
## defining hyperparameters to consider tuning
ada_params = {'n_estimators': [100, 300],
                 'base_estimator__min_samples_split': [10, 15],
                 'base_estimator__min_samples_leaf': [5, 7],
                 'base_estimator__max_depth': [3, 5, 7],
                 'learning_rate': [0.001]}

## creating customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

## running GridsearchCV
ada_random_search_3 = RandomizedSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()),
                               param_distributions = ada_params, cv = 3, scoring = my_score_function, n_jobs = -1).fit(x_train_top7, y_train)

## extracting the best hyperparameter combinations
ada_random_search_3.best_params_

## extracting the best score
ada_score_3 = ada_random_search_3.best_score_



In [20]:
print('Best hyper-parameter combo', ada_random_search_3.best_params_)
print('Best score:', ada_score_3)

Best hyper-parameter combo {'n_estimators': 300, 'learning_rate': 0.001, 'base_estimator__min_samples_split': 15, 'base_estimator__min_samples_leaf': 5, 'base_estimator__max_depth': 7}
Best score: -16.666666666666668


### iii) Optuna: LightGBM

##### model 1

top 5 features

In [21]:
class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
    ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
            min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
            max_depth = trial.suggest_int('max_depth', 2, 10)
            )
        scores= list()
         
        #Running cross validation
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
       
        for train_idx, valid_idx in skf.split(x_train_top5, y_train):
            x_train_1, x_valid_1 = x_train_top5.iloc[train_idx], x_train_top5.iloc[valid_idx]
            y_train_1 , y_valid_1 = y_train.iloc[train_idx] , y_train.iloc[valid_idx]
         
            rf_md = RandomForestClassifier(**params).fit(x_train_1, y_train_1)
         
            preds_valid = rf_md.predict_proba(x_valid_1)[:,1]
            
            score = cost_function(y_valid_1, preds_valid)
            
            scores.append(score)
         
        return np.mean(scores)

In [22]:
seed = 42
n_trials = 30

study_1 = optuna.create_study(direction = 'maximize')
study_1.optimize(Objective(seed), n_trials = n_trials)

[32m[I 2023-03-31 18:39:23,868][0m A new study created in memory with name: no-name-2f788cb7-9494-4747-8d1a-4ba80fac5820[0m
[32m[I 2023-03-31 18:39:27,930][0m Trial 0 finished with value: -51.666666666666664 and parameters: {'n_estimators': 517, 'min_samples_split': 10, 'min_samples_leaf': 12, 'max_depth': 9}. Best is trial 0 with value: -51.666666666666664.[0m
[32m[I 2023-03-31 18:39:38,401][0m Trial 1 finished with value: -45.0 and parameters: {'n_estimators': 1440, 'min_samples_split': 26, 'min_samples_leaf': 5, 'max_depth': 6}. Best is trial 1 with value: -45.0.[0m
[32m[I 2023-03-31 18:39:51,207][0m Trial 2 finished with value: -56.666666666666664 and parameters: {'n_estimators': 1833, 'min_samples_split': 11, 'min_samples_leaf': 17, 'max_depth': 3}. Best is trial 1 with value: -45.0.[0m
[32m[I 2023-03-31 18:39:58,460][0m Trial 3 finished with value: -45.0 and parameters: {'n_estimators': 990, 'min_samples_split': 28, 'min_samples_leaf': 10, 'max_depth': 5}. Best is t

In [23]:
study_1.best_trial.params

{'n_estimators': 1237,
 'min_samples_split': 7,
 'min_samples_leaf': 7,
 'max_depth': 10}

##### model 2

top 6 features

In [24]:
class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
    ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
            min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
            max_depth = trial.suggest_int('max_depth', 2, 10)
            )
        scores= list()
         
        #Running cross validation
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
       
        for train_idx, valid_idx in skf.split(x_train_top6, y_train):
            x_train_2, x_valid_2 = x_train_top6.iloc[train_idx], x_train_top6.iloc[valid_idx]
            y_train_2 , y_valid_2 = y_train.iloc[train_idx] , y_train.iloc[valid_idx]
         
            rf_md = RandomForestClassifier(**params).fit(x_train_2, y_train_2)
         
            preds_valid = rf_md.predict_proba(x_valid_2)[:,1]
            
            score = cost_function(y_valid_2, preds_valid)
            
            scores.append(score)
         
        return np.mean(scores)

In [25]:
seed = 42
n_trials = 30

study_2 = optuna.create_study(direction = 'maximize')
study_2.optimize(Objective(seed), n_trials = n_trials)

[32m[I 2023-03-31 18:43:23,927][0m A new study created in memory with name: no-name-08b2d734-9634-4c56-ba18-520c7fc121cd[0m
[32m[I 2023-03-31 18:43:26,834][0m Trial 0 finished with value: -88.33333333333333 and parameters: {'n_estimators': 361, 'min_samples_split': 8, 'min_samples_leaf': 30, 'max_depth': 6}. Best is trial 0 with value: -88.33333333333333.[0m
[32m[I 2023-03-31 18:43:28,001][0m Trial 1 finished with value: -56.666666666666664 and parameters: {'n_estimators': 141, 'min_samples_split': 30, 'min_samples_leaf': 11, 'max_depth': 9}. Best is trial 1 with value: -56.666666666666664.[0m
[32m[I 2023-03-31 18:43:30,029][0m Trial 2 finished with value: -103.33333333333333 and parameters: {'n_estimators': 275, 'min_samples_split': 28, 'min_samples_leaf': 25, 'max_depth': 2}. Best is trial 1 with value: -56.666666666666664.[0m
[32m[I 2023-03-31 18:43:44,045][0m Trial 3 finished with value: -80.0 and parameters: {'n_estimators': 1957, 'min_samples_split': 9, 'min_samples

In [26]:
study_2.best_trial.params

{'n_estimators': 430,
 'min_samples_split': 17,
 'min_samples_leaf': 7,
 'max_depth': 9}

##### model 3

top 7 features

In [27]:
class Objective:
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
    ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
            min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
            max_depth = trial.suggest_int('max_depth', 2, 10)
            )
        scores= list()
         
        #Running cross validation
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
       
        for train_idx, valid_idx in skf.split(x_train_top7, y_train):
            x_train_3, x_valid_3 = x_train_top7.iloc[train_idx], x_train_top7.iloc[valid_idx]
            y_train_3, y_valid_3 = y_train.iloc[train_idx] , y_train.iloc[valid_idx]
         
            rf_md = RandomForestClassifier(**params).fit(x_train_3, y_train_3)
         
            preds_valid = rf_md.predict_proba(x_valid_3)[:,1]
            
            score = cost_function(y_valid_3, preds_valid)
            
            scores.append(score)
         
        return np.mean(scores)

In [28]:
seed = 42
n_trials = 30

study_3 = optuna.create_study(direction = 'maximize')
study_3.optimize(Objective(seed), n_trials = n_trials)

[32m[I 2023-03-31 18:45:55,707][0m A new study created in memory with name: no-name-5f9d10e7-8ef9-4cf8-bd6a-8fea265d11ea[0m
[32m[I 2023-03-31 18:46:06,756][0m Trial 0 finished with value: -73.33333333333333 and parameters: {'n_estimators': 1741, 'min_samples_split': 16, 'min_samples_leaf': 5, 'max_depth': 2}. Best is trial 0 with value: -73.33333333333333.[0m
[32m[I 2023-03-31 18:46:17,062][0m Trial 1 finished with value: -40.0 and parameters: {'n_estimators': 1519, 'min_samples_split': 8, 'min_samples_leaf': 15, 'max_depth': 4}. Best is trial 1 with value: -40.0.[0m
[32m[I 2023-03-31 18:46:29,832][0m Trial 2 finished with value: -63.333333333333336 and parameters: {'n_estimators': 1861, 'min_samples_split': 28, 'min_samples_leaf': 29, 'max_depth': 10}. Best is trial 1 with value: -40.0.[0m
[32m[I 2023-03-31 18:46:35,761][0m Trial 3 finished with value: -48.333333333333336 and parameters: {'n_estimators': 858, 'min_samples_split': 24, 'min_samples_leaf': 19, 'max_depth': 

In [29]:
study_3.best_trial.params

{'n_estimators': 531,
 'min_samples_split': 20,
 'min_samples_leaf': 10,
 'max_depth': 6}

## Exercise 3
***

In [34]:
## defining input and target variables
x = train[['totalScanTimeInSeconds', 'trustLevel', 'lineItemVoidsPerPosition',
                 'interaction_4', 'scannedLineItemsPerSecond', 'scansWithoutRegistration', 'lineItemVoids',]]
y = train['fraud']

## defining the testing variables
test_final = test[['totalScanTimeInSeconds', 'trustLevel', 'lineItemVoidsPerPosition',
                   'interaction_4', 'scannedLineItemsPerSecond', 'scansWithoutRegistration', 'lineItemVoids',]]

## splitting the data into 80% training and 20% validation
x_train_final, x_valid, y_train_final, y_valid = train_test_split(x, y, test_size = 0.2, stratify = y)

#### best xgb model: model 1
'colsample_bytree': 1, 'gamma': 0.3, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 500, 'subsample': 0.8

In [38]:
## building XGB model with optimal hyperparameters
xgb_final_md = XGBClassifier(colsample_bytree = 1, gamma = 0.3, learning_rate = 0.01, max_depth = 3,
                                 min_child_weight = 5, n_estimators = 500, subsample = 0.8).fit(x_train_final, y_train_final)

## predicting on testing and validation
xgb_val_pred = xgb_final_md.predict_proba(x_valid)[:, 1]
xgb_test_pred = xgb_final_md.predict_proba(test_final)[:, 1]

#### best ada model: model 3
'n_estimators': 300, 'learning_rate': 0.001, 'base_estimator__min_samples_split': 15, 'base_estimator__min_samples_leaf': 5, 'base_estimator__max_depth': 7

In [45]:
## building ada model with optimal hyperparameters
ada_final_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 15,
                            min_samples_leaf = 5, max_depth = 7), n_estimators = 300,
                            learning_rate = 0.001).fit(x_train_final, y_train_final)

## predicting on testing and validation
ada_val_pred = ada_final_md.predict_proba(x_valid)[:, 1]
ada_test_pred = ada_final_md.predict_proba(test_final)[:, 1]



#### best rf model: model 3
'n_estimators': 531,
 'min_samples_split': 20,
 'min_samples_leaf': 10,
 'max_depth': 6

In [46]:
## building rf model with optimal hyperparameters
rf_final_md = RandomForestClassifier(n_estimators = 531,
                                     max_depth = 6,
                                     min_samples_split = 20,
                                     min_samples_leaf = 10).fit(x_train_final, y_train_final)

## predicting on the testing and validation
rf_val_pred = rf_final_md.predict_proba(x_valid)[:, 1]
rf_test_pred = rf_final_md.predict_proba(test_final)[:, 1]

#### building ensemble model

In [58]:
## combining all preds into dataframe
x_ensemble = pd.concat([pd.DataFrame(xgb_val_pred),pd.DataFrame(ada_val_pred), pd.DataFrame(rf_val_pred)], axis = 1)
x_test_ensemble = pd.concat([pd.DataFrame(xgb_test_pred),pd.DataFrame(ada_test_pred), pd.DataFrame(rf_test_pred)], axis = 1)

x = x_ensemble
y = y_valid

class Objective:
    def __init__(self, seed):
        self.seed = seed

    def __call__(self, trial):
        
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
        min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
        max_depth = trial.suggest_int('max_depth', 2, 10)
        )
        scores = []
       
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x, y):
            x_train, x_valid = x.iloc[train_idx], x.iloc[valid_idx]
            y_train , y_valid = y.iloc[train_idx] , y.iloc[valid_idx]
            
            rf_md = RandomForestClassifier(**params).fit(x_train, y_train)
            
            preds_valid = rf_md.predict_proba(x_valid)[:, 1]
            score = cost_function(y_valid, preds_valid)
            
            scores.append(score)
            
        return np.mean(scores)

In [59]:
## Defining seed and number of trials
seed = 42
n_trials = 30

study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(seed), n_trials = n_trials)

[32m[I 2023-03-31 19:15:02,022][0m A new study created in memory with name: no-name-6cf42683-81c8-4de2-a8f4-91ed0ebb2613[0m
[32m[I 2023-03-31 19:15:06,518][0m Trial 0 finished with value: 15.0 and parameters: {'n_estimators': 716, 'min_samples_split': 15, 'min_samples_leaf': 12, 'max_depth': 6}. Best is trial 0 with value: 15.0.[0m
[32m[I 2023-03-31 19:15:08,366][0m Trial 1 finished with value: -35.0 and parameters: {'n_estimators': 300, 'min_samples_split': 23, 'min_samples_leaf': 26, 'max_depth': 10}. Best is trial 0 with value: 15.0.[0m
[32m[I 2023-03-31 19:15:13,095][0m Trial 2 finished with value: -35.0 and parameters: {'n_estimators': 805, 'min_samples_split': 30, 'min_samples_leaf': 28, 'max_depth': 7}. Best is trial 0 with value: 15.0.[0m
[32m[I 2023-03-31 19:15:18,695][0m Trial 3 finished with value: -35.0 and parameters: {'n_estimators': 1001, 'min_samples_split': 17, 'min_samples_leaf': 30, 'max_depth': 5}. Best is trial 0 with value: 15.0.[0m
[32m[I 2023-03-

In [74]:
## final model
rf_ensemble_md = RandomForestClassifier(**study.best_trial.params).fit(x_ensemble, y_valid)

## Predicting on testing and valid
rf_ensemble_val_preds = rf_ensemble_md.predict_proba(x_ensemble)[:, 1]
rf_ensemble_test_preds = rf_ensemble_md.predict_proba(x_test_ensemble)[:, 1]

## optimal cutoff value
optimal_cutoff = cost_function_cutoff(y_valid, rf_ensemble_val_preds)

## changing likelihoods to labels
rf_ensemble_label_preds = np.where(rf_ensemble_test_preds < optimal_cutoff, 0, 1)

print('The optimal cutoff is:', optimal_cutoff)

The optimal cutoff is: 0.4


In [76]:
## submitting final likelihoods to csv file
final_likelihoods = pd.DataFrame({'Final Likelihoods': rf_ensemble_test_preds})
final_likelihoods.to_csv('final_likelihoods_hw6.csv', index = False)