In [1]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


**Exercise 1: (5 points) Using the bucket, that you create in the last homework assignment, and the pandas
library, read the train.csv and test.csv data files and create two data-frames called train and
test, respectively.**

In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna

import cost_functions

from tqdm import tqdm
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.metrics import classification_report, make_scorer, confusion_matrix

## Defining the s3 bucket
s3= boto3.resource('s3')
bucket_name= 'craig-shaffer-data-445-bucket'
bucket= s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'train.csv'
file_key2 = 'test.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

## Reading the datafiles
train = pd.read_csv(file_content_stream, sep = '|')
test = pd.read_csv(file_content_stream2, sep = '|')

*Engineering variables from previous homeworks*

In [3]:
#variable one: low trust level (trustLevel for fraud is never >2)
train['lowTrust'] = np.where(train['trustLevel'] <= 2, 1, 0)
test['lowTrust'] = np.where(test['trustLevel'] <= 2, 1, 0)

#variable two: low value per second (the highest value per second (VPS) in the fraud data set is .231, but will set it to .25)
train['lowVPS'] = np.where(train['valuePerSecond'] <= 0.25, 1, 0)
test['lowVPS'] = np.where(test['valuePerSecond'] <= 0.25, 1, 0)

#variable three: low scan time (noticeable difference in quartiles for fraud and not fraud)
train['lowTotalScanTime'] = np.where(train['totalScanTimeInSeconds'] < 1000, 1, 0)
test['lowTotalScanTime'] = np.where(test['totalScanTimeInSeconds'] < 1000, 1, 0)

#variable four: high scannedLineItemsPerSecond (SLIPS) (SLIPS doesn't exceed .308 in fraud but goes up to 11 in not fraud)
train['highSLIPS'] = np.where(train['scannedLineItemsPerSecond'] > 0.35 , 1, 0)
test['highSLIPS'] = np.where(test['scannedLineItemsPerSecond'] > 0.35 , 1, 0)

#variable five: boxcox transformation on scannedLineItemsPerSecond
train['boxcox_SLIPS'] = boxcox(train['scannedLineItemsPerSecond'])[0]
test['boxcox_SLIPS'] = boxcox(test['scannedLineItemsPerSecond'])[0]

#variable six: 1/grandTotal
train['1_grandTotal'] = 1/(train['grandTotal'])
test['1_grandTotal'] = 1/(test['grandTotal'])

#variable seven: natural log of totalScanTimeInSeconds
train['log_totalScanTimeInSeconds']= np.log(train['totalScanTimeInSeconds'])
test['log_totalScanTimeInSeconds']= np.log(test['totalScanTimeInSeconds'])

#variable eight: lineItemVoidsPerPosition^2
train['squared_lineItemVoidsPerPosition']= np.power(train['lineItemVoidsPerPosition'], 2)
test['squared_lineItemVoidsPerPosition']= np.power(test['lineItemVoidsPerPosition'], 2)

#variable nine: attempted a scan without registration
train['madeScansWithoutRegistration'] = np.where(train['scansWithoutRegistration'] > 0, 1, 0)
test['madeScansWithoutRegistration'] = np.where(test['scansWithoutRegistration'] > 0, 1, 0)

#variable ten: made a modification to quantity
train['madeModification'] = np.where(train['quantityModifications'] > 0, 1, 0)
test['madeModification'] = np.where(test['quantityModifications'] > 0, 1, 0)

#3 heredity principle features
train['heredity_interaction_1'] = train['trustLevel'] * train['lowTrust']
test['heredity_interaction_1'] = test['trustLevel'] * test['lowTrust']

train['heredity_interaction_2'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
test['heredity_interaction_2'] = test['trustLevel'] * test['scannedLineItemsPerSecond']

train['heredity_interaction_3'] = train['lowTrust'] * train['scannedLineItemsPerSecond']
test['heredity_interaction_3'] = test['lowTrust'] * test['scannedLineItemsPerSecond']


#decision tree features
train['tree_interaction_1'] = np.where(train['heredity_interaction_3'] <= 0.012, 1, 0)
test['tree_interaction_1'] = np.where(test['heredity_interaction_3'] <= 0.012, 1, 0)

train['tree_interaction_2'] = np.where((train['heredity_interaction_3'] > 0.012) & 
                                       (train['totalScanTimeInSeconds'] <= 993.0) &
                                       (train['heredity_interaction_1'] > 1.5) &
                                       (train['scansWithoutRegistration'] <= 7.5), 1, 0)
test['tree_interaction_2'] = np.where((test['heredity_interaction_3'] > 0.012) & 
                                       (test['totalScanTimeInSeconds'] <= 993.0) &
                                       (test['heredity_interaction_1'] > 1.5) &
                                       (test['scansWithoutRegistration'] <= 7.5), 1, 0)

train['tree_interaction_3'] = np.where((train['heredity_interaction_3'] > 0.012) & 
                                       (train['totalScanTimeInSeconds'] <= 993.0) &
                                       (train['heredity_interaction_1'] <= 1.5) &
                                       (train['valuePerSecond'] <= 0.119), 1, 0)
test['tree_interaction_3'] = np.where((test['heredity_interaction_3'] > 0.012) & 
                                       (test['totalScanTimeInSeconds'] <= 993.0) &
                                       (test['heredity_interaction_1'] <= 1.5) &
                                       (test['valuePerSecond'] <= 0.119), 1, 0)

In [4]:
#defining the input (top 7 features) and target variable (fraud)
x_train_7 = train[['log_totalScanTimeInSeconds', 'trustLevel', 'tree_interaction_1', 'heredity_interaction_3', 
                 'boxcox_SLIPS','scansWithoutRegistration','lineItemVoids']]
y_train = train['fraud']

#top 6 features
x_train_6 = x_train_7.drop(columns = ['lineItemVoids'])

#top 5 features
x_train_5 = x_train_6.drop(columns = ['scansWithoutRegistration'])

In [5]:
#defining scorer
my_scorer = make_scorer(cost_functions.cost_function, greater_is_better = True, needs_proba = True)

**Exercise 2: (85 points) Using the train data-frame (including the top 7 features from homework assignment 5), do the following:**

- (i) Consider a model to predict fraud. Then, do the following:
  - With the top 5 important features and using the GridSearchCV function with cv = 3, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
  - With the top 6 important features and using the GridSearchCV function with cv = 3, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
  - With the top 7 important features and using the GridSearchCV function with cv = 3, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.

From above three scenarios, identify the best model; that is, the model (input features
and hyper-parameters) that has the best performance.

In [6]:
#Model: Gradient Boosting

#defining parameter dictionary
gb_param_grid = {'n_estimators': [100, 300],
                  'min_samples_split': [10, 15],
                  'min_samples_leaf': [5, 7],
                  'max_depth': [3, 5, 7],
                  'learning_rate': [0.001, 0.01, 0.1]}

#GridSearchCV w/ top 5 most important features:----------
gb_grid_search_1 = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = gb_param_grid, cv = 3, 
                                scoring = my_scorer).fit(x_train_5, y_train)

print('Best hyper-parameter combination for GradientBoostingClassifier with top-5 variables: \n', gb_grid_search_1.best_params_)
print('\nBest score:\n', gb_grid_search_1.best_score_)
print('\n----------')
#GridSearchCV w/ top 6 most important features:----------
gb_grid_search_2 = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = gb_param_grid, cv = 3, 
                                scoring = my_scorer).fit(x_train_6, y_train)

print('Best hyper-parameter combination for GradientBoostingClassifier with top-6 variables: \n', gb_grid_search_2.best_params_)
print('\nBest score:\n', gb_grid_search_2.best_score_)
print('\n----------')
#GridSearchCV w/ top 7 most important features:----------
gb_grid_search_3 = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = gb_param_grid, cv = 3, 
                                scoring = my_scorer).fit(x_train_5, y_train)

print('Best hyper-parameter combination for GradientBoostingClassifier with top-7 variables: \n', gb_grid_search_3.best_params_)
print('\nBest score:\n', gb_grid_search_3.best_score_)

Best hyper-parameter combination for GradientBoostingClassifier with top-5 variables: 
 {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 10, 'n_estimators': 300}

Best score:
 -13.333333333333334

----------
Best hyper-parameter combination for GradientBoostingClassifier with top-6 variables: 
 {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 300}

Best score:
 -6.666666666666667

----------
Best hyper-parameter combination for GradientBoostingClassifier with top-7 variables: 
 {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 10, 'n_estimators': 300}

Best score:
 -13.333333333333334


- (ii) Consider a model different from part (i) to predict fraud. Then, do the following:
  - With the top 5 important features and using the RandomizedSearchCV function with cv = 3 and n iter = 30, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
  - With the top 6 important features and using the RandomizedSearchCV function with cv = 3 and n iter = 30, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
  - With the top 7 important features and using the RandomizedSearchCV function with cv = 3 and n iter = 30, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
  
From above three scenarios, identify the best model; that is, the model (input features and hyper-parameters) that has the best performance.

In [7]:
#Model: AdaBoost

#defining parameter dictionary
ada_param_grid = {'n_estimators': [100, 300],
                  'estimator__min_samples_split': [10, 15],
                  'estimator__min_samples_leaf': [5, 7],
                  'estimator__max_depth': [3, 5, 7],
                  'learning_rate': [0.001, 0.01, 0.1]}

#RandomizedSearchCV w/ top 5 most important features:----------
ada_randomized_search_1 = RandomizedSearchCV(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier()), 
                                             param_distributions = ada_param_grid, cv = 3, scoring = my_scorer,
                                             n_jobs = -1, n_iter = 30).fit(x_train_5, y_train)

print('Best hyper-parameter combination for AdaBoostClassifier with top-5 variables: \n', ada_randomized_search_1.best_params_)
print('\nBest score:\n', ada_randomized_search_1.best_score_)
print('\n----------')
#RandomizedSearchCV w/ top 6 most important features:----------
ada_randomized_search_2 = RandomizedSearchCV(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier()), 
                                             param_distributions = ada_param_grid, cv = 3, scoring = my_scorer,
                                             n_jobs = -1, n_iter = 30).fit(x_train_6, y_train)

print('Best hyper-parameter combination for AdaBoostClassifier with top-6 variables: \n', ada_randomized_search_2.best_params_)
print('\nBest score:\n', ada_randomized_search_2.best_score_)
print('\n----------')
#RandomizedSearchCV w/ top 7 most important features:----------
ada_randomized_search_3 = RandomizedSearchCV(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier()), 
                                             param_distributions = ada_param_grid, cv = 3, scoring = my_scorer,
                                             n_jobs = -1, n_iter = 30).fit(x_train_7, y_train)

print('Best hyper-parameter combination for AdaBoostClassifier with top-7 variables: \n', ada_randomized_search_3.best_params_)
print('\nBest score:\n', ada_randomized_search_3.best_score_)

Best hyper-parameter combination for AdaBoostClassifier with top-5 variables: 
 {'n_estimators': 300, 'learning_rate': 0.01, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 3}

Best score:
 -21.666666666666668

----------
Best hyper-parameter combination for AdaBoostClassifier with top-6 variables: 
 {'n_estimators': 100, 'learning_rate': 0.01, 'estimator__min_samples_split': 15, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 3}

Best score:
 13.333333333333334

----------
Best hyper-parameter combination for AdaBoostClassifier with top-7 variables: 
 {'n_estimators': 100, 'learning_rate': 0.01, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 3}

Best score:
 -3.3333333333333335


- (iii) Consider a model different from parts (i) & (ii) to predict fraud. Then, do the following:
  - With the top 5 important features and using the Optuna framework using 3 folds and N TRIALS = 30, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
  - With the top 6 important features and using the Optuna framework using 3 folds and N TRIALS = 30, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.
  - With the top 7 important features and using the Optuna framework using 3 folds and N TRIALS = 30, run a hyper-parameter tuning procedure on the model. Please see page 4 of DATA-MINING-CUP-2019-task.pdf file to understand how the model should be evaluated.

From above three scenarios, identify the best model; that is, the model (input features and hyper-parameters) that has the best performance.

In [10]:
#Model: Random Forest

SEED = 42
N_TRIALS = 30

#Optuna w/ top 5 most important features:----------
class Objective:
    
    def __init__(self, seed):
        
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000), 
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10))
        
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x_train_5, y_train):
            
            x_train_1, x_valid_1 = x_train_5.iloc[train_idx], x_train_5.iloc[valid_idx]
            y_train_1, y_valid_1 = y_train.iloc[train_idx], y_train.iloc[valid_idx]
            
            rf_md = RandomForestClassifier(**params).fit(x_train_1,y_train_1)
            
            pred_valid = rf_md.predict_proba(x_valid_1)[:,1]
            
            score = cost_functions.cost_cutoff_function(y_valid_1, pred_valid)
            
            scores.append(score[0])     
            
        return np.mean(scores)

study_1 = optuna.create_study(direction = 'maximize')
study_1.optimize(Objective(SEED), n_trials = N_TRIALS)

print('Best hyper-parameter combination for RandomForestClassifier with top-5 variables: \n', study_1.best_trial.params)
print('\n----------')
#Optuna w/ top 6 most important features:----------
class Objective:
    
    def __init__(self, seed):
        
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000), 
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10))
        
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x_train_6, y_train):
            
            x_train_1, x_valid_1 = x_train_6.iloc[train_idx], x_train_6.iloc[valid_idx]
            y_train_1, y_valid_1 = y_train.iloc[train_idx], y_train.iloc[valid_idx]
            
            rf_md = RandomForestClassifier(**params).fit(x_train_1,y_train_1)
            
            pred_valid = rf_md.predict_proba(x_valid_1)[:,1]
            
            score = cost_functions.cost_cutoff_function(y_valid_1, pred_valid)
            
            scores.append(score[0])     
            
        return np.mean(scores)

study_2 = optuna.create_study(direction = 'maximize')
study_2.optimize(Objective(SEED), n_trials = N_TRIALS)

print('Best hyper-parameter combination for RandomForestClassifier with top-6 variables: \n', study_2.best_trial.params)
print('\n----------')
#Optuna w/ top 7 most important features:----------
class Objective:
    
    def __init__(self, seed):
        
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000), 
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10))
        
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x_train_7, y_train):
            
            x_train_1, x_valid_1 = x_train_7.iloc[train_idx], x_train_7.iloc[valid_idx]
            y_train_1, y_valid_1 = y_train.iloc[train_idx], y_train.iloc[valid_idx]
            
            rf_md = RandomForestClassifier(**params).fit(x_train_1,y_train_1)
            
            pred_valid = rf_md.predict_proba(x_valid_1)[:,1]
            
            score = cost_functions.cost_cutoff_function(y_valid_1, pred_valid)
            
            scores.append(score[0])     
            
        return np.mean(scores)

study_3 = optuna.create_study(direction = 'maximize')
study_3.optimize(Objective(SEED), n_trials = N_TRIALS)

print('Best hyper-parameter combination for RandomForestClassifier with top-7 variables: \n', study_3.best_trial.params)

[32m[I 2023-03-25 00:45:59,483][0m A new study created in memory with name: no-name-2f56093e-99e9-463f-b648-85359022a2d9[0m
[32m[I 2023-03-25 00:46:06,594][0m Trial 0 finished with value: -51.666666666666664 and parameters: {'n_estimators': 1464, 'min_samples_split': 11, 'min_samples_leaf': 26, 'max_depth': 9}. Best is trial 0 with value: -51.666666666666664.[0m
[32m[I 2023-03-25 00:46:15,809][0m Trial 1 finished with value: -36.666666666666664 and parameters: {'n_estimators': 1896, 'min_samples_split': 11, 'min_samples_leaf': 12, 'max_depth': 9}. Best is trial 1 with value: -36.666666666666664.[0m
[32m[I 2023-03-25 00:46:18,511][0m Trial 2 finished with value: -33.333333333333336 and parameters: {'n_estimators': 541, 'min_samples_split': 12, 'min_samples_leaf': 11, 'max_depth': 8}. Best is trial 2 with value: -33.333333333333336.[0m
[32m[I 2023-03-25 00:46:27,310][0m Trial 3 finished with value: -43.333333333333336 and parameters: {'n_estimators': 1845, 'min_samples_spli

Best hyper-parameter combination for RandomForestClassifier with top-5 variables: 
 {'n_estimators': 924, 'min_samples_split': 16, 'min_samples_leaf': 5, 'max_depth': 5}

----------


[32m[I 2023-03-25 00:48:07,551][0m Trial 0 finished with value: -46.666666666666664 and parameters: {'n_estimators': 996, 'min_samples_split': 13, 'min_samples_leaf': 29, 'max_depth': 7}. Best is trial 0 with value: -46.666666666666664.[0m
[32m[I 2023-03-25 00:48:13,982][0m Trial 1 finished with value: -56.666666666666664 and parameters: {'n_estimators': 1336, 'min_samples_split': 10, 'min_samples_leaf': 26, 'max_depth': 6}. Best is trial 0 with value: -46.666666666666664.[0m
[32m[I 2023-03-25 00:48:14,746][0m Trial 2 finished with value: -6.666666666666667 and parameters: {'n_estimators': 136, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_depth': 9}. Best is trial 2 with value: -6.666666666666667.[0m
[32m[I 2023-03-25 00:48:24,044][0m Trial 3 finished with value: -46.666666666666664 and parameters: {'n_estimators': 1916, 'min_samples_split': 7, 'min_samples_leaf': 18, 'max_depth': 8}. Best is trial 2 with value: -6.666666666666667.[0m
[32m[I 2023-03-25 00:48:32,602]

Best hyper-parameter combination for RandomForestClassifier with top-6 variables: 
 {'n_estimators': 136, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_depth': 9}

----------


[32m[I 2023-03-25 00:50:14,671][0m Trial 0 finished with value: -63.333333333333336 and parameters: {'n_estimators': 340, 'min_samples_split': 25, 'min_samples_leaf': 28, 'max_depth': 5}. Best is trial 0 with value: -63.333333333333336.[0m
[32m[I 2023-03-25 00:50:19,338][0m Trial 1 finished with value: -56.666666666666664 and parameters: {'n_estimators': 977, 'min_samples_split': 13, 'min_samples_leaf': 30, 'max_depth': 6}. Best is trial 1 with value: -56.666666666666664.[0m
[32m[I 2023-03-25 00:50:27,848][0m Trial 2 finished with value: -40.0 and parameters: {'n_estimators': 1743, 'min_samples_split': 10, 'min_samples_leaf': 15, 'max_depth': 8}. Best is trial 2 with value: -40.0.[0m
[32m[I 2023-03-25 00:50:31,126][0m Trial 3 finished with value: -78.33333333333333 and parameters: {'n_estimators': 699, 'min_samples_split': 7, 'min_samples_leaf': 16, 'max_depth': 2}. Best is trial 2 with value: -40.0.[0m
[32m[I 2023-03-25 00:50:32,568][0m Trial 4 finished with value: -70.0

Best hyper-parameter combination for RandomForestClassifier with top-7 variables: 
 {'n_estimators': 1178, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_depth': 10}


**Exercise 3: (70 points) Using the train data-frame and the models from exercise 2, split the train data-frame into two data-frames: training (80%) and validation (20%) taking into account the proportions of 0s and 1s. Then, do the following:**

In [47]:
#define input(top 7) and target(fraud)
x = train[['log_totalScanTimeInSeconds', 'trustLevel', 'tree_interaction_1', 'heredity_interaction_3', 
                 'boxcox_SLIPS','scansWithoutRegistration','lineItemVoids']]
y = train['fraud']

#splitting the data
x_training,x_validation,y_training,y_validation = train_test_split(x,y,test_size=0.2,stratify=y)

In [48]:
#selecting top 7 for test
test_7 = test[['log_totalScanTimeInSeconds', 'trustLevel', 'tree_interaction_1', 'heredity_interaction_3', 
                 'boxcox_SLIPS','scansWithoutRegistration','lineItemVoids']]

#top 6 variables (for best AdaBoost and GradientBoost models)
x_training_6 = x_training.drop(columns = ['lineItemVoids'])
x_validation_6 = x_validation.drop(columns = ['lineItemVoids'])
test_6 = test_7.drop(columns = ['lineItemVoids'])

- (i) Consider the best model from exercise 2(i). Build that model on the training data-frame. After that, predict the likelihood of fraud on the validation and test data-frames.

In [49]:
'''
Best hyper-parameter combination for GradientBoostingClassifier with top-6 variables: 
 {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 300}
'''

#building the GradientBoosting model
gb_md = GradientBoostingClassifier(n_estimators = 300, max_depth = 7, learning_rate = 0.1,
                                   min_samples_leaf = 5, min_samples_split = 10).fit(x_training_6, y_training)
#predicting on validation & test
gb_val_pred = gb_md.predict_proba(x_validation_6)[:, 1]
gb_test_pred = gb_md.predict_proba(test_6)[:, 1]

#computing the cost for the validation predictions
print('Cost function score of GradientBoosting model:', cost_functions.cost_function(y_validation, gb_val_pred))
print('Cutoff value for GradientBoosting model:', cost_functions.cost_cutoff_function(y_validation, gb_val_pred)[1])

Cost function score of GradientBoosting model: -15.0
Cutoff value for GradientBoosting model: 0.05


- (ii) Consider the best model from exercise 2(ii). Build that model on the training data-frame. After that, predict the likelihood of fraud on the validation and test data-frames.

In [50]:
'''
Best hyper-parameter combination for AdaBoostClassifier with top-6 variables: 
 {'n_estimators': 100, 'learning_rate': 0.01, 'estimator__min_samples_split': 15, 'estimator__min_samples_leaf': 7, 'estimator__max_depth': 3}
'''

#building the AdaBoost model
ada_md = AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 7, min_samples_split = 15),
                        learning_rate = 0.01, n_estimators = 100).fit(x_training_6, y_training)
#predicting on validation & test
ada_val_pred = ada_md.predict_proba(x_validation_6)[:, 1]
ada_test_pred = ada_md.predict_proba(test_6)[:, 1]

#computing the cost for the validation predictions
print('Cost function score of AdaBoost model:', cost_functions.cost_function(y_validation, ada_val_pred))
print('Cutoff value for AdaBoost model:', cost_functions.cost_cutoff_function(y_validation, ada_val_pred)[1])

Cost function score of AdaBoost model: 10.0
Cutoff value for AdaBoost model: 0.51


- (iii) Consider the best model from exercise 2(iii). Build that model on the training data-frame. After that, predict the likelihood of fraud on the validation and test data-frames.

In [51]:
'''
Best hyper-parameter combination for RandomForestClassifier with top-7 variables: 
 {'n_estimators': 1178, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_depth': 10}
'''

#building the GradientBoosting model
rf_md = RandomForestClassifier(max_depth = 10, min_samples_leaf = 7, min_samples_split = 11, n_estimators = 1178).fit(x_training, y_training)

#predicting on validation & test
rf_val_pred = rf_md.predict_proba(x_validation)[:, 1]
rf_test_pred = rf_md.predict_proba(test_7)[:, 1]

#computing the cost for the validation predictions
print('Cost function score of RandomForest model:', cost_functions.cost_function(y_validation, rf_val_pred))
print('Cutoff value for RandomForest model:', cost_functions.cost_cutoff_function(y_validation, rf_val_pred)[1])

Cost function score of RandomForest model: 5.0
Cutoff value for RandomForest model: 0.56


Using the prediction on the validation data-frame as inputs from parts (i)-(ii)-(iii) and the actual fraud values from the validation data-frame as the target variable, build a meta-learner to predict fraud. Make sure to tune the hyper-parameters of the meta-learner keeping in mind how the results are going to be evaluated. For more info, see page 4 of DATA-MINING-CUP-2019-task.pdf file. Finally, use the best meta-learner to predict the likelihood of fraud in the test data-frame. Submit the likelihoods in a csv file. Also submit the associated cut-off value.

In [59]:
x_ensemble = pd.concat([pd.DataFrame(gb_val_pred),pd.DataFrame(ada_val_pred),pd.DataFrame(rf_val_pred)], axis = 1)
x_test_ensemble = pd.concat([pd.DataFrame(gb_test_pred),pd.DataFrame(ada_test_pred),pd.DataFrame(rf_test_pred)], axis = 1)

x = x_ensemble
y = y_validation

class Objective:
    def __init__(self, seed):
        
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000), 
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10))
        
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x, y):
            
            x_train, x_valid = x.iloc[train_idx], x.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
            
            rf_md = RandomForestClassifier(**params).fit(x_train,y_train)
            
            pred_valid = rf_md.predict_proba(x_valid)[:,1]
            
            score = cost_functions.cost_cutoff_function(y_valid, pred_valid)
            
            scores.append(score[0])     
            
        return np.mean(scores)

#defining seed and number of trials
SEED = 42
N_TRIALS = 30

#execute an optimization
study = optuna.create_study(direction = 'maximize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-26 01:35:03,576][0m A new study created in memory with name: no-name-b32402e7-e4b6-49a6-b4c6-cb8cd52ab2e8[0m
[32m[I 2023-03-26 01:35:09,062][0m Trial 0 finished with value: -28.333333333333332 and parameters: {'n_estimators': 1304, 'min_samples_split': 17, 'min_samples_leaf': 21, 'max_depth': 10}. Best is trial 0 with value: -28.333333333333332.[0m
[32m[I 2023-03-26 01:35:14,295][0m Trial 1 finished with value: -6.666666666666667 and parameters: {'n_estimators': 1255, 'min_samples_split': 18, 'min_samples_leaf': 15, 'max_depth': 4}. Best is trial 1 with value: -6.666666666666667.[0m
[32m[I 2023-03-26 01:35:15,660][0m Trial 2 finished with value: -28.333333333333332 and parameters: {'n_estimators': 312, 'min_samples_split': 11, 'min_samples_leaf': 22, 'max_depth': 4}. Best is trial 1 with value: -6.666666666666667.[0m
[32m[I 2023-03-26 01:35:17,144][0m Trial 3 finished with value: -28.333333333333332 and parameters: {'n_estimators': 342, 'min_samples_split':

In [60]:
print('Best hyper-parameter combination for ensemble model: \n', study.best_trial.params)

Best hyper-parameter combination for ensemble model: 
 {'n_estimators': 662, 'min_samples_split': 26, 'min_samples_leaf': 5, 'max_depth': 5}


In [63]:
#building ensemble model with optimal hyperparameters (fitted on predictions from validation)
rf_ens_md =RandomForestClassifier(**study.best_trial.params).fit(x_ensemble, y_validation)

#predicting on validation & test
rf_ens_val_pred = rf_ens_md.predict_proba(x_ensemble)[:, 1]
rf_ens_test_pred = rf_ens_md.predict_proba(x_test_ensemble)[:, 1]

#identifying the optimal cutoff
opt_cutoff = cost_functions.cost_cutoff_function(y_validation, rf_ens_val_pred)[1]
print('The optimal cutoff for the ensemble  is:', opt_cutoff)

The optimal cutoff for the ensemble  is: 0.72


In [64]:
#exporting the test predictions as a csv file
likelihoods = pd.DataFrame({'Likelihoods': rf_ens_test_pred})
likelihoods.to_csv('likelihoods.csv', index = False)