In [None]:
pip install xgboost

In [None]:
pip install optuna

In [4]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score, classification_report, confusion_matrix, make_scorer
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import boxcox
from sklearn.feature_selection import RFE
from cost_function import cost_function, cost_function_cutoff
from xgboost import XGBClassifier

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'train.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

file_key2 = 'test.csv'

bucket_object2 = bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

# reading data file
train = pd.read_csv(file_content_stream, delimiter = '|')
test = pd.read_csv(file_content_stream2, delimiter = '|')
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [None]:
# top 5: trustLevel, totalScanTimeInSeconds, interation_1, heredity_1, heridity_2
# top 6: trustLevel, totalScanTimeInSeconds, interation_1, heredity_1, heridity_2, DT_1
# top 7: trustLevel, totalScanTimeInSeconds, interation_1, heredity_1, heridity_2, scansWithoutRegistration, DT_1,

In [5]:
train['interaction_1'] = train['totalScanTimeInSeconds'] * train['scannedLineItemsPerSecond']
train['heredity_1'] = train['trustLevel'] * train['interaction_1']
train['heredity_2'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
train['DT_1'] = np.where((train['trustLevel'] <= 1.5) & (train['heredity_1'] <= 4.412) & (train['totalScanTimeInSeconds'] <= 281.406), 1, 0)

test['interaction_1'] = test['totalScanTimeInSeconds'] * test['scannedLineItemsPerSecond']
test['heredity_1'] = test['trustLevel'] * test['interaction_1']
test['heredity_2'] = test['trustLevel'] * test['scannedLineItemsPerSecond']
test['DT_1'] = np.where((test['trustLevel'] <= 1.5) & (test['heredity_1'] <= 4.412) & (test['totalScanTimeInSeconds'] <= 281.406), 1, 0)

# GridSearchCV

In [24]:
# defining imput and target
x1 = train[['trustLevel', 'totalScanTimeInSeconds', 'interaction_1', 'heredity_1', 'heredity_2']]
y1 = train['fraud']

x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(x1, y1, test_size = 0.2, stratify = y1)

# grid search cv
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth': [3, 5, 7]}

# defining custom scorer
my_scorer_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# running grid search 
rf_grid1 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = RF_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train_1, y_train_1)

# extracting best hyper params
rf_grid1.best_params_

# running model with best parameters
rf_opt1 = RandomForestClassifier(**rf_grid1.best_params_).fit(x_train_1, y_train_1)

# predicting on test
rf_pred1 = rf_opt1.predict_proba(x_test_1)[:,1]

# getting optimal cutoff
opt_cutoff1 = cost_function_cutoff(y_test_1, rf_pred1)

# likilyhoods to labels
rf_label1 = np.where(rf_pred1 < opt_cutoff1, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_1, rf_label1)
print(con_mat)
print('the cost of the rf model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[[347   8]
 [  0  21]]
the cost of the rf model is:  65


In [8]:
# defining imput and target
x2 = train[['trustLevel', 'totalScanTimeInSeconds', 'interaction_1', 'heredity_1', 'heredity_2', 'DT_1']]
y2 = train['fraud']

x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x2, y2, test_size = 0.2, stratify = y2)

# running grid search 
rf_grid2 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = RF_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train_2, y_train_2)

# extracting best hyper params
rf_grid2.best_params_

# running model with best parameters
rf_opt2 = RandomForestClassifier(**rf_grid2.best_params_).fit(x_train_2, y_train_2)

# predicting on test
rf_pred2 = rf_opt2.predict_proba(x_test_2)[:,1]

# getting optimal cutoff
opt_cutoff2 = cost_function_cutoff(y_test_2, rf_pred2)

# likilyhoods to labels
rf_label2 = np.where(rf_pred2 < opt_cutoff2, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_2, rf_label2)
print(con_mat)
print('the cost of the rf model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[[341  14]
 [  1  20]]
the cost of the rf model is:  5


In [9]:
# defining imput and target
x3 = train[['trustLevel', 'totalScanTimeInSeconds', 'interaction_1', 'heredity_1', 'heredity_2', 'DT_1', 'scansWithoutRegistration']]
y3 = train['fraud']

x_train_3, x_test_3, y_train_3, y_test_3 = train_test_split(x3, y3, test_size = 0.2, stratify = y3)

# running grid search 
rf_grid3 = GridSearchCV(estimator = RandomForestClassifier(), param_grid = RF_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train_3, y_train_3)

# extracting best hyper params
rf_grid3.best_params_

# running model with best parameters
rf_opt3 = RandomForestClassifier(**rf_grid3.best_params_).fit(x_train_3, y_train_3)

# predicting on test
rf_pred3 = rf_opt3.predict_proba(x_test_3)[:,1]

# getting optimal cutoff
opt_cutoff3 = cost_function_cutoff(y_test_3, rf_pred3)

# likilyhoods to labels
rf_label3 = np.where(rf_pred3 < opt_cutoff3, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_3, rf_label3)
print(con_mat)
print('the cost of the rf model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[[349   6]
 [  0  21]]
the cost of the rf model is:  75


# RandomizedSearchCV

In [10]:
Gb_param_grid = {'n_estimators': [100, 300],
                    'min_samples_split': [10, 15],
                    'min_samples_leaf': [5, 7],
                    'max_depth': [3, 5, 7],
                    'learning_rate': [0.01]}

# running grid search 
gb_grid1 = RandomizedSearchCV(estimator = GradientBoostingClassifier(), 
                               param_distributions = Gb_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train_1, y_train_1)

# extracting best hyper params
gb_grid1.best_params_

# running model with best parameters
gb_opt1 = GradientBoostingClassifier(**gb_grid1.best_params_).fit(x_train_1, y_train_1)

# predicting on test
gb_pred1 = gb_opt1.predict_proba(x_test_1)[:,1]

# getting optimal cutoff
opt_cutoff1 = cost_function_cutoff(y_test_1, gb_pred1)

# likilyhoods to labels
gb_label1 = np.where(gb_pred1 < opt_cutoff1, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_1, gb_label1)
print(con_mat)
print('the cost of the gb model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[[347   8]
 [  0  21]]
the cost of the gb model is:  65


In [11]:
# running grid search 
gb_grid2 = RandomizedSearchCV(estimator = GradientBoostingClassifier(), 
                               param_distributions = Gb_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train_2, y_train_2)

# extracting best hyper params
gb_grid2.best_params_

# running model with best parameters
gb_opt2 = GradientBoostingClassifier(**gb_grid2.best_params_).fit(x_train_2, y_train_2)

# predicting on test
gb_pred2 = gb_opt2.predict_proba(x_test_2)[:,1]

# getting optimal cutoff
opt_cutoff2 = cost_function_cutoff(y_test_2, gb_pred2)

# likilyhoods to labels
gb_label2 = np.where(gb_pred2 < opt_cutoff2, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_2, gb_label2)
print(con_mat)
print('the cost of the gb model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[[346   9]
 [  4  17]]
the cost of the gb model is:  -60


In [12]:
# running grid search 
gb_grid3 = RandomizedSearchCV(estimator = GradientBoostingClassifier(), 
                               param_distributions = Gb_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_train_3, y_train_3)

# extracting best hyper params
gb_grid3.best_params_

# running model with best parameters
gb_opt3 = GradientBoostingClassifier(**gb_grid3.best_params_).fit(x_train_3, y_train_3)

# predicting on test
gb_pred3 = gb_opt3.predict_proba(x_test_3)[:,1]

# getting optimal cutoff
opt_cutoff3 = cost_function_cutoff(y_test_3, gb_pred3)

# likilyhoods to labels
gb_label3 = np.where(gb_pred3 < opt_cutoff3, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_3, gb_label3)
print(con_mat)
print('the cost of the gb model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[[348   7]
 [  0  21]]
the cost of the gb model is:  70


# Optuna

In [22]:
class objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 300, 500),
                     max_depth = trial.suggest_int('max_depth', 3, 7),
                     min_child_weight = trial.suggest_int('min_child_weight', 5, 7),
                     learning_rate= trial.suggest_float('learning_rate', 0.01, 0.1),
                     gamma= trial.suggest_float('gamma', 0.1, 0.3),
                     subsample= trial.suggest_float('subsample', 0.8, 1))
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, test_idx in skf.split(x1, y1):
            
            ## Splitting the data 
            x_train, x_test = x1.iloc[train_idx], x1.iloc[test_idx]
            y_train, y_test = y1.iloc[train_idx], y1.iloc[test_idx]
            
            xgb_md = XGBClassifier(**params).fit(x_train, y_train)
            
            xgb_pred = xgb_md.predict_proba(x_test)[:,1]
            
            score = cost_function(y_test, xgb_pred)
            scores.append(score)
        
        return np.mean(scores)

seed = 42
n_trials = 20

study_meta = optuna.create_study(direction = 'maximize')
study_meta.optimize(objective(seed), n_trials = n_trials)
study_meta.best_trial.params

# running model with best parameters
xgb_opt1 = XGBClassifier(**study_meta.best_trial.params).fit(x_train_1, y_train_1)

# predicting on test
xgb_pred1 = xgb_opt1.predict_proba(x_test_1)[:,1]

# getting optimal cutoff
opt_cutoff1 = cost_function_cutoff(y_test_1, xgb_pred1)

# likilyhoods to labels
xgb_label1 = np.where(xgb_pred1 < opt_cutoff1, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_1, xgb_label1)
print(con_mat)
print('the cost of the xgb model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[32m[I 2023-03-31 20:36:23,509][0m A new study created in memory with name: no-name-2714ff73-0c30-4d13-a94a-a3776d407137[0m
[32m[I 2023-03-31 20:36:24,625][0m Trial 0 finished with value: 6.666666666666667 and parameters: {'n_estimators': 365, 'max_depth': 5, 'min_child_weight': 7, 'learning_rate': 0.08799433405250016, 'gamma': 0.1597397219101332, 'subsample': 0.9561336306591741}. Best is trial 0 with value: 6.666666666666667.[0m
[32m[I 2023-03-31 20:36:25,410][0m Trial 1 finished with value: 13.333333333333334 and parameters: {'n_estimators': 458, 'max_depth': 7, 'min_child_weight': 6, 'learning_rate': 0.015552765108463137, 'gamma': 0.1137143155567222, 'subsample': 0.8228238086975045}. Best is trial 1 with value: 13.333333333333334.[0m
[32m[I 2023-03-31 20:36:26,186][0m Trial 2 finished with value: 8.333333333333334 and parameters: {'n_estimators': 420, 'max_depth': 7, 'min_child_weight': 6, 'learning_rate': 0.02557051251937307, 'gamma': 0.1224374927110069, 'subsample': 0.9

[[347   8]
 [  0  21]]
the cost of the xgb model is:  65


In [15]:
class objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 300, 500),
                     max_depth = trial.suggest_int('max_depth', 3, 7),
                     min_child_weight = trial.suggest_int('min_child_weight', 5, 7),
                     learning_rate= trial.suggest_float('learning_rate', 0.01, 0.1),
                     gamma= trial.suggest_float('gamma', 0.1, 0.3),
                     subsample= trial.suggest_float('subsample', 0.8, 1))
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, test_idx in skf.split(x2, y2):
            
            ## Splitting the data 
            x_train, x_test = x2.iloc[train_idx], x2.iloc[test_idx]
            y_train, y_test = y2.iloc[train_idx], y2.iloc[test_idx]
            
            xgb_md = XGBClassifier(**params).fit(x_train, y_train)
            
            xgb_pred = xgb_md.predict_proba(x_test)[:,1]
            
            score = cost_function(y_test, xgb_pred)
            scores.append(score)
        
        return np.mean(scores)

seed = 42
n_trials = 20

study2 = optuna.create_study(direction = 'maximize')
study2.optimize(objective(seed), n_trials = n_trials)
study2.best_trial.params

# running model with best parameters
xgb_opt2 = XGBClassifier(**study2.best_trial.params).fit(x_train_2, y_train_2)

# predicting on test
xgb_pred2 = xgb_opt2.predict_proba(x_test_2)[:,1]

# getting optimal cutoff
opt_cutoff2 = cost_function_cutoff(y_test_2, xgb_pred2)

# likilyhoods to labels
xgb_label2 = np.where(xgb_pred2 < opt_cutoff2, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_2, xgb_label2)
print(con_mat)
print('the cost of the xgb model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[32m[I 2023-03-31 20:34:06,456][0m A new study created in memory with name: no-name-a45ac854-88cc-4a11-8d0e-3621992f858d[0m
[32m[I 2023-03-31 20:34:07,125][0m Trial 0 finished with value: 20.0 and parameters: {'n_estimators': 383, 'max_depth': 5, 'min_child_weight': 7, 'learning_rate': 0.02097612352106601, 'gamma': 0.238325848458318, 'subsample': 0.9500757864263396}. Best is trial 0 with value: 20.0.[0m
[32m[I 2023-03-31 20:34:07,752][0m Trial 1 finished with value: 16.666666666666668 and parameters: {'n_estimators': 407, 'max_depth': 4, 'min_child_weight': 6, 'learning_rate': 0.046333008663413756, 'gamma': 0.2326492246395072, 'subsample': 0.9295858824762935}. Best is trial 0 with value: 20.0.[0m
[32m[I 2023-03-31 20:34:08,369][0m Trial 2 finished with value: 13.333333333333334 and parameters: {'n_estimators': 369, 'max_depth': 7, 'min_child_weight': 5, 'learning_rate': 0.04867347135815458, 'gamma': 0.21029846143782577, 'subsample': 0.9987594611050464}. Best is trial 0 with 

[[344  11]
 [  1  20]]
the cost of the xgb model is:  20


In [23]:
class objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 300, 500),
                     max_depth = trial.suggest_int('max_depth', 3, 7),
                     min_child_weight = trial.suggest_int('min_child_weight', 5, 7),
                     learning_rate= trial.suggest_float('learning_rate', 0.01, 0.1),
                     gamma= trial.suggest_float('gamma', 0.1, 0.3),
                     subsample= trial.suggest_float('subsample', 0.8, 1))
        scores = list()
        
        skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, test_idx in skf.split(x3, y3):
            
            ## Splitting the data 
            x_train, x_test = x3.iloc[train_idx], x3.iloc[test_idx]
            y_train, y_test = y3.iloc[train_idx], y3.iloc[test_idx]
            
            xgb_md = XGBClassifier(**params).fit(x_train, y_train)
            
            xgb_pred = xgb_md.predict_proba(x_test)[:,1]
            
            score = cost_function(y_test, xgb_pred)
            scores.append(score)
        
        return np.mean(scores)

seed = 42
n_trials = 20

study3 = optuna.create_study(direction = 'maximize')
study3.optimize(objective(seed), n_trials = n_trials)
study3.best_trial.params

# running model with best parameters
xgb_opt3 = XGBClassifier(**study3.best_trial.params).fit(x_train_3, y_train_3)

# predicting on test
xgb_pred3 = xgb_opt3.predict_proba(x_test_3)[:,1]

# getting optimal cutoff
opt_cutoff3 = cost_function_cutoff(y_test_3, xgb_pred3)

# likilyhoods to labels
xgb_label3 = np.where(xgb_pred3 < opt_cutoff3, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_test_3, xgb_label3)
print(con_mat)
print('the cost of the xgb model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[32m[I 2023-03-31 20:36:41,674][0m A new study created in memory with name: no-name-f76a4b07-9060-4418-ac38-5a240c738313[0m
[32m[I 2023-03-31 20:36:42,330][0m Trial 0 finished with value: 46.666666666666664 and parameters: {'n_estimators': 444, 'max_depth': 7, 'min_child_weight': 7, 'learning_rate': 0.09797280789474971, 'gamma': 0.19717254783308824, 'subsample': 0.8566769545266859}. Best is trial 0 with value: 46.666666666666664.[0m
[32m[I 2023-03-31 20:36:43,055][0m Trial 1 finished with value: 65.0 and parameters: {'n_estimators': 456, 'max_depth': 3, 'min_child_weight': 5, 'learning_rate': 0.03821288029680182, 'gamma': 0.10263846799414456, 'subsample': 0.8193509103649181}. Best is trial 1 with value: 65.0.[0m
[32m[I 2023-03-31 20:36:43,568][0m Trial 2 finished with value: 53.333333333333336 and parameters: {'n_estimators': 312, 'max_depth': 4, 'min_child_weight': 7, 'learning_rate': 0.06289536751279884, 'gamma': 0.17686234085559369, 'subsample': 0.8232796665918554}. Best 

[[352   3]
 [  0  21]]
the cost of the xgb model is:  90


In [126]:
print('best rf model: ', rf_grid3.best_params_)

best rf model:  {'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}


In [121]:
print('best gb model: ', gb_grid3.best_params_)

best gb model:  {'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 7, 'max_depth': 3, 'learning_rate': 0.01}


In [26]:
print('best xgb model: ', study3.best_trial.params)

best xgb model:  {'n_estimators': 323, 'max_depth': 5, 'min_child_weight': 5, 'learning_rate': 0.08199481025626429, 'gamma': 0.18013848692542023, 'subsample': 0.9112569903715746}


# Meta Learner

In [28]:
# defining imput and target
x = train[['trustLevel', 'totalScanTimeInSeconds', 'interaction_1', 'heredity_1', 'heredity_2']]
y = train['fraud']

test = test[['trustLevel', 'totalScanTimeInSeconds', 'interaction_1', 'heredity_1', 'heredity_2']]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, stratify = y)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size = 0.5, stratify = y_val)

# running model with best parameters
rf_md = RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 10, n_estimators = 100).fit(x_train, y_train)
rf_val_pred = rf_md.predict_proba(x_val)[:,1]
rf_test_pred = rf_md.predict_proba(x_test)[:,1]

gb_md = GradientBoostingClassifier(n_estimators = 300, min_samples_split = 15, min_samples_leaf = 7, max_depth = 3, learning_rate = 0.01).fit(x_train, y_train)
gb_val_pred = gb_md.predict_proba(x_val)[:,1]
gb_test_pred = gb_md.predict_proba(x_test)[:,1]

xgb_md = XGBClassifier(n_estimators = 323, max_depth = 5, min_child_weight = 5, learning_rate = 0.08199481025626429, gamma = 0.18013848692542023, subsample = 0.9112569903715746).fit(x_train, y_train)
xgb_val_pred = xgb_md.predict_proba(x_val)[:,1]
xgb_test_pred = xgb_md.predict_proba(x_test)[:,1]

In [33]:
# creating predictions dataframe
x_preds = pd.DataFrame({'rf': rf_val_pred, 'gb': gb_val_pred, 'xgb': xgb_val_pred})
x_test_preds = pd.DataFrame({'rf': rf_test_pred, 'gb': gb_test_pred, 'xgb': xgb_test_pred})

XGB_param_grid = {'n_estimators': [300, 500],
                    'max_depth': [3, 7],
                     'min_child_weight': [5, 7],
                     'learning_rate': [0.01, 0.1],
                     'gamma': [0.1, 0.3],
                     'subsample': [0.8, 1]}

# running grid search 
xgb_meta_grid = RandomizedSearchCV(estimator = XGBClassifier(), 
                               param_distributions = XGB_param_grid, cv = 3, scoring = my_scorer_function, n_jobs = -1).fit(x_preds, y_val)

# running model with best parameters
meta_opt = XGBClassifier(**xgb_meta_grid.best_params_).fit(x_preds, y_val)

# predicting on test
meta_pred = meta_opt.predict_proba(x_test_preds)[:,1]

# getting optimal cutoff
opt_cutoff = cost_function_cutoff(y_val, meta_pred)

# likilyhoods to labels
meta_label = np.where(meta_pred < opt_cutoff, 0, 1)

# scoring the model 
con_mat = confusion_matrix(y_val, meta_label)
print(con_mat)
print('the cost of the meta model is: ', -25 * con_mat[1, 0] - 5 * con_mat[0, 1] + 5 * con_mat[1, 1])

[[178   0]
 [ 10   0]]
the cost of the meta model is:  -250


In [37]:
print('Optimal Cutoff: ', opt_cutoff)

Optimal Cutoff:  0.51


In [39]:
preds = pd.DataFrame(meta_pred).to_csv()