In [1]:
from sklearn.ensemble import StackingClassifier, VotingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.preprocessing import PowerTransformer
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from itertools import combinations 

import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

#Set a random state
rs = 4

#Set our CVs
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=rs)
cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=rs)

#Set the scaler
scaler = PowerTransformer()

In [2]:
#Get current directory 
path = os.getcwd() 

#Get parent directory 
parent = os.path.dirname(path)

#Move to the directory with data
train_csv = os.path.join(parent, "data", "train.csv")

#Import our dataset
dataset = pd.read_csv(train_csv, delimiter = '|')

#Clean the dataset (drop unrealistic entries)
dataset_org = dataset.copy()
dataset = dataset[dataset['scannedLineItemsPerSecond'] < 4]
cutted = len(dataset_org)-len(dataset)
print(f"{cutted} entries removed due to errors in feature 'scannedLineItemsPerSecond'.")

#Add new feature 'totalItems'
dataset = dataset.assign(totalItems = dataset.totalScanTimeInSeconds * dataset.scannedLineItemsPerSecond)

#Add new feature 'suspicious' as frauds only occur at trustLevels 1-2, all others are non-fraudulent
suspicious = dataset['trustLevel'].copy()
suspicious[suspicious > 2] = 3
dataset = dataset.assign(suspicious = suspicious)

#Drop 'trustLevel' as it is too similar to 'suspicious'
dataset = dataset.drop("trustLevel", axis=1)

#Add new feature 'avgLineItemValue'
dataset = dataset.assign(avgLineItemValue = dataset.valuePerSecond / dataset.scannedLineItemsPerSecond)

4 entries removed due to errors in feature 'scannedLineItemsPerSecond'.


In [3]:
#Split the dataset in X and y
X = dataset.drop('fraud', axis=1)
y = dataset.fraud

In [4]:
X.head()

Unnamed: 0,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalItems,suspicious,avgLineItemValue
0,1054,54.7,7,0,3,0.027514,0.051898,0.241379,29.0,3,1.886207
1,108,27.36,5,2,4,0.12963,0.253333,0.357143,14.0,3,1.954286
2,1516,62.16,3,10,5,0.008575,0.041003,0.230769,13.0,3,4.781538
3,1791,92.31,8,4,4,0.016192,0.051541,0.275862,29.0,3,3.183103
4,430,81.53,3,7,2,0.062791,0.189605,0.111111,27.0,3,3.01963


In [5]:
#Define monetary_score as our used metric
def monetary_score_func(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    monetary_value = (cm[0,0] * 0) + (cm[1, 0] * -5) + (cm[0, 1] * -25) + (cm[1, 1] * 5)
    max_monetary_value = (cm[1,0] + cm[1,1]) * 5
    return (monetary_value / max_monetary_value)

monetary_score = make_scorer(monetary_score_func)

In [6]:
#Define models and parameters
model_XG = XGBClassifier(booster="gblinear", alpha=0.00075, eta=0.03, reg_lambda=0.001, n_estimators=1000, random_state=rs, n_jobs=-1)
model_SVC = SVC(kernel="linear", C=0.6, probability=True, random_state=rs)
model_LR = LogisticRegression(solver="lbfgs", C=0.3, class_weight=None, random_state=rs, n_jobs=-1)
model_Ada = BaggingClassifier(base_estimator=AdaBoostClassifier(),
                                   n_estimators=25, n_jobs=-1,
                                   random_state=rs)

model_MLP1 = MLPClassifier(hidden_layer_sizes=(8,4,2), alpha=1.4, max_iter=500, random_state=rs)
model_MLP2 = MLPClassifier(hidden_layer_sizes=(8,4,2), alpha=1.5, max_iter=500, random_state=rs)
model_MLP3 = MLPClassifier(hidden_layer_sizes=(8,4),   alpha=1.3, max_iter=500, random_state=rs, learning_rate_init=0.005)

estimators_MLP = [
    ("MLP1", model_MLP1),
    ("MLP2", model_MLP2),
    ("MLP3", model_MLP3)
]
model_MLP = VotingClassifier(estimators=estimators_MLP, voting="soft", n_jobs=-1)


In [7]:
#Get a list of all top 5 models
model_list = [("XG", model_XG), ("SVC", model_SVC), ("LR", model_LR), ("Ada", model_Ada), ("MLP", model_MLP)]

#Add combinations of 1-5 (take away combos of 1 after)
combo5 = [com for sub in range(5) for com in combinations(model_list, sub + 1)] 
combo1 = [com for sub in range(1) for com in combinations(model_list, sub + 1)]
combos = list(set(combo5) - set(combo1))

print(len(combos))

26


In [8]:
#What does a combo element look like?
print(combos[7])

(('XG', XGBClassifier(alpha=0.00075, base_score=None, booster='gblinear',
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, eta=0.03, gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=-1, num_parallel_tree=None,
              random_state=4, reg_alpha=None, reg_lambda=0.001,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)), ('LR', LogisticRegression(C=0.3, n_jobs=-1, random_state=4)), ('Ada', BaggingClassifier(base_estimator=AdaBoostClassifier(), n_estimators=25,
                  n_jobs=-1, random_state=4)), ('MLP', VotingClassifier(estimators=[('MLP1',
                              MLPClassifier(alpha=1.4,
                          

In [9]:
#Create a dummy, as estimators is a mandatory parameter
models_SLA = (
    ("SVC", model_SVC),  
    ("LR", model_LR),   
    ("Ada", model_Ada)
)

In [10]:
#Define model and parameters
model = StackingClassifier(estimators=models_SLA, n_jobs=-1)
params = {"model__estimators": combos,
          "model__final_estimator": [model_XG, model_SVC, model_LR, model_Ada, model_MLP]
         }

#Create the model pipeline
try:
    pipe_model = Pipeline([
        ('sampler', sampler),
        ("scaler", scaler),
        ("model", model)
    ])
    
except NameError:
    pipe_model = Pipeline([    
        ("scaler", scaler),
        ("model", model)
    ])

#Define the grid search (inner cv)
grid = GridSearchCV(estimator=pipe_model, param_grid=params, scoring=monetary_score, cv=cv_inner, n_jobs=-1)
print(pipe_model)

Pipeline(steps=[('scaler', PowerTransformer()),
                ('model',
                 StackingClassifier(estimators=(('SVC',
                                                 SVC(C=0.6, kernel='linear',
                                                     probability=True,
                                                     random_state=4)),
                                                ('LR',
                                                 LogisticRegression(C=0.3,
                                                                    n_jobs=-1,
                                                                    random_state=4)),
                                                ('Ada',
                                                 BaggingClassifier(base_estimator=AdaBoostClassifier(),
                                                                   n_estimators=25,
                                                                   n_jobs=-1,
                                  

In [11]:
#Fit the data to the GridSearch
grid.fit(X, y)

#Get the GridSearch results
results = pd.DataFrame(grid.cv_results_)

#Sort the output by mean_test_score
results.sort_values(by='mean_test_score', inplace=True, ignore_index=True, ascending=False)

#Drop everything but the param columns
results_grid = results.drop(["mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time", "split0_test_score", "split1_test_score", "split2_test_score", "rank_test_score", "mean_test_score", "std_test_score"], axis=1)

#Get back mean_test_score and std_test_score (as double_std_test_score) to insert them in the first two columns
results_grid.insert(0, "mean_test_score", results["mean_test_score"])
results_grid.insert(1, "double_std_test_score", results["std_test_score"] * 2)

In [12]:
#Save the results so we can look at them again later
results_grid.to_pickle("stacking_results.pkl")

In [13]:
#Load the results if we want to look at them without running the whole code again
#results_grid = pd.read_pickle("stacking_results.pkl")

In [23]:
#See the top results (sorted by 'mean_test_score')
#results_grid[:20]

#See the top results (sorted by 'double_std_test_score')
results_grid[:80].sort_values(by='double_std_test_score')

Unnamed: 0,mean_test_score,double_std_test_score,param_model__estimators,param_model__final_estimator,params
59,0.538375,0.012676,"((SVC, SVC(C=0.6, kernel='linear', probability...","LogisticRegression(C=0.3, n_jobs=-1, random_st...","{'model__estimators': (('SVC', SVC(C=0.6, kern..."
50,0.557983,0.042783,"((SVC, SVC(C=0.6, kernel='linear', probability...","LogisticRegression(C=0.3, n_jobs=-1, random_st...","{'model__estimators': (('SVC', SVC(C=0.6, kern..."
10,0.634734,0.049806,"((XG, XGBClassifier(alpha=0.00075, base_score=...","LogisticRegression(C=0.3, n_jobs=-1, random_st...","{'model__estimators': (('XG', XGBClassifier(al..."
78,0.489916,0.068928,"((LR, LogisticRegression(C=0.3, n_jobs=-1, ran...","SVC(C=0.6, kernel='linear', probability=True, ...","{'model__estimators': (('LR', LogisticRegressi..."
57,0.538936,0.083892,"((XG, XGBClassifier(alpha=0.00075, base_score=...","LogisticRegression(C=0.3, n_jobs=-1, random_st...","{'model__estimators': (('XG', XGBClassifier(al..."
...,...,...,...,...,...
33,0.594398,0.492987,"((XG, XGBClassifier(alpha=0.00075, base_score=...","VotingClassifier(estimators=[('MLP1',\n ...","{'model__estimators': (('XG', XGBClassifier(al..."
60,0.536695,0.493819,"((XG, XGBClassifier(alpha=0.00075, base_score=...","XGBClassifier(alpha=0.00075, base_score=None, ...","{'model__estimators': (('XG', XGBClassifier(al..."
32,0.594958,0.498621,"((XG, XGBClassifier(alpha=0.00075, base_score=...","VotingClassifier(estimators=[('MLP1',\n ...","{'model__estimators': (('XG', XGBClassifier(al..."
42,0.575350,0.520776,"((SVC, SVC(C=0.6, kernel='linear', probability...","VotingClassifier(estimators=[('MLP1',\n ...","{'model__estimators': (('SVC', SVC(C=0.6, kern..."


In [18]:
print(results_grid.loc[59, "params"])

{'model__estimators': (('SVC', SVC(C=0.6, kernel='linear', probability=True, random_state=4)), ('MLP', VotingClassifier(estimators=[('MLP1',
                              MLPClassifier(alpha=1.4,
                                            hidden_layer_sizes=(8, 4, 2),
                                            max_iter=500, random_state=4)),
                             ('MLP2',
                              MLPClassifier(alpha=1.5,
                                            hidden_layer_sizes=(8, 4, 2),
                                            max_iter=500, random_state=4)),
                             ('MLP3',
                              MLPClassifier(alpha=1.3,
                                            hidden_layer_sizes=(8, 4),
                                            learning_rate_init=0.005,
                                            max_iter=500, random_state=4))],
                 n_jobs=-1, voting='soft'))), 'model__final_estimator': LogisticRegression(C=0.3, n_

In [19]:
print(results_grid.loc[10, "params"])

{'model__estimators': (('XG', XGBClassifier(alpha=0.00075, base_score=None, booster='gblinear',
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, eta=0.03, gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=-1, num_parallel_tree=None,
              random_state=4, reg_alpha=None, reg_lambda=0.001,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)), ('SVC', SVC(C=0.6, kernel='linear', probability=True, random_state=4)), ('LR', LogisticRegression(C=0.3, n_jobs=-1, random_state=4)), ('MLP', VotingClassifier(estimators=[('MLP1',
                              MLPClassifier(alpha=1.4,
                                            hidden_layer_si

In [20]:
print(results_grid.loc[3, "params"])

{'model__estimators': (('XG', XGBClassifier(alpha=0.00075, base_score=None, booster='gblinear',
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, eta=0.03, gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=-1, num_parallel_tree=None,
              random_state=4, reg_alpha=None, reg_lambda=0.001,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)), ('SVC', SVC(C=0.6, kernel='linear', probability=True, random_state=4)), ('LR', LogisticRegression(C=0.3, n_jobs=-1, random_state=4)), ('Ada', BaggingClassifier(base_estimator=AdaBoostClassifier(), n_estimators=25,
                  n_jobs=-1, random_state=4))), 'model__final_estimator': Logist

In [21]:
print(results_grid.loc[4, "params"])

{'model__estimators': (('XG', XGBClassifier(alpha=0.00075, base_score=None, booster='gblinear',
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, eta=0.03, gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=-1, num_parallel_tree=None,
              random_state=4, reg_alpha=None, reg_lambda=0.001,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)), ('SVC', SVC(C=0.6, kernel='linear', probability=True, random_state=4)), ('LR', LogisticRegression(C=0.3, n_jobs=-1, random_state=4)), ('Ada', BaggingClassifier(base_estimator=AdaBoostClassifier(), n_estimators=25,
                  n_jobs=-1, random_state=4)), ('MLP', VotingClassifier(estimato

In [24]:
print(results_grid.loc[0, "params"])

{'model__estimators': (('Ada', BaggingClassifier(base_estimator=AdaBoostClassifier(), n_estimators=25,
                  n_jobs=-1, random_state=4)), ('MLP', VotingClassifier(estimators=[('MLP1',
                              MLPClassifier(alpha=1.4,
                                            hidden_layer_sizes=(8, 4, 2),
                                            max_iter=500, random_state=4)),
                             ('MLP2',
                              MLPClassifier(alpha=1.5,
                                            hidden_layer_sizes=(8, 4, 2),
                                            max_iter=500, random_state=4)),
                             ('MLP3',
                              MLPClassifier(alpha=1.3,
                                            hidden_layer_sizes=(8, 4),
                                            learning_rate_init=0.005,
                                            max_iter=500, random_state=4))],
                 n_jobs=-1, voting='soft'))),

In [25]:
#Alternative A - use best model as final_clf
final_pipe = clone(pipe_model)
final_pipe = final_pipe.set_params(**results_grid.loc[10, "params"])
final_pipe

Pipeline(steps=[('scaler', PowerTransformer()),
                ('model',
                 StackingClassifier(estimators=(('XG',
                                                 XGBClassifier(alpha=0.00075,
                                                               base_score=None,
                                                               booster='gblinear',
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               eta=0.03,
                                                               gamma=None,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                            