In [18]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone

import os
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

#Set a random state
rs = 4

#Set the scaler
scaler = PowerTransformer()

In [19]:
#Get current directory 
path = os.getcwd() 

#Get parent directory 
parent = os.path.dirname(path)

#Move to the directory with data
train_csv = os.path.join(parent, "data", "train.csv")

#Import our dataset
dataset = pd.read_csv(train_csv, delimiter = '|')
dataset_org = dataset.copy()

#Add some new features
dataset = dataset.assign(totalItems = dataset.totalScanTimeInSeconds * dataset.scannedLineItemsPerSecond)
dataset = dataset.assign(avgLineItemValue = dataset.valuePerSecond / dataset.scannedLineItemsPerSecond)

#Add new feature suspicious
suspicious = dataset['trustLevel'].copy()
suspicious[suspicious > 2] = 3
dataset = dataset.assign(suspicious = suspicious)

dataset = dataset.drop('trustLevel', axis=1)

dataset = dataset[dataset['scannedLineItemsPerSecond'] < 4]
cutted = len(dataset_org)-len(dataset)
print(f"{cutted} ({cutted/len(dataset_org)}) entries removed .")

4 (0.0021287919105907396) entries removed .


In [20]:
#Split the dataset in X and y
X = dataset.drop('fraud', axis=1)
y = dataset.fraud

#Copy X for easier testing
X_org = X.copy()

In [21]:
X.head()

Unnamed: 0,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalItems,avgLineItemValue,suspicious
0,1054,54.7,7,0,3,0.027514,0.051898,0.241379,29.0,1.886207,3
1,108,27.36,5,2,4,0.12963,0.253333,0.357143,14.0,1.954286,3
2,1516,62.16,3,10,5,0.008575,0.041003,0.230769,13.0,4.781538,3
3,1791,92.31,8,4,4,0.016192,0.051541,0.275862,29.0,3.183103,3
4,430,81.53,3,7,2,0.062791,0.189605,0.111111,27.0,3.01963,3


In [22]:
#Define monetary_score as our used metric
def monetary_score_func(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    monetary_value = (cm[0,0] * 0) + (cm[1, 0] * -5) + (cm[0, 1] * -25) + (cm[1, 1] * 5)
    max_monetary_value = (cm[1,0] + cm[1,1]) * 5
    return (monetary_value / max_monetary_value)

monetary_score = make_scorer(monetary_score_func)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=rs, test_size=0.2)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
#import keras packages
import tensorflow as tf
import kerastuner as kt
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras import models, layers, Sequential
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch

In [25]:
# define tuner class for keras gridsearch with keras tuner
class MyTuner(kt.tuners.BayesianOptimization):
  def run_trial(self, trial, *args, **kwargs):
    # You can add additional HyperParameters for preprocessing and custom training loops
    # via overriding `run_trial`
    kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', 4, 32, step=8)
    kwargs['epochs'] = trial.hyperparameters.Int('epochs', 50, 250, 50)
    super(MyTuner, self).run_trial(trial, *args, **kwargs)

In [26]:
# set random seed to get same results 
from numpy.random import seed
seed(42)
import tensorflow
tensorflow.random.set_seed(42)

input_shape = (X_train.shape[1],)

class HyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape
       
    def build(self, hp):
        model = Sequential()
        model.add(
            layers.Dense(
                units=hp.Int('units', 2, 16, step=4),
                activation=hp.Choice(
                    'dense_activation',
                    values=['relu', 'tanh'],
                    default='relu'),
                input_shape=input_shape
            )
        )
        
        num_dense = hp.Int('num_dense', min_value=0, max_value=3, step=1)
        for i in range(num_dense):
            units = hp.Int('units_{i}'.format(i=i), min_value=4, max_value=16, step=8)
            dp = hp.Float('dp_{i}'.format(i=i), min_value=0., max_value=0.2, step=0.1)
            model.add(layers.Dropout(dp))
            model.add(layers.Dense(units, activation='relu'))

        
        model.add(Dense(1, activation='sigmoid'))
        
        
        hp_learning_rate = hp.Choice('learning_rate', values = [0.01, 0.001, 0.0001]) 
            
        model.compile(loss='binary_crossentropy', 
                     optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                     metrics=['accuracy'])
        
        return model
    

hypermodel = HyperModel(input_shape)

# Uses same arguments as the BayesianOptimization Tuner.

tuner = MyTuner(
            hypermodel,
            objective='val_accuracy',
            max_trials=20,
            seed=42,
            executions_per_trial=10,
            overwrite=True
        )


tuner.search(x=X_train, y=y_train,
           verbose=0,
           validation_data=(X_test,y_test),
           callbacks=[tf.keras.callbacks.EarlyStopping('val_accuracy', patience=15)])

INFO:tensorflow:Oracle triggered exit


In [27]:
best_batch_size = tuner.get_best_hyperparameters()[0].values['batch_size']
best_epochs = tuner.get_best_hyperparameters()[0].values['epochs']
print(f'best_batch_size: {best_batch_size}, best_epochs: {best_epochs}')

best_batch_size: 4, best_epochs: 50


In [28]:
# Get the best hyperparameters from the search
tuner_params = tuner.get_best_hyperparameters()[0]

# Build the model using the best hyperparameters
tuner_model = tuner.hypermodel.build(tuner_params)

tuner.get_best_models()[0].summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 2)                 24        
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 3         
Total params: 27
Trainable params: 27
Non-trainable params: 0
_________________________________________________________________


In [29]:
# fit the model to the training data
tuner_model.fit(X_train, y_train,  
                batch_size=best_batch_size, 
                epochs=best_epochs,
                verbose=0
)

<tensorflow.python.keras.callbacks.History at 0x7fdffe072fa0>

In [30]:
#How high was the reached monetary value with the application of our model?
def get_monetary_value(confusion_matrix):
    monetary_value = ((confusion_matrix[0,0] * 0) + (confusion_matrix[1, 0] * -5) + (confusion_matrix[0, 1] * -25) + (confusion_matrix[1, 1] * 5))
    return monetary_value

In [31]:
#What was the maximum monetary value? -> (FN + TP) * 5
def get_max_monetary_value(confusion_matrix):
    max_monetary_value = (confusion_matrix[1,0] + confusion_matrix[1,1]) * 5
    return max_monetary_value

In [32]:
# predict the Test classes
y_pred = tuner_model.predict_classes(X_test)

#Get the confusion matrix
cm_test = confusion_matrix(y_test, y_pred)

#Visualize the results
tn, fp, fn, tp = cm_test.ravel()
print("TN:", tn, " FP:", fp, "\nFN:", fn, "\t TP:", tp)
#TN FP

TN: 358 	 FP: 0 
FN: 6 	 TP: 11


In [33]:
#How well did our model perform?
print("Achieved monetary value:\t\t\t", get_monetary_value(cm_test))
print("Maximum monetary value:\t\t\t\t", get_max_monetary_value(cm_test))
print("Achieved monetary value compared to maximum:\t", round(get_monetary_value(cm_test) / get_max_monetary_value(cm_test), 4))

Achieved monetary value:			 25
Maximum monetary value:				 85
Achieved monetary value compared to maximum:	 0.2941


The achieved monetary score does not look too promising maybe we should continue with a simpler neural network. For comparison reasons to the other models the MLPClassifier of the sci-kit-learn package seems promising.