In [1]:
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.preprocessing import PowerTransformer
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

#Set a random state
rs = 4

#Set the scaler
scaler = PowerTransformer()

In [2]:
#Get current directory 
path = os.getcwd() 

#Get parent directory 
parent = os.path.dirname(path)

#Move to the directory with data
train_csv = os.path.join(parent, "data", "train.csv")

#Import our dataset
dataset = pd.read_csv(train_csv, delimiter = '|')

#Clean the dataset (drop unrealistic entries)
dataset_org = dataset.copy()
dataset = dataset[dataset['scannedLineItemsPerSecond'] < 4]
cutted = len(dataset_org)-len(dataset)
print(f"{cutted} entries removed due to errors in feature 'scannedLineItemsPerSecond'.")

#Add new feature 'totalItems'
dataset = dataset.assign(totalItems = dataset.totalScanTimeInSeconds * dataset.scannedLineItemsPerSecond)

#Add new feature 'suspicious' as frauds only occur at trustLevels 1-2, all others are non-fraudulent
suspicious = dataset['trustLevel'].copy()
suspicious[suspicious > 2] = 3
dataset = dataset.assign(suspicious = suspicious)

#Drop 'trustLevel' as it is too similar to 'suspicious'
dataset = dataset.drop("trustLevel", axis=1)

#Add new feature 'avgLineItemValue'
dataset = dataset.assign(avgLineItemValue = dataset.valuePerSecond / dataset.scannedLineItemsPerSecond)

4 entries removed due to errors in feature 'scannedLineItemsPerSecond'.


In [3]:
#Split the dataset in X and y
X = dataset.drop('fraud', axis=1)
y = dataset.fraud

In [4]:
X.head()

Unnamed: 0,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalItems,suspicious,avgLineItemValue
0,1054,54.7,7,0,3,0.027514,0.051898,0.241379,29.0,3,1.886207
1,108,27.36,5,2,4,0.12963,0.253333,0.357143,14.0,3,1.954286
2,1516,62.16,3,10,5,0.008575,0.041003,0.230769,13.0,3,4.781538
3,1791,92.31,8,4,4,0.016192,0.051541,0.275862,29.0,3,3.183103
4,430,81.53,3,7,2,0.062791,0.189605,0.111111,27.0,3,3.01963


In [5]:
#Define models and parameters
model_XG = XGBClassifier(booster="gblinear", alpha=0.00075, eta=0.03, reg_lambda=0.001, n_estimators=1000, random_state=rs, n_jobs=-1)
model_SVC = SVC(kernel="linear", C=0.6, probability=True, random_state=rs)
model_LR = LogisticRegression(solver="lbfgs", C=0.3, class_weight=None, random_state=rs, n_jobs=-1)

model_MLP1 = MLPClassifier(hidden_layer_sizes=(8,4,2), alpha=1.4, max_iter=500, random_state=rs)
model_MLP2 = MLPClassifier(hidden_layer_sizes=(8,4,2), alpha=1.5, max_iter=500, random_state=rs)
model_MLP3 = MLPClassifier(hidden_layer_sizes=(8,4),   alpha=1.3, max_iter=500, random_state=rs, learning_rate_init=0.005)

estimators_MLP = [
    ("MLP1", model_MLP1),
    ("MLP2", model_MLP2),
    ("MLP3", model_MLP3)
]
model_MLP = VotingClassifier(estimators=estimators_MLP, voting="soft", n_jobs=-1)

models_stacking = [
    ("XG", model_XG),
    ("SVC", model_SVC),  
    ("LR", model_LR), 
    ("MLP", model_MLP)
 ]

In [6]:
#Define model and parameters
model = StackingClassifier(estimators=models_stacking, final_estimator=model_LR, n_jobs=-1)

#Create the model pipeline
try:
    final_pipe = Pipeline([
        ("scaler", scaler),
        ("model", model)
    ])
    
except NameError:
    final_pipe = Pipeline([
        ("model", model)
    ])

# Prediction of test data

In [7]:
#Move to the directory with data
test_csv = os.path.join(parent, "data", "test.csv")

#Import our dataset
X_real = pd.read_csv(test_csv, delimiter = '|')

#Add new feature 'totalItems'
X_real = X_real.assign(totalItems = X_real.totalScanTimeInSeconds * X_real.scannedLineItemsPerSecond)

#Add new feature 'suspicious' as frauds only occur at trustLevels 1-2, all others are non-fraudulent
suspicious_real = X_real['trustLevel'].copy()
suspicious_real[suspicious_real > 2] = 3
X_real = X_real.assign(suspicious = suspicious_real)

#Drop 'trustLevel' as it is too similar to 'suspicious'
X_real = X_real.drop("trustLevel", axis=1)

#Add new feature 'avgLineItemValue'
X_real = X_real.assign(avgLineItemValue = X_real.valuePerSecond / X_real.scannedLineItemsPerSecond)

In [8]:
X_real.head()

Unnamed: 0,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalItems,suspicious,avgLineItemValue
0,467,88.48,4,8,4,0.014989,0.189465,0.571429,7.0,3,12.64
1,1004,58.99,7,6,1,0.026892,0.058755,0.259259,27.0,3,2.184815
2,162,14.0,4,5,4,0.006173,0.08642,4.0,1.0,1,14.0
3,532,84.79,9,3,4,0.026316,0.15938,0.642857,14.0,3,6.056429
4,890,42.16,4,0,0,0.021348,0.047371,0.210526,19.0,3,2.218947


In [9]:
#Fit the data to the final pipe
final_pipe.fit(X, y)

#Predict the classes of the test data
predictions = final_pipe.predict(X_real)

In [10]:
#Make sure the predictions are saved in the right shape
submission = pd.DataFrame()
submission['fraud'] = predictions

if len(submission) == len(X_real):
    print("Submission and X_real both have", len(submission), "entries.")

submission.head()

Submission and X_real both have 498121 entries.


Unnamed: 0,fraud
0,0
1,0
2,0
3,0
4,0


In [11]:
# Convert submission dataframe to csv for submission to csv
submission.to_csv('submission.csv', index=False)