# Check performance of the StandardScaler 

no application of distribution manipulation with log etc. 

In [1]:
from sklearn.model_selection import StratifiedKFold, cross_validate 
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.svm import SVC

import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

#Set a random state
rs = 4

#Set amount of splits
cv_split = 5

#Set the scaler
scaler = StandardScaler()

In [2]:
#Define monetary_score
def monetary_score_func(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    monetary_value = (cm[0,0] * 0) + (cm[1, 0] * -5) + (cm[0, 1] * -25) + (cm[1, 1] * 5)
    max_monetary_value = (cm[1,0] + cm[1,1]) * 5
    return (monetary_value / max_monetary_value)

monetary_score = make_scorer(monetary_score_func)

In [3]:
# get current directory 
path = os.getcwd() 

# get parent directory 
parent = os.path.dirname(path)

# move to the directory with data
train_csv = os.path.join(parent, "data", "train.csv")

#Import our dataset
dataset = pd.read_csv(train_csv, delimiter = '|')
dataset_org = dataset.copy()

#Add some new features
dataset = dataset.assign(totalItems = dataset.totalScanTimeInSeconds * dataset.scannedLineItemsPerSecond)
dataset = dataset.assign(avgLineItemValue = dataset.valuePerSecond / dataset.scannedLineItemsPerSecond)

#Add new feature suspicious
suspicious = dataset['trustLevel'].copy()
suspicious[suspicious > 2] = 3
dataset = dataset.assign(suspicious = suspicious)

dataset = dataset.drop('trustLevel', axis=1)

dataset = dataset[dataset['scannedLineItemsPerSecond'] < 4]
cutted = len(dataset_org)-len(dataset)
print(f"{cutted} ({cutted/len(dataset_org)}) entries removed .")

4 (0.0021287919105907396) entries removed .


In [4]:
#Split X and y 
X = dataset.drop('fraud', axis=1)
y = dataset.fraud

In [5]:
X.head()

Unnamed: 0,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalItems,avgLineItemValue,suspicious
0,1054,54.7,7,0,3,0.027514,0.051898,0.241379,29.0,1.886207,3
1,108,27.36,5,2,4,0.12963,0.253333,0.357143,14.0,1.954286,3
2,1516,62.16,3,10,5,0.008575,0.041003,0.230769,13.0,4.781538,3
3,1791,92.31,8,4,4,0.016192,0.051541,0.275862,29.0,3.183103,3
4,430,81.53,3,7,2,0.062791,0.189605,0.111111,27.0,3.01963,3


In [6]:
cv = StratifiedKFold(n_splits=cv_split, shuffle=True, random_state=rs)
   
model = SVC(kernel="linear", random_state=rs)

# create the model pipeline
try:
    scaler    
    pipe_model = Pipeline([
        ("scaler", scaler),
        ("model", model)
    ])
    
except NameError:
    pipe_model = Pipeline([    
        ("model", model)
    ])

cv_results = cross_validate(pipe_model, X, y, cv=cv_split, scoring=monetary_score, n_jobs=-1)
print("Model performance on folds:\t\t",cv_results['test_score'])
print("Model mean monetary score:\t\t", cv_results['test_score'].mean())
print("Model double standard deviation:\t", 2*cv_results['test_score'].std())

Model performance on folds:		 [0.2        0.19047619 0.76190476 0.57142857 0.71428571]
Model mean monetary score:		 0.48761904761904756
Model double standard deviation:	 0.49368253866183254
