# Create a basic model

Create a basic comparison model with which all preprocessing steps can be evaluated

In [1]:
# Import of used packages
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.svm import SVC

import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

#Set a random state
rs = 4

#Set amount of splits
cv_split = 5

In [2]:
#Define monetary_score
def monetary_score_func(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    monetary_value = (cm[0,0] * 0) + (cm[1, 0] * -5) + (cm[0, 1] * -25) + (cm[1, 1] * 5)
    max_monetary_value = (cm[1,0] + cm[1,1]) * 5
    return (monetary_value / max_monetary_value)

monetary_score = make_scorer(monetary_score_func)

In [3]:
# get current directory 
path = os.getcwd() 

# get parent directory 
parent = os.path.dirname(path)

# move to the directory with data
train_csv = os.path.join(parent, "data", "train.csv")

#Import our dataset
dataset = pd.read_csv(train_csv, delimiter = '|')

In [4]:
#Split X and y 
X = dataset.drop('fraud', axis=1)
y = dataset.fraud

In [5]:
cv = StratifiedKFold(n_splits=cv_split, shuffle=True, random_state=rs)
   
model = SVC(kernel="linear", random_state=rs)

cv_results = cross_validate(model, X, y, cv=cv_split, scoring=monetary_score, n_jobs=-1)
print("Model performance on folds:\t\t",cv_results['test_score'])
print("Model mean monetary score:\t\t", cv_results['test_score'].mean())
print("Model double standard deviation:\t", 2*cv_results['test_score'].std())

Model performance on folds:		 [-0.80952381 -0.52380952 -0.42857143 -0.52380952 -0.15      ]
Model mean monetary score:		 -0.4871428571428572
Model double standard deviation:	 0.4230501490029623
