### Model stacking approach - test

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from matplotlib import pyplot as plt
%matplotlib inline
sns.set()

Prepare training data

In [2]:
df = pd.read_csv("../data/train.csv", delimiter="|")

In [3]:
def prepare_subset(df):
    fraudulent = df[df['fraud']==1]
    non_fraudulent = df[df['fraud']==0]
    n = fraudulent.count()[0]
    non_fraudulent_subset = non_fraudulent.sample(n)
    fraudulent.reset_index(drop=True, inplace=True)
    non_fraudulent_subset.reset_index(drop=True, inplace=True)
    dfs = pd.concat([fraudulent, non_fraudulent_subset])
    dfs = dfs.sample(len(dfs)).reset_index(drop=True)
    return dfs

In [4]:
dt = prepare_subset(df)
dt.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,110,19.72,1,7,4,0.118182,0.179273,0.076923,0
1,1,1672,54.3,6,0,3,0.004187,0.032476,0.857143,0
2,2,1649,55.81,1,10,0,0.005458,0.033845,0.111111,0
3,2,1461,39.82,6,9,1,0.017112,0.027255,0.24,1
4,1,1045,45.74,0,2,4,0.026794,0.04377,0.0,1


Randomly split training data into two disjoined sets.

In [5]:
# from sklearn.model_selection import GroupShuffleSplit

# def split_data(df):
#     gss = GroupShuffleSplit(n_splits=1, test_size=0.5)
#     idx1, idx2 = next(gss.split(df, groups=df.index))
#     df1, df2 = df.iloc[idx1], df.iloc[idx2]
#     return df1, df2

# dt1, dt2 = split_data(dt)

Level 0 classifiers

In [6]:
# normalize data
import sklearn.preprocessing

x = dt.drop('fraud', axis=1)
y = dt['fraud']
x = sklearn.preprocessing.scale(x)

  


In [7]:
# test/training sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

0.8809523809523809

In [9]:
# random forest 

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9047619047619048

In [10]:
# logistic regression

from sklearn.linear_model import LogisticRegression

reg = LogisticRegression(random_state=0, solver='liblinear', multi_class="ovr")
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.9047619047619048

In [17]:
from sklearn.svm import SVC

svc = SVC(gamma='auto')
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.9285714285714286

Predictions from Level 0 classifiers as input for level 1 classifier

In [11]:
tree_pred = list(tree.predict(X_test))
clf_pred = list(clf.predict(X_test))
reg_pred = list(reg.predict(X_test))
true_pred = list(y_test)

Prepare date for Level 1 classfier

In [12]:
dtl1 = pd.DataFrame({
    'tree': tree_pred,
    'clf': clf_pred,
    'red': reg_pred,
    'true': true_pred
})

x1 = dtl1.drop('true', axis=1)
y1 = dtl1['true']

X1_train, X1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=42)

Train level 1 classifier

In [13]:
clf1 = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf1.fit(X1_train, y1_train)
clf1.score(X1_test, y1_test)

0.7777777777777778

In [14]:
tree1 = DecisionTreeClassifier(random_state=0)
tree1.fit(X1_train, y1_train)
tree1.score(X1_test, y1_test)

0.7777777777777778

In [15]:
reg1 = LogisticRegression(random_state=0, solver='liblinear', multi_class="ovr")
reg1.fit(X1_train, y1_train)
reg1.score(X1_test, y1_test)

0.7777777777777778