# Stacking model

### Import modules

In [None]:
# Basic modules and plotting tools
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

# Scikit-learn model modules
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification

# Scikit-learn metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score, accuracy_score

# Random seed used in this notebook:
RSEED = 42

### Useful functions
Calculating and printing out the classification report, including the F1-score:

In [None]:
def custom_report(y_test, y_pred):
    f1 = f1_score(y_test, y_pred, average='binary')
    return print("------"*10 +"\n",
    classification_report(y_test, y_pred) +"\n",
    "------"*10 +"\n",
    "F1-score:" + str(f1) +"\n",
    "------"*10
    )

Printing out a confusion matrix:

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

### Defining test and train data

In [None]:
# Loading test and train data of the previously saved csv file from the notebook "Modelling.ipynb"
X_train = pd.read_csv("data/x-train.csv")
y_train = pd.read_csv("data/y-train.csv")
X_test = pd.read_csv("data/x-test.csv")
y_test = pd.read_csv("data/y-test.csv")

### Stacking model implementation: variant 1 of 2

As a first attempt, this model with the following sub-models are being used: Decision Tree, KNN and Random Forest. As the meta-model a Logistic Regression was chosen.

In [None]:
# Sub-models
estimators = [
    ('dt', DecisionTreeClassifier(random_state = RSEED)),
    ('knn', KNeighborsClassifier()),
    ('rf', RandomForestClassifier(n_estimators=1000,random_state = RSEED,max_features = 'sqrt',n_jobs=-1, verbose = 1))
]

# Meta-model
clf = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())

# Fit the training data and get the score
clf.fit(X_train, y_train).score(X_test, y_test)

In [None]:
# Predicted y values by this model
stack_y_pred = clf.predict(X_test)

### Classification report variant 1 of 2

In [None]:
# Printing out the classification report using the function defined above
custom_report(y_test,stack_y_pred)

### Stacking model implementation: variant 2 of 2

Alternatively to model variant 1, in this model the following sub-models were implemented: Decision Tree, AdaBoost and RandomForst.

In [None]:
# Sub-models:
estimators = [
    ('dt', DecisionTreeClassifier(random_state = RSEED)),
    ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=200)),
    ('rf', RandomForestClassifier(n_estimators=1000,criterion = 'entropy',max_depth = None,random_state = RSEED,max_features = 'sqrt',n_jobs=-1, verbose = 1))
    ]

# Meta-model
clf = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv=10)

# Fit the training data and get the score
clf.fit(X_train, y_train).score(X_test, y_test)

In [None]:
# Predicted y values by this model
stack_y_pred = clf.predict(X_test)

### Classification report and confusion matrix variant 2 of 2

In [None]:
# Printing out the classification report using the function defined above
custom_report(y_test,stack_y_pred)

In [None]:
# Printing out the confusion matrix using the function defined above
cnf_matrix = confusion_matrix(y_test, stack_y_pred, labels=[0,1])
np.set_printoptions(precision=2)    
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Legal','Fraud'],normalize= True,  title="Fraud detection model: confusion matrix")

### Conclusion
Variant 2 of 2 yielded a slightly better result (F1-score) than variant 1 of 2.