# Benchmark models - Exploration, calibration, test

This notebook uses as inputs the outputs from the pipeline ('07_preproc_pipeline.ipynb' notebook) and performs model calibration and general exploration for the transactions prediction.

In [1]:
import pandas as pd
import numpy as np
import pickle
from os import environ

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from visualization_utils import plot_rocs

from bokeh.io import show, output_notebook
output_notebook()

import datetime as dt

In [29]:
#utils for model permformance evaluation

def model_diag(model, X_train, y_train, CrossValFolds=5, run_confusion_matrix=False):
    """
    This function returns as output false positive rate, true positive rate and auc score in the form of a dictionary.
    It needs model, training x and training y as inputs.
    """
    y_pred = cross_val_predict(model, X_train, y_train, cv=CrossValFolds)
    
    if hasattr(model, "decision_function"):
        y_scores = cross_val_predict(model, X_train, y_train, cv=CrossValFolds, method="decision_function")
    else:
        y_proba = cross_val_predict(model, X_train, y_train, cv=CrossValFolds, method="predict_proba")
        y_scores = y_proba[:,1]
    fpr, tpr, thresholds = roc_curve(y_train, y_scores) #false positive rate, true positive rate and thresholds
    auc = roc_auc_score(y_train, y_scores)
    
    print("AUC {:.3f}".format(auc))
    
    if run_confusion_matrix:
        cm = confusion_matrix(y_train, y_pred)
        #rescale the confusion matrix
        rcm = np.empty([2,2])
        rcm[0, :] = cm[0, :] / float(sum(cm[0, :]))
        rcm[1, :] = cm[1, :] / float(sum(cm[0, :]))
        
        print("Confusion matrix: \n" + np.array_str(rcm, precision=5, suppress_small=True))
    
    return {'fpr':fpr, 'tpr':tpr, 'auc':auc}


def model_oostest(model, X_test, y_test):
    """
    This function tests the model performance on out of sample data
    """
    y_score = model.predict_proba(X_test)[:,1]
    m_auc = roc_auc_score(y_test, y_score)
    return m_auc

In [3]:
#importing data
user=environ["USERNAME"]
prefix = '190207_'
postfix = '_shuffle_imp1'
trainfile = 'traindata'
#from work
datafolder = "C:/Users/{}/Tradeteq Dropbox/Davide Mariani/thesis_project/traintestsets/".format(user)

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 

In [4]:
X_train_df = pd.DataFrame(X_train, columns=feature_labels)
X_train_df.head()

Unnamed: 0,has_purchase,dd_value_date,cd_lent_c,cd_repaid_c,cd_impaired1_c,cd_pastdue90_c,cd_trend_a,c_lent_c,c_repaid_c,c_impaired1_c,...,d_pastdue90_c,d_trend_a,d_we_payment_share,invoice_amount,purchase_amount,currency_Britisches Pfund,currency_Euro,currency_Schweizer Franken,currency_US-Dollar,invoice_date
0,0.478098,-0.145273,-0.528606,-0.49391,-0.17567,-0.210131,-1.385604,-1.00996,-0.893308,-0.554907,...,-0.210299,-1.378706,0.11732,0.967374,0.678658,0.0,0.0,1.0,0.0,-1.870522
1,0.478098,-0.145273,-0.302102,-0.49391,-0.17567,-0.210131,-0.960531,-0.938481,-0.893308,-0.554907,...,-0.210299,-0.950582,0.11732,2.014475,0.905137,0.0,1.0,0.0,0.0,-0.606708
2,0.478098,-0.145273,-0.457419,-0.368922,-0.17567,-0.210131,0.997265,-0.922398,-0.788296,-0.554907,...,-0.210299,1.021271,-0.192621,-0.012316,0.466758,0.0,0.0,1.0,0.0,0.315048
3,0.478098,-0.145273,-0.50272,-0.46058,-0.17567,-0.210131,1.624837,-0.98941,-0.861077,-0.554907,...,-0.210299,1.65335,-0.192621,-0.829188,0.290076,0.0,0.0,1.0,0.0,0.675109
4,0.478098,-0.145273,-0.522134,-0.443915,-0.17567,-0.210131,1.315468,0.274891,0.239989,0.079527,...,-0.210299,1.341759,-0.192621,1.362819,0.764189,0.0,0.0,1.0,0.0,0.523883


In [5]:
X_train_df.columns

Index(['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c',
       'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a', 'c_lent_c',
       'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a',
       'cd_lent_c', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c',
       'd_trend_a', 'd_we_payment_share', 'invoice_amount', 'purchase_amount',
       'currency_Britisches Pfund', 'currency_Euro',
       'currency_Schweizer Franken', 'currency_US-Dollar', 'invoice_date'],
      dtype='object')

In [6]:
X_train_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
has_purchase,46095.0,-2.067704e-08,1.000011,-2.091622,0.478098,0.478098,0.478098,0.478098
dd_value_date,46095.0,3.199733e-09,1.000011,-0.145273,-0.145273,-0.145273,-0.145273,85.601524
cd_lent_c,46095.0,1.108227e-09,1.000011,-0.567435,-0.528606,-0.431533,-0.017355,4.393315
cd_repaid_c,46095.0,-1.870823e-08,1.000011,-0.49391,-0.49391,-0.418917,-0.093949,4.37709
cd_impaired1_c,46095.0,-1.428856e-10,1.000011,-0.17567,-0.17567,-0.17567,-0.17567,8.90372
cd_pastdue90_c,46095.0,-3.09564e-09,1.000011,-0.210131,-0.210131,-0.210131,-0.210131,6.4287
cd_trend_a,46095.0,-1.419782e-08,1.000011,-3.920882,-0.678948,-0.03671,0.712322,3.604046
c_lent_c,46095.0,-2.6985e-08,1.000011,-1.037659,-0.801775,-0.315712,0.566171,3.869435
c_repaid_c,46095.0,1.823345e-08,1.000011,-0.893308,-0.788296,-0.336017,0.435457,4.005999
c_impaired1_c,46095.0,1.572906e-08,1.000011,-0.554907,-0.554907,-0.343429,0.079527,4.80632


In [8]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_clf_diag = model_diag(sgd_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.795
Confusion matrix: 
[[0.99874 0.00126]
 [0.01897 0.00248]]


In [9]:
sgd_roc = plot_rocs([sgd_clf_diag], p_width=600, p_height=600)
show(sgd_roc)

In [10]:
#random forest
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_clf_diag = model_diag(rf_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.920
Confusion matrix: 
[[0.89441 0.10559]
 [0.00443 0.01702]]


In [11]:
rf_sgd_roc = plot_rocs([sgd_clf_diag, rf_clf_diag], p_width=600, p_height=600)
show(rf_sgd_roc)

In [13]:
#testdata
testfile = 'testdata'
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl')

In [31]:
y_score_sgd = sgd_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd)
print(cm)
model_oostest(sgd_clf,X_test, y_test)

[[11269    26]
 [  196    31]]


0.7928544266399892

In [32]:
y_score_rf = rf_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf)
print(cm)
model_oostest(rf_clf, X_test, y_test)

[[10016  1279]
 [   53   174]]


0.910715824904006

In [35]:
X_all = np.concatenate((X_train, X_test), axis=0)
y_all = np.concatenate((y_train, y_test), axis=0)

In [37]:
#test on the whole dataset
y_score_sgd_all = sgd_clf.predict(X_all)
cm = confusion_matrix(y_all, y_score_sgd_all)
print(cm)
model_oostest(sgd_clf, X_all, y_all)

[[56291   131]
 [ 1024   171]]


0.8082181510550576

In [38]:
y_score_rf_all = rf_clf.predict(X_all)
cm = confusion_matrix(y_all, y_score_rf_all)
print(cm)
model_oostest(rf_clf, X_all, y_all)

[[50128  6294]
 [  172  1023]]


0.9396671807741692