# Benchmark models - Exploration, calibration, test

This notebook uses as inputs the outputs from the pipeline ('05_preproc_pipeline_1.ipynb' notebook) and performs model calibration and general exploration for the transactions prediction.

In [1]:
import pandas as pd
import numpy as np
import pickle
from os import environ

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from visualization_utils import *

from bokeh.io import show, output_notebook
output_notebook()

import datetime as dt

### TRAIN Data import

In [2]:
#importing data
user=environ["USERNAME"]
prefix = '190625_728_'
trainfile = 'traindata'
postfix = '_shuffle_imp1'

#from home
datafolder= "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/preproc_traintest/".format(user)

#from work
#datafolder = "C:/Users/{}/Tradeteq Dropbox/Davide Mariani/thesis_project/traintestsets/".format(user)

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 

In [3]:
X_train_df = pd.DataFrame(X_train, columns=feature_labels)
X_train_df.head()

Unnamed: 0,has_purchase,dd_value_date,cd_lent_c,cd_repaid_c,cd_impaired1_c,cd_pastdue90_c,cd_trend_a,c_lent_c,c_repaid_c,c_impaired1_c,...,d_pastdue90_c,d_trend_a,d_we_payment_share,invoice_amount,purchase_amount,currency_Britisches Pfund,currency_Euro,currency_Schweizer Franken,currency_US-Dollar,invoice_date
0,0.478098,-0.145273,-0.528606,-0.49391,-0.17567,-0.214037,-1.385604,-1.00996,-0.893308,-0.554907,...,-0.215463,-1.378706,0.11732,0.967374,0.678658,0.0,0.0,1.0,0.0,-1.870522
1,0.478098,-0.145273,-0.302102,-0.49391,-0.17567,-0.214037,-0.960531,-0.938481,-0.893308,-0.554907,...,-0.215463,-0.950582,0.11732,2.014475,0.905137,0.0,1.0,0.0,0.0,-0.606708
2,0.478098,-0.145273,-0.457419,-0.368922,-0.17567,-0.214037,0.997265,-0.922398,-0.788296,-0.554907,...,-0.215463,1.021271,-0.192621,-0.012316,0.466758,0.0,0.0,1.0,0.0,0.315048
3,0.478098,-0.145273,-0.50272,-0.46058,-0.17567,-0.214037,1.624837,-0.98941,-0.861077,-0.554907,...,-0.215463,1.65335,-0.192621,-0.829188,0.290076,0.0,0.0,1.0,0.0,0.675109
4,0.478098,-0.145273,-0.522134,-0.443915,-0.17567,-0.214037,1.315468,0.274891,0.239989,0.079527,...,-0.215463,1.341759,-0.192621,1.362819,0.764189,0.0,0.0,1.0,0.0,0.523883


In [4]:
X_train_df.columns

Index(['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c',
       'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a', 'c_lent_c',
       'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a',
       'cd_lent_c', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c',
       'd_trend_a', 'd_we_payment_share', 'invoice_amount', 'purchase_amount',
       'currency_Britisches Pfund', 'currency_Euro',
       'currency_Schweizer Franken', 'currency_US-Dollar', 'invoice_date'],
      dtype='object')

In [5]:
X_train_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
has_purchase,46095.0,-2.067704e-08,1.000011,-2.091622,0.478098,0.478098,0.478098,0.478098
dd_value_date,46095.0,3.199733e-09,1.000011,-0.145273,-0.145273,-0.145273,-0.145273,85.601524
cd_lent_c,46095.0,1.108227e-09,1.000011,-0.567435,-0.528606,-0.431533,-0.017355,4.393315
cd_repaid_c,46095.0,-1.870823e-08,1.000011,-0.49391,-0.49391,-0.418917,-0.093949,4.37709
cd_impaired1_c,46095.0,-1.428856e-10,1.000011,-0.17567,-0.17567,-0.17567,-0.17567,8.90372
cd_pastdue90_c,46095.0,-4.453377e-09,1.000011,-0.214037,-0.214037,-0.214037,-0.214037,6.398803
cd_trend_a,46095.0,-1.419782e-08,1.000011,-3.920882,-0.678948,-0.03671,0.712322,3.604046
c_lent_c,46095.0,-2.6985e-08,1.000011,-1.037659,-0.801775,-0.315712,0.566171,3.869435
c_repaid_c,46095.0,1.823345e-08,1.000011,-0.893308,-0.788296,-0.336017,0.435457,4.005999
c_impaired1_c,46095.0,1.572906e-08,1.000011,-0.554907,-0.554907,-0.343429,0.079527,4.80632


### SGD Classifier

In [6]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_clf_diag = model_diag(sgd_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.793
Confusion matrix: 
[[0.99883 0.00117]
 [0.01901 0.00244]]


In [7]:
sgd_roc = plot_rocs([sgd_clf_diag], p_width=600, p_height=600)
show(sgd_roc)

### Random Forest Classifier

In [8]:
#random forest
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_clf_diag = model_diag(rf_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.921
Confusion matrix: 
[[0.89408 0.10592]
 [0.00439 0.01706]]


In [9]:
rf_sgd_roc = plot_rocs([sgd_clf_diag, rf_clf_diag], p_width=600, p_height=600)
show(rf_sgd_roc)

In [13]:
#testdata
testfile = 'testdata'
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl')

In [14]:
y_score_sgd = sgd_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd)
print(cm)
model_oostest(sgd_clf,X_test, y_test)

[[11266    29]
 [  196    31]]


0.7905174212596507

In [15]:
y_score_rf = rf_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf)
print(cm)
model_oostest(rf_clf, X_test, y_test)

[[10024  1271]
 [   53   174]]


0.911707453104859

In [16]:
X_all = np.concatenate((X_train, X_test), axis=0)
y_all = np.concatenate((y_train, y_test), axis=0)

In [17]:
#test on the whole dataset
y_score_sgd_all = sgd_clf.predict(X_all)
cm = confusion_matrix(y_all, y_score_sgd_all)
print(cm)
model_oostest(sgd_clf, X_all, y_all)

[[56283   139]
 [ 1026   169]]


0.8065059342857003

In [18]:
y_score_rf_all = rf_clf.predict(X_all)
cm = confusion_matrix(y_all, y_score_rf_all)
print(cm)
model_oostest(rf_clf, X_all, y_all)

[[50179  6243]
 [  174  1021]]


0.9397339148843836

In [47]:
#optimization of the random forest
#search throguh parameter sequence
model_in = {"n_estimators": [10,15,50,80,120,120,150,200],
           "max_leaf_nodes": [6,7,8,9,12,18,30,40]}
model_out = []
models_box = []

for m in range(len(model_in["n_estimators"])):
    n_estimators = model_in["n_estimators"][m]
    max_leaf_nodes = model_in["max_leaf_nodes"][m]
    rf_clf_tmp = RandomForestClassifier(random_state=42,
                                        n_estimators=n_estimators,
                                        max_leaf_nodes=max_leaf_nodes,
                                        class_weight="balanced",
                                        n_jobs=7)
    print("Random Forest - n_estimators={} and max_leaf_nodes={}".format(n_estimators, max_leaf_nodes))
    rf_clf_tmp_diag = model_diag(rf_clf_tmp, X_train, y_train)
    print()
    models_box.append(rf_clf_tmp)
    model_out.append(rf_clf_tmp_diag)
                                    

Random Forest - n_estimators=10 and max_leaf_nodes=6
AUC 0.873

Random Forest - n_estimators=15 and max_leaf_nodes=7
AUC 0.884

Random Forest - n_estimators=50 and max_leaf_nodes=8
AUC 0.889

Random Forest - n_estimators=80 and max_leaf_nodes=9
AUC 0.891

Random Forest - n_estimators=120 and max_leaf_nodes=12
AUC 0.899

Random Forest - n_estimators=120 and max_leaf_nodes=18
AUC 0.908

Random Forest - n_estimators=150 and max_leaf_nodes=30
AUC 0.917

Random Forest - n_estimators=200 and max_leaf_nodes=40
AUC 0.920



In [48]:
#best performances seem to be given by 200 estimators and 30 max leaf nodes

rf_clf_final = models_box[-1] #the last one in the box
rf_clf_final.fit(X_train, y_train)
final_scores = rf_clf_final.predict(X_test)
cm = confusion_matrix(y_test, final_scores)
print(cm)
model_oostest(rf_clf_final, X_test, y_test)

[[10016  1279]
 [   53   174]]


0.910715824904006

In [49]:
#analyzing feature importance
feat_rep = sorted(zip(map(lambda x: round(x, 4), np.abs(np.squeeze(sgd_clf.coef_))),
                      range(len(feature_labels)),
                      feature_labels,
                      map(lambda x: round(x,4), np.squeeze(sgd_clf.coef_)),
                      map(lambda x: round(x,4), np.squeeze(rf_clf_final.feature_importances_))), reverse=True)
for fr in feat_rep:
    print("{:2}. {:20.20}: {:5.2f}/{:4.3f} ".format(fr[1], fr[2], fr[3], fr[4]))

20. currency_Britisches :  2.07/0.001 
 7. c_lent_c            : -1.10/0.089 
 8. c_repaid_c          :  0.79/0.072 
 3. cd_repaid_c         :  0.76/0.039 
14. d_impaired1_c       :  0.50/0.107 
 9. c_impaired1_c       :  0.44/0.096 
13. d_repaid_c          : -0.42/0.067 
12. cd_lent_c           : -0.41/0.066 
 2. cd_lent_c           : -0.41/0.058 
10. c_pastdue90_c       : -0.37/0.024 
 0. has_purchase        : -0.28/0.003 
24. invoice_date        :  0.26/0.056 
17. d_we_payment_share  :  0.23/0.018 
15. d_pastdue90_c       : -0.21/0.001 
11. c_trend_a           : -0.19/0.035 
 5. cd_pastdue90_c      :  0.19/0.002 
 1. dd_value_date       : -0.13/0.004 
23. currency_US-Dollar  : -0.13/0.000 
21. currency_Euro       :  0.12/0.004 
19. purchase_amount     :  0.10/0.022 
 6. cd_trend_a          :  0.10/0.024 
22. currency_Schweizer F: -0.04/0.006 
16. d_trend_a           :  0.02/0.021 
 4. cd_impaired1_c      : -0.01/0.149 
18. invoice_amount      :  0.00/0.036 


In [None]:
#saving sgd
#if (int(cfg["modelsave"])):
#        logpostfix = str(int(dt.datetime.now().timestamp()))[4:] #uniqueish name postfix
#        filename = datafolder + cfg["modelfolder"] + "linear" + logpostfix + ".pickle"
#        log.info("Saving model to " + filename)
#        with open(filename, "wb") as pickle_file:
#            pickle.dump(sgd_clf, pickle_file)


#saving rf
#if (int(cfg["modelsave"])):
#        logpostfix = str(int(dt.datetime.now().timestamp()))[4:] #uniqueish name postfix
#        filename = datafolder + cfg["modelfolder"] + "forest" + logpostfix + ".pickle"
#        log.info("Saving model to " + filename)
#        with open(filename, "wb") as pickle_file:
#            pickle.dump(forest_clf, pickle_file)