# Benchmark models - Exploration, calibration, test

This notebook uses as inputs the outputs from the pipeline ('05_preproc_pipeline_1.ipynb' notebook) and performs model calibration and general exploration for the transaction credit events prediction.

In [1]:
import pandas as pd
import numpy as np
import pickle
from os import environ

from sklearn.model_selection import cross_val_predict, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from visualization_utils import *
from models_utils import *

from bokeh.io import show, output_notebook
output_notebook()

import datetime as dt

### TRAIN Data import

In [2]:
#importing data
user=environ["USERNAME"]
prefix = 'shuffle_imp_'
trainfile = '_traindata'
testfile = '_testdata'
postfix = '_19072_750'

#from home
datafolder = ".."+"/data/preproc_traintest/"

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl') 

In [3]:
X_train_df = pd.DataFrame(X_train, columns=feature_labels)
X_train_df.head()

Unnamed: 0,has_purchase,dd_value_date,cd_lent_c,cd_repaid_c,cd_impaired1_c,cd_pastdue90_c,cd_trend_a,c_lent_c,c_repaid_c,c_impaired1_c,...,d_pastdue90_c,d_trend_a,d_we_payment_share,invoice_amount,purchase_amount,currency_Britisches Pfund,currency_Euro,currency_Schweizer Franken,currency_US-Dollar,invoice_date
0,0.478098,-0.145273,-0.528606,-0.49391,-0.17567,-0.214037,-1.385604,-1.00996,-0.893308,-0.554907,...,-0.215463,-1.378706,0.11732,0.967374,0.678658,0.0,0.0,1.0,0.0,-1.870522
1,0.478098,-0.145273,-0.302102,-0.49391,-0.17567,-0.214037,-0.960531,-0.938481,-0.893308,-0.554907,...,-0.215463,-0.950582,0.11732,2.014475,0.905137,0.0,1.0,0.0,0.0,-0.606708
2,0.478098,-0.145273,-0.457419,-0.368922,-0.17567,-0.214037,0.997265,-0.922398,-0.788296,-0.554907,...,-0.215463,1.021271,-0.192621,-0.012316,0.466758,0.0,0.0,1.0,0.0,0.315048
3,0.478098,-0.145273,-0.50272,-0.46058,-0.17567,-0.214037,1.624837,-0.98941,-0.861077,-0.554907,...,-0.215463,1.65335,-0.192621,-0.829188,0.290076,0.0,0.0,1.0,0.0,0.675109
4,0.478098,-0.145273,-0.522134,-0.443915,-0.17567,-0.214037,1.315468,0.274891,0.239989,0.079527,...,-0.215463,1.341759,-0.192621,1.362819,0.764189,0.0,0.0,1.0,0.0,0.523883


In [4]:
X_train_df.columns

Index(['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c',
       'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a', 'c_lent_c',
       'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a',
       'cd_lent_c', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c',
       'd_trend_a', 'd_we_payment_share', 'invoice_amount', 'purchase_amount',
       'currency_Britisches Pfund', 'currency_Euro',
       'currency_Schweizer Franken', 'currency_US-Dollar', 'invoice_date'],
      dtype='object')

In [5]:
X_train_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
has_purchase,46095.0,-2.067704e-08,1.000011,-2.091622,0.478098,0.478098,0.478098,0.478098
dd_value_date,46095.0,3.199733e-09,1.000011,-0.145273,-0.145273,-0.145273,-0.145273,85.601524
cd_lent_c,46095.0,1.108227e-09,1.000011,-0.567435,-0.528606,-0.431533,-0.017355,4.393315
cd_repaid_c,46095.0,-1.870823e-08,1.000011,-0.49391,-0.49391,-0.418917,-0.093949,4.37709
cd_impaired1_c,46095.0,-1.428856e-10,1.000011,-0.17567,-0.17567,-0.17567,-0.17567,8.90372
cd_pastdue90_c,46095.0,-4.453377e-09,1.000011,-0.214037,-0.214037,-0.214037,-0.214037,6.398803
cd_trend_a,46095.0,-1.419782e-08,1.000011,-3.920882,-0.678948,-0.03671,0.712322,3.604046
c_lent_c,46095.0,-2.6985e-08,1.000011,-1.037659,-0.801775,-0.315712,0.566171,3.869435
c_repaid_c,46095.0,1.823345e-08,1.000011,-0.893308,-0.788296,-0.336017,0.435457,4.005999
c_impaired1_c,46095.0,1.572906e-08,1.000011,-0.554907,-0.554907,-0.343429,0.079527,4.80632


### SGD Classifier

In [6]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_clf_diag = model_diag(sgd_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.793
Confusion matrix: 
[[0.99883 0.00117]
 [0.01901 0.00244]]


In [7]:
y_score_sgd = sgd_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd)
print(cm)
sgd_test_diag = model_oostest(sgd_clf,X_test, y_test)
sgd_test_auc = plot_rocs([sgd_clf_diag, sgd_test_diag], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc)

[[11266    29]
 [  196    31]]


In [8]:
#save_sk_model(sgd_clf, "../data/models/", 'sgd', 'benchmark')

Saving model to ../data/models/benchmark_sgd_clf_19077_931


In [8]:
# max Number of iterations
max_iter = [100, 180, 200, 250, 300, 320, 350] 
# learning rate
learning_rate = ['constant', 'optimal', 'adaptive']
#eta
eta0 = [0.00001, 0.0001, 0.001, 0.01]
#loss
loss = ['log', 'hinge', 'perceptron']

sgd_random_grid = {'max_iter': max_iter,
               'learning_rate': learning_rate,
                  'eta0': eta0,
                  'loss':loss}

sgd = SGDClassifier(tol=0.0001)

sgd_rsearch = RandomizedSearchCV(estimator = sgd, param_distributions = sgd_random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs =7)
# Fit the random search model
sgd_rsearch.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    3.5s
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:    9.9s
[Parallel(n_jobs=7)]: Done 351 tasks      | elapsed:   21.3s
[Parallel(n_jobs=7)]: Done 600 out of 600 | elapsed:   37.6s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
       validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=200, n_jobs=7,
          param_distributions={'max_iter': [100, 180, 200, 250, 300, 320, 350], 'learning_rate': ['constant', 'optimal', 'adaptive'], 'eta0': [1e-05, 0.0001, 0.001, 0.01], 'loss': ['log', 'hinge', 'perceptron']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [None]:
sgd_rsearch.best_params_

In [11]:
#Linear model Stochastic Gradient Descent
sgd_rs = SGDClassifier(random_state=42, max_iter=300, loss='log', learning_rate='adaptive', eta0=0.01, tol=0.0001) 
sgd_rs.fit(X_train, y_train)
sgd_rs_diag = model_diag(sgd_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.797
Confusion matrix: 
[[0.99878 0.00122]
 [0.01888 0.00257]]


In [13]:
y_score_sgd_rs = sgd_rs.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd_rs)
print(cm)
sgd_test_diag_rs = model_oostest(sgd_rs,X_test, y_test)
sgd_test_auc_rs = plot_rocs([sgd_rs_diag, sgd_test_diag_rs], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc_rs)

[[11284    11]
 [  201    26]]


### Random Forest Classifier

In [14]:
#random forest - benchmark
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_clf_diag = model_diag(rf_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.921
Confusion matrix: 
[[0.89408 0.10592]
 [0.00439 0.01706]]


In [16]:
y_score_rf = rf_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf)
print(cm)
rf_test_diag = model_oostest(rf_clf,X_test, y_test)
rf_test_auc = plot_rocs([rf_clf_diag, rf_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_test_auc)

[[10024  1271]
 [   53   174]]


In [7]:
# Number of trees in random forest
n_estimators = [150, 180, 200, 250, 280, 300, 350] 
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 10, 20]
# Maximum number of levels in tree
max_depth = [100,200, None]
#Max leaf nodes
max_leaf_nodes = [10,20,40,60]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'max_leaf_nodes':max_leaf_nodes}
print(random_grid)

{'n_estimators': [80, 125, 150, 180, 200, 250, 280, 300, 320, 350], 'max_features': ['auto', 'sqrt', 10, 15, 20], 'max_depth': [10, 20, 30, 40, 50, 60, 80, 90, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [8]:
rf = RandomForestClassifier() #base model
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 80, cv = 3, verbose=2, random_state=42, n_jobs =7)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 45.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [80, 125, 150, 180, 200, 250, 280, 300, 320, 350], 'max_features': ['auto', 'sqrt', 10, 15, 20], 'max_depth': [10, 20, 30, 40, 50, 60, 80, 90, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [10]:
rf_random.best_params_

{'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': True}

In [9]:
#first randomized search
rs1 = {'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_leaf_nodes':60,
 'max_depth': 100,
 'bootstrap': True}

#second randomized search 
rs2 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

In [10]:
#random_forest from rand
n_estimators=rs1['n_estimators']
min_samples_split=rs1['min_samples_split']
min_samples_leaf=rs1['min_samples_leaf']
max_leaf_nodes=rs1['max_leaf_nodes']
max_features=rs1['max_features']
max_depth=rs1['max_depth']
bootstrap=rs1['bootstrap']




rf_rs = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                                max_features=max_features,
                                min_samples_split=min_samples_split,
                               min_samples_leaf=min_samples_leaf,
                               class_weight="balanced", bootstrap=bootstrap,
                               n_jobs=7)
rf_rs.fit(X_train, y_train)
rf_rs_diag = model_diag(rf_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.930
Confusion matrix: 
[[0.99548 0.00452]
 [0.0129  0.00855]]


In [11]:
y_score_rf_rs = rf_rs.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf_rs)
print(cm)
rf_rs_test_diag = model_oostest(rf_rs,X_test, y_test)
rf_rs_test_auc = plot_rocs([rf_rs_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_rs_test_auc)

[[11243    52]
 [  141    86]]


In [24]:
#save_sk_model(rf_rs, "../data/models/", 'rf', 'benchmark')

Saving model to ../data/models/benchmark_rf_19077_1119.pkl


### Training Performance Comparison

In [12]:
rf_sgd_roc = plot_rocs([sgd_clf_diag, rf_rs_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)

### Test Performance Comparison

In [13]:
rf_sgd_roc = plot_rocs([sgd_test_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)