# Bond graph features model - Pastdue90 - Exploration, calibration, test

This notebook uses as inputs the outputs from the pipeline ('08_preproc_pipeline_2.ipynb' notebook) and performs model calibration and general exploration for the transaction credit events prediction.

In [1]:
import pandas as pd
import numpy as np
import pickle
from os import environ

from sklearn.model_selection import cross_val_predict, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix, make_scorer, accuracy_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from visualization_utils import *
from models_utils import *

from bokeh.io import show, output_notebook
output_notebook()

import datetime as dt

### TRAIN Data import

In [2]:
#importing data
user=environ["USERNAME"]
prefix = 'shuffle_p180_bg_'
trainfile = '_traindata'
testfile = '_testdata'
postfix = '_190721_1655'

#from home
datafolder = ".."+"/data/preproc_traintest/"

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl') 

In [3]:
X_train_df = pd.DataFrame(X_train, columns=feature_labels)
X_train_df.head()

Unnamed: 0,has_purchase,dd_value_date,cd_lent_c,cd_repaid_c,cd_impaired1_c,cd_pastdue90_c,cd_trend_a,c_lent_c,c_repaid_c,c_impaired1_c,...,flow_shock_p180,p180_d_node_flow,p180_energy,invoice_amount,purchase_amount,currency_Britisches Pfund,currency_Euro,currency_Schweizer Franken,currency_US-Dollar,invoice_date
0,0.478098,-0.145273,-0.528606,-0.49391,-0.17567,-0.214037,-1.385604,-1.00996,-0.893308,-0.554907,...,-0.376595,-0.301636,-0.221484,0.967374,0.678658,0.0,0.0,1.0,0.0,-1.870522
1,0.478098,-0.145273,-0.302102,-0.49391,-0.17567,-0.214037,-0.960531,-0.938481,-0.893308,-0.554907,...,0.87016,-0.301636,-0.261344,2.014475,0.905137,0.0,1.0,0.0,0.0,-0.606708
2,0.478098,-0.145273,-0.457419,-0.368922,-0.17567,-0.214037,0.997265,-0.922398,-0.788296,-0.554907,...,-0.375489,-0.301636,-0.221484,-0.012316,0.466758,0.0,0.0,1.0,0.0,0.315048
3,0.478098,-0.145273,-0.50272,-0.46058,-0.17567,-0.214037,1.624837,-0.98941,-0.861077,-0.554907,...,-0.371151,-0.301636,-0.221484,-0.829188,0.290076,0.0,0.0,1.0,0.0,0.675109
4,0.478098,-0.145273,-0.522134,-0.443915,-0.17567,-0.214037,1.315468,0.274891,0.239989,0.079527,...,-0.196853,-0.301636,-0.221484,1.362819,0.764189,0.0,0.0,1.0,0.0,0.523883


In [4]:
X_train_df.columns

Index(['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c',
       'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a', 'c_lent_c',
       'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a',
       'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c', 'd_trend_a',
       'd_we_payment_share', 'flow_shock_imp1', 'imp_c_node_eff', 'imp_energy',
       'imp_d_node_flow', 'flow_shock_p90', 'p90_c_node_eff', 'p90_energy',
       'p90_d_node_flow', 'flow_shock_p180', 'p180_d_node_flow', 'p180_energy',
       'invoice_amount', 'purchase_amount', 'currency_Britisches Pfund',
       'currency_Euro', 'currency_Schweizer Franken', 'currency_US-Dollar',
       'invoice_date'],
      dtype='object')

In [5]:
X_train_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
has_purchase,46095.0,-2.067704e-08,1.000011,-2.091622,0.478098,0.478098,0.478098,0.478098
dd_value_date,46095.0,3.199733e-09,1.000011,-0.145273,-0.145273,-0.145273,-0.145273,85.601524
cd_lent_c,46095.0,1.108227e-09,1.000011,-0.567435,-0.528606,-0.431533,-0.017355,4.393315
cd_repaid_c,46095.0,-1.870823e-08,1.000011,-0.49391,-0.49391,-0.418917,-0.093949,4.37709
cd_impaired1_c,46095.0,-1.428856e-10,1.000011,-0.17567,-0.17567,-0.17567,-0.17567,8.90372
cd_pastdue90_c,46095.0,-4.453377e-09,1.000011,-0.214037,-0.214037,-0.214037,-0.214037,6.398803
cd_trend_a,46095.0,-1.419782e-08,1.000011,-3.920882,-0.678948,-0.03671,0.712322,3.604046
c_lent_c,46095.0,-2.6985e-08,1.000011,-1.037659,-0.801775,-0.315712,0.566171,3.869435
c_repaid_c,46095.0,1.823345e-08,1.000011,-0.893308,-0.788296,-0.336017,0.435457,4.005999
c_impaired1_c,46095.0,1.572906e-08,1.000011,-0.554907,-0.554907,-0.343429,0.079527,4.80632


### SGD Classifier

In [6]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_clf_diag = model_diag(sgd_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.844
Confusion matrix: 
[[0.99246 0.00754]
 [0.05825 0.00803]]


In [7]:
y_score_sgd = sgd_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd)
print(cm)
sgd_test_diag = model_oostest(sgd_clf,X_test, y_test)
sgd_test_auc = plot_rocs([sgd_clf_diag, sgd_test_diag], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc)

[[10718    69]
 [  645    90]]
[[10718    69]
 [  645    90]]
AUC 0.848


In [8]:
#save_sk_model(sgd_clf, "../data/models/", 'sgd_p180', 'enriched')

Saving model to ../data/models/enriched_sgd_p180_190711_1134.pkl


In [9]:
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

# max Number of iterations
max_iter = [100, 180, 200, 250, 300, 320, 350] 
# learning rate
learning_rate = ['constant', 'optimal', 'adaptive']
#eta
eta0 = [0.00001, 0.0001, 0.001, 0.01]
#loss
loss = ['log', 'hinge', 'perceptron']

sgd_random_grid = {'max_iter': max_iter,
               'learning_rate': learning_rate,
                  'eta0': eta0,
                  'loss':loss}

sgd = SGDClassifier(tol=0.0001)

sgd_rsearch = RandomizedSearchCV(estimator = sgd, param_distributions = sgd_random_grid, n_iter = 200, cv = 3, 
                                 verbose=2, random_state=42, n_jobs =7, scoring=scoring, refit='AUC')
# Fit the random search model
sgd_rsearch.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:   11.2s
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:   29.9s
[Parallel(n_jobs=7)]: Done 351 tasks      | elapsed:  1.0min
[Parallel(n_jobs=7)]: Done 600 out of 600 | elapsed:  1.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
       validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=200, n_jobs=7,
          param_distributions={'max_iter': [100, 180, 200, 250, 300, 320, 350], 'learning_rate': ['constant', 'optimal', 'adaptive'], 'eta0': [1e-05, 0.0001, 0.001, 0.01], 'loss': ['log', 'hinge', 'perceptron']},
          pre_dispatch='2*n_jobs', random_state=42, refit='AUC',
          return_train_score='warn',
          scoring={'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)},
          verbose=2)

In [10]:
sgd_rsearch.best_params_

{'max_iter': 300, 'loss': 'log', 'learning_rate': 'adaptive', 'eta0': 0.01}

In [11]:
#Linear model Stochastic Gradient Descent
sgd_rs = SGDClassifier(random_state=42, max_iter=300, loss='log', learning_rate='adaptive', eta0=0.01, tol=0.0001) 
sgd_rs.fit(X_train, y_train)
sgd_rs_diag = model_diag(sgd_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.847
Confusion matrix: 
[[0.99378 0.00622]
 [0.05885 0.00743]]


In [12]:
sgd_test_diag_rs = model_oostest(sgd_rs,X_test, y_test)
sgd_test_auc_rs = plot_rocs([sgd_rs_diag, sgd_test_diag_rs], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc_rs)

[[10731    56]
 [  654    81]]
AUC 0.850


### Random Forest Classifier

In [13]:
#random forest - benchmark
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_clf_diag = model_diag(rf_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.969
Confusion matrix: 
[[0.92427 0.07573]
 [0.00611 0.06017]]


In [14]:
rf_test_diag = model_oostest(rf_clf,X_test, y_test)
rf_test_auc = plot_rocs([rf_clf_diag, rf_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_test_auc)

[[9927  860]
 [  57  678]]
AUC 0.970


In [15]:
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

# Number of trees in random forest
n_estimators = [150, 180, 200, 250, 280, 300, 350] 
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 10, 20]
# Maximum number of levels in tree
max_depth = [100,200, None]
#Max leaf nodes
max_leaf_nodes = [10,20,40,60]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'max_leaf_nodes':max_leaf_nodes}
print(random_grid)

{'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]}


In [16]:
rf = RandomForestClassifier() #base model
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 80, cv = 3, 
                               verbose=2, random_state=42, n_jobs =7, scoring=scoring, refit='AUC')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  2.2min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed: 12.2min
[Parallel(n_jobs=7)]: Done 240 out of 240 | elapsed: 17.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=80, n_jobs=7,
          param_distributions={'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]},
          pre_dispatch='2*n_jobs', random_state=42, refit='AUC',
          return_train_score='warn',
          scoring={'AUC': 'roc_auc', 'Accurac

In [17]:
results = rf_random.cv_results_
print("Best AUC {:.3f}".format(results["mean_test_AUC"].max()))
rf_random.best_params_

Best AUC 0.968


{'n_estimators': 280,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_leaf_nodes': 60,
 'max_features': 10,
 'max_depth': 100,
 'bootstrap': False}

In [20]:
#first randomized search
rs1 = {'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_leaf_nodes':60,
 'max_depth': 100,
 'bootstrap': True}

#second randomized search 
rs2 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

#third randomized search 
rs3 = {'n_estimators': 280,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 200,
 'bootstrap': True}

#fourth randomized search_p90_bgt
rs4={'n_estimators': 280,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_leaf_nodes': 60,
 'max_features': 10,
 'max_depth': 100,
 'bootstrap': False}


#Best AUC 0.968 search_p180_bgt
rs5={'n_estimators': 280,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_leaf_nodes': 60,
 'max_features': 10,
 'max_depth': 100,
 'bootstrap': False}

In [21]:
#random_forest from rand
n_estimators=rs5['n_estimators']
min_samples_split=rs5['min_samples_split']
min_samples_leaf=rs5['min_samples_leaf']
max_leaf_nodes=rs5['max_leaf_nodes']
max_features=rs5['max_features']
max_depth=rs5['max_depth']
bootstrap=rs5['bootstrap']




rf_rs = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                                max_features=max_features,
                                min_samples_split=min_samples_split,
                               min_samples_leaf=min_samples_leaf,
                               class_weight="balanced", bootstrap=bootstrap,
                               n_jobs=7)
rf_rs.fit(X_train, y_train)
rf_rs_diag = model_diag(rf_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.988
Confusion matrix: 
[[0.99123 0.00877]
 [0.00939 0.05688]]


In [22]:
y_score_rf_rs = rf_rs.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf_rs)
rf_rs_test_diag = model_oostest(rf_rs,X_test, y_test)
rf_rs_test_auc = plot_rocs([rf_rs_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_rs_test_auc)

[[10694    93]
 [   83   652]]
AUC 0.989


In [23]:
#save_sk_model(rf_rs, "../data/models/", 'rf_p180', 'enriched')

Saving model to ../data/models/enriched_rf_p180_190711_120.pkl


### Training Performance Comparison

In [24]:
rf_sgd_roc = plot_rocs([sgd_clf_diag, rf_rs_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)

### Test Performance Comparison

In [25]:
rf_sgd_roc = plot_rocs([sgd_test_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)

## REPEATING THE EXPERIMENT IN TIME MODE

In [26]:
prefix = 'time_2018-02-20_p180_bgt_'
postfix = '_190710_1539'
[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl') 

### SGD Classifier

In [27]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_clf_diag = model_diag(sgd_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.798
Confusion matrix: 
[[0.98222 0.01778]
 [0.07664 0.01542]]


In [29]:
y_score_sgd = sgd_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd)
print(cm)
sgd_test_diag = model_oostest(sgd_clf,X_test, y_test)
sgd_test_auc = plot_rocs([sgd_clf_diag, sgd_test_diag], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc)

[[14922   564]
 [   52     2]]
[[14922   564]
 [   52     2]]
AUC 0.733


In [30]:
#save_sk_model(sgd_clf, "../data/models/", 'sgd_p180_time', 'benchmark')

Saving model to ../data/models/benchmark_sgd_p180_time_190711_123.pkl


In [31]:
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

# max Number of iterations
max_iter = [100, 180, 200, 250, 300, 320, 350] 
# learning rate
learning_rate = ['constant', 'optimal', 'adaptive']
#eta
eta0 = [0.00001, 0.0001, 0.001, 0.01]
#loss
loss = ['log']

sgd_random_grid = {'max_iter': max_iter,
               'learning_rate': learning_rate,
                  'eta0': eta0,
                  'loss':loss}

sgd = SGDClassifier(tol=0.0001)

sgd_rsearch = RandomizedSearchCV(estimator = sgd, param_distributions = sgd_random_grid, n_iter = 200, cv = 3, verbose=2, 
                                 random_state=42, n_jobs =7, scoring=scoring, refit='AUC')
# Fit the random search model
sgd_rsearch.fit(X_train, y_train)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.


Fitting 3 folds for each of 168 candidates, totalling 504 fits


[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    7.4s
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:   31.0s
[Parallel(n_jobs=7)]: Done 351 tasks      | elapsed:  1.1min
[Parallel(n_jobs=7)]: Done 504 out of 504 | elapsed:  1.4min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
       validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=200, n_jobs=7,
          param_distributions={'max_iter': [100, 180, 200, 250, 300, 320, 350], 'learning_rate': ['constant', 'optimal', 'adaptive'], 'eta0': [1e-05, 0.0001, 0.001, 0.01], 'loss': ['log', 'perceptron']},
          pre_dispatch='2*n_jobs', random_state=42, refit='AUC',
          return_train_score='warn',
          scoring={'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)},
          verbose=2)

In [32]:
results = sgd_rsearch.cv_results_
print("Best AUC {:.3f}".format(results["mean_test_AUC"].max()))
sgd_rsearch.best_params_

Best AUC 0.798


{'max_iter': 300,
 'loss': 'perceptron',
 'learning_rate': 'optimal',
 'eta0': 0.0001}

In [35]:
#Linear model Stochastic Gradient Descent
sgd_rs = SGDClassifier(random_state=42, max_iter=300, loss='log', learning_rate='optimal', eta0=0.0001, tol=0.0001) 
sgd_rs.fit(X_train, y_train)
sgd_rs_diag = model_diag(sgd_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.798
Confusion matrix: 
[[0.98222 0.01778]
 [0.07664 0.01542]]


In [36]:
sgd_test_diag_rs = model_oostest(sgd_rs,X_test, y_test)
sgd_test_auc_rs = plot_rocs([sgd_rs_diag, sgd_test_diag_rs], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc_rs)

[[14922   564]
 [   52     2]]
AUC 0.733


### Random Forest Classifier

In [37]:
#random forest - benchmark
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_clf_diag = model_diag(rf_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.838
Confusion matrix: 
[[0.86852 0.13148]
 [0.03184 0.06021]]


In [38]:
rf_test_diag = model_oostest(rf_clf,X_test, y_test)
rf_test_auc = plot_rocs([rf_clf_diag, rf_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_test_auc)

[[13083  2403]
 [   38    16]]
AUC 0.700


In [39]:
# Number of trees in random forest
n_estimators = [150, 180, 200, 250, 280, 300, 350] 
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 10, 20]
# Maximum number of levels in tree
max_depth = [100,200, None]
#Max leaf nodes
max_leaf_nodes = [10,20,40,60]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'max_leaf_nodes':max_leaf_nodes}
print(random_grid)

{'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]}


In [40]:
rf = RandomForestClassifier() #base model
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, n_jobs =7, scoring=scoring, refit='AUC')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  1.8min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed: 10.0min
[Parallel(n_jobs=7)]: Done 300 out of 300 | elapsed: 15.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=7,
          param_distributions={'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]},
          pre_dispatch='2*n_jobs', random_state=42, refit='AUC',
          return_train_score='warn',
          scoring={'AUC': 'roc_auc', 'Accura

In [41]:
results = rf_random.cv_results_
print("Best AUC {:.3f}".format(results["mean_test_AUC"].max()))
rf_random.best_params_

Best AUC 0.797


{'n_estimators': 300,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 40,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

In [42]:
#first randomized search_imp
rs1 = {'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_leaf_nodes':60,
 'max_depth': 100,
 'bootstrap': True}

#second randomized search_imp
rs2 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

#third randomized search_p90
rs3 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

#fourth randomized search_p90_time
rs4 = {'n_estimators': 300,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 10,
 'max_features': 'auto',
 'max_depth': 200,
 'bootstrap': True}

#fifth randomized search_imp_time
rs5 = {'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 10,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': True}

#sixth randomized search_imp_time bgt
rs6 = {'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 10,
 'max_features': 10,
 'max_depth': 100,
 'bootstrap': False}

#seventh randomized search_p90_time bgt
rs7={'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 20,
 'max_features': 10,
 'max_depth': None,
 'bootstrap': True}

#Best AUC 0.785 search_p90_time bgt
rs8 = {'n_estimators': 250,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 20,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': False}

#Best AUC 0.797 search_p180_time bgt
rs9={'n_estimators': 300,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 40,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

In [43]:
#random_forest from rand
n_estimators=rs9['n_estimators']
min_samples_split=rs9['min_samples_split']
min_samples_leaf=rs9['min_samples_leaf']
max_leaf_nodes=rs9['max_leaf_nodes']
max_features=rs9['max_features']
max_depth=rs9['max_depth']
bootstrap=rs9['bootstrap']




rf_rs = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                                max_features=max_features,
                                min_samples_split=min_samples_split,
                               min_samples_leaf=min_samples_leaf,
                               class_weight="balanced", bootstrap=bootstrap,
                               n_jobs=7)
rf_rs.fit(X_train, y_train)
rf_rs_diag = model_diag(rf_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.783
Confusion matrix: 
[[0.95879 0.04121]
 [0.05564 0.03641]]


In [44]:
y_score_rf_rs = rf_rs.predict(X_test)
rf_rs_test_diag = model_oostest(rf_rs,X_test, y_test)
rf_rs_test_auc = plot_rocs([rf_rs_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_rs_test_auc)

[[15161   325]
 [   33    21]]
AUC 0.775


In [45]:
#save_sk_model(rf_rs, "../data/models/", 'rf_p180_time', 'enriched')

Saving model to ../data/models/enriched_rf_p180_time_190711_1330.pkl


### Training Performance Comparison

In [46]:
rf_sgd_roc = plot_rocs([sgd_clf_diag, rf_rs_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)

### Test Performance Comparison

In [47]:
rf_sgd_roc = plot_rocs([sgd_test_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)