# Benchmark model - Pastdue90 - Exploration, calibration, test

This notebook uses as inputs the outputs from the pipeline ('05_preproc_pipeline_1.ipynb' notebook) and performs model calibration and general exploration for the transaction credit events prediction.

In [3]:
import pandas as pd
import numpy as np
import pickle
from os import environ

from sklearn.model_selection import cross_val_predict, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from scripts_viz.visualization_utils import *
from scripts_ml.models_utils import *

from bokeh.io import show, output_notebook
output_notebook()

import datetime as dt

### TRAIN Data import

In [2]:
#importing data
user=environ["USERNAME"]
prefix = 'shuffle_p90_'
trainfile = '_traindata'
testfile = '_testdata'
postfix = '_19072_750'

#from home
datafolder = ".."+"/data/preproc_traintest/"

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl') 

In [3]:
X_train_df = pd.DataFrame(X_train, columns=feature_labels)
X_train_df.head()

Unnamed: 0,has_purchase,dd_value_date,cd_lent_c,cd_repaid_c,cd_impaired1_c,cd_pastdue90_c,cd_trend_a,c_lent_c,c_repaid_c,c_impaired1_c,...,d_pastdue90_c,d_trend_a,d_we_payment_share,invoice_amount,purchase_amount,currency_Britisches Pfund,currency_Euro,currency_Schweizer Franken,currency_US-Dollar,invoice_date
0,0.478098,-0.145273,-0.528606,-0.49391,-0.17567,-0.214037,-1.385604,-1.00996,-0.893308,-0.554907,...,-0.215463,-1.378706,0.11732,0.967374,0.678658,0.0,0.0,1.0,0.0,-1.870522
1,0.478098,-0.145273,-0.302102,-0.49391,-0.17567,-0.214037,-0.960531,-0.938481,-0.893308,-0.554907,...,-0.215463,-0.950582,0.11732,2.014475,0.905137,0.0,1.0,0.0,0.0,-0.606708
2,0.478098,-0.145273,-0.457419,-0.368922,-0.17567,-0.214037,0.997265,-0.922398,-0.788296,-0.554907,...,-0.215463,1.021271,-0.192621,-0.012316,0.466758,0.0,0.0,1.0,0.0,0.315048
3,0.478098,-0.145273,-0.50272,-0.46058,-0.17567,-0.214037,1.624837,-0.98941,-0.861077,-0.554907,...,-0.215463,1.65335,-0.192621,-0.829188,0.290076,0.0,0.0,1.0,0.0,0.675109
4,0.478098,-0.145273,-0.522134,-0.443915,-0.17567,-0.214037,1.315468,0.274891,0.239989,0.079527,...,-0.215463,1.341759,-0.192621,1.362819,0.764189,0.0,0.0,1.0,0.0,0.523883


In [4]:
X_train_df.columns

Index(['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c',
       'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a', 'c_lent_c',
       'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a',
       'cd_lent_c', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c',
       'd_trend_a', 'd_we_payment_share', 'invoice_amount', 'purchase_amount',
       'currency_Britisches Pfund', 'currency_Euro',
       'currency_Schweizer Franken', 'currency_US-Dollar', 'invoice_date'],
      dtype='object')

In [5]:
X_train_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
has_purchase,46095.0,-2.067704e-08,1.000011,-2.091622,0.478098,0.478098,0.478098,0.478098
dd_value_date,46095.0,3.199733e-09,1.000011,-0.145273,-0.145273,-0.145273,-0.145273,85.601524
cd_lent_c,46095.0,1.108227e-09,1.000011,-0.567435,-0.528606,-0.431533,-0.017355,4.393315
cd_repaid_c,46095.0,-1.870823e-08,1.000011,-0.49391,-0.49391,-0.418917,-0.093949,4.37709
cd_impaired1_c,46095.0,-1.428856e-10,1.000011,-0.17567,-0.17567,-0.17567,-0.17567,8.90372
cd_pastdue90_c,46095.0,-4.453377e-09,1.000011,-0.214037,-0.214037,-0.214037,-0.214037,6.398803
cd_trend_a,46095.0,-1.419782e-08,1.000011,-3.920882,-0.678948,-0.03671,0.712322,3.604046
c_lent_c,46095.0,-2.6985e-08,1.000011,-1.037659,-0.801775,-0.315712,0.566171,3.869435
c_repaid_c,46095.0,1.823345e-08,1.000011,-0.893308,-0.788296,-0.336017,0.435457,4.005999
c_impaired1_c,46095.0,1.572906e-08,1.000011,-0.554907,-0.554907,-0.343429,0.079527,4.80632


### SGD Classifier

In [6]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_clf_diag = model_diag(sgd_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.768
Confusion matrix: 
[[0.99502 0.00498]
 [0.07216 0.00606]]


In [7]:
y_score_sgd = sgd_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd)
print(cm)
sgd_test_diag = model_oostest(sgd_clf,X_test, y_test)
sgd_test_auc = plot_rocs([sgd_clf_diag, sgd_test_diag], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc)

[[10590    82]
 [  737   113]]
[[10590    82]
 [  737   113]]
AUC 0.781


In [8]:
#save_sk_model(sgd_clf, "../data/models/", 'sgd', 'benchmark')

Saving model to ../data/models/benchmark_sgd_clf_19077_931


In [8]:
# max Number of iterations
max_iter = [100, 180, 200, 250, 300, 320, 350] 
# learning rate
learning_rate = ['constant', 'optimal', 'adaptive']
#eta
eta0 = [0.00001, 0.0001, 0.001, 0.01]
#loss
loss = ['log', 'hinge', 'perceptron']

sgd_random_grid = {'max_iter': max_iter,
               'learning_rate': learning_rate,
                  'eta0': eta0,
                  'loss':loss}

sgd = SGDClassifier(tol=0.0001)

sgd_rsearch = RandomizedSearchCV(estimator = sgd, param_distributions = sgd_random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs =7)
# Fit the random search model
sgd_rsearch.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    3.1s
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:   10.0s
[Parallel(n_jobs=7)]: Done 351 tasks      | elapsed:   22.5s
[Parallel(n_jobs=7)]: Done 600 out of 600 | elapsed:   38.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
       validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=200, n_jobs=7,
          param_distributions={'max_iter': [100, 180, 200, 250, 300, 320, 350], 'learning_rate': ['constant', 'optimal', 'adaptive'], 'eta0': [1e-05, 0.0001, 0.001, 0.01], 'loss': ['log', 'hinge', 'perceptron']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [9]:
sgd_rsearch.best_params_

{'max_iter': 200, 'loss': 'log', 'learning_rate': 'optimal', 'eta0': 0.01}

initial settings for SGD are confirmed

### Random Forest Classifier

In [11]:
#random forest - benchmark
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_clf_diag = model_diag(rf_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.944
Confusion matrix: 
[[0.85603 0.14397]
 [0.00865 0.06957]]


In [12]:
y_score_rf = rf_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf)
print(cm)
rf_test_diag = model_oostest(rf_clf,X_test, y_test)
rf_test_auc = plot_rocs([rf_clf_diag, rf_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_test_auc)

[[9175 1497]
 [  83  767]]
[[9175 1497]
 [  83  767]]
AUC 0.946


In [13]:
# Number of trees in random forest
n_estimators = [150, 180, 200, 250, 280, 300, 350] 
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 10, 20]
# Maximum number of levels in tree
max_depth = [100,200, None]
#Max leaf nodes
max_leaf_nodes = [10,20,40,60]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'max_leaf_nodes':max_leaf_nodes}
print(random_grid)

{'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]}


In [15]:
rf = RandomForestClassifier() #base model
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs =7)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  1.1min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:  6.0min
[Parallel(n_jobs=7)]: Done 351 tasks      | elapsed: 14.1min
[Parallel(n_jobs=7)]: Done 634 tasks      | elapsed: 26.7min
[Parallel(n_jobs=7)]: Done 900 out of 900 | elapsed: 38.4min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=7,
          param_distributions={'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [16]:
rf_random.best_params_

{'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

In [17]:
#first randomized search_imp
rs1 = {'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_leaf_nodes':60,
 'max_depth': 100,
 'bootstrap': True}

#second randomized search_imp
rs2 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

#third randomized search_p90
rs3 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

In [18]:
#random_forest from rand
n_estimators=rs3['n_estimators']
min_samples_split=rs3['min_samples_split']
min_samples_leaf=rs3['min_samples_leaf']
max_leaf_nodes=rs3['max_leaf_nodes']
max_features=rs3['max_features']
max_depth=rs3['max_depth']
bootstrap=rs3['bootstrap']




rf_rs = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                                max_features=max_features,
                                min_samples_split=min_samples_split,
                               min_samples_leaf=min_samples_leaf,
                               class_weight="balanced", bootstrap=bootstrap,
                               n_jobs=7)
rf_rs.fit(X_train, y_train)
rf_rs_diag = model_diag(rf_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.973
Confusion matrix: 
[[0.97764 0.02236]
 [0.01467 0.06355]]


In [19]:
y_score_rf_rs = rf_rs.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf_rs)
print(cm)
rf_rs_test_diag = model_oostest(rf_rs,X_test, y_test)
rf_rs_test_auc = plot_rocs([rf_rs_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_rs_test_auc)

[[10460   212]
 [  131   719]]
[[10460   212]
 [  131   719]]
AUC 0.981


In [20]:
#save_sk_model(rf_rs, "../data/models/", 'rf_p90', 'benchmark')

Saving model to ../data/models/benchmark_rf_p90_190710_105.pkl


### Training Performance Comparison

In [21]:
rf_sgd_roc = plot_rocs([sgd_clf_diag, rf_rs_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)

### Test Performance Comparison

In [22]:
rf_sgd_roc = plot_rocs([sgd_test_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)

## REPEATING THE EXPERIMENT IN TIME MODE

In [25]:
prefix = 'time_2018-04-30_p90_'
postfix = '_190710_745'
[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl') 

### SGD Classifier

In [26]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_clf_diag = model_diag(sgd_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.700
Confusion matrix: 
[[0.99136 0.00864]
 [0.08484 0.00947]]


In [27]:
y_score_sgd = sgd_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd)
print(cm)
sgd_test_diag = model_oostest(sgd_clf,X_test, y_test)
sgd_test_auc = plot_rocs([sgd_clf_diag, sgd_test_diag], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc)

[[11004   292]
 [  144    78]]
[[11004   292]
 [  144    78]]
AUC 0.771


In [28]:
#save_sk_model(sgd_clf, "../data/models/", 'sgd', 'benchmark')

In [29]:
# max Number of iterations
max_iter = [100, 180, 200, 250, 300, 320, 350] 
# learning rate
learning_rate = ['constant', 'optimal', 'adaptive']
#eta
eta0 = [0.00001, 0.0001, 0.001, 0.01]
#loss
loss = ['log', 'hinge', 'perceptron']

sgd_random_grid = {'max_iter': max_iter,
               'learning_rate': learning_rate,
                  'eta0': eta0,
                  'loss':loss}

sgd = SGDClassifier(tol=0.0001)

sgd_rsearch = RandomizedSearchCV(estimator = sgd, param_distributions = sgd_random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs =7)
# Fit the random search model
sgd_rsearch.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:    3.4s
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:   10.6s
[Parallel(n_jobs=7)]: Done 351 tasks      | elapsed:   23.0s
[Parallel(n_jobs=7)]: Done 600 out of 600 | elapsed:   39.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
       validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=200, n_jobs=7,
          param_distributions={'max_iter': [100, 180, 200, 250, 300, 320, 350], 'learning_rate': ['constant', 'optimal', 'adaptive'], 'eta0': [1e-05, 0.0001, 0.001, 0.01], 'loss': ['log', 'hinge', 'perceptron']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [30]:
sgd_rsearch.best_params_

{'max_iter': 350, 'loss': 'log', 'learning_rate': 'optimal', 'eta0': 0.0001}

In [31]:
#Linear model Stochastic Gradient Descent
sgd_rs = SGDClassifier(random_state=42, max_iter=300, loss='log', learning_rate='optimal', eta0=0.0001, tol=0.0001) 
sgd_rs.fit(X_train, y_train)
sgd_rs_diag = model_diag(sgd_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.700
Confusion matrix: 
[[0.99136 0.00864]
 [0.08484 0.00947]]


In [32]:
y_score_sgd_rs = sgd_rs.predict(X_test)
cm = confusion_matrix(y_test, y_score_sgd_rs)
print(cm)
sgd_test_diag_rs = model_oostest(sgd_rs,X_test, y_test)
sgd_test_auc_rs = plot_rocs([sgd_rs_diag, sgd_test_diag_rs], p_width=600, p_height=600, model_appendix=['SGD - 5folds','SGD - test'])
show(sgd_test_auc_rs)

[[11004   292]
 [  144    78]]
[[11004   292]
 [  144    78]]
AUC 0.771


### Random Forest Classifier

In [33]:
#random forest - benchmark
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_clf_diag = model_diag(rf_clf, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.714
Confusion matrix: 
[[0.75318 0.24682]
 [0.03447 0.05984]]


In [34]:
y_score_rf = rf_clf.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf)
print(cm)
rf_test_diag = model_oostest(rf_clf,X_test, y_test)
rf_test_auc = plot_rocs([rf_clf_diag, rf_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_test_auc)

[[8864 2432]
 [  53  169]]
[[8864 2432]
 [  53  169]]
AUC 0.844


In [35]:
# Number of trees in random forest
n_estimators = [150, 180, 200, 250, 280, 300, 350] 
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 10, 20]
# Maximum number of levels in tree
max_depth = [100,200, None]
#Max leaf nodes
max_leaf_nodes = [10,20,40,60]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
              'max_leaf_nodes':max_leaf_nodes}
print(random_grid)

{'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]}


In [36]:
rf = RandomForestClassifier() #base model
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs =7)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed:  1.0min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed:  6.1min
[Parallel(n_jobs=7)]: Done 351 tasks      | elapsed: 14.0min
[Parallel(n_jobs=7)]: Done 634 tasks      | elapsed: 26.1min
[Parallel(n_jobs=7)]: Done 900 out of 900 | elapsed: 37.6min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=7,
          param_distributions={'n_estimators': [150, 180, 200, 250, 280, 300, 350], 'max_features': ['auto', 'sqrt', 10, 20], 'max_depth': [100, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'max_leaf_nodes': [10, 20, 40, 60]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [37]:
rf_random.best_params_

{'n_estimators': 300,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 10,
 'max_features': 'auto',
 'max_depth': 200,
 'bootstrap': True}

In [38]:
#first randomized search_imp
rs1 = {'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_leaf_nodes':60,
 'max_depth': 100,
 'bootstrap': True}

#second randomized search_imp
rs2 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

#third randomized search_p90
rs3 = {'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_leaf_nodes': 60,
 'max_features': 20,
 'max_depth': 100,
 'bootstrap': False}

#fourth randomized search_p90_time
rs4 = {'n_estimators': 300,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 10,
 'max_features': 'auto',
 'max_depth': 200,
 'bootstrap': True}

In [39]:
#random_forest from rand
n_estimators=rs4['n_estimators']
min_samples_split=rs4['min_samples_split']
min_samples_leaf=rs4['min_samples_leaf']
max_leaf_nodes=rs4['max_leaf_nodes']
max_features=rs4['max_features']
max_depth=rs4['max_depth']
bootstrap=rs4['bootstrap']




rf_rs = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                                max_features=max_features,
                                min_samples_split=min_samples_split,
                               min_samples_leaf=min_samples_leaf,
                               class_weight="balanced", bootstrap=bootstrap,
                               n_jobs=7)
rf_rs.fit(X_train, y_train)
rf_rs_diag = model_diag(rf_rs, X_train, y_train, run_confusion_matrix=True, CrossValFolds=5)

AUC 0.700
Confusion matrix: 
[[0.9276  0.0724 ]
 [0.06388 0.03043]]


In [40]:
y_score_rf_rs = rf_rs.predict(X_test)
cm = confusion_matrix(y_test, y_score_rf_rs)
print(cm)
rf_rs_test_diag = model_oostest(rf_rs,X_test, y_test)
rf_rs_test_auc = plot_rocs([rf_rs_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'])
show(rf_rs_test_auc)

[[11098   198]
 [  142    80]]
[[11098   198]
 [  142    80]]
AUC 0.873


In [41]:
#save_sk_model(rf_rs, "../data/models/", 'rf_p90_time', 'benchmark')

Saving model to ../data/models/benchmark_rf_p90_time_190710_1058.pkl


### Training Performance Comparison

In [42]:
rf_sgd_roc = plot_rocs([sgd_clf_diag, rf_rs_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)

### Test Performance Comparison

In [43]:
rf_sgd_roc = plot_rocs([sgd_test_diag, rf_rs_test_diag], p_width=600, p_height=600, model_appendix=['SGD', 'Random Forest'])
show(rf_sgd_roc)