# Models loop automation

This notebook uses as inputs the outputs from the pipeline ('05_preproc_pipeline_1.ipynb' notebook) and performs model calibration and general exploration for the transaction credit events prediction.

In [7]:
import pandas as pd
import numpy as np
import pickle

from models_utils import *
from visualization_utils import *

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from bokeh.io import show, output_notebook
output_notebook()

In [2]:
datafolder = "../data/preproc_traintest/"
output_path = "../data/models/"

prefixes = ['shuffle_imp_', 'shuffle_p90_', 'shuffle_p180_']
postfixes = ['_19072_750']*len(prefixes)

In [3]:
#Linear model Stochastic Gradient Descent
sgd_rs = SGDClassifier(random_state=42, max_iter=300, loss='log', learning_rate='adaptive', eta0=0.01, tol=0.0001)

#Random forest
rs1 = {'n_estimators': 280,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_leaf_nodes':60,
 'max_depth': 100,
 'bootstrap': True}

n_estimators=rs1['n_estimators']
min_samples_split=rs1['min_samples_split']
min_samples_leaf=rs1['min_samples_leaf']
max_leaf_nodes=rs1['max_leaf_nodes']
max_features=rs1['max_features']
max_depth=rs1['max_depth']
bootstrap=rs1['bootstrap']




rf_rs = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                                max_features=max_features,
                                min_samples_split=min_samples_split,
                               min_samples_leaf=min_samples_leaf,
                               class_weight="balanced", bootstrap=bootstrap,
                               n_jobs=7)

models = [sgd_rs, rf_rs]

In [4]:
experiment = models_loop(models, datafolder, prefixes, postfixes)

Training, validation and testing of experiment with prefix shuffle_imp_ and postfix _19072_750 using SGDClassifier
-Loading preprocessed data...
training files: ../data/preproc_traintest/shuffle_imp__traindata_19072_750.pkl
testing files: ../data/preproc_traintest/shuffle_imp__testdata_19072_750.pkl
- Training...
- Validation...
AUC 0.797
Confusion matrix: 
[[0.99878 0.00122]
 [0.01888 0.00257]]
- Testing...
[[11284    11]
 [  201    26]]
AUC 0.783

Training, validation and testing of experiment with prefix shuffle_imp_ and postfix _19072_750 using RandomForestClassifier
-Loading preprocessed data...
training files: ../data/preproc_traintest/shuffle_imp__traindata_19072_750.pkl
testing files: ../data/preproc_traintest/shuffle_imp__testdata_19072_750.pkl
- Training...
- Validation...
AUC 0.930
Confusion matrix: 
[[0.99548 0.00452]
 [0.0129  0.00855]]
- Testing...
[[11243    52]
 [  141    86]]
AUC 0.924

Training, validation and testing of experiment with prefix shuffle_p90_ and postfix

In [6]:
experiment.keys()

dict_keys(['SGDClassifier_shuffle_imp_validation', 'SGDClassifier_shuffle_imp_testing', 'RandomForestClassifier_shuffle_imp_validation', 'RandomForestClassifier_shuffle_imp_testing', 'SGDClassifier_shuffle_p90_validation', 'SGDClassifier_shuffle_p90_testing', 'RandomForestClassifier_shuffle_p90_validation', 'RandomForestClassifier_shuffle_p90_testing', 'SGDClassifier_shuffle_p180_validation', 'SGDClassifier_shuffle_p180_testing', 'RandomForestClassifier_shuffle_p180_validation', 'RandomForestClassifier_shuffle_p180_testing'])

In [9]:
rf_imp = plot_rocs([experiment['RandomForestClassifier_shuffle_imp_validation'], experiment['RandomForestClassifier_shuffle_imp_testing']],
                   p_width=600, p_height=600, model_appendix=['RF - 5folds','RF - test'], title_lab='Random Forest performance for Impairment')
show(rf_imp)