In [1]:
import pandas as pd
import numpy as np
import pickle
from os import environ

from sklearn.model_selection import cross_val_predict, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix, make_scorer, accuracy_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from scripts_viz.visualization_utils import *
from scripts_ml.models_utils import *

from bokeh.io import show, output_notebook
output_notebook()

import datetime as dt

In [2]:
#importing data

#time_2018-04-30_imp_bg__val_12000_3000__traindata_190812_1612.pkl

prefix = 'time_2018-04-30_imp_bg_'
valid_code = '_val_12000_3000_'
trainfile = '_traindata'
testfile = '_testdata'
postfix = '_190812_1612'
preproc_folder = "enriched_time_seq"
datafolder = "../data/preproc_traintest/"+preproc_folder+'/'
indexfile = '_fold_indexes'

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+valid_code+trainfile+postfix+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+valid_code+testfile+postfix+'.pkl') 
indexes = pd.read_pickle(datafolder+prefix+valid_code+indexfile+postfix+'.pkl')

In [3]:
len(indexes)

11

In [4]:
X_test.shape

(33000, 35)

In [5]:
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 

model = rf_clf

In [6]:
exp = model_diag_seq(model, X_train, y_train, specify_idxs=True, idx_tuples=indexes, X_test=X_test, y_test=y_test)

Fold 1: train  on 13101 from index 0 to 13100, test on 3000 from 0 to 2999
Fold 1 AUC: 0.908734724572288
Fold 2: train  on 12000 from index 4101 to 16100, test on 3000 from 3000 to 5999
Fold 2 AUC: 0.7906072307728607
Fold 3: train  on 12000 from index 7101 to 19100, test on 3000 from 6000 to 8999
Fold 3 AUC: 0.5623367803673406
Fold 4: train  on 12000 from index 10101 to 22100, test on 3000 from 9000 to 11999
Fold 4 AUC: 0.7062196533423668
Fold 5: train  on 12000 from index 13101 to 25100, test on 3000 from 12000 to 14999
Fold 5 AUC: 0.8569012178619756
Fold 6: train  on 12000 from index 16101 to 28100, test on 3000 from 15000 to 17999
Fold 6 AUC: 0.8455419765544219
Fold 7: train  on 12000 from index 19101 to 31100, test on 3000 from 18000 to 20999
Fold 7 AUC: 0.8358762416339628
Fold 8: train  on 12000 from index 22101 to 34100, test on 3000 from 21000 to 23999
Fold 8 AUC: 0.903902384165542
Fold 9: train  on 12000 from index 25101 to 37100, test on 3000 from 24000 to 26999
Fold 9 AUC: 0.

In [7]:
exp

{'AUC_fold_1': 0.908734724572288,
 'AUC_fold_2': 0.7906072307728607,
 'AUC_fold_3': 0.5623367803673406,
 'AUC_fold_4': 0.7062196533423668,
 'AUC_fold_5': 0.8569012178619756,
 'AUC_fold_6': 0.8455419765544219,
 'AUC_fold_7': 0.8358762416339628,
 'AUC_fold_8': 0.903902384165542,
 'AUC_fold_9': 0.7529011824324323,
 'AUC_fold_10': 0.7560197687098389,
 'AUC_fold_11': 0.6528764204545455,
 'fpr': array([0.00000000e+00, 3.08166410e-05, 6.16332820e-05, ...,
        9.99876733e-01, 9.99938367e-01, 1.00000000e+00]),
 'tpr': array([0., 0., 0., ..., 1., 1., 1.]),
 'auc': 0.7577275808936825}

### TRAIN Data import

In [8]:
#importing data
prefix = 'time_2018-04-30_imp_bg_'
trainfile = '_traindata'
testfile = '_testdata'
postfix = '_190812_1547'
preproc_folder = "enriched_time_seq"
datafolder = "../data/preproc_traintest/"+preproc_folder+'/'

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix+trainfile+postfix+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix+testfile+postfix+'.pkl') 

In [9]:
#Linear model Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state=42, max_iter=250, loss='log', tol=0.0001) 
sgd_clf.fit(X_train, y_train)
sgd_test_diag = model_oostest(sgd_clf,X_test, y_test)
sgd_test_auc = plot_rocs([sgd_test_diag], p_width=600, p_height=600, model_appendix=['SGD - test'])
show(sgd_test_auc)

Confusion matrix: 
[[0.99665 0.00335]
 [0.03734 0.00662]]
AUC 0.780


In [10]:
#random forest - benchmark
n_estimators = 200
max_leaf_nodes = 40
rf_clf = RandomForestClassifier(random_state=42,
                               n_estimators=n_estimators,
                               max_leaf_nodes=max_leaf_nodes,
                               class_weight="balanced",
                               n_jobs=7)

rf_clf.fit(X_train, y_train)
rf_test_diag = model_oostest(rf_clf,X_test, y_test)
rf_test_auc = plot_rocs([rf_test_diag], p_width=600, p_height=600, model_appendix=['RF - test'])
show(rf_test_auc)

Confusion matrix: 
[[0.91752 0.08248]
 [0.00852 0.03544]]
AUC 0.954
