In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

# Import TensorFlow
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import RMSprop, Adam, SGD

from scripts_ml.models_utils import *

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

from scripts_ml.ann_utils import *

In [2]:
#importing data
preproc_folder = "enriched_time_seq"
datafolder = "../data/preproc_traintest/"+preproc_folder+'/'
prefix_time_seq = 'time_2018-04-30_imp_bg_'
valid_code = '_val_24000_6000_'
trainfile = '_traindata'
testfile = '_testdata'
postfix_time_seq_val = '_190815_645'
postfix_time_seq = '_190812_1547'
preproc_folder = "enriched_time_seq"
datafolder = "../data/preproc_traintest/"+preproc_folder+'/'
indexfile = '_fold_indexes'
expname = "MLP_"+preproc_folder+valid_code.split('_val_')[1][:-1]+"_imp"

[X_train, y_train, feature_labels] = pd.read_pickle(datafolder+prefix_time_seq+trainfile+postfix_time_seq+'.pkl') 
[X_test, y_test, feature_labels] = pd.read_pickle(datafolder+prefix_time_seq+testfile+postfix_time_seq+'.pkl') 
[val_X_train, val_y_train, val_feature_labels] = pd.read_pickle(datafolder+prefix_time_seq+valid_code+trainfile+postfix_time_seq_val+'.pkl') 
[val_X_test, val_y_test, val_feature_labels] = pd.read_pickle(datafolder+prefix_time_seq+valid_code+testfile+postfix_time_seq_val+'.pkl') 
indexes = pd.read_pickle(datafolder+prefix_time_seq+valid_code+indexfile+postfix_time_seq_val+'.pkl')

In [3]:
#recombining folds for grid search

val_X_all = []
val_y_all = []
indexes_tuples = []

count=0
start_tr=0

for idx in indexes:
    val_X_all.append(val_X_train[idx[0]])
    val_y_all.append(val_y_train[idx[0]])
    if count==0:
        test_idx = np.array(range(0, len(idx[1])))
    else:
        test_idx+=len(idx[1])
    val_X_all.append(val_X_test[test_idx])
    val_y_all.append(val_y_test[test_idx])
    
    
    if count==0:
        start_tst = len(idx[0])
    else:
        start_tr+=add_to_tr
        start_tst=start_tr+len(idx[0])
        
    indexes_tuples.append((np.array(range(start_tr, start_tr+len(idx[0]))), 
                          np.array(range(start_tst, start_tst+len(idx[1])))))
    
    add_to_tr = len(idx[0])+len(idx[1])
    
    count+=1

val_X_all = np.concatenate(val_X_all, axis=0)
val_y_all = np.concatenate(val_y_all, axis=0)

In [4]:
#mlp = create_mlp_model(input_shape = X_train.shape[0],
#                     hidden_layers_no=1, 
#                     hidden_nodes=[5], 
#                     hl_activations = [tf.nn.relu], 
#                     random_seed=42, 
 #                    output_function = tf.nn.sigmoid,
 #                    optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-4),
 #                    loss_func = 'binary_crossentropy',
 #                    metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()],
 #                    kernel_regularizers = [],
 #                    kernel_initializer = tf.keras.initializers.lecun_uniform(seed=42),
 #                    bias_initializer = tf.keras.initializers.Zeros(),
 #                    dropout = None,
 #                   print_summary=True)

In [5]:
mlp = KerasClassifier(build_fn=create_mlp_model)

In [6]:
input_shape = [X_train.shape[1]]

hidden_layers_no = [2] 

hidden_nodes = [[5,5], [10,5], 
                [20,5], [20,10], 
                [50,5], [50,10], [50,20],
               [80,5], [80,10], [80,20]] 

hl_activations = [[tf.nn.relu]*2]

dropout = [[0.3]*2, [0.4]*2, [0.5]*2, [0.6]*2, None]

optimizer = [RMSprop(), Adam(), SGD()]

batch_size = [128, 256, 512, 1024, 2048]

nb_epoch = [50, 100, 200, 500]

param_grid = {'hidden_layers_no': hidden_layers_no,
               'hidden_nodes': hidden_nodes,
               'hl_activations': hl_activations,
              'dropout': dropout,
               'input_shape': input_shape,
               'optimizer': optimizer,
               'batch_size': batch_size,
              'nb_epoch':nb_epoch,
               }

In [7]:
scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}

mlp_grid = GridSearchCV(estimator = mlp, param_grid = param_grid, 
                               cv = rolling_window_idxs(indexes_tuples), 
                               verbose=1, n_jobs =7, scoring=scoring, refit='AUC')


In [None]:
# Fit the random search model
mlp_grid.fit(val_X_all, val_y_all)

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   23.5s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:  1.8min


In [None]:
mlp_grid.best_params_

In [None]:
experiment = mlp_exp_timeseq(datafolder, prefix_time_seq, postfix_time_seq,
                postfix_time_seq_val, valid_code, indexfile,
                         experiment_name=expname, 
                         hidden_layers_no=2,  #4
                         hidden_nodes=[50, 20, 10],       
                         optimizer=Adam(0.0001),
                         hl_activations=[tf.nn.relu]*2,      
                         dropout=[0.4]*2,            
                         loss_func = tf.keras.losses.BinaryCrossentropy(),
                         metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()],
                         to_monitor=('accuracy', 0.98),
                         validation_ep=True,
                         epochs=3000, 
                         batch_size=2048,
                         use_batch_and_steps=False,
                         class_1_weight=25,
                         pred_threshold = 0.55,
                         verbose=0,
                         early_stopping=False, 
                         save_model=True, 
                         save_results_for_viz=False,
                         mlf_tracking=True)