In [None]:
import os
os.chdir('..')

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from ast import literal_eval
import gc
from pprint import pprint
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import keras.backend as K
from keras_preprocessing.sequence import pad_sequences

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from scripts.utils import log_msg, precision_reall_f1_report
from scripts.models import vanilla_LSTM, CustomStopper, keras_categorical

In [None]:
args={}
args['data'] = 'data/sample_data_features.csv'
args['feature_space'] = 'data/feature_names.csv'
args['test_size'] = 0.2
args['seed'] = 123456

args['cv_results'] = 'results/cv_vanilla_lstm.csv'

In [None]:
args['embedding_dim'] = 10
args['num_lstm_units'] = [16, 32, 64, 128]
args['input_len'] = 20
args['layer_nodes'] = [[512], [256], [128], [64], [32]]

args['NFOLDS'] = 3
args['num_classes'] = 2
args['batch_size'] =1000
args['max_epochs'] = 200
args['early_stop_start'] = 50
args['verbose'] = 0

In [None]:
data = pd.read_csv(args['data'])
data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[['sequence']], 
                                                    data[['label']], 
                                                    test_size=args['test_size'], 
                                                    random_state=args['seed'])

In [None]:
X_train = X_train['sequence'].apply(lambda x: [int(i) for i in literal_eval(x)]).values
X_train = pad_sequences(X_train, maxlen=args['input_len'])

X_test = X_test['sequence'].apply(lambda x: [int(i) for i in literal_eval(x)]).values
X_test = pad_sequences(X_test, maxlen=args['input_len'])

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

y_train = y_train.values
y_test = y_test.values

### CV

In [None]:
folds = KFold(n_splits=args['NFOLDS'], shuffle=True, random_state=args['seed'])

vocab_size = np.max(X_train)+1 

results = []
log_msg(">>> Start CV")
for i in range(len(args['num_lstm_units'])):
    
    for j in range(len(args['layer_nodes'])):
        
        score = 0
        splits = folds.split(X_train, y_train)
        
        for fold_n, (train_index, valid_index) in enumerate(splits):
            
            X_train_cv, X_valid = X_train[train_index], X_train[valid_index]
            y_train_cv, y_valid = y_train[train_index], y_train[valid_index]
            
            K.clear_session()
            gc.collect()
    
            model = vanilla_LSTM(vocab_size, args['embedding_dim'], 
                                 args['num_lstm_units'][i], 
                                 args['input_len'], 
                                 args['num_classes'], 
                                 layer_nodes=args['layer_nodes'][j])

            early_stop = CustomStopper(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', 
                                       start_epoch=args['early_stop_start'],
                                       restore_best_weights=True)

            y_train_cv_categorical = keras_categorical(y_train_cv, args['num_classes'])
            y_valid_categorical = keras_categorical(y_valid, args['num_classes'])

            model.fit(X_train_cv, y_train_cv_categorical, 
                      batch_size=args['batch_size'], 
                      epochs=args['max_epochs'], 
                      validation_data=(X_valid, y_valid_categorical),
                      verbose=args['verbose'],
                      callbacks=[early_stop])

            y_pred_valid = model.predict(X_valid)
#             print(f"\n>>> Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid[:,1])}\n")

            score += roc_auc_score(y_valid, y_pred_valid[:, 1]) / args['NFOLDS']

            del X_train_cv, X_valid, y_train_cv, y_train_cv_categorical, y_valid, y_valid_categorical
            del model, y_pred_valid, early_stop
        
        del splits
            
        results.append({'num_lstm_units' : args['num_lstm_units'][i],
                   'layer_nodes' : args['layer_nodes'][j],
                   'score': score})

        log_msg(f">>> num_lstm_units={args['num_lstm_units'][i]} layer_nodes={args['layer_nodes'][j]} Mean AUC = {score}")

log_msg(">>> Finished!")

results_df = pd.DataFrame(results)
results_df.head()


In [None]:
results_df.to_csv(args['cv_results'], index=False)

### Final train and test

In [None]:
args['final_params'] = {'num_lstm_units': 64,
                        'layer_nodes': [512]}

In [None]:
X_train_final, X_valid, y_train_final, y_valid = train_test_split(X_train, y_train, 
                                                                  test_size=0.1, random_state=args['seed'])

vocab_size = np.max(X_train)+1 

model = vanilla_LSTM(vocab_size, args['embedding_dim'], 
                         args['final_params']['num_lstm_units'], 
                         args['input_len'], args['num_classes'], 
                         layer_nodes=args['final_params']['layer_nodes'])
    
early_stop = CustomStopper(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', 
                           start_epoch=args['early_stop_start'],
                           restore_best_weights=True)

y_train_final_categorical = keras_categorical(y_train_final, args['num_classes'])
y_valid_categorical = keras_categorical(y_valid, args['num_classes'])

print(">>> Training ")
model.fit(X_train_final, y_train_final_categorical, 
          batch_size=args['batch_size'], 
          epochs=args['max_epochs'], 
          validation_data=(X_valid, y_valid_categorical),
          verbose=args['verbose'],
          callbacks=[early_stop])

print(">>> Finished!")



In [None]:
y_pred = model.predict(X_valid)
y_pred = y_pred[:,1]

precision, recall, thresholds = precision_recall_curve(y_valid, y_pred)

reports = precision_reall_f1_report(precision, recall, thresholds, 
                                    font_scale=2,
                                    linewidth=3,
                                    plot=False)

threshold = reports[reports['f1']==reports['f1'].max()]['threshold'].values[0]

print('Threshold to get the best F1 on validation set: ', threshold)

In [None]:
y_pred = model.predict(X_test)
y_pred = y_pred[:,1]

print(f">>> AUC on Test set: {roc_auc_score(y_test, y_pred)}\n")

y_pred_label = [1 if i >= threshold else 0 for i in y_pred]

print(f">>> F1 on Test set (threshold {threshold}) : {f1_score(y_test, y_pred_label)}\n")


### Precision, Recall and F1 vs threshold on test set

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

In [None]:
reports = precision_reall_f1_report(precision, recall, thresholds, 
                                    font_scale=2,
                                    linewidth=3)

In [None]:
reports.head()

In [None]:
print('Best F1: ', reports['f1'].max())
print('Threshold:', reports[reports['f1']==reports['f1'].max()]['threshold'].values[0])