In [None]:
%reload_kedro

In [None]:
from crypto_thesis.data_domains.modeling.lstm import _build_lstm_timestamps_seq, lstm_model_predict
from pprint import pprint
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
import numpy as np
from keras.engine.sequential import Sequential
from keras.layers import LSTM, BatchNormalization, Dense
from keras.models import Sequential
from keras.regularizers import l2
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from time import time

## Base

In [None]:
TARGET_COL = ["label"]
INDEX_COL = "window_nbr"

In [None]:
mt_train_multic = catalog.load("master_table_train_multic")
mt_test_multic = catalog.load("master_table_test_multic")
seq_length = catalog.load("params:lstm_timestamp_seq_length")

## LSTM

In [None]:
# set numpy seed
np.random.seed(0)
# set tensorflow seed
tf.random.set_seed(0)
SHUFFLE = False

EPOCHS = 50
N_SPLITS = 5

tf.keras.utils.disable_interactive_logging()

In [None]:
master_table_train = mt_train_multic.set_index(INDEX_COL).sort_index().reset_index(drop=True)
X_train, y_train = master_table_train.drop(columns=TARGET_COL), master_table_train[TARGET_COL]

X_train_scaled_seq, y_train_scaled_seq = _build_lstm_timestamps_seq(X=X_train,
                                                                    y=y_train,
                                                                    seq_length=seq_length)

master_table_test = mt_test_multic.set_index(INDEX_COL).sort_index().reset_index(drop=True)
X_test, y_test = master_table_test.drop(columns=TARGET_COL), master_table_test[TARGET_COL]

X_test_scaled_seq, y_test_scaled_seq = _build_lstm_timestamps_seq(X=X_test,
                                                                    y=y_test,
                                                                    seq_length=seq_length)

In [None]:
def create_model_lstm(optimizer: str) -> Sequential:

    # parameters
    LAYERS = [20, 20, 20, 1]
    N = X_train_scaled_seq.shape[2]
    LAMBD = 0.005
    DP = 0.0
    RDP = 0.0

    # model
    model = Sequential()

    model.add(LSTM(
        input_shape=(seq_length, N),
        units=LAYERS[0],
        activation='tanh',
        recurrent_activation='hard_sigmoid',
        kernel_regularizer=l2(LAMBD),
        recurrent_regularizer=l2(LAMBD),
        dropout=DP,
        recurrent_dropout=RDP,
        return_sequences=True,
        return_state=False,
        stateful=False,
        unroll=False
                ))
    model.add(BatchNormalization())

    model.add(LSTM(
        units=LAYERS[1],
        activation='tanh',
        recurrent_activation='hard_sigmoid',
        kernel_regularizer=l2(LAMBD),
        recurrent_regularizer=l2(LAMBD),
        dropout=DP,
        recurrent_dropout=RDP,
        return_sequences=True,
        return_state=False,
        stateful=False,
        unroll=False
                ))
    model.add(BatchNormalization())

    model.add(LSTM(
        units=LAYERS[2],
        activation='tanh',
        recurrent_activation='hard_sigmoid',
        kernel_regularizer=l2(LAMBD),
        recurrent_regularizer=l2(LAMBD),
        dropout=DP,
        recurrent_dropout=RDP,
        return_sequences=False,
        return_state=False,
        stateful=False,
        unroll=False
                ))
    model.add(BatchNormalization())
    
    model.add(Dense(
        units=LAYERS[3],
        activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [None]:
start = time()

lstm_model_params = {
    'batch_size': [4, 16],
    'optimizer': ['SGD', 'RMSprop', 'Adam']
    }
model = KerasClassifier(build_fn=create_model_lstm, epochs=EPOCHS, verbose=1)

cv_results = {}
tss = TimeSeriesSplit(n_splits=N_SPLITS)
for i, (train_index, test_index) in enumerate(tss.split(X_train), 1):
    X_train_split, X_test_split = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_train_split, y_test_split = y_train.iloc[train_index, :], y_train.iloc[test_index, :]

    X_train_scaled_seq, y_train_scaled_seq = _build_lstm_timestamps_seq(X=X_train_split,
                                                                    y=y_train_split,
                                                                    seq_length=seq_length)
    
    X_test_scaled_seq, y_test_scaled_seq = _build_lstm_timestamps_seq(X=X_test_split,
                                                                    y=y_test_split,
                                                                    seq_length=seq_length)

    grid = GridSearchCV(estimator=model, 
                        param_grid=lstm_model_params, 
                        n_jobs=-1,
                        scoring="accuracy")
    grid_result = grid.fit(X_train_scaled_seq, y_train_scaled_seq)

    cv_results[f"fold_{i}"] = {"score": grid_result.best_score_,
                               "params": grid_result.best_params_}
    
end = time()

In [None]:
best_score = -999
best_fold = ""
scores = []

for fold, params in cv_results.items():
    if params["score"] > best_score:
        best_score = params["score"]
        best_fold = fold
    scores.append(params["score"])

best_params = cv_results[best_fold]["params"]

print("Reporting:")
print(f"Best score: {max(scores)}")
print(f"Min: {min(scores)}, avg: {round(np.average(scores), 4)} and std: {round(np.std(scores, ddof=1), 4)} of scores")
print(f"Time elapsed (seconds): {end-start}")
print()
print("Best parameters:")
pprint(best_params)
print()
print("CV results:")
pprint(cv_results)

In [None]:
# model = create_model_lstm(optimizer=best_params["optimizer"])
model = create_model_lstm(optimizer="SGD")

M_TRAIN = X_train_scaled_seq.shape[0]
M_TEST = X_test_scaled_seq.shape[0]
# BATCH = best_params["batch_size"]
BATCH = 4

lr_decay = ReduceLROnPlateau(
            monitor='loss',
            patience=1,
            verbose=0,
            factor=0.5,
            min_lr=1e-8)

# Define Early Stopping:
early_stop = EarlyStopping(monitor='val_loss', min_delta=0,
                        patience=EPOCHS, verbose=1, mode='auto',
                        baseline=0, restore_best_weights=True)

train_history = model.fit(X_train_scaled_seq, y_train_scaled_seq,
                    epochs=EPOCHS,
                    batch_size=BATCH,
                    validation_split=0.0,
                    validation_data=(X_test_scaled_seq[:M_TEST], y_test_scaled_seq[:M_TEST]),
                    shuffle=SHUFFLE, verbose=0,
                    callbacks=[lr_decay, early_stop])

y_pred = lstm_model_predict(model=model, master_table_test=mt_test_multic, seq_length=seq_length)
print(y_pred["y_pred"].value_counts())