## Packages and Assets

In [1]:
from gensim.models import Word2Vec
import json
import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping
import seaborn as sns
import keras_tuner as kt
from keras.optimizers import SGD

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

import matplotlib.pyplot as plt

from keras.preprocessing.text import tokenizer_from_json
import os, sys

In [2]:
print(sys.executable)
print(os.getcwd())

C:\Users\Guilherme\.conda\envs\nlp2\python.exe
C:\Users\Guilherme\Documents\Programming\Python\DataScience\TCC\pipeline\supervised_lstm_model


## Dependencies

In [4]:
with open('../../assets/lstm_assets/tokenizer.json', 'r', encoding='utf-8') as f:
    tokenizer_json = f.read()
    tokenizer = tokenizer_from_json(tokenizer_json)
    word_index = tokenizer.word_index



In [None]:
model_we = Word2Vec.load('../../assets/lstm_assets/word2vec.model')

print(model_we.wv.most_similar('itau'))

# List of nparrays of size 300
embeddings_dict = {}
for word in model_we.wv.index_to_key:
    embeddings_dict[word] = model_we.wv[word]

embeddings_on_this_context = np.zeros((len(word_index), 300))
for word, i in word_index.items():
    embeddings_vector = embeddings_dict.get(word)
    if embeddings_vector is not None:
        embeddings_on_this_context[i - 1] = embeddings_vector


In [None]:
train = pd.read_csv('../../assets/data/splits/train/padded.csv')
val = pd.read_csv('../../assets/data/splits/val/padded.csv')

## Functions

In [9]:
def index2word(word_index):
    index_word = {}
    for key in word_index:
        index_word[word_index[key]] = key
    return index_word


def seq2text(seq, index_word):
    text = []
    for index in seq:
        text.append(index_word[index])
    return text


def show_confusion_matrix(cm):
    print("Confusion Matrix")
    plt.figure(figsize=(10, 7))

    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.title('Confusion Matrix')
    plt.show()

## RNN Model

### Data transformation for model

In [None]:
X_train = train.to_numpy()[:, :-1]
y_train = train.to_numpy()[:, -1]

X_val = val.to_numpy()[:, :-1]
y_val = val.to_numpy()[:, -1]

### Without Tuning and Early stopping

In [10]:
# model = Sequential([
#     Embedding(input_dim=len(word_index), output_dim= 300, input_length=X_train.shape[1], trainable=False, weights=[embeddings_on_this_context]),
#     Bidirectional(LSTM(64, return_sequences=True)),
#     # Dropout(0.4),
#     # Bidirectional(LSTM(hp.Choice('units',[32,64]))),
#     Bidirectional(LSTM(64)),
#     Dense(32, activation='relu'),
#     # Dropout(0.6),
#     Dense(4, activation='softmax')
# ])
#
# model.summary()

# from keras.optimizers import SGD
#
# loss = "sparse_categorical_crossentropy"
# optimizer = SGD(learning_rate=0.01)
# metrics = ['accuracy']
#
# model.compile(loss=loss,optimizer=optimizer,metrics=metrics)
# history = model.fit(X_train, y_train, epochs=25, validation_data=(X_val,y_val), verbose=2)

# history = model.fit(X_train, y_train, epochs=4000, validation_data=(X_val,y_val), verbose=2, callbacks=[es])


## Tuning Architecture Hyperparams using RandomSearch and Early Stopping

### Test 1

In [13]:
def build_model(hp):
    model = Sequential([
        Embedding(input_dim=len(word_index), output_dim=300, input_length=X_train.shape[1], trainable=False,
                  weights=[embeddings_on_this_context]),
        Bidirectional(LSTM(32, return_sequences=True)),
        Dropout(0.29),
        Bidirectional(LSTM(16)),
        Dense(64, 'tanh'),
        Dropout(0.73),
        Dense(4, activation='softmax')
    ])
    loss = "sparse_categorical_crossentropy"
    # optimizer = SGD(learning_rate=0.01)
    metrics = ['accuracy']

    model.compile(loss=loss, optimizer=hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop']), metrics=metrics)
    return model


es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

### Test 2

In [None]:
# def build_model(hp):
#     model = Sequential([
#     Embedding(input_dim=len(word_index), output_dim= 300, input_length=X_train.shape[1], trainable=False, weights=[embeddings_on_this_context]),
#     Bidirectional(LSTM(hp.Choice('units_bilstm_1',[16,32,64]), return_sequences=True)),
#     Dropout(hp.Float('rate_dp_1',0.5,0.9,step=0.1,default=0.5)),
#     Bidirectional(LSTM(hp.Choice('units_bilstm_2',[16,32,64]))),
#     Dense(hp.Choice('units_dense',[16,32,64]), hp.Choice('activation',['relu','sigmoid','tanh']) ),
#     Dropout(hp.Float('rate_dp_2',0.5,0.9,step=0.1,default=0.5)),
#     Dense(4, activation='softmax')
# ])
#     loss = "sparse_categorical_crossentropy"
#     # optimizer = SGD(learning_rate=0.01)
#     metrics = ['accuracy']
#
#     model.compile(loss=loss,optimizer=hp.Choice('optimizer',['adam','sgd','rmsprop']),metrics=metrics)
#     return model
#
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [14]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=1,
    directory='../../assets/lstm_assets',
    project_name='lstm_tuning')

tuner.search_space_summary()

Search space summary
Default search space size: 7
units_bilstm_1 (Choice)
{'default': 16, 'conditions': [], 'values': [16, 32, 64], 'ordered': True}
rate_dp_1 (Float)
{'default': 0.5, 'conditions': [], 'min_value': 0.5, 'max_value': 0.9, 'step': 0.1, 'sampling': 'linear'}
units_bilstm_2 (Choice)
{'default': 16, 'conditions': [], 'values': [16, 32, 64], 'ordered': True}
units_dense (Choice)
{'default': 16, 'conditions': [], 'values': [16, 32, 64], 'ordered': True}
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'sigmoid', 'tanh'], 'ordered': False}
rate_dp_2 (Float)
{'default': 0.5, 'conditions': [], 'min_value': 0.5, 'max_value': 0.9, 'step': 0.1, 'sampling': 'linear'}
optimizer (Choice)
{'default': 'adam', 'conditions': [], 'values': ['adam', 'sgd', 'rmsprop'], 'ordered': False}


In [15]:
tuner.search(X_train, y_train, epochs=4000, validation_split=0.1, callbacks=[es])

Trial 2 Complete [00h 01m 07s]
val_accuracy: 0.3569042384624481

Best val_accuracy So Far: 0.7989977598190308
Total elapsed time: 00h 06m 29s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
32                |16                |units_bilstm_1
0.8               |0.6               |rate_dp_1
32                |16                |units_bilstm_2
32                |32                |units_dense
relu              |relu              |activation
0.6               |0.5               |rate_dp_2
adam              |sgd               |optimizer

Epoch 1/4000
Epoch 2/4000
Epoch 3/4000
Epoch 4/4000
Epoch 5/4000
Epoch 6/4000
Epoch 7/4000
Epoch 8/4000
Epoch 9/4000
Epoch 10/4000
Epoch 11/4000
Epoch 12/4000
Epoch 13/4000
Epoch 14/4000
Epoch 15/4000
Epoch 16/4000
Epoch 17/4000
Epoch 18/4000
Epoch 19/4000
Epoch 20/4000
Epoch 21/4000
Epoch 22/4000
Epoch 23/4000
Epoch 24/4000
Epoch 25/4000
Epoch 26/4000
Epoch 27/4000
Epoch 28/4000
Epoch 29/4000
Epoch 30/4000
Epoch 31/4000
Epo

In [None]:
tuner.results_summary()

In [None]:
best_model = tuner.get_best_models()[0]

### Evaluation

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(tuner.history['accuracy'])
ax1.plot(tuner.history['val_accuracy'])
ax1.set_title('Model Accuracy')
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('Epoch')
ax1.legend(['Train', 'Validation'], loc='upper left')

ax2.plot(tuner.history['loss'])
ax2.plot(tuner.history['val_loss'])
ax2.set_title('Model Loss')
ax2.set_ylabel('Loss')
ax2.set_xlabel('Epoch')
ax2.legend(['Train', 'Validation'], loc='upper left')
plt.show()

## Tuning fit hyperparamters using GridSearch

In [None]:
def architecture_tuned_model():
    model = Sequential([
        Embedding(input_dim=len(word_index), output_dim=300, input_length=X_train.shape[1], trainable=False,
                  weights=[embeddings_on_this_context]),
        Bidirectional(LSTM(32, return_sequences=True)),
        Dropout(0.29),
        Bidirectional(LSTM(16)),
        Dense(64, 'tanh'),
        Dropout(0.73),
        Dense(4, activation='softmax')
    ])
    loss = "sparse_categorical_crossentropy"
    optimizer = SGD(learning_rate=0.01)
    metrics = ['accuracy']

    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    return model

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(model=architecture_tuned_model, verbose=2)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'batch_size': [32, 64, 128]}

gs = GridSearchCV(estimator=model,
                  param_grid=param_grid,
                  cv=5)
gs.fit(X_train, y_train)
print(f"Best results for {model.__class__.__name__}")
print("Best Score of train set: " + str(gs.best_score_))
print("Best estimator: " + str(gs.best_estimator_))
print("Best parameter set: " + str(gs.best_params_))


### Exporting model

In [None]:
model.save('../../assets/lstm_assets/lstm_model')