## Packages and Assets

In [43]:
from gensim.models import Word2Vec
import json
import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping
import seaborn as sns
import keras_tuner as kt
from keras.optimizers import SGD

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
import matplotlib.pyplot as plt
from keras.metrics import Accuracy, Precision, Recall
from keras.preprocessing.text import tokenizer_from_json
import os, sys


import keras.backend as K

In [2]:
print(sys.executable)
print(os.getcwd())

C:\Users\Guilherme\.conda\envs\nlp2\python.exe
C:\Users\Guilherme\Documents\Programming\Python\DataScience\TCC\pipeline\supervised_deep_models


## Dependencies

In [3]:
with open('../../assets/deep_assets/tokenizer.json', 'r', encoding='utf-8') as f:
    tokenizer_json = f.read()
    tokenizer = tokenizer_from_json(tokenizer_json)
    word_index = tokenizer.word_index



In [4]:
model_we = Word2Vec.load('../../assets/deep_assets/word2vec.model')

print(model_we.wv.most_similar('itau'))

# List of nparrays of size 300
embeddings_dict = {}
for word in model_we.wv.index_to_key:
    embeddings_dict[word] = model_we.wv[word]

embeddings_on_this_context = np.zeros((len(word_index), 300))
for word, i in word_index.items():
    embeddings_vector = embeddings_dict.get(word)
    if embeddings_vector is not None:
        embeddings_on_this_context[i - 1] = embeddings_vector


[('itub4', 0.990408718585968), ('banco', 0.9718618392944336), ('bradesco', 0.9700526595115662), ('bbas3', 0.9663271307945251), ('bbdc4', 0.9616492986679077), ('santander', 0.9615459442138672), ('xpbr31', 0.9500864148139954), ('unibanco', 0.9492729902267456), ('pagar', 0.9354850649833679), ('valioso', 0.9320330619812012)]


In [20]:
train = pd.read_csv('../../assets/data/splits/train/padded.csv')
val = pd.read_csv('../../assets/data/splits/val/padded.csv')

## Functions

In [6]:
def index2word(word_index):
    index_word = {}
    for key in word_index:
        index_word[word_index[key]] = key
    return index_word


def seq2text(seq, index_word):
    text = []
    for index in seq:
        text.append(index_word[index])
    return text


def show_confusion_matrix(cm):
    print("Confusion Matrix")
    plt.figure(figsize=(10, 7))

    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.title('Confusion Matrix')
    plt.show()

In [35]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

## RNN Model

### Data transformation for model

In [21]:
X_train = train.to_numpy()[:, :-1]
X_val = val.to_numpy()[:, :-1]
y_train = train.to_numpy()[:, -1]
y_val = val.to_numpy()[:, -1]

### Tuning Architecture Hyperparams using RandomSearch and Early Stopping

#### Test 1

In [9]:
# def build_model(hp):
#     model = Sequential([
#         Embedding(input_dim=len(word_index), output_dim=300, input_length=X_train.shape[1], trainable=False,
#                   weights=[embeddings_on_this_context]),
#         Bidirectional(LSTM(16, return_sequences=True)),
#         Dropout(hp.Float('rate_dp_1', 0, 0.4, step=0.1)),
#         Bidirectional(LSTM(16)),
#         Dense(64, 'tanh'),
#         Dropout(hp.Float('rate_dp_1', 0.1, 0.4, step=0.1)),
#         Dense(4, activation='softmax')
#     ])
#     loss = "sparse_categorical_crossentropy"
#     # optimizer = SGD(learning_rate=0.01)
#     metrics = ['accuracy']
#
#     model.compile(loss=loss, optimizer='adam', metrics=metrics)
#     return model
#
#
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

TypeError: f1_score() missing 2 required positional arguments: 'y_true' and 'y_pred'

####  Test 2

In [58]:
def build_model(hp):
    model = Sequential([
    Embedding(input_dim=len(word_index), output_dim= 300, input_length=X_train.shape[1], trainable=False, weights=[embeddings_on_this_context]),
    Bidirectional(LSTM(hp.Choice('units_bilstm_1',[16,32,64]), return_sequences=True)),
    Dropout(hp.Float('rate_dp_1',0,0.3,step=0.1)),
    Bidirectional(LSTM(hp.Choice('units_bilstm_2',[16,32,64]))),
    Dense(hp.Choice('units_dense',[16,32,64]), hp.Choice('activation',['tanh']) ),
    Dropout(hp.Float('rate_dp_2',0,0.3,step=0.1)),
    Dense(4, activation='softmax')
])
    loss = "sparse_categorical_crossentropy"
    # optimizer = SGD(learning_rate=0.01)
    metrics = ['accuracy']
    # metrics = ['accuracy', Precision(), Recall()]

    model.compile(loss=loss,optimizer='adam',metrics=metrics)
    return model

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [59]:
# amanha testar dropout 0.0, 0.1, 0.2
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='../../assets/deep_assets',
    overwrite=True,
    project_name='lstm_tuning')

tuner.search_space_summary()

Search space summary
Default search space size: 6
units_bilstm_1 (Choice)
{'default': 16, 'conditions': [], 'values': [16, 32, 64], 'ordered': True}
rate_dp_1 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.3, 'step': 0.1, 'sampling': 'linear'}
units_bilstm_2 (Choice)
{'default': 16, 'conditions': [], 'values': [16, 32, 64], 'ordered': True}
units_dense (Choice)
{'default': 16, 'conditions': [], 'values': [16, 32, 64], 'ordered': True}
activation (Choice)
{'default': 'tanh', 'conditions': [], 'values': ['tanh'], 'ordered': False}
rate_dp_2 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.3, 'step': 0.1, 'sampling': 'linear'}


In [None]:
tuner.search(X_train, y_train, epochs=4000, validation_data=(X_val,y_val), callbacks=[es], batch_size=32, verbose=2)

Trial 5 Complete [00h 02m 48s]
val_accuracy: 0.8378678560256958

Best val_accuracy So Far: 0.8378678560256958
Total elapsed time: 00h 12m 16s

Search: Running Trial #6

Value             |Best Value So Far |Hyperparameter
16                |64                |units_bilstm_1
0                 |0                 |rate_dp_1
16                |64                |units_bilstm_2
64                |32                |units_dense
tanh              |tanh              |activation
0.1               |0.1               |rate_dp_2

Epoch 1/4000
451/451 - 13s - loss: 0.9301 - accuracy: 0.6300 - val_loss: 0.7751 - val_accuracy: 0.7152 - 13s/epoch - 30ms/step
Epoch 2/4000
451/451 - 6s - loss: 0.7232 - accuracy: 0.7291 - val_loss: 0.7454 - val_accuracy: 0.7185 - 6s/epoch - 13ms/step
Epoch 3/4000
451/451 - 5s - loss: 0.6306 - accuracy: 0.7725 - val_loss: 0.6073 - val_accuracy: 0.7835 - 5s/epoch - 11ms/step
Epoch 4/4000
451/451 - 5s - loss: 0.5773 - accuracy: 0.7914 - val_loss: 0.5758 - val_accuracy: 0.79

In [None]:
tuner.results_summary()
 # 0.845774233341217

In [None]:
type(tuner.results_summary())

In [None]:
with open('../../assets/deep_assets/tuner_results_10_attempts.txt', 'w') as f:
    f.write(str(tuner.results_summary()))


In [None]:
tuner.get_best_hyperparameters()[0].values


In [None]:
tuner.save()

In [None]:
best_model = tuner.get_best_models()[0]

#### Evaluation

### Tuning fit hyperparamters using GridSearch

In [None]:
# model = Sequential([
#     Embedding(input_dim=len(word_index), output_dim= 300, input_length=X_train.shape[1], trainable=False, weights=[embeddings_on_this_context]),
#     Bidirectional(LSTM(64, return_sequences=True)),
#     # Dropout(0.29),
#     # Bidirectional(LSTM(hp.Choice('units',[32,64]))),
#     Bidirectional(LSTM(64)),
#     Dense(32, activation='relu'),
#     # Dropout(0.73),
#     Dense(4, activation='softmax')
# ])
#
# model.summary()
#
# from keras.optimizers import SGD
#
# loss = "sparse_categorical_crossentropy"
# optimizer = 'adam'
# metrics = ['accuracy']
#
# model.compile(loss=loss,optimizer=optimizer,metrics=metrics)
#
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
#
#
# history = model.fit(X_train, y_train, epochs=4000, validation_data=(X_val,y_val), verbose=2, callbacks=[es])


In [None]:
# def architecture_tuned_model():
#     model = Sequential([
#         Embedding(input_dim=len(word_index), output_dim=300, input_length=X_train.shape[1], trainable=False,
#                   weights=[embeddings_on_this_context]),
#         Bidirectional(LSTM(32, return_sequences=True)),
#         Dropout(0.29),
#         Bidirectional(LSTM(16)),
#         Dense(64, 'tanh'),
#         Dropout(0.73),
#         Dense(4, activation='softmax')
#     ])
#     loss = "sparse_categorical_crossentropy"
#     optimizer = 'adam'
#     metrics = ['accuracy']
#
#     model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
#     return model


In [None]:
# from keras.wrappers.scikit_learn import KerasClassifier
#
# model = KerasClassifier(build_fn=architecture_tuned_model, verbose=0)

In [None]:
# from sklearn.model_selection import GridSearchCV
#
# es = EarlyStopping(monitor='accuracy', mode='min', verbose=1, patience=5)
#
# param_grid = {'batch_size': [1,2,4,8]}
#
# gs = GridSearchCV(estimator=model,
#                   param_grid=param_grid,
#                   cv=5)
# gs.fit(X_train, y_train, callbacks=[es])
# print(f"Best results for {model.__class__.__name__}")
# print("Best Score of train set: " + str(gs.best_score_))
# print("Best estimator: " + str(gs.best_estimator_))
# print("Best parameter set: " + str(gs.best_params_))


In [None]:
# gs.best_estimator_.model.history.history
# # 0.7766810655593872

In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
# ax1.plot(gs.best_estimator_.model.history.history['accuracy'])
# ax1.plot(gs.best_estimator_.model.history.history['loss'])
# ax1.set_title('Model Accuracy')
# ax1.set_ylabel('Accuracy')
# ax1.set_xlabel('Epoch')
# ax1.legend(['Train', 'Validation'], loc='upper left')
#
# ax2.plot(gs.best_estimator_.model.history.history['loss'])
# ax2.plot(gs.best_estimator_.model.history.history['val_loss'])
# ax2.set_title('Model Loss')
# ax2.set_ylabel('Loss')
# ax2.set_xlabel('Epoch')
# ax2.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

### Tuning fit hyperparameters manually

In [None]:
# # model = Sequential([
# #     Embedding(input_dim=len(word_index), output_dim= 300, input_length=X_train.shape[1], trainable=False, weights=[embeddings_on_this_context]),
# #     Bidirectional(LSTM(4, return_sequences=True)),
# #     # Dropout(0.29),
# #     # Bidirectional(LSTM(hp.Choice('units',[32,64]))),
# #     Bidirectional(LSTM(4)),
# #     Dense(32, activation='relu'),
# #     # Dropout(0.73),
# #     Dense(4, activation='softmax')
# # ])
# #
# # model.summary()
#
# from keras.optimizers import SGD
#
# loss = "sparse_categorical_crossentropy"
# optimizer = 'adam'
# metrics = [get_f1]
#
#
#
#
# best_model.compile(loss=loss,optimizer=optimizer,metrics=metrics)
#
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
#
#
# best_model.summary()
#


In [None]:
# history = best_model.fit(X_train, y_train, epochs=4000, batch_size=32, validation_split=0.1, verbose=2, callbacks=[es])

In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
# ax1.plot(history.history['accuracy'])
# ax1.plot(history.history['val_accuracy'])
# ax1.set_title('Model Accuracy')
# ax1.set_ylabel('Accuracy')
# ax1.set_xlabel('Epoch')
# ax1.legend(['Train', 'Validation'], loc='upper left')
#
# ax2.plot(history.history['loss'])
# ax2.plot(history.history['val_loss'])
# ax2.set_title('Model Loss')
# ax2.set_ylabel('Loss')
# ax2.set_xlabel('Epoch')
# ax2.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

### Exporting model

In [None]:
best_model.save('../../assets/deep_assets/lstm_model')