In [168]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

import matplotlib.pyplot as plt

import os
import sys
PROJECT_ROOT = os.path.abspath( # Getting Obtaining the absolute normalized version of the project root path (Obtendo a versão absoluta normalizada do path raíz do projeto)
    os.path.join( # Concatenating the paths (Concatenando os paths)
        os.getcwd(), # # Getting the path of the notebooks directory (Obtendo o path do diretório dos notebooks)
        os.pardir # Gettin the constant string used by the OS to refer to the parent directory (Obtendo a string constante usada pelo OS para fazer referência ao diretório pai)
    )
)
# Adding path to the list of strings that specify the search path for modules
# Adicionando o path à lista de strings que especifica o path de pesquisa para os módulos
sys.path.append(PROJECT_ROOT)
from src.preprocessing import *

In [42]:
files = ['train', 'valid', 'test']
datasets = []
for file in files:
    with open(f'../data/preprocessed/{file}_corpus.npy', 'rb') as f:
        datasets.append(np.load(f))

train_corpus, valid_corpus, test_corpus = datasets
print(f'Train set shape: {train_corpus.shape}\nValidation set shape: {valid_corpus.shape}\nTest set shape: {test_corpus.shape}')

Train set shape: (9682, 167)
Validation set shape: (3227, 167)
Test set shape: (3228, 167)


In [70]:
# Dataset global variables
BATCH_SIZE = 128
BUFFER_SIZE = 1000

# Model global variables
MAX_LEN = train_corpus.shape[1] - 1
EMBEDDING_DIM = 5000
DROPOUT_RATE = .1
VOCAB_SIZE = len(pickle.load(open('../models/vectorizer.pkl', 'rb'))['vocabulary'])

In [90]:
def create_batch_dataset(dataset, batch_size=64, buffer_size=10000):
    """
    
    """
    dataset_final = Dataset.from_tensor_slices((dataset[:, :-1], dataset[:, -1:]))
    dataset_final = (dataset_final
                    .batch(batch_size)
                    .shuffle(buffer_size)
                    .prefetch(AUTOTUNE)
                    .cache()
                    )

    return dataset_final

In [94]:
train_set = create_batch_dataset(train_corpus, BATCH_SIZE, BUFFER_SIZE)
valid_set = create_batch_dataset(valid_corpus, BATCH_SIZE, BUFFER_SIZE)
test_set = create_batch_dataset(test_corpus, BATCH_SIZE, BUFFER_SIZE)

train_batch = next(train_set.as_numpy_iterator())
print(f"Shape of the train batch: {train_batch[0].shape}")

Shape of the train batch: (128, 166)


In [178]:
def create_and_compile_model(input_shape, vocab_size, embedding_dim=1000, dropout_rate=.1, lr=1e-3):
    """
    
    """
    input_layer = Input(shape=(input_shape,), name='input_layer')
    embedding_layer = Embedding(vocab_size, embedding_dim, name='embedding_layer')(input_layer)
    X = Bidirectional(LSTM(64, return_sequences=True), name='bi_lstm_layer_1')(embedding_layer)
    X = Dropout(rate=dropout_rate, name='dropout_layer_1')(X)
    X = Bidirectional(LSTM(32), name='bi_lstm_layer_2')(X)
    X = Dropout(rate=dropout_rate, name='dropout_layer_2')(X)
    X = Dense(
        32, 
        activation='relu', 
        kernel_initializer='he_normal',
        name='dense_layer_1'
    )(X)
    '''X = Dense(
        64, 
        activation='relu',
        kernel_initializer='he_normal',
        name='dense_layer_2'
    )(X)'''
    output = Dense(1, activation='sigmoid', name='output_layer')(X)
    model = Model(inputs=[input_layer], outputs=[output])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy', 
        metrics=['accuracy', 'f1_score']
    )    

    return model

In [219]:
def plot_history(history, metric_name):
    """
    
    """
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    metric = history.history[metric_name]
    val_metric = history.history[f'val_{metric_name}']
    epochs = range(len(loss))
    utils = [loss, 'loss'], [metric, metric_name]
    val_utils = [val_loss], [val_metric]
    
    fig, ax = plt.subplots(2, 2, figsize=(12, 6))
    for i in range(2):
        fig.suptitle('Performance per Epoch', fontsize=16)
        ax[i, 0].plot(epochs, utils[i][0], label='Train')
        ax[i, 0].plot(epochs, val_utils[i][0], label='Validation')
        ax[i, 0].set_ylabel(utils[i][1], fontsize=16)        
    
        ax[i, 1].plot(epochs, utils[i][0], label='Train')
        ax[i, 1].plot(epochs, val_utils[i][0], label='Validation')
        ax[i, 1].set_xlim(int((len(utils[i][0]) * .75)), len(utils[i][0]))
        if i == 1:
            ax[i, 0].set_xlabel('epochs', fontsize=16)
            ax[i, 1].set_xlabel('epochs', fontsize=16)
    plt.legend(loc='best', fontsize=16)
    plt.show()

In [228]:
EMBEDDING_DIM = 1000
DROPOUT_RATE = .5

model = create_and_compile_model(
    MAX_LEN, 
    VOCAB_SIZE, 
    EMBEDDING_DIM, 
    dropout_rate=DROPOUT_RATE, 
    lr=1e-2
)
model.summary()

In [None]:
checkpoint_cb = ModelCheckpoint('../models/lstm_model.keras', save_best_only=True)
early_stopping_cb = EarlyStopping(patience=50, restore_best_weights=True)

history = model.fit(
    train_set, 
    epochs=25, 
    validation_data=valid_set, 
    callbacks=[checkpoint_cb, early_stopping_cb],
    verbose=2
)

In [None]:
plot_history(history, 'accuracy')