In [None]:
!pip install gensim==4.1.2
!pip install scikit-learn==0.24.2
!pip install seaborn==0.11.2

In [None]:
import os

base_dir = '/tf/sa-experiments/corpus'

if not os.path.exists(base_dir):
    !tar xvzf corpus.tar.gz

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import preprocessing
import shutil, os

In [None]:
from gensim.models import KeyedVectors

EMBEDDING_DIM = 300
USE_EMBEDDING = True
EMBEDDING_TYPE = 'skip'


if USE_EMBEDDING:
    embeddings_index = KeyedVectors.load_word2vec_format(f'{base_dir}/embeddings/{EMBEDDING_TYPE}_s{EMBEDDING_DIM}.txt')

In [None]:
training_samples = 745307
validation_samples = 82811
batch_size = 16

In [None]:
proportion = 1
subset_training = round((proportion * training_samples) / batch_size)

In [None]:
seed = 42

corpus_dir = '/tf/sa-experiments/corpus/reviews'

dataset_training = (preprocessing
    .text_dataset_from_directory(corpus_dir,
                                 validation_split=0.1,
                                 subset='training',
                                 shuffle=True,
                                 batch_size=batch_size,
                                 seed=seed)
)

class_names = dataset_training.class_names

dataset_training = (dataset_training
    .map(lambda x, y: (x, tf.one_hot(y, depth=3)))
    .take(subset_training))

dataset_validation = (preprocessing
    .text_dataset_from_directory(
        corpus_dir,
        validation_split=0.1,
        subset='validation',
        shuffle=True,
        batch_size=batch_size,
        seed=seed)
    .map(lambda x, y: (x, tf.one_hot(y, depth=3)))
)

num_samples = np.concatenate([x 
                              for x, _ 
                              in dataset_training.as_numpy_iterator()]).shape[0]

num_samples

In [None]:
VOCAB_SIZE = 200_000
SEQUENCE_LENGTH = 1000

encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    output_sequence_length=SEQUENCE_LENGTH,
    max_tokens=VOCAB_SIZE
    )
encoder.adapt(dataset_training.map(lambda text, label: text))

In [None]:
if USE_EMBEDDING:
    voc = encoder.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))

    num_tokens = len(voc)
    embedding_matrix = np.zeros((num_tokens, EMBEDDING_DIM))

    hits = 0
    misses = 0

    for word, i in word_index.items():
        
        if embeddings_index.has_index_for(word):
            embedding_matrix[i] = embeddings_index[word]
            hits+=1
        else:
            misses+=1

    print(f"Hits: {hits}")
    print(f"Misses: {misses}")

In [None]:
emb = tf.keras.layers.Embedding(mask_zero=True, 
                                input_dim=len(encoder.get_vocabulary()),
                                output_dim=EMBEDDING_DIM, 
                                trainable=True)

model = tf.keras.Sequential([
    encoder,
    emb,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300, return_sequences=True, stateful=False)),
    tf.keras.layers.Dense(SEQUENCE_LENGTH, activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(.6),
    tf.keras.layers.Dense(3, activation='softmax')
])

emb.set_weights([embedding_matrix])

In [None]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=[tf.keras.metrics.CategoricalAccuracy()])

model.summary()

In [None]:
base_dir = f'/tf/sa-experiments/logs/lstm_{num_samples}_{EMBEDDING_DIM}_{VOCAB_SIZE}_{SEQUENCE_LENGTH}_{USE_EMBEDDING}_{EMBEDDING_TYPE}'

model_dir = f'{base_dir}/model'
history_dir = f'{base_dir}/fit_results'

In [None]:
def clean_directory(directory: str, recreate : bool = True):

    shutil.rmtree(directory, ignore_errors=True)
    os.makedirs(directory, exist_ok=True) if recreate else None

clean_directory(base_dir)
clean_directory(history_dir)
clean_directory(model_dir)

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath= model_dir + '/{epoch:02d}.tf')
csv_logger_callback = tf.keras.callbacks.CSVLogger(f'{history_dir}/training.log')

history = model.fit(dataset_training,
                    validation_data=dataset_validation, 
                    batch_size=batch_size,
                    callbacks=[model_checkpoint_callback, csv_logger_callback],
                    epochs=5
                   )

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.set(font_scale=2)

def plot_results(clf, X_validation, Y_validation):

    Y_predict = np.argmax(clf.predict(X_validation), axis=1)
    fig, axs = plt.subplots(figsize=(15, 20), nrows=2, ncols=1)

    map_classes = np.vectorize(lambda x: class_names[x])
    
    sns.heatmap(pd.DataFrame(confusion_matrix(map_classes(Y_predict), 
                                              map_classes(Y_validation), 
                                              normalize='true', 
                                              labels=class_names), 
                             columns=class_names, 
                             index=class_names), 
                annot=True,
                cmap='Blues',
                ax=axs[0])
    
    sns.heatmap(pd.DataFrame(confusion_matrix(map_classes(Y_predict), 
                                              map_classes(Y_validation),
                                              labels=class_names), 
                             columns=class_names, 
                             index=class_names), 
                annot=True,
                cmap='Blues',
                ax=axs[1])

    axs[0].set_title(f'{clf.__class__.__name__} | Accuracy : {round(accuracy_score(Y_validation, Y_predict), 2)}')
    axs[1].set_title(f'{clf.__class__.__name__} | Accuracy : {accuracy_score(Y_validation, Y_predict, normalize=False)}')

    fig.savefig(f'{base_dir}/LSTM.svg', bbox_inches='tight')

In [None]:
training_log = pd.read_csv(f'{history_dir}/training.log')

epoch = (training_log
    .sort_values(by=['val_loss'])
    .reset_index()
    ['epoch'][0] + 1)

final_dataset_validation = np.concatenate([
    np.concatenate([x.reshape(x.shape[0], 1),y], axis=1)
    for x, y in dataset_validation.as_numpy_iterator()])

model.load_weights(model_dir + f"/{str(epoch).zfill(2)}.tf")
Y_validation = np.argmax(final_dataset_validation[:,1:], axis=1)

plot_results(model, final_dataset_validation[:,0], Y_validation)

In [None]:
fig, ax = plt.subplots(figsize=(15, 20), nrows=2, ncols=1)

sns.lineplot(data=training_log
                    .melt(value_vars=['categorical_accuracy', 'val_categorical_accuracy'],
                          id_vars=['epoch']), 
             x='epoch',
             y='value',
             hue='variable',
             ax=ax[0])

ax[0].set_xticks(range(0,5))

sns.lineplot(data=training_log
                    .melt(value_vars=['loss', 'val_loss'],
                          id_vars=['epoch']), 
             x='epoch',
             y='value',
             hue='variable',
             ax=ax[1])
ax[1].set_xticks(range(0,5))

fig.savefig(f'{base_dir}/training.svg', bbox_inches='tight')