# Emotions ML Model Training 

This notebook is built with the purpose of training and save a deep learning ML model for emotions classification using the sentence emotions dataset from books built using the `Build-emotions-dataset-final` notebook. 

For more details about the data, please check this notebook.

For mode details about the ML model, please check the code. 





In [None]:
from limbic.emotion.models.tf_limbic_model import utils
training_metadata ={}

### Loading train and test data


In [2]:
import pickle

import pandas as pd
import tensorflow as tf

from limbic.limbic_constants import AFFECT_INTENSITY_EMOTIONS as EMOTIONS
training_metadata['labels'] = EMOTIONS

SENTENCE_EMOTIONS_TEST_FILE = '../data/sentence_emotions_test.pickle' 
SENTENCE_EMOTIONS_TRAIN_FILE = '../data/sentence_emotions_train.pickle'
CONTINUOUS_TO_BINARY_THRESHOLD = 0.5
training_metadata['sentence_emotions_test_file'] = SENTENCE_EMOTIONS_TEST_FILE
training_metadata['sentence_emotions_train_file'] = SENTENCE_EMOTIONS_TRAIN_FILE
training_metadata['continuous_to_binary_threshold'] = CONTINUOUS_TO_BINARY_THRESHOLD

VERSION = '2019-11-16'
tokenizer_file = f'tokenizer_{VERSION}.pickle'
training_metadata['previous_version'] = VERSION

# This is the total number of words to be used from the ones found in the corpus (note that 50k is about freq 5)
MAX_WORDS = 50000  
MAX_LEN = 150  # this is the total number of words allowed in an input sentence of the model (also used for the padding)
training_metadata['tokenized_max_words'] = MAX_WORDS
training_metadata['tokenized_padding_max_len'] = MAX_LEN


with open(tokenizer_file, 'rb') as tokenizer_f:
    tokenizer = pickle.load(tokenizer_f)


def load_data_file(file_path, tokenizer):
    data = pd.read_pickle(file_path)
    data_sentences = data['text'].str.lower().apply(lambda x: utils.preprocess_sentence(x))
    y_data = data[EMOTIONS].values
    # This will be used throughout the notebook to compute performance 
    y_data_labeled = utils.continuous_labels_to_binary(y_data, CONTINUOUS_TO_BINARY_THRESHOLD)   

    # This representation will be needed for sklearn later in this notebook. 
    x_data = tokenizer.texts_to_sequences(data_sentences)
    x_data = tf.keras.preprocessing.sequence.pad_sequences(x_data, maxlen=MAX_LEN)
    
    return data, x_data, y_data, y_data_labeled, data_sentences


train, x_train, y_train, y_train_labeled, train_sentences = load_data_file(SENTENCE_EMOTIONS_TRAIN_FILE, tokenizer)
test, x_test, y_test, y_test_labeled, test_sentences = load_data_file(SENTENCE_EMOTIONS_TEST_FILE, tokenizer)

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (76340, 5)
test shape: (19085, 5)


### Loading pre-trained embeddings 


In [3]:
GLOVE_EMBEDDING = "../data/embeddings/glove.6B.100d.txt"  # available here: http://nlp.stanford.edu/data/glove.6B.zip
EMBEDDING_SIZE = 100  # Given the GloVe file used. 
training_metadata['embeddings_file'] = GLOVE_EMBEDDING
training_metadata['embeddings_size'] = EMBEDDING_SIZE

embeddings_matrix = utils.build_embeddings_matrix(tokenizer, 
                                                  max_words=MAX_WORDS, 
                                                  embeddings_file=GLOVE_EMBEDDING,
                                                  embedding_size=EMBEDDING_SIZE)


### Building and training the TensorFlow model

In [4]:
# These values were tweaked manually but should be included in a Hyper-parameters search with Bayesian optimization
# More info: https://www.dlology.com/blog/how-to-do-hyperparameter-search-with-baysian-optimization-for-keras-model/
DROP_OUT_RATE = 0.1
MODEL_METRICS = ['accuracy']
LOSS_FUNCTION = 'binary_crossentropy'
ADAM_LR_PARAMETER = 1e-3

# TODO: move all params to config.cfg file and load from disk. 
params = utils.ModelParams(
    max_len=MAX_LEN,
    max_words=MAX_WORDS,
    embedding_size=EMBEDDING_SIZE,
    drop_out_rate=DROP_OUT_RATE,
    labels=EMOTIONS,
    loss_function=LOSS_FUNCTION,
    adam_lr_parameter=ADAM_LR_PARAMETER,
    model_metrics=MODEL_METRICS)

training_metadata['training_params'] = params._asdict()

model = utils.build_model(embeddings_matrix, params)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 100)     5000000     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 150, 512)     549888      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 146, 128)     327808      bidirectional[0][0]              
______________________________________________________________________________________________

In [5]:
BATCH_SIZE = 256
VALIDATION_SPLIT = 0.1
EPOCHS = 3  # using low number as example
TENSORFLOW_LOGS = './tensorflow_logs'
training_metadata['training_batch_size'] = BATCH_SIZE
training_metadata['training_validation_split'] = VALIDATION_SPLIT
training_metadata['training_epochs'] = EPOCHS
training_metadata['training_tensorflow_logs'] = TENSORFLOW_LOGS

# Adding a callback to review train/test learning curves in Tensorboard 
# TODO: Add callbacks to make use of checkpoints if needed. 
callbacks = [tf.keras.callbacks.TensorBoard(log_dir=TENSORFLOW_LOGS)]

model.fit(x_train, 
          y_train, 
          validation_split=VALIDATION_SPLIT, 
          batch_size=BATCH_SIZE,
          epochs=EPOCHS, 
          callbacks=callbacks, 
          verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x152efdcc0>

### Saving the model

In [6]:
utils.save_model(tokenizer, model, training_metadata)
