# Testing Models

The bi-directional RNN built in previous jupyter notebook will be evaluated against Facebook's FastText and other multi-label classification models based on scikit-learn will be used. The idea is to evaluate both the general classification performance and the ability of the models to generate weighted predictions for each label similar to the ones generated by the lexicon-based emotions model. 



## Limbic Bi-directional RNN Multilabel Classifier 

First let's check the performance of such model with the test dataset.

In [2]:
from limbic.emotion.models.tf_limbic_model import utils

In [3]:
VERSION = '2019-11-16'

metadata_file = f'model_metadata_{VERSION}.txt'
tokenizer_file = f'tokenizer_{VERSION}.pickle'

with open(tokenizer_file, 'rb') as tokenizer_f:
    tokenizer = pickle.load(tokenizer_f)

model = tf.keras.models.load_model(model_path)

In [4]:
from limbic.limbic_constants import AFFECT_INTENSITY_EMOTIONS as EMOTIONS
from limbic.emotion.models.tf_limbic_model import TfLimbicModel

# These are variables needed to load and use the model
MAX_LEN = 150

# To create the limbic model from scratch we need to pass down the TensorFlow model, the tokenizer and some paramters
tf_model = TfLimbicModel(model=model, tokenizer=tokenizer, max_len=MAX_LEN, emotions=EMOTIONS)  


Note that we can also load the model without passing down any parameter and will use the latest configured in the code base, as simply as the following:

```python
tf_model = TfLimbicModel()
```

Given that in this notebook we are indeed computing the performance for a very specific model, I'm allowing to generate such models by passing some of the parameters used in the previous step. 

In [54]:
import pandas as pd
import tensorflow as tf

SENTENCE_EMOTIONS_TEST_FILE = '../data/sentence_emotions_test.pickle' 
SENTENCE_EMOTIONS_TRAIN_FILE = '../data/sentence_emotions_train.pickle'
CONTINUES_TO_BINARY_THRESHOLD = 0.5


def load_data_file(file_path):
    data = pd.read_pickle(file_path)
    data_sentences = data['text'].str.lower().apply(lambda x: utils.preprocess_sentence(x))
    y_data = data[EMOTIONS].values
    # This will be used throughout the notebook to compute performance 
    y_data_labeled = utils.continuous_labels_to_binary(y_data, CONTINUES_TO_BINARY_THRESHOLD)   

    # This representation will be needed for sklearn later in this notebook. 
    x_data = tokenizer.texts_to_sequences(data_sentences)
    x_data = tf.keras.preprocessing.sequence.pad_sequences(x_data, maxlen=MAX_LEN)
    
    return data, x_data, y_data, y_data_labeled, data_sentences


train, x_train, y_train, y_train_labeled, train_sentences = load_data_file(SENTENCE_EMOTIONS_TRAIN_FILE)
test, x_test, y_test, y_test_labeled, test_sentences = load_data_file(SENTENCE_EMOTIONS_TEST_FILE)

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (76340, 5)
test shape: (19085, 5)


In [7]:
from tqdm import tqdm_notebook as tqdm

y_pred_tf = []
for sentence in tqdm(test_sentences):
    prediction = tf_model.predict(sentence)
    y_pred_tf.append(prediction)

HBox(children=(IntProgress(value=0, max=19085), HTML(value='')))




In [10]:
import numpy as np
from sklearn.metrics import classification_report

y_pred_tf_labeled = utils.continuous_labels_to_binary(np.array([list(x) for x in y_pred_tf]), 0.5)
print(classification_report(y_test_labeled, y_pred_tf_labeled, target_names=EMOTIONS))


              precision    recall  f1-score   support

     sadness       0.82      0.42      0.56      2430
         joy       0.77      0.54      0.63      3603
        fear       0.81      0.49      0.61      2684
       anger       0.71      0.47      0.57      1783

   micro avg       0.78      0.49      0.60     10500
   macro avg       0.78      0.48      0.59     10500
weighted avg       0.78      0.49      0.60     10500
 samples avg       0.18      0.16      0.16     10500



## FastText MultiLabel Classifier

In [11]:
"""
These methods are exclusively for transforming the train and test datasets to fasttext format. 
"""

def add_label(key, value):    
    return f'__label__{key}' if value > 0 else None

def prepare_for_fasttext(data, suffix=''):
    """
    This will generate the dataset needed for fasttext, each line will be something like the following:
    __label__joy this is joy\n
    
    Each sentence is preprocessed using the preprocess_sentence method used in the deep learning model above. 
    """
    with open(f'../data/fasttext_{suffix}', 'w') as f:
        for index, row in tqdm(data.iterrows(), f'iterating data {suffix}'):
            labels = ' '.join([x for x in [add_label(emotion, row[emotion]) for emotion in EMOTIONS] if x])
            f.write(f"{labels} {utils.preprocess_sentence(row['text'])}\n")


In [25]:
prepare_for_fasttext(train, 'train')
prepare_for_fasttext(test, 'test')

HBox(children=(IntProgress(value=1, bar_style='info', description='iterating data train', max=1, style=Progres…




HBox(children=(IntProgress(value=1, bar_style='info', description='iterating data test', max=1, style=Progress…




In [39]:
import fasttext

ft_model = fasttext.train_supervised(
    input="../data/fasttext_train", 
    lr=0.5, 
    epoch=25, 
    wordNgrams=2, 
    bucket=200000, 
    dim=100, 
    loss='ova')  # One vs All strategy for training a multi-label classification model

# TODO: consider adding FastText model to have a "fast" version of the classifier 
# (it's orders of magnitude faster than the others but not very accurate)
ft_model.save_model("model_fasttext.bin")  

In [36]:
def fasttext_prediction(ft_model, sentence):
    p = ft_model.predict(sentence, k=-1)
    return {k.split('__')[-1]:min(1.0, max(v, 0.0001)) for k, v in zip(*p)}


def np_fasttext_prediction(ft_model, sentence, categories):
    label_p = fasttext_prediction(ft_model, sentence)
    return np.array([label_p[c] for c in categories])
    

In [37]:
fasttext_prediction(ft_model, 'i have joy')

{'joy': 1.0, 'sadness': 0.0001, 'anger': 0.0001, 'fear': 0.0001}

In [38]:
from tqdm import tqdm_notebook as tqdm

y_pred_ft = []
for sentence in tqdm(test_sentences):
    prediction = np_fasttext_prediction(ft_model, sentence, EMOTIONS)
    y_pred_ft.append(prediction)

HBox(children=(IntProgress(value=0, max=19085), HTML(value='')))




In [42]:
y_pred_labeled_ft = utils.continuous_labels_to_binary(np.array([list(x) for x in y_pred_ft]), CONTINUES_TO_BINARY_THRESHOLD)
print(classification_report(y_test_labeled, y_pred_labeled_ft, target_names=EMOTIONS))


              precision    recall  f1-score   support

     sadness       0.44      0.87      0.59      2430
         joy       0.42      0.94      0.58      3603
        fear       0.46      0.88      0.60      2684
       anger       0.44      0.82      0.57      1783

   micro avg       0.44      0.89      0.59     10500
   macro avg       0.44      0.88      0.59     10500
weighted avg       0.44      0.89      0.59     10500
 samples avg       0.25      0.31      0.27     10500



## Scikit-MultiLearn RandomForest using BinaryRelevance (One-vs-All)

For more details on scikit-multilearn you should check this post: https://xang1234.github.io/multi-label/

In [46]:
GLOVE_EMBEDDING = "../data/embeddings/glove.6B.100d.txt"
EMBEDDING_SIZE = 100
MAX_WORDS = 50000

# TODO: explore building a new method to return a dictionary instead of a list as lookup 
# operations are much faster (re-using the one needed by TensorFlow for the moment)
embeddings_matrix = utils.build_embeddings_matrix(tokenizer, 
                                                  max_words=MAX_WORDS, 
                                                  embeddings_file=GLOVE_EMBEDDING,
                                                  embedding_size=EMBEDDING_SIZE)


In [48]:
def combine_embeddings(embeddings):
    """
    Very simple embeddings combination strategy (using the average). 
    
    TODO: Some other strategies could be tested. 
    """
    avg_emb = np.zeros((1, len(embeddings[0])), dtype='float32')[0]
    for e in embeddings:
        for idx, eb in enumerate(e):
            avg_emb[idx] += eb
    return avg_emb / len(embeddings)


def prepare_for_sklearn(x_data):    
    """
    basically transform the input dataset to a dataset of embeddings. 
    """
    x_data_sk = []
    for xt in tqdm(x_data):
        embs = [embeddings_matrix[_v] for _v in xt]
        x_data_sk.append(combine_embeddings(embs))
    return x_data_sk


In [49]:
x_train_sklearn = prepare_for_sklearn(x_train)

HBox(children=(IntProgress(value=0, max=76340), HTML(value='')))




In [57]:
"""
Given that these models are quite slow to train, I can't use the +75k objects (at least when running this in my laptop). 

Will do some simple sampling and use a smaller dataset for training scikit-learn based models. 
"""

import random

def sample_db(x, y, n=10000):
    s = random.sample(range(len(x)), n)
    new_x = []
    new_y = []
    for idx in s:
        new_x.append(x[idx])
        new_y.append(y[idx])
    return np.array(new_x), np.array(new_y)

x_train_sample, y_train_sample = sample_db(x_train_sklearn, y_train, n=30000)


In [82]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
import time

def prepare_y(y):
    """
    If there's any value predicted for a category above CONTINUES_TO_BINARY_THRESHOLD, 
    we'll consider the category present.
    
    TODO: There must be a method that does this in sklearn but doing this manually anyways.
    """
    new_y = np.zeros(y.shape, dtype='float32')
    for iidx, i in enumerate(y):
        for jidx, j in enumerate(i):
            if j > 0.1:
                new_y[iidx][jidx] = 1
    return new_y
            
# TODO: Tweak and do a proper parameter tuning for this model
classifier = BinaryRelevance(
    classifier = RandomForestClassifier(n_estimators=50, max_depth=5, 
                                        min_samples_split=10, random_state=42),
    require_dense = [False, True]
)

classifier.fit(x_train_sklearn, prepare_y(y_train))


BinaryRelevance(classifier=RandomForestClassifier(bootstrap=True,
                                                  class_weight=None,
                                                  criterion='gini', max_depth=5,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=10,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=50, n_jobs=None,
                                                  oob_score=False,
                                                  random_state=42, verbose=0,
                                           

In [65]:
x_test_sklearn = prepare_for_sklearn(x_test)

HBox(children=(IntProgress(value=0, max=19085), HTML(value='')))




In [83]:
def mlpredict(x):
    prediction = classifier.predict_proba(np.array([x]))[0]
    index = prediction[0].rows[0]
    preds = []
    for idx in range(4):
        if idx in index:
            pos = index.index(idx)
            preds.append(prediction[0].data[0][pos])
        else:
            preds.append(0)
    return preds 

y_pred_skml = []
for _x in tqdm(x_test_sklearn):
    y_pred_skml.append(mlpredict(_x))

HBox(children=(IntProgress(value=0, max=19085), HTML(value='')))




In [85]:
y_pred_labeled_skml = utils.continuous_labels_to_binary(np.array([list(x) for x in y_pred_skml]), CONTINUES_TO_BINARY_THRESHOLD)
print(classification_report(y_test_labeled, y_pred_labeled_skml, target_names=EMOTIONS))


              precision    recall  f1-score   support

     sadness       0.43      0.13      0.20      2430
         joy       0.41      0.44      0.43      3603
        fear       0.43      0.17      0.25      2684
       anger       0.41      0.07      0.12      1783

   micro avg       0.42      0.24      0.30     10500
   macro avg       0.42      0.21      0.25     10500
weighted avg       0.42      0.24      0.28     10500
 samples avg       0.08      0.08      0.07     10500



## Scikit-Learn One vs All SVM 

In [78]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC


svm_classifier = OneVsRestClassifier(
    SVC(kernel='rbf', gamma=0.5, C=100, verbose=1, probability=True, random_state=42))
svm_classifier.fit(x_train_sample, prepare_y(y_train_sample))


[LibSVM][LibSVM][LibSVM][LibSVM]

OneVsRestClassifier(estimator=SVC(C=100, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma=0.5, kernel='rbf',
                                  max_iter=-1, probability=True,
                                  random_state=42, shrinking=True, tol=0.001,
                                  verbose=1),
                    n_jobs=None)

In [79]:
y_pred_sklearn = svm_classifier.predict_proba(x_test_sklearn)


In [81]:
y_pred_labeled_sklearn = utils.continuous_labels_to_binary(np.array([list(x) for x in y_pred_sklearn]), CONTINUES_TO_BINARY_THRESHOLD)
print(classification_report(y_test_labeled, y_pred_labeled_sklearn, target_names=EMOTIONS))


              precision    recall  f1-score   support

     sadness       0.49      0.37      0.42      2430
         joy       0.47      0.65      0.55      3603
        fear       0.54      0.48      0.51      2684
       anger       0.47      0.27      0.34      1783

   micro avg       0.49      0.48      0.48     10500
   macro avg       0.49      0.44      0.45     10500
weighted avg       0.49      0.48      0.47     10500
 samples avg       0.15      0.16      0.15     10500

