In [2]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import numpy as np

import re
import io

In [3]:
label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

emoticons_additional = {
    '(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>',
    ':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>',
    ':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':":
        '<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>',
}

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter",
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, emoticons_additional]
)


def tokenize(text):
    text = " ".join(text_processor.pre_process_doc(text))
    return text


def preprocessData(dataFilePath, mode):
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            line = line.strip().split('\t')
            for i in range(1, 4):
                line[i] = tokenize(line[i])
            if mode == "train":
                labels.append(emotion2label[line[4]])
            conv = line[1:4]
            conversations.append(conv)
    if mode == "train":
        return np.array(conversations), np.array(labels)
    else:
        return np.array(conversations)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /Users/paulvilledieu/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /Users/paulvilledieu/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [4]:
texts_train, labels_train = preprocessData('../projet2/train.txt', mode="train")
texts_dev, labels_dev = preprocessData('../projet2/dev.txt', mode="train")

In [5]:
def getEmbeddings(file):
    embeddingsIndex = {}
    dim = 0
    with io.open(file, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector 
            dim = len(embeddingVector)
    return embeddingsIndex, dim


def getEmbeddingMatrix(wordIndex, embeddings, dim):
    embeddingMatrix = np.zeros((len(wordIndex) + 1, dim))
    for word, i in wordIndex.items():
        embeddingMatrix[i] = embeddings.get(word)
    return embeddingMatrix

In [6]:
from keras.preprocessing.text import Tokenizer

embeddings, dim = getEmbeddings('../projet2/emosense.300d.txt')
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([' '.join(list(embeddings.keys()))])

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

embeddings_matrix = getEmbeddingMatrix(wordIndex, embeddings, dim)

Using TensorFlow backend.


Found 658129 unique tokens.


In [7]:
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

MAX_SEQUENCE_LENGTH = 24

X_train, X_val, y_train, y_val = train_test_split(texts_train, labels_train, test_size=0.2, random_state=42)

labels_categorical_train = to_categorical(np.asarray(y_train))
labels_categorical_val = to_categorical(np.asarray(y_val))
labels_categorical_dev = to_categorical(np.asarray(labels_dev))


def get_sequances(texts, sequence_length):
    message_first = pad_sequences(tokenizer.texts_to_sequences(texts[:, 0]), sequence_length)
    message_second = pad_sequences(tokenizer.texts_to_sequences(texts[:, 1]), sequence_length)
    message_third = pad_sequences(tokenizer.texts_to_sequences(texts[:, 2]), sequence_length)
    return message_first, message_second, message_third


message_first_message_train, message_second_message_train, message_third_message_train = get_sequances(X_train, MAX_SEQUENCE_LENGTH)
message_first_message_val, message_second_message_val, message_third_message_val = get_sequances(X_val, MAX_SEQUENCE_LENGTH)
message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequances(texts_dev, MAX_SEQUENCE_LENGTH)

# Model 1

In [8]:
from keras.layers import Input, Dense, Embedding, Concatenate, Activation, \
    Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, GaussianNoise
from keras.models import Model


def buildModel(embeddings_matrix, sequence_length, lstm_dim, hidden_layer_dim, num_classes, 
               noise=0.1, dropout_lstm=0.2, dropout=0.2):
    turn1_input = Input(shape=(sequence_length,), dtype='int32')
    turn2_input = Input(shape=(sequence_length,), dtype='int32')
    turn3_input = Input(shape=(sequence_length,), dtype='int32')
    embedding_dim = embeddings_matrix.shape[1]
    embeddingLayer = Embedding(embeddings_matrix.shape[0],
                                embedding_dim,
                                weights=[embeddings_matrix],
                                input_length=sequence_length,
                                trainable=False)
    
    turn1_branch = embeddingLayer(turn1_input)
    turn2_branch = embeddingLayer(turn2_input) 
    turn3_branch = embeddingLayer(turn3_input) 
    
    turn1_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn1_branch)
    turn2_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn2_branch)
    turn3_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn3_branch)

    lstm1 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    lstm2 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    
    turn1_branch = lstm1(turn1_branch)
    turn2_branch = lstm2(turn2_branch)
    turn3_branch = lstm1(turn3_branch)
    
    x = Concatenate(axis=-1)([turn1_branch, turn2_branch, turn3_branch])
    
    x = Dropout(dropout)(x)
    
    x = Dense(hidden_layer_dim, activation='relu')(x)
    
    output = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=[turn1_input, turn2_input, turn3_input], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

model = buildModel(embeddings_matrix, MAX_SEQUENCE_LENGTH, lstm_dim=64, hidden_layer_dim=30, num_classes=4)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [9]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 24)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 24)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 24)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 24, 300)      197439000   input_1[0][0]                    
                                                                 input_2[0][0]                    
          

In [15]:
#from kutilities.callbacks import MetricsCallback, PlottingCallback
from sklearn.metrics import f1_score, precision_score, recall_score
from keras.callbacks import ModelCheckpoint, TensorBoard

metrics = {
    "f1_e": (lambda y_test, y_pred:
             f1_score(y_test, y_pred, average='micro',
                      labels=[emotion2label['happy'],
                              emotion2label['sad'],
                              emotion2label['angry']
                              ])),
    "precision_e": (lambda y_test, y_pred:
                    precision_score(y_test, y_pred, average='micro',
                                    labels=[emotion2label['happy'],
                                            emotion2label['sad'],
                                            emotion2label['angry']
                                            ])),
}

# _datasets = {}
# _datasets["dev"] = [[message_first_message_dev, message_second_message_dev, message_third_message_dev],
#                     np.array(labels_categorical_dev)]
# _datasets["val"] = [[message_first_message_val, message_second_message_val, message_third_message_val],
#                     np.array(labels_categorical_val)]

# metrics_callback = MetricsCallback(datasets=_datasets, metrics=metrics)

# filepath = "models/bidirectional_LSTM_best_weights_{epoch:02d}-{val_acc:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', save_best_only=True, save_weights_only=False,
#                              mode='auto', period=1)
# tensorboardCallback = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

In [11]:
history = model.fit([message_first_message_train, message_second_message_train, message_third_message_train],
                    np.array(labels_categorical_train),
                    validation_data=(
                        [message_first_message_val, message_second_message_val, message_third_message_val],
                        np.array(labels_categorical_val)
                    ),
                    epochs=20,
                    batch_size=200)

Instructions for updating:
Use tf.cast instead.
Train on 24128 samples, validate on 6032 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
y_pred = model.predict([message_first_message_dev, message_second_message_dev, message_third_message_dev])

In [16]:
from sklearn.metrics import classification_report

for title, metric in metrics.items():
    print(title, metric(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))
print(classification_report(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))

f1_e 0.7055150884495317
precision_e 0.6231617647058824
              precision    recall  f1-score   support

           0       0.97      0.92      0.94      2338
           1       0.61      0.77      0.69       142
           2       0.73      0.82      0.77       125
           3       0.57      0.84      0.68       150

   micro avg       0.90      0.90      0.90      2755
   macro avg       0.72      0.84      0.77      2755
weighted avg       0.92      0.90      0.91      2755



# Model 2

In [60]:
from keras.layers import Input, Dense, Embedding, Concatenate, Activation, \
    Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, GaussianNoise, RepeatVector
from keras.models import Model


def buildModel2(embeddings_matrix, sequence_length, lstm_dim, hidden_layer_dim, num_classes, 
               noise=0.1, dropout_lstm=0.2, dropout=0.2):
    turn1_input = Input(shape=(sequence_length,), dtype='int32')
    turn2_input = Input(shape=(sequence_length,), dtype='int32')
    turn3_input = Input(shape=(sequence_length,), dtype='int32')
    embedding_dim = embeddings_matrix.shape[1]
    embeddingLayer = Embedding(embeddings_matrix.shape[0],
                                embedding_dim,
                                weights=[embeddings_matrix],
                                input_length=sequence_length,
                                trainable=False)
    
    turn1_branch = embeddingLayer(turn1_input)
    turn2_branch = embeddingLayer(turn2_input) 
    turn3_branch = embeddingLayer(turn3_input) 
    
    turn1_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn1_branch)
    turn2_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn2_branch)
    turn3_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn3_branch)

    lstm1 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    lstm2 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    
    turn1_branch = lstm1(turn1_branch)
    turn2_branch = lstm2(turn2_branch)
    turn3_branch = lstm1(turn3_branch)
    
    turn1_branch = RepeatVector(1)(turn1_branch)
    turn2_branch = RepeatVector(1)(turn2_branch)
    turn3_branch = RepeatVector(1)(turn3_branch)
    
    x = Concatenate(axis=1)([turn1_branch, turn2_branch, turn3_branch])

    x = LSTM(lstm_dim, dropout=dropout_lstm)(x)

    x = Dropout(dropout)(x)
    
    x = Dense(hidden_layer_dim, activation='relu')(x)
    
    output = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=[turn1_input, turn2_input, turn3_input], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

model2 = buildModel2(embeddings_matrix, MAX_SEQUENCE_LENGTH, lstm_dim=64, hidden_layer_dim=30, num_classes=4)

In [61]:
model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_92 (InputLayer)           (None, 24)           0                                            
__________________________________________________________________________________________________
input_93 (InputLayer)           (None, 24)           0                                            
__________________________________________________________________________________________________
input_94 (InputLayer)           (None, 24)           0                                            
__________________________________________________________________________________________________
embedding_31 (Embedding)        (None, 24, 300)      197439000   input_92[0][0]                   
                                                                 input_93[0][0]                   
          

In [56]:
history = model2.fit([message_first_message_train, message_second_message_train, message_third_message_train],
                    np.array(labels_categorical_train),
                    validation_data=(
                        [message_first_message_val, message_second_message_val, message_third_message_val],
                        np.array(labels_categorical_val)
                    ),
                    epochs=9,
                    batch_size=200)

Train on 24128 samples, validate on 6032 samples
Epoch 1/15
 2600/24128 [==>...........................] - ETA: 1:44 - loss: 1.2897 - acc: 0.4669

KeyboardInterrupt: 

In [49]:
y_pred = model2.predict([message_first_message_dev, message_second_message_dev, message_third_message_dev])

In [50]:
from sklearn.metrics import classification_report

for title, metric in metrics.items():
    print(title, metric(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))
print(classification_report(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))

f1_e 0.6843657817109144
precision_e 0.58
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      2338
           1       0.60      0.77      0.67       142
           2       0.61      0.84      0.71       125
           3       0.54      0.89      0.68       150

   micro avg       0.89      0.89      0.89      2755
   macro avg       0.68      0.85      0.75      2755
weighted avg       0.91      0.89      0.90      2755



# Model 3

In [66]:
from keras.layers import Input, Dense, Embedding, Concatenate, Activation, \
    Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, GaussianNoise, RepeatVector, Permute, Reshape, multiply, Flatten
from keras.models import Model

def attention_3d_block(inputs):
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, 3))(a)
    a = Dense(3, activation='softmax')(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = multiply([inputs, a_probs])
    return output_attention_mul

def buildModel3(embeddings_matrix, sequence_length, lstm_dim, hidden_layer_dim, num_classes, 
               noise=0.1, dropout_lstm=0.2, dropout=0.2):
    turn1_input = Input(shape=(sequence_length,), dtype='int32')
    turn2_input = Input(shape=(sequence_length,), dtype='int32')
    turn3_input = Input(shape=(sequence_length,), dtype='int32')
    embedding_dim = embeddings_matrix.shape[1]
    embeddingLayer = Embedding(embeddings_matrix.shape[0],
                                embedding_dim,
                                weights=[embeddings_matrix],
                                input_length=sequence_length,
                                trainable=False)
    
    turn1_branch = embeddingLayer(turn1_input)
    turn2_branch = embeddingLayer(turn2_input) 
    turn3_branch = embeddingLayer(turn3_input) 
    
    turn1_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn1_branch)
    turn2_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn2_branch)
    turn3_branch = GaussianNoise(noise, input_shape=(None, sequence_length, embedding_dim))(turn3_branch)

    lstm1 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    lstm2 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    
    turn1_branch = lstm1(turn1_branch)
    turn2_branch = lstm2(turn2_branch)
    turn3_branch = lstm1(turn3_branch)
    
    turn1_branch = RepeatVector(1)(turn1_branch)
    turn2_branch = RepeatVector(1)(turn2_branch)
    turn3_branch = RepeatVector(1)(turn3_branch)
    
    x = Concatenate(axis=1)([turn1_branch, turn2_branch, turn3_branch])

    x = LSTM(lstm_dim, dropout=dropout_lstm, return_sequences=True)(x)

    x = attention_3d_block(x)
    
    x = Flatten()(x)
    
    x = Dropout(dropout)(x)
    
    x = Dense(hidden_layer_dim, activation='relu')(x)
    
    output = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=[turn1_input, turn2_input, turn3_input], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

model3 = buildModel3(embeddings_matrix, MAX_SEQUENCE_LENGTH, lstm_dim=64, hidden_layer_dim=30, num_classes=4)

In [67]:
model3.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_107 (InputLayer)          (None, 24)           0                                            
__________________________________________________________________________________________________
input_108 (InputLayer)          (None, 24)           0                                            
__________________________________________________________________________________________________
input_109 (InputLayer)          (None, 24)           0                                            
__________________________________________________________________________________________________
embedding_36 (Embedding)        (None, 24, 300)      197439000   input_107[0][0]                  
                                                                 input_108[0][0]                  
          

In [68]:
history = model3.fit([message_first_message_train, message_second_message_train, message_third_message_train],
                    np.array(labels_categorical_train),
                    validation_data=(
                        [message_first_message_val, message_second_message_val, message_third_message_val],
                        np.array(labels_categorical_val)
                    ),
                    epochs=15,
                    batch_size=200)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 24128 samples, validate on 6032 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [69]:
y_pred = model3.predict([message_first_message_dev, message_second_message_dev, message_third_message_dev])

In [70]:
from sklearn.metrics import classification_report

for title, metric in metrics.items():
    print(title, metric(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))
print(classification_report(labels_categorical_dev.argmax(axis=1), y_pred.argmax(axis=1)))

f1_e 0.6859344894026975
precision_e 0.573268921095008
              precision    recall  f1-score   support

           0       0.98      0.89      0.93      2338
           1       0.54      0.80      0.65       142
           2       0.70      0.85      0.77       125
           3       0.52      0.91      0.66       150

   micro avg       0.88      0.88      0.88      2755
   macro avg       0.69      0.86      0.75      2755
weighted avg       0.92      0.88      0.89      2755



In [73]:
y_pred

array([[1.3284779e-02, 7.2048115e-06, 7.7815901e-04, 9.8592991e-01],
       [9.9931324e-01, 6.3388346e-05, 3.7864447e-05, 5.8546365e-04],
       [6.7154607e-03, 9.9326718e-01, 1.8156113e-06, 1.5632924e-05],
       ...,
       [6.0109776e-01, 3.7267306e-05, 1.5390149e-03, 3.9732599e-01],
       [9.9379396e-01, 1.5795598e-05, 1.0762960e-04, 6.0825227e-03],
       [9.9751461e-01, 4.3820625e-04, 3.1942478e-04, 1.7277488e-03]],
      dtype=float32)

In [74]:
labels_categorical_dev

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [83]:
%reload_ext autoreload
%autoreload 2

In [88]:
from Metrics import evaluation

In [90]:
evaluation(y_pred, labels_categorical_dev)

True Positives per class :  [2082.  114.  106.  136.]
False Positives per class :  [ 52.  96.  45. 124.]
False Negatives per class :  [256.  28.  19.  14.]
Class happy : Precision : 0.543, Recall : 0.803, F1 : 0.648
Class sad : Precision : 0.702, Recall : 0.848, F1 : 0.768
Class angry : Precision : 0.523, Recall : 0.907, F1 : 0.663
Ignoring the Others class, Macro Precision : 0.5893, Macro Recall : 0.8525, Macro F1 : 0.6969
Ignoring the Others class, Micro TP : 356, FP : 265, FN : 61
