# General operations

In [2]:
import pandas as pd

def load_data(filename, columns, sep="\t"):
    pd.read_csv(filename, sep=sep, names=columns)

def map_column(df, columns, function):
    df = df.copy()
    for column in columns:
        df[column] = df[column].apply(function)
    return df

# Transformations functions

In [3]:
import numpy as np
import nltk
from keras.preprocessing import text, sequence

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def tokenize(texts):
    tokenizer = text.Tokenizer(filters=' ')
    tokenizer.fit_on_texts(texts)
    return tokenizer, tokenizer.word_index

def to_sequences(tokenizer, texts):
    return tokenizer.texts_to_sequences(texts)

def max_sequences_length(texts_list):
    return max([ max(map(lambda text: len(text), texts)) for texts in texts_list ])

def pad_sequences(texts_list, max_length=None):
    if max_length is None:
        max_length = max_sequences_length(texts_list)
    return [ sequence.pad_sequences(texts, maxlen=max_length) for texts in texts_list ]

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/beuvry_j/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:

#replace emojy
emojis = {
    u":‑\)":"☺️", u":\)":"☺️", u":-\]":"☺️", u":\]":"☺️", u":-3":"☺️",
    u":3":"☺️",
    u":->":"☺️",
    u":>":"☺️",
    u"8-\)":"☺️",
    u":o\)":"☺️",
    u":-\}":"☺️",
    u":\}":"☺️",
    u":-\)":"☺️",
    u":c\)":"☺️",
    u":\^\)":"☺️",
    u"=\]":"☺️",
    u"=\)":"☺️",
    u":‑D":"😃",
    u":D":"😃",
    u"8‑D":"😃",
    u"8D":"😃",
    u"X‑D":"😃",
    u"XD":"😃",
    u"=D":"😃",
    u"=3":"😃",
    u"B\^D":"😃",
    u":-\)\)":"😃",
    u":‑\(":"☹️",
    u":-\(":"☹️",
    u":\(":"☹️",
    u":‑c":"☹️",
    u":c":"☹️",
    u":‑<":"☹️",
    u":<":"☹️",
    u":‑\[":"☹️",
    u":\[":"☹️",
    u":-\|\|":"☹️",
    u">:\[":"☹️",
    u":\{":"☹️",
    u":@":"☹️",
    u">:\(":"☹️",
    u":'‑\(":"😭",
    u":'\(":"😭",
    u":'‑\)":"😃",
    u":'\)":"😃",
    u"D‑':":"😨",
    u"D:<":"😨",
    u"D:":"😧",
    u"D8":"😧",
    u"D;":"😧",
    u"D=":"😧",
    u"DX":"😧",
    u":‑O":"😮",
    u":O":"😮",
    u":‑o":"😮",
    u":o":"😮",
    u":-0":"😮",
    u"8‑0":"😮",
    u">:O":"😮",
    u":-\*":"😗",
    u":\*":"😗",
    u":X":"😗",
    u";‑\)":"😉",
    u";\)":"😉",
    u"\*-\)":"😉",
    u"\*\)":"😉",
    u";‑\]":"😉",
    u";\]":"😉",
    u";\^\)":"😉",
    u":‑,":"😉",
    u";D":"😉",
    u":‑P":"😛",
    u":P":"😛",
    u"X‑P":"😛",
    u"XP":"😛",
    u":‑Þ":"😛",
    u":Þ":"😛",
    u":b":"😛",
    u"d:":"😛",
    u"=p":"😛",
    u">:P":"😛",
    u":‑/":"😕",
    u":/":"😕",
    u":-[.]":"😕",
    u">:[(\\\)]":"😕",
    u">:/":"😕",
    u":[(\\\)]":"😕",
    u"=/":"😕",
    u"=[(\\\)]":"😕",
    u":L":"😕",
    u"=L":"😕",
    u":S":"😕",
    u":‑\|":"😐",
    u":\|":"😐",
    u":$":"😳",
    u":‑x":"🤐",
    u":x":"🤐",
    u":‑#":"🤐",
    u":#":"🤐",
    u":‑&":"🤐",
    u":&":"🤐",
    u"O:‑\)":"😇",
    u"O:\)":"😇",
    u"0:‑3":"😇",
    u"0:3":"😇",
    u"0:‑\)":"😇",
    u"0:\)":"😇",
    u":‑b":"😛",
    u"0;\^\)":"😇",
    u">:‑\)":"😈",
    u">:\)":"😈",
    u"\}:‑\)":"😈",
    u"\}:\)":"😈",
    u"3:‑\)":"😈",
    u"3:\)":"😈",
    u">;\)":"😈",
    u"\|;‑\)":"😎",
    u"\|‑O":"😏",
    u":‑J":"😏",
    u"%‑\)":"😵",
    u"%\)":"😵",
    u":-###..":"🤒",
    u":###..":"🤒",
    u"\(>_<\)":"😣",
    u"\(>_<\)>":"😣",
    u"\(';'\)":"👶",
    u"\(\^\^>``":"😓",
    u"\(\^_\^;\)":"😓",
    u"\(-_-;\)":"😓",
    u"\(~_~;\) \(・\.・;\)":"😓",
    u"\(-_-\)zzz":"😴",
    u"\(\^_-\)":"😉",
    u"\(\(\+_\+\)\)":"😕",
    u"\(\+o\+\)":"😕",
    u"\^_\^":"😃",
    u"\(\^_\^\)/":"😃",
    u"\(\^O\^\)／":"😃",
    u"\(\^o\^\)／":"😃",
    u"\(__\)":"🙇",
    u"_\(\._\.\)_":"🙇",
    u"<\(_ _\)>":"🙇",
    u"<m\(__\)m>":"🙇",
    u"m\(__\)m":"🙇",
    u"m\(_ _\)m":"🙇",
    u"\('_'\)":"😭",
    u"\(/_;\)":"😭",
    u"\(T_T\) \(;_;\)":"😭",
    u"\(;_;":"😭",
    u"\(;_:\)":"😭",
    u"\(;O;\)":"😭",
    u"\(:_;\)":"😭",
    u"\(ToT\)":"😭",
    u";_;":"😭",
    u";-;":"😭",
    u";n;":"😭",
    u";;":"😭",
    u"Q\.Q":"😭",
    u"T\.T":"😭",
    u"QQ":"😭",
    u"Q_Q":"😭",
    u"\(-\.-\)":"😞",
    u"\(-_-\)":"😞",
    u"\(一一\)":"😞",
    u"\(；一_一\)":"😞",
    u"\(=_=\)":"😩",
    u"\(=\^\·\^=\)":"😺",
    u"\(=\^\·\·\^=\)":"😺",
    u"=_\^=	":"😺",
    u"\(\.\.\)":"😔",
    u"\(\._\.\)":"😔",
    u"\(\・\・?":"😕",
    u"\(?_?\)":"😕",
    u">\^_\^<":"😃",
    u"<\^!\^>":"😃",
    u"\^/\^":"😃",
    u"\（\*\^_\^\*）" :"😃",
    u"\(\^<\^\) \(\^\.\^\)":"😃",
    u"\(^\^\)":"😃",
    u"\(\^\.\^\)":"😃",
    u"\(\^_\^\.\)":"😃",
    u"\(\^_\^\)":"😃",
    u"\(\^\^\)":"😃",
    u"\(\^J\^\)":"😃",
    u"\(\*\^\.\^\*\)":"😃",
    u"\(\^—\^\）":"😃",
    u"\(#\^\.\^#\)":"😃",
    u"\（\^—\^\）":"👋",
    u"\(;_;\)/~~~":"👋",
    u"\(\^\.\^\)/~~~":"👋",
    u"\(T_T\)/~~~":"👋",
    u"\(ToT\)/~~~":"👋",
    u"\(\*\^0\^\*\)":"😍",
    u"\(\*_\*\)":"😍",
    u"\(\*_\*;":"😍",
    u"\(\+_\+\) \(@_@\)":"😍",
    u"\(\*\^\^\)v":"😂",
    u"\(\^_\^\)v":"😂",
    u'\(-"-\)':"😓",
    u"\(ーー;\)":"😓",
    u"\(\^0_0\^\)":"😎",
    u"\(\＾ｖ\＾\)":"😀",
    u"\(\＾ｕ\＾\)":"😀",
    u"\(\^\)o\(\^\)":"😀",
    u"\(\^O\^\)":"😀",
    u"\(\^o\^\)":"😀",
    u"\)\^o\^\(":"😀",
    u":O o_O":"😮",
    u"o_0":"😮",
    u"o\.O":"😮",
    u"\(o\.o\)":"😮",
    u"oO":"😮",
    u"\(\*￣m￣\)":"😠",
    u":‑)":"☺️",
    u":)":"☺️",
    u":-]":"☺️",
    u":]":"☺️",
    u":-3":"☺️",
    u":3":"☺️",
    u":->":"☺️",
    u":>":"☺️",
    u"8-)":"☺️",
    u":o)":"☺️",
    u":-}":"☺️",
    u":}":"☺️",
    u":-)":"☺️",
    u":c)":"☺️",
    u":^)":"☺️",
    u"=]":"☺️",
    u"=)":"☺️",
    u":‑D":"😃",
    u":D":"😃",
    u"8‑D":"😃",
    u"8D":"😃",
    u"X‑D":"😃",
    u"XD":"😃",
    u"=D":"😃",
    u"=3":"😃",
    u"B^D":"😃",
    u":-))":"😃",
    u":-(":"☹️",
    u":‑(":"☹️",
    u":(":"☹️",
    u":‑c":"☹️",
    u":c":"☹️",
    u":‑<":"☹️",
    u":<":"☹️",
    u":‑[":"☹️",
    u":[":"☹️",
    u":-||":"☹️",
    u">:[":"☹️",
    u":{":"☹️",
    u":@":"☹️",
    u">:(":"☹️",
    u":'‑(":"😭",
    u":'(":"😭",
    u":'‑)":"😃",
    u":')":"😃",
    u"D‑':":"😧",
    u"D:<":"😨",
    u"D:":"😧",
    u"D8":"😧",
    u"D;":"😧",
    u"D=":"😧",
    u"DX":"😧",
    u":‑O":"😮",
    u":O":"😮",
    u":‑o":"😮",
    u":o":"😮",
    u":-0":"😮",
    u"8‑0":"😮",
    u">:O":"😮",
    u":-*":"😗",
    u":*":"😗",
    u":X":"😗",
    u";‑)":"😉",
    u";)":"😉",
    u"*-)":"😉",
    u"*)":"😉",
    u";‑]":"😉",
    u";]":"😉",
    u";^)":"😉",
    u":‑,":"😉",
    u";D":"😉",
    u":‑P":"😛",
    u":P":"😛",
    u"X‑P":"😛",
    u"XP":"😛",
    u":‑Þ":"😛",
    u":Þ":"😛",
    u":b":"😛",
    u"d:":"😛",
    u"=p":"😛",
    u">:P":"😛",
    u":‑/":"😕",
    u":/":"😕",
    u":-[.]":"😕",
    u">:[(\)]":"😕",
    u">:/":"😕",
    u":[(\)]":"😕",
    u"=/":"😕",
    u"=[(\)]":"😕",
    u":L":"😕",
    u"=L":"😕",
    u":S":"😕",
    u":‑|":"😐",
    u":|":"😐",
    u":$":"😳",
    u":‑x":"🤐",
    u":x":"🤐",
    u":‑#":"🤐",
    u":#":"🤐",
    u":‑&":"🤐",
    u":&":"🤐",
    u"O:‑)":"😇",
    u"O:)":"😇",
    u"0:‑3":"😇",
    u"0:3":"😇",
    u"0:‑)":"😇",
    u"0:)":"😇",
    u":‑b":"😛",
    u"0;^)":"😇",
    u">:‑)":"😈",
    u">:)":"😈",
    u"}:‑)":"😈",
    u"}:)":"😈",
    u"3:‑)":"😈",
    u"3:)":"😈",
    u">;)":"😈",
    u"|;‑)":"😎",
    u"|‑O":"😏",
    u":‑J":"😏",
    u"%‑)":"😵",
    u"%)":"😵",
    u":-###..":"🤒",
    u":###..":"🤒",
    u"(>_<)":"😣",
    u"(>_<)>":"😣",
    u"(';')":"Baby",
    u"(^^>``":"😓",
    u"(^_^;)":"😓",
    u"(-_-;)":"😓",
    u"(~_~;) (・.・;)":"😓",
    u"(-_-)zzz":"😴",
    u"(^_-)":"😉",
    u"((+_+))":"😕",
    u"(+o+)":"😕",
    u"^_^":"😃",
    u"(^_^)/":"😃",
    u"(^O^)／":"😃",
    u"(^o^)／":"😃",
    u"(__)":"🙇",
    u"_(._.)_":"🙇",
    u"<(_ _)>":"🙇",
    u"<m(__)m>":"🙇",
    u"m(__)m":"🙇",
    u"m(_ _)m":"🙇",
    u"('_')":"😭",
    u"(/_;)":"😭",
    u"(T_T) (;_;)":"😭",
    u"(;_;":"😭",
    u"(;_:)":"😭",
    u"(;O;)":"😭",
    u"(:_;)":"😭",
    u"(ToT)":"😭",
    u";_;":"😭",
    u";-;":"😭",
    u";n;":"😭",
    u";;":"😭",
    u"Q.Q":"😭",
    u"T.T":"😭",
    u"QQ":"😭",
    u"Q_Q":"😭",
    u"(-.-)":"😞",
    u"(-_-)":"😞",
    u"(一一)":"😞",
    u"(；一_一)":"😞",
    u"(=_=)":"😩",
    u"(=^·^=)":"😺",
    u"(=^··^=)":"😺",
    u"=_^= ":"😺",
    u"(..)":"😔",
    u"(._.)":"😔",
    u"(・・?":"😕",
    u"(?_?)":"😕",
    u">^_^<":"😃",
    u"<^!^>":"😃",
    u"^/^":"😃",
    u"（*^_^*）" :"😃",
    u"(^<^) (^.^)":"😃",
    u"(^^)":"😃",
    u"(^.^)":"😃",
    u"(^_^.)":"😃",
    u"(^_^)":"😃",
    u"(^^)":"😃",
    u"(^J^)":"😃",
    u"(*^.^*)":"😃",
    u"(^—^）":"😃",
    u"(#^.^#)":"😃",
    u"（^—^）":"👋",
    u"(;_;)/~~~":"👋",
    u"(^.^)/~~~":"👋",
    u"(-_-)/~~~ ($··)/~~~":"👋",
    u"(T_T)/~~~":"👋",
    u"(ToT)/~~~":"👋",
    u"(*^0^*)":"😍",
    u"(*_*)":"😍",
    u"(*_*;":"😍",
    u"(+_+) (@_@)":"😍",
    u"(*^^)v":"😂",
    u"(^_^)v":"😂",
    u'(-"-)':"😓",
    u"(ーー;)":"😓",
    u"(^0_0^)":"😎",
    u"(＾ｖ＾)":"😀",
    u"(＾ｕ＾)":"😀",
    u"(^)o(^)":"😀",
    u"(^O^)":"😀", u"(^o^)":"😀", u")^o^(":"😀", u":O o_O":"😮",
    u"o_0":"😮", u"o.O":"😮", u"(o.o)":"😮", u"oO":"😮",
}


def str2emoji(text):

    for pos,ej in enumerate(text):
        if ej in emojis:
            text[pos]=emojis[ej]
    return text

In [5]:
import re
import string
from unidecode import unidecode

def norm_text(text):
    text = re.sub(r"\\u2019", "'", text)
    text = re.sub(r"\\u002c", ",", text)
    text = ' '.join(str2emoji(unidecode(text).lower().split()))
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r" can\'t", " cannot", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'s", "", text)
    text = re.sub(r"\'n", "", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"@\w+", r' ',text)
    text = re.sub(r"#\w+", r' ',text)
    text = re.sub(r"[.]+"," ",text)
    # Remove stopwords and punctuations
    text = ' '.join([ word for word in text.split() if (word not in stopwords) and (word not in string.punctuation ) ])
    return text

In [6]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

def load_embeddings_matrix(filename, word_index, embedding_dim=300):
    nb_words=len(word_index)+1
    
    embedding_matrix = np.zeros((nb_words, embedding_dim))
    word2vec = KeyedVectors.load_word2vec_format(filename, binary=True)
    
    # Unknown vocabulary will be replaced as random vector
    oov = [np.random.rand(embedding_dim) * 2.0 - 1.0]
    oov = oov / np.linalg.norm(oov)
    
    # Create the resulting embedding_matrix
    for word, i in word_index.items():
        if word in word2vec.vocab:
            embedding_matrix[i] = word2vec.word_vec(word)
        else:
            embedding_matrix[i] = oov
    
    return embedding_matrix

# TP part

In [7]:
data = pd.read_csv("train.txt", sep="\t")
data

Unnamed: 0,id,turn1,turn2,turn3,label
0,0,Then dont ask me,YOURE A GUY NOT AS IF YOU WOULD UNDERSTAND,IM NOT A GUY FUCK OFF,angry
1,1,Mixed things such as??,the things you do.,Have you seen minions??,others
2,2,Today I'm very happy,and I'm happy for you ❤,I will be marry,happy
3,3,Woah bring me some,left it there oops,Brb,others
4,4,it is thooooo,I said soon master.,he is pressuring me,others
...,...,...,...,...,...
2750,2750,U are my book,book for what? ugliness? THANK YOU,U like ur self,others
2751,2751,I'll be crying,You just want to make ppl cry:P,ppl,others
2752,2752,Thanks for sending,hahaha you're welcome! 😤😤,Why are u not sending,others
2753,2753,Write it,Mr. F,U understand me?,others


In [33]:
# Replace empty columns with a balise to prevent mistakes

CLASSES = ["angry", "happy", "sad", "others"]

def transform_text(data):    
    data = map_column(data, ["turn1", "turn2", "turn3"], lambda text:  " <b> " + text)
    data = map_column(data, ["turn1", "turn2", "turn3"], norm_text)
    data = map_column(data, ["label"], lambda label: CLASSES.index(label))
    return data

In [None]:
data = transform_text(data)
data

In [9]:
tokenizer, word_index = tokenize(data["turn1"].append(data["turn2"]).append(data["turn3"]))
word_index

{'<b>': 1,
 'u': 2,
 'know': 3,
 'yes': 4,
 'ok': 5,
 'like': 6,
 'tell': 7,
 'good': 8,
 'love': 9,
 'want': 10,
 'r': 11,
 '☺️': 12,
 'one': 13,
 'say': 14,
 'go': 15,
 'think': 16,
 'ur': 17,
 'get': 18,
 'send': 19,
 'what?': 20,
 'see': 21,
 'yeah': 22,
 'something': 23,
 'oh': 24,
 'thank': 25,
 'nothing': 26,
 'talk': 27,
 'dont': 28,
 'would': 29,
 'give': 30,
 'please': 31,
 'really': 32,
 'time': 33,
 'let': 34,
 'sorry': 35,
 'okay': 36,
 'you?': 37,
 'funny': 38,
 'understand': 39,
 'well': 40,
 'ask': 41,
 'friend': 42,
 'thanks': 43,
 'haha': 44,
 'cool': 45,
 'much': 46,
 'going': 47,
 'mean': 48,
 'said': 49,
 'life': 50,
 'cannot': 51,
 ':p': 52,
 'come': 53,
 'sleep': 54,
 'name': 55,
 'got': 56,
 'also': 57,
 'need': 58,
 'im': 59,
 'number': 60,
 'make': 61,
 'nice': 62,
 'lol': 63,
 'bye': 64,
 'welcome': 65,
 'pic': 66,
 ':d': 67,
 'still': 68,
 'hate': 69,
 'day': 70,
 'bad': 71,
 'better': 72,
 'fuck': 73,
 'happy': 74,
 'best': 75,
 'night': 76,
 'always': 77,


In [10]:
[seq_turn_1, seq_turn_2, seq_turn_3] = pad_sequences([to_sequences(tokenizer, data[turn]) for turn in ["turn1", "turn2", "turn3"]])
seq_turn_1, seq_turn_2, seq_turn_3

(array([[   0,    0,    0, ...,    1,   28,   41],
        [   0,    0,    0, ..., 1739,   94, 1740],
        [   0,    0,    0, ...,    1,   98,   74],
        ...,
        [   0,    0,    0, ...,    1,   43,  615],
        [   0,    0,    0, ...,    0,    1,  510],
        [   0,    0,    0, ...,    0,    1,    4]], dtype=int32),
 array([[  0,   0,   0, ..., 254,  29,  39],
        [  0,   0,   0, ...,   0,   1,  94],
        [  0,   0,   0, ...,   0,   1,  74],
        ...,
        [  0,   0,   0, ...,   1, 189, 538],
        [  0,   0,   0, ...,   1, 691, 595],
        [  0,   0,   0, ...,  36,  30, 795]], dtype=int32),
 array([[   0,    0,    0, ...,   59,  254,   73],
        [   0,    0,    0, ...,    1,  202, 3913],
        [   0,    0,    0, ...,    0,    1,  377],
        ...,
        [   0,    0,    0, ...,    1,    2,  615],
        [   0,    0,    0, ...,    2,   39,   83],
        [   0,    0,    0, ...,    0,    1,  590]], dtype=int32))

In [11]:
embedding_matrix = load_embeddings_matrix("../GoogleNews-vectors-negative300.bin", word_index)

# Models creation

In [12]:
def evaluation(predictions, ground):

    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification  

    """

    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))
    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)

    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)    

    # ------------- Macro level calculation ---------------

    macroPrecision = 0
    macroRecall = 0

    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (["angry", "happy", "sad", "others"][c], precision, recall, f1))
    
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   

    # ------------- Micro level calculation ---------------

    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()        

    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))

    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)    

    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    print("Ignoring the Others class, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (microPrecision, microRecall, microF1))   

In [13]:
from keras.callbacks import Callback

class TestCallback(Callback):
    def __init__(self, test_data):
        self.test_data = test_data
        
    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        r = self.model.predict(x)
        # Evaluation return nothing
        evaluation(r, y)

In [14]:
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from sklearn.model_selection import train_test_split
from keras.layers import LSTM, Dropout, Dense, Bidirectional,  Flatten, Input, GRU, Concatenate, Reshape
from keras.layers.embeddings import Embedding
from keras import optimizers

In [15]:
MAX_SEQUENCES_LENGTH = seq_turn_1.shape[1]
EMBEDDING_DIM = embedding_matrix.shape[1]
NUM_CLASSES = len(data["label"].unique()) 
DROPOUT = 0.2
LEARNING_RATE = 0.001

In [16]:
def create_model_1():
    turn1 = Input(shape=(MAX_SEQUENCES_LENGTH,), dtype='int32', name='main_input1')
    turn2 = Input(shape=(MAX_SEQUENCES_LENGTH,), dtype='int32', name='main_input2')
    turn3 = Input(shape=(MAX_SEQUENCES_LENGTH,), dtype='int32', name='main_input3')
    embedding_layer = Embedding(
        embedding_matrix.shape[0],
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCES_LENGTH,
        trainable=False
    )
    emb1 = embedding_layer(turn1)
    emb2 = embedding_layer(turn2)
    emb3 = embedding_layer(turn3)
    lstm = Bidirectional(LSTM(EMBEDDING_DIM, dropout=DROPOUT))
    lstm1 = lstm(emb1)
    lstm2 = lstm(emb2)
    lstm3 = lstm(emb3)
    inp = Concatenate(axis=-1)([lstm1, lstm2, lstm3])
    out = Dense(NUM_CLASSES, activation='softmax')(inp)
    adam = optimizers.adam(lr=LEARNING_RATE)
    model1 = Model([turn1, turn2, turn3], out)
    model1.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
    model1.summary()

    return model1

model1 = create_model_1()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input1 (InputLayer)        (None, 21)           0                                            
__________________________________________________________________________________________________
main_input2 (InputLayer)        (None, 21)           0                                            
__________________________________________________________________________________________________
main_input3 (InputLayer)        (None, 21)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 21, 300)      1392600     main_input1[0][0]                
                                                                 main_input2[0][0]          

In [17]:
def create_model_2():
    turn1 = Input(shape=(MAX_SEQUENCES_LENGTH,), dtype='int32', name='main_input1')
    turn2 = Input(shape=(MAX_SEQUENCES_LENGTH,), dtype='int32', name='main_input2')
    turn3 = Input(shape=(MAX_SEQUENCES_LENGTH,), dtype='int32', name='main_input3')
    embedding_layer = Embedding(
        embedding_matrix.shape[0],
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCES_LENGTH,
        trainable=False
    )
    emb1 = embedding_layer(turn1)
    emb2 = embedding_layer(turn2)
    emb3 = embedding_layer(turn3)
    lstm = Bidirectional(LSTM(EMBEDDING_DIM, dropout=DROPOUT))
    lstm1 = lstm(emb1)
    lstm2 = lstm(emb2)
    lstm3 = lstm(emb3)
    inp = Concatenate(axis=-1)([lstm1, lstm2, lstm3])
    inp = Reshape((3, 2 * EMBEDDING_DIM))(inp)
    lstm_up = LSTM(EMBEDDING_DIM, dropout=DROPOUT)
    out = lstm_up(inp)
    out = Dense(NUM_CLASSES, activation='softmax')(out)
    adam = optimizers.adam(lr=LEARNING_RATE)
    model2 = Model([turn1, turn2, turn3], out)
    model2.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
    model2.summary()

    return model1

model2 = create_model_2()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input1 (InputLayer)        (None, 21)           0                                            
__________________________________________________________________________________________________
main_input2 (InputLayer)        (None, 21)           0                                            
__________________________________________________________________________________________________
main_input3 (InputLayer)        (None, 21)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 21, 300)      1392600     main_input1[0][0]                
                                                                 main_input2[0][0]          

# Training

In [18]:
# One hotting all the label
Y = to_categorical(data["label"], NUM_CLASSES)
Y

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [19]:
# Splitting as test, train dataset
test_ratio = 0.20
split_index = int(len(Y) * (1 - test_ratio))

Y_train = Y[:split_index]
Y_val= Y[split_index:]

X_train = [seq_turn_1[:split_index], seq_turn_2[:split_index], seq_turn_3[:split_index]]
X_val = [seq_turn_1[split_index:], seq_turn_2[split_index:], seq_turn_3[split_index:]]

Y_train, Y_val, X_train, X_val

(array([[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]], dtype=float32), array([[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]], dtype=float32), [array([[   0,    0,    0, ...,    1,   28,   41],
         [   0,    0,    0, ..., 1739,   94, 1740],
         [   0,    0,    0, ...,    1,   98,   74],
         ...,
         [   0,    0,    0, ...,    3, 1355,  271],
         [   0,    0,    0, ...,    0,    1, 2359],
         [   0,    0,    0, ...,    1,   17,  376]], dtype=int32),
  array([[   0,    0,    0, ...,  254,   29,   39],
         [   0,    0,    0, ...,    0,    1,   94],
         [   0,    0,    0, ...,    0,    1,   74],
         ...,
         [   0,    0,    0, ...,    0,    1,  519],
         [   0,    0,    0, ..., 1045,  552,   18],
         [   0,

In [20]:
history = model1.fit(
    X_train,
    Y_train,
    validation_data=(X_val, Y_val),
    epochs=6,
    batch_size=50,
    callbacks=[TestCallback((X_val, Y_val))]
)

Train on 2204 samples, validate on 551 samples
Epoch 1/6
True Positives per class :  [  0.   0.   0. 470.]
False Positives per class :  [ 0.  0.  0. 81.]
False Negatives per class :  [24. 22. 35.  0.]
Class happy : Precision : nan, Recall : 0.000, F1 : 0.000
Class sad : Precision : nan, Recall : 0.000, F1 : 0.000
Class others : Precision : 0.853, Recall : 1.000, F1 : 0.921
Ignoring the Others class, Macro Precision : nan, Macro Recall : 0.3333, Macro F1 : 0.0000
Ignoring the Others class, Micro TP : 470, FP : 81, FN : 57
Ignoring the Others class, Micro Precision : 0.8530, Micro Recall : 0.8918, Micro F1 : 0.8720
Epoch 2/6




True Positives per class :  [  1.   0.   0. 470.]
False Positives per class :  [ 0.  0.  0. 80.]
False Negatives per class :  [23. 22. 35.  0.]
Class happy : Precision : nan, Recall : 0.000, F1 : 0.000
Class sad : Precision : nan, Recall : 0.000, F1 : 0.000
Class others : Precision : 0.855, Recall : 1.000, F1 : 0.922
Ignoring the Others class, Macro Precision : nan, Macro Recall : 0.3333, Macro F1 : 0.0000
Ignoring the Others class, Micro TP : 470, FP : 80, FN : 57
Ignoring the Others class, Micro Precision : 0.8545, Micro Recall : 0.8918, Micro F1 : 0.8728
Epoch 3/6
True Positives per class :  [ 15.   4.   7. 463.]
False Positives per class :  [ 9.  0.  0. 53.]
False Negatives per class :  [ 9. 18. 28.  7.]
Class happy : Precision : 1.000, Recall : 0.182, F1 : 0.308
Class sad : Precision : 1.000, Recall : 0.200, F1 : 0.333
Class others : Precision : 0.897, Recall : 0.985, F1 : 0.939
Ignoring the Others class, Macro Precision : 0.9658, Macro Recall : 0.4556, Macro F1 : 0.6192
Ignoring 

In [21]:
history = model2.fit(
    X_train,
    Y_train,
    validation_data=(X_val, Y_val),
    epochs=6,
    batch_size=50,
    callbacks=[TestCallback((X_val, Y_val))]
)

Train on 2204 samples, validate on 551 samples
Epoch 1/6
True Positives per class :  [ 16.   7.  15. 454.]
False Positives per class :  [ 7.  8.  2. 42.]
False Negatives per class :  [ 8. 15. 20. 16.]
Class happy : Precision : 0.467, Recall : 0.318, F1 : 0.378
Class sad : Precision : 0.882, Recall : 0.429, F1 : 0.577
Class others : Precision : 0.915, Recall : 0.966, F1 : 0.940
Ignoring the Others class, Macro Precision : 0.7548, Macro Recall : 0.5709, Macro F1 : 0.6501
Ignoring the Others class, Micro TP : 476, FP : 52, FN : 51
Ignoring the Others class, Micro Precision : 0.9015, Micro Recall : 0.9032, Micro F1 : 0.9024
Epoch 2/6
True Positives per class :  [ 14.   2.   8. 468.]
False Positives per class :  [ 2.  0.  0. 57.]
False Negatives per class :  [10. 20. 27.  2.]
Class happy : Precision : 1.000, Recall : 0.091, F1 : 0.167
Class sad : Precision : 1.000, Recall : 0.229, F1 : 0.372
Class others : Precision : 0.891, Recall : 0.996, F1 : 0.941
Ignoring the Others class, Macro Precis

# Testing

In [34]:
def to_index_classes(Y_pred):
    return Y_pred.argmax(axis=1)

def index_to_classes(Y_pred_index):
    return np.vectorize(lambda index: CLASSES[index])(Y_pred_index)

In [36]:
data_test = pd.read_csv("train.txt", sep="\t")

# Transform all the test data
data_test = transform_text(data_test)
[seq_test_turn_1, seq_test_turn_2, seq_test_turn_3] = pad_sequences([to_sequences(tokenizer, data_test[turn]) for turn in ["turn1", "turn2", "turn3"]])
X_test = [seq_test_turn_1, seq_test_turn_2, seq_test_turn_3]

# Make the prediction
Y_pred_test_1, Y_pred_test_2 = model1.predict(X_test), model2.predict(X_test)

# Get back the predicted classes
Y_pred_test_1 = index_to_classes(to_index_classes(Y_pred_test_1))
Y_pred_test_2 = index_to_classes(to_index_classes(Y_pred_test_2))

Y_pred_test_1, Y_pred_test_2

(array(['angry', 'others', 'others', ..., 'others', 'others', 'others'],
       dtype='<U6'),
 array(['angry', 'others', 'others', ..., 'others', 'others', 'others'],
       dtype='<U6'))