In [1]:
import numpy as np
import tensorflow as tf
from keras import backend as K

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model, Model
from keras.layers import concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D, Layer, Dense, Embedding, LSTM, GRU, Dropout, SpatialDropout1D, Input, Average, Bidirectional
from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers

from keras.models import load_model
import json, argparse, os
import re
import io
import sys

Using TensorFlow backend.


In [2]:
# Don't hog GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
keras.backend.set_session(sess)

In [3]:
# Path to training and testing data file. This data can be downloaded from a link, details of which will be provided.
trainDataPath = "./train.txt"
testDataPath = "./dev.txt"
# Output file that will be generated. This file can be directly submitted.
solutionPath = "./test.txt"
# Path to directory where GloVe file is saved.
gloveDir = ""

In [4]:
NUM_CLASSES = 4                 # Number of classes - Happy, Sad, Angry, Others
MAX_NB_WORDS = None                # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer 
MAX_SEQUENCE_LENGTH = 25         # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = 100               # The dimension of the word embeddings

In [5]:
label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

In [6]:
import re
def cleanText(text):
    text = re.sub(r"\bu\b", "you", text)
    text = re.sub(r"\bm\b", "am", text)
    text = re.sub(r"\bn\b", "and", text)
    text = re.sub(r"\bluv\b", "love", text)
    text = re.sub(r"\bans\b", "answer", text)
    text = re.sub(r"\bwt\b", "what", text)
    text = re.sub(r"\br\b", "are", text)
    text = re.sub(r"\bur\b", "your", text)
    text = re.sub(r"\bnthng\b", "nothing", text)
    text = re.sub(r"\btxt\b", "text", text)
    text = re.sub(r"\by\b", "why", text)
    text = re.sub(r"\bconvo\b", "conversation", text)
    text = re.sub(r"\bdont\b", "do not", text)
    text = re.sub(r"\bpl[s]+\b", "please", text)
    text = re.sub(r"\bpl[z]+\b", "please", text)
    text = re.sub(r"\bim\b", "i am", text)
    text = re.sub(r"\bwlcm\b", "welcome", text)
    text = re.sub(r"\bi m\b", "i am", text)
    text = re.sub(r"\baren't\b", "are not", text)
    text = re.sub(r"\bb[e]?coz\b", "because", text)
    text = re.sub(r"\bdnt\b", "did not", text)
    text = re.sub(r"\bknw\b", "know", text)
    text = re.sub(r"\bsry\b", "sorry", text)
    text = re.sub(r"\bchating\b", "chatting", text)
    text = re.sub(r"\bfrnds\b", "friends", text)
    text = re.sub(r"\bsrry\b", "sorry", text)
    text = re.sub(r"\burself\b", "yourself", text)
    text = re.sub(r"&amp;", "and", text)
    text = re.sub(r"&apos;", "'", text)
    text = re.sub(r"\btal\b", "talk", text)
    text = re.sub(r"\bsec\b", "second", text)
    text = re.sub(r"\bmin\b", "minute", text)
    text = re.sub(r"\bfr\b", "for", text)
    text = re.sub(r"\bwrk\b", "work", text)
    text = re.sub(r"\bfrm\b", "from", text)
    text = re.sub(r"\bwr are\b", "where are", text)
    text = re.sub(r"\bwrkng\b", "working", text)
    text = re.sub(r"\bmyslf\b", "myself", text)
    text = re.sub(r"\bbtr\b", "better", text)
    text = re.sub(r"\bdil\b", "heart", text)
    text = re.sub(r"\by[a]+r\b", "yaar", text)
    text = re.sub(r"\bdon't\b", "do not", text)
    text = re.sub(r"\bdon’t\b", "do not", text)
    text = re.sub(r"\bi'm\b", "i am", text)
    text = re.sub(r"\bit's\b", "it is", text)
    text = re.sub(r"\byou're\b", "you are", text)
    text = re.sub(r"\byou’re\b", "you are", text)
    text = re.sub(r"\bthat's\b", "that is", text)
    text = re.sub(r"\bcan't\b", "cannot", text)
    text = re.sub(r"\bcan’t\b", "cannot", text)
    text = re.sub(r"\bwhat's\b", "what is", text)
    text = re.sub(r"\bwhat’s\b", "what is", text)
    text = re.sub(r"\bdidn't\b", "did not", text)
    text = re.sub(r"\bi'll\b", "i will", text)
    text = re.sub(r"\blet's\b", "let us", text)
    text = re.sub(r"\bi've\b", "i have", text)
    text = re.sub(r"\bwon't\b", "will not", text)
    text = re.sub(r"\bdoesn't\b", "does not", text)
    text = re.sub(r"\bit'll\b", "it will", text)
    text = re.sub(r"\bofcourse\b", "of course", text)
    text = re.sub(r"\bbcz\b", "because", text)

    text = re.sub(r"\bwe'll\b", "we will", text)
    text = re.sub(r"\bwhen's\b", "when is", text)
    text = re.sub(r"\bwe've\b", "we have", text)
    text = re.sub(r"\bhe's\b", "he is", text)
    text = re.sub(r"\bshe's\b", "she is", text)
    text = re.sub(r"\bfrnd\b", "friend", text)
    text = re.sub(r"\bi’m\b", "i am", text)
    text = re.sub(r"\bth[a]?nx\b", "thanks", text)
    text = re.sub(r"\bthnks\b", "thanks", text)
    text = re.sub(r"\bye[s]+\b", "yes", text)
    text = re.sub(r"\bi’am\b", "i am", text)
    text = re.sub(r"\bisn't\b", "is not", text)
    text = re.sub(r"\bhaven't\b", "have not", text)
    text = re.sub(r"\bhow's\b", "how is", text)
    text = re.sub(r"\bhow're\b", "how are", text)
    text = re.sub(r"\bhowz\b", "how is", text)
    text = re.sub(r"\bwasn't\b", "was not", text)
    text = re.sub(r"\bthere's\b", "there is", text)
    text = re.sub(r"\bwe're\b", "we are", text)
    text = re.sub(r"\byou'll\b", "you will", text)
    text = re.sub(r"\bo[k]+\b", "okay", text)
    text = re.sub(r"\bcouldn't\b", "could not", text)
    text = re.sub(r"\bthey're\b", "they are", text)
    text = re.sub(r"\bit’s\b", "it is", text)
    text = re.sub(r"\bthat’s\b", "that is", text)
    text = re.sub(r"\bain't\b", "is not", text)
    text = re.sub(r"\bwho's\b", "who is", text)
    text = re.sub(r"\byou've\b", "you have", text)
    text = re.sub(r"\bwhere's\b", "where is", text)
    text = re.sub(r"\bshouldn't\b", "should not", text)
    text = re.sub(r"\bwouldn't\b", "would not", text)
    text = re.sub(r"\b'you\b", "you", text)
    text = re.sub(r"\bit'd\b", "it would", text)
    text = re.sub(r"\bidk\b", "i do not know", text)
    text = re.sub(r"\bha[ha]+\b", "haha", text)
    text = re.sub(r"\bhe[he]+\b", "haha", text)
    text = re.sub(r"\brofl\b", "haha", text)
    text = re.sub(r"\blmao\b", "haha", text)
    text = re.sub(r"\bweren't\b", "were not", text)
    text = re.sub(r"\bone's\b", "one is", text)
    text = re.sub(r"\bwhay\b", "what", text)
    text = re.sub(r"\bsomthing\b", "something", text)
    
    text = re.sub(r"😂+", "😂", text)
    text = re.sub(r"😞+", "😞", text)
    text = re.sub(r"😁+", "😁", text)
    text = re.sub(r"😭+", "😭", text)
    text = re.sub(r"😤+", "😤", text)
    text = re.sub(r"👍+", "👍", text)
    text = re.sub(r"🙂+", "🙂", text)
    text = re.sub(r"😨+", "😨", text)
    text = re.sub(r"😠+", "😠", text)
    text = re.sub(r"🤔+", "🤔", text)
    text = re.sub(r"😅+", "😅,", text)
    text = re.sub(r"😏+", "😏", text)
    text = re.sub(r"😛+", "😛", text)
    text = re.sub(r"😆+", "😆", text)
    text = re.sub(r"😢+", "😢", text)
    text = re.sub(r"😜+", "😜", text)
    text = re.sub(r"😃+", "😃", text)
    text = re.sub(r"😌+", "😌", text)
    text = re.sub(r"😒+", "😒", text)
    text = re.sub(r"😩+", "😩", text)
    text = re.sub(r"😀+", "😀", text)
    text = re.sub(r"😘+", "😘", text)
    text = re.sub(r"😫+", "😫", text)
    text = re.sub(r"😄+", "😄", text)
    text = re.sub(r"😡+", "😡", text)
    text = re.sub(r"😍+", "😍", text)
    text = re.sub(r"❤️+", "❤️", text)
    text = re.sub(r"😸+", "😸", text)
    text = re.sub(r"😊+", "😊", text)
    
    text = re.sub(r"\bhe[y]+\b", "hey", text)
    text = re.sub(r"\bshutup\b", "shut up", text)
    text = re.sub(r"\bwhatsup\b", "what is up", text)
    text = re.sub(r"\bintrested\b", "interested", text)
    text = re.sub(r"\btbh\b", "to be honest", text)
    text = re.sub(r"\btmrw\b", "tomorrow", text)
    text = re.sub(r"\byu[p]+\b", "yup", text)
    text = re.sub(r"\bdumbass\b", "dumb ass", text)
    text = re.sub(r"\byo[u]+\b", "you", text)
    text = re.sub(r"\bttyl\b", "talk to you later", text)
    text = re.sub(r"\bthts\b", "that is", text)
    text = re.sub(r"\bikr\b", "i know right ?", text)
    text = re.sub(r"\bthanku\b", "thank you", text)
    text = re.sub(r"\b'you\b", "you", text)
    text = re.sub(r"\bwhts\b", "what is", text)
    text = re.sub(r"\bye[a]+[h]*\b", "yeah", text)
    text = re.sub(r"\byou'are\b", "you are", text)
    text = re.sub(r"\bypu\b", "you", text)
    text = re.sub(r"\byo[u]+\b", "you", text)
    text = re.sub(r"\bx‑d\b", "😆", text)
    text = re.sub(r"\bryt\b", "right", text)
    text = re.sub(r"\banytym\b", "anytime", text)
    text = re.sub(r"\bbitch(.*)\b", "bitch", text)
    text = re.sub(r"\blyk\b", "like", text)
    text = re.sub(r"\bna[h]+\b", "nah", text)
    text = re.sub(r"\bnopes\b", "nope", text)
    text = re.sub(r"\bto[o]+\b", "too", text)
    text = re.sub(r"\b'i\b", "i", text)
    text = re.sub(r"\b'you\b", "you", text)
    text = re.sub(r"\bnt[n]?g\b", "nothing", text)
    text = re.sub(r"\bi\b", "I", text)
    text = re.sub(r"\bohkay\b", "oh okay", text)
    text = re.sub(r"\b[m]+[e]+\b", "me", text)
    text = re.sub(r"\b[o]+[k]+[a]+[y]+\b", "okay", text)
    text = re.sub(r"\b[o]+[h]+\b", "oh", text)
    text = re.sub(r"\b[b]+[y]+[e]+\b", "bye", text)
    text = re.sub(r"\by[a]+\b", "ya", text)
    text = re.sub(r"\b[w]+[h]+[y]+\b", "why", text)
    text = re.sub(r"\bbday\b", "birthday", text)
    text = re.sub(r"\b[w]+[o]+[w]+\b", "wow", text)
    
    return text

In [7]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        raw_conversations : All conversations together
        labels : [Only available in "train" mode] List of labels
    """
    indices = []
    conversations = []
    raw_conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            raw_conv = line[:].strip()
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])
            
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)
            
            indices.append(int(line[0]))
            conversations.append(cleanText(conv.lower()))
            raw_conversations.append(raw_conv)
    
    if mode == "train":
        return indices, conversations, raw_conversations, labels
    else:
        return indices, conversations, raw_conversations

In [8]:
def writeNormalisedData(dataFilePath, texts):
    """Write normalised data to a file
    Input:
        dataFilePath : Path to original train/test file that has been processed
        texts : List containing the normalised 3 turn conversations, separated by the <eos> tag.
    """
    normalisedDataFilePath = dataFilePath.replace(".txt", "_normalised.txt")
    with io.open(normalisedDataFilePath, 'w', encoding='utf8') as fout:
        with io.open(dataFilePath, encoding='utf8') as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                line = line.strip().split('\t')
                normalisedLine = texts[lineNum].strip().split('<eos>')
                fout.write(line[0] + '\t')
                # Write the original turn, followed by the normalised version of the same turn
                fout.write(line[1] + '\t' + normalisedLine[0] + '\t')
                fout.write(line[2] + '\t' + normalisedLine[1] + '\t')
                fout.write(line[3] + '\t' + normalisedLine[2] + '\t')
                try:
                    # If label information available (train time)
                    fout.write(line[4] + '\n')    
                except:
                    # If label information not available (test time)
                    fout.write('\n')

In [9]:
def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
    """
    embeddingsIndex = {}
    # Load the embedding vectors from ther GloVe file
    with io.open(os.path.join(gloveDir, 'glove.6B.%dd.txt' % EMBEDDING_DIM), encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector
    
    print('Found %s word vectors.' % len(embeddingsIndex))
    
    oov = []
    # Minimum word index of any word is 1. 
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    count, total= 0, 0
    for word, i in wordIndex.items():
        embeddingVector = embeddingsIndex.get(word)
        total += 1
        if embeddingVector is not None:
            # words not found in embedding index will be all-zeros.
            embeddingMatrix[i] = embeddingVector
            count += 1
        else:
            oov.append(word)
    
    print("Found embedding for", str((100 * count) / total), "% embeddings")
    return embeddingMatrix, oov

In [10]:
print("Processing training data...")
trainIndices, trainTexts, rawtrainTexts, labels = preprocessData(trainDataPath, mode="train")
labels = to_categorical(np.asarray(labels))
# Write normalised text to file to check if normalisation works. Disabled now. Uncomment following line to enable   
# writeNormalisedData(trainDataPath, trainTexts)
print("Processing test data...")
testIndices, testTexts, rawtestTexts = preprocessData(testDataPath, mode="test")
# writeNormalisedData(testDataPath, testTexts)

print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(trainTexts)
trainSequences = tokenizer.texts_to_sequences(trainTexts)
testSequences = tokenizer.texts_to_sequences(testTexts)

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

Processing training data...
Processing test data...
Extracting tokens...
Found 16269 unique tokens.


In [11]:
def split_into_three(texts, tknzr):
    middle, left, right = [], [], []
    for text in texts:
        l, m, r = text.split(' <eos> ')
        middle.append(m)
        left.append(l)
        right.append(r)
    tokenize = lambda x: tknzr.texts_to_sequences(x)
    return (tokenize(left), tokenize(middle), tokenize(right))

In [12]:
train_l, train_m, train_r = split_into_three(trainTexts, tokenizer)
test_l, test_m, test_r = split_into_three(testTexts, tokenizer)

In [13]:
print("Populating embedding matrix...")
embeddingMatrix, oov = getEmbeddingMatrix(wordIndex)
oov = [(x, tokenizer.word_counts.get(x, 0)) for x in oov]
oov.sort(key=lambda tup: tup[1], reverse=True)

Populating embedding matrix...
Found 400000 word vectors.
Found embedding for 66.46382691007437 % embeddings


In [14]:
print(oov[:100])

[('😂', 1206), ('😭', 302), ('😞', 213), ('😢', 200), ('😁', 199), ('😅', 147), ('😍', 132), ('😀', 118), ('😃', 111), ('😡', 111), ('😆', 110), ('😄', 102), ('😊', 92), ("i'd", 82), ('😒', 77), ('😌', 71), ('😠', 70), ('😤', 68), ('🙂', 63), ('😸', 60), ('😺', 57), ('😫', 55), ('‑', 54), ('😜', 53), ('😹', 53), ('😩', 53), ('👍', 52), ('😘', 51), ('ohk', 48), ('😉', 46), ('😽', 43), ('😻', 42), ('💔', 35), ('😏', 35), ('hurted', 33), ('😑', 32), ('😝', 32), ('🙁', 31), ('emoji', 30), ('♥', 29), ('😿', 28), ('😾', 28), ('😬', 27), ('❤', 26), ('😋', 25), ('‑d', 23), ("'i", 23), ('😔', 23), ('🙄', 23), ('☺', 22), ('🙀', 22), ("you'd", 21), ('😦', 20), ('☹', 20), ('😎', 20), ('👎', 20), ("'‑", 20), ('😧', 19), ('❤️', 18), ('you😂', 17), ('selfie', 17), ('chatbot', 17), ("'you", 16), ('😛', 16), ('i̇', 15), ('yrr', 15), ('😶', 14), ('8‑d', 14), ('😐', 13), ('😁😂', 13), ('🤔', 12), ('👌', 12), ('☺️', 12), ('me😭', 12), ('・', 12), ('♡', 11), ('😇', 11), ('yayy', 11), ("someone's", 11), ('byy', 10), ("friend's", 10), ("valentine's", 10), ('○', 9

In [15]:
train_l = pad_sequences(train_l, maxlen=MAX_SEQUENCE_LENGTH)
train_m = pad_sequences(train_m, maxlen=MAX_SEQUENCE_LENGTH)
train_r = pad_sequences(train_r, maxlen=MAX_SEQUENCE_LENGTH)

print("Creating solution file...")
test_l = pad_sequences(test_l, maxlen=MAX_SEQUENCE_LENGTH)
test_m = pad_sequences(test_m, maxlen=MAX_SEQUENCE_LENGTH)
test_r = pad_sequences(test_r, maxlen=MAX_SEQUENCE_LENGTH)

print("Shape of training data tensor: ", train_l.shape)
print("Shape of label tensor: ", labels.shape)

Creating solution file...
Shape of training data tensor:  (30160, 25)
Shape of label tensor:  (30160, 4)


In [16]:
# https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [17]:
def getMetrics(predictions, ground):
    discretePredictions = to_categorical(predictions.argmax(axis=1))
    
    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
    
    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)
    
    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
    
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   
    
    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()    
    
    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
    
    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)
    
    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------
    
    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)
    
    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1

In [18]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [19]:
def microF1Loss(ground, predictions):
    discretePredictions = K.one_hot(K.argmax(predictions, axis=1), NUM_CLASSES)
    
    truePositives = K.sum(discretePredictions*ground, axis=0)
    falsePositives = K.sum(K.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = K.sum(K.clip(ground-discretePredictions, 0, 1), axis=0)

    macroPrecision = 0
    macroRecall = 0
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall + K.epsilon())
        
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall + K.epsilon())
    
    truePositives = K.sum(truePositives[1:])
    falsePositives = K.sum(falsePositives[1:])
    falseNegatives = K.sum(falseNegatives[1:])
    
    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)
    
    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall + + K.epsilon())
    
    return microF1

In [20]:
def buildTriModel(embeddingMatrix, learnEmbs=False):
    if learnEmbs:
        embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                    EMBEDDING_DIM,
                                    input_length=MAX_SEQUENCE_LENGTH)
    else:
        embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                    EMBEDDING_DIM,
                                    weights=[embeddingMatrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
    
    inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embeddingLayer(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(32, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))(x)
    y = Bidirectional(GRU(32, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))(x)
    
    atten_1 = Attention(MAX_SEQUENCE_LENGTH)(x) # skip connect
    atten_2 = Attention(MAX_SEQUENCE_LENGTH)(y)
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    conc = concatenate([atten_1, atten_2, avg_pool, max_pool])
    conc = Dropout(0.3)(conc)
    conc = Dense(32, activation="relu")(conc)
    conc = Dropout(0.3)(conc)
    
    model = Model(inputs=inp, outputs=conc)
    
    input_l, input_m, input_r = Input((MAX_SEQUENCE_LENGTH,)), Input((MAX_SEQUENCE_LENGTH,)), Input((MAX_SEQUENCE_LENGTH,))
    embed_l, embed_m, embed_r = model(input_l), model(input_m), model(input_r)
    output = Average()([embed_l, embed_m, embed_r])
    output = Dense(NUM_CLASSES, activation='softmax')(output)
    tri_model = Model(inputs=[input_l, input_m, input_r], outputs=output)
    tri_model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.Adam(1e-2),
                  metrics=[microF1Loss])
    return tri_model

In [21]:
import time

clr = CyclicLR(base_lr=0.002, max_lr=0.004,
               step_size=300., mode='exp_range',
               gamma=0.99994)

print("Training model")
model = buildTriModel(embeddingMatrix, learnEmbs=False)

model.fit([train_l, train_m, train_r], labels, epochs=15, batch_size=256, validation_split=0.1, callbacks= [clr,])
model.save("ended_model.h5")

Training model
Train on 27144 samples, validate on 3016 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [111]:
def construct_meta_features(data):
    # Compute emoji-based features
    emojis = ['😂', '😭', '😞', '😢', '😁', '😅', '😍',
              '😀', '😃', '😡', '😄', '😆', '😒', '😊',
              '😌', '😠', '😤', '🙂', '😺', '😫', '😩',
              '😹', '😜', '👍', '😘', '😸', '😉', '😽',
              '😻', '😏', '💔', '😝', '😑', '🙁', '😾',
              '😿', '😬', '❤', '😋', '🙄', '😔', '🙀',
              '😎', '👎', '😦', '😧', '❤️', '😛', '😶',
              '😐', '👌', '🤔','😇', '😨', '😯', '😳',
              '☹️', '💋', '👋', '?', '!', '.']
    happy_words = ['happy', 'lol', 'haha', 'enjoy', 'cool', 'glad',
                   'smile', 'nice', 'funny', 'wow']
    angry_words = ['angry', 'fuck', 'hell', 'shut up', 'bad', 'rude',
                  'block', 'stupid']
    sad_words   = ['sad', 'sorry', 'miss', 'alone', 'lonely', 'cry'
                   'disappointed']
    indicator_words = emojis + happy_words + angry_words + sad_words
    
    word_features = np.zeros((len(data), len(indicator_words)))
    for i, text in enumerate(data):
        for j, word in enumerate(indicator_words):
            word_features[i][j] += text.lower().count(word)
    
    # Compute CAPS-based features
    capital_features = np.zeros((len(data), 1))
    for i, text in enumerate(data):
        for word in text.split(' '):
            if word.isupper():
                capital_features[i] += 1
    
    # Combine metadata-based features
    metadata_features = np.concatenate((word_features, capital_features), axis=1)
    return metadata_features

In [112]:
metadata_features = construct_meta_features(rawtrainTexts)
print(metadata_features.shape)

(30160, 87)


In [120]:
# Train classifier for metadata-based classification
from sklearn import tree
from sklearn.metrics import f1_score
meta_clf = tree.DecisionTreeClassifier()
meta_clf.fit(metadata_features, np.argmax(labels, axis=1))
getMetrics(meta_clf.predict_proba(metadata_features), labels)

True Positives per class :  [14823.  4238.  5460.  5503.]
False Positives per class :  [ 6. 68. 41. 21.]
False Negatives per class :  [125.   5.   3.   3.]
Class happy : Precision : 0.984, Recall : 0.999, F1 : 0.991
Class sad : Precision : 0.993, Recall : 0.999, F1 : 0.996
Class angry : Precision : 0.996, Recall : 0.999, F1 : 0.998
Ignoring the Others class, Macro Precision : 0.9910, Macro Recall : 0.9992, Macro F1 : 0.9951
Ignoring the Others class, Micro TP : 15201, FP : 130, FN : 11
Accuracy : 0.9955, Micro Precision : 0.9915, Micro Recall : 0.9993, Micro F1 : 0.9954


(0.9954907161803713, 0.99152046, 0.9992769, 0.9953835902288041)

In [123]:
# Dump tree structure vislualization
import graphviz
dot_data = tree.export_graphviz(meta_clf, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render("semeval_dtree") 

'semeval.pdf'

In [50]:
# Test train accuracy with and without emoji-augmentation
preds = model.predict([train_l, train_m, train_r], batch_size=1024)

In [82]:
preds_augment = meta_clf.predict_proba(metadata_features)

In [84]:
# Visible performance boost using prior based on emojis
getMetrics(preds, labels)
print()
getMetrics(0.5 * preds + 0.5 * preds_augment, labels)

True Positives per class :  [13456.  3071.  4267.  4585.]
False Positives per class :  [2614.  831.  732.  604.]
False Negatives per class :  [1492. 1172. 1196.  921.]
Class happy : Precision : 0.787, Recall : 0.724, F1 : 0.754
Class sad : Precision : 0.854, Recall : 0.781, F1 : 0.816
Class angry : Precision : 0.884, Recall : 0.833, F1 : 0.857
Ignoring the Others class, Macro Precision : 0.8414, Macro Recall : 0.7792, Macro F1 : 0.8091
Ignoring the Others class, Micro TP : 11923, FP : 2167, FN : 3289
Accuracy : 0.8415, Micro Precision : 0.8462, Micro Recall : 0.7838, Micro F1 : 0.8138

True Positives per class :  [14825.  4200.  5454.  5501.]
False Positives per class :  [44. 67. 42. 27.]
False Negatives per class :  [123.  43.   9.   5.]
Class happy : Precision : 0.984, Recall : 0.990, F1 : 0.987
Class sad : Precision : 0.992, Recall : 0.998, F1 : 0.995
Class angry : Precision : 0.995, Recall : 0.999, F1 : 0.997
Ignoring the Others class, Macro Precision : 0.9906, Macro Recall : 0.995

(0.9940318302387268, 0.99110585, 0.99625295, 0.9936727389718414)

In [114]:
predictions = model.predict([test_l, test_m, test_r], batch_size=1024)

In [86]:
test_meta = construct_meta_features(rawtestTexts)
predictions_meta = meta_clf.predict_proba(test_meta)

In [87]:
predictions  = 0.5 * predictions + 0.5 * predictions_meta

In [115]:
predictions = predictions.argmax(axis=1)

with io.open(solutionPath, "w", encoding="utf8") as fout:
    fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')        
    with io.open(testDataPath, encoding="utf8") as fin:
        fin.readline()
        for lineNum, line in enumerate(fin):
            fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
            fout.write(label2emotion[predictions[lineNum]] + '\n')
print("Completed")

Completed
