In [603]:
import numpy as np
import tensorflow as tf
from keras import backend as K

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model, Model
from keras.layers import concatenate, Activation, GlobalAveragePooling1D, GlobalMaxPooling1D, Layer, Dense, Embedding, LSTM, GRU, Dropout, SpatialDropout1D, Input, Average, Bidirectional, BatchNormalization
from keras.callbacks import Callback
from keras import initializers, regularizers, constraints, optimizers, layers

import sys, importlib
importlib.reload(sys.modules['utils'])
import utils

from keras.models import load_model
import json, argparse, os
import re
import io
import sys

Loading utils module


In [16]:
# Don't hog GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
keras.backend.set_session(sess)

In [17]:
# Path to training and testing data file. This data can be downloaded from a link, details of which will be provided.
trainDataPath = "./train.txt"
testDataPath = "./dev.txt"
evalDataPath = "./evaluate.txt"
# Output file that will be generated. This file can be directly submitted.
solutionPath = "./test.txt"

label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

In [349]:
NUM_CLASSES = 4                 # Number of classes - Happy, Sad, Angry, Others
MAX_NB_WORDS = 15000                # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer 
MAX_SEQUENCE_LENGTH = 35         # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = 300               # The dimension of the word embeddings

In [406]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        raw_conversations : All conversations together
        labels : [Only available in "train" mode] List of labels
    """
    indices = []
    conversations = []
    raw_conversations = []
    labels = []
    
    importlib.reload(sys.modules['regex'])
    import regex
    
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            raw_conv = ' '.join(line[:].strip().split('\t')[1:4])
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])
            
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)
            
            indices.append(int(line[0]))
            # Remove stray punctuation
            stray_punct = ['‑', '-', "^", ":",
                           ";", "#", ")", "(", "*", "=", "\\", "/"]
            for punct in stray_punct:
                    conv = conv.replace(punct, "")
    
            processedData = regex.cleanText(conv.lower()).lower() #.rstrip()
            processedData = processedData.replace("'", "")
            # Remove numbers
            processedData = ''.join([i for i in processedData if not i.isdigit()])

            conversations.append(processedData)
            raw_conversations.append(raw_conv)
    
    if mode == "train":
        return indices, conversations, raw_conversations, labels
    else:
        return indices, conversations, raw_conversations

In [410]:
print("Processing training data...")
trainIndices, trainTexts, rawtrainTexts, labels = preprocessData(trainDataPath, mode="train")
labels = to_categorical(np.asarray(labels), NUM_CLASSES)
print("Processing test data...")
testIndices, testTexts, rawtestTexts, testLabels = preprocessData(testDataPath, mode="train")
testLabels = to_categorical(np.asarray(testLabels), NUM_CLASSES)
print("Processing evaluation data...")
evalIndices, evalTexts, rawevalTexts = preprocessData(evalDataPath, mode="test")

print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
# tokenizer.fit_on_texts(trainTexts)
tokenizer.fit_on_texts(trainTexts + testTexts + evalTexts)
trainSequences = tokenizer.texts_to_sequences(trainTexts)
testSequences = tokenizer.texts_to_sequences(testTexts)
evalSequences = tokenizer.texts_to_sequences(evalTexts)

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

Processing training data...
Loading utils module
Processing test data...
Loading utils module
Processing evaluation data...
Loading utils module
Extracting tokens...
Found 15752 unique tokens.


In [418]:
lens = [len(x) for x in trainSequences]
print("Train-data Coverage (cutoff length):", np.sum(np.array(lens) <= MAX_SEQUENCE_LENGTH) / len(trainSequences))
lens = [len(x) for x in testSequences]
print("Test-data Coverage (cutoff length):", np.sum(np.array(lens) <= MAX_SEQUENCE_LENGTH) / len(testSequences))
lens = [len(x) for x in evalSequences]
print("Eval-data Coverage (cutoff length):", np.sum(np.array(lens) <= MAX_SEQUENCE_LENGTH) / len(evalSequences))

Train-data Coverage (cutoff length): 0.9904509283819629
Test-data Coverage (cutoff length): 0.992377495462795
Eval-data Coverage (cutoff length): 0.9920130695225994


In [419]:
sorted_wordcounts = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)
words_covered, total_words = 0, 0
for i, tup in enumerate(sorted_wordcounts):
    total_words += tup[1]
    if i < MAX_NB_WORDS:
        words_covered += tup[1]
print("Coverage with %d words:" % MAX_NB_WORDS, words_covered/total_words)

Coverage with 15000 words: 0.9987336272485694


In [366]:
sorted_wordcounts = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)
with open('./need_sswe.txt', 'w') as f:
    for i, tup in enumerate(sorted_wordcounts):
        f.write(tup[0] + '\n')

In [586]:
train_l, train_m, train_r = utils.split_into_three(trainTexts, tokenizer)
test_l, test_m, test_r = utils.split_into_three(testTexts, tokenizer)
eval_l, eval_m, eval_r = utils.split_into_three(evalTexts, tokenizer)

train_all = tokenizer.texts_to_sequences(trainTexts)
test_all = tokenizer.texts_to_sequences(testTexts)
eval_all = tokenizer.texts_to_sequences(evalTexts)

In [422]:
for i in range(5, 10):
    print(testTexts[i])
    print(rawtestTexts[i])
    print()

wont you ask my age  <eos> hey at least i age well  <eos> can you tell me how can we get closer  
Wont u ask my age?? hey at least I age well! Can u tell me how can we get closer??

i said yes <eos> what if i told you iam not  <eos> go to hell
I said yes What if I told you I'm not? Go to hell

where i ll check <eos> why tomorrow  <eos> no i want now
Where I ll check why tomorrow? No I want now

shall we meet <eos> you say you are leaving soon  anywhere you wanna go before you head  <eos>  
Shall we meet you say- you're leaving soon...anywhere you wanna go before you head? ?

let us change the subject <eos> i just did it  l  <eos> you are broken
Let's change the subject I just did it .l. You're broken



In [423]:
print("Populating embedding matrix...")
embeddingMatrix, oov = utils.getEmbeddingMatrix(wordIndex, EMBEDDING_DIM)
oov = [(x, tokenizer.word_counts.get(x, 0)) for x in oov]
oov.sort(key=lambda tup: tup[1], reverse=True)

Populating embedding matrix...
Found 400000 word vectors.
Found embedding for 76.16175723717623 % embeddings


In [497]:
importlib.reload(sys.modules['utils'])
import utils

print("Populating SSWE embedding matrix...")
embeddingMatrix_sswe, oov_sswe = utils.get_sswe_embeddings(wordIndex)

Loading utils module
Populating SSWE embedding matrix...
Found unreadable 144 word vectors
Found 15936 word vectors.
Found embedding for 98.99060436769933 % embeddings


In [498]:
print(oov_sswe[:100])
print(len(oov_sswe))

['≠', 'afterdays', '…it', 'more…lasagna', 'family…', 'görlitz', 'rşch', 'ilm', 'whére', 'ω', 'girlfriend\u200b', 'iş', 'wai', '⊄', 'referrel', 'iscoming', 'me…', '¡so', '¿did', 'aadu', 'face…', 'n', 'xx', 'nan', 'libspill', 'serious…just', 'mbile', 'bred', 'ounces', 'r', '••', 'yyi', 'song¿', 'elhamdülillah', '\u200d\u200d', 'okay\u200b', 'what¡', 'well…', 'one—', 'i̇', 'àapka', 'kha\u200d\u200d', 'ı', 'friendsqa', '–', 'vho', 'not\u200b', 'kķkkkk', 'vmyhorbs', 'driver', 'it¿', 'ĺove', '—', 'cross\u200b', 'still…', '¶', 'party¿', 'sad…', 'will…', 'cbillion', '͡°', 'alented', 'snt', 'itӳ', 'false', 'któw', 'morrow', 'i…', '͜ʖ', '×', 'wuakd', '฿', 'yaaaaà', 'what¿¿', 'ixm', 'is…', '\u200d\u200d\u200d\u200d', 'netflixs\u200b', 'doğng', 'mnts', 'seelfi', 'sone', 'cliché', 'same…', 'or…', 'beşiktaş', 'vsit', 'gudni', 'yr', 'want…', 'north\u200b', 'rknow', '¿', 'su', 'kiu', 'y', '∑oo', 'çuz', 'fi̇nd', 'gr']
159


In [431]:
print(oov[:100])
print(len(oov))

[('emoji', 39), ('i̇', 18), ('emojis', 12), ('friendzoned', 11), ('friendzone', 10), ('tajmahal', 9), ('hmmzoning', 9), ('oho', 9), ('donapost', 8), ('lolz', 8), ('rted', 7), ('bangaram', 7), ('everyones', 7), ('ehh', 7), ('gng', 7), ('everythings', 6), ('useropenreflink', 6), ('nonveg', 6), ('whatsaap', 6), ('iand', 5), ('thatll', 5), ('ftfy', 5), ('dafaq', 5), ('facepalm', 5), ('whatt', 5), ('oww', 5), ('ddlj', 5), ('selfies', 5), ('freecharge', 5), ('chatbots', 5), ('\u200d', 5), ('emojisong', 4), ('himher', 4), ('begar', 4), ('happies', 4), ('thnk', 4), ('wiil', 4), ('lololol', 4), ('playcreepypedia', 4), ('flipkart', 4), ('temme', 4), ('bdw', 4), ('youve', 4), ('brokeup', 4), ('oclock', 4), ('lolzzz', 4), ('whyre', 4), ('habbit', 4), ('iaposm', 4), ('whts', 4), ('xams', 4), ('medam', 4), ('arjit', 4), ('bhindi', 3), ('lve', 3), ('dobt', 3), ('theyll', 3), ('padmavat', 3), ('hihihi', 3), ('scuffletown', 3), ('katachi', 3), ('tysm', 3), ('baaghi', 3), ('okz', 3), ('brozoned', 3), ('

In [432]:
ooh = {}

relevant_oov = [x[0] for x in oov]
actual_words = [k for (k,v) in tokenizer.word_index.items()]
for datum in testTexts:
    words = datum.split(' ')
    for word in words:
        if word in relevant_oov :
            ooh[word] = ooh.get(word, 0) + 1

ooh_items = [(k, v) for (k,v) in ooh.items()]
ooh_items.sort(key=lambda tup: tup[1], reverse=True)

In [433]:
print(ooh_items)
print(len(ooh_items))

[('donapost', 6), ('iaposm', 3), ('friendzoned', 3), ('emojis', 3), ('nightcore', 3), ('frnz', 2), ('wowe', 2), ('whataposs', 2), ('ahow', 2), ('bachhe', 2), ('tajmahal', 2), ('playcreepypedia', 2), ('emojisong', 2), ('oclock', 2), ('lolz', 2), ('emoji', 2), ('kewlwst', 2), ('baaghi', 2), ('lamakaan', 2), ('swiggy', 1), ('dunp', 1), ('howaposs', 1), ('hmmzoning', 1), ('oopsy', 1), ('wokayyy', 1), ('vakola', 1), ('lolzzz', 1), ('tindering', 1), ('dhyat', 1), ('besties', 1), ('ocourse', 1), ('whoaposs', 1), ('tnq', 1), ('sharukh', 1), ('yaantey', 1), ('juxt', 1), ('thks', 1), ('milovie', 1), ('xwhat', 1), ('muhhhh', 1), ('higherres', 1), ('megalol', 1), ('areee', 1), ('pshhhh', 1), ('aahh', 1), ('powerliftingsmh', 1), ('tranwendy', 1), ('ivve', 1), ('one—', 1), ('temme', 1), ('saxy', 1), ('fucken', 1), ('googlin', 1), ('dped', 1), ('aaare', 1), ('introducted', 1), ('sircasm', 1), ('favoriet', 1), ('scense', 1), ('gafe', 1), ('gulthfriend', 1), ('whatsaap', 1), ('fidha', 1), ('berozgaari'

In [434]:
ooh = {}

relevant_oov = [x[0] for x in oov]
actual_words = [k for (k,v) in tokenizer.word_index.items()]
for datum in evalTexts:
    words = datum.split(' ')
    for word in words:
        if word in relevant_oov:
            ooh[word] = ooh.get(word, 0) + 1

ooh_items = [(k, v) for (k,v) in ooh.items()]
ooh_items.sort(key=lambda tup: tup[1], reverse=True)

In [435]:
print(ooh_items[:100])
print(len(ooh_items))

[('emoji', 5), ('useropenreflink', 3), ('allout', 2), ('everythings', 2), ('friendzoned', 2), ('donapost', 2), ('okiee', 2), ('briyani', 2), ('chatbots', 2), ('halfgirlfriend', 2), ('besties', 2), ('shole', 2), ('iand', 2), ('lolz', 2), ('ftfy', 2), ('panipuri', 2), ('lohiya', 2), ('wachapp', 2), ('choclati', 2), ('i̇', 2), ('hihihi', 2), ('sachhi', 2), ('nigt', 1), ('duuuuude', 1), ('sherikkum', 1), ('whatsa', 1), ('oclock', 1), ('cutti', 1), ('chillll', 1), ('sherlyns', 1), ('friendzone', 1), ('bzy', 1), ('bainchod', 1), ('tgnxmh', 1), ('twilightt', 1), ('shype', 1), ('ihddit', 1), ('shold', 1), ('mattee', 1), ('bwaha', 1), ('kilpauk', 1), ('langwege', 1), ('dhonis', 1), ('ruwho', 1), ('matvh', 1), ('delect', 1), ('directot', 1), ('voicesg', 1), ('blowjob', 1), ('thnk', 1), ('thenn', 1), ('spiling', 1), ('trappp', 1), ('messagee', 1), ('besharam', 1), ('namber', 1), ('llallu', 1), ('aswu', 1), ('toask', 1), ('conformd', 1), ('adhuri', 1), ('heartfull', 1), ('hutiya', 1), ('huhhhh', 1

In [587]:
train_l = pad_sequences(train_l, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
train_m = pad_sequences(train_m, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
train_r = pad_sequences(train_r, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
train_all = pad_sequences(train_all, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

test_l = pad_sequences(test_l, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_m = pad_sequences(test_m, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_r = pad_sequences(test_r, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_all = pad_sequences(test_all, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

eval_l = pad_sequences(eval_l, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
eval_m = pad_sequences(eval_m, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
eval_r = pad_sequences(eval_r, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
eval_all = pad_sequences(eval_all, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

print("Shape of training data tensor: ", train_l.shape)
print("Shape of training label tensor: ", labels.shape)

Shape of training data tensor:  (30160, 35)
Shape of training label tensor:  (30160, 4)


In [608]:
from keras.layers import average
from keras.regularizers import l1, l2, l1_l2
from keras.layers import RepeatVector
import keras.backend as K

importlib.reload(sys.modules['utils'])
import utils

def buildSingleModel(embeddingMatrix, hidDim=128, dropout=0.2, first_type='lstm', second_type='gru', multitask=False):
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                    embeddingMatrix.shape[1],
                                    weights=[embeddingMatrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
    
    inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embeddingLayer(inp)
    x = SpatialDropout1D(0.2)(x)
    if first_type == 'lstm':
        x = Bidirectional(LSTM(hidDim, return_sequences=True,
                           dropout=0.2, recurrent_dropout=0.2,
                           bias_regularizer=l1_l2(0.01, 0.01),
                           recurrent_regularizer=l1_l2(0.01, 0.01),
                          ))(x)
    else:
        x = Bidirectional(GRU(hidDim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
    if second_type == 'gru':
        y = Bidirectional(GRU(hidDim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
    else:
        y = Bidirectional(LSTM(hidDim, return_sequences=True,
                           dropout=0.2, recurrent_dropout=0.2,
                           bias_regularizer=l1_l2(0.01, 0.01),
                           recurrent_regularizer=l1_l2(0.01, 0.01),
                          ))(x)

    atten_1 = utils.Attention(MAX_SEQUENCE_LENGTH)(x)
    atten_2 = utils.Attention(MAX_SEQUENCE_LENGTH)(y)
    avg_pool_1 = GlobalAveragePooling1D()(x)
    max_pool_1 = GlobalMaxPooling1D()(x)
    avg_pool_2 = GlobalAveragePooling1D()(y)
    max_pool_2 = GlobalMaxPooling1D()(y)
    
    feature_comb_1 = concatenate([atten_1, avg_pool_1, max_pool_1])
    feature_comb_1 = Dropout(dropout)(feature_comb_1)
    conc_1 = Dense(hidDim, activation="relu")(feature_comb_1)

    feature_comb_2 = concatenate([atten_2, avg_pool_2, max_pool_2])
    feature_comb_2 = Dropout(dropout)(feature_comb_2)
    conc_2 = Dense(hidDim, activation="relu")(feature_comb_2)

    conc = concatenate([conc_1, conc_2])
    conc = Dropout(dropout)(conc)
    conc = Dense(hidDim, activation="relu")(conc)
    conc = Dropout(dropout)(conc)
    output = Dense(NUM_CLASSES, activation='softmax')(conc)
    
    if multitask:
        text_encoding = RepeatVector(MAX_SEQUENCE_LENGTH)(conc)
        decoded = LSTM(hidDim, return_sequences=True)(text_encoding)
        decoded = LSTM(hidDim, return_sequences=True)(decoded)
        decoded_words = Dense(embeddingMatrix.shape[0], activation='softmax')(decoded)
        
        def multitask_loss(y_true, y_pred):
            cross_entropy_loss = K.categorical_crossentropy(y_true[1], y_pred[1])
            autoencoder_loss = K.mean(K.square(y_true[0] - y_pred[0]))
            combined_loss = cross_entropy_loss + autoencoder_loss
            return combined_loss
        
        auto_model = Model(inputs=inp, outputs=decoded_words)
        main_model = Model(inputs=inp, outputs=output)
        auto_model.compile(loss='mse',
                  optimizer=optimizers.Adam(1e-3),
                  metrics=['accuracy'])
        main_model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.Adam(1e-3),
                  metrics=[utils.microF1Loss])
        return auto_model, main_model
    else:
        model = Model(inputs=inp, outputs=output)
        model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.Adam(1e-3),
                  metrics=[utils.microF1Loss])

    return model

Loading utils module


In [571]:
from keras.layers import average
from keras.regularizers import l1, l2, l1_l2

def buildSingleDEEPModel(embeddingMatrix, hidDim=128, learnEmbs=False, dropout=0.2):
    if learnEmbs:
        embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                    embeddingMatrix.shape[1],
                                    input_length=MAX_SEQUENCE_LENGTH)
    else:
        embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                    embeddingMatrix.shape[1],
                                    weights=[embeddingMatrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
    
    inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embeddingLayer(inp)
    x = SpatialDropout1D(0.1)(x)
    
#     x = Bidirectional(LSTM(hidDim, return_sequences=True,
#                            dropout=0.1, recurrent_dropout=0.1,
#                            bias_regularizer=l1(0.01),
#                            recurrent_regularizer=l2(0.01),
#                           ))(x)
#     x = Bidirectional(LSTM(hidDim, return_sequences=True,
#                            dropout=0.1, recurrent_dropout=0.1,
#                            bias_regularizer=l1(0.01),
#                            recurrent_regularizer=l2(0.01),
#                           ))(x)
    
    x = LSTM(hidDim, return_sequences=False,
                           dropout=0.1, recurrent_dropout=0.1,
                           bias_regularizer=l1_l2(0.01, 0.01),
                           recurrent_regularizer=l1_l2(0.01, 0.01),
                          )(x)
#     x = Bidirectional(LSTM(hidDim, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
#     x = Bidirectional(LSTM(hidDim, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))(x)    
#     x = utils.Attention(MAX_SEQUENCE_LENGTH)(x)
    
    conc = Dropout(dropout)(x)
    conc = Dense(hidDim, activation="relu")(conc)
    conc = Dropout(dropout)(conc)
    output = Dense(NUM_CLASSES, activation='softmax')(conc)
    model = Model(inputs=inp, outputs=output)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.Adam(1e-3),
#                   optimizer=optimizers.Adam(1e-4),
#                   optimizer=optimizers.SGD(5e-3),
                  metrics=[utils.microF1Loss])
    return model

In [674]:
class_weight = {0: 0.25,
                1: 0.25,
                2: 0.25,
                3: 0.25}
total = 0
for i in range(4):
    class_weight[i] = (len(labels) + len(testLabels))/(np.sum(np.argmax(labels, 1)==i)+np.sum(np.argmax(testLabels, 1)==i))
    total += class_weight[i]

for i in range(4):
    class_weight[i] /= total

print(class_weight)

{0: 0.09015739790282193, 1: 0.35540724746822805, 2: 0.27889419830855045, 3: 0.27554115632039955}


In [675]:
print(len(labels))

30160


In [684]:
from livelossplot.keras import PlotLossesCallback

clr = utils.CyclicLR(base_lr=0.001, max_lr=0.005,
               step_size=300., mode='exp_range',
               gamma=0.99994)
model = buildSingleModel(embeddingMatrix_sswe, hidDim=32,
                         dropout=0.25, multitask=False,
                        first_type='lstm', second_type='gru')
model.fit(train_all, labels,
          batch_size=2048, epochs=150,
          class_weight=class_weight,
          callbacks=[PlotLossesCallback(), clr],
          verbose=0,
          validation_data=(test_all, testLabels))

KeyboardInterrupt: 

In [688]:
from keras.wrappers.scikit_learn import KerasClassifier
from modAL.models import ActiveLearner

real_model = buildSingleModel(embeddingMatrix_sswe, hidDim=32,
                         dropout=0.25, multitask=False,
                        first_type='lstm', second_type='gru')

def get_my_model():
    return real_model

classifier = KerasClassifier(get_my_model)

n_initial = 10000
initial_idx = np.random.choice(range(len(train_all)), size=n_initial, replace=False)
X_initial = train_all[initial_idx]
y_initial = labels[initial_idx]

X_pool = train_all[:]
y_pool = labels[:]

# initialize ActiveLearner
learner = ActiveLearner(
    estimator=classifier,
    X_training=X_initial, y_training=y_initial,
    verbose=1,
    validation_data=(test_all, testLabels),
    batch_size=2048,
    epochs=50
)

# the active learning loop
n_queries = 20
for idx in range(n_queries):
    print('Query no. %d' % (idx + 1))
    query_idx, query_instance = learner.query(X_pool, n_instances=1000, verbose=0, batch_size=1024)
    learner.teach(
        X=X_pool[query_idx], y=y_pool[query_idx], only_new=True,
        verbose=1,
        validation_data=(test_all, testLabels),
        batch_size=1024,
        epochs=7
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)

Train on 10000 samples, validate on 2755 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Query no. 1
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7


Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 2
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 3
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 4
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 5
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 6
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 7
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 8
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7


Epoch 7/7
Query no. 9
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 10
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 11
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 12
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 13
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 14
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 15
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 16
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epo

Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 17
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 18
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 19
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Query no. 20
Train on 1000 samples, validate on 2755 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [None]:
def construct_meta_features(data):
    # Compute emoji-based features
    emojis = ['😂', '😭', '😞', '😢', '😁', '😅', '😍',
              '😀', '😃', '😡', '😄', '😆', '😒', '😊',
              '😌', '😠', '😤', '🙂', '😺', '😫', '😩',
              '😹', '😜', '👍', '😘', '😸', '😉', '😽',
              '😻', '😏', '💔', '😝', '😑', '🙁', '😾',
              '😿', '😬', '❤', '😋', '🙄', '😔', '🙀',
              '😎', '👎', '😦', '😧', '❤️', '😛', '😶',
              '😐', '👌', '🤔','😇', '😨', '😯', '😳',
              '☹️', '💋', '👋', '😪', '😥', '💕', '😱',
              '🙈', '😟', '🙏', '✌', '😖', '😣', '😮',
              '🤗', '😓', '😷', '☹', '💞', '🏻', '🙌',
              '💐', '🙊', '😰', '☺', '😴', '🖕', '♥', '😕',
              '😈', '💗', '♡', '👀', '👊', '‑c', ' 8‑d', ' ‑d',
              '👻', '：）', '.', '?', '!', ',', '-', '・', "'-'",
              '\U0001f923','・ω・', '\U000fe339', ' ‑c']
    happy_words = ['happy', 'lol', 'haha', 'enjoy', 'cool', 'glad',
                   'smile', 'nice', 'funny', 'wow', 'good', 'best',
                   'party', 'baby', 'sweet', 'joke', 'glad', 'perfect',
                   'fantastic', 'excite', 'cute', 'enjoy', 'omg']
    angry_words = ['angry', 'fuck', 'hell', 'shut up', 'bad', 'rude',
                  'block', 'stupid', 'piss', 'lame', "don't", 'mean',
                  'irritat', 'hate', 'ignore', 'get lost', 'reply',
                  'fool', 'regret', 'dumb', 'cheat', 'whore', 'disgust']
    sad_words   = ['sad', 'sorry', 'miss', 'alone', 'lonely', 'cry',
                   'disappointed', 'not', 'no', 'not happy', 'crazy',
                   'stress', 'depress', 'poor', 'care', 'health', 'break up',
                   'breaking up', 'upset', 'forgive', 'left me', 'dump']
    others_words = ['thank you', 'favorite', 'favourite']
    indicator_words = emojis + happy_words + angry_words + sad_words + others_words
    
    word_features = np.zeros((len(data), len(indicator_words)))
    for i, text in enumerate(data):
        for j, word in enumerate(indicator_words):
            useful_text = text.lower()
#             useful_text = " ".join([text.split(' <eos> ')[0], text.split(' <eos> ')[-1]]).lower()
            word_features[i][j] += useful_text.count(word)
    
    # Compute CAPS-based features
    capital_features = np.zeros((len(data), 3))
    for i, text in enumerate(data):
        for word in text.split(' '):
            if word.isupper():
                capital_features[i][0] += 1
        capital_features[i][1] = capital_features[i][0] / (len(text.split(' ')) + 1)
        capital_features[i][2] = sum([len(x) for x in text.split(' ')]) / len(text.split(' '))
    
    # Combine metadata-based features
    metadata_features = np.concatenate((word_features, capital_features), axis=1)
    return metadata_features

In [506]:
metadata_features = construct_meta_features(rawtrainTexts)
print(metadata_features.shape)

(30160, 180)


In [510]:
# Train classifier for metadata-based classification
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score
meta_clf = tree.DecisionTreeClassifier()
meta_clf.fit(metadata_features, np.argmax(labels, axis=1))
utils.getMetrics(meta_clf.predict_proba(metadata_features), labels)

True Positives per class :  [14930.  4063.  5074.  5246.]
False Positives per class :  [787.  12.  44.   4.]
False Negatives per class :  [ 18. 180. 389. 260.]
Class happy : Precision : 0.997, Recall : 0.958, F1 : 0.977
Class sad : Precision : 0.991, Recall : 0.929, F1 : 0.959
Class angry : Precision : 0.999, Recall : 0.953, F1 : 0.975
Ignoring the Others class, Macro Precision : 0.9959, Macro Recall : 0.9464, Macro F1 : 0.9705
Ignoring the Others class, Micro TP : 14383, FP : 60, FN : 829
Accuracy : 0.9719, Micro Precision : 0.9958, Micro Recall : 0.9455, Micro F1 : 0.9700


(0.9719164456233422, 0.99584574, 0.94550353, 0.9700219082279653)

In [590]:
predictions = model.predict(test_all, batch_size=1024)
eval_predictions = model.predict(eval_all, batch_size=1024)

In [512]:
test_meta = construct_meta_features(rawtestTexts)
predictions_meta = meta_clf.predict_proba(test_meta)

eval_meta = construct_meta_features(rawevalTexts)
eval_predictions_meta = meta_clf.predict_proba(eval_meta)

In [591]:
utils.getMetrics(predictions, testLabels)
print()
predictions_sly = np.copy(predictions)
eval_predictions_sly = np.copy(eval_predictions)

for i in range(len(predictions_sly)):
    if predictions_sly[i][1] >= 0.2:
        predictions_sly[i] = [0, 1, 0, 0]
#     if predictions_sly[i][0] >= 0.5:
#         predictions_sly[i] = [1, 0, 0, 0]
#     elif predictions_sly[i][1] >= 0.4:
#         predictions_sly[i] = [0, 1, 0, 0]
#     elif predictions_sly[i][2] >= 0.5:
#         predictions_sly[i] = [0, 0, 1, 0]
  
alpha = 0.8
# utils.getMetrics(predictions_sly, testLabels)
# print()
utils.getMetrics(alpha * predictions_sly + (1-alpha) * predictions_meta, testLabels)

True Positives per class :  [2202.   75.   92.  124.]
False Positives per class :  [110.  41.  35.  76.]
False Negatives per class :  [136.  67.  33.  26.]
Class happy : Precision : 0.647, Recall : 0.528, F1 : 0.581
Class sad : Precision : 0.724, Recall : 0.736, F1 : 0.730
Class angry : Precision : 0.620, Recall : 0.827, F1 : 0.709
Ignoring the Others class, Macro Precision : 0.6637, Macro Recall : 0.6969, Macro F1 : 0.6799
Ignoring the Others class, Micro TP : 291, FP : 152, FN : 126
Accuracy : 0.9049, Micro Precision : 0.6569, Micro Recall : 0.6978, Micro F1 : 0.6767

True Positives per class :  [2138.  102.   94.  127.]
False Positives per class :  [ 76. 114.  35.  69.]
False Negatives per class :  [200.  40.  31.  23.]
Class happy : Precision : 0.472, Recall : 0.718, F1 : 0.570
Class sad : Precision : 0.729, Recall : 0.752, F1 : 0.740
Class angry : Precision : 0.648, Recall : 0.847, F1 : 0.734
Ignoring the Others class, Macro Precision : 0.6163, Macro Recall : 0.7723, Macro F1 : 0.

(0.8932849364791289, 0.5970425, 0.77458036, 0.6743215332041502)

In [552]:
# predictions_towrite = (predictions).argmax(axis=1)
predictions_towrite = (alpha * predictions_sly + (1-alpha) * predictions_meta).argmax(axis=1)

with io.open(solutionPath, "w", encoding="utf8") as fout:
    fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')        
    with io.open(testDataPath, encoding="utf8") as fin:
        fin.readline()
        for lineNum, line in enumerate(fin):
            fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
            fout.write(label2emotion[predictions_towrite[lineNum]] + '\n')
print("Completed")

Completed


In [592]:
# predictions_towrite = (predictions).argmax(axis=1)
# evals_towrite = (alpha * eval_predictions_sly + (1-alpha) * eval_predictions_meta).argmax(axis=1)
evals_towrite = eval_predictions_sly.argmax(axis=1)

with io.open(solutionPath, "w", encoding="utf8") as fout:
    fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n')        
    with io.open(evalDataPath, encoding="utf8") as fin:
        fin.readline()
        for lineNum, line in enumerate(fin):
            fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
            fout.write(label2emotion[evals_towrite[lineNum]] + '\n')
print("Completed")

Completed
