In [7]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras import optimizers
from keras.models import load_model
import json, argparse, os
import re
import io
import sys

In [8]:
trainDataPath = "train.txt"
testDataPath = "devwithoutlabels.txt"
# Output file that will be generated. This file can be directly submitted.
solutionPath = "test.txt"
# Path to directory where GloVe file is saved.
gloveDir = "./"
NUM_FOLDS = 5                   # Value of K in K-fold Cross Validation
NUM_CLASSES = 4                 # Number of classes - Happy, Sad, Angry, Others
MAX_NB_WORDS = 20000                # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer 
MAX_SEQUENCE_LENGTH = 100         # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = 300               # The dimension of the word embeddings
BATCH_SIZE = 200                  # The batch size to be chosen for training the model.
LSTM_DIM = 128                    # The dimension of the representations learnt by the LSTM model
DROPOUT = 0.2  
LEARNING_RATE = 0.003 # Fraction of the units to drop for the linear transformation of the inputs. Ref - https://keras.io/layers/recurrent/
NUM_EPOCHS = 75                  # Number of epochs to train a model for
'''

Change parameters here for the LSTM model used below

'''

label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

In [9]:
def getMetrics(predictions, ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification  
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))
    
    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please add  code here or somewhere else to print the TruePositives,FalsePositives,FalseNegatives
    
    ##########################################################
    ##########################################################
    '''
    
    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)
    
    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
    
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   
    
    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()    
    
    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
    
    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)
    
    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------
    
    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)
    
    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1


In [10]:
def writeNormalisedData(dataFilePath, texts):
    """Write normalised data to a file
    Input:
        dataFilePath : Path to original train/test file that has been processed
        texts : List containing the normalised 3 turn conversations, separated by the <eos> tag.
    """
    
    '''
    ##########################################################
    ##########################################################
    
    You May ignore this function
    
    ##########################################################
    ##########################################################
    '''
    normalisedDataFilePath = dataFilePath.replace(".txt", "_normalised.txt")
    with io.open(normalisedDataFilePath, 'w', encoding='utf8') as fout:
        with io.open(dataFilePath, encoding='utf8') as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                line = line.strip().split('\t')
                normalisedLine = texts[lineNum].strip().split('<eos>')
                fout.write(line[0] + '\t')
                # Write the original turn, followed by the normalised version of the same turn
                fout.write(line[1] + '\t' + normalisedLine[0] + '\t')
                fout.write(line[2] + '\t' + normalisedLine[1] + '\t')
                fout.write(line[3] + '\t' + normalisedLine[2] + '\t')
                try:
                    # If label information available (train time)
                    fout.write(line[4] + '\n')    
                except:
                    # If label information not available (test time)
                    fout.write('\n')


In [11]:
from gensim.models.wrappers import FastText
mod = FastText.load_fasttext_format('/home/bt1/17CS10037/taddhita/cc.en.300.bin', encoding="utf8")


'''
    ##########################################################
    ##########################################################
    
    Fasttext model being loaded here. It might take 1 min to load.
    This bin file is of 10mb you can look for 5 gb variant of it.
    Also download the 300d English language '.bin' file not '.vec' file.
    
    ##########################################################
    ##########################################################
'''

"\n    ##########################################################\n    ##########################################################\n    \n    Fasttext model being loaded here. It might take 1 min to load.\n    This bin file is of 10mb you can look for 5 gb variant of it.\n    Also download the 300d English language '.bin' file not '.vec' file.\n    \n    ##########################################################\n    ##########################################################\n"

In [12]:
def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
    """
    
    # Minimum word index of any word is 1. 
    '''
    ##########################################################
    ##########################################################
    
    If you want to use gloVe model you may use but main purpose of it is to generate wordtovec
    If possible look up for some better library than fasttext.bin file here which can deal with emoji as well
    
    ##########################################################
    ##########################################################
    '''
    i=0
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        try:
            embeddingVector = mod[word]
            embeddingMatrix[i] = embeddingVector    
        except KeyError:
            print(word)
            i=i+1
    print(i)
    return embeddingMatrix

In [7]:
import emoji
import regex

def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

In [8]:
line = ["🤔 🙈 me así, se 😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"]
line1=["money money and lots of money😍😍"]

counter = split_count(line1[0])
print(counter)

['😍', '😍']


In [9]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please try to better the pre-processing here by sepearting the emojis with text like  "hmm😞", "now‼‼‼😣😤😤" & etc.
    Also look for other methods.
    
    ##########################################################
    ##########################################################
    '''
    indices = []
    conversations = []
    labels = []
    i=0
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            i+=1
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            emojis = split_count(line)
            repeatedChars = ['.']
            repeatedChars.append('?')
            repeatedChars.append(',')
                           
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            for c in emojis:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])
            
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)
            
            indices.append(int(line[0]))
            conversations.append(conv.lower())
    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations


In [29]:
    '''
    ##########################################################
    ##########################################################
    
    Simple function calling.
    
    ##########################################################
    ##########################################################
    '''
    print("Processing training data...")
    trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train")
    # Write normalised text to file to check if normalisation works. Disabled now. Uncomment following line to enable   
    #writeNormalisedData(trainDataPath, trainTexts)
    print("Processing test data...")
    testIndices, testTexts = preprocessData(testDataPath, mode="test")
    #writeNormalisedData(testDataPath, testTexts)

    print("Extracting tokens...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(trainTexts)
    trainSequences = tokenizer.texts_to_sequences(trainTexts)
    testSequences = tokenizer.texts_to_sequences(testTexts)

    wordIndex = tokenizer.word_index
    print("Found %s unique tokens." % len(wordIndex))

    print("Populating embedding matrix...")
    embeddingMatrix = getEmbeddingMatrix(wordIndex)

    data = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(labels))
    print("Shape of training data tensor: ", data.shape)
    print("Shape of label tensor: ", labels.shape)
        
    # Randomize data
    np.random.shuffle(trainIndices)
    data = data[trainIndices]
    labels = labels[trainIndices]
      
    # Perform k-fold cross validation
    metrics = {"accuracy" : [],
               "microPrecision" : [],
               "microRecall" : [],
               "microF1" : []}

Processing training data...
Processing test data...
Extracting tokens...
Found 15111 unique tokens.
Populating embedding matrix...
😂
😭
i'll
😞
😁
😢
😍
😀
😡
😅
i've
😆
😄
😊
😃
😒
😠
🙂
😤
😉
😘
😌
😹
😺
how's
😩
i'd
😸
😫
👍
i’m
💔
😻
😽
😑
😋
😝
😔
❤️
😏
🙁
😬
ain't
😾
🙀
😿
😧
😦
🙄
😛
👎
😐
😖
😥
🎁
🤣
😇
‑d
😕
☺️
💕
😪
😣
'‑
👌
🤔
😶
😟
🙈
🙏
😱
😳
i̇
8‑d
x‑d
😯
😰
💙
😓
😨
😮
okh
💋
🤗
😈
💞
how're
🖕
♂️
😴
󾌹
👄
✨
💚
it'll
😷
‑c
i’ve
👏
💖
‑3
👀
：）
・ω・
u'r
💗
ohhk
👦
👿
😚
💐
byyy
💘
👊
it'd
nthg
hbu
let’s
☹️
🙌
d‑'
🙊
🤦‍
wlcm
👻
👍🏻
💝
💛
😗
it'
😵
i’ll
😼
🌹
man's
mom's
how've
💜
👋
💓
👩‍
wht's
ohkk
💃
ddlj
mum's
💤
how'd
😲
✌️
how’s
son's
🐻
🐼
🐍
'6'
'if'
yrrr
💑
eh'
swty
❤️‍
👨
👨‍
💍
🐱
jzt
i'n
now'
🙃
🌍
lol'
🍒
🤘
🤐
hwz
🚗
okzz
👅
i̇f
👭
☝️
🎂
🍾
d'
'rec'
ai's
qstn
why's
msgng
dog's
🐺
eldg
🤡
👮
👷
iopp
🔥
ma'am
cry's
guy's
extc
cllg
🐷
hmmk
🙏🏻
'na'
👊🏻
tbhhh
hmnn
🍞
🐔
dhf
👬
👆
it�s
vaijy
👍🏼
💭
okz
≧∇≦
💩
ukw
🤦🏻‍
👯
ur's
🆗
🐶
or…
🍜
who’s
💃🏻
🐰
'fake'
cgat
💰
🍺
yyup
🏀
bgst
'noob'
🐒
'i'm
up'
hapnd
okhh
👧
͡°
👈
lifw
🙆🏻
👩
🤪
rudw
gunyt
haapy
❓
kxip
👏🏾
i'l
hol'
lil'
🙏🏿
nekde
🖑
aadu2
you�re
haooy
¿did
👼🏻
whé

In [27]:
 '''
    ##########################################################
    ##########################################################
    
    Build model
    
    ##########################################################
    ##########################################################
    '''
def buildModel(embeddingMatrix):
    """Constructs the architecture of the model
    Input:
        embeddingMatrix : The embedding matrix to be loaded in the embedding layer.
    Output:
        model : A basic LSTM model
    """
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(LSTM(LSTM_DIM, dropout=DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    
    rmsprop = optimizers.rmsprop(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=rmsprop,
                  metrics=['acc'])
    return model