In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras import optimizers
from keras.models import load_model
import json, argparse, os
import re
import io
import sys

Using TensorFlow backend.


In [2]:
trainDataPath = "train.txt"
testDataPath = "devwithoutlabels.txt"
# Output file that will be generated. This file can be directly submitted.
solutionPath = "test.txt"
# Path to directory where GloVe file is saved.
gloveDir = "./glove"
NUM_FOLDS = 5                   # Value of K in K-fold Cross Validation
NUM_CLASSES = 4                 # Number of classes - Happy, Sad, Angry, Others
MAX_NB_WORDS = 20000                # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer 
MAX_SEQUENCE_LENGTH = 100         # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = 300               # The dimension of the word embeddings
BATCH_SIZE = 200                  # The batch size to be chosen for training the model.
LSTM_DIM = 128                    # The dimension of the representations learnt by the LSTM model
DROPOUT = 0.2  
LEARNING_RATE = 0.003 # Fraction of the units to drop for the linear transformation of the inputs. Ref - https://keras.io/layers/recurrent/
NUM_EPOCHS = 75                  # Number of epochs to train a model for
'''

Change parameters here for the LSTM model used below

'''

label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

In [3]:
def getMetrics(predictions, ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification  
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))
    
    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please add  code here or somewhere else to print the TruePositives,FalsePositives,FalseNegatives
    
    ##########################################################
    ##########################################################
    '''
    
    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)
    
    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
    
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   
    
    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()    
    
    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
    
    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)
    
    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------
    
    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)
    
    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1


In [4]:
def writeNormalisedData(dataFilePath, texts):
    """Write normalised data to a file
    Input:
        dataFilePath : Path to original train/test file that has been processed
        texts : List containing the normalised 3 turn conversations, separated by the <eos> tag.
    """
    
    '''
    ##########################################################
    ##########################################################
    
    You May ignore this function
    
    ##########################################################
    ##########################################################
    '''
    normalisedDataFilePath = dataFilePath.replace(".txt", "_normalised.txt")
    with io.open(normalisedDataFilePath, 'w', encoding='utf8') as fout:
        with io.open(dataFilePath, encoding='utf8') as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                line = line.strip().split('\t')
                normalisedLine = texts[lineNum].strip().split('<eos>')
                fout.write(line[0] + '\t')
                # Write the original turn, followed by the normalised version of the same turn
                fout.write(line[1] + '\t' + normalisedLine[0] + '\t')
                fout.write(line[2] + '\t' + normalisedLine[1] + '\t')
                fout.write(line[3] + '\t' + normalisedLine[2] + '\t')
                try:
                    # If label information available (train time)
                    fout.write(line[4] + '\n')    
                except:
                    # If label information not available (test time)
                    fout.write('\n')


In [5]:
from gensim.models.wrappers import FastText
mod = FastText.load_fasttext_format('/home/bt1/17CS10037/taddhita/cc.en.300.bin', encoding="utf8")


'''
    ##########################################################
    ##########################################################
    
    Fasttext model being loaded here. It might take 1 min to load.
    This bin file is of 10mb you can look for 5 gb variant of it.
    Also download the 300d English language '.bin' file not '.vec' file.
    
    ##########################################################
    ##########################################################
'''

"\n    ##########################################################\n    ##########################################################\n    \n    Fasttext model being loaded here. It might take 1 min to load.\n    This bin file is of 10mb you can look for 5 gb variant of it.\n    Also download the 300d English language '.bin' file not '.vec' file.\n    \n    ##########################################################\n    ##########################################################\n"

In [6]:
def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
    """
    
    # Minimum word index of any word is 1. 
    '''
    ##########################################################
    ##########################################################
    
    If you want to use gloVe model you may use but main purpose of it is to generate wordtovec
    If possible look up for some better library than fasttext.bin file here which can deal with emoji as well
    
    ##########################################################
    ##########################################################
    '''
    i=0
    error_words =[]
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        try:
            embeddingVector = mod[word]
            embeddingMatrix[i] = embeddingVector    
        except KeyError:
            print(word)
            error_words.append(word)
    print(len(error_words))
    return embeddingMatrix

# def getEmbeddingMatrix(wordIndex):
#     """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
#        the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
#     Input:
#         wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
#     Output:
#         embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
#     """
#     embeddingsIndex = {}
#     # Load the embedding vectors from ther GloVe file
#     with io.open(os.path.join(gloveDir, 'glove.6B.100d.txt'), encoding="utf8") as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             embeddingVector = np.asarray(values[1:], dtype='float32')
#             embeddingsIndex[word] = embeddingVector
    
#     print('Found %s word vectors.' % len(embeddingsIndex))
#     error_words =[]
#     # Minimum word index of any word is 1. 
#     embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
#     for word, i in wordIndex.items():
#         embeddingVector = embeddingsIndex.get(word)
#         if embeddingVector is not None:
#             # words not found in embedding index will be all-zeros.
#             embeddingMatrix[i] = embeddingVector
#         else
#             print(word)
#             error_words.append(word)
#     print(len(error_words))
#     return embeddingMatrix

In [7]:
import emoji
import regex

def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

In [8]:
line1 = ["ü§î üôà me as√≠, se üòå ds üíïüë≠üëô hello üë©üéì emoji hello  how are üòä you todayüôÖüèΩüôÖüèΩ"]
line=["money money and lots of moneyüòçüòç"]

counter = split_count(line[0])
print(counter)
emojis = split_count(line1[0])
for c in emojis:
    lineSplit = line1[0].split(c)
    while True:
        try:
            lineSplit.remove('')
        except:
            break
    cSpace = ' ' + emoji.UNICODE_EMOJI[c] + ' '    
    line1[0] = cSpace.join(lineSplit)
print(line1[0])

['üòç', 'üòç']
  :see-no-evil_monkey:  me as√≠, se  :relieved_face:  ds  :two_hearts:  :two_women_holding_hands:  :bikini:  hello  :woman:  :graduation_cap:  emoji hello  how are  :smiling_face_with_smiling_eyes:  you today


In [21]:

def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please try to better the pre-processing here by sepearting the emojis with text like  "hmmüòû", "now‚Äº‚Äº‚Äºüò£üò§üò§" & etc.
    Also look for other methods.
    
    ##########################################################
    ##########################################################
    '''
    indices = []
    conversations = []
    labels = []
    i=0
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            i+=1
            
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            
            emojis = split_count(line)
            repeatedChars = ['.']
            repeatedChars.append('?')
            repeatedChars.append(',')
            repeatedChars.append('_')
            repeatedChars.append(':')
            repeatedChars.append('-')
            repeatedChars.append(';')
            repeatedChars.append('&')
            repeatedChars.append('#')
            for c in emojis:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' '    
                line = cSpace.join(lineSplit)
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])
#             print(conv)
#             print("#######Conv Over######")
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)
            #print(conv)
            indices.append(int(line[0]))
            conversations.append(conv.lower())
            print(conversations)
            
    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations


In [22]:
    '''
    ##########################################################
    ##########################################################
    
    Simple function calling.
    
    ##########################################################
    ##########################################################
    '''
    print("Processing training data...")
    trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train")
    # Write normalised text to file to check if normalisation works. Disabled now. Uncomment following line to enable   
    #writeNormalisedData(trainDataPath, trainTexts)
    print("Processing test data...")
    testIndices, testTexts = preprocessData(testDataPath, mode="test")
    #writeNormalisedData(testDataPath, testTexts)

    print("Extracting tokens...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(trainTexts)
    trainSequences = tokenizer.texts_to_sequences(trainTexts)
    testSequences = tokenizer.texts_to_sequences(testTexts)

    wordIndex = tokenizer.word_index
    print("Found %s unique tokens." % len(wordIndex))

    print("Populating embedding matrix...")
    embeddingMatrix = getEmbeddingMatrix(wordIndex)

    data = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(labels))
    print("Shape of training data tensor: ", data.shape)
    print("Shape of label tensor: ", labels.shape)
        
    # Randomize data
    np.random.shuffle(trainIndices)
    data = data[trainIndices]
    labels = labels[trainIndices]
      
    # Perform k-fold cross validation
    metrics = {"accuracy" : [],
               "microPrecision" : [],
               "microRecall" : [],
               "microF1" : []}

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



["don't worry i'm girl <eos> hmm how do i know if you are <eos> what's ur name ? ", 'when did i ? <eos> saw many times i think - _ - <eos> no . i never saw you', 'by <eos> by google chrome <eos> where you live', 'u r ridiculous <eos> i might be ridiculous but i am telling the truth . <eos> u little disgusting whore', 'just for time pass <eos> wt do u do 4 a living then <eos> maybe', "i'm a dog person <eos> youre so rude <eos> whaaaat why", 'so whatsup <eos> nothing much . sitting sipping and watching tv . how abt u ? <eos> what are you watching on tv ? ', 'ok <eos> ok im back!! <eos> so , how are u', 'really ? <eos> really really really really really <eos> y saying so many times . i can hear you', 'bay <eos> in the bay <eos> love you', 'i hate my boyfriend <eos> you got a boyfriend ? <eos> yes', 'i will do night . <eos> alright . keep me in loop . <eos> not giving whatsapp no . ', 'sure go ahead <eos> many thanks once again! <eos> love you too', "bad <eos> bad bad! that's the bad kind 

KeyboardInterrupt: 

In [13]:
 '''
    ##########################################################
    ##########################################################
    
    Build model
    
    ##########################################################
    ##########################################################
    '''
def buildModel(embeddingMatrix):
    """Constructs the architecture of the model
    Input:
        embeddingMatrix : The embedding matrix to be loaded in the embedding layer.
    Output:
        model : A basic LSTM model
    """
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(LSTM(LSTM_DIM, dropout=DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    
    rmsprop = optimizers.rmsprop(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=rmsprop,
                  metrics=['acc'])
    return model

In [14]:
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    doing k-fold validation and it will take time to execute maybe an hour or two
    Please analyse TruePositives,FalsePositives,FalseNegatives to gain insights why model is not working
    
    ##########################################################
    ##########################################################
    '''
    print("Starting k-fold cross validation...")
    for k in range(NUM_FOLDS):
        print('-'*40)
        print("Fold %d/%d" % (k+1, NUM_FOLDS))
        validationSize = int(len(data)/NUM_FOLDS)
        index1 = validationSize * k
        index2 = validationSize * (k+1)
            
        xTrain = np.vstack((data[:index1],data[index2:]))
        yTrain = np.vstack((labels[:index1],labels[index2:]))
        xVal = data[index1:index2]
        yVal = labels[index1:index2]
        print("Building model...")
        model = buildModel(embeddingMatrix)
        model.fit(xTrain, yTrain, 
                  validation_data=(xVal, yVal),
                  epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

        predictions = model.predict(xVal, batch_size=BATCH_SIZE)
        accuracy, microPrecision, microRecall, microF1 = getMetrics(predictions, yVal)
        metrics["accuracy"].append(accuracy)
        metrics["microPrecision"].append(microPrecision)
        metrics["microRecall"].append(microRecall)
        metrics["microF1"].append(microF1)

Starting k-fold cross validation...
----------------------------------------
Fold 1/5
Building model...
Train on 24128 samples, validate on 6032 samples
Epoch 1/75
 3400/24128 [===>..........................] - ETA: 27s - loss: 1.1936 - acc: 0.4929

KeyboardInterrupt: 

In [None]:
        
print("\n============= Metrics =================")
print("Average Cross-Validation Accuracy : %.4f" % (sum(metrics["accuracy"])/len(metrics["accuracy"])))
print("Average Cross-Validation Micro Precision : %.4f" % (sum(metrics["microPrecision"])/len(metrics["microPrecision"])))
print("Average Cross-Validation Micro Recall : %.4f" % (sum(metrics["microRecall"])/len(metrics["microRecall"])))
print("Average Cross-Validation Micro F1 : %.4f" % (sum(metrics["microF1"])/len(metrics["microF1"])))
    
print("\n======================================")
    
print("Retraining model on entire data to create solution file")
model = buildModel(embeddingMatrix)
model.fit(data, labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
model.save('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))
# model = load_model('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))


In [119]:
    '''
    ##########################################################
    ##########################################################
    
    Prediction on dev-set
    
    ##########################################################
    ##########################################################
    '''
    print("Creating solution file...")
    testData = pad_sequences(testSequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(testData, batch_size=BATCH_SIZE)
    predictions = predictions.argmax(axis=1)

Creating solution file...


In [120]:
    '''
    ##########################################################
    ##########################################################
    
    Dont modify it.It writes the txt file which is needed to submit at the site
    
    ##########################################################
    ##########################################################
    '''
    with io.open(solutionPath, "w", encoding="utf8") as fout:
        fout.write(unicode('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n'))        
        with io.open(testDataPath, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(unicode(label2emotion[predictions[lineNum]] + '\n'))
    print("Completed. Model parameters: ")
    print("Learning rate : %.3f, LSTM Dim : %d, Dropout : %.3f, Batch_size : %d" 
          % (LEARNING_RATE, LSTM_DIM, DROPOUT, BATCH_SIZE))


NameError: name 'unicode' is not defined