In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras import optimizers
from keras.models import load_model

Using TensorFlow backend.


In [2]:
import json, argparse, os
import re
import io
import sys

In [3]:
trainDataPath = "Recursive-neural-networks-TensorFlow/train.txt"
testDataPath = "devwithoutlabels.txt"
# Output file that will be generated. This file can be directly submitted.
solutionPath = "test.txt"
# Path to directory where GloVe file is saved.
gloveDir = "./glove"
NUM_FOLDS = 5                   # Value of K in K-fold Cross Validation
NUM_CLASSES = 4                 # Number of classes - Happy, Sad, Angry, Others
MAX_NB_WORDS = 20000                # To set the upper limit on the number of tokens extracted using keras.preprocessing.text.Tokenizer 
MAX_SEQUENCE_LENGTH = 100         # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = 300               # The dimension of the word embeddings
BATCH_SIZE = 200                  # The batch size to be chosen for training the model.
LSTM_DIM = 128                    # The dimension of the representations learnt by the LSTM model
DROPOUT = 0.2  
LEARNING_RATE = 0.003 # Fraction of the units to drop for the linear transformation of the inputs. Ref - https://keras.io/layers/recurrent/
NUM_EPOCHS = 75                  # Number of epochs to train a model for
'''

Change parameters here for the LSTM model used below

'''

label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

In [4]:
import emoji
import regex

def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

In [5]:
line1 = ["🤔 🙈 mfsafase sadasí, se 😌 dsdgss 💕👭👙 hello 👩🎓 emoji hello  how are 😊 you today🙅🏽🙅🏽"]
line=["money money and lots of money😍😍"]

counter = split_count(line[0])
print(counter)
emojis = split_count(line1[0])
for c in emojis:
    lineSplit = line1[0].split(c)
    while True:
        try:
            lineSplit.remove('')
        except:
            break
    cSpace = ' ' + emoji.UNICODE_EMOJI[c] + ' '    
    line1[0] = cSpace.join(lineSplit)
print(line1[0])

['😍', '😍']
  :see-no-evil_monkey:  mfsafase sadasí, se  :relieved_face:  dsdgss  :two_hearts:  :two_women_holding_hands:  :bikini:  hello  :woman:  :graduation_cap:  emoji hello  how are  :smiling_face_with_smiling_eyes:  you today


In [6]:

def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please try to better the pre-processing here by sepearting the emojis with text like  "hmm😞", "now‼‼‼😣😤😤" & etc.
    Also look for other methods.
    
    ##########################################################
    ##########################################################
    '''
    indices = []
    conversations = []
    labels = []
    i=0
    open("trees.ptb", "w").close()
    
    with open('../stanford-corenlp-full-2018-10-05/output.txt','w') as f:
        f.write("")
    last_tree_processed = ""

    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            i+=1
            
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            
            emojis = split_count(line)
            repeatedChars = ['.']
            repeatedChars.append('?')
            repeatedChars.append(',')
            repeatedChars.append('_')
            repeatedChars.append(':')
            repeatedChars.append('-')
            repeatedChars.append(';')
            repeatedChars.append('&')
            repeatedChars.append('#')
            for c in emojis:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' '    
                line = cSpace.join(lineSplit)
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])

#             print(conv)
#             print("#######Conv Over######")
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = (re.sub(duplicateSpacePattern, ' ', conv)).lower()
            conv = translator(conv)
            indices.append(int(line[0]))

            if mode == "train":
                tree = text_to_ptb(line[4], conv, last_tree_processed)
            else:
                tree = text_to_ptb("", conv, last_tree_processed)

            conversations.append(tree)
            print(tree)
#             print(conversations)
#             '''
#             COMMENT OUT THE FOLLOWING 2 LINES BEFORE RUNNING!
#             '''
#             import time 
#             time.sleep(10)
            print("Done for " + str(len(conversations)))

    
    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations


In [7]:
def text_to_ptb(label, text, last_tree_processed):
    dir_path = os.path.dirname(os.path.abspath('__file__'))
    temp_write_out = '../stanford-corenlp-full-2018-10-05/input.txt'
    temp_read_tree = '../stanford-corenlp-full-2018-10-05/output.txt'
    tree = ''

    with open(temp_write_out,'w') as f:
        f.write(text.replace('<eos>',',').replace('.',','))

    os.system("cd ~/stanford-corenlp-full-2018-10-05 && java -Xmx8g edu.stanford.nlp.sentiment.SentimentPipeline -file input.txt -output PENNTREES > output.txt")

    with open(temp_read_tree, 'r') as f2:
        tree = f2.read()
           
    with open('trees.ptb','a') as f3:
        f3.write(label + " " + (" ").join((tree.split("\n"))[1:]) + "\n")

    return (tree)

In [None]:
'''
############################################################
############################################################
Testing the data being loaded in Sentiment-PTB(Sentiment Penn TreeBank) Format
(Check if this is giving proper output)
############################################################
############################################################
'''

trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train")

don't worry i'm girl , hmm how do i know if you are , what's ur name ?
(1 (2 (2 (2 do) (2 n't)) (2 (1 worry) (2 (2 i) (2 (2 'm) (2 (2 (2 girl) (2 ,)) (2 (2 hmm) (2 (2 how) (2 (2 do) (2 (2 i) (2 (2 know) (2 (2 if) (2 (2 you) (2 are))))))))))))) (2 (2 ,) (2 (2 what) (2 (2 (2 's) (2 (2 ur) (2 name))) (2 ?)))))

Done for 1
when did i ?
(2 (2 when) (2 (2 (2 did) (2 i)) (2 ?)))
, saw many times i think - _ - , no , i never saw you
(1 (2 ,) (1 (2 saw) (1 (2 (2 (2 many) (2 times)) (2 (2 i) (2 think))) (1 (2 -) (1 (2 (2 _) (2 -)) (1 (2 ,) (1 (1 no) (2 (2 ,) (2 (2 i) (2 (2 never) (2 (2 saw) (2 you))))))))))))

Done for 2
by , by google chrome , where you live
(1 (2 by) (2 (2 ,) (2 (2 (2 by) (2 (2 google) (2 chrome))) (2 (2 ,) (2 (2 where) (2 (2 you) (2 live)))))))

Done for 3
u r ridiculous , i might be ridiculous but i am telling the truth , , u little disgusting whore
(1 (2 u) (1 (2 r) (0 (1 ridiculous) (1 (1 (2 ,) (1 (1 (2 i) (1 (1 (2 might) (1 (1 (2 be) (1 ridiculous)) (2 (2 but) (2 (2 i) (2

are you ticklish ?
(2 (2 (2 (2 are) (2 you)) (2 ticklish)) (2 ?))
, yes , mostly on my feet .
(2 (2 (2 (2 ,) (2 (2 yes) (2 (2 ,) (2 mostly)))) (2 (2 on) (2 (2 my) (2 feet)))) (2 .))
, where are you ticklish ?
(1 (2 (2 (2 ,) (2 (2 where) (2 (2 are) (2 you)))) (2 ticklish)) (2 ?))

Done for 35
idk , you know that i'm here for you .
(1 (2 idk) (2 (2 ,) (2 (2 you) (2 (2 (2 know) (2 (2 that) (2 (2 i) (2 (2 (2 'm) (2 here)) (2 (2 for) (2 you)))))) (2 .)))))
, but u don't like to listen to me
(1 (2 ,) (1 (2 (2 but) (2 u)) (1 (2 (2 do) (2 n't)) (2 (2 like) (2 (2 to) (2 (2 listen) (2 (2 to) (3 me))))))))

Done for 36
love is my life , hayee ab .
(2 (4 love) (2 (2 (2 is) (2 (2 my) (2 (3 life) (2 (2 ,) (2 (2 hayee) (2 ab)))))) (2 .)))
you are love .
(3 (2 you) (3 (2 (2 are) (4 love)) (2 .)))
pure love .
(3 (4 pure) (2 (4 love) (2 .)))
forever <3 , yes right dear
(2 (2 forever) (2 (2 <) (2 (2 (2 3) (2 ,)) (2 (2 yes) (3 (3 right) (2 dear))))))

Done for 37
zombie attack , if a zombie wants to eat y

my boyfriend is so dumb .
(1 (2 (2 my) (2 boyfriend)) (1 (1 (2 is) (1 (2 so) (1 dumb))) (2 .)))
, have i said that ?
(2 (2 (2 (2 (2 ,) (2 (2 have) (2 i))) (2 said)) (2 that)) (2 ?))
, he doesnt reply me back
(1 (2 ,) (1 (2 he) (1 (2 doesnt) (2 (2 (2 reply) (3 me)) (2 back)))))

Done for 61
                          which is the decent way to call a girl m , i'd say first learn the spelling of relationship .
(1 (2 (2 which) (3 (2 is) (2 (2 the) (3 (3 decent) (2 (2 way) (2 (2 to) (2 (2 call) (2 (2 a) (2 (2 girl) (2 m)))))))))) (2 (2 ,) (2 (2 i) (2 (2 (2 'd) (2 (2 (2 say) (2 first)) (2 (2 learn) (2 (2 (2 the) (2 spelling)) (2 (2 of) (2 relationship)))))) (2 .)))))
, .
(2 (2 ,) (2 .))
yeah okay fine .
(3 (3 (3 yeah) (3 (2 okay) (3 fine))) (2 .))
wht to do to make a relationship better ?
(2 (2 (2 wht) (2 (2 to) (2 (2 do) (2 (2 to) (3 (2 (2 make) (2 (2 a) (2 relationship))) (4 better)))))) (2 ?))

Done for 62
you broke my heart , it was never mine to break </3 , see you are arrogant
(1 (2 yo

not mine , done for the day ?
(2 (2 (2 (2 (1 not) (2 mine)) (2 ,)) (2 (2 done) (2 (2 for) (2 (2 the) (2 day))))) (2 ?))
, can my meet to sexy girl
(2 (2 ,) (2 (2 (2 can) (2 my)) (2 (2 meet) (2 (2 to) (2 (4 sexy) (2 girl))))))

Done for 90
                                                                                          , i insult you
(1 (2 ,) (1 (2 i) (1 (1 insult) (2 you))))

Done for 91
so the story ?
(2 (2 so) (2 (2 the) (2 (2 story) (2 ?))))
, yeah indeed , tomorrow probably
(3 (2 ,) (3 (3 (2 (3 yeah) (2 indeed)) (2 ,)) (3 (2 tomorrow) (2 probably))))

Done for 92
no you aren't , oh i am , really ?
(1 (2 (1 no) (2 you)) (1 (1 (1 (1 (2 are) (2 n't)) (2 (2 ,) (2 (2 (2 (2 oh) (2 i)) (2 am)) (2 ,)))) (2 really)) (2 ?)))

Done for 93
but .
(2 (2 but) (2 .))
, then , i'm feeling nervous
(2 (2 ,) (2 (2 then) (2 (2 ,) (2 (2 i) (2 (2 'm) (2 (2 feeling) (1 nervous)))))))

Done for 94
do it , but why ?
(2 (2 (2 (2 (2 (2 do) (2 it)) (2 ,)) (2 but)) (2 why)) (2 ?))
, aaah .
(2 (2 ,) (2 

but i'm angry with you , but you look like you are , really
(1 (1 (1 (2 (2 but) (2 (2 i) (2 (2 'm) (1 (1 angry) (2 (2 with) (2 you)))))) (2 ,)) (2 but)) (3 (2 you) (3 (2 (2 (2 look) (2 (2 like) (2 (2 you) (2 are)))) (2 ,)) (2 really))))

Done for 119
# play lot , then go play it , , ?
(1 (1 (2 (2 #) (2 (2 play) (2 lot))) (2 (2 ,) (2 (2 (2 then) (2 (2 (2 go) (2 (2 play) (2 it))) (2 ,))) (2 ,)))) (2 ?))

Done for 120
isn't your liking that also signaling me to climb out of friendzone , oh are people still doing this whole friendzone thing ?
(1 (1 (1 (2 is) (2 n't)) (1 (2 (2 your) (3 liking)) (1 (2 that) (1 (2 also) (2 (2 signaling) (2 (3 me) (2 (2 to) (2 (2 (2 climb) (1 out)) (2 (2 of) (2 friendzone)))))))))) (2 (2 ,) (2 (2 oh) (1 (2 (2 (2 are) (2 people)) (2 (2 still) (2 (2 doing) (2 (2 this) (2 (2 whole) (2 (2 friendzone) (2 thing))))))) (2 ?)))))
, yah sadly
(2 (2 ,) (2 (2 yah) (1 sadly)))

Done for 121
i'm missing someone right now , who are you missing , may i ask ?
(1 (1 (1 (1 (2 (

cuz it broke on the side walk , what side is the walk side ?
(1 (2 (2 cuz) (2 (2 it) (2 (2 broke) (2 (2 on) (2 (2 the) (2 (2 side) (2 walk))))))) (2 (2 ,) (2 (2 (2 what) (2 side)) (2 (2 (2 is) (2 (2 the) (2 (2 walk) (2 side)))) (2 ?)))))
, me sad
(2 (2 ,) (2 (3 me) (1 sad)))

Done for 149
so which number is largest known prime number ?
(1 (2 so) (2 (2 (2 which) (2 number)) (2 (3 (2 is) (3 (2 (2 largest) (2 known)) (2 (3 prime) (2 number)))) (2 ?))))
, 13 - 3179 - 97 only these are the numbers for your question , , : (
(2 (2 ,) (1 (2 (2 (2 (2 (2 13) (2 (2 -) (2 3179))) (2 -)) (2 (2 97) (2 only))) (2 these)) (2 (2 are) (3 (2 (2 the) (2 numbers)) (3 (2 (2 (2 for) (2 (2 your) (2 question))) (2 ,)) (2 (2 ,) (2 (2 :) (2 -LRB-))))))))

Done for 150
okay relax its just a cat sticker , so this cat one emoji is your new addiction ?
(3 (2 okay) (3 (3 (2 (2 (2 relax) (2 its)) (2 just)) (3 (3 (3 (2 a) (2 (2 cat) (2 sticker))) (2 ,)) (3 (2 so) (3 (3 (2 this) (2 (2 cat) (2 (2 one) (2 emoji)))) (2 (2 

hello how are you , ?
(2 (3 (2 hello) (2 (2 how) (2 (2 (2 are) (2 you)) (2 ,)))) (2 ?))
, i'm just fine smiles anyway how are you ?
(2 (2 ,) (3 (2 i) (3 (2 'm) (3 (2 just) (3 (3 fine) (2 (2 smiles) (2 (2 (2 anyway) (2 how)) (2 (2 (2 are) (2 you)) (2 ?)))))))))
, good morning , i'm sad
(2 (2 ,) (3 (3 (3 good) (2 morning)) (2 (2 ,) (2 (2 i) (2 (2 'm) (1 sad))))))

Done for 179
nop , ok , hey
(3 (3 (2 (2 (2 nop) (2 ,)) (2 ok)) (2 ,)) (2 hey))

Done for 180
oil , you know , i am 100% sure about the oil , , cool
(2 (1 (2 oil) (1 (2 ,) (1 (1 (2 you) (1 (2 know) (2 (2 ,) (2 (2 (2 i) (1 (2 am) (2 (2 (2 100) (2 %)) (2 (2 sure) (2 (2 about) (2 (2 the) (2 oil))))))) (2 ,))))) (2 ,)))) (3 cool))

Done for 181
you activ all time , wow , i do not , you talk in sleep , when you sleep
(2 (2 you) (2 (3 (3 (3 (3 (2 activ) (3 (3 (2 (2 (2 all) (2 time)) (2 ,)) (3 (3 (3 wow) (2 ,)) (2 i))) (2 (2 do) (1 not)))) (2 ,)) (2 (2 you) (2 (2 talk) (2 (2 in) (1 sleep))))) (2 ,)) (1 (2 when) (2 (2 you) (1 sleep)))))

have you ever been sad ?
(2 (2 (2 (2 have) (2 (2 you) (2 ever))) (2 (2 been) (1 sad))) (2 ?))
, no!
(1 (2 ,) (2 (1 no) (2 !)))
but i heard good things!
(2 (2 but) (2 (2 i) (2 (2 (2 heard) (3 (3 good) (2 things))) (2 !))))
, been to janakpuri ?
(2 (2 ,) (2 (2 been) (2 (2 to) (2 (2 janakpuri) (2 ?)))))

Done for 210
book an uber ride to my home , is this allowed ?
(1 (3 (2 (2 book) (2 (2 an) (2 uber))) (3 (2 ride) (2 (2 to) (2 (2 my) (2 home))))) (2 (2 ,) (2 (2 (2 (2 is) (2 this)) (2 allowed)) (2 ?))))
how will you go home ?
(2 (2 how) (3 (3 (2 (2 will) (2 you)) (2 (2 go) (2 home))) (2 ?)))
, yes
(2 (2 ,) (2 yes))

Done for 211
you , not yet !
(1 (2 you) (2 (2 ,) (1 (1 not) (2 (2 yet) (2 !)))))
, my wife saw me chatting with you , i am screwed
(1 (2 ,) (1 (2 (2 my) (2 wife)) (1 (2 (3 (2 saw) (3 (3 me) (3 (2 chatting) (2 (2 with) (2 you))))) (2 ,)) (2 (2 i) (1 (2 am) (2 screwed))))))

Done for 212
very good chocolate , i prefer orange flavoured chocolates , i like this chocolate
(3 (3 (3 

what , what is a weird word , never talk to me again
(1 (2 what) (1 (2 ,) (1 (2 what) (1 (2 is) (1 (2 (2 (2 (2 a) (2 (1 weird) (2 word))) (2 ,)) (2 never)) (3 (2 (2 talk) (2 (2 to) (3 me))) (2 again)))))))

Done for 239
i might enjoy it , cool!
(4 (2 i) (3 (3 (3 (2 (2 might) (3 (4 enjoy) (2 it))) (2 ,)) (3 cool)) (2 !)))
you should!
(2 (2 you) (2 (2 should) (2 !)))
, well this has been great , but i have food to attend too ,
(2 (2 ,) (3 (3 (3 (3 (3 (3 well) (3 (2 this) (3 (2 has) (3 (2 been) (4 great))))) (2 ,)) (2 but)) (2 (2 i) (2 (2 have) (2 (2 food) (2 (2 to) (2 (2 attend) (2 too))))))) (2 ,)))

Done for 240
i feel very happy to talk like this , well im happy you�re happy!
(4 (2 i) (4 (4 (3 (3 (2 feel) (3 (2 very) (3 (4 happy) (2 (2 to) (2 (2 talk) (2 (2 like) (2 this))))))) (2 ,)) (3 (3 well) (3 (2 im) (3 (4 happy) (3 (2 you) (3 (2 re) (4 happy))))))) (2 !)))
so there!
(2 (2 so) (2 (2 there) (2 !)))
: - ) , at least i learn something from uh
(2 (2 :) (2 (2 (2 (2 -) (2 -RRB-)) (2 (

you ?
(2 (2 you) (2 ?))
, yeah probably should , , how's your day
(3 (2 ,) (3 (3 (3 (3 (3 (3 yeah) (2 probably)) (2 (2 should) (2 ,))) (2 ,)) (2 how)) (2 (2 's) (2 (2 your) (2 day)))))

Done for 267
yes awesome , thank you!
(3 (2 (2 yes) (3 awesome)) (3 (2 ,) (3 (2 (3 thank) (2 you)) (2 !))))
i like your photo too!
(2 (2 i) (2 (2 (2 (2 like) (2 (2 your) (2 photo))) (2 too)) (2 !)))
, i'm talking about the movie
(2 (2 ,) (2 (2 i) (2 (2 'm) (2 (2 talking) (2 (2 about) (2 (2 the) (2 movie)))))))

Done for 268
why am i even there in this world , because people avoid to talk about , no one loves me
(1 (1 (2 why) (1 (2 am) (1 (2 (2 (2 (2 i) (2 (2 even) (2 there))) (2 (2 in) (2 (2 this) (2 world)))) (2 ,)) (2 (2 because) (2 (2 people) (2 (1 avoid) (2 (2 to) (2 (2 talk) (2 about))))))))) (2 (2 ,) (2 (2 (1 no) (2 one)) (3 (3 loves) (3 me)))))

Done for 269
rubbish , indeed , tell romantic
(1 (1 (1 (1 (1 rubbish) (2 ,)) (2 indeed)) (2 ,)) (3 (2 tell) (2 romantic)))

Done for 270
wellcome , thank

yes , yes yes yes yes , ok good nights
(2 (2 (2 yes) (2 (2 ,) (2 yes))) (2 (2 yes) (3 (2 (2 yes) (2 yes)) (2 (2 ,) (2 (2 ok) (3 (3 good) (2 nights)))))))

Done for 297
i hate a , i , you hate me , sometimes
(1 (2 i) (1 (0 hate) (1 (3 (2 (2 (2 a) (2 ,)) (2 i)) (2 ,)) (1 (2 you) (1 (1 (1 (0 hate) (3 me)) (2 ,)) (2 sometimes))))))

Done for 298
what do you want ?
(2 (2 what) (2 (2 (2 (2 do) (2 you)) (2 want)) (2 ?)))
, duh of course , i have an exam tomorrow
(2 (2 ,) (2 (2 (2 duh) (2 (2 of) (2 (2 (2 course) (2 ,)) (2 i)))) (2 (2 (2 have) (2 (2 an) (2 exam))) (2 tomorrow))))

Done for 299
how was your day ?
(2 (2 how) (2 (2 (2 was) (2 (2 your) (2 day))) (2 ?)))
, my day was good : d how about yours ?
(3 (3 (2 (2 ,) (2 (2 my) (2 day))) (3 (2 was) (3 good))) (2 (2 :) (2 (2 (2 (2 d) (2 how)) (2 (2 about) (2 yours))) (2 ?))))
, bad very bad
(0 (2 ,) (0 (0 bad) (1 (2 very) (0 bad))))

Done for 300
take it from microsoft's database , we don't get wifi from our aircraft , then how are you chattin

what you don't want to , anything , i just want to laze about and do nothing , , except 1 thing
(1 (2 what) (1 (2 you) (1 (2 (2 do) (2 n't)) (1 (1 (1 (2 (2 (2 want) (2 to)) (2 ,)) (1 (2 anything) (1 (2 ,) (1 (1 (2 i) (1 (2 (2 (2 just) (2 (2 want) (2 (2 to) (2 (2 laze) (2 about))))) (2 and)) (2 (2 do) (1 nothing)))) (2 ,))))) (2 ,)) (2 (2 except) (2 (2 1) (2 thing)))))))

Done for 326
did you get anu teddy today , yep , , wowwww
(3 (2 (2 did) (2 you)) (3 (2 get) (3 (3 (3 (3 (2 (2 (2 anu) (2 (2 teddy) (2 today))) (2 ,)) (2 yep)) (2 ,)) (2 ,)) (2 wowwww))))

Done for 327
i will judge , be warned , i will judge the judging of the judge , what the hell
(2 (2 i) (1 (2 will) (1 (2 (2 judge) (1 (2 ,) (2 (2 (2 be) (1 warned)) (2 ,)))) (2 (2 i) (2 (2 will) (2 (2 judge) (1 (2 (2 the) (2 judging)) (1 (2 of) (1 (2 (2 (2 the) (2 judge)) (2 ,)) (1 (2 what) (1 (2 the) (1 hell))))))))))))

Done for 328
so sad , so sad ?
(1 (2 (2 so) (1 sad)) (2 (2 ,) (1 (2 (2 so) (1 sad)) (2 ?))))
about what ?
(2 (2 (2

In [None]:
from gensim.models.wrappers import FastText
mod = FastText.load_fasttext_format('/home/bt1/17CS10037/taddhita/cc.en.300.bin', encoding="utf8")


'''
    ##########################################################
    ##########################################################
    
    Fasttext model being loaded here. It might take 1 min to load.
    This bin file is of 10mb you can look for 5 gb variant of it.
    Also download the 300d English language '.bin' file not '.vec' file.
    
    ##########################################################
    ##########################################################
'''

In [None]:
def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
    """
    
    # Minimum word index of any word is 1. 
    '''
    ##########################################################
    ##########################################################
    
    If you want to use gloVe model you may use but main purpose of it is to generate wordtovec
    If possible look up for some better library than fasttext.bin file here which can deal with emoji as well
    
    ##########################################################
    ##########################################################
    '''
    i=0
    error_words =[]
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        try:
            embeddingVector = mod[word]
            embeddingMatrix[i] = embeddingVector    
        except KeyError:
            print(word)
            error_words.append(word)
    print(len(error_words))
    return embeddingMatrix

# def getEmbeddingMatrix(wordIndex):
#     """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
#        the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
#     Input:
#         wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
#     Output:
#         embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
#     """
#     embeddingsIndex = {}
#     # Load the embedding vectors from ther GloVe file
#     with io.open(os.path.join(gloveDir, 'glove.6B.100d.txt'), encoding="utf8") as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             embeddingVector = np.asarray(values[1:], dtype='float32')
#             embeddingsIndex[word] = embeddingVector
    
#     print('Found %s word vectors.' % len(embeddingsIndex))
#     error_words =[]
#     # Minimum word index of any word is 1. 
#     embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
#     for word, i in wordIndex.items():
#         embeddingVector = embeddingsIndex.get(word)
#         if embeddingVector is not None:
#             # words not found in embedding index will be all-zeros.
#             embeddingMatrix[i] = embeddingVector
#         else
#             print(word)
#             error_words.append(word)
#     print(len(error_words))
#     return embeddingMatrix

In [3]:
def getMetrics(predictions, ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification  
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))
    
    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please add  code here or somewhere else to print the TruePositives,FalsePositives,FalseNegatives
    
    ##########################################################
    ##########################################################
    '''
    
    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)
    
    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
    
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   
    
    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()    
    
    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
    
    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)
    
    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------
    
    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)
    
    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1


In [4]:
def writeNormalisedData(dataFilePath, texts):
    """Write normalised data to a file
    Input:
        dataFilePath : Path to original train/test file that has been processed
        texts : List containing the normalised 3 turn conversations, separated by the <eos> tag.
    """
    
    '''
    ##########################################################
    ##########################################################
    
    You May ignore this function
    
    ##########################################################
    ##########################################################
    '''
    normalisedDataFilePath = dataFilePath.replace(".txt", "_normalised.txt")
    with io.open(normalisedDataFilePath, 'w', encoding='utf8') as fout:
        with io.open(dataFilePath, encoding='utf8') as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                line = line.strip().split('\t')
                normalisedLine = texts[lineNum].strip().split('<eos>')
                fout.write(line[0] + '\t')
                # Write the original turn, followed by the normalised version of the same turn
                fout.write(line[1] + '\t' + normalisedLine[0] + '\t')
                fout.write(line[2] + '\t' + normalisedLine[1] + '\t')
                fout.write(line[3] + '\t' + normalisedLine[2] + '\t')
                try:
                    # If label information available (train time)
                    fout.write(line[4] + '\n')    
                except:
                    # If label information not available (test time)
                    fout.write('\n')


In [14]:
    '''
    ##########################################################
    ##########################################################
    
    Simple function calling.
    
    ##########################################################
    ##########################################################
    '''
    print("Processing training data...")
    trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train")
    print(labels[:5])
    print(np.asarray(labels[:5]))
    labels = to_categorical(np.asarray(labels))
    print(labels[:5])
    # Write normalised text to file to check if normalisation works. Disabled now. Uncomment following line to enable   
    #writeNormalisedData(trainDataPath, trainTexts)
    print("Processing test data...")
    testIndices, testTexts = preprocessData(testDataPath, mode="test")
    #writeNormalisedData(testDataPath, testTexts)

    print("Extracting tokens...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(trainTexts)
    trainSequences = tokenizer.texts_to_sequences(trainTexts)
    testSequences = tokenizer.texts_to_sequences(testTexts)

    wordIndex = tokenizer.word_index
    print("Found %s unique tokens." % len(wordIndex))

    print("Populating embedding matrix...")
    embeddingMatrix = getEmbeddingMatrix(wordIndex)

    data = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    print("Shape of training data tensor: ", data.shape)
    print("Shape of label tensor: ", labels.shape)
        
    # Randomize data
    np.random.shuffle(trainIndices)
    data = data[trainIndices]
    labels = labels[trainIndices]
      
    # Perform k-fold cross validation
    metrics = {"accuracy" : [],
               "microPrecision" : [],
               "microRecall" : [],
               "microF1" : []}

Processing training data...


FileNotFoundError: [Errno 2] No such file or directory: 'train.txt'

In [None]:
 '''
    ##########################################################
    ##########################################################
    
    Build model
    
    ##########################################################
    ##########################################################
    '''
def buildModel(embeddingMatrix):
    """Constructs the architecture of the model
    Input:
        embeddingMatrix : The embedding matrix to be loaded in the embedding layer.
    Output:
        model : A basic LSTM model
    """
    embeddingLayer = Embedding(embeddingMatrix.shape[0],
                                EMBEDDING_DIM,
                                weights=[embeddingMatrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    model = Sequential()
    model.add(embeddingLayer)
    model.add(LSTM(LSTM_DIM, dropout=DROPOUT))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    
    rmsprop = optimizers.rmsprop(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy',
                  optimizer=rmsprop,
                  metrics=['acc'])
    return model

In [15]:
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    doing k-fold validation and it will take time to execute maybe an hour or two
    Please analyse TruePositives,FalsePositives,FalseNegatives to gain insights why model is not working
    
    ##########################################################
    ##########################################################
    '''
    print("Starting k-fold cross validation...")
    for k in range(NUM_FOLDS):
        print('-'*40)
        print("Fold %d/%d" % (k+1, NUM_FOLDS))
        validationSize = int(len(data)/NUM_FOLDS)
        index1 = validationSize * k
        index2 = validationSize * (k+1)
            
        xTrain = np.vstack((data[:index1],data[index2:]))
        yTrain = np.vstack((labels[:index1],labels[index2:]))
        xVal = data[index1:index2]
        yVal = labels[index1:index2]
        print("Building model...")
        model = buildModel(embeddingMatrix)
        model.fit(xTrain, yTrain, 
                  validation_data=(xVal, yVal),
                  epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

        predictions = model.predict(xVal, batch_size=BATCH_SIZE)
        accuracy, microPrecision, microRecall, microF1 = getMetrics(predictions, yVal)
        metrics["accuracy"].append(accuracy)
        metrics["microPrecision"].append(microPrecision)
        metrics["microRecall"].append(microRecall)
        metrics["microF1"].append(microF1)

Starting k-fold cross validation...
----------------------------------------
Fold 1/5


NameError: name 'data' is not defined

In [16]:
        
print("\n============= Metrics =================")
print("Average Cross-Validation Accuracy : %.4f" % (sum(metrics["accuracy"])/len(metrics["accuracy"])))
print("Average Cross-Validation Micro Precision : %.4f" % (sum(metrics["microPrecision"])/len(metrics["microPrecision"])))
print("Average Cross-Validation Micro Recall : %.4f" % (sum(metrics["microRecall"])/len(metrics["microRecall"])))
print("Average Cross-Validation Micro F1 : %.4f" % (sum(metrics["microF1"])/len(metrics["microF1"])))
    
print("\n======================================")
    
print("Retraining model on entire data to create solution file")
model = buildModel(embeddingMatrix)
model.fit(data, labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
model.save('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))
# model = load_model('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))





NameError: name 'metrics' is not defined

In [17]:
    '''
    ##########################################################
    ##########################################################
    
    Prediction on dev-set
    
    ##########################################################
    ##########################################################
    '''
    print("Creating solution file...")
    testData = pad_sequences(testSequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(testData, batch_size=BATCH_SIZE)
    predictions = predictions.argmax(axis=1)

Creating solution file...


NameError: name 'testSequences' is not defined

In [120]:
    '''
    ##########################################################
    ##########################################################
    
    Dont modify it.It writes the txt file which is needed to submit at the site
    
    ##########################################################
    ##########################################################
    '''
    with io.open(solutionPath, "w", encoding="utf8") as fout:
        fout.write(unicode('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n'))        
        with io.open(testDataPath, encoding="utf8") as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t')
                fout.write(unicode(label2emotion[predictions[lineNum]] + '\n'))
    print("Completed. Model parameters: ")
    print("Learning rate : %.3f, LSTM Dim : %d, Dropout : %.3f, Batch_size : %d" 
          % (LEARNING_RATE, LSTM_DIM, DROPOUT, BATCH_SIZE))


NameError: name 'unicode' is not defined