In [1]:
import sys
import os
import io
import numpy as np
import matplotlib.pyplot as plt
import math
import time
import itertools
import shutil
import re
import tensorflow as tf
from tree import *
import numpy as np
from keras.preprocessing.text import Tokenizer
from utils import Vocab
from keras.utils import to_categorical
from gensim.models.wrappers import FastText

Using TensorFlow backend.


In [2]:
trainDataPath = "Recursive-neural-networks-TensorFlow/train.txt"
testDataPath = "test_o.txt"
# Output file that will be generated. This file can be directly submitted.
solutionPath = "tests.txt"
# Path to directory where GloVe file is saved.
gloveDir = "/home/bt1/17CS10037/starterkit/glove"
NUM_FOLDS = 5   # Number of classes - Happy, Sad, Angry, Others
MAX_SEQUENCE_LENGTH = 100 
MAX_NB_WORDS = 20000        # All sentences having lesser number of words than this will be padded
EMBEDDING_DIM = 300               # The dimension of the word embeddings

In [3]:
label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}
def getMetrics(predictions, ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification  
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))
    
    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please add  code here or somewhere else to print the TruePositives,FalsePositives,FalseNegatives
    
    ##########################################################
    ##########################################################
    '''
    
    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)
    
    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
    
    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   
    
    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()    
    
    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
    
    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)
    
    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------
    
    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)
    
    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1

def writeNormalisedData(dataFilePath, texts):
    """Write normalised data to a file
    Input:
        dataFilePath : Path to original train/test file that has been processed
        texts : List containing the normalised 3 turn conversations, separated by the <eos> tag.
    """
    
    '''
    ##########################################################
    ##########################################################
    
    You May ignore this function
    
    ##########################################################
    ##########################################################
    '''
    normalisedDataFilePath = dataFilePath.replace(".txt", "_normalised.txt")
    with io.open(normalisedDataFilePath, 'w', encoding='utf8') as fout:
        with io.open(dataFilePath, encoding='utf8') as fin:
            fin.readline()
            for lineNum, line in enumerate(fin):
                line = line.strip().split('\t')
                normalisedLine = texts[lineNum].strip().split('<eos>')
                fout.write(line[0] + '\t')
                # Write the original turn, followed by the normalised version of the same turn
                fout.write(line[1] + '\t' + normalisedLine[0] + '\t')
                fout.write(line[2] + '\t' + normalisedLine[1] + '\t')
                fout.write(line[3] + '\t' + normalisedLine[2] + '\t')
                try:
                    # If label information available (train time)
                    fout.write(line[4] + '\n')    
                except:
                    # If label information not available (test time)
                    fout.write('\n')

def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 100 dimensional GloVe embedding
    """
    embeddingsIndex = {}
    # Load the embedding vectors from ther GloVe file
    with io.open(os.path.join(gloveDir, 'glove.6B.300d.txt'), encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector
    
    print('Found %s word vectors.' % len(embeddingsIndex))
    
    # Minimum word index of any word is 1. 
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        embeddingVector = embeddingsIndex.get(word)
        if embeddingVector is not None:
            # words not found in embedding index will be all-zeros.
            embeddingMatrix[i] = embeddingVector
    
    return embeddingMatrix

In [4]:
import emoji
import regex

def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

line1 = ["🤔 🙈 me así, se 😌 ds 💕👭👙 hello 👩🎓 emoji hello  how are 😊 you today🙅🏽🙅🏽"]
line=["money money and lots of money😍😍"]

counter = split_count(line[0])
print(counter)
emojis = split_count(line1[0])
for c in emojis:
    lineSplit = line1[0].split(c)
    while True:
        try:
            lineSplit.remove('')
        except:
            break
    cSpace = ' ' + emoji.UNICODE_EMOJI[c] + ' '    
    line1[0] = cSpace.join(lineSplit)
print(line1[0])

['😍', '😍']
  :see-no-evil_monkey:  me así, se  :relieved_face:  ds  :two_hearts:  :two_women_holding_hands:  :bikini:  hello  :woman:  :graduation_cap:  emoji hello  how are  :smiling_face_with_smiling_eyes:  you today


In [5]:
import csv

In [6]:
def translator(user_string):
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        # File path which consists of Abbreviations.
        fileName = "slang.txt"
        # File Access mode [Read Mode]
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            # Reading file as CSV with delimiter as "=", so that abbreviation are stored in row[0] and phrases in row[1]
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            # Removing Special Characters.
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                # Check if selected word matches short forms[LHS] in text file.
                if _str.upper() == row[0]:
                    # If match found replace it with its appropriate phrase in text file.
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    # Replacing commas with spaces for final output.
    a = ' '.join(user_string)
    print('')
    str = j
    return a


In [26]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    
    '''
    ##########################################################
    ##########################################################
    
    IMPORTANT
    Please try to better the pre-processing here by sepearting the emojis with text like  "hmm😞", "now‼‼‼😣😤😤" & etc.
    Also look for other methods.
    
    ##########################################################
    ##########################################################
    '''
    indices = []
    conversations = []
    labels = []
    i=0

    if mode=='train':
        open("train_trees.ptb", "w").close()
    else:
        open("test_trees.ptb", "w").close()

    with open('../stanford-corenlp-full-2018-10-05/output.txt','w') as f:
        f.write("")

    
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            i+=1
            
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
#             print(line)
            emojis = split_count(line)
            repeatedChars = ['.']
            repeatedChars.append('?')
            repeatedChars.append(',')
            repeatedChars.append('_')
            repeatedChars.append(':')
            repeatedChars.append('-')
            repeatedChars.append(';')
            repeatedChars.append('&')
            repeatedChars.append('#')
            for c in emojis:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' '    
                line = cSpace.join(lineSplit)
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            print(line)
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])
            

#             print(conv)
#             print("#######Conv Over######")
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = (re.sub(duplicateSpacePattern, ' ', conv)).lower()
            conv = translator(conv)
            indices.append(int(line[0]))
#             print(conv)
            if mode == "train":
                tree = text_to_ptb(line[4], conv, mode)
            else:
                tree = text_to_ptb("", conv, mode)

            conversations.append(tree)
#             print(tree)
#             print(conversations)
#             '''
#             COMMENT OUT THE FOLLOWING 2 LINES BEFORE RUNNING!
#             '''
#             import time 
#             time.sleep(10)
            print("Done for " + str(len(conversations)))

    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations


In [27]:
def text_to_ptb(label, text, mode):
    dir_path = os.path.dirname(os.path.abspath('__file__'))
    temp_write_out = '../stanford-corenlp-full-2018-10-05/input.txt'
    temp_read_tree = '../stanford-corenlp-full-2018-10-05/output.txt'
    tree = ''

#     print(text)
    print(" ")    
    print(text.replace('<eos>',',').replace('.',',').replace('?','').replace('!',' very'))

    with open(temp_write_out,'w') as f:
        f.write(text.replace('<eos>',',').replace('.',',').replace('?','').replace('!',' very'))

    os.system("cd ~/stanford-corenlp-full-2018-10-05 && java -Xmx8g edu.stanford.nlp.sentiment.SentimentPipeline -file input.txt -output PENNTREES > output.txt")

    with open(temp_read_tree, 'r') as f2:
        tree = f2.read()
    
    if mode == 'train':
        with open('train_trees.ptb','a') as f3:
            f3.write(label + " " + (" ").join((tree.split("\n"))[1:]) + "\n")
    else:
        with open('test_trees.ptb','a') as f3:
            f3.write(label + " " + (" ").join((tree.split("\n"))[1:]) + "\n")
    
#     print(tree)
    return (tree)

In [21]:
trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train")

['0', "Don't worry  I'm girl", 'hmm how do I know if you are', "What's ur name ? ", 'others']

 
don't worry i'm girl   okay how do i know if you are   what's  your name  
Done for 1
['1', 'When did I ? ', 'saw many times i think  -  _  - ', 'No .  I never saw you', 'angry']

 
when did i    saw many times i think - _ -   no , i never saw you
Done for 2
['2', 'By', 'by Google Chrome', 'Where you live', 'others']

 
by   by google chrome   where you live
Done for 3
['3', 'U r ridiculous', 'I might be ridiculous but I am telling the truth . ', 'U little disgusting whore', 'angry']

 
You are ridiculous   i might be ridiculous but i am telling the truth ,   You little disgusting whore
Done for 4
['4', 'Just for time pass', 'wt do u do 4 a living then', 'Maybe', 'others']

 
just for time pass   wt do You do for a living then   maybe
Done for 5
['5', "I'm a dog person", 'youre so rude', 'Whaaaat why', 'others']

 
i'm a dog person   youre so rude   whaaaat why
Done for 6
['6', 'So whatsup'

KeyboardInterrupt: 

In [10]:
trainTexts

NameError: name 'trainTexts' is not defined

In [11]:
from gensim.models.wrappers import FastText
mod = FastText.load_fasttext_format('/home/bt1/17CS10037/taddhita/cc.en.300.bin', encoding="utf8")

In [28]:
def getEmbeddingMatrix(wordIndex):
    
    i=0
    error_words =[]
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        try:
            embeddingVector = mod[word]
            embeddingMatrix[i] = embeddingVector    
        except KeyError:
            print(word)
            error_words.append(word)
    print(len(error_words))
    return embeddingMatrix

In [13]:
labels = to_categorical(np.asarray(labels))

NameError: name 'labels' is not defined

In [14]:
embeddingMatrix

NameError: name 'embeddingMatrix' is not defined

In [None]:
#     print("Processing training data...")
#     trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train")
#     labels = to_categorical(np.asarray(labels))
    # Write normalised text to file to check if normalisation works. Disabled now. Uncomment following line to enable   
    #writeNormalisedData(trainDataPath, trainTexts)
    print("Processing test data...")
    testIndices, testTexts = preprocessData(testDataPath, mode="test")
    #writeNormalisedData(testDataPath, testTexts)

    print("Extracting tokens...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(trainTexts)
    trainSequences = tokenizer.texts_to_sequences(trainTexts)
    testSequences = tokenizer.texts_to_sequences(testTexts)

    wordIndex = tokenizer.word_index
    print("Found %s unique tokens." % len(wordIndex))

    print("Populating embedding matrix...")
    embeddingMatrix = getEmbeddingMatrix(wordIndex)

Processing test data...
['0', 'Then dont ask me', 'YOURE A GUY NOT AS IF YOU WOULD UNDERSTAND', 'IM NOT A GUY FUCK OFF', 'angry']

 
then dont ask me   youre a guy not as if you would understand   im not a guy fuck off
Done for 1
['1', 'Mixed things  such as ? ', 'the things you do . ', 'Have you seen minions ? ', 'others']

 
mixed things such as    the things you do ,   have you seen minions  
Done for 2
['2', "Today I'm very happy", "and I'm happy for you  ", 'I will be marry', 'happy']

 
today i'm very happy   and i'm happy for you   i will be marry
Done for 3
['3', 'Woah bring me some', 'left it there oops', 'Brb', 'others']

 
woah bring me some   left it there oops   Be right back
Done for 4
['4', 'it is thooooo', 'I said soon master . ', 'he is pressuring me', 'others']

 
it is thooooo   i said soon master ,   he is pressuring me
Done for 5
['5', 'Wont u ask my age ? ', 'hey at least I age well!', 'Can u tell me how can we get closer ? ', 'others']

 
wont You ask my age    h

Done for 49
['49', 'u were offering something right ? ', "I don't remember the wrong part . ", 'tell me the correct part', 'others']

 
You were offering something right    i don't remember the wrong part ,   tell me the correct part
Done for 50
['50', "I'm not talking to u anymore", 'what I do ? ', 'Clear ur glitch s', 'angry']

 
i'm not talking to You anymore   what i do    clear  your glitch s
Done for 51
['51', 'Do you study', 'yes gtg ttyl', 'Where are you from', 'others']

 
do you study   yes Got to go Talk to you later   where are you from
Done for 52
['52', 'Okay', 'Hello! How are you ? ', 'Good n u', 'others']

 
okay   hello very how are you    good n You
Done for 53
['53', 'No  . l human', 'Are you sure ? ', 'Yeah', 'others']

 
no , l human   are you sure    yeah
Done for 54
['54', 'Talking to me ? ', "but I didn't help", 'Does that work ? ', 'others']

 
talking to me    but i didn't help   does that work  
Done for 55
['55', 'Fight', "I don't fight . ", 'What are you do

Done for 96
['96', 'My hobbies are to do gym', 'your gym is more expensive than mine!!!', 'So what', 'others']

 
my hobbies are to do gym   your gym is more expensive than mine very very very   so what
Done for 97
['97', 'What u are saying', "Some people like me love everyone but hate everything .  That's what I'm saying . ", 'Tell me what i do', 'others']

 
what You are saying   some people like me love everyone but hate everything , that's what i'm saying ,   tell me what i do
Done for 98
['98', "Ohhh that's nice sense of humor", "Oh shucks .  You've got me blushing .   ", 'Please carry on blushing', 'happy']

 
ohhh that's nice sense of humor   Overheard shucks , you've got me blushing ,   please carry on blushing
Done for 99
['99', 'To chat with you', 'Hello', 'No hello', 'others']

 
to chat with you   hello   no hello
Done for 100
['100', "I don't like watching TV and movies", 'It is absolutely ohk to watch a movie with subtitles switched on . ', 'Ok', 'others']

 
i don't like

Done for 143
['143', "What's wrong with you", "clothes is what's wrong .  I've been good .  How you been ? ", 'You are wierd', 'others']

 
what's wrong with you   clothes is what's wrong , I Have been good , how you been    you are wierd
Done for 144
['144', 'what is your fathers name ? ', "What's in the name ? ", 'what is name of your father ? ', 'others']

 
what is your fathers name    what's in the name    what is name of your father  
Done for 145
['145', 'text me after 7 : 30pm', 'dint have a phone', 'you can call me', 'others']

 
text me after 7 : 30pm   dint have a phone   you can call me
Done for 146
['146', 'Lovely', 'AWWWW thank you!   ', 'And sexy baby', 'others']

 
lovely   awwww thank you very   and sexy baby
Done for 147
['147', 'You Coming', "I'm trying to  .  You going  ? ", 'No you Coming', 'others']

 
you coming   i'm trying to , you going    no you coming
Done for 148
['148', 'Okay  your  pictures ', 'I like yours too', 'Send me   your  pics', 'others']

 
okay 

Done for 188
['188', 'Do u like her  : )', 'YUP', 'Which movie of hers ? ', 'others']

 
do You like her : )   Yes   which movie of hers  
Done for 189
['189', 'Hmmm', 'mmmmm', 'Alri8', 'others']

 
Yes   mmmmm   alri8
Done for 190
['190', 'none is you', 'I am!', 'gender ? ', 'others']

 
none is you   i am very   gender  
Done for 191
['191', "Didn't know that", 'Yeah .  Brilliant series though .   : )', 'Yup', 'others']

 
didn't know that   yeah , brilliant series though , : )   Yes
Done for 192
['192', 'What is my gender ? ', 'I like women', 'What is write your programmin language ? ', 'others']

 
what is my gender    i like women   what is write your programmin language  
Done for 193
['193', 'Who types ? ', 'The one who are like me . ', 'Can u give me some money', 'others']

 
who types    the one who are like me ,   can You give me some money
Done for 194
['194', 'Funny', 'I know I am . ', 'Yep ur funny', 'happy']

 
funny   i know i am ,   yep  your funny
Done for 195
['195', 

Done for 236
['236', 'Thought', "nope .  What is obvious doesn't need thought . ", 'Yeah thats true', 'others']

 
thought   nope , what is obvious doesn't need thought ,   yeah thats true
Done for 237
['237', 'Yeah sure', 'hmm do you doubt it ? ', 'You bored me', 'others']

 
yeah sure   okay do you doubt it    you bored me
Done for 238
['238', 'well said', 'well thank you  : )', 'so how are you', 'others']

 
well said   well thank you : )   so how are you
Done for 239
['239', 'hey .  nothing much', 'How are you doing ? ', 'superb', 'happy']

 
hey , nothing much   how are you doing    superb
Done for 240
['240', 'Shut up', "I don't even know", "What you don't know", 'others']

 
shut up   i don't even know   what you don't know
Done for 241
['241', 'Y not', 'reasons . ', 'Hey leave it', 'others']

 
Yawn not   reasons ,   hey leave it
Done for 242
['242', 'Yes I was very busy', 'haha last year in uni yet you go to the freshers event lol  ', "I didn't go", 'sad']

 
yes i Wild *a* gu

In [36]:
from keras.preprocessing.sequence import pad_sequences

In [39]:
data = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
 

In [41]:
   
print("Shape of training data tensor: ", data.shape)
# print("Shape of label tensor: ", labels.shape)
        
# Randomize data
# np.random.shuffle(trainIndices)
# data = data[trainIndices]
# labels = labels[trainIndices]
      
# Perform k-fold cross validation
metrics = {"accuracy" : [],
           "microPrecision" : [],
           "microRecall" : [],
           "microF1" : []}

Shape of training data tensor:  (201, 100)


In [42]:
print(trainTexts)

["don't worry i'm girl   okay how do i know if you are   what's  your name\n(2 (2 (2 do) (2 n't)) (2 (1 worry) (2 (2 i) (2 (2 'm) (2 (2 girl) (2 (2 okay) (2 (2 how) (2 (2 do) (2 (2 i) (2 (2 know) (2 (2 if) (2 (2 you) (2 (2 are) (2 (2 what) (2 (2 's) (2 (2 your) (2 name)))))))))))))))))\n", 'when did i    saw many times i think - _ -   no , i never saw you\n(1 (2 when) (1 (2 (2 did) (2 i)) (1 (2 saw) (1 (2 (2 (2 (2 (2 many) (2 times)) (2 i)) (2 (2 (2 think) (2 -)) (2 _))) (2 -)) (1 (1 no) (2 (2 ,) (2 (2 i) (2 (2 never) (2 (2 saw) (2 you))))))))))\n', 'by   by google chrome   where you live\n(2 (2 by) (2 (2 by) (2 (2 (2 google) (2 chrome)) (2 (2 where) (2 (2 you) (2 live))))))\n', 'You are ridiculous   i might be ridiculous but i am telling the truth ,   You little disgusting whore\n(1 (2 You) (1 (2 are) (1 (1 ridiculous) (1 (2 i) (1 (2 might) (1 (1 (2 be) (1 ridiculous)) (1 (2 but) (2 (2 i) (1 (2 am) (2 (3 telling) (1 (3 (2 (2 (2 the) (3 truth)) (2 ,)) (2 You)) (1 (2 (2 little) (1 disgu

In [61]:
import os, sys, shutil, time, itertools
import math, random
from collections import OrderedDict, defaultdict

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import utils
import tree

MODEL_STR = 'rnn_embed=%d_l2=%f_lr=%f.weights'
SAVE_DIR = './weights/'


class Config(object):
    embed_size = 300
    label_size = 4
    early_stopping = 2
    anneal_threshold = 0.99
    anneal_by = 1.5
    max_epochs = 30
    lr = 0.01
    l2 = 0.02

    model_name = MODEL_STR % (embed_size, l2, lr)


In [69]:
class RNN_Model():

    def __init__(self, config):
        self.config = config

        # Load train data and build vocabulary
        self.train_data, self.dev_data, self.test_data = simplified_data(200,50,50)
        self.vocab = utils.Vocab()
        train_sents = [t.get_words() for t in self.train_data]
        self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))

        # add input placeholders
        self.is_leaf_placeholder = tf.placeholder(tf.bool, (None), name='is_leaf_placeholder')
        self.left_children_placeholder = tf.placeholder(tf.int32, (None), name='left_children_placeholder')
        self.right_children_placeholder = tf.placeholder(tf.int32, (None), name='right_children_placeholder')
        self.node_word_indices_placeholder = tf.placeholder(tf.int32, (None), name='node_word_indices_placeholder')
        self.labels_placeholder = tf.placeholder(tf.int32, (None), name='labels_placeholder')
        self.childlabels_placeholder = tf.placeholder(tf.int32, (None), name='childlabels_placeholder')
        self.label1_placeholder = tf.placeholder(tf.float32, (None), name='label1_placeholder')
        
        # add model variables
        W = tf.Variable(tf.constant(0.0, shape=[len(wordIndex), 300]),trainable=False, name="W")

        embedding_placeholder = tf.placeholder(tf.float32, [len(wordIndex), 300])
        embedding_init = W.assign(embedding_placeholder)

        sess = tf.Session()
        sess.run(embedding_init, feed_dict={embedding_placeholder: embeddingMatrix})

        with tf.variable_scope('Composition'):
            tf.get_variable("Wplusplus",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("Wplus0",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("Wplusminus",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("W0plus",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("W00",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("W0minus",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("Wminusplus",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("Wminus0",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("Wminusminus",shape=[2*self.config.embed_size,self.config.embed_size])
            tf.get_variable("bplusplus",shape=[1,self.config.embed_size])
            tf.get_variable("bplus0",shape=[1,self.config.embed_size])
            tf.get_variable("bplusminus",shape=[1,self.config.embed_size])
            tf.get_variable("b0plus",shape=[1,self.config.embed_size])
            tf.get_variable("b00",shape=[1,self.config.embed_size])
            tf.get_variable("b0minus",shape=[1,self.config.embed_size])
            tf.get_variable("bminusplus",shape=[1,self.config.embed_size])
            tf.get_variable("bminus0",shape=[1,self.config.embed_size])
            tf.get_variable("bminusminus",shape=[1,self.config.embed_size])
        with tf.variable_scope('Projection'):
            U = tf.get_variable('U', [self.config.embed_size, self.config.label_size])
            bs = tf.get_variable('bs', [1, self.config.label_size])

        # build recursive graph

        tensor_array = tf.TensorArray(tf.float32,size=0,dynamic_size=True,clear_after_read=False,infer_shape=False)

        def embed_word(word_index):
            with tf.device('/cpu:0'):
                return tf.expand_dims(tf.gather(W, word_index), 0)

        def combine_children(left_tensor, right_tensor,child_label):
            return tf.nn.relu(tf.matmul(tf.concat(1, [left_tensor, right_tensor]), W1) + b1)

        def combine_children(left_tensor, right_tensor,child_label):
                # node_tensors.update(self.add_model(node.left))
                # node_tensors.update(self.add_model(node.right))
                ##  YOUR CODE HERE
                ##ELIF##
                a = tf.constant(0, dtype=tf.float32)
                b = tf.constant(1, dtype=tf.float32)
                c = tf.constant(2, dtype=tf.float32)
                child_tensor=tf.concat(1,[left_tensor,right_tensor])
                def f1(): return tf.nn.relu(tf.matmul(child_tensor,Wminusminus) + bminusminus)
                def f2(): return tf.nn.relu(tf.matmul(child_tensor,Wminus0) + bminus0)
                def f3(): return tf.nn.relu(tf.matmul(child_tensor,Wminusplus) + bminusplus)
                def f4(): return tf.nn.relu(tf.matmul(child_tensor,W0minus) + b0minus)
                def f5(): return tf.nn.relu(tf.matmul(child_tensor,W00) + b00)
                def f6(): return tf.nn.relu(tf.matmul(child_tensor,W0plus) + b0plus)
                def f7(): return tf.nn.relu(tf.matmul(child_tensor,Wplusminus) + bplusminus)
                def f8(): return tf.nn.relu(tf.matmul(child_tensor,Wplus0) + bplus0)
                def f9(): return tf.nn.relu(tf.matmul(child_tensor,Wplusplus) + bplusplus)

                def f_0():
                    val = tf.case({tf.math.equal(tf.gather(self.child_label)[1],a):f1,tf.math.equal(tf.gather(self.child_label)[1],b):f2}, default=f3)
                    return val

                def f_1():
                    val = tf.case({tf.math.equal(tf.gather(self.child_label)[1],a):f4,tf.math.equal(tf.gather(self.child_label)[1],b):f5}, default=f6)
                    return val

                def f_2():
                    val = tf.case({tf.math.equal(tf.gather(self.child_label)[1],a):f7,tf.math.equal(tf.gather(self.child_label)[1],b):f8}, default=f9)
                    return val

                curr_node_tensor = tf.case({tf.math.equal(tf.gather(self.child_label)[0],a):f_0, tf.math.equal(tf.gather(self.child_label)[0],b):f_1}, default= f_2)

                return curr_node_tensor
        
        def loop_body(tensor_array, i):
            node_is_leaf = tf.gather(self.is_leaf_placeholder, i)
            node_word_index = tf.gather(self.node_word_indices_placeholder, i)
            left_child = tf.gather(self.left_children_placeholder, i)
            child_label = tf.gather(self.childlabels_placeholder, i)
            right_child = tf.gather(self.right_children_placeholder, i)
            node_tensor = tf.cond(node_is_leaf,lambda: embed_word(node_word_index),lambda: combine_children(tensor_array.read(left_child),tensor_array.read(right_child),child_label))
            tensor_array = tensor_array.write(i, node_tensor)
            i = tf.add(i, 1)
            return tensor_array, i

        loop_cond = lambda tensor_array, i: tf.less(i, tf.squeeze(tf.shape(self.is_leaf_placeholder)))
        self.tensor_array, _ = tf.while_loop(loop_cond, loop_body, [tensor_array, 0], parallel_iterations=1)

        # add projection layer
        self.logits = tf.matmul(self.tensor_array.concat(), U) + bs
        self.root_logits = tf.matmul(self.tensor_array.read(self.tensor_array.size() - 1), U) + bs
        self.root_prediction = tf.squeeze(tf.argmax(self.root_logits, 1))

        # add loss layer
        regularization_loss = self.config.l2 * (tf.nn.l2_loss(Wplusplus) + tf.nn.l2_loss(Wplus0) + tf.nn.l2_loss(Wplusminus) + tf.nn.l2_loss(W0plus) + tf.nn.l2_loss(W00) + tf.nn.l2_loss(W0minus) + tf.nn.l2_loss(Wminusplus) + tf.nn.l2_loss(Wminus0) + tf.nn.l2_loss(Wminusminus) + tf.nn.l2_loss(U))
        self.root_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(self.root_logits, self.labels1_placeholder))

        # add training opl
        self.train_op = tf.train.GradientDescentOptimizer(self.config.lr).minimize(self.root_loss)

        def build_feed_dict(self, node,label):
            nodes_list = []
            tree.leftTraverse(node, lambda node, args: args.append(node), nodes_list)
            node_to_index = OrderedDict()
            for i in range(len(nodes_list)):
                node_to_index[nodes_list[i]] = i
            feed_dict = {self.is_leaf_placeholder: [node.isLeaf for node in nodes_list],self.left_children_placeholder: [node_to_index[node.left] if not node.isLeaf else -1 for node in nodes_list], self.right_children_placeholder: [node_to_index[node.right] if not node.isLeaf else -1 for node in nodes_list], self.node_word_indices_placeholder: [wordIndex[node.word] for node in nodes_list],self.labels_placeholder: [node.label for node in nodes_list],self.childlabels_placeholder:[[node.left.label,node.right.label] if not node.isLeaf else -1 for node in nodes_list],self.label1_placeholder:label}
            return feed_dict

        def predict(self, trees, weights_path, get_loss=False):
            """Make predictions from the provided model."""
            results = []
            losses = []
            logits = []
            with tf.Session() as sess:
                saver = tf.train.Saver()
                saver.restore(sess, weights_path)
                for tree in trees:
                    feed_dict = self.build_feed_dict(tree.root,tree.label)
                    if get_loss:
                        logit, root_prediction, loss = sess.run([self.root_logits, self.root_prediction, self.root_loss], feed_dict=feed_dict)
                        losses.append(loss)
                    else:
                        logits ,root_prediction = sess.run([self.root_logits, self.root_prediction], feed_dict=feed_dict)
                    results.append(root_prediction)
                    logits.append(logit)
            return results, losses, logits

        def run_epoch(self, new_model=False, verbose=True):
            loss_history = []
            # training
            random.shuffle(self.train_data)
            with tf.Session() as sess:
                if new_model:
                    sess.run(tf.initialize_all_variables())
                else:
                    saver = tf.train.Saver()
                    saver.restore(sess, SAVE_DIR + '%s.temp' % self.config.model_name)
                for step, tree in enumerate(self.train_data):
                    feed_dict = self.build_feed_dict(tree.root,tree.label)
                    loss_value, _ = sess.run([self.full_loss, self.train_op],feed_dict=feed_dict)
                    loss_history.append(loss_value)
                    if verbose:
                        sys.stdout.write('\r{} / {} :    loss = {}'.format(step, len(self.train_data), np.mean(loss_history)))
                        sys.stdout.flush()
                saver = tf.train.Saver()
                if not os.path.exists(SAVE_DIR):
                    os.makedirs(SAVE_DIR)
                saver.save(sess, SAVE_DIR + '%s.temp' % self.config.model_name)
            # statistics
            train_preds, _ ,train_logits= self.predict(self.train_data,SAVE_DIR + '%s.temp' % self.config.model_name)
            val_preds, val_losses,val_logits = self.predict(self.dev_data,SAVE_DIR + '%s.temp' % self.config.model_name,get_loss=True)
            train_labels = [t.label for t in self.train_data]
            val_labels = [t.label for t in self.dev_data]
            train_acc = np.equal(train_preds, train_labels).mean()
            val_acc = np.equal(val_preds, val_labels).mean()

            print ('Training acc (only root node): {}'.format(train_acc))
            print ('Valiation acc (only root node): {}'.format(val_acc))
            print (self.make_conf(train_labels, train_preds))
            print (self.make_conf(val_labels, val_preds))
            return train_acc, val_acc, loss_history, np.mean(val_losses)

        def train(self, verbose=True):
            complete_loss_history = []
            train_acc_history = []
            val_acc_history = []
            prev_epoch_loss = float('inf')
            best_val_loss = float('inf')
            best_val_epoch = 0
            stopped = -1
            for epoch in xrange(self.config.max_epochs):
                print ('epoch %d' % epoch)
                if epoch == 0:
                    train_acc, val_acc, loss_history, val_loss = self.run_epoch(new_model=True)
                else:
                    train_acc, val_acc, loss_history, val_loss = self.run_epoch()
                complete_loss_history.extend(loss_history)
                train_acc_history.append(train_acc)
                val_acc_history.append(val_acc)

                #lr annealing
                epoch_loss = np.mean(loss_history)
                if epoch_loss > prev_epoch_loss * self.config.anneal_threshold:
                    self.config.lr /= self.config.anneal_by
                    print ('annealed lr to %f' % self.config.lr)
                prev_epoch_loss = epoch_loss

                #save if model has improved on val
                if val_loss < best_val_loss:
                    shutil.copyfile(SAVE_DIR + '%s.temp' % self.config.model_name,SAVE_DIR + '%s' % self.config.model_name)
                    best_val_loss = val_loss
                    best_val_epoch = epoch

                # if model has not imprvoved for a while stop
                if epoch - best_val_epoch > self.config.early_stopping:
                    stopped = epoch
                    #break
            if verbose:
                sys.stdout.write('\r')
                sys.stdout.flush()

            print ('\n\nstopped at %d\n' % stopped)
            return {'loss_history': complete_loss_history,'train_acc_history': train_acc_history,'val_acc_history': val_acc_history}

        #     def make_conf(self, labels, predictions):
        #         confmat = np.zeros([2, 2])
        #         for l, p in itertools.izip(labels, predictions):
        #             confmat[l, p] += 1
        #         return confmat


    def plot_loss_history(stats):
        plt.plot(stats['loss_history'])
        plt.title('Loss history')
        plt.xlabel('Iteration')
        plt.ylabel('Loss')
        plt.savefig('loss_history.png')
        plt.show()

      
        
        

In [70]:
def test_RNN():
    """Test RNN model implementation.

    You can use this function to test your implementation of the Named Entity
    Recognition network. When debugging, set max_epochs in the Config object to 1
    so you can rapidly iterate.
    """
    #mod = FastText.load_fasttext_format('/home/bt1/17CS10037/taddhita/cc.en.300.bin', encoding="utf8")
    config = Config()
    model = RNN_Model(config)
    start_time = time.time()
    stats = model.train(verbose=True)
    print ('Training time: {}'.format(time.time() - start_time))

    plt.plot(stats['loss_history'])
    plt.title('Loss history')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.savefig("loss_history.png")
    plt.show()

    print ('Test')
    print ('=-=-=')
    predictions, _ = model.predict(model.test_data, './weights/%s'%model.config.model_name)
    labels = [t.label for t in model.test_data]
    test_acc = np.equal(predictions, labels).mean()
    print ('Test acc: {}'.format(test_acc))


In [71]:
test_RNN()

Loading train trees..


TypeError: unhashable type: 'list'