In [18]:
##############################
# import modules
##############################

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

from gensim.models import word2vec
from gensim import models

In [19]:
# read the file, 
def read_data(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as myfile:
        f = myfile.readlines()
        s_num = 0
        i =0
        sentence_s = []
        tag_s = []
        dep_s = []
        s  = []   # sentence
        p = []    # tag
        d = []    # dependency
        for l in f:
            
            v = l.replace('\n','').split("\t")
            v.append(s_num)
            if len(l) != 1:
                data.append(v)
                dep = v[6] + '_' + v[7]
                word = v[1].lower()
                if any(char.isdigit() for char in word):
                    word = 'NUM'       # replace numbers with NUM
                s.append(word)
                p.append(v[3])
                d.append(dep)
                i +=1
            else:
                sentence_s.append(s)
                tag_s.append(p)
                dep_s.append(d)
                s_num +=1
                s  = []
                p = []
                d = []
        
    return data, sentence_s, tag_s, dep_s

In [20]:
def process_data(dataname):
    #reads in files, produces data structure with all actions
        #does so by applying produce_rule_list to every sentence.
        #for loop that sets actions to empty, calls p_r_l giving it
        #the stack and buffer, actions and correct_parse, adds finished action list
        #to new data file, for each sentence in the input data
    #input: name of the data file with all parses. Run with data file in same directory.
    #output: data file with all actions
    file = open(dataname)
    data = file.read()
    correct_parses = correct_parse_list(data)
    #gets rid of final whitespace
    del correct_parses[len(correct_parses)-1]
    
    #iterates over all parses, producing action list for each
    complete_rule_list = []
    for sentence_parse in correct_parses:
        stack = []
#         print(len(sentence_parse))
        buff = list(range(1,len(sentence_parse)+1))
        actions = []
        rule_list = produce_rule_list(stack, buff, actions, sentence_parse)
        complete_rule_list.append(rule_list)

    
    return complete_rule_list

def correct_parse_list(data):
    #Turns data into a list of lists of lists with relevant information
    correct_parse = data.split("\n\n")
    for index, paragraph in enumerate(correct_parse):
        correct_parse[index] = paragraph.split("\n")
    for paragraph in correct_parse:
        for index, line in enumerate(paragraph):
            paragraph[index] = line.split("\t")
    return correct_parse

In [21]:
def produce_rule_list(stack, buff, actions, sentence_parse):
    #recursive function that works through words in the sentence (stack/buffer)
        #until only one word is left, creating the list of actions 
        #that was taken to parse it.
    #input: stack, buffer, actions, correct parse
    #output: actions with the actions taken for each buff/stack configuration
    
    #base case
    if len(stack) == 1 and len(buff) == 0:
        #actions.append([stack[:], "empty", "R_arc"])
        actions.append([stack[0],-1, -1, "R_root"])
        return actions

    #If enough of the sentence is still left:
    #If there is not enough material in the stack, shift:
    if len(stack) == 0 :
        #print('chose S - small stack')
        actions.append([-1,-1,buff[0], "S"])
        stack.append(buff[0])
        del buff[0]        
        return produce_rule_list(stack,buff,actions,sentence_parse)
    if len(stack) == 1:
        actions.append([-1,stack[-1],buff[0], "S"])
        stack.append(buff[0])
        del buff[0]
        return produce_rule_list(stack,buff,actions,sentence_parse)
    #If there are 2 or more words in the stack, decide which action to perform and perform it
    if len(stack) > 1:
        action = rule_decision(stack,buff,sentence_parse)
        stack, buff, actions = action(stack,buff,actions, sentence_parse)
        return produce_rule_list(stack,buff,actions,sentence_parse)
    

def rule_decision(stack, buff, sentence_parse):
    #determines which action to apply
    #input: words on stack, words on buff, correct parse
    #output: one of three methods, Shift(), L_arc(), R_arc()

    #TODO: find ids/heads (index [6]) from stack and sentence_parse
    s1 = stack[-2]
    head_of_s1 = int(sentence_parse[s1-1][6])
    s2 = stack[-1]
    head_of_s2 = int(sentence_parse[s2-1][6])
    
    #L arcs can always be applied if possible
    if head_of_s1 == s2:
        action = L_arc
        #print('chose L')
    else:
        #R arcs can only be applied if there is no word in the buffer which has the last word in the stack as a head
        if head_of_s2 == s1:
            buff_heads = [int(sentence_parse[x-1][6]) for x in buff]
            if s2 in buff_heads:
                action = Shift
                #print('chose S - s2 in buffheads')
            else:
                action = R_arc
                #print('chose R')
        #if there is no match between s1 and s2, simply shift another word from the buffer
        else:
            action = Shift
            #print('chose S - no matching s1s2')

    return action

#The following methods perform an arc or shift. These can be changed if more data is needed in the network.

def L_arc(stack, buff, actions, sentence_parse):
    #removes second to last item from stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one L_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    relation = sentence_parse[s1-1][7]
    #actions.append([stack[:], buff[:], "L_arc"])
    actions.append([s1,s2,b1, "L"+"_"+relation])
    del stack[-2]
    return stack, buff, actions



def R_arc(stack, buff, actions, sentence_parse):
    #removes last item from the stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one R_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    relation = sentence_parse[s2-1][7]
    #actions.append([stack[:], buff[:], "R_arc"])
    actions.append([s1,s2,b1, "R"+"_"+relation])
    del stack[-1]
    return stack, buff, actions



def Shift(stack, buff, actions, sentence_parse):
    #moves an item from the buff to the stack, writes action to actions
    #input: stack, buff and actions
    #output: new stack and actions with one extra shift line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    #actions.append([stack[:], buff[:], "Shift"])
    actions.append([s1,s2,b1, "S"])
    stack.append(buff[0])
    del buff[0]
    return stack, buff, actions

In [22]:
train_data, train_sentences, train_tags, train_dependencies = read_data('./data/train-stanford-raw.conll')
dev_data, dev_sentences, dev_tags, dev_dependencies = read_data('./data/dev-stanford-raw.conll')
test_data, test_sentences, test_tags, test_dependencies = read_data('./data/test-stanford-raw.conll')

# create a full set of all the words in our train, test, and dev sets for word2vec model
# in order to avoid unseen words during test and validation
total_sentences = train_sentences + dev_sentences + test_sentences
print('data: ', train_data[:2])
print('words sentences: ', total_sentences[2:4])
print('tags sentences: ', train_tags[2:4])
print('dependencies: ', train_dependencies[2:4])

data:  [['1', 'In', '_', 'IN', 'IN', '_', '45', 'prep', '_', '_', 0], ['2', 'an', '_', 'DT', 'DT', '_', '5', 'det', '_', '_', 0]]
words sentences:  [['rolls-royce', 'motor', 'cars', 'inc.', 'said', 'it', 'expects', 'its', 'u.s.', 'sales', 'to', 'remain', 'steady', 'at', 'about', 'NUM', 'cars', 'in', 'NUM', '.'], ['the', 'luxury', 'auto', 'maker', 'last', 'year', 'sold', 'NUM', 'cars', 'in', 'the', 'u.s.']]
tags sentences:  [['NNP', 'NNP', 'NNPS', 'NNP', 'VBD', 'PRP', 'VBZ', 'PRP$', 'NNP', 'NNS', 'TO', 'VB', 'JJ', 'IN', 'IN', 'CD', 'NNS', 'IN', 'CD', '.'], ['DT', 'NN', 'NN', 'NN', 'JJ', 'NN', 'VBD', 'CD', 'NNS', 'IN', 'DT', 'NNP']]
dependencies:  [['4_nn', '4_nn', '4_nn', '5_nsubj', '0_root', '7_nsubj', '5_ccomp', '10_poss', '10_nn', '12_nsubj', '12_aux', '7_xcomp', '12_acomp', '12_prep', '16_quantmod', '17_num', '14_pobj', '12_prep', '18_pobj', '5_punct'], ['4_det', '4_nn', '4_nn', '7_nsubj', '6_amod', '7_tmod', '0_root', '9_num', '7_dobj', '7_prep', '12_det', '10_pobj']]


In [23]:
action_data = process_data('data/train-stanford-raw.conll')

In [24]:
##############################
# TF functions
##############################

def length(sequence):
    """
    function that computes the real, unpadded lenghts for every sequence in batch
    """
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

def mlp(_X, _weights, _biases):
    """
    function that defines a multilayer perceptron in the graph
    input shape: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
    output shape: parse_steps (=?) x num_classes
    """
    # ReLU hidden layer (output shape: parse_steps x n_hidden)
    layer_1 = tf.nn.relu(tf.add(tf.einsum('ijk,kl->il', _X, _weights['h']), _biases['b'])) 
    # return output layer (output shape: parse_steps x n_classes)
    return tf.add(tf.einsum('ik,kl->il', layer_1, _weights['out']), _biases['out'])

def embedding_lookup(sentences, max_seq_length, vec_length):
    """
    function that looks up embeddings.
    input: list of sentences, length of sentences, length of word vectors
    output: 3D array of word vectors per sentence 
                (dims #sentences x sentence_length x embedding_size)
    """
    sentence_embeddings = np.empty((0,max_seq_length,vec_length))
    for sentence in sentences:
        word_embeddings = np.empty((0,vec_length))
        for word in sentence:
            word_embeddings = np.vstack([word_embeddings, model[word]])
        if len(sentence) < max_seq_length:
            zero_padding_length = max_seq_length - len(sentence)
            word_embeddings = np.vstack([word_embeddings, np.zeros((zero_padding_length, vec_length))])
        sentence_embeddings = np.append(sentence_embeddings, np.array([word_embeddings]), axis=0)
    return sentence_embeddings

In [31]:
##############################
# build graph
##############################

with tf.Graph().as_default():
    
    # hyperparameters (from Cross & Huang, 2016)
    batch_size = 10
    n_input = 400
    n_hidden = 200
    n_classes = 50 # TODO: how many classes for the actual data?
    lstm_units = 200
    num_epochs = 10
    dropout = 0.5
    L2_penalty = 0.
    rho = 0.99
    epsilon = 1e-07
    learn_rate = 0.01

    # Store layers weight & bias
    weights = {
        'h': tf.Variable(tf.random_normal([n_input, n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes], dtype=tf.float64))
    }
    biases = {
        'b': tf.Variable(tf.random_normal([n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64))
    }

    # load word2vec model
    model_name = "dep_parser_word2vec_total"
    if not 'model' in locals():
        model = models.word2vec.Word2Vec.load(model_name)
        print("Model loaded from disk")
    else:
        print("Model was already loaded")

    
    # dummy data:
    # toy sentences (= list of lists of words)
#     sentences = [["the", "by", "an", "on", "the", "in", "an"], ["the", "cat", "sat", "on", "the", "ground"]]
    toy_sentences = train_sentences[0:30]
    toy_parses = action_data[0:30]
    
    # toy parses (= array with an action and word indices)
#     parses = [np.array([[1,0,1,2], 
#                         [0,1,2,3],
#                         [0,1,2,3],
#                         [1,0,1,2]]),
#               np.array([[1,0,1,2], 
#                         [0,1,2,3], 
#                         [3,2,3,4]])]

    # variables from sentences
    vec_length = model['a'].size
    seq_lengths = [len(sentence) for sentence in sentences]
    max_seq_length = max(seq_lengths)
    

    # look up embeddings of words in sentence
    embeddings = embedding_lookup(sentences, max_seq_length, vec_length)
    print("Sentence embedding shape (np-array): ", embeddings.shape)

    # define LSTM cell + dropout wrapper (like Cross & Huang)
    cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_units, state_is_tuple=True)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=0.5)

    # define bidirectional architecture
    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell,
        cell_bw=cell,
        dtype=tf.float64,
        sequence_length=seq_lengths,
        inputs=embeddings
    )

    # fw/bw output (num_sequences x max_seq_length x lstm_units)
    output_fw, output_bw = outputs
    print("Forward pass size: ", output_fw.get_shape())

    # concatenate forward & backward outputs per word
    output_lstm = tf.concat(2, outputs)
    print("BiLSTM output shape: ", output_lstm.get_shape())


    # MLP layer filtering LSTM outputs based on parse steps
    batch_cost = 0
    for i in range(0, len(sentences)):
        sentence = toy_sentences[i]
        parse = toy_parses[i]
        # input: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
        # output: parse_steps (=?) x num_classes
        output_mlp = mlp(tf.gather(sentence, parse[]), weights, biases)
        print("MLP output shape S{}: ".format(i), output_mlp.get_shape())
        cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(output_mlp, parse[:,0]))
        batch_cost += cost
    batch_cost /= len(sentences)

    # define optimizer
    global_step = tf.Variable(0, name='global_step', trainable=True)
    optimizer = tf.train.AdadeltaOptimizer(rho=rho, epsilon=epsilon).minimize(batch_cost)
#     train_op = optimizer.train(loss, global_step=global_step)
    correct = tf.nn.in_top_k(embeddings[0], parses[0], 1)
    print(correct)

    
    ##############################
    # start session in graph
    ##############################

    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)

Model was already loaded
Sentence embedding shape (np-array):  (2, 7, 189)
Forward pass size:  (2, 7, 200)
BiLSTM output shape:  (2, 7, 400)


TypeError: Expected binary or unicode string, got -1

In [33]:
for action in action_data[1]:
    print(action)

[-1, -1, 1, 'S']
[-1, 1, 2, 'S']
[1, 2, 1, 'L_nn']
[-1, 2, 3, 'S']
[2, 3, 2, 'L_nsubj']
[-1, 3, 4, 'S']
[3, 4, 3, 'R_dobj']
[-1, 3, 5, 'S']
[3, 5, 3, 'R_punct']
[3, -1, -1, 'R_root']
