In [59]:
##############################
# import modules
##############################

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

from gensim.models import word2vec
from gensim import models
import time

In [2]:
# read the file, 
def read_data(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as myfile:
        f = myfile.readlines()
        s_num = 0
        i =0
        sentence_s = []
        tag_s = []
        dep_s = []
        s  = []   # sentence
        p = []    # tag
        d = []    # dependency
        for l in f:
            
            v = l.replace('\n','').split("\t")
            v.append(s_num)
            if len(l) != 1:
                data.append(v)
                dep = v[6] + '_' + v[7]
                word = v[1].lower()
                if any(char.isdigit() for char in word):
                    word = 'NUM'       # replace numbers with NUM
                s.append(word)
                p.append(v[3])
                d.append(dep)
                i +=1
            else:
                sentence_s.append(s)
                tag_s.append(p)
                dep_s.append(d)
                s_num +=1
                s  = []
                p = []
                d = []
        
    return data, sentence_s, tag_s, dep_s

In [3]:
def process_data(dataname):
    #reads in files, produces data structure with all actions
        #does so by applying produce_rule_list to every sentence.
        #for loop that sets actions to empty, calls p_r_l giving it
        #the stack and buffer, actions and correct_parse, adds finished action list
        #to new data file, for each sentence in the input data
    #input: name of the data file with all parses. Run with data file in same directory.
    #output: data file with all actions
    file = open(dataname)
    data = file.read()
    correct_parses = correct_parse_list(data)
    #gets rid of final whitespace
    del correct_parses[len(correct_parses)-1]
    
    #iterates over all parses, producing action list for each
    complete_rule_list = []
    arc_dict = {'Shift':0,'L_root':1,'R_root':2}
    for sentence_parse in correct_parses:
        stack = []
#         print(len(sentence_parse))
        buff = list(range(1,len(sentence_parse)+1))
        actions = []
        rule_list, arc_dict = produce_rule_list(stack, buff, actions, sentence_parse, arc_dict)
        complete_rule_list.append(np.array(rule_list))

    
    return complete_rule_list, arc_dict

def correct_parse_list(data):
    #Turns data into a list of lists of lists with relevant information
    correct_parse = data.split("\n\n")
    for index, paragraph in enumerate(correct_parse):
        correct_parse[index] = paragraph.split("\n")
    for paragraph in correct_parse:
        for index, line in enumerate(paragraph):
            paragraph[index] = line.split("\t")
    return correct_parse

In [4]:
def produce_rule_list(stack, buff, actions, sentence_parse, arc_dict):
    #recursive function that works through words in the sentence (stack/buffer)
        #until only one word is left, creating the list of actions 
        #that was taken to parse it.
    #input: stack, buffer, actions, correct parse
    #output: actions with the actions taken for each buff/stack configuration
    
    #base case
    if len(stack) == 1 and len(buff) == 0:
        #actions.append([stack[:], "empty", "R_arc"])
        actions.append([0,stack[0], 0, 2])
        return actions, arc_dict

    #If enough of the sentence is still left:
    #If there is not enough material in the stack, shift:
    if len(stack) == 0 :
        #print('chose S - small stack')
        actions.append([0,0,buff[0], 0])
        stack.append(buff[0])
        del buff[0]        
        return produce_rule_list(stack,buff,actions,sentence_parse, arc_dict)
    if len(stack) == 1:
        actions.append([0,stack[-1],buff[0], 0])
        stack.append(buff[0])
        del buff[0]
        return produce_rule_list(stack,buff,actions,sentence_parse, arc_dict)
    #If there are 2 or more words in the stack, decide which action to perform and perform it
    if len(stack) > 1:
        action = rule_decision(stack,buff,sentence_parse)
        stack, buff, actions, arc_dict = action(stack,buff,actions, sentence_parse, arc_dict)
        return produce_rule_list(stack,buff,actions,sentence_parse, arc_dict)
    

def rule_decision(stack, buff, sentence_parse):
    #determines which action to apply
    #input: words on stack, words on buff, correct parse
    #output: one of three methods, Shift(), L_arc(), R_arc()

    #find ids/heads (index [6]) from stack and sentence_parse
    s1 = stack[-2]
    head_of_s1 = int(sentence_parse[s1-1][6])
    s2 = stack[-1]
    head_of_s2 = int(sentence_parse[s2-1][6])
    
    #L arcs can always be applied if possible
    if head_of_s1 == s2:
        action = L_arc
        #print('chose L')
    else:
        #R arcs can only be applied if there is no word in the buffer which has the last word in the stack as a head
        if head_of_s2 == s1:
            buff_heads = [int(sentence_parse[x-1][6]) for x in buff]
            if s2 in buff_heads:
                action = Shift
                #print('chose S - s2 in buffheads')
            else:
                action = R_arc
                #print('chose R')
        #if there is no match between s1 and s2, simply shift another word from the buffer
        else:
            action = Shift
            #print('chose S - no matching s1s2')

    return action

#The following methods perform an arc or shift. These can be changed if more data is needed in the network.

def L_arc(stack, buff, actions, sentence_parse, arc_dict):
    #removes second to last item from stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one L_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    if len(buff) == 0:
        b1 = 0
    else:
        b1 = int(buff[0])
    relation = "L_"+sentence_parse[s1-1][7]

    if relation not in arc_dict:
        maximum = max(arc_dict, key=arc_dict.get)
        arc_dict['L_'+relation[2:]] = arc_dict[maximum]+1
        arc_dict['R_'+relation[2:]] = arc_dict[maximum]+2
    

    actions.append([s1,s2,b1, arc_dict[relation]])
    del stack[-2]
    return stack, buff, actions, arc_dict



def R_arc(stack, buff, actions, sentence_parse, arc_dict):
    #removes last item from the stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one R_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    if len(buff) == 0:
        b1 = 0
    else:
        b1 = int(buff[0])
        
    relation = "R_"+sentence_parse[s2-1][7]

    if relation not in arc_dict:
        maximum = max(arc_dict, key=arc_dict.get)
        arc_dict['L_'+relation[2:]] = arc_dict[maximum]+1
        arc_dict['R_'+relation[2:]] = arc_dict[maximum]+2 
    
    actions.append([s1,s2,b1, arc_dict[relation]])
    del stack[-1]
    return stack, buff, actions, arc_dict



def Shift(stack, buff, actions, sentence_parse, arc_dict):
    #moves an item from the buff to the stack, writes action to actions
    #input: stack, buff and actions
    #output: new stack and actions with one extra shift line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(buff[0])
    #actions.append([stack[:], buff[:], "Shift"])
    actions.append([s1,s2,b1, 0])
    stack.append(buff[0])
    del buff[0]
    return stack, buff, actions, arc_dict

In [5]:
train_data, train_sentences, train_tags, train_dependencies = read_data('./data/train-stanford-raw.conll')
dev_data, dev_sentences, dev_tags, dev_dependencies = read_data('./data/dev-stanford-raw.conll')
test_data, test_sentences, test_tags, test_dependencies = read_data('./data/test-stanford-raw.conll')

# create a full set of all the words in our train, test, and dev sets for word2vec model
# in order to avoid unseen words during test and validation
total_sentences = train_sentences + dev_sentences + test_sentences
print('data: ', train_data[:2])
print('words sentences: ', total_sentences[2:4])
print('tags sentences: ', train_tags[2:4])
print('dependencies: ', train_dependencies[2:4])

data:  [['1', 'In', '_', 'IN', 'IN', '_', '45', 'prep', '_', '_', 0], ['2', 'an', '_', 'DT', 'DT', '_', '5', 'det', '_', '_', 0]]
words sentences:  [['rolls-royce', 'motor', 'cars', 'inc.', 'said', 'it', 'expects', 'its', 'u.s.', 'sales', 'to', 'remain', 'steady', 'at', 'about', 'NUM', 'cars', 'in', 'NUM', '.'], ['the', 'luxury', 'auto', 'maker', 'last', 'year', 'sold', 'NUM', 'cars', 'in', 'the', 'u.s.']]
tags sentences:  [['NNP', 'NNP', 'NNPS', 'NNP', 'VBD', 'PRP', 'VBZ', 'PRP$', 'NNP', 'NNS', 'TO', 'VB', 'JJ', 'IN', 'IN', 'CD', 'NNS', 'IN', 'CD', '.'], ['DT', 'NN', 'NN', 'NN', 'JJ', 'NN', 'VBD', 'CD', 'NNS', 'IN', 'DT', 'NNP']]
dependencies:  [['4_nn', '4_nn', '4_nn', '5_nsubj', '0_root', '7_nsubj', '5_ccomp', '10_poss', '10_nn', '12_nsubj', '12_aux', '7_xcomp', '12_acomp', '12_prep', '16_quantmod', '17_num', '14_pobj', '12_prep', '18_pobj', '5_punct'], ['4_det', '4_nn', '4_nn', '7_nsubj', '6_amod', '7_tmod', '0_root', '9_num', '7_dobj', '7_prep', '12_det', '10_pobj']]


In [6]:
action_data, arc_dict = process_data('./data/train-stanford-raw.conll')

# print(arc_dict)
# for line in action_data[0]:
#     print(line)

In [125]:
##############################
# TF functions
##############################

def mlp(_X, _weights, _biases):
    """
    function that defines a multilayer perceptron in the graph
    input shape: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
    output shape: parse_steps (=?) x num_classes
    """
    # ReLU hidden layer (output shape: parse_steps x n_hidden)
    layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, _weights['h']), _biases['b'])) 
    # return output layer (output shape: parse_steps x n_classes)
    return tf.add(tf.matmul(layer_1, _weights['out']), _biases['out'])


def create_sentence_embeddings(sentences):
    """
    for each sentence, get embedded representation
    """
    embedded_train_sentences = []
    for sentence in sentences:
        embed = model[sentence]
        embedded_train_sentences.append(embed)
    return embedded_train_sentences


In [68]:
##############################
# load word2vec model & input data
##############################

model_name = "dep_parser_word2vec_total"
model = word2vec.Word2Vec.load(model_name)

# embeddings for all sentences
sentence_embeddings = create_sentence_embeddings(train_sentences)

In [137]:
##############################
# TensorFlow model
##############################

graph = tf.Graph()
with graph.as_default():
        
    # hyperparameters (from Cross & Huang, 2016)
    word2vec_length = model['a'].size
    n_input = 400
    n_hidden = 200
    n_classes = 99 # there are 99 possible actions to take
    lstm_units = 200
    num_epochs = 1 # at least 1 for now
    dropout = 0.5
    L2_penalty = 0.
    rho = 0.99
    epsilon = 1e-07
    learning_rate = 0.01 # default is 0.001, Cross & Huang do not specify learning rate

    # Store layers weight & bias
    weights = {
        'h': tf.Variable(tf.random_normal([3*n_input, n_hidden], dtype=tf.float64), name='weights_h'),
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes], dtype=tf.float64), name='weights_out')
    }
    biases = {
        'b': tf.Variable(tf.random_normal([n_hidden], dtype=tf.float64), name='biases_b'),
        'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64), name='biases_out')
    }
    
    # placeholders
    sentence_length = tf.placeholder(tf.int32)
    lstm_x = tf.placeholder(tf.float64, [1, None, word2vec_length])
    parse_indices = tf.placeholder(tf.int64, [1, None, 3])
    labels = tf.placeholder(tf.int64, [None])
    
    # define LSTM cell + dropout wrapper (like Cross & Huang)
    cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_units, state_is_tuple=True)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=dropout)

    # define bidirectional architecture
    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell,
        cell_bw=cell,
        dtype=tf.float64,
        sequence_length=sentence_length,
        inputs=lstm_x
    )

    # fw/bw output
    output_fw, output_bw = outputs

    # concatenate forward & backward outputs per word
    output_lstm = tf.concat(2, outputs)
    
    # zero-padding of LSTM-output (sentence gets a "dummy word" in front of it)
    zero_padding = tf.zeros([1, 1, n_input], tf.float64)
    output_lstm = tf.concat(1, [zero_padding, output_lstm])
   
    # mlp_x: make a matrix with all corresponding word vector 3-tuples (up to the no of parse steps)
    # TODO: use gather or embedding_lookup? (+ check correctness)
    mlp_x = tf.nn.embedding_lookup(output_lstm[0,:,:], parse_indices[0,:,:])
    print(mlp_x.get_shape())
    dims = tf.shape(mlp_x)
    mlp_x = tf.reshape(mlp_x, [dims[0], dims[1]*dims[2]])
    print(mlp_x.get_shape())

    output_mlp = mlp(mlp_x, weights, biases)

    cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(output_mlp, labels))
    cost_indication = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(output_mlp, labels))

    train_op = tf.train.AdadeltaOptimizer(rho=rho, epsilon=epsilon, learning_rate=learning_rate).minimize(cost)
    
    # TODO: tf.train.Saver(...variables...)
    
    with tf.Session() as session:
        init = tf.initialize_all_variables()
        session.run(init)
        
        for epoch in range(0,num_epochs): 
            for i in range(0,1000): 
                
                # get sentence embedding
                sentence = sentence_embeddings[i]
                
                # get parse data for sentence
                parse_data = action_data[i]
                
                indices = parse_data[:,:3]
                actions = parse_data[:,3]
                
                # important variables
                seq_length = len(sentence)
                                
                feed_dict_batch = {sentence_length: [seq_length], lstm_x: [sentence],  parse_indices: [indices], labels: actions}
                
#                 old_weights = session.run(tf.trainable_variables()[4])
                result = session.run([train_op, cost_indication], feed_dict_batch)
                
                if i%50 == 0:
                    print("iteration ", i, "\n", "current average loss: ", result[1])

        # TODO: prediction on validation set every 200 sentences
        # pred_input = feed_dict without labels
        # prediction = tf.run(output_mlp, pred_input)


(?, 3, 400)
(?, ?)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


iteration  1 
 current average loss:  190.32764962
iteration  51 
 current average loss:  146.517881412
iteration  101 
 current average loss:  92.9535972101
iteration  151 
 current average loss:  65.545953349
iteration  201 
 current average loss:  33.087154735
iteration  251 
 current average loss:  72.1201142244
iteration  301 
 current average loss:  83.1242303292
iteration  351 
 current average loss:  48.0976733586
iteration  401 
 current average loss:  76.198705717
iteration  451 
 current average loss:  69.5548603264
iteration  501 
 current average loss:  67.1562097119
iteration  551 
 current average loss:  55.3417320577
iteration  601 
 current average loss:  50.3440057379
iteration  651 
 current average loss:  55.62117022
iteration  701 
 current average loss:  75.2326607467
iteration  751 
 current average loss:  48.684001623
iteration  801 
 current average loss:  50.2926326761
iteration  851 
 current average loss:  53.717079589
iteration  901 
 current average loss: 

In [None]:
check_index = 1
print(embedded_train_sentences[check_index].shape)
print(len(embedded_train_sentences[check_index]))
# print(embedded_train_sentences[check_index])
print(type(embedded_train_sentences[check_index][0][0]))
for word in train_sentences[check_index]:
    print(word)
# for emb in embedded_train_sentences[check_index]:
#     print(emb)
# for action in action_data[check_index]:
#     print(action)

In [None]:
#### OUD OUD OUD

##############################
# build graph
##############################

with tf.Graph().as_default():
    
    # hyperparameters (from Cross & Huang, 2016)
    batch_size = 10
    n_input = 400
    n_hidden = 200
    n_classes = 99 # there are 99 possible actions to take
    lstm_units = 200
    num_epochs = 10
    dropout = 0.5
    L2_penalty = 0.
    rho = 0.99
    epsilon = 1e-07
    learn_rate = 0.01

    # Store layers weight & bias
    weights = {
        'h': tf.Variable(tf.random_normal([n_input, n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes], dtype=tf.float64))
    }
    biases = {
        'b': tf.Variable(tf.random_normal([n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64))
    }

    # load word2vec model
    model_name = "dep_parser_word2vec_total"
    if not 'model' in locals():
        model = models.word2vec.Word2Vec.load(model_name)
        print("Model loaded from disk")
    else:
        print("Model was already loaded")

    # placeholders for input (input tensor uses None as variable sized dimension, 
    # or can use rows of train set length, or batch length might be better)
    lstm_x = tf.placeholder("float", [None, word2vec_length])
    # lstm output (1 pass)
    lstm_y = tf.placeholder("float", [None, n_hidden])
    # mlp input (3 words)
    mlp_x = tf.placeholder("float", [3, 2*n_hidden])
    # mlp output
    mlp_y = tf.placeholder("float", [n_classes])
    
    
    # dummy data:
    # toy sentences (= list of lists of words)
#     sentences = [["the", "by", "an", "on", "the", "in", "an"], ["the", "cat", "sat", "on", "the", "ground"]]
#     toy_sentences = train_sentences[0:30]
#     toy_parses = action_data[0:30]
    # toy parses (= array with an action and word indices)
#     parses = [np.array([[1,0,1,2], 
#                         [0,1,2,3],
#                         [0,1,2,3],
#                         [1,0,1,2]]),
#               np.array([[1,0,1,2], 
#                         [0,1,2,3], 
#                         [3,2,3,4]])]
    
    # look up embeddings of words in sentence
#     toy_embeddings = np.array(embedded_train_sentences[0:30])
#     embeddings = embedding_lookup(toy_sentences, max_seq_length, vec_length)
#     embeddings = embedded_train_sentences[0:30]
#     print("Sentence embedding shape (np-array): ", embeddings.shape)

    # define LSTM cell + dropout wrapper (like Cross & Huang)
    cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_units, state_is_tuple=True)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=0.5)

    # define bidirectional architecture
    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell,
        cell_bw=cell,
        dtype=tf.float64,
        sequence_length=seq_lengths,
        inputs=lstm_x
    )

    # fw/bw output (num_sequences x max_seq_length x lstm_units)
    output_fw, output_bw = outputs
    print("Forward pass size: ", output_fw.get_shape())

    # concatenate forward & backward outputs per word
    output_lstm = tf.concat(2, outputs)
    print("BiLSTM output shape: ", output_lstm.get_shape())


    # MLP layer filtering LSTM outputs based on parse steps
    batch_cost = 0
    for i in range(0, len(toy_sentences)):
        action = action_data[i]
#         sentence_input_indices = action[0:5]
        print(input_indices)
        output_action_index = action[-1]
        print(output_action_index)
        sentence = toy_sentences[i]
#         print(len(sentence))
        emb = embeddings[i]
#         print(emb.shape())
        parse = toy_parses[i]
        # input: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
        # output: parse_steps (=?) x num_classes
        output_mlp = mlp(tf.gather(emb, parse), weights, biases)
        print("MLP output shape S{}: ".format(i), output_mlp.get_shape())
        cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(output_mlp, parse[:,0]))
        batch_cost += cost
    batch_cost /= len(sentences)

    # define optimizer
    global_step = tf.Variable(0, name='global_step', trainable=True)
    optimizer = tf.train.AdadeltaOptimizer(rho=rho, epsilon=epsilon).minimize(batch_cost)
#     train_op = optimizer.train(loss, global_step=global_step)
    correct = tf.nn.in_top_k(embeddings[0], parses[0], 1)
    print(correct)

    
    ##############################
    # start session in graph
    ##############################

    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)