In [1]:
##############################
# import modules
##############################

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

from gensim.models import word2vec
from gensim import models



In [2]:
# read the file, 
def read_data(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as myfile:
        f = myfile.readlines()
        s_num = 0
        i =0
        sentence_s = []
        tag_s = []
        dep_s = []
        s  = []   # sentence
        p = []    # tag
        d = []    # dependency
        for l in f:
            
            v = l.replace('\n','').split("\t")
            v.append(s_num)
            if len(l) != 1:
                data.append(v)
                dep = v[6] + '_' + v[7]
                word = v[1].lower()
                if any(char.isdigit() for char in word):
                    word = 'NUM'       # replace numbers with NUM
                s.append(word)
                p.append(v[3])
                d.append(dep)
                i +=1
            else:
                sentence_s.append(s)
                tag_s.append(p)
                dep_s.append(d)
                s_num +=1
                s  = []
                p = []
                d = []
        
    return data, sentence_s, tag_s, dep_s

In [3]:
def process_data(dataname):
    #reads in files, produces data structure with all actions
        #does so by applying produce_rule_list to every sentence.
        #for loop that sets actions to empty, calls p_r_l giving it
        #the stack and buffer, actions and correct_parse, adds finished action list
        #to new data file, for each sentence in the input data
    #input: name of the data file with all parses. Run with data file in same directory.
    #output: data file with all actions
    file = open(dataname)
    data = file.read()
    correct_parses = correct_parse_list(data)
    #gets rid of final whitespace
    del correct_parses[len(correct_parses)-1]
    
    #iterates over all parses, producing action list for each
    complete_rule_list = []
    arc_dict = {'Shift':0,'L_root':1,'R_root':2}
    for sentence_parse in correct_parses:
        stack = []
#         print(len(sentence_parse))
        buff = list(range(1,len(sentence_parse)+1))
        actions = []
        rule_list, arc_dict = produce_rule_list(stack, buff, actions, sentence_parse, arc_dict)
        complete_rule_list.append(np.array(rule_list))

    
    return complete_rule_list, arc_dict

def correct_parse_list(data):
    #Turns data into a list of lists of lists with relevant information
    correct_parse = data.split("\n\n")
    for index, paragraph in enumerate(correct_parse):
        correct_parse[index] = paragraph.split("\n")
    for paragraph in correct_parse:
        for index, line in enumerate(paragraph):
            paragraph[index] = line.split("\t")
    return correct_parse

In [4]:
def produce_rule_list(stack, buff, actions, sentence_parse, arc_dict):
    #recursive function that works through words in the sentence (stack/buffer)
        #until only one word is left, creating the list of actions 
        #that was taken to parse it.
    #input: stack, buffer, actions, correct parse
    #output: actions with the actions taken for each buff/stack configuration
    
    #base case
    if len(stack) == 1 and len(buff) == 0:
        #actions.append([stack[:], "empty", "R_arc"])
        actions.append([-1,stack[0], -1, 2])
        return actions, arc_dict

    #If enough of the sentence is still left:
    #If there is not enough material in the stack, shift:
    if len(stack) == 0 :
        #print('chose S - small stack')
        actions.append([-1,-1,buff[0], 0])
        stack.append(buff[0])
        del buff[0]        
        return produce_rule_list(stack,buff,actions,sentence_parse, arc_dict)
    if len(stack) == 1:
        actions.append([-1,stack[-1],buff[0], 0])
        stack.append(buff[0])
        del buff[0]
        return produce_rule_list(stack,buff,actions,sentence_parse, arc_dict)
    #If there are 2 or more words in the stack, decide which action to perform and perform it
    if len(stack) > 1:
        action = rule_decision(stack,buff,sentence_parse)
        stack, buff, actions, arc_dict = action(stack,buff,actions, sentence_parse, arc_dict)
        return produce_rule_list(stack,buff,actions,sentence_parse, arc_dict)
    

def rule_decision(stack, buff, sentence_parse):
    #determines which action to apply
    #input: words on stack, words on buff, correct parse
    #output: one of three methods, Shift(), L_arc(), R_arc()

    #TODO: find ids/heads (index [6]) from stack and sentence_parse
    s1 = stack[-2]
    head_of_s1 = int(sentence_parse[s1-1][6])
    s2 = stack[-1]
    head_of_s2 = int(sentence_parse[s2-1][6])
    
    #L arcs can always be applied if possible
    if head_of_s1 == s2:
        action = L_arc
        #print('chose L')
    else:
        #R arcs can only be applied if there is no word in the buffer which has the last word in the stack as a head
        if head_of_s2 == s1:
            buff_heads = [int(sentence_parse[x-1][6]) for x in buff]
            if s2 in buff_heads:
                action = Shift
                #print('chose S - s2 in buffheads')
            else:
                action = R_arc
                #print('chose R')
        #if there is no match between s1 and s2, simply shift another word from the buffer
        else:
            action = Shift
            #print('chose S - no matching s1s2')

    return action

#The following methods perform an arc or shift. These can be changed if more data is needed in the network.

def L_arc(stack, buff, actions, sentence_parse, arc_dict):
    #removes second to last item from stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one L_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    relation = "L_"+sentence_parse[s1-1][7]

    if relation not in arc_dict:
        maximum = max(arc_dict, key=arc_dict.get)
        arc_dict['L_'+relation[2:]] = arc_dict[maximum]+1
        arc_dict['R_'+relation[2:]] = arc_dict[maximum]+2
    

    actions.append([s1,s2,b1, arc_dict[relation]])
    del stack[-2]
    return stack, buff, actions, arc_dict



def R_arc(stack, buff, actions, sentence_parse, arc_dict):
    #removes last item from the stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one R_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    relation = "R_"+sentence_parse[s2-1][7]

    if relation not in arc_dict:
        maximum = max(arc_dict, key=arc_dict.get)
        arc_dict['L_'+relation[2:]] = arc_dict[maximum]+1
        arc_dict['R_'+relation[2:]] = arc_dict[maximum]+2 
    
    actions.append([s1,s2,b1, arc_dict[relation]])
    del stack[-1]
    return stack, buff, actions, arc_dict



def Shift(stack, buff, actions, sentence_parse, arc_dict):
    #moves an item from the buff to the stack, writes action to actions
    #input: stack, buff and actions
    #output: new stack and actions with one extra shift line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    #actions.append([stack[:], buff[:], "Shift"])
    actions.append([s1,s2,b1, 0])
    stack.append(buff[0])
    del buff[0]
    return stack, buff, actions, arc_dict

In [5]:
train_data, train_sentences, train_tags, train_dependencies = read_data('./data/train-stanford-raw.conll')
dev_data, dev_sentences, dev_tags, dev_dependencies = read_data('./data/dev-stanford-raw.conll')
test_data, test_sentences, test_tags, test_dependencies = read_data('./data/test-stanford-raw.conll')

# create a full set of all the words in our train, test, and dev sets for word2vec model
# in order to avoid unseen words during test and validation
total_sentences = train_sentences + dev_sentences + test_sentences
print('data: ', train_data[:2])
print('words sentences: ', total_sentences[2:4])
print('tags sentences: ', train_tags[2:4])
print('dependencies: ', train_dependencies[2:4])

data:  [['1', 'In', '_', 'IN', 'IN', '_', '45', 'prep', '_', '_', 0], ['2', 'an', '_', 'DT', 'DT', '_', '5', 'det', '_', '_', 0]]
words sentences:  [['rolls-royce', 'motor', 'cars', 'inc.', 'said', 'it', 'expects', 'its', 'u.s.', 'sales', 'to', 'remain', 'steady', 'at', 'about', 'NUM', 'cars', 'in', 'NUM', '.'], ['the', 'luxury', 'auto', 'maker', 'last', 'year', 'sold', 'NUM', 'cars', 'in', 'the', 'u.s.']]
tags sentences:  [['NNP', 'NNP', 'NNPS', 'NNP', 'VBD', 'PRP', 'VBZ', 'PRP$', 'NNP', 'NNS', 'TO', 'VB', 'JJ', 'IN', 'IN', 'CD', 'NNS', 'IN', 'CD', '.'], ['DT', 'NN', 'NN', 'NN', 'JJ', 'NN', 'VBD', 'CD', 'NNS', 'IN', 'DT', 'NNP']]
dependencies:  [['4_nn', '4_nn', '4_nn', '5_nsubj', '0_root', '7_nsubj', '5_ccomp', '10_poss', '10_nn', '12_nsubj', '12_aux', '7_xcomp', '12_acomp', '12_prep', '16_quantmod', '17_num', '14_pobj', '12_prep', '18_pobj', '5_punct'], ['4_det', '4_nn', '4_nn', '7_nsubj', '6_amod', '7_tmod', '0_root', '9_num', '7_dobj', '7_prep', '12_det', '10_pobj']]


In [6]:
action_data, arc_dict = process_data('./data/train-stanford-raw.conll')

In [7]:
print(arc_dict)
for line in action_data[1]:
    print(line)

{'R_conj': 28, 'R_num': 4, 'R_acomp': 42, 'L_number': 79, 'L_num': 3, 'R_pcomp': 60, 'R_poss': 14, 'R_abbrev': 94, 'R_predet': 72, 'R_auxpass': 36, 'R_complm': 64, 'R_dobj': 24, 'L_rcmod': 61, 'R_advmod': 34, 'L_acomp': 41, 'L_conj': 27, 'L_purpcl': 89, 'L_cc': 25, 'R_advcl': 70, 'R_rcmod': 62, 'R_det': 8, 'R_parataxis': 66, 'L_amod': 19, 'L_possessive': 11, 'R_purpcl': 90, 'R_rel': 86, 'R_nsubj': 22, 'R_number': 80, 'L_abbrev': 93, 'L_nsubj': 21, 'L_mark': 67, 'L_xcomp': 45, 'R_pobj': 16, 'R_csubj': 74, 'L_parataxis': 65, 'R_possessive': 12, 'R_appos': 52, 'L_poss': 13, 'R_partmod': 32, 'L_rel': 85, 'L_csubj': 73, 'R_mark': 68, 'R_punct': 10, 'L_partmod': 31, 'L_complm': 63, 'R_ccomp': 48, 'L_auxpass': 35, 'R_attr': 78, 'L_appos': 51, 'L_iobj': 83, 'L_punct': 9, 'L_quantmod': 43, 'R_nn': 6, 'L_cop': 95, 'L_csubjpass': 97, 'L_det': 7, 'R_infmod': 58, 'L_pobj': 15, 'L_advcl': 69, 'R_mwe': 76, 'R_dep': 30, 'Shift': 0, 'L_expl': 87, 'L_advmod': 33, 'L_attr': 77, 'L_aux': 39, 'R_xcomp': 46

In [8]:
##############################
# TF functions
##############################

def length(sequence):
    """
    function that computes the real, unpadded lenghts for every sequence in batch
    """
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

def mlp(_X, _weights, _biases):
    """
    function that defines a multilayer perceptron in the graph
    input shape: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
    output shape: parse_steps (=?) x num_classes
    """
    # ReLU hidden layer (output shape: parse_steps x n_hidden)
    layer_1 = tf.nn.relu(tf.add(tf.einsum('ijk,kl->il', _X, _weights['h']), _biases['b'])) 
    # return output layer (output shape: parse_steps x n_classes)
    return tf.add(tf.einsum('ik,kl->il', layer_1, _weights['out']), _biases['out'])

def embedding_lookup(sentences, max_seq_length, vec_length):
    """
    function that looks up embeddings.
    input: list of sentences, length of sentences, length of word vectors
    output: 3D array of word vectors per sentence 
                (dims #sentences x sentence_length x embedding_size)
    """
    sentence_embeddings = np.empty((0,max_seq_length,vec_length))
    for sentence in sentences:
        word_embeddings = np.empty((0,vec_length))
        for word in sentence:
            word_embeddings = np.vstack([word_embeddings, model[word]])
        if len(sentence) < max_seq_length:
            zero_padding_length = max_seq_length - len(sentence)
            word_embeddings = np.vstack([word_embeddings, np.zeros((zero_padding_length, vec_length))])
        sentence_embeddings = np.append(sentence_embeddings, np.array([word_embeddings]), axis=0)
    return sentence_embeddings

In [9]:
# load model first
model_name = "dep_parser_word2vec_total"
model = word2vec.Word2Vec.load(model_name)

In [10]:
##############################
# create training data using parser
# for each sentence, get embedded representation
# and actions ONLY when needed
##############################

def create_sentence_embeddings(sentences):
    embedded_train_sentences = []
    for sentence in sentences:
        embed = model[sentence]
#         print(embed)
        embedded_train_sentences.append(embed)
    return embedded_train_sentences

In [11]:
# get the word2vec embedding version for all trainings sentences
embedded_train_sentences = create_sentence_embeddings(train_sentences)

In [34]:
def get_output_array(indices_and_actions):
    max_len = 0
    no_sent = 0
    for sent in indices_and_actions:
        no_sent +=1
        if len(sent) > max_len:
            max_len = len(sent)
            print(max_len)
    
    input_indices = np.empty((no_sent,max_len,3),dtype=np.int64) # ,dtype=np.ndarray
    input_indices.fill(-2)

    output_actions = np.empty((no_sent,max_len),dtype=np.int64)
    output_actions.fill(-1)
    
    sentence_count = 0
    for sent in indices_and_actions:
        action_pair_count = 0
        for action_pair in sent:
            output_actions[sentence_count,action_pair_count] = action_pair[-1]
            index_count = 0
            indices = action_pair[:-1]
            for ind in indices:
                input_indices[sentence_count,action_pair_count,index_count] = ind
                index_count += 1
            action_pair_count += 1
        sentence_count += 1
    return input_indices, output_actions

In [38]:
# convert data to tensors
# stack+buffer indices need to be put in a padded numpy array first
input_indices, output_actions = get_output_array(action_data)

# print(input_indices[1])

# now convert to tensor
output_actions_tensor = tf.convert_to_tensor(output_actions)
input_indices_tensor = tf.convert_to_tensor(input_indices)
print(output_actions_tensor.get_shape())
print(input_indices_tensor.get_shape())

10
98
100
116
118
128
136
198
216
238
282
(39832, 282)
(39832, 282, 3)


In [14]:
# create some variables for TF
word2vec_length = model['a'].size
seq_lengths = [len(sentence) for sentence in train_sentences]
max_seq_length = max(seq_lengths)
train_set_length = len(train_sentences)
print("word2vec vector length: ", word2vec_length)
print("max sentence length: ", max_seq_length)
print("no. of training sentences: ", train_set_length)
print("--------------------")
print(type(seq_lengths[0]))

word2vec vector length:  189
max sentence length:  141
no. of training sentences:  39832
--------------------
<class 'int'>


In [15]:
# returns a list with padded np arrays for every word2vec sentence
def get_input_array(unpadded_word2vec_sentences):
    padded_sentences = []
    for unpad_sentence in unpadded_word2vec_sentences:
        unpad_length = unpad_sentence.shape
        pad_array = np.empty((max_seq_length - unpad_length[0], word2vec_length),dtype=np.float64)
        pad_array.fill(-2.)
        padded_sentences.append(np.concatenate((unpad_sentence,pad_array)))
    return padded_sentences

In [16]:
print("padding word2vec sentences... this might take a while")
padded_word2vec_train = get_input_array(embedded_train_sentences)
print("... DONE")
print(len(padded_word2vec_train))

padding word2vec sentences... this might take a while
... DONE
39832


In [40]:
graph = tf.Graph()
with graph.as_default():
    
    # hyperparameters (from Cross & Huang, 2016)
    batch_size = 10
    n_input = 400
    n_hidden = 200
    n_classes = 99 # there are 99 possible actions to take
    lstm_units = 200
    num_epochs = 10
    dropout = 0.5
    L2_penalty = 0.
    rho = 0.99
    epsilon = 1e-07
    learn_rate = 0.01

    # Store layers weight & bias
    weights = {
        'h': tf.Variable(tf.random_normal([n_input, n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes], dtype=tf.float64))
    }
    biases = {
        'b': tf.Variable(tf.random_normal([n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64))
    }
    
    # placeholders for input (input tensor uses None as variable sized dimension, 
    # or can use rows of train set length, or batch length might be better)
    lstm_x = tf.placeholder(tf.float64, [batch_size, max_seq_length, word2vec_length])
    # lstm output (1 pass, max sentence length, lstm output vector length)
#     lstm_y = tf.placeholder("float", [None, max_seq_length, n_hidden])
    # mlp input (3 words, 2xlstm output)
#     mlp_x = tf.placeholder("float", [3, 2*n_hidden])
    # mlp output (number of action classes)
#     mlp_y = tf.placeholder("float", [n_classes])
    
    # placeholder for sentence length per batch (one int for each sentence)
    lstm_sent_lengths = tf.placeholder(tf.int64, [batch_size])
    
    # placeholder for indices (batch size, max sentence length, 3 indices)
    parse_indices = tf.placeholder(tf.int64, [batch_size, max_seq_length, 3])
    
    # sentence lengths
    sentence_lengths = tf.placeholder(tf.int64, [batch_size])
    
    # define LSTM cell + dropout wrapper (like Cross & Huang)
    lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_units, state_is_tuple=True)
    lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_units, state_is_tuple=True)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_fw_cell, output_keep_prob=0.5)

    # define bidirectional architecture
    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=lstm_fw_cell,
        cell_bw=lstm_bw_cell,
        dtype=tf.float64,
        sequence_length=lstm_sent_lengths,
        inputs=lstm_x
    )

    # fw/bw output (num_sequences x max_seq_length x lstm_units)
    output_fw, output_bw = outputs
    print("Forward pass size: ", output_fw.get_shape())

    # concatenate forward & backward outputs per word
    output_lstm = tf.concat(2, outputs)
    print("BiLSTM output shape: ", output_lstm.get_shape())
    
#     for i in range(batch_size):
#         mlp_input_
    print("Parse indices shape: ", parse_indices)
    for i in range(batch_size):
        
        mlp_x = tf.nn.embedding_lookup(output_lstm[i,:,:], parse_indices[i,:,:])

#         for j in range(sentence_lengths[i]):
        output_mlp = mlp(mlp_x, weights, biases)
        ### TODO: ###
        # opsplitsen in zinnen van correcte lengte (padding weg)
        # voor elke plek in elke zin output mlp berekenen
        # outputs in tensor/array als resultaat (kan dit wel?)
        ### TODO: ###
        
        if i < 1:
            print("MLP input shape: ", mlp_x.get_shape())
            print("MLP output shape S{}: ", output_mlp.get_shape())

Forward pass size:  (10, 141, 200)
BiLSTM output shape:  (10, 141, 400)
Parse indices shape:  Tensor("Placeholder_2:0", shape=(10, 141, 3), dtype=int64)
MLP input shape:  (141, 3, 400)
MLP output shape S{}:  (141, 99)


In [41]:
# now use a feed_dict

temp_batch_size = 10

print(padded_word2vec_train[0:temp_batch_size][0].shape)
print(output_actions.shape)
### TODO: ###
# juiste acties zijn nu ints, moeten one-hot vectoren worden met een 1 op de juiste index
# dat wordt gepassd aan session.run

with tf.Session() as session:
    feed_dict_batch = {lstm_sent_lengths: seq_lengths[0:temp_batch_size], lstm_x: padded_word2vec_train[0:temp_batch_size],  parse_indices: input_indices[0:10]}
    result = session.run(output_actions[0:temp_batch_size], feed_dict_batch)
    print(result)


(141, 189)
(39832, 282)


TypeError: Cannot interpret feed_dict key as Tensor: Tensor Tensor("Placeholder_1:0", shape=(10,), dtype=int64) is not an element of this graph.

In [None]:
check_index = 1
print(embedded_train_sentences[check_index].shape)
print(len(embedded_train_sentences[check_index]))
# print(embedded_train_sentences[check_index])
print(type(embedded_train_sentences[check_index][0][0]))
for word in train_sentences[check_index]:
    print(word)
# for emb in embedded_train_sentences[check_index]:
#     print(emb)
# for action in action_data[check_index]:
#     print(action)

In [None]:
#### OUD OUD OUD

##############################
# build graph
##############################

with tf.Graph().as_default():
    
    # hyperparameters (from Cross & Huang, 2016)
    batch_size = 10
    n_input = 400
    n_hidden = 200
    n_classes = 99 # there are 99 possible actions to take
    lstm_units = 200
    num_epochs = 10
    dropout = 0.5
    L2_penalty = 0.
    rho = 0.99
    epsilon = 1e-07
    learn_rate = 0.01

    # Store layers weight & bias
    weights = {
        'h': tf.Variable(tf.random_normal([n_input, n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes], dtype=tf.float64))
    }
    biases = {
        'b': tf.Variable(tf.random_normal([n_hidden], dtype=tf.float64)),
        'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64))
    }

    # load word2vec model
    model_name = "dep_parser_word2vec_total"
    if not 'model' in locals():
        model = models.word2vec.Word2Vec.load(model_name)
        print("Model loaded from disk")
    else:
        print("Model was already loaded")

    # placeholders for input (input tensor uses None as variable sized dimension, 
    # or can use rows of train set length, or batch length might be better)
    lstm_x = tf.placeholder("float", [None, max_seq_length, word2vec_length])
    # lstm output (1 pass)
    lstm_y = tf.placeholder("float", [None, max_seq_length, n_hidden])
    # mlp input (3 words)
    mlp_x = tf.placeholder("float", [3, 2*n_hidden])
    # mlp output
    mlp_y = tf.placeholder("float", [n_classes])
    
    
    # dummy data:
    # toy sentences (= list of lists of words)
#     sentences = [["the", "by", "an", "on", "the", "in", "an"], ["the", "cat", "sat", "on", "the", "ground"]]
#     toy_sentences = train_sentences[0:30]
#     toy_parses = action_data[0:30]
    # toy parses (= array with an action and word indices)
#     parses = [np.array([[1,0,1,2], 
#                         [0,1,2,3],
#                         [0,1,2,3],
#                         [1,0,1,2]]),
#               np.array([[1,0,1,2], 
#                         [0,1,2,3], 
#                         [3,2,3,4]])]
    
    # look up embeddings of words in sentence
#     toy_embeddings = np.array(embedded_train_sentences[0:30])
#     embeddings = embedding_lookup(toy_sentences, max_seq_length, vec_length)
#     embeddings = embedded_train_sentences[0:30]
#     print("Sentence embedding shape (np-array): ", embeddings.shape)

    # define LSTM cell + dropout wrapper (like Cross & Huang)
    cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_units, state_is_tuple=True)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=0.5)

    # define bidirectional architecture
    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell,
        cell_bw=cell,
        dtype=tf.float64,
        sequence_length=seq_lengths,
        inputs=lstm_x
    )

    # fw/bw output (num_sequences x max_seq_length x lstm_units)
    output_fw, output_bw = outputs
    print("Forward pass size: ", output_fw.get_shape())

    # concatenate forward & backward outputs per word
    output_lstm = tf.concat(2, outputs)
    print("BiLSTM output shape: ", output_lstm.get_shape())


    # MLP layer filtering LSTM outputs based on parse steps
    batch_cost = 0
    for i in range(0, len(toy_sentences)):
        action = action_data[i]
#         sentence_input_indices = action[0:5]
        print(input_indices)
        output_action_index = action[-1]
        print(output_action_index)
        sentence = toy_sentences[i]
#         print(len(sentence))
        emb = embeddings[i]
#         print(emb.shape())
        parse = toy_parses[i]
        # input: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
        # output: parse_steps (=?) x num_classes
        output_mlp = mlp(tf.gather(emb, parse), weights, biases)
        print("MLP output shape S{}: ".format(i), output_mlp.get_shape())
        cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(output_mlp, parse[:,0]))
        batch_cost += cost
    batch_cost /= len(sentences)

    # define optimizer
    global_step = tf.Variable(0, name='global_step', trainable=True)
    optimizer = tf.train.AdadeltaOptimizer(rho=rho, epsilon=epsilon).minimize(batch_cost)
#     train_op = optimizer.train(loss, global_step=global_step)
    correct = tf.nn.in_top_k(embeddings[0], parses[0], 1)
    print(correct)

    
    ##############################
    # start session in graph
    ##############################

    init = tf.initialize_all_variables()

    with tf.Session() as sess:
        sess.run(init)