In [1]:
##############################
# import modules
##############################
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import numpy as np
import tensorflow as tf

from tensorflow.models.rnn.ptb import reader
from gensim import models

In [2]:
##############################
# import modules
#############################


# function that computes the real, unpadded lenghts for every sequence in batch
def length(sequence):
    """
    function that computes the real, unpadded lenghts for every sequence in batch
    """
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

def mlp(_X, _weights, _biases):
    """
    function that defines a multilayer perceptron in the graph
    input shape: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
    """
    # ReLU hidden layer (output shape: parse_steps x n_hidden)
    layer_1 = tf.nn.relu(tf.add(tf.einsum('ijk,kl->il', _X, _weights['h']), _biases['b'])) 
    # return output layer (output shape: parse_steps x n_classes)
    return tf.add(tf.einsum('ik,kl->il', layer_1, _weights['out']), _biases['out'])

def embedding_lookup(sentences, max_seq_length, vec_length):
    """
    function that looks up embeddings.
    input: list of sentences, length of sentences, length of word vectors
    output: 3D array of word vectors per sentence 
                (dims #sentences x sentence_length x embedding_size)
    """
    # TODO: deal with sentences of different lengths
    sentence_embeddings = np.empty((0,max_seq_length,vec_length))
    for sentence in sentences:
        word_embeddings = np.empty((0,vec_length))
        for word in sentence:
            word_embeddings = np.vstack([word_embeddings, model[word]])
        if len(sentence) < max_seq_length:
            zero_padding_length = max_seq_length - len(sentence)
            word_embeddings = np.vstack([word_embeddings, np.zeros((zero_padding_length, vec_length))])
        sentence_embeddings = np.append(sentence_embeddings, np.array([word_embeddings]), axis=0)
    return sentence_embeddings


In [3]:
##############################
# build graph
##############################

# reset graph (and clear namespace)
tf.reset_default_graph()

# for testing purposes:
sess = tf.InteractiveSession()

# hyperparameters (from Cross & Huang, 2016)
batch_size = 10
n_input = 400
n_hidden = 200
n_classes = 50 # TODO: how many classes for the actual data?
lstm_units = 200
num_epochs = 10
dropout = 0.5
L2_penalty = 0.
adadelta_rho = 0.99
adadelta_epsilon = 1e-07


# Store layers weight & bias
weights = {
    'h': tf.Variable(tf.random_normal([n_input, n_hidden], dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_hidden, n_classes], dtype=tf.float64))
}
biases = {
    'b': tf.Variable(tf.random_normal([n_hidden], dtype=tf.float64)),
    'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64))
}


# load word2vec model
model_name = "dep_parser_word2vec_total"
model = models.word2vec.Word2Vec.load(model_name)
print("model loaded from disk")


# dummy data:
# toy sentences
sentences = [["the", "by", "an", "on", "the", "in", "an"], ["the", "cat", "sat", "on", "the", "ground"]]
# toy variables from sentences (TODO: should be computed, not fixed) 
vec_length = 189
seq_lengths = [len(sentence) for sentence in sentences]
max_seq_length = max(seq_lengths)
# toy parse (= array with an action and word indices)
# parses = [np.array([[0,-1,-1,0], [0,-1,0,1], [1,0,1,2], [0,1,2,3]]),
#           np.array([[0,-1,-1,0], [0,-1,0,1], [1,0,1,2], [0,1,2,3], [3,2,3,4]])]
parses = [np.array([[1,0,1,2], 
                    [0,1,2,3],
                    [0,1,2,3],
                    [1,0,1,2]]),
          np.array([[1,0,1,2], 
                    [0,1,2,3], 
                    [3,2,3,4]])]


# look up embeddings of words in sentence (added dimension is for the model to understand the structure)
embeddings = embedding_lookup(sentences, max_seq_length, vec_length)
print("Sentence embedding shape (np-array): ", embeddings.shape)


# define LSTM cell + dropout wrapper (like Cross & Huang)
cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_units, state_is_tuple=True)
cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=0.5)

# define bidirectional architecture
outputs, states = tf.nn.bidirectional_dynamic_rnn(
    cell_fw=cell,
    cell_bw=cell,
    dtype=tf.float64,
    sequence_length=seq_lengths,
    inputs=embeddings
)

# fw/bw output (num_sequences x max_seq_length x lstm_units) and final state ()
output_fw, output_bw = outputs
states_fw, states_bw = states

# concatenate forward & backward outputs per word
output_lstm = tf.concat(2, outputs)

print("BiLSTM output shape: ", output_lstm.get_shape())

sess.run(tf.initialize_all_variables())

# MLP layer
for i in range(0, len(sentences)):
    sentence, parse = output_lstm[i], parses[i]
    # input: parse_steps (=?) x filtered_words (=3) x lstm_output_length (=400)
    output_mlp = mlp(tf.gather(sentence, parse[:,1:]), weights, biases)
    print("MLP output shape S{}: ".format(i), output_mlp.get_shape())
    cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(output_mlp, parse[:,0]))



model loaded from disk
Sentence embedding shape (np-array):  (2, 7, 189)
BiLSTM output shape:  (2, 7, 400)
MLP output shape S0:  (4, 50)
MLP output shape S1:  (3, 50)


# After this, it's just stuff I use(d) for building and testing the main files above!
<p><p>

In [3]:
##############################
# WORKING BATCHING EXAMPLE
##############################

# [0, 1, 2, 3, 4 ,...]
x = tf.range(1, 10, name="x")
 
# A queue that outputs 0,1,2,3,...
range_q = tf.train.range_input_producer(limit=5, shuffle=False)
slice_end = range_q.dequeue()
 
# Slice x to variable length, i.e. [0], [0, 1], [0, 1, 2], ....
y = tf.slice(x, [0], [slice_end], name="y")
print(y)
 
# Batch the variable length tensor with dynamic padding
batched_data = tf.train.batch(
    tensors=[y],
    batch_size=5,
    dynamic_pad=True,
    name="y_batch"
)
 
# Run the graph
# tf.contrib.learn takes care of starting the queues for us
res = tf.contrib.learn.run_n({"y": batched_data}, n=1, feed_dict=None)
 
# Print the result
print("Batch shape: {}".format(res[0]["y"].shape))
print(res[0]["y"])

Tensor("y:0", shape=(?,), dtype=int32)
Batch shape: (5, 4)
[[0 0 0 0]
 [1 0 0 0]
 [1 2 0 0]
 [1 2 3 0]
 [1 2 3 4]]


In [4]:
##############################
# BATCHING WORK IN PROGRESS
##############################

tf.reset_default_graph()

batch_size = 3
hidden_units = 200

# dummy data
X = np.random.randn(2, 10, 8)
W = np.random.randn(10, 8, 8)
Y = np.random.randn(10, 6, 8)
Z = np.random.randn(10, 5, 8)
tensors = [W,Y,Z]


# divide sentences over batches, and perform 0-padding
# TODO: how can the data in this tensor have a different sequence length per case?
batched_data = tf.train.batch(
    tensors=[W],
    batch_size=batch_size,
    enqueue_many=True,
    dynamic_pad=True,
    name="y_batch"
)

res = tf.contrib.learn.run_n({"y": batched_data}, n=1, feed_dict=None)

for batch in res[0]["y"]:
    print(batch.shape)
# Print the result
# print("Batch: {}".format(res[0]["y"]))
# print(res[0]["y"])


(8, 8)
(8, 8)
(8, 8)


In [18]:
##############################
# COORDINATION EXAMPLE
##############################
# do NOT run as this does not terminate yet

# # possible format for initializing input
# input_images = tf.constant(data_sets.train.images)
# input_labels = tf.constant(data_sets.train.labels)

# image, label = tf.train.slice_input_producer(
#     [input_images, input_labels], num_epochs=FLAGS.num_epochs)
# label = tf.cast(label, tf.int32)
# images, labels = tf.train.batch(
#     [image, label], batch_size=FLAGS.batch_size)


# Create the graph, etc.
init_op = tf.initialize_all_variables()

# Create a session for running operations in the Graph.
with tf.Session() as sess:
    # Initialize the variables (like the epoch counter).
    sess.run(init_op)

    # Start input enqueue threads.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    try:
        while not coord.should_stop():
            # Run training steps or whatever
            pass

    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

    # Wait for threads to finish.
    coord.join(threads)


KeyboardInterrupt: 

In [42]:
# load model:
model_name = "dep_parser_word2vec_total"
model = models.word2vec.Word2Vec.load(model_name)
print("model loaded from disk")


model loaded from disk


In [46]:
# toy sentence
sentence = ["the", "cat", "sat", "on", "the", "ground"]
vec_length = model[sentence[0]].size


def embedding_lookup(sentence, vec_length):
    """
    function that takes a sentence, and returns an 2D np-array of 
    word vectors with dims sentence_length x embedding_size
    """
    sentence_embeddings = np.empty((0,vec_length))
    for word in sentence:
        sentence_embeddings = np.vstack([sentence_embeddings, model[word]])
    return sentence_embeddings


embeddings = embedding_lookup(sentence, vec_length)

print(embeddings.shape)


(6, 189)


In [60]:
a = [np.array([[1,2],[3,4],[5,6]]),np.array([[1,2],[3,4],[5,6]])]

b = tf.pack(a)

b.get_shape()

TensorShape([Dimension(2), Dimension(3), Dimension(2)])