In [1]:
%%capture
%load_ext autoreload
%autoreload 2
%matplotlib inline
#! SETUP 1 - DO NOT CHANGE, MOVE NOR COPY
import sys, os
_snlp_book_dir = "../../../../../"
sys.path.append(_snlp_book_dir)
# docker image contains tensorflow 0.10.0rc0. We will support execution of only that version!
import statnlpbook.nn as nn

import tensorflow as tf
import numpy as np

In [2]:
#! SETUP 2 - DO NOT CHANGE, MOVE NOR COPY
data_path = _snlp_book_dir + "data/nn/"
data_train = nn.load_corpus(data_path + "train.tsv")
data_dev = nn.load_corpus(data_path + "dev.tsv")
assert(len(data_train) == 45502)

In [3]:
import numpy as np
import os
import tensorflow as tf
import re

# data loading
def load_corpus(filename):
    data = []
    with open(filename, "r") as f:
        for line in f.readlines():
            splits = [x.strip() for x in line.split("\t")]
            current_story = splits[0:5]
            current_order = list(int(elem) for elem in splits[5:])
            instance = {"story": current_story, "order": current_order}
            data.append(instance)
    return data


# tokenisation
def tokenize(input):
    input = string = re.sub('[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）]+', " ",input)
    return input.split(' ')


# preprocessing pipeline, used to load the data intro a structure required by the model
def pipeline(data, vocab=None, max_sent_len_=None):
    is_ext_vocab = True
    if vocab is None:
        is_ext_vocab = False
        vocab = {'<PAD>': 0, '<OOV>': 1}

    max_sent_len = -1
    data_sentences = []
    data_orders = []
    data_orders_reverse = []
    for instance in data:
        sents = []
        for sentence in instance['story']:
            sent = []
            tokenized = tokenize(sentence)
            for token in tokenized:
                #token = token.lower()
                if not is_ext_vocab and token not in vocab:
                    vocab[token] = len(vocab)
                if token not in vocab:
                    token_id = vocab['<OOV>']
                else:
                    token_id = vocab[token]
                sent.append(token_id)
            if len(sent) > max_sent_len:
                max_sent_len = len(sent)
            sents.append(sent)
        data_sentences.append(sents)
        data_orders.append(instance['order'])
        data_orders_reverse.append(instance['order'][::-1])

    if max_sent_len_ is not None:
        max_sent_len = max_sent_len_
    out_sentences = np.full([len(data_sentences), 5, max_sent_len], vocab['<PAD>'], dtype=np.int32)

    for i, elem in enumerate(data_sentences):
        for j, sent in enumerate(elem):
            out_sentences[i, j, 0:len(sent)] = sent

    out_orders = np.array(data_orders, dtype=np.int32)
    out_orders_reverse = np.array(data_orders_reverse, dtype=np.int32)

    return out_sentences, out_orders, out_orders_reverse,vocab


# displaying the loaded data
def show_data_instance(data_stories, data_orders, vocab, num_story):
    inverted_vocab = {value: key for key, value in vocab.items()}
    print('Input:\n Story:')
    story_example = {}
    for i, elem in enumerate(data_stories[num_story]):
        x = list(inverted_vocab[ch] if ch in inverted_vocab else '<OOV>'
                 for ch in elem if ch != 0)
        story_example[data_orders[num_story][i]] = " ".join(x)
        print(' '," ".join(x))
    print(' Order:\n ', data_orders[num_story])
    print('\nDesired story:')
    for (k, v) in sorted(story_example.items()):
        print(' ',v)


# accuracy calculation
def calculate_accuracy(orders_gold, orders_predicted):
    num_correct = np.sum(orders_predicted == orders_gold)
    num_total =  orders_gold.shape[0] * 5
    return num_correct / num_total


# save the model params to the hard drive
def save_model(session):
    if not os.path.exists('./model/'):
        os.mkdir('./model/')
    saver = tf.train.Saver()
    saver.save(session, './model/model.checkpoint')


In [4]:
train_stories, train_orders, train_orders_reverse, vocab = pipeline(data_train)

In [12]:
show_data_instance(train_stories,train_orders,vocab,0)

Input:
 Story:
  His parents understood and decided to make a change 
  The doctors told his parents it was unhealthy 
  Dan was overweight as well 
  Dan s parents were overweight 
  They got themselves and Dan on a diet 
 Order:
  [3 2 1 0 4]

Desired story:
  Dan s parents were overweight 
  Dan was overweight as well 
  The doctors told his parents it was unhealthy 
  His parents understood and decided to make a change 
  They got themselves and Dan on a diet 


In [6]:
# get the length of the longest sentence
max_sent_len = train_stories.shape[2]
# convert dev set to integer IDs, based on the train vocabulary and max_sent_len
dev_stories, dev_orders, dev_order_reverse, _ = pipeline(data_dev, vocab=vocab, max_sent_len_=max_sent_len)

In [7]:
### MODEL PARAMETERS ###
target_size = 5
vocab_size = len(vocab)
input_size = 10
# n = len(train_stories)
output_size = 5


rnn_size= 200
num_of_layers = 2

In [8]:
story = tf.placeholder(tf.int64, [None, None, None], "story")
order = tf.placeholder(tf.int64, [None, None], "order")
order_reverse = tf.placeholder(tf.int64, [None, None], "order_reverse")

batch_size = tf.shape(story)[0]

sentences = [tf.reshape(x, [batch_size, -1]) for x in tf.split(axis=1, num_or_size_splits=5, value=story)]  # 5 times [batch_size x max_length]

initializer = tf.random_uniform_initializer(-0.1, 0.1)
embeddings = tf.get_variable("W", [vocab_size, input_size], initializer=initializer)

sentences_embedded = [tf.nn.embedding_lookup(embeddings, sentence) for sentence in sentences]
hs1 = [tf.reduce_sum(sentence, 1) for sentence in sentences_embedded] # 5 times [batch_size x input_size]
hs2 = hs1[::-1]  #5 times [batch_size x input_size]

# Model 1
# encoder
lstm_cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)
initial_state1 = state1 = lstm_cell.zero_state(batch_size,tf.float32)

for i in range(5):
    encoder_output1, encoder_state1 = lstm_cell(hs1[i], state1)
# decoder
decoder_state1 = encoder_state1
output1 = []
for i in range(5):
    decoder_output1, decoder_state1 = lstm_cell(hs1[i], decoder_state1)
    output1.append(tf.contrib.layers.fully_connected(decoder_output1, target_size, tf.tanh))

logits_flat1 = tf.stack(output1, axis=1)
logits1 = tf.reshape(logits_flat1, [-1, 5, target_size]) 
loss1 = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits1, labels=order))

# Model2
initial_state2 = state2 = lstm_cell.zero_state(batch_size,tf.float32)

for i in range(5):
    encoder_output2, encoder_state2 = lstm_cell(hs2[i], state2)
# decoder
decoder_state2 = encoder_state2
output2 = []
for i in range(5):
    decoder_output2, decoder_state2 = lstm_cell(hs2[i], decoder_state2)
    output2.append(tf.contrib.layers.fully_connected(decoder_output2, target_size,tf.tanh))

logits_flat2 = tf.stack(output2, axis=1)
logits2 = tf.reshape(logits_flat2, [-1, 5, target_size]) 
loss2 = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits2, labels=order_reverse))

# Predict
unpacked_logits1 = [tensor for tensor in tf.unstack(logits1, axis=1)]
unpacked_logits2 = [tensor for tensor in tf.unstack(logits2, axis=1)]
combine_logits = [ unpacked_logits2[-1], unpacked_logits2[-2], unpacked_logits1[-3], unpacked_logits1[-2], unpacked_logits1[-1]]
softmaxes = [tf.nn.softmax(tensor) for tensor in combine_logits]
softmaxed_logits = tf.stack(softmaxes, axis=1)
predict = tf.argmax(softmaxed_logits, 2)

In [9]:
opt_op1 = tf.train.AdamOptimizer(0.001).minimize(loss1)
opt_op2 = tf.train.AdamOptimizer(0.001).minimize(loss2)

In [26]:
import random

BATCH_SIZE = 25
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    n = train_stories.shape[0]

    for epoch in range(5):
        print('----- Epoch', epoch, '-----')
        total_loss1 = 0
        total_loss2 = 0
        for i in range(n // BATCH_SIZE):
            batch_list = random.sample(range(len(train_stories)),BATCH_SIZE)
            inst_story = train_stories[batch_list]
            inst_order = train_orders[batch_list]
            inst_order_reverse = train_orders_reverse[batch_list]
            feed_dict = {story: inst_story, order: inst_order,order_reverse:inst_order_reverse}
            _,_, current_loss1, current_loss2 = sess.run([opt_op1,opt_op2, loss1,loss2], feed_dict=feed_dict)
            total_loss1 += current_loss1
            total_loss2 += current_loss2

        print(' Model1 Train loss:', total_loss1 / n)
        print(' Model2 Train loss:', total_loss2 / n)
        '''
        train_feed_dict = {story: train_stories, order: train_orders}
        train_predicted = sess.run(predict, feed_dict=train_feed_dict)
        train_accuracy = nn.calculate_accuracy(train_orders, train_predicted)
        print(' Train accuracy:', train_accuracy)
        '''
        dev_feed_dict = {story: dev_stories, order: dev_orders,order_reverse:dev_order_reverse}
        dev_predicted = sess.run(predict, feed_dict=dev_feed_dict)
        dev_accuracy = nn.calculate_accuracy(dev_orders, dev_predicted)
        print(' Dev accuracy:', dev_accuracy)

        
    
    nn.save_model(sess)

----- Epoch 0 -----
 Model1 Train loss: 6.22957585916
 Model2 Train loss: 6.22701385828
 Dev accuracy: 0.518439337253
----- Epoch 1 -----
 Model1 Train loss: 5.41817140783
 Model2 Train loss: 5.41402065838
 Dev accuracy: 0.534901122394
----- Epoch 2 -----
 Model1 Train loss: 5.24858183771
 Model2 Train loss: 5.24140199178
 Dev accuracy: 0.52859433458
----- Epoch 3 -----
 Model1 Train loss: 5.1038672693
 Model2 Train loss: 5.10497755562
 Dev accuracy: 0.53115980759
----- Epoch 4 -----
 Model1 Train loss: 4.95824420988
 Model2 Train loss: 4.94460008387
 Dev accuracy: 0.537359700695
