In [24]:
"""
Imports
"""
import numpy as np
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import time
import os
import pickle
#from tensorflow.models.rnn.ptb import reader

<h1> Prepare the data batches </h1>

This whole thing works to learn both abstract and titles. The only difference is that for titles GloVe is trained on an embedding of dimesnion 100, while for abstract the dimension is 512.

From : https://github.com/petewarden/tensorflow_makefile/blob/master/tensorflow/models/rnn/ptb/reader.py

In [25]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
 
import collections
 
def ptb_iterator(raw_data, batch_size, num_steps):

  raw_data = np.array(raw_data, dtype=np.int32)
 
  data_len = len(raw_data)
  batch_len = data_len // batch_size
  data = np.zeros([batch_size, batch_len], dtype=np.int32)
  for i in range(batch_size):
    data[i] = raw_data[batch_len * i:batch_len * (i + 1)]
 
  epoch_size = (batch_len - 1) // num_steps
 
  if epoch_size == 0:
    raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
 
  for i in range(epoch_size):
    x = data[:, i*num_steps:(i+1)*num_steps]
    y = data[:, i*num_steps+1:(i+1)*num_steps+1]
    yield (x, y)

<h1> Read the embeddings </h1>

'vectors.txt' has to be the file containing the right embedding (for abstracts or titles, depending)

In [26]:
l_key = []
l_val=[]
with open("vectors_abstract.txt") as f:
    for line in f:
        line_content = line.split()
        key=line_content[0]
        val=[float(x) for x in line_content[1:]]
        l_key.append(key)
        l_val.append(np.asarray(val))
dictionary=dict(zip(l_key,l_val))

'abstract_file' contains the abstract corpus, if trained on title put the title corpus.

In [27]:
with open('abstract_file', 'rb') as f:
    abs_ph_filtered = pickle.load(f)
datawords = [item for sublist in abs_ph_filtered for item in sublist]

In [28]:
words_unique = set(datawords)
vocab = words_unique
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))
data_temp= [vocab_to_idx[word] for word in datawords]
del abs_ph_filtered

In [29]:
emb_matrix=np.matrix([dictionary[idx_to_vocab[i]] for i in range(vocab_size)])

In [30]:
data=data_temp

<h1> Build the TF graph </h1>

In [31]:
"""
Load and process data, utility functions
"""

def gen_epochs(n, num_steps, batch_size):
    for i in range(n):
        yield ptb_iterator(data, batch_size, num_steps)

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def train_network(g, num_epochs, num_steps = 200, batch_size = 32, verbose = True, save=False):
    tf.set_random_seed(2345)
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        training_losses = []
        
        for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps, batch_size)):
            training_loss = 0
            steps = 0
            training_state = None
            for X, Y in epoch:
                steps += 1

                feed_dict={g['x']: X, g['y']: Y}
                if training_state is not None:
                    feed_dict[g['init_state']] = training_state
                training_loss_, training_state, _ = sess.run([g['total_loss'],
                                                      g['final_state'],
                                                      g['train_step']],
                                                             feed_dict)
                training_loss += training_loss_
            if (verbose) & (idx % 5==1):
                print("Average training loss for Epoch", idx, ":", training_loss/steps)
            training_losses.append(training_loss/steps)

            if (idx>1) & (idx % 10==1):
                g['saver'].save(sess, save+'_'+str(idx))
                
        if isinstance(save, str):
            g['saver'].save(sess, save)

    return training_losses

The embedding size is 512 for abstracts, probably too big, put 100 for titles.

In [86]:
def build_graph(
    cell_type = None,
    num_weights_for_custom_cell = 5,
    state_size = 512, 
    num_classes = vocab_size,
    batch_size = 32,
    num_steps = 200,
    num_layers = 2,
    build_with_dropout=True,
    temperature = 1,
    learning_rate = 5e-4):

    reset_graph()

    x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
    y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')

    dropout = tf.constant(0.8)

    init_emb = tf.constant(emb_matrix.astype(np.float32))
    embeddings = tf.get_variable('embedding_matrix', initializer=init_emb)
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

    if cell_type == 'Custom':
        cell = CustomCell(state_size, num_weights_for_custom_cell)
    elif cell_type == 'GRU':
        cell = tf.nn.rnn_cell.GRUCell(state_size)
    elif cell_type == 'LSTM':
        cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
    elif cell_type == 'LN_LSTM':
        cell = LayerNormalizedLSTMCell(state_size)
    else:
        cell = tf.nn.rnn_cell.BasicRNNCell(state_size)

    if build_with_dropout:
        cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=dropout)

    if cell_type == 'LSTM' or cell_type == 'LN_LSTM':
        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
    else:
        cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)

    if build_with_dropout:
        cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropout)

    init_state = cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)

    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))

    #reshape rnn_outputs and y
    rnn_out_old=rnn_outputs
    rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])
    y_reshaped = tf.reshape(y, [-1])

    logits = (tf.matmul(rnn_outputs, W) + b)/temperature

    predictions = tf.nn.softmax(logits)

    softsparse=tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y_reshaped)
    
    total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y_reshaped))
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

    return dict(
        x = x,
        y = y,
        init_state = init_state,
        final_state = final_state,
        total_loss = total_loss,
        train_step = train_step,
        preds = predictions,
        saver = tf.train.Saver()
    )

<h1> Training </h1>

In [None]:
g = build_graph(cell_type='GRU', num_steps=30)
t = time.time()
losses = train_network(g, 15, num_steps=30, save="saves/abstract_epochs")
print("It took", time.time() - t, "seconds to train for 20 epochs.")
print("The average loss on the final epoch was:", losses[-1])