In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.contrib.eager.python import tfe

from utils.data_utils import Corpus
# Import the BasicLSTM written in TF Eager
from utils.basic_lstm import BasicLSTM

  from ._conv import register_converters as _register_converters


In [2]:
# enable eager mode
tf.enable_eager_execution()
tf.set_random_seed(0)
np.random.seed(0)

In [3]:
if not os.path.exists('weights/'):
    os.makedirs('weights/')

# Hyper-parameters
embed_size = 128
rnn_units = 1024
num_epochs = 10
num_samples = 1000  # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

In [4]:
# dataset loading
corpus = Corpus()
train_corpus = corpus.get_data('../data_ptb/train', batch_size)
vocab_size = len(corpus.dictionary)
num_batches = train_corpus.shape[-1] // seq_length

train_corpus = tf.constant(train_corpus, dtype=tf.int32)

print("Dataset shape : ", train_corpus.shape)
print("Vocabulary size : ", vocab_size)
print("Number of batches : ", num_batches)

Dataset shape :  (20, 46479)
Vocabulary size :  10000
Number of batches :  1549


# RNN Language Model using `BasicLSTM`

This is a stateful model which feeds its own output predictions (a single word from the entire vocabulary) back into its input of the next time step. This is shown to be super useful as a pre-training step for other NLP tasks as shown in the paper [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146), and is generally used for models such as Google's Smart Reply feature in GMail.

For the language model, we have to return the states as well as the sequences from the `BasicLSTM`.

We also have to maintain and utilize the initial states that are managed by the caller now, so we can no longer depend on the general Model.fit() to train our model in these circumstances.

While it doesnt allow multiple layers easily, its speed makes up for this somewhat.

In [5]:
class RNNLanguageModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, rnn_units):
        super(RNNLanguageModel, self).__init__()
        self.units = rnn_units
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.enbedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.lstm = BasicLSTM(self.units, return_states=True, return_sequence=True)
        self.classifier = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, training=None, mask=None, initial_states=None):
        embeds = self.enbedding(inputs)

        output, [h, c] = self.lstm(embeds, initial_states=initial_states)

        # preserve the states
        self.states = [h, c]

        # Reshape output to (batch_size * sequence_length, hidden_size)
        output = tf.reshape(output, [-1, output.shape[2]])

        # Decode hidden states of all time steps
        output = self.classifier(output)

        return output

# Basic LSTM Training
Below, we train a language model RNN using the BasicLSTM we defined.

We perform a bit of maintainance work, where we have to supply the initial state of each epoch to each of the Cells in the RNN, accept the resultant state after each call of the model, and feed those states back as input to the next step.

We also monitor the best training perplexity and save the model only for those epochs where the perplexity is reduced from its previous best.

We then generate sampled text from this trained language model.

This is much faster than the previous method, but is limited in that it is a single layer. 2 or more layers can often provide a real boost to the performance of Language Models. A middle ground method would be to write our own loop for the LSTMCell as shown in (6.3) and hand chain the layers ourselves.

In [6]:
device = '/cpu:0' if tfe.num_gpus() == 0 else '/gpu:0'

with tf.device(device):
    # build model and optimizer
    model = RNNLanguageModel(vocab_size, embed_size, rnn_units)
    optimizer = tf.train.AdamOptimizer(0.001)

    # TF Keras tries to use entire dataset to determine shape without this step when using .fit()
    # Fix = Use exactly one sample from the provided input dataset to determine input/output shape/s for the model
    dummy_x = tf.zeros((1, 1))
    model._set_inputs(dummy_x)

    best_perplexity = 1e6
    saver = tfe.Saver(model.variables)

    if os.path.exists('weights/08_02_rnn_lm/') and tf.train.checkpoint_exists('weights/08_02_rnn_lm/weights.ckpt'):
        saver = tfe.Saver(model.variables)
        saver.restore('weights/08_02_rnn_lm/weights.ckpt')
        print("Restored model !")

    # train loop
    for epoch in range(num_epochs):
        # Set initial hidden and cell states
        initial_states = (tf.zeros([batch_size, rnn_units]), tf.zeros([batch_size, rnn_units]))

        for i in range(0, train_corpus.shape[1] - seq_length, seq_length):
            # Get mini-batch inputs and targets
            inputs = train_corpus[:, i:i + seq_length]
            targets = train_corpus[:, (i + 1):(i + 1) + seq_length]
            targets = tf.reshape(targets, [-1])

            # Forward pass
            with tf.GradientTape() as tape:
                outputs= model(inputs, initial_states=initial_states)
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=targets)
                loss = tf.reduce_mean(loss)

            # use only the final state
            h, c = model.states
            initial_states = [h[:, -1, :], c[:, -1, :]]

            # get and clip gradients
            gradients = tape.gradient(loss, model.variables)
            
            with tf.device('/cpu:0'):
                gradients = [tf.cast(g, tf.float64) for g in gradients]  # necessary cast for kernel to exist
                gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                gradients = [tf.cast(g, tf.float32) for g in gradients]  # necessary cast to correct dtype of grads
                
            grad_vars = zip(gradients, model.variables)

            # update weights
            optimizer.apply_gradients(grad_vars, tf.train.get_or_create_global_step())

            step = (i + 1) // seq_length
            if step % 100 == 0:
                perplexity = np.exp(loss.numpy())

                print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                      .format(epoch + 1, num_epochs, step, num_batches, loss.numpy(), perplexity))

                if best_perplexity > perplexity:
                    best_perplexity = perplexity
                    saver.save('weights/08_02_rnn_lm/weights.ckpt')
                    print("Perplexity improved. Saving weights...")

    saver = tfe.Saver(model.variables)
    saver.restore('weights/08_02_rnn_lm/weights.ckpt')

    if not os.path.exists('language_model/'):
        os.makedirs('language_model/')

    # evaluation of model
    with open('language_model/sample.txt', 'w') as f:
        # Set intial hidden ane cell states
        initial_states = (tf.zeros([1, rnn_units]), tf.zeros([1, rnn_units]))

        # Select one word id randomly
        prob = tf.ones([1, vocab_size])
        input = tf.multinomial(prob, num_samples=1)

        for i in range(num_samples):
            # Forward propagate RNN
            output = model(input, initial_states=initial_states)

            # use only the final state
            h, c = model.states
            initial_states = [h[:, -1, :], c[:, -1, :]]

            # Sample a word id
            prob = tf.exp(output)
            word_id = tf.multinomial(prob, num_samples=1)[0, 0]

            # Fill input with sampled word id for the next time step
            input = tf.fill(input.shape, word_id)

            # File write
            word = corpus.dictionary.idx2word[word_id.numpy()]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)

            if (i + 1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i + 1, num_samples, 'language_model/sample.txt'))



Epoch [1/10], Step[0/1549], Loss: 9.2120, Perplexity: 10016.35
Perplexity improved. Saving weights...
Epoch [1/10], Step[100/1549], Loss: 6.4166, Perplexity: 611.92
Perplexity improved. Saving weights...
Epoch [1/10], Step[200/1549], Loss: 6.4802, Perplexity: 652.08
Epoch [1/10], Step[300/1549], Loss: 6.4195, Perplexity: 613.71
Epoch [1/10], Step[400/1549], Loss: 6.1461, Perplexity: 466.90
Perplexity improved. Saving weights...
Epoch [1/10], Step[500/1549], Loss: 5.7776, Perplexity: 322.98
Perplexity improved. Saving weights...
Epoch [1/10], Step[600/1549], Loss: 5.7695, Perplexity: 320.38
Perplexity improved. Saving weights...
Epoch [1/10], Step[700/1549], Loss: 6.0343, Perplexity: 417.51
Epoch [1/10], Step[800/1549], Loss: 5.7469, Perplexity: 313.23
Perplexity improved. Saving weights...
Epoch [1/10], Step[900/1549], Loss: 5.6528, Perplexity: 285.10
Perplexity improved. Saving weights...
Epoch [1/10], Step[1000/1549], Loss: 5.7026, Perplexity: 299.63
Epoch [1/10], Step[1100/1549], Lo

Epoch [8/10], Step[700/1549], Loss: 3.9145, Perplexity: 50.13
Epoch [8/10], Step[800/1549], Loss: 3.9062, Perplexity: 49.71
Epoch [8/10], Step[900/1549], Loss: 3.6993, Perplexity: 40.42
Epoch [8/10], Step[1000/1549], Loss: 3.8593, Perplexity: 47.43
Epoch [8/10], Step[1100/1549], Loss: 3.9789, Perplexity: 53.46
Epoch [8/10], Step[1200/1549], Loss: 3.8457, Perplexity: 46.79
Epoch [8/10], Step[1300/1549], Loss: 3.6130, Perplexity: 37.08
Epoch [8/10], Step[1400/1549], Loss: 3.4303, Perplexity: 30.89
Epoch [8/10], Step[1500/1549], Loss: 3.8725, Perplexity: 48.06
Epoch [9/10], Step[0/1549], Loss: 3.6570, Perplexity: 38.75
Epoch [9/10], Step[100/1549], Loss: 3.7161, Perplexity: 41.10
Epoch [9/10], Step[200/1549], Loss: 3.8601, Perplexity: 47.47
Epoch [9/10], Step[300/1549], Loss: 3.8076, Perplexity: 45.04
Epoch [9/10], Step[400/1549], Loss: 3.7814, Perplexity: 43.88
Epoch [9/10], Step[500/1549], Loss: 3.3222, Perplexity: 27.72
Perplexity improved. Saving weights...
Epoch [9/10], Step[600/1549

# Print the sampled sentences

In [7]:
with open('language_model/sample_2.txt', 'r') as f:
    for line in f:
        print(line)



the rage he said fleming 's heights paid foster doug <unk> yard themes of the tucson rain 

the telephone mentioned in the microprocessor conversation driving entirely <unk> 

the veto geography was a households schools day baldwin <unk> granted specter consider a <unk> aggressive inherent in the <unk> cypress bridge 

the planners sidelines technological <unk> rises in the ann announcement nikko saks poorer sweden lloyd <unk> royal conduct environmentally <unk> 

the institutes of the intensity of the <unk> apartheid is i does n't seizure steering stocks 

the wherever irving <unk> casual <unk> fame produced by <unk> <unk> industry his <unk> teller resulting from the <unk> spencer racked up to the took permits <unk> 

the ironic pervasive supplier tabloid xerox 's creatures focusing exist in the alternative grew to interpublic 

the enactment of the combustion twelve demanded number of <unk> plate pursuit of the soften clinical cancer 

the citibank mlx toy postpone the fdic 's offe