# RNN example with TensorFlow

In this example, we build the language model made of stacked LSTMs with Penn Treebank dataset. This notebook is based on the official tutorial of TensorFlow on language modeling.

## Procedures

This example takes the following steps:

1. Import packages
2. Prepare dataset
3. Prepare model, optimizer, and parameter initializer
4. Run training loop
5. Save models

## Preparation

Before executing the code, please download dataset from [Mikolov's website](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz) and decompress it.

## Codes

### 1. Import packages

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

import reader

### 2. Prepare dataset

The figure below show how to create a minibatch in this example. The raw input is a sequence of integers, which represents word IDs. We first reshape it to 2-dimensional matrix with as many rows as the minibatch size. Then, extract the minibatch of feature vector from columns ``[i : i+L]``, and of target vectors from ``[i+1 : i+L+1]`` where ``i`` is the iteration count and ``L`` is the BPTT length.

![How to create minibatch](../image/tf_rnn_minibatch.png)
Fig. How to create a minibatch.

In [2]:
class PTBInput(object):
    """The input data."""

    def __init__(self, config, data, name):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
        self.input_data, self.targets = reader.ptb_producer(
            data, batch_size, num_steps, name=name)

raw_data = reader.ptb_raw_data('simple-examples/data/')
train_data, valid_data, test_data, _ = raw_data

### 3. Prepare model, optimizer, and parameter initializer

``PTBModel`` defines the RNN architecture and the optimizer.

In [3]:
class PTBModel(object):

    def __init__(self, is_training, config, input_):
        self._input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(
                size, forget_bias=0.0, state_is_tuple=True)
        cell = tf.contrib.rnn.MultiRNNCell(
            [lstm_cell() for _ in range(config.num_layers)],
            state_is_tuple=True)

        self._initial_state = cell.zero_state(batch_size, tf.float32)

        # Forward propagation
        # Word embedding
        with tf.device("/cpu:0"):
            embedding = tf.get_variable(
                "embedding", [vocab_size, size], dtype=tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        # RNN
        inputs = tf.unstack(inputs, num=num_steps, axis=1)
        outputs, state = tf.contrib.rnn.static_rnn(cell, inputs,
                                                  initial_state=self._initial_state)

        # Linear
        output = tf.reshape(tf.concat_v2(outputs, 1), [-1, size])
        softmax_w = tf.get_variable(
            "softmax_w", [size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
        logits = tf.matmul(output, softmax_w) + softmax_b

        # Calculate loss
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits],
            [tf.reshape(input_.targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._final_state = state

        if not is_training:
            return

        # Backward propagation
        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)

        # Parameter update
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        # Adjustment of learning rate
        self._new_lr = tf.placeholder(
            tf.float32, shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)

    def assign_lr(self, session, lr_value):
        session.run(self._lr_update, feed_dict={self._new_lr: lr_value})

    @property
    def input(self):
        return self._input

    @property
    def initial_state(self):
        return self._initial_state

    @property
    def cost(self):
        return self._cost

    @property
    def final_state(self):
        return self._final_state

    @property
    def lr(self):
        return self._lr

    @property
    def train_op(self):
        return self._train_op

In [4]:
# training configurations
class SmallConfig(object):
    init_scale = 0.1
    learning_rate = 1.0
    max_grad_norm = 5
    num_layers = 2
    num_steps = 20
    hidden_size = 200
    max_epoch = 4
    max_max_epoch = 13
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000

config = SmallConfig()
eval_config = SmallConfig()
eval_config.batch_size = 1
eval_config.num_steps = 1

We setup models for training/validation/testing with ``PTBInput`` and ``PTBModel``.

In [5]:
initializer = tf.random_uniform_initializer(-config.init_scale,
                                            config.init_scale)

# Setup the model for training
with tf.name_scope("Train"):
    train_input = PTBInput(config=config, data=train_data, name="TrainInput")
    # Prepare initializer
    with tf.variable_scope("Model", reuse=None, initializer=initializer):
        m = PTBModel(is_training=True, config=config, input_=train_input)
    tf.summary.scalar("Training_Loss", m.cost)
    tf.summary.scalar("Learning_Rate", m.lr)

# Setup the model for validation
with tf.name_scope("Valid"):
    valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
    # Parameters in the model is shared with the training model.
    with tf.variable_scope("Model", reuse=True, initializer=initializer):
        mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
    tf.summary.scalar("Validation_Loss", mvalid.cost)

# Setup the model for testing
with tf.name_scope("Test"):
    test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
    # Parameters in the model is shared with the training model.
    with tf.variable_scope("Model", reuse=True, initializer=initializer):
        mtest = PTBModel(is_training=False, config=eval_config,
                         input_=test_input)

### 5. Training loop and 6. Save model

In [6]:
def run_epoch(session, model, eval_op=None):
    """Execute single epoch."""
    
    costs = 0.0
    iters = 0
    
    # initialize RNN states
    state = session.run(model.initial_state)

    fetches = {
        "cost": model.cost,
        "final_state": model.final_state,
    }
    if eval_op is not None:
        fetches["eval_op"] = eval_op

    for step in range(model.input.epoch_size):
        feed_dict = {}
        for i, (c, h) in enumerate(model.initial_state):
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h

        # Process one minibatch
        vals = session.run(fetches, feed_dict)
        cost = vals["cost"]
        state = vals["final_state"]

        costs += cost
        iters += model.input.num_steps

    return np.exp(costs / iters)

In [7]:
sv = tf.train.Supervisor(logdir='result')
with sv.managed_session() as session:
    for i in range(config.max_max_epoch):
        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)

        # Change learning rate
        m.assign_lr(session, config.learning_rate * lr_decay)
        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))

        # Execute one sweep of training dataset
        train_perplexity = run_epoch(session, m, eval_op=m.train_op)
        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))

        # Evaluation with validation dataset
        valid_perplexity = run_epoch(session, mvalid)
        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

    # Evaluation with test dataset
    test_perplexity = run_epoch(session, mtest)
    print("Test Perplexity: %.3f" % test_perplexity)

    sv.saver.save(session, 'result', global_step=sv.global_step)

INFO:tensorflow:Model/global_step/sec: 0
Epoch: 1 Learning rate: 1.000
Epoch: 1 Train Perplexity: 270.203
Epoch: 1 Valid Perplexity: 184.724
Epoch: 2 Learning rate: 1.000
Epoch: 2 Train Perplexity: 133.531
Epoch: 2 Valid Perplexity: 147.012
Epoch: 3 Learning rate: 1.000
INFO:tensorflow:Model/global_step/sec: 55.8605
Epoch: 3 Train Perplexity: 102.464
Epoch: 3 Valid Perplexity: 131.377
Epoch: 4 Learning rate: 1.000
Epoch: 4 Train Perplexity: 86.059
Epoch: 4 Valid Perplexity: 126.897
Epoch: 5 Learning rate: 0.500
Epoch: 5 Train Perplexity: 65.631
Epoch: 5 Valid Perplexity: 118.964
Epoch: 6 Learning rate: 0.250
INFO:tensorflow:Model/global_step/sec: 55.0667
Epoch: 6 Train Perplexity: 53.493
Epoch: 6 Valid Perplexity: 118.570
Epoch: 7 Learning rate: 0.125
Epoch: 7 Train Perplexity: 47.197
Epoch: 7 Valid Perplexity: 119.952
Epoch: 8 Learning rate: 0.062
INFO:tensorflow:Model/global_step/sec: 42.8499
Epoch: 8 Train Perplexity: 44.064
Epoch: 8 Valid Perplexity: 120.727
Epoch: 9 Learning rate:

INFO:tensorflow:Model/global_step/sec: 0
INFO:tensorflow:Model/global_step/sec: 55.8605
INFO:tensorflow:Model/global_step/sec: 55.0667
INFO:tensorflow:Model/global_step/sec: 42.8499
INFO:tensorflow:Model/global_step/sec: 53.4251
INFO:tensorflow:Model/global_step/sec: 44.4749
