# Recurrent Neural Networks

https://www.tensorflow.org/versions/r0.11/tutorials/recurrent/index.html

https://github.com/tensorflow/tensorflow/tree/r0.11/tensorflow/models/rnn/ptb

https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/models/rnn/ptb/ptb_word_lm.py

https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/models/rnn/ptb/reader.py

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

http://karpathy.github.io/2015/05/21/rnn-effectiveness/

https://arxiv.org/abs/1409.2329

http://www.cis.upenn.edu/~treebank/

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

In [2]:
import os
import tarfile
import shutil

HOME_DIR = 'treebank'
DATA_DIR = os.path.join(HOME_DIR, 'data')

print('Unpacking treebank dataset...')

TAR_FILE = 'simple-examples.tgz'
TAR_PATH = os.path.join(DATA_DIR, TAR_FILE)

from tensorflow.contrib.learn.python.learn.datasets.base import maybe_download
maybe_download(TAR_FILE, DATA_DIR, 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz')

def extract(tar, filename, dst_path):
    print('Extracting', filename)
    dst_file = os.path.join(dst_path, os.path.basename(filename))
    with open(dst_file, 'wb') as fout:
        fin = tar.extractfile(filename)
        shutil.copyfileobj(fin, fout)

with tarfile.open(TAR_PATH, mode='r:gz') as t:
    extract(t, './simple-examples/data/ptb.test.txt', DATA_DIR)
    extract(t, './simple-examples/data/ptb.train.txt', DATA_DIR)
    extract(t, './simple-examples/data/ptb.valid.txt', DATA_DIR)

Unpacking treebank dataset...
Extracting ./simple-examples/data/ptb.test.txt
Extracting ./simple-examples/data/ptb.train.txt
Extracting ./simple-examples/data/ptb.valid.txt


In [3]:
from tensorflow.models.rnn.ptb import reader

raw_data = reader.ptb_raw_data(DATA_DIR)
train_data, valid_data, test_data, _ = raw_data

print("Train size:", len(train_data))
print("Validation size:", len(valid_data))
print("Test size:", len(test_data))

Train size: 929589
Validation size: 73760
Test size: 82430


In [4]:
class Config(object):
    def params(self, *keys):
        return {
            k: v
            for k, v in self.__class__.__dict__.items()
            if not k.startswith('__') and (not keys or k in keys)
        }
    
class FakeConfig(Config):
    """Tiny config, for testing."""
    init_scale = 0.1
    learning_rate = 1.0
    max_grad_norm = 1
    num_layers = 1
    num_steps = 2
    hidden_size = 2
    max_epoch = 1
    max_max_epoch = 2
    keep_prob = 1.0
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000

class SmallConfig(Config):
    """Small config."""
    init_scale = 0.1
    learning_rate = 1.0
    max_grad_norm = 5
    num_layers = 2
    num_steps = 20
    hidden_size = 200
    max_epoch = 4
    max_max_epoch = 13
    keep_prob = 1.0
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000

In [5]:
import time
import numpy as np

class PTBModel(object):
    
    #_input_data
    #_logits
    #_initial_state
    #_final_state
    
    def __init__(self,
                 batch_size,
                 num_steps,
                 num_layers,
                 hidden_size,
                 vocab_size):
        size = hidden_size
        
        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        
        lstm_cell = self._lstm_reg(
            tf.nn.rnn_cell.BasicLSTMCell(
                size,
                forget_bias=0.0,
                state_is_tuple=True))
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers, state_is_tuple=True)
        
        self._initial_state = cell.zero_state(batch_size, tf.float32)
        
        embedding = tf.get_variable("embedding", [vocab_size, size], dtype=tf.float32)
        inputs = self._input_reg(
            tf.nn.embedding_lookup(
                embedding,
                self._input_data))
        
        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
        
        self._final_state = state
        
        output = tf.reshape(tf.concat(1, outputs), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
        self._logits = tf.matmul(output, softmax_w) + softmax_b
    
    def _lstm_reg(self, lstm_cell):
        return lstm_cell
    
    def _input_reg(self, inputs):
        return inputs
    
    @property
    def input_data(self):
        return self._input_data
    
    @property
    def logits(self):
        return self._logits
    
    @property
    def initial_state(self):
        return self._initial_state
    
    @property
    def final_state(self):
        return self._final_state

class PTBTrain(PTBModel):

    #_batch_size
    #_num_steps
    #_is_training
    #_keep_prob

    #_targets
    #_cost
    #_lr
    #_train_op
    #_new_lr
    #_lr_update

    def __init__(self,
                 batch_size,
                 num_steps,
                 num_layers,
                 hidden_size,
                 vocab_size,
                 is_training,
                 keep_prob,
                 learning_rate,
                 max_grad_norm):
        self._batch_size = batch_size
        self._num_steps = num_steps
        self._is_training = is_training
        self._keep_prob = keep_prob
        
        PTBModel.__init__(self,
                 batch_size,
                 num_steps,
                 num_layers,
                 hidden_size,
                 vocab_size)
        
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        loss = tf.nn.seq2seq.sequence_loss_by_example(
            [self.logits],
            [tf.reshape(self._targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])
        self._cost = cost = tf.reduce_sum(loss) / batch_size

        if not is_training:
            return

        self._lr = tf.Variable(learning_rate, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))

        self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)

    def _lstm_reg(self, lstm_cell):
        if self._is_training and self._keep_prob < 1:
            return tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self._keep_prob)
        return lstm_cell
    
    def _input_reg(self, inputs):
        if self._is_training and self._keep_prob < 1:
            return tf.nn.dropout(inputs, self._keep_prob)
        return inputs
    
    def lr(self, session):
        return session.run(self._lr)
    
    def assign_lr(self, session, lr_value):
        session.run(self._lr_update, feed_dict={self._new_lr: lr_value})

    def run_epoch_train(self, session, data, verbose=True):
        return self._run_epoch(session, data, self._train_op, verbose)

    def run_epoch_eval(self, session, data, verbose=False):
        return self._run_epoch(session, data, tf.no_op(), verbose)

    def _run_epoch(self, session, data, eval_op, verbose):
        batch_size = self._batch_size
        num_steps = self._num_steps
        
        epoch_size = ((len(data) // batch_size) - 1) // num_steps
        start_time = time.time()
        costs = 0.0
        iters = 0
        state = session.run(self.initial_state)

        for step, (x, y) in enumerate(reader.ptb_iterator(data, batch_size, num_steps)):
            fetches = [self._cost, self.final_state, eval_op]

            feed_dict = {}
            feed_dict[self.input_data] = x
            feed_dict[self._targets] = y
            for k, (c, h) in enumerate(self.initial_state):
                feed_dict[c] = state[k].c
                feed_dict[h] = state[k].h

            batch_cost, state, _ = session.run(fetches, feed_dict)
            costs += batch_cost
            iters += num_steps

            if verbose and step % (epoch_size // 10) == 10:
                print("%.1f%% perplexity: %.3f speed: %.0f wps" % (
                    step * 100.0 / epoch_size,
                    np.exp(costs / iters),
                    iters * batch_size / (time.time() - start_time)))

        return np.exp(costs / iters)

In [6]:
#cfg = FakeConfig()
cfg = SmallConfig()

graph = tf.Graph()

with graph.as_default():
    initializer = tf.random_uniform_initializer(-cfg.init_scale, cfg.init_scale)
    
    params = cfg.params(
        'batch_size',
        'num_steps',
        'num_layers',
        'hidden_size',
        'vocab_size',
        'keep_prob',
        'learning_rate',
        'max_grad_norm')
    
    with tf.variable_scope("model", reuse=None, initializer=initializer):
        m_train = PTBTrain(is_training=True, **params)
    
    with tf.variable_scope("model", reuse=True, initializer=initializer):
        m_valid = PTBTrain(is_training=False, **params)
        
        params.update(batch_size=1, num_steps=1)
        m_test = PTBTrain(is_training=False, **params)
    
    init = tf.initialize_all_variables()

In [7]:
with tf.Session(graph=graph) as session:
    init.run()
    print("Initialized")
    
    for i in range(cfg.max_max_epoch):
        decay = cfg.lr_decay ** max(i + 1 - cfg.max_epoch, 0.0)
        lr_value = cfg.learning_rate * decay
        m_train.assign_lr(session, lr_value)
        
        print()
        print("Epoch: %d Learning rate: %.6f" % (i + 1, m_train.lr(session)))
        
        train_perplexity = m_train.run_epoch_train(session, train_data)
        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
        
        valid_perplexity = m_valid.run_epoch_eval(session, valid_data)
        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
    
    print()
    
    test_perplexity = m_test.run_epoch_eval(session, test_data)
    print("Test Perplexity: %.3f" % test_perplexity)

Initialized

Epoch: 1 Learning rate: 1.000000
0.4% perplexity: 5585.529 speed: 1258 wps
10.4% perplexity: 840.839 speed: 1435 wps
20.4% perplexity: 619.551 speed: 1439 wps
30.4% perplexity: 501.574 speed: 1440 wps
40.4% perplexity: 432.873 speed: 1432 wps
50.4% perplexity: 387.834 speed: 1422 wps
60.4% perplexity: 349.447 speed: 1414 wps
70.3% perplexity: 323.106 speed: 1407 wps
80.3% perplexity: 302.319 speed: 1400 wps
90.3% perplexity: 283.138 speed: 1395 wps
Epoch: 1 Train Perplexity: 268.899
Epoch: 1 Valid Perplexity: 178.410

Epoch: 2 Learning rate: 1.000000
0.4% perplexity: 212.329 speed: 1326 wps
10.4% perplexity: 152.070 speed: 1345 wps
20.4% perplexity: 159.321 speed: 1349 wps
30.4% perplexity: 154.330 speed: 1349 wps
40.4% perplexity: 151.455 speed: 1347 wps
50.4% perplexity: 149.040 speed: 1346 wps
60.4% perplexity: 144.437 speed: 1345 wps
70.3% perplexity: 142.202 speed: 1344 wps
80.3% perplexity: 140.163 speed: 1344 wps
90.3% perplexity: 136.517 speed: 1343 wps
Epoch: 2 Tr