# Recurrent Neural Networks

https://www.tensorflow.org/tutorials/recurrent

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

http://karpathy.github.io/2015/05/21/rnn-effectiveness/

https://arxiv.org/abs/1409.2329

In [1]:
import tensorflow as tf

tf.VERSION

'1.3.1'

## Dataset

Treebank

In [2]:
import os
import tarfile
import shutil

HOME_DIR = 'treebank'
DATA_DIR = os.path.join(HOME_DIR, 'data')

print('Unpacking treebank dataset...')

TAR_FILE = 'simple-examples.tgz'
TAR_PATH = os.path.join(DATA_DIR, TAR_FILE)

from tensorflow.contrib.learn.python.learn.datasets.base import maybe_download
maybe_download(TAR_FILE, DATA_DIR, 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz')

def extract(tar, filename, dst_path):
    print('Extracting', filename)
    dst_file = os.path.join(dst_path, os.path.basename(filename))
    with open(dst_file, 'wb') as fout:
        fin = tar.extractfile(filename)
        shutil.copyfileobj(fin, fout)

with tarfile.open(TAR_PATH, mode='r:gz') as t:
    extract(t, './simple-examples/data/ptb.test.txt', DATA_DIR)
    extract(t, './simple-examples/data/ptb.train.txt', DATA_DIR)
    extract(t, './simple-examples/data/ptb.valid.txt', DATA_DIR)

Unpacking treebank dataset...
Extracting ./simple-examples/data/ptb.test.txt
Extracting ./simple-examples/data/ptb.train.txt
Extracting ./simple-examples/data/ptb.valid.txt


In [3]:
os.listdir(DATA_DIR)

['simple-examples.tgz', 'ptb.test.txt', 'ptb.train.txt', 'ptb.valid.txt']

In [4]:
def show(filename, lines=3):
    with open(filename, 'r') as f:
        for _ in range(lines):
            print(next(f).strip())
            print()

TRAIN_DATA = os.path.join(DATA_DIR, 'ptb.train.txt')
VALID_DATA = os.path.join(DATA_DIR, 'ptb.valid.txt')
TEST_DATA = os.path.join(DATA_DIR, 'ptb.test.txt')

print('Train samples...\n')
show(TRAIN_DATA)
print('...\n')
print('Validation samples...\n')
show(VALID_DATA)
print('...\n')
print('Test samples...\n')
show(TEST_DATA)

Train samples...

aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter

pierre <unk> N years old will join the board as a nonexecutive director nov. N

mr. <unk> is chairman of <unk> n.v. the dutch publishing group

...

Validation samples...

consumers may want to move their telephones a little closer to the tv set

<unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk>

two weeks ago viewers of several nbc <unk> consumer segments started calling a N number for advice on various <unk> issues

...

Test samples...

no it was n't black monday

but while the new york stock exchange did n't fall apart friday as the dow jones industrial average plunged N points most of it in the final hour it barely managed to stay this side of chaos

some circuit breakers installed after the octob

## Input

In [5]:
def read_text_as_tokens(filename):
    with open(filename) as f:
        return f.read().replace('\n', '<eos>').split()

train_tokens = read_text_as_tokens(TRAIN_DATA)
valid_tokens = read_text_as_tokens(VALID_DATA)
test_tokens = read_text_as_tokens(TEST_DATA)

train_tokens[:20]

['aer',
 'banknote',
 'berlitz',
 'calloway',
 'centrust',
 'cluett',
 'fromstein',
 'gitano',
 'guterman',
 'hydro-quebec',
 'ipo',
 'kia',
 'memotec',
 'mlx',
 'nahb',
 'punts',
 'rake',
 'regatta',
 'rubens',
 'sim']

In [6]:
import collections

tokens_freq = collections.Counter(train_tokens)

print('Vocabulary size:\n\n{:,d}\n'.format(len(tokens_freq)))
print('Most common:\n')
for token, freq in tokens_freq.most_common(10):
    print('{:,d}\t{}'.format(freq, token))

Vocabulary size:

10,000

Most common:

50,770	the
45,020	<unk>
42,068	<eos>
32,481	N
24,400	of
23,638	to
21,196	a
18,000	in
17,474	and
9,784	's


In [7]:
# lower ids for most common tokens (first id = 0)
tokens = sorted(tokens_freq.items(), key=lambda tokn: tokn[1], reverse=True)
token_to_id = dict((tok, i) for i, (tok, _) in enumerate(tokens, 0))

train_data = list(token_to_id[tok] for tok in train_tokens)
valid_data = list(token_to_id[tok] for tok in valid_tokens)
test_data = list(token_to_id[tok] for tok in test_tokens)

for tok, i in zip(train_tokens[:20], train_data[:20]):
    print('{}={}'.format(tok, i))

aer=9970
banknote=9971
berlitz=9972
calloway=9973
centrust=9974
cluett=9975
fromstein=9976
gitano=9977
guterman=9978
hydro-quebec=9979
ipo=9980
kia=9981
memotec=9982
mlx=9983
nahb=9984
punts=9985
rake=9986
regatta=9987
rubens=9988
sim=9989


**Batch reader**

Batch blocks.

* `data`: array of token ids
* `batch_size`: number of blocks for each call
* `num_steps`: number of tokens per block for each call

In [8]:
# Pseudo-values
data = train_data[:26]
batch_size = 4
num_steps = 2

In [9]:
# TensorFlow runtime
session = tf.InteractiveSession()
coord = tf.train.Coordinator()
threads = []

In [10]:
data_tensor = tf.convert_to_tensor(data, dtype=tf.int32)
print(data_tensor)
data_tensor.eval()

Tensor("Const:0", shape=(26,), dtype=int32)


array([9970, 9971, 9972, 9973, 9974, 9975, 9976, 9977, 9978, 9979, 9980,
       9981, 9982, 9983, 9984, 9985, 9986, 9987, 9988, 9989, 9990, 9991,
       9992, 9993,    2, 8569], dtype=int32)

In [11]:
data_len = tf.size(data_tensor)
print(data_len)
data_len.eval()

Tensor("Size:0", shape=(), dtype=int32)


26

In [12]:
num_batches = data_len // batch_size
print(num_batches)
num_batches.eval()

Tensor("floordiv:0", shape=(), dtype=int32)


6

In [13]:
data_len_trim = batch_size * num_batches
print(data_len_trim)
data_len_trim.eval()

Tensor("mul:0", shape=(), dtype=int32)


24

In [14]:
# ignore tail lesser than batch size (trim)
data_tensor_trim = data_tensor[:data_len_trim]
print(data_tensor_trim)
data_tensor_trim.eval()

Tensor("strided_slice:0", shape=(?,), dtype=int32)


array([9970, 9971, 9972, 9973, 9974, 9975, 9976, 9977, 9978, 9979, 9980,
       9981, 9982, 9983, 9984, 9985, 9986, 9987, 9988, 9989, 9990, 9991,
       9992, 9993], dtype=int32)

In [15]:
data_batches = tf.reshape(data_tensor_trim, shape=(batch_size, num_batches))
print(data_batches)
data_batches.eval()

Tensor("Reshape:0", shape=(4, ?), dtype=int32)


array([[9970, 9971, 9972, 9973, 9974, 9975],
       [9976, 9977, 9978, 9979, 9980, 9981],
       [9982, 9983, 9984, 9985, 9986, 9987],
       [9988, 9989, 9990, 9991, 9992, 9993]], dtype=int32)

In [16]:
# ignore last block lesser than num_steps + 1
num_blocks = (num_batches - 1) // num_steps
print(num_blocks)
num_blocks.eval()

Tensor("floordiv_1:0", shape=(), dtype=int32)


2

In [17]:
i = tf.train.range_input_producer(num_blocks, shuffle=False).dequeue()
threads += tf.train.start_queue_runners(sess=session, coord=coord)
print(i)

Tensor("input_producer_Dequeue:0", shape=(), dtype=int32)


In [18]:
x = tf.strided_slice(data_batches, begin=(0, i * num_steps), end=(batch_size, (i + 1) * num_steps))
x.set_shape((batch_size, num_steps))
print(x)

Tensor("StridedSlice:0", shape=(4, 2), dtype=int32)


In [19]:
y = tf.strided_slice(data_batches, begin=(0, i * num_steps + 1), end=(batch_size, (i + 1) * num_steps + 1))
y.set_shape((batch_size, num_steps))
print(y)

Tensor("StridedSlice_1:0", shape=(4, 2), dtype=int32)


In [20]:
print(data_batches.eval())
print()
for k in range(num_blocks.eval()):
    x_, y_ = session.run([x, y])
    print('X:\n')
    print(x_)
    print()
    print('Y:\n')
    print(y_)
    print()

[[9970 9971 9972 9973 9974 9975]
 [9976 9977 9978 9979 9980 9981]
 [9982 9983 9984 9985 9986 9987]
 [9988 9989 9990 9991 9992 9993]]

X:

[[9970 9971]
 [9976 9977]
 [9982 9983]
 [9988 9989]]

Y:

[[9971 9972]
 [9977 9978]
 [9983 9984]
 [9989 9990]]

X:

[[9972 9973]
 [9978 9979]
 [9984 9985]
 [9990 9991]]

Y:

[[9973 9974]
 [9979 9980]
 [9985 9986]
 [9991 9992]]



In [21]:
coord.request_stop()
coord.join(threads)
session.close()

In [22]:
# final dataset reader

def input_batch(data, batch_size, num_steps, name=None):
    with tf.name_scope(name, 'input_batch', [data, batch_size, num_steps]):
        data_tensor = tf.convert_to_tensor(data, dtype=tf.int32)

        # ignore tail lesser than batch size (trim)
        data_len = tf.size(data_tensor)
        num_batches = data_len // batch_size
        data_len_trim = batch_size * num_batches
        data_tensor_trim = data_tensor[:data_len_trim]
        data_batches = tf.reshape(data_tensor_trim, shape=(batch_size, num_batches))

        # ignore last block lesser than num_steps + 1
        num_blocks = (num_batches - 1) // num_steps

        i = tf.train.range_input_producer(num_blocks, shuffle=False).dequeue()

        x = tf.strided_slice(data_batches, begin=(0, i * num_steps), end=(batch_size, (i + 1) * num_steps))
        x.set_shape((batch_size, num_steps))

        y = tf.strided_slice(data_batches, begin=(0, i * num_steps + 1), end=(batch_size, (i + 1) * num_steps + 1))
        y.set_shape((batch_size, num_steps))

        return x, y


graph = tf.Graph()

with graph.as_default(), tf.Session(graph=graph) as session:
    coord = tf.train.Coordinator()
    
    _data = train_data[:26]
    _batch_size = 4
    _num_steps = 2
    _epoch_size = ((len(_data) // _batch_size) - 1) // _num_steps
    
    x, y = input_batch(_data, _batch_size, _num_steps)
    threads = tf.train.start_queue_runners(sess=session, coord=coord)
    
    for _ in range(_epoch_size):
        x_, y_ = session.run([x, y])
        print('X:\n')
        print(x_)
        print()
        print('Y:\n')
        print(y_)
        print()
    
    coord.request_stop()
    coord.join(threads)

del graph

X:

[[9970 9971]
 [9976 9977]
 [9982 9983]
 [9988 9989]]

Y:

[[9971 9972]
 [9977 9978]
 [9983 9984]
 [9989 9990]]

X:

[[9972 9973]
 [9978 9979]
 [9984 9985]
 [9990 9991]]

Y:

[[9973 9974]
 [9979 9980]
 [9985 9986]
 [9991 9992]]



## Model

LSTM

In [23]:
import time
import numpy as np

class PTBModel:
    
    #_logits
    #_initial_state
    #_final_state
    
    def __init__(self,
                 input_data,
                 batch_size,
                 num_steps,
                 num_layers,
                 hidden_size,
                 vocab_size):        
        lstm_cells = [
            self._lstm_reg(tf.contrib.rnn.BasicLSTMCell(hidden_size,
                                                        forget_bias=0.0,
                                                        state_is_tuple=True,
                                                        reuse=tf.get_variable_scope().reuse))
            for _ in range(num_layers)
        ]
        cell = tf.contrib.rnn.MultiRNNCell(lstm_cells, state_is_tuple=True)
        
        self._initial_state = cell.zero_state(batch_size, tf.float32)
        
        embedding = tf.get_variable('embedding', [vocab_size, hidden_size], dtype=tf.float32)
        inputs = self._input_reg(tf.nn.embedding_lookup(embedding, input_data))
        
        outputs = []
        state = self._initial_state
        with tf.variable_scope('RNN'):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
        
        self._final_state = state
        
        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, hidden_size])
        softmax_w = tf.get_variable('softmax_w', [hidden_size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable('softmax_b', [vocab_size], dtype=tf.float32)
        logits = tf.matmul(output, softmax_w) + softmax_b
        self._logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])
    
    def _lstm_reg(self, lstm_cell):
        return lstm_cell
    
    def _input_reg(self, inputs):
        return inputs
    
    @property
    def logits(self):
        return self._logits
    
    @property
    def initial_state(self):
        return self._initial_state
    
    @property
    def final_state(self):
        return self._final_state

class PTBTrain(PTBModel):

    #_is_training
    #_epoch_size
    #_batch_size
    #_num_steps
    #_keep_prob

    #_cost
    #_lr
    #_train_op
    #_new_lr
    #_lr_update

    def __init__(self,
                 is_training,
                 input_data,
                 targets,
                 epoch_size,
                 batch_size,
                 num_steps,
                 num_layers,
                 hidden_size,
                 vocab_size,
                 keep_prob,
                 learning_rate,
                 max_grad_norm):
        self._is_training = is_training
        self._epoch_size = epoch_size
        self._batch_size = batch_size
        self._num_steps = num_steps
        self._keep_prob = keep_prob
        
        PTBModel.__init__(self,
                          input_data,
                          batch_size,
                          num_steps,
                          num_layers,
                          hidden_size,
                          vocab_size)
        
        loss = tf.contrib.seq2seq.sequence_loss(
            self.logits,
            targets,
            tf.ones((batch_size, num_steps), dtype=tf.float32),
            average_across_timesteps=False,
            average_across_batch=True
        )
        self._cost = tf.reduce_sum(loss)

        if not is_training:
            self._train_op = tf.no_op()
            return

        self._lr = tf.Variable(learning_rate, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars), max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(tf.float32, shape=[], name='new_learning_rate')
        self._lr_update = tf.assign(self._lr, self._new_lr)

    def _lstm_reg(self, lstm_cell):
        if self._is_training and self._keep_prob < 1:
            return tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self._keep_prob)
        return lstm_cell
    
    def _input_reg(self, inputs):
        if self._is_training and self._keep_prob < 1:
            return tf.nn.dropout(inputs, self._keep_prob)
        return inputs
    
    def lr(self, session):
        return session.run(self._lr)
    
    def assign_lr(self, session, lr_value):
        session.run(self._lr_update, feed_dict={self._new_lr: lr_value})

    def run_epoch_train(self, session, verbose=True):
        return self._run_epoch(session, verbose)

    def run_epoch_eval(self, session, verbose=False):
        return self._run_epoch(session, verbose)

    def _run_epoch(self, session, verbose):
        batch_size = self._batch_size
        num_steps = self._num_steps
        epoch_size = self._epoch_size
        
        start_time = time.time()
        costs = 0.0
        iters = 0
        state = session.run(self.initial_state)

        for step in range(epoch_size):
            fetches = [self._cost, self.final_state, self._train_op]

            feed_dict = {}
            for k, (c, h) in enumerate(self.initial_state):
                feed_dict[c] = state[k].c
                feed_dict[h] = state[k].h

            batch_cost, state, _ = session.run(fetches, feed_dict)
            costs += batch_cost
            iters += num_steps

            if verbose and step % (epoch_size // 10) == 10:
                print('{:.1f}% perplexity: {:,.3f} speed: {:,.0f} wps'.format(
                    step * 100.0 / epoch_size,
                    np.exp(costs / iters),
                    iters * batch_size / (time.time() - start_time)))

        return np.exp(costs / iters)

## Training

In [24]:
class Config(object):
    def params(self, *keys):
        return {
            k: v
            for k, v in self.__class__.__dict__.items()
            if not k.startswith('__') and (not keys or k in keys)
        }
    
class TinyConfig(Config):
    '''Tiny config, for testing.'''
    init_scale = 0.1
    learning_rate = 1.0
    max_grad_norm = 1
    num_layers = 1
    num_steps = 2
    hidden_size = 2
    max_epoch = 1
    max_max_epoch = 2
    keep_prob = 1.0
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000

class SmallConfig(Config):
    '''Small config.'''
    init_scale = 0.1
    learning_rate = 1.0
    max_grad_norm = 5
    num_layers = 2
    num_steps = 20
    hidden_size = 200
    max_epoch = 4
    max_max_epoch = 13
    keep_prob = 1.0
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000

In [25]:
cfg = TinyConfig()
# cfg = SmallConfig()

graph = tf.Graph()

with graph.as_default():
    initializer = tf.random_uniform_initializer(-cfg.init_scale, cfg.init_scale)
    
    params = cfg.params(
        'batch_size',
        'num_steps',
        'num_layers',
        'hidden_size',
        'vocab_size',
        'keep_prob',
        'learning_rate',
        'max_grad_norm')
    
    batch_size = params['batch_size']
    num_steps = params['num_steps']

    with tf.name_scope('train'):
        train_input, train_target = input_batch(train_data, batch_size, num_steps, 'train_input')
        epoch_size = ((len(train_data) // batch_size) - 1) // num_steps
        with tf.variable_scope('model', reuse=None, initializer=initializer):
            m_train = PTBTrain(is_training=True, 
                               input_data=train_input,
                               targets=train_target,
                               epoch_size=epoch_size,
                               **params)
        tf.summary.scalar('training_loss', m_train._cost)
        tf.summary.scalar('learning_rate', m_train._lr)
    
    with tf.name_scope('validation'):
        valid_input, valid_target = input_batch(valid_data, batch_size, num_steps, 'validation_input')
        epoch_size = ((len(valid_data) // batch_size) - 1) // num_steps
        with tf.variable_scope('model', reuse=True, initializer=initializer):
            m_valid = PTBTrain(is_training=False,
                               input_data=valid_input,
                               targets=valid_target,
                               epoch_size=epoch_size,
                               **params)
        tf.summary.scalar('validation_loss', m_valid._cost)
        
    with tf.name_scope('test'):
        test_input, test_target = input_batch(test_data, 1, 1, 'test_input')
        epoch_size = len(test_data) - 1
        params.update(batch_size=1, num_steps=1)
        with tf.variable_scope('model', reuse=True, initializer=initializer):
            m_test = PTBTrain(is_training=False,
                              input_data=test_input,
                              targets=test_target,
                              epoch_size=epoch_size,
                              **params)

In [26]:
import os
import shutil

def remove_dir(path):
    if os.path.isdir(path):
        shutil.rmtree(path)

LOG_DIR = os.path.join(HOME_DIR, 'log')
MODEL_DIR = os.path.join(HOME_DIR, 'model')
MODEL_FILE = os.path.join(MODEL_DIR, 'ptb_lstm')

remove_dir(LOG_DIR)
remove_dir(MODEL_DIR)

In [27]:
%%time

sv = tf.train.Supervisor(graph=graph, logdir=LOG_DIR)
with sv.managed_session() as session:
    for epoch in range(1, cfg.max_max_epoch+1):
        decay = cfg.lr_decay ** max(epoch - cfg.max_epoch, 0.0)
        lr_value = cfg.learning_rate * decay
        m_train.assign_lr(session, lr_value)
        
        print()
        print('Epoch: {:d} Learning rate: {:.6f}'.format(epoch, m_train.lr(session)))
        
        train_perplexity = m_train.run_epoch_train(session)
        print('Epoch: {:d} Train Perplexity: {:.3f}'.format(epoch, train_perplexity))
        
        valid_perplexity = m_valid.run_epoch_eval(session)
        print('Epoch: {:d} Valid Perplexity: {:.3f}'.format(epoch, valid_perplexity))
    
    print()
    
    test_perplexity = m_test.run_epoch_eval(session)
    print('Test Perplexity: {:.3f}'.format(test_perplexity))
    
    print()
    print('Saving model...')
    sv.saver.save(session, MODEL_FILE, global_step=sv.global_step)

INFO:tensorflow:Starting standard services.
INFO:tensorflow:Saving checkpoint to path treebank/log/model.ckpt
INFO:tensorflow:Starting queue runners.
INFO:tensorflow:model/global_step/sec: 0

Epoch: 1 Learning rate: 1.000000INFO:tensorflow:Recording summary at step 0.

0.0% perplexity: 9,180.709 speed: 1,920 wps
10.0% perplexity: 840.813 speed: 2,911 wps
20.0% perplexity: 781.459 speed: 2,910 wps
30.0% perplexity: 758.018 speed: 2,911 wps
INFO:tensorflow:model/global_step/sec: 72.7842
INFO:tensorflow:Recording summary at step 8734.
40.0% perplexity: 746.492 speed: 2,913 wps
50.0% perplexity: 741.939 speed: 2,915 wps
60.0% perplexity: 717.206 speed: 2,917 wps
70.0% perplexity: 686.134 speed: 2,917 wps
INFO:tensorflow:model/global_step/sec: 72.9932
INFO:tensorflow:Recording summary at step 17493.
80.0% perplexity: 661.062 speed: 2,916 wps
90.0% perplexity: 638.630 speed: 2,915 wps
Epoch: 1 Train Perplexity: 619.249
Epoch: 1 Valid Perplexity: 489.841

Epoch: 2 Learning rate: 0.500000
0.0%