In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.python.keras.datasets import mnist
from tensorflow.contrib.eager.python import tfe

  from ._conv import register_converters as _register_converters


In [2]:
# enable eager mode
tf.enable_eager_execution()
tf.set_random_seed(0)
np.random.seed(0)

In [3]:
if not os.path.exists('weights/'):
    os.makedirs('weights/')

# constants
units = 128
batch_size = 100
epochs = 2
num_classes = 10

In [4]:
# dataset loading
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((-1, 28, 28))  # 28 timesteps, 28 inputs / timestep
x_test = x_test.reshape((-1, 28, 28))  # 28 timesteps, 28 inputs / timeste

# one hot encode the labels. convert back to numpy as we cannot use a combination of numpy
# and tensors as input to keras
y_train_ohe = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_ohe = tf.one_hot(y_test, depth=num_classes).numpy()

print('x train', x_train.shape)
print('y train', y_train_ohe.shape)
print('x test', x_test.shape)
print('y test', y_test_ohe.shape)

x train (60000, 28, 28)
y train (60000, 10)
x test (10000, 28, 28)
y test (10000, 10)


# Fast LSTM Cell

Here, we take a middle ground approach to the canonical (6.1) slow approach and the custom (6.2) approach to building RNNs. 

It is not recommended to try building RNNs from scratch unless you know what is going on in the model and many factors like initialization, order of operations, activation function can affect results drastically (it is not as fun as it looks). 

Hence, when you want something pre-built, but with the speed comparable or better to a custom model, you can use the Tensorflow Cell variant of that RNN layer. 

Here, we will use the LSTMCell from Tensorflow to demonstrate this. Note, that this is a full fledged LSTM Cell from Tensorflow, therefore it has a lot more niceties like peephole, supports dropout and initializers etc. 

## Notes

A point to take not of is that since a cell was originally meant to be built by the RNN that wraps it, we have to perform an extra check inside `call` to build the RNN if it has not been built the first time. This will make the first call to the Model slightly slower, but not by much.

In essence, we override the K.rnn() symbolic loop and use Eager loop to manage the internal state of the lstm as well as the feeding of data slices to it.

It won't take much to extend this to Multi layer RNN. 

- Here we have 2 initial states for 1 layer. For k layers, you would need a list of 2 * k initial states. This is specific to an LSTM. A GRU needs only 1 state.
- Loop around the 1st Cell and manage its output embedding. Here, we wont maintain just the final `x`, but the entire list of `x` over all timesteps, and use `tf.concat(x_list, axis=1)` to get back a time series of shape (batchsize, timesteps, hiddendim)
- After this first layer, use the outer loop to switch to the next cell. Initially, the states of this cell will be the two zero states corresponding to the i*2-th location in the state list.
- Loop around while maintaining all of its intermediate `x` and do the same as above.
- Finally, after all the layers of the RNN are done, feed only the final x of the final Cell to the classifier.

## Noticeable speed difference

With the Eager loop, this model is significantly faster than the barebones BasicLSTM model. It is most probably due to the use of tensorflows internal calls to add and multiply etc, whereas I use the public facing API to design the custom cell.

In [5]:
class RNN(tf.keras.Model):
    def __init__(self, units, num_classes):
        super(RNN, self).__init__()
        self.units = units
        self.lstm_cell = tf.nn.rnn_cell.LSTMCell(units)  # use the tensorflow cell directly
        self.classifier = tf.keras.layers.Dense(num_classes)

    def call(self, inputs, training=None, mask=None):
        state = self.lstm_cell.zero_state(batch_size=inputs.shape[0], dtype=tf.float32)
        x = inputs

        for t in range(inputs.shape[1]):
            input = inputs[:, t, :]  # extract the current input at timestep t
            x, state = self.lstm_cell(input, state=state)  # get the output embedding and the states ; note `state` rather than `states`

            # states = feed in the states back to the next timestep

        output = self.classifier(x)  # feed the last `x` as the hidden embedding of the lstm to the classifier

        # softmax op does not exist on the gpu, so always use cpu
        with tf.device('/cpu:0'):
            output = tf.nn.softmax(output)

        return output

In [6]:
device = '/cpu:0' if tfe.num_gpus() == 0 else '/gpu:0'

with tf.device(device):
    # build model and optimizer
    model = RNN(units, num_classes)
    model.compile(optimizer=tf.train.AdamOptimizer(0.01), loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # TF Keras tries to use entire dataset to determine shape without this step when using .fit()
    # Fix = Use exactly one sample from the provided input dataset to determine input/output shape/s for the model
    dummy_x = tf.zeros((1, 28, 28))
    model._set_inputs(dummy_x)

    # train
    model.fit(x_train, y_train_ohe, batch_size=batch_size, epochs=epochs,
              validation_data=(x_test, y_test_ohe), verbose=1)

    # evaluate on test set
    scores = model.evaluate(x_test, y_test_ohe, batch_size, verbose=1)
    print("Final test loss and accuracy :", scores)

    saver = tfe.Saver(model.variables)
    saver.save('weights/06_03_rnn/weights.ckpt')

Train on 60000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
Final test loss and accuracy : [0.07217171450378373, 0.9790000069141388]
