In [2]:
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

# Data Preparation

## Load the file

In [3]:
with open('anna.txt', 'r') as f:
    text = f.read()
vocab = sorted(set(text))
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))

# Convert all the text as integer
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

## Create mini batch

In [5]:
def get_batches(arr, batch_size, n_steps):
    # Get the total number of characters per batch
    chars_per_batch = batch_size * n_steps

    # Total Number of batchs
    n_batches = len(arr) // chars_per_batch

    # Get the full pack of charactors for each batch
    arr = arr[:n_batches * chars_per_batch]

    # Reshape the array by batch_size, Horizonal reshaping based on batch size.
    arr = arr.reshape((batch_size, -1))

    # Create steps from mini batch, Keep in mind this is a matrix
    for n in range(0, arr.shape[1], n_steps):
        x = arr[:, n:n + n_steps]

        # At last batch the y will be sort by 1 charactor. That will create error in `tf.nn.dynamic_rnn` for array size mis match
        y_temp = arr[:, n + 1:n + n_steps + 1]
        
        # Create a zero array and append with y output
        y = np.zeros(x.shape, dtype=np.int32)
        y[:, :y_temp.shape[1]] = y_temp
        
        # To handle memory
        yield x, y

### Test batch function

In [6]:
# Test
batch_size = 10; n_steps=5
batchs = get_batches(encoded, batch_size, n_steps)
x, y = next(batchs)
print(x)
print(y)

[[31 64 57 72 76]
 [81 11  3  1 57]
 [57 63 70 65 62]
 [ 1 57  1 72 74]
 [61  1 76 64 57]
 [ 1 75 57 79  1]
 [59 76 61 60  1]
 [57 75  1 75 77]
 [61 60  1 65 76]
 [59 61  1 71 62]]
[[64 57 72 76 61]
 [11  3  1 57 70]
 [63 70 65 62 65]
 [57  1 72 74 61]
 [ 1 76 64 57 70]
 [75 57 79  1 64]
 [76 61 60  1 79]
 [75  1 75 77 62]
 [60  1 65 76 75]
 [61  1 71 62  1]]


# Tensorflow Initialization

## Building Inputs

In [7]:
def build_inputs(batch_size, n_steps):
    inputs = tf.placeholder(
        shape=[batch_size, n_steps], dtype=tf.int32, name='inputs')
    
    outputs = tf.placeholder(
        shape=[batch_size, n_steps], dtype=tf.int32, name='outputs')
    
    keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
    
    print('Shape of the input tensor: {}'.format(inputs.get_shape()))
    print('Shape of the output tensor: {}'.format(outputs.get_shape()))
    
    return inputs, outputs, keep_prob

## LSTM cells

In [8]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):

    # Single cell creation
    def build_cell(lstm_size, keep_prob):
        cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
        return drop

    # Initialize different cells by calling single cell creation function
    cells = tf.contrib.rnn.MultiRNNCell(
        [build_cell(lstm_size, keep_prob) for _ in range(num_layers)])
    
    print('ALL LSTM cells creation: {}'.format(cells))

    # Values that pass to another cell horizontally for memory or time-steps
    initial_state = cells.zero_state(batch_size, tf.float32)
    
    return cells, initial_state

## Build Output 

In [9]:
def build_output(lstm_output, lstm_size, number_of_classes):
    # lstm_output comes as list. Concat to create as array
    pre_output = tf.concat(lstm_output, axis=1)

    # Reshape the output matrix as [batch_size*n_steps X lstm_size]
    # Each row output for each charactor. The number of rows will batch_size*n_steps.
    output = tf.reshape(pre_output, [-1, lstm_size])

    # Apply softmax function by creation a softmax layer
    # `variable_scope` helps to change the default name for weight initialization.
    #  Because RNN cell also have weights and bias with the default
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(
            tf.truncated_normal((lstm_size, number_of_classes), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(number_of_classes))

    # Input for softmax function
    logits = tf.matmul(output, softmax_w) + softmax_b

    predictions = tf.nn.softmax(logits, name='predictions')
    
    return predictions, logits

## Training loss

In [10]:
def build_loss(logits, targets, lstm_size, num_of_classes):

    # convert the numbers to one hot encoding
    target_one_hot = tf.one_hot(targets, num_of_classes)

    # Change the shape similer to output from RNN
    target = tf.reshape(target_one_hot, logits.get_shape())

    # Apply cross entrophy
    loss = tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=target)
    
    loss = tf.reduce_mean(loss)

    return loss

## Optimizer or Simple gradient decent

* `tf.trainable_variables`: weights and bias used during the training, This is assigned by TF

* `tf.gradients`: Constructs symbolic partial derivatives of sum of ys w.r.t. x in xs. That return $\delta w, \delta b = tf.gradients(cost, [W, b])$. The `tf.gradients()` returns the gradient of cost wrt each tensor in the second argument as a list in the same order.

* `tf.clip_by_global_norm` is a function to put the upper bound on the gradient value wrt to each variables. `grad_clip` is the upper bound.

* AdamOptimizer is used to minimise the prediction error. It is variation of gradient descent. 
Here's a list on optimizers

In [11]:
def build_optimizer(loss, learning_rate, grad_clip):
    # weights and bias used during the training, This is assigned by TF
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    
    train_op = tf.train.AdamOptimizer(learning_rate)
    
    # Tune the weights
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

# Building Network

## Create the class of the model

In [19]:
class CharRNN:
    def __init__(self, num_classes, batch_size=64, num_steps = 50, lstm_size=128, num_layers = 2, learning_rate=0.001,
                grad_clip=5, sampling=False):

        if sampling==True:
            batch_size, num_steps = 1,1

        # Reset all variabled that has mapped with this model
        tf.reset_default_graph()
        
        # Tensor input
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)
        
        # Tensor cells and state.
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
        
        # Encode the inputs.
        input_one_hot = tf.one_hot(self.inputs, num_classes)
        
        # Create a RNN network with LSTM cell, input and initial state
        outputs, state = tf.nn.dynamic_rnn(cell, input_one_hot, initial_state=self.initial_state)
        self.final_state = state
        
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

## Training parameters
* `batch_size`    - Number of sequences running through the network in one pass.
* `num_steps`     - Number of characters in the sequence the network is trained on. Larger is better typically, the network will learn more long range dependencies. But it takes longer to train. 100 is typically a good number here.
* `lstm_size`     - The number of units in the hidden layers.
* `num_layers`    - Number of hidden LSTM layers to use
* `learning_rate` - Learning rate for training
* `keep_prob`     - The dropout keep probability when training. If you're network is overfitting, try decreasing this.

## Tips and Tricks

### Monitoring Validation Loss vs. Training Loss
The most important quantity to keep track of is the difference between your **training loss** (printed during training) and the **validation loss** (printed once in a while when the RNN is run on the validation data (by default every 1000 iterations)). In particular:

- If your training loss is much lower than validation loss then this means the network might be **overfitting**. Solutions to this are to **decrease your network size, or to increase dropout**. For example you could try dropout of 0.5 and so on.
- If your training/validation loss are about equal then your model is **underfitting**. **Increase the size of your model (either number of layers or the raw number of neurons per layer)**

### Approximate number of parameters

The two most important parameters that control the model are `lstm_size` and `num_layers`. I would advise that you always use `num_layers` of either 2/3. The `lstm_size` can be adjusted based on how much data you have. The two important quantities to keep track of here are:

- The number of parameters in your model. This is printed when you start training.
- The size of your dataset. 1MB file is approximately 1 million characters.

These two should be about the same order of magnitude. It's a little tricky to tell. Here are some examples:

- I have a 100MB dataset and I'm using the default parameter settings (which currently print 150K parameters). My data size is significantly larger (100 mil >> 0.15 mil), so I expect to heavily underfit. I am thinking I can comfortably afford to make `lstm_size` larger.
- I have a 10MB dataset and running a 10 million parameter model. I'm slightly nervous and I'm carefully monitoring my validation loss. If it's larger than my training loss then I may want to try to increase dropout a bit and see if that helps the validation loss.

### Best models strategy

The winning strategy to obtaining very good models (if you have the compute time) is to always err on making the network larger (as large as you're willing to wait for it to compute) and then try different dropout values (between 0,1). **Whatever model has the best validation performance (the loss, written in the checkpoint filename, low is good) is the one you should use in the end.**

It is very common in deep learning to run many different models with many different hyperparameter settings, and in the end **take whatever checkpoint gave the best validation performance**.

By the way, the size of your training and validation splits are also parameters. Make sure you have a decent amount of data in your validation set or otherwise the validation performance will be noisy and not very informative.

In [13]:
batch_size = 100        # Sequences per batch
num_steps = 100         # Number of sequence steps per batch
lstm_size = 512         # Size of hidden layers in LSTMs
num_layers = 2          # Number of LSTM layers
learning_rate = 0.001   # Learning rate
keep_prob = 0.5         # Dropout keep probability

In [14]:
epochs = 20

# Print losses every N interations
print_every_n = 50

# Save every N iterations
save_every_n = 200

model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

# Store the model for best validation performance, Where the validation loss is low.
saver = tf.train.Saver(max_to_keep=100)

Shape of the input tensor: (100, 100)
Shape of the output tensor: (100, 100)
ALL LSTM cells creation: <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7fbc8680a320>
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



# Trained models

## Training

In [15]:
with tf.Session() as sess:
    # Initialize all TF variables
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        
        # Tuple of initial state with 0 tensors
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            
            # we pass the each function element of list. That return tuples. So we pick from each.
            # batch_loss: Previos iteration loss
            # new_state: previous iteration final_state will be the new_state for next iteration.
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            if (counter % print_every_n == 0):
                end = time.time()
                print('Epoch: {}/{}... '.format(e+1, epochs),
                      'Training Step: {}... '.format(counter),
                      'Training loss: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

    # i{iteration number}_l{# hidden layer units}.ckpt
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

Epoch: 1/20...  Training Step: 50...  Training loss: 3.1663...  0.3197 sec/batch
Epoch: 1/20...  Training Step: 100...  Training loss: 3.0855...  0.3228 sec/batch
Epoch: 1/20...  Training Step: 150...  Training loss: 2.7331...  0.3211 sec/batch
Epoch: 2/20...  Training Step: 200...  Training loss: 2.4193...  0.3217 sec/batch
Epoch: 2/20...  Training Step: 250...  Training loss: 2.3242...  0.3211 sec/batch
Epoch: 2/20...  Training Step: 300...  Training loss: 2.2082...  0.3230 sec/batch
Epoch: 2/20...  Training Step: 350...  Training loss: 2.1719...  0.3245 sec/batch
Epoch: 3/20...  Training Step: 400...  Training loss: 2.0444...  0.3230 sec/batch
Epoch: 3/20...  Training Step: 450...  Training loss: 1.9644...  0.3228 sec/batch
Epoch: 3/20...  Training Step: 500...  Training loss: 1.9265...  0.3210 sec/batch
Epoch: 3/20...  Training Step: 550...  Training loss: 1.8887...  0.3217 sec/batch
Epoch: 4/20...  Training Step: 600...  Training loss: 1.7832...  0.3210 sec/batch
Epoch: 4/20...  T

## Load trained models

In [None]:
# This helps to pick the latest check point.
tf.train.get_checkpoint_state('checkpoints')

# Pick the model by it's name
checkpoint = 'checkpoints/i200_l512.ckpt'
# Load the model
saver.restore(sess, checkpoint)

# Predictions
Once the model is trained. We are going to generate text from trained models. The idea is that we pass in a character, then the network will predict the next character. We can use the new one, to predict the next one. And we keep doing this to generate all new text

* `pick_top_n` function will pick the predictions from RNN model with each class probability. 
* `np.squeeze` function used to remove one-dimensional entry from the shape of the given array. Where the preds comes as 3-dimension array.
* `np.argsort` Soft the array by probability of each class. and pick the `top_n` charactors.
* `np.random.choice` Create random samples. 

In [16]:
def pick_top_n(preds, vocab_size, top_n=5):
    # remove 1 dimension
    p = np.squeeze(preds)
    # Sort by probability
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

* loop through each letter to predict next one
* load the model from checkpoint

In [17]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [20]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime="Far")
print(samp)

Shape of the input tensor: (1, 1)
Shape of the output tensor: (1, 1)
ALL LSTM cells creation: <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7fbc8685f198>
INFO:tensorflow:Restoring parameters from checkpoints/i3960_l512.ckpt
Farred at the table that had
so much fell of. Here she could not take the clotices of the servants, and
were all the crowd of the servant and his company of the point of the
conviction. She was that the person had never drawing the theer
of them. He found him to be decided abluttly at his hear, which
had been at once thank women in the princess, and he had not spoken
himself at the thick three hay at feeling in her arms to the solicary
of them to be dead at the stars and was taken. He sere to say if his
conversation. She came into the side of the bell of hand, and so she
did not know her and show her at her and with a fact that in her seaken he
did not spot them and to take the three, he heard her at it, and
the sheet of the cappity of the portrait t

In [21]:
checkpoint = 'checkpoints/i600_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)

Shape of the input tensor: (1, 1)
Shape of the output tensor: (1, 1)
ALL LSTM cells creation: <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7fbc3d65b208>
INFO:tensorflow:Restoring parameters from checkpoints/i600_l512.ckpt
Fartre
dor the contelssion, all atent the portings, and the his tround. And that he wan to his seaning.

"At what so what he said nathing to
him betally, bucting and as anding her
tore anytred hard and aspents with the sanch of
shins the concenss to
a pastion and the sach him
the withous, he distion the
poress of stard.

"I mant which and she his sereary, by sheat he called, the she there aness of himserfing tanding to the
muther and
shathing her. "I dint to the more the poncertens that the
mane and the crond and the mored. Ho saik of and his
sine then
well talk in the hid his, thind she had hould
not his take her sending,
was had to her
herdent, her, the center had beang his her had hearse, and has stired and him sounded the
pricester the semint of t

In [22]:
checkpoint = 'checkpoints/i1200_l512.ckpt'
samp = sample(checkpoint, 1000, lstm_size, len(vocab), prime="Far")
print(samp)

Shape of the input tensor: (1, 1)
Shape of the output tensor: (1, 1)
ALL LSTM cells creation: <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x7fbc61707a20>
INFO:tensorflow:Restoring parameters from checkpoints/i1200_l512.ckpt
Fardraly. But she was a strange with his condiciat of the parsiant of her said to a cannon and
sear of the sare and something with all, he was striggly and say to
the sants of serent of him and he
sow, her ale
to be any the conversation, he shouted to
her at the parents and at her fanch the clast, she
were at the most as though the
more of the come, but the man as any she went
to think the coldections that she would seen her
husbandd him he had not
three wither trat and that had a conversation that
she went out off her find the camment to seen her face all the position in
socition in the comacion was the
poon who conversatial of her for the same of
the complething were all, so the sense of
happy of the stanting of the some out of a lateres,
what
he h