In [None]:
from d2l.tensorflow import data, losses, optimizers

import tensorflow as tf
import numpy as np

In [None]:
data_iter, vocab = data.load_seq_data('timemachine.txt', 128, 1)

In [None]:
tf.one_hot(np.array([0, 2, 20]), len(vocab))

##  Converting an entire minibatch to one-hot encoded data

In [None]:
def to_onehot(X, size):
    return [tf.one_hot(x, size) for x in tf.transpose(X)]

X = np.arange(10).reshape((2, 5))
inputs = to_onehot(X, len(vocab))
print(len(inputs), inputs[0].shape)
print(inputs)

## RNN Model

In [None]:
vocab_size = len(vocab)
corpus_indices = data_iter.corpus
idx_to_char, char_to_idx = vocab.idx_to_token, vocab.token_to_idx
num_inputs, num_hiddens, num_outputs = vocab_size, 512, vocab_size

def get_params():
    def _one(shape):
        return tf.random.normal(stddev=0.01, shape=shape)
    # Hidden layer parameters
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = tf.zeros(num_hiddens)
    # Output layer parameters
    W_hq = _one((num_hiddens, num_outputs))
    b_q = tf.zeros(num_outputs)
    # Attach a gradient
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    return [tf.Variable(param) for param in params]

In [None]:
# return tuples such that we can extend this later
def init_rnn_state(batch_size, num_hiddens):
    state = [tf.zeros(shape=(batch_size, num_hiddens)), ]# return tuples such that we can extend this 
    return [tf.Variable(st) for st in state]

In [None]:
def rnn(inputs, state, params):
    # Both inputs and outputs are composed of num_steps matrices
    # of the shape (batch_size, vocab_size).
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        H = tf.tanh(tf.matmul(X, W_xh) + tf.matmul(H, W_hh) + b_h)
        Y = tf.matmul(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H,)

In [None]:
state = init_rnn_state(X.shape[0], num_hiddens)
inputs = to_onehot(X, vocab_size)
params = get_params()
outputs, state_new = rnn(inputs, state, params)
print(len(inputs), inputs[0].shape, state[0].shape)
print(len(outputs), outputs[0].shape, state_new[0].shape)

In [None]:
def predict_rnn(
    prefix, num_chars, rnn, params, init_rnn_state,
    num_hiddens, vocab_size, idx_to_char, char_to_idx
):
    state = init_rnn_state(1, num_hiddens)
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        # The output of the previous time step is taken
        # as the input of the current time step.
        X = to_onehot([output[-1]], vocab_size)
        # Calculate the output and update the hidden state.
        (Y, state) = rnn([X], state, params)
        # The input to the next time step is the character in 3
        # the prefix or the current best predicted character.
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            # This is maximum likelihood decoding, not sampling
            output.append(int(tf.argmax(Y[0], axis=1)))
    return ''.join([idx_to_char[i] for i in output])

In [None]:
predict_rnn(
    'traveller', 10, rnn, params, init_rnn_state, num_hiddens, vocab_size, idx_to_char, char_to_idx
)

## Gradient Clipping

In [None]:
def grad_clipping(gradients, theta):
    gradients = [tf.Variable(grad) for grad in gradients]
    norm = tf.Variable(0.)
    for grad in gradients:
        norm.assign_add(tf.reduce_sum(grad ** 2))
        norm.assign(tf.sqrt(norm))
    if norm > theta:
        for grad in gradients:
            grad.assign_add(grad * (theta / norm))
    return gradients

## Training funcion

In [None]:
# This function is saved in the d2l package for future use.
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    import time
    import math
    from tqdm import tqdm
    if is_random_iter:
        data_iter_fn = data.seq_data_iter_random
    else:
        data_iter_fn = data.seq_data_iter_consecutive
    params = get_params()
    loss = losses.softmax_cross_entropy

    for epoch in tqdm(range(num_epochs)):
        if not is_random_iter:  
            # If adjacent sampling is used, the hidden state is initialized 
            # at the beginning of the epoch.
            state = init_rnn_state(batch_size, num_hiddens)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps)
        for X, Y in data_iter:
            if is_random_iter:  
                # If random sampling is used, the hidden state is initialized 
                # before each mini-batch update.
                state = init_rnn_state(batch_size, num_hiddens)
            # else:  
            #     # Otherwise, the detach function needs to be used to separate 
            #     # the hidden state from the computational graph to avoid 
            #     # backpropagation beyond the current sample.
            #     for s in state:
            #         s.detach()
            with tf.GradientTape() as t:
                inputs = to_onehot(X, vocab_size)
                # outputs is num_steps terms of shape (batch_size, vocab_size)
                (outputs, state) = rnn(inputs, state, params)
                # after stitching it is (num_steps * batch_size, vocab_size).
                outputs = tf.concat(outputs, axis=0)
                # The shape of Y is (batch_size, num_steps), and then becomes 
                # a vector with a length of batch * num_steps after 
                # transposition. This gives it a one-to-one correspondence 
                # with output rows.
                y = tf.reshape(tf.transpose(Y), (-1,))
                # Average classification error via cross entropy loss.
                l = tf.reduce_mean(loss(y, outputs))
            gradients = t.gradient(l, params)
            gradients = grad_clipping(gradients, clipping_theta)  # Clip the gradient.
            optimizers.sgd(params, gradients, lr, 1)  
            # Since the error is the mean, no need to average gradients here.
            l_sum += l * y.shape[0]
            n += y.shape[0]

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, idx_to_char, char_to_idx))

In [None]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 500, 64, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['traveller', 'time traveller']

In [None]:
train_and_predict_rnn(
    rnn, get_params, init_rnn_state, num_hiddens,
    vocab_size, corpus_indices, idx_to_char,
    char_to_idx, True, num_epochs, num_steps, lr,
    clipping_theta, batch_size, pred_period, pred_len,
    prefixes
)