### Gated Recurrent Unit

In [1]:
import sys
sys.path.insert(0, '..')

import d2l
from mxnet import nd
from mxnet.gluon import rnn

(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_time_machine()
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
ctx = d2l.try_gpu()

### Implementation from Scratch

In [2]:
def get_params():
    def _one(shape):
        return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)

    def _three():
        return (_one((num_inputs, num_hiddens)),
                _one((num_hiddens, num_hiddens)),
                nd.zeros(num_hiddens, ctx=ctx))

    W_xz, W_hz, b_z = _three()  # Update gate parameter
    W_xr, W_hr, b_r = _three()  # Reset gate parameter
    W_xh, W_hh, b_h = _three()  # Candidate hidden state parameter
    # Output layer parameters
    W_hq = _one((num_hiddens, num_outputs))
    b_q = nd.zeros(num_outputs, ctx=ctx)
    # Create gradient
    params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.attach_grad()
    return params

### Define the Model

In [3]:
def init_gru_state(batch_size, num_hiddens, ctx):
    return (nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx), )

In [4]:
def gru(inputs, state, params):
    W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z)
        R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r)
        H_tilda = nd.tanh(nd.dot(X, W_xh) + R * nd.dot(H, W_hh) + b_h)
        H = Z * H + (1 - Z) * H_tilda
        Y = nd.dot(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H,)

### Train the Model

In [5]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 40, 50, ['traveller', 'time traveller']

In [6]:
d2l.train_and_predict_rnn(gru, get_params, init_gru_state, num_hiddens,
                         vocab_size, ctx, corpus_indices, idx_to_char,
                         char_to_idx, False, num_epochs, num_steps, lr,
                         clipping_theta, batch_size, pred_period, pred_len,
                         prefixes)

epoch 40, perplexity 7.740170, time 0.76 sec
 - traveller and the the the the the the the the the the the t
 - time traveller the the the the the the the the the the the the t
epoch 80, perplexity 3.347345, time 0.77 sec
 - traveller.  'but is all the preal existence, but you and wi
 - time traveller the presence be and we have in the persally of th
epoch 120, perplexity 1.350327, time 0.83 sec
 - traveller, way it lalk beenow we conventented, and why cann
 - time traveller, with a conili, and his facknon space pespli badk
epoch 160, perplexity 1.147413, time 0.77 sec
 - traveller. 'but now you begin to see the object of my.ink s
 - time traveller (for so it will be convenient to speak of him) wa


### Gluon Implementation

In [7]:
gru_layer = rnn.GRU(num_hiddens)
model = d2l.RNNModel(gru_layer, vocab_size)
d2l.train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                               corpus_indices, idx_to_char, char_to_idx,
                               num_epochs, num_steps, lr, clipping_theta,
                               batch_size, pred_period, pred_len, prefixes)

epoch 40, perplexity 7.779318, time 0.40 sec
 - traveller the the the the the the the the the the the the t
 - time traveller the the the the the the the the the the the the t
epoch 80, perplexity 3.565889, time 0.50 sec
 - traveller.  'it as movent of space, and the there is allous
 - time traveller.  'it as think the psychologist.  'you cand to th
epoch 120, perplexity 1.423461, time 0.41 sec
 - traveller.  'you can show black fory. but were young man.  
 - time traveller.  'you can show black fory. butteer, and there wa
epoch 160, perplexity 1.118035, time 0.43 sec
 - traveller. 'ftracines, and ary always beend, weat you at us
 - time traveller.  'you can show black is white by argument,' said
