In [None]:
!pip install setuptools==65.5.0 "wheel<0.40.0"

In [None]:
!pip install d2l==1.0.0b0

### Long Short-Term Memory (LSTM)

In [None]:
import tensorflow as tf
from d2l import tensorflow as d2l

#### Implementation from Scratch

Now let's implement an LSTM from scratch.

**Initializing Model Parameters**

Next, we need to define and initialize the model parameters. As previously, the hyperparameter `num_hiddens` dictates the number of hidden units. We initialize weights following a Gaussian distribution with 0.01 standard deviation, and we set the biases to 0.

In [None]:
class LSTMScratch(d2l.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()

        init_weight = lambda *shape: tf.Variable(tf.random.normal(shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          tf.Variable(tf.zeros(num_hiddens)))

        self.W_xi, self.W_hi, self.b_i = triple()  # Input gate
        self.W_xf, self.W_hf, self.b_f = triple()  # Forget gate
        self.W_xo, self.W_ho, self.b_o = triple()  # Output gate
        self.W_xc, self.W_hc, self.b_c = triple()  # Input node

The actual model is defined as described above, consisting of three gates and an input node. Note that only the hidden state is passed to the output layer.

In [None]:
@d2l.add_to_class(LSTMScratch)
def forward(self, inputs, H_C=None):
    if H_C is None:
        # Initial state with shape: (batch_size, num_hiddens)
        H = tf.zeros((inputs.shape[1], self.num_hiddens))
        C = tf.zeros((inputs.shape[1], self.num_hiddens))
    else:
        H, C = H_C
    outputs = []
    for X in inputs:
        I = tf.sigmoid(tf.matmul(X, self.W_xi) +
                        tf.matmul(H, self.W_hi) + self.b_i)
        F = tf.sigmoid(tf.matmul(X, self.W_xf) +
                        tf.matmul(H, self.W_hf) + self.b_f)
        O = tf.sigmoid(tf.matmul(X, self.W_xo) +
                        tf.matmul(H, self.W_ho) + self.b_o)
        C_tilde = tf.tanh(tf.matmul(X, self.W_xc) +
                           tf.matmul(H, self.W_hc) + self.b_c)
        C = F * C + I * C_tilde
        H = O * tf.tanh(C)
        outputs.append(H)
    return outputs, (H, C)

**Training and Prediction**

Let’s train an LSTM model by instantiating the RNNLMScratch class

In [None]:
data = d2l.TimeMachine(batch_size=1024, num_steps=32)
with d2l.try_gpu():
    lstm = LSTMScratch(num_inputs=len(data.vocab), num_hiddens=32)
    model = d2l.RNNLMScratch(lstm, vocab_size=len(data.vocab), lr=4)
trainer = d2l.Trainer(max_epochs=50, gradient_clip_val=1)
trainer.fit(model, data)

#### Concise Implementation

Using high-level APIs, we can directly instantiate an LSTM model. This encapsulates all the configuration details that we made explicit above. The code is significantly faster as it uses compiled operators rather than Python for many details that we spelled out before.

In [None]:
class LSTM(d2l.RNN):
    def __init__(self, num_hiddens):
        d2l.Module.__init__(self)
        self.save_hyperparameters()
        self.rnn = tf.keras.layers.LSTM(
                num_hiddens, return_sequences=True,
                return_state=True, time_major=True)

    def forward(self, inputs, H_C=None):
        outputs, *H_C = self.rnn(inputs, H_C)
        return outputs, H_C

lstm = LSTM(num_hiddens=32)
with d2l.try_gpu():
    model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=4)
trainer.fit(model, data)

In [None]:
model.predict('it has', 20, data.vocab)

### Gated Recurrent Units (GRU)

In [None]:
import tensorflow as tf
from d2l import tensorflow as d2l

#### Implementation from Scratch

**Initializing Model Parameters**

The first step is to initialize the model parameters. We draw the weights from a Gaussian distribution with standard deviation to be sigma and set the bias to 0. The hyperparameter `num_hiddens` defines the number of hidden units. We instantiate all weights and biases relating to the update gate, the reset gate, and the candidate hidden state.

In [None]:
class GRUScratch(d2l.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()

        init_weight = lambda *shape: tf.Variable(tf.random.normal(shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          tf.Variable(tf.zeros(num_hiddens)))

        self.W_xz, self.W_hz, self.b_z = triple()  # Update gate
        self.W_xr, self.W_hr, self.b_r = triple()  # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple()  # Candidate hidden state

**Defining the Model**

Now we are ready to define the GRU forward computation. Its structure is the same as that of the basic RNN cell, except that the update equations are more complex.

In [None]:
@d2l.add_to_class(GRUScratch)
def forward(self, inputs, H=None):
    if H is None:
        # Initial state with shape: (batch_size, num_hiddens)
        H = tf.zeros((inputs.shape[1], self.num_hiddens))
    outputs = []
    for X in inputs:
        Z = tf.sigmoid(tf.matmul(X, self.W_xz) +
                        tf.matmul(H, self.W_hz) + self.b_z)
        R = tf.sigmoid(tf.matmul(X, self.W_xr) +
                        tf.matmul(H, self.W_hr) + self.b_r)
        H_tilde = tf.tanh(tf.matmul(X, self.W_xh) +
                           tf.matmul(R * H, self.W_hh) + self.b_h)
        H = Z * H + (1 - Z) * H_tilde
        outputs.append(H)
    return outputs, H

**Training**

Training a language model on The Time Machine dataset

In [None]:
data = d2l.TimeMachine(batch_size=1024, num_steps=32)
with d2l.try_gpu():
    gru = GRUScratch(num_inputs=len(data.vocab), num_hiddens=32)
    model = d2l.RNNLMScratch(gru, vocab_size=len(data.vocab), lr=4)
trainer = d2l.Trainer(max_epochs=50, gradient_clip_val=1)
trainer.fit(model, data)

#### Concise Implementation

In high-level APIs, we can directly instantiate a GRU model. This encapsulates all the configuration detail that we made explicit above.

In [None]:
class GRU(d2l.RNN):
    def __init__(self, num_inputs, num_hiddens):
        d2l.Module.__init__(self)
        self.save_hyperparameters()
        self.rnn = tf.keras.layers.GRU(num_hiddens, return_sequences=True,
                                       return_state=True)

The code is significantly faster in training as it uses compiled operators rather than Python.

In [None]:
gru = GRU(num_inputs=len(data.vocab), num_hiddens=32)
with d2l.try_gpu():
    model = d2l.RNNLM(gru, vocab_size=len(data.vocab), lr=4)
trainer.fit(model, data)

After training, we print out the perplexity on the training set and the predicted sequence following the provided prefix.

In [None]:
model.predict('it has', 20, data.vocab)

### Deep Recurrent Neural Networks

In [None]:
import tensorflow as tf
from d2l import tensorflow as d2l

#### Implementation from Scratch

To implement a multi-layer RNN from scratch, we can treat each layer as an `RNNScratch` instance with its own learnable parameters.

In [None]:
class StackedRNNScratch(d2l.Module):
    def __init__(self, num_inputs, num_hiddens, num_layers, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.rnns = [d2l.RNNScratch(num_inputs if i==0 else num_hiddens,
                                    num_hiddens, sigma)
                     for i in range(num_layers)]

The multi-layer forward computation simply performs forward computation layer by layer.

In [None]:
@d2l.add_to_class(StackedRNNScratch)
def forward(self, inputs, Hs=None):
    outputs = inputs
    if Hs is None: Hs = [None] * self.num_layers
    for i in range(self.num_layers):
        outputs, Hs[i] = self.rnns[i](outputs, Hs[i])
        outputs = tf.stack(outputs, 0)
    return outputs, Hs

As an example, we train a deep GRU model on The Time Machine dataset. To keep things simple we set the number of layers to 2.

In [None]:
data = d2l.TimeMachine(batch_size=1024, num_steps=32)
with d2l.try_gpu():
    rnn_block = StackedRNNScratch(num_inputs=len(data.vocab),
                              num_hiddens=32, num_layers=2)
    model = d2l.RNNLMScratch(rnn_block, vocab_size=len(data.vocab), lr=2)
trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1)
trainer.fit(model, data)

#### Concise Implementation

Fortunately many of the logistical details required to implement multiple layers of an RNN are readily available in high-level APIs. Our concise implementation will use such built-in functionalities. The code allows specification of the number of layers explicitly rather than picking the default of a single layer.

In [None]:
class GRU(d2l.RNN):
    """The multi-layer GRU model."""
    def __init__(self, num_hiddens, num_layers, dropout=0):
        d2l.Module.__init__(self)
        self.save_hyperparameters()
        gru_cells = [tf.keras.layers.GRUCell(num_hiddens, dropout=dropout)
                     for _ in range(num_layers)]
        self.rnn = tf.keras.layers.RNN(gru_cells, return_sequences=True,
                                       return_state=True, time_major=True)

    def forward(self, X, state=None):
        outputs, *state = self.rnn(X, state)
        return outputs, state

The architectural decisions such as choosing hyperparameters are very similar to those of GRU. We pick the same number of inputs and outputs as we have distinct tokens, i.e., `vocab_size`. The number of hidden units is still 32. The only difference is that we now select a nontrivial number of hidden layers by specifying the value of `num_layers`.

In [None]:
gru = GRU(num_hiddens=32, num_layers=2)
with d2l.try_gpu():
    model = d2l.RNNLM(gru, vocab_size=len(data.vocab), lr=2)
trainer.fit(model, data)

In [None]:
model.predict('it has', 20, data.vocab)

### Bidirectional Recurrent Neural Networks

In [None]:
import tensorflow as tf
from d2l import tensorflow as d2l

#### Implementation from Scratch

To implement a bidirectional RNN from scratch, we can include two unidirectional `RNNScratch` instances with separate learnable parameters.

In [None]:
class BiRNNScratch(d2l.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.f_rnn = d2l.RNNScratch(num_inputs, num_hiddens, sigma)
        self.b_rnn = d2l.RNNScratch(num_inputs, num_hiddens, sigma)
        self.num_hiddens *= 2  # The output dimension will be doubled

States of forward and backward RNNs are updated separately, while outputs of these two RNNs are concatenated.

In [None]:
@d2l.add_to_class(BiRNNScratch)
def forward(self, inputs, Hs=None):
    f_H, b_H = Hs if Hs is not None else (None, None)
    f_outputs, f_H = self.f_rnn(inputs, f_H)
    b_outputs, b_H = self.b_rnn(reversed(inputs), b_H)
    outputs = [tf.concat((f, b), -1) for f, b in zip(
        f_outputs, reversed(b_outputs))]
    return outputs, (f_H, b_H)