# Layers number

By specifying the `num_layers` parameter in `torch.nn.RNN`, you can define how many times the recurrent layer will be applied to the input data.

In [1]:
import torch
from torch.nn import RNN

## Idea description

When you specify data like this, the computation procedure changes:

$$h^{l}_t = f(h_t^{l-1} [W_1^l]^T + b_1 + h_{t-1}^{l} [W^l_2]^T + b_2)$$

Consider differences from classical definition of the recurrent layer.

$h_t^{l}$ is a hidden state of the $t$-th element of sequence of the $l$-th layer. Obviously for the first layer it takes input values - $l=1, h_t^{l-1}=x_t$.

Also you there are sets of parameters for each layer $W_1^l, W_2^l, l \in \overline{1,L}$ where $L$ is a number of recurrent layers.

## Reproduce as set of `RNN`

In [9]:
sequence_len = 3
batch_size = 3
input_size = 2
hidden_size = 3
input = torch.randn(sequence_len, batch_size, input_size)

In [10]:
rnn = RNN(
    input_size=input_size, 
    hidden_size=hidden_size, 
    num_layers=2
)
print(rnn(input)[0])

tensor([[[-0.6123, -0.2152, -0.1725],
         [-0.7671, -0.3529, -0.3967],
         [-0.8459, -0.2376, -0.5091]],

        [[-0.8887, -0.0942, -0.7185],
         [-0.8719,  0.0387, -0.6241],
         [-0.8933,  0.0182, -0.5859]],

        [[-0.8192, -0.3412, -0.3195],
         [-0.9153, -0.0921, -0.4135],
         [-0.8775, -0.2551, -0.3753]]], grad_fn=<StackBackward0>)


In [11]:
rnn1 = RNN(input_size=input_size, hidden_size=hidden_size)
rnn1.weight_ih_l0 = rnn.weight_ih_l0
rnn1.bias_ih_l0 = rnn.bias_ih_l0
rnn1.weight_hh_l0 = rnn.weight_hh_l0
rnn1.bias_hh_l0 = rnn.bias_hh_l0

rnn2 = RNN(input_size=hidden_size, hidden_size=hidden_size)
rnn2.weight_ih_l0 = rnn.weight_ih_l1
rnn2.bias_ih_l0 = rnn.bias_ih_l1
rnn2.weight_hh_l0 = rnn.weight_hh_l1
rnn2.bias_hh_l0 = rnn.bias_hh_l1

print(rnn2(rnn1(input)[0])[0])

tensor([[[-0.6123, -0.2152, -0.1725],
         [-0.7671, -0.3529, -0.3967],
         [-0.8459, -0.2376, -0.5091]],

        [[-0.8887, -0.0942, -0.7185],
         [-0.8719,  0.0387, -0.6241],
         [-0.8933,  0.0182, -0.5859]],

        [[-0.8192, -0.3412, -0.3195],
         [-0.9153, -0.0921, -0.4135],
         [-0.8775, -0.2551, -0.3753]]], grad_fn=<StackBackward0>)


## Self implementation

In [None]:
def forward(
    x: torch.Tensor, 
    hidden_size: int, 
    weight_ih: list[torch.Tensor], 
    bias_ih: list[torch.Tensor], 
    weight_hh: list[torch.Tensor], 
    bias_hh: list[torch.Tensor], 
    num_layers=1
):
    seq_len, batch_size, _ = x.size()
    h_0 = torch.zeros(num_layers, batch_size, hidden_size)
    h_t_minus_1 = h_0
    h_t = h_0
    output = []
    for t in range(seq_len):
        print("="*80)
        display(Latex(f"Processing $x_{{{t}}}$ elemnt of sequence."))
        for layer in range(num_layers):
            print(layer)
            print(t)
            h_t[layer] = torch.tanh(
                x[t] @ weight_ih[layer].T
                + bias_ih[layer]
                + h_t_minus_1[layer] @ weight_hh[layer].T
                + bias_hh[layer]
            )
            display(Latex("$x_i$"))
        output.append(h_t[-1].clone())
        h_t_minus_1 = h_t
    output = torch.stack(output)
    return output, h_t

In [None]:
weight_hh = [rnn.weight_hh_l0, rnn.weight_hh_l1, rnn.weight_hh_l2]
weight_ih = [rnn.weight_ih_l0, rnn.weight_ih_l1, rnn.weight_ih_l2]
bias_hh = [rnn.bias_hh_l0, rnn.bias_hh_l1, rnn.bias_hh_l2]
bias_ih = [rnn.bias_ih_l0, rnn.bias_ih_l1, rnn.bias_ih_l2]

forward(
    x=input,
    hidden_size=rnn.hidden_size,
    weight_ih=weight_ih,
    bias_ih=bias_ih,
    weight_hh=weight_hh,
    bias_hh=bias_hh,
    num_layers=rnn.num_layers
)



<IPython.core.display.Latex object>

0
0


<IPython.core.display.Latex object>

1
0


RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x2 and 10x10)