# Sequential Model (RNN, LSTM)

***Dapeng Shang, BU Questrom***

### RNN

<div style="text-align: center">
    <img src="./Asset/p10.png" width="600" height="200">
</div>

$$ 
\begin{align*}
&h = 0 \\
&\text{for } x \text{ in } X: \\
&\quad h = linear(x, h) \\
&\quad h = tanh(h) 
\end{align*}
$$

##### RNN Cell

<div style="text-align: center">
    <img src="./Asset/p11.png" width="600" height="200">
</div>

$$ h_t = tanh(W_{ih}x_t + b_{ih} + W_{hh}h_{t-1} + b_{hh})$$

```python
cell = torch.nn.RNNCell(input_size, hidden_size)
hidden = cell(input, hidden)
```
**Input** :
input of shape (batch, input_size); hidden of shape (batch, hidden_size)

**Output** : hidden of shape (batch, hidden_size)

**dataset.shape = (batch, seq_len, input_size)**

In [None]:
import torch

batch_size =1
seq_len = 3
input_size = 4
hidden_size = 2

cell = torch.nn.RNNCell(input_size, hidden_size)

dataset = torch.randn(seq_len, batch_size, input_size)
hidden = torch.zeros(batch_size, hidden_size)

for idx, input in enumerate(dataset):
    print('='*20, idx, '='*20)
    print('Input size:', input.shape)

    hidden = cell(input, hidden)

    print('hidden size:', hidden.shape)
    print(hidden)

Input size: torch.Size([1, 4])
hidden size: torch.Size([1, 2])
tensor([[-0.1594,  0.7914]], grad_fn=<TanhBackward0>)
Input size: torch.Size([1, 4])
hidden size: torch.Size([1, 2])
tensor([[-0.8924,  0.0662]], grad_fn=<TanhBackward0>)
Input size: torch.Size([1, 4])
hidden size: torch.Size([1, 2])
tensor([[-0.8680,  0.1673]], grad_fn=<TanhBackward0>)


#### Using RNN

<div style="text-align: center">
    <img src="./Asset/p12.png" width="600" height="200">
</div>

```python
cell = torch.nn.RNN(input_size, hidden_size, num_layers)
out, hidden = cell(input, hidden)
```
**Input** :
input of shape (seqSize, batch, input_size); hidden of shape (numLayers, batch, hidden_size)

**Output** : output of shape (seqSize, batch, hidden_size); hidden of shape (numLayers, batch, hidden_size)

**numLayers**
<div style="text-align: center">
    <img src="./Asset/p13.png" width="600" height="200">
</div>

**batch_first**: if True, input and output tensors are provided as **(batch, seqSize, input_size)**

In [None]:
batch_size =1
seq_len = 3
input_size = 4
hidden_size = 2
num_layers = 2

cell = torch.nn.RNN(input_size, hidden_size, num_layers)

inputs = torch.randn(seq_len, batch_size, input_size)
hidden = torch.zeros(num_layers, batch_size, hidden_size)

out, hidden = cell(inputs, hidden)

print('Output size:', out.shape)
print('Output:', out)
print('Hidden size:', hidden.shape)
print('Hidden:', hidden)

Output size: torch.Size([3, 1, 2])
Output: tensor([[[-0.6445, -0.6960]],

        [[ 0.2711,  0.0168]],

        [[-0.6503, -0.5252]]], grad_fn=<StackBackward0>)
Hidden size: torch.Size([2, 1, 2])
Hidden: tensor([[[ 0.2277,  0.7334]],

        [[-0.6503, -0.5252]]], grad_fn=<StackBackward0>)


#### Task: "hello" $\to$ "ohlol"

##### Using RNN Cell
<div style="text-align: center">
    <img src="./Asset/p14.png" width="600" height="250">
</div>

<div style="text-align: center">
    <img src="./Asset/p15.png" width="600" height="250">
</div>

Inputsize/Outputsize = 4: as we have 4 letters in the alphabet (one-hot encoding)

In [None]:
import torch

input_size = 4
hidden_size = 4
batch_size = 1

# Building a dictionary
idx2char = ['e', 'h', 'l', 'o']
x_data = [1,0,2,2,3] # hello
y_data = [3,1,2,3,2] # ohlol

one_hot_lookup = [[1,0,0,0], # 0
                  [0,1,0,0], # 1
                  [0,0,1,0], # 2
                  [0,0,0,1]] # 3
# Convert indices to one-hot vector
x_one_hot = [one_hot_lookup[x] for x in x_data]

# input dim: (seq_len, batch, input_size)
inputs = torch.Tensor(x_one_hot).view(-1, batch_size, input_size)
# labels dim: (seq_len, 1)
labels = torch.LongTensor(y_data).view(-1,1)

In [None]:
class Model(torch.nn.Module):
    def __init__(self, input_size, hidden_size, batch_size):
        super(Model, self).__init__()
        self.batch_size = batch_size
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnncell = torch.nn.RNNCell(input_size=self.input_size, hidden_size=self.hidden_size)

    def forward(self, input, hidden):
        hidden = self.rnncell(input, hidden)
        return hidden
    
    # provide intial hidden h_0
    def init_hidden(self):
        # batch_size only used in h_0
        return torch.zeros(self.batch_size, self.hidden_size)
    
model = Model(input_size, hidden_size, batch_size)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [None]:
for epoch in range(15):
    loss = 0
    optimizer.zero_grad()
    hidden = model.init_hidden()

    print('Predicted string:', end='')
    # Loop through the whole sequence
    # shape of input: (batch_size, input_size); 
    # shape of inputs: (seq_len, batch_size, input_size)
    for input, label in zip(inputs, labels):
        hidden = model(input, hidden)
        # Loss along the sequence should be accumulated 
        # to construct computational graph
        loss += criterion(hidden, label)
        # output prediction
        _, idx = hidden.max(dim=1)
        print(idx2char[idx.item()], end='')
    loss.backward()
    optimizer.step()
    print(', Epoch [%d/15] loss=%.4f' % (epoch+1, loss.item()))
    

Predicted string:lllll, Epoch [1/15] loss=6.9622
Predicted string:lllll, Epoch [2/15] loss=5.9980
Predicted string:lhlll, Epoch [3/15] loss=5.1600
Predicted string:lhlol, Epoch [4/15] loss=4.3704
Predicted string:lhlol, Epoch [5/15] loss=3.8832
Predicted string:ohlol, Epoch [6/15] loss=3.5548
Predicted string:ohlol, Epoch [7/15] loss=3.2252
Predicted string:ohlol, Epoch [8/15] loss=2.9380
Predicted string:ohlol, Epoch [9/15] loss=2.7180
Predicted string:ohlol, Epoch [10/15] loss=2.5496
Predicted string:ohlol, Epoch [11/15] loss=2.4173
Predicted string:ohlol, Epoch [12/15] loss=2.3090
Predicted string:ohlol, Epoch [13/15] loss=2.2165
Predicted string:ohlol, Epoch [14/15] loss=2.1375
Predicted string:ohlol, Epoch [15/15] loss=2.0754


##### Using RNN

In [None]:
import torch 

input_size = 4
hidden_size = 4
num_layers = 1
batch_size = 1
seq_len = 5


# Building a dictionary
idx2char = ['e', 'h', 'l', 'o']
x_data = [1,0,2,2,3] # hello
y_data = [3,1,2,3,2] # ohlol

one_hot_lookup = [[1,0,0,0], # 0
                  [0,1,0,0], # 1
                  [0,0,1,0], # 2
                  [0,0,0,1]] # 3
# Convert indices to one-hot vector
x_one_hot = [one_hot_lookup[x] for x in x_data]

inputs = torch.Tensor(x_one_hot).view(seq_len, batch_size, input_size)
# labels dim: (seq_len*batch_size, 1)
labels = torch.LongTensor(y_data)

In [None]:
class Model(torch.nn.Module):
    def __init__(self, input_size, hidden_size, batch_size):
        super(Model, self).__init__()
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn = torch.nn.RNN(input_size=self.input_size, 
                                    hidden_size=self.hidden_size,
                                    num_layers=self.num_layers)

    def forward(self, input):
        # inital hidden h_0
        hidden = torch.zeros(self.num_layers, self.batch_size, self.hidden_size)
        # don't need to record hidden state
        out, _ = self.rnn(input, hidden)
        # reshape out to (batch_size*seq_len, hidden_size) to match input dim
        return out.view(-1, self.hidden_size)
    
model = Model(input_size, hidden_size, batch_size)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

for epoch in range(15):
    optimizer.zero_grad()
    # inputs dim: (seq_len, batch, input_size), outputs dim: (seq_len, batch, hidden_size)
    outputs = model(inputs)
    # labels dim: (seq_len, batch, 1)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    _, idx = outputs.max(dim=1)
    idx = idx.data.numpy()
    print('Predicted:', ''.join([idx2char[x] for x in idx]), end='')
    print(', Epoch [%d/15] loss=%.4f' % (epoch+1, loss.item()))

Predicted: olooe, Epoch [1/15] loss=1.3778
Predicted: olool, Epoch [2/15] loss=1.1973
Predicted: ollll, Epoch [3/15] loss=1.0821
Predicted: ollll, Epoch [4/15] loss=0.9964
Predicted: ololl, Epoch [5/15] loss=0.9237
Predicted: oholl, Epoch [6/15] loss=0.8513
Predicted: oholl, Epoch [7/15] loss=0.7763
Predicted: oholl, Epoch [8/15] loss=0.7238
Predicted: ohool, Epoch [9/15] loss=0.6927
Predicted: ohool, Epoch [10/15] loss=0.6604
Predicted: ohool, Epoch [11/15] loss=0.6342
Predicted: ohool, Epoch [12/15] loss=0.6163
Predicted: ohool, Epoch [13/15] loss=0.6030
Predicted: ohool, Epoch [14/15] loss=0.5916
Predicted: ohool, Epoch [15/15] loss=0.5813


In example above, we use **one-hot encoding**
1. high-dim
2. Sparse
3. Hardcoded

To solve this, we map from one-hot encoding to low-dim dense vector, i.e. dimension reduction, called **embedding** 
1. Low-dim
2. Dense
3. Learn from Data


##### Using Embedding

<div style="text-align: center">
    <img src="./Asset/p16.png" width="600" height="300">
</div>

```python
embedding = torch.nn.Embedding(num_embeddings, embedding_dim)
# num_embeddings: size of the dictionary of embeddings
# embedding_dim: the size of each embedding vector
```
**Input** : LongTensor of arbitrary shape containing the indices to extract

**Output** : (*, embedding_dim), where * is the input shape


In [None]:
num_class = 4
input_size = 4
hidden_size = 8
embedding_size = 10
num_layers = 2
batch_size = 1
seq_len = 5

In [None]:
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = torch.nn.Embedding(input_size, embedding_size)
    
        self.rnn = torch.nn.RNN(input_size=embedding_size, 
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    batch_first=True)
        
        self.fc = torch.nn.Linear(hidden_size, num_class)
        
        

    def forward(self, x):
        # x.shape: (batch_size, seq_len)
        hidden = torch.zeros(num_layers, x.size(0), hidden_size)
        # Input LongTensor (batch_size, seq_len), output (batch_size, seq_len, embedding_size)
        # Note, batch first
        x = self.emb(x)
        # Input (batch_size, seq_len, embedding_size), output (batch_size, seq_len, hidden_size)
        x, _ = self.rnn(x, hidden)
        x = self.fc(x)
        return x.view(-1, num_class)

    
model = Model()

In [None]:
idx2char = ['e', 'h', 'l', 'o']
x_data = [[1,0,2,2,3]] # (batch, seq_len)
y_data = [3,1,2,3,2] # (batch*seq_len)

inputs = torch.LongTensor(x_data)
labels = torch.LongTensor(y_data)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

for epoch in range(15):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    _, idx = outputs.max(dim=1)
    idx = idx.data.numpy()
    print('Predicted:', ''.join([idx2char[x] for x in idx]), end='')
    print(', Epoch [%d/15] loss=%.4f' % (epoch+1, loss.item()))

Predicted: heeee, Epoch [1/15] loss=1.4693
Predicted: oolol, Epoch [2/15] loss=1.1082
Predicted: oolol, Epoch [3/15] loss=0.6984
Predicted: ohlol, Epoch [4/15] loss=0.4194
Predicted: ohlol, Epoch [5/15] loss=0.2707
Predicted: ohlol, Epoch [6/15] loss=0.1528
Predicted: ohlol, Epoch [7/15] loss=0.0902
Predicted: ohlol, Epoch [8/15] loss=0.0540
Predicted: ohlol, Epoch [9/15] loss=0.0337
Predicted: ohlol, Epoch [10/15] loss=0.0217
Predicted: ohlol, Epoch [11/15] loss=0.0142
Predicted: ohlol, Epoch [12/15] loss=0.0096
Predicted: ohlol, Epoch [13/15] loss=0.0067
Predicted: ohlol, Epoch [14/15] loss=0.0048
Predicted: ohlol, Epoch [15/15] loss=0.0036


### LSTM

<div style="text-align: center">
    <img src="./Asset/p17.png" width="600" height="300">
</div>

$$
\begin{align*}
&i_t = \sigma(W_{ii}x_t + b_{ii} + W_{hi}h_{t-1} + b_{hi}) \\
&f_t = \sigma(W_{if}x_t + b_{if} + W_{hf}h_{t-1} + b_{hf}) \\
&g_t = \tanh(W_{ig}x_t + b_{ig} + W_{hg}h_{t-1} + b_{hg}) \\
&o_t = \sigma(W_{io}x_t + b_{io} + W_{ho}h_{t-1} + b_{ho}) \\
&c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
&h_t = o_t \odot \tanh(c_t)
\end{align*}
$$

**Advantage**: add a direct path for gradient to flow along $c_t$, which can solve the problem of vanishing gradient

```python
cell = torch.nn.LSTM(input_size, hidden_size, num_layers)
out, (h, c) = cell(input, (h, c))
```
**Input**: input of shape (seqSize, batch, input_size); hidden of shape (numLayers * numDirections, batch, hidden_size); cell of shape (numLayers * numDirections, batch, hidden_size)

**Output**: output of shape (seqSize, batch, hidden_size); hidden of shape (numLayers * numDirections, batch, hidden_size); cell of shape (numLayers * numDirections, batch, hidden_size)

```python
numDirections = 2 if bidirectional else 1
```

In [None]:
import torch

num_class = 4
input_size = 4
hidden_size = 8
embedding_size = 10
num_layers = 2
batch_size = 1
seq_len = 5

idx2char = ['e', 'h', 'l', 'o']
x_data = [[1,0,2,2,3]] # (batch, seq_len)
y_data = [3,1,2,3,2] # (batch*seq_len)

inputs = torch.LongTensor(x_data)
labels = torch.LongTensor(y_data)

In [None]:
class Model(torch.nn.Module):
    def __init__(self, num_layers, bidirectional=False):
        super(Model, self).__init__()
        self.num_layers = num_layers
        self.num_directions = 2 if bidirectional else 1

        self.emb = torch.nn.Embedding(input_size, embedding_size)
    
        self.lstm = torch.nn.LSTM(input_size=embedding_size, 
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional = bidirectional,
                                    batch_first=True)
        # if bidirectional, hidden_size*2
        self.fc = torch.nn.Linear(hidden_size*self.num_directions, num_class)
        
        

    def forward(self, x):
        # x.shape: (batch_size, seq_len)
        hidden = torch.zeros(self.num_layers*self.num_directions, x.size(0), hidden_size)
        cell = torch.zeros(self.num_layers*self.num_directions, x.size(0), hidden_size)
        # Input LongTensor (batch_size, seq_len), output (batch_size, seq_len, embedding_size)
        # Note, batch first
        x = self.emb(x)
        # Input (batch_size, seq_len, embedding_size), output (batch_size, seq_len, hidden_size)
        x, _ = self.lstm(x, (hidden, cell))
        x = self.fc(x)
        return x.view(-1, num_class)

    
model = Model(num_layers, bidirectional=True)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

for epoch in range(15):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    _, idx = outputs.max(dim=1)
    idx = idx.data.numpy()
    print('Predicted:', ''.join([idx2char[x] for x in idx]), end='')
    print(', Epoch [%d/15] loss=%.4f' % (epoch+1, loss.item()))

Predicted: hhhhh, Epoch [1/15] loss=1.3900
Predicted: ooooo, Epoch [2/15] loss=1.2025
Predicted: ollll, Epoch [3/15] loss=1.0328
Predicted: oolll, Epoch [4/15] loss=0.8841
Predicted: oholl, Epoch [5/15] loss=0.7371
Predicted: ohool, Epoch [6/15] loss=0.5523
Predicted: ohlol, Epoch [7/15] loss=0.3412
Predicted: ohlol, Epoch [8/15] loss=0.1919
Predicted: ohlol, Epoch [9/15] loss=0.0978
Predicted: ohlol, Epoch [10/15] loss=0.0518
Predicted: ohlol, Epoch [11/15] loss=0.0256
Predicted: ohlol, Epoch [12/15] loss=0.0126
Predicted: ohlol, Epoch [13/15] loss=0.0068
Predicted: ohlol, Epoch [14/15] loss=0.0039
Predicted: ohlol, Epoch [15/15] loss=0.0024
