In [None]:
import torch
import math

In [None]:
d_model = 4
max_seq_length = 5
pe = torch.zeros(max_seq_length, d_model)

position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
print(position)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
print(div_term)
print(torch.sin(position * div_term), torch.sin(position * div_term).shape)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
print(pe)

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.]])
tensor([1.0000, 0.0100])
tensor([[ 0.0000,  0.0000],
        [ 0.8415,  0.0100],
        [ 0.9093,  0.0200],
        [ 0.1411,  0.0300],
        [-0.7568,  0.0400]]) torch.Size([5, 2])
tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996],
        [-0.7568, -0.6536,  0.0400,  0.9992]])


# Some links
- [Code reference](https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb)
- [Original Paper -- Attention is all you need](https://arxiv.org/pdf/1706.03762.pdf)


- [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html): base class for all neural network modules. Every model should be subclass of this:

```
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        ...
    def forward(self, x):
        ...
        # return output
```
- model = Model(). No need to call forward like model.forward().
- model(x) gives the output

# MultiHeadAttention
**Intuition**: Each head captures different relationships between words.

_NB_: The following code and explanation implements the multihead attention a bit differently from that in the original paper. $Q,K,V \in \mathbb{R}^{seq, d_{model}}$

Original paper:
-  $W_i^Q \in \mathbb{R}^{d_{\text{model}} \times d_k}, \quad W_i^K \in \mathbb{R}^{d_{\text{model}} \times d_k}, \quad W_i^V \in \mathbb{R}^{d_{\text{model}} \times d_v}, \quad \text{and} \quad W^O \in \mathbb{R}^{h \cdot d_v \times d_{\text{model}}}$
- These are weight matrix for each head

Code below:
- $W_i^Q \in \mathbb{R}^{d_{\text{model}} \times d_{model} }, \quad W_i^K \in \mathbb{R}^{d_{\text{model}} \times d_{model} }, \quad W_i^V \in \mathbb{R}^{d_{\text{model}} \times d_{model} }, \quad W^O \in \mathbb{R}^{h \cdot d_v \times d_{\text{model}}}$
- Only one big weight matrix for $Q,K,V$.
- Split the heads after linear transformation.

![image](https://github.com/guyuxuan9/UROP_robotic_arm/assets/58468284/5e12311f-dee3-4d09-9dce-90e81c93458c)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

## Linear Layer for $Q$, $K$ and $V$

$Q_{original} = \begin{bmatrix}
    q_{1,1} & q_{1,2} & \dots & q_{1,d_{model}} \\
    q_{2,1} & q_{2,2} & \dots & q_{2,d_{model}} \\
    \vdots & \vdots & \ddots & \vdots \\
    q_{\text{seq},1} & q_{\text{seq},2} & \dots & q_{\text{seq},d_{model}} \\
\end{bmatrix}$,    $K_{original} = \begin{bmatrix}
    k_{1,1} & k_{1,2} & \dots & k_{1,d_{model}} \\
    k_{2,1} & k_{2,2} & \dots & k_{2,d_{model}} \\
    \vdots & \vdots & \ddots & \vdots \\
    k_{\text{seq},1} & k_{\text{seq},2} & \dots & k_{\text{seq},d_{model}} \\
\end{bmatrix}$

The weight matrix $W_Q$ and $W_K$ have the learnable parameters. They are described by the _nn.Linear_ function. They all have dimension ($d_{model}, d_{model}$).

$W_Q = \begin{bmatrix}
\vdots & \vdots & \dots & \vdots \\
w_1^Q & w_2^Q & \dots & w_{d_{model}}^Q \\
\vdots & \vdots & \dots & \vdots
\end{bmatrix}$ $W_K = \begin{bmatrix}
\vdots & \vdots & \dots & \vdots \\
w_1^K & w_2^K & \dots & w_{d_{model}}^K \\
\vdots & \vdots & \dots & \vdots
\end{bmatrix}$

By multiplying the original embeddings with the learnable weights, the network can learn more patterns, increasing the expressive power than self-attention.

$Q = Q_{original}W_Q = \begin{bmatrix}
    q_{1,1}w_{1,1}^Q & q_{1,2}w_{1,2}^Q & \dots & q_{1,d_{model}}w_{1,d_{model}}^Q \\
    q_{2,1}w_{2,1}^Q & q_{2,2}w_{2,2}^Q & \dots & q_{2,d_{model}}w_{2,d_{model}}^Q \\
    \vdots & \vdots & \ddots & \vdots \\
    q_{\text{seq},1}w_{\text{seq},1}^Q & q_{\text{seq},2}w_{\text{seq},2}^Q & \dots & q_{\text{seq},d_{model}}w_{\text{seq},d_{model}}^Q \\
\end{bmatrix} = \begin{bmatrix}
    q_{1,1}' & q_{1,2}' & \dots & q_{1,d_{model}}' \\
    q_{2,1}' & q_{2,2}' & \dots & q_{2,d_{model}}' \\
    \vdots & \vdots & \ddots & \vdots \\
    q_{\text{seq},1}' & q_{\text{seq},2}' & \dots & q_{\text{seq},d_{model}}' \\
\end{bmatrix}$,

$K = K_{original}W_K = \begin{bmatrix}
    k_{1,1}' & k_{1,2}' & \dots & k_{1,d_{model}}' \\
    k_{2,1}' & k_{2,2}' & \dots & k_{2,d_{model}}' \\
    \vdots & \vdots & \ddots & \vdots \\
    k_{\text{seq},1}' & k_{\text{seq},2}' & \dots & k_{\text{seq},d_{model}}' \\
\end{bmatrix}$





## Split heads
(batch size, seq length, $d_{model}$) --> (batch size, # heads, seq length, $d_k$)

$\begin{bmatrix}
    q_{1,1}' & q_{1,2}' & \dots & q_{1,d_{model}}' \\
    q_{2,1}' & q_{2,2}' & \dots & q_{2,d_{model}}' \\
    \vdots & \vdots & \ddots & \vdots \\
    q_{\text{seq},1}' & q_{\text{seq},2}' & \dots & q_{\text{seq},d_{model}}' \\
\end{bmatrix}$ --> $\begin{bmatrix}
    q_{1,1}' & \dots & q_{1,k}'  \\
    q_{2,1}' & \dots &q_{2,k}'\\
    \vdots & \ddots & \vdots   \\
    q_{\text{seq},1}' & \dots & q_{\text{seq},k}' \\
\end{bmatrix}$ $\begin{bmatrix}
    q_{1,k+1}' & \dots & q_{1,2k}'  \\
    q_{2,k+1}' & \dots &q_{2,2k}'\\
    \vdots & \ddots & \vdots   \\
    q_{\text{seq},k+1}' & \dots & q_{\text{seq},2k}' \\
\end{bmatrix}$ ...



## Attention calculation

$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_{k}}}\right) V$
- **Q**: Why scaled? **A**: dot product $q.k = \sum_{i=1}^{d_k} q_ik_i$. Assume $q$ and $k$ are independent, zero mean and unit variance. $E\{q.k\} = 0$, $Var(q.k) = d_k$. If dot products gets larger, it will enter the saturation region of softmax --> vanishing gradient
- Attention has shape: (batch size, # heads, seq length,  $d_k$ )

## Combine heads
(batch size, # heads, seq length,  dk) --> (batch size, seq length, $d_{model}$)

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads" # d_k = d_v = d_model/num_heads
    self.d_model = d_model
    self.num_heads = num_heads
    self.d_k = d_model // num_heads

    self.W_q = nn.Linear(d_model, d_model)
    self.W_k = nn.Linear(d_model, d_model)
    self.W_v = nn.Linear(d_model, d_model)
    self.W_o = nn.Linear(d_model, d_model)

  def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

  def split_heads(self, x):
      batch_size, seq_length, d_model = x.size()
      return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

  def combine_heads(self, x):
      batch_size, _, seq_length, d_k = x.size()
      return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

  def forward(self, Q, K, V, mask=None):
      Q = self.split_heads(self.W_q(Q))
      K = self.split_heads(self.W_k(K))
      V = self.split_heads(self.W_v(V))

      attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
      output = self.W_o(self.combine_heads(attn_output))
      return output

# Position-wise Feed-Forward Networks
![image](https://github.com/guyuxuan9/UROP_robotic_arm/assets/58468284/4c2f54fb-27d0-4e19-a99f-0562123240d7)


In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Positional encoding