# Attention Mechanism

[DeepLearning.ai: Attention in Transformers: Concepts and Code in PyTorch](https://learn.deeplearning.ai/courses/attention-in-transformers-concepts-and-code-in-pytorch/lesson/gb20l/the-matrix-math-for-calculating-self-attention)

Process: 
1. Input tokenized
2. Input embedded `(n, 512)` matrix, where n – number of words, each word presented in 512-size dimension
3. Calculate Q, K, V matrixes: 
      - `(n, 512) x (512, 512)^T = (n, 512)`, 
      - where multiplying matrix of words `(n, 512)` 
      - with Q/K/V weights `(512, 512)` that are transposed as PyTorch always store weights in a way where ther should be transposed first,
      - and get resulting matrix of `(n, 512)`
4. Calculate self attention
      $$
      \text{Attention}(Q, K, V) = \text{softmax} \left( \frac{Q K^\top}{\sqrt{d_k}} \right) V
      $$

      - Query matrix: $Q$
      - Key matrix: $K$
      - Value matrix: $V$
      - Dimension of key vectors: $d_k$, where in our case it's 512
      - Activation function: $softmax()$ applied to each row, which makes sum of values on the row equal to 1

![Attention Mechanism Diagram](../images/self-attention-QKV.png)

Note:
- **Dot products** can be used as an unscaled measure of similarity between two things. This metric becomes especially powerful when **comparing vectors like Queries and Keysin the Attention mechanism**. 
- The higher the Dot Product between a Query and a Key, the more similar they are considered to be, which leads to a stronger influence of the corresponding value in the final output. 
- However, as the dimensionality of these vectors increases, the **raw Dot Product values can become large**, which may **push the Softmax function into regions with very small gradients**. 
- To counteract this, **the Dot Product** is scaled by the square root of the key dimension $\sqrt{d_k}$, ensuring more stable gradients and improved training dynamics.



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Self Attention

In [27]:
class SelfAttention(nn.Module):
    # d_model - size of word embeddings per token (i.e. 512) 
    #           and alos dimension of weight matrixes that used to create Q, K, V matrixes
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        super().__init__()

        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = row_dim
        self.col_dim = col_dim
    
    # token encoding – are the Words Embeddings plus Positional Encoding
    def forward(self, token_encodings):
        q = self.W_q(token_encodings)
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)

        # Calculation 1: Q*K^T
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        
        # Calculation 2: Q*K^T / sqrt(d_model)
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

         # Calculation 4: softmax(Q*K^T / sqrt(d_model))
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        # Calculation 3: softmax(Q*K^T / sqrt(d_model)) * V
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores

In [28]:
encoding_matrix = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
])

torch.manual_seed(42)

selfAttention = SelfAttention(d_model=2, row_dim=0, col_dim=1)

# calculation of self-attention scores
selfAttention.forward(encoding_matrix)
# Output: tensor([
#        [1.0100, 1.0641],
#        [0.2040, 0.7057],
#        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [32]:
# Note: validating self-attention calculations, but actually not clear from the given code and video
print(selfAttention.W_q.weight.transpose(0, 1))
print(selfAttention.W_k.weight.transpose(0, 1))
print(selfAttention.W_v.weight.transpose(0, 1))

selfAttention.W_q(encoding_matrix)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)
tensor([[-0.1549, -0.3443],
        [ 0.1427,  0.4153]], grad_fn=<TransposeBackward0>)
tensor([[ 0.6233,  0.6146],
        [-0.5188,  0.1323]], grad_fn=<TransposeBackward0>)


tensor([[ 0.7621, -0.0428],
        [ 1.1063,  0.7890],
        [ 1.1164, -2.1336]], grad_fn=<MmBackward0>)

## Masked Self Attention

1. {eveything similar to self attention}
2. but different calcualation for Masked self attention:
      $$
      \text{Attention}(Q, K, V) = \text{softmax} \left( \frac{Q K^\top}{\sqrt{d_k}} + M \right) V
      $$

      - M is masking matrix, e.g. Look-ahead mask prevents attention to future tokens

    

In [None]:
class MaskedSelfAttention(nn.Module):
    # d_model - size of word embeddings per token (i.e. 512)
    #           and alos dimension of weight matrixes that used to create Q, K, V matrixes
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        super().__init__()

        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = row_dim
        self.col_dim = col_dim

    # token encoding – are the Words Embeddings plus Positional Encoding
    def forward(self, token_encodings, mask):
        q = self.W_q(token_encodings)
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)

        # Calculation 1: Q*K^T
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        # Calculation 2: softmax(Q*K^T / sqrt(d_model))
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

        # Calculation 3: apply mask
        scaled_sims = scaled_sims.masked_fill(mask == 0, float('-inf'))

        # Calculation 4: softmax(Q*K^T / sqrt(d_model))
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        # Calculation 5: softmax(Q*K^T / sqrt(d_model)) * V
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores

In [21]:
# generate mask
mask = torch.tril(torch.ones(3, 3), diagonal=0)
mask

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [None]:
encoding_matrix = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
])

torch.manual_seed(42)

maskedSelfAttention = MaskedSelfAttention(d_model=2, row_dim=0, col_dim=1)

# calculation of self-attention scores with mask
maskedSelfAttention.forward(encoding_matrix, mask=mask)
# Output: tensor([
#       [ 0.6038,  0.7434],
#       [-0.0062,  0.6072],
#       [ 3.4989,  2.2427]]

tensor([[ 0.6038,  0.7434],
        [-0.0062,  0.6072],
        [ 3.4989,  2.2427]], grad_fn=<MmBackward0>)

In [33]:
# validating masked self-attention calculations
print(maskedSelfAttention.W_q.weight.transpose(0, 1))

# shoiuld be the same as masked self-attention scores
maskedSelfAttention.W_v(encoding_matrix)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)


tensor([[ 0.6038,  0.7434],
        [-0.3502,  0.5303],
        [ 3.8695,  2.4246]], grad_fn=<MmBackward0>)

# Complete Self Attention

In [2]:
class Attention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, col_dim=1):
        super().__init__()

        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):
        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)

        # Calculation 1: Q*K^T
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        # Calculation 2: Q*K^T / sqrt(d_model)
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

        if mask is not None:
            # Calculation 3: apply mask
            scaled_sims = scaled_sims.masked_fill(mask == 0, float('-inf'))

        # Calculation 4: softmax(Q*K^T / sqrt(d_model))
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        # Calculation 5: softmax(Q*K^T / sqrt(d_model)) * V
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores

In [None]:
encodings_for_q = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
])

encodings_for_k = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
])

encodings_for_v = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
])

torch.manual_seed(42)

attention = Attention(d_model=2, row_dim=0, col_dim=1)

# calculation of attention scores
attention.forward(encodings_for_q, encodings_for_k, encodings_for_v)
# Output: tensor([[1.0100, 1.0641],
#                 [0.2040, 0.7057],
#                 [3.4989, 2.2427]])

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=2, row_dim=0, col_dim=1, num_heads=1):
        super().__init__()

        self.heads = nn.ModuleList([
            Attention(d_model=d_model, row_dim=row_dim, col_dim=col_dim)
            for _ in range(num_heads)
        ])
         
        self.col_dim = col_dim

    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):
        # Calculation 1: concat all heads
        # Calculation 2: W^O * concat
        return torch.cat([
            head(encodings_for_q, encodings_for_k, encodings_for_v, mask=mask)
            for head in self.heads
        ], dim=self.col_dim)


In [None]:
torch.manual_seed(42)

multiHeadAttention = MultiHeadAttention(d_model=2, row_dim=0, col_dim=1, num_heads=1)

multiHeadAttention.forward(encodings_for_q, encodings_for_k, encodings_for_v)
# should be the same as attention scores from the single head attention
attention.forward(encodings_for_q, encodings_for_k, encodings_for_v)
# Output: tensor([[1.0100, 1.0641],
#                 [0.2040, 0.7057],
#                 [3.4989, 2.2427]])

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<CatBackward0>)

In [None]:
torch.manual_seed(42)
multi_head_attention = MultiHeadAttention(d_model=2, row_dim=0, col_dim=1, num_heads=2)
# calculation of multi-head attention scores
multi_head_attention.forward(encodings_for_q, encodings_for_k, encodings_for_v)
# Output: tensor([[ 1.0100,  1.0641, -0.7081, -0.8268],
#                 [ 0.2040,  0.7057, -0.7417, -0.9193],
#                 [ 3.4989,  2.2427, -0.7190, -0.8447]]

tensor([[ 1.0100,  1.0641, -0.7081, -0.8268],
        [ 0.2040,  0.7057, -0.7417, -0.9193],
        [ 3.4989,  2.2427, -0.7190, -0.8447]], grad_fn=<CatBackward0>)