<a href="https://colab.research.google.com/github/gnoejh/ict1022/blob/main/Transformer/10_encoder_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# Attention, Positional Encoding, Encoder Block
- https://newsletter.theaiedge.io/p/the-transformer-architecture-v2

### 1. Attention Mechanism


**Objective**: Enable models to focus on relevant parts of input sequences, essential for handling longer dependencies.

**Explanation**: 
- Attention allows a model to “attend” to specific input parts when generating each output. It’s especially useful for tasks requiring selective referencing of input tokens (e.g., translation, summarization).

    

In [7]:
# Simple attention mechanism
def attention(query, key, value):
    scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(key.size(-1), dtype=torch.float32))
    attn_weights = torch.nn.functional.softmax(scores, dim=-1)
    output = torch.matmul(attn_weights, value)
    return output, attn_weights

# Example inputs
query = torch.randn(5, 3, 8)  # Batch size = 5, Sequence length = 3, Embedding size = 8
key = torch.randn(5, 3, 8)
value = torch.randn(5, 3, 8)

# Apply attention
output, weights = attention(query, key, value)
print("Attention Output:", output)
print("Attention Weights:", weights)
    

Attention Output: tensor([[[ 9.6740e-01, -3.0352e-02,  1.1156e-01,  1.9646e-01, -6.1691e-01,
           7.4783e-01, -5.7474e-02,  1.5947e-02],
         [ 8.2061e-01, -5.0180e-02, -9.8467e-01, -1.8920e-01, -1.6075e+00,
           1.0993e+00, -2.2059e-01, -6.5414e-01],
         [ 1.3272e+00,  3.0562e-01,  7.5185e-01,  1.5077e-01, -2.8726e-01,
           3.6313e-01,  3.9887e-01,  1.0497e+00]],

        [[ 3.4469e-01,  2.5661e-01,  5.7576e-01,  6.3296e-01,  2.7943e-01,
          -7.2584e-01,  1.0602e+00,  2.8766e-01],
         [ 8.6278e-01,  2.7242e-01,  2.6027e-01,  2.0195e-01,  4.0122e-01,
          -5.4352e-01,  8.1289e-01,  3.4031e-01],
         [ 1.6067e+00,  2.1239e-01,  1.1643e-01, -2.6463e-01,  8.8893e-01,
          -3.1012e-01,  2.2290e-01,  5.0179e-01]],

        [[ 1.8186e-01, -2.7800e-01, -7.5871e-02,  1.2186e+00, -1.1200e+00,
          -1.0662e+00, -1.0065e+00, -1.1341e+00],
         [-1.8343e-02, -1.4870e-03, -7.7953e-02,  1.2999e+00, -1.2093e+00,
          -9.5335e-01, -9.07

### 2. Self-Attention Mechanism


**Objective**: Understand how self-attention allows a model to relate each word in a sentence to every other word, helping capture context effectively.

**Explanation**:
   - **Queries, Keys, and Values**: In self-attention, each word is represented by three vectors: a query, a key, and a value.
   - **Dot Product and Scaling**: The query and key vectors are multiplied to determine attention scores, representing the similarity between words. These scores are scaled and then passed through a softmax function to obtain attention weights.
   - **Weighted Sum**: Each word’s final representation is the sum of all value vectors, weighted by attention scores, allowing the model to focus on relevant words in context.
    

In [8]:
# Self-attention function
def self_attention(query, key, value):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    attn_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attn_weights, value)
    return output, attn_weights

# Example inputs
query = torch.randn(2, 4, 8)  # Batch size = 2, Sequence length = 4, Embedding size = 8
key = torch.randn(2, 4, 8)
value = torch.randn(2, 4, 8)

# Apply self-attention
output, attn_weights = self_attention(query, key, value)
print("Self-Attention Output:", output)
print("Attention Weights:", attn_weights)
    

Self-Attention Output: tensor([[[ 0.0643,  0.3878, -0.9651, -0.1585,  0.2823,  0.5144,  0.4199,
           1.3144],
         [-1.5024,  0.9853,  0.1165, -0.4015, -0.5092,  0.0443,  0.7778,
          -0.2300],
         [-0.5930,  0.6945, -0.6236, -0.4940, -0.0053,  0.6527,  0.2250,
           0.5691],
         [-1.0035,  1.0931, -0.1644, -0.6018,  0.1497,  0.8812,  0.3046,
           0.4224]],

        [[-1.1846, -0.6857, -0.8063,  0.5842, -0.5376,  0.2940, -0.3155,
          -0.2056],
         [-1.2179, -0.6285, -0.5397,  0.7711, -0.2013, -0.3624, -0.6052,
           0.1188],
         [-0.9526, -0.6172, -0.5162,  0.8198, -0.1445, -0.4041, -0.1650,
           0.2313],
         [-0.5575, -0.6047, -0.5012,  0.8782, -0.0858, -0.4164,  0.5090,
           0.3747]]])
Attention Weights: tensor([[[0.1811, 0.1012, 0.7108, 0.0070],
         [0.0116, 0.4177, 0.0339, 0.5368],
         [0.2914, 0.3495, 0.3362, 0.0229],
         [0.0639, 0.5883, 0.1632, 0.1845]],

        [[0.2246, 0.4163, 0.1650, 0.

### 3. Multi-Head Attention


**Objective**: Enhance the model’s ability to capture diverse word relationships by using multiple attention heads.

**Explanation**:
   - **Multiple Heads**: Multi-head attention applies self-attention multiple times in parallel, each with different learned projections. This allows the model to capture various types of dependencies, such as grammatical or semantic.
   - **Concatenation and Linear Transformation**: Each head’s output is concatenated and passed through a linear layer, blending the insights from all heads into a single representation.
    

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert embed_size % num_heads == 0
        self.head_dim = embed_size // num_heads
        self.num_heads = num_heads

        # Linear layers for queries, keys, values
        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)
        
        # Final linear layer after concatenation
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        batch_size, seq_len, embed_size = x.size()

        # Transform inputs into multiple heads
        query = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        key = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        value = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Apply self-attention
        scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        attn_weights = F.softmax(scores, dim=-1)
        attention = torch.matmul(attn_weights, value)
        
        # Concatenate heads and pass through the final linear layer
        attention = attention.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_size)
        return self.fc_out(attention)

# Example usage
embed_size = 16
num_heads = 4
x = torch.randn(2, 5, embed_size)  # Batch size = 2, Sequence length = 5

multi_head_attention = MultiHeadAttention(embed_size, num_heads)
output = multi_head_attention(x)
print("Multi-Head Attention Output:", output)
    

Multi-Head Attention Output: tensor([[[ 0.2883, -0.0666,  0.0324,  0.0903, -0.0224,  0.1709,  0.3777,
          -0.1263, -0.1861,  0.1183,  0.0483,  0.0571,  0.3380, -0.0030,
          -0.0444,  0.1921],
         [ 0.0937, -0.2319, -0.2270,  0.2127, -0.3615,  0.1010,  0.1613,
           0.2324, -0.1731,  0.1133,  0.1282,  0.1121,  0.1108,  0.0730,
          -0.0035,  0.0394],
         [ 0.1270, -0.0730, -0.0799,  0.1946, -0.3110, -0.0063,  0.2045,
           0.2427, -0.1767,  0.0796,  0.0379,  0.2217,  0.1492,  0.1154,
           0.0604,  0.0234],
         [ 0.4188,  0.0238,  0.2415,  0.1716,  0.0192,  0.0526,  0.4497,
          -0.1370, -0.0547,  0.2190,  0.1902,  0.1459,  0.3794,  0.0985,
           0.0621,  0.1025],
         [-0.0127, -0.2505, -0.2466,  0.1567, -0.4309,  0.0406,  0.1680,
           0.2939, -0.2072,  0.0328,  0.0527,  0.1321,  0.0940,  0.0553,
           0.0515,  0.1003]],

        [[-0.1013, -0.0444, -0.0998,  0.1317, -0.4518, -0.1567,  0.0364,
           0.2895,  0

### 4. Positional Encoding


**Objective**: Introduce positional information into word embeddings, as Transformers process words in parallel without inherent sequence order.

**Explanation**:
   - Since Transformers do not process words sequentially, positional encodings are added to the embeddings to give the model information about the order of words.
   - **Sine and Cosine Functions**: Positional encodings are computed using sine and cosine functions with varying frequencies, creating unique patterns for each position in the sequence.
    

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_size)
        for pos in range(max_len):
            for i in range(0, embed_size, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / embed_size)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * i) / embed_size)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

# Example usage
embed_size = 16
seq_len = 10
pos_encoding = PositionalEncoding(embed_size)
x = torch.randn(1, seq_len, embed_size)
output = pos_encoding(x)
print("Positional Encoding Output:", output)
    

Positional Encoding Output: tensor([[[-0.0142,  0.6519, -0.4046,  1.5159,  0.5459,  1.1051,  0.9110,
           3.0094,  1.1891,  2.7101,  0.0316,  1.0957,  0.3445,  0.5408,
           0.3892,  0.4563],
         [ 1.9320,  0.4930,  0.7246,  1.5846,  0.6008,  1.0921,  0.3042,
           1.9109,  0.4135,  0.1733, -1.7470,  0.5978, -0.9461,  1.1420,
          -0.3217,  1.4612],
         [ 2.0854,  0.3954,  2.5952,  2.0235, -1.1602,  1.6995,  0.7881,
          -0.0250,  0.0606,  2.2339, -1.9072,  1.5465, -0.0992,  1.2900,
           0.1494,  0.2316],
         [ 1.0135,  0.3130, -0.4570, -0.3857,  0.6012,  0.7225,  1.1795,
           2.0650,  0.0155,  0.5062, -0.4983,  1.7248, -0.3680,  1.2916,
          -0.9639,  2.2793],
         [-0.3510,  0.4673,  0.3528,  0.4085, -0.5552,  1.6738, -0.1249,
           1.5577,  1.7393,  1.4887, -1.9180,  2.1472, -0.2435,  0.0221,
           0.5600,  0.5158],
         [-1.5962, -1.0358,  0.3558,  2.9229,  0.7078,  4.4950, -0.6757,
           1.4326,  1.95

### 4. Transformer Encoder Block


**Objective**: Understand how the Transformer encoder combines self-attention and feedforward layers, the fundamental units in a Transformer.

**Explanation**:
   - **Self-Attention Layer**: The encoder starts with a multi-head attention layer, allowing the model to attend to relevant parts of the input.
   - **Add & Norm**: After self-attention, a residual connection is added, followed by layer normalization to stabilize learning.
   - **Feedforward Layer**: The attention output is passed through a fully connected layer for further processing.
   - **Final Add & Norm**: A second residual connection and layer normalization complete the block, making the Transformer’s encoder robust to varying input sequences.
    

In [11]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_size, num_heads, forward_expansion):
        super(TransformerEncoderBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, num_heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )

    def forward(self, x):
        attention = self.attention(x)
        x = self.norm1(attention + x)  # Add & Norm
        forward = self.feed_forward(x)
        x = self.norm2(forward + x)  # Add & Norm
        return x

# Example usage
embed_size = 16
num_heads = 4
forward_expansion = 4
encoder_block = TransformerEncoderBlock(embed_size, num_heads, forward_expansion)
x = torch.randn(2, 5, embed_size)
output = encoder_block(x)
print("Transformer Encoder Block Output:", output)
    

Transformer Encoder Block Output: tensor([[[ 0.3580, -2.3309,  0.9199,  1.8216,  0.1357, -0.4394, -0.0730,
          -0.1890, -0.8685,  0.2046, -0.7075,  1.4934, -0.8587, -0.8029,
           0.3374,  0.9994],
         [-0.5780, -0.1122, -1.0606, -0.9381,  0.7764, -0.6726,  0.2579,
          -2.0317, -0.1308,  0.3224,  0.5965,  0.6775, -1.1284,  2.0658,
           0.9089,  1.0469],
         [-0.2449, -0.3956, -1.1125, -0.1097, -1.4519,  0.5094, -0.1645,
          -1.7579,  1.3889, -1.3745,  1.6421,  0.4637,  0.2255,  1.2707,
           0.6900,  0.4212],
         [ 0.7663,  0.8497, -1.3042,  0.7077, -0.1119,  1.0356, -1.3789,
          -0.9106, -0.8916, -1.4833,  0.7209,  0.5240, -1.1699,  0.1378,
           0.8356,  1.6728],
         [-0.7347,  0.5657,  0.1316, -0.7882, -1.1169,  0.0235,  1.2757,
           0.6790, -2.7670,  0.9572, -0.4825,  1.2487,  0.5235, -0.3385,
           0.2200,  0.6029]],

        [[-1.6383,  1.8912, -0.9375, -0.1615,  0.6343, -0.2352, -0.8187,
          -0.118