<a href="https://colab.research.google.com/github/gnoejh/ict1022/blob/main/Transformer/12_transformer_code_random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# Transformer

### 1. Using nn.Transformer for the Full Model


We will implement the full Transformer model using PyTorch's built-in `nn.Transformer`, which includes encoder and decoder layers optimized for Transformer architecture. This approach is more efficient and avoids the need to implement individual layers manually.

**Explanation**:
- **nn.Transformer**: Includes all the core components of a Transformer model, including multi-head attention, encoder and decoder layers, and residual connections.
- **nn.Embedding and Positional Encoding**: We use separate embedding and positional encoding modules, as these are not directly included in `nn.Transformer`.

This model can be used for tasks like translation, summarization, and other sequence-to-sequence tasks.
    

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_size)
        for pos in range(max_len):
            for i in range(0, embed_size, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / embed_size)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * i) / embed_size)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size, num_heads, num_layers, forward_expansion, dropout, max_length):
        super(TransformerModel, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, max_len=max_length)
        
        # Transformer module with encoder and decoder layers
        self.transformer = nn.Transformer(
            d_model=embed_size, 
            nhead=num_heads, 
            num_encoder_layers=num_layers, 
            num_decoder_layers=num_layers, 
            dim_feedforward=forward_expansion * embed_size, 
            dropout=dropout,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(embed_size, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Embedding and positional encoding for source and target
        src_embedded = self.positional_encoding(self.src_embedding(src))
        tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt))
        
        # Transformer forward pass
        transformer_output = self.transformer(
            src_embedded, tgt_embedded, src_key_padding_mask=src_mask, tgt_key_padding_mask=tgt_mask
        )
        
        # Final output layer for token predictions
        out = self.fc_out(transformer_output)
        return out

# Example usage
src_vocab_size = 10000 # Source vocabulary size
tgt_vocab_size = 10000 # Target vocabulary size
embed_size = 16
num_heads = 4
num_layers = 3
forward_expansion = 4 # Feedforward expansion factor
dropout = 0.1
max_length = 100 # Maximum sequence length

transformer = TransformerModel(src_vocab_size, tgt_vocab_size, embed_size, num_heads, num_layers, forward_expansion, dropout, max_length)
src = torch.randint(0, src_vocab_size, (2, 5))  # Batch size = 2, Sequence length = 5
print("Source Input:", src)
tgt = torch.randint(0, tgt_vocab_size, (2, 5))
print("Target Input:", tgt)

output = transformer(src, tgt)
print("Transformer Model Output (logits):", output) # logits for each token in the target sequence
    

Source Input: tensor([[ 801, 3345, 6259, 9247, 5210],
        [6625, 7970, 3600, 8639, 6364]])
Target Input: tensor([[ 317, 2920, 9425, 1596, 1245],
        [6418, 3183,  389, 5557, 7705]])
Transformer Model Output (logits): tensor([[[-0.2127,  0.6445, -0.1000,  ..., -0.6889, -0.7662, -0.3775],
         [-0.3332,  0.7756,  0.0581,  ..., -1.2720,  0.3419, -0.4285],
         [ 0.0414,  0.5423,  0.3065,  ..., -0.7987, -0.7487,  0.2474],
         [-0.1450,  1.2587, -0.5921,  ..., -0.4057, -0.9128, -0.2640],
         [-0.0619,  1.0298,  0.5166,  ..., -1.2149,  0.0316, -0.5781]],

        [[-0.1878,  0.7941, -0.0325,  ..., -0.2016,  0.9408, -0.3650],
         [-0.1603,  1.0844, -0.2845,  ..., -0.8920, -0.2510, -0.5387],
         [-0.1987,  0.6298,  0.2243,  ..., -0.2841,  0.1619, -0.1662],
         [ 0.2598,  0.6405, -0.0019,  ..., -0.9852, -0.6150, -0.6857],
         [ 0.4340,  0.4203,  0.5398,  ..., -1.0211, -0.0523,  0.4455]]],
       grad_fn=<ViewBackward0>)


### 2. Training Loop Example


**Objective**: Demonstrate a basic training loop with a forward and backward pass, typically used to train the Transformer on sequence-to-sequence tasks.

**Explanation**:
   - **Loss Calculation**: The model output is compared to the target sequence to compute cross-entropy loss.
   - **Backpropagation**: The loss is used to adjust model weights through gradient descent.
   - **Optimization**: Each batch adjusts model parameters to improve predictions.
    

In [None]:
# Example training loop
num_epochs = 10
learning_rate = 0.001
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming padding index is 0
optimizer = optim.Adam(transformer.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    transformer.train()
    optimizer.zero_grad()

    output = transformer(src, tgt[:, :-1])  # Input all tokens except the last one
    # [batch_size, sequence_length, vocab_size]
    output = output.reshape(-1, output.shape[2])  # [batch_size * sequence_length, vocab_size]
    tgt_output = tgt[:, 1:].reshape(-1)  # Expected output sequence shifted by one
    # tgt[:, 1:]: The target sequence, excluding the first token. 
    # This is done because the model's prediction at each time step should be compared to the next token in the sequence.
    # tgt[:, 1:].reshape(-1): Reshapes the target sequence to a 1D tensor of shape [batch_size * sequence_length]. 
    # This flattening aligns with the reshaped model output for loss computation.
    
    loss = criterion(output, tgt_output)
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 9.1753
Epoch [2/10], Loss: 8.8822
Epoch [3/10], Loss: 8.8661
Epoch [4/10], Loss: 8.7572
Epoch [5/10], Loss: 8.5909
Epoch [6/10], Loss: 8.4162
Epoch [7/10], Loss: 8.4443
Epoch [8/10], Loss: 8.2908
Epoch [9/10], Loss: 8.3136
Epoch [10/10], Loss: 8.1716
