<a href="https://colab.research.google.com/github/gnoejh/ict1022/blob/main/Transformer/12_transformer_code_russian_15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# Transformer

### 1. Using nn.Transformer for the Full Model


We will implement the full Transformer model using PyTorch's built-in `nn.Transformer`, which includes encoder and decoder layers optimized for Transformer architecture. This approach is more efficient and avoids the need to implement individual layers manually.

**Explanation**:
- **nn.Transformer**: Includes all the core components of a Transformer model, including multi-head attention, encoder and decoder layers, and residual connections.
- **nn.Embedding and Positional Encoding**: We use separate embedding and positional encoding modules, as these are not directly included in `nn.Transformer`.

This model can be used for tasks like translation, summarization, and other sequence-to-sequence tasks.
    

In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_size)
        for pos in range(max_len):
            for i in range(0, embed_size, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / embed_size)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * i) / embed_size)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size, num_heads, num_layers, forward_expansion, dropout, max_length):
        super(TransformerModel, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, max_len=max_length)
        
        # Transformer module with encoder and decoder layers
        self.transformer = nn.Transformer(
            d_model=embed_size, 
            nhead=num_heads, 
            num_encoder_layers=num_layers, 
            num_decoder_layers=num_layers, 
            dim_feedforward=forward_expansion * embed_size, 
            dropout=dropout,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(embed_size, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Embedding and positional encoding for source and target
        src_embedded = self.positional_encoding(self.src_embedding(src))
        tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt))
        
        # Transformer forward pass
        transformer_output = self.transformer(
            src_embedded, tgt_embedded, src_key_padding_mask=src_mask, tgt_key_padding_mask=tgt_mask
        )
        
        # Final output layer for token predictions
        out = self.fc_out(transformer_output)
        return out

# Example usage
src_vocab_size = 15 # Source vocabulary size (English)
tgt_vocab_size = 15 # Target vocabulary size (Russian)
embed_size = 16
num_heads = 4
num_layers = 3
forward_expansion = 4 # Feedforward expansion factor
dropout = 0.1
max_length = 100 # Maximum sequence length

# Define a simple vocabulary for demonstration purposes
# In practice, you'd have much larger vocabularies and use tokenizers
eng_vocab = {
    '<pad>': 0, '<sos>': 1, '<eos>': 2, 'I': 3, 'you': 4, 'love': 5, 
    'like': 6, 'books': 7, 'music': 8, 'movies': 9, 'food': 10
}
ru_vocab = {
    '<pad>': 0, '<sos>': 1, '<eos>': 2, 'я': 3, 'ты': 4, 'люблю': 5, 
    'нравится': 6, 'книги': 7, 'музыка': 8, 'фильмы': 9, 'еда': 10,
    'и': 11, 'а': 12, 'в': 13, 'на': 14
}
# Reverse mapping for output interpretation
idx_to_eng = {v: k for k, v in eng_vocab.items()}
idx_to_ru = {v: k for k, v in ru_vocab.items()}

# Example sentences: "I love books", "You like music"
src = torch.tensor([
    [eng_vocab['I'], eng_vocab['love'], eng_vocab['books'], eng_vocab['<eos>'], eng_vocab['<pad>']],
    [eng_vocab['you'], eng_vocab['like'], eng_vocab['music'], eng_vocab['<eos>'], eng_vocab['<pad>']]
])
print("Source sentences:")
for i in range(src.shape[0]):
    print([idx_to_eng[idx.item()] for idx in src[i]])

# Expected translations: "<sos> я люблю книги <eos>", "<sos> ты нравится музыка <eos>"
tgt = torch.tensor([
    [ru_vocab['<sos>'], ru_vocab['я'], ru_vocab['люблю'], ru_vocab['книги'], ru_vocab['<eos>']],
    [ru_vocab['<sos>'], ru_vocab['ты'], ru_vocab['нравится'], ru_vocab['музыка'], ru_vocab['<eos>']]
])
print("Target sentences:")
for i in range(tgt.shape[0]):
    print([idx_to_ru[idx.item()] for idx in tgt[i]])

transformer = TransformerModel(src_vocab_size, tgt_vocab_size, embed_size, num_heads, num_layers, forward_expansion, dropout, max_length)

output = transformer(src, tgt)
print(f"Transformer Model Output Shape: {output.shape}")  # [batch_size, seq_len, vocab_size]

# Example of prediction (without proper probability conversion)
_, predicted = torch.max(output, dim=2)
print("Predicted sequences:")
for i in range(predicted.shape[0]):
    print([idx_to_ru.get(idx.item(), '<unknown>') for idx in predicted[i]])

# Example training loop
num_epochs = 100
learning_rate = 0.001
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignoring padding index 0
optimizer = optim.Adam(transformer.parameters(), lr=learning_rate)

print("\nTraining the model to translate English to Russian:")
for epoch in range(num_epochs):
    transformer.train()
    optimizer.zero_grad()

    # For teacher forcing, we input the target sequence shifted right (without the last token)
    output = transformer(src, tgt[:, :-1])  
    
    # Reshape for loss calculation
    output = output.reshape(-1, output.shape[2])  # [batch_size * sequence_length, vocab_size]
    
    # Target is the sequence shifted left (without the first token which is <sos>)
    tgt_output = tgt[:, 1:].reshape(-1)  
    
    loss = criterion(output, tgt_output)
    loss.backward()
    optimizer.step()

    # Evaluation after each epoch
    with torch.no_grad():
        transformer.eval()
        eval_output = transformer(src, tgt[:, :-1])
        _, predicted = torch.max(eval_output, dim=2)
        
        # Example display of prediction for first sentence
        if epoch == num_epochs - 1 or epoch % 3 == 0:
            print(f"\nEpoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")
            print("Source: I love books")
            pred_sentence = [idx_to_ru[idx.item()] for idx in predicted[0]]
            print(f"Predicted translation: {' '.join(pred_sentence)}")
            target_sentence = [idx_to_ru[idx.item()] for idx in tgt[0, 1:]]  # Skip <sos>
            print(f"Target translation: {' '.join(target_sentence)}")

Source sentences:
['I', 'love', 'books', '<eos>', '<pad>']
['you', 'like', 'music', '<eos>', '<pad>']
Target sentences:
['<sos>', 'я', 'люблю', 'книги', '<eos>']
['<sos>', 'ты', 'нравится', 'музыка', '<eos>']
Transformer Model Output Shape: torch.Size([2, 5, 15])
Predicted sequences:
['ты', 'люблю', 'на', 'на', 'люблю']
['ты', 'в', 'на', 'в', 'люблю']

Training the model to translate English to Russian:

Epoch [1/100], Loss: 2.7872
Source: I love books
Predicted translation: ты люблю <eos> <eos>
Target translation: я люблю книги <eos>

Epoch [4/100], Loss: 2.2199
Source: I love books
Predicted translation: <eos> <eos> <eos> <eos>
Target translation: я люблю книги <eos>

Epoch [7/100], Loss: 2.0396
Source: I love books
Predicted translation: ты <eos> <eos> <eos>
Target translation: я люблю книги <eos>

Epoch [10/100], Loss: 1.9940
Source: I love books
Predicted translation: ты <eos> <eos> <eos>
Target translation: я люблю книги <eos>

Epoch [13/100], Loss: 1.9187
Source: I love books
Pr