# Mini Project - Deep Learning Course 2026

- Download the dataset from huggingface wmt14, use the fr-en subset.

## 1. Setup and Configuration

This implementation follows "Sequence to Sequence Learning with Neural Networks" (Sutskever et al., 2014) for English→French machine translation on the WMT14 dataset. We use a 4-layer LSTM encoder-decoder architecture with scaled-down dimensions for computational constraints.

In [5]:
"""
Sequence to Sequence Learning with Neural Networks
Reimplementation of Sutskever et al., 2014 for English→French translation

Architecture: 4-layer LSTM encoder-decoder with teacher forcing
Dataset: WMT14 fr-en (10k train, 1k val, 1k test samples)
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from datasets import load_dataset
from collections import Counter
import random
import numpy as np
from tqdm.auto import tqdm
import sacrebleu
from typing import List, Tuple, Dict, Optional

# ==================== Configuration ====================
class Config:
    """Hyperparameters scaled down from the paper for computational constraints."""
    # Data
    SEED = 42
    TRAIN_SIZE = 10_000
    VAL_SIZE = 1_000
    TEST_SIZE = 1_000
    MAX_SEQ_LEN = 50  # Truncate sequences longer than this
    
    # Vocabulary
    MIN_FREQ = 2  # Minimum frequency to include in vocabulary
    MAX_VOCAB_SIZE = 30_000
    
    # Model (scaled down from paper's 1000 dim for compute constraints)
    EMBEDDING_DIM = 256
    HIDDEN_DIM = 512
    NUM_LAYERS = 4  # As specified in the paper
    DROPOUT = 0.2
    
    # Training
    BATCH_SIZE = 64
    LEARNING_RATE = 0.001
    EPOCHS = 10
    TEACHER_FORCING_RATIO = 0.5
    CLIP_GRAD = 5.0  # Gradient clipping as in the paper
    
    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set random seeds for reproducibility
def set_seed(seed: int = Config.SEED):
    """Set random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed()
print(f"Using device: {Config.DEVICE}")
print(f"Model configuration: {Config.NUM_LAYERS} layers, {Config.HIDDEN_DIM} hidden dim, {Config.EMBEDDING_DIM} embedding dim")

Using device: cpu
Model configuration: 4 layers, 512 hidden dim, 256 embedding dim


In [4]:
!pip install datasets sacrebleu rich -q

In [6]:
from datasets import load_dataset
import rich as ri

raw_dataset = load_dataset(
    "wmt14", 
    "fr-en",
    cache_dir="./data"
)

ri.print(raw_dataset)
ri.print(raw_dataset["train"][0])

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

## 2. Data Loading and Preprocessing

Load WMT14 English-French dataset and create reproducible subsets:
- 10,000 training examples
- 1,000 validation examples  
- 1,000 test examples

In [7]:
# Create reproducible subsets
train_ds = raw_dataset["train"].shuffle(seed=Config.SEED).select(range(Config.TRAIN_SIZE))
val_ds = raw_dataset["validation"].shuffle(seed=Config.SEED).select(range(Config.VAL_SIZE))
test_ds = raw_dataset["test"].shuffle(seed=Config.SEED).select(range(Config.TEST_SIZE))

print(f"Training examples: {len(train_ds)}")
print(f"Validation examples: {len(val_ds)}")
print(f"Test examples: {len(test_ds)}")
print(f"\nSample pair:")
print(f"  English: {train_ds[0]['translation']['en']}")
print(f"  French:  {train_ds[0]['translation']['fr']}")

Training examples: 10000
Validation examples: 1000
Test examples: 1000

Sample pair:
  English: It should also be recalled that Australia and Japan have announced ambitious goals - not yet in binding terms, certainly, but at a political level.
  French:  Il faut rappeler aussi que l'Australie et le Japon ont annoncé - pas encore en termes contraignants, certes, mais déjà sur un plan politique - des objectifs ambitieux.


### 2.1 Vocabulary Class

Build source (English) and target (French) vocabularies with special tokens:
- `<pad>`: Padding token
- `<bos>`: Beginning of sequence
- `<eos>`: End of sequence  
- `<unk>`: Unknown token

In [8]:
class Vocabulary:
    """
    Word-level vocabulary with special tokens.
    
    Following the paper's approach: simple word-level tokenization.
    """
    PAD_TOKEN = "<pad>"
    BOS_TOKEN = "<bos>"
    EOS_TOKEN = "<eos>"
    UNK_TOKEN = "<unk>"
    
    def __init__(self, min_freq: int = 2, max_size: int = 30_000):
        self.min_freq = min_freq
        self.max_size = max_size
        self.word2idx: Dict[str, int] = {}
        self.idx2word: Dict[int, str] = {}
        self._init_special_tokens()
        
    def _init_special_tokens(self):
        """Initialize vocabulary with special tokens."""
        special_tokens = [self.PAD_TOKEN, self.BOS_TOKEN, self.EOS_TOKEN, self.UNK_TOKEN]
        for idx, token in enumerate(special_tokens):
            self.word2idx[token] = idx
            self.idx2word[idx] = token
    
    @property
    def pad_idx(self) -> int:
        return self.word2idx[self.PAD_TOKEN]
    
    @property
    def bos_idx(self) -> int:
        return self.word2idx[self.BOS_TOKEN]
    
    @property
    def eos_idx(self) -> int:
        return self.word2idx[self.EOS_TOKEN]
    
    @property
    def unk_idx(self) -> int:
        return self.word2idx[self.UNK_TOKEN]
    
    def __len__(self) -> int:
        return len(self.word2idx)
    
    def build_vocab(self, sentences: List[str]):
        """
        Build vocabulary from a list of sentences.
        
        Args:
            sentences: List of sentences (strings)
        """
        counter = Counter()
        for sentence in sentences:
            tokens = self.tokenize(sentence)
            counter.update(tokens)
        
        # Sort by frequency (descending) and add to vocabulary
        sorted_words = sorted(counter.items(), key=lambda x: -x[1])
        
        for word, freq in sorted_words:
            if freq < self.min_freq:
                continue
            if len(self.word2idx) >= self.max_size:
                break
            if word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word
        
        print(f"Vocabulary built: {len(self)} tokens (min_freq={self.min_freq})")
    
    @staticmethod
    def tokenize(text: str) -> List[str]:
        """
        Simple word-level tokenization.
        Lowercase and split on whitespace.
        """
        return text.lower().strip().split()
    
    def numericalize(self, sentence: str, add_special_tokens: bool = True) -> List[int]:
        """
        Convert a sentence to a list of indices.
        
        Args:
            sentence: Input sentence string
            add_special_tokens: Whether to add <bos> and <eos>
            
        Returns:
            List of token indices
        """
        tokens = self.tokenize(sentence)
        indices = [self.word2idx.get(token, self.unk_idx) for token in tokens]
        
        if add_special_tokens:
            indices = [self.bos_idx] + indices + [self.eos_idx]
        
        return indices
    
    def decode(self, indices: List[int], skip_special: bool = True) -> str:
        """
        Convert indices back to a sentence string.
        
        Args:
            indices: List of token indices
            skip_special: Whether to skip special tokens
            
        Returns:
            Decoded sentence string
        """
        special_indices = {self.pad_idx, self.bos_idx, self.eos_idx}
        tokens = []
        
        for idx in indices:
            if skip_special and idx in special_indices:
                continue
            if idx == self.eos_idx and skip_special:
                break
            tokens.append(self.idx2word.get(idx, self.UNK_TOKEN))
        
        return " ".join(tokens)

In [9]:
# Build vocabularies from training data
print("Building source (English) vocabulary...")
src_sentences = [ex["translation"]["en"] for ex in train_ds]
src_vocab = Vocabulary(min_freq=Config.MIN_FREQ, max_size=Config.MAX_VOCAB_SIZE)
src_vocab.build_vocab(src_sentences)

print("\nBuilding target (French) vocabulary...")
tgt_sentences = [ex["translation"]["fr"] for ex in train_ds]
tgt_vocab = Vocabulary(min_freq=Config.MIN_FREQ, max_size=Config.MAX_VOCAB_SIZE)
tgt_vocab.build_vocab(tgt_sentences)

print(f"\nSource vocabulary size: {len(src_vocab)}")
print(f"Target vocabulary size: {len(tgt_vocab)}")

Building source (English) vocabulary...
Vocabulary built: 12801 tokens (min_freq=2)

Building target (French) vocabulary...
Vocabulary built: 15084 tokens (min_freq=2)

Source vocabulary size: 12801
Target vocabulary size: 15084


## 3. Dataset and DataLoader

Create PyTorch Dataset and DataLoader with proper padding and batching.

In [10]:
class TranslationDataset(Dataset):
    """
    PyTorch Dataset for translation pairs.
    
    Handles numericalization and sequence length limiting.
    """
    def __init__(
        self, 
        hf_dataset, 
        src_vocab: Vocabulary, 
        tgt_vocab: Vocabulary,
        max_len: int = Config.MAX_SEQ_LEN
    ):
        self.data = hf_dataset
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len
        
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Returns:
            src_tensor: Source (English) token indices
            tgt_tensor: Target (French) token indices
        """
        example = self.data[idx]
        src_text = example["translation"]["en"]
        tgt_text = example["translation"]["fr"]
        
        # Numericalize
        src_indices = self.src_vocab.numericalize(src_text)
        tgt_indices = self.tgt_vocab.numericalize(tgt_text)
        
        # Truncate if necessary (keeping <bos> and <eos>)
        if len(src_indices) > self.max_len:
            src_indices = src_indices[:self.max_len-1] + [self.src_vocab.eos_idx]
        if len(tgt_indices) > self.max_len:
            tgt_indices = tgt_indices[:self.max_len-1] + [self.tgt_vocab.eos_idx]
        
        return torch.tensor(src_indices), torch.tensor(tgt_indices)


def collate_fn(batch: List[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Collate function for DataLoader.
    
    Pads sequences and returns:
        - src_batch: [batch_size, max_src_len] padded source sequences
        - src_lengths: [batch_size] original source lengths
        - tgt_input: [batch_size, max_tgt_len] target input (for teacher forcing)
        - tgt_output: [batch_size, max_tgt_len] target output (shifted by 1)
    """
    src_seqs, tgt_seqs = zip(*batch)
    
    # Get lengths before padding
    src_lengths = torch.tensor([len(s) for s in src_seqs])
    
    # Pad sequences
    src_batch = pad_sequence(src_seqs, batch_first=True, padding_value=src_vocab.pad_idx)
    tgt_batch = pad_sequence(tgt_seqs, batch_first=True, padding_value=tgt_vocab.pad_idx)
    
    # For decoder: input is tgt[:-1], output is tgt[1:]
    # Input starts with <bos>, output ends with <eos>
    tgt_input = tgt_batch[:, :-1]
    tgt_output = tgt_batch[:, 1:]
    
    return src_batch, src_lengths, tgt_input, tgt_output


# Create datasets
train_dataset = TranslationDataset(train_ds, src_vocab, tgt_vocab)
val_dataset = TranslationDataset(val_ds, src_vocab, tgt_vocab)
test_dataset = TranslationDataset(test_ds, src_vocab, tgt_vocab)

# Create dataloaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=Config.BATCH_SIZE, 
    shuffle=True, 
    collate_fn=collate_fn,
    num_workers=0,
    pin_memory=True if Config.DEVICE.type == "cuda" else False
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=Config.BATCH_SIZE, 
    shuffle=False, 
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=Config.BATCH_SIZE, 
    shuffle=False, 
    collate_fn=collate_fn
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Verify batch structure
sample_batch = next(iter(train_loader))
src_batch, src_lengths, tgt_input, tgt_output = sample_batch
print(f"\nSample batch shapes:")
print(f"  src_batch: {src_batch.shape}")
print(f"  src_lengths: {src_lengths.shape}")
print(f"  tgt_input: {tgt_input.shape}")
print(f"  tgt_output: {tgt_output.shape}")

Train batches: 157
Val batches: 16
Test batches: 16

Sample batch shapes:
  src_batch: torch.Size([64, 50])
  src_lengths: torch.Size([64])
  tgt_input: torch.Size([64, 49])
  tgt_output: torch.Size([64, 49])


## 4. Model Architecture

Implementing the encoder-decoder LSTM architecture from Sutskever et al., 2014:

**Encoder:**
- Embedding layer for source tokens
- 4-layer LSTM (as specified in the paper)
- Returns final hidden/cell states to initialize decoder

**Decoder:**
- Embedding layer for target tokens  
- 4-layer LSTM initialized from encoder states
- Linear layer to project hidden states to vocabulary logits

In [11]:
class Encoder(nn.Module):
    """
    LSTM Encoder for Seq2Seq model.
    
    As per Sutskever et al., 2014:
    - Uses 4 stacked LSTM layers
    - Processes source sequence and returns final hidden states
    """
    def __init__(
        self, 
        vocab_size: int,
        embedding_dim: int,
        hidden_dim: int,
        num_layers: int,
        dropout: float,
        pad_idx: int
    ):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True,
            bidirectional=False  # Unidirectional as in the paper
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(
        self, 
        src: torch.Tensor,
        src_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Args:
            src: [batch_size, src_len] source token indices
            src_lengths: [batch_size] original sequence lengths
            
        Returns:
            outputs: [batch_size, src_len, hidden_dim] encoder outputs
            (hidden, cell): Final hidden and cell states for each layer
        """
        # Embed tokens: [batch_size, src_len, embedding_dim]
        embedded = self.dropout(self.embedding(src))
        
        # Pack for efficiency with variable length sequences
        packed = pack_padded_sequence(
            embedded, 
            src_lengths.cpu(), 
            batch_first=True, 
            enforce_sorted=False
        )
        
        # LSTM forward pass
        packed_outputs, (hidden, cell) = self.lstm(packed)
        
        # Unpack outputs
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)
        
        # hidden: [num_layers, batch_size, hidden_dim]
        # cell: [num_layers, batch_size, hidden_dim]
        return outputs, (hidden, cell)


class Decoder(nn.Module):
    """
    LSTM Decoder for Seq2Seq model.
    
    As per Sutskever et al., 2014:
    - Uses 4 stacked LSTM layers
    - Initialized with encoder's final hidden states
    - Generates one token at a time
    """
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        hidden_dim: int,
        num_layers: int,
        dropout: float,
        pad_idx: int
    ):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        self.fc_out = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(
        self,
        input_token: torch.Tensor,
        hidden: torch.Tensor,
        cell: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Single step of decoding.
        
        Args:
            input_token: [batch_size, 1] current input token
            hidden: [num_layers, batch_size, hidden_dim]
            cell: [num_layers, batch_size, hidden_dim]
            
        Returns:
            prediction: [batch_size, vocab_size] logits for next token
            hidden: Updated hidden state
            cell: Updated cell state
        """
        # Embed input: [batch_size, 1, embedding_dim]
        embedded = self.dropout(self.embedding(input_token))
        
        # LSTM step: output is [batch_size, 1, hidden_dim]
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        
        # Project to vocabulary: [batch_size, vocab_size]
        prediction = self.fc_out(output.squeeze(1))
        
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    """
    Sequence to Sequence model combining Encoder and Decoder.
    
    Implements teacher forcing during training as described in the paper.
    """
    def __init__(
        self,
        encoder: Encoder,
        decoder: Decoder,
        device: torch.device
    ):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(
        self,
        src: torch.Tensor,
        src_lengths: torch.Tensor,
        tgt: torch.Tensor,
        teacher_forcing_ratio: float = 0.5
    ) -> torch.Tensor:
        """
        Forward pass with teacher forcing.
        
        Args:
            src: [batch_size, src_len] source sequences
            src_lengths: [batch_size] source lengths
            tgt: [batch_size, tgt_len] target sequences (input)
            teacher_forcing_ratio: Probability of using ground truth
            
        Returns:
            outputs: [batch_size, tgt_len, vocab_size] predictions
        """
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.vocab_size
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        # Encode source sequence
        _, (hidden, cell) = self.encoder(src, src_lengths)
        
        # First input to decoder is <bos> token (first column of tgt)
        input_token = tgt[:, 0:1]  # [batch_size, 1]
        
        for t in range(tgt_len):
            # Decoder step
            prediction, hidden, cell = self.decoder(input_token, hidden, cell)
            
            # Store prediction
            outputs[:, t, :] = prediction
            
            # Teacher forcing: use ground truth or predicted token
            teacher_force = random.random() < teacher_forcing_ratio
            
            # Get top prediction
            top1 = prediction.argmax(1, keepdim=True)  # [batch_size, 1]
            
            # Next input: ground truth if teacher forcing, else prediction
            if t < tgt_len - 1:
                input_token = tgt[:, t+1:t+2] if teacher_force else top1
        
        return outputs

In [12]:
# Initialize model components
encoder = Encoder(
    vocab_size=len(src_vocab),
    embedding_dim=Config.EMBEDDING_DIM,
    hidden_dim=Config.HIDDEN_DIM,
    num_layers=Config.NUM_LAYERS,
    dropout=Config.DROPOUT,
    pad_idx=src_vocab.pad_idx
)

decoder = Decoder(
    vocab_size=len(tgt_vocab),
    embedding_dim=Config.EMBEDDING_DIM,
    hidden_dim=Config.HIDDEN_DIM,
    num_layers=Config.NUM_LAYERS,
    dropout=Config.DROPOUT,
    pad_idx=tgt_vocab.pad_idx
)

model = Seq2Seq(encoder, decoder, Config.DEVICE).to(Config.DEVICE)

# Count parameters
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model has {count_parameters(model):,} trainable parameters")
print(f"\nEncoder parameters: {count_parameters(encoder):,}")
print(f"Decoder parameters: {count_parameters(decoder):,}")

Model has 30,638,060 trainable parameters

Encoder parameters: 11,157,760
Decoder parameters: 19,480,300


## 5. Training Pipeline

Training with:
- Cross-entropy loss (ignoring padding)
- Adam optimizer
- Gradient clipping (as recommended in the paper to prevent exploding gradients)
- Teacher forcing with configurable ratio

In [13]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.pad_idx)
optimizer = optim.Adam(model.parameters(), lr=Config.LEARNING_RATE)


def train_epoch(
    model: Seq2Seq,
    dataloader: DataLoader,
    optimizer: optim.Optimizer,
    criterion: nn.Module,
    clip: float,
    teacher_forcing_ratio: float,
    device: torch.device
) -> float:
    """
    Train for one epoch.
    
    Returns:
        Average loss for the epoch
    """
    model.train()
    epoch_loss = 0
    
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    
    for src_batch, src_lengths, tgt_input, tgt_output in progress_bar:
        src_batch = src_batch.to(device)
        tgt_input = tgt_input.to(device)
        tgt_output = tgt_output.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        # output: [batch_size, tgt_len, vocab_size]
        output = model(src_batch, src_lengths, tgt_input, teacher_forcing_ratio)
        
        # Reshape for loss computation
        # output: [batch_size * tgt_len, vocab_size]
        # tgt_output: [batch_size * tgt_len]
        output_dim = output.shape[-1]
        output = output.reshape(-1, output_dim)
        tgt_output = tgt_output.reshape(-1)
        
        # Compute loss
        loss = criterion(output, tgt_output)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping (as in the paper)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    
    return epoch_loss / len(dataloader)


def evaluate(
    model: Seq2Seq,
    dataloader: DataLoader,
    criterion: nn.Module,
    device: torch.device
) -> float:
    """
    Evaluate model on a dataset.
    
    Returns:
        Average loss
    """
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for src_batch, src_lengths, tgt_input, tgt_output in dataloader:
            src_batch = src_batch.to(device)
            tgt_input = tgt_input.to(device)
            tgt_output = tgt_output.to(device)
            
            # Forward pass (no teacher forcing during evaluation)
            output = model(src_batch, src_lengths, tgt_input, teacher_forcing_ratio=0)
            
            # Reshape for loss
            output_dim = output.shape[-1]
            output = output.reshape(-1, output_dim)
            tgt_output = tgt_output.reshape(-1)
            
            loss = criterion(output, tgt_output)
            epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

## 6. Inference and Evaluation

Implement:
- Greedy decoding for inference
- BLEU score computation using sacrebleu
- Translation examples

In [14]:
def greedy_decode(
    model: Seq2Seq,
    src_sentence: str,
    src_vocab: Vocabulary,
    tgt_vocab: Vocabulary,
    max_len: int = Config.MAX_SEQ_LEN,
    device: torch.device = Config.DEVICE
) -> str:
    """
    Greedy decoding for a single sentence.
    
    Args:
        model: Trained Seq2Seq model
        src_sentence: Source sentence string
        src_vocab: Source vocabulary
        tgt_vocab: Target vocabulary
        max_len: Maximum output length
        device: Computation device
        
    Returns:
        Decoded translation string
    """
    model.eval()
    
    with torch.no_grad():
        # Numericalize source sentence
        src_indices = src_vocab.numericalize(src_sentence)
        src_tensor = torch.tensor(src_indices).unsqueeze(0).to(device)  # [1, src_len]
        src_lengths = torch.tensor([len(src_indices)])
        
        # Encode
        _, (hidden, cell) = model.encoder(src_tensor, src_lengths)
        
        # Start with <bos> token
        input_token = torch.tensor([[tgt_vocab.bos_idx]]).to(device)
        
        output_indices = []
        
        for _ in range(max_len):
            # Decode one step
            prediction, hidden, cell = model.decoder(input_token, hidden, cell)
            
            # Get top prediction
            top1 = prediction.argmax(1).item()
            
            # Stop if <eos>
            if top1 == tgt_vocab.eos_idx:
                break
                
            output_indices.append(top1)
            
            # Next input
            input_token = torch.tensor([[top1]]).to(device)
        
        # Decode to string
        return tgt_vocab.decode(output_indices, skip_special=True)


def compute_bleu(
    model: Seq2Seq,
    dataset,
    src_vocab: Vocabulary,
    tgt_vocab: Vocabulary,
    num_samples: int = 100,
    device: torch.device = Config.DEVICE
) -> Tuple[float, List[Tuple[str, str, str]]]:
    """
    Compute BLEU score on a subset of the dataset.
    
    Args:
        model: Trained model
        dataset: HuggingFace dataset
        src_vocab: Source vocabulary
        tgt_vocab: Target vocabulary
        num_samples: Number of samples to evaluate
        device: Computation device
        
    Returns:
        BLEU score and list of (source, reference, hypothesis) tuples
    """
    model.eval()
    
    hypotheses = []
    references = []
    examples = []
    
    # Sample indices
    indices = random.sample(range(len(dataset)), min(num_samples, len(dataset)))
    
    for idx in tqdm(indices, desc="Computing BLEU"):
        example = dataset[idx]
        src_text = example["translation"]["en"]
        ref_text = example["translation"]["fr"]
        
        # Generate translation
        hyp_text = greedy_decode(model, src_text, src_vocab, tgt_vocab, device=device)
        
        hypotheses.append(hyp_text)
        references.append(ref_text)
        examples.append((src_text, ref_text, hyp_text))
    
    # Compute BLEU score using sacrebleu
    bleu = sacrebleu.corpus_bleu(hypotheses, [references])
    
    return bleu.score, examples


def show_translations(examples: List[Tuple[str, str, str]], num: int = 5):
    """Display sample translations."""
    print("\n" + "="*80)
    print("SAMPLE TRANSLATIONS")
    print("="*80)
    
    for i, (src, ref, hyp) in enumerate(examples[:num]):
        print(f"\n--- Example {i+1} ---")
        print(f"Source (EN):     {src}")
        print(f"Reference (FR):  {ref}")
        print(f"Hypothesis (FR): {hyp}")

## 7. Training Loop

Train the model and track progress.

In [None]:
import math

# Training history
train_losses = []
val_losses = []
best_val_loss = float('inf')

print("Starting training...")
print(f"Configuration: {Config.EPOCHS} epochs, batch size {Config.BATCH_SIZE}")
print(f"Teacher forcing ratio: {Config.TEACHER_FORCING_RATIO}")
print(f"Gradient clipping: {Config.CLIP_GRAD}")
print("-" * 60)

for epoch in range(1, Config.EPOCHS + 1):
    # Train
    train_loss = train_epoch(
        model, train_loader, optimizer, criterion,
        Config.CLIP_GRAD, Config.TEACHER_FORCING_RATIO, Config.DEVICE
    )
    train_losses.append(train_loss)
    
    # Validate
    val_loss = evaluate(model, val_loader, criterion, Config.DEVICE)
    val_losses.append(val_loss)
    
    # Compute perplexity
    train_ppl = math.exp(train_loss)
    val_ppl = math.exp(val_loss)
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
    
    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} (PPL: {train_ppl:.2f}) | "
          f"Val Loss: {val_loss:.4f} (PPL: {val_ppl:.2f})")

print("-" * 60)
print(f"Training complete. Best validation loss: {best_val_loss:.4f}")

Starting training...
Configuration: 10 epochs, batch size 64
Teacher forcing ratio: 0.5
Gradient clipping: 5.0
------------------------------------------------------------


Training:   0%|          | 0/157 [00:00<?, ?it/s]

### Training Visualization

In [None]:
import matplotlib.pyplot as plt

# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Loss plot
axes[0].plot(range(1, len(train_losses) + 1), train_losses, 'b-', label='Train')
axes[0].plot(range(1, len(val_losses) + 1), val_losses, 'r-', label='Validation')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True)

# Perplexity plot
train_ppls = [math.exp(l) for l in train_losses]
val_ppls = [math.exp(l) for l in val_losses]
axes[1].plot(range(1, len(train_ppls) + 1), train_ppls, 'b-', label='Train')
axes[1].plot(range(1, len(val_ppls) + 1), val_ppls, 'r-', label='Validation')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Perplexity')
axes[1].set_title('Training and Validation Perplexity')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.savefig('training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Test Set Evaluation

Load the best model and evaluate on the test set with BLEU score.

In [None]:
# Load best model
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

# Evaluate on test set
test_loss = evaluate(model, test_loader, criterion, Config.DEVICE)
test_ppl = math.exp(test_loss)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Perplexity: {test_ppl:.2f}")

# Compute BLEU score on test set
print("\nComputing BLEU score on test set...")
bleu_score, examples = compute_bleu(
    model, test_ds, src_vocab, tgt_vocab, 
    num_samples=500,  # Evaluate on 500 samples for speed
    device=Config.DEVICE
)

print(f"\nBLEU Score: {bleu_score:.2f}")

# Show some translations
show_translations(examples, num=10)

## 9. Analysis and Comparison to Paper

### Comparison with Sutskever et al., 2014

The original paper achieved a BLEU score of **34.81** on the WMT'14 En→Fr task with:
- Full WMT'14 dataset (~12M sentence pairs)
- 4-layer LSTM with 1000 hidden units per layer
- 1000-dimensional embeddings
- Training on 8 GPUs for several days
- Beam search decoding with beam size 12

**Our implementation differs due to compute constraints:**
- 10,000 training examples (vs ~12M)
- 512 hidden units (vs 1000)  
- 256-dimensional embeddings (vs 1000)
- Greedy decoding (vs beam search)
- Much shorter training time

These factors explain the lower BLEU score. The reduced dataset size is the primary limiting factor, as neural translation models are highly data-hungry.

In [None]:
# Summary statistics
print("="*60)
print("EXPERIMENT SUMMARY")
print("="*60)
print(f"\nModel Architecture:")
print(f"  - Encoder/Decoder layers: {Config.NUM_LAYERS}")
print(f"  - Hidden dimension: {Config.HIDDEN_DIM}")
print(f"  - Embedding dimension: {Config.EMBEDDING_DIM}")
print(f"  - Total parameters: {count_parameters(model):,}")

print(f"\nDataset:")
print(f"  - Training examples: {Config.TRAIN_SIZE}")
print(f"  - Validation examples: {Config.VAL_SIZE}")
print(f"  - Test examples: {Config.TEST_SIZE}")
print(f"  - Source vocabulary size: {len(src_vocab)}")
print(f"  - Target vocabulary size: {len(tgt_vocab)}")

print(f"\nTraining:")
print(f"  - Epochs: {Config.EPOCHS}")
print(f"  - Batch size: {Config.BATCH_SIZE}")
print(f"  - Learning rate: {Config.LEARNING_RATE}")
print(f"  - Teacher forcing ratio: {Config.TEACHER_FORCING_RATIO}")
print(f"  - Gradient clipping: {Config.CLIP_GRAD}")

print(f"\nResults:")
print(f"  - Best validation loss: {best_val_loss:.4f}")
print(f"  - Test loss: {test_loss:.4f}")
print(f"  - Test perplexity: {test_ppl:.2f}")
print(f"  - BLEU score: {bleu_score:.2f}")

print(f"\nComparison to Paper (Sutskever et al., 2014):")
print(f"  - Paper BLEU: 34.81 (on full WMT'14, beam search)")
print(f"  - Our BLEU: {bleu_score:.2f} (on 10k subset, greedy decoding)")
print(f"  - Key differences: ~1200x less data, ~4x smaller model, greedy decoding")

## 10. Interactive Translation

Test the model with custom sentences.

In [None]:
def translate(sentence: str) -> str:
    """Translate an English sentence to French."""
    model.eval()
    translation = greedy_decode(model, sentence, src_vocab, tgt_vocab, device=Config.DEVICE)
    return translation

# Test with some example sentences
test_sentences = [
    "Hello, how are you?",
    "I love deep learning.",
    "The weather is beautiful today.",
    "What is your name?",
    "Thank you very much.",
]

print("Custom translations:")
print("-" * 60)
for sentence in test_sentences:
    translation = translate(sentence)
    print(f"EN: {sentence}")
    print(f"FR: {translation}")
    print()