# Assignment 4
### 2024-07-31
### Yuwei Hsu

<div class="alert alert-block alert-warning">
<b>Note:</b> This code file is ONLY for Question 1 in Assignment 4 .
For Question 2, please refer to the other file provided in the upload session.

Thank you.
</div>

## Question 1

In [31]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import MarianTokenizer, get_linear_schedule_with_warmup
from sacrebleu.metrics import BLEU

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from datasets import load_dataset
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers, processors
from tokenizers.processors import TemplateProcessing
from tqdm import tqdm

### Data preprocessing

In [37]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

# Data Preprocessing
class TranslationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

def encode_translation_data(data, tokenizer_src, tokenizer_tgt, max_length, batch_size=1000):
    all_input_ids = []
    all_attention_masks = []
    all_labels = []
    
    for i in tqdm(range(0, len(data), batch_size), desc="Encoding data"):
        batch = data.select(range(i, min(i + batch_size, len(data))))
        translations = batch['translation']
        src_texts = [item['en'] for item in translations]
        tgt_texts = [item['fr'] for item in translations]
        
        src_encodings = tokenizer_src(src_texts, padding='max_length', truncation=True, max_length=max_length)
        tgt_encodings = tokenizer_tgt(tgt_texts, padding='max_length', truncation=True, max_length=max_length)
        
        all_input_ids.extend(src_encodings['input_ids'])
        all_attention_masks.extend(src_encodings['attention_mask'])
        all_labels.extend(tgt_encodings['input_ids'])
    
    return {
        'input_ids': all_input_ids,
        'attention_mask': all_attention_masks,
        'labels': all_labels
    }

def preprocess_data(train_size, valid_size, max_length, batch_size):
    # Load dataset
    print("Loading dataset...")
    dataset = load_dataset("wmt14", "fr-en")
    
    # Select training and validation data
    train_data = dataset['train'].select(range(train_size))
    valid_data = dataset['validation'].select(range(valid_size))
    
    # Load pre-trained tokenizers
    print("\nLoading pre-trained tokenizers...")
    tokenizer_src = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenizer_tgt = AutoTokenizer.from_pretrained("camembert-base")
    
    # Encode data
    print("Encoding training data...")
    train_encodings = encode_translation_data(train_data, tokenizer_src, tokenizer_tgt, max_length)
    print("Encoding validation data...")
    valid_encodings = encode_translation_data(valid_data, tokenizer_src, tokenizer_tgt, max_length)
    
    # Create datasets
    train_dataset = TranslationDataset(train_encodings)
    valid_dataset = TranslationDataset(valid_encodings)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, valid_loader, tokenizer_src, tokenizer_tgt


# Usage example
if __name__ == "__main__":
    train_size = 100000
    valid_size = 1000
    max_length = 128
    batch_size = 32
    
    train_loader, valid_loader, tokenizer_src, tokenizer_tgt = preprocess_data(train_size, valid_size, max_length, batch_size)
    
    print(f"\nNumber of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(valid_loader)}")
    
    # Check the first batch of data
    for batch in train_loader:
        print("\nSample batch shape:")
        print(f"Input IDs: {batch['input_ids'].shape}")
        print(f"Attention Mask: {batch['attention_mask'].shape}")
        print(f"Labels: {batch['labels'].shape}")
        break

Loading dataset...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]


Loading pre-trained tokenizers...
Encoding training data...


Encoding data: 100% 100/100 [00:12<00:00,  7.91it/s]


Encoding validation data...


Encoding data: 100% 1/1 [00:00<00:00, 13.54it/s]



Number of training batches: 3125
Number of validation batches: 32

Sample batch shape:
Input IDs: torch.Size([32, 128])
Attention Mask: torch.Size([32, 128])
Labels: torch.Size([32, 128])


In [34]:
from transformers import AutoTokenizer

def check_tokenization(tokenizer, sample_texts):
    print(f"Tokenizer vocabulary size: {tokenizer.vocab_size}")
    print(f"Tokenizer name: {tokenizer.name_or_path}")
    print("\nTokenization Examples:")
    
    for text in sample_texts:
        print(f"\nOriginal: {text}")
        
        # Encode the text
        encoded = tokenizer.encode(text, add_special_tokens=True)
        
        # Get the actual tokens
        tokens = tokenizer.convert_ids_to_tokens(encoded)
        
        # Decode back to text
        decoded = tokenizer.decode(encoded)
        
        print(f"Encoded token IDs: {encoded}")
        print(f"Actual tokens: {tokens}")
        print(f"Decoded: {decoded}")
        
        # Check if any unknown tokens were used
        unknown_tokens = [token for token in tokens if token == tokenizer.unk_token]
        if unknown_tokens:
            print(f"Warning: {len(unknown_tokens)} unknown tokens found.")

# Usage example
if __name__ == "__main__":
    # Load English tokenizer
    tokenizer_src = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Load French tokenizer
    tokenizer_tgt = AutoTokenizer.from_pretrained("camembert-base")
    
    # Prepare some sample texts
    english_samples = [
        "I'm fine thank you",
        "Hello, how are you doing today?",
        "The quick brown fox jumps over the lazy dog.",
        "In machine learning, tokenization is a fundamental preprocessing step."
    ]
    
    french_samples = [
        "Je vais bien merci",
        "Bonjour, comment allez-vous aujourd'hui ?",
        "Le renard brun rapide saute par-dessus le chien paresseux.",
        "En apprentissage automatique, la tokenisation est une étape de prétraitement fondamentale."
    ]
    
    print("Checking English Tokenizer:")
    check_tokenization(tokenizer_src, english_samples)
    
    print("\n" + "="*50 + "\n")
    
    print("Checking French Tokenizer:")
    check_tokenization(tokenizer_tgt, french_samples)


Checking English Tokenizer:
Tokenizer vocabulary size: 30522
Tokenizer name: bert-base-uncased

Tokenization Examples:

Original: I'm fine thank you
Encoded token IDs: [101, 1045, 1005, 1049, 2986, 4067, 2017, 102]
Actual tokens: ['[CLS]', 'i', "'", 'm', 'fine', 'thank', 'you', '[SEP]']
Decoded: [CLS] i'm fine thank you [SEP]

Original: Hello, how are you doing today?
Encoded token IDs: [101, 7592, 1010, 2129, 2024, 2017, 2725, 2651, 1029, 102]
Actual tokens: ['[CLS]', 'hello', ',', 'how', 'are', 'you', 'doing', 'today', '?', '[SEP]']
Decoded: [CLS] hello, how are you doing today? [SEP]

Original: The quick brown fox jumps over the lazy dog.
Encoded token IDs: [101, 1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899, 1012, 102]
Actual tokens: ['[CLS]', 'the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '[SEP]']
Decoded: [CLS] the quick brown fox jumps over the lazy dog. [SEP]

Original: In machine learning, tokenization is a fundamental preprocessing step.


### Model definition

In [38]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        Q = self.W_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attn = self.dropout(torch.softmax(scores, dim=-1))
        
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        output = self.W_o(context)
        
        return output

# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x))))

# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

# Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_length, dropout):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_length)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.embed(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x

# Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_length, dropout):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_length)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.embed(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        return x

# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_length, dropout)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_length, dropout)
        self.linear = nn.Linear(d_model, tgt_vocab_size)
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        output = self.linear(dec_output)
        return output

    def encode(self, src, src_mask):
        return self.encoder(src, src_mask)

    def decode(self, tgt, memory, src_mask, tgt_mask):
        return self.decoder(tgt, memory, src_mask, tgt_mask)

### Training and evaluation

In [39]:
# Create masks
def create_mask(src, tgt, pad_token_id):
    src_mask = (src != pad_token_id).unsqueeze(1).unsqueeze(2)
    tgt_mask = (tgt != pad_token_id).unsqueeze(1).unsqueeze(3)
    seq_length = tgt.size(1)
    nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
    tgt_mask = tgt_mask & nopeak_mask.to(tgt.device)
    return src_mask, tgt_mask

# Train for one epoch
def train_epoch(model, data_loader, optimizer, criterion, pad_token_id, device, scheduler):
    model.train()
    total_loss = 0
    start_time = time.time()
    
    progress_bar = tqdm(total=len(data_loader), desc="Training", leave=False)
    
    for batch in data_loader:
        src = batch['input_ids'].to(device)
        tgt = batch['labels'].to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        src_mask, tgt_mask = create_mask(src, tgt_input, pad_token_id)
        
        optimizer.zero_grad()
        output = model(src, tgt_input, src_mask, tgt_mask)
        
        output = output.contiguous().view(-1, output.size(-1))
        tgt_output = tgt_output.contiguous().view(-1)
        
        loss = criterion(output, tgt_output)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        progress_bar.update(1)
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}", 'lr': f"{scheduler.get_last_lr()[0]:.6f}"})
    
    progress_bar.close()
    
    avg_loss = total_loss / len(data_loader)
    elapsed_time = time.time() - start_time
    
    return avg_loss, elapsed_time

# Evaluate the model
def evaluate(model, data_loader, criterion, pad_token_id, device):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            src = batch['input_ids'].to(device)
            tgt = batch['labels'].to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            src_mask, tgt_mask = create_mask(src, tgt_input, pad_token_id)
            
            output = model(src, tgt_input, src_mask, tgt_mask)
            
            output = output.contiguous().view(-1, output.size(-1))
            tgt_output = tgt_output.contiguous().view(-1)
            
            loss = criterion(output, tgt_output)
            
            total_loss += loss.item()
    
    return total_loss / len(data_loader)

# Train the Transformer model
def train_transformer(model, train_loader, valid_loader, criterion, optimizer, scheduler, num_epochs, device, pad_token_id):
    best_valid_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        # Training loop
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            src = batch['input_ids'].to(device)
            tgt = batch['labels'].to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            src_mask, tgt_mask = create_mask(src, tgt_input, pad_token_id)

            optimizer.zero_grad()
            output = model(src, tgt_input, src_mask, tgt_mask)
            loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt_output.contiguous().view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Validation loop
        model.eval()
        total_valid_loss = 0
        with torch.no_grad():
            for batch in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                src = batch['input_ids'].to(device)
                tgt = batch['labels'].to(device)
                tgt_input = tgt[:, :-1]
                tgt_output = tgt[:, 1:]

                src_mask, tgt_mask = create_mask(src, tgt_input, pad_token_id)

                output = model(src, tgt_input, src_mask, tgt_mask)
                loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt_output.contiguous().view(-1))

                total_valid_loss += loss.item()

        avg_valid_loss = total_valid_loss / len(valid_loader)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}")

        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), f'best_model_epoch_{epoch+1}.pt')
            print(f"New best model saved with validation loss: {best_valid_loss:.4f}")

# Learning rate scheduler
def get_lr_scheduler(optimizer, d_model, warmup_steps):
    def lr_lambda(step):
        step = max(1, step)  # Avoid division by zero
        return min(step ** (-0.5), step * (warmup_steps ** (-1.5))) * (d_model ** (-0.5))
    
    return LambdaLR(optimizer, lr_lambda)


### Train transformer

In [44]:
def save_model(model, path):
    torch.save(model.state_dict(), path)

def load_model(model_class, path, *args, **kwargs):
    model = model_class(*args, **kwargs)
    model.load_state_dict(torch.load(path))
    model.eval()  # Set the model to evaluation mode
    return model

# Training script
if __name__ == "__main__":
    # Data preprocessing
    train_size = 100000
    valid_size = 1000
    max_length = 128
    batch_size = 32
    
    train_loader, valid_loader, tokenizer_src, tokenizer_tgt = preprocess_data(train_size, valid_size, max_length, batch_size)
    
    print(f"\nNumber of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(valid_loader)}")
    
    # Check the first batch of data
    for batch in train_loader:
        print("\nSample batch shape:")
        print(f"Input IDs: {batch['input_ids'].shape}")
        print(f"Attention Mask: {batch['attention_mask'].shape}")
        print(f"Labels: {batch['labels'].shape}")
        break

    # Initialize tokenizers
    tokenizer_src = BertTokenizerFast.from_pretrained("bert-base-uncased")
    tokenizer_tgt = CamembertTokenizerFast.from_pretrained("camembert-base")

    # Get vocabulary size
    src_vocab_size = len(tokenizer_src.vocab)
    tgt_vocab_size = len(tokenizer_tgt.vocab)

    # Set other parameters
    d_model = 512
    num_layers = 6
    num_heads = 8
    d_ff = 2048
    max_seq_length = 128
    dropout = 0.1

    num_epochs = 2

    # Get pad token ID
    pad_token_id = tokenizer_src.pad_token_id

    # Ensure using the correct device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize the model
    model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_length, dropout).to(device)

    # Initialize the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

    # Initialize the loss function
    criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id, reduction='mean')

    warmup_steps = 4000
    scheduler = get_lr_scheduler(optimizer, d_model, warmup_steps)

    print(f"Source vocabulary size: {src_vocab_size}")
    print(f"Target vocabulary size: {tgt_vocab_size}")
    print(f"Pad token ID: {pad_token_id}")
    print(f"Device: {device}")

    print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
    print(f"Initial learning rate: {optimizer.param_groups[0]['lr']}")
    print(f"Number of epochs: {num_epochs}")
    print(f"Warmup steps: {warmup_steps}")

    # Train the model
    train_transformer(model, train_loader, valid_loader, criterion, optimizer, scheduler, num_epochs, device, pad_token_id)

    # Save the trained model
    save_model(model, 'transformer_model.pth')

Loading dataset...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]


Loading pre-trained tokenizers...
Encoding training data...


Encoding data: 100% 100/100 [00:12<00:00,  7.72it/s]


Encoding validation data...


Encoding data: 100% 1/1 [00:00<00:00, 14.43it/s]



Number of training batches: 3125
Number of validation batches: 32

Sample batch shape:
Input IDs: torch.Size([32, 128])
Attention Mask: torch.Size([32, 128])
Labels: torch.Size([32, 128])
Source vocabulary size: 30522
Target vocabulary size: 32004
Pad token ID: 0
Device: cuda
Model parameters: 92569860
Initial learning rate: 1.746928107421711e-11
Number of epochs: 2
Warmup steps: 4000


Epoch 1/2 - Training: 100% 3125/3125 [28:58<00:00,  1.80it/s]
Epoch 1/2 - Validation: 100% 32/32 [00:05<00:00,  5.77it/s]


Epoch 1/2, Train Loss: 8.3708, Valid Loss: 4.1679
New best model saved with validation loss: 4.1679


Epoch 2/2 - Training: 100% 3125/3125 [28:57<00:00,  1.80it/s]
Epoch 2/2 - Validation: 100% 32/32 [00:05<00:00,  5.76it/s]


Epoch 2/2, Train Loss: 3.4544, Valid Loss: 2.1944
New best model saved with validation loss: 2.1944


In [None]:
import torch
from transformers import BertTokenizerFast, CamembertTokenizerFast

# Define the Transformer model and necessary components here (or import them if they are in a separate file)
# Positional Encoding, MultiHeadAttention, FeedForward, EncoderLayer, DecoderLayer, Encoder, Decoder, Transformer classes remain the same...

# Mask creation, tokenization, and greedy decoding functions
def tokenize_input(sentence, tokenizer, max_length):
    tokens = tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    return tokens.input_ids, tokens.attention_mask

def create_src_mask(src, pad_token_id):
    return (src != pad_token_id).unsqueeze(1).unsqueeze(2)

def greedy_decode(model, src, src_mask, max_length, start_token_id, end_token_id, pad_token_id):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    
    ys = torch.ones(1, 1).fill_(start_token_id).type_as(src.data).to(device)
    
    for i in range(max_length - 1):
        tgt_mask = create_src_mask(ys, pad_token_id).to(device)
        out = model.decode(ys, memory, src_mask, tgt_mask)
        prob = model.linear(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word).to(device)], dim=1)
        
        if next_word == end_token_id:
            break
    
    return ys

def translate_sentence(model, sentence, tokenizer_src, tokenizer_tgt, max_length, device, pad_token_id):
    model.eval()
    src, src_mask = tokenize_input(sentence, tokenizer_src, max_length)
    src_mask = create_src_mask(src, pad_token_id)
    start_token_id = tokenizer_tgt.cls_token_id
    end_token_id = tokenizer_tgt.sep_token_id
    
    translated_tokens = greedy_decode(model, src, src_mask, max_length, start_token_id, end_token_id, pad_token_id)
    
    translated_sentence = tokenizer_tgt.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_sentence

# Load the model and perform inference
if __name__ == "__main__":
    # Set parameters
    max_length = 128
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize tokenizers
    tokenizer_src = BertTokenizerFast.from_pretrained("bert-base-uncased")
    tokenizer_tgt = CamembertTokenizerFast.from_pretrained("camembert-base")

    # Get vocabulary size
    src_vocab_size = len(tokenizer_src.vocab)
    tgt_vocab_size = len(tokenizer_tgt.vocab)

    # Set model parameters
    d_model = 512
    num_layers = 6
    num_heads = 8
    d_ff = 2048
    dropout = 0.1

    # Get pad token ID
    pad_token_id = tokenizer_src.pad_token_id

    # Load the model
    model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_length, dropout).to(device)
    model.load_state_dict(torch.load('transformer_model.pth'))
    model.eval()  # Set the model to evaluation mode

    # Translate a sample sentence
    input_sentence = "Hello, how are you?"
    translated_sentence = translate_sentence(model, input_sentence, tokenizer_src, tokenizer_tgt, max_length, device, pad_token_id)
    print(f"Translated sentence: {translated_sentence}")

    # Debugging information
    print("\nDebugging Information:")
    print(f"Input Sentence: {input_sentence}")
    print(f"Source Tokens: {tokenizer_src.encode(input_sentence)}")
    print(f"Translated Tokens: {translated_sentence}")

    # Check special tokens
    print(f"Start token ID: {tokenizer_tgt.cls_token_id}")
    print(f"End token ID: {tokenizer_tgt.sep_token_id}")
    print(f"Pad token ID: {pad_token_id}")

    # Print each decoding step's tokens
    src, src_mask = tokenize_input(input_sentence, tokenizer_src, max_length)
    src_mask = create_src_mask(src, pad_token_id)
    start_token_id = tokenizer_tgt.cls_token_id
    end_token_id = tokenizer_tgt.sep_token_id

    translated_tokens = greedy_decode(model, src, src_mask, max_length, start_token_id, end_token_id, pad_token_id)
    print(f"Decoded Token IDs: {translated_tokens[0].tolist()}")
    print(f"Decoded Tokens: {tokenizer_tgt.convert_ids_to_tokens(translated_tokens[0])}")
