In [None]:
import os
import sys
import random
import requests
import tokenizers
import tqdm
import unicodedata
import zipfile

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary

# to get deterministic output
#torch.manual_seed(123)

sys.path.append(os.path.abspath(".."))

### Loading the Dataset

In [2]:
DATASET_ZIP_PATH = "../datasets/fra-eng.zip"

# Download dataset provided by Anki: https://www.manythings.org/anki/ with requests
if not os.path.exists(DATASET_ZIP_PATH):
    url = "http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip"
    response = requests.get(url)
    with open(DATASET_ZIP_PATH, "wb") as f:
        f.write(response.content)

### Normalize text

In [3]:
# each line of the file is in the format "<english>\t<french>"
# We convert text to lowercasee, normalize unicode (UFKC)
def normalize(line):
    """Normalize a line of text and split into two at the tab character"""
    line = unicodedata.normalize("NFKC", line.strip().lower())
    eng, fra = line.split("\t")
    return eng.lower().strip(), fra.lower().strip()

text_pairs = []
with zipfile.ZipFile(DATASET_ZIP_PATH, "r") as zip_ref:
    for line in zip_ref.read("fra.txt").decode("utf-8").splitlines():
        eng, fra = normalize(line)
        text_pairs.append((eng, fra))

### Tokenization with BPE

In [4]:
DATASET_TOKENIZER_EN = "../datasets/en_tokenizer.json"
DATASET_TOKENIZER_FR = "../datasets/fr_tokenizer.json"

if os.path.exists(DATASET_TOKENIZER_EN) and os.path.exists(DATASET_TOKENIZER_FR):
    en_tokenizer = tokenizers.Tokenizer.from_file(DATASET_TOKENIZER_EN)
    fr_tokenizer = tokenizers.Tokenizer.from_file(DATASET_TOKENIZER_FR)
else:
    en_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
    fr_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

    # Configure pre-tokenizer to split on whitespace and punctuation, add space at beginning of the sentence
    en_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)
    fr_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

    # Configure decoder: So that word boundary symbol "Ġ" will be removed
    en_tokenizer.decoder = tokenizers.decoders.ByteLevel()
    fr_tokenizer.decoder = tokenizers.decoders.ByteLevel()

    # Train BPE for English and French using the same trainer
    VOCAB_SIZE = 8000
    trainer = tokenizers.trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        special_tokens=["[start]", "[end]", "[pad]"],
        show_progress=True
    )
    en_tokenizer.train_from_iterator([x[0] for x in text_pairs], trainer=trainer)
    fr_tokenizer.train_from_iterator([x[1] for x in text_pairs], trainer=trainer)

    en_tokenizer.enable_padding(pad_id=en_tokenizer.token_to_id("[pad]"), pad_token="[pad]")
    fr_tokenizer.enable_padding(pad_id=fr_tokenizer.token_to_id("[pad]"), pad_token="[pad]")

    # Save the trained tokenizers
    en_tokenizer.save(DATASET_TOKENIZER_EN, pretty=True)
    fr_tokenizer.save(DATASET_TOKENIZER_FR, pretty=True)

# Test the tokenizer
print("Sample tokenization:")
en_sample, fr_sample = random.choice(text_pairs)
encoded = en_tokenizer.encode(en_sample)
print(f"Original: {en_sample}")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
print(f"Decoded: {en_tokenizer.decode(encoded.ids)}")
print()

encoded = fr_tokenizer.encode("[start] " + fr_sample + " [end]")
print(f"Original: {fr_sample}")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
print(f"Decoded: {fr_tokenizer.decode(encoded.ids)}")
print()

Sample tokenization:
Original: what's the meaning of this phrase?
Tokens: ['Ġwhat', "'s", 'Ġthe', 'Ġmeaning', 'Ġof', 'Ġthis', 'Ġphr', 'ase', '?']
IDs: [155, 135, 86, 2560, 128, 141, 6183, 301, 26]
Decoded:  what's the meaning of this phrase?

Original: que veut dire cette phrase ?
Tokens: ['[start]', 'Ġque', 'Ġveut', 'Ġdire', 'Ġcette', 'Ġphrase', 'Ġ?', 'Ġ', '[end]']
IDs: [0, 116, 778, 374, 291, 2809, 120, 74, 1]
Decoded:  que veut dire cette phrase ? 



### Define DataLoader for the BPE-encoded translation pairs

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, text_pairs):
        self.text_pairs = text_pairs

    def __len__(self):
        return len(self.text_pairs)

    def __getitem__(self, idx):
        eng, fra = self.text_pairs[idx]
        return eng, "[start] " + fra + " [end]"

def collate_fn(batch):
    en_str, fr_str = zip(*batch)
    en_enc = en_tokenizer.encode_batch(en_str, add_special_tokens=True)
    fr_enc = fr_tokenizer.encode_batch(fr_str, add_special_tokens=True)
    en_ids = [enc.ids for enc in en_enc]
    fr_ids = [enc.ids for enc in fr_enc]
    return torch.tensor(en_ids), torch.tensor(fr_ids)

BATCH_SIZE = 32
dataset = TranslationDataset(text_pairs)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Test the dataset
for en_ids, fr_ids in dataloader:
    print(f"English: {en_ids}")
    print(f"French: {fr_ids}")
    break

English: tensor([[  86, 2849,  541,  120,  262,  144, 5999,   75,  410, 2091,  128, 1889,
          277, 2421,   12,    2,    2,    2],
        [  81,  463,  167,  116,  791,  128, 1325, 1582,  328, 2227,   75, 1564,
           12,    2,    2,    2,    2,    2],
        [ 140,  105, 1134,  244,  155,   72,  165,   12,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2],
        [  81,  297, 1050,  159,  698,  135, 1158,   12,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2],
        [  72,  161,   86,  760, 2939,  201,   81,  157, 1501,   12,    2,    2,
            2,    2,    2,    2,    2,    2],
        [  72,  297,  116,  230,   12,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2],
        [  72,  235,  305,   81,   75, 1274,   12,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2],
        [ 121, 3444,  318,   75, 2233,  232,  175,  296, 4574,   86, 2154,   12,
            2,    2, 

### Design LSTM seq2seq model for translation

In [7]:
class EncoderLSTM(nn.Module):
    """A stacked LSTM encoder with an embedding layer"""
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.1):
        """
        Plain LSTM is used. No bidirectional LSTM.

        Args:
            vocab_size: The size of the input vocabulary
            embedding_dim: The dimension of the embedding vector
            hidden_dim: The dimension of the hidden state
            num_layers: The number of recurrent layers (layers of stacked LSTM)
            dropout: The dropout rate, applied to all LSTM layers except the last one
        """
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, input_seq):
        # input seq = [batch_size, seq_len] -> embedded = [batch_size, seq_len, embedding_dim]
        embedded = self.embedding(input_seq)
        # outputs = [batch_size, seq_len, embedding_dim]
        # hidden = cell = [n_layers, batch_size, hidden_dim]
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell


class DecoderLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_seq, hidden, cell):
        # input seq = [batch_size, seq_len] -> embedded = [batch_size, seq_len, embedding_dim]
        # hidden = cell = [n_layers, batch_size, hidden_dim]
        embedded = self.embedding(input_seq)
        # output = [batch_size, seq_len, embedding_dim]
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.out(output)
        return prediction, hidden, cell


class Seq2SeqLSTM(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, target_seq):
        """Given the partial target sequence, predict the next token"""
        # input seq = [batch_size, seq_len]
        # target seq = [batch_size, seq_len]
        batch_size, target_len = target_seq.shape
        device = target_seq.device
        # storing output logits
        outputs = []
        # encoder forward pass
        _enc_out, hidden, cell = self.encoder(input_seq)
        dec_in = target_seq[:, :1]
        # decoder forward pass
        for t in range(target_len-1):
            # last target token and hidden states -> next token
            pred, hidden, cell = self.decoder(dec_in, hidden, cell)
            # store the prediction
            pred = pred[:, -1:, :]
            outputs.append(pred)
            # use the predicted token as the next input
            dec_in = torch.cat([dec_in, pred.argmax(dim=2)], dim=1)
        outputs = torch.cat(outputs, dim=1)
        return outputs

In [8]:
# model parameters
enc_vocab = len(en_tokenizer.get_vocab())
dec_vocab = len(fr_tokenizer.get_vocab())
emb_dim = 256
hidden_dim = 256
num_layers = 2
dropout = 0.1

# init model
encoder = EncoderLSTM(enc_vocab, emb_dim, hidden_dim, num_layers, dropout).to(device)
decoder = DecoderLSTM(dec_vocab, emb_dim, hidden_dim, num_layers, dropout).to(device)
model = Seq2SeqLSTM(encoder, decoder).to(device)
summary(model)

Layer (type:depth-idx)                   Param #
Seq2SeqLSTM                              --
├─EncoderLSTM: 1-1                       --
│    └─Embedding: 2-1                    2,048,000
│    └─LSTM: 2-2                         1,052,672
├─DecoderLSTM: 1-2                       --
│    └─Embedding: 2-3                    2,048,000
│    └─LSTM: 2-4                         1,052,672
│    └─Linear: 2-5                       2,056,000
Total params: 8,257,344
Trainable params: 8,257,344
Non-trainable params: 0

In [9]:
print("Model created with:")
print(f"  Input vocabulary size: {enc_vocab}")
print(f"  Output vocabulary size: {dec_vocab}")
print(f"  Embedding dimension: {emb_dim}")
print(f"  Hidden dimension: {hidden_dim}")
print(f"  Number of layers: {num_layers}")
print(f"  Dropout: {dropout}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Model created with:
  Input vocabulary size: 8000
  Output vocabulary size: 8000
  Embedding dimension: 256
  Hidden dimension: 256
  Number of layers: 2
  Dropout: 0.1
  Total parameters: 8257344


### Train the Model

In [10]:
MODEL_WEIGHT_PATH = "weights/translator.pth"

os.makedirs(os.path.dirname(MODEL_WEIGHT_PATH), exist_ok=True)

if os.path.exists(MODEL_WEIGHT_PATH):
    model.load_state_dict(torch.load(MODEL_WEIGHT_PATH))
else:
    N_EPOCHS = 5 # Ideal 30    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss(ignore_index=fr_tokenizer.token_to_id("[pad]"))    

    for epoch in range(N_EPOCHS):
        model.train()
        epoch_loss = 0
        for en_ids, fr_ids in tqdm.tqdm(dataloader, desc="Training"):
            # Move the "sentences" to device
            en_ids = en_ids.to(device)
            fr_ids = fr_ids.to(device)
            # zero the grad, then forward pass
            optimizer.zero_grad()
            outputs = model(en_ids, fr_ids)
            # compute the loss: compare 3D logits to 2D targets
            loss = loss_fn(outputs.reshape(-1, dec_vocab), fr_ids[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{N_EPOCHS}; Avg loss {epoch_loss/len(dataloader)}; Latest loss {loss.item()}")
        weigths_name, weigths_ext = os.path.splitext(MODEL_WEIGHT_PATH)
        torch.save(model.state_dict(), f"{weigths_name}-epoch-{epoch+1:02d}{weigths_ext}")
        
        # Evaluate
        if (epoch+1) % 5 != 0:
            continue
        model.eval()
        epoch_loss = 0
        with torch.no_grad():
            for en_ids, fr_ids in tqdm.tqdm(dataloader, desc="Evaluating"):
                en_ids = en_ids.to(device)
                fr_ids = fr_ids.to(device)
                outputs = model(en_ids, fr_ids)
                loss = loss_fn(outputs.reshape(-1, dec_vocab), fr_ids[:, 1:].reshape(-1))
                epoch_loss += loss.item()
        print(f"Eval loss: {epoch_loss/len(dataloader)}")

    # Save the final model
    torch.save(model.state_dict(), MODEL_WEIGHT_PATH)

Training: 100%|██████████| 5223/5223 [55:42<00:00,  1.56it/s]  


Epoch 1/5; Avg loss 4.2010774445812205; Latest loss 3.4846739768981934


Training: 100%|██████████| 5223/5223 [57:38<00:00,  1.51it/s]  


Epoch 2/5; Avg loss 3.3816945481158847; Latest loss 2.665708541870117


Training: 100%|██████████| 5223/5223 [54:23<00:00,  1.60it/s]  


Epoch 3/5; Avg loss 2.9767239248983115; Latest loss 2.609898567199707


Training: 100%|██████████| 5223/5223 [41:16<00:00,  2.11it/s]  


Epoch 4/5; Avg loss 2.7105143109249834; Latest loss 2.484867572784424


Training: 100%|██████████| 5223/5223 [17:23<00:00,  5.01it/s]  


Epoch 5/5; Avg loss 2.519286102269907; Latest loss 2.939636468887329


Evaluating: 100%|██████████| 5223/5223 [05:40<00:00, 15.33it/s]

Eval loss: 2.292828548258117





### Test for a few samples

In [11]:
N_SAMPLES = 5
MAX_LEN = 60

model.eval()
with torch.no_grad():
    start_token = torch.tensor([fr_tokenizer.token_to_id("[start]")]).to(device)
    for en, true_fr in random.sample(text_pairs, N_SAMPLES):
        en_ids = torch.tensor(en_tokenizer.encode(en).ids).unsqueeze(0).to(device)
        _output, hidden, cell = model.encoder(en_ids)
        pred_ids = [start_token]
        for _ in range(MAX_LEN):
            decoder_input = torch.tensor(pred_ids).unsqueeze(0).to(device)
            output, hidden, cell = model.decoder(decoder_input, hidden, cell)
            output = output[:, -1, :].argmax(dim=1)
            pred_ids.append(output.item())
            # early stop if the predicted token is the end token
            if pred_ids[-1] == fr_tokenizer.token_to_id("[end]"):
                break
        # Decode the predicted IDs
        pred_fr = fr_tokenizer.decode(pred_ids)
        print(f"English: {en}")
        print(f"French: {true_fr}")
        print(f"Predicted: {pred_fr}")
        print()

English: her slurred speech was an indication that she was drunk.
French: son langage inarticulé trahissait qu'elle était saoule.
Predicted:  son tentative a était une était que que quelleelleelle..  

English: he has a book.
French: il dispose d'un livre.
Predicted:  il' une livre. 

English: i remember when we used to never eat vegetables that we didn't grow ourselves.
French: je me rappelle quand nous ne mangions jamais de légumes que nous n'avions pas cultivés nous-mêmes.
Predicted:  je me rappelle quand nous ne' jamais jamais''' nous nous nous nous nous nous nous nous..   

English: you must have seen them there.
French: tu as dû les voir là-bas.
Predicted:  tu as probablement voir voir.. 

English: i know what can happen here.
French: je sais ce qu'il peut se passer ici.
Predicted:  je sais ce qu' se passe ici. 

