# **Machine Translation** with **Transformer Model**



# Tokenizing Vietnamese Text with underthesea Vietnamese Natural Language Processing Toolkit

## Overview


In [None]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
import torchtext; 
torchtext.disable_torchtext_deprecation_warning()
import pandas as pd
import spacy
import math
from tqdm import tqdm
import torch
from torch import nn
import lightning as pl
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, Vocab
from typing import Iterable, List, Callable
from torch.utils.data import Dataset, DataLoader
from underthesea import sent_tokenize, text_normalize, word_tokenize
from torchmetrics.text import BLEUScore
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from transformer_from_scratch import Transformer
from lightning.pytorch.loggers import TensorBoardLogger

#### Load the Dataset

In [3]:
def read_text(en_file, vi_file):
    """
    Read text pairs from files, then build a dataframe with two columns: english and vietnamese
    """
    
    with open(en_file, 'r') as f:
        en_lines = f.readlines()
        
    with open(vi_file, 'r') as f:
        vi_lines = f.readlines()
        
    data = pd.DataFrame({'english': en_lines, 'vietnamese': vi_lines})
    
    return data

In [4]:
# Read the data file
df = read_text('Data/en_sents', 'Data/vi_sents')

In [5]:
# Print the first 5 rows
df.head()

Unnamed: 0,english,vietnamese
0,Please put the dustpan in the broom closet\n,xin vui lòng đặt người quét rác trong tủ chổi\n
1,Be quiet for a moment.\n,im lặng một lát\n
2,Read this\n,đọc này\n
3,Tom persuaded the store manager to give him ba...,tom thuyết phục người quản lý cửa hàng trả lại...
4,Friendship consists of mutual understanding\n,tình bạn bao gồm sự hiểu biết lẫn nhau\n


In [6]:
print(f'We have total {len(df)} pairs of sentences.')

We have total 254090 pairs of sentences.


# Data Preprocessing

In this step, we will preprocess our data to make it suitable for our Transformer model.

For English, we will use the spaCy library, which is a powerful tool for natural language processing.

For Vietnamese, we will use the underthesea library, which is a Vietnamese Natural Language Processing Toolkit.

## Tokenizing Vietnamese Text with Underthesea

### Overview

Underthesea is a powerful toolkit for processing Vietnamese language data. It provides functionalities for various tasks such as tokenization, part-of-speech tagging, named entity recognition, and more.

## Usage

To use Underthesea for tokenizing Vietnamese text, follow these steps:

1. **Installation**: First, ensure that you have Underthesea installed. If not, you can install it using pip:

```bash
pip install underthesea
```

2. **Tokenization**: With Underthesea installed, you can tokenize Vietnamese text by simply calling the `word_tokenize` function on the text. Here is an example:

```python
from underthesea import word_tokenize

text = "Underthesea là thư viện xử lý ngôn ngữ tự nhiên Tiếng Việt."
tokens = word_tokenize(text)
```

In this example, `tokens` will be a list of tokens extracted from the input text.

Remember to always refer to the official documentation or repository for the most accurate and updated information.

In [7]:
# Download the models if necessary
if not spacy.util.is_package('en_core_web_md'):
    spacy.cli.download('en_core_web_md')
    
# Load the models
nlp_en = spacy.load('en_core_web_md')

In [None]:
def tokenize_vi(text: str) -> List[List[str]]:
    """
    Tokenize a Vietnamese text into sentences and words
    
    Args:
        text (str): the input text
        
    Returns:
        List[List[str]]: a list of sentences, each sentence is a list of tokens
    """
    # Step 1: Sentence Tokenization
    sentences = sent_tokenize(text)
    
    # Step 2: Text Normalization (assuming it's just lowercasing here)
    sentences = [text_normalize(sentence) for sentence in sentences]
        
    # Step 3: Word Tokenization
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
        
    # Flatten the list
    tokenized_sentences = [word for sentence in tokenized_sentences for word in sentence]
        
    # Lowercase all tokens
    tokenized_sentences = [word.lower() for word in tokenized_sentences]
    
    return tokenized_sentences

In [9]:
# Test the function
tokenize_vi('Tôi là sinh viên trường Đại học Bách Khoa. Tôi học ngành Khoa học máy tính.')

['tôi',
 'là',
 'sinh viên',
 'trường',
 'đại học',
 'bách khoa',
 '.',
 'tôi',
 'học',
 'ngành',
 'khoa học',
 'máy tính',
 '.']

In [9]:
# Define the tokenizer for English
en_tokenizer = get_tokenizer('spacy', language='en_core_web_md')

# Define the tokenizer for Vietnamese
vi_tokenizer = get_tokenizer(tokenize_vi)

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data.iloc[idx]
        src_tokens = self.src_tokenizer(src.lower())
        tgt_tokens = self.tgt_tokenizer(tgt.lower())
        
        # 1. Add '<sos>' and '<eos>' into the sentence
        
        src_tokens = ['<sos>'] + src_tokens + ['<eos>']
        tgt_tokens = ['<sos>'] + tgt_tokens + ['<eos>']

        #. Convert into a tensor of IDs
        
        src_tensor = torch.tensor([self.src_vocab[token] for token in src_tokens], dtype=torch.long)
        tgt_tensor = torch.tensor([self.tgt_vocab[token] for token in tgt_tokens], dtype=torch.long)
        
        return src_tensor, tgt_tensor
        
        
        

In [None]:
class TranslationDataModule(pl.LightningDataModule):
    def __init__(self, df, src_tokenizer, tgt_tokenizer, batch_size=32):
        super().__init__()
        self.df = df
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.batch_size = batch_size
        self.src_vocab = None
        self.tgt_vocab = None

    def setup(self, stage=None):
        if self.src_vocab is not None and self.tgt_vocab is not None:
            return
        
        # Build vocabularies
        self.src_vocab = build_vocab_from_iterator(
            self.df['english'].apply(lambda x: self.src_tokenizer(x.lower())),
            specials=['<unk>', '<pad>', '<sos>', '<eos>']
        )
    
        # Construct the tgt_vocab
        self.tgt_vocab = build_vocab_from_iterator(
            self.df['vietnamese'].apply(lambda x: self.tgt_tokenizer(x.lower())),
            specials=['<unk>', '<pad>', '<sos>', '<eos>']
        )
    
        self.src_vocab.set_default_index(self.src_vocab['<unk>'])
        
        # Set the default index for the target vocabulary
        
        self.tgt_vocab.set_default_index(self.tgt_vocab['<unk>'])

        # Create datasets
        
        self.tranlastion_dataset = TranslationDataset(
            self.df,
            self.src_vocab,
            self.tgt_vocab,
            self.src_tokenizer,
            self.tgt_tokenizer
        )
        
        train_size = int(0.8 * len(self.df))
        val_size = len(self.df) - train_size
        self.train_dataset, self.val_dataset = torch.utils.data.random_split(self.tranlastion_dataset, [train_size, val_size])
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=self.collate_fn
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=self.collate_fn
        )

    def collate_fn(self, batch):
        src_batch, tgt_batch = zip(*batch)
        src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=self.src_vocab['<pad>'], batch_first=True)
        tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, padding_value=self.tgt_vocab['<pad>'], batch_first=True)
        return src_batch, tgt_batch


In [12]:
PAD_IDX = 0

In [None]:
class TranslationModel(pl.LightningModule):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8, num_layers=6, dim_feedforward=2048, dropout=0.1, max_seq_length=80):
        super().__init__()
        self.transformer = Transformer(
            src_vocab_size=src_vocab_size,
            tgt_vocab_size=tgt_vocab_size,
            d_model=d_model,
            num_heads=nhead,
            num_layers=num_layers,
            d_ff=dim_feedforward,
            max_seq_length=max_seq_length,
            dropout=dropout
        )
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
        
    def forward(self, src, tgt):
        return  self.transformer(src, tgt)


    def training_step(self, batch, batch_idx):
        src, tgt = batch
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        output = self(src, tgt_input)
        
        loss = self.loss_fn(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):        
        src, tgt = batch
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        output = self(src, tgt_input)
        
        loss = self.loss_fn(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        self.log('val_loss', loss, prog_bar=True)
        return loss
        

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0005)
        # Learning rate scheduler: Reduces LR when validation loss plateaus
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode="min",  
            factor=0.1, 
            patience=3, 
            verbose=False,
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss",
                "interval": "epoch",
                "frequency": 1,
            },
        }
        

In [14]:
# Create the data module
data_module = TranslationDataModule(df, en_tokenizer, vi_tokenizer, batch_size=32)
data_module.setup()

In [15]:
# Create model
model = TranslationModel(
    src_vocab_size=len(data_module.src_vocab),
    tgt_vocab_size=len(data_module.tgt_vocab),
    d_model=256,
    nhead=4,
    num_layers=3,
    dim_feedforward=2048,
    dropout=0.1,
    max_seq_length=80
)


In [16]:
# Checkpoint callback: Saves the best model based on validation loss
checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",  
    dirpath="checkpoints_machine_translation",
    filename="transformer-best-{epoch:02d}", \
    save_top_k=1, 
    mode="min",  
    save_last=True, 
)

# Early stopping callback: Stops training if validation loss doesn't improve
early_stop_callback = EarlyStopping(
    monitor="val_loss", 
    patience=5, 
    verbose=True,  
    mode="min", 
)

In [None]:
# Train and evaluate the model
# Create the model
# Initialize the Trainer with tensorboard logger
trainer = pl.Trainer(
    # fast_dev_run=True,
    max_epochs=1,
    accelerator="auto", 
    devices=-1,  
    callbacks=[checkpoint_callback, early_stop_callback],
    log_every_n_steps=20,
    logger=TensorBoardLogger(
        save_dir="logs",
        name="transformer_translation",
        version=1,
    ),
)
# Start the training process
print("\n--- Starting Training ---")
trainer.fit(model, datamodule=data_module)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores


HPU available: False, using: 0 HPUs
/Users/minhdat2004/anaconda3/envs/nlp/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /Users/minhdat2004/BKU/DL4NLP/FinalExam/checkpoints_machine_translation exists and is not empty.

  | Name        | Type             | Params | Mode 
---------------------------------------------------------
0 | transformer | Transformer      | 21.6 M | train
1 | loss_fn     | CrossEntropyLoss | 0      | train
---------------------------------------------------------
21.6 M    Trainable params
0         Non-trainable params
21.6 M    Total params
86.312    Total estimated model params size (MB)
130       Modules in train mode
0         Modules in eval mode



--- Starting Training ---


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/minhdat2004/anaconda3/envs/nlp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/minhdat2004/anaconda3/envs/nlp/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.630
`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
# Test translating a list of sentences from English to Vietnamese
def translate_sentences(model, data_module, num_sentences=10, max_len=50):
    """
    Translate random sentences from English to Vietnamese in the validation set
    Args:
        model (nn.Module): The trained translation model.
        data_module: The DataModule containing vocab and dataset.
        num_sentences (int): Number of random sentences to translate.
        max_len (int): Maximum length of translation.
    Returns:
        List[Tuple[str, str, str]]: List of (source, target, predicted) sentences.
    """
    src_vocab = data_module.src_vocab
    tgt_vocab = data_module.tgt_vocab
    src_tokenizer = data_module.src_tokenizer
    tgt_tokenizer = data_module.tgt_tokenizer

    PAD_IDX = tgt_vocab['<pad>']
    SOS_IDX = tgt_vocab['<sos>']
    EOS_IDX = tgt_vocab['<eos>']
    SRC_PAD_IDX = src_vocab['<pad>']
    SRC_SOS_IDX = src_vocab['<sos>']
    SRC_EOS_IDX = src_vocab['<eos>']

    # Select random sentences from validation set
    random_indices = torch.randint(0, len(data_module.val_dataset), (num_sentences,))
    random_sentences = [data_module.val_dataset[i] for i in random_indices]

    translations = []
    
    model.eval()
    with torch.no_grad():
        for src_tensor, tgt_tensor in random_sentences:
            # Prepare source sentence
            src_tokens = [src_vocab.lookup_token(token.item()) for token in src_tensor]
            # Remove special tokens <sos>, <eos>, <pad>
            src_tokens = [token for token in src_tokens if token not in ['<sos>', '<eos>', '<pad>']]
            src_sentence = ' '.join(src_tokens)

            # Prepare input for model
            src_tensor = src_tensor.unsqueeze(0)  # [1, src_len]
            tgt_input = torch.tensor([[SOS_IDX]], device=src_tensor.device)  # [1, 1] with <sos>

            # Autoregressive decoding
            for _ in range(max_len):
                output = model(src_tensor, tgt_input)
                next_token = output[:, -1, :].argmax(dim=-1)  # take the last token prediction
                tgt_input = torch.cat([tgt_input, next_token.unsqueeze(0)], dim=1)
                if next_token.item() == EOS_IDX:
                    break

            # Decode target ground truth
            tgt_tokens = [token.item() for token in tgt_tensor]
            tgt_tokens = [token for token in tgt_tokens if token not in {PAD_IDX, SOS_IDX, EOS_IDX}]
            tgt_sentence = ' '.join([tgt_vocab.lookup_token(token) for token in tgt_tokens])

            # Decode predicted translation
            pred_tokens = [token.item() for token in tgt_input[0]]
            pred_tokens = [token for token in pred_tokens if token not in {PAD_IDX, SOS_IDX, EOS_IDX}]
            pred_sentence = ' '.join([tgt_vocab.lookup_token(token) for token in pred_tokens])

            translations.append((src_sentence, tgt_sentence, pred_sentence))

    return translations

# Translate one sentence 
def translate_one_sentence(model, data_module, sentence, max_len=50):
    """
    Translate a single sentence from English to Vietnamese
    Args:
        model (nn.Module): The trained translation model.
        data_module: The DataModule containing vocab and dataset.
        sentence (str): The English sentence to translate.
        max_len (int): Maximum length of translation.
    Returns:
        Tuple[str, str, str]: (source, target, predicted) sentences.
    """
    src_vocab = data_module.src_vocab
    tgt_vocab = data_module.tgt_vocab
    src_tokenizer = data_module.src_tokenizer
    tgt_tokenizer = data_module.tgt_tokenizer

    PAD_IDX = tgt_vocab['<pad>']
    SOS_IDX = tgt_vocab['<sos>']
    EOS_IDX = tgt_vocab['<eos>']
    
    # Prepare source sentence
    src_tokens = src_tokenizer(sentence.lower())
    src_tokens = ['<sos>'] + src_tokens + ['<eos>']
    src_tensor = torch.tensor([src_vocab[token] for token in src_tokens], dtype=torch.long).unsqueeze(0)  # [1, src_len]
    
    # Prepare input for model
    tgt_input = torch.tensor([[SOS_IDX]], device=src_tensor.device)  # [1, 1] with <sos>

    # Autoregressive decoding
    with torch.no_grad():
        for _ in range(max_len):
            output = model(src_tensor, tgt_input)
            next_token = output[:, -1, :].argmax(dim=-1)  # take the last token prediction
            tgt_input = torch.cat([tgt_input, next_token.unsqueeze(0)], dim=1)
            if next_token.item() == EOS_IDX:
                break

    # Decode predicted translation
    pred_tokens = [token.item() for token in tgt_input[0]]
    pred_tokens = [token for token in pred_tokens if token not in {PAD_IDX, SOS_IDX, EOS_IDX}]
    pred_sentence = ' '.join([tgt_vocab.lookup_token(token) for token in pred_tokens])

    return sentence, pred_sentence
    

In [None]:
translations = translate_sentences(model, data_module)

# --- Print the results ---
print("\n--- Translations ---")
for src, tgt, pred in translations:
    print(f"Source:    {src[:-2]}")
    print(f"Target:    {tgt}")
    print(f"Predicted: {pred}")
    print("-" * 50)


--- Translations ---
Source:    what you need to do next is fill out this application form .
Target:    những gì bạn cần làm tiếp theo là điền vào mẫu đơn này .
Predicted: những gì bạn cần phải làm tiếp theo là điền vào mẫu đơn này .
--------------------------------------------------
Source:    it will cost 500 dollars to fly to paris .
Target:    nó sẽ có giá 500 đô la để bay đến paris .
Predicted: nó sẽ tốn 500 đô la để bay đến paris .
--------------------------------------------------
Source:    we 're not the only ones here from boston
Target:    chúng tôi không phải là những người duy nhất ở đây từ boston
Predicted: chúng tôi không phải là người duy nhất ở đây từ boston
--------------------------------------------------
Source:    she lives far from there .
Target:    cô ấy sống xa đó
Predicted: cô ấy sống xa đây .
--------------------------------------------------
Source:    you 're the most beautiful girl i 've ever seen
Target:    bạn là cô gái đẹp nhất tôi từng thấy
Predicted

In [None]:
# Calculate the BLEU score

bleu = BLEUScore()

references = []
hypotheses = []

for src, tgt, pred in translations:
    references.append([tgt])   # Ground truth (as list of strings)
    hypotheses.append(pred)    # Model output (as string)
    

bleu_score = bleu(hypotheses, references)

print(f"\nBLEU score for 10 translations (torchmetrics): {bleu_score:.4f}")



BLEU score for 10 translations (torchmetrics): 0.6552


In [None]:
# Test the translation of a single sentence
sentence = "Hello, how are you?"
src, pred = translate_one_sentence(model, data_module, sentence)
print(f"Source:    {src}")
print(f"Predicted: {pred}")
# Test the translation of a single sentence
sentence = "The cat is sleeping on mat"
src, pred = translate_one_sentence(model, data_module, sentence)
print(f"Source:    {src}")
print(f"Predicted: {pred}")
sentence = "you are so beautiful"
src, pred = translate_one_sentence(model, data_module, sentence)
print(f"Source:    {src}")
print(f"Predicted: {pred}")
sentence = "I am a student at Hanoi University of Science and Technology"
src, pred = translate_one_sentence(model, data_module, sentence)
print(f"Source:    {src}")
print(f"Predicted: {pred}")
sentence = "I love you"
src, pred = translate_one_sentence(model, data_module, sentence)
print(f"Source:    {src}")
print(f"Predicted: {pred}")

Source:    Hello, how are you?
Predicted: xin chào , bạn thế nào ?
Source:    The cat is sleeping on mat
Predicted: con mèo đang ngủ trên tấm thảm
Source:    you are so beautiful
Predicted: bạn thật đẹp
Source:    I am a student at Hanoi University of Science and Technology
Predicted: tôi là học sinh tại trường đại học và công nghệ
Source:    I love you
Predicted: tôi yêu bạn
