# Opus books subset

This notebook implements the transformer architecture and uses it to traduce from english to italian using a subset of the Opus books dataset from HuggingFace

## Imports

In [1]:
import gc
import copy
import torch
import evaluate
import pandas as pd

from tqdm.auto import tqdm
from datasets import load_dataset
from gen_trainer import GenerativeTrainer
from model.Transformer import EncoderDecoderTransformer
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders


avail_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device available: ", avail_device)

Device available:  cuda


## Dataset loading and preprocessing

In [2]:
# Load dataset
dataset = load_dataset("librakevin/opus_books_split")
# Make the dataset a bit easier to access
# Reduce the size to speedup training
train = pd.DataFrame.from_dict(dataset['train']["translation"])
test = pd.DataFrame.from_dict(dataset['validation']["translation"])

vocab_size = 30000
# Create tokenizer
eng_tok = Tokenizer(models.WordPiece(unk_token="[UNK]"))
# Convert into unicode, remove accents and set to lowercase
eng_tok.normalizer = normalizers.Sequence([normalizers.NFD(),
                                           normalizers.StripAccents(),
                                            normalizers.Lowercase()])
# Pre-tokenize by spliting whitespace and punctuation
eng_tok.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Whitespace()])
# Create the trainer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
eng_tok_trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
it_tok_trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
# Create a italian tokenizer as well that allows accents
ita_tok = copy.deepcopy(eng_tok)
ita_tok.normalizer = normalizers.Sequence([normalizers.NFD(),
                                            normalizers.Lowercase()])
# Train the tokenizer
eng_tok.train_from_iterator(list(train["en"][:]), trainer=eng_tok_trainer)
ita_tok.train_from_iterator(list(train["it"][:]), trainer=it_tok_trainer)
# Add the start and end clauses for each sentence
eng_tok.post_processor = processors.TemplateProcessing(single="[CLS] $A [SEP]",
                                                       special_tokens=[("[CLS]", eng_tok.token_to_id("[CLS]")), ("[SEP]", eng_tok.token_to_id("[SEP]"))])
ita_tok.post_processor = processors.TemplateProcessing(single="[CLS] $A [SEP]",
                                                       special_tokens=[("[CLS]", ita_tok.token_to_id("[CLS]")), ("[SEP]", ita_tok.token_to_id("[SEP]"))])
# Create the decoders
eng_tok.decoder = decoders.WordPiece(prefix="##")
ita_tok.decoder = decoders.WordPiece(prefix="##")

# Encode dataset
train["eng_tok"] = train["en"].apply(lambda x: torch.tensor(eng_tok.encode(x).ids))
train["ita_tok"] = train["it"].apply(lambda x: torch.tensor(ita_tok.encode(x).ids))
test["eng_tok"] = test["en"].apply(lambda x: torch.tensor(eng_tok.encode(x).ids))
test["ita_tok"] = test["it"].apply(lambda x: torch.tensor(ita_tok.encode(x).ids))

# Quick test
sample = train.sample(1)["en"].values[0]
eng_enc = eng_tok.encode(sample)
print(f"Input: {sample}")
print(f"Tokens: {eng_enc.tokens}")
print(f"Encoded: {eng_enc.ids}")
print(f"Decoded: {eng_tok.decode(eng_enc.ids)}")

Input: 'Are presents required?' and off he rushed to Fulda, the jeweller's.
Tokens: ['[CLS]', "'", 'are', 'presents', 'required', "?'", 'and', 'off', 'he', 'rushed', 'to', 'fulda', ',', 'the', 'jeweller', "'", 's', '.', '[SEP]']
Encoded: [2, 8, 309, 7181, 3202, 385, 136, 462, 154, 3293, 138, 28670, 12, 131, 20214, 8, 51, 14, 3]
Decoded: ' are presents required?' and off he rushed to fulda, the jeweller ' s.


## Convert to pytorch dataloaders

In [3]:
class TraductionDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df[["eng_tok", "ita_tok"]]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = self.df["eng_tok"].iloc[idx]
        # For the training pass we remove the <eos> token
        y_train = self.df["ita_tok"].iloc[idx][:-1]
        # For the loss calculation we remove the <sos> token
        y_target = self.df["ita_tok"].iloc[idx][1:]
        return x, y_train, y_target

# Since all sentences in the batch might have different lenght we need to padd
# them to the longest sentence in the batch
def collate_pad_fn(batch):
    x, y_train, y_target = zip(*batch)
    x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=eng_tok.token_to_id("[PAD]"))
    y_train = torch.nn.utils.rnn.pad_sequence(y_train, batch_first=True, padding_value=ita_tok.token_to_id("[PAD]"))
    y_target = torch.nn.utils.rnn.pad_sequence(y_target, batch_first=True, padding_value=ita_tok.token_to_id("[PAD]"))
    return x, y_train, y_target

train_dl = torch.utils.data.DataLoader(TraductionDataset(train), batch_size=8, shuffle=True, collate_fn=collate_pad_fn)
test_dl = torch.utils.data.DataLoader(TraductionDataset(test), batch_size=8, shuffle=True, collate_fn=collate_pad_fn)
print(len(train_dl), len(test_dl))

3638 405


## Train the model

In [4]:
# Create the model
transformer = EncoderDecoderTransformer(eng_tok, ita_tok, device=avail_device)

# Define the training params
epochs = 10
val_interval = 2
lr = 0.0001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=lr, max_lr=0.1) # Not liking this one, will change
# Create the trainer
trainer = GenerativeTrainer(transformer, train_dl, test_dl, criterion, optimizer, scheduler, avail_device, 
                            model_name="EncoderDecoderTransformer", log_dir="runs/opus_books_split")

# Start the training
# tensorboard --logdir=runs/opus_books_split
trainer.train(epochs, val_interval)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 1/10: Train Loss: 0.0895


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 2/10: Train Loss: 0.1562


Testing:   0%|          | 0/405 [00:00<?, ?it/s]

Epoch 2/10: Test Loss: 2.0266
Model saved!


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 3/10: Train Loss: 0.2185


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 4/10: Train Loss: 0.0260


Testing:   0%|          | 0/405 [00:00<?, ?it/s]

Epoch 4/10: Test Loss: 1.8578
Model saved!


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 5/10: Train Loss: 0.0865


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 6/10: Train Loss: 0.1392


Testing:   0%|          | 0/405 [00:00<?, ?it/s]

Epoch 6/10: Test Loss: 1.9142


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 7/10: Train Loss: 0.1892


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 8/10: Train Loss: 0.0413


Testing:   0%|          | 0/405 [00:00<?, ?it/s]

Epoch 8/10: Test Loss: 1.9797


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 9/10: Train Loss: 0.0972


Training:   0%|          | 0/3638 [00:00<?, ?it/s]

Epoch 10/10: Train Loss: 0.1477


Testing:   0%|          | 0/405 [00:00<?, ?it/s]

Epoch 10/10: Test Loss: 2.0166


## Evaluate 

In [7]:
# Load the metrics
#model.load_state_dict(torch.load("transformer.pth")["model_state_dict"])
bertscore = evaluate.load("bertscore")
meteor = evaluate.load('meteor')
transformer.eval()

y = []
y_hat = []

with torch.no_grad():
    # Run generation with batches so we finish in a decent amout of time
    for x, _, y_target in tqdm(test_dl, desc="Generating translations", leave=False):
        # Store GT and predictions
        y += [ita_tok.decode(s) for s in y_target.tolist()]
        y_hat += [ita_tok.decode(s[1:]) for s in transformer.generate(x.to(avail_device), 320).tolist()]



print(f"Bertscore: {bertscore.compute(references=y, predictions=y_hat, lang='it')}")
print(f"Meteor: {meteor.compute(references=y, predictions=y_hat)}")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eamunoza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\eamunoza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\eamunoza\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Generating translations:   0%|          | 0/13 [00:00<?, ?it/s]

Bertscore: {'precision': [0.4886772632598877, 0.6211746335029602, 0.6098685264587402, 0.546404242515564, 0.45597389340400696, 0.5480381846427917, 0.5271639227867126, 0.48881858587265015, 0.4914749264717102, 0.4892617166042328, 0.5538017749786377, 0.4956820607185364, 0.49009427428245544, 0.49850332736968994, 0.4827694296836853, 0.6137924194335938, 0.5136950016021729, 0.48780137300491333, 0.3767920136451721, 0.5812061429023743, 0.522136926651001, 0.511408805847168, 0.5131323933601379, 0.5441244840621948, 0.5723872780799866, 0.49514418840408325, 0.5071470737457275, 0.5558851957321167, 0.4977937340736389, 0.41583579778671265, 0.4866471588611603, 0.6182420253753662, 0.5246499180793762, 0.43029287457466125, 0.5062270760536194, 0.4397454559803009, 0.547743558883667, 0.4976840615272522, 0.5078949928283691, 0.5505914688110352, 0.5184524655342102, 0.5701242089271545, 0.5324598550796509, 0.3991318345069885, 0.5265843868255615, 0.4113255739212036, 0.4710431694984436, 0.5876820683479309, 0.48995727