In [124]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torchmetrics

from datasets import load_dataset, get_dataset_split_names
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path
import numpy as np
from tqdm import tqdm
import warnings
import math
import os

warnings.filterwarnings("ignore")

In [125]:
class InputEmbedding(nn.Module):
    def __init__(self, d_module: int, vocab_size: int):
        super().__init__()
        self.d_module = d_module
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_module)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_module)

In [126]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_length: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_length = seq_length
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_length, d_model)
        position = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.size(1), :]).requires_grad_(False)
        return self.dropout(x)

In [127]:
class LayerNormalization(nn.Module):
    def __init__(self, eps: float = 10**-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.beta = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.beta

In [128]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.layer1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.layer2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

In [129]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h

        assert d_model % h == 0, "d_model must be divisible by h"

        self.d_k = d_model // h
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, d_k, mask, dropout: nn.Dropout=None):
        d_k = query.shape[-1]
        
        # (batch_size, h, seq_length, d_k) --> (batch_size, h, seq_length, seq_length)
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = torch.softmax(attention_scores, dim=-1) # (batch_size, h, seq_length, seq_length)
        if dropout is not None:
            attention_scores = dropout(attention_scores)

        # (batch_size, h, seq_length, seq_length) --> (batch_size, h, seq_length, d_k)
        attention_output = torch.matmul(attention_scores, value)
        return (attention_output) , attention_scores

    def forward(self, q, k, v, mask):
        query = self.W_Q(q)  # (batch_size, seq_length, d_model) --> (batch_size, seq_length, d_model)
        key = self.W_K(k)   # (batch_size, seq_length, d_model) --> (batch_size, seq_length, d_model)
        value = self.W_V(v) # (batch_size, seq_length, d_model) --> (batch_size, seq_length, d_model)

        # (batch_size, seq_length, d_model) --> (batch_size, h, seq_length, d_k) --> (batch_size, h, seq_length, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # (batch_size, h, seq_length, d_k) --> (batch_size, h, seq_length, d_k)
        x, self.attention_scores = self.attention(query, key, value, self.d_k, mask, self.dropout)

        # (batch_size, h, seq_length, d_k) --> (batch_size, seq_length, h, d_k) --> (batch_size, seq_length, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)  # self.h * self.d_k = d_model

        # (batch_size, seq_length, d_model) --> (batch_size, seq_length, d_model)
        return self.W_O(x)   



In [130]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = LayerNormalization()

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.layer_norm(x)))

In [131]:
class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connection1 = ResidualConnection(dropout)
        self.residual_connection2 = ResidualConnection(dropout)

    def forward(self, x, src_mask):
        x = self.residual_connection1(x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connection2(x, self.feed_forward_block)
        return x


In [132]:
class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, src_mask):
        for layer in self.layers:
            x = layer(x, src_mask)
        return self.norm(x)

In [133]:
class DecoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock,
                 feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connection1 = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connection1[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connection1[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connection1[2](x, self.feed_forward_block)
        return x

In [134]:
class Decoder(nn.Module):
    
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [135]:
class ProjectionLayer(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # (batch_size, seq_length, d_model) --> (batch_size, seq_length, vocab_size)
        return torch.log_softmax(self.proj(x), dim=-1)

In [136]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbedding, tgt_embed: InputEmbedding,
                 src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        return self.projection_layer(x)

In [137]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, seq_len: int,
                      d_model: int = 512, N: int = 6, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Transformer:
    src_embed = InputEmbedding(d_model, src_vocab_size)
    tgt_embed = InputEmbedding(d_model, tgt_vocab_size)

    src_pos = PositionalEncoding(d_model, seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, seq_len, dropout)

    encoder_blocks = []

    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_blocks.append(EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout))

    decoder_blocks = []

    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

## Training

In [138]:
class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }
    
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [139]:
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

def get_ds_and_tokenizer(config):
    print('Loading dataset and Tokenizer')
    ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    print('Dataset and Tokenizer loaded')

    max_len_src = 0
    max_len_tgt = 0

    print('Finding max length of source and target sentences')
    for item in tqdm(ds_raw):
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
   
    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
    
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

In [140]:
def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [141]:
def load_prev_state(config, model, optimizer, initial_epoch, global_step):
    
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        optimizer.load_state_dict(state['optimizer_state_dict'])
        initial_epoch = state['epoch'] + 1
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')
    return model, optimizer, initial_epoch, global_step

In [142]:
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    console_width = 60

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)
            
            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break
    
    metric = torchmetrics.CharErrorRate()
    cer = metric(predicted, expected)
    
    metric = torchmetrics.WordErrorRate()
    wer = metric(predicted, expected)

    metric = torchmetrics.BLEUScore()
    bleu = metric(predicted, expected)

    print_msg(f'Char Error Rate: {cer:.2f}\t Word Error Rate: {wer:.2f}\t BLEU: {bleu:.2f}')

In [143]:
# import torchmetrics
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

def train_model(config, model, optimizer, loss_fn, train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt, device):
    model, optimizer, initial_epoch, global_step = load_prev_state(config, model, optimizer, 0, 0)

    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])
    
    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg))

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)
        print(f"Saved model to {model_filename}")

In [149]:
config = {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

In [145]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
if (device == 'cuda'):
    print(f"Device name: {torch.cuda.get_device_name(device.index)}")
    print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
device = torch.device(device)

Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds_and_tokenizer(config)

model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config['seq_len'], config['d_model']).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

Using device: cuda
Device name: NVIDIA GeForce RTX 3090
Device memory: 23.68316650390625 GB
Loading dataset and Tokenizer
Dataset and Tokenizer loaded
Finding max length of source and target sentences


100%|██████████| 32332/32332 [00:03<00:00, 10412.24it/s]


Max length of source sentence: 309
Max length of target sentence: 274


In [150]:
train_model(config, model, optimizer, loss_fn, train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt, device)

Preloading model opus_books_weights/tmodel_00.pt


Processing Epoch 01: 100%|██████████| 3638/3638 [07:29<00:00,  8.09it/s, loss=4.991]


------------------------------------------------------------
    SOURCE: 'Why, he's jealous!' she thought. 'Oh dear!
    TARGET: “Perché è geloso — ella pensava. — Dio mio! com’è simpatico e sciocco!
 PREDICTED: — E che cosa è accaduto — disse lei . — E non è accaduto .
------------------------------------------------------------
    SOURCE: He had a talent for understanding art and for imitating it with accuracy and good taste, and he imagined that he possessed the real power an artist needs.
    TARGET: Aveva attitudine a intendere l’arte e ad imitare con fedeltà, con gusto, l’opera d’arte; credette così d’avere ciò che occorre all’artista.
 PREDICTED: Era un ’ altra cosa , e , e , e , e , e , e , e , e , e , per la sua vita , e la sua vita .
------------------------------------------------------------
Char Error Rate: 0.73	 Word Error Rate: 1.38	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_01.pt


Processing Epoch 02: 100%|██████████| 3638/3638 [07:30<00:00,  8.08it/s, loss=4.696]


------------------------------------------------------------
    SOURCE: "I hope we are friends," was the unmoved reply; while he still watched the rising of the moon, which he had been contemplating as I approached.
    TARGET: — Spero invece che noi siamo amici, — mi disse fissando la luna.
 PREDICTED: — Io sono — rispose il signor Rochester , — che mi la luna , e che mi di .
------------------------------------------------------------
    SOURCE: "Did you see her face?"
    TARGET: — Avete veduto la sua fisonomia?
 PREDICTED: — Avete sentito il viso ?
------------------------------------------------------------
Char Error Rate: 0.74	 Word Error Rate: 1.11	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_02.pt


Processing Epoch 03: 100%|██████████| 3638/3638 [07:30<00:00,  8.07it/s, loss=4.814]


------------------------------------------------------------
    SOURCE: At three she also left, promising to come back to dinner.
    TARGET: Alle tre se ne andò anche lei, promettendo di venire a pranzo.
 PREDICTED: In tre volte , dopo , si mise a la mattina .
------------------------------------------------------------
    SOURCE: They'll water the horses at the wrong time, tear good harness, change a wheel with an iron tire for one without, or drop a bolt into the threshing machine in order to break it.
    TARGET: Abbevera i cavalli così da farli scoppiare, una bardatura buona la rompe, una ruota cerchiata ve la cambia e se la beve; nella macchina per la battitura ci getta un perno, per spezzarla.
 PREDICTED: a il bosco , il quale , , un ’ altra parte , senza , o un , senza , senza , né un .
------------------------------------------------------------
Char Error Rate: 0.73	 Word Error Rate: 0.96	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_03.pt


Processing Epoch 04: 100%|██████████| 3638/3638 [07:30<00:00,  8.07it/s, loss=4.289]


------------------------------------------------------------
    SOURCE: It came very warmly upon my thoughts, and indeed irresistibly, that now was the time to get me a servant, and, perhaps, a companion or assistant; and that I was plainly called by Providence to save this poor creature’s life.
    TARGET: Adesso sì che mi tornava caldamente e in guisa invincibile la mia prediletta idea di procacciarmi un servo e forse un compagno o aiutante; adesso sì, diceva a me stesso, che ne è arrivato il tempo; adesso sono l’uomo chiamato dalla Provvidenza a salvare la vita di quella povera creatura.
 PREDICTED: Mi fece un po ’ di gratitudine e di , perchè la mia storia era stata una volta , mi fu un pezzo di cui mi aveva fatto , e mi diedi a la mia vita , e mi diedi a la vita .
------------------------------------------------------------
    SOURCE: 'Yes, all my hopes are fixed on you,' said her brother.
    TARGET: — Sì, ogni speranza è in te — disse Stepan Arkad’ic.
 PREDICTED: — Sì , tutto 

Processing Epoch 05: 100%|██████████| 3638/3638 [07:31<00:00,  8.06it/s, loss=4.237]


------------------------------------------------------------
    SOURCE: Whether is it better, I ask, to be a slave in a fool's paradise at Marseilles--fevered with delusive bliss one hour--suffocating with the bitterest tears of remorse and shame the next--or to be a village-schoolmistress, free and honest, in a breezy mountain nook in the healthy heart of England?
    TARGET: Che cosa era meglio, domando: vivere schiava in un paradiso d'amore, trascinata un momento nel vortice di una felicità, e soffocata dopo subito dalle lagrime amare del rimorso e della vergogna, o esser maestra libera e onorata in un villaggio, fra le montagne dell'Inghilterra?
 PREDICTED: " Se è meglio , vi un ' altra donna , un ' altra donna , con una donna che fa , con la sua bontà e la sua bontà , la sua bontà , la sua bontà , la quale ha un ' altra donna , la testa di quelle colline ?
------------------------------------------------------------
    SOURCE: 'You could go there to-morrow!' she said.
    TARGET

Processing Epoch 06: 100%|██████████| 3638/3638 [07:29<00:00,  8.10it/s, loss=4.337]


------------------------------------------------------------
    SOURCE: "No; you shall tear yourself away, none shall help you: you shall yourself pluck out your right eye; yourself cut off your right hand: your heart shall be the victim, and you the priest to transfix it."
    TARGET: — Ti sbranerai da te, e nessuno ti aiuterà; ti strapperai l'occhio, ti strapperai la mano diritta; il cuore sarà la vittima e tu il carnefice.
 PREDICTED: — No , non vi , non vi , non vi , il cuore , e vi il cuore , e vi il cuore .
------------------------------------------------------------
    SOURCE: "Yes, sir; in different ways, I have an affection for both."
    TARGET: — Sì, signore, voglio bene a tutte e due, benché in modo differente.
 PREDICTED: — Sì , signore , e in uno stato di quelli che ho amici e di cuore .
------------------------------------------------------------
Char Error Rate: 0.64	 Word Error Rate: 1.00	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_06.pt


Processing Epoch 07: 100%|██████████| 3638/3638 [07:29<00:00,  8.09it/s, loss=4.109]


------------------------------------------------------------
    SOURCE: "Gratitude!" he ejaculated; and added wildly--"Jane accept me quickly. Say, Edward--give me my name--Edward--I will marry you." "Are you in earnest? Do you truly love me?
    TARGET: — Gratitudine! — esclamò; ed aggiunse violentemente: — Jane, accettatemi subito, chiamatemi per nome, ditemi: Edoardo, Edoardo, vi voglio sposare.
 PREDICTED: — ! — continuò , — , — fate il mio ; volete che mi , Jane , vi , vi , vi .
------------------------------------------------------------
    SOURCE: The old man accomplished this with ease.
    TARGET: Il vecchio lo faceva con facilità.
 PREDICTED: Il vecchio , che si , si sempre .
------------------------------------------------------------
Char Error Rate: 0.69	 Word Error Rate: 1.04	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_07.pt


Processing Epoch 08: 100%|██████████| 3638/3638 [07:29<00:00,  8.10it/s, loss=3.002]


------------------------------------------------------------
    SOURCE: I hope this delay will not have increased the difficulty of securing it."
    TARGET: Spero che questo ritardo non avrà resa più difficile l'ottenerla.
 PREDICTED: Spero che non nulla di tempo a lavorare , perché la difficoltà dell ' atto di .
------------------------------------------------------------
    SOURCE: Harris said I encouraged him.
    TARGET: Harris disse che ero io che lo incoraggiavo.
 PREDICTED: Harris gli dissi che gli risposi .
------------------------------------------------------------
Char Error Rate: 0.66	 Word Error Rate: 1.22	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_08.pt


Processing Epoch 09: 100%|██████████| 3638/3638 [07:29<00:00,  8.09it/s, loss=3.813]


------------------------------------------------------------
    SOURCE: 'No, I don't.
    TARGET: — No, non lo conosco.
 PREDICTED: — No , non lo farò .
------------------------------------------------------------
    SOURCE: "Go back and fetch both."
    TARGET: — Andate a prendere queste due cose.
 PREDICTED: — Andate a prendere e a prendere due .
------------------------------------------------------------
Char Error Rate: 0.37	 Word Error Rate: 0.67	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_09.pt


Processing Epoch 10: 100%|██████████| 3638/3638 [07:30<00:00,  8.08it/s, loss=3.212]


------------------------------------------------------------
    SOURCE: CHAPTER VIII
    TARGET: VIII.
 PREDICTED: VIII
------------------------------------------------------------
    SOURCE: "She is in Miss Temple's room," said the nurse.
    TARGET: — In quella della signorina Temple, — mi rispose l'infermiera.
 PREDICTED: — È nella camera della direttrice , — disse la njanja .
------------------------------------------------------------
Char Error Rate: 0.61	 Word Error Rate: 0.91	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_10.pt


Processing Epoch 11: 100%|██████████| 3638/3638 [07:30<00:00,  8.07it/s, loss=2.892]


------------------------------------------------------------
    SOURCE: He knew the difficulties connected with such a step: but he had said he would do it and was now obliged to do it.
    TARGET: Egli conosceva tutte le difficoltà collegate a questa faccenda, ma aveva detto che lo avrebbe fatto ed ora doveva mettere in atto la minaccia.
 PREDICTED: Egli sapeva che le difficoltà sarebbero finite di un passo così , ma ora egli avrebbe voluto .
------------------------------------------------------------
    SOURCE: Poor Alice!
    TARGET: Povera Alice!
 PREDICTED: Alice !
------------------------------------------------------------
Char Error Rate: 0.63	 Word Error Rate: 0.85	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_11.pt


Processing Epoch 12: 100%|██████████| 3638/3638 [07:30<00:00,  8.08it/s, loss=2.596]


------------------------------------------------------------
    SOURCE: 'It happens that I am expecting visitors,' replied Levin more rapidly, breaking off the splintered bits of the stick with his strong fingers. 'Or no, I am not expecting visitors and nothing has happened, yet I request you to leave.
    TARGET: — È accaduto che aspetto ospiti — disse Levin, rompendo sempre più in fretta con le dita forti le estremità del bastone che s’era spaccato. — Anzi, non aspetto ospiti e non è accaduto nulla, ma vi prego di partire.
 PREDICTED: — È questo che vado — rispose Levin , sempre più il punto che , il bastone sopra delle sinistra . — O , forse non abbiamo neppure intenzione di andare a chiedere se n ’ è andato , ma son andato via di nuovo a chiedere aiuto .
------------------------------------------------------------
    SOURCE: Her head dropped.
    TARGET: Ella chinò il capo.
 PREDICTED: Il capo si lasciò .
------------------------------------------------------------
Char Error Rat

Processing Epoch 13: 100%|██████████| 3638/3638 [07:30<00:00,  8.07it/s, loss=3.004]


------------------------------------------------------------
    SOURCE: The man, the human being, broke the spell at once. Nothing ever rode the Gytrash: it was always alone; and goblins, to my notions, though they might tenant the dumb carcasses of beasts, could scarce covet shelter in the commonplace human form.
    TARGET: La vista dell'uomo sfatò l'incantesimo, perché nessun essere umano aveva mai cavalcato Gytrash.
 PREDICTED: L ' uomo era umano , l ' aspetto umano si vedeva . Non era mai veduto il mio spirito , e mi sentivo sempre triste , perché era ben sicuro di scendere in mezzo ai miei simili modi , ma potevano esser liberato .
------------------------------------------------------------
    SOURCE: (Aside to pianist): "It is too low, old man; we'll have that over again, if you don't mind."
    TARGET: (Da parte al pianista): — È troppo basso, caro; ricominceremo da capo se non vi dispiace.
 PREDICTED: ( allegramente ): — Fa troppo forte , caro ; facciamo il tuo bel giuramen

Processing Epoch 14: 100%|██████████| 3638/3638 [07:30<00:00,  8.08it/s, loss=2.831]


------------------------------------------------------------
    SOURCE: And her clothes, they will wear out: how can she get new ones?"
    TARGET: E poi i suoi vestiti si consumeranno, e come farà ad averne altri?
 PREDICTED: E le si , le si devono esser forse più vicino a lei ?
------------------------------------------------------------
    SOURCE: While he is so occupied, I will tell you, reader, what they are: and first, I must premise that they are nothing wonderful.
    TARGET: Mentre che il signor Rochester li considerava ho il tempo di descriverli.
 PREDICTED: Mentre egli si allontana da lui , vi dirò come siano ; e appunto in che cosa sono , devo far nulla .
------------------------------------------------------------
Char Error Rate: 0.80	 Word Error Rate: 1.36	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_14.pt


Processing Epoch 15: 100%|██████████| 3638/3638 [07:31<00:00,  8.06it/s, loss=2.817]


------------------------------------------------------------
    SOURCE: She went up to them, talked to them, and acted as interpreter for the woman, who spoke nothing but Russian.
    TARGET: Si avvicinava loro, conversava, faceva da interprete alla donna che non parlava nessuna lingua straniera.
 PREDICTED: Ella si avvicinò a loro , le esaminò i , e gli parve poco la donna che non esprimeva nulla .
------------------------------------------------------------
    SOURCE: The third prod did it: and he turned over on the other side, and said he would be down in a minute, and that he would have his lace-up boots.
    TARGET: Il terzo colpo fece effetto; ma Harris si voltò sull’altro lato, dicendo che si sarebbe levato in un minuto e che si sarebbe subito infilati gli stivaletti.
 PREDICTED: L ’ ultimo segno si fermò ; poi , sorridendo su di un fianco , dicendo : si sarebbe seduto su di un monticello e gli scarpe .
------------------------------------------------------------
Char Error Ra

Processing Epoch 16: 100%|██████████| 3638/3638 [07:30<00:00,  8.08it/s, loss=2.466]


------------------------------------------------------------
    SOURCE: You have heard of her misfortune?
    TARGET: Sapete la sua sventura?
 PREDICTED: Avete inteso la sua sventura ?
------------------------------------------------------------
    SOURCE: 'But we are not talking about that.'
    TARGET: — Ma noi non parliamo di questo.
 PREDICTED: — Ma noi non parli più con questo .
------------------------------------------------------------
Char Error Rate: 0.33	 Word Error Rate: 0.82	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_16.pt


Processing Epoch 17: 100%|██████████| 3638/3638 [07:30<00:00,  8.07it/s, loss=2.520]


------------------------------------------------------------
    SOURCE: Now I looked back upon my desolate, solitary island as the most pleasant place in the world and all the happiness my heart could wish for was to be but there again. I stretched out my hands to it, with eager wishes—“O happy desert!” said I, “I shall never see thee more.
    TARGET: Ora io m’augurava la mia desolata e solitaria isola come se fosse il più delizioso paese dell’universo; ora tutta la felicità che il mio cuore sapesse desiderare, era il tornare ad esservi di bel nuovo; stendeva sospirando le mani verso di essa: «Oh fortunato deserto! io esclamava, non ti vedrò mai più! Misera creatura ch’io sono!
 PREDICTED: Ora mi fermai , o la mia vita errante , come la più bella e il mondo si rallegrò pienamente e il cuore di essere in me solo per essere in me . Ma io non potevo persuadermi che il cuore fosse rimasto in me , mi più e ad ogni felicità più , la piena di vita .
-----------------------------------------

Processing Epoch 18: 100%|██████████| 3638/3638 [07:30<00:00,  8.08it/s, loss=2.271]


------------------------------------------------------------
    SOURCE: "And Miss Ingram: what sort of a voice had she?"
    TARGET: — E che voce ha la signorina Ingram?
 PREDICTED: — E quella signorina Ingram ?
------------------------------------------------------------
    SOURCE: Why, not even from the members of his own family did he receive what you could call active encouragement.
    TARGET: Ebbene, neppure dai membri della propria famiglia ricevè ciò che si chiamerebbe un attivo incoraggiamento.
 PREDICTED: Perché non solo dei membri della famiglia , egli si occupava di quello che egli supponeva , avrebbe saputo delle terre .
------------------------------------------------------------
Char Error Rate: 0.64	 Word Error Rate: 1.09	 BLEU: 0.00
Saved model to opus_books_weights/tmodel_18.pt


Processing Epoch 19: 100%|██████████| 3638/3638 [07:30<00:00,  8.07it/s, loss=2.406]


------------------------------------------------------------
    SOURCE: The room Levin entered was a large one with a tiled stove and a partition.
    TARGET: La stanza era grande, con una stufa olandese e un’intelaiatura.
 PREDICTED: La sala da pranzo era in una grande e morbide , da mangiare , da un tramezzo .
------------------------------------------------------------
    SOURCE: On the day when, in the ballroom of the house in Arbat Street, she in her brown dress had gone up to him and silently plighted herself to him, on that day and in that hour a complete rupture seemed to have taken place within her soul between her former life and this other new and entirely unknown life – although in fact the old life still went on.
    TARGET: Nell’animo suo, in quel giorno in cui, in abito marrone, nella sala della casa sull’Arbat, si era avvicinata a lui in silenzio e gli si era data, nell’animo suo, in quel giorno e in quell’ora si era compiuto un completo distacco da tutta la sua vita 