#VI-EN Machine Translation using BERT-to-GPT2 Model
##AI VIETNAM
**Dataset: IWSLT15-en-vi**

**thai.nq07@gmail.com**

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd "/content/drive/MyDrive/Colab Notebooks/Transformer"

/content/drive/MyDrive/Colab Notebooks/Transformer


In [3]:
!python --version

Python 3.10.12


##1.Install library

In [4]:
!pip -q install 'datasets==2.9.0' 'sentencepiece==0.1.99' 'sacrebleu==2.3.1'

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import os
import sys
import math
import copy
import heapq
import datetime

from tqdm import tqdm
import numpy as np

import sacrebleu

import datasets

import sentencepiece as spm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [6]:
print(f'numpy version: {np.__version__}')
print(f'sacrebleu version: {sacrebleu.__version__}')
print(f'datasets version: {datasets.__version__}')
print(f'sentencepiece version: {spm.__version__}')
print(f'torch version: {torch.__version__}')

numpy version: 1.23.5
sacrebleu version: 2.3.1
datasets version: 2.9.0
sentencepiece version: 0.1.99
torch version: 2.1.0+cu121


##2.Data Preparing
**IWSLT2015-EN-VI**

In [7]:
class DataPreparing:
    def __init__(self, save_data_dir, source_lang, target_lang):
        self.save_data_dir = save_data_dir
        self.source_lang = source_lang
        self.target_lang = target_lang

    def download_dataset(self):
        if not(os.path.exists(self.save_data_dir)):
            print('Create Foler')
            os.mkdir(self.save_data_dir)
        if len(os.listdir(self.save_data_dir)) ==0:
            print('#1-Download Dataset')
            corpus = datasets.load_dataset("mt_eng_vietnamese", "iwslt2015-en-vi")

            print('#2-Save Dataset')
            for data in ['train', 'validation', 'test']:

                source_data, target_data = self.get_data(corpus[data])

                print('Source lang: {} - {}: {}'.format(self.source_lang, data, len(source_data)))
                print('Target lang: {} - {}: {}'.format(self.target_lang, data, len(target_data)))

                self.save_data(source_data, os.path.join(self.save_data_dir, data + '.' + self.source_lang))
                self.save_data(target_data, os.path.join(self.save_data_dir, data + '.' + self.target_lang))

        else:
            print('Dataset exit!')

    def get_data(self, corpus):
        source_data = []
        target_data = []
        for data in corpus:
            source_data.append(data['translation'][self.source_lang])
            target_data.append(data['translation'][self.target_lang])
        return source_data, target_data

    def save_data(self, data, save_path):
        print('=> Save data => Path: {}'.format(save_path))
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(data))

##3.SentencePiece Tokenization

In [8]:
def train_sentencepiece(cfg, is_src=True):
    template = "--input={} \
                --pad_id={} \
                --bos_id={} \
                --eos_id={} \
                --unk_id={} \
                --model_prefix={} \
                --vocab_size={} \
                --character_coverage={} \
                --model_type={}"

    if is_src:
        train_file = f"{cfg.data_dir}/train.{cfg.src_lang}"
        model_prefix = f"{cfg.sp_dir}/{cfg.src_model_prefix}"
    else:
        train_file = f"{cfg.data_dir}/train.{cfg.tgt_lang}"
        model_prefix = f"{cfg.sp_dir}/{cfg.tgt_model_prefix}"

    print(f"===> Processing file: {train_file}")
    if not os.path.isdir(cfg.sp_dir):
        os.mkdir(cfg.sp_dir)

    sp_cfg = template.format(
        train_file, # file txt gồm các câu ở mỗi dòng
        cfg.pad_id, # index của token padding
        cfg.sos_id, # index của token start of sentence
        cfg.eos_id, # index của token end of sentence
        cfg.unk_id, # index của token unknow
        model_prefix, # tên của model tokenize sau khi train
        cfg.sp_vocab_size, # kích thước bộ từ vựng (ở đây được đặt là 10k)
        cfg.character_coverage, # tỉ lệ số lượng kí tự được mô hình sử dụng
        cfg.model_type) # được đặt là "unigram" -> tokenize ở mức độ từ đơn

    spm.SentencePieceTrainer.Train(sp_cfg)

##4.Dataloader

In [9]:
class NMTDataset(Dataset):
    def __init__(self, cfg, data_type="train"):
        super().__init__()
        self.cfg = cfg

        self.sp_src, self.sp_tgt = self.load_sp_tokenizer()
        self.src_texts, self.tgt_texts = self.read_data(data_type)

        src_tokenized_sequences = self.texts_to_sequences(self.src_texts, True)
        tgt_input_tokenized_sequences, tgt_output_tokenized_sequences = self.texts_to_sequences(self.tgt_texts, False)

        self.src_data = torch.LongTensor(src_tokenized_sequences)
        self.input_tgt_data = torch.LongTensor(tgt_input_tokenized_sequences)
        self.output_tgt_data = torch.LongTensor(tgt_output_tokenized_sequences)

    def read_data(self, data_type):
        print(f"===> Load data from: {self.cfg.data_dir}/{data_type}.{self.cfg.src_lang}")
        with open(f"{self.cfg.data_dir}/{data_type}.{self.cfg.src_lang}", 'r') as f:
            src_texts = f.readlines()

        print(f"===> Load data from: {self.cfg.data_dir}/{data_type}.{self.cfg.tgt_lang}")
        with open(f"{self.cfg.data_dir}/{data_type}.{self.cfg.tgt_lang}", 'r') as f:
            trg_texts = f.readlines()

        return src_texts, trg_texts

    def load_sp_tokenizer(self):
        sp_src = spm.SentencePieceProcessor()
        sp_src.Load(f"{self.cfg.sp_dir}/{self.cfg.src_model_prefix}.model")

        sp_tgt = spm.SentencePieceProcessor()
        sp_tgt.Load(f"{self.cfg.sp_dir}/{self.cfg.tgt_model_prefix}.model")

        return sp_src, sp_tgt

    def texts_to_sequences(self, texts, is_src=True):
        if is_src:
            src_tokenized_sequences = []
            for text in tqdm(texts):
                tokenized = self.sp_src.EncodeAsIds(text.strip())
                src_tokenized_sequences.append(
                    pad_or_truncate([self.cfg.sos_id] + tokenized + [self.cfg.eos_id], self.cfg.seq_len, self.cfg.pad_id)
                )
            return src_tokenized_sequences
        else:
            tgt_input_tokenized_sequences = []
            tgt_output_tokenized_sequences = []
            for text in tqdm(texts):
                tokenized = self.sp_tgt.EncodeAsIds(text.strip())
                tgt_input = [self.cfg.sos_id] + tokenized
                tgt_output = tokenized + [self.cfg.eos_id]
                tgt_input_tokenized_sequences.append(pad_or_truncate(tgt_input, self.cfg.seq_len, self.cfg.pad_id))
                tgt_output_tokenized_sequences.append(pad_or_truncate(tgt_output, self.cfg.seq_len, self.cfg.pad_id))

            return tgt_input_tokenized_sequences, tgt_output_tokenized_sequences

    def __getitem__(self, idx):
        return self.src_data[idx], self.input_tgt_data[idx], self.output_tgt_data[idx]

    def __len__(self):
        return np.shape(self.src_data)[0]

def pad_or_truncate(tokenized_sequence, seq_len, pad_id):
    if len(tokenized_sequence) < seq_len:
        left = seq_len - len(tokenized_sequence)
        padding = [pad_id] * left
        tokenized_sequence += padding
    else:
        tokenized_sequence = tokenized_sequence[:seq_len]
    return tokenized_sequence

def get_data_loader(cfg, data_type='train'):
    dataset = NMTDataset(cfg, data_type)

    if data_type == 'train':
        shuffle = False
    else:
        shuffle = False

    dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=shuffle)

    return dataset, dataloader

##5.Transformer Model
Ref: https://pytorch.org/tutorials/beginner/translation_transformer.html

In [10]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_model, num_heads, drop_out=0.1):
        super().__init__()
        self.inf = 1e9

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # W^Q, W^K, W^V in the paper
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(drop_out)
        self.attn_softmax = nn.Softmax(dim=-1)

        # Final output linear transformation
        self.w_0 = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        input_shape = q.shape

        # Linear calculation +  split into num_heads
        q = self.w_q(q).view(input_shape[0], -1, self.num_heads, self.d_k) # (B, L, num_heads, d_k)
        k = self.w_k(k).view(input_shape[0], -1, self.num_heads, self.d_k) # (B, L, num_heads, d_k)
        v = self.w_v(v).view(input_shape[0], -1, self.num_heads, self.d_k) # (B, L, num_heads, d_k)

        # For convenience, convert all tensors in size (B, num_heads, L, d_k)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # Conduct self-attention
        attn_values = self.self_attention(q, k, v, mask=mask) # (B, num_heads, L, d_k)
        concat_output = attn_values.transpose(1, 2)\
            .contiguous().view(input_shape[0], -1, self.d_model) # (B, L, d_model)

        return self.w_0(concat_output)

    def self_attention(self, q, k, v, mask=None):
        # Calculate attention scores with scaled dot-product attention
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) # (B, num_heads, L, L)
        attn_scores = attn_scores / math.sqrt(self.d_k)

        # If there is a mask, make masked spots -INF
        if mask is not None:
            mask = mask.unsqueeze(1) # (B, 1, L) => (B, 1, 1, L) or (B, L, L) => (B, 1, L, L)
            attn_scores = attn_scores.masked_fill_(mask == 0, -1 * self.inf)

        # Softmax and multiplying K to calculate attention value
        attn_distribs = self.attn_softmax(attn_scores)

        attn_distribs = self.dropout(attn_distribs)
        attn_values = torch.matmul(attn_distribs, v) # (B, num_heads, L, d_k)

        return attn_values

class FeedFowardLayer(nn.Module):
    def __init__(self, d_model, d_ff, drop_out=0.1):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff, bias=True)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(d_ff, d_model, bias=True)
        self.dropout = nn.Dropout(drop_out)

    def forward(self, x):
        x = self.relu(self.linear_1(x)) # (B, L, d_ff)
        x = self.dropout(x)
        x = self.linear_2(x) # (B, L, d_model)

        return x


class LayerNormalization(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.layer = nn.LayerNorm([d_model], elementwise_affine=True, eps=self.eps)

    def forward(self, x):
        x = self.layer(x)

        return x

class PositionalEncoder(nn.Module):
    def __init__(self, seq_len, d_model, device):
        super().__init__()
        self.seq_len = seq_len
        self.d_model = d_model
        # Make initial positional encoding matrix with 0
        pe_matrix= torch.zeros(seq_len, d_model) # (L, d_model)

        # Calculating position encoding values
        for pos in range(seq_len):
            for i in range(d_model):
                if i % 2 == 0:
                    pe_matrix[pos, i] = math.sin(pos / (10000 ** (2 * i / d_model)))
                elif i % 2 == 1:
                    pe_matrix[pos, i] = math.cos(pos / (10000 ** (2 * i / d_model)))

        pe_matrix = pe_matrix.unsqueeze(0) # (1, L, d_model)
        self.positional_encoding = pe_matrix.to(device=device).requires_grad_(False)

    def forward(self, x):
        x = x * math.sqrt(self.d_model) # (B, L, d_model)
        x = x + self.positional_encoding # (B, L, d_model)

        return x

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, drop_out=0.1):
        super().__init__()
        self.layer_norm_1 = LayerNormalization(d_model)
        self.multihead_attention = MultiheadAttention(d_model, num_heads, drop_out)
        self.drop_out_1 = nn.Dropout(drop_out)

        self.layer_norm_2 = LayerNormalization(d_model)
        self.feed_forward = FeedFowardLayer(d_model, d_ff, drop_out)
        self.drop_out_2 = nn.Dropout(drop_out)

    def forward(self, x, e_mask):
        x_1 = self.layer_norm_1(x) # (B, L, d_model)
        x = x + self.drop_out_1(
            self.multihead_attention(x_1, x_1, x_1, mask=e_mask)
        ) # (B, L, d_model)

        x_2 = self.layer_norm_2(x) # (B, L, d_model)
        x = x + self.drop_out_2(self.feed_forward(x_2)) # (B, L, d_model)

        return x # (B, L, d_model)

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, drop_out=0.1):
        super().__init__()
        self.layer_norm_1 = LayerNormalization(d_model)
        self.masked_multihead_attention = MultiheadAttention(d_model, num_heads, drop_out)
        self.drop_out_1 = nn.Dropout(drop_out)

        self.layer_norm_2 = LayerNormalization(d_model)
        self.multihead_attention = MultiheadAttention(d_model, num_heads, drop_out)
        self.drop_out_2 = nn.Dropout(drop_out)

        self.layer_norm_3 = LayerNormalization(d_model)
        self.feed_forward = FeedFowardLayer(d_model, d_ff, drop_out)
        self.drop_out_3 = nn.Dropout(drop_out)

    def forward(self, x, e_output, e_mask,  d_mask):
        x_1 = self.layer_norm_1(x) # (B, L, d_model)
        x = x + self.drop_out_1(
            self.masked_multihead_attention(x_1, x_1, x_1, mask=d_mask)
        ) # (B, L, d_model)
        x_2 = self.layer_norm_2(x) # (B, L, d_model)
        x = x + self.drop_out_2(
            self.multihead_attention(x_2, e_output, e_output, mask=e_mask)
        ) # (B, L, d_model)
        x_3 = self.layer_norm_3(x) # (B, L, d_model)
        x = x + self.drop_out_3(self.feed_forward(x_3)) # (B, L, d_model)

        return x # (B, L, d_model)

class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, drop_out=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.layers = nn.ModuleList(
            [EncoderLayer(d_model, num_heads, d_ff, drop_out) for i in range(num_layers)]
        )
        self.layer_norm = LayerNormalization(d_model)

    def forward(self, x, e_mask):
        for i in range(self.num_layers):
            x = self.layers[i](x, e_mask)

        return self.layer_norm(x)

class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, drop_out):
        super().__init__()
        self.num_layers = num_layers
        self.layers = nn.ModuleList(
            [DecoderLayer(d_model, num_heads, d_ff, drop_out) for i in range(num_layers)]
        )
        self.layer_norm = LayerNormalization(d_model)

    def forward(self, x, e_output, e_mask, d_mask):
        for i in range(self.num_layers):
            x = self.layers[i](x, e_output, e_mask, d_mask)

        return self.layer_norm(x)

class Transformer(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.src_embedding = nn.Embedding(self.cfg.sp_vocab_size, self.cfg.d_model)
        self.tgt_embedding = nn.Embedding(self.cfg.sp_vocab_size, self.cfg.d_model)
        self.positional_encoder = PositionalEncoder(
            self.cfg.seq_len,
            self.cfg.d_model,
            self.cfg.device
        )
        self.encoder = Encoder(
            self.cfg.num_layers,
            self.cfg.d_model,
            self.cfg.num_heads,
            self.cfg.d_ff,
            self.cfg.drop_out
        )
        self.decoder = Decoder(
            self.cfg.num_layers,
            self.cfg.d_model,
            self.cfg.num_heads,
            self.cfg.d_ff,
            self.cfg.drop_out
        )
        self.output_linear = nn.Linear(self.cfg.d_model, self.cfg.sp_vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, src_input, tgt_input, e_mask=None, d_mask=None):
        src_input = self.src_embedding(src_input) # (B, L) => (B, L, d_model)
        tgt_input = self.tgt_embedding(tgt_input) # (B, L) => (B, L, d_model)
        src_input = self.positional_encoder(src_input) # (B, L, d_model) => (B, L, d_model)
        tgt_input = self.positional_encoder(tgt_input) # (B, L, d_model) => (B, L, d_model)

        e_output = self.encoder(src_input, e_mask) # (B, L, d_model)
        d_output = self.decoder(tgt_input, e_output, e_mask, d_mask) # (B, L, d_model)

        output = self.softmax(self.output_linear(d_output)) # (B, L, d_model) => # (B, L, trg_vocab_size)

        return output

##7.Trainer

In [11]:
class Trainer():
    def __init__(self, cfg, is_train=True, load_ckpt=True):
        self.cfg = cfg

        print("Loading Transformer model & Adam optimizer...")
        self.model = Transformer(self.cfg).to(self.cfg.device)

        self.optim = torch.optim.Adam(self.model.parameters(), lr=self.cfg.learning_rate)

        self.best_loss = 100.0
        if load_ckpt:
            print("Loading checkpoint...")
            checkpoint = torch.load(f"{self.cfg.ckpt_dir}/{self.cfg.ckpt_name}", map_location=self.cfg.device)
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optim.load_state_dict(checkpoint['optim_state_dict'])
            self.best_loss = checkpoint['loss']
        else:
            print("Initializing the model...")
            for p in self.model.parameters():
                if p.dim() > 1:
                    nn.init.xavier_uniform_(p)

        # Prepare Tokenizer
        self.prepare_tokenizer()

        if is_train:
            # Load loss function
            print("Loading loss function...")
            self.criterion = nn.NLLLoss()

            # Load dataloaders
            print("Loading dataloaders...")
            self.train_dataset, self.train_loader = get_data_loader(self.cfg, 'train')
            self.valid_dataset, self.valid_loader = get_data_loader(self.cfg, 'validation')

        else:
            if os.path.exists(f"{self.cfg.ckpt_dir}/{self.cfg.ckpt_name}"):
                print("Loading sentencepiece tokenizer...")
                self.sp_src = spm.SentencePieceProcessor()
                self.sp_tgt = spm.SentencePieceProcessor()
                self.sp_src.Load(f"{self.cfg.sp_dir}/{self.cfg.src_model_prefix}.model")
                self.sp_tgt.Load(f"{self.cfg.sp_dir}/{self.cfg.tgt_model_prefix}.model")
            else:
                print("Checkpoint path not exits...")

        print("Setting finished.")

    def prepare_tokenizer(self):
        if not os.path.isdir(self.cfg.sp_dir):
            print('Training sentencepiece tokenizer...')
            train_sentencepiece(self.cfg, is_src=True)
            train_sentencepiece(self.cfg, is_src=False)
        else:
            print('Tokenization already...')

    def train(self):
        print("Training...")

        for epoch in range(1, self.cfg.num_epochs+1):
            print(f"#################### Epoch: {epoch} ####################")

            self.model.train()
            train_losses = []
            start_time = datetime.datetime.now()

            bar = tqdm(enumerate(self.train_loader), total=len(self.train_loader), desc='TRAINING')

            for batch_idx, batch in bar:
                src_input, tgt_input, tgt_output = batch
                src_input, tgt_input, tgt_output = src_input.to(self.cfg.device), tgt_input.to(self.cfg.device), tgt_output.to(self.cfg.device)

                e_mask, d_mask = self.create_mask(src_input, tgt_input) #

                logits = self.model(src_input, tgt_input, e_mask, d_mask)

                self.optim.zero_grad()

                loss = self.criterion(
                    logits.view(-1, logits.shape[-1]),
                    tgt_output.reshape(-1)
                )

                loss.backward()
                self.optim.step()

                train_losses.append(loss.item())

                del src_input, tgt_input, tgt_output, e_mask, d_mask, logits
                torch.cuda.empty_cache()

                bar.set_postfix(TRAIN="Epoch {} - Batch_Loss {:.2f} - Train_Loss {:.2f} - Best_Valid_Loss {:.2f}".format(
                    epoch,
                    loss.item(),
                    np.mean(train_losses),
                    self.best_loss
                    )
                )

            end_time = datetime.datetime.now()
            training_time = end_time - start_time

            mean_train_loss = np.mean(train_losses)
            print(f"Train loss: {mean_train_loss} || Time: {training_time} secs")

            valid_loss, valid_time = self.validation()

            if valid_loss < self.best_loss:
                if not os.path.exists(self.cfg.ckpt_dir):
                    os.mkdir(self.cfg.ckpt_dir)

                self.best_loss = valid_loss
                state_dict = {
                    'model_state_dict': self.model.state_dict(),
                    'optim_state_dict': self.optim.state_dict(),
                    'loss': self.best_loss
                }
                torch.save(state_dict, f"{self.cfg.ckpt_dir}/{self.cfg.ckpt_name}")
                print(f"***** Current best checkpoint is saved. *****")

            print(f"Best valid loss: {self.best_loss}")
            print(f"Valid loss: {valid_loss} || One epoch training time: {valid_time}")

        print(f"Training finished!")

    def validation(self):
        self.model.eval()

        valid_losses = []
        start_time = datetime.datetime.now()

        with torch.no_grad():
            bar = tqdm(enumerate(self.valid_loader), total=len(self.valid_loader), desc='VALIDATIION')
            for batch_idx, batch in bar:
                src_input, tgt_input, tgt_output = batch
                src_input, tgt_input, tgt_output = src_input.to(self.cfg.device), tgt_input.to(self.cfg.device), tgt_output.to(self.cfg.device)

                e_mask, d_mask = self.create_mask(src_input, tgt_input)

                logits = self.model(src_input, tgt_input, e_mask, d_mask)

                loss = self.criterion(
                    logits.view(-1, logits.shape[-1]),
                    tgt_output.reshape(-1)
                )

                valid_losses.append(loss.item())

                bar.set_postfix(TRAIN="Batch_Loss {:.2f} - Valid_Loss {:.2f}".format(
                    loss.item(),
                    np.mean(valid_losses)
                    )
                )

                del src_input, tgt_input, tgt_output, e_mask, d_mask, logits
                torch.cuda.empty_cache()

        end_time = datetime.datetime.now()
        validation_time = end_time - start_time

        mean_valid_loss = np.mean(valid_losses)

        return mean_valid_loss, f"{validation_time} secs"

    def inference(self, input_sentence):
        self.model.eval()

        print("Preprocessing input sentence...")
        tokenized = self.sp_src.EncodeAsIds(input_sentence)
        src_data = torch.LongTensor(
            pad_or_truncate([self.cfg.sos_id] + tokenized + [self.cfg.eos_id], self.cfg.seq_len, self.cfg.pad_id)
        ).unsqueeze(0).to(self.cfg.device)

        e_mask = (src_data != self.cfg.pad_id).unsqueeze(1).to(self.cfg.device) # (1, 1, L)

        start_time = datetime.datetime.now()

        print("Encoding input sentence...")
        src_data = self.model.src_embedding(src_data)
        src_data = self.model.positional_encoder(src_data)
        e_output = self.model.encoder(src_data, e_mask) # (1, L, d_model)

        result = self.greedy_search(e_output, e_mask)

        end_time = datetime.datetime.now()

        total_inference_time = end_time - start_time

        print(f"Input: {input_sentence}")
        print(f"Result: {result}")
        print(f"Inference finished! || Total inference time: {total_inference_time}secs")
        return result

    def greedy_search(self, e_output, e_mask):
        last_words = torch.LongTensor([self.cfg.pad_id] * self.cfg.seq_len).to(self.cfg.device) # (L)
        last_words[0] = self.cfg.sos_id # (L)
        cur_len = 1

        for i in range(self.cfg.seq_len):
            d_mask = (last_words.unsqueeze(0) != self.cfg.pad_id).unsqueeze(1).to(self.cfg.device) # (1, 1, L)
            nopeak_mask = torch.ones([1, self.cfg.seq_len, self.cfg.seq_len], dtype=torch.bool).to(self.cfg.device)  # (1, L, L)
            nopeak_mask = torch.tril(nopeak_mask)  # (1, L, L) to triangular shape
            d_mask = d_mask & nopeak_mask  # (1, L, L) padding false

            tgt_embedded = self.model.tgt_embedding(last_words.unsqueeze(0))
            tgt_positional_encoded = self.model.positional_encoder(tgt_embedded)
            decoder_output = self.model.decoder(
                tgt_positional_encoded,
                e_output,
                e_mask,
                d_mask
            ) # (1, L, d_model)

            output = self.model.softmax(
                self.model.output_linear(decoder_output)
            ) # (1, L, trg_vocab_size)

            output = torch.argmax(output, dim=-1) # (1, L)
            last_word_id = output[0][i].item()

            if i < self.cfg.seq_len-1:
                last_words[i+1] = last_word_id
                cur_len += 1

            if last_word_id == self.cfg.eos_id:
                break

        if last_words[-1].item() == self.cfg.pad_id:
            decoded_output = last_words[1:cur_len].tolist()
        else:
            decoded_output = last_words[1:].tolist()
        decoded_output = self.sp_tgt.decode_ids(decoded_output)

        return decoded_output

    def create_mask(self, src_input, tgt_input):
        e_mask = (src_input != self.cfg.pad_id).unsqueeze(1)  # (B, 1, L)
        d_mask = (tgt_input != self.cfg.pad_id).unsqueeze(1)  # (B, 1, L)

        nopeak_mask = torch.ones([1, self.cfg.seq_len, self.cfg.seq_len], dtype=torch.bool)  # (1, L, L)
        nopeak_mask = torch.tril(nopeak_mask).to(self.cfg.device)  # (1, L, L) to triangular shape
        d_mask = d_mask & nopeak_mask  # (B, L, L) padding false

        return e_mask, d_mask

##8.Training

In [12]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Dataset
    data_dir = './transformer/data'
    # data_dir = '/content/test_download_data/data'
    src_lang = 'vi'
    tgt_lang = 'en'

    # Tokenizer
    sp_dir = data_dir + '/sp'
    pad_id = 0
    sos_id = 1
    eos_id = 2
    unk_id = 3
    src_model_prefix = 'sp_' + src_lang
    tgt_model_prefix = 'sp_' + tgt_lang
    sp_vocab_size = 10000
    character_coverage = 1.0
    model_type = 'unigram'

    # Model
    num_heads = 8
    num_layers = 6
    d_model = 512
    d_ff = 2048
    drop_out = 0.1

    # Training
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 1e-4
    batch_size = 64
    seq_len = 150
    num_epochs = 2
    ckpt_dir = './transformer'
    ckpt_name = 'best_ckpt.tar'

In [13]:
# !rm -r '/content/test_download_data'

In [14]:
# !mkdir '/content/test_download_data/'
# !mkdir '/content/test_download_data/data'

In [15]:
cfg = NMTConfig()

In [16]:
# data_pre = DataPreparing(cfg.data_dir, cfg.src_lang, cfg.tgt_lang)
# data_pre.download_dataset()

In [17]:
# trainer = Trainer(cfg, is_train=True, load_ckpt=False)
# trainer.train()

##9.Evaluate

In [18]:
print(f'numpy version: {np.__version__}')
print(f'datasets version: {datasets.__version__}')
print(f'sentencepiece version: {spm.__version__}')
print(f'torch version: {torch.__version__}')
print("\n")
cfg = NMTConfig()
trainer = Trainer(cfg, is_train=False, load_ckpt=True)
trainer.inference('Tôi yêu bạn.')

numpy version: 1.23.5
datasets version: 2.9.0
sentencepiece version: 0.1.99
torch version: 2.1.0+cu121


Loading Transformer model & Adam optimizer...
Loading checkpoint...
Tokenization already...
Loading sentencepiece tokenizer...
Setting finished.
Preprocessing input sentence...
Encoding input sentence...
Input: Tôi yêu bạn.
Result: I love you .
Inference finished! || Total inference time: 0:00:05.059223secs


'I love you .'

In [19]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa mmio_stale_data retbleed
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
addres

In [20]:
def evaluate(cfg, trainer):
    with open(cfg.data_dir + '/test.' + cfg.src_lang, 'r') as f:
        src_texts = f.readlines()
    src_texts = [s.strip() for s in src_texts]

    with open(cfg.data_dir + '/test.' + cfg.tgt_lang, 'r') as f:
        tgt_texts = f.readlines()
    tgt_texts = [s.strip() for s in tgt_texts]

    len(src_texts) == len(tgt_texts)

    pred_texts = []
    for sent in tqdm(src_texts):
        pred_texts.append(trainer.inference(sent))

    bleu_score = sacrebleu.corpus_bleu(pred_texts, [tgt_texts], force=True)

    return pred_texts, bleu_score

In [21]:
# pred_texts, bleu_score = evaluate(cfg, trainer)

In [22]:
# bleu_score

# **Training step by step**

## **Load data**

In [23]:
input_data = NMTDataset(cfg=cfg, data_type="train")

===> Load data from: ./transformer/data/train.vi
===> Load data from: ./transformer/data/train.en


100%|██████████| 133317/133317 [00:06<00:00, 20292.16it/s]
100%|██████████| 133317/133317 [00:08<00:00, 15549.51it/s]


In [24]:
input_data[0] # 13

(tensor([   1, 1960,   74,  128,  974,  146,    9,  418,  127,   41,  404,  817,
            2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [25]:
input_data[1] # 88

(tensor([   1,  373,   17,  692,  671,    4,  732,  150,  223,   74,  404, 1099,
         3657, 1414, 2472,  106, 1301,  165, 1110, 1399,   41,   11, 1339,  303,
          345,   74,  216, 1666,   39,   16,  128,  974,  146,   11,  418,  127,
         1527,  972,   41,  351,  170,  404,  817,    4,  141,   29, 1386,  324,
          250,   13,  102,   93,  195,  986,   19,   24,  275,   17,   62, 1454,
           31,  281,  456,   23,   93,    9,  999,  504, 1571,  717,  155, 1033,
          883,   40,  176,  578,  175,  154,   41,    9,  331,  427, 2015,   51,
         1212,   35,    5,    2,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [26]:
input_data[2] # 34

(tensor([   1,   49,  100,   31,   25,   20,   83,   41,   33,  768,  139,   13,
           11, 1339,  303,  345,   74,   24, 1138,  173,   36,  111,   25,  826,
           17, 1691,   35,   20,  233,   64,   77,  524,    5,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

## **Init model, tokenizer, loss fn**

In [27]:
def prepare_tokenizer():
    if not os.path.isdir(cfg.sp_dir):
        print('Training sentencepiece tokenizer...')
        train_sentencepiece(cfg, is_src=True)
        train_sentencepiece(cfg, is_src=False)
    else:
        print('Tokenization already...')

In [28]:
print("Loading Transformer model & Adam optimizer...")
model = Transformer(cfg).to(cfg.device)

optim = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

best_loss = 100.0
print("Loading checkpoint...")
checkpoint = torch.load(f"{cfg.ckpt_dir}/{cfg.ckpt_name}", map_location=cfg.device)
model.load_state_dict(checkpoint['model_state_dict'])
optim.load_state_dict(checkpoint['optim_state_dict'])
best_loss = checkpoint['loss']

# Prepare Tokenizer
prepare_tokenizer()

Loading Transformer model & Adam optimizer...
Loading checkpoint...
Tokenization already...


## **Dataloader Train**

In [29]:
# Load loss function
print("Loading loss function...")
criterion = nn.NLLLoss()

# Load dataloaders
print("Loading dataloaders...")
train_dataset, train_loader = get_data_loader(cfg, 'train')
valid_dataset, valid_loader = get_data_loader(cfg, 'validation')

Loading loss function...
Loading dataloaders...
===> Load data from: ./transformer/data/train.vi
===> Load data from: ./transformer/data/train.en


100%|██████████| 133317/133317 [00:07<00:00, 17900.90it/s]
100%|██████████| 133317/133317 [00:06<00:00, 20773.19it/s]


===> Load data from: ./transformer/data/validation.vi
===> Load data from: ./transformer/data/validation.en


100%|██████████| 1268/1268 [00:00<00:00, 23770.24it/s]
100%|██████████| 1268/1268 [00:00<00:00, 23095.97it/s]


In [30]:
model.train()

Transformer(
  (src_embedding): Embedding(10000, 512)
  (tgt_embedding): Embedding(10000, 512)
  (positional_encoder): PositionalEncoder()
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (layer_norm_1): LayerNormalization(
          (layer): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        )
        (multihead_attention): MultiheadAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (attn_softmax): Softmax(dim=-1)
          (w_0): Linear(in_features=512, out_features=512, bias=True)
        )
        (drop_out_1): Dropout(p=0.1, inplace=False)
        (layer_norm_2): LayerNormalization(
          (layer): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        )
        (feed_forward): FeedFowardLayer(
 

In [31]:
train_losses = []

In [32]:
bar = tqdm(enumerate(train_loader), total=len(train_loader), desc='TRAINING')

TRAINING:   0%|          | 0/2084 [00:00<?, ?it/s]

In [33]:
src_input, tgt_input, tgt_output = _, _, _
for batch_idx, batch in bar:
    src_input, tgt_input, tgt_output = batch
    break

TRAINING:   0%|          | 0/2084 [00:00<?, ?it/s]


## **src_input, tgt_input, tgt_output**

In [34]:
src_input, tgt_input, tgt_output = src_input.to(cfg.device), tgt_input.to(cfg.device), tgt_output.to(cfg.device)

In [35]:
print(src_input)
print(src_input.shape)

tensor([[   1, 1960,   74,  ...,    0,    0,    0],
        [   1,  373,   17,  ...,    0,    0,    0],
        [   1,   49,  100,  ...,    0,    0,    0],
        ...,
        [   1,  697,    4,  ...,    0,    0,    0],
        [   1,  908,   19,  ...,    0,    0,    0],
        [   1,  199,   43,  ...,    0,    0,    0]])
torch.Size([64, 150])


In [36]:
print(tgt_input)
print(tgt_input.shape)

tensor([[   1, 6509, 3475,  ...,    0,    0,    0],
        [   1,  152, 2326,  ...,    0,    0,    0],
        [   1,   17,   10,  ...,    0,    0,    0],
        ...,
        [   1,  192,    4,  ...,    0,    0,    0],
        [   1,  770, 5923,  ...,    0,    0,    0],
        [   1,  107,   90,  ...,    0,    0,    0]])
torch.Size([64, 150])


In [37]:
print(tgt_output)
print(tgt_output.shape)

tensor([[6509, 3475, 1785,  ...,    0,    0,    0],
        [ 152, 2326,  582,  ...,    0,    0,    0],
        [  17,   10,   12,  ...,    0,    0,    0],
        ...,
        [ 192,    4,    5,  ...,    0,    0,    0],
        [ 770, 5923,    4,  ...,    0,    0,    0],
        [ 107,   90,   11,  ...,    0,    0,    0]])
torch.Size([64, 150])


## **Create mask**

In [38]:
def create_mask(src_input, tgt_input):
    e_mask = (src_input != cfg.pad_id).unsqueeze(1)  # (B, 1, L)
    d_mask = (tgt_input != cfg.pad_id).unsqueeze(1)  # (B, 1, L)

    nopeak_mask = torch.ones([1, cfg.seq_len, cfg.seq_len], dtype=torch.bool)  # (1, L, L)
    nopeak_mask = torch.tril(nopeak_mask).to(cfg.device)  # (1, L, L) to triangular shape
    d_mask = d_mask & nopeak_mask  # (B, L, L) padding false

    return e_mask, d_mask

In [39]:
e_mask, d_mask = create_mask(src_input, tgt_input)

In [40]:
print(e_mask)
print(e_mask.shape)

tensor([[[ True,  True,  True,  ..., False, False, False]],

        [[ True,  True,  True,  ..., False, False, False]],

        [[ True,  True,  True,  ..., False, False, False]],

        ...,

        [[ True,  True,  True,  ..., False, False, False]],

        [[ True,  True,  True,  ..., False, False, False]],

        [[ True,  True,  True,  ..., False, False, False]]])
torch.Size([64, 1, 150])


In [41]:
print(e_mask[0]) # 13

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, F

In [42]:
print(e_mask[1]) # 88

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, F

In [43]:
print(e_mask[2]) # 34

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, F

## **Feed src_input, tgt_input, e_mask, d_mask to model**

In [50]:
print(src_input[0].reshape(1, 150))
print(src_input[0].reshape(1, 150).shape)
print(tgt_input[0].reshape(1, 150))
print(tgt_input[0].reshape(1, 150).shape)
print(e_mask[0].reshape(1, 1, 150))
print(e_mask[0].reshape(1, 1, 150).shape)
print(d_mask[0].reshape(1, 150, 150))
print(d_mask[0].reshape(1, 150, 150).shape)

tensor([[   1, 1960,   74,  128,  974,  146,    9,  418,  127,   41,  404,  817,
            2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [51]:
logits = model(src_input[0].reshape(1, 150), tgt_input[0].reshape(1, 150), e_mask[0].reshape(1, 1, 150), d_mask[0].reshape(1, 150, 150))

In [54]:
print(logits)
print(logits.shape)

tensor([[[-7.5021e+00, -1.2412e+01, -7.0736e+00,  ..., -1.2030e+01,
          -1.2297e+01, -1.2295e+01],
         [-6.9140e+00, -1.2693e+01, -6.5903e+00,  ..., -1.1733e+01,
          -1.1865e+01, -1.2309e+01],
         [-7.4816e+00, -1.4167e+01, -8.6468e+00,  ..., -1.2696e+01,
          -1.2840e+01, -1.3195e+01],
         ...,
         [-7.8794e-05, -1.9599e+01, -1.3170e+01,  ..., -1.9381e+01,
          -1.8749e+01, -1.9419e+01],
         [-7.9033e-05, -1.9655e+01, -1.3468e+01,  ..., -1.9412e+01,
          -1.8816e+01, -1.9467e+01],
         [-7.8913e-05, -1.9616e+01, -1.3367e+01,  ..., -1.9321e+01,
          -1.8779e+01, -1.9378e+01]]], grad_fn=<LogSoftmaxBackward0>)
torch.Size([1, 150, 10000])


## **Calculate loss**

In [55]:
optim.zero_grad()

In [56]:
print(logits.view(-1, logits.shape[-1]))
print(logits.view(-1, logits.shape[-1]).shape)

tensor([[-7.5021e+00, -1.2412e+01, -7.0736e+00,  ..., -1.2030e+01,
         -1.2297e+01, -1.2295e+01],
        [-6.9140e+00, -1.2693e+01, -6.5903e+00,  ..., -1.1733e+01,
         -1.1865e+01, -1.2309e+01],
        [-7.4816e+00, -1.4167e+01, -8.6468e+00,  ..., -1.2696e+01,
         -1.2840e+01, -1.3195e+01],
        ...,
        [-7.8794e-05, -1.9599e+01, -1.3170e+01,  ..., -1.9381e+01,
         -1.8749e+01, -1.9419e+01],
        [-7.9033e-05, -1.9655e+01, -1.3468e+01,  ..., -1.9412e+01,
         -1.8816e+01, -1.9467e+01],
        [-7.8913e-05, -1.9616e+01, -1.3367e+01,  ..., -1.9321e+01,
         -1.8779e+01, -1.9378e+01]], grad_fn=<ViewBackward0>)
torch.Size([150, 10000])


In [59]:
print(tgt_output[0].reshape(-1))
print(tgt_output[0].reshape(-1).shape)

tensor([6509, 3475, 1785,    4,   67,   48,  396,    4,  731,   13,  933, 3219,
           2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [62]:
loss = criterion(
    logits.view(-1, logits.shape[-1]),
    tgt_output[0].reshape(-1)
)
loss.item()

0.3642583191394806

In [None]:
loss.backward()
optim.step()

In [45]:
# loss.backward()
# self.optim.step()

# train_losses.append(loss.item())

# Inference step by step

## **switch model to eval mode**

In [63]:
model.eval()

Transformer(
  (src_embedding): Embedding(10000, 512)
  (tgt_embedding): Embedding(10000, 512)
  (positional_encoder): PositionalEncoder()
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (layer_norm_1): LayerNormalization(
          (layer): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        )
        (multihead_attention): MultiheadAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (attn_softmax): Softmax(dim=-1)
          (w_0): Linear(in_features=512, out_features=512, bias=True)
        )
        (drop_out_1): Dropout(p=0.1, inplace=False)
        (layer_norm_2): LayerNormalization(
          (layer): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        )
        (feed_forward): FeedFowardLayer(
 

## **Proprocess**

In [64]:
if os.path.exists(f"{cfg.ckpt_dir}/{cfg.ckpt_name}"):
  print("Loading sentencepiece tokenizer...")
  sp_src = spm.SentencePieceProcessor()
  sp_tgt = spm.SentencePieceProcessor()
  sp_src.Load(f"{cfg.sp_dir}/{cfg.src_model_prefix}.model")
  sp_tgt.Load(f"{cfg.sp_dir}/{cfg.tgt_model_prefix}.model")

Loading sentencepiece tokenizer...


In [65]:
input_sentence = 'Tôi yêu bạn.'

In [66]:
print("Preprocessing input sentence...")
tokenized = sp_src.EncodeAsIds(input_sentence)
tokenized

Preprocessing input sentence...


[49, 355, 20, 308]

In [68]:
src_data = torch.LongTensor(
    pad_or_truncate([cfg.sos_id] + tokenized + [cfg.eos_id], cfg.seq_len, cfg.pad_id)
).unsqueeze(0).to(cfg.device)
src_data

tensor([[  1,  49, 355,  20, 308,   2,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

## **Create mask**

In [69]:
e_mask = (src_data != cfg.pad_id).unsqueeze(1).to(cfg.device) # (1, 1, L)
e_mask

tensor([[[ True,  True,  True,  True,  True,  True, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, Fa

## **Embedding + positional encoding**

In [70]:
print("Encoding input sentence...")
src_data = model.src_embedding(src_data)
src_data

Encoding input sentence...


tensor([[[ 0.0171,  0.0053, -0.0175,  ...,  0.0109, -0.0197,  0.0021],
         [ 0.0038, -0.0334,  0.0255,  ..., -0.0058,  0.0320,  0.0002],
         [-0.0155,  0.0117,  0.0165,  ...,  0.0103, -0.0025,  0.0053],
         ...,
         [-0.0216, -0.0148,  0.0131,  ..., -0.0124,  0.0135,  0.0092],
         [-0.0216, -0.0148,  0.0131,  ..., -0.0124,  0.0135,  0.0092],
         [-0.0216, -0.0148,  0.0131,  ..., -0.0124,  0.0135,  0.0092]]],
       grad_fn=<EmbeddingBackward0>)

In [71]:
src_data = model.positional_encoder(src_data)
src_data

tensor([[[ 0.3870,  1.1194, -0.3957,  ...,  1.2476, -0.4466,  1.0479],
         [ 0.9268, -0.1862,  1.3790,  ...,  0.8686,  0.7241,  1.0046],
         [ 0.5589, -0.0866,  1.3306,  ...,  1.2332, -0.0575,  1.1208],
         ...,
         [ 0.1197, -1.2419, -0.6955,  ...,  0.7183,  0.3047,  1.2071],
         [-0.8277, -0.5062, -0.1887,  ...,  0.7183,  0.3047,  1.2071],
         [-1.4640,  0.3774,  0.7079,  ...,  0.7183,  0.3047,  1.2071]]],
       grad_fn=<AddBackward0>)

## **Go through encoder**

In [72]:
e_output = model.encoder(src_data, e_mask) # (1, L, d_model)

In [74]:
print(e_output)
print(e_output.shape)

tensor([[[-0.1970, -1.6769,  1.1399,  ..., -0.3952, -1.1380,  1.1183],
         [-0.1228, -1.9183,  1.3894,  ..., -0.7759, -0.5548,  1.0149],
         [-0.2277, -1.9047,  1.2152,  ..., -0.4457, -1.1053,  0.8765],
         ...,
         [ 0.0669, -2.1646,  1.2392,  ..., -0.5328, -0.7571,  1.0111],
         [-0.1282, -2.0022,  1.3519,  ..., -0.5859, -0.8206,  1.0823],
         [-0.2480, -1.8211,  1.5326,  ..., -0.6345, -0.8304,  1.1566]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 150, 512])


## **Greedy search**

In [113]:
def greedy_search(e_output, e_mask):
  last_words = torch.LongTensor([cfg.pad_id] * cfg.seq_len).to(cfg.device) # (L)
  print("last_words: \n", last_words)
  print("last_words len = ", len(last_words))

  last_words[0] = cfg.sos_id # (L)
  cur_len = 1
  print("last_words: \n", last_words)
  print("last_words len = ", len(last_words))

  for i in range(cfg.seq_len):
      print("============ Iter {} ============".format(i))
      d_mask = (last_words.unsqueeze(0) != cfg.pad_id).unsqueeze(1).to(cfg.device) # (1, 1, L)
      nopeak_mask = torch.ones([1, cfg.seq_len, cfg.seq_len], dtype=torch.bool).to(cfg.device)  # (1, L, L)
      nopeak_mask = torch.tril(nopeak_mask)  # (1, L, L) to triangular shape
      d_mask = d_mask & nopeak_mask  # (1, L, L) padding false
      print("d_mask: \n", d_mask)
      print("d_mask shape: ", d_mask.shape)

      tgt_embedded = model.tgt_embedding(last_words.unsqueeze(0))
      tgt_positional_encoded = model.positional_encoder(tgt_embedded)
      print("Embedded + positional_encoded last_words: \n", tgt_positional_encoded)
      print("Embedded + positional_encoded last_words shape: ", tgt_positional_encoded.shape)

      decoder_output = model.decoder(
          tgt_positional_encoded,
          e_output,
          e_mask,
          d_mask
      ) # (1, L, d_model)
      print("Decoder_output: \n", decoder_output)
      print("Decoder_output shape: ", decoder_output.shape)

      output = model.softmax(
          model.output_linear(decoder_output)
      ) # (1, L, trg_vocab_size)
      print("Decoder_output after softmax: \n", output)
      print("Decoder_output shape: ", output.shape)

      output = torch.argmax(output, dim=-1) # (1, L)
      print("Get argmax of output: \n", output)
      last_word_id = output[0][i].item()
      print("last_word_id: ", last_word_id)

      if i < cfg.seq_len-1:
          last_words[i+1] = last_word_id
          cur_len += 1
          print("last_words: \n", last_words)


      if last_word_id == cfg.eos_id:
          break

  if last_words[-1].item() == cfg.pad_id:
      decoded_output = last_words[1:cur_len].tolist()
  else:
      decoded_output = last_words[1:].tolist()
  decoded_output = sp_tgt.decode_ids(decoded_output)
  print("decoded_output: ", decoded_output)
  return decoded_output

In [114]:
result = greedy_search(e_output, e_mask)

last_words: 
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])
last_words len =  150
last_words: 
 tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [81]:
result

'I love you .'