In [56]:
import torch
import pickle
from pprint import pprint

# 1. Datasets

In [57]:
# Source seq
with open('datasets/test/en', 'r+') as file_obj:
    data_en = file_obj.readlines()
data_en = [ line.strip().lower() for line in data_en ]

print(len(data_en))
print(data_en[:5])

50818
['i thought you needed the sleep', 'you will survive.', 'he is a doctor and also a novelist', 'what you were taught is wrong.', "i should've known tom would be here, too"]


In [58]:
# Target seq
with open('datasets/test/vi', 'r+') as file_obj:
    data_vi = file_obj.readlines()
data_vi = [ line.strip().lower() for line in data_vi ]

print(len(data_vi))
print(data_vi[:5])

50818
['tôi nghĩ bạn cần ngủ', 'bạn sẽ sống sót.', 'ông là một bác sĩ và cũng là một tiểu thuyết gia', 'những gì bạn đã được dạy là sai.', 'tôi cũng nên biết tom cũng sẽ ở đây']


# 2. Feature Engineerings

## 2.1 Source
#### Source tokernizer

In [59]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

print(f'{tokenizer.special_tokens_map = }')
print(f'{tokenizer.vocab_size = }')

encoded = tokenizer(
    text=data_en[:5],               # the batch sentences to be encoded
    add_special_tokens=True,        # Add [CLS] and [SEP]
    padding='longest',              # Add [PAD]s
    return_attention_mask=True,     # Generate the attention mask
    return_tensors='pt',            # ask the function to return PyTorch tensors
    max_length=50,                  # maximum length of a sentence
    truncation=True
)
pprint(encoded)

tokenizer.special_tokens_map = {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
tokenizer.vocab_size = 30522
{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[ 101, 1045, 2245, 2017, 2734, 1996, 3637,  102,    0,    0,    0,    0,
            0],
        [ 101, 2017, 2097, 5788, 1012,  102,    0,    0,    0,    0,    0,    0,
            0],
        [ 101, 2002, 2003, 1037, 3460, 1998, 2036, 1037, 9974,  102,    0,    0,
            0],
        [ 101, 2054, 2017, 2020, 4036, 2003, 3308, 1012,  102,    0,    0,    0,
            0],
        [ 101, 1045, 2323, 1005, 2310, 2124, 3419, 2052, 2022, 2182, 1010, 2205,
          102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [60]:
print(f"{data_en[0] = }")
print(f"{encoded['input_ids'].size() = }")

ids = encoded['input_ids'][0]
print(f'{ids = }')

tokens = tokenizer.convert_ids_to_tokens(ids)
print(f'{tokens = }')

decoding = tokenizer.decode(ids)
print(f'{decoding = }')

data_en[0] = 'i thought you needed the sleep'
encoded['input_ids'].size() = torch.Size([5, 13])
ids = tensor([ 101, 1045, 2245, 2017, 2734, 1996, 3637,  102,    0,    0,    0,    0,
           0])
tokens = ['[CLS]', 'i', 'thought', 'you', 'needed', 'the', 'sleep', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
decoding = '[CLS] i thought you needed the sleep [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'


## 2.2 Target
#### Target Tokenizer

In [61]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

print(f'{tokenizer.special_tokens_map = }')
print(f'{tokenizer.vocab_size = }')


encoded = tokenizer(
    text=data_vi[:5],               # the batch sentences to be encoded
    add_special_tokens=True,        # Add [CLS] and [SEP]
    padding='longest',              # Add [PAD]s
    return_attention_mask=True,     # Generate the attention mask
    return_tensors='pt',            # ask the function to return PyTorch tensors
    max_length=50,                  # maximum length of a sentence
    truncation=True
)
pprint(encoded)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer.special_tokens_map = {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}
tokenizer.vocab_size = 64000
{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]),
 'input_ids': tensor([[    0,    70,   487,    88,   115,   845,     2,     1,     1,     1,
             1,     1,     1,     1],
        [    0,    88,    38,   235, 49850, 13208,     2,     1,     1,     1,
             1,     1,     1,     1],
        [    0,    46,     8,    16,  2780,  9031,     6,    32,     8,    16,
          3520, 12907,  3931,     2],
        [    0,    21,   148,    88,    14,    11,   940,     8,  4101, 27375,
             2,     1,     1,     1],
        [    0,    70,    

In [62]:
print(f"{data_vi[4] = }")
print(f"{encoded['input_ids'].size() = }")

ids = encoded['input_ids'][4]
print(f'{ids = }')

tokens = tokenizer.convert_ids_to_tokens(ids)
print(f'{tokens = }')

decoding = tokenizer.decode(ids)
print(f'{decoding = }')

data_vi[4] = 'tôi cũng nên biết tom cũng sẽ ở đây'
encoded['input_ids'].size() = torch.Size([5, 14])
ids = tensor([    0,    70,    32,    77,    55, 33939,    32,    38,    25,    97,
            2,     1,     1,     1])
tokens = ['<s>', 'tôi', 'cũng', 'nên', 'biết', 'tom', 'cũng', 'sẽ', 'ở', 'đây', '</s>', '<pad>', '<pad>', '<pad>']
decoding = '<s> tôi cũng nên biết tom cũng sẽ ở đây </s> <pad> <pad> <pad>'


# 3. Inference

In [63]:
en_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
vi_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [64]:
Tx = 60
Ty = 59
X_lexicon_size = en_tokenizer.special_tokens_map
Y_lexicon_size = vi_tokenizer.special_tokens_map

print(f'{Tx = }, {X_lexicon_size = }')
print(f'{Ty = }, {Y_lexicon_size = }')

Tx = 60, X_lexicon_size = {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
Ty = 59, Y_lexicon_size = {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}


In [65]:
from Transformer.Transformer import \
    create_mask_source, create_mask_target

## Load model
transformer = torch.load('ckpts/8B_Transformer_en2vi.model.pth')
transformer = transformer.to('cpu')

#### Translate single sentence

In [66]:
def translate(en_sentence, model,
        en_tokenizer, vi_tokenizer,
        Tx=60, Ty=59):
    model = model.cpu()
    model = model.eval()

    # X
    X_encoded = en_tokenizer(
        text=en_sentence,               # the batch sentences to be encoded
        add_special_tokens=True,        # Add [CLS] and [SEP]
        padding='max_length',           # Add [PAD]s
        return_attention_mask=True,     # Generate the attention mask
        return_tensors='pt',            # ask the function to return PyTorch tensors
        max_length=Tx,                  # maximum length of a sentence
        truncation=True)
    x_pad = en_tokenizer.convert_tokens_to_ids('[PAD]')
    x_eos = en_tokenizer.convert_tokens_to_ids('[SEP]')

    ## (Tx)
    X_seq = X_encoded['input_ids']
    X_mask = create_mask_source(
        X_seq=X_seq,
        pad_token=x_pad,
        device='cpu')
    
    # Debug
    X_token = en_tokenizer.decode(X_seq.squeeze(dim=0))
    print(f'{X_token = }')

    # Init Y_seq = [<sos>] -> (1, Ty+1)
    y_sos = vi_tokenizer.convert_tokens_to_ids('<s>')
    y_pad = vi_tokenizer.convert_tokens_to_ids('<pad>')
    Y_seq = torch.full((1, Ty+1), fill_value=y_pad, dtype=torch.int32)
    Y_seq[0, 0] = y_sos

    # t-1 -> t
    for t in range(Ty):
        # Infer
        Y_mask = create_mask_target(
            Y_seq=Y_seq[:,:-1],
            pad_token=y_pad,
            device='cpu')
        with torch.no_grad():
            yt_hat = model(
                X_seq=X_seq, X_mask=X_mask,
                Y_seq=Y_seq[:,:-1], Y_mask=Y_mask)
        
        # Debug
        # y_pred = yt_hat.argmax(dim=-1)[0]
        # y_pred_seq = vi_tokenizer.decode(y_pred)
        # print(f'{Y_seq = }')

        # Predict next token
        best_guess = yt_hat.argmax(dim=-1)[:, t]

        # Y_seq = (1, Ty)
        Y_seq[:, t+1] = best_guess

    # Output
    Y_seq = Y_seq.squeeze(dim=0)
    vi_sentence = vi_tokenizer.decode(Y_seq)
    return vi_sentence

In [67]:
translate(en_sentence="too much sugar for us.",
    model=transformer,
    en_tokenizer=en_tokenizer, vi_tokenizer=vi_tokenizer,
    Tx=Tx, Ty=Ty)

X_token = '[CLS] too much sugar for us. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'


'<s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'