In [107]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
spacy_en = spacy.load('en_core_web_sm')

In [158]:
from iteration_utilities import deepflatten
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
def tokenize_vi(text):
    return [tok for tok in deepflatten(annotator.tokenize(text), depth=1)]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [159]:
import pandas as pd

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)


In [160]:
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import re

class Field:
    def __init__(self, tokenize_func=None, init_token=None, eos_token=None, pad_token='<pad>', lower=True, unk_token='<unk>'):
        self.tokenize_func = tokenize_func or (lambda x: x.split())
        self.init_token = init_token
        self.eos_token = eos_token
        self.pad_token = pad_token
        self.lower = lower
        self.vocab = None
        self.itos = None
        self.unk_token = unk_token
        self.apostrophe_regex = re.compile(r"(\w+)'(\w+)")

    def tokenize(self, texts):
        tokenized_texts = []
        for t in texts:
            tokens = self.tokenize_func(t)
            if self.lower:
                tokens = [token.lower() for token in tokens]
            # Split words with apostrophes into separate tokens
            tokens = [token for word in tokens for token in re.split(self.apostrophe_regex, word) if token]
            tokenized_texts.append(tokens)
        return tokenized_texts

    def build_vocab(self, texts, max_vocab_size=10000, min_freq=3):
        # Tokenize the texts
        tokenized_texts = self.tokenize(texts)

        # Flatten the tokenized texts
        tokens = [token for token_list in tokenized_texts for token in token_list]

        # Replace underscores with spaces in target text
        if '_' in tokens and self.init_token != '<sos>' and self.eos_token != '<eos>':
            tokens = [token.replace('_', ' ') for token in tokens]

        # Split tokens on apostrophes
        if self.apostrophe_regex is not None:
            new_tokens = []
            for token in tokens:
                subtokens = re.split(self.apostrophe_regex, token)
                new_tokens.extend(subtokens)
            tokens = new_tokens

        # Count the tokens
        counter = Counter(tokens)

        # Sort the tokens by frequency
        sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        # Truncate the sorted tokens by max_vocab_size
        if max_vocab_size is not None:
            sorted_tokens = sorted_tokens[:max_vocab_size]

        # Filter the tokens by min_freq
        filtered_tokens = [(token, freq) for token, freq in sorted_tokens if freq >= min_freq]

        # Add special tokens to the vocabulary
        if self.init_token is not None:
            filtered_tokens.insert(0, (self.init_token, float("inf")))
        if self.eos_token is not None:
            filtered_tokens.append((self.eos_token, float("inf")))
        if self.pad_token is not None:
            filtered_tokens.append((self.pad_token, float("inf")))
        filtered_tokens.append((self.unk_token, float("inf")))

        # Build the token-to-index mapping and index-to-token mapping
        self.vocab = {}
        self.itos = []
        for token, freq in filtered_tokens:
            self.vocab[token] = len(self.itos)
            self.itos.append(token)



In [180]:
from datasets import load_dataset
from functools import partial


source_field = Field(tokenize_func=partial(tokenize_en), init_token='<sos>', eos_token='<eos>', lower=True)
target_field = Field(tokenize_func=partial(tokenize_vi), init_token='<sos>', eos_token='<eos>', lower=True)

# Load the JSON files as a dataset
dataset = load_dataset("json", data_files={"train": "train.json", "test": "test.json", "val": "val.json"})

# Build vocabularies for source and target fields
source_text = dataset['train']['English'] + dataset['test']['English'] + dataset['val']['English']
target_text = dataset['train']['Vietnamese'] + dataset['test']['Vietnamese'] + dataset['val']['Vietnamese']
source_field.build_vocab(source_text)
target_field.build_vocab(target_text)

class Example:
    def __init__(self, src, trg):
        self.src = src
        self.trg = trg

train_examples = []
for example in dataset['train']:
    src_tokens = tokenize_en(example['English'])
    trg_tokens = tokenize_vi(example['Vietnamese'])
    train_examples.append(Example([token.lower() for token in src_tokens], [token.lower() for token in trg_tokens]))

test_examples = []
for example in dataset['test']:
    src_tokens = tokenize_en(example['English'])
    trg_tokens = tokenize_vi(example['Vietnamese'])
    test_examples.append(Example([token.lower() for token in src_tokens], [token.lower() for token in trg_tokens]))

val_examples = []
for example in dataset['val']:
    src_tokens = tokenize_en(example['English'])
    trg_tokens = tokenize_vi(example['Vietnamese'])
    val_examples.append(Example([token.lower() for token in src_tokens], [token.lower() for token in trg_tokens]))


Found cached dataset json (C:/Users/16262/.cache/huggingface/datasets/json/default-ec50e1705067867d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [216]:
print(len(source_field.vocab))
print(len(target_field.vocab))
print(source_field.vocab)
print(target_field.vocab)

1258
1165
{'<sos>': 0, '.': 1, 'i': 2, 'the': 3, 'to': 4, 'tom': 5, 'you': 6, 'a': 7, '?': 8, 'n': 9, 't': 10, 'is': 11, 'do': 12, 'he': 13, 'it': 14, 'in': 15, 'that': 16, 'of': 17, "'s": 18, 'was': 19, ',': 20, 'for': 21, 'have': 22, 'we': 23, 'me': 24, 'this': 25, 'what': 26, 'my': 27, 'his': 28, 'be': 29, 'not': 30, 'and': 31, 'are': 32, 'mary': 33, 'did': 34, 'she': 35, 'on': 36, "'m": 37, 'with': 38, 'want': 39, 'at': 40, 'your': 41, 'can': 42, 'him': 43, 'about': 44, 'there': 45, "'ll": 46, 'as': 47, 'know': 48, 'think': 49, 'go': 50, 'they': 51, 'her': 52, 'all': 53, 'here': 54, 'has': 55, "'re": 56, 'like': 57, 'up': 58, 'how': 59, 'would': 60, 'had': 61, 'time': 62, 'get': 63, 'were': 64, 'will': 65, 'tell': 66, "'ve": 67, 'just': 68, 'ca': 69, 'does': 70, 'out': 71, 'one': 72, 'when': 73, 'very': 74, 'been': 75, 'by': 76, 'could': 77, 'if': 78, 'an': 79, 'why': 80, 'going': 81, 'see': 82, 'good': 83, 'should': 84, 'us': 85, 'from': 86, 'no': 87, 'come': 88, 'so': 89, 'never'

In [191]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random
import torch
#self.apostrophe_regex = re.compile(r"(\S+)\s*'(\S+)")
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class TranslationExample:
    def __init__(self, src, trg):
        self.src = src
        self.trg = trg

class TranslationDataset(Dataset):
    def __init__(self, data, source_vocab, target_vocab):
        self.data = data
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.apostrophe_regex = re.compile(r"(\S+)\s*'(\S+)")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        example = self.data[index]
        src_sent = [token for word in example.src for token in re.split(self.apostrophe_regex, word) if token]
        trg_sent = [token.replace('_', ' ') if '_' in token else token for token in example.trg]
        trg_sent = [token for word in trg_sent for token in re.split(self.apostrophe_regex, word) if token]
        src_seq = []
        for token in src_sent:
            if token in self.source_vocab:
                src_seq.append(self.source_vocab[token])
            else:
                print(f"Token '{token}' not in source vocabulary")
        trg_seq = []
        for token in trg_sent:
            if token in self.target_vocab:
                trg_seq.append(self.target_vocab[token])
            else:
                print(f"Token '{token}' not in target vocabulary")
        src_seq = np.array(src_seq)
        trg_seq = np.array(trg_seq)
        return src_seq, trg_seq

def get_iterator(data, source_vocab, target_vocab, device, batch_size=128, shuffle=True):
    dataset = TranslationDataset(data, source_vocab, target_vocab)
    sampler = None
    if shuffle:
        sampler = torch.utils.data.RandomSampler(dataset)
    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=0, sampler=sampler, shuffle=(sampler is None), drop_last=False, collate_fn=lambda x: tuple(zip(*x)))
    for batch in loader:
        src_seqs = pad_sequence([torch.LongTensor(seq) for seq in batch[0]], padding_value=source_vocab['<pad>'], batch_first=True).to(device)
        trg_seqs = pad_sequence([torch.LongTensor(seq) for seq in batch[1]], padding_value=target_vocab['<pad>'], batch_first=True).to(device)
        yield src_seqs.transpose(0, 1), trg_seqs.transpose(0, 1), len(batch[0])


train_batches = get_iterator(train_examples, source_field.vocab, target_field.vocab, device)
test_batches = get_iterator(test_examples, source_field.vocab, target_field.vocab, device)
val_batches = get_iterator(val_examples, source_field.vocab, target_field.vocab, device)


In [190]:
# Convert train_batches to a list and get its length
train_batches_list = list(train_batches)
num_train_batches = len(train_batches_list)

# Convert test_batches to a list and get its length
test_batches_list = list(test_batches)
num_test_batches = len(test_batches_list)

# Print the number of batches
print(f"Number of train batches: {num_train_batches}")
print(f"Number of test batches: {num_test_batches}")


Token 'underwater' not in source vocabulary
Token 'longest' not in source vocabulary
Token 'đường hầm' not in target vocabulary
Token 'nhật bản' not in target vocabulary
Token 'thực sự' not in target vocabulary
Token 'mend' not in source vocabulary
Token 'quyết định' not in target vocabulary
Token 'improvement' not in source vocabulary
Token 'cải tiến' not in target vocabulary
Token 'quan tâm' not in target vocabulary
Token 'như thế nào' not in target vocabulary
Token 'homeland' not in source vocabulary
Token 'bao giờ' not in target vocabulary
Token 'quê hương' not in target vocabulary
Token 'witness' not in source vocabulary
Token 'detail' not in source vocabulary
Token 'cảnh sát' not in target vocabulary
Token 'nhân chứng' not in target vocabulary
Token 'giải thích' not in target vocabulary
Token 'chi tiết' not in target vocabulary
Token 'tai nạn' not in target vocabulary
Token 'như thế nào' not in target vocabulary
Token 'monica' not in source vocabulary
Token 'monica' not in target

In [192]:
batch = next(iter(test_batches))
print(batch[0])

Token 'chúng tôi' not in target vocabulary
Token 'weighed' not in source vocabulary
Token 'xe tải' not in target vocabulary
Token 'workman' not in source vocabulary
Token 'thợ' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'bây giờ' not in target vocabulary
Token 'nhà ga' not in target vocabulary
Token 'turning' not in source vocabulary
Token 'markets' not in source vocabulary
Token 'domestic' not in source vocabulary
Token 'sales' not in source vocabulary
Token 'công ty' not in target vocabulary
Token 'thị trường' not in target vocabulary
Token 'xuất khẩu' not in target vocabulary
Token 'bù' not in target vocabulary
Token 'sụt giảm' not in target vocabulary
Token 'doanh số' not in target vocabulary
Token 'hạnh phúc' not in target vocabulary
Token 'vấn đề' not in target vocabulary
Token 'nói chuyện' not in target vocabulary
Token 'bài tập' not in target vocabulary
Token 'largely' not in source vocabulary
Token 'depends' not in source vocabulary
Token 'bao nhi

In [193]:
import torch
from torch import nn, optim

# adjustable parameters
INPUT_DIM = len(source_field.vocab)
OUTPUT_DIM = len(target_field.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, n_layers, dropout=dropout,
                          bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

    def forward(self, src_batch):
        # src [sent len, batch size]

        # [sent len, batch size, emb dim]
        embedded = self.embedding(src_batch)
        outputs, hidden = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        # hidden -> [n layers * n directions, batch size, hidden dim]

        # initial decoder hidden is final hidden state of the forwards and
        # backwards encoder RNNs fed through a linear layer
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = torch.tanh(self.fc(concated))
        return outputs, hidden

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
outputs, hidden = encoder(batch[0])

print(outputs.shape, hidden.shape)



torch.Size([19, 128, 1024]) torch.Size([128, 512])




In [194]:
class Attention(nn.Module):

    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        # enc_hid_dim multiply by 2 due to bidirectional
        self.fc1 = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.fc2 = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        # repeat encoder hidden state src_len times [batch size, sent len, dec hid dim]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        # reshape/permute the encoder output, so that the batch size comes first
        # [batch size, sent len, enc hid dim * 2], times 2 because of bidirectional
        outputs = encoder_outputs.permute(1, 0, 2)

        # the attention mechanism receives a concatenation of the hidden state
        # and the encoder output
        concat = torch.cat((hidden, outputs), dim=2)
        
        # fully connected layer and softmax layer to compute the attention weight
        # [batch size, sent len, dec hid dim]
        energy = torch.tanh(self.fc1(concat))
        # attention weight should be of [batch size, sent len]
        attention = self.fc2(energy).squeeze(dim=2)  
        attention_weight = torch.softmax(attention, dim=1)
        return attention_weight

    
attention = Attention(ENC_HID_DIM, DEC_HID_DIM).to(device)
attention_weight = attention(outputs, hidden)
attention_weight.shape

torch.Size([128, 19])

In [195]:
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers,
                 dropout, attention):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(enc_hid_dim * 2 + emb_dim, dec_hid_dim, n_layers, dropout=dropout)
        self.linear = nn.Linear(dec_hid_dim, output_dim)

    def forward(self, trg, encoder_outputs, hidden):
        # trg [batch size]
        # outputs [src sen len, batch size, enc hid dim * 2], times 2 due to bidirectional
        # hidden [batch size, dec hid dim]

        # [batch size, 1, sent len] 
        attention = self.attention(encoder_outputs, hidden).unsqueeze(1)

        # [batch size, sent len, enc hid dim * 2]
        outputs = encoder_outputs.permute(1, 0, 2)

        # [1, batch size, enc hid dim * 2]
        context = torch.bmm(attention, outputs).permute(1, 0, 2)

        # input sentence -> embedding
        # [1, batch size, emb dim]
        embedded = self.embedding(trg.unsqueeze(0))
        rnn_input = torch.cat((embedded, context), dim=2)

        outputs, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        prediction = self.linear(outputs.squeeze(0))
        return prediction, hidden.squeeze(0)

decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention).to(device)
prediction, decoder_hidden = decoder(batch[1][0], outputs, hidden)

# notice the decoder_hidden's shape should match the shape that's generated by
# the encoder
prediction.shape, decoder_hidden.shape

(torch.Size([128, 1165]), torch.Size([128, 512]))

In [196]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src_batch, trg_batch, teacher_forcing_ratio=0.5):
        max_len, batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        # encoder_outputs : all hidden states of the input sequence (forward and backward)
        # hidden : final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src_batch)

        trg = trg_batch[0]
        for i in range(1, max_len):
            prediction, hidden = self.decoder(trg, encoder_outputs, hidden)
            outputs[i] = prediction

            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)

        return outputs

attention = Attention(ENC_HID_DIM, DEC_HID_DIM)
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1258, 256)
    (rnn): GRU(256, 512, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (fc1): Linear(in_features=1536, out_features=512, bias=True)
      (fc2): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(1165, 256)
    (rnn): GRU(1280, 512, dropout=0.5)
    (linear): Linear(in_features=512, out_features=1165, bias=True)
  )
)

In [197]:
outputs = seq2seq(batch[0],batch[1])
outputs.shape

torch.Size([16, 128, 1165])

In [198]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 7,651,213 trainable parameters


In [199]:
optimizer = optim.Adam(seq2seq.parameters())

# ignore the padding index when calculating the loss
PAD_IDX = target_field.vocab['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [200]:
from tqdm import tqdm
import math
import time

def train(seq2seq, iterator, optimizer, criterion):
    seq2seq.train()
    
    epoch_loss = 0
    
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        outputs = seq2seq(batch[0], batch[1])

        # the loss function only works on 2d inputs
        # and 1d targets we need to flatten each of them
        outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
        trg_flatten = batch[1][1:].reshape(-1)
        loss = criterion(outputs_flatten, trg_flatten)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    return epoch_loss / 28


def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(iterator):
            # turn off teacher forcing
            outputs = seq2seq(batch[0], batch[1], teacher_forcing_ratio=0) 

            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]
            outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
            trg_flatten = batch[1][1:].reshape(-1)
            loss = criterion(outputs_flatten, trg_flatten)
            epoch_loss += loss.item()
        
    return epoch_loss / 8
  

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 30
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(seq2seq, train_batches, optimizer, criterion)
    valid_loss = evaluate(seq2seq, val_batches, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'tut2-model.pt')

    print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

0it [00:00, ?it/s]

Token 'believes' not in source vocabulary
Token 'equality' not in source vocabulary
Token 'bình đẳng' not in target vocabulary
Token 'phụ nữ' not in target vocabulary
Token 'nam giới' not in target vocabulary
Token 'burying' not in source vocabulary
Token 'sand' not in source vocabulary
Token 'chôn' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'đàn ông' not in target vocabulary
Token 'normal' not in source vocabulary
Token 'không thể' not in target vocabulary
Token 'chờ đợi' not in target vocabulary
Token 'trở lại' not in target vocabulary
Token 'bình thường' not in target vocabulary
Token 'đánh giá' not in target vocabulary
Token 'điên rồ' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'làm gì' not in target vocabulary
Token 'tất cả' not in target vocabulary
Token 'thảo luận' not in target vocabulary
Token 'sent' not in source vocabulary
Token 'postcard' not in source vocabulary
Token 'bưu 

1it [00:01,  1.49s/it]

Token 'suspect' not in source vocabulary
Token 'custody' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'nghi' not in target vocabulary
Token 'giam giữ' not in target vocabulary
Token 'stairs' not in source vocabulary
Token 'cầu thang' not in target vocabulary
Token 'cops' not in source vocabulary
Token 'vui lòng' not in target vocabulary
Token 'cảnh sát' not in target vocabulary
Token 'retire' not in source vocabulary
Token 'dự định' not in target vocabulary
Token 'hưu' not in target vocabulary
Token 'delay' not in source vocabulary
Token 'chậm trễ' not in target vocabulary
Token 'tình hình' not in target vocabulary
Token 'tồi tệ' not in target vocabulary
Token 'nicest' not in source vocabulary
Token 'girlfriend' not in source vocabulary
Token 'chìa khoá' not in target vocabulary
Token 'australian' not in source vocabulary
Token 'visa' not in source vocabulary
Token 'đại sứ quán' not in target vocabulary
Token 'visa' not in target vocabulary
Token 'bất kỳ' n

2it [00:03,  1.58s/it]

Token 'không thể' not in target vocabulary
Token 'cẩn thận' not in target vocabulary
Token 'sức khoẻ' not in target vocabulary
Token 'cnn' not in source vocabulary
Token 'đổi' not in target vocabulary
Token 'cnn' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'subjects' not in source vocabulary
Token 'mathematics' not in source vocabulary
Token 'tất cả' not in target vocabulary
Token 'môn học' not in target vocabulary
Token 'trên hết' not in target vocabulary
Token 'toán học' not in target vocabulary
Token 'flood' not in source vocabulary
Token 'subside' not in source vocabulary
Token 'lũ' not in target vocabulary
Token 'bắt đầu' not in target vocabulary
Token 'lắng' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'làm việc' not in target vocabulary
Token 'thay vì' not in target vocabulary
Token 'group' not in source vocabulary
Token 'trừng phạt' not in target vocabulary
Token 'sai lầm' not in target vocabulary
Token 'belongs' not in sou

3it [00:04,  1.43s/it]

Token 'công việc' not in target vocabulary
Token 'thực sự' not in target vocabulary
Token 'absorbs' not in source vocabulary
Token 'công việc' not in target vocabulary
Token 'hấp thụ' not in target vocabulary
Token 'hầu hết' not in target vocabulary
Token 'thời gian' not in target vocabulary
Token 'contented' not in source vocabulary
Token 'hạnh phúc' not in target vocabulary
Token 'đàn ông' not in target vocabulary
Token 'hài lòng' not in target vocabulary
Token 'thời gian' not in target vocabulary
Token 'ngu ngốc' not in target vocabulary
Token 'như vậy' not in target vocabulary
Token 'museum' not in source vocabulary
Token 'hôm nay' not in target vocabulary
Token 'bảo tàng' not in target vocabulary
Token 'fax' not in source vocabulary
Token 'fax' not in target vocabulary
Token 'bệnh viện' not in target vocabulary
Token 'lúc nào' not in target vocabulary
Token 'dễ dàng' not in target vocabulary
Token 'similar' not in source vocabulary
Token 'kinh nghiệm' not in target vocabulary
Toke

4it [00:05,  1.50s/it]

Token 'irreplaceable' not in source vocabulary
Token 'không thể' not in target vocabulary
Token 'thay thế' not in target vocabulary
Token 'figured' not in source vocabulary
Token 'hình dung' not in target vocabulary
Token 'tất cả' not in target vocabulary
Token 'im lặng' not in target vocabulary
Token 'ý tưởng' not in target vocabulary
Token 'trò đùa' not in target vocabulary
Token 'rainy' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'văn phòng' not in target vocabulary
Token 'sáng mai' not in target vocabulary
Token 'efficiently' not in source vocabulary
Token 'ý tưởng' not in target vocabulary
Token 'thế nào' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'hiệu quả' not in target vocabulary
Token 'campus' not in source vocabulary
Token 'nhà hàng' not in target vocabulary
Token 'khuôn viên trường' not in target vocabulary
Token 'miserable' not in source vocabulary
Token 'cuộc đời' not in target

5it [00:07,  1.61s/it]

Token 'có thể' not in target vocabulary
Token 'significantly' not in source vocabulary
Token '50' not in source vocabulary
Token 'nhật bản' not in target vocabulary
Token 'thay đổi' not in target vocabulary
Token 'đáng kể' not in target vocabulary
Token '50' not in target vocabulary
Token 'hardest' not in source vocabulary
Token 'dreams' not in source vocabulary
Token 'phụ nữ' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'bạn bè' not in target vocabulary
Token 'heartwarming' not in source vocabulary
Token 'quan tâm' not in target vocabulary
Token 'valuable' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'thời gian' not in target vocabulary
Token 'swims' not in source vocabulary
Token 'tâm trí' not in target vocabulary
Token 'bây giờ' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'nói chuyện' not in target vocabulary
Token 'murderer' not in source vocabulary
Token 'cuối cùng' not in target vocabulary
Token 'tối

6it [00:09,  1.79s/it]

Token 'thick' not in source vocabulary
Token 'thin' not in source vocabulary
Token 'sát cánh' not in target vocabulary
Token 'dày' not in target vocabulary
Token 'mỏng' not in target vocabulary
Token 'smelled' not in source vocabulary
Token 'ngửi' not in target vocabulary
Token 'confusing' not in source vocabulary
Token 'dim' not in source vocabulary
Token 'grandmother' not in source vocabulary
Token 'ký ức' not in target vocabulary
Token 'mờ nhạt' not in target vocabulary
Token 'bà' not in target vocabulary
Token 'india' not in source vocabulary
Token 'ấn độ' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'chắc chắn' not in target vocabulary
Token 'chính phủ' not in target vocabulary
Token 'kế hoạch' not in target vocabulary
Token 'psychic' not in source vocabulary
Token 'ngoại cảm' not in target vocabulary
Token 'thế giới' not in target vocabulary
Token 'thế giới' not in target vocabulary
Token 'thông minh' not in target vocabulary
Token 'force' not in sourc

7it [00:11,  1.68s/it]

Token 'trading' not in source vocabulary
Token 'làm việc' not in target vocabulary
Token 'công ty' not in target vocabulary
Token 'thương mại' not in target vocabulary
Token 'anh trai' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'mạnh mẽ' not in target vocabulary
Token 'hôm nay' not in target vocabulary
Token 'bundle' not in source vocabulary
Token 'bó' not in target vocabulary
Token 'quần áo' not in target vocabulary
Token 'steep' not in source vocabulary
Token 'slope' not in source vocabulary
Token 'chằm chằm' not in target vocabulary
Token 'dốc' not in target vocabulary
Token 'consequences' not in source vocabulary
Token 'hậu quả' not in target vocabulary
Token 'bắt đầu' not in target vocabulary
Token 'bao lâu' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'sẵn sàng' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'preparing' not in source vocabulary
Token 'entrance' not in source vocabulary
Token 'chuẩn b

8it [00:12,  1.65s/it]

Token 'không thể' not in target vocabulary
Token 'một mình' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'quyết định' not in target vocabulary
Token 'hôm nay' not in target vocabulary
Token 'như vậy' not in target vocabulary
Token 'peter' not in source vocabulary
Token 'peter' not in target vocabulary
Token 'đội trưởng' not in target vocabulary
Token 'anticipated' not in source vocabulary
Token 'dự đoán' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'một số' not in target vocabulary
Token 'vấn đề' not in target vocabulary
Token 'pull' not in source vocabulary
Token 'trở' not in target vocabulary
Token 'vê' not in target vocabulary
Token 'nha' not in target vocabulary
Token 'correct' not in source vocabulary
Token 'measurements' not in source vocabulary
Token 'tailoring' not in source vocabulary
Token 'thực hiện' not in target vocabulary
Token 'đo' not in target vocabulary
Token 'chính xác' not in target vocabulary
Token 'bộ đồ' 

9it [00:14,  1.62s/it]

Token 'tưởng' not in target vocabulary
Token 'bao giờ' not in target vocabulary
Token 'ngu ngốc' not in target vocabulary
Token 'victory' not in source vocabulary
Token 'race' not in source vocabulary
Token 'chiến thắng' not in target vocabulary
Token 'đất nước' not in target vocabulary
Token 'hoàn thành' not in target vocabulary
Token 'đầu tiên' not in target vocabulary
Token 'bệnh viện' not in target vocabulary
Token 'jeff' not in target vocabulary
Token 'dường như' not in target vocabulary
Token 'assumed' not in source vocabulary
Token 'soccer' not in source vocabulary
Token 'bị thương' not in target vocabulary
Token 'bóng đá' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'thể hiện' not in target vocabulary
Token 'blue' not in source vocabulary
Token 'mắt xanh' not in target vocabulary
Token 'panelists' not in source vocabulary
Token 'tham luận viên' not in target vocabulary
Token 'thảo luận' not in target vocabulary
Token 'vấn đề' not in target vocabulary

10it [00:16,  1.61s/it]

Token 'spread' not in source vocabulary
Token 'cloth' not in source vocabulary
Token 'vải' not in target vocabulary
Token 'washed' not in source vocabulary
Token 'gội' not in target vocabulary
Token 'electronic' not in source vocabulary
Token 'media' not in source vocabulary
Token 'primary' not in source vocabulary
Token 'source' not in source vocabulary
Token 'information' not in source vocabulary
Token 'phương tiện' not in target vocabulary
Token 'truyền thông' not in target vocabulary
Token 'tin tức' not in target vocabulary
Token 'điện tử' not in target vocabulary
Token 'nguồn' not in target vocabulary
Token 'thông tin' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'treaty' not in source vocabulary
Token 'ban' not in source vocabulary
Token 'nuclear' not in source vocabulary
Token 'tests' not in source vocabulary
Token 'ground' not in source vocabulary
Token 'hiệp ước' not in target vocabulary
Token 'cấm' not in target vocabulary
Token 'hạt nhân' not in 

11it [00:17,  1.50s/it]

Token 'kết hôn' not in target vocabulary
Token 'lắng nghe' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'located' not in source vocabulary
Token 'supreme' not in source vocabulary
Token 'court' not in source vocabulary
Token 'đại sứ quán' not in target vocabulary
Token 'toà án' not in target vocabulary
Token 'tối cao' not in target vocabulary
Token 'hữu ích' not in target vocabulary
Token 'một chút' not in target vocabulary
Token '20' not in source vocabulary
Token '20' not in target vocabulary
Token 'xe buýt' not in target vocabulary
Token 'trở thành' not in target vocabulary
Token 'bác sĩ' not in target vocabulary
Token 'faint' not in source vocabulary
Token 'smile' not in source vocabulary
Token 'yếu ớt' not in target vocabulary
Token 'clever' not in source vocabulary
Token 'thông minh' not in target vocabulary
Token 'mend' not in source vocabulary
Token 'aboard' not in source vocabulary
Token 'máy bay' not in target vocabulary
Token 'ford' not in source

12it [00:18,  1.38s/it]

Token 'thế nào' not in target vocabulary
Token 'tàu hoả' not in target vocabulary
Token 'máy bay' not in target vocabulary
Token 'luôn luôn' not in target vocabulary
Token 'vấn đề' not in target vocabulary
Token 'đối với' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'information' not in source vocabulary
Token 'nhu cầu' not in target vocabulary
Token 'thông tin' not in target vocabulary
Token 'sake' not in source vocabulary
Token 'giả sử' not in target vocabulary
Token 'lý do' not in target vocabulary
Token 'tranh luận' not in target vocabulary
Token 'regret' not in source vocabulary
Token 'hối hận' not in target vocabulary
Token 'probability' not in source vocabulary
Token 'xác suất' not in target vocabulary
Token 'trừng phạt' not in target vocabulary
Token 'bắt đầu' not in target vocabulary
Token 'nude' not in source vocabulary
Token 'bao giờ' not in target vocabulary
Token 'khoả thân' not in target vocabulary
Token 'longest' not in source vocabulary
Token

13it [00:20,  1.59s/it]

Token 'barked' not in source vocabulary
Token 'sủa' not in target vocabulary
Token 'công việc' not in target vocabulary
Token 'kinh doanh' not in target vocabulary
Token 'thế nào' not in target vocabulary
Token 'lái xe' not in target vocabulary
Token 'hint' not in source vocabulary
Token 'gợi ý' not in target vocabulary
Token 'thời gian' not in target vocabulary
Token 'thậm chí' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'bất cứ' not in target vocabulary
Token 'mail' not in source vocabulary
Token 'thu thập' not in target vocabulary
Token 'lobby' not in source vocabulary
Token 'sảnh' not in target vocabulary
Token 'tốt bụng' not in target vocabulary
Token 'rang' not in source vocabulary
Token 'supper' not in source vocabulary
Token 'điện thoại' not in target vocabulary
Token 'reo' not in target vocabulary
Token 'swelling' not in source vocabulary
Token 'head' not in source vocabulary
Token 'mood' not in source vocab

14it [00:22,  1.67s/it]

Token 'tours' not in source vocabulary
Token 'tour' not in target vocabulary
Token 'du lịch' not in target vocabulary
Token 'tuyệt vời' not in target vocabulary
Token 'seated' not in source vocabulary
Token 'circle' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'vòng tròn' not in target vocabulary
Token 'yolks' not in source vocabulary
Token 'unhealthy' not in source vocabulary
Token 'lòng đỏ' not in target vocabulary
Token 'thực sự' not in target vocabulary
Token 'lành mạnh' not in target vocabulary
Token 'unstoppable' not in source vocabulary
Token 'dường như' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'ngăn cản' not in target vocabulary
Token 'vấn đề' not in target vocabulary
Token 'adults' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'hakata' not in source vocabulary
Token 'hakata' not in target vocabulary
Token 'former' not in source vocabulary
Token 'cia' not in source vocabulary
Token 'cựu' not 

15it [00:23,  1.59s/it]

Token 'stepped' not in source vocabulary
Token 'tomato' not in source vocabulary
Token 'bao giờ' not in target vocabulary
Token 'cà chua' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'thực hiện' not in target vocabulary
Token 'có vẻ' not in target vocabulary
Token 'ngạc nhiên' not in target vocabulary
Token 'farther' not in source vocabulary
Token 'alive' not in source vocabulary
Token 'gia đình' not in target vocabulary
Token 'nghỉ ngơi' not in target vocabulary
Token 'consists' not in source vocabulary
Token 'mutual' not in source vocabulary
Token 'understanding' not in source vocabulary
Token 'bao gồm' not in target vocabulary
Token 'hiểu biết' not in target vocabulary
Token 'lẫn' not in target vocabulary
Token 'similar' not in source vocabulary
Token 'cảm giác' not in target vocabulary
Token 'tình huống' not in target vocabulary
Token 'tương tự' not in target vocabulary
Token 'shouted' not in source vocabulary
Token 'giúp đỡ' not in target vocabulary
To

16it [00:27,  2.11s/it]

Token 'bao nhiêu' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'cảm thấy' not in target vocabulary
Token 'tồi tệ' not in target vocabulary
Token 'intentions' not in source vocabulary
Token 'ý định' not in target vocabulary
Token 'finishing' not in source vocabulary
Token 'hoàn thành' not in target vocabulary
Token 'bài tập' not in target vocabulary
Token 'figured' not in source vocabulary
Token 'chi phí' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'socks' not in source vocabulary
Token 'tất' not in target vocabulary
Token 'announcer' not in source vocabulary
Token 'thông báo' not in target vocabulary
Token 'kỳ lạ' not in target vocabulary
Token 'shaking' not in source vocabulary
Token 'run' not in target vocabulary
Token 'dwarf' not in source vocabulary
Token 'lùn' not in target vocabulary
Token 'affected' not in source vocabulary
Token 'audience' not in source vocabulary
Token 'phát biểu' not in target vocabulary
Token 'ảnh hưởng' 

17it [00:28,  1.80s/it]

Token 'flown' not in source vocabulary
Token 'trước đây' not in target vocabulary
Token 'survive' not in source vocabulary
Token 'chúng ta' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'sống sót' not in target vocabulary
Token 'reply' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'đánh giá' not in target vocabulary
Token 'trả lời' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'sự thật' not in target vocabulary
Token 'chờ đợi' not in target vocabulary
Token 'studies' not in source vocabulary
Token 'nodding' not in source vocabulary
Token 'lecture' not in source vocabulary
Token 'đôi khi' not in target vocabulary
Token 'gật' not in target vocabulary
Token 'mama' not in source vocabulary
Token 'cried' not in source vocabulary
Token 'dozens' not in source vocabulary
Token 'parking' not in source vocabulary
Token 'chục' not in target vocabulary
Token 'ô tô' not in ta

18it [00:29,  1.61s/it]

Token 'đề nghị' not in target vocabulary
Token 'folks' not in source vocabulary
Token 'folks' not in target vocabulary
Token 'osamu' not in source vocabulary
Token 'dazai' not in source vocabulary
Token 'osamu' not in target vocabulary
Token 'dazai' not in target vocabulary
Token 'tự sát' not in target vocabulary
Token 'cork' not in source vocabulary
Token 'trở lại' not in target vocabulary
Token 'đầu tiên' not in target vocabulary
Token 'shy' not in source vocabulary
Token 'xấu hổ' not in target vocabulary
Token 'nói chuyện' not in target vocabulary
Token 'bắt đầu' not in target vocabulary
Token 'ngay lập tức' not in target vocabulary
Token 'em gái' not in target vocabulary
Token 'distance' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'ánh sáng' not in target vocabulary
Token 'campfire' not in source vocabulary
Token 'đống' not in target vocabulary
Token 'lửa trại' not in target vocabulary
Token 'mạnh mẽ' not in target vocabulary
Token 'dũng cảm' not in targe

19it [00:30,  1.42s/it]

Token 'bathroom' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'nhà tắm' not in target vocabulary
Token 'an toàn' not in target vocabulary
Token 'chinese' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'trung quốc' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'assistant' not in source vocabulary
Token 'supermarket' not in source vocabulary
Token 'hiện tại' not in target vocabulary
Token 'chị' not in target vocabulary
Token 'trợ lý' not in target vocabulary
Token 'siêu thị' not in target vocabulary
Token 'abroad' not in source vocabulary
Token 'fund' not in source vocabulary
Token 'nước ngoài' not in target vocabulary
Token 'quỹ' not in target vocabulary
Token 'dự án' not in target vocabulary
Token 'insisted' not in source vocabulary
Token 'department' not in source vocabulary
Token 'khăng khăng' not in target vocabulary
Token 'cửa hàng' not in target vocabulary
Token 'bách hoá' not in target vocabulary
Token 

20it [00:31,  1.30s/it]

Token 'reasons' not in source vocabulary
Token 'lý do' not in target vocabulary
Token 'washington' not in source vocabulary
Token 'cherry' not in source vocabulary
Token 'blossoms' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'washington' not in target vocabulary
Token 'kịp thời' not in target vocabulary
Token 'anh đào' not in target vocabulary
Token 'sell' not in source vocabulary
Token 'operated' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'vận hành' not in target vocabulary
Token 'xuất hiện' not in target vocabulary
Token 'returned' not in source vocabulary
Token 'câu chuyện' not in target vocabulary
Token 'loose' not in source vocabulary
Token 'nguy hiểm' not in target vocabulary
Token 'bỏ rơi' not in target vocabulary
Token 'sự thật' not in target vocabulary
Token 'cảm thấy' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'thực sự' not in target vocabular

21it [00:32,  1.26s/it]

Token 'nun' not in source vocabulary
Token 'teresa' not in source vocabulary
Token 'trở thành' not in target vocabulary
Token 'nữ' not in target vocabulary
Token 'tu' not in target vocabulary
Token 'chị' not in target vocabulary
Token 'teresa' not in target vocabulary
Token 'có vẻ' not in target vocabulary
Token 'phấn khích' not in target vocabulary
Token 'newspaper' not in source vocabulary
Token 'tờ' not in target vocabulary
Token 'miễn phí' not in target vocabulary
Token 'lightning' not in source vocabulary
Token 'sét' not in target vocabulary
Token 'ngôn ngữ' not in target vocabulary
Token 'actor' not in source vocabulary
Token 'trở thành' not in target vocabulary
Token 'diễn viên' not in target vocabulary
Token 'một nửa' not in target vocabulary
Token 'hộ chiếu' not in target vocabulary
Token 'nhà ga' not in target vocabulary
Token 'tiễn' not in target vocabulary
Token 'naked' not in source vocabulary
Token 'khoả thân' not in target vocabulary
Token 'inconsiderate' not in source v

22it [00:33,  1.25s/it]

Token 'thường xuyên' not in target vocabulary
Token 'kết hôn' not in target vocabulary
Token 'gradually' not in source vocabulary
Token 'dần dần' not in target vocabulary
Token 'đàn ông' not in target vocabulary
Token 'sam' not in source vocabulary
Token 'sam' not in target vocabulary
Token 'có lẽ' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'thời gian' not in target vocabulary
Token 'cream' not in source vocabulary
Token 'một chút' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'oysters' not in source vocabulary
Token 'hàu' not in target vocabulary
Token 'smiles' not in source vocabulary
Token 'indicate' not in source vocabulary
Token 'pleasure' not in source vocabulary
Token 'lúc nào' not in target vocabulary
Token 'biểu thị' not in target vocabulary
Token 'brief' not in source vocabulary
Token 'human' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'thảo luận' not in target vocabulary
Token 'con người' 

23it [00:35,  1.36s/it]

Token 'tất cả' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'flicked' not in source vocabulary
Token 'heat' not in source vocabulary
Token 'nagoya' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'nagoya' not in target vocabulary
Token 'band' not in source vocabulary
Token 'một chút' not in target vocabulary
Token 'italy' not in source vocabulary
Token 'tất cả' not in target vocabulary
Token 'meningitis' not in source vocabulary
Token 'disease' not in source vocabulary
Token 'viêm' not in target vocabulary
Token 'màng' not in target vocabulary
Token 'não' not in target vocabulary
Token 'căn bệnh' not in target vocabulary
Token 'khủng khiếp' not in target vocabulary
Token 'gon' not in source vocabulary
Token 'na' not in source vocabulary
Token 'recovered' not in source vocabulary
Token 'hồi phục' not in target vocabulary
Token 'suspected' not in source vocabulary
Token 'stealing' not in source 

24it [00:36,  1.42s/it]

Token 'evade' not in source vocabulary
Token 'trốn tránh' not in target vocabulary
Token 'workers' not in source vocabulary
Token 'laid' not in source vocabulary
Token 'plant' not in source vocabulary
Token 'công nhân' not in target vocabulary
Token 'nghỉ việc' not in target vocabulary
Token 'nhà máy' not in target vocabulary
Token 'mức độ' not in target vocabulary
Token 'đồng ý' not in target vocabulary
Token 'resolve' not in source vocabulary
Token 'conflicts' not in source vocabulary
Token 'tại sao' not in target vocabulary
Token 'giải quyết' not in target vocabulary
Token 'xung đột' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'ngạc nhiên' not in target vocabulary
Token 'dish' not in source vocabulary
Token 'ngọt' not in target vocabulary
Token 'baggage' not in source vocabulary
Token 'missing' not in source vocabulary
Token 'hành lý' not in target vocabulary
Token 'rise' not in source vocabulary
Token 'năm ngoái' not in target vocabulary
Token 'tăng gi

25it [00:37,  1.29s/it]

Token 'jobs' not in source vocabulary
Token 'việc làm' not in target vocabulary
Token 'divided' not in source vocabulary
Token 'parts' not in source vocabulary
Token 'bí mật' not in target vocabulary
Token 'thực sự' not in target vocabulary
Token 'hay là' not in target vocabulary
Token 'như thể' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'một số' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'bất cứ' not in target vocabulary
Token 'demanded' not in source vocabulary
Token 'cảnh sát' not in target vocabulary
Token 'yêu cầu' not in target vocabulary
Token 'di chuyển' not in target vocabulary
Token 'carpets' not in source vocabulary
Token 'valuable' not in source vocabulary
Token 'newer' not in source vocabulary
Token 'carpets' not in source vocabulary
Token 'giá trị' not in target vocabulary
Token 'yêu cầu' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'cố gắng' not in target vocabulary
Token 'một chút' no

26it [00:38,  1.23s/it]

Token 'washington' not in source vocabulary
Token 'washington' not in target vocabulary
Token 'lo lắng' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'bắt đầu' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'pity' not in source vocabulary
Token 'tiếc' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'cơ hội' not in target vocabulary
Token 'silk' not in source vocabulary
Token 'lụa' not in target vocabulary
Token 'thường lệ' not in target vocabulary
Token 'hopeless' not in source vocabulary
Token 'tất cả' not in target vocabulary
Token 'vô vọng' not in target vocabulary
Token 'kết thúc' not in target vocabulary
Token 'unforgettable' not in source vocabulary
Token 'nhật bản' not in target vocabulary
Token 'châu âu' not in target vocabulary
Token 'anh chàng' not in target vocabulary
Token 'shake' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token '

27it [00:40,  1.18s/it]

Token 'ăn ở' not in target vocabulary
Token 'contract' not in source vocabulary
Token 'thời gian' not in target vocabulary
Token 'hợp đồng' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'whales' not in source vocabulary
Token 'cá voi' not in target vocabulary
Token 'làm ơn' not in target vocabulary
Token 'tiếp theo' not in target vocabulary
Token 'ngày mai' not in target vocabulary
Token 'pride' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'tự hào' not in target vocabulary
Token 'thời gian' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'đồng hồ' not in target vocabulary
Token 'bây giờ' not in target vocabulary
Token 'mấy' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'jewelry' not in source vocabulary
Token 'cửa hàng' not in target vocabulary
Token 'trang sức' not in target vocabulary
Token 'mở cửa' not in target vocabulary
Token 'someday'

28it [00:40,  1.45s/it]
0it [00:00, ?it/s]

Token 'tham dự' not in target vocabulary
Token 'modern' not in source vocabulary
Token 'impossibilities' not in source vocabulary
Token 'possibilities' not in source vocabulary
Token 'khoa học' not in target vocabulary
Token 'hiện đại' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'khả năng' not in target vocabulary
Token 'thế nào' not in target vocabulary
Token 'chuẩn bị' not in target vocabulary
Token 'có lẽ' not in target vocabulary
Token 'tai nạn' not in target vocabulary
Token 'xe hơi' not in target vocabulary
Token 'sử dụng' not in target vocabulary
Token 'remaining' not in source vocabulary
Token 'lựa chọn' not in target vocabulary
Token 'milk' not in source vocabulary
Token 'bác sĩ' not in target vocabulary
Token 'sữa' not in target vocabulary
Token 'complaining' not in source vocabulary
Token 'thực sự' not in target vocabulary
Token 'hy vọng' not in target vocabulary
Token 'phàn nàn' not in target vocabulary
Token 'lười biếng' not in target vocabula

1it [00:00,  2.18it/s]

Token 'thirteen' not in source vocabulary
Token 'unlucky' not in source vocabulary
Token 'một số' not in target vocabulary
Token 'mười ba' not in target vocabulary
Token 'con số' not in target vocabulary
Token 'may mắn' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'bào chữa' not in target vocabulary
Token 'trunk' not in source vocabulary
Token 'spare' not in source vocabulary
Token 'tire' not in source vocabulary
Token 'cốp' not in target vocabulary
Token 'lốp' not in target vocabulary
Token 'dự phòng' not in target vocabulary
Token 'có vẻ' not in target vocabulary
Token 'nghiêm trọng' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'tin tưởng' not in target vocabulary
Token 'chúng ta' not in target vocabulary
Token 'driver' not in source vocabulary
Token 'tài xế' not in target vocabulary
Token 'wakes' not in source vocabulary
Token 'giải' not in target vocabulary
Token 'tuyệt vời' not in ta

2it [00:00,  2.56it/s]

Token 'tìm hiểu' not in target vocabulary
Token 'cooks' not in source vocabulary
Token 'nấu ăn' not in target vocabulary
Token 'hall' not in source vocabulary
Token 'có thể' not in target vocabulary
Token 'thế nào' not in target vocabulary
Token 'hội trường' not in target vocabulary
Token 'thành phố' not in target vocabulary
Token 'postponed' not in source vocabulary
Token 'trò chơi' not in target vocabulary
Token 'hoãn' not in target vocabulary
Token 'chủ nhật' not in target vocabulary
Token 'cơ hội' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'lý do' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'bao giờ' not in target vocabulary
Token 'burns' not in source vocabulary
Token 'roland' not in source vocabulary
Token 'đồng ý' not in target vocabulary
Token 'bỏng' not in target vocabulary
Token 'đồng ý' not in target vocabulary
Token 'roland' not in target vocabulary
Token 'chính xác' not in target vocabulary
Token 'làm bạn' not in t

3it [00:01,  2.54it/s]

Token 'wallet' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'pushy' not in source vocabulary
Token 'một chút' not in target vocabulary
Token 'thúc đẩy' not in target vocabulary
Token 'produced' not in source vocabulary
Token 'vở' not in target vocabulary
Token 'kịch' not in target vocabulary
Token 'sản xuất' not in target vocabulary
Token 'miners' not in source vocabulary
Token 'thợ' not in target vocabulary
Token 'mỏ' not in target vocabulary
Token 'chiến đấu' not in target vocabulary
Token 'pants' not in source vocabulary
Token 'hung' not in source vocabulary
Token 'chair' not in source vocabulary
Token 'quần' not in target vocabulary
Token 'cẩn thận' not in target vocabulary
Token 'harmless' not in source vocabulary
Token 'có vẻ' not in target vocabulary
Token 'vô hại' not in target vocabulary
Token 'engaged' not in source vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'đính hôn' not in target vocabulary
Token 'research' not in source vocabu

4it [00:01,  2.68it/s]


Epoch: 1 | Time: 0m 42s
	Train Loss: 5.317 | Train PPL: 203.865
	 Val. Loss: 2.455 |  Val. PPL:  11.641


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 2 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 3 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 4 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 5 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 6 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 7 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 8 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 9 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 10 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 11 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 12 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 13 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 14 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 15 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 16 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 17 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 18 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 19 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 20 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 21 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 22 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 23 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 24 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 25 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 26 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 27 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 28 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]


Epoch: 29 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000


0it [00:00, ?it/s]
0it [00:00, ?it/s]

Epoch: 30 | Time: 0m 0s
	Train Loss: 0.000 | Train PPL:   1.000
	 Val. Loss: 0.000 |  Val. PPL:   1.000





In [201]:
seq2seq.load_state_dict(torch.load('tut2-model.pt'))

test_loss = evaluate(seq2seq, test_batches, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

0it [00:00, ?it/s]

Token 'có thể' not in target vocabulary
Token 'sử dụng' not in target vocabulary
Token 'một số' not in target vocabulary
Token 'trợ giúp' not in target vocabulary
Token 'nhà bếp' not in target vocabulary
Token 'patting' not in source vocabulary
Token 'shoulder' not in source vocabulary
Token 'cảm thấy' not in target vocabulary
Token 'vỗ' not in target vocabulary
Token 'vai' not in target vocabulary
Token 'thin' not in source vocabulary
Token 'không khí' not in target vocabulary
Token 'mỏng' not in target vocabulary
Token 'đỉnh' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'utmost' not in source vocabulary
Token 'quan trọng' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'rõ ràng' not in target vocabulary
Token 'thất vọng' not in target vocabulary
Token 'bác sĩ' not in target vocabulary
Token 'passenger' not in source vocabulary
Token 'toàn bộ' not in target vocabulary
Token 'hành khách' not in 

1it [00:00,  3.08it/s]

Token 'có thể' not in target vocabulary
Token 'một chút' not in target vocabulary
Token 'pretends' not in source vocabulary
Token 'sports' not in source vocabulary
Token 'giả vờ' not in target vocabulary
Token 'thể thao' not in target vocabulary
Token 'sweating' not in source vocabulary
Token 'helps' not in source vocabulary
Token 'regulate' not in source vocabulary
Token 'temperature' not in source vocabulary
Token 'thời tiết' not in target vocabulary
Token 'ấm áp' not in target vocabulary
Token 'mồ hôi' not in target vocabulary
Token 'con người' not in target vocabulary
Token 'điều chỉnh' not in target vocabulary
Token 'nhiệt độ' not in target vocabulary
Token 'cơ thể' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'ân huệ' not in target vocabulary
Token 'unconscious' not in source vocabulary
Token 'bất tỉnh' not in target vocabulary
Token 'policy' not in source vocabulary
Token 'criticized' not in source vocabulary
Token 'opposition' not in source vocabulary


2it [00:00,  2.99it/s]

Token 'entire' not in source vocabulary
Token 'toàn bộ' not in target vocabulary
Token 'owners' not in source vocabulary
Token 'negotiate' not in source vocabulary
Token 'mỏ' not in target vocabulary
Token 'từ chối' not in target vocabulary
Token 'đàm phán' not in target vocabulary
Token 'hôm nay' not in target vocabulary
Token 'heavily' not in source vocabulary
Token 'sedated' not in source vocabulary
Token 'mê hoặc' not in target vocabulary
Token 'tại sao' not in target vocabulary
Token 'alice' not in source vocabulary
Token 'yêu cầu' not in target vocabulary
Token 'alice' not in target vocabulary
Token 'yard' not in source vocabulary
Token 'gốc' not in target vocabulary
Token 'sân' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'thirsty' not in source vocabulary
Token 'khát' not in target vocabulary
Token 'vấn đề' not in target vocabulary
Token 'hoà hợp' not in target vocabulary
Token 'excused' not in source vocabulary
Token 'giáo viên' not in target vocab

3it [00:00,  3.09it/s]

Token 'terms' not in source vocabulary
Token 'recycling' not in source vocabulary
Token 'thảo luận' not in target vocabulary
Token 'vấn đề' not in target vocabulary
Token 'chất thải' not in target vocabulary
Token 'tái chế' not in target vocabulary
Token 'consult' not in source vocabulary
Token 'tham khảo' not in target vocabulary
Token 'ý kiến' not in target vocabulary
Token 'giáo viên' not in target vocabulary
Token 'duy nhất' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'tổn thương' not in target vocabulary
Token 'cảm xúc' not in target vocabulary
Token 'rabbits' not in source vocabulary
Token 'tails' not in source vocabulary
Token 'đuôi' not in target vocabulary
Token 'ghét anh' not in target vocabulary
Token 'wealth' not in source vocabulary
Token 'thế nào' not in target vocabulary
Token 'tất cả' not in target vocabulary
Token 'giàu có' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'làm việc' not in target vocabulary
Token 'chă

4it [00:01,  2.34it/s]

Token 'gray' not in source vocabulary
Token 'xám' not in target vocabulary
Token 'trí nhớ' not in target vocabulary
Token 'có vẻ' not in target vocabulary
Token 'nhặt' not in target vocabulary
Token 'conversation' not in source vocabulary
Token 'recorded' not in source vocabulary
Token 'hội thoại' not in target vocabulary
Token 'ghi' not in target vocabulary
Token 'cheating' not in source vocabulary
Token 'exam' not in source vocabulary
Token 'carpet' not in source vocabulary
Token 'gian lận' not in target vocabulary
Token 'chỉ định' not in target vocabulary
Token 'putting' not in source vocabulary
Token 'nguy cơ' not in target vocabulary
Token 'kindergarten' not in source vocabulary
Token 'mẫu giáo' not in target vocabulary
Token 'calf' not in source vocabulary
Token 'bê' not in target vocabulary
Token 'nhắm mắt' not in target vocabulary
Token 'gussied' not in source vocabulary
Token 'hẹn hò' not in target vocabulary
Token 'đầu tiên' not in target vocabulary
Token 'chúng ta' not in ta

5it [00:01,  2.47it/s]

Token 'trả lời' not in target vocabulary
Token 'surrounded' not in source vocabulary
Token 'brown' not in target vocabulary
Token 'bao' not in target vocabulary
Token 'balls' not in source vocabulary
Token 'xung quanh' not in target vocabulary
Token 'okay' not in source vocabulary
Token 'finds' not in source vocabulary
Token 'miễn là' not in target vocabulary
Token 'phát hiện' not in target vocabulary
Token 'defusing' not in source vocabulary
Token 'cố gắng' not in target vocabulary
Token 'giải quyết' not in target vocabulary
Token 'tình huống' not in target vocabulary
Token 'nhanh chóng' not in target vocabulary
Token 'hiring' not in source vocabulary
Token 'babysitter' not in source vocabulary
Token 'lý do' not in target vocabulary
Token 'quần jean' not in target vocabulary
Token 'mặt hàng' not in target vocabulary
Token 'quần áo' not in target vocabulary
Token 'xuất khẩu' not in target vocabulary
Token 'phổ biến' not in target vocabulary
Token 'needy' not in source vocabulary
Token 

6it [00:02,  2.84it/s]

Token 'results' not in source vocabulary
Token 'hài lòng' not in target vocabulary
Token 'kết quả' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'cảm thấy' not in target vocabulary
Token 'lo lắng' not in target vocabulary
Token 'killer' not in source vocabulary
Token 'rõ ràng' not in target vocabulary
Token 'quan tâm' not in target vocabulary
Token 'theory' not in source vocabulary
Token 'lý thuyết' not in target vocabulary
Token 'bệnh nhân' not in target vocabulary
Token 'có thể' not in target vocabulary
Token 'qua đời' not in target vocabulary
Token 'bất cứ' not in target vocabulary
Token 'lúc nào' not in target vocabulary
Token 'lười biếng' not in target vocabulary
Token 'chờ đợi' not in target vocabulary
Token 'mathematics' not in source vocabulary
Token 'chúng ta' not in target vocabulary
Token 'toán học' not in target vocabulary
Token 'không thể' not in target vocabulary
Token 'chúng tôi' not in target vocabulary
Token 'chúng tôi' not in target vocabul

7it [00:02,  2.69it/s]

| Test Loss: 4.336 | Test PPL:  76.431 |





In [220]:
print(train_examples[0])
example = train_examples[example_idx]
print(example)
print('source sentence: ', ' '.join(example.src))
print('target sentence: ', ' '.join(example.trg))

<__main__.Example object at 0x0000015405163F50>
<__main__.Example object at 0x0000015405163F50>
source sentence:  tom let go of mary 's wrists
target sentence:  tom buông cổ_tay của mary


In [None]:

src_tensor = source.numericalize([example["English"]], device=device)
trg_tensor = target.numericalize([example["Vietnamese"]], device=device)

print(trg_tensor.shape)

seq2seq.eval()
with torch.no_grad():
    outputs = seq2seq(src_tensor, trg_tensor, teacher_forcing_ratio=0)

print(outputs.shape)


In [None]:
output_idx = outputs[1:].squeeze(1).argmax(1).tolist()
print(output_idx)
' '.join([target.itos[idx] for idx in output_idx])


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import sacrebleu

def evaluate(model, iterator, source_field, target_field, device):
    model.eval()

    with torch.no_grad():
        targets = []
        predictions = []
        for i, batch in enumerate(iterator):
            src = getattr(batch, source_field)
            trg = getattr(batch, target_field)
            output = model(src, trg, 0) # turn off teacher forcing
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            # convert output indices to tokens
            predicted_tokens = []
            for i in range(output.shape[0]):
                predicted_tokens.append(target_field.vocab.itos[output[i]])

            targets.append(trg.tolist())
            predictions.append(predicted_tokens)

    # calculate BLEU score
    references = [[target_field.vocab.itos[token] for token in t if token != target_field.vocab.stoi[target_field.eos_token]] for t in targets]
    candidate_corpus = [[token for token in prediction if token != target_field.eos_token] for prediction in predictions]
    bleu_score = corpus_bleu(references, candidate_corpus, smoothing_function=SmoothingFunction().method1)

    # calculate METEOR score
    meteor_score = 0
    for i in range(len(targets)):
        meteor_score += nltk.translate.meteor_score(references[i], predictions[i])
    meteor_score /= len(targets)

    # calculate ROUGE score
    rouge = sacrebleu.corpus_rouge(predictions, references)

    # calculate TER score
    ter_score = 0
    for i in range(len(targets)):
        ter_score += nltk.translate.ter(references[i], predictions[i])
    ter_score /= len(targets)

    return bleu_score, meteor_score, rouge.score, ter_score

evaluate(seq2seq,train_batches,source,target,device=device)
