In [1]:
import random
import torch
SEED = 2222
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1c9ff6aa610>

In [2]:
from vncorenlp import VnCoreNLP
annotator = VnCoreNLP("VnCoreNLP-master\VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

In [3]:
import nltk
import string
import itertools

def tokenize_en(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

def tokenize_vi(text):
    return [tok for tok in itertools.chain.from_iterable(annotator.tokenize(text))]

text_en = 'Please put the dustpan in the broom closet'
text_vi = 'Cuốn sách này là của tôi. Của bạn đâu?'
print(tokenize_en(text_en))
print(tokenize_vi(text_vi))


['Please', 'put', 'the', 'dustpan', 'in', 'the', 'broom', 'closet']
['Cuốn', 'sách', 'này', 'là', 'của', 'tôi', '.', 'Của', 'bạn', 'đâu', '?']


In [4]:
import pandas as pd

def create_raw_dataset():
    data_dir = ""
    en_sents = open(data_dir + 'english.txt', "r",encoding="utf-8" ).read().splitlines()
    vi_sents = open(data_dir + 'vietnamese.txt', "r" ,encoding="utf-8").read().splitlines()
    return {
        "English": [line for line in en_sents[:5000]],
        "Vietnamese": [line for line in vi_sents[:5000]],
    }
raw_data = create_raw_dataset()

from sklearn.model_selection import train_test_split

df = pd.DataFrame(raw_data, columns=["English", "Vietnamese"])
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)
val.to_json("val.json", orient="records", lines=True)


In [104]:
import json
from collections import Counter
from itertools import chain

source_tokenizer = tokenize_en
target_tokenizer = tokenize_vi

def load_data(filename, source_tokenizer, target_tokenizer):
    examples = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            example = json.loads(line)
            src = source_tokenizer(example["English"])
            trg = target_tokenizer(example["Vietnamese"])
            example["src"] = src
            example["trg"] = trg
            examples.append(example)
    return examples

train_examples = load_data("train.json", source_tokenizer, target_tokenizer)
val_examples = load_data("val.json", source_tokenizer, target_tokenizer)
test_examples = load_data("test.json", source_tokenizer, target_tokenizer)

def build_vocab(tokenized_sentences, max_size=None, min_freq=1):
    word_counts = Counter(chain(*tokenized_sentences))
    sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
    if max_size is not None:
        sorted_words = sorted_words[:max_size]
    vocabulary = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
    for word, count in sorted_words:
        if count >= min_freq and word not in vocabulary:
            vocabulary[word] = len(vocabulary)
    return vocabulary

source_sentences_train = [example["src"] for example in train_examples]
target_sentences_train = [example["trg"] for example in train_examples]
source_vocab = build_vocab(source_sentences_train, max_size=10000, min_freq=2)
target_vocab = build_vocab(target_sentences_train, max_size=10000, min_freq=2)



In [63]:
print(f"Unique tokens in source (en) vocabulary: {len(source_vocab)}")
print(f"Unique tokens in target (vi) vocabulary: {len(target_vocab)}")

Unique tokens in source (en) vocabulary: 1511
Unique tokens in target (vi) vocabulary: 1356


In [86]:
print(source_sentences_train)

[['Will', 'you', 'be', 'taking', 'a', 'holiday', 'this', 'year'], ['He', 'decided', 'to', 'write', 'in', 'his', 'diary', 'every', 'day'], ['I', 'have', "n't", 'forgotten', 'our', 'first', 'date'], ['I', 'make', '€100', 'a', 'day'], ['I', 'ca', "n't", 'speak', 'French'], ['Tom', 'had', 'put', 'off', 'telling', 'Mary', 'the', 'bad', 'news', 'for', 'as', 'long', 'as', 'possible'], ['I', 'have', 'to', 'go', 'home', 'now'], ['I', 'do', "n't", 'find', 'Tom', 'particularly', 'interesting', 'to', 'talk', 'to'], ['The', 'girl', 'rowing', 'the', 'boat', 'is', 'my', 'cousin'], ['I', 'want', 'you', 'to', 'stop', 'this', 'immediately'], ['I', 'paid', 'nothing'], ['You', 'have', 'probably', 'never', 'heard', 'of', 'me', 'but', 'I', "'m", 'famous', 'back', 'home'], ['I', "'m", 'really', 'starting', 'to', 'get', 'worried'], ['You', "'ll", 'have', 'to', 'go'], ['He', 'studied', 'to', 'be', 'a', 'doctor'], ['It', 'is', 'very', 'difficult', 'to', 'persuade', 'people', 'to', 'change', 'their', 'life', 'st

In [44]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.nn.functional as F

def get_iterator(data, batch_size, source_vocab, target_vocab, device):
    src_sents = [x[0] for x in data]
    trg_sents = [x[1] for x in data]

    # Sort the sentences by length for efficiency
    sorted_indices = sorted(range(len(src_sents)), key=lambda i: len(src_sents[i]))

    # Split the data into batches
    batches = []
    for i in range(0, len(src_sents), batch_size):
        indices = sorted_indices[i:i+batch_size]
        src_batch = [src_sents[j] for j in indices]
        trg_batch = [trg_sents[j] for j in indices]
        batch = {"src": src_batch, "trg": trg_batch}
        batches.append(batch)

    # Shuffle the batches
    random.shuffle(batches)

    # Get the number of batches
    num_batches = len(batches)

    # Iterate over the batches
    for batch in batches:
        # Convert the sentences to sequences of indices
        src_seqs = [torch.LongTensor([source_vocab[token] if token in source_vocab else source_vocab['<unk>'] for token in sent]) for sent in batch['src']]
        trg_seqs = [torch.LongTensor([target_vocab[token] if token in target_vocab else target_vocab['<unk>'] for token in sent]) for sent in batch['trg']]

        # Pad the sequences
        src_seqs = torch.nn.utils.rnn.pad_sequence(src_seqs, batch_first=True, padding_value=source_vocab['<pad>']).to(device)
        trg_seqs = torch.nn.utils.rnn.pad_sequence(trg_seqs, batch_first=True, padding_value=target_vocab['<pad>']).to(device)
        trg_seqs = F.pad(trg_seqs, (0, 8 - (trg_seqs.size(1) % 8)), value=target_vocab['<pad>'])

        # Transpose the source sequences tensor to have batch size as the first dimension
        src_seqs = src_seqs.transpose(0, 1)
        trg_seqs = trg_seqs.transpose(0, 1)

        # Return the batch
        yield {"src": src_seqs, "trg": trg_seqs}

    # Return the number of batches
    return num_batches, batches

train_batches = list(get_iterator(train_examples, BATCH_SIZE, source_vocab, target_vocab, device))
valid_batches = list(get_iterator(val_examples, BATCH_SIZE, source_vocab, target_vocab, device))
test_batches = list(get_iterator(test_examples, BATCH_SIZE, source_vocab, target_vocab, device))
source_lengths = [len(sent) for sent in source_sentences_train]
target_lengths = [len(sent) for sent in target_sentences_train]
print("Source sentence lengths: ", sorted(set(source_lengths)))
print("Target sentence lengths: ", sorted(set(target_lengths)))
num_train_batches = len(train_batches)
num_valid_batches = len(valid_batches)
for i, batch in enumerate (test_batches):
    print(f"Batch {i}:")
    print("Source sequences:")
    print(num_train_batches)
    print(num_valid_batches)
    print(batch["src"].shape)
    print("Target sequences:")
    print(batch["trg"].shape)
    print(batch["trg"][i])




Source sentence lengths:  [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 29]
Target sentence lengths:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 29]
Batch 0:
Source sequences:
28
4
torch.Size([8, 128])
Target sequences:
torch.Size([16, 128])
tensor([  26,   89,   59,    1,   30,    1,    6,   25,    8,   44,    6,    4,
           1,    6,    4,    1,   41,   26,    4,    1,    6,   26,    8,  113,
           6,    4,    4,    1,   59,   26, 1254,   12,  131,   76,    1,   47,
         646,  309,    8,    6,   26,    4,  389,   26,  325,    4,   37,    6,
         191,   26,    4,    1,   26,   26,    4,    4,    1,    4,  236,  281,
          34,   89,  101,    1,  324,    6,  186,    4,    4,    8,    4,    8,
           4,    6,   12,   25,   30,   25,    8,    4,    8,    8,  134,  241,
           8,    8,    8,    4,    4,    8,  122,   12,   10,   25,    1,  101,
          26,   47,    4,    1,   

In [34]:
import torch
from torch import nn, optim

# adjustable parameters
INPUT_DIM = len(source_vocab)
OUTPUT_DIM = len(target_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, n_layers, dropout=dropout,
                          bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

    def forward(self, src_batch):
        # src [sent len, batch size]

        # [sent len, batch size, emb dim]
        embedded = self.embedding(src_batch)
        outputs, hidden = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        # hidden -> [n layers * n directions, batch size, hidden dim]

        # initial decoder hidden is final hidden state of the forwards and
        # backwards encoder RNNs fed through a linear layer
        concated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = torch.tanh(self.fc(concated))
        return outputs, hidden

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
outputs, hidden = encoder(batch["src"])
# transpose the outputs tensor to have batch size as the first dimension
print(outputs.shape, hidden.shape)



torch.Size([4, 128, 1024]) torch.Size([128, 512])




In [35]:
class Attention(nn.Module):

    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        # enc_hid_dim multiply by 2 due to bidirectional
        self.fc1 = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.fc2 = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        # repeat encoder hidden state src_len times [batch size, sent len, dec hid dim]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        # reshape/permute the encoder output, so that the batch size comes first
        # [batch size, sent len, enc hid dim * 2], times 2 because of bidirectional
        outputs = encoder_outputs.permute(1, 0, 2)

        # the attention mechanism receives a concatenation of the hidden state
        # and the encoder output
        concat = torch.cat((hidden, outputs), dim=2)
        
        # fully connected layer and softmax layer to compute the attention weight
        # [batch size, sent len, dec hid dim]
        energy = torch.tanh(self.fc1(concat))
        # attention weight should be of [batch size, sent len]
        attention = self.fc2(energy).squeeze(dim=2)  
        attention_weight = torch.softmax(attention, dim=1)
        return attention_weight

    
attention = Attention(ENC_HID_DIM, DEC_HID_DIM).to(device)
attention_weight = attention(outputs, hidden)
attention_weight.shape

torch.Size([128, 4])

In [36]:
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers,
                 dropout, attention):
        super().__init__()
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(enc_hid_dim * 2 + emb_dim, dec_hid_dim, n_layers, dropout=dropout)
        self.linear = nn.Linear(dec_hid_dim, output_dim)

    def forward(self, trg, encoder_outputs, hidden):
        # trg [batch size]
        # outputs [src sen len, batch size, enc hid dim * 2], times 2 due to bidirectional
        # hidden [batch size, dec hid dim]

        # [batch size, 1, sent len] 
        attention = self.attention(encoder_outputs, hidden).unsqueeze(1)

        # [batch size, sent len, enc hid dim * 2]
        outputs = encoder_outputs.permute(1, 0, 2)

        # [1, batch size, enc hid dim * 2]
        context = torch.bmm(attention, outputs).permute(1, 0, 2)

        # input sentence -> embedding
        # [1, batch size, emb dim]
        embedded = self.embedding(trg.unsqueeze(0))
        rnn_input = torch.cat((embedded, context), dim=2)

        outputs, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        prediction = self.linear(outputs.squeeze(0))
        return prediction, hidden.squeeze(0)

decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention).to(device)
prediction, decoder_hidden = decoder(batch["trg"][0], outputs, hidden)

# notice the decoder_hidden's shape should match the shape that's generated by
# the encoder
prediction.shape, decoder_hidden.shape

(torch.Size([128, 1356]), torch.Size([128, 512]))

In [37]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src_batch, trg_batch, teacher_forcing_ratio=0.5):
        max_len, batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        # encoder_outputs : all hidden states of the input sequence (forward and backward)
        # hidden : final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src_batch)

        trg = trg_batch[0]
        for i in range(1, max_len):
            prediction, hidden = self.decoder(trg, encoder_outputs, hidden)
            outputs[i] = prediction

            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)

        return outputs

attention = Attention(ENC_HID_DIM, DEC_HID_DIM)
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attention)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1511, 256)
    (rnn): GRU(256, 512, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (fc1): Linear(in_features=1536, out_features=512, bias=True)
      (fc2): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(1356, 256)
    (rnn): GRU(1280, 512, dropout=0.5)
    (linear): Linear(in_features=512, out_features=1356, bias=True)
  )
)

In [38]:
outputs = seq2seq(batch["src"], batch["trg"])
outputs.shape

torch.Size([16, 128, 1356])

In [39]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 7,862,860 trainable parameters


In [40]:
optimizer = optim.Adam(seq2seq.parameters())

# ignore the padding index when calculating the loss
PAD_IDX = target_vocab['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [41]:
from tqdm import tqdm
import math
import time

def train(seq2seq, iterator, optimizer, criterion):
    seq2seq.train()
    
    epoch_loss = 0
    
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        outputs = seq2seq(batch["src"], batch["trg"])

        # the loss function only works on 2d inputs
        # and 1d targets we need to flatten each of them
        outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
        trg_flatten = batch["trg"][1:].reshape(-1)
        loss = criterion(outputs_flatten, trg_flatten)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    return epoch_loss / len(list(iterator))


def evaluate(seq2seq, iterator, criterion):
    seq2seq.eval()

    epoch_loss = 0
    num_batches = len(iterator)

    with torch.no_grad():
        for batch in tqdm(iterator):
            # turn off teacher forcing
            outputs = seq2seq(batch["src"], batch["trg"], teacher_forcing_ratio=0) 

            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]
            outputs_flatten = outputs[1:].view(-1, outputs.shape[-1])
            trg_flatten = batch["trg"][1:].reshape(-1)
            loss = criterion(outputs_flatten, trg_flatten)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)
  

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 30
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(seq2seq, train_batches, optimizer, criterion)
    valid_loss = evaluate(seq2seq, list(valid_batches), criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'tut2-model.pt')

    print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

100%|██████████| 28/28 [00:21<00:00,  1.29it/s]
100%|██████████| 4/4 [00:01<00:00,  3.08it/s]


Epoch: 1 | Time: 0m 23s
	Train Loss: 5.734 | Train PPL: 309.162
	 Val. Loss: 5.181 |  Val. PPL: 177.928


100%|██████████| 28/28 [00:21<00:00,  1.31it/s]
100%|██████████| 4/4 [00:01<00:00,  3.22it/s]


Epoch: 2 | Time: 0m 22s
	Train Loss: 4.988 | Train PPL: 146.659
	 Val. Loss: 4.897 |  Val. PPL: 133.868


100%|██████████| 28/28 [00:27<00:00,  1.01it/s]
100%|██████████| 4/4 [00:01<00:00,  3.00it/s]


Epoch: 3 | Time: 0m 28s
	Train Loss: 4.438 | Train PPL:  84.602
	 Val. Loss: 4.463 |  Val. PPL:  86.784


100%|██████████| 28/28 [00:22<00:00,  1.23it/s]
100%|██████████| 4/4 [00:01<00:00,  2.93it/s]


Epoch: 4 | Time: 0m 24s
	Train Loss: 3.877 | Train PPL:  48.298
	 Val. Loss: 4.259 |  Val. PPL:  70.755


100%|██████████| 28/28 [00:23<00:00,  1.21it/s]
100%|██████████| 4/4 [00:01<00:00,  3.08it/s]


Epoch: 5 | Time: 0m 24s
	Train Loss: 3.332 | Train PPL:  28.000
	 Val. Loss: 4.126 |  Val. PPL:  61.936


100%|██████████| 28/28 [00:22<00:00,  1.24it/s]
100%|██████████| 4/4 [00:01<00:00,  2.91it/s]


Epoch: 6 | Time: 0m 23s
	Train Loss: 2.962 | Train PPL:  19.331
	 Val. Loss: 3.982 |  Val. PPL:  53.623


100%|██████████| 28/28 [00:23<00:00,  1.21it/s]
100%|██████████| 4/4 [00:01<00:00,  3.09it/s]


Epoch: 7 | Time: 0m 24s
	Train Loss: 2.484 | Train PPL:  11.994
	 Val. Loss: 3.956 |  Val. PPL:  52.230


100%|██████████| 28/28 [00:21<00:00,  1.30it/s]
100%|██████████| 4/4 [00:01<00:00,  3.08it/s]


Epoch: 8 | Time: 0m 22s
	Train Loss: 2.033 | Train PPL:   7.636
	 Val. Loss: 3.830 |  Val. PPL:  46.040


100%|██████████| 28/28 [00:21<00:00,  1.33it/s]
100%|██████████| 4/4 [00:01<00:00,  2.99it/s]


Epoch: 9 | Time: 0m 22s
	Train Loss: 1.697 | Train PPL:   5.458
	 Val. Loss: 3.757 |  Val. PPL:  42.833


100%|██████████| 28/28 [00:21<00:00,  1.32it/s]
100%|██████████| 4/4 [00:01<00:00,  3.14it/s]


Epoch: 10 | Time: 0m 22s
	Train Loss: 1.384 | Train PPL:   3.992
	 Val. Loss: 3.952 |  Val. PPL:  52.019


100%|██████████| 28/28 [00:21<00:00,  1.33it/s]
100%|██████████| 4/4 [00:01<00:00,  2.72it/s]


Epoch: 11 | Time: 0m 22s
	Train Loss: 1.144 | Train PPL:   3.140
	 Val. Loss: 3.796 |  Val. PPL:  44.514


100%|██████████| 28/28 [00:21<00:00,  1.28it/s]
100%|██████████| 4/4 [00:01<00:00,  3.14it/s]


Epoch: 12 | Time: 0m 23s
	Train Loss: 1.045 | Train PPL:   2.843
	 Val. Loss: 4.108 |  Val. PPL:  60.821


100%|██████████| 28/28 [00:22<00:00,  1.26it/s]
100%|██████████| 4/4 [00:01<00:00,  3.18it/s]


Epoch: 13 | Time: 0m 23s
	Train Loss: 0.895 | Train PPL:   2.447
	 Val. Loss: 3.889 |  Val. PPL:  48.883


100%|██████████| 28/28 [00:20<00:00,  1.34it/s]
100%|██████████| 4/4 [00:01<00:00,  2.89it/s]


Epoch: 14 | Time: 0m 22s
	Train Loss: 0.627 | Train PPL:   1.873
	 Val. Loss: 3.870 |  Val. PPL:  47.954


100%|██████████| 28/28 [00:21<00:00,  1.33it/s]
100%|██████████| 4/4 [00:01<00:00,  3.17it/s]


Epoch: 15 | Time: 0m 22s
	Train Loss: 0.432 | Train PPL:   1.540
	 Val. Loss: 3.869 |  Val. PPL:  47.879


100%|██████████| 28/28 [00:21<00:00,  1.31it/s]
100%|██████████| 4/4 [00:01<00:00,  3.01it/s]


Epoch: 16 | Time: 0m 22s
	Train Loss: 0.320 | Train PPL:   1.378
	 Val. Loss: 3.968 |  Val. PPL:  52.871


100%|██████████| 28/28 [00:22<00:00,  1.23it/s]
100%|██████████| 4/4 [00:01<00:00,  3.11it/s]


Epoch: 17 | Time: 0m 24s
	Train Loss: 0.260 | Train PPL:   1.297
	 Val. Loss: 3.993 |  Val. PPL:  54.224


100%|██████████| 28/28 [00:20<00:00,  1.35it/s]
100%|██████████| 4/4 [00:01<00:00,  2.82it/s]


Epoch: 18 | Time: 0m 22s
	Train Loss: 0.243 | Train PPL:   1.275
	 Val. Loss: 4.062 |  Val. PPL:  58.063


100%|██████████| 28/28 [00:20<00:00,  1.37it/s]
100%|██████████| 4/4 [00:01<00:00,  3.28it/s]


Epoch: 19 | Time: 0m 21s
	Train Loss: 0.232 | Train PPL:   1.261
	 Val. Loss: 4.144 |  Val. PPL:  63.077


100%|██████████| 28/28 [00:20<00:00,  1.35it/s]
100%|██████████| 4/4 [00:01<00:00,  3.24it/s]


Epoch: 20 | Time: 0m 22s
	Train Loss: 0.186 | Train PPL:   1.204
	 Val. Loss: 4.131 |  Val. PPL:  62.256


100%|██████████| 28/28 [00:20<00:00,  1.34it/s]
100%|██████████| 4/4 [00:01<00:00,  3.09it/s]


Epoch: 21 | Time: 0m 22s
	Train Loss: 0.132 | Train PPL:   1.141
	 Val. Loss: 4.147 |  Val. PPL:  63.231


100%|██████████| 28/28 [00:21<00:00,  1.31it/s]
100%|██████████| 4/4 [00:01<00:00,  3.01it/s]


Epoch: 22 | Time: 0m 22s
	Train Loss: 0.105 | Train PPL:   1.111
	 Val. Loss: 4.258 |  Val. PPL:  70.671


100%|██████████| 28/28 [00:20<00:00,  1.38it/s]
100%|██████████| 4/4 [00:01<00:00,  3.26it/s]


Epoch: 23 | Time: 0m 21s
	Train Loss: 0.083 | Train PPL:   1.087
	 Val. Loss: 4.333 |  Val. PPL:  76.146


100%|██████████| 28/28 [00:20<00:00,  1.34it/s]
100%|██████████| 4/4 [00:01<00:00,  3.24it/s]


Epoch: 24 | Time: 0m 22s
	Train Loss: 0.078 | Train PPL:   1.081
	 Val. Loss: 4.298 |  Val. PPL:  73.555


100%|██████████| 28/28 [00:20<00:00,  1.36it/s]
100%|██████████| 4/4 [00:01<00:00,  3.29it/s]


Epoch: 25 | Time: 0m 21s
	Train Loss: 0.066 | Train PPL:   1.068
	 Val. Loss: 4.260 |  Val. PPL:  70.832


100%|██████████| 28/28 [00:20<00:00,  1.38it/s]
100%|██████████| 4/4 [00:01<00:00,  3.24it/s]


Epoch: 26 | Time: 0m 21s
	Train Loss: 0.060 | Train PPL:   1.062
	 Val. Loss: 4.298 |  Val. PPL:  73.517


100%|██████████| 28/28 [00:20<00:00,  1.35it/s]
100%|██████████| 4/4 [00:01<00:00,  2.79it/s]


Epoch: 27 | Time: 0m 22s
	Train Loss: 0.051 | Train PPL:   1.053
	 Val. Loss: 4.416 |  Val. PPL:  82.742


100%|██████████| 28/28 [00:20<00:00,  1.34it/s]
100%|██████████| 4/4 [00:01<00:00,  3.22it/s]


Epoch: 28 | Time: 0m 22s
	Train Loss: 0.037 | Train PPL:   1.038
	 Val. Loss: 4.373 |  Val. PPL:  79.254


100%|██████████| 28/28 [00:20<00:00,  1.36it/s]
100%|██████████| 4/4 [00:01<00:00,  3.19it/s]


Epoch: 29 | Time: 0m 21s
	Train Loss: 0.029 | Train PPL:   1.030
	 Val. Loss: 4.365 |  Val. PPL:  78.645


100%|██████████| 28/28 [00:20<00:00,  1.36it/s]
100%|██████████| 4/4 [00:01<00:00,  3.28it/s]

Epoch: 30 | Time: 0m 21s
	Train Loss: 0.024 | Train PPL:   1.024
	 Val. Loss: 4.397 |  Val. PPL:  81.176





In [45]:
seq2seq.load_state_dict(torch.load('tut2-model.pt'))

test_loss = evaluate(seq2seq, test_batches, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

100%|██████████| 8/8 [00:01<00:00,  4.48it/s]

| Test Loss: 3.715 | Test PPL:  41.061 |





In [110]:
example_idx = 10
example = train_examples[example_idx]
print('source sentence: ', ' '.join(example["src"]))
print('target sentence: ', ' '.join(example["trg"]))

source sentence:  I paid nothing
target sentence:  tôi không trả gì cả


In [113]:
src_sentence = source_sentences_train[0]
trg_sentence = target_sentences_train[0]

src_tensor = torch.tensor([source_vocab[word] for word in src_sentence], dtype=torch.long).unsqueeze(1).to(device)
trg_tensor = torch.tensor([target_vocab[word] for word in trg_sentence], dtype=torch.long).unsqueeze(1).to(device)

print(trg_tensor.shape)

seq2seq.eval()
with torch.no_grad():
    outputs = seq2seq(src_tensor, trg_tensor, teacher_forcing_ratio=0)

outputs.shape

torch.Size([9, 1])


torch.Size([9, 1, 1356])

In [116]:
output_idx = outputs[1:].argmax(2)
predicted_sentence = ' '.join([list(target_vocab.keys())[idx.item()] for idx in output_idx])

In [122]:
import nltk.translate.bleu_score as bleu
import sacrebleu

def compute_bleu(reference_corpus, translation_corpus):
    """
    Computes BLEU score using the NLTK library.

    :param reference_corpus: list of reference sentences
    :param translation_corpus: list of translated sentences
    :return: BLEU score
    """
    references = [[ref.split()] for ref in reference_corpus]
    translations = [trans.split() for trans in translation_corpus]
    return bleu.corpus_bleu(references, translations)

def compute_meteor(reference_corpus, translation_corpus):
    """
    Computes METEOR score using the NLTK library.

    :param reference_corpus: list of reference sentences
    :param translation_corpus: list of translated sentences
    :return: METEOR score
    """
    meteor = nltk.translate.meteor_score.meteor_score(reference_corpus, translation_corpus)
    return meteor

def compute_rouge(reference_corpus, translation_corpus):
    """
    Computes ROUGE score using the sacrebleu library.

    :param reference_corpus: list of reference sentences
    :param translation_corpus: list of translated sentences
    :return: ROUGE score
    """
    rouge = sacrebleu.corpus_rouge(translation_corpus, [reference_corpus])
    return rouge.score



In [125]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.translate import rouge_score

import sacrebleu

# Tokenize sentences using NLTK
def tokenize(sentence):
    return nltk.word_tokenize(sentence.lower())

# Calculate BLEU score using NLTK
def calculate_bleu_score(y_true, y_pred):
    references = [[tokenize(y)] for y in y_true]
    hypotheses = [tokenize(y) for y in y_pred]
    return corpus_bleu(references, hypotheses)

# Calculate METEOR score using NLTK
def calculate_meteor_score(y_true, y_pred):
    return sum([meteor_score([y], p) for y, p in zip(y_true, y_pred)]) / len(y_true)

# Calculate ROUGE score using NLTK
def calculate_rouge_score(y_true, y_pred):
    return sum([rouge_n(y, p, 1) for y, p in zip(y_true, y_pred)]) / len(y_true)

# Calculate BLEU score using sacrebleu
def calculate_bleu_score_sacrebleu(y_true, y_pred):
    references = [[y] for y in y_true]
    hypothesis = [y for y in y_pred]
    bleu = sacrebleu.corpus_bleu(hypothesis, references).score
    return bleu

# Example usage
y_true = [example['trg'] for example in val_examples]
y_pred = [' '.join(pred) for pred in y_pred]

bleu_score = calculate_bleu_score(y_true, y_pred)
meteor_score = calculate_meteor_score(y_true, y_pred)
rouge_score = calculate_rouge_score(y_true, y_pred)
bleu_score_sacrebleu = calculate_bleu_score_sacrebleu(y_true, y_pred)

print(f"BLEU score: {bleu_score:.4f}")
print(f"BLEU score (sacrebleu): {bleu_score_sacrebleu:.4f}")
print(f"METEOR score: {meteor_score:.4f}")
print(f"ROUGE score: {rouge_score:.4f}")


ImportError: cannot import name 'rouge_score' from 'nltk.translate' (C:\Users\16262\AppData\Roaming\Python\Python311\site-packages\nltk\translate\__init__.py)