In [24]:
!pip install spacy nltk --user
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm





[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 825.8 kB/s eta 0:00:16
     --------------------------------------- 0.1/12.8 MB 751.6 kB/s eta 0:00:17
     --------------------------------------- 0.1/12.8 MB 751.6 kB/s eta 0:00:17
     --------------------------------------- 0.1/12.8 MB 751.6 kB/s eta 0:00:17
     --------------------------------------- 0.1/12.8 MB 481.4 kB/s eta 0:00:27
      -------------------------------------- 0.2/12.8 MB 726.5 kB/s eta 0:00:18
      -------------------------------------- 0.3/12.8 MB 842.9 kB/s eta 0:00:15
      -------------------------------------- 0.3/12.8 MB 846.5 kB/s eta 0:00:15
     - ---------------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     --------------------------------------- 0.1/16.3 MB 525.1 kB/s eta 0:00:31
     --------------------------------------- 0.1/16.3 MB 655.4 kB/s eta 0:00:25
     ---------------------------------------- 0.2/16.3 MB 1.1 MB/s eta 0:00:15
      --------------------------------------- 0.3/16.3 MB 1.5 MB/s eta 0:00:11
     - -------------------------------------- 0.5/16.3 MB 1.8 MB/s eta 0:00:09
     - -------------------------------------- 0.7/16.3 MB 2.3 MB/s eta 0:00:07
     -- ------------------------------------- 0.9/16.3 MB 2.6 MB/s eta 0:00:06
     -- ------------------------------------- 1.1/16.3 MB 2.8 MB/s eta 0:00:06
     --- ----------------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
print(torch.__version__)


2.1.0+cu121


In [2]:
# ============================================================
# 1. IMPORTS
# ============================================================

import spacy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.translate.bleu_score import sentence_bleu
import random
# from numba import cuda
from tqdm import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", device)


DEVICE: cuda


In [3]:
# ============================================================
# 2. LOAD DATA (TRAIN / VAL / TEST)
# ============================================================

def load_file(path):
    with open(path, "r", encoding="utf8") as f:
        return [line.strip() for line in f]

# train_en = load_file("/content/train.en")
# train_fr = load_file("/content/train.fr")

# val_en = load_file("/content/val.en")
# val_fr = load_file("/content/val.fr")

# test_en = load_file("/content/test.en")
# test_fr = load_file("/content/test.fr")

train_en = load_file("./data/train.en")
train_fr = load_file("./data/train.fr")

val_en = load_file("./data/val.en")
val_fr = load_file("./data/val.fr")

test_en = load_file("./data/test.en")
test_fr = load_file("./data/test.fr")

print("Train examples:", len(train_en))
print("Val examples:", len(val_en))
print("Test examples:", len(test_en))


Train examples: 29000
Val examples: 1014
Test examples: 1071


In [4]:
# ============================================================
# 3. TOKENIZATION (spaCy)
# ============================================================


nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")

def en_tokenizer(text):
    return [tok.text.lower() for tok in nlp_en.tokenizer(text)]
def fr_tokenizer(text):
    return [tok.text.lower() for tok in nlp_fr.tokenizer(text)]

train_en_tok = [en_tokenizer(s) for s in train_en]
train_fr_tok = [fr_tokenizer(s) for s in train_fr]

val_en_tok = [en_tokenizer(s) for s in val_en]
val_fr_tok = [fr_tokenizer(s) for s in val_fr]

test_en_tok = [en_tokenizer(s) for s in test_en]
test_fr_tok = [fr_tokenizer(s) for s in test_fr]


In [5]:
# ============================================================
# 4. BUILD VOCAB
# ============================================================

from collections import Counter

SPECIAL_TOKENS = ["<pad>", "<unk>", "<sos>", "<eos>"]

class Vocab:
    def __init__(self, tokens, max_size=10000):
        self.freq = Counter(tokens)
        most_common = self.freq.most_common(max_size)

        # word2idx
        self.itos = SPECIAL_TOKENS + [w for w, _ in most_common]
        self.stoi = {w:i for i, w in enumerate(self.itos)}

    def numericalize(self, tokens):
        return [self.stoi.get(t, self.stoi["<unk>"]) for t in tokens]

    def __len__(self):
        return len(self.itos)

en_vocab = Vocab(tok for sent in train_en_tok for tok in sent)
fr_vocab = Vocab(tok for sent in train_fr_tok for tok in sent)

PAD_IDX = fr_vocab.stoi["<pad>"]
SOS_IDX = fr_vocab.stoi["<sos>"]
EOS_IDX = fr_vocab.stoi["<eos>"]


In [6]:
# ============================================================
# 5. DATASET + DATALOADER
# ============================================================

class TranslationDataset(Dataset):
    def __init__(self, en_data, fr_data, vocab_en, vocab_fr):
        self.en = en_data
        self.fr = fr_data
        self.vocab_en = vocab_en
        self.vocab_fr = vocab_fr

    def __len__(self):
        return len(self.en)

    # def numericalize(self, tokens, vocab):
    #     return [vocab["<sos>"]] + [vocab[t] for t in tokens] + [vocab["<eos>"]]

    # def __getitem__(self, idx):
    #     src_num = self.numericalize(self.src[idx], self.src_vocab)
    #     trg_num = self.numericalize(self.trg[idx], self.trg_vocab)
    #     return torch.tensor(src_num), torch.tensor(trg_num)

    def __getitem__(self, idx):
        en_tokens = ["<sos>"] + self.en[idx] + ["<eos>"]
        fr_tokens = ["<sos>"] + self.fr[idx] + ["<eos>"]

        en_ids = self.vocab_en.numericalize(en_tokens)
        fr_ids = self.vocab_fr.numericalize(fr_tokens)

        return torch.tensor(en_ids), torch.tensor(fr_ids)


def collate_fn(batch):
    en_batch, fr_batch = zip(*batch)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    fr_batch = pad_sequence(fr_batch, padding_value=PAD_IDX)
    return en_batch, fr_batch


train_ds = TranslationDataset(train_en_tok, train_fr_tok, en_vocab, fr_vocab)
val_ds = TranslationDataset(val_en_tok, val_fr_tok, en_vocab, fr_vocab)
test_ds = TranslationDataset(test_en_tok, test_fr_tok, en_vocab, fr_vocab)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)


In [7]:
# # ============================================================
# # 6. ENCODER - DECODER MODEL
# # ============================================================

# class Encoder(nn.Module):
#     def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.3):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim,
#                             num_layers=num_layers,
#                             dropout=dropout)

#     def forward(self, src):
#         embedded = self.embedding(src)
#         outputs, (hidden, cell) = self.lstm(embedded)
#         return hidden, cell


# class Decoder(nn.Module):
#     def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.3):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim,
#                             num_layers=num_layers,
#                             dropout=dropout)
#         self.fc = nn.Linear(hidden_dim, vocab_size)

#     def forward(self, input, hidden, cell):
#         input = input.unsqueeze(0)
#         embedded = self.embedding(input)
#         output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
#         prediction = self.fc(output.squeeze(0))
#         return prediction, hidden, cell


# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder):
#         super().__init__()
#         self.encoder = encoder
#         self.decoder = decoder

#     def forward(self, src, trg, teacher_forcing=0.3):
#         batch_size = trg.size(1)
#         max_len = trg.size(0)
#         vocab_size = self.decoder.fc.out_features

#         outputs = torch.zeros(max_len, batch_size, vocab_size).to(src.device)

#         hidden, cell = self.encoder(src)
#         input_token = trg[0, :]

#         for t in range(1, max_len):
#             output, hidden, cell = self.decoder(input_token, hidden, cell)
#             outputs[t] = output
#             best = output.argmax(1)

#             input_token = trg[t] if random.random() < teacher_forcing else best

#         return outputs


In [8]:
class LuongAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

    def forward(self, decoder_hidden, encoder_outputs):
        """
        decoder_hidden: [num_layers, batch, hidden]
        encoder_outputs: [src_len, batch, hidden]
        """
        # l·∫•y hidden c·ªßa layer cu·ªëi: [batch, hidden]
        decoder_hidden = decoder_hidden[-1].unsqueeze(2)  # [batch, hidden, 1]

        # Score = encoder_output ¬∑ decoder_hidden
        # encoder_outputs: [src_len, batch, hidden]
        # sau permute:    [batch, src_len, hidden]
        scores = torch.bmm(
            encoder_outputs.permute(1,0,2),
            decoder_hidden
        ).squeeze(2)  # [batch, src_len]

        attn_weights = torch.softmax(scores, dim=1)  # [batch, src_len]

        return attn_weights


In [9]:
# class Encoder(nn.Module):
#     def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.5):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim,
#                             num_layers=num_layers,
#                             dropout=dropout, bidirectional = True)

#     def forward(self, src):
#         embedded = self.embedding(src)
#         outputs, (hidden, cell) = self.lstm(embedded)
#         return outputs, hidden, cell   # tr·∫£ v·ªÅ t·∫•t c·∫£ hidden states


In [9]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout, bidirectional=True)

        # ‚ö° reduce 1024 ‚Üí 512
        self.fc_reduce = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)

        # reduce dimension
        outputs = self.fc_reduce(outputs)

        return outputs, hidden, cell


In [10]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout)

        self.attention = LuongAttention(hidden_dim)

        # combine context + decoder hidden
        self.fc_concat = nn.Linear(hidden_dim * 2, hidden_dim)

        self.fc_out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden, cell, encoder_outputs):
        input_token = input_token.unsqueeze(0)
        embedded = self.embedding(input_token)

        lstm_output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        lstm_output = lstm_output.squeeze(0)  # [batch, hidden]

        # =========== Luong Attention ============
        attn_weights = self.attention(hidden, encoder_outputs)  # [batch, src_len]

        # context vector = sum(attn * encoder_outputs)
        context = torch.bmm(
            attn_weights.unsqueeze(1),       # [batch, 1, src_len]
            encoder_outputs.permute(1,0,2)   # [batch, src_len, hidden]
        ).squeeze(1)  # [batch, hidden]

        # concat context + output
        combined = torch.cat((lstm_output, context), dim=1)  # [batch, 2*hidden]
        combined = torch.tanh(self.fc_concat(combined))      # [batch, hidden]

        # final prediction
        prediction = self.fc_out(combined)  # [batch, vocab]

        return prediction, hidden, cell


In [11]:
# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder):
#         super().__init__()
#         self.encoder = encoder
#         self.decoder = decoder

#     def forward(self, src, trg, teacher_forcing=0.3):
#         batch_size = trg.size(1)
#         max_len = trg.size(0)
#         vocab_size = self.decoder.fc_out.out_features

#         outputs = torch.zeros(max_len, batch_size, vocab_size).to(src.device)

#         encoder_outputs, hidden, cell = self.encoder(src)
#         input_token = trg[0, :]

#         for t in range(1, max_len):
#             output, hidden, cell = self.decoder(
#                 input_token, hidden, cell, encoder_outputs
#             )

#             outputs[t] = output
#             best = output.argmax(1)

#             input_token = trg[t] if random.random() < teacher_forcing else best

#         return outputs


In [11]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def _merge_bidir(self, h):
        # h: [num_layers*2, batch, hidden]
        return (h[0::2] + h[1::2]) / 2

    def forward(self, src, trg, teacher_forcing=0.3):
        batch_size = trg.size(1)
        max_len = trg.size(0)
        vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(max_len, batch_size, vocab_size).to(src.device)

        encoder_outputs, hidden, cell = self.encoder(src)

        # üî• G·ªòP 2 H∆Ø·ªöNG ‚Üí 1 H∆Ø·ªöNG ‚≠ê
        hidden = self._merge_bidir(hidden)
        cell   = self._merge_bidir(cell)

        input_token = trg[0, :]

        for t in range(1, max_len):
            output, hidden, cell = self.decoder(
                input_token, hidden, cell, encoder_outputs
            )

            outputs[t] = output
            best = output.argmax(1)

            input_token = trg[t] if random.random() < teacher_forcing else best

        return outputs


In [13]:
# # ============================================================
# # 7. TRAINING + VALIDATION + EARLY STOPPING
# # ============================================================

input_dim = len(en_vocab)
output_dim = len(fr_vocab)

encoder = Encoder(input_dim, 256, 512).to(device)
decoder = Decoder(output_dim, 256, 512).to(device)
model = Seq2Seq(encoder, decoder).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.001)


def evaluate(model, loader):
    model.eval()
    total = 0
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(device), trg.to(device)
            outputs = model(src, trg, teacher_forcing=0)
            loss = criterion(outputs[1:].reshape(-1, outputs.size(-1)),
                             trg[1:].reshape(-1))
            total += loss.item()
    return total / len(loader)


EPOCHS = 20
best_val = float("inf")
patience = 3
wait = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for src, trg in tqdm(train_loader):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        outputs = model(src, trg)

        loss = criterion(outputs[1:].reshape(-1, outputs.size(-1)),
                         trg[1:].reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)
    val_loss = evaluate(model, val_loader)

    print(f"Epoch {epoch+1} | Train={train_loss:.4f} | Val={val_loss:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        wait = 0
        torch.save(model.state_dict(), "best_model.pth")
        print("Saved best model")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered")
            break


  0%|          | 2/907 [00:03<28:01,  1.86s/it]


KeyboardInterrupt: 

In [None]:
# def translate(sentence):
#     model.eval()

#     # tokenize
#     tokens = en_tokenizer(sentence)

#     # t·∫°o vector s·ªë h√≥a ƒë√∫ng chu·∫©n
#     ids = (
#         [en_vocab.stoi["<sos>"]] +
#         [en_vocab.stoi.get(t, en_vocab.stoi["<unk>"]) for t in tokens] +
#         [en_vocab.stoi["<eos>"]]
#     )

#     src = torch.tensor(ids).unsqueeze(1).to(device)

#     with torch.no_grad():
#         hidden, cell = model.encoder(src)

#     # token ƒë·∫ßu ti√™n ƒë·∫ßu ra
#     input_tok = torch.tensor([fr_vocab.stoi["<sos>"]]).to(device)
#     outputs = []

#     for _ in range(50):
#         with torch.no_grad():
#             pred, hidden, cell = model.decoder(input_tok, hidden, cell)

#         top_id = pred.argmax(1).item()

#         if top_id == fr_vocab.stoi["<eos>"]:
#             break

#         outputs.append(top_id)
#         input_tok = torch.tensor([top_id]).to(device)

#     return " ".join(fr_vocab.itos[i] for i in outputs)


In [12]:
input_dim = len(en_vocab)
output_dim = len(fr_vocab)

encoder = Encoder(input_dim, 256, 512).to(device)
decoder = Decoder(output_dim, 256, 512).to(device)
model = Seq2Seq(encoder, decoder).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
def translate(sentence, max_len=50):
    model.eval()

    # tokenize c√¢u ti·∫øng Anh
    tokens = en_tokenizer(sentence)

    # chuy·ªÉn sang ID
    ids = (
        [en_vocab.stoi["<sos>"]] +
        [en_vocab.stoi.get(t, en_vocab.stoi["<unk>"]) for t in tokens] +
        [en_vocab.stoi["<eos>"]]
    )

    src = torch.tensor(ids).unsqueeze(1).to(device)   # shape: [seq_len, 1]

    # ---- RUN ENCODER ----
    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src)

    # b·∫Øt ƒë·∫ßu decoder b·∫±ng token <sos>
    input_tok = torch.tensor([fr_vocab.stoi["<sos>"]]).to(device)

    outputs = []

    # ---- RUN DECODER LOOP ----
    for _ in range(max_len):
        with torch.no_grad():
            pred, hidden, cell = model.decoder(
                input_tok,
                hidden,
                cell,
                encoder_outputs   # <<<< QUAN TR·ªåNG
            )

        top_id = pred.argmax(1).item()

        if top_id == fr_vocab.stoi["<eos>"]:
            break

        outputs.append(top_id)
        input_tok = torch.tensor([top_id]).to(device)

    # chuy·ªÉn ID ‚Üí t·ª´
    return " ".join(fr_vocab.itos[i] for i in outputs)


In [20]:
# def translate_beam(sentence, max_len=50, beam_size=3):
#     model.eval()

#     # tokenize c√¢u ti·∫øng Anh
#     tokens = en_tokenizer(sentence)

#     # chuy·ªÉn sang ID
#     ids = (
#         [en_vocab.stoi["<sos>"]] +
#         [en_vocab.stoi.get(t, en_vocab.stoi["<unk>"]) for t in tokens] +
#         [en_vocab.stoi["<eos>"]]
#     )

#     src = torch.tensor(ids).unsqueeze(1).to(device)  # [seq_len, 1]

#     # ---- RUN ENCODER ----
#     with torch.no_grad():
#         encoder_outputs, hidden, cell = model.encoder(src)

#     # beam = list of (sequence_ids, hidden, cell, score_log_prob)
#     beam = [([fr_vocab.stoi["<sos>"]], hidden, cell, 0.0)]

#     for _ in range(max_len):
#         new_beam = []

#         for seq, h, c, score in beam:
#             input_tok = torch.tensor([seq[-1]]).to(device)

#             with torch.no_grad():
#                 pred, h_new, c_new = model.decoder(input_tok, h, c, encoder_outputs)

#             log_probs = torch.log_softmax(pred, dim=1).squeeze(0)  # [vocab_size]

#             # l·∫•y top k token
#             top_log_probs, top_ids = torch.topk(log_probs, beam_size)

#             for log_p, tok_id in zip(top_log_probs.tolist(), top_ids.tolist()):
#                 new_seq = seq + [tok_id]
#                 new_score = score + log_p
#                 new_beam.append((new_seq, h_new, c_new, new_score))

#         # gi·ªØ l·∫°i beam_size sequences t·ªët nh·∫•t
#         new_beam = sorted(new_beam, key=lambda x: x[3], reverse=True)[:beam_size]
#         beam = new_beam

#         # n·∫øu t·∫•t c·∫£ beam ƒë√£ g·∫∑p <eos>, d·ª´ng
#         if all(seq[-1] == fr_vocab.stoi["<eos>"] for seq, _, _, _ in beam):
#             break

#     # ch·ªçn sequence c√≥ score cao nh·∫•t
#     best_seq = beam[0][0]

#     # lo·∫°i b·ªè <sos> v√† c·∫Øt ƒë·∫øn <eos>
#     if fr_vocab.stoi["<eos>"] in best_seq:
#         eos_idx = best_seq.index(fr_vocab.stoi["<eos>"])
#         best_seq = best_seq[1:eos_idx]
#     else:
#         best_seq = best_seq[1:]

#     return " ".join(fr_vocab.itos[i] for i in best_seq)


In [14]:
def translate_beam(sentence, max_len=50, beam_size=3):
    model.eval()

    # tokenize c√¢u ti·∫øng Anh
    tokens = en_tokenizer(sentence)

    # chuy·ªÉn sang ID
    ids = (
        [en_vocab.stoi["<sos>"]] +
        [en_vocab.stoi.get(t, en_vocab.stoi["<unk>"]) for t in tokens] +
        [en_vocab.stoi["<eos>"]]
    )

    src = torch.tensor(ids).unsqueeze(1).to(device)  # [seq_len, 1]

    # ---- RUN ENCODER ----
    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src)

        # üî• merge bidirectional hidden/cell
        def merge_bidir(h):
            return (h[0::2] + h[1::2]) / 2

        hidden = merge_bidir(hidden)
        cell   = merge_bidir(cell)

    # beam = list of (sequence_ids, hidden, cell, score_log_prob)
    beam = [([fr_vocab.stoi["<sos>"]], hidden, cell, 0.0)]

    for _ in range(max_len):
        new_beam = []

        for seq, h, c, score in beam:
            input_tok = torch.tensor([seq[-1]]).to(device)

            with torch.no_grad():
                pred, h_new, c_new = model.decoder(input_tok, h, c, encoder_outputs)

            log_probs = torch.log_softmax(pred, dim=1).squeeze(0)  # [vocab_size]

            # l·∫•y top k token
            top_log_probs, top_ids = torch.topk(log_probs, beam_size)

            for log_p, tok_id in zip(top_log_probs.tolist(), top_ids.tolist()):
                new_seq = seq + [tok_id]
                new_score = score + log_p
                new_beam.append((new_seq, h_new, c_new, new_score))

        # gi·ªØ l·∫°i beam_size sequences t·ªët nh·∫•t
        new_beam = sorted(new_beam, key=lambda x: x[3], reverse=True)[:beam_size]
        beam = new_beam

        # n·∫øu t·∫•t c·∫£ beam ƒë√£ g·∫∑p <eos>, d·ª´ng
        if all(seq[-1] == fr_vocab.stoi["<eos>"] for seq, _, _, _ in beam):
            break

    # ch·ªçn sequence c√≥ score cao nh·∫•t
    best_seq = beam[0][0]

    # lo·∫°i b·ªè <sos> v√† c·∫Øt ƒë·∫øn <eos>
    if fr_vocab.stoi["<eos>"] in best_seq:
        eos_idx = best_seq.index(fr_vocab.stoi["<eos>"])
        best_seq = best_seq[1:eos_idx]
    else:
        best_seq = best_seq[1:]

    return " ".join(fr_vocab.itos[i] for i in best_seq)


In [15]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_bleu():
    references = []   # d·∫°ng: [[ref_tokens], [ref_tokens], ...]
    hypotheses = []   # d·∫°ng: [pred_tokens, pred_tokens, ...]

    for en, fr in zip(test_en_tok, test_fr_tok):
        # input cho model l√† chu·ªói ti·∫øng Anh
        pred = translate_beam(" ".join(en))

        # BLEU y√™u c·∫ßu:
        #   - ref: list c√°c c√¢u tham chi·∫øu ‚Üí m·ªói c√¢u ph·∫£i b·ªçc trong 1 list
        #   - hyp: list c√°c c√¢u d·ª± ƒëo√°n tokenized
        references.append([fr])
        hypotheses.append(pred.split())

    score = corpus_bleu(
        references,
        hypotheses,
        weights=(0.25, 0.25, 0.25, 0.25)
    )
    return score


model.load_state_dict(torch.load("best_model_31211943.pth"))
bleu = evaluate_bleu()
print("Corpus BLEU on Test =", bleu)


Corpus BLEU on Test = 0.3121401146591142


In [25]:
# S·ªë c√¢u mu·ªën th·ª≠
num_examples = 5

for i in range(num_examples):
    # L·∫•y c√¢u g·ªëc ti·∫øng Anh
    en_sentence = " ".join(test_en_tok[i])

    # D·ªãch sang ti·∫øng Ph√°p
    fr_pred = translate_beam(en_sentence, beam_size=3)

    # L·∫•y c√¢u tham chi·∫øu ti·∫øng Ph√°p
    fr_ref = " ".join(test_fr_tok[i])

    # In ra k·∫øt qu·∫£
    print(f"Example {i+1}")
    print("EN (Original) :", en_sentence)
    print("FR (Reference):", fr_ref)
    print("FR (Predicted):", fr_pred)
    print("-" * 60)


Example 1
EN (Original) : a young man participates in a career while the subject who records it smiles .
FR (Reference): un jeune homme participe √† une course pendant que le sujet qui le filme sourit .
FR (Predicted): un jeune homme participe √† un tracteur tandis que le qu' il est en train de sourit .
------------------------------------------------------------
Example 2
EN (Original) : the man is scratching the back of his neck while looking for a book in a book store .
FR (Reference): l' homme se gratte l' arri√®re du cou tout en cherchant un livre dans une librairie .
FR (Predicted): l' homme a le l' arri√®re d' son cou , en train de regarder un livre dans un livre .
------------------------------------------------------------
Example 3
EN (Original) : a person wearing goggles and a hat is sled riding .
FR (Reference): une personne portant des lunettes de protection et un chapeau fait de la luge .
FR (Predicted): une personne avec des lunettes et une casquette fait du roller .
---