In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


2.1.0+cu121
True
NVIDIA GeForce RTX 3050 Laptop GPU


In [None]:
!pip install spacy nltk --user
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm





[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 2.0 MB/s eta 0:00:07
      --------------------------------------- 0.2/12.8 MB 2.8 MB/s eta 0:00:05
     - -------------------------------------- 0.5/12.8 MB 4.3 MB/s eta 0:00:03
     -- ------------------------------------- 0.9/12.8 MB 5.9 MB/s eta 0:00:03
     ---- ----------------------------------- 1.4/12.8 MB 6.2 MB/s eta 0:00:02
     ----- ---------------------------------- 1.8/12.8 MB 7.0 MB/s eta 0:00:02
     ------ --------------------------------- 2.2/12.8 MB 7.3 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.8 MB 7.4 MB/s eta 0:00:02
     --------- ------------------------------ 3.0/12.8 MB 7.5 MB/s eta 0:00:02
     ---------- -------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     ---------------------------------------- 0.1/16.3 MB 2.6 MB/s eta 0:00:07
      --------------------------------------- 0.3/16.3 MB 3.1 MB/s eta 0:00:06
     - -------------------------------------- 0.6/16.3 MB 4.7 MB/s eta 0:00:04
     -- ------------------------------------- 1.0/16.3 MB 5.8 MB/s eta 0:00:03
     -- ------------------------------------- 1.2/16.3 MB 5.3 MB/s eta 0:00:03
     ---- ----------------------------------- 1.8/16.3 MB 6.6 MB/s eta 0:00:03
     ----- ---------------------------------- 2.4/16.3 MB 7.5 MB/s eta 0:00:02
     ------ --------------------------------- 2.8/16.3 MB 7.5 MB/s eta 0:00:02
     -------- ------------------------------- 3.5/16.3 MB 8.2 MB/s eta 0:00:02
     ---------- ----------------------


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# ============================================================
# 1. IMPORTS
# ============================================================

import spacy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.translate.bleu_score import sentence_bleu
import random
# from numba import cuda
from tqdm import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", device)


DEVICE: cuda


In [3]:
# ============================================================
# 2. LOAD DATA (TRAIN / VAL / TEST)
# ============================================================

def load_file(path):
    with open(path, "r", encoding="utf8") as f:
        return [line.strip() for line in f]

# train_en = load_file("/content/train.en")
# train_fr = load_file("/content/train.fr")

# val_en = load_file("/content/val.en")
# val_fr = load_file("/content/val.fr")

# test_en = load_file("/content/test.en")
# test_fr = load_file("/content/test.fr")

train_en = load_file("./data/train.en")
train_fr = load_file("./data/train.fr")

val_en = load_file("./data/val.en")
val_fr = load_file("./data/val.fr")

test_en = load_file("./data/test.en")
test_fr = load_file("./data/test.fr")

print("Train examples:", len(train_en))
print("Val examples:", len(val_en))
print("Test examples:", len(test_en))


Train examples: 29000
Val examples: 1014
Test examples: 1071


In [4]:
# ============================================================
# 3. TOKENIZATION (spaCy)
# ============================================================


nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")

def en_tokenizer(text):
    return [tok.text.lower() for tok in nlp_en.tokenizer(text)]
def fr_tokenizer(text):
    return [tok.text.lower() for tok in nlp_fr.tokenizer(text)]

train_en_tok = [en_tokenizer(s) for s in train_en]
train_fr_tok = [fr_tokenizer(s) for s in train_fr]

val_en_tok = [en_tokenizer(s) for s in val_en]
val_fr_tok = [fr_tokenizer(s) for s in val_fr]

test_en_tok = [en_tokenizer(s) for s in test_en]
test_fr_tok = [fr_tokenizer(s) for s in test_fr]


In [5]:
# ============================================================
# 4. BUILD VOCAB
# ============================================================

from collections import Counter

SPECIAL_TOKENS = ["<pad>", "<unk>", "<sos>", "<eos>"]

class Vocab:
    def __init__(self, tokens, max_size=10000):
        self.freq = Counter(tokens)
        most_common = self.freq.most_common(max_size)

        # word2idx
        self.itos = SPECIAL_TOKENS + [w for w, _ in most_common]
        self.stoi = {w:i for i, w in enumerate(self.itos)}

    def numericalize(self, tokens):
        return [self.stoi.get(t, self.stoi["<unk>"]) for t in tokens]

    def __len__(self):
        return len(self.itos)

en_vocab = Vocab(tok for sent in train_en_tok for tok in sent)
fr_vocab = Vocab(tok for sent in train_fr_tok for tok in sent)

PAD_IDX = fr_vocab.stoi["<pad>"]
SOS_IDX = fr_vocab.stoi["<sos>"]
EOS_IDX = fr_vocab.stoi["<eos>"]


In [6]:
# ============================================================
# 5. DATASET + DATALOADER
# ============================================================

class TranslationDataset(Dataset):
    def __init__(self, en_data, fr_data, vocab_en, vocab_fr):
        self.en = en_data
        self.fr = fr_data
        self.vocab_en = vocab_en
        self.vocab_fr = vocab_fr

    def __len__(self):
        return len(self.en)

    # def numericalize(self, tokens, vocab):
    #     return [vocab["<sos>"]] + [vocab[t] for t in tokens] + [vocab["<eos>"]]

    # def __getitem__(self, idx):
    #     src_num = self.numericalize(self.src[idx], self.src_vocab)
    #     trg_num = self.numericalize(self.trg[idx], self.trg_vocab)
    #     return torch.tensor(src_num), torch.tensor(trg_num)

    def __getitem__(self, idx):
        en_tokens = ["<sos>"] + self.en[idx] + ["<eos>"]
        fr_tokens = ["<sos>"] + self.fr[idx] + ["<eos>"]

        en_ids = self.vocab_en.numericalize(en_tokens)
        fr_ids = self.vocab_fr.numericalize(fr_tokens)

        return torch.tensor(en_ids), torch.tensor(fr_ids)


def collate_fn(batch):
    en_batch, fr_batch = zip(*batch)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    fr_batch = pad_sequence(fr_batch, padding_value=PAD_IDX)
    return en_batch, fr_batch


train_ds = TranslationDataset(train_en_tok, train_fr_tok, en_vocab, fr_vocab)
val_ds = TranslationDataset(val_en_tok, val_fr_tok, en_vocab, fr_vocab)
test_ds = TranslationDataset(test_en_tok, test_fr_tok, en_vocab, fr_vocab)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)


In [7]:
# ============================================================
# 6. ENCODER - DECODER MODEL
# ============================================================

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                            num_layers=num_layers,
                            dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(0))
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing=0.3):
        batch_size = trg.size(1)
        max_len = trg.size(0)
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(max_len, batch_size, vocab_size).to(src.device)

        hidden, cell = self.encoder(src)
        input_token = trg[0, :]

        for t in range(1, max_len):
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            outputs[t] = output
            best = output.argmax(1)

            input_token = trg[t] if random.random() < teacher_forcing else best

        return outputs


In [8]:
# ============================================================
# 7. TRAINING + VALIDATION + EARLY STOPPING
# ============================================================

input_dim = len(en_vocab)
output_dim = len(fr_vocab)

encoder = Encoder(input_dim, 256, 512).to(device)
decoder = Decoder(output_dim, 256, 512).to(device)
model = Seq2Seq(encoder, decoder).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.001)


def evaluate(model, loader):
    model.eval()
    total = 0
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(device), trg.to(device)
            outputs = model(src, trg, teacher_forcing=0.3)
            loss = criterion(outputs[1:].reshape(-1, outputs.size(-1)),
                             trg[1:].reshape(-1))
            total += loss.item()
    return total / len(loader)


EPOCHS = 20
best_val = float("inf")
patience = 3
wait = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for src, trg in tqdm(train_loader):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        outputs = model(src, trg)

        loss = criterion(outputs[1:].reshape(-1, outputs.size(-1)),
                         trg[1:].reshape(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)
    val_loss = evaluate(model, val_loader)

    print(f"Epoch {epoch+1} | Train={train_loss:.4f} | Val={val_loss:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        wait = 0
        torch.save(model.state_dict(), "best_model.pth")
        print("Saved best model")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered")
            break


100%|██████████| 907/907 [02:24<00:00,  6.29it/s]


Epoch 1 | Train=4.8306 | Val=4.1716
Saved best model


100%|██████████| 907/907 [02:47<00:00,  5.40it/s]


Epoch 2 | Train=3.9689 | Val=3.6688
Saved best model


100%|██████████| 907/907 [02:52<00:00,  5.25it/s]


Epoch 3 | Train=3.5240 | Val=3.3385
Saved best model


100%|██████████| 907/907 [03:02<00:00,  4.96it/s]


Epoch 4 | Train=3.1753 | Val=3.1139
Saved best model


100%|██████████| 907/907 [02:50<00:00,  5.31it/s]


Epoch 5 | Train=2.8826 | Val=2.9163
Saved best model


100%|██████████| 907/907 [03:00<00:00,  5.03it/s]


Epoch 6 | Train=2.6448 | Val=2.9446


100%|██████████| 907/907 [02:10<00:00,  6.97it/s]


Epoch 7 | Train=2.4218 | Val=2.8927
Saved best model


100%|██████████| 907/907 [01:47<00:00,  8.47it/s]


Epoch 8 | Train=2.2174 | Val=2.8010
Saved best model


100%|██████████| 907/907 [03:03<00:00,  4.95it/s]


Epoch 9 | Train=2.0426 | Val=2.7770
Saved best model


100%|██████████| 907/907 [02:32<00:00,  5.96it/s]


Epoch 10 | Train=1.8803 | Val=2.8363


100%|██████████| 907/907 [02:40<00:00,  5.65it/s]


Epoch 11 | Train=1.7231 | Val=2.8413


100%|██████████| 907/907 [02:46<00:00,  5.43it/s]


Epoch 12 | Train=1.5862 | Val=2.8646
Early stopping triggered


In [9]:
def translate(sentence):
    model.eval()

    # tokenize
    tokens = en_tokenizer(sentence)

    # tạo vector số hóa đúng chuẩn
    ids = (
        [en_vocab.stoi["<sos>"]] +
        [en_vocab.stoi.get(t, en_vocab.stoi["<unk>"]) for t in tokens] +
        [en_vocab.stoi["<eos>"]]
    )

    src = torch.tensor(ids).unsqueeze(1).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(src)

    # token đầu tiên đầu ra
    input_tok = torch.tensor([fr_vocab.stoi["<sos>"]]).to(device)
    outputs = []

    for _ in range(50):
        with torch.no_grad():
            pred, hidden, cell = model.decoder(input_tok, hidden, cell)

        top_id = pred.argmax(1).item()

        if top_id == fr_vocab.stoi["<eos>"]:
            break

        outputs.append(top_id)
        input_tok = torch.tensor([top_id]).to(device)

    return " ".join(fr_vocab.itos[i] for i in outputs)


In [16]:
from nltk.translate.bleu_score import corpus_bleu

def evaluate_bleu():
    references = []   # dạng: [[ref_tokens], [ref_tokens], ...]
    hypotheses = []   # dạng: [pred_tokens, pred_tokens, ...]

    for en, fr in zip(test_en_tok, test_fr_tok):
        # input cho model là chuỗi tiếng Anh
        pred = translate(" ".join(en))

        # BLEU yêu cầu:
        #   - ref: list các câu tham chiếu → mỗi câu phải bọc trong 1 list
        #   - hyp: list các câu dự đoán tokenized
        references.append([fr])
        hypotheses.append(pred.split())

    score = corpus_bleu(
        references,
        hypotheses,
    )
    return score


model.load_state_dict(torch.load("best_model.pth"))
bleu = evaluate_bleu()
print("Corpus BLEU on Test =", bleu)


Corpus BLEU on Test = 0.14313889922319398


In [11]:
# Số câu muốn thử
num_examples = 5

for i in range(num_examples):
    # Lấy câu gốc tiếng Anh
    en_sentence = " ".join(test_en_tok[i])

    # Dịch sang tiếng Pháp
    fr_pred = translate(en_sentence)

    # Lấy câu tham chiếu tiếng Pháp
    fr_ref = " ".join(test_fr_tok[i])

    # In ra kết quả
    print(f"Example {i+1}")
    print("EN (Original) :", en_sentence)
    print("FR (Reference):", fr_ref)
    print("FR (Predicted):", fr_pred)
    print("-" * 60)


Example 1
EN (Original) : a young man participates in a career while the subject who records it smiles .
FR (Reference): un jeune homme participe à une course pendant que le sujet qui le filme sourit .
FR (Predicted): un jeune homme est dans un miroir tandis que il est lui lui lui .
------------------------------------------------------------
Example 2
EN (Original) : the man is scratching the back of his neck while looking for a book in a book store .
FR (Reference): l' homme se gratte l' arrière du cou tout en cherchant un livre dans une librairie .
FR (Predicted): l' homme examine le visage de la tête tandis qu' un un un un un un un un un .
------------------------------------------------------------
Example 3
EN (Original) : a person wearing goggles and a hat is sled riding .
FR (Reference): une personne portant des lunettes de protection et un chapeau fait de la luge .
FR (Predicted): une personne portant des lunettes et un casque fait fait du vélo .
------------------------------

In [12]:
test = translate("A young man participates in a career while the subject who records it smiles")
print(test)
test = translate("The man is scratching the back of his neck while looking for a book in a book store")
print(test)

un jeune homme est dans un miroir tandis que il a ses cheveux .
l' homme se du la de la tête tandis qu' un un un un un un un un un .


In [13]:

# from nltk.translate.bleu_score import corpus_bleu

# # Giữ nguyên test_en, test_de
# hyps = []
# refs = []

# for src_sent, trg_sent in zip(test_en_tok, test_fr_tok):
#     hyp = translate(src_sent).split()  # dự đoán tiếng Đức
#     refs.append([fr_tokenizer(trg_sent)])              # reference tiếng Đức
#     hyps.append(hyp)

# bleu = corpus_bleu(refs, hyps)
# print("BLEU (corpus) on test set:", bleu)