Kiểm tra DataLoader

In [2]:
import torch
from src.data import get_loader, load_vocab, load_dataset

# Load DataLoader
train_loader, train_dataset = get_loader("data/train.en.gz", "data/train.fr.gz", batch_size=4, shuffle=False)

print(f"Dataset size: {len(train_dataset)}")
print(f"Vocab EN size: {len(train_dataset.src_vocab.stoi)}")
print(f"Vocab FR size: {len(train_dataset.trg_vocab.stoi)}")

# Kiểm tra batch đầu tiên
for src, trg in train_loader:
    print("SRC shape:", src.shape)  # [batch_size, seq_len]
    print("TRG shape:", trg.shape)
    print("SRC example:", src[0])
    print("TRG example:", trg[0])
    break


100%|██████████| 29000/29000 [00:00<00:00, 41725.92it/s]
100%|██████████| 29000/29000 [00:01<00:00, 28154.45it/s]


Dataset size: 29000
Vocab EN size: 5893
Vocab FR size: 6470
SRC shape: torch.Size([4, 17])
TRG shape: torch.Size([4, 18])
SRC example: tensor([   1,   16,   24,   15,   25,  774,   17,   57,   80,  202, 1305,    5,
           2,    0,    0,    0,    0])
TRG example: tensor([   1,   21,   81,   32,  214,   28,   88,   70,    7, 1171,    5,    2,
           0,    0,    0,    0,    0,    0])


Test việc lưu và load file vocab, train

In [None]:
from src.data import get_loader, save_vocab, save_dataset

# Load DataLoader + Dataset
train_loader, train_dataset = get_loader("data/train.en.gz", "data/train.fr.gz", batch_size=32)

# Lưu vocab và dataset
save_vocab(train_dataset.src_vocab, "data/vocab_en.pkl")
save_vocab(train_dataset.trg_vocab, "data/vocab_fr.pkl")
save_dataset(train_dataset, "data/train_dataset.pt")

print("✅ Vocab và dataset đã lưu xong.")


100%|██████████| 29000/29000 [00:00<00:00, 42743.19it/s]
100%|██████████| 29000/29000 [00:01<00:00, 25849.03it/s]


✅ Vocab và dataset đã lưu xong.


In [1]:
from src.data import load_vocab, load_dataset

# Load lại vocab
src_vocab = load_vocab("data/vocab_en.pkl")
trg_vocab = load_vocab("data/vocab_fr.pkl")

print("Loaded EN vocab size:", len(src_vocab.stoi))
print("Loaded FR vocab size:", len(trg_vocab.stoi))

# Load dataset đã save
train_dataset = load_dataset("data/train_dataset.pt")
print("Loaded dataset size:", len(train_dataset))

# train_dataset = load_dataset("data/train_dataset.pt")
# print(len(train_dataset))


Loaded EN vocab size: 5893
Loaded FR vocab size: 6470
Loaded dataset size: 29000


Xem File vocab_en.pkl / vocab_fr.pkl

In [5]:
import pickle

# Load vocab
with open("data/vocab_en.pkl", "rb") as f:
    src_vocab = pickle.load(f)

with open("data/vocab_fr.pkl", "rb") as f:
    trg_vocab = pickle.load(f)

# Xem tổng số từ
print("EN vocab size:", len(src_vocab.stoi))
print("FR vocab size:", len(trg_vocab.stoi))

# In 10 từ đầu tiên trong vocab
print("EN sample:", list(src_vocab.stoi.items())[:10])
print("FR sample:", list(trg_vocab.stoi.items())[:10])


EN vocab size: 5893
FR vocab size: 6470
EN sample: [('<pad>', 0), ('<sos>', 1), ('<eos>', 2), ('<unk>', 3), ('a', 4), ('.', 5), ('in', 6), ('the', 7), ('on', 8), ('man', 9)]
FR sample: [('<pad>', 0), ('<sos>', 1), ('<eos>', 2), ('<unk>', 3), ('un', 4), ('.', 5), ('une', 6), ('de', 7), ('en', 8), ("d'", 9)]


In [6]:
NUM_SAMPLES = 5  # số câu muốn xem
for i in range(NUM_SAMPLES):
    src_idx, trg_idx = train_dataset[i]

    # Chuyển index -> từ
    src_words = [src_vocab.itos[idx.item()] for idx in src_idx if idx.item() not in [src_vocab.stoi["<pad>"]]]
    trg_words = [trg_vocab.itos[idx.item()] for idx in trg_idx if idx.item() not in [trg_vocab.stoi["<pad>"]]]

    print(f"Sample {i+1}:")
    print("EN:", " ".join(src_words))
    print("FR:", " ".join(trg_words))
    print("---")


Sample 1:
EN: <sos> two young , white males are outside near many bushes . <eos>
FR: <sos> deux jeunes hommes blancs sont dehors près de buissons . <eos>
---
Sample 2:
EN: <sos> several men in hard hats are operating a giant pulley system . <eos>
FR: <sos> plusieurs hommes en casque font fonctionner un système de poulies géant . <eos>
---
Sample 3:
EN: <sos> a little girl climbing into a wooden playhouse . <eos>
FR: <sos> une petite fille grimpe dans une maisonnette en bois . <eos>
---
Sample 4:
EN: <sos> a man in a blue shirt is standing on a ladder cleaning a window . <eos>
FR: <sos> un homme dans une chemise bleue se tient sur une échelle pour nettoyer une fenêtre . <eos>
---
Sample 5:
EN: <sos> two men are at the stove preparing food . <eos>
FR: <sos> deux hommes aux fourneaux préparent à manger . <eos>
---


2️⃣ Kiểm tra vocab và dataset

In [2]:
src_example, trg_example = train_dataset[0]
print("Source indices:", src_example)
print("Target indices:", trg_example)


Source indices: tensor([   1,   16,   24,   15,   25,  774,   17,   57,   80,  202, 1305,    5,
           2])
Target indices: tensor([   1,   21,   81,   32,  214,   28,   88,   70,    7, 1171,    5,    2])


In [3]:
src_words = [src_vocab.itos[idx.item()] for idx in src_example]
trg_words = [trg_vocab.itos[idx.item()] for idx in trg_example]
print("Source sentence:", " ".join(src_words))
print("Target sentence:", " ".join(trg_words))


Source sentence: <sos> two young , white males are outside near many bushes . <eos>
Target sentence: <sos> deux jeunes hommes blancs sont dehors près de buissons . <eos>


In [4]:
# Lấy một sample đầu tiên từ dataset
src_example, trg_example = train_dataset[0]

# In ra các index (số) trong câu
print("Source indices:", src_example)
print("Target indices:", trg_example)

# Chuyển các index về từ để đọc được
src_words = [src_vocab.itos[idx.item()] for idx in src_example]
trg_words = [trg_vocab.itos[idx.item()] for idx in trg_example]

print("Source sentence:", " ".join(src_words))
print("Target sentence:", " ".join(trg_words))


Source indices: tensor([   1,   16,   24,   15,   25,  774,   17,   57,   80,  202, 1305,    5,
           2])
Target indices: tensor([   1,   21,   81,   32,  214,   28,   88,   70,    7, 1171,    5,    2])
Source sentence: <sos> two young , white males are outside near many bushes . <eos>
Target sentence: <sos> deux jeunes hommes blancs sont dehors près de buissons . <eos>


3️⃣ Kiểm tra DataLoader

In [10]:
from torch.utils.data import DataLoader
from src.data import load_dataset, MyCollate, load_vocab

# Load dataset + vocab
train_dataset = load_dataset("data/train_dataset.pt")
src_vocab = load_vocab("data/vocab_en.pkl")
trg_vocab = load_vocab("data/vocab_fr.pkl")

PAD_IDX = src_vocab.stoi["<pad>"]

# Tạo DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                          collate_fn=MyCollate(PAD_IDX))

# Xem shape batch đầu tiên
for src_batch, trg_batch in train_loader:
    print(src_batch.shape, trg_batch.shape)
    break


torch.Size([32, 23]) torch.Size([32, 25])


src_batch.shape = [32, 23] → batch size = 32, sequence length nguồn = 23 token.

trg_batch.shape = [32, 25] → batch size = 32, sequence length đích = 25 token.

4️⃣ Training

In [3]:
%run src/train_data.py


Epoch [1/10] Loss: 4.3839
Epoch [2/10] Loss: 3.4757
Epoch [3/10] Loss: 3.0128


KeyboardInterrupt: 

In [None]:
model.eval()
with torch.no_grad():
    for src, trg in train_loader:
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        output = model(src, trg, teacher_forcing_ratio=0)
        top_words = output.argmax(-1)
        print("Src:", src[0])
        print("Pred:", top_words[0])
        print("Trg:", trg[0])
        break


5️⃣ Kiểm tra model sau training

In [None]:
model.load_state_dict(torch.load("checkpoints/seq2seq_epoch10.pth"))
model.eval()

with torch.no_grad():
    src_tensor = src_example.unsqueeze(0).to(DEVICE)
    trg_tensor = trg_example.unsqueeze(0).to(DEVICE)
    output = model(src_tensor, trg_tensor, teacher_forcing_ratio=0)
    pred_indices = output.argmax(-1)[0].cpu().tolist()
    pred_sentence = [trg_vocab.itos[idx] for idx in pred_indices]
    print("Predicted sentence:", " ".join(pred_sentence))


6️⃣ Gợi ý cải tiến

1. Teacher forcing: hiện tại là cố định 0.5. Bạn có thể giảm dần theo epoch

2. Gradient clipping: tránh exploding gradients với LSTM:

3. Validation set: nếu có dataset validation, theo dõi val_loss sẽ tốt hơn.

In [None]:
teacher_forcing_ratio = max(0.5 * (0.9 ** epoch), 0.1)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)


Load checkpoint

In [None]:
checkpoint_path = "checkpoints/seq2seq_epoch10.pth"
model.load_state_dict(torch.load(checkpoint_path, map_location=DEVICE))
model.eval()  # Chuyển model sang chế độ eval
