# Download data yang dibutuhkan

In [1]:
!curl -L -o ./question-pairs-dataset.zip\
  https://www.kaggle.com/api/v1/datasets/download/quora/question-pairs-dataset
!unzip question-pairs-dataset.zip -d ./
!curl -L -o paws_wiki_labeled_final.tar.gz https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz
!tar -xvzf paws_wiki_labeled_final.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 20.7M  100 20.7M    0     0  5583k      0  0:00:03  0:00:03 --:--:-- 9617k
Archive:  question-pairs-dataset.zip
  inflating: ./questions.csv         
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4577k  100 4577k    0     0  1683k      0  0:00:02  0:00:02 --:--:-- 1684k
final/test.tsv
final/
final/train.tsv
final/dev.tsv


# Data Preprocessing

In [2]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Load the PAWS-Wiki dataset
paws_train = pd.read_csv('final/train.tsv', sep='\t')
paws_test = pd.read_csv('final/test.tsv', sep='\t')
paws_dev = pd.read_csv('final/dev.tsv', sep='\t')

# Load the another dataset
quora_data = pd.read_csv('questions.csv')

# Prep Functions
def lower(data) :
    data['sentence1'] = data['sentence1'].apply(lambda x: x.lower())
    data['sentence2'] = data['sentence2'].apply(lambda x: x.lower())

# PAWS Preprocessing
lower(paws_train)
lower(paws_test)
lower(paws_dev)

paws_train = paws_train.drop(columns=['id'])
paws_test = paws_test.drop(columns=['id'])
paws_dev = paws_dev.drop(columns=['id'])

paws_train = paws_train.dropna()
paws_test = paws_test.dropna()
paws_dev = paws_dev.dropna()

#Quora Preprocessing
quora_data = quora_data.drop(columns=['qid1', 'qid2', 'id'])
quora_data = quora_data.rename(columns={'question1': 'sentence1', 'question2': 'sentence2', 'is_duplicate' : 'label'})
quora_data = quora_data.dropna()
lower(quora_data)

quora_train, quora_testdev = train_test_split(quora_data, test_size=0.24, random_state=42)
quora_test, quora_dev = train_test_split(quora_testdev, test_size=0.5, random_state=42)

# Combine the datasets
train_df = shuffle(pd.concat([paws_train, quora_train], ignore_index=True), random_state=40)
test_df = shuffle(pd.concat([paws_test, quora_test], ignore_index=True), random_state=40)
dev_df = shuffle(pd.concat([paws_dev, quora_dev], ignore_index=True), random_state=40)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentence1'] = data['sentence1'].apply(lambda x: x.lower())


# Fungsi-fungsi untuk mengubah data ke format yang cocok untuk training model

1. Tokenizing
*   Mengubah teks menjadi huruf kecil.
*   Menghapus karakter non-alfanumerik dan tanda baca tertentu.
*   Memecah teks menjadi token-token (kata-kata).
2. Persiapan Data
*   Membaca pasangan kalimat dari DataFrame.
*   Menghasilkan pasangan token (input_tokens, target_tokens) untuk setiap baris.
*   Menambahkan token khusus <sos> di awal target dan <eos> di akhir sebagai penanda awal dan akhir kalimat.
3. Membuat Vocabulary
*   Menghitung frekuensi token dari pasangan input atau target.
*   Menambahkan token ke dalam vocab jika frekuensinya ≥ min_freq.
*   Token spesial seperti <pad>, <unk>, <sos>, <eos> dimasukkan terlebih dahulu dengan indeks tetap.
4. Encoding Token
*   Mengubah token menjadi indeks berdasarkan vocab.
*   Token yang tidak ditemukan akan dikodekan sebagai <unk>.
5. Dataset PyTorch
*   Kelas dataset kustom yang menyimpan pasangan (input, target) dalam bentuk indeks tensor.
*   Digunakan oleh DataLoader untuk training dan evaluasi.
6. Padding Batch
*   Fungsi collate_fn khusus untuk memastikan setiap batch memiliki panjang urutan yang sama.
*   Digunakan oleh DataLoader agar model bisa memproses batch dengan ukuran berbeda.
7. Membuat DataLoader
*   Mempersiapkan DataLoader untuk data latih (train_df), validasi (dev_df), dan uji (test_df).
*   Mengembalikan tiga DataLoader dan dua kamus vocabulary (src_vocab, trg_vocab).


In [4]:
import pandas as pd
import re
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

def tokenize(text):
    """
    Tokenize input text by converting to lowercase
    and removing special characters
    """
    text = str(text).lower()  # Ensure text is a string
    text = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", text)
    return text.strip().split()

def prepare_data(df, input_col, target_col):
    """
    Prepare data pairs from DataFrame

    Args:
        df (pd.DataFrame): Input DataFrame
        input_col (str): Column name for input sequences
        target_col (str): Column name for target sequences

    Returns:
        list: List of (input_tokens, target_tokens) pairs
    """
    return [
        (tokenize(row[input_col]), ['<sos>'] + tokenize(row[target_col]) + ['<eos>'])
        for _, row in df.iterrows()
    ]

def build_vocab(pairs, index, min_freq=2):
    """
    Build vocabulary from token pairs

    Args:
        pairs (list): List of (input, target) token pairs
        index (int): 0 for input, 1 for target
        min_freq (int): Minimum frequency to include a token

    Returns:
        dict: Vocabulary mapping tokens to indices
    """
    counter = Counter()
    for pair in pairs:
        counter.update(pair[index])

    # Start with special tokens
    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}

    for word, freq in counter.items():
        if freq >= min_freq and word not in vocab:
            vocab[word] = len(vocab)

    return vocab

def encode(tokens, vocab):
    """
    Encode tokens to their vocabulary indices

    Args:
        tokens (list): List of tokens
        vocab (dict): Vocabulary mapping

    Returns:
        list: List of token indices
    """
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

class Seq2SeqDataset(Dataset):
    """
    PyTorch Dataset for Sequence-to-Sequence tasks
    """
    def __init__(self, pairs, src_vocab, trg_vocab):
        """
        Args:
            pairs (list): List of (input, target) token pairs
            src_vocab (dict): Source vocabulary
            trg_vocab (dict): Target vocabulary
        """
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, trg = self.pairs[idx]
        return (
            torch.tensor(encode(src, self.src_vocab)),
            torch.tensor(encode(trg, self.trg_vocab))
        )

def collate_fn(batch):
    """
    Collate function for padding sequences in a batch

    Args:
        batch (list): List of (src_sequence, trg_sequence) pairs

    Returns:
        tuple: Padded source and target batches
    """
    src_batch, trg_batch = zip(*batch)

    # Pad sequences, using 0 (pad token index) for padding
    src_batch = pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=0, batch_first=True)

    return src_batch, trg_batch

def prepare_dataloaders(train_df, dev_df, test_df,
                        input_col='sentence1',
                        target_col='sentence2',
                        batch_size=32):
    """
    Prepare DataLoaders for train, dev, and test datasets

    Args:
        train_df (pd.DataFrame): Training data
        dev_df (pd.DataFrame): Development data
        test_df (pd.DataFrame): Test data
        input_col (str): Column name for input sequences
        target_col (str): Column name for target sequences
        batch_size (int): Batch size for DataLoaders

    Returns:
        tuple: (train_loader, dev_loader, test_loader), source vocab, target vocab
    """
    # Prepare token pairs
    train_pairs = prepare_data(train_df, input_col, target_col)
    dev_pairs = prepare_data(dev_df, input_col, target_col)
    test_pairs = prepare_data(test_df, input_col, target_col)

    # Build vocabularies
    src_vocab = build_vocab(train_pairs, 0)
    trg_vocab = build_vocab(train_pairs, 1)

    # Create datasets
    train_dataset = Seq2SeqDataset(train_pairs, src_vocab, trg_vocab)
    dev_dataset = Seq2SeqDataset(dev_pairs, src_vocab, trg_vocab)
    test_dataset = Seq2SeqDataset(test_pairs, src_vocab, trg_vocab)

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )
    dev_loader = DataLoader(
        dev_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )

    return train_loader, dev_loader, test_loader, src_vocab, trg_vocab

train_loader, dev_loader, test_loader, src_vocab, trg_vocab = prepare_dataloaders(
    train_df, dev_df, test_df,
    input_col='sentence1',
    target_col='sentence2'
)

# **Seq2Seq Model Architecture**
Model ini terdiri dari 3 komponen utama: Encoder, Decoder, dan Seq2Seq wrapper.
1. Encoder

Fungsi :
*   Mengubah urutan token input menjadi representasi tersembunyi (hidden states).
*   Menggunakan Embedding Layer diikuti oleh LSTM.

Alur :    
*   src (input) → Embedding
*   Embedding → LSTM
*   Mengembalikan: hidden dan cell (untuk digunakan oleh decoder)

2. Decoder

Fungsi :    
*   Menghasilkan token satu per satu berdasarkan output sebelumnya dan konteks dari encoder.

Alur :    
*   input token saat ini → Embedding
*   Embedding + hidden, cell → LSTM
*   LSTM output → Linear layer → Prediksi token
*   Mengembalikan prediksi, dan hidden, cell baru

3. Seq2Seq Wrapper

Fungsi :    
*   Mengintegrasikan Encoder dan Decoder.
*   Mengatur proses decoding selama pelatihan dan inference.

Alur :    
*   Encoder memproses seluruh input src.
*   Decoder menghasilkan output secara iteratif.
*   Gunakan Teacher Forcing:
  *   Dengan probabilitas teacher_forcing_ratio, gunakan token target yang benar sebagai input berikutnya.
  *   Jika tidak, gunakan prediksi model sebelumnya.

Output :    
Tensor berisi seluruh prediksi token untuk urutan target.






















In [5]:
import torch.nn as nnFun

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)  # [batch, 1]
        embedded = self.embedding(input)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.shape
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs


In [6]:
import torch.optim as optim

# Hyperparameters
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(trg_vocab)
EMB_DIM = 256
HID_DIM = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model setup
encoder = Encoder(INPUT_DIM, EMB_DIM, HID_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

# Loss and Optimizer
PAD_IDX = trg_vocab['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
from tqdm import tqdm
import os

N_EPOCHS = 2
CHECKPOINT_EVERY = 200  # langkah (steps)
CHECKPOINT_DIR = "checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

step = 0

for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{N_EPOCHS}", leave=False)

    for src, trg in loop:
        src, trg = src.to(DEVICE), trg.to(DEVICE)

        optimizer.zero_grad()
        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        loop.set_postfix(loss=loss.item())

        step += 1
        if step % CHECKPOINT_EVERY == 0:
            ckpt_path = os.path.join(CHECKPOINT_DIR, f"checkpoint_epoch{epoch+1}_step{step}.pt")
            torch.save({
                'epoch': epoch,
                'step': step,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss.item(),
            }, ckpt_path)
            print(f"Checkpoint disimpan: {ckpt_path}")

    print(f"Epoch {epoch+1} Average Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1/2:   2%|▏         | 200/11148 [01:59<10:39:54,  3.51s/it, loss=6.44]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step200.pt


Epoch 1/2:   4%|▎         | 400/11148 [03:54<11:43:38,  3.93s/it, loss=6.12]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step400.pt


Epoch 1/2:   5%|▌         | 600/11148 [05:51<11:00:24,  3.76s/it, loss=6.25]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step600.pt


Epoch 1/2:   7%|▋         | 800/11148 [07:43<7:49:19,  2.72s/it, loss=5.92]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step800.pt


Epoch 1/2:   9%|▉         | 1000/11148 [09:46<15:32:00,  5.51s/it, loss=5.52]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step1000.pt


Epoch 1/2:  11%|█         | 1200/11148 [11:38<10:30:11,  3.80s/it, loss=6.21]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step1200.pt


Epoch 1/2:  13%|█▎        | 1400/11148 [13:34<9:28:40,  3.50s/it, loss=5.23]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step1400.pt


Epoch 1/2:  14%|█▍        | 1600/11148 [15:25<7:33:16,  2.85s/it, loss=5.67]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step1600.pt


Epoch 1/2:  16%|█▌        | 1800/11148 [17:17<9:30:14,  3.66s/it, loss=5.9]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step1800.pt


Epoch 1/2:  18%|█▊        | 2000/11148 [19:21<9:22:48,  3.69s/it, loss=5.49]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step2000.pt


Epoch 1/2:  20%|█▉        | 2200/11148 [21:17<10:14:24,  4.12s/it, loss=5.61]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step2200.pt


Epoch 1/2:  22%|██▏       | 2400/11148 [23:13<6:50:30,  2.82s/it, loss=5.73]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step2400.pt


Epoch 1/2:  23%|██▎       | 2600/11148 [25:08<6:10:45,  2.60s/it, loss=5.83]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step2600.pt


Epoch 1/2:  25%|██▌       | 2800/11148 [27:05<10:05:18,  4.35s/it, loss=5.59]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step2800.pt


Epoch 1/2:  27%|██▋       | 3000/11148 [28:56<6:29:08,  2.87s/it, loss=5.5]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step3000.pt


Epoch 1/2:  29%|██▊       | 3200/11148 [30:52<8:22:39,  3.79s/it, loss=5.21]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step3200.pt


Epoch 1/2:  30%|███       | 3400/11148 [32:49<8:05:31,  3.76s/it, loss=5.54]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step3400.pt


Epoch 1/2:  32%|███▏      | 3600/11148 [34:41<5:27:53,  2.61s/it, loss=5.6]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step3600.pt


Epoch 1/2:  34%|███▍      | 3800/11148 [36:42<11:11:45,  5.49s/it, loss=4.92]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step3800.pt


Epoch 1/2:  36%|███▌      | 4000/11148 [38:38<6:18:55,  3.18s/it, loss=5.41]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step4000.pt


Epoch 1/2:  38%|███▊      | 4200/11148 [40:29<5:57:33,  3.09s/it, loss=5.01]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step4200.pt


Epoch 1/2:  39%|███▉      | 4400/11148 [42:25<7:09:24,  3.82s/it, loss=4.9]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step4400.pt


Epoch 1/2:  41%|████▏     | 4600/11148 [44:15<6:45:40,  3.72s/it, loss=4.94]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step4600.pt


Epoch 1/2:  43%|████▎     | 4800/11148 [46:11<6:32:02,  3.71s/it, loss=5.13]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step4800.pt


Epoch 1/2:  45%|████▍     | 5000/11148 [48:08<6:23:50,  3.75s/it, loss=5.5]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step5000.pt


Epoch 1/2:  47%|████▋     | 5200/11148 [50:05<6:33:00,  3.96s/it, loss=4.72]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step5200.pt


Epoch 1/2:  48%|████▊     | 5400/11148 [52:03<7:07:30,  4.46s/it, loss=4.93]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step5400.pt


Epoch 1/2:  50%|█████     | 5600/11148 [54:08<6:24:30,  4.16s/it, loss=5.03]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step5600.pt


Epoch 1/2:  52%|█████▏    | 5800/11148 [56:04<4:27:39,  3.00s/it, loss=4.77]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step5800.pt


Epoch 1/2:  54%|█████▍    | 6000/11148 [58:04<5:58:20,  4.18s/it, loss=4.78]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step6000.pt


Epoch 1/2:  56%|█████▌    | 6200/11148 [1:00:09<6:36:11,  4.80s/it, loss=4.55]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step6200.pt


Epoch 1/2:  57%|█████▋    | 6400/11148 [1:02:07<5:39:41,  4.29s/it, loss=4.6]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step6400.pt


Epoch 1/2:  59%|█████▉    | 6600/11148 [1:04:04<4:37:19,  3.66s/it, loss=4.35]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step6600.pt


Epoch 1/2:  61%|██████    | 6800/11148 [1:06:02<4:34:52,  3.79s/it, loss=4.15]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step6800.pt


Epoch 1/2:  63%|██████▎   | 7000/11148 [1:08:07<4:11:55,  3.64s/it, loss=5.19]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step7000.pt


Epoch 1/2:  65%|██████▍   | 7200/11148 [1:10:07<5:23:25,  4.92s/it, loss=5.11]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step7200.pt


Epoch 1/2:  66%|██████▋   | 7400/11148 [1:12:01<2:31:46,  2.43s/it, loss=4.89]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step7400.pt


Epoch 1/2:  68%|██████▊   | 7600/11148 [1:14:05<3:58:08,  4.03s/it, loss=5.52]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step7600.pt


Epoch 1/2:  70%|██████▉   | 7800/11148 [1:16:03<3:46:00,  4.05s/it, loss=4.51]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step7800.pt


Epoch 1/2:  72%|███████▏  | 8000/11148 [1:17:53<1:13:47,  1.41s/it, loss=4.72]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step8000.pt


Epoch 1/2:  74%|███████▎  | 8200/11148 [1:19:44<1:15:35,  1.54s/it, loss=4.15]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step8200.pt


Epoch 1/2:  75%|███████▌  | 8400/11148 [1:21:52<1:59:45,  2.61s/it, loss=4.51]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step8400.pt


Epoch 1/2:  77%|███████▋  | 8600/11148 [1:23:44<1:40:29,  2.37s/it, loss=5]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step8600.pt


Epoch 1/2:  79%|███████▉  | 8800/11148 [1:25:36<2:09:09,  3.30s/it, loss=4.5]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step8800.pt


Epoch 1/2:  81%|████████  | 9000/11148 [1:27:36<2:26:28,  4.09s/it, loss=4.3]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step9000.pt


Epoch 1/2:  83%|████████▎ | 9200/11148 [1:29:36<2:28:21,  4.57s/it, loss=4.59]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step9200.pt


Epoch 1/2:  84%|████████▍ | 9400/11148 [1:31:24<1:26:47,  2.98s/it, loss=4.41]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step9400.pt


Epoch 1/2:  86%|████████▌ | 9600/11148 [1:33:22<2:10:40,  5.06s/it, loss=4.77]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step9600.pt


Epoch 1/2:  88%|████████▊ | 9800/11148 [1:35:35<1:25:59,  3.83s/it, loss=4.99]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step9800.pt


Epoch 1/2:  90%|████████▉ | 10000/11148 [1:37:23<31:34,  1.65s/it, loss=4.34]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step10000.pt


Epoch 1/2:  91%|█████████▏| 10200/11148 [1:39:22<26:03,  1.65s/it, loss=4.54]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step10200.pt


Epoch 1/2:  93%|█████████▎| 10400/11148 [1:41:17<19:06,  1.53s/it, loss=4.58]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step10400.pt


Epoch 1/2:  95%|█████████▌| 10600/11148 [1:43:10<23:05,  2.53s/it, loss=4.51]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step10600.pt


Epoch 1/2:  97%|█████████▋| 10800/11148 [1:45:04<22:25,  3.87s/it, loss=4.1]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step10800.pt


Epoch 1/2:  99%|█████████▊| 11000/11148 [1:47:01<05:34,  2.26s/it, loss=5.04]

Checkpoint disimpan: checkpoints/checkpoint_epoch1_step11000.pt




Epoch 1 Average Loss: 5.0701


Epoch 2/2:   0%|          | 52/11148 [00:43<16:52:48,  5.48s/it, loss=4.66]

Checkpoint disimpan: checkpoints/checkpoint_epoch2_step11200.pt


Epoch 2/2:   2%|▏         | 252/11148 [02:47<5:49:52,  1.93s/it, loss=4.43]

Checkpoint disimpan: checkpoints/checkpoint_epoch2_step11400.pt


Epoch 2/2:   4%|▍         | 436/11148 [04:18<1:28:25,  2.02it/s, loss=4.31]

In [28]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0)  # No teacher forcing

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            total_loss += loss.item()

    return total_loss / len(dataloader)

# Load a checkpoint (adjust file name as needed)
checkpoint_path = os.path.join(CHECKPOINT_DIR, "checkpoint_epoch1_step200.pt")

checkpoint = torch.load(checkpoint_path, map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

test_loss = evaluate(model, test_loader, criterion, DEVICE)
print(f"Test Loss: {test_loss:.4f}")


KeyboardInterrupt: 

In [10]:
def load_checkpoint(model, optimizer, checkpoint_path, device):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    step = checkpoint['step']
    loss = checkpoint['loss']
    print(f"✅ Loaded checkpoint from epoch {epoch+1}, step {step}, loss: {loss:.4f}")
    return model, optimizer


In [15]:
def evaluate(model, data_loader, criterion, device, max_batches=5):
    model.eval()
    total_loss = 0
    batches_evaluated = 0

    with torch.no_grad():
        for src, trg in data_loader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0)  # no teacher forcing

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            total_loss += loss.item()
            batches_evaluated += 1

            if batches_evaluated >= max_batches:
                break

    return total_loss / batches_evaluated


In [16]:
checkpoint_path = "checkpoints/checkpoint_epoch1_step200.pt"

model, optimizer = load_checkpoint(model, optimizer, checkpoint_path, DEVICE)
dev_loss = evaluate(model, dev_loader, criterion, DEVICE)
print(f"Validation Loss: {dev_loss:.4f}")


✅ Loaded checkpoint from epoch 1, step 200, loss: 6.3531
Validation Loss: 6.4478


In [21]:
def paraphrase_sentence(model, sentence, src_vocab, trg_vocab, device, max_len=50):
    model.eval()

    # Reverse trg_vocab to map indices back to tokens
    idx2trg = {i: t for t, i in trg_vocab.items()}

    # Tokenize and encode the input sentence
    tokens = tokenize(sentence)
    input_tensor = torch.tensor([encode(tokens, src_vocab)], dtype=torch.long).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(input_tensor)

        input_token = torch.tensor([trg_vocab['<sos>']], dtype=torch.long).to(device)
        output_tokens = []

        for _ in range(max_len):
            output, hidden, cell = model.decoder(input_token, hidden, cell)
            top1 = output.argmax(1).item()
            if top1 == trg_vocab['<eos>']:
                break
            output_tokens.append(idx2trg.get(top1, '<unk>'))
            input_token = torch.tensor([top1], dtype=torch.long).to(device)

    return ' '.join(output_tokens)


In [27]:
# Load the model from a checkpoint
checkpoint_path = os.path.join(CHECKPOINT_DIR, "checkpoint_epoch1_step200.pt")
checkpoint = torch.load(checkpoint_path, map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])

# Use a sentence
custom_sentence = "How do i get?"
paraphrased = paraphrase_sentence(model, custom_sentence, src_vocab, trg_vocab, DEVICE)
print(f"Original: {custom_sentence}")
print(f"Paraphrased: {paraphrased}")


Original: How do i get?
Paraphrased: what is the
