In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import torchtext
import tqdm
import yaml 
from data.preprocessing import preprocess_dataset  
from data.data_loader import get_data_loader
from utils.training import init_model, train_fn, evaluate_fn, train_model
from utils.inference import translate_sentence


# Import Dataset

In [3]:
# 📌 **Load konfigurasi YAML**
with open(os.path.abspath("../../configs/gru_seq2seq.yaml"), "r") as f:
    config = yaml.safe_load(f)

In [4]:
from datasets import load_from_disk
dataset = load_from_disk(config["data"]["dataset_path"])
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text_1', 'text_2', 'text_1_lang', 'text_2_lang'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'text_1', 'text_2', 'text_1_lang', 'text_2_lang'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'text_1', 'text_2', 'text_1_lang', 'text_2_lang'],
        num_rows: 100
    })
})

In [5]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [6]:
train_data[10]

{'id': '10',
 'text_1': 'Wektu kuwi mara mrene pesen sega goreng karo kentang goreng, sega gorenge kabehane seneng, Kentang gorenge enak tenan, lan dhelokane apik. Keluarga ngomong yen kopine enak. Babagan paling apik nang kene yaiku panggonane jembar lan sek nang njaba iso ndhelok pemandangan dusun pring, nanging pelayanane ora cepet dadi aku kudu takon kaping pirang-pirang babagan pesenanku. Regane cukup larang, nanging amarga panganane enak kabeh dadi kabayar. Ora kudhu mikir ping pindho yen pengen mara panggonan iki maneh.',
 'text_2': 'Waktu itu ke sini pesan nasi goreng dan kentang goreng, nasi gorengnya semua suka. Kentang gorengnya enak banget, dan presentasinya bagus. Keluarga bilang kopinya enak. Hal yang sangat baik di sini adalah tempatnya luas dan yang di luar bisa lihat pemandangan dusun bambu, tapi pelayanannya tidak cepat sehingga saya harus bertanya beberapa kali tentang pesanan saya. Harganya cukup mahal, namun karena makanannya enak semua jadi terbayarkan. Tidak perl

# Preprocessing

In [7]:
train_data[0]['text_2']

'Nikmati cicilan 0% hingga 12 bulan untuk pemesanan tiket pesawat air asia dengan kartu kredit bni!'

In [8]:
# Preprocessing
train_data, valid_data, test_data, en_vocab, id_vocab = preprocess_dataset(dataset)

✅ Tokenisasi sederhana selesai!
✅ Data siap digunakan dalam format PyTorch!


# Data Loader

In [9]:
# Ambil indeks padding dari vocabulary
pad_index = en_vocab[config["data"]["pad_token"]]


# Definisikan batch size
batch_size = config["training"]["batch_size"]

# Buat DataLoader untuk train, valid, dan test
train_loader = get_data_loader(train_data, batch_size=batch_size, pad_index=pad_index, shuffle=True)
valid_loader = get_data_loader(valid_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)
test_loader = get_data_loader(test_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)

# TRAIN

In [10]:
# masukin ke YAML 
config["model"]["input_dim"] = len(id_vocab)
config["model"]["output_dim"] = len(en_vocab)

# Inisialisasi Model
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = config['training']['device']
print(f'device: {device}')

model, optimizer, criterion = init_model(
    config["model"]["input_dim"],
    config["model"]["output_dim"],
    config["model"]["embedding_dim"],
    config["model"]["hidden_dim"],
    config["model"]["dropout"],
    pad_index,
    device
)

# 📌 **Training Parameters dari YAML**
epochs = config["training"]["epochs"]
clip = config["training"]["clip"]
teacher_forcing_initial = config["training"]["teacher_forcing_initial"]
teacher_forcing_final = config["training"]["teacher_forcing_final"]
checkpoint_path = config["training"]["checkpoint_path"]
patience = config["training"]["patience"]
patience_counter = config["training"]["patience_counter"]

model

device: cpu


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(3827, 256)
    (rnn): GRU(256, 64)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(4181, 256)
    (rnn): GRU(320, 64)
    (fc_out): Linear(in_features=384, out_features=4181, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

In [None]:
history = train_model(model, train_loader, valid_loader, optimizer, criterion, config, en_vocab, id_vocab, resume_training=False)

Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

# EVALUATE

In [None]:
import matplotlib.pyplot as plt

with open("training_history.json", "r") as f:
    history = json.load(f)

plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["valid_loss"], label="Valid Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [12]:
# Evaluasi Model
model.load_state_dict(torch.load("../checkpoints/gru_model.pt"))
test_loss = evaluate_fn(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):.3f}")

Test Loss: 6.502 | Test PPL: 666.733


In [14]:

# Contoh model (Pastikan model sudah di-load sebelumnya)
sentence = "koe mangan opo"
translated = translate_sentence(sentence, model, en_vocab, id_vocab, "<sos>", "<eos>", device)
print("Terjemahan:", translated)

Terjemahan: pang ##anan ##e . ##e . ##e . .
