In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import torchtext
import tqdm
import yaml 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))

from data.preprocessing import preprocess_dataset  
from data.data_loader import get_data_loader
from utils.training import init_model, train_fn, evaluate_fn
from utils.inference import translate_sentence


# Import Dataset

In [3]:
# 📌 **Load konfigurasi YAML**
with open(os.path.abspath("../../configs/gru_seq2seq.yaml"), "r") as f:
    config = yaml.safe_load(f)

In [11]:
from datasets import load_dataset

dataset = load_dataset("IndoNLP/NusaX-MT", "jav-ind", trust_remote_code=True)
dataset

Generating train split: 500 examples [00:00, 5609.79 examples/s]
Generating validation split: 100 examples [00:00, 6424.21 examples/s]
Generating test split: 400 examples [00:00, 8124.32 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'text_1', 'text_2', 'text_1_lang', 'text_2_lang'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['id', 'text_1', 'text_2', 'text_1_lang', 'text_2_lang'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'text_1', 'text_2', 'text_1_lang', 'text_2_lang'],
        num_rows: 400
    })
})

In [5]:
from datasets import load_from_disk
import os

dataset_path = os.path.abspath(config["data"]["dataset_path"])

dataset = load_from_disk(dataset_path)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 100
    })
    test: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 100
    })
})


In [11]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [12]:
train_data[5]

{'indonesia': 'Mau bikin postingan yang isinya mengedukasi customers gojek.',
 'jawa': 'Pengin nggawe postingan sing isine ngajari pelanggan Gojek.'}

# Preprocessing

In [13]:
# Preprocessing
train_data, valid_data, test_data, en_vocab, id_vocab = preprocess_dataset(dataset)

Map: 100%|██████████| 800/800 [00:00<00:00, 850.79 examples/s] 
Map: 100%|██████████| 100/100 [00:00<00:00, 1255.04 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 361.67 examples/s]


✅ Tokenisasi selesai dengan BERT tokenizer!


Map: 100%|██████████| 800/800 [00:00<00:00, 957.52 examples/s] 
Map: 100%|██████████| 100/100 [00:00<00:00, 2000.64 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 2223.80 examples/s]

✅ Data siap digunakan dalam format PyTorch!





# Data Loader

In [15]:
# Ambil indeks padding dari vocabulary
pad_index = en_vocab[config["data"]["pad_token"]]


# Definisikan batch size
batch_size = config["training"]["batch_size"]

# Buat DataLoader untuk train, valid, dan test
train_loader = get_data_loader(train_data, batch_size=batch_size, pad_index=pad_index, shuffle=True)
valid_loader = get_data_loader(valid_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)
test_loader = get_data_loader(test_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)

# TRAIN

In [17]:
# masukin ke YAML 
config["model"]["input_dim"] = len(id_vocab)
config["model"]["output_dim"] = len(en_vocab)

# Inisialisasi Model
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model, optimizer, criterion = init_model(
    config["model"]["input_dim"],
    config["model"]["output_dim"],
    config["model"]["embedding_dim"],
    config["model"]["hidden_dim"],
    config["model"]["dropout"],
    pad_index,
    device
)

# 📌 **Training Parameters dari YAML**
epochs = config["training"]["epochs"]
clip = config["training"]["clip"]
teacher_forcing_initial = config["training"]["teacher_forcing_initial"]
teacher_forcing_final = config["training"]["teacher_forcing_final"]
checkpoint_path = config["training"]["checkpoint_path"]
patience = config["training"]["patience"]
patience_counter = config["training"]["patience_counter"]

model

Using device: mps


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(2622, 512)
    (rnn): GRU(512, 1024)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(3113, 512)
    (rnn): GRU(1536, 1024)
    (fc_out): Linear(in_features=2560, out_features=3113, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

In [None]:
# Training Loop
best_valid_loss = float("inf")
patience = 5
patience_counter = 0

for epoch in tqdm.tqdm(range(epochs)):
    teacher_forcing_ratio = teacher_forcing_initial - \
                            (teacher_forcing_initial - teacher_forcing_final) * \
                            (epoch / (epochs - 1))
    
    train_loss = train_fn(model, train_loader, optimizer, criterion, clip, teacher_forcing_ratio, device)
    valid_loss = evaluate_fn(model, valid_loader, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "checkpoints/gru_model.pt")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):.3f}")
    print(f"Valid Loss: {valid_loss:.3f} | Valid PPL: {np.exp(valid_loss):.3f}")


# EVALUATE

In [None]:
# Evaluasi Model
model.load_state_dict(torch.load("gru_model.pt"))
test_loss = evaluate_fn(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):.3f}")

In [None]:

# Contoh model (Pastikan model sudah di-load sebelumnya)
sentence = "aku akan makan"
translated = translate_sentence(sentence, model, en_vocab, id_vocab, "<sos>", "<eos>", device)
print("Terjemahan:", translated)