In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import torchtext
import tqdm

In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))

from data.preprocessing import preprocess_dataset  
from data.data_loader import get_data_loader
from utils.training import init_model, train_fn, evaluate_fn
from utils.training import translate_sentence


# Import Dataset

In [6]:
!which python
!python --version


/opt/homebrew/anaconda3/envs/gru_translation/bin/python
Python 3.10.16


In [7]:
import datasets
print(datasets.__version__)

2.14.5


In [8]:
from datasets import load_from_disk
import os

dataset_path = os.path.abspath("../../data/indonesia_jawa_dataset")

dataset = load_from_disk(dataset_path)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 100
    })
    test: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 100
    })
})


In [9]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [10]:
train_data[5]

{'indonesia': 'Mau bikin postingan yang isinya mengedukasi customers gojek.',
 'jawa': 'Pengin nggawe postingan sing isine ngajari pelanggan Gojek.'}

# Preprocessing

In [11]:
# Preprocessing
train_data, valid_data, test_data, en_vocab, id_vocab = preprocess_dataset(dataset)

Map: 100%|██████████| 800/800 [00:00<00:00, 863.21 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 189.29 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 152.49 examples/s]


✅ Tokenisasi selesai dengan BERT tokenizer!


Map: 100%|██████████| 800/800 [00:00<00:00, 1011.49 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1320.33 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1294.51 examples/s]


✅ Data siap digunakan dalam format PyTorch!


# Data Loader

In [12]:
# Ambil indeks padding dari vocabulary
pad_index = en_vocab["<pad>"]

# Definisikan batch size
batch_size = 128  # Sesuai kebutuhan

# Buat DataLoader untuk train, valid, dan test
train_loader = get_data_loader(train_data, batch_size=batch_size, pad_index=pad_index, shuffle=True)
valid_loader = get_data_loader(valid_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)
test_loader = get_data_loader(test_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)

# TRAIN

In [13]:
# Inisialisasi Model
input_dim = len(id_vocab)
output_dim = len(en_vocab)
embedding_dim = 512
hidden_dim = 1024
dropout = 0.3
clip = 1.0
teacher_forcing_initial = 0.5
teacher_forcing_final = 0.1
epochs = 10
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model, optimizer, criterion = init_model(input_dim, output_dim, embedding_dim, hidden_dim, dropout, pad_index, device)
model

Using device: mps


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(2622, 512)
    (rnn): GRU(512, 1024)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(3113, 512)
    (rnn): GRU(1536, 1024)
    (fc_out): Linear(in_features=2560, out_features=3113, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

In [None]:
# Training Loop
best_valid_loss = float("inf")
patience = 5
patience_counter = 0

for epoch in tqdm.tqdm(range(epochs)):
    teacher_forcing_ratio = teacher_forcing_initial - \
                            (teacher_forcing_initial - teacher_forcing_final) * \
                            (epoch / (epochs - 1))
    
    train_loss = train_fn(model, train_loader, optimizer, criterion, clip, teacher_forcing_ratio, device)
    valid_loss = evaluate_fn(model, valid_loader, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "checkpoints/gru_model.pt")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):.3f}")
    print(f"Valid Loss: {valid_loss:.3f} | Valid PPL: {np.exp(valid_loss):.3f}")


# EVALUATE

In [None]:
# Evaluasi Model
model.load_state_dict(torch.load("gru_model.pt"))
test_loss = evaluate_fn(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):.3f}")

In [None]:

# Contoh model (Pastikan model sudah di-load sebelumnya)
sentence = "aku akan makan"
translated = translate_sentence(sentence, model, en_vocab, id_vocab, "<sos>", "<eos>", device)
print("Terjemahan:", translated)