In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import torchtext
import tqdm

In [3]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))

from data.preprocessing import preprocess_dataset  # ✅ Import ulang setelah menambahkan path
from data.data_loader import get_data_loader


# Import Dataset

In [4]:
!which python
!python --version


/opt/homebrew/anaconda3/envs/gru_translation/bin/python
Python 3.10.16


In [5]:
import datasets
print(datasets.__version__)

2.14.5


In [6]:
from datasets import load_from_disk
import os

dataset_path = os.path.abspath("../../data/indonesia_jawa_dataset")

dataset = load_from_disk(dataset_path)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 100
    })
    test: Dataset({
        features: ['indonesia', 'jawa'],
        num_rows: 100
    })
})


In [7]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [8]:
train_data[5]

{'indonesia': 'Mau bikin postingan yang isinya mengedukasi customers gojek.',
 'jawa': 'Pengin nggawe postingan sing isine ngajari pelanggan Gojek.'}

# Preprocessing

In [9]:
# Preprocessing
train_data, valid_data, test_data, en_vocab, id_vocab = preprocess_dataset(dataset)

Map: 100%|██████████| 800/800 [00:00<00:00, 1108.90 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1250.19 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1237.98 examples/s]


✅ Tokenisasi selesai dengan BERT tokenizer!


Map: 100%|██████████| 800/800 [00:00<00:00, 4621.98 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1693.40 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1065.23 examples/s]


✅ Data siap digunakan dalam format PyTorch!


# Data Loader

In [10]:
# Ambil indeks padding dari vocabulary
pad_index = en_vocab["<pad>"]

# Definisikan batch size
batch_size = 128  # Sesuai kebutuhan

# Buat DataLoader untuk train, valid, dan test
train_loader = get_data_loader(train_data, batch_size=batch_size, pad_index=pad_index, shuffle=True)
valid_loader = get_data_loader(valid_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)
test_loader = get_data_loader(test_data, batch_size=batch_size, pad_index=pad_index, shuffle=False)

## Encoder

In [23]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, hidden = self.rnn(embedded)  # no cell state in GRU!
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden

## Decoder

In [24]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim + hidden_dim, hidden_dim)
        self.fc_out = nn.Linear(embedding_dim + hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, context):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # context = [n layers * n directions, batch size, hidden dim]
        # n layers and n directions in the decoder will both always be 1, therefore:
        # hidden = [1, batch size, hidden dim]
        # context = [1, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        emb_con = torch.cat((embedded, context), dim=2)
        # emb_con = [1, batch size, embedding dim + hidden dim]
        output, hidden = self.rnn(emb_con, hidden)
        # output = [seq len, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [1, batch size, hidden dim]
        output = torch.cat(
            (embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1
        )
        # output = [batch size, embedding dim + hidden dim * 2]
        prediction = self.fc_out(output)
        # prediction = [batch size, output dim]
        return prediction, hidden

## Seq2Seq

In [25]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is the context
        context = self.encoder(src)
        # context = [n layers * n directions, batch size, hidden dim]
        # context also used as the initial hidden state of the decoder
        hidden = context
        # hidden = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden state and the context state
            # receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            # output = [batch size, output dim]
            # hidden = [1, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

## Training Seq2Seq

In [57]:
input_dim = len(id_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 512 # dari 256
decoder_embedding_dim = 512 # dari 256
hidden_dim = 1024 # dari 512
encoder_dropout = 0.3 # ubah jadi 0,6
decoder_dropout = 0.3 # ubah jadi 0,6
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")
# device = torch.device("mps" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

Using device: mps


In [58]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(2197, 512)
    (rnn): GRU(512, 1024)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2587, 512)
    (rnn): GRU(1536, 1024)
    (fc_out): Linear(in_features=2560, out_features=2587, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

In [59]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 21,669,915 trainable parameters


## Func Train

In [60]:
optimizer = optim.Adam(model.parameters(), 
                       weight_decay=1e-5) # menambah ini utk regularisasi
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [61]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["id_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

## Func Evaluate

In [62]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["id_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

# TRAIN

In [63]:
# n_epochs = 10
# clip = 1.0
# teacher_forcing_ratio = 0.5 # coba ubah dari 0.5

# best_valid_loss = float("inf")
# patience = 5

# for epoch in tqdm.tqdm(range(n_epochs)):
#     train_loss = train_fn(
#         model,
#         train_data_loader,
#         optimizer,
#         criterion,
#         clip,
#         teacher_forcing_ratio,
#         device,
#     )
#     valid_loss = evaluate_fn(
#         model,
#         valid_data_loader,
#         criterion,
#         device,
#     )
    
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), "tut2-model.pt")
#         patience_counter = 0
#     else:
#         patience_counter += 1
#         if patience_counter >= patience:
#             print(f"Early stopping at epoch {epoch + 1}")
#             break
#     print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
#     print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

In [64]:
import numpy as np
import torch
import tqdm

# Hyperparameter pelatihan
n_epochs = 10
clip = 1.0
initial_teacher_forcing_ratio = 0.5  # Nilai awal
final_teacher_forcing_ratio = 0.1    # Nilai akhir
best_valid_loss = float("inf")
patience = 5
patience_counter = 0

# Loop pelatihan dengan penurunan teacher_forcing_ratio
for epoch in tqdm.tqdm(range(n_epochs)):
    # Hitung teacher_forcing_ratio secara linier
    teacher_forcing_ratio = initial_teacher_forcing_ratio - \
                            (initial_teacher_forcing_ratio - final_teacher_forcing_ratio) * (epoch / (n_epochs - 1))
    
    # Pastikan tidak kurang dari final_teacher_forcing_ratio
    teacher_forcing_ratio = max(teacher_forcing_ratio, final_teacher_forcing_ratio)

    # Cetak nilai teacher_forcing_ratio untuk debugging
    print(f"Epoch {epoch + 1}/{n_epochs} | Teacher Forcing Ratio: {teacher_forcing_ratio:.3f}")

    # Panggil fungsi pelatihan dengan teacher_forcing_ratio yang diperbarui
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    
    # Logika early stopping
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut2-model.pt")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break
    
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

# Evaluasi setelah pelatihan
model.load_state_dict(torch.load("tut2-model.pt"))
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10 | Teacher Forcing Ratio: 0.500


 10%|█         | 1/10 [01:48<16:12, 108.03s/it]

	Train Loss:   7.551 | Train PPL: 1903.499
	Valid Loss:   8.279 | Valid PPL: 3940.723
Epoch 2/10 | Teacher Forcing Ratio: 0.456


 20%|██        | 2/10 [05:16<22:16, 167.09s/it]

	Train Loss:   7.892 | Train PPL: 2675.284
	Valid Loss:   7.626 | Valid PPL: 2051.511
Epoch 3/10 | Teacher Forcing Ratio: 0.411


 30%|███       | 3/10 [06:53<15:46, 135.26s/it]

	Train Loss:   6.836 | Train PPL: 930.663
	Valid Loss:   7.003 | Valid PPL: 1100.355
Epoch 4/10 | Teacher Forcing Ratio: 0.367


 40%|████      | 4/10 [09:20<13:59, 139.85s/it]

	Train Loss:   6.531 | Train PPL: 686.421
	Valid Loss:   6.860 | Valid PPL: 953.433
Epoch 5/10 | Teacher Forcing Ratio: 0.322


 50%|█████     | 5/10 [12:45<13:36, 163.34s/it]

	Train Loss:   6.459 | Train PPL: 638.189
	Valid Loss:   6.845 | Valid PPL: 939.495
Epoch 6/10 | Teacher Forcing Ratio: 0.278


 60%|██████    | 6/10 [13:31<08:13, 123.30s/it]

	Train Loss:   6.406 | Train PPL: 605.747
	Valid Loss:   6.848 | Valid PPL: 941.911
Epoch 7/10 | Teacher Forcing Ratio: 0.233


 70%|███████   | 7/10 [14:45<05:21, 107.10s/it]

	Train Loss:   6.368 | Train PPL: 583.051
	Valid Loss:   6.865 | Valid PPL: 958.588
Epoch 8/10 | Teacher Forcing Ratio: 0.189


 80%|████████  | 8/10 [16:06<03:17, 98.92s/it] 

	Train Loss:   6.348 | Train PPL: 571.271
	Valid Loss:   6.876 | Valid PPL: 968.949
Epoch 9/10 | Teacher Forcing Ratio: 0.144


 80%|████████  | 8/10 [17:16<04:19, 129.59s/it]


KeyboardInterrupt: 

# EVALUATE

In [None]:
model.load_state_dict(torch.load("tut2-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 6.644 | Test PPL: 767.833 |


In [None]:
sentence = "aku akan makan"

In [None]:
# sentence = "Dia sangat pintar."
translated = translate_sentence(sentence, model, tokenizer, en_vocab, id_vocab, "<sos>", "<eos>", device)
print("Terjemahan:", translated)

Terjemahan: . . . . . . . . . . . . . . . . . . . . . . . .


In [None]:
def translate_sentence(
    sentence,
    model,
    tokenizer,
    en_vocab,
    id_vocab,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
    max_repetition=3  # Batas maksimum pengulangan token tidak bermakna
):
    model.eval()
    
    # Tokenisasi input
    tokens = [sos_token] + tokenizer.tokenize(sentence.lower())[:1000] + [eos_token]
    numericalized = [id_vocab[token] if token in id_vocab else id_vocab["<unk>"] for token in tokens]
    sentence_tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device)

    # Encoder
    with torch.no_grad():
        context = model.encoder(sentence_tensor)

    # Decoder
    trg_tokens = [en_vocab[sos_token]]
    hidden = context
    last_token = None
    repeat_count = 0

    for _ in range(max_output_length):
        trg_tensor = torch.LongTensor([trg_tokens[-1]]).to(device)
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, context)
        
        # Ambil token dengan probabilitas tertinggi
        top1 = output.argmax(1).item()
        trg_tokens.append(top1)

        # Cek pengulangan token tidak bermakna
        current_token = en_vocab.lookup_token(top1)
        if current_token in [".", "<pad>", "<unk>"]:  # Token yang dianggap tidak bermakna
            if current_token == last_token:
                repeat_count += 1
            else:
                repeat_count = 1
            if repeat_count >= max_repetition:
                break  # Hentikan jika pengulangan melebihi batas
        else:
            repeat_count = 0  # Reset jika token bermakna
        
        last_token = current_token

        # Hentikan jika menemukan <eos>
        if top1 == en_vocab[eos_token]:
            break

    # Konversi ke string
    translated_tokens = en_vocab.lookup_tokens(trg_tokens[1:-1])  # Hapus <sos> & <eos>
    translated_sentence = " ".join(translated_tokens)
    return translated_sentence

# Contoh penggunaan
sentence = "aku akan makan"
translated = translate_sentence(sentence, model, tokenizer, en_vocab, id_vocab, "<sos>", "<eos>", device)
print("Terjemahan:", translated)

Terjemahan: . .
