# Project Transformer : Performance Evaluation of Vanilla and Performer Transformers for Text-to-Cypher Query Generation

Projek ini merupakan projek kuliah deep learning yang membangun vanila transformer dan variant transformer untuk permasalahan cypher query generation.

## 1. Import and Install Libraries

In [None]:
!pip install wandb
!pip install nltk
!pip install torchinfo
from torchinfo import summary
import nltk
import torch
import torch.nn as nn
import math
from tqdm import tqdm
from datasets import load_dataset
from transformers import T5Tokenizer
import time
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
nltk.download("punkt")

### Login Wandb

In [None]:
import wandb
import os
os.environ["WANDB_API_KEY"] = "a80fda938f801fd535d7f9884348f76b061048b0"
wandb.login()

## 2.Building Transformer Variant (Performer)

### 1. Vanila Transformer

In [None]:
class PositionalEncoding(nn.Module): # Pembuatan Class Positional Encoding Untuk Menambahkan  Positional pada embedding dengan menggunakan nn.Module pytorch
    def __init__(self, emb_dim, max_len=64): # emb_dim merupakan dimensi dari embedding dan max_lens merupakan panjang maksimum sequences
        super().__init__()
        pe = torch.zeros(max_len, emb_dim) # Inisiasi tensor berukuran max_len x emb_dim yang berisikan zero
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # Membuat tensor posisi yang memiliki nilai dari 0 - 1 yang digunakan untuk sinusoidal
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * (-math.log(10000.0) / emb_dim)) # menghitung pembagi eksponensial untuk menentukan frekuensi sinus dan cosinus
        pe[:, 0::2] = torch.sin(position * div_term) # menggunakan klom genap untuk nilai sinus 
        pe[:, 1::2] = torch.cos(position * div_term) # menggunakan kolom ganjil untuk cosinus
        pe = pe.unsqueeze(0)  # menambahkan dimensi untuk batch
        self.register_buffer('pe', pe) # memasukkan tensor pe sebagai buffer yang bukan trainable

    def forward(self, x):
        return x + self.pe[:, :x.size(1)] # melaukan positional encoding pe ke input x


def scaled_dot_product(q, k, v, mask=None): # Menghitung scale dot product untuk perhitungan attention
    d_k = q.size(-1) #mengambil dimensi dari d_k
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) # melakukan scaled dot antara q dan k.T
    if mask is not None: # check jika di mask
        scores = scores.masked_fill(mask == 0, float('-inf')) # jika di mask maka nilai posisi akan 0
    attn = torch.softmax(scores, dim=-1) # Menerapkan softmax pada attention
    return torch.matmul(attn, v), attn # menghasilkan output attention v x attention score


class MultiHeadAttention(nn.Module): # Pembuatan class multihead attention
    def __init__(self, emb_dim, num_heads):
        super().__init__()
        assert emb_dim % num_heads == 0 # memastikan jika ukuran embedding dapat dibagi
        self.d_k = emb_dim // num_heads # ukuran d_k merupakan dimensi per head
        self.num_heads = num_heads

        self.q_linear = nn.Linear(emb_dim, emb_dim) # Proyeksi q ke dimensi embedding 
        self.k_linear = nn.Linear(emb_dim, emb_dim)  # Proyeksi k ke dimensi embedding 
        self.v_linear = nn.Linear(emb_dim, emb_dim) # Proyeksi v ke dimensi embedding 
        self.out = nn.Linear(emb_dim, emb_dim) # menggabungkan semua head menjadi satu output

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0) # menyimpan batch size untuk reshaping

        def transform(x, linear):
            x = linear(x) # Mengaplikasikan proyeksi linear
            x = x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # Mengubah shape menjadi (batch_size, num_heads, seq_len, d_k)
            return x  

        q, k, v = transform(q, self.q_linear), transform(k, self.k_linear), transform(v, self.v_linear) # Memproyeksikan masing-masing input untuk multihead processing
        scores, attn = scaled_dot_product(q, k, v, mask) # menghitung attention score
        scores = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k) # Menggabungkan semua hasil dari head
        return self.out(scores)


class PositionwiseFeedForward(nn.Module):
    def __init__(self, emb_dim, ff_dim):
        super().__init__()
        self.linear1 = nn.Linear(emb_dim, ff_dim) # Memproyeksikan emb dim menjadi ff_dim
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(ff_dim, emb_dim) # Memproyeksikan ff_dim  menjadi emb_dim

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x))) # memproses linear1 terhadap x -> relu (linear1) -> linear2 (relu)

class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(emb_dim, num_heads) # Penggunaan self attention layer
        self.ff = PositionwiseFeedForward(emb_dim, ff_dim) # Feedforward network
        self.norm1 = nn.LayerNorm(emb_dim) # Layernorm 1
        self.norm2 = nn.LayerNorm(emb_dim) # Layernorm setlah feed forward
        self.dropout = nn.Dropout(dropout) # Dropout untuk regularization

    def forward(self, x, mask=None):
        attn = self.self_attn(x, x, x, mask) # Menghitung self attention
        x = self.norm1(x + self.dropout(attn)) # Residual + dripout + norm 
        ff_out = self.ff(x) # Residual + dropout + norm
        x = self.norm2(x + self.dropout(ff_out)) # Output encoded token representation
        return x


class DecoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(emb_dim, num_heads) # Self-attention untuk decoder (masked)
        self.cross_attn = MultiHeadAttention(emb_dim, num_heads) # Cross-attention ke output encoder
        self.ff = PositionwiseFeedForward(emb_dim, ff_dim)  # Feedforward network
        self.norm1 = nn.LayerNorm(emb_dim)  # LayerNorm setelah self-attention
        self.norm2 = nn.LayerNorm(emb_dim) # LayerNorm setelah cross-attention
        self.norm3 = nn.LayerNorm(emb_dim) # LayerNorm setelah feedforward
        self.dropout = nn.Dropout(dropout) # Dropout untuk regularisasi

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.norm1(x + self.dropout(self.self_attn(x, x, x, tgt_mask)))  # Masked self-attention dengan residual dan layer norm
        x = self.norm2(x + self.dropout(self.cross_attn(x, enc_output, enc_output, src_mask))) # Cross-attention (dari encoder ke decoder) dengan residual dan layer norm
        x = self.norm3(x + self.dropout(self.ff(x))) # Feedforward network dengan residual dan layer norm
        return x


class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)              # Embedding untuk input token
        self.pos_encoding = PositionalEncoding(emb_dim, max_len)        # Penambahan positional encoding
        self.layers = nn.ModuleList([
            EncoderLayer(emb_dim, num_heads, ff_dim) for _ in range(num_layers)  # Beberapa encoder layer
        ])

    def forward(self, src, mask=None):
        x = self.embedding(src)                                          # Konversi token ke vektor
        x = self.pos_encoding(x)                                         # Menambahkan informasi posisi
        for layer in self.layers:
            x = layer(x, mask)                                          
        return x                                                        

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)              # Embedding untuk target token
        self.pos_encoding = PositionalEncoding(emb_dim, max_len)        # Positional encoding untuk target
        self.layers = nn.ModuleList([
            DecoderLayer(emb_dim, num_heads, ff_dim) for _ in range(num_layers)  # Beberapa decoder layer
        ])
        self.fc_out = nn.Linear(emb_dim, vocab_size)                    # Proyeksi ke ukuran vocab untuk prediksi

    def forward(self, tgt, enc_output, src_mask=None, tgt_mask=None):
        x = self.embedding(tgt)                                         # Embed target token
        x = self.pos_encoding(x)                                        # Tambahkan positional encoding
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)                # Lewatkan ke decoder layer
        return self.fc_out(x)                                           # Output logits untuk prediksi token


class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.encoder = TransformerEncoder(src_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len)
        self.decoder = TransformerDecoder(tgt_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len)

    def make_subsequent_mask(self, size):
        mask = torch.tril(torch.ones(size, size)).unsqueeze(0).unsqueeze(1)
        return mask  # Maskking

    def forward(self, src, tgt, src_mask=None):
        enc_output = self.encoder(src, src_mask)                        # Encode input source
        tgt_mask = self.make_subsequent_mask(tgt.size(1)).to(tgt.device) # Mask untuk autoregressive decoding
        output = self.decoder(tgt, enc_output, src_mask, tgt_mask)     # Decode dengan cross-attention
        return output                                                   # Output prediksi akhir




### 2. Performer Transformer

In [None]:
import torch.nn.functional as F

def elu_feature_map(x):
    return F.elu(x) + 1  # Fungsi feature map untuk Performer: menggunakan ELU + 1 agar hasil tetap positif

    
def linear_attention(q, k, v):
    q = elu_feature_map(q)  # Terapkan feature map pada query
    k = elu_feature_map(k)  # Terapkan feature map pada key

    kv = torch.einsum('bhnd,bhne->bhde', k, v)  # Kalikan key dan value: bentuk tensor (batch, head, d_k, d_model)
    z = 1 / (torch.einsum('bhnd,bhd->bhn', q, k.sum(dim=2)) + 1e-6)  # Normalisasi dengan penjumlahan key
    out = torch.einsum('bhnd,bhde->bhne', q, kv)  # Hitung hasil attention (query * kv)
    out = out * z.unsqueeze(-1)  # Terapkan normalisasi
    return out  # Kembalikan output attention

class PerformerAttention(nn.Module):
    def __init__(self, emb_dim, num_heads):
        super().__init__()
        assert emb_dim % num_heads == 0  # Pastikan emb_dim bisa dibagi rata ke semua head
        self.d_k = emb_dim // num_heads  # Ukuran dimensi per head
        self.num_heads = num_heads

        # Linear projection untuk query, key, dan value
        self.q_linear = nn.Linear(emb_dim, emb_dim)
        self.k_linear = nn.Linear(emb_dim, emb_dim)
        self.v_linear = nn.Linear(emb_dim, emb_dim)
        self.out = nn.Linear(emb_dim, emb_dim)  # Proyeksi output akhir

    def forward(self, q, k, v, mask=None):
        bsz = q.size(0)  # Batch size

        def transform(x, linear):
            x = linear(x)  # Proyeksi linier
            x = x.view(bsz, -1, self.num_heads, self.d_k).transpose(1, 2)  # Bentuk ulang ke (batch, head, seq_len, d_k)
            return x

        # Proyeksikan dan ubah bentuk query, key, value
        q, k, v = transform(q, self.q_linear), transform(k, self.k_linear), transform(v, self.v_linear)

        # Hitung linear attention
        attn_output = linear_attention(q, k, v)

        # Gabungkan kembali output dari semua head
        out = attn_output.transpose(1, 2).contiguous().view(bsz, -1, self.num_heads * self.d_k)

        return self.out(out)  # Proyeksi ke dimensi akhir model
        return self.out(out)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, emb_dim, ff_dim):
        super().__init__()
        self.linear1 = nn.Linear(emb_dim, ff_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(ff_dim, emb_dim)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))


class EncoderLayerPerformer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = PerformerAttention(emb_dim, num_heads)
        self.ff = PositionwiseFeedForward(emb_dim, ff_dim)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x

class DecoderLayerPerformer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = PerformerAttention(emb_dim, num_heads)
        self.cross_attn = PerformerAttention(emb_dim, num_heads)
        self.ff = PositionwiseFeedForward(emb_dim, ff_dim)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.norm3 = nn.LayerNorm(emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.norm1(x + self.dropout(self.self_attn(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.cross_attn(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x

class TransformerEncoderPerformer(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_encoding = PositionalEncoding(emb_dim, max_len)
        self.layers = nn.ModuleList([
            EncoderLayerPerformer(emb_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])

    def forward(self, src, mask=None):
        x = self.embedding(src)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x

class TransformerDecoderPerformer(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_encoding = PositionalEncoding(emb_dim, max_len)
        self.layers = nn.ModuleList([
            DecoderLayerPerformer(emb_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(emb_dim, vocab_size)
    
    def forward(self, tgt, enc_output, src_mask=None, tgt_mask=None):
        x = self.embedding(tgt)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        return self.fc_out(x)

class TransformerPerformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.encoder = TransformerEncoderPerformer(src_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len)
        self.decoder = TransformerDecoderPerformer(tgt_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len)

    def make_subsequent_mask(self, size):
        mask = torch.tril(torch.ones(size, size)).unsqueeze(0).unsqueeze(1)
        return mask  

    def forward(self, src, tgt, src_mask=None):
        enc_output = self.encoder(src, src_mask)
        tgt_mask = self.make_subsequent_mask(tgt.size(1)).to(tgt.device)
        output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        return output


## 3. Data Preparation

### 1. Tokenizer

Dataset neo4j/text2cypher-2025v1 digunakan untuk mengubah teks alami menjadi query Cypher. Tokenisasi dilakukan dengan T5Tokenizer dari model t5-base, yang mengubah teks menjadi token numerik. Token khusus seperti pad_token_id, eos_token_id, dan sos_token_id disiapkan untuk mengatur struktur input dan output sesuai kebutuhan model encoder-decoder.

In [None]:
dataset = load_dataset("neo4j/text2cypher-2025v1")
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-base")
pad_token_id = tokenizer.pad_token_id
sos_token_id = tokenizer.convert_tokens_to_ids("<pad>")  
eos_token_id = tokenizer.eos_token_id


Fungsi batch_preprocess digunakan untuk memproses data dari dataset text2cypher agar dapat digunakan dalam pelatihan model berbasis T5. Fungsi ini menangani dua skenario: mode batch dan mode non batch. Untuk setiap pasangan question, schema, dan cypher, teks input dibentuk dalam format terstruktur dan ditokenisasi dengan panjang maksimum 512 token. Tokenisasi dilakukan baik untuk input maupun label (Cypher query), dengan padding dan truncation otomatis. Fungsi juga menghitung panjang input dan label sebelum padding. Hasil akhirnya berupa dictionary berisi token yang telah diproses dan dapat langsung digunakan sebagai input model.

In [None]:
def batch_preprocess(example):
    inputs = []

    is_batch = isinstance(example["question"], list)

    if is_batch:
        for schema, question, cypher in zip(example["schema"], example["question"], example["cypher"]):
            schema = schema or ""
            input_text = (
                f"translate question to cypher:\n"
                f"<question> {question} </question>\n"
                f"<schema> {schema} </schema>"
            )

            input_enc = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")
            label_enc = tokenizer(cypher+tokenizer.eos_token, max_length=512, truncation=True, padding="max_length")

            input_enc["labels"] = label_enc["input_ids"]
            input_enc["input_length"] = len(tokenizer(input_text)["input_ids"])
            input_enc["label_length"] = len(tokenizer(cypher)["input_ids"])

            inputs.append(input_enc)

        if not inputs:
            print("Seluruh batch difilter.")
            return {}

        return {key: [d[key] for d in inputs] for key in inputs[0]}

    # Single example mode
    schema = example.get("schema", "")
    question = example["question"]
    cypher = example["cypher"]

    input_text = (
        f"translate question to cypher:\n"
        f"<question> {question} </question>\n"
        f"<schema> {schema} </schema>"
    )

    input_enc = tokenizer(input_text, max_length=512, truncation=True, padding="max_length")
    label_enc = tokenizer(cypher, max_length=512, truncation=True, padding="max_length")

    input_enc["labels"] = label_enc["input_ids"]
    input_enc["input_length"] = len(tokenizer(input_text)["input_ids"])
    input_enc["label_length"] = len(tokenizer(cypher)["input_ids"])

    return input_enc


Baris kode tokenized_dataset = dataset.map(...) digunakan untuk melakukan praproses seluruh dataset menggunakan fungsi batch_preprocess. Proses ini dilakukan dalam bentuk batch dengan ukuran 32 dan dijalankan secara paralel menggunakan 4 proses (num_proc=4) untuk efisiensi waktu. Fungsi map akan menerapkan tokenisasi dan format input-output model ke setiap contoh dalam dataset, menghasilkan dataset baru yang telah siap untuk dilatih dalam pipeline model Transformer seperti T5.

In [None]:
tokenized_dataset = dataset.map(
    batch_preprocess,
    batched=True,
    batch_size=32,
    num_proc=4
)

### 2. Dataloader

HuggingfaceCypherDataset adalah class warpper dataset dari Huggingface agar dapat digunakan oleh torch.utils.data.DataLoader.

In [None]:
class HuggingfaceCypherDataset(Dataset):
    def __init__(self, hf_dataset):
        self.data = hf_dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids = torch.tensor(item["input_ids"], dtype=torch.long)
        labels = torch.tensor(item["labels"], dtype=torch.long)
        return input_ids, labels

Kode ini mengatur proses pembentukan data loader untuk pelatihan dan pengujian menggunakan PyTorch. Dataset yang telah ditokenisasi terlebih dahulu diwrapper menggunakan HuggingfaceCypherDataset agar sesuai dengan DataLoader. Data pelatihan (train_data) dimuat dalam train_loader dengan opsi shuffle=True untuk memastikan data teracak setiap epoch, sedangkan data pengujian (test_data) dimuat dalam test_loader tanpa pengacakan. Ukuran batch ditentukan sebesar 32 sampel untuk setiap iterasi pelatihan dan evaluasi.

In [None]:
BATCH_SIZE = 32

train_data = HuggingfaceCypherDataset(tokenized_dataset["train"])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

test_data = HuggingfaceCypherDataset(tokenized_dataset["test"])
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

## 4. Training and Evaluation Utils

### 1. Training and evaluation

**evaluate(model, dataloader, criterion, device, write_samples=False)**

Fungsi ini digunakan untuk mengevaluasi performa model pada data validasi atau pengujian.

* Model diset ke mode evaluasi (model.eval()), dan torch.no_grad() digunakan untuk menonaktifkan perhitungan gradien.

* Untuk setiap batch, input (src) dan target (tgt) dipindahkan ke device (CPU/GPU).

* tgt dipisah menjadi tgt_input (semua token kecuali terakhir) dan tgt_output (semua token kecuali pertama).

* Model menghasilkan output berdasarkan src dan tgt_input, lalu loss dihitung.

* Prediksi dievaluasi menggunakan argmax, dan akurasi dihitung dengan membandingkan prediksi dan target (tanpa memperhitungkan padding).

* Nilai rata-rata loss dan akurasi keseluruhan dikembalikan.


**train(model, dataloader, optimizer, criterion, device)**

* Fungsi ini digunakan untuk melatih model selama satu epoch.

* Model diset ke mode training (model.train()).

* Untuk setiap batch, src dan tgt diproses serupa dengan evaluasi.

* Optimizer di-reset (optimizer.zero_grad()), kemudian model melakukan forward pass.

* Loss dihitung dan dilakukan backward pass (loss.backward()), lalu parameter diperbarui (optimizer.step()).

* Akurasi dihitung berdasarkan token yang benar (mengabaikan padding).

* Loop menggunakan tqdm untuk menampilkan progres batch, termasuk loss per batch.

* Rata-rata loss dan akurasi epoch dikembalikan di akhir.

In [None]:
from nltk.tokenize import word_tokenize

def evaluate(model, dataloader, criterion, device,write_samples=False):
    model.eval()
    total_loss = 0
    total_tokens = 0
    correct_tokens = 0
    total_bleu = 0
    num_samples = 0
    sample_logs = []
    loop = tqdm(enumerate(dataloader), total=len(dataloader), desc="Validating", leave=False)
    with torch.no_grad():
        for i, (src, tgt) in loop:
            src, tgt = src.to(device), tgt.to(device)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
            total_loss += loss.item()

            preds = output.argmax(dim=-1)
            mask = (tgt_output != tokenizer.pad_token_id)
            correct = (preds == tgt_output) & mask
            correct_tokens += correct.sum().item()
            total_tokens += mask.sum().item()


    avg_loss = total_loss / len(dataloader)
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    return avg_loss, accuracy



def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_tokens = 0
    total_tokens = 0

    loop = tqdm(enumerate(dataloader), total=len(dataloader), desc="Training", leave=False)

    for i, (src, tgt) in loop:
        src, tgt = src.to(device), tgt.to(device)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_input)

        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()

        preds = output.argmax(dim=-1)  # (batch, seq_len)
        mask = (tgt_output != tokenizer.pad_token_id)
        correct = (preds == tgt_output) & mask
        correct_tokens += correct.sum().item()
        total_tokens += mask.sum().item()

        total_loss += loss.item()
        loop.set_description(f"Training [Batch {i+1}/{len(dataloader)}]")
        loop.set_postfix(batch_loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    return avg_loss, accuracy

### 2. Train With Wandb

Fungsi train_with_wandb digunakan untuk melatih model Transformer dengan konfigurasi tertentu, melakukan pencatatan metrik ke platform Weights & Biases (WandB), dan menyimpan model terbaik berdasarkan nilai validation loss terendah. Proses pelatihan dilakukan dengan optimisasi Adam dan fungsi loss CrossEntropy, serta mendukung multiple GPU dengan DataParallel. Setiap epoch, fungsi ini mencatat metrik akurasi dan loss untuk data pelatihan dan validasi, serta menerapkan early stopping jika tidak ada peningkatan setelah sejumlah epoch tertentu. Model terbaik disimpan dan diunggah ke WandB sebagai artifact.

In [None]:
def train_with_wandb(config, run_name,transformer):
    import wandb
    wandb.init(project="cypher-transformer-generation-variant", config=config, name=run_name,reinit=True)
    patience = config.get("patience", 3)
    patience_counter = 0
    # === Setup Model ===
    model = transformer(
        src_vocab_size=tokenizer.vocab_size,
        tgt_vocab_size=tokenizer.vocab_size,
        emb_dim=config['emb_dim'],
        num_layers=config['num_layers'],
        num_heads=config['num_heads'],
        ff_dim=config['ff_dim'],
        max_len=config['max_len']
    )

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    best_val_loss = float('inf')
    best_model_path = f"best_model_{run_name}.pt"

    for epoch in range(config['epochs']):
        print(">>> Starting training step...", flush=True)
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        print(">>> Finished training step...", flush=True)
        print(">>> Starting evaluation step...", flush=True)
        val_loss, val_acc = evaluate(model, test_loader, criterion, device)

        print(">>> Finished evaluation step...", flush=True)

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc,
        })

        print(f"[{run_name}] Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {val_acc:.2%} ")

        # Save best model only
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_path)
            wandb.run.summary["best_val_loss"] = val_loss
            wandb.run.summary["best_model_path"] = best_model_path
            artifact = wandb.Artifact(name=f"{run_name}_best_model", type="model")
            artifact.add_file(best_model_path)
            wandb.log_artifact(artifact)
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    wandb.finish()


### 3. Model Configuration

6 Model konfigurasi digunakan untuk ditraining secara sequence dan dimonitoring menggunakan wand

In [None]:
model_configs = [
        {
        "emb_dim": 128,
        "num_layers": 2,
        "num_heads": 4,
        "ff_dim": 256,
        "max_len": 512,
        "lr": 1e-4,
        "epochs": 40,
        "model_name":"vanila_transformerV1"
        "transformer":Transformer,
    },
    {
        "emb_dim": 128,
        "num_layers": 4,
        "num_heads": 4,
        "ff_dim": 512,
        "max_len": 512,
        "lr": 1e-4,
        "epochs": 40,
        "model_name":"vanila_transformerV2",
        "transformer":Transformer,
    },
    {
        "emb_dim": 128,
        "num_layers": 8,
        "num_heads": 4,
        "ff_dim": 256,
        "max_len": 512,
        "lr": 1e-4,
        "epochs": 40,
        "model_name":"vanila_transformerV3",
        "transformer":Transformer,
    },
    {
        "emb_dim": 128,
        "num_layers": 2,
        "num_heads": 4,
        "ff_dim": 256,
        "max_len": 512,
        "lr": 1e-4,
        "epochs": 40,
        "model_name":"variant_transformerV1"
        "transformer":TransformerPerformer,
    },
    {
        "emb_dim": 128,
        "num_layers": 4,
        "num_heads": 4,
        "ff_dim": 512,
        "max_len": 512,
        "lr": 1e-4,
        "epochs": 40,
        "model_name":"variant_transformerV2",
        "transformer":TransformerPerformer,
    },
    {
        "emb_dim": 128,
        "num_layers": 8,
        "num_heads": 4,
        "ff_dim": 256,
        "max_len": 512,
        "lr": 1e-4,
        "epochs": 40,
        "model_name":"variant_transformerV3",
        "transformer":TransformerPerformer,
    }
]


## 5. Train the model

Melakukan proses training untuk 6 kombinasi model

In [None]:
for model in model_configs:
    config = model
    run_name = model["model_name"]
    transformer = model["transformer"]
    train_with_wandb(config, run_name,transformer)

## 6. Evaluasi Model

Evaluasi model dilakukan dengan mengambil artifact dari model yang telah ditraining dan disimpan pada wandb. Evaluasi yang dilakukan menggunakan loss, accuracy, dan BLEU

In [None]:
os.environ["WANDB_API_KEY"] = "a80fda938f801fd535d7f9884348f76b061048b0"
import wandb
run = wandb.init()

models= {
    "vanila_v1":"fadhilelrizanda-student/cypher-transformer-generation-variant/vanila_transformerV1_best_model:v39",
    "vanila_v2":"fadhilelrizanda-student/cypher-transformer-generation-variant/vanila_transformerV2_best_model:v13",
    "vanila_v3":"fadhilelrizanda-student/cypher-transformer-generation-variant/vanila_transformerV3_best_model:v18",
    "variant_v1":"fadhilelrizanda-student/cypher-transformer-generation-variant/variant_transformerV1_best_model:v41",
    "variant_v2":"fadhilelrizanda-student/cypher-transformer-generation-variant/variant_transformerV2_best_model:v40",
    "variant_v3":"fadhilelrizanda-student/cypher-transformer-generation-variant/variant_transformerV3_best_model:v12"
}
model_artifacts = {}

for model in models:
    artifact = run.use_artifact(models[model], type='model')
    artifact_dir = artifact.download()
    model_artifacts[model] = artifact_dir



import torch
from collections import OrderedDict


model_config = [
    {"name":"vanila_v1",
    "path":"/kaggle/working/artifacts/vanila_transformerV1_best_model:v39/best_model_vanila_transformerV1.pt",
    "config":model_configs[0]},

     {"name":"vanila_v2",
    "path":"/kaggle/working/artifacts/vanila_transformerV2_best_model:v13/best_model_vanila_transformerV2.pt",
    "config":model_configs[1]},

     {"name":"vanila_v3",
    "path":"/kaggle/working/artifacts/vanila_transformerV3_best_model:v18/best_model_vanila_transformerV3.pt",
    "config":model_configs[2]},

     {"name":"variant_v1",
    "path":"/kaggle/working/artifacts/variant_transformerV1_best_model:v41/best_model_variant_transformerV1.pt",
    "config":model_configs[0]},

     {"name":"variant_v2",
    "path":"/kaggle/working/artifacts/variant_transformerV2_best_model:v40/best_model_variant_transformerV2.pt",
    "config":model_configs[1]},

     {"name":"variant_v3",
    "path":"/kaggle/working/artifacts/variant_transformerV3_best_model:v12/best_model_variant_transformerV3.pt",
    "config":model_configs[2]},
]

for i,model_info in enumerate(model_config):
    config = model_info["config"]
    
    if i>2:
        model = TransformerPerformer(
    src_vocab_size=tokenizer.vocab_size,
    tgt_vocab_size=tokenizer.vocab_size,
    emb_dim=config["emb_dim"],
    num_layers=config["num_layers"],
    num_heads=config["num_heads"],
    ff_dim=config["ff_dim"],
    max_len=config["max_len"]
)
    else:
        model = Transformer(
    src_vocab_size=tokenizer.vocab_size,
    tgt_vocab_size=tokenizer.vocab_size,
    emb_dim=config["emb_dim"],
    num_layers=config["num_layers"],
    num_heads=config["num_heads"],
    ff_dim=config["ff_dim"],
    max_len=config["max_len"]
)

    model_path = model_info["path"]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    raw_state_dict = torch.load(model_path, map_location=device)
    
    cleaned_state_dict = OrderedDict()
    for k, v in raw_state_dict.items():
        new_key = k.replace("module.", "")  # remove 'module.' prefix
        cleaned_state_dict[new_key] = v
    
    model.load_state_dict(cleaned_state_dict)
    model.to(device)
    model.eval()

    criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    
    
    
    avg_loss, accuracy, avg_bleu, avg_infer_time = evaluate(
        model=model,
        dataloader=test_loader,
        criterion=criterion,
        device=device
    )
    
    print(f"📊 Evaluation Results for {model_info['name']}")
    print(f"- Loss         : {avg_loss:.4f}")
    print(f"- Accuracy     : {accuracy:.4f}")
    print(f"- Avg BLEU     : {avg_bleu:.4f}")
    print(f"- Inference Time (avg/batch): {avg_infer_time:.4f} seconds\n")




Berdasarkan training yang telah dilakukan didapatkan hasil sebagai berikut

📊 Evaluation Results for vanila_v1
- Loss         : 0.1841
- Accuracy     : 0.9460
- Avg BLEU     : 0.1758
- Inference Time (avg/batch): 0.0109 seconds

📊 Evaluation Results for vanila_v2
- Loss         : 0.3133
- Accuracy     : 0.9145
- Avg BLEU     : 0.1485
- Inference Time (avg/batch): 0.0224 seconds

📊 Evaluation Results for vanila_v3
- Loss         : 0.3189
- Accuracy     : 0.8947
- Avg BLEU     : 0.1984
- Inference Time (avg/batch): 0.0412 seconds

📊 Evaluation Results for variant_v1
- Loss         : 0.0635
- Accuracy     : 0.9820
- Avg BLEU     : 0.3242
- Inference Time (avg/batch): 0.0080 seconds

📊 Evaluation Results for variant_v2
- Loss         : 2.4396
- Accuracy     : 0.5385
- Avg BLEU     : 0.0608
- Inference Time (avg/batch): 0.0166 seconds

📊 Evaluation Results for variant_v3
- Loss         : 0.0089
- Accuracy     : 0.9972
- Avg BLEU     : 0.3854
- Inference Time (avg/batch): 0.0297 seconds