In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install wandb
!pip install nltk
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download("punkt")

In [None]:
import wandb
import os
os.environ["WANDB_API_KEY"] = "a80fda938f801fd535d7f9884348f76b061048b0"
wandb.login()

In [None]:
import torch
import torch.nn as nn
import math
from tqdm import tqdm
from datasets import load_dataset
from transformers import T5Tokenizer
import time

class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_len=64):
        super().__init__()
        pe = torch.zeros(max_len, emb_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * (-math.log(10000.0) / emb_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, emb_dim)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch_size, seq_len, emb_dim)
        return x + self.pe[:, :x.size(1)]


In [None]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size(-1)
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    attn = torch.softmax(scores, dim=-1)
    return torch.matmul(attn, v), attn


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, num_heads):
        super().__init__()
        assert emb_dim % num_heads == 0
        self.d_k = emb_dim // num_heads
        self.num_heads = num_heads

        self.q_linear = nn.Linear(emb_dim, emb_dim)
        self.k_linear = nn.Linear(emb_dim, emb_dim)
        self.v_linear = nn.Linear(emb_dim, emb_dim)
        self.out = nn.Linear(emb_dim, emb_dim)

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        def transform(x, linear):
            x = linear(x)
            x = x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
            return x  # (batch_size, heads, seq_len, d_k)

        q, k, v = transform(q, self.q_linear), transform(k, self.k_linear), transform(v, self.v_linear)
        scores, attn = scaled_dot_product(q, k, v, mask)
        scores = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.out(scores)


In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, emb_dim, ff_dim):
        super().__init__()
        self.linear1 = nn.Linear(emb_dim, ff_dim)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(ff_dim, emb_dim)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))


In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(emb_dim, num_heads)
        self.ff = PositionwiseFeedForward(emb_dim, ff_dim)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(emb_dim, num_heads)
        self.cross_attn = MultiHeadAttention(emb_dim, num_heads)
        self.ff = PositionwiseFeedForward(emb_dim, ff_dim)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.norm3 = nn.LayerNorm(emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.norm1(x + self.dropout(self.self_attn(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.cross_attn(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x


In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_encoding = PositionalEncoding(emb_dim, max_len)
        self.layers = nn.ModuleList([
            EncoderLayer(emb_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])

    def forward(self, src, mask=None):
        x = self.embedding(src)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x


In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_encoding = PositionalEncoding(emb_dim, max_len)
        self.layers = nn.ModuleList([
            DecoderLayer(emb_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(emb_dim, vocab_size)

    def forward(self, tgt, enc_output, src_mask=None, tgt_mask=None):
        x = self.embedding(tgt)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        return self.fc_out(x)


In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len):
        super().__init__()
        self.encoder = TransformerEncoder(src_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len)
        self.decoder = TransformerDecoder(tgt_vocab_size, emb_dim, num_layers, num_heads, ff_dim, max_len)

    def make_subsequent_mask(self, size):
        mask = torch.tril(torch.ones(size, size)).unsqueeze(0).unsqueeze(1)
        return mask  # (1, 1, tgt_len, tgt_len)

    def forward(self, src, tgt, src_mask=None):
        enc_output = self.encoder(src, src_mask)
        tgt_mask = self.make_subsequent_mask(tgt.size(1)).to(tgt.device)
        output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        return output


In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("neo4j/text2cypher-2025v1")

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-base")
pad_token_id = tokenizer.pad_token_id
sos_token_id = tokenizer.convert_tokens_to_ids("<pad>")  # We can define <pad> as start
eos_token_id = tokenizer.eos_token_id


In [None]:
def preprocess(example):
    schema = example.get("schema", "")
    question = example["question"]
    cypher = example["cypher"]

    input_text = f"translate question to cypher: <schema> {schema} </schema> {question}"
    
    model_inputs = tokenizer(input_text, max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(cypher, max_length=64, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess, batched=False)

In [None]:
from torch.utils.data import Dataset

class HuggingfaceCypherDataset(Dataset):
    def __init__(self, hf_dataset):
        self.data = hf_dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids = torch.tensor(item["input_ids"], dtype=torch.long)
        labels = torch.tensor(item["labels"], dtype=torch.long)
        return input_ids, labels

In [None]:
from torch.utils.data import DataLoader
BATCH_SIZE = 32

train_data = HuggingfaceCypherDataset(tokenized_dataset["train"])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

test_data = HuggingfaceCypherDataset(tokenized_dataset["test"])
test_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
from nltk.tokenize import word_tokenize

def compute_bleu(pred_str, ref_str):
    ref_tokens = [word_tokenize(ref_str)]
    pred_tokens = word_tokenize(pred_str)
    return sentence_bleu(ref_tokens, pred_tokens, smoothing_function=SmoothingFunction().method1)

def evaluate(model, dataloader, criterion, device,write_samples=False):
    model.eval()
    total_loss = 0
    total_tokens = 0
    correct_tokens = 0
    total_bleu = 0
    num_samples = 0
    sample_logs = []

    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
            total_loss += loss.item()

            # === Accuracy ===
            preds = output.argmax(dim=-1)
            mask = (tgt_output != tokenizer.pad_token_id)
            correct = (preds == tgt_output) & mask
            correct_tokens += correct.sum().item()
            total_tokens += mask.sum().item()

            # === BLEU ===
            for i in range(src.size(0)):
                pred_ids = preds[i].tolist()
                target_ids = tgt_output[i].tolist()

                pred_text = tokenizer.decode(pred_ids, skip_special_tokens=True)
                ref_text = tokenizer.decode(target_ids, skip_special_tokens=True)

                bleu = compute_bleu(pred_text, ref_text)
                total_bleu += bleu
                num_samples += 1
    
                # === Log up to 5 examples ===
                if write_samples:
                    if len(sample_logs) < 5:
                        input_ids = src[i].tolist()
                        input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
        
                        sample_logs.append({
                            "input": input_text,
                            "ground_truth": ref_text,
                            "prediction": pred_text
                        })
    

    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    avg_bleu = total_bleu / num_samples if num_samples > 0 else 0.0
    return avg_loss, accuracy, avg_bleu,sample_logs



def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_tokens = 0
    total_tokens = 0

    loop = tqdm(enumerate(dataloader), total=len(dataloader), desc="Training", leave=False)

    for i, (src, tgt) in loop:
        src, tgt = src.to(device), tgt.to(device)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_input)

        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()

        # === Accuracy Computation ===
        preds = output.argmax(dim=-1)  # (batch, seq_len)
        mask = (tgt_output != tokenizer.pad_token_id)
        correct = (preds == tgt_output) & mask
        correct_tokens += correct.sum().item()
        total_tokens += mask.sum().item()

        # === Loss and Logging ===
        total_loss += loss.item()
        loop.set_description(f"Training [Batch {i+1}/{len(dataloader)}]")
        loop.set_postfix(batch_loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    return avg_loss, accuracy

In [None]:
def train_with_wandb(config, run_name):
    import wandb
    wandb.init(project="cypher-transformer-generation", config=config, name=run_name,id=run_name)
    patience = config.get("patience", 3)
    patience_counter = 0
    # === Setup Model ===
    model = Transformer(
        src_vocab_size=tokenizer.vocab_size,
        tgt_vocab_size=tokenizer.vocab_size,
        emb_dim=config['emb_dim'],
        num_layers=config['num_layers'],
        num_heads=config['num_heads'],
        ff_dim=config['ff_dim'],
        max_len=config['max_len']
    )

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    best_val_loss = float('inf')
    best_model_path = f"best_model_{run_name}.pt"

    for epoch in range(config['epochs']):
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc,val_bleu,_ = evaluate(model, test_loader, criterion, device)

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc,
            "val_bleu": val_bleu
        })

        print(f"[{run_name}] Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {val_acc:.2%} | BLEU: {val_bleu:.3f}")

        # Save best model only
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_path)
            wandb.run.summary["best_val_loss"] = val_loss
            wandb.run.summary["best_model_path"] = best_model_path
            artifact = wandb.Artifact(name=f"{run_name}_best_model", type="model")
            artifact.add_file(best_model_path)
            wandb.log_artifact(artifact)
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    model.load_state_dict(torch.load(best_model_path))
    model.eval()
    
    val_loss, val_acc, val_bleu, sample_logs = evaluate(model, test_loader, criterion, device,write_samples=True)
    
    if sample_logs:
        table = wandb.Table(columns=["input", "ground_truth", "prediction"])
        for row in sample_logs:
            table.add_data(row["input"], row["ground_truth"], row["prediction"])
        wandb.log({
            "final_val_loss": val_loss,
            "final_val_acc": val_acc,
            "final_val_bleu": val_bleu,
            "example_predictions": table
        })
    wandb.finish()


In [None]:
model_configs = [
    {
        "emb_dim": 128,
        "num_layers": 2,
        "num_heads": 4,
        "ff_dim": 256,
        "max_len": 64,
        "lr": 1e-4,
        "epochs": 50,
        "model_name":"vanila_transformerV1"
    },
    {
        "emb_dim": 128,
        "num_layers": 4,
        "num_heads": 4,
        "ff_dim": 512,
        "max_len": 64,
        "lr": 1e-4,
        "epochs": 50,
        "model_name":"vanila_transformerV2"
    },
    {
        "emb_dim": 64,
        "num_layers": 8,
        "num_heads": 8,
        "ff_dim": 512,
        "max_len": 64,
        "lr": 1e-3,
        "epochs": 50,
        "model_name":"vanila_transformerV3"
    }
]


In [None]:
for i, config in enumerate(model_configs):
    run_name = config["model_name"]
    train_with_wandb(config, run_name)
