In [None]:
# Install dependencies
!pip -q install -U sacrebleu transformers scikit-learn


In [2]:
!pip install torch-geometric



In [None]:
# Load dataset (KaggleHub)
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [4]:


import os, sys, pickle

comp_path = kagglehub.competition_download("molecular-graph-captioning")
print("comp_path:", comp_path)

BASELINE_DIR = os.path.join(comp_path, "data_baseline")
DATA_DIR     = os.path.join(BASELINE_DIR, "data")

sys.path.append(BASELINE_DIR)
from data_utils import x_map, e_map

TRAIN_PKL = os.path.join(DATA_DIR, "train_graphs.pkl")
VAL_PKL   = os.path.join(DATA_DIR, "validation_graphs.pkl")
TEST_PKL  = os.path.join(DATA_DIR, "test_graphs.pkl")

def load_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

train_graphs = load_pkl(TRAIN_PKL)
val_graphs   = load_pkl(VAL_PKL)
test_graphs  = load_pkl(TEST_PKL)

print("Loaded:", len(train_graphs), len(val_graphs), len(test_graphs))
print("Fields present:", [k for k in ["x","edge_index","edge_attr","smiles","description","id","idx"] if hasattr(train_graphs[0], k)])
print("x:", train_graphs[0].x.shape, "edge_attr:", train_graphs[0].edge_attr.shape, "edge_index:", train_graphs[0].edge_index.shape)


comp_path: /root/.cache/kagglehub/competitions/molecular-graph-captioning
Loaded: 31008 1000 1000
Fields present: ['x', 'edge_index', 'edge_attr', 'description', 'id']
x: torch.Size([40, 9]) edge_attr: torch.Size([86, 3]) edge_index: torch.Size([2, 86])


In [None]:
# Build BOW vectors
import numpy as np
from tqdm import tqdm

x_feature_names = list(x_map.keys())
e_feature_names = list(e_map.keys())

x_sizes = [len(x_map[k]) for k in x_feature_names]
e_sizes = [len(e_map[k]) for k in e_feature_names]
BOW_DIM = sum(x_sizes) + sum(e_sizes)
print("BOW_DIM:", BOW_DIM)

def graph_to_bow(g):
    x = g.x.detach().cpu().numpy().astype(np.int64)
    e = None
    if hasattr(g, "edge_attr") and g.edge_attr is not None and g.edge_attr.numel() > 0:
        e = g.edge_attr.detach().cpu().numpy().astype(np.int64)

    parts = []

    # node histograms
    for j, sz in enumerate(x_sizes):
        col = x[:, j] if j < x.shape[1] else np.zeros((x.shape[0],), dtype=np.int64)
        col = np.clip(col, 0, sz - 1)
        parts.append(np.bincount(col, minlength=sz).astype(np.float32))

    # edge histograms
    if e is None:
        for sz in e_sizes:
            parts.append(np.zeros((sz,), dtype=np.float32))
    else:
        for j, sz in enumerate(e_sizes):
            col = e[:, j] if j < e.shape[1] else np.zeros((e.shape[0],), dtype=np.int64)
            col = np.clip(col, 0, sz - 1)
            parts.append(np.bincount(col, minlength=sz).astype(np.float32))

    v = np.concatenate(parts, axis=0)
    v = v / (np.linalg.norm(v) + 1e-12)
    return v.astype(np.float32)

print("Building BOW vectors...")
B_train = np.stack([graph_to_bow(g) for g in tqdm(train_graphs)], axis=0)
B_val   = np.stack([graph_to_bow(g) for g in tqdm(val_graphs)], axis=0)
B_test  = np.stack([graph_to_bow(g) for g in tqdm(test_graphs)], axis=0)

print("Shapes:", B_train.shape, B_val.shape, B_test.shape)


BOW_DIM: 207
Building BOW vectors...


100%|██████████| 31008/31008 [00:08<00:00, 3850.20it/s] 
100%|██████████| 1000/1000 [00:00<00:00, 10753.33it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8546.18it/s]

Shapes: (31008, 207) (1000, 207) (1000, 207)





In [None]:
# Build BOW vectors (alternate)
import numpy as np
from tqdm import tqdm

x_feature_names = list(x_map.keys())
e_feature_names = list(e_map.keys())

x_sizes = [len(x_map[k]) for k in x_feature_names]
e_sizes = [len(e_map[k]) for k in e_feature_names]
BOW_DIM = sum(x_sizes) + sum(e_sizes)
print("BOW_DIM:", BOW_DIM)

def graph_to_bow(g):
    x = g.x.detach().cpu().numpy().astype(np.int64)
    e = None
    if hasattr(g, "edge_attr") and g.edge_attr is not None and g.edge_attr.numel() > 0:
        e = g.edge_attr.detach().cpu().numpy().astype(np.int64)

    parts = []

    # node histograms
    for j, sz in enumerate(x_sizes):
        col = x[:, j] if j < x.shape[1] else np.zeros((x.shape[0],), dtype=np.int64)
        col = np.clip(col, 0, sz - 1)
        parts.append(np.bincount(col, minlength=sz).astype(np.float32))

    # edge histograms
    if e is None:
        for sz in e_sizes:
            parts.append(np.zeros((sz,), dtype=np.float32))
    else:
        for j, sz in enumerate(e_sizes):
            col = e[:, j] if j < e.shape[1] else np.zeros((e.shape[0],), dtype=np.int64)
            col = np.clip(col, 0, sz - 1)
            parts.append(np.bincount(col, minlength=sz).astype(np.float32))

    v = np.concatenate(parts, axis=0)
    v = v / (np.linalg.norm(v) + 1e-12)
    return v.astype(np.float32)

print("Building BOW vectors...")
B_train = np.stack([graph_to_bow(g) for g in tqdm(train_graphs)], axis=0)
B_val   = np.stack([graph_to_bow(g) for g in tqdm(val_graphs)], axis=0)
B_test  = np.stack([graph_to_bow(g) for g in tqdm(test_graphs)], axis=0)

print("Shapes:", B_train.shape, B_val.shape, B_test.shape)


BOW_DIM: 207
Building BOW vectors...


100%|██████████| 31008/31008 [00:02<00:00, 11048.57it/s]
100%|██████████| 1000/1000 [00:00<00:00, 10302.78it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9272.44it/s]

Shapes: (31008, 207) (1000, 207) (1000, 207)





In [None]:
# Train dual retriever (GNN + text)
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

# Repro
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# (Optional) keep installs in a separate setup cell on Kaggle
# !pip -q install -U torch-geometric pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-$(python -c "import torch;print(torch.__version__.split('+')[0])")+cu$(python -c "import torch;print(torch.version.cuda.replace('.',''))").html

from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Batch
from torch_geometric.nn import GINEConv, global_mean_pool
from transformers import AutoTokenizer, AutoModel

# Infer vocab sizes for categorical embeddings
def infer_vocabs_all(graphs):
    # graphs: list of PyG Data objects
    x_max = None
    e_max = None

    for g in graphs:
        x = g.x
        if x_max is None:
            x_max = x.max(dim=0).values
        else:
            x_max = torch.maximum(x_max, x.max(dim=0).values)

        ea = getattr(g, "edge_attr", None)
        if ea is not None and ea.numel() > 0:
            if e_max is None:
                e_max = ea.max(dim=0).values
            else:
                e_max = torch.maximum(e_max, ea.max(dim=0).values)

    if e_max is None:
        e_max = torch.zeros(3, dtype=torch.long)

    x_vocabs = (x_max + 1).tolist()
    e_vocabs = (e_max + 1).tolist()
    return [int(v) for v in x_vocabs], [int(v) for v in e_vocabs]

x_vocabs, e_vocabs = infer_vocabs_all(train_graphs)   # or train_graphs + val_graphs
print("x_vocabs:", x_vocabs)
print("e_vocabs:", e_vocabs)


# Categorical feature embedders
class CatFeatureEmbedder(nn.Module):
    def __init__(self, vocabs, emb_dim):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(v, emb_dim) for v in vocabs])

    def forward(self, x_int):  # [N, F] int
        # sum embeddings across feature fields
        out = 0
        for j, emb in enumerate(self.embs):
            out = out + emb(x_int[:, j])
        return out

# GNN encoder
class GNNEncoder(nn.Module):
    def __init__(self, x_vocabs, e_vocabs, hidden=256, out_dim=256, layers=4, dropout=0.1):
        super().__init__()
        self.node_emb = CatFeatureEmbedder(x_vocabs, hidden)
        self.edge_emb = CatFeatureEmbedder(e_vocabs, hidden)
        self.dropout = dropout

        self.convs = nn.ModuleList()
        self.norms = nn.ModuleList()
        for _ in range(layers):
            mlp = nn.Sequential(
                nn.Linear(hidden, hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
            )
            self.convs.append(GINEConv(mlp, edge_dim=hidden))
            self.norms.append(nn.LayerNorm(hidden))

        self.out = nn.Linear(hidden, out_dim)

    def forward(self, batch):
        # batch.x and batch.edge_attr are categorical indices
        x = self.node_emb(batch.x.long())

        e = None
        if batch.edge_attr is not None and batch.edge_attr.numel() > 0:
            e = self.edge_emb(batch.edge_attr.long())

        for conv, ln in zip(self.convs, self.norms):
            x = conv(x, batch.edge_index, e) if e is not None else conv(x, batch.edge_index)
            x = F.relu(x)
            x = ln(x)  # stability
            x = F.dropout(x, p=self.dropout, training=self.training)

        g = global_mean_pool(x, batch.batch)
        g = self.out(g)
        return F.normalize(g, dim=-1)

# Text encoder
class TextEncoder(nn.Module):
    def __init__(self, name="sentence-transformers/all-MiniLM-L6-v2", out_dim=256):
        super().__init__()
        self.tok = AutoTokenizer.from_pretrained(name)
        self.enc = AutoModel.from_pretrained(name)
        self.proj = nn.Linear(self.enc.config.hidden_size, out_dim)

    def forward(self, texts):
        batch = self.tok(
            texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
        ).to(DEVICE)

        out = self.enc(**batch)
        attn = batch["attention_mask"].unsqueeze(-1)
        pooled = (out.last_hidden_state * attn).sum(1) / (attn.sum(1) + 1e-12)

        z = self.proj(pooled)
        return F.normalize(z, dim=-1)

class DualRetriever(nn.Module):
    def __init__(self, x_vocabs, e_vocabs, dim=256, gnn_layers=4, dropout=0.1):
        super().__init__()
        self.gnn = GNNEncoder(x_vocabs, e_vocabs, hidden=256, out_dim=dim, layers=gnn_layers, dropout=dropout)
        self.txt = TextEncoder("sentence-transformers/all-MiniLM-L6-v2", out_dim=dim)

def info_nce(mol_z, txt_z, temp=0.07):
    logits = mol_z @ txt_z.t() / temp
    labels = torch.arange(logits.size(0), device=logits.device)
    return (F.cross_entropy(logits, labels) + F.cross_entropy(logits.t(), labels)) / 2

# Dataset and dataloader
class RetrieverDS(Dataset):
    def __init__(self, graphs):
        self.graphs = graphs
    def __len__(self): return len(self.graphs)
    def __getitem__(self, i):
        g = self.graphs[i]
        txt = getattr(g, "description", "")
        return g, str(txt) if txt is not None else ""

def collate_retr(batch):
    graphs, texts = zip(*batch)
    pyg = Batch.from_data_list(list(graphs))
    return pyg, list(texts)

# Training setup
DIM = 256          # try 128 vs 256 if you want
GNN_LAYERS = 4     # recommended sweep: 3,4,5
DROPOUT = 0.1
BATCH = 128 if DEVICE == "cuda" else 64  # bigger batch = better contrastive negatives

retr = DualRetriever(x_vocabs, e_vocabs, dim=DIM, gnn_layers=GNN_LAYERS, dropout=DROPOUT).to(DEVICE)
opt  = torch.optim.AdamW(retr.parameters(), lr=2e-4, weight_decay=1e-2)

loader = DataLoader(
    RetrieverDS(train_graphs),
    batch_size=BATCH,
    shuffle=True,
    num_workers=0,
    collate_fn=collate_retr
)

EPOCHS = 3  # recommended 1–3; often 2 is sweet spot
retr.train()
for ep in range(EPOCHS):
    losses = []
    for pyg, texts in tqdm(loader, desc=f"Retriever epoch {ep+1}/{EPOCHS}"):
        pyg = pyg.to(DEVICE)

        mol_z = retr.gnn(pyg)
        txt_z = retr.txt(texts)
        loss = info_nce(mol_z, txt_z, temp=0.07)

        opt.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(retr.parameters(), 1.0)
        opt.step()

        losses.append(loss.item())
    print(f"epoch {ep+1} | loss {float(np.mean(losses)):.4f}")

# Embed graphs using trained GNN
@torch.no_grad()
def embed_gnn(graphs, bs=256):
    retr.eval()
    out = []
    for i in tqdm(range(0, len(graphs), bs), desc="Embed GNN"):
        pyg = Batch.from_data_list(graphs[i:i+bs]).to(DEVICE)
        z = retr.gnn(pyg).detach().cpu().numpy().astype(np.float32)
        out.append(z)
    return np.vstack(out)

Z_train = embed_gnn(train_graphs)
Z_val   = embed_gnn(val_graphs)
Z_test  = embed_gnn(test_graphs)

print("Z shapes:", Z_train.shape, Z_val.shape, Z_test.shape)


DEVICE: cuda
x_vocabs: [86, 3, 7, 10, 5, 5, 7, 2, 2]
e_vocabs: [13, 4, 2]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Retriever epoch 1/3: 100%|██████████| 243/243 [01:48<00:00,  2.24it/s]


epoch 1 | loss 2.1504


Retriever epoch 2/3: 100%|██████████| 243/243 [01:49<00:00,  2.21it/s]


epoch 2 | loss 0.8115


Retriever epoch 3/3: 100%|██████████| 243/243 [01:52<00:00,  2.16it/s]


epoch 3 | loss 0.5043


Embed GNN: 100%|██████████| 122/122 [00:02<00:00, 49.44it/s]
Embed GNN: 100%|██████████| 4/4 [00:00<00:00, 48.21it/s]
Embed GNN: 100%|██████████| 4/4 [00:00<00:00, 46.74it/s]

Z shapes: (31008, 256) (1000, 256) (1000, 256)





In [10]:
!pip -q install -U bert-score transformers


In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import sacrebleu
import torch
from bert_score import score as bertscore

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("BERTScore DEVICE:", DEVICE)

def chemberta_bertscore_f1(preds, refs, batch_size=32, max_length=128):
    """
    BERTScore F1 using ChemBERTa-zinc-base-v1 with your bert-score version.
    Key trick: pass num_layers explicitly to avoid KeyError.
    """
    P, R, F1 = bertscore(
        preds, refs,
        lang="en",
        model_type="seyonec/ChemBERTa-zinc-base-v1",
        num_layers=6,                 # <-- base model: 6 transformer layers
        device=DEVICE,
        batch_size=batch_size,
        verbose=False,
        rescale_with_baseline=False,
        use_fast_tokenizer=True
    )
    return float(F1.mean().cpu())

def eval_bleu_chembertscore(preds, refs, sample_size=2000, seed=42):
    assert len(preds) == len(refs)

    if sample_size is not None and sample_size < len(preds):
        rng = np.random.default_rng(seed)
        idx = rng.choice(len(preds), size=sample_size, replace=False)
        preds_s = [preds[i] for i in idx]
        refs_s  = [refs[i] for i in idx]
    else:
        preds_s, refs_s = preds, refs

    bleu = sacrebleu.corpus_bleu(preds_s, [refs_s]).score  # 0..100
    bert_f1 = chemberta_bertscore_f1(preds_s, refs_s, batch_size=32)

    comp = 0.5 * (bleu / 100.0) + 0.5 * bert_f1
    return bleu, bert_f1, comp


# Setup and evaluation utilities
train_caps = [str(getattr(g, "description", "")) for g in train_graphs]
val_refs   = [str(getattr(g, "description", "")) for g in val_graphs]

have_gnn = ("Z_train" in globals()) and (Z_train is not None)
print("have_gnn:", have_gnn)

# Hybrid retrieval (GNN + BOW)
def topk_retrieve(Q_bow, K_bow, Q_gnn=None, K_gnn=None, topk=64, alpha=0.8, chunk=1024):
    K_bow_T = K_bow.T
    use_gnn = (Q_gnn is not None) and (K_gnn is not None) and (alpha > 1e-12)
    if use_gnn:
        K_gnn_T = K_gnn.T

    nq = Q_bow.shape[0]
    idx_out = np.zeros((nq, topk), dtype=np.int64)
    sc_out  = np.zeros((nq, topk), dtype=np.float32)

    for s in tqdm(range(0, nq, chunk), desc=f"Retrieve topk={topk} alpha={alpha}"):
        e = min(nq, s + chunk)
        sim = (1 - alpha) * (Q_bow[s:e] @ K_bow_T)
        if use_gnn:
            sim += alpha * (Q_gnn[s:e] @ K_gnn_T)

        part = np.argpartition(-sim, topk, axis=1)[:, :topk]
        part_sc = np.take_along_axis(sim, part, axis=1)
        order = np.argsort(-part_sc, axis=1)
        idx_out[s:e] = np.take_along_axis(part, order, axis=1)
        sc_out[s:e]  = np.take_along_axis(part_sc, order, axis=1)

    return idx_out, sc_out

# TF-IDF text features
word_tfidf = TfidfVectorizer(
    lowercase=True, ngram_range=(1,2),
    min_df=2, max_df=0.95,
    max_features=250000,
    sublinear_tf=True
)
X_word = word_tfidf.fit_transform(train_caps)

USE_CHAR = True
if USE_CHAR:
    char_tfidf = TfidfVectorizer(
        lowercase=True, analyzer="char_wb", ngram_range=(3,5),
        min_df=2, max_df=0.95,
        max_features=200000,
        sublinear_tf=True
    )
    X_char = char_tfidf.fit_transform(train_caps)
else:
    X_char = None

def softmax(x, beta=10.0):
    x = np.asarray(x, dtype=np.float32)
    x = x - x.max()
    e = np.exp(beta * x)
    return e / (e.sum() + 1e-12)

def normalize01(x):
    x = np.asarray(x, dtype=np.float32)
    mn, mx = float(x.min()), float(x.max())
    if mx - mn < 1e-12:
        return np.zeros_like(x)
    return (x - mn) / (mx - mn)

def dedup_by_caption(ids, scores, max_keep=64):
    best = {}
    for j, s in zip(ids, scores):
        cap = train_caps[int(j)]
        s = float(s)
        if cap not in best or s > best[cap][1]:
            best[cap] = (int(j), s)
    items = list(best.values())
    items.sort(key=lambda x: -x[1])
    items = items[:max_keep]
    return [i for i,_ in items], [s for _,s in items]

def pick_caption_tfidf(ids, scores, w_cons=0.8, w_graph=0.2, beta=10.0, max_unique=64):
    ids, scores = dedup_by_caption(ids, scores, max_keep=max_unique)
    if len(ids) == 0:
        return ""

    ids = np.array(ids, dtype=np.int64)
    gs  = np.array(scores, dtype=np.float32)
    w   = softmax(gs, beta=beta)

    # WORD consensus
    Cw = X_word[ids]  # sparse (K,V)
    Sw = (Cw @ Cw.T).toarray().astype(np.float32)  # (K,K) small dense
    cons_w = Sw @ w

    # CHAR consensus (optional)
    if X_char is not None:
        Cc = X_char[ids]
        Sc = (Cc @ Cc.T).toarray().astype(np.float32)
        cons_c = Sc @ w
        cons = 0.5 * (cons_w + cons_c)
    else:
        cons = cons_w

    final = w_cons * normalize01(cons) + w_graph * normalize01(gs)
    return train_caps[int(ids[int(np.argmax(final))])]

# Confidence gate for choosing captions
def pick_with_gate(ids, scores,
                   top1_min=0.55, gap_min=0.03,
                   **tfidf_kwargs):
    """
    ids,scores are sorted best->worst.
    - If top1 is strong OR top1-top2 gap is large => copy top1 caption
    - else => TF-IDF consensus among candidates
    Tune top1_min/gap_min on val.
    """
    if len(ids) == 0:
        return ""
    if len(ids) == 1:
        return train_caps[int(ids[0])]

    s1 = float(scores[0])
    s2 = float(scores[1])

    if (s1 >= top1_min) or ((s1 - s2) >= gap_min):
        return train_caps[int(ids[0])]

    return pick_caption_tfidf(ids, scores, **tfidf_kwargs)

# Validation tuning
ALPHA = 0.8 if have_gnn else 0.0

# retrieve a bigger pool, then select
POOL = 64
v_idx, v_sc = topk_retrieve(B_val, B_train, Q_gnn=(Z_val if have_gnn else None), K_gnn=(Z_train if have_gnn else None),
                            topk=POOL, alpha=ALPHA, chunk=1024)

best = (-1, None)

for top1_min in [0.50, 0.55, 0.60]:
    for gap_min in [0.02, 0.03, 0.05]:
        for beta in [6.0, 10.0, 14.0]:
            preds = []
            for i in range(len(val_graphs)):
                preds.append(
                    pick_with_gate(
                        v_idx[i], v_sc[i],
                        top1_min=top1_min, gap_min=gap_min,
                        w_cons=0.8, w_graph=0.2, beta=beta, max_unique=64
                    )
                )

            bleu, bert_f1, comp = eval_bleu_chembertscore(preds, val_refs, sample_size=2000, seed=42)

            print(
                f"VAL comp={comp:.4f} | BLEU={bleu:.3f} | ChemBERTa-F1={bert_f1:.4f} "
                f"| top1_min={top1_min} gap_min={gap_min} beta={beta}"
            )

            if comp > best[0]:
                best = (comp, dict(top1_min=top1_min, gap_min=gap_min, beta=beta,
                                   bleu=bleu, bert_f1=bert_f1))

print("BEST:", best)

cfg = best[1]
top1_min = cfg["top1_min"]
gap_min  = cfg["gap_min"]
beta     = cfg["beta"]
print(f"Using: top1_min={top1_min}, gap_min={gap_min}, beta={beta} (BLEU={cfg['bleu']:.3f}, BERT={cfg['bert_f1']:.4f})")


# Predict on test set
t_idx, t_sc = topk_retrieve(B_test, B_train, Q_gnn=(Z_test if have_gnn else None), K_gnn=(Z_train if have_gnn else None),
                            topk=POOL, alpha=ALPHA, chunk=1024)

test_preds = []
for i in tqdm(range(len(test_graphs)), desc="Predict test"):
    test_preds.append(
        pick_with_gate(
            t_idx[i], t_sc[i],
            top1_min=top1_min, gap_min=gap_min,
            w_cons=0.8, w_graph=0.2, beta=beta, max_unique=64
        )
    )

print("Example preds:", test_preds[:3])

# Write submission file
def get_graph_id(g, fallback):
    for k in ["id", "idx", "graph_id", "mol_id"]:
        if hasattr(g, k):
            v = getattr(g, k)
            if isinstance(v, (int, np.integer)):
                return int(v)
            if isinstance(v, str) and v.strip():
                return v
    return fallback

sample_path = None
for root, _, files in os.walk(comp_path):
    for f in files:
        fn = f.lower()
        if fn.endswith(".csv") and ("sample" in fn) and ("submission" in fn):
            sample_path = os.path.join(root, f)
            break
    if sample_path:
        break

if sample_path is not None:
    sample = pd.read_csv(sample_path)
    target_col = sample.columns[1] if len(sample.columns) > 1 else "description"
    sub = sample.copy()
    sub[target_col] = test_preds
else:
    ids = [get_graph_id(g, i) for i, g in enumerate(test_graphs)]
    sub = pd.DataFrame({"id": ids, "description": test_preds})

sub.to_csv("submission.csv", index=False)
print("✅ Wrote submission.csv")
display(sub.head())


BERTScore DEVICE: cuda
have_gnn: True


Retrieve topk=64 alpha=0.8: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]


VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.02 beta=6.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.02 beta=10.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.02 beta=14.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.03 beta=6.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.03 beta=10.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.03 beta=14.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.05 beta=6.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.05 beta=10.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.5 gap_min=0.05 beta=14.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.55 gap_min=0.02 beta=6.0
VAL comp=0.7318 | BLEU=48.668 | ChemBERTa-F1=0.9768 | top1_min=0.55 gap_min=0.02 beta=10.0
VAL comp=0.7

Retrieve topk=64 alpha=0.8: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
Predict test: 100%|██████████| 1000/1000 [00:00<00:00, 437362.25it/s]

Example preds: ["The molecule is a beta-D-glucosyl-(1<->1')-N-acylsphinganine in which the acyl group specified is hexacosanoyl. It has a role as a mouse metabolite. It derives from a hexacosanoic acid.", 'The molecule is the monohydrate form of doxapram hydrochloride. A central and respiratory stimulant with a brief duration of action, it is used as a temporary treatment of acute respiratory failure, particularly when superimposed on chronic obstructive pulmonary disease, and of postoperative respiratory depression. It has also been used for treatment of postoperative shivering. It has a role as a central nervous system stimulant. It contains a doxapram hydrochloride (anhydrous).', 'The molecule is a steroid glucosiduronic acid having etiocholanolone as the steroid component. It has a role as a human blood serum metabolite, a human urinary metabolite, a human xenobiotic metabolite and a marine xenobiotic metabolite. It is a steroid glucosiduronic acid and a 17-oxo steroid. It is a con




Unnamed: 0,id,description
0,0,The molecule is a beta-D-glucosyl-(1<->1')-N-a...
1,1,The molecule is the monohydrate form of doxapr...
2,2,The molecule is a steroid glucosiduronic acid ...
3,3,The molecule is a hydroxy fatty acid ascarosid...
4,4,The molecule is an organochlorine compound tha...
