In [1]:
import wandb
import pandas as pd
import numpy as np

try:
    df = pd.read_csv("results.csv")
except FileNotFoundError:
    # ── A) CONFIGURE YOUR W&B ACCESS ────────────────────────────────────────────
    ENTITY  = "gbg141"
    PROJECT = "TopoBench_Hypergraph"
    FILTER  = {}  # e.g. {"group": "my_sweep_group"} or {} 

    api = wandb.Api()
    runs = api.runs(f"{ENTITY}/{PROJECT}", filters=FILTER)

    import itertools

    def flatten_config(d, parent_key="", sep="."):
        items = []
        for k,v in d.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(flatten_config(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)

    print(f"▶ Number of runs fetched from W&B: {len(runs)}")

    # ── B) BUILD THE RAW DATAFRAME ────────────────────────────────────────────────
    records = []
    for run in runs:
        cfg = run.config.copy() or {}
        
        cfg = flatten_config(cfg)
        # Attempt to extract metrics—skip if missing
        if ("val/accuracy" not in run.summary) or ("test/accuracy" not in run.summary):
            continue
        # "dataset.loader.parameters.data_name", "model.backbone._target_", "dataset.split_params.data_seed"
        val_acc  = run.summary["val/accuracy"]
        test_acc = run.summary["test/accuracy"]
        dataset  = cfg.get("dataset.loader.parameters.data_name", None)
        model    = cfg.get("model.backbone._target_",   None)
        dataset_seed     = cfg.get("dataset.split_params.data_seed",    None)

        # If any of these is None, we might want to skip as well:
        if (dataset is None) or (model is None) or (dataset_seed is None):
            continue

        # Collect any other hyperparams (besides dataset/model/dataset_seed)
        hyperparams = {k: v for k, v in cfg.items() 
                    if k not in ["dataset", "model", "dataset_seed", "dataset.loader.parameters.data_name", "model.backbone._target_", "dataset.split_params.data_seed"]}

        row = {
            "dataset":  dataset,
            "model":    model,
            "dataset_seed": dataset_seed,
            "val_acc":  val_acc,
            "test_acc": test_acc,
        }
        row.update(hyperparams)
        records.append(row)

    df = pd.DataFrame(records)
    df.to_csv("results.csv", index=False)
print("▶ After building df, df.shape =", df.shape)
print(df.head())


▶ Number of runs fetched from W&B: 7193
▶ After building df, df.shape = (7193, 146)
               dataset                                          model  \
0           20newsW100  topobench.nn.backbones.hypergraph.edgnn.EDGNN   
1                  zoo  topobench.nn.backbones.hypergraph.edgnn.EDGNN   
2      cocitation_cora  topobench.nn.backbones.hypergraph.edgnn.EDGNN   
3              NTU2012  topobench.nn.backbones.hypergraph.edgnn.EDGNN   
4  cocitation_citeseer  topobench.nn.backbones.hypergraph.edgnn.EDGNN   

   dataset_seed   val_acc  test_acc          loss._target_  \
0             0  0.800493  0.803989  topobench.loss.TBLoss   
1             0  0.680000  0.730769  topobench.loss.TBLoss   
2             0  0.744461  0.747415  topobench.loss.TBLoss   
3             0  0.815109  0.801193  topobench.loss.TBLoss   
4             0  0.675121  0.701691  topobench.loss.TBLoss   

  loss.dataset_loss.task loss.dataset_loss.loss_type  \
0         classification               cross_ent

In [31]:
# Remove all columns that have 'train/' in their name
df = df[[col for col in df.columns if "callbacks." not in col]]
df = df[[col for col in df.columns if "logger." not in col]]
df = df[[col for col in df.columns if "extras." not in col]]
df = df[[col for col in df.columns if "trainer." not in col]]
df = df[[col for col in df.columns if "evaluator." not in col]]
df = df[[col for col in df.columns if "paths." not in col]]
df = df[[col for col in df.columns if "config_key" not in col]]
df = df[[col for col in df.columns if "tags" not in col]]
for col in df.columns:
    print(col)

dataset
model
dataset_seed
val_acc
test_acc
loss._target_
loss.dataset_loss.task
loss.dataset_loss.loss_type
loss.modules_losses.readout
loss.modules_losses.backbone
loss.modules_losses.feature_encoder
seed
test
model.compile
model.readout._target_
model.readout.hidden_dim
model.readout.task_level
model.readout.out_channels
model.readout.pooling_type
model.readout.readout_name
model.readout.num_cell_dimensions
model._target_
model.backbone.dropout
model.backbone.n_layers
model.backbone.aggregate
model.backbone.activation
model.backbone.edconv_type
model.backbone.num_features
model.backbone.input_dropout
model.backbone.MLP_num_layers
model.model_name
model.model_domain
model.feature_encoder._target_
model.feature_encoder.in_channels
model.feature_encoder.encoder_name
model.feature_encoder.out_channels
model.feature_encoder.proj_dropout
model.backbone_wrapper._target_
model.backbone_wrapper._partial_
model.backbone_wrapper.out_channels
model.backbone_wrapper.wrapper_name
model.backbone_w

In [None]:
# ── C) DETERMINE HYPERPARAM COLUMNS ──────────────────────────────────────────
static_cols    = {"dataset", "model", "dataset_seed", "val_acc", "test_acc"}
hyperparam_cols = sorted(list(set(df.columns) - static_cols))
print(f"▶ hyperparam_cols = {hyperparam_cols}")

# If df is empty or columns are missing, fix those first
if df.empty:
    raise RuntimeError("DataFrame `df` is empty. Check that runs actually contained "
                       "`dataset`, `model`, `dataset_seed`, `val/accuracy`, `test/accuracy`.")

# ── D) ENSURE ALL “group key” CELLS ARE HASHABLE ─────────────────────────────
def ensure_hashable(x):
    try:
        hash(x)
        return x
    except TypeError:
        if isinstance(x, np.ndarray):
            return tuple(x.tolist())
        elif isinstance(x, list):
            return tuple(x)
        elif isinstance(x, dict):
            return tuple(sorted(x.items()))
        else:
            return str(x)

# Apply to dataset/model too, in case they’re arrays or lists
for col in ["dataset", "model"] + hyperparam_cols:
    if col in df.columns and df[col].dtype == object:
        df[col] = df[col].apply(ensure_hashable)

# ── E) GROUP BY (dataset, model, hyperparams) ─────────────────────────────────
group_cols = ["dataset", "model"] + hyperparam_cols
print(f"▶ group_cols = {group_cols}")

grouped = (
    df
    .groupby(group_cols, dropna=False)
    .agg(
        mean_val_acc = ("val_acc", "mean"),
        n_seeds      = ("dataset_seed", "count")
    )
    .reset_index()
)
print("▶ After grouping, grouped.shape =", grouped.shape)
print(grouped.head())

if grouped.empty:
    raise RuntimeError("`grouped` is empty. Either hyperparam_cols are wrong or "
                       "no rows survived the groupby.")

▶ hyperparam_cols = ['ckpt_path', 'dataset.dataloader_params.batch_size', 'dataset.dataloader_params.num_workers', 'dataset.dataloader_params.pin_memory', 'dataset.loader._target_', 'dataset.loader.parameters.data_dir', 'dataset.loader.parameters.data_domain', 'dataset.loader.parameters.data_type', 'dataset.parameters.loss_type', 'dataset.parameters.monitor_metric', 'dataset.parameters.num_classes', 'dataset.parameters.num_features', 'dataset.parameters.task', 'dataset.parameters.task_level', 'dataset.split_params.data_split_dir', 'dataset.split_params.k', 'dataset.split_params.learning_setting', 'dataset.split_params.split_type', 'dataset.split_params.train_prop', 'loss._target_', 'loss.dataset_loss.loss_type', 'loss.dataset_loss.task', 'loss.modules_losses.backbone', 'loss.modules_losses.feature_encoder', 'loss.modules_losses.readout', 'model._target_', 'model.backbone.MLP_num_layers', 'model.backbone.activation', 'model.backbone.aggregate', 'model.backbone.alpha', 'model.backbone.be

In [94]:
# ── F) PICK BEST CONFIGS PER (dataset, model) ─────────────────────────────────
grouped_sorted = grouped.sort_values(
    by=["dataset", "model", "mean_val_acc"], 
    ascending=[True, True, False]
)
best_configs = (
    grouped_sorted
    .groupby(["dataset", "model"], as_index=False)
    .first()
)
print("▶ best_configs.shape =", best_configs.shape)

# Build a “config_key” column in both DataFrames for merging
def columns_config_key(row, cols):
    return tuple(c for c in cols if row[c] is not None)

def make_config_key(row, cols):
    return tuple(row[c] for c in cols)

best_configs = best_configs.fillna(np.nan)  # Ensure no NaN values
grouped_cols = df.apply(lambda r: columns_config_key(r, group_cols[:]), axis=1)[0]  # same as group_cols

best_configs["config_key"] = best_configs.apply(lambda r: make_config_key(r, grouped_cols), axis=1)
df["config_key"]          = df.apply(lambda r: make_config_key(r, grouped_cols), axis=1)

▶ best_configs.shape = (30, 81)


In [88]:
grouped_cols[0]

('dataset',
 'model',
 'ckpt_path',
 'dataset.dataloader_params.batch_size',
 'dataset.dataloader_params.num_workers',
 'dataset.dataloader_params.pin_memory',
 'dataset.loader._target_',
 'dataset.loader.parameters.data_dir',
 'dataset.loader.parameters.data_domain',
 'dataset.loader.parameters.data_type',
 'dataset.parameters.loss_type',
 'dataset.parameters.monitor_metric',
 'dataset.parameters.num_classes',
 'dataset.parameters.num_features',
 'dataset.parameters.task',
 'dataset.parameters.task_level',
 'dataset.split_params.data_split_dir',
 'dataset.split_params.k',
 'dataset.split_params.learning_setting',
 'dataset.split_params.split_type',
 'dataset.split_params.train_prop',
 'loss._target_',
 'loss.dataset_loss.loss_type',
 'loss.dataset_loss.task',
 'loss.modules_losses.backbone',
 'loss.modules_losses.feature_encoder',
 'loss.modules_losses.readout',
 'model._target_',
 'model.backbone.MLP_num_layers',
 'model.backbone.activation',
 'model.backbone.aggregate',
 'model.back

In [95]:
# ── G) FILTER df BY best_configs AND COMPUTE TEST STATS ────────────────────────
chosen_keys = [elem for elem in best_configs["config_key"]]

In [98]:
best_configs["config_key"][0]

('20newsW100',
 'topobench.nn.backbones.hypergraph.edgnn.EDGNN',
 1,
 1,
 False,
 'topobench.data.loaders.HypergraphDatasetLoader',
 '/home/gbg141/TopoBenchmarkX/datasets/hypergraph/ml_datasets',
 'hypergraph',
 'ml_datasets',
 'cross_entropy',
 'accuracy',
 4,
 100,
 'classification',
 'node',
 '/home/gbg141/TopoBenchmarkX/datasets/data_splits/20newsW100',
 10,
 'transductive',
 'random',
 0.5,
 'topobench.loss.TBLoss',
 'cross_entropy',
 'classification',
 'topobench.model.TBModel',
 1.0,
 'relu',
 'add',
 nan,
 nan,
 0.5,
 'EquivSet',
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 1,
 128.0,
 True,
 'topobench.nn.wrappers.HypergraphWrapper',
 1,
 128,
 'HypergraphWrapper',
 False,
 'topobench.nn.encoders.AllCellFeatureEncoder',
 'AllCellFeatureEncoder',
 (100,),
 128,
 0.5,
 'hypergraph',
 'edgnn',
 'topobench.nn.readouts.PropagateSignalDown',
 128,
 1,
 4,
 'sum',
 'PropagateSignalDown',
 'node',
 0,
 96516,
 96516,
 'topobench.optimizer.TBOptimizer',
 'Adam',
 0.001,
 0,
 'StepL

In [99]:
df["config_key"][0]

('20newsW100',
 'topobench.nn.backbones.hypergraph.edgnn.EDGNN',
 1,
 1,
 False,
 'topobench.data.loaders.HypergraphDatasetLoader',
 '/home/gbg141/TopoBenchmarkX/datasets/hypergraph/ml_datasets',
 'hypergraph',
 'ml_datasets',
 'cross_entropy',
 'accuracy',
 4,
 100,
 'classification',
 'node',
 '/home/gbg141/TopoBenchmarkX/datasets/data_splits/20newsW100',
 10,
 'transductive',
 'random',
 0.5,
 'topobench.loss.TBLoss',
 'cross_entropy',
 'classification',
 'topobench.model.TBModel',
 1.0,
 'relu',
 'add',
 nan,
 nan,
 0.5,
 'EquivSet',
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 1,
 32.0,
 True,
 'topobench.nn.wrappers.HypergraphWrapper',
 1,
 32,
 'HypergraphWrapper',
 False,
 'topobench.nn.encoders.AllCellFeatureEncoder',
 'AllCellFeatureEncoder',
 (100,),
 32,
 0.25,
 'hypergraph',
 'edgnn',
 'topobench.nn.readouts.PropagateSignalDown',
 32,
 1,
 4,
 'sum',
 'PropagateSignalDown',
 'node',
 0,
 8772,
 8772,
 'topobench.optimizer.TBOptimizer',
 'Adam',
 0.01,
 0,
 'StepLR',
 0

In [84]:
df_best_runs = df[df["config_key"].isin(chosen_keys)].copy()

In [85]:
df_best_runs

Unnamed: 0,dataset,model,dataset_seed,val_acc,test_acc,loss._target_,loss.dataset_loss.task,loss.dataset_loss.loss_type,loss.modules_losses.readout,loss.modules_losses.backbone,...,model.backbone.beta,model.backbone.alpha,model.backbone.input_drop,model.backbone.layer_drop,model.backbone.in_channels,model.backbone.hidden_channels,model.backbone.heads,model.backbone.mlp_dropout,model.backbone.mlp_num_layers,config_key


In [100]:




# ── G) FILTER df BY best_configs AND COMPUTE TEST STATS ────────────────────────
# chosen_keys = set(best_configs["config_key"].tolist())
df_best_runs = df[df["config_key"].isin(chosen_keys)].copy()
print("▶ df_best_runs.shape (should be #datasets × #models × #dataset_seeds) =", df_best_runs.shape)
print(df_best_runs.head())

if df_best_runs.empty:
    raise RuntimeError("`df_best_runs` is empty. That means none of your runs "
                       "matched the chosen best_configs.\n"
                       "Check if the config_key logic is correct.")

summary = (
    df_best_runs
    .groupby(["dataset", "model"])
    .agg(
        mean_test_acc = ("test_acc", "mean"),
        std_test_acc  = ("test_acc", "std"),
    )
    .reset_index()
)
print("▶ summary.shape =", summary.shape)
print(summary.head())

# ── H) MERGE HYPERPARAMS BACK INTO summary FOR FINAL TABLE ────────────────────
hyperparam_summary = best_configs[["dataset", "model"] + hyperparam_cols].copy()
final_table = summary.merge(
    hyperparam_summary,
    on=["dataset", "model"],
    how="left"
)

final_cols = ["dataset", "model"] + hyperparam_cols + ["mean_test_acc", "std_test_acc"]
final_table = final_table[final_cols]
print("▶ final_table.shape =", final_table.shape)
print(final_table.to_markdown(index=False, floatfmt=".4f"))

▶ df_best_runs.shape (should be #datasets × #models × #dataset_seeds) = (150, 83)
               dataset                                              model  \
201  coauthorship_cora  topomodelx.nn.hypergraph.allset_transformer.Al...   
213  coauthorship_cora  topomodelx.nn.hypergraph.allset_transformer.Al...   
223  coauthorship_cora  topomodelx.nn.hypergraph.allset_transformer.Al...   
234  coauthorship_cora  topomodelx.nn.hypergraph.allset_transformer.Al...   
246  coauthorship_cora  topomodelx.nn.hypergraph.allset_transformer.Al...   

     dataset_seed   val_acc  test_acc          loss._target_  \
201             0  0.813885  0.846381  topobench.loss.TBLoss   
213             3  0.818316  0.791728  topobench.loss.TBLoss   
223             5  0.855244  0.797637  topobench.loss.TBLoss   
234             7  0.830133  0.846381  topobench.loss.TBLoss   
246             9  0.843427  0.827179  topobench.loss.TBLoss   

    loss.dataset_loss.task loss.dataset_loss.loss_type  \
201         

In [None]:
import wandb
import pandas as pd

# ── A) CONFIGURE YOUR W&B ACCESS ────────────────────────────────────────────
ENTITY  = "gbg141"
PROJECT = "TopoBench_Hypergraph"
# If you used a sweep or run‐group, you can add a filter like:
# FILTER = {"group": "my_sweep_group"}  
FILTER = {}  # e.g. {"sweep": "abcdef1234"} or {} if not needed

# Log in is assumed. Now initialize the API:
api = wandb.Api()

# ── B) FETCH ALL RUNS FROM W&B ───────────────────────────────────────────────
# We can iterate over all runs in the project, optionally filtering by group/sweep.
runs = api.runs(f"{ENTITY}/{PROJECT}", filters=FILTER)

# Prepare a list to accumulate dictionaries
records = []

import itertools

def flatten_config(d, parent_key="", sep="."):
    items = []
    for k,v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_config(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


for run in runs:
    # 1) Extract config dict (all hyperparameters)
    cfg = run.config.copy() or {}
    # Some users log nested configs as e.g. {"model": {"type": "ResNet", ...}, ...}
    # In that case, you'd need to flatten. Below we assume flat config keys.
    cfg = flatten_config(cfg)
    
    # 2) Pull out exact fields from summary / config
    #    (Adapt these keys to match your logging names)
    try:
        val_acc  = run.summary["val/accuracy"]
        test_acc = run.summary["test/accuracy"]
    except KeyError:
        # If you have different metric names, adjust here.
        continue
    
    # 3) Identify which fields in cfg correspond to "dataset", "model", etc.
    dataset = cfg.get("dataset.loader.parameters.data_name", "UNKNOWN_DATASET")
    model   = cfg.get("model.backbone._target_",   "UNKNOWN_MODEL")
    seed    = cfg.get("dataset.split_params.data_seed",    None)
    
    # 4) Optionally collect all hyperparameters except seed/dataset/model
    #    Let's keep them, because we need to group by exact hyper‐param tuple.
    hyperparams = {k:v for k,v in cfg.items() 
                   if k not in ["dataset.loader.parameters.data_name", "model.backbone._target_", "dataset.split_params.data_seed"]}
    
    # 5) Build one record (row) per run/seed
    record = {
        "dataset": dataset,
        "model":   model,
        "seed":    seed,
        "val_acc": val_acc,
        "test_acc": test_acc,
    }
    # Add all other hyperparams as columns
    record.update(hyperparams)
    
    records.append(record)

# Convert to DataFrame
df = pd.DataFrame(records)

# If your "hyperparams" columns contain lists or dicts, you might need to flatten them further.
# But typically, each hyperparam is a scalar in `cfg` and shows up as its own column.

# ── BEFORE GROUPING: FIX UNHASHABLE (LIST) COLUMNS ────────────────────────────

# Identify which columns might contain lists (or other unhashable types).
# We’ll try converting any `list` into a `tuple` in-place.
for col in hyperparam_cols:
    # If there is at least one list in df[col], convert entire column:
    if df[col].apply(lambda x: isinstance(x, list)).any():
        df[col] = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)

# ── C) GROUP BY (DATASET, MODEL, HYPERPARAMS) ───────────────────────────────
# We want to treat each unique combo of (dataset, model, param1, param2, ...) as one group.
# First, identify all hyperparam column names:
all_cols = set(df.columns)
static_cols = {"dataset", "model", "seed", "val_acc", "test_acc"}
hyperparam_cols = sorted(list(all_cols - static_cols))

# Step 1: compute mean_val for each group (grouping by dataset, model, and all hyperparam columns)
group_cols = ["dataset", "model"] + hyperparam_cols
grouped = (
    df
    .groupby(group_cols)
    .agg(
        mean_val_acc = ("val_acc", "mean"),
        n_seeds      = ("seed",   "count")
    )
    .reset_index()
)

# At this point, `grouped` has one row per (dataset, model, hyperparams),
# with the average validation accuracy and how many seeds contributed.
# (We expect n_seeds == 5 if every hyperparam-set ran exactly 5 seeds.)

# ── D) FOR EACH (DATASET, MODEL), PICK THE BEST HYPERPARAM SET ─────────────
# Sort by descending mean_val_acc so that the “best” config per (dataset, model) is first.
grouped_sorted = grouped.sort_values(
    by=["dataset", "model", "mean_val_acc"], 
    ascending=[True, True, False]
)

# Now, within each (dataset, model) block, pick the top‐1 row:
best_configs = (
    grouped_sorted
    .groupby(["dataset", "model"], as_index=False)
    .first()   # first row according to our sort: highest mean_val_acc
)

# `best_configs` now has one row per (dataset, model) with the chosen hyperparameters.

# ── E) EXTRACT THE TEST ACCURACIES FOR THE CHOSEN CONFIGS ────────────────────
# We need to merge `best_configs` back onto the original df, selecting only the runs
# whose (dataset, model, hyperparams) match those in best_configs. Then we can compute
# mean/std of test_acc across seeds.

# 1) Mark each row in `df` with a unique key that matches the grouping:
def make_config_key(row, columns):
    return tuple(row[col] for col in columns)

# Build a key column in both dataframes:
df["config_key"] = df.apply(lambda r: make_config_key(r, group_cols), axis=1)
best_configs["config_key"] = best_configs.apply(lambda r: make_config_key(r, group_cols), axis=1)

# 2) Filter `df` down to only those rows where config_key is in best_configs.config_key
chosen_keys = set(best_configs["config_key"].tolist())
df_best_runs = df[df["config_key"].isin(chosen_keys)].copy()

# 3) Now group `df_best_runs` by (dataset, model)—each chunk corresponds to the five seeds
summary = (
    df_best_runs
    .groupby(["dataset", "model"])
    .agg(
        mean_test_acc = ("test_acc", "mean"),
        std_test_acc  = ("test_acc", "std"),
    )
    .reset_index()
)

# ── F) (OPTIONAL) ADD HYPERPARAMETERS TO THE SUMMARY TABLE ───────────────────
# If you also want to *display* which hyperparameters were chosen:
hyperparam_summary = best_configs[["dataset", "model"] + hyperparam_cols].copy()
# Merge into `summary`:
final_table = summary.merge(
    hyperparam_summary,
    on=["dataset", "model"],
    how="left"
)

# Re‐order columns so that (dataset, model, [hyperparams…], mean_test_acc, std_test_acc) appears nicely:
final_cols = ["dataset", "model"] + hyperparam_cols + ["mean_test_acc", "std_test_acc"]
final_table = final_table[final_cols]

# ── G) DISPLAY OR SAVE THE FINAL TABLE ───────────────────────────────────────
print("\n=== Best test‐accuracy summary for each (dataset, model) ===\n")
print(final_table.to_markdown(index=False, floatfmt=".4f"))
# Or, if you prefer CSV:
# final_table.to_csv("best_summary.csv", index=False)

# If you want a nicely formatted LaTeX table, you can do:
# print(final_table.to_latex(index=False, float_format="%.4f"))


TypeError: unhashable type: 'list'

In [None]:
df = pd.DataFrame(records)
print("▶ After building df, df.shape =", df.shape)
print(df.head())

# ── C) DETERMINE HYPERPARAM COLUMNS ──────────────────────────────────────────
static_cols    = {"dataset", "model", "seed", "val_acc", "test_acc"}
hyperparam_cols = sorted(list(set(df.columns) - static_cols))
print(f"▶ hyperparam_cols = {hyperparam_cols}")

# If df is empty or columns are missing, fix those first
if df.empty:
    raise RuntimeError("DataFrame `df` is empty. Check that runs actually contained "
                       "`dataset`, `model`, `seed`, `val/accuracy`, `test/accuracy`.")

# ── D) ENSURE ALL “group key” CELLS ARE HASHABLE ─────────────────────────────
def ensure_hashable(x):
    try:
        hash(x)
        return x
    except TypeError:
        if isinstance(x, np.ndarray):
            return tuple(x.tolist())
        elif isinstance(x, list):
            return tuple(x)
        elif isinstance(x, dict):
            return tuple(sorted(x.items()))
        else:
            return str(x)

# Apply to dataset/model too, in case they’re arrays or lists
for col in ["dataset", "model"] + hyperparam_cols:
    if col in df.columns and df[col].dtype == object:
        df[col] = df[col].apply(ensure_hashable)

# ── E) GROUP BY (dataset, model, hyperparams) ─────────────────────────────────
group_cols = ["dataset", "model"] + hyperparam_cols
print(f"▶ group_cols = {group_cols}")

grouped = (
    df
    .groupby(group_cols, dropna=False)
    .agg(
        mean_val_acc = ("val_acc",  "mean"),
        n_seeds      = ("seed",     "count")
    )
    .reset_index()
)
print("▶ After grouping, grouped.shape =", grouped.shape)
print(grouped.head())

if grouped.empty:
    raise RuntimeError("`grouped` is empty. Either hyperparam_cols are wrong or "
                       "no rows survived the groupby.")

# ── F) PICK BEST CONFIGS PER (dataset, model) ─────────────────────────────────
grouped_sorted = grouped.sort_values(
    by=["dataset", "model", "mean_val_acc"], 
    ascending=[True, True, False]
)
best_configs = (
    grouped_sorted
    .groupby(["dataset", "model"], as_index=False)
    .first()
)
print("▶ best_configs.shape =", best_configs.shape)
print(best_configs.head())

# Build a “config_key” column in both DataFrames for merging
def make_config_key(row, cols):
    return tuple(row[c] for c in cols)

grouped_cols = group_cols[:]  # same as group_cols
best_configs["config_key"] = best_configs.apply(lambda r: make_config_key(r, grouped_cols), axis=1)
df["config_key"]          = df.apply(lambda r: make_config_key(r, grouped_cols), axis=1)

# ── G) FILTER df BY best_configs AND COMPUTE TEST STATS ────────────────────────
chosen_keys = set(best_configs["config_key"].tolist())
df_best_runs = df[df["config_key"].isin(chosen_keys)].copy()
print("▶ df_best_runs.shape (should be #datasets × #models × #seeds) =", df_best_runs.shape)
print(df_best_runs.head())

if df_best_runs.empty:
    raise RuntimeError("`df_best_runs` is empty. That means none of your runs "
                       "matched the chosen best_configs.\n"
                       "Check if the config_key logic is correct.")

summary = (
    df_best_runs
    .groupby(["dataset", "model"])
    .agg(
        mean_test_acc = ("test_acc", "mean"),
        std_test_acc  = ("test_acc", "std"),
    )
    .reset_index()
)
print("▶ summary.shape =", summary.shape)
print(summary.head())

# ── H) MERGE HYPERPARAMS BACK INTO summary FOR FINAL TABLE ────────────────────
hyperparam_summary = best_configs[["dataset", "model"] + hyperparam_cols].copy()
final_table = summary.merge(
    hyperparam_summary,
    on=["dataset", "model"],
    how="left"
)

final_cols = ["dataset", "model"] + hyperparam_cols + ["mean_test_acc", "std_test_acc"]
final_table = final_table[final_cols]
print("▶ final_table.shape =", final_table.shape)
print(final_table.to_markdown(index=False, floatfmt=".4f"))

▶ After building df, df.shape = (7193, 145)
               dataset                                          model  seed  \
0           20newsW100  topobench.nn.backbones.hypergraph.edgnn.EDGNN    42   
1                  zoo  topobench.nn.backbones.hypergraph.edgnn.EDGNN    42   
2      cocitation_cora  topobench.nn.backbones.hypergraph.edgnn.EDGNN    42   
3              NTU2012  topobench.nn.backbones.hypergraph.edgnn.EDGNN    42   
4  cocitation_citeseer  topobench.nn.backbones.hypergraph.edgnn.EDGNN    42   

    val_acc  test_acc          loss._target_ loss.dataset_loss.task  \
0  0.800493  0.803989  topobench.loss.TBLoss         classification   
1  0.680000  0.730769  topobench.loss.TBLoss         classification   
2  0.744461  0.747415  topobench.loss.TBLoss         classification   
3  0.815109  0.801193  topobench.loss.TBLoss         classification   
4  0.675121  0.701691  topobench.loss.TBLoss         classification   

  loss.dataset_loss.loss_type loss.modules_losses.read

NameError: name 'np' is not defined

In [2]:
group_cols

['dataset',
 'model',
 'callbacks.early_stopping._target_',
 'callbacks.early_stopping.check_finite',
 'callbacks.early_stopping.check_on_train_epoch_end',
 'callbacks.early_stopping.divergence_threshold',
 'callbacks.early_stopping.min_delta',
 'callbacks.early_stopping.mode',
 'callbacks.early_stopping.monitor',
 'callbacks.early_stopping.patience',
 'callbacks.early_stopping.stopping_threshold',
 'callbacks.early_stopping.strict',
 'callbacks.early_stopping.verbose',
 'callbacks.learning_rate_monitor._target_',
 'callbacks.learning_rate_monitor.logging_interval',
 'callbacks.model_checkpoint._target_',
 'callbacks.model_checkpoint.auto_insert_metric_name',
 'callbacks.model_checkpoint.dirpath',
 'callbacks.model_checkpoint.every_n_epochs',
 'callbacks.model_checkpoint.every_n_train_steps',
 'callbacks.model_checkpoint.filename',
 'callbacks.model_checkpoint.mode',
 'callbacks.model_checkpoint.monitor',
 'callbacks.model_checkpoint.save_last',
 'callbacks.model_checkpoint.save_on_tra

In [1]:
import ast
import glob
import warnings
from collections import defaultdict
from datetime import date

import numpy as np
import pandas as pd
import wandb

today = date.today()
api = wandb.Api()

# # Find all csv files in the current directory
csv_files = glob.glob("*.csv")
# # Collect all the names of the csv files without the extension
csv_names = [csv_file[:-4] for csv_file in csv_files]
project_name = "TopoBench_Hypergraph"  #'best_results_edhnn'
user = "gbg141"

if project_name not in csv_names:
    runs = api.runs(f"{user}/{project_name}")

    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k, v in run.config.items() if not k.startswith("_")}
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {"summary": summary_list, "config": config_list, "name": name_list}
    )

    runs_df.to_csv(f"{project_name}.csv")
else:
    runs_df = pd.read_csv(f"{project_name}.csv", index_col=0)

    for row in runs_df.iloc:
        row["summary"] = ast.literal_eval(row["summary"])
        row["config"] = ast.literal_eval(row["config"])


for row in runs_df.iloc:
    row["summary"].update(row["config"])

lst = [i["summary"] for i in runs_df.iloc]
df = pd.DataFrame.from_dict(lst)

df_init = df.copy()

# Get average epoch run time
df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [2]:
df_init.columns

Index(['AvgTime/train_batch_mean', 'AvgTime/train_batch_std',
       'AvgTime/train_epoch_mean', 'AvgTime/train_epoch_std',
       'AvgTime/val_batch_mean', 'AvgTime/val_batch_std',
       'AvgTime/val_epoch_mean', 'AvgTime/val_epoch_std', '_runtime', '_step',
       '_timestamp', '_wandb', 'epoch', 'lr-Adam', 'test/accuracy',
       'test/auroc', 'test/loss', 'test/precision', 'test/recall',
       'train/accuracy', 'train/auroc', 'train/loss', 'train/precision',
       'train/recall', 'trainer/global_step', 'val/accuracy', 'val/auroc',
       'val/loss', 'val/precision', 'val/recall', 'loss', 'seed', 'tags',
       'test', 'model', 'paths', 'train', 'extras', 'logger', 'dataset',
       'trainer', 'callbacks', 'ckpt_path', 'evaluator', 'optimizer',
       'task_name', 'model/params/total', 'model/params/trainable',
       'model/params/non_trainable'],
      dtype='object')

In [3]:
def normalize_column(df, column_to_normalize):
    # Use json_normalize to flatten the nested dictionaries into separate columns
    flattened_df = pd.json_normalize(df[column_to_normalize])
    # Rename columns to include 'nested_column' prefix
    flattened_df.columns = [
        f"{column_to_normalize}.{col}" for col in flattened_df.columns
    ]
    # Concatenate the flattened DataFrame with the original DataFrame
    result_df = pd.concat([df, flattened_df], axis=1)
    # Get new columns names
    new_columns = flattened_df.columns
    # Drop the original nested column if needed
    result_df.drop(column_to_normalize, axis=1, inplace=True)
    return result_df, new_columns


# Config columns to normalize
columns_to_normalize = ["model", "dataset", "callbacks", "paths"]

# Keep track of config columns added
config_columns = []
for column in columns_to_normalize:
    df, columns = normalize_column(df, column)
    config_columns.extend(columns)

In [21]:
# Remove columns that are not needed (we shouldn't vary them or their variation is not interesting)
remove_col = [
    #"dataset.transforms.data_manipulations.selected_fields",
    "callbacks.model_checkpoint.dirpath", 'callbacks.model_checkpoint.dirpath', #'model.feature_encoder.selected_dimensions',
]
df = df.drop(remove_col, axis=1)

# Ensure that removed columns are not present in config_columns
config_columns = [col for col in config_columns if col != remove_col]

KeyError: "['callbacks.model_checkpoint.dirpath', 'callbacks.model_checkpoint.dirpath'] not found in axis"

In [None]:
print(
    f"Number of rows with model.backbone._target_ = nan is {sum(df['model.backbone._target_'].isna())}"
)
# Drop na values if there are
df = df.dropna(subset=["model.backbone._target_"])
# Reset index
df = df.reset_index(drop=True)

# Drop rows that 'callbacks.early_stopping.monitor' isna
print(
    f"Number of rows with callbacks.early_stopping.monitor = nan is {sum(df['callbacks.early_stopping.monitor'].isna())}"
)

# print("Because of SCCN and CWN false runs there were 96 such runs on 13/03/24")

df = df.dropna(subset=["callbacks.early_stopping.monitor"])
# Reset index
df = df.reset_index(drop=True)


# Get correct names for the models
df["model.backbone._target_"] = df["model.backbone._target_"].apply(
    lambda x: x.split(".")[-1]
)

Number of rows with model.backbone._target_ = nan is 0
Number of rows with callbacks.early_stopping.monitor = nan is 0


In [None]:
df["model.backbone._target_"].unique()

array(['EDGNN', 'UniGCNII', 'AllSetTransformer'], dtype=object)

In [None]:
df["dataset.split_params.data_seed"]

0       0
1       0
2       0
3       0
4       0
       ..
7188    9
7189    3
7190    5
7191    7
7192    9
Name: dataset.split_params.data_seed, Length: 7193, dtype: int64

In [25]:
# Identify unique models in DataFrame
unique_models = df["model.backbone._target_"].unique()

# Identify unique datasets in DataFrame
unique_datasets = df["dataset.loader.parameters.data_name"].unique()


collected_results = defaultdict(dict)
collected_results_time = defaultdict(dict)
collected_results_time_run = defaultdict(dict)

collected_aggregated_results = defaultdict(dict)
collected_non_aggregated_results = defaultdict(dict)

# Got over each dataset and model and find the best result
for dataset in unique_datasets:
    for model in unique_models:
        # Get the subset of the DataFrame for the current dataset and model
        subset = df[
            (df["dataset.loader.parameters.data_name"] == dataset)
            & (df["model.backbone._target_"] == model)
        ]

        if subset.empty:
            print("---------")
            print(f"No results for {model} on {dataset}")
            print("---------")
            continue
        # Suppress all warnings
        warnings.filterwarnings("ignore")
        subset["Model"] = model
        warnings.filterwarnings("default")

        def get_metric(df):
            metric_ = df["callbacks.early_stopping.monitor"].unique()
            assert len(metric_) == 1, "There should be only one metric to optimize"
            metric = metric_[0]
            return metric.split("/")[-1]

        # Cols to get statistics later
        # TODO: log maximum validation value for optimized metric
        performance_cols = [f"val/{get_metric(subset)}"]

        # Get the unique values for each config column
        unique_colums_values = {}
        for col in config_columns:
            try:
                unique_colums_values[col] = subset[col].unique()
            except:
                print(f"Attention the columns: {col}, has issues with unique values")

        # Keep only those keys that have more than one unique value
        unique_colums_values = {
            k: v for k, v in unique_colums_values.items() if len(v) > 1
        }

        # Print the unique values for each config column

        print(f"Unique values for each config column for {model} on {dataset}:")
        for col, unique in unique_colums_values.items():
            print(f"{col}: {unique}")
            print()
        print("---------")

        # Check if "special colums" are not in unique_colums_values
        # For example dataset.parameters.data_seed should not be in aggregation columns
        # If it is, then we should remove it from the list
        special_columns = ["dataset.split_params.data_seed"]

        for col in special_columns:
            if col in unique_colums_values:
                unique_colums_values.pop(col)

        # Obtain the aggregation columns
        aggregation_columns = ["Model"] + list(unique_colums_values.keys())

        collected_non_aggregated_results[dataset][model] = {
            "df": subset.copy(),
            "aggregation_columns": aggregation_columns,
            "performance_cols": performance_cols,
        }

        # Aggregate the subset by the aggregation columns and get the best result for each group
        aggregated = subset.groupby(aggregation_columns).agg(
            {col: ["mean", "std"] for col in performance_cols}
        )

        # Go from MultiIndex to Index
        aggregated = aggregated.reset_index()

        assert (
            len(subset["callbacks.early_stopping.mode"].unique()) == 1
        ), "There should be only one mode for early stopping"
        # Identify the mode of the early stopping mode

        if subset["callbacks.early_stopping.mode"].unique()[0] == "max":
            ascending = False
            final_best_ = aggregated.sort_values(
                by=(f"test/{get_metric(subset)}", "mean"), ascending=ascending
            ).head(1)
            final_best_ = (final_best_ * 100).round(2)
        else:
            ascending = True
            final_best_ = aggregated.sort_values(
                by=(f"test/{get_metric(subset)}", "mean"), ascending=ascending
            ).head(1)

        collected_results[dataset][model] = {
            "mean": final_best_[(f"test/{get_metric(subset)}", "mean")].values[0],
            "std": final_best_[(f"test/{get_metric(subset)}", "std")].values[0],
        }

        # Get average epoch run time
        collected_results_time[dataset][model] = {
            "mean": subset['AvgTime/train_epoch_mean'].mean(),
            "std": subset['AvgTime/train_epoch_mean'].std(),
        }

        collected_results_time_run[dataset][model] = {
            "mean": subset["_runtime"].mean(),
            "std": subset["_runtime"].std(),
        }

        collected_aggregated_results[dataset][model] = aggregated.sort_values(
            by=(f"test/{get_metric(subset)}", "mean"), ascending=ascending
        )

Attention the columns: model.feature_encoder.in_channels, has issues with unique values
Attention the columns: callbacks.model_checkpoint.dirpath, has issues with unique values
Unique values for each config column for EDGNN on 20newsW100:
model.readout.hidden_dim: [ 32  64 128]

model.backbone.n_layers: [1 2 3 4]

model.backbone.num_features: [ 32.  64. 128.]

model.feature_encoder.out_channels: [ 32  64 128]

model.feature_encoder.proj_dropout: [0.25 0.5 ]

model.backbone_wrapper.out_channels: [ 32  64 128]

dataset.split_params.data_seed: [0 3 5 7 9]

paths.output_dir: ['/home/gbg141/TopoBenchmarkX/logs/train/multiruns/2025-05-30_17-05-14/0'
 '/home/gbg141/TopoBenchmarkX/logs/train/multiruns/2025-05-30_17-05-14/1'
 '/home/gbg141/TopoBenchmarkX/logs/train/multiruns/2025-05-30_17-05-14/2'
 '/home/gbg141/TopoBenchmarkX/logs/train/multiruns/2025-05-30_17-05-14/3'
 '/home/gbg141/TopoBenchmarkX/logs/train/multiruns/2025-05-30_17-05-14/4'
 '/home/gbg141/TopoBenchmarkX/logs/train/multiruns/2

In [26]:
collected_results_time
# Convert nested dictionary to DataFrame
nested_dict = dict(collected_results)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)


result_dict = result_dict.round(2)
result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Test Performance"]

In [27]:
result_dict.pivot_table(
    index="Model", columns="Dataset", values="Test Performance", aggfunc="first"
)

Dataset,20newsW100,ModelNet40,Mushroom,NTU2012,coauthorship_cora,coauthorship_dblp,cocitation_citeseer,cocitation_cora,cocitation_pubmed,zoo
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AllSetTransformer,82.02 ± nan,98.67 ± nan,100.0 ± nan,90.66 ± nan,85.97 ± nan,92.31 ± nan,75.48 ± nan,81.09 ± nan,87.73 ± nan,100.0 ± nan
EDGNN,81.41 ± nan,98.57 ± nan,99.85 ± nan,89.66 ± nan,83.46 ± nan,91.17 ± nan,74.52 ± nan,79.62 ± nan,87.57 ± nan,96.15 ± nan
UniGCNII,81.29 ± nan,98.8 ± nan,99.8 ± nan,91.25 ± nan,83.9 ± nan,91.12 ± nan,75.97 ± nan,81.68 ± nan,87.51 ± nan,100.0 ± nan


In [20]:
result_dict

Unnamed: 0,Dataset,Model,Test Performance
0,20newsW100,EDGNN,81.41 ± nan
1,20newsW100,UniGCNII,81.29 ± nan
2,20newsW100,AllSetTransformer,82.02 ± nan
3,zoo,EDGNN,96.15 ± nan
4,zoo,UniGCNII,100.0 ± nan
5,zoo,AllSetTransformer,100.0 ± nan
6,cocitation_cora,EDGNN,79.62 ± nan
7,cocitation_cora,UniGCNII,81.68 ± nan
8,cocitation_cora,AllSetTransformer,81.09 ± nan
9,NTU2012,EDGNN,89.66 ± nan


### Runtime per epoch

In [22]:
collected_results_time
# Convert nested dictionary to DataFrame
nested_dict = dict(collected_results_time)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)


result_dict = result_dict.round(2)
result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Average Time per Epoch"]

In [23]:
result_dict.pivot_table(
    index="Model", columns="Dataset", values="Average Time per Epoch", aggfunc="first"
)[['MUTAG', 'NCI1','NCI109','PROTEINS','ZINC']]

Dataset,MUTAG,NCI1,NCI109,PROTEINS,ZINC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AllSetTransformer,0.08 ± 0.0,0.55 ± 0.02,0.56 ± 0.02,0.16 ± 0.0,3.1 ± 0.05
CCCN,0.11 ± 0.0,1.33 ± 0.02,1.33 ± 0.02,0.32 ± 0.01,6.14 ± 0.04
CCXN,0.09 ± 0.0,1.25 ± 0.02,1.39 ± 0.04,0.32 ± 0.01,5.92 ± 0.09
CWN,0.1 ± 0.0,1.38 ± 0.03,1.37 ± 0.01,0.37 ± 0.02,5.86 ± 0.05
EDGNN,0.04 ± 0.0,0.49 ± 0.02,0.49 ± 0.01,0.13 ± 0.0,2.06 ± 0.01
GAT,0.04 ± 0.0,0.34 ± 0.02,0.33 ± 0.02,0.07 ± 0.0,1.24 ± 0.01
GCN,0.03 ± 0.0,0.26 ± 0.01,0.27 ± 0.02,0.05 ± 0.0,1.24 ± 0.01
GIN,0.03 ± 0.0,0.27 ± 0.02,0.27 ± 0.02,0.06 ± 0.0,1.19 ± 0.0
SCCN,0.09 ± 0.0,1.71 ± 0.02,1.7 ± 0.01,0.5 ± 0.01,8.19 ± nan
SCCNNCustom,0.09 ± 0.0,1.65 ± 0.02,1.66 ± 0.03,0.43 ± 0.01,nan ± nan


### Runtime

In [24]:
collected_results_time_run
# Convert nested dictionary to DataFrame
nested_dict = dict(collected_results_time_run)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)


result_dict = result_dict.round(2)
result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Average Training Time"]


In [25]:
result_dict.pivot_table(
    index="Model", columns="Dataset", values="Average Training Time", aggfunc="first"
)[['MUTAG', 'NCI1','NCI109','PROTEINS','ZINC']]

Dataset,MUTAG,NCI1,NCI109,PROTEINS,ZINC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AllSetTransformer,9.9 ± 3.24,138.13 ± 46.01,138.66 ± 26.38,15.81 ± 2.89,397.77 ± 153.75
CCCN,12.04 ± 2.21,372.36 ± 109.47,272.3 ± 20.89,41.63 ± 7.23,1621.11 ± 141.72
CCXN,10.76 ± 1.89,244.72 ± 46.09,225.72 ± 86.57,51.98 ± 8.5,1226.67 ± 317.55
CWN,10.92 ± 0.96,302.34 ± 63.44,294.79 ± 46.27,53.6 ± 17.7,1390.21 ± 96.86
EDGNN,5.81 ± 1.13,110.86 ± 27.75,126.61 ± 45.53,15.15 ± 3.3,357.68 ± 46.95
GAT,4.16 ± 1.05,57.32 ± 17.49,56.44 ± 9.05,8.18 ± 2.3,171.15 ± 64.67
GCN,3.83 ± 0.89,53.23 ± 19.67,37.4 ± 8.63,8.18 ± 2.47,146.89 ± 27.63
GIN,4.6 ± 0.56,61.2 ± 23.97,50.32 ± 7.98,8.88 ± 2.34,168.43 ± 107.1
SCCN,10.71 ± 2.92,332.76 ± 51.89,321.76 ± 55.92,70.06 ± 15.91,1226.19 ± 1686.57
SCCNNCustom,14.06 ± 2.51,307.2 ± 83.01,353.69 ± 105.89,54.13 ± 11.27,794.76 ± nan
