In [1]:
import ast
import glob
import warnings
from collections import defaultdict
from datetime import date

import numpy as np
import pandas as pd
import wandb

today = date.today()
api = wandb.Api()

# # Find all csv files in the current directory
csv_files = glob.glob("*.csv")
# # Collect all the names of the csv files without the extension
csv_names = [csv_file[:-4] for csv_file in csv_files]
project_name = "graph_baselines_normalized_graph_baselines"  # rebutal_CWN_ogbg-molhiv fix_gnn_rebuttal_cell_NCI109  fix_gnn_rebuttal_cell_MUTAG fix_gnn_rebuttal_cell_PROTEINS fix_gnn_rebuttal_cell_ZINC
user = "levsap" #"telyatnikov_sap"

if project_name not in csv_names:
    runs = api.runs(f"{user}/{project_name}")

    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k, v in run.config.items() if not k.startswith("_")}
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {"summary": summary_list, "config": config_list, "name": name_list}
    )

    runs_df.to_csv(f"{project_name}.csv")
else:
    runs_df = pd.read_csv(f"{project_name}.csv", index_col=0)

    for row in runs_df.iloc:
        row["summary"] = ast.literal_eval(row["summary"])
        row["config"] = ast.literal_eval(row["config"])


for row in runs_df.iloc:
    row["summary"].update(row["config"])

lst = [i["summary"] for i in runs_df.iloc]
df = pd.DataFrame.from_dict(lst)

df_init = df.copy()

# Get average epoch run time
#df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [2]:
df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [3]:
df.shape

(25441, 88)

In [4]:
def normalize_column(df, column_to_normalize):
    # Use json_normalize to flatten the nested dictionaries into separate columns
    flattened_df = pd.json_normalize(df[column_to_normalize])
    # Rename columns to include 'nested_column' prefix
    flattened_df.columns = [
        f"{column_to_normalize}.{col}" for col in flattened_df.columns
    ]
    # Concatenate the flattened DataFrame with the original DataFrame
    result_df = pd.concat([df, flattened_df], axis=1)
    # Get new columns names
    new_columns = flattened_df.columns
    # Drop the original nested column if needed
    result_df.drop(column_to_normalize, axis=1, inplace=True)
    return result_df, new_columns


# Config columns to normalize
columns_to_normalize = ["model", "dataset", "callbacks", "paths",'optimizer' ] # 'optimizer'

# Keep track of config columns added
config_columns = []
for column in columns_to_normalize:
    df, columns = normalize_column(df, column)
    config_columns.extend(columns)

In [5]:
# Remove columns that are not needed (we shouldn't vary them or their variation is not interesting)
remove_cols = [
    #"dataset.transforms.data_manipulations.selected_fields",
    "callbacks.model_checkpoint.dirpath", "paths.output_dir", 'model.feature_encoder.selected_dimensions','callbacks.model_checkpoint.dirpath',
]
removed_columns = []
for remove_col in remove_cols:
    if remove_col in df.columns:
        df = df.drop([remove_col], axis=1)
        removed_columns.append(remove_col)

print("Removed columns:", removed_columns)

# Ensure that removed columns are not present in config_columns
config_columns = [col for col in config_columns if col not in removed_columns]

Removed columns: ['callbacks.model_checkpoint.dirpath', 'paths.output_dir']


In [6]:
[i for i in list(df.columns) if 'best' in i]

['best_epoch',
 'best_epoch/checkpoint',
 'best_epoch/train/accuracy',
 'best_epoch/train/cohen_kappa',
 'best_epoch/train/f1_score',
 'best_epoch/train/jaccard',
 'best_epoch/train/loss',
 'best_epoch/train/mcc',
 'best_epoch/train/precision',
 'best_epoch/train/recall',
 'best_epoch/val/accuracy',
 'best_epoch/val/cohen_kappa',
 'best_epoch/val/f1_score',
 'best_epoch/val/jaccard',
 'best_epoch/val/loss',
 'best_epoch/val/mcc',
 'best_epoch/val/precision',
 'best_epoch/val/recall',
 'test_best_rerun/accuracy',
 'test_best_rerun/cohen_kappa',
 'test_best_rerun/f1_score',
 'test_best_rerun/jaccard',
 'test_best_rerun/loss',
 'test_best_rerun/mcc',
 'test_best_rerun/precision',
 'test_best_rerun/recall',
 'val_best_rerun/accuracy',
 'val_best_rerun/cohen_kappa',
 'val_best_rerun/f1_score',
 'val_best_rerun/jaccard',
 'val_best_rerun/loss',
 'val_best_rerun/mcc',
 'val_best_rerun/precision',
 'val_best_rerun/recall',
 'callbacks.best_epoch_metrics.mode',
 'callbacks.best_epoch_metrics.mo

In [7]:
# First get rid of the runs that have less than 50 epochs (minimum amount of epochs is 50)
MIN_EPOCHS = 50
df = df[df["epoch"] >= MIN_EPOCHS]

In [8]:
print(
    f"Number of rows with model.backbone._target_ = nan is {sum(df['model.backbone._target_'].isna())}"
)
# Drop na values if there are
df = df.dropna(subset=["model.backbone._target_"])
# Reset index
df = df.reset_index(drop=True)

# Drop rows that 'callbacks.early_stopping.monitor' isna
print(
    f"Number of rows with callbacks.early_stopping.monitor = nan is {sum(df['callbacks.early_stopping.monitor'].isna())}"
)

# print("Because of SCCN and CWN false runs there were 96 such runs on 13/03/24")

df = df.dropna(subset=["callbacks.early_stopping.monitor"])
# Reset index
df = df.reset_index(drop=True)


# Get correct names for the models
df["model.backbone._target_"] = df["model.backbone._target_"].apply(
    lambda x: x.split(".")[-1]
)

Number of rows with model.backbone._target_ = nan is 0
Number of rows with callbacks.early_stopping.monitor = nan is 0


In [9]:
df["model.backbone._target_"].unique()

array(['GCN', 'GIN'], dtype=object)

In [15]:
import pandas as pd
import numpy as np
import warnings
from collections import defaultdict
from IPython.display import display

# =============================================================================
# 1. Configuration & Constants
# =============================================================================
REQUIRED_SEEDS = {0, 3, 5, 7, 9}
SEED_COL = "dataset.split_params.data_seed"

TASK_METRICS = {
    "classification": ["accuracy", "precision", "recall", "f1_score", "cohen_kappa", "mcc", "jaccard"],
    "regression": ["mse", "mae", "r2"]
}

def get_performance_cols(subset, task_type):
    """
    Updated to robustly find metrics containing 'test' and '/' 
    (e.g., 'test/accuracy', 'test_best_rerun/mse').
    """
    found_cols = []
    keywords = TASK_METRICS.get(task_type, [])
    
    for col in subset.columns:
        # UPDATED: Checks if "test" is present anywhere, not just at start
        if "test" in col and "/" in col:
            metric_name = col.split("/")[-1]
            if metric_name in keywords:
                found_cols.append(col)
    
    try:
        opt_metric_path = subset["callbacks.early_stopping.monitor"].iloc[0]
        opt_metric_name = opt_metric_path.split("/")[-1]
        
        # UPDATED: More flexible search for the optimized metric
        for col in subset.columns:
            if "test" in col and col.endswith(f"/{opt_metric_name}"):
                if col not in found_cols:
                    found_cols.append(col)
    except:
        pass 
    return list(set(found_cols))

def get_metric_name(subset):
    metric_path = subset["callbacks.early_stopping.monitor"].unique()
    if len(metric_path) != 1:
        # Warning instead of crash to allow manual inspection if needed
        print(f"Warning: Multiple monitor metrics found: {metric_path}. Defaulting to accuracy/mse.")
        return "accuracy"
    return metric_path[0].split("/")[-1]

# =============================================================================
# 2. Core Processing Logic
# =============================================================================
def process_dataset_model(df, dataset, model, config_columns):
    subset = df[
        (df["dataset.loader.parameters.data_name"] == dataset)
        & (df["model.backbone._target_"] == model)
    ].copy()

    if subset.empty:
        return None

    warnings.filterwarnings("ignore")
    subset["Model"] = model
    warnings.filterwarnings("default")

    # STRICT Mode Check
    modes = subset["callbacks.early_stopping.mode"].unique()
    if len(modes) == 0:
        print(f"[SKIP] {model} on {dataset}: No early_stopping mode found.")
        return None
    
    mode = modes[0]
    if mode == "max":
        task_type = "classification"
        ascending = False
    elif mode == "min":
        task_type = "regression"
        ascending = True
    else:
        # Fallback
        task_type = "classification"
        ascending = False

    performance_cols = get_performance_cols(subset, task_type)
    
    # SAFETY CHECK: If no metrics found, skip
    if not performance_cols:
        print(f"[SKIP] {model} on {dataset}: No test metrics found.")
        return None

    subset[performance_cols] = subset[performance_cols].apply(pd.to_numeric, errors='coerce')
    nan_mask = subset[performance_cols].isna().any(axis=1)
    subset = subset[~nan_mask].reset_index(drop=True)
    
    if subset.empty:
        return None

    # Determine HPs
    unique_colums_values = {}
    check_cols = list(config_columns)
    if SEED_COL not in check_cols and SEED_COL in subset.columns:
        check_cols.append(SEED_COL)

    for col in check_cols:
        if col in subset.columns:
            try:
                unique_vals = subset[col].apply(lambda x: tuple(x) if isinstance(x, list) else x).unique()
                if len(unique_vals) > 1:
                    sorted_vals = sorted(list(unique_vals), key=lambda x: str(x))
                    unique_colums_values[col] = sorted_vals
            except Exception:
                pass

    print("---------------------------------------------------------------")
    print(f"UNIQUE VALUES FOR MODEL: {model}; DATASET: {dataset} ({task_type.upper()}):")
    print("---------------------------------------------------------------")
    for col, unique in unique_colums_values.items():
        print(f"{col}: {unique}")
    print("---------------------------END---------------------------------")

    if SEED_COL in unique_colums_values:
        unique_colums_values.pop(SEED_COL)

    aggregation_columns = ["Model"] + list(unique_colums_values.keys())

    # Filter Seeds
    def has_required_seeds(group):
        current_seeds = set(group[SEED_COL].unique())
        # Use issubset to be safe against float/int types
        req_seeds = {float(s) for s in REQUIRED_SEEDS}
        cur_seeds_float = {float(s) for s in current_seeds}
        
        if not req_seeds.issubset(cur_seeds_float): return False
        return True
    
    # Ensure aggregation columns are strings to avoid grouping errors
    for col in aggregation_columns:
        subset[col] = subset[col].astype(str)

    subset_filtered = subset.groupby(aggregation_columns).filter(has_required_seeds)
    
    if subset_filtered.empty:
        return None
        
    # -------------------------------------------------------------------------
    # UPDATED: Safety Check for Hidden Variations with STATISTICS
    # -------------------------------------------------------------------------
    potential_hps = [
        c for c in subset.columns 
        if "test" not in c and "val" not in c and "train" not in c
        and c not in ["mean", "std", "Model", SEED_COL] and "time" not in c.lower() and "dir" not in c.lower()
    ]
    
    grouped_check = subset_filtered.groupby(aggregation_columns)
    warning_buffer = []

    for col in potential_hps:
        if col in aggregation_columns: continue
        
        # Check if this column varies within ANY single group (Config)
        try:
            has_variation = grouped_check[col].apply(lambda x: len(x.unique()) > 1).any()
        except:
            # Fallback for unhashable types
            has_variation = False

        if has_variation:
            stats_msg = ""
            try:
                # Try to calculate numeric stats
                numeric_series = pd.to_numeric(subset_filtered[col], errors='raise')
                v_min = numeric_series.min()
                v_mean = numeric_series.mean()
                v_max = numeric_series.max()
                stats_msg = f"Range: [{v_min:.4f} - {v_max:.4f}], Mean: {v_mean:.4f}"
            except:
                # If not numeric, show unique values preview
                u_vals = subset_filtered[col].unique()
                preview = ", ".join([str(v) for v in u_vals[:3]])
                if len(u_vals) > 3: preview += "..."
                stats_msg = f"Values: {preview}"
            
            warning_buffer.append(f"  [VARIES] '{col}' -> {stats_msg}")

    if warning_buffer:
        print(f"\n[WARNING] Hidden variations detected for {model} on {dataset}:")
        for msg in warning_buffer:
            print(msg)
        print("  -> Averaging occurred over these variations.")
    # -------------------------------------------------------------------------

    num_unique_configs = len(subset_filtered.groupby(aggregation_columns))
    valid_grid = {}
    for col in aggregation_columns:
        if col == "Model": continue
        u_vals = subset_filtered[col].unique()
        valid_grid[col] = sorted(list(u_vals), key=lambda x: str(x))

    # Aggregation
    agg_dict = {col: ["mean", "std"] for col in performance_cols}
    aggregated = subset_filtered.groupby(aggregation_columns).agg(agg_dict).reset_index()

    # Sort Results - UPDATED to find column dynamically
    optimized_metric_name = get_metric_name(subset)
    
    # Try to find the sort column safely
    sort_col = None
    for col in performance_cols:
        if optimized_metric_name in col:
            sort_col = col
            break
            
    if sort_col is None:
        sort_col = performance_cols[0] # Fallback
    
    if (sort_col, "mean") not in aggregated.columns:
        # Fallback logic if naming doesn't align
        sort_col = aggregated.columns[1][0] # Pick first metric available

    leaderboard = aggregated.sort_values(
        by=(sort_col, "mean"), 
        ascending=ascending
    ).copy()

    # -------------------------------------------------------------------------
    # SCALING LOGIC
    # -------------------------------------------------------------------------
    cols_to_scale = []
    
    for col in leaderboard.columns:
        if isinstance(col, tuple) and col[0] in performance_cols:
            cols_to_scale.append(col)
            
    if mode == "max":
        if cols_to_scale:
            leaderboard.loc[:, cols_to_scale] = (leaderboard.loc[:, cols_to_scale] * 100).round(2)
    else:
        if cols_to_scale:
            leaderboard.loc[:, cols_to_scale] = leaderboard.loc[:, cols_to_scale].round(4)
    # -------------------------------------------------------------------------

    best_row_df = leaderboard.head(1)

    result_data = {
        "dataset": dataset,
        "model": model,
        "task_type": task_type,
        "optimized_metric": optimized_metric_name,
        "num_configs": num_unique_configs,
        "hyperparameter_grid": valid_grid,
        "best_config": best_row_df,
        "full_leaderboard": leaderboard,
        "aggregation_columns": aggregation_columns
    }

    for metric_col in performance_cols:
        mean_val = best_row_df[(metric_col, "mean")].values[0]
        std_val = best_row_df[(metric_col, "std")].values[0]
        simple_name = metric_col.split("/")[-1]
        result_data[simple_name] = f"{mean_val:.2f} ± {std_val:.2f}"

    return result_data

# =============================================================================
# 3. Execution
# =============================================================================
unique_models = df["model.backbone._target_"].unique()
unique_datasets = df["dataset.loader.parameters.data_name"].unique()

all_results = []
print("Processing results...")

for dataset in unique_datasets:
    for model in unique_models:
        try:
            res = process_dataset_model(df, dataset, model, config_columns)
            if res:
                all_results.append(res)
        except ValueError as e:
            print(f"\n!!! EXECUTION STOPPED !!!\n{e}")
            raise e 

print(f"Done. Processed {len(all_results)} pairs.")

# =============================================================================
# 4. Display Functions (Unchanged)
# =============================================================================

def display_performance_tables(results_list):
    if not results_list: return
    df_results = pd.DataFrame(results_list)

    for task in ["classification", "regression"]:
        task_df = df_results[df_results["task_type"] == task]
        if task_df.empty: continue
            
        print(f"\n{'='*40}\n  PERFORMANCE: {task.upper()}\n{'='*40}")
        ignore_cols = ["dataset", "model", "task_type", "optimized_metric", "best_config", "num_configs", "hyperparameter_grid", "full_leaderboard", "aggregation_columns"]
        metric_cols = [c for c in task_df.columns if c not in ignore_cols]
        
        for metric in metric_cols:
            pivot = task_df.pivot_table(index="model", columns="dataset", values=metric, aggfunc="first").fillna("-")
            print(f"\nMetric: {metric.upper()}")
            display(pivot)

def display_experiment_stats(results_list):
    if not results_list: return
    df_results = pd.DataFrame(results_list)
    count_pivot = df_results.pivot_table(
        index="model", columns="dataset", values="num_configs", aggfunc="first"
    ).fillna(0).astype(int)
    print(f"\n{'='*40}\n  COMPLETED HP CONFIGURATIONS (Count)\n{'='*40}")
    display(count_pivot)

def display_hp_impact(results_list):
    if not results_list: return
    
    print(f"\n{'='*40}\n  HYPERPARAMETER IMPACT ANALYSIS\n{'='*40}")
    print("Listing all configurations sorted by performance.")
    
    for entry in results_list:
        d = entry['dataset']
        m = entry['model']
        leaderboard = entry['full_leaderboard'].copy() 
        opt_metric = entry['optimized_metric']
        
        # 1. ROBUST FLATTENING: Convert MultiIndex columns to Strings
        if isinstance(leaderboard.columns, pd.MultiIndex):
            new_cols = []
            for col in leaderboard.columns:
                c0 = str(col[0])
                c1 = str(col[1])
                if c1 == "":
                    new_cols.append(c0)
                else:
                    new_cols.append(f"{c0}|{c1}") 
            leaderboard.columns = new_cols

            metric_base = None
            # Find the actual metric column name in the leaderboard
            for col in leaderboard.columns:
                if opt_metric in col and "|mean" in col:
                    metric_base = col.split("|mean")[0]
                    break
            
            if not metric_base:
                # Fallback
                metric_base = f"test/{opt_metric}"
                
            target_mean = f"{metric_base}|mean"
            target_std = f"{metric_base}|std"
        else:
            target_mean = f"test/{opt_metric}" 
            target_std = f"test/{opt_metric}_std"

        # 2. Select Columns
        hp_cols = [h for h in entry['aggregation_columns'] if h != "Model"]
        selected_cols = []
        rename_map = {}
        
        for hp in hp_cols:
            if hp in leaderboard.columns:
                selected_cols.append(hp)
        
        if target_mean in leaderboard.columns:
            selected_cols.append(target_mean)
            rename_map[target_mean] = f"{opt_metric} (Mean)"
            
        if target_std in leaderboard.columns:
            selected_cols.append(target_std)
            rename_map[target_std] = f"{opt_metric} (Std)"
            
        if not selected_cols:
             print(f"\n>>> {m} on {d}: Columns not found.")
             continue
             
        # 3. Create Final View
        final_view = leaderboard[selected_cols].rename(columns=rename_map)
        
        print(f"\n>>> {m} on {d} (Sorted by {opt_metric})")
        display(final_view)

# =============================================================================
# 5. Run All Displays
# =============================================================================
display_experiment_stats(all_results)
display_performance_tables(all_results)
display_hp_impact(all_results)

Processing results...
---------------------------------------------------------------
UNIQUE VALUES FOR MODEL: GCN; DATASET: hm-categories (CLASSIFICATION):
---------------------------------------------------------------
model.readout.hidden_dim: [16.0, 32.0, 64.0, 8.0]
model.backbone.dropout: [0.0, 0.1, 0.2]
model.backbone.num_layers: [1.0, 2.0, 3.0, 4.0, 8.0]
model.backbone.in_channels: [16.0, 32.0, 64.0, 8.0]
model.backbone.hidden_channels: [16.0, 32.0, 64.0, 8.0]
model.feature_encoder.out_channels: [16.0, 32.0, 64.0, 8.0]
model.feature_encoder.proj_dropout: [0.0, 0.1, 0.2]
model.backbone_wrapper.out_channels: [16.0, 32.0, 64.0, 8.0]
dataset.split_params.data_seed: [0.0, 3.0, 5.0, 7.0, 9.0]
optimizer.parameters.lr: [0.0001, 0.0003, 0.001, 0.003, 3e-05]
---------------------------END---------------------------------

  [VARIES] '_step' -> Range: [122.0000 - 1530.0000], Mean: 884.1911
  [VARIES] 'best_epoch' -> Range: [9.0000 - 649.0000], Mean: 355.4505
  [VARIES] 'best_epoch/checkpoi

dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,896,898,899,647,890
GIN,0,0,585,0,0



  PERFORMANCE: CLASSIFICATION

Metric: ACCURACY


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,91.24 ± 0.20,94.08 ± 0.17,38.12 ± 0.50,8.57 ± 0.24,81.58 ± 0.18
GIN,-,-,51.35 ± 0.54,-,-



Metric: F1_SCORE


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,91.24 ± 0.20,94.08 ± 0.17,38.12 ± 0.50,8.57 ± 0.24,81.58 ± 0.18
GIN,-,-,51.35 ± 0.54,-,-



Metric: RECALL


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,62.66 ± 0.64,81.58 ± 0.57,27.76 ± 0.50,2.15 ± 0.08,65.01 ± 1.08
GIN,-,-,39.15 ± 0.64,-,-



Metric: JACCARD


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,57.26 ± 0.63,75.31 ± 0.64,18.04 ± 0.35,0.66 ± 0.03,54.84 ± 0.93
GIN,-,-,27.93 ± 0.66,-,-



Metric: MCC


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,38.16 ± 1.54,69.97 ± 0.94,31.18 ± 0.59,6.20 ± 0.28,37.99 ± 1.39
GIN,-,-,46.16 ± 0.60,-,-



Metric: PRECISION


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,78.78 ± 1.38,88.77 ± 0.41,34.68 ± 0.76,1.56 ± 0.09,74.05 ± 0.21
GIN,-,-,49.13 ± 3.00,-,-



Metric: COHEN_KAPPA


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,34.23 ± 1.49,69.38 ± 0.98,30.99 ± 0.59,5.96 ± 0.27,35.72 ± 1.90
GIN,-,-,46.05 ± 0.60,-,-



  HYPERPARAMETER IMPACT ANALYSIS
Listing all configurations sorted by performance.

>>> GCN on hm-categories (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,accuracy (Mean),accuracy (Std)
468,64.0,0.0,2.0,64.0,64.0,64.0,0.0,64.0,0.003,38.12,0.50
473,64.0,0.0,2.0,64.0,64.0,64.0,0.1,64.0,0.003,37.62,0.36
543,64.0,0.1,2.0,64.0,64.0,64.0,0.0,64.0,0.003,37.51,0.48
483,64.0,0.0,3.0,64.0,64.0,64.0,0.0,64.0,0.003,37.41,0.35
548,64.0,0.1,2.0,64.0,64.0,64.0,0.1,64.0,0.003,37.35,0.34
...,...,...,...,...,...,...,...,...,...,...,...
778,8.0,0.1,2.0,8.0,8.0,8.0,0.2,8.0,3e-05,2.58,0.13
773,8.0,0.1,2.0,8.0,8.0,8.0,0.1,8.0,3e-05,2.57,0.15
843,8.0,0.2,2.0,8.0,8.0,8.0,0.0,8.0,3e-05,2.56,0.15
693,8.0,0.0,2.0,8.0,8.0,8.0,0.0,8.0,3e-05,2.56,0.16



>>> GIN on hm-categories (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,accuracy (Mean),accuracy (Std)
292,64.0,0.0,1.0,64.0,64.0,64.0,0.0,64.0,0.003,51.35,0.54
388,64.0,0.2,1.0,64.0,64.0,64.0,0.0,64.0,0.003,51.33,0.46
340,64.0,0.1,1.0,64.0,64.0,64.0,0.0,64.0,0.003,51.18,0.38
345,64.0,0.1,1.0,64.0,64.0,64.0,0.1,64.0,0.003,49.81,0.50
297,64.0,0.0,1.0,64.0,64.0,64.0,0.1,64.0,0.003,49.74,0.57
...,...,...,...,...,...,...,...,...,...,...,...
68,16.0,0.1,2.0,16.0,16.0,16.0,0.0,16.0,3e-05,0.86,0.05
24,16.0,0.0,2.0,16.0,16.0,16.0,0.1,16.0,3e-05,0.86,0.05
116,16.0,0.2,2.0,16.0,16.0,16.0,0.0,16.0,3e-05,0.86,0.05
121,16.0,0.2,2.0,16.0,16.0,16.0,0.1,16.0,3e-05,0.86,0.05



>>> GCN on pokec-regions (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,accuracy (Mean),accuracy (Std)
288,32.0,0.0,8.0,32.0,32.0,32.0,0.0,32.0,0.003,8.57,0.24
293,32.0,0.0,8.0,32.0,32.0,32.0,0.1,32.0,0.003,7.49,0.08
273,32.0,0.0,4.0,32.0,32.0,32.0,0.0,32.0,0.003,7.06,0.04
348,32.0,0.1,4.0,32.0,32.0,32.0,0.0,32.0,0.003,7.06,0.07
258,32.0,0.0,3.0,32.0,32.0,32.0,0.0,32.0,0.003,6.97,0.10
...,...,...,...,...,...,...,...,...,...,...,...
582,8.0,0.2,1.0,8.0,8.0,8.0,0.1,8.0,3e-05,0.39,0.01
432,8.0,0.0,1.0,8.0,8.0,8.0,0.1,8.0,3e-05,0.39,0.01
427,8.0,0.0,1.0,8.0,8.0,8.0,0.0,8.0,3e-05,0.38,0.01
577,8.0,0.2,1.0,8.0,8.0,8.0,0.0,8.0,3e-05,0.38,0.01



>>> GCN on tolokers-2 (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,accuracy (Mean),accuracy (Std)
480,64.0,0.0,3.0,64.0,64.0,64.0,0.0,64.0,0.003,81.58,0.18
615,64.0,0.2,2.0,64.0,64.0,64.0,0.1,64.0,0.003,81.58,0.48
505,64.0,0.0,4.0,64.0,64.0,64.0,0.2,64.0,0.003,81.58,0.58
551,64.0,0.1,3.0,64.0,64.0,64.0,0.0,64.0,0.003,81.56,0.61
500,64.0,0.0,4.0,64.0,64.0,64.0,0.1,64.0,0.003,81.54,0.55
...,...,...,...,...,...,...,...,...,...,...,...
158,16.0,0.2,1.0,16.0,16.0,16.0,0.1,16.0,3e-05,31.05,0.69
9,16.0,0.0,1.0,16.0,16.0,16.0,0.1,16.0,3e-05,31.05,0.69
14,16.0,0.0,1.0,16.0,16.0,16.0,0.2,16.0,3e-05,31.05,0.72
163,16.0,0.2,1.0,16.0,16.0,16.0,0.2,16.0,3e-05,31.05,0.72



>>> GCN on city-reviews (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,accuracy (Mean),accuracy (Std)
587,64.0,0.1,8.0,64.0,64.0,64.0,0.0,64.0,0.003,94.08,0.17
647,64.0,0.2,4.0,64.0,64.0,64.0,0.0,64.0,0.003,94.07,0.15
517,64.0,0.0,8.0,64.0,64.0,64.0,0.1,64.0,0.003,94.07,0.17
497,64.0,0.0,4.0,64.0,64.0,64.0,0.0,64.0,0.003,94.07,0.14
667,64.0,0.2,8.0,64.0,64.0,64.0,0.1,64.0,0.003,94.06,0.18
...,...,...,...,...,...,...,...,...,...,...,...
428,32.0,0.2,4.0,32.0,32.0,32.0,0.1,32.0,3e-05,43.03,0.23
278,32.0,0.0,4.0,32.0,32.0,32.0,0.1,32.0,3e-05,43.02,0.21
423,32.0,0.2,4.0,32.0,32.0,32.0,0.0,32.0,3e-05,43.02,0.23
348,32.0,0.1,4.0,32.0,32.0,32.0,0.0,32.0,3e-05,43.01,0.22



>>> GCN on artnet-exp (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,accuracy (Mean),accuracy (Std)
500,64.0,0.0,4.0,64.0,64.0,64.0,0.1,64.0,0.003,91.24,0.20
505,64.0,0.0,4.0,64.0,64.0,64.0,0.2,64.0,0.003,91.16,0.32
570,64.0,0.1,4.0,64.0,64.0,64.0,0.0,64.0,0.003,91.15,0.16
495,64.0,0.0,4.0,64.0,64.0,64.0,0.0,64.0,0.003,91.14,0.16
281,32.0,0.0,4.0,32.0,32.0,32.0,0.2,32.0,0.003,91.11,0.18
...,...,...,...,...,...,...,...,...,...,...,...
128,16.0,0.1,4.0,16.0,16.0,16.0,0.1,16.0,3e-05,28.17,0.15
59,16.0,0.0,4.0,16.0,16.0,16.0,0.2,16.0,3e-05,28.16,0.13
54,16.0,0.0,4.0,16.0,16.0,16.0,0.1,16.0,3e-05,28.13,0.13
123,16.0,0.1,4.0,16.0,16.0,16.0,0.0,16.0,3e-05,28.12,0.13


In [21]:
import pandas as pd
import numpy as np
import warnings
from collections import defaultdict
from IPython.display import display

# =============================================================================
# 1. Configuration & Constants
# =============================================================================
REQUIRED_SEEDS = {0, 3, 5, 7, 9}
SEEDS_STR = "0,3,5,7,9"
SEED_COL = "dataset.split_params.data_seed"
PROJECT_NAME = "TopoBench_Reproduction"
OUTPUT_FILE = "best_runs.sh"

TASK_METRICS = {
    "classification": ["accuracy", "precision", "recall", "f1_score", "cohen_kappa", "mcc", "jaccard"],
    "regression": ["mse", "mae", "r2"]
}

def get_performance_cols(subset, task_type):
    found_cols = []
    keywords = TASK_METRICS.get(task_type, [])
    for col in subset.columns:
        if "test" in col and "/" in col:
            metric_name = col.split("/")[-1]
            if metric_name in keywords:
                found_cols.append(col)
    try:
        opt_metric_path = subset["callbacks.early_stopping.monitor"].iloc[0]
        opt_metric_name = opt_metric_path.split("/")[-1]
        for col in subset.columns:
            if "test" in col and col.endswith(f"/{opt_metric_name}"):
                if col not in found_cols: found_cols.append(col)
    except: pass 
    return list(set(found_cols))

def get_metric_name(subset):
    metric_path = subset["callbacks.early_stopping.monitor"].unique()
    if len(metric_path) != 1: return "accuracy" 
    return metric_path[0].split("/")[-1]

def clean_value(val):
    """
    Cleans values for CLI usage:
    - Unwraps pandas/numpy objects
    - Converts "21.0" -> 21
    - Returns None for empty values
    """
    if isinstance(val, (pd.Series, pd.Index)):
        if len(val) == 0: return None
        val = val.iloc[0] if isinstance(val, pd.Series) else val[0]
        
    if isinstance(val, (np.ndarray, list, tuple)):
        if isinstance(val, np.ndarray) and val.size == 1:
            val = val.item()
        elif isinstance(val, (list, tuple)) and len(val) > 1:
            return val

    if hasattr(val, 'item'):
        val = val.item()
        
    if pd.isna(val) or val is None or val == "":
        return None

    # Aggressive float->int conversion
    if isinstance(val, float) and val.is_integer():
        return int(val)
    if isinstance(val, str):
        try:
            f_val = float(val)
            if f_val.is_integer(): return int(f_val)
        except ValueError: pass

    return val

# =============================================================================
# 2. Core Processing Logic
# =============================================================================
def process_dataset_model(df, dataset, model, config_columns):
    subset = df[
        (df["dataset.loader.parameters.data_name"] == dataset)
        & (df["model.backbone._target_"] == model)
    ].copy()

    if subset.empty: return None

    warnings.filterwarnings("ignore")
    subset["Model"] = model
    warnings.filterwarnings("default")

    # Mode & Task
    modes = subset["callbacks.early_stopping.mode"].unique()
    if len(modes) == 0:
        print(f"[SKIP] {model} on {dataset}: No early_stopping mode found.")
        return None
    mode = modes[0]
    task_type = "classification" if mode == "max" else "regression"
    ascending = False if mode == "max" else True

    # Metrics
    performance_cols = get_performance_cols(subset, task_type)
    if not performance_cols:
        print(f"[SKIP] {model} on {dataset}: No test metrics found.")
        return None

    subset[performance_cols] = subset[performance_cols].apply(pd.to_numeric, errors='coerce')
    subset = subset.dropna(subset=performance_cols).reset_index(drop=True)
    if subset.empty: return None

    # Determine Varying HPs
    unique_colums_values = {}
    check_cols = list(config_columns)
    if SEED_COL not in check_cols and SEED_COL in subset.columns:
        check_cols.append(SEED_COL)

    for col in check_cols:
        if col in subset.columns:
            try:
                unique_vals = subset[col].apply(lambda x: tuple(x) if isinstance(x, list) else x).unique()
                if len(unique_vals) > 1:
                    unique_colums_values[col] = sorted(list(unique_vals), key=lambda x: str(x))
            except: pass

    if SEED_COL in unique_colums_values: unique_colums_values.pop(SEED_COL)
    aggregation_columns = ["Model"] + list(unique_colums_values.keys())

    # Seed Filter
    def has_required_seeds(group):
        current_seeds = set(group[SEED_COL].unique())
        req_seeds = {float(s) for s in REQUIRED_SEEDS}
        cur_seeds_float = {float(s) for s in current_seeds}
        return req_seeds.issubset(cur_seeds_float)
    
    for col in aggregation_columns: subset[col] = subset[col].astype(str)
    subset_filtered = subset.groupby(aggregation_columns).filter(has_required_seeds)
    if subset_filtered.empty: return None
        
    # Hidden Variations Check (Warning only)
    potential_hps = [
        c for c in subset.columns 
        if "test" not in c and "val" not in c and "train" not in c
        and c not in ["mean", "std", "Model", SEED_COL] and "time" not in c.lower() and "dir" not in c.lower()
    ]
    
    grouped_check = subset_filtered.groupby(aggregation_columns)
    warning_buffer = []

    for col in potential_hps:
        if col in aggregation_columns: continue
        try:
            has_variation = grouped_check[col].apply(lambda x: len(x.unique()) > 1).any()
        except: has_variation = False

        if has_variation:
            try:
                numeric_series = pd.to_numeric(subset_filtered[col], errors='raise')
                v_min, v_mean, v_max = numeric_series.min(), numeric_series.mean(), numeric_series.max()
                stats_msg = f"Range: [{v_min:.4f} - {v_max:.4f}], Mean: {v_mean:.4f}"
            except:
                u_vals = subset_filtered[col].unique()
                preview = ", ".join([str(v) for v in u_vals[:3]])
                if len(u_vals) > 3: preview += "..."
                stats_msg = f"Values: {preview}"
            warning_buffer.append(f"  [VARIES] '{col}' -> {stats_msg}")

    if warning_buffer:
        print(f"\n[WARNING] Hidden variations detected for {model} on {dataset}:")
        for msg in warning_buffer: print(msg)
        print("  -> Averaging occurred over these variations.")

    # Aggregation
    num_unique_configs = len(subset_filtered.groupby(aggregation_columns))
    agg_dict = {col: ["mean", "std"] for col in performance_cols}
    aggregated = subset_filtered.groupby(aggregation_columns).agg(agg_dict).reset_index()

    # Sort
    optimized_metric_name = get_metric_name(subset)
    sort_col = next((c for c in performance_cols if optimized_metric_name in c), performance_cols[0])
    if (sort_col, "mean") not in aggregated.columns: sort_col = aggregated.columns[1][0]

    leaderboard = aggregated.sort_values(by=(sort_col, "mean"), ascending=ascending).copy()

    # Scaling
    cols_to_scale = [c for c in leaderboard.columns if isinstance(c, tuple) and c[0] in performance_cols]
    if mode == "max" and cols_to_scale:
        leaderboard.loc[:, cols_to_scale] = (leaderboard.loc[:, cols_to_scale] * 100).round(2)
    elif cols_to_scale:
        leaderboard.loc[:, cols_to_scale] = leaderboard.loc[:, cols_to_scale].round(4)

    best_row_df = leaderboard.head(1)

    result_data = {
        "dataset": dataset,
        "model": model,
        "task_type": task_type,
        "optimized_metric": optimized_metric_name,
        "num_configs": num_unique_configs,
        "hyperparameter_grid": {col: sorted(subset_filtered[col].unique()) for col in aggregation_columns if col != "Model"},
        "best_config": best_row_df,
        "full_leaderboard": leaderboard,
        "aggregation_columns": aggregation_columns
    }
    
    for metric_col in performance_cols:
        mean_val = best_row_df[(metric_col, "mean")].values[0]
        std_val = best_row_df[(metric_col, "std")].values[0]
        simple_name = metric_col.split("/")[-1]
        result_data[simple_name] = f"{mean_val:.2f} ± {std_val:.2f}"

    return result_data

# =============================================================================
# 3. Execution
# =============================================================================
unique_models = df["model.backbone._target_"].unique()
unique_datasets = df["dataset.loader.parameters.data_name"].unique()

all_results = []
print("Processing results...")

for dataset in unique_datasets:
    for model in unique_models:
        try:
            res = process_dataset_model(df, dataset, model, config_columns)
            if res: all_results.append(res)
        except ValueError as e:
            print(f"\n!!! EXECUTION STOPPED !!!\n{e}")

print(f"Done. Processed {len(all_results)} pairs.")

# =============================================================================
# 4. Display & Generation Functions
# =============================================================================
def display_performance_tables(results_list):
    if not results_list: return
    df_results = pd.DataFrame(results_list)
    for task in ["classification", "regression"]:
        task_df = df_results[df_results["task_type"] == task]
        if task_df.empty: continue
        print(f"\n{'='*40}\n  PERFORMANCE: {task.upper()}\n{'='*40}")
        ignore_cols = ["dataset", "model", "task_type", "optimized_metric", "best_config", "num_configs", "hyperparameter_grid", "full_leaderboard", "aggregation_columns"]
        metric_cols = [c for c in task_df.columns if c not in ignore_cols]
        for metric in metric_cols:
            pivot = task_df.pivot_table(index="model", columns="dataset", values=metric, aggfunc="first").fillna("-")
            print(f"\nMetric: {metric.upper()}")
            display(pivot)

def display_hp_impact(results_list):
    if not results_list: return
    print(f"\n{'='*40}\n  HYPERPARAMETER IMPACT ANALYSIS\n{'='*40}")
    for entry in results_list:
        d, m = entry['dataset'], entry['model']
        leaderboard = entry['full_leaderboard'].copy() 
        opt_metric = entry['optimized_metric']
        
        if isinstance(leaderboard.columns, pd.MultiIndex):
            new_cols = []
            for col in leaderboard.columns:
                c0, c1 = str(col[0]), str(col[1])
                new_cols.append(c0 if c1 == "" else f"{c0}|{c1}")
            leaderboard.columns = new_cols
        
        hp_cols = [h for h in entry['aggregation_columns'] if h != "Model"]
        metric_cols = [c for c in leaderboard.columns if opt_metric in c]
        final_view = leaderboard[hp_cols + metric_cols]
        print(f"\n>>> {m} on {d} (Sorted by {opt_metric})")
        display(final_view)

def generate_best_runs_file(results_list, filename="best_runs.sh"):
    if not results_list: return
    
    print(f"\n{'='*40}\n  GENERATING REPRODUCTION COMMANDS\n{'='*40}")
    commands = []
    
    for entry in results_list:
        dataset = entry['dataset']
        model = entry['model']
        best_row = entry['best_config'].iloc[0]
        
        # Base command
        cmd_parts = [
            "python -m topobench",
            f"model={model}",
            f"dataset={dataset}", 
        ]
        
        # --- ONLY ADD VARIED HYPERPARAMETERS ---
        # The aggregation_columns list contains exactly the columns that 
        # had >1 unique value across the experiment (filtered in process_dataset_model).
        for key in entry['aggregation_columns']:
            if key == "Model": continue # Already added
            
            # Retrieve value from best_row
            val = None
            if key in best_row: val = best_row[key]
            elif (key, "") in best_row: val = best_row[(key, "")]
            
            val = clean_value(val)
            
            if val is not None:
                if isinstance(val, (list, tuple)):
                    val_str = "[" + ",".join(map(str, val)) + "]"
                    cmd_parts.append(f"{key}={val_str}")
                else:
                    cmd_parts.append(f"{key}={val}")

        # Finalize with Seeds and Project
        cmd_parts.append(f"dataset.split_params.data_seed={SEEDS_STR}")
        cmd_parts.append(f"logger.wandb.project={PROJECT_NAME}")
        cmd_parts.append("--multirun")
        
        commands.append(" ".join(cmd_parts))

    with open(filename, "w") as f:
        f.write("#!/bin/bash\n")
        f.write(f"# Best runs generated for project: {PROJECT_NAME}\n")
        f.write("# Contains ONLY model, dataset, varying HPs, and seeds.\n\n")
        for cmd in commands:
            f.write(cmd + "\n\n")
            
    print(f"Successfully wrote {len(commands)} commands to {filename}")

# =============================================================================
# 5. Run Everything
# =============================================================================
display_performance_tables(all_results)
display_hp_impact(all_results)
generate_best_runs_file(all_results, OUTPUT_FILE)

Processing results...

  [VARIES] '_step' -> Range: [122.0000 - 1530.0000], Mean: 884.1911
  [VARIES] 'best_epoch' -> Range: [9.0000 - 649.0000], Mean: 355.4505
  [VARIES] 'best_epoch/checkpoint' -> Values: /scratch/levtel/TB/logs/train/runs/2025-12-30_04-00-42/checkpoints/epoch_189.ckpt, /scratch/levtel/TB/logs/train/runs/2025-12-30_04-00-41/checkpoints/epoch_149.ckpt, /scratch/levtel/TB/logs/train/runs/2025-12-30_04-00-42/checkpoints/epoch_204-v1.ckpt...
  [VARIES] 'epoch' -> Range: [54.0000 - 694.0000], Mean: 400.4505
  [VARIES] 'lr-Adam' -> Range: [0.0000 - 0.0004], Mean: 0.0000
  -> Averaging occurred over these variations.

  [VARIES] '_step' -> Range: [122.0000 - 1508.0000], Mean: 575.3166
  [VARIES] 'best_epoch' -> Range: [9.0000 - 639.0000], Mean: 215.0530
  [VARIES] 'best_epoch/checkpoint' -> Values: /scratch/levtel/TB/logs/train/runs/2026-01-06_18-09-03/checkpoints/epoch_029.ckpt, /scratch/levtel/TB/logs/train/runs/2026-01-06_18-09-11/checkpoints/epoch_069.ckpt, /scratch/lev

dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,91.24 ± 0.20,94.08 ± 0.17,38.12 ± 0.50,8.57 ± 0.24,81.58 ± 0.18
GIN,-,-,51.35 ± 0.54,-,-



Metric: F1_SCORE


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,91.24 ± 0.20,94.08 ± 0.17,38.12 ± 0.50,8.57 ± 0.24,81.58 ± 0.18
GIN,-,-,51.35 ± 0.54,-,-



Metric: RECALL


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,62.66 ± 0.64,81.58 ± 0.57,27.76 ± 0.50,2.15 ± 0.08,65.01 ± 1.08
GIN,-,-,39.15 ± 0.64,-,-



Metric: JACCARD


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,57.26 ± 0.63,75.31 ± 0.64,18.04 ± 0.35,0.66 ± 0.03,54.84 ± 0.93
GIN,-,-,27.93 ± 0.66,-,-



Metric: MCC


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,38.16 ± 1.54,69.97 ± 0.94,31.18 ± 0.59,6.20 ± 0.28,37.99 ± 1.39
GIN,-,-,46.16 ± 0.60,-,-



Metric: PRECISION


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,78.78 ± 1.38,88.77 ± 0.41,34.68 ± 0.76,1.56 ± 0.09,74.05 ± 0.21
GIN,-,-,49.13 ± 3.00,-,-



Metric: COHEN_KAPPA


dataset,artnet-exp,city-reviews,hm-categories,pokec-regions,tolokers-2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GCN,34.23 ± 1.49,69.38 ± 0.98,30.99 ± 0.59,5.96 ± 0.27,35.72 ± 1.90
GIN,-,-,46.05 ± 0.60,-,-



  HYPERPARAMETER IMPACT ANALYSIS

>>> GCN on hm-categories (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,test_best_rerun/accuracy|mean,test_best_rerun/accuracy|std
468,64.0,0.0,2.0,64.0,64.0,64.0,0.0,64.0,0.003,38.12,0.50
473,64.0,0.0,2.0,64.0,64.0,64.0,0.1,64.0,0.003,37.62,0.36
543,64.0,0.1,2.0,64.0,64.0,64.0,0.0,64.0,0.003,37.51,0.48
483,64.0,0.0,3.0,64.0,64.0,64.0,0.0,64.0,0.003,37.41,0.35
548,64.0,0.1,2.0,64.0,64.0,64.0,0.1,64.0,0.003,37.35,0.34
...,...,...,...,...,...,...,...,...,...,...,...
778,8.0,0.1,2.0,8.0,8.0,8.0,0.2,8.0,3e-05,2.58,0.13
773,8.0,0.1,2.0,8.0,8.0,8.0,0.1,8.0,3e-05,2.57,0.15
843,8.0,0.2,2.0,8.0,8.0,8.0,0.0,8.0,3e-05,2.56,0.15
693,8.0,0.0,2.0,8.0,8.0,8.0,0.0,8.0,3e-05,2.56,0.16



>>> GIN on hm-categories (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,test_best_rerun/accuracy|mean,test_best_rerun/accuracy|std
292,64.0,0.0,1.0,64.0,64.0,64.0,0.0,64.0,0.003,51.35,0.54
388,64.0,0.2,1.0,64.0,64.0,64.0,0.0,64.0,0.003,51.33,0.46
340,64.0,0.1,1.0,64.0,64.0,64.0,0.0,64.0,0.003,51.18,0.38
345,64.0,0.1,1.0,64.0,64.0,64.0,0.1,64.0,0.003,49.81,0.50
297,64.0,0.0,1.0,64.0,64.0,64.0,0.1,64.0,0.003,49.74,0.57
...,...,...,...,...,...,...,...,...,...,...,...
68,16.0,0.1,2.0,16.0,16.0,16.0,0.0,16.0,3e-05,0.86,0.05
24,16.0,0.0,2.0,16.0,16.0,16.0,0.1,16.0,3e-05,0.86,0.05
116,16.0,0.2,2.0,16.0,16.0,16.0,0.0,16.0,3e-05,0.86,0.05
121,16.0,0.2,2.0,16.0,16.0,16.0,0.1,16.0,3e-05,0.86,0.05



>>> GCN on pokec-regions (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,test_best_rerun/accuracy|mean,test_best_rerun/accuracy|std
288,32.0,0.0,8.0,32.0,32.0,32.0,0.0,32.0,0.003,8.57,0.24
293,32.0,0.0,8.0,32.0,32.0,32.0,0.1,32.0,0.003,7.49,0.08
273,32.0,0.0,4.0,32.0,32.0,32.0,0.0,32.0,0.003,7.06,0.04
348,32.0,0.1,4.0,32.0,32.0,32.0,0.0,32.0,0.003,7.06,0.07
258,32.0,0.0,3.0,32.0,32.0,32.0,0.0,32.0,0.003,6.97,0.10
...,...,...,...,...,...,...,...,...,...,...,...
582,8.0,0.2,1.0,8.0,8.0,8.0,0.1,8.0,3e-05,0.39,0.01
432,8.0,0.0,1.0,8.0,8.0,8.0,0.1,8.0,3e-05,0.39,0.01
427,8.0,0.0,1.0,8.0,8.0,8.0,0.0,8.0,3e-05,0.38,0.01
577,8.0,0.2,1.0,8.0,8.0,8.0,0.0,8.0,3e-05,0.38,0.01



>>> GCN on tolokers-2 (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,test_best_rerun/accuracy|mean,test_best_rerun/accuracy|std
480,64.0,0.0,3.0,64.0,64.0,64.0,0.0,64.0,0.003,81.58,0.18
615,64.0,0.2,2.0,64.0,64.0,64.0,0.1,64.0,0.003,81.58,0.48
505,64.0,0.0,4.0,64.0,64.0,64.0,0.2,64.0,0.003,81.58,0.58
551,64.0,0.1,3.0,64.0,64.0,64.0,0.0,64.0,0.003,81.56,0.61
500,64.0,0.0,4.0,64.0,64.0,64.0,0.1,64.0,0.003,81.54,0.55
...,...,...,...,...,...,...,...,...,...,...,...
158,16.0,0.2,1.0,16.0,16.0,16.0,0.1,16.0,3e-05,31.05,0.69
9,16.0,0.0,1.0,16.0,16.0,16.0,0.1,16.0,3e-05,31.05,0.69
14,16.0,0.0,1.0,16.0,16.0,16.0,0.2,16.0,3e-05,31.05,0.72
163,16.0,0.2,1.0,16.0,16.0,16.0,0.2,16.0,3e-05,31.05,0.72



>>> GCN on city-reviews (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,test_best_rerun/accuracy|mean,test_best_rerun/accuracy|std
587,64.0,0.1,8.0,64.0,64.0,64.0,0.0,64.0,0.003,94.08,0.17
647,64.0,0.2,4.0,64.0,64.0,64.0,0.0,64.0,0.003,94.07,0.15
517,64.0,0.0,8.0,64.0,64.0,64.0,0.1,64.0,0.003,94.07,0.17
497,64.0,0.0,4.0,64.0,64.0,64.0,0.0,64.0,0.003,94.07,0.14
667,64.0,0.2,8.0,64.0,64.0,64.0,0.1,64.0,0.003,94.06,0.18
...,...,...,...,...,...,...,...,...,...,...,...
428,32.0,0.2,4.0,32.0,32.0,32.0,0.1,32.0,3e-05,43.03,0.23
278,32.0,0.0,4.0,32.0,32.0,32.0,0.1,32.0,3e-05,43.02,0.21
423,32.0,0.2,4.0,32.0,32.0,32.0,0.0,32.0,3e-05,43.02,0.23
348,32.0,0.1,4.0,32.0,32.0,32.0,0.0,32.0,3e-05,43.01,0.22



>>> GCN on artnet-exp (Sorted by accuracy)


Unnamed: 0,model.readout.hidden_dim,model.backbone.dropout,model.backbone.num_layers,model.backbone.in_channels,model.backbone.hidden_channels,model.feature_encoder.out_channels,model.feature_encoder.proj_dropout,model.backbone_wrapper.out_channels,optimizer.parameters.lr,test_best_rerun/accuracy|mean,test_best_rerun/accuracy|std
500,64.0,0.0,4.0,64.0,64.0,64.0,0.1,64.0,0.003,91.24,0.20
505,64.0,0.0,4.0,64.0,64.0,64.0,0.2,64.0,0.003,91.16,0.32
570,64.0,0.1,4.0,64.0,64.0,64.0,0.0,64.0,0.003,91.15,0.16
495,64.0,0.0,4.0,64.0,64.0,64.0,0.0,64.0,0.003,91.14,0.16
281,32.0,0.0,4.0,32.0,32.0,32.0,0.2,32.0,0.003,91.11,0.18
...,...,...,...,...,...,...,...,...,...,...,...
128,16.0,0.1,4.0,16.0,16.0,16.0,0.1,16.0,3e-05,28.17,0.15
59,16.0,0.0,4.0,16.0,16.0,16.0,0.2,16.0,3e-05,28.16,0.13
54,16.0,0.0,4.0,16.0,16.0,16.0,0.1,16.0,3e-05,28.13,0.13
123,16.0,0.1,4.0,16.0,16.0,16.0,0.0,16.0,3e-05,28.12,0.13



  GENERATING REPRODUCTION COMMANDS
Successfully wrote 6 commands to best_runs.sh


{'Dataset': 'hm-categories',
 'Model': 'GCN',
 'Command': 'python -m topobench model=GCN dataset=hm-categories dataset.loader.parameters.data_name=hm-categories dataset.loader.parameters.data_type=graphland dataset.loader.parameters.data_domain=graph dataset.loader.parameters.drop_missing_y=False dataset.loader.parameters.impute_missing_x.copy=True dataset.loader.parameters.impute_missing_x.strategy=most_frequent dataset.loader.parameters.impute_missing_x.add_indicator=False dataset.parameters.task=classification dataset.parameters.loss_type=cross_entropy dataset.parameters.task_level=node dataset.parameters.num_classes=21 dataset.parameters.num_features=35 dataset.split_params.k=10 dataset.split_params.split_type=random dataset.split_params.train_prop=0.5 dataset.split_params.learning_setting=transductive dataset.dataloader_params.batch_size=1 dataset.dataloader_params.pin_memory=False dataset.dataloader_params.num_workers=0 callbacks.early_stopping.strict=True callbacks.early_stoppin

### Runtime per epoch

In [44]:
collected_results_time
# Convert nested dictionary to DataFrame
nested_dict = dict(collected_results_time)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)


result_dict = result_dict.round(2)
result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Average Time per Epoch"]

In [45]:
result_dict.pivot_table(
    index="Model", columns="Dataset", values="Average Time per Epoch", aggfunc="first"
)

Dataset,Cora,PubMed,amazon_ratings,artnet-exp,artnet-views,avazu-ctr,citeseer,city-reviews,city-roads-L,city-roads-M,hm-categories,pokec-regions,roman_empire,tolokers-2,twitch-views
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
GAT,0.19 ± 0.01,0.24 ± 0.02,0.04 ± 0.02,0.06 ± 0.04,0.05 ± 0.04,0.52 ± 0.05,0.25 ± 0.01,0.16 ± 0.09,0.05 ± 0.03,0.03 ± 0.02,0.42 ± 0.04,2.15 ± 0.05,0.03 ± 0.01,0.07 ± 0.06,0.33 ± 0.07
GCN,0.18 ± 0.01,0.23 ± 0.01,0.03 ± 0.01,0.03 ± 0.01,0.03 ± 0.01,0.56 ± 0.18,0.25 ± 0.01,0.1 ± 0.04,0.03 ± 0.01,0.02 ± 0.01,0.51 ± 0.17,11.82 ± 7.48,0.03 ± 0.0,0.03 ± 0.02,0.39 ± 0.23
GIN,0.18 ± 0.01,0.23 ± 0.01,0.02 ± 0.0,0.03 ± 0.01,0.03 ± 0.01,0.52 ± 0.16,0.25 ± 0.01,0.09 ± 0.04,0.03 ± 0.01,0.02 ± 0.01,0.52 ± 0.3,2.26 ± 0.16,0.03 ± 0.0,0.03 ± 0.01,0.31 ± 0.14


### Runtime

In [47]:
collected_results_time_run
# Convert nested dictionary to DataFrame
nested_dict = dict(collected_results_time_run)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)


result_dict = result_dict.round(2)
result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Average Training Time"]


In [48]:
result_dict.pivot_table(
    index="Model", columns="Dataset", values="Average Training Time", aggfunc="first"
)

Dataset,Cora,PubMed,amazon_ratings,artnet-exp,artnet-views,avazu-ctr,citeseer,city-reviews,city-roads-L,city-roads-M,hm-categories,pokec-regions,roman_empire,tolokers-2,twitch-views
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
GAT,32.97 ± 17.08,42.8 ± 18.93,14.31 ± 7.73,12.48 ± 7.04,20.59 ± 19.19,99.65 ± 40.46,26.94 ± 12.52,68.29 ± 42.31,33.99 ± 21.12,18.57 ± 9.99,205.67 ± 42.4,1036.01 ± 168.84,13.36 ± 5.01,16.68 ± 13.98,170.26 ± 67.39
GCN,31.84 ± 14.85,38.52 ± 18.1,10.63 ± 3.34,12.31 ± 6.14,13.12 ± 4.77,141.18 ± 64.78,25.86 ± 10.97,40.85 ± 17.66,14.81 ± 6.96,9.59 ± 3.67,293.66 ± 124.7,17228.85 ± 12894.43,11.14 ± 3.43,10.68 ± 7.64,173.02 ± 95.12
GIN,25.3 ± 10.54,49.25 ± 25.52,7.72 ± 3.09,5.71 ± 2.94,8.83 ± 4.08,78.36 ± 49.69,26.95 ± 11.26,25.99 ± 17.45,11.49 ± 6.96,7.7 ± 3.95,112.08 ± 77.15,774.89 ± 376.36,11.45 ± 3.46,4.74 ± 2.02,112.97 ± 72.32
