In [1]:
import ast
import glob
import warnings
from collections import defaultdict
from datetime import date

import numpy as np
import pandas as pd
import wandb

today = date.today()
api = wandb.Api()

# # Find all csv files in the current directory
csv_files = glob.glob("/home/lev/projects/TopoBenchmarkX/big_csv/*.csv")
# # Collect all the names of the csv files without the extension
csv_names = [csv_file[:-4] for csv_file in csv_files]
project_name = "TopoBenchmarkX_Cellular"  
user = "telyatnikov_sap"

if project_name not in csv_names:
    runs = api.runs(f"{user}/{project_name}")

    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k, v in run.config.items() if not k.startswith("_")}
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {"summary": summary_list, "config": config_list, "name": name_list}
    )

    runs_df.to_csv(f"{user}_{project_name}.csv")
else:
    runs_df = pd.read_csv(f"{user}_{project_name}.csv", index_col=0)

    for row in runs_df.iloc:
        row["summary"] = ast.literal_eval(row["summary"])
        row["config"] = ast.literal_eval(row["config"])


for row in runs_df.iloc:
    row["summary"].update(row["config"])

lst = [i["summary"] for i in runs_df.iloc]
df = pd.DataFrame.from_dict(lst)

df_init = df.copy()

# Get average epoch run time
df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [2]:
def normalize_column(df, column_to_normalize):
    # Use json_normalize to flatten the nested dictionaries into separate columns
    flattened_df = pd.json_normalize(df[column_to_normalize])
    # Rename columns to include 'nested_column' prefix
    flattened_df.columns = [
        f"{column_to_normalize}.{col}" for col in flattened_df.columns
    ]
    # Concatenate the flattened DataFrame with the original DataFrame
    result_df = pd.concat([df, flattened_df], axis=1)
    # Get new columns names
    new_columns = flattened_df.columns
    # Drop the original nested column if needed
    result_df.drop(column_to_normalize, axis=1, inplace=True)
    return result_df, new_columns


# Config columns to normalize
columns_to_normalize = ["model", "dataset", "callbacks", "paths"]

# Keep track of config columns added
config_columns = []
for column in columns_to_normalize:
    df, columns = normalize_column(df, column)
    config_columns.extend(columns)

## Adding aditional runs

In [3]:
additiona_runs = pd.read_csv(f'gbg141_{project_name}.csv', index_col=0)
df = pd.concat([df, additiona_runs], axis=0)
df.reset_index(drop=True, inplace=True)

  additiona_runs = pd.read_csv(f'gbg141_{project_name}.csv', index_col=0)


## Select models that have finished the runs

## Workout us_demographic 

In [4]:
# For every rows where df['dataset.parameters.data_name'] == 'US-county-demos' extend the 'dataset.parameters.data_name' with dataset.parameters.task_variable 
# and set it to 'US-county-demos' + '-' + dataset.parameters.task_variable
df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.data_name'] = df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.data_name'] + '-' + df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.task_variable']

In [5]:
# Print all columns 10 per line
for i in range(0, len(df.columns), 5):
    print(list(df.columns[i:i + 5]))


['_step', 'val/auroc', 'train/loss', 'train/auroc', 'train/recall']
['val/precision', 'train/accuracy', 'lr-Adam', '_runtime', 'val/loss']
['trainer/global_step', 'epoch', '_timestamp', 'val/recall', 'val/accuracy']
['train/precision', 'val/mae', 'train/mae', 'train/mse', 'val/mse']
['seed', 'tags', 'extras', 'trainer', 'ckpt_path']
['task_name', 'model/params/total', 'model/params/trainable', 'model/params/non_trainable', 'test/mae']
['_wandb', 'test/mse', 'test/loss', 'test/precision', 'test/auroc']
['test/accuracy', 'test/recall', 'epoch_run_time', 'model.compile', 'model._target_']
['model.model_name', 'model.model_domain', 'model.loss.task', 'model.loss._target_', 'model.loss.loss_type']
['model.readout._target_', 'model.readout.hidden_dim', 'model.readout.readout_name', 'model.readout.num_cell_dimensions', 'model.backbone.dropout']
['model.backbone._target_', 'model.backbone.n_layers', 'model.backbone.in_channels', 'model.optimizer.lr', 'model.optimizer._target_']
['model.optimiz

### See unique datasets

In [6]:
print(df['dataset.parameters.data_name'].unique())
print("Num unique datasets:", len(df['dataset.parameters.data_name'].unique()))

[nan 'US-county-demos-UnemploymentRate' 'ZINC' 'IMDB-MULTI' 'IMDB-BINARY'
 'US-county-demos-BachelorRate' 'minesweeper' 'US-county-demos-DeathRate'
 'PROTEINS' 'US-county-demos-BirthRate' 'MUTAG'
 'US-county-demos-MigraRate' 'PubMed' 'roman_empire' 'citeseer' 'Cora'
 'US-county-demos-MedianIncome' 'US-county-demos-Election' 'NCI1' 'NCI109']
Num unique datasets: 20


## See unique models

In [7]:
print(df['model.model_name'].unique())

[nan 'cwn_dcm' 'ccxn' 'cwn']


## Solve batch problems

In [8]:
datasets = ['minesweeper', 'roman_empire']
models = ['cwn_dcm', 'ccxn', 'cwn']
# For the following models and datasets I mistook the batch size, it should be 1, instead of 256 or 128
# Keep the run where batch size is 128 and then change the batch size to 1
for model in models:
    print("MODEL:", model)
    for dataset in datasets:

        # Change the batch size to 1 when it is 128
        
        print(df.loc[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset), 'dataset.parameters.batch_size'].unique())
        

MODEL: cwn_dcm
[1.]
[1.]
MODEL: ccxn
[]
[]
MODEL: cwn
[]
[]


### Workout US-county-demos-DeathRate ccxn problems

In [9]:
# Find and keep the runs where the model is ccxn and the dataset is US-county-demos-DeathRate and the learning rate is 0.001

dataset_names = ['US-county-demos-DeathRate', 'US-county-demos-BirthRate']
for dataset_name in dataset_names:
    a = df.loc[(df['model.model_name'] == 'ccxn') & (df['dataset.parameters.data_name'] == dataset_name)]
    a = a['model.optimizer.lr'].isin([0.001])
    df.loc[(df['model.model_name'] == 'ccxn') & (df['dataset.parameters.data_name'] == dataset_name)] = df.loc[(df['model.model_name'] == 'ccxn') & (df['dataset.parameters.data_name'] == dataset_name)][a]



## Solve issue with projection dropout

In [10]:
print(df['model.feature_encoder.proj_dropout'].unique())

[ nan 0.25 0.5 ]


In [11]:
# Keep rows where model.feature_encoder.proj_dropout is [0.5  0.25]
df = df[df['model.feature_encoder.proj_dropout'].isin([0.5, 0.25])]


In [12]:
df.reset_index(drop=True, inplace=True)

In [13]:
# Sweeped parameters: 
sweeped_columns = [
    'model.optimizer.lr', 
    'model.feature_encoder.out_channels',
    'model.backbone.n_layers',
    'model.readout.readout_name',
    'model.feature_encoder.proj_dropout',
    'dataset.parameters.batch_size',
    'dataset.parameters.data_seed',
    'seed',
]



# # For each model and dataset go over all the sweeped parameters and print the unique values
# for model in df['model.model_name'].unique():
#     print(f"Model: {model}")
#     for dataset in df['dataset.parameters.data_name'].unique():
#         print(f"Dataset: {dataset}")
#         for column in sweeped_columns:
#             print(f"Column: {column}")
#             print(df.loc[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset), column].unique())
        
#         print('---------------NEW DATASET------------------')
#     print('---------------NEW MODEL------------------')


### Get the best results

In [14]:
# Extract best results for each model and dataset
# 1. Keep the columns that are necessary for the comparison
sweeped_columns = [
    'model.optimizer.lr', 
    'model.feature_encoder.out_channels',
    'model.backbone.n_layers',
    'model.readout.readout_name',
    'model.feature_encoder.proj_dropout',
    'dataset.parameters.batch_size',
    # 'dataset.parameters.data_seed',
    # 'seed',
]
run_columns = ['dataset.parameters.data_seed','seed']

# Dataset and model columns
dataset_model_columns = ['model.model_name', 'dataset.parameters.data_name']

# Performance columns
performance_columns = [
    'val/loss', 'test/loss',
    'val/mae', 'test/mae',
    'val/mse', 'test/mse',
    'val/accuracy', 'test/accuracy',
    'val/auroc','test/auroc',
    'val/recall', 'test/recall',
    'val/precision', 'test/precision',
    ]
keep_columns = dataset_model_columns + sweeped_columns + performance_columns + run_columns
df = df[keep_columns]

In [15]:
performance_classification = [
    'val/accuracy', 'test/accuracy',
    'val/auroc','test/auroc',
    'val/recall', 'test/recall',
    'val/precision', 'test/precision',
    ]
performance_regression = [
    'val/mae', 'test/mae',
    'val/mse', 'test/mse',
    ]
# Define a dict of dicts for each dataset the corresponding optimization metrics
optimization_metrics = {
    'IMDB-MULTI': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'IMDB-BINARY': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'REDDIT-BINARY': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'NCI109': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'NCI1': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'PROTEINS': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'MUTAG': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'Cora': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'citeseer': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'PubMed': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},

    'roman_empire': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'amazon_ratings': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    
    'tolokers': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},
    'questions': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},
    'minesweeper': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},

    'ZINC': {'optim_metric': 'val/mse', 'eval_metric': 'test/mae', 'direction': 'min', 'performance_columns': performance_regression},
    
    'US-county-demos-UnemploymentRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-BachelorRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-DeathRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-BirthRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-MigraRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-MedianIncome': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-Election': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},

} 

len(optimization_metrics)

23

### Generate the best results

In [16]:
# Get unique datasets
datasets = list(df['dataset.parameters.data_name'].unique())
# Get unique models
models = list(df['model.model_name'].unique())

best_results = defaultdict(dict)
hp_runs = defaultdict(dict)
best_runs = defaultdict(dict)
# Got over each dataset and model and find the best result
for dataset in datasets:
    for model in models:
        # Get the subset of the DataFrame for the current dataset and model
        subset = df[
            (df['dataset.parameters.data_name'] == dataset)
            & (df['model.model_name'] == model)
        ]

        optim_metric = optimization_metrics[dataset]['optim_metric']
        eval_metric = optimization_metrics[dataset]['eval_metric']
        direction = optimization_metrics[dataset]['direction']
        
        # Keep metrics that matters for dataset
        performance_columns = optimization_metrics[dataset]['performance_columns']
        subset = subset[dataset_model_columns + sweeped_columns + performance_columns + run_columns]

        # Find 'NaN' in performance columns
        # for each column find number of rows with 'NaN' string
        total_performance_nan = 0
        for column in performance_columns:
            # Find the number of NaN string in the column
            num_nan = subset[column].apply(lambda x: x == 'NaN')
            num_nan = num_nan.sum()
            total_performance_nan += num_nan
            # If there are any 'NaN' string in the column print the column name and the number of 'NaN' strings
            

        if total_performance_nan > 0:
            print('Dropping rows with NaN values')
            print(f"Dataset: {dataset}, Model: {model}")
            nan_rows = subset[performance_columns].eq('NaN')
            nan_rows = nan_rows.sum(axis=1)
            # Drop every rows where 'NaN' string is present
            subset = subset[~nan_rows.gt(0)]
            print('Total rows in subset:', nan_rows.shape[0])
            print('Total dropped rows:', sum(nan_rows.gt(0)))
    
        # Ensure that the performance columns are of type float
        subset[performance_columns] = subset[performance_columns].astype(float)
        # Aggregate
        aggregated = subset.groupby(sweeped_columns, dropna=False).agg(
            {col: ["mean", "std"] for col in performance_columns},
        )

         # Go from MultiIndex to Index
        aggregated = aggregated.reset_index()
        aggregated = aggregated.sort_values(
                by=(optim_metric, "mean"), ascending=(direction == 'min')
            )
        
        # Git percent in case of classification
        if 'test/accuracy' in performance_columns:
            # Go over all the performance columns and multiply by 100
            for col in performance_columns:
                aggregated[(col, "mean")] *= 100
                aggregated[(col, "std")] *= 100
            
            # Round performance columns values up to 2 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(2)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(2)
            
            
        else:
            # Round all values up to 4 decimal points
            # Round performance columns values up to 4 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(4)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(4)
        
            
        
        # Get the best result
        final_best = aggregated.head(1)
        if final_best[(eval_metric, "mean")].any(): 
            best_results[dataset][model] = {
                "mean": final_best[(eval_metric, "mean")].values[0],
                "std": final_best[(eval_metric, "std")].values[0],
            }

            # Extract best runs: 
            best_params = {}
            for col in sweeped_columns:
                best_params[col] = final_best[(col, '')].item()
            
            hp_runs[dataset][model] = subset.copy()
            
            # Start with the entire DataFrame
            filtered_subset = subset.copy()

            # Iterate over each key-value pair in the best parameters dictionary and filter the DataFrame
            for param, value in best_params.items():
                filtered_subset = filtered_subset[filtered_subset[param] == value]
            best_runs[dataset][model] = filtered_subset
        
        else: 
            best_results[dataset][model] = {
                "mean": np.nan,
                "std": np.nan,
            }

        

        
            
        


Dropping rows with NaN values
Dataset: US-county-demos-UnemploymentRate, Model: ccxn
Total rows in subset: 474
Total dropped rows: 40
Dropping rows with NaN values
Dataset: ZINC, Model: ccxn
Total rows in subset: 304
Total dropped rows: 33
Dropping rows with NaN values
Dataset: US-county-demos-BachelorRate, Model: ccxn
Total rows in subset: 460
Total dropped rows: 25
Dropping rows with NaN values
Dataset: US-county-demos-DeathRate, Model: ccxn
Total rows in subset: 232
Total dropped rows: 9
Dropping rows with NaN values
Dataset: US-county-demos-BirthRate, Model: ccxn
Total rows in subset: 229
Total dropped rows: 5


Dropping rows with NaN values
Dataset: US-county-demos-MigraRate, Model: ccxn
Total rows in subset: 432
Total dropped rows: 31
Dropping rows with NaN values
Dataset: US-county-demos-MedianIncome, Model: ccxn
Total rows in subset: 450
Total dropped rows: 37
Dropping rows with NaN values
Dataset: US-county-demos-Election, Model: ccxn
Total rows in subset: 457
Total dropped rows: 40


In [17]:
#best_runs['US-county-demos-BirthRate']['ccxn']

## Save obtained best results and best runs

In [18]:
# Convert nested dictionary to DataFrame
nested_dict = dict(best_results)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)

result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Performance"]

result_dict = result_dict.pivot_table(
    index="Model", columns="Dataset", values="Performance", aggfunc="first"
)

In [19]:
result_dict

Dataset,Cora,IMDB-BINARY,IMDB-MULTI,MUTAG,NCI1,NCI109,PROTEINS,PubMed,US-county-demos-BachelorRate,US-county-demos-BirthRate,US-county-demos-DeathRate,US-county-demos-Election,US-county-demos-MedianIncome,US-county-demos-MigraRate,US-county-demos-UnemploymentRate,ZINC,citeseer,minesweeper,roman_empire
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ccxn,86.79 ± 1.81,68.4 ± nan,47.63 ± 3.45,74.89 ± 5.51,74.86 ± 0.82,74.48 ± 0.83,75.63 ± 2.57,88.91 ± 0.47,0.3166 ± 0.0288,0.745 ± 0.1123,0.5426 ± 0.055,0.3471 ± 0.0154,0.2489 ± 0.0296,0.8482 ± 0.185,0.2707 ± 0.0258,0.4194 ± 0.0161,74.67 ± 2.24,nan ± nan,nan ± nan
cwn,86.32 ± 1.38,70.4 ± 2.02,49.71 ± 2.83,80.43 ± 1.78,74.11 ± 1.77,73.8 ± 2.06,76.13 ± 2.7,88.64 ± 0.36,0.3306 ± 0.0279,0.7181 ± 0.086,0.5399 ± 0.0553,0.3437 ± 0.0216,0.2468 ± 0.03,0.838 ± 0.1286,0.2535 ± 0.031,0.3497 ± 0.012,75.2 ± 1.82,nan ± nan,nan ± nan
cwn_dcm,87.44 ± 1.35,69.12 ± 2.82,45.87 ± nan,77.02 ± 9.32,76.65 ± 0.0,75.35 ± 1.5,73.33 ± 2.3,88.62 ± 0.4,0.313 ± 0.0248,0.7134 ± 0.0915,0.5443 ± 0.0568,0.307 ± 0.0155,0.2263 ± 0.0187,0.8373 ± 0.1206,0.2456 ± 0.0329,0.336 ± 0.0125,75.63 ± 1.67,89.42 ± 0.0,82.14 ± 0.0


In [20]:
# Increase the number of allowed rows to display
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.width", 1000)
result_dict.to_csv(f"best_results_cell.csv")

### Ablation

In [21]:
# Get unique datasets
datasets = list(df['dataset.parameters.data_name'].unique())
# Get unique models
models = list(df['model.model_name'].unique())

best_results = defaultdict(dict)
hp_runs = defaultdict(dict)
best_runs = defaultdict(dict)
# Got over each dataset and model and find the best result
for dataset in datasets:
    for model in models:
        # Get the subset of the DataFrame for the current dataset and model
        subset = df[
            (df['dataset.parameters.data_name'] == dataset)
            & (df['model.model_name'] == model)
        ]

        optim_metric = optimization_metrics[dataset]['optim_metric']
        eval_metric = optimization_metrics[dataset]['eval_metric']
        direction = optimization_metrics[dataset]['direction']
        
        # Keep metrics that matters for dataset
        performance_columns = optimization_metrics[dataset]['performance_columns']
        subset = subset[dataset_model_columns + sweeped_columns + performance_columns + run_columns]

        # Find 'NaN' in performance columns
        # for each column find number of rows with 'NaN' string
        total_performance_nan = 0
        for column in performance_columns:
            # Find the number of NaN string in the column
            num_nan = subset[column].apply(lambda x: x == 'NaN')
            num_nan = num_nan.sum()
            total_performance_nan += num_nan
            # If there are any 'NaN' string in the column print the column name and the number of 'NaN' strings
            
        print('Dropping rows with NaN values')
        print(f"Dataset: {dataset}, Model: {model}")
        if total_performance_nan > 0:
            
            nan_rows = subset[performance_columns].eq('NaN')
            nan_rows = nan_rows.sum(axis=1)
            # Drop every rows where 'NaN' string is present
            subset = subset[~nan_rows.gt(0)]
            print('Total rows in subset:', nan_rows.shape[0])
            print('Total dropped rows:', sum(nan_rows.gt(0)))
    
        # Ensure that the performance columns are of type float
        subset[performance_columns] = subset[performance_columns].astype(float)

        # Workout nan
    
        for column in performance_columns:
            if subset[column].isna().sum() > 0:  
                # Find the number of NaN string in the column
                print(f'Initial number of rows, {subset.shape[0]}')
                print(f'Rows with isna() == True, {subset[column].isna().sum()}')
            
                subset = subset[~subset[column].isna()]
            
        # Aggregate
        aggregated = subset.groupby(sweeped_columns, dropna=False).agg(
            {col: ["mean", "std"] for col in performance_columns},
        )

         # Go from MultiIndex to Index
        aggregated = aggregated.reset_index()
        aggregated = aggregated.sort_values(
                by=(optim_metric, "mean"), ascending=(direction == 'min')
            )
        
        # Git percent in case of classification
        if 'test/accuracy' in performance_columns:
            # Go over all the performance columns and multiply by 100
            for col in performance_columns:
                aggregated[(col, "mean")] *= 100
                aggregated[(col, "std")] *= 100
            
            # Round performance columns values up to 2 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(2)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(2)
            
            
        else:
            # Round all values up to 4 decimal points
            # Round performance columns values up to 4 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(4)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(4)
        
            
        
        if sorted(list(aggregated['model.readout.readout_name'].unique())) == sorted(['NoReadOut', 'PropagateSignalDown']):
            prop_types = ['PropagateSignalDown','NoReadOut' ]
            for prop_type in prop_types:
                agg_sub = aggregated[aggregated['model.readout.readout_name'] == prop_type]
                agg_sub = agg_sub.sort_values(
                    by=(optim_metric, "mean"), ascending=(direction == 'min')
                )
                
                final_best = agg_sub.head(1)
                if final_best[(eval_metric, "mean")].any(): 
                    best_results[dataset][f"{model} ({prop_type})"] = {
                        "mean": final_best[(eval_metric, "mean")].values[0],
                        "std": final_best[(eval_metric, "std")].values[0],
                    }

                    # Extract best runs: 
                    best_params = {}
                    for col in sweeped_columns:
                        best_params[col] = final_best[(col, '')].item()
                    
                    hp_runs[dataset][model] = subset.copy()
                    
                    # Start with the entire DataFrame
                    filtered_subset = subset.copy()

                    # Iterate over each key-value pair in the best parameters dictionary and filter the DataFrame
                    for param, value in best_params.items():
                        filtered_subset = filtered_subset[filtered_subset[param] == value]
                    best_runs[dataset][model] = filtered_subset
                
                else: 
                    best_results[dataset][model] = {
                        "mean": np.nan,
                        "std": np.nan,
                        "prop_type": prop_type
                    }
        else:
            prop_types = ['NoReadOut', 'PropagateSignalDown']
            for prop_type in prop_types:
                best_results[dataset][f"{model} ({prop_type})"] = {
                            "mean": np.nan,
                            "std": np.nan,
                        }

        


Dropping rows with NaN values
Dataset: US-county-demos-UnemploymentRate, Model: cwn_dcm
Initial number of rows, 296
Rows with isna() == True, 1
Dropping rows with NaN values
Dataset: US-county-demos-UnemploymentRate, Model: ccxn
Total rows in subset: 474
Total dropped rows: 40
Dropping rows with NaN values
Dataset: US-county-demos-UnemploymentRate, Model: cwn
Dropping rows with NaN values
Dataset: ZINC, Model: cwn_dcm
Dropping rows with NaN values
Dataset: ZINC, Model: ccxn
Total rows in subset: 304
Total dropped rows: 33
Initial number of rows, 271
Rows with isna() == True, 1
Dropping rows with NaN values
Dataset: ZINC, Model: cwn
Dropping rows with NaN values
Dataset: IMDB-MULTI, Model: cwn_dcm
Dropping rows with NaN values
Dataset: IMDB-MULTI, Model: ccxn
Initial number of rows, 961
Rows with isna() == True, 5
Initial number of rows, 956
Rows with isna() == True, 1


Dropping rows with NaN values
Dataset: IMDB-MULTI, Model: cwn
Dropping rows with NaN values
Dataset: IMDB-BINARY, Model: cwn_dcm
Dropping rows with NaN values
Dataset: IMDB-BINARY, Model: ccxn
Initial number of rows, 969
Rows with isna() == True, 368
Initial number of rows, 601
Rows with isna() == True, 8
Dropping rows with NaN values
Dataset: IMDB-BINARY, Model: cwn
Dropping rows with NaN values
Dataset: US-county-demos-BachelorRate, Model: cwn_dcm
Dropping rows with NaN values
Dataset: US-county-demos-BachelorRate, Model: ccxn
Total rows in subset: 460
Total dropped rows: 25
Dropping rows with NaN values
Dataset: US-county-demos-BachelorRate, Model: cwn
Dropping rows with NaN values
Dataset: minesweeper, Model: cwn_dcm
Dropping rows with NaN values
Dataset: minesweeper, Model: ccxn
Dropping rows with NaN values
Dataset: minesweeper, Model: cwn
Dropping rows with NaN values
Dataset: US-county-demos-DeathRate, Model: cwn_dcm
Dropping rows with NaN values
Dataset: US-county-demos-DeathR

In [22]:
# Convert nested dictionary to DataFrame
nested_dict = dict(best_results)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)

result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Performance"]

result_dict = result_dict.pivot_table(
    index="Model", columns="Dataset", values="Performance", aggfunc="first"
)
result_dict.reset_index(inplace=True)

result_dict['ReadOut'] = result_dict['Model'].apply(lambda x: x.split('(')[1].replace(')', ''))
result_dict['Model'] = result_dict['Model'].apply(lambda x: x.split('(')[0])

result_dict.sort_values(by=['Model','ReadOut'], inplace=True)



result_dict.to_csv(f"ablation_cellular.csv")

In [23]:
columns = ['Model',
 'ReadOut',
 'Cora',
 'IMDB-BINARY',
 'IMDB-MULTI',
 'MUTAG',
 'NCI1',
 'NCI109',
 'PROTEINS',
 'PubMed',
 'US-county-demos-BachelorRate',
 'US-county-demos-BirthRate',
 'US-county-demos-DeathRate',
 'US-county-demos-Election',
 'US-county-demos-MedianIncome',
 'US-county-demos-MigraRate',
 'US-county-demos-UnemploymentRate',
 'ZINC',
 'citeseer',
 'minesweeper',
 'roman_empire',
 ]
result_dict[columns]

Dataset,Model,ReadOut,Cora,IMDB-BINARY,IMDB-MULTI,MUTAG,NCI1,NCI109,PROTEINS,PubMed,US-county-demos-BachelorRate,US-county-demos-BirthRate,US-county-demos-DeathRate,US-county-demos-Election,US-county-demos-MedianIncome,US-county-demos-MigraRate,US-county-demos-UnemploymentRate,ZINC,citeseer,minesweeper,roman_empire
0,ccxn,NoReadOut,86.32 ± 1.22,68.4 ± nan,47.63 ± 3.45,69.79 ± 4.61,72.43 ± 1.72,73.22 ± 0.48,75.63 ± 2.57,88.91 ± 0.47,0.3318 ± 0.0282,0.7996 ± 0.1227,0.5663 ± 0.0588,0.3898 ± 0.0451,0.2538 ± 0.0276,0.8046 ± 0.1133,0.2836 ± 0.0478,0.6295 ± 0.0095,72.87 ± 1.13,nan ± nan,nan ± nan
1,ccxn,PropagateSignalDown,86.79 ± 1.81,69.6 ± nan,48.75 ± 3.56,74.89 ± 5.51,74.86 ± 0.82,74.48 ± 0.83,74.91 ± 1.85,88.38 ± 0.38,0.3166 ± 0.0288,0.745 ± 0.1123,0.5426 ± 0.055,0.3471 ± 0.0154,0.2489 ± 0.0296,0.8482 ± 0.185,0.2707 ± 0.0258,0.4194 ± 0.0161,74.67 ± 2.24,nan ± nan,nan ± nan
2,cwn,NoReadOut,74.95 ± 0.98,70.4 ± 2.02,49.71 ± 2.83,69.68 ± 8.58,68.52 ± 0.51,68.19 ± 0.65,76.13 ± 1.8,86.94 ± 0.68,0.3306 ± 0.0279,0.8088 ± 0.1141,0.5532 ± 0.0494,0.5976 ± 0.0428,0.3631 ± 0.0373,0.8998 ± 0.1641,0.4628 ± 0.0362,0.6985 ± 0.004,70.49 ± 2.85,nan ± nan,nan ± nan
3,cwn,PropagateSignalDown,86.32 ± 1.38,69.28 ± 2.57,49.87 ± 2.33,80.43 ± 1.78,73.93 ± 1.87,73.8 ± 2.06,76.13 ± 2.7,88.64 ± 0.36,0.3331 ± 0.0319,0.7181 ± 0.086,0.5399 ± 0.0553,0.3437 ± 0.0216,0.2468 ± 0.03,0.838 ± 0.1286,0.2535 ± 0.031,0.3497 ± 0.012,75.2 ± 1.82,nan ± nan,nan ± nan
4,cwn_dcm,NoReadOut,87.44 ± 1.35,69.12 ± 2.82,49.33 ± nan,80.85 ± 5.42,76.65 ± 0.0,75.35 ± 1.5,73.55 ± 3.43,88.62 ± 0.4,0.3158 ± 0.0247,0.7134 ± 0.0915,0.5443 ± 0.0568,0.307 ± 0.0155,0.2263 ± 0.0187,0.8404 ± 0.1031,0.244 ± 0.0384,0.3451 ± 0.0219,75.63 ± 1.67,89.42 ± 0.0,82.14 ± 0.0
5,cwn_dcm,PropagateSignalDown,87.68 ± 1.24,69.44 ± 2.46,45.87 ± nan,77.02 ± 9.32,76.58 ± 1.02,74.68 ± 1.39,73.33 ± 2.3,88.67 ± 0.44,0.313 ± 0.0248,0.7183 ± 0.0479,0.5408 ± 0.0561,0.3058 ± 0.0057,0.228 ± 0.0189,0.8373 ± 0.1206,0.2456 ± 0.0329,0.336 ± 0.0125,74.91 ± 1.33,89.85 ± 0.0,82.51 ± 0.0


In [27]:
best_runs['IMDB-MULTI']['cwn_dcm']

Unnamed: 0,model.model_name,dataset.parameters.data_name,model.optimizer.lr,model.feature_encoder.out_channels,model.backbone.n_layers,model.readout.readout_name,model.feature_encoder.proj_dropout,dataset.parameters.batch_size,val/accuracy,test/accuracy,val/auroc,test/auroc,val/recall,test/recall,val/precision,test/precision,dataset.parameters.data_seed,seed
11,cwn_dcm,IMDB-MULTI,0.001,128.0,3.0,NoReadOut,0.25,128.0,0.498667,0.493333,0.667355,0.646179,0.50652,0.490424,0.504078,0.48849,0.0,42.0


In [168]:
# Keep metrics that matters for dataset
performance_columns = optimization_metrics['IMDB-BINARY']['performance_columns']
subset = hp_runs['IMDB-BINARY']['ccxn'].copy()

# Find 'NaN' in performance columns
# for each column find number of rows with 'NaN' string
total_performance_nan = 0
for column in performance_columns:
    # Find the number of NaN string in the column
    num_nan = subset[column].apply(lambda x: x == 'NaN')
    num_nan = num_nan.sum()
    total_performance_nan += num_nan
    # If there are any 'NaN' string in the column print the column name and the number of 'NaN' strings

if total_performance_nan > 0:
    print('Dropping rows with NaN values')
    print(f"Dataset: {dataset}, Model: {model}")
    nan_rows = subset[performance_columns].eq('NaN')
    nan_rows = nan_rows.sum(axis=1)
    # Drop every rows where 'NaN' string is present
    subset = subset[~nan_rows.gt(0)]
    print('Total rows in subset:', nan_rows.shape[0])
    print('Total dropped rows:', sum(nan_rows.gt(0)))


 
# Ensure that the performance columns are of type float
#subset[performance_columns] = subset[performance_columns].astype(float)

In [170]:
subset
# Drop every rows where 'NaN' string is present


Unnamed: 0,model.model_name,dataset.parameters.data_name,model.optimizer.lr,model.feature_encoder.out_channels,model.backbone.n_layers,model.readout.readout_name,model.feature_encoder.proj_dropout,dataset.parameters.batch_size,val/accuracy,test/accuracy,val/auroc,test/auroc,val/recall,test/recall,val/precision,test/precision,dataset.parameters.data_seed,seed
23889,ccxn,IMDB-BINARY,0.001,128.0,4.0,PropagateSignalDown,0.5,256.0,0.668,0.696,0.755974,0.765796,0.661638,0.6876,0.693719,0.716485,0.0,42.0
23893,ccxn,IMDB-BINARY,0.001,128.0,4.0,NoReadOut,0.5,256.0,0.484,0.476,0.5,0.5,0.5,0.5,0.242,0.238,0.0,42.0
23903,ccxn,IMDB-BINARY,0.001,128.0,4.0,PropagateSignalDown,0.5,128.0,0.676,0.688,0.771286,0.780486,0.680152,0.687279,0.691762,0.687279,0.0,42.0
23909,ccxn,IMDB-BINARY,0.001,128.0,4.0,NoReadOut,0.5,128.0,0.484,0.476,0.5,0.5,0.5,0.5,0.242,0.238,0.0,42.0
23919,ccxn,IMDB-BINARY,0.001,128.0,4.0,PropagateSignalDown,0.25,256.0,0.68,0.736,0.751169,0.824427,0.677878,0.740394,0.681459,0.746141,0.0,42.0
23928,ccxn,IMDB-BINARY,0.001,128.0,4.0,NoReadOut,0.25,256.0,0.732,0.696,0.824524,0.800196,0.733391,0.698377,0.734367,0.699195,0.0,42.0
23933,ccxn,IMDB-BINARY,0.001,128.0,4.0,PropagateSignalDown,0.25,128.0,0.484,0.476,0.5,0.5,0.5,0.5,0.242,0.238,0.0,42.0
23939,ccxn,IMDB-BINARY,0.001,128.0,4.0,NoReadOut,0.25,128.0,0.484,0.476,0.5,0.5,0.5,0.5,0.242,0.238,0.0,42.0
23948,ccxn,IMDB-BINARY,0.001,128.0,3.0,PropagateSignalDown,0.5,256.0,0.692,0.68,0.803895,0.786324,0.69745,0.686189,0.721027,0.697114,0.0,42.0
23958,ccxn,IMDB-BINARY,0.001,128.0,3.0,NoReadOut,0.5,256.0,0.732,0.692,0.823691,0.804125,0.735185,0.69533,0.743177,0.697727,0.0,42.0


In [166]:
subset[column].apply(lambda x: x == 'NaN')

23889    False
23893    False
23903    False
23909    False
23919    False
23928    False
23933    False
23939    False
23948    False
23958    False
23976    False
23984    False
23989    False
23993    False
23998    False
24000    False
24007    False
24014    False
24015    False
24020    False
24021    False
24026    False
24032    False
24033    False
24043    False
24048    False
24054    False
24056    False
24062    False
24065    False
24068    False
24069    False
24075    False
24080    False
24083    False
24085    False
24086    False
24087    False
24090    False
24096    False
24097    False
24108    False
24109    False
24115    False
24118    False
24120    False
24126    False
24129    False
24130    False
24131    False
24141    False
24144    False
24146    False
24152    False
24153    False
24155    False
24161    False
24164    False
24165    False
24166    False
24167    False
24168    False
24169    False
24170    False
24171    False
24173    False
24174    F

In [159]:
performance_columns

['val/accuracy',
 'test/accuracy',
 'val/auroc',
 'test/auroc',
 'val/recall',
 'test/recall',
 'val/precision',
 'test/precision']