In [1]:
import ast
import glob
import warnings
from collections import defaultdict
from datetime import date

import numpy as np
import pandas as pd
import wandb

today = date.today()
api = wandb.Api()

# # Find all csv files in the current directory
csv_files = glob.glob("/home/lev/projects/TopoBenchmarkX/big_csv/*.csv")
# # Collect all the names of the csv files without the extension
csv_names = [csv_file[:-4] for csv_file in csv_files]
project_name = "TopoBenchmarkX_Simplicial"  
user = "telyatnikov_sap"

if project_name not in csv_names:
    runs = api.runs(f"{user}/{project_name}")

    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k, v in run.config.items() if not k.startswith("_")}
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {"summary": summary_list, "config": config_list, "name": name_list}
    )

    runs_df.to_csv(f"{user}_{project_name}.csv")
else:
    runs_df = pd.read_csv(f"{project_name}.csv", index_col=0)

    for row in runs_df.iloc:
        row["summary"] = ast.literal_eval(row["summary"])
        row["config"] = ast.literal_eval(row["config"])


for row in runs_df.iloc:
    row["summary"].update(row["config"])

lst = [i["summary"] for i in runs_df.iloc]
df = pd.DataFrame.from_dict(lst)

df_init = df.copy()

# Get average epoch run time
df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [2]:
def normalize_column(df, column_to_normalize):
    # Use json_normalize to flatten the nested dictionaries into separate columns
    flattened_df = pd.json_normalize(df[column_to_normalize])
    # Rename columns to include 'nested_column' prefix
    flattened_df.columns = [
        f"{column_to_normalize}.{col}" for col in flattened_df.columns
    ]
    # Concatenate the flattened DataFrame with the original DataFrame
    result_df = pd.concat([df, flattened_df], axis=1)
    # Get new columns names
    new_columns = flattened_df.columns
    # Drop the original nested column if needed
    result_df.drop(column_to_normalize, axis=1, inplace=True)
    return result_df, new_columns


# Config columns to normalize
columns_to_normalize = ["model", "dataset", "callbacks", "paths"]

# Keep track of config columns added
config_columns = []
for column in columns_to_normalize:
    df, columns = normalize_column(df, column)
    config_columns.extend(columns)

### Concatenate tables to obtain full hp space

In [3]:
additiona_runs = pd.read_csv('Simplicial_additional_runs.csv', index_col=0)


In [4]:
pd.concat([df, additiona_runs], axis=0)

Unnamed: 0,lr-Adam,_timestamp,val/recall,val/precision,train/accuracy,train/loss,trainer/global_step,val/loss,train/recall,val/accuracy,...,callbacks.model_checkpoint.save_on_train_epoch_end,callbacks.rich_progress_bar._target_,callbacks.learning_rate_monitor._target_,callbacks.learning_rate_monitor.logging_interval,paths.log_dir,paths.data_dir,paths.root_dir,paths.work_dir,paths.output_dir,dataset.transforms.one_hot_node_degree_features.max_degrees
0,0.000990,1.716749e+09,0.643183,0.662108,0.637470,0.682051,85.0,0.669272,0.637904,0.641675,...,,,,,,,,,,
1,0.000940,1.716749e+09,0.690025,0.724185,0.879800,0.263027,60.0,0.386238,0.757250,0.824000,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/lev/projects/TopoBenchmarkX/logs/,/home/lev/projects/TopoBenchmarkX/datasets/,/home/lev/projects/TopoBenchmarkX,/home/lev/projects/TopoBenchmarkX/topobenchmarkx,/home/lev/projects/TopoBenchmarkX/logs/train/m...,
2,0.000882,1.716749e+09,0.743229,0.729881,0.939800,0.141516,119.0,0.571109,0.895250,0.825600,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/lev/projects/TopoBenchmarkX/logs/,/home/lev/projects/TopoBenchmarkX/datasets/,/home/lev/projects/TopoBenchmarkX,/home/lev/projects/TopoBenchmarkX/topobenchmarkx,/home/lev/projects/TopoBenchmarkX/logs/train/m...,
3,0.000841,1.716749e+09,0.720611,0.728938,0.932000,0.163599,160.0,0.455577,0.873875,0.826400,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/lev/projects/TopoBenchmarkX/logs/,/home/lev/projects/TopoBenchmarkX/datasets/,/home/lev/projects/TopoBenchmarkX,/home/lev/projects/TopoBenchmarkX/topobenchmarkx,/home/lev/projects/TopoBenchmarkX/logs/train/m...,
4,0.000882,1.716749e+09,0.702433,0.716004,0.940400,0.146581,119.0,0.522819,0.910250,0.818800,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/lev/projects/TopoBenchmarkX/logs/,/home/lev/projects/TopoBenchmarkX/datasets/,/home/lev/projects/TopoBenchmarkX,/home/lev/projects/TopoBenchmarkX/topobenchmarkx,/home/lev/projects/TopoBenchmarkX/logs/train/m...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4009,0.007260,1.716589e+09,0.677322,0.713025,0.852600,0.302888,275.0,0.359202,0.706500,0.818000,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/gbg141/TopoBenchmarkX/logs/,/home/gbg141/TopoBenchmarkX/datasets/,/home/gbg141/TopoBenchmarkX,/home/gbg141/TopoBenchmarkX/hp_scripts/main_ex...,/home/gbg141/TopoBenchmarkX/logs/train/multiru...,
4010,0.006960,1.716589e+09,0.682287,0.710489,0.854800,0.301027,305.0,0.363667,0.716125,0.816400,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/gbg141/TopoBenchmarkX/logs/,/home/gbg141/TopoBenchmarkX/datasets/,/home/gbg141/TopoBenchmarkX,/home/gbg141/TopoBenchmarkX/hp_scripts/main_ex...,/home/gbg141/TopoBenchmarkX/logs/train/multiru...,
4011,0.006820,1.716589e+09,0.674839,0.714451,0.854400,0.301079,319.0,0.368256,0.721500,0.818800,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/gbg141/TopoBenchmarkX/logs/,/home/gbg141/TopoBenchmarkX/datasets/,/home/gbg141/TopoBenchmarkX,/home/gbg141/TopoBenchmarkX/hp_scripts/main_ex...,/home/gbg141/TopoBenchmarkX/logs/train/multiru...,
4012,0.000676,1.716589e+09,0.738251,0.758098,0.908481,0.318860,325.0,0.606247,0.855169,0.813945,...,,lightning.pytorch.callbacks.RichProgressBar,lightning.pytorch.callbacks.LearningRateMonitor,epoch,/home/gbg141/TopoBenchmarkX/logs/,/home/gbg141/TopoBenchmarkX/datasets/,/home/gbg141/TopoBenchmarkX,/home/gbg141/TopoBenchmarkX/hp_scripts/main_ex...,/home/gbg141/TopoBenchmarkX/logs/train/multiru...,


In [5]:
df = pd.concat([df, additiona_runs], axis=0)

## Select models that have finished the runs

## Workout us_demographic 

In [6]:
# For every rows where df['dataset.parameters.data_name'] == 'US-county-demos' extend the 'dataset.parameters.data_name' with dataset.parameters.task_variable 
# and set it to 'US-county-demos' + '-' + dataset.parameters.task_variable
df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.data_name'] = df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.data_name'] + '-' + df.loc[df['dataset.parameters.data_name'] == 'US-county-demos', 'dataset.parameters.task_variable']

In [7]:
# Print all columns 10 per line
for i in range(0, len(df.columns), 5):
    print(list(df.columns[i:i + 5]))


['lr-Adam', '_timestamp', 'val/recall', 'val/precision', 'train/accuracy']
['train/loss', 'trainer/global_step', 'val/loss', 'train/recall', 'val/accuracy']
['train/precision', '_step', 'epoch', '_runtime', 'val/auroc']
['train/auroc', 'seed', 'tags', 'extras', 'trainer']
['ckpt_path', 'task_name', 'model/params/total', 'model/params/trainable', 'model/params/non_trainable']
['test/auroc', 'test/accuracy', 'test/recall', 'test/precision', '_wandb']
['test/loss', 'val/mae', 'val/mse', 'test/mae', 'train/mae']
['test/mse', 'train/mse', 'epoch_run_time', 'model.compile', 'model._target_']
['model.model_name', 'model.model_domain', 'model.loss.task', 'model.loss._target_', 'model.loss.loss_type']
['model.readout._target_', 'model.readout.hidden_dim', 'model.readout.readout_name', 'model.readout.num_cell_dimensions', 'model.backbone._target_']
['model.backbone.n_layers', 'model.backbone.sc_order', 'model.backbone.aggr_norm', 'model.backbone.conv_order', 'model.backbone.update_func']
['model

### See unique datasets

In [8]:
print(df['dataset.parameters.data_name'].unique())
print("Num unique datasets:", len(df['dataset.parameters.data_name'].unique()))

[nan 'minesweeper' 'NCI1' 'roman_empire' 'ZINC' 'PROTEINS' 'NCI109'
 'PubMed' 'citeseer' 'Cora' 'US-county-demos-UnemploymentRate'
 'US-county-demos-BachelorRate' 'MUTAG' 'US-county-demos-DeathRate'
 'US-county-demos-BirthRate' 'US-county-demos-MigraRate'
 'US-county-demos-MedianIncome' 'US-county-demos-Election']
Num unique datasets: 18


## See unique models

In [9]:
print(df['model.model_name'].unique())

[nan 'sccnn_custom' 'scn' 'sccn']


## Solve batch problems

In [10]:
datasets = ['minesweeper', 'roman_empire']
models = ['sccnn_custom', 'scn', 'sccn']
# For the following models and datasets I mistook the batch size, it should be 1, instead of 256 or 128
# Keep the run where batch size is 128 and then change the batch size to 1
for model in models:
    print("MODEL:", model)
    for dataset in datasets:

        # Change the batch size to 1 when it is 128
        
        print(df.loc[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset), 'dataset.parameters.batch_size'].unique())
        

MODEL: sccnn_custom
[1.]
[1.]
MODEL: scn
[]
[]
MODEL: sccn
[1.]
[1.]


## Solve issue with projection dropout

In [11]:
print(df['model.feature_encoder.proj_dropout'].unique())

[ nan 0.25 0.5 ]


In [12]:
# Keep rows where model.feature_encoder.proj_dropout is [0.5  0.25]
df = df[df['model.feature_encoder.proj_dropout'].isin([0.5, 0.25])]


In [13]:
df.reset_index(drop=True, inplace=True)

In [14]:
# Sweeped parameters: 
sweeped_columns = [
    'model.optimizer.lr', 
    'model.feature_encoder.out_channels',
    'model.backbone.n_layers',
    'model.readout.readout_name',
    'dataset.transforms.graph2simplicial_lifting.signed',
    'model.feature_encoder.proj_dropout',
    'dataset.parameters.batch_size',
    'dataset.parameters.data_seed',
    'seed',
]



# For each model and dataset go over all the sweeped parameters and print the unique values
for model in df['model.model_name'].unique():
    print(f"Model: {model}")
    for dataset in df['dataset.parameters.data_name'].unique():
        print(f"Dataset: {dataset}")
        for column in sweeped_columns:
            print(f"Column: {column}")
            print(df.loc[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset), column].unique())
        
        print('---------------NEW DATASET------------------')
    print('---------------NEW MODEL------------------')


Model: sccnn_custom
Dataset: minesweeper
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128.  64.  32.]
Column: model.backbone.n_layers
[4. 3. 2. 1.]
Column: model.readout.readout_name
['NoReadOut' 'PropagateSignalDown']
Column: dataset.transforms.graph2simplicial_lifting.signed
[True]
Column: model.feature_encoder.proj_dropout
[0.25 0.5 ]
Column: dataset.parameters.batch_size
[1.]
Column: dataset.parameters.data_seed
[0. 9. 7. 5. 3.]
Column: seed
[42.]
---------------NEW DATASET------------------
Dataset: NCI1
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[ 64.  32. 128.]
Column: model.backbone.n_layers
[4. 3. 2. 1.]
Column: model.readout.readout_name
['NoReadOut' 'PropagateSignalDown']
Column: dataset.transforms.graph2simplicial_lifting.signed
[True]
Column: model.feature_encoder.proj_dropout
[0.25 0.5 ]
Column: dataset.parameters.batch_size
[256. 128.]
Column: dataset.parameters.data_seed
[3. 9. 7. 0. 5.]
Co

['PropagateSignalDown' 'NoReadOut']
Column: dataset.transforms.graph2simplicial_lifting.signed
[True]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch_size
[1.]
Column: dataset.parameters.data_seed
[9. 7. 5. 3. 0.]
Column: seed
[42.]
---------------NEW DATASET------------------
Dataset: US-county-demos-BachelorRate
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128.  64.  32.]
Column: model.backbone.n_layers
[4. 3. 2. 1.]
Column: model.readout.readout_name
['PropagateSignalDown' 'NoReadOut']
Column: dataset.transforms.graph2simplicial_lifting.signed
[True]
Column: model.feature_encoder.proj_dropout
[0.5  0.25]
Column: dataset.parameters.batch_size
[1.]
Column: dataset.parameters.data_seed
[9. 7. 5. 3. 0.]
Column: seed
[42.]
---------------NEW DATASET------------------
Dataset: MUTAG
Column: model.optimizer.lr
[0.001 0.01 ]
Column: model.feature_encoder.out_channels
[128.  64.  32.]
Column: model.backbone.n_lay

### Get the best results

In [15]:
# Extract best results for each model and dataset
# 1. Keep the columns that are necessary for the comparison
sweeped_columns = [
    'model.optimizer.lr', 
    'model.feature_encoder.out_channels',
    'model.backbone.n_layers',
    'model.readout.readout_name',
    'dataset.transforms.graph2simplicial_lifting.signed',
    'model.feature_encoder.proj_dropout',
    'dataset.parameters.batch_size',
    # 'dataset.parameters.data_seed',
    # 'seed',
]
run_columns = ['dataset.parameters.data_seed','seed']

# Dataset and model columns
dataset_model_columns = ['model.model_name', 'dataset.parameters.data_name']

# Performance columns
performance_columns = [
    'val/loss', 'test/loss',
    'val/mae', 'test/mae',
    'val/mse', 'test/mse',
    'val/accuracy', 'test/accuracy',
    'val/auroc','test/auroc',
    'val/recall', 'test/recall',
    'val/precision', 'test/precision',
    ]
keep_columns = dataset_model_columns + sweeped_columns + performance_columns + run_columns
df = df[keep_columns]

In [16]:
performance_classification = [
    'val/accuracy', 'test/accuracy',
    'val/auroc','test/auroc',
    'val/recall', 'test/recall',
    'val/precision', 'test/precision',
    ]
performance_regression = [
    'val/mae', 'test/mae',
    'val/mse', 'test/mse',
    ]
# Define a dict of dicts for each dataset the corresponding optimization metrics
optimization_metrics = {
    'IMDB-MULTI': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'IMDB-BINARY': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'REDDIT-BINARY': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'NCI109': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'NCI1': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'PROTEINS': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'MUTAG': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'Cora': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'citeseer': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'PubMed': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},

    'roman_empire': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    'amazon_ratings': {'optim_metric': 'val/accuracy', 'eval_metric': 'test/accuracy', 'direction': 'max', 'performance_columns': performance_classification},
    
    'tolokers': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},
    'questions': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},
    'minesweeper': {'optim_metric': 'val/auroc', 'eval_metric': 'test/auroc', 'direction': 'max', 'performance_columns': performance_classification},

    'ZINC': {'optim_metric': 'val/mse', 'eval_metric': 'test/mae', 'direction': 'min', 'performance_columns': performance_regression},
    
    'US-county-demos-UnemploymentRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-BachelorRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-DeathRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-BirthRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-MigraRate': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-MedianIncome': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},
    'US-county-demos-Election': {'optim_metric': 'val/mse', 'eval_metric': 'test/mse', 'direction': 'min', 'performance_columns': performance_regression},

} 

len(optimization_metrics)

23

### Generate the best results

In [17]:
# Get unique datasets
datasets = list(df['dataset.parameters.data_name'].unique())
# Get unique models
models = list(df['model.model_name'].unique())

best_results = defaultdict(dict)
hp_runs = defaultdict(dict)
best_runs = defaultdict(dict)
# Got over each dataset and model and find the best result
for dataset in datasets:
    for model in models:
        # Get the subset of the DataFrame for the current dataset and model
        subset = df[
            (df['dataset.parameters.data_name'] == dataset)
            & (df['model.model_name'] == model)
        ]

        optim_metric = optimization_metrics[dataset]['optim_metric']
        eval_metric = optimization_metrics[dataset]['eval_metric']
        direction = optimization_metrics[dataset]['direction']
        
        # Keep metrics that matters for dataset
        performance_columns = optimization_metrics[dataset]['performance_columns']
        subset = subset[dataset_model_columns + sweeped_columns + performance_columns + run_columns]

        aggregated = subset.groupby(sweeped_columns, dropna=False).agg(
            {col: ["mean", "std"] for col in performance_columns}
        )

         # Go from MultiIndex to Index
        aggregated = aggregated.reset_index()
        aggregated = aggregated.sort_values(
                by=(optim_metric, "mean"), ascending=(direction == 'min')
            )
        
        # Git percent in case of classification
        if 'test/accuracy' in performance_columns:
            # Go over all the performance columns and multiply by 100
            for col in performance_columns:
                aggregated[(col, "mean")] *= 100
                aggregated[(col, "std")] *= 100
            
            # Round performance columns values up to 2 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(2)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(2)
            
            
        else:
            # Round all values up to 4 decimal points
            # Round performance columns values up to 4 decimal points
            for col in performance_columns:
                aggregated[(col, "mean")] = aggregated[(col, "mean")].round(4)
                aggregated[(col, "std")] = aggregated[(col, "std")].round(4)
        
            
        
        # Get the best result
        final_best = aggregated.head(1)
        if final_best[(eval_metric, "mean")].any(): 
            best_results[dataset][model] = {
                "mean": final_best[(eval_metric, "mean")].values[0],
                "std": final_best[(eval_metric, "std")].values[0],
            }

            # Extract best runs: 
            best_params = {}
            for col in sweeped_columns:
                best_params[col] = final_best[(col, '')].item()
            
            hp_runs[dataset][model] = subset.copy()
            
            # Start with the entire DataFrame
            filtered_subset = subset.copy()

            # Iterate over each key-value pair in the best parameters dictionary and filter the DataFrame
            for param, value in best_params.items():
                filtered_subset = filtered_subset[filtered_subset[param] == value]
            best_runs[dataset][model] = filtered_subset
        
        else: 
            best_results[dataset][model] = {
                "mean": np.nan,
                "std": np.nan,
            }

        

        
            
        


In [18]:
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.width", 1000)

COLS = [
    'dataset.parameters.data_seed',
    'dataset.parameters.batch_size',
    'model.backbone.n_layers',
    'model.feature_encoder.out_channels',
    'model.readout.readout_name',
    'model.feature_encoder.proj_dropout',
    'model.optimizer.lr', 
    
]

a = hp_runs['US-county-demos-BirthRate']['scn'].sort_values(by=COLS, ascending=False)[COLS]


In [19]:
for col in COLS:
    print(a[col].value_counts())

dataset.parameters.data_seed
5.0    92
3.0    92
7.0    90
9.0    88
0.0    88
Name: count, dtype: int64
dataset.parameters.batch_size
1.0    450
Name: count, dtype: int64
model.backbone.n_layers
3.0    117
2.0    113
4.0    111
1.0    109
Name: count, dtype: int64
model.feature_encoder.out_channels
128.0    153
32.0     150
64.0     147
Name: count, dtype: int64
model.readout.readout_name
PropagateSignalDown    229
NoReadOut              221
Name: count, dtype: int64
model.feature_encoder.proj_dropout
0.25    232
0.50    218
Name: count, dtype: int64
model.optimizer.lr
0.001    229
0.010    221
Name: count, dtype: int64


In [20]:
for col in COLS:
    print(a[col].value_counts())

dataset.parameters.data_seed
5.0    92
3.0    92
7.0    90
9.0    88
0.0    88
Name: count, dtype: int64
dataset.parameters.batch_size
1.0    450
Name: count, dtype: int64
model.backbone.n_layers
3.0    117
2.0    113
4.0    111
1.0    109
Name: count, dtype: int64
model.feature_encoder.out_channels
128.0    153
32.0     150
64.0     147
Name: count, dtype: int64
model.readout.readout_name
PropagateSignalDown    229
NoReadOut              221
Name: count, dtype: int64
model.feature_encoder.proj_dropout
0.25    232
0.50    218
Name: count, dtype: int64
model.optimizer.lr
0.001    229
0.010    221
Name: count, dtype: int64


## Save obtained best results and best runs

In [21]:
# Convert nested dictionary to DataFrame
nested_dict = dict(best_results)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)

result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Performance"]

result_dict = result_dict.pivot_table(
    index="Model", columns="Dataset", values="Performance", aggfunc="first"
)

In [22]:
result_dict

Dataset,Cora,MUTAG,NCI1,NCI109,PROTEINS,PubMed,US-county-demos-BachelorRate,US-county-demos-BirthRate,US-county-demos-DeathRate,US-county-demos-Election,US-county-demos-MedianIncome,US-county-demos-MigraRate,US-county-demos-UnemploymentRate,ZINC,citeseer,minesweeper,roman_empire
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
sccn,80.86 ± 2.16,70.64 ± 5.9,75.27 ± 1.74,75.31 ± 1.36,75.05 ± 2.76,88.37 ± 0.48,0.3588 ± 0.0246,0.8242 ± 0.0942,0.5751 ± 0.0553,0.5344 ± 0.0323,0.2908 ± 0.032,0.9146 ± 0.1822,0.4328 ± 0.044,0.4858 ± 0.0584,69.6 ± 1.83,89.07 ± 0.25,88.27 ± 0.14
sccnn_custom,82.19 ± 1.07,76.17 ± 6.63,nan ± nan,75.31 ± 0.47,74.19 ± 2.86,88.18 ± 0.32,0.3394 ± 0.028,0.7937 ± 0.1162,0.5527 ± 0.0474,0.5112 ± 0.0316,0.2825 ± 0.0279,0.8976 ± 0.1431,0.4278 ± 0.0394,0.4088 ± 0.0047,70.23 ± 2.69,89.0 ± 0.0,89.15 ± 0.32
scn,82.27 ± 1.34,73.62 ± 6.13,nan ± nan,nan ± nan,75.27 ± 2.14,88.72 ± 0.5,0.3186 ± 0.0241,0.7122 ± 0.0836,0.5208 ± 0.0525,0.4648 ± 0.043,0.2526 ± 0.0247,0.9209 ± 0.1993,0.3753 ± 0.0432,nan ± nan,71.24 ± 1.68,nan ± nan,nan ± nan


In [23]:
# Increase the number of allowed rows to display
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.width", 1000)
result_dict.to_csv(f"best_results_simplicial.csv")