In [7]:
import ast
import glob
import warnings
from collections import defaultdict
from datetime import date

import numpy as np
import pandas as pd
import wandb

today = date.today()
api = wandb.Api()

# # Find all csv files in the current directory
csv_files = glob.glob("*.csv")
# # Collect all the names of the csv files without the extension
csv_names = [csv_file[:-4] for csv_file in csv_files]
project_name = "graph_baselines_graph_baselines"  # rebutal_CWN_ogbg-molhiv fix_gnn_rebuttal_cell_NCI109  fix_gnn_rebuttal_cell_MUTAG fix_gnn_rebuttal_cell_PROTEINS fix_gnn_rebuttal_cell_ZINC
user = "levsap" #"telyatnikov_sap"

if project_name not in csv_names:
    runs = api.runs(f"{user}/{project_name}")

    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k, v in run.config.items() if not k.startswith("_")}
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {"summary": summary_list, "config": config_list, "name": name_list}
    )

    runs_df.to_csv(f"{project_name}.csv")
else:
    runs_df = pd.read_csv(f"{project_name}.csv", index_col=0)

    for row in runs_df.iloc:
        row["summary"] = ast.literal_eval(row["summary"])
        row["config"] = ast.literal_eval(row["config"])


for row in runs_df.iloc:
    row["summary"].update(row["config"])

lst = [i["summary"] for i in runs_df.iloc]
df = pd.DataFrame.from_dict(lst)

df_init = df.copy()

# Get average epoch run time
#df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [8]:
df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [9]:
df.shape

(7679, 104)

In [10]:
def normalize_column(df, column_to_normalize):
    # Use json_normalize to flatten the nested dictionaries into separate columns
    flattened_df = pd.json_normalize(df[column_to_normalize])
    # Rename columns to include 'nested_column' prefix
    flattened_df.columns = [
        f"{column_to_normalize}.{col}" for col in flattened_df.columns
    ]
    # Concatenate the flattened DataFrame with the original DataFrame
    result_df = pd.concat([df, flattened_df], axis=1)
    # Get new columns names
    new_columns = flattened_df.columns
    # Drop the original nested column if needed
    result_df.drop(column_to_normalize, axis=1, inplace=True)
    return result_df, new_columns


# Config columns to normalize
columns_to_normalize = ["model", "dataset", "callbacks", "paths"]

# Keep track of config columns added
config_columns = []
for column in columns_to_normalize:
    df, columns = normalize_column(df, column)
    config_columns.extend(columns)

In [11]:
# Remove columns that are not needed (we shouldn't vary them or their variation is not interesting)
remove_cols = [
    #"dataset.transforms.data_manipulations.selected_fields",
    "callbacks.model_checkpoint.dirpath", 'model.feature_encoder.selected_dimensions','callbacks.model_checkpoint.dirpath',
]
removed_columns = []
for remove_col in remove_cols:
    if remove_col in df.columns:
        df = df.drop([remove_col], axis=1)
        removed_columns.append(remove_col)

print("Removed columns:", removed_columns)

# Ensure that removed columns are not present in config_columns
config_columns = [col for col in config_columns if col not in removed_columns]

Removed columns: ['callbacks.model_checkpoint.dirpath']


In [None]:
df.columns

In [26]:
[i for i in list(df.columns) if 'best' in i]

['best_epoch',
 'best_epoch/checkpoint',
 'best_epoch/train/accuracy',
 'best_epoch/train/auroc',
 'best_epoch/train/average_precision',
 'best_epoch/train/cohen_kappa',
 'best_epoch/train/f1_score',
 'best_epoch/train/jaccard',
 'best_epoch/train/loss',
 'best_epoch/train/mcc',
 'best_epoch/train/precision',
 'best_epoch/train/recall',
 'best_epoch/val/accuracy',
 'best_epoch/val/auroc',
 'best_epoch/val/average_precision',
 'best_epoch/val/cohen_kappa',
 'best_epoch/val/f1_score',
 'best_epoch/val/jaccard',
 'best_epoch/val/loss',
 'best_epoch/val/mcc',
 'best_epoch/val/precision',
 'best_epoch/val/recall',
 'best_epoch/train/mae',
 'best_epoch/train/mse',
 'best_epoch/train/r2',
 'best_epoch/val/mae',
 'best_epoch/val/mse',
 'best_epoch/val/r2',
 'callbacks.best_epoch_metrics.mode',
 'callbacks.best_epoch_metrics.monitor',
 'callbacks.best_epoch_metrics._target_']

In [None]:
# First get rid of the runs that have less than 50 epochs (minimum amount of epochs is 50)
MIN_EPOCHS = 50
df = df[df["epoch"] >= MIN_EPOCHS]

15      0.0
16      0.0
17      0.0
18      0.0
19      0.0
       ... 
7514    4.0
7515    4.0
7516    4.0
7517    4.0
7518    4.0
Name: epoch, Length: 502, dtype: float64

In [33]:
print(
    f"Number of rows with model.backbone._target_ = nan is {sum(df['model.backbone._target_'].isna())}"
)
# Drop na values if there are
df = df.dropna(subset=["model.backbone._target_"])
# Reset index
df = df.reset_index(drop=True)

# Drop rows that 'callbacks.early_stopping.monitor' isna
print(
    f"Number of rows with callbacks.early_stopping.monitor = nan is {sum(df['callbacks.early_stopping.monitor'].isna())}"
)

# print("Because of SCCN and CWN false runs there were 96 such runs on 13/03/24")

df = df.dropna(subset=["callbacks.early_stopping.monitor"])
# Reset index
df = df.reset_index(drop=True)


# Get correct names for the models
df["model.backbone._target_"] = df["model.backbone._target_"].apply(
    lambda x: x.split(".")[-1]
)

Number of rows with model.backbone._target_ = nan is 1
Number of rows with callbacks.early_stopping.monitor = nan is 0


In [34]:
df["model.backbone._target_"].unique()

array(['SANN'], dtype=object)

In [35]:
# Identify unique models in DataFrame
unique_models = df["model.backbone._target_"].unique()

# Identify unique datasets in DataFrame
unique_datasets = df["dataset.loader.parameters.data_name"].unique()


collected_results = defaultdict(dict)
collected_results_time = defaultdict(dict)
collected_results_time_run = defaultdict(dict)

collected_aggregated_results = defaultdict(dict)
collected_non_aggregated_results = defaultdict(dict)

# Got over each dataset and model and find the best result
for dataset in unique_datasets:
    for model in unique_models:
        # Get the subset of the DataFrame for the current dataset and model
        subset = df[
            (df["dataset.loader.parameters.data_name"] == dataset)
            & (df["model.backbone._target_"] == model)
        ]

        if subset.empty:
            print("---------")
            print(f"No results for {model} on {dataset}")
            print("---------")
            continue
        # Suppress all warnings
        warnings.filterwarnings("ignore")
        subset["Model"] = model
        warnings.filterwarnings("default")

        def get_metric(df):
            metric_ = df["callbacks.early_stopping.monitor"].unique()
            assert len(metric_) == 1, "There should be only one metric to optimize"
            metric = metric_[0]
            return metric.split("/")[-1]

        # Cols to get statistics later
        # TODO: log maximum validation value for optimized metric
        performance_cols = [f"test/{get_metric(subset)}"]

        # Get the unique values for each config column
        unique_colums_values = {}
        for col in config_columns:
            try:
                unique_colums_values[col] = subset[col].unique()
            except:
                print(f"Attention the columns: {col}, has issues with unique values")

        # Keep only those keys that have more than one unique value
        unique_colums_values = {
            k: v for k, v in unique_colums_values.items() if len(v) > 1
        }

        # Print the unique values for each config column

        print(f"Unique values for each config column for {model} on {dataset}:")
        for col, unique in unique_colums_values.items():
            print(f"{col}: {unique}")
            print()
        print("---------")

        # Check if "special colums" are not in unique_colums_values
        # For example dataset.parameters.data_seed should not be in aggregation columns
        # If it is, then we should remove it from the list
        special_columns = ["dataset.parameters.data_seed"]

        for col in special_columns:
            if col in unique_colums_values:
                unique_colums_values.pop(col)

        # Obtain the aggregation columns
        aggregation_columns = ["Model"] + list(unique_colums_values.keys())

        collected_non_aggregated_results[dataset][model] = {
            "df": subset.copy(),
            "aggregation_columns": aggregation_columns,
            "performance_cols": performance_cols,
        }

        # Aggregate the subset by the aggregation columns and get the best result for each group
        aggregated = subset.groupby(aggregation_columns).agg(
            {col: ["mean", "std"] for col in performance_cols}
        )

        # Go from MultiIndex to Index
        aggregated = aggregated.reset_index()

        assert (
            len(subset["callbacks.early_stopping.mode"].unique()) == 1
        ), "There should be only one mode for early stopping"
        # Identify the mode of the early stopping mode

        if subset["callbacks.early_stopping.mode"].unique()[0] == "max":
            ascending = False
            final_best_ = aggregated.sort_values(
                by=(f"test/{get_metric(subset)}", "mean"), ascending=ascending
            ).head(1)
            final_best_ = (final_best_ * 100).round(2)
        else:
            ascending = True
            final_best_ = aggregated.sort_values(
                by=(f"test/{get_metric(subset)}", "mean"), ascending=ascending
            ).head(1)

        collected_results[dataset][model] = {
            "mean": final_best_[(f"test/{get_metric(subset)}", "mean")].values[0],
            "std": final_best_[(f"test/{get_metric(subset)}", "std")].values[0],
        }

        # Get average epoch run time
        collected_results_time[dataset][model] = {
            "mean": subset['AvgTime/train_epoch_mean'].mean(),
            "std": subset['AvgTime/train_epoch_mean'].std(),
        }

        collected_results_time_run[dataset][model] = {
            "mean": subset["_runtime"].mean(),
            "std": subset["_runtime"].std(),
        }

        collected_aggregated_results[dataset][model] = aggregated.sort_values(
            by=(f"test/{get_metric(subset)}", "mean"), ascending=ascending
        )

Attention the columns: model.feature_encoder.in_channels, has issues with unique values
Attention the columns: model.feature_encoder.dataset_in_channels, has issues with unique values
Attention the columns: model.feature_encoder.selected_dimensions, has issues with unique values
Attention the columns: dataset.parameters.num_features, has issues with unique values
Attention the columns: callbacks.model_checkpoint.dirpath, has issues with unique values
Unique values for each config column for SANN on ogbg-molhiv:
model.readout.max_hop: [5. 4. 2.]

model.readout.hidden_dim: [128. 256.]

model.backbone.max_hop: [5. 4. 2.]

model.backbone.in_channels: [128. 256.]

model.backbone.hidden_channels: [128. 256.]

model.feature_encoder.max_hop: [5. 4. 2.]

model.feature_encoder.out_channels: [128. 256.]

model.feature_encoder.proj_dropout: [0.   0.25]

model.backbone_wrapper.max_hop: [5. 4. 2.]

model.backbone_wrapper.out_channels: [128. 256.]

dataset.split_params.data_seed: [0. 3. 5. 7. 9.]

pa

### Runtime per epoch

In [36]:
collected_results_time
# Convert nested dictionary to DataFrame
nested_dict = dict(collected_results_time)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)


result_dict = result_dict.round(2)
result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Average Time per Epoch"]

In [38]:
result_dict

Unnamed: 0,Dataset,Model,Average Time per Epoch
0,ogbg-molhiv,SANN,15.75 ± 2.78


In [39]:
result_dict.pivot_table(
    index="Model", columns="Dataset", values="Average Time per Epoch", aggfunc="first"
)[['ogbg-molhiv']] #[['MUTAG', 'NCI1','NCI109','PROTEINS','ZINC']]

Dataset,ogbg-molhiv
Model,Unnamed: 1_level_1
SANN,15.75 ± 2.78


### Runtime

In [40]:
collected_results_time_run
# Convert nested dictionary to DataFrame
nested_dict = dict(collected_results_time_run)
result_dict = pd.DataFrame.from_dict(
    {
        (i, j): nested_dict[i][j]
        for i in nested_dict
        for j in nested_dict[i].keys()
    },
    orient="index",
)


result_dict = result_dict.round(2)
result_dict["performance"] = result_dict.apply(
    lambda x: f"{x['mean']} ± {x['std']}", axis=1
)
result_dict = result_dict.drop(["mean", "std"], axis=1)

# Reset multiindex
result_dict = result_dict.reset_index()
# rename columns
result_dict.columns = ["Dataset", "Model", "Average Training Time"]


In [43]:
result_dict.pivot_table(
    index="Model", columns="Dataset", values="Average Training Time", aggfunc="first"
)[['ogbg-molhiv']] #[['MUTAG', 'NCI1','NCI109','PROTEINS','ZINC']]

Dataset,ogbg-molhiv
Model,Unnamed: 1_level_1
SANN,1378.66 ± 306.25
