In [3]:
import ast
import glob
import warnings
from collections import defaultdict
from datetime import date

import numpy as np
import pandas as pd
import wandb

today = date.today()
api = wandb.Api()

# # Find all csv files in the current directory
csv_files = []
# # Collect all the names of the csv files without the extension
csv_names = [csv_file[:-4] for csv_file in csv_files]
project_name = "TopoBenchmarkX_Simplicial"  
user = "gbg141"

if project_name not in csv_names:
    runs = api.runs(f"{user}/{project_name}")

    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k, v in run.config.items() if not k.startswith("_")}
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {"summary": summary_list, "config": config_list, "name": name_list}
    )

    runs_df.to_csv(f"{project_name}.csv")
else:
    runs_df = pd.read_csv(f"{project_name}.csv", index_col=0)

    for row in runs_df.iloc:
        row["summary"] = ast.literal_eval(row["summary"])
        row["config"] = ast.literal_eval(row["config"])


for row in runs_df.iloc:
    row["summary"].update(row["config"])

lst = [i["summary"] for i in runs_df.iloc]
df = pd.DataFrame.from_dict(lst)

df_init = df.copy()

# Get average epoch run time
df["epoch_run_time"] = df["_runtime"] / df["epoch"]

In [5]:
def normalize_column(df, column_to_normalize):
    # Use json_normalize to flatten the nested dictionaries into separate columns
    flattened_df = pd.json_normalize(df[column_to_normalize])
    # Rename columns to include 'nested_column' prefix
    flattened_df.columns = [
        f"{column_to_normalize}.{col}" for col in flattened_df.columns
    ]
    # Concatenate the flattened DataFrame with the original DataFrame
    result_df = pd.concat([df, flattened_df], axis=1)
    # Get new columns names
    new_columns = flattened_df.columns
    # Drop the original nested column if needed
    result_df.drop(column_to_normalize, axis=1, inplace=True)
    return result_df, new_columns


# Config columns to normalize
columns_to_normalize = ["model", "dataset", "callbacks", "paths"]

# Keep track of config columns added
config_columns = []
for column in columns_to_normalize:
    df, columns = normalize_column(df, column)
    config_columns.extend(columns)

In [6]:
# Print all columns 10 per line
for i in range(0, len(df.columns), 5):
    print(list(df.columns[i:i + 5]))


['val/mae', 'val/mse', 'train/loss', '_wandb', 'test/mse']
['train/mse', '_step', 'val/loss', 'train/mae', 'trainer/global_step']
['epoch', 'lr-Adam', '_runtime', 'test/mae', 'test/loss']
['_timestamp', 'seed', 'tags', 'extras', 'trainer']
['ckpt_path', 'task_name', 'model/params/total', 'model/params/trainable', 'model/params/non_trainable']
['train/auroc', 'val/auroc', 'val/accuracy', 'val/precision', 'train/precision']
['val/recall', 'train/recall', 'train/accuracy', 'test/recall', 'test/accuracy']
['test/precision', 'test/auroc', 'epoch_run_time', 'model.compile', 'model._target_']
['model.model_name', 'model.model_domain', 'model.loss.task', 'model.loss._target_', 'model.loss.loss_type']
['model.readout._target_', 'model.readout.hidden_dim', 'model.readout.readout_name', 'model.readout.num_cell_dimensions', 'model.backbone._target_']
['model.backbone.channels', 'model.backbone.max_rank', 'model.backbone.n_layers', 'model.backbone.update_func', 'model.optimizer.lr']
['model.optimiz

### See unique datasets

In [7]:
print(df['dataset.parameters.data_name'].unique())
print("Num unique datasets:", len(df['dataset.parameters.data_name'].unique()))

['ZINC' 'NCI109' 'NCI1' 'PROTEINS' 'MUTAG' 'minesweeper' 'roman_empire']
Num unique datasets: 7


## See unique models

In [8]:
print(df['model.model_name'].unique())

['sccnn']


### For the simplicial runs there was an error, the truth name of sccnn is sccn 

In [11]:
df['model.model_name'] = df['model.model_name'].apply(lambda x: 'sccn' if x == 'sccnn' else x)

## Solve batch problems

In [12]:
datasets = ['minesweeper', 'roman_empire']
models = ['sccn']
# For the following models and datasets I mistook the batch size, it should be 1, instead of 256 or 128
# Keep the run where batch size is 128 and then change the batch size to 1
for model in models:
    print("MODEL:", model)
    for dataset in datasets:

        # Change the batch size to 1 when it is 128
        
        print(df.loc[(df['model.model_name'] == model) & (df['dataset.parameters.data_name'] == dataset), 'dataset.parameters.batch_size'].unique())
        

MODEL: sccn
[1]
[1]


## Solve issue with projection dropout

In [13]:
print(df['model.feature_encoder.proj_dropout'].unique())

[0.5  0.25]


In [14]:
# Keep rows where model.feature_encoder.proj_dropout is [0.5  0.25]
df = df[df['model.feature_encoder.proj_dropout'].isin([0.5, 0.25])]


In [15]:
df.reset_index(drop=True, inplace=True)

In [17]:
df.to_csv('Simplicial_additional_runs.csv')