In [1]:
from ast import literal_eval
from collections import defaultdict

import pandas as pd

import wandb
from scripts.results_processing.preprocess import preprocess_df
from scripts.results_processing.reproduction.constants import (
    dataset_model_columns,
    optimization_metrics,
    run_columns,
    sweeped_columns,
)

%load_ext autoreload
%autoreload 2


In [2]:

columns_to_normalize = [
    "model",
    "dataset",
    "transforms",
    "optimizer",
    "callbacks",
]
def normalize_columns(df, columns_to_normalize):
    # Gather the new DataFrames to be concatenated
    flattened_dfs = []

    for col in columns_to_normalize:
        df[col] = df[col].apply(lambda x: str(x).replace("nan", "None"))
        df[col] = df[col].apply(literal_eval)

        # Flatten
        flat = pd.json_normalize(df[col])

        # Rename
        flat.columns = [f"{col}.{c}" for c in flat.columns]
        flattened_dfs.append(flat)

    # Drop all nested columns in one shot
    df = df.drop(columns=columns_to_normalize)

    # Concatenate once at the end
    return pd.concat([df] + flattened_dfs, axis=1)


def normalize_df(df, columns_to_normalize):
    # Config columns to normalize
    df = normalize_columns(df, columns_to_normalize)

    return df

def fetch():
    user = "telyatnikov_sap"
    project = "main_exp_GPSE_ogbg-molhiv"
    api = wandb.Api(overrides={"base_url": "https://api.wandb.ai"}, timeout=40)
    runs = api.runs(f"{user}/{project}")
    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {
                k: v
                for k, v in run.config.items()
                if not k.startswith("_")
            }
        )

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame(
        {
            "summary": summary_list,
            "config": config_list,
            "name": name_list,
        }
    )
    # Merge the dicts in a vectorized way:
    merged_dicts = [
        {**s, **c}
        for s, c in zip(runs_df["summary"], runs_df["config"], strict=False)
    ]

    # Now expand them into a DataFrame:
    df_merged = pd.DataFrame.from_records(merged_dicts)
    return df_merged

def main():
    df = fetch()
    df = normalize_df(df, columns_to_normalize)
    return df

In [3]:
def gen_scores(df):
    # Get unique datasets
    datasets = list(df["dataset.loader.parameters.data_name"].unique())
    # Get unique models
    models = list(df["model.model_name"].unique())

    collect_subsets = defaultdict(dict)
    # Got over each dataset and model and find the best result
    for dataset in datasets:
        for model in models:
            # Get the subset of the DataFrame for the current dataset and model
            subset = df.loc[
                (df["dataset.loader.parameters.data_name"] == dataset)
            ]

            optim_metric = optimization_metrics[dataset]["optim_metric"]
            eval_metric = optimization_metrics[dataset]["eval_metric"]
            direction = optimization_metrics[dataset]["direction"]

            # Keep metrics that matters for dataset
            performance_columns = optimization_metrics[dataset][
                "performance_columns"
            ]
            subset = subset[
                dataset_model_columns
                + sweeped_columns
                + performance_columns
                + run_columns
            ]
            aggregated = subset.groupby(
                sweeped_columns + ["model.model_name", "model.model_domain"],
                dropna=False,
            ).agg(
                {col: ["mean", "std", "count"] for col in performance_columns},
            )

            # aggregated = subset.groupby(sweeped_columns, dropna=False).count()

            n_count = 5 if "MANTRA" not in dataset else 4
            # Go from MultiIndex to Index
            aggregated = aggregated.reset_index()
            print(f"Dataset: {dataset}, Model: {model}")
            print(aggregated[(eval_metric, "count")].unique())
            # print(aggregated['dataset.split_params.data_seed'].unique())
            print(
                (aggregated[(eval_metric, "count")] >= n_count).sum()
                / len(aggregated)
                * 100
            )
            aggregated = aggregated[aggregated[(eval_metric, "count")] >= n_count]
            # print(len(aggregated[aggregated['seed'] > 4]))
            # aggregated = aggregated.sort_values(
            #     by=(optim_metric, "mean"), ascending=(direction == "min")
            # )

            # Git percent in case of classification
            if "test/accuracy" in performance_columns:
                # Go over all the performance columns and multiply by 100
                for col in performance_columns:
                    aggregated[(col, "mean")] *= 100
                    aggregated[(col, "std")] *= 100

                # Round performance columns values up to 2 decimal points
                for col in performance_columns:
                    aggregated[(col, "mean")] = aggregated[
                        (col, "mean")
                    ].round(4)
                    aggregated[(col, "std")] = aggregated[(col, "std")].round(
                        4
                    )

            else:
                # Round all values up to 4 decimal points
                # Round performance columns values up to 4 decimal points
                for col in performance_columns:
                    aggregated[(col, "mean")] = aggregated[
                        (col, "mean")
                    ].round(4)
                    aggregated[(col, "std")] = aggregated[(col, "std")].round(
                        4
                    )

            collect_subsets[dataset] = aggregated
    return collect_subsets


In [4]:
df = main()

In [5]:
df = preprocess_df(df, split_mantra=False)

In [6]:
collected_subsets = gen_scores(df)

Dataset: ogbg-molhiv, Model: hopse_g
[5 3 1 0 2 4]
88.13559322033898


In [21]:
collected_subsets["ogbg-molhiv"].sort_values(by=("val/auroc", "mean"), ascending=False)[["val/auroc", "test/auroc", "model.backbone.n_layers", "dataset.dataloader_params.batch_size", "optimizer.parameters.lr", "optimizer.parameters.weight_decay", "transforms.sann_encoding.pretrain_model", "transforms.sann_encoding.neighborhoods"]].head(60)

Unnamed: 0_level_0,val/auroc,val/auroc,val/auroc,test/auroc,test/auroc,test/auroc,model.backbone.n_layers,dataset.dataloader_params.batch_size,optimizer.parameters.lr,optimizer.parameters.weight_decay,transforms.sann_encoding.pretrain_model,transforms.sann_encoding.neighborhoods
Unnamed: 0_level_1,mean,std,count,mean,std,count,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
212,78.5561,1.529,5,73.9272,3.0664,5,1,128,0.001,0.0,MOLPCBA,['up_adjacency-0']
208,77.5297,1.1666,5,74.1542,2.752,5,1,128,0.001,0.0001,MOLPCBA,['up_adjacency-0']
136,77.3994,2.2687,5,71.6806,2.7016,5,2,128,0.001,0.0001,MOLPCBA,"['up_adjacency-0', 'up_adjacency-1']"
192,77.3197,3.5181,5,73.6264,1.6072,5,2,128,0.001,0.0001,MOLPCBA,['up_adjacency-0']
52,77.2984,2.9981,5,73.7953,2.5942,5,1,128,0.001,0.0001,MOLPCBA,"['up_adjacency-0', 'up_adjacency-1', 'down_adj..."
85,77.0527,2.2222,5,76.6055,1.0047,5,1,256,0.001,0.0001,MOLPCBA,"['up_adjacency-0', 'up_adjacency-1', 'down_adj..."
176,77.0196,3.4239,5,68.8513,2.4877,5,1,128,0.001,0.0001,MOLPCBA,['up_adjacency-0']
200,76.996,2.0547,5,72.2979,1.7728,5,2,128,0.001,0.0001,MOLPCBA,['up_adjacency-0']
93,76.995,3.5996,5,74.1606,0.7509,5,1,256,0.001,0.0001,MOLPCBA,"['up_adjacency-0', 'up_adjacency-1', 'down_adj..."
201,76.9879,2.4747,5,72.9734,2.2041,5,2,256,0.001,0.0001,MOLPCBA,['up_adjacency-0']
