In [7]:
# aggregate_results.ipynb
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Set working directory
base_dir = "./results"

# Get all datasets
datasets = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
print(f"Found {len(datasets)} datasets: {datasets}")

# Create result storage structure
all_results = []

# Iterate through all datasets and seeds
for dataset in datasets:
    dataset_dir = os.path.join(base_dir, dataset)
    seeds = [s for s in os.listdir(dataset_dir) if s.startswith('seed_')]
    
    print(f"Processing dataset {dataset}, {len(seeds)} seeds in total")
    
    # Collect results for all seeds in this dataset
    dataset_results = []
    
    for seed in seeds:
        result_file = os.path.join(dataset_dir, seed, "correlation_results.csv")
        
        # Check if file exists
        if not os.path.exists(result_file):
            print(f"Warning: File not found {result_file}")
            continue
            
        # Read result file
        try:
            df = pd.read_csv(result_file)
            df['dataset'] = dataset
            df['seed'] = seed
            dataset_results.append(df)
        except Exception as e:
            print(f"Error reading {result_file}: {e}")
    
    # Merge all results for this dataset
    if dataset_results:
        dataset_df = pd.concat(dataset_results, ignore_index=True)
        all_results.append(dataset_df)

# Merge results from all datasets
if all_results:
    results_df = pd.concat(all_results, ignore_index=True)
    
    # Display dataset and seed coverage
    print("\nDataset and seed coverage:")
    coverage = results_df.groupby(['dataset', 'seed']).size().unstack(0)
    display(coverage)
    
    # Calculate mean for each dataset, method, metric
    print("\nCalculating averages grouped by dataset, method and metric:")
    avg_by_dataset = results_df.groupby(['dataset', 'split', 'method', 'metric'])['value'].agg(['mean', 'std']).reset_index()
    
    # First display results for each dataset
    for dataset in datasets:
        dataset_avg = avg_by_dataset[avg_by_dataset['dataset'] == dataset]
        if len(dataset_avg) > 0:
            print(f"\nDataset: {dataset}")
            # Pivot table for easier viewing
            pivot = dataset_avg.pivot_table(
                index=['split', 'method'], 
                columns='metric', 
                values='mean'
            )
            display(pivot)
    
    # Average across all datasets
    print("\nAverage results across all datasets:")
    overall_avg = results_df.groupby(['split', 'method', 'metric'])['value'].agg(['mean', 'std']).reset_index()
    pivot_overall = overall_avg.pivot_table(
        index=['split', 'method'], 
        columns='metric', 
        values='mean'
    )
    display(pivot_overall)
    
    # Save summary results
    avg_by_dataset.to_csv("aggregate_results_by_dataset.csv", index=False)
    overall_avg.to_csv("aggregate_results_overall.csv", index=False)
    
    print("\nResults saved to 'aggregate_results_by_dataset.csv' and 'aggregate_results_overall.csv'")
else:
    print("No result files found")

# Create a function to print the best methods
def print_best_methods(df, metrics=None, higher_better=None):
    if metrics is None:
        metrics = ['mae', 'mse']
    
    if higher_better is None:
        higher_better = {
            'mae': False,
            'mse': False
        }
    
    print("\nBest method for each metric:")
    for metric in metrics:
        metric_df = df[df['metric'] == metric]
        
        # Determine whether to take max or min based on the metric
        if higher_better.get(metric, True):
            best_idx = metric_df['mean'].idxmax()
        else:
            best_idx = metric_df['mean'].idxmin()
        
        best_row = metric_df.loc[best_idx]
        print(f"{metric}: {best_row['method']} (value: {best_row['mean']:.4f} ± {best_row['std']:.4f})")

# Print the best method for each metric
if 'overall_avg' in locals():
    print_best_methods(overall_avg)

Found 8 datasets: ['MiniBooNE', '2dplanes', 'digits', 'bbc-embeddings', 'nomao', 'electricity', 'election', 'fried']
Processing dataset MiniBooNE, 20 seeds in total
Processing dataset 2dplanes, 20 seeds in total
Processing dataset digits, 20 seeds in total
Processing dataset bbc-embeddings, 20 seeds in total
Processing dataset nomao, 20 seeds in total
Processing dataset electricity, 20 seeds in total
Processing dataset election, 20 seeds in total
Processing dataset fried, 20 seeds in total

Dataset and seed coverage:


dataset,2dplanes,MiniBooNE,bbc-embeddings,digits,election,electricity,fried,nomao
seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
seed_10,24,24,24,24,24,24,24,24
seed_100,24,24,24,24,24,24,24,24
seed_110,24,24,24,24,24,24,24,24
seed_120,24,24,24,24,24,24,24,24
seed_130,24,24,24,24,24,24,24,24
seed_140,24,24,24,24,24,24,24,24
seed_150,24,24,24,24,24,24,24,24
seed_160,24,24,24,24,24,24,24,24
seed_170,24,24,24,24,24,24,24,24
seed_180,24,24,24,24,24,24,24,24



Calculating averages grouped by dataset, method and metric:

Dataset: MiniBooNE


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.317504,0.136565
test,BetaShapley,1.281986,2.7601
test,Bipartite,0.101351,0.01827
test,Linear,0.265365,0.096057
test,MLP,0.315962,0.132002
test,Shapley,1.311503,2.888855
train,Banzhaf,0.314128,0.130298
train,BetaShapley,1.280374,2.80079
train,Bipartite,0.104437,0.020153
train,Linear,0.255039,0.090661



Dataset: 2dplanes


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.323206,0.140628
test,BetaShapley,1.278453,2.7558
test,Bipartite,0.143517,0.029811
test,Linear,0.269082,0.098
test,MLP,0.321577,0.13589
test,Shapley,1.307833,2.887273
train,Banzhaf,0.319416,0.133779
train,BetaShapley,1.281775,2.805375
train,Bipartite,0.145733,0.03135
train,Linear,0.258336,0.09196



Dataset: digits


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.212028,0.060206
test,BetaShapley,0.424602,0.315228
test,Bipartite,0.057852,0.005454
test,Linear,0.162057,0.035217
test,MLP,0.211228,0.057887
test,Shapley,0.442399,0.343364
train,Banzhaf,0.206145,0.055239
train,BetaShapley,0.424369,0.318859
train,Bipartite,0.058892,0.005632
train,Linear,0.150956,0.030833



Dataset: bbc-embeddings


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.289427,0.11175
test,BetaShapley,0.745893,0.951159
test,Bipartite,0.058068,0.006439
test,Linear,0.226742,0.068518
test,MLP,0.288312,0.107282
test,Shapley,0.76763,1.008758
train,Banzhaf,0.281744,0.102719
train,BetaShapley,0.741071,0.95961
train,Bipartite,0.059705,0.006751
train,Linear,0.212507,0.06116



Dataset: nomao


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.389567,0.207864
test,BetaShapley,1.482275,3.705948
test,Bipartite,0.067364,0.009974
test,Linear,0.327302,0.148412
test,MLP,0.387768,0.200812
test,Shapley,1.515589,3.873723
train,Banzhaf,0.385472,0.198661
train,BetaShapley,1.485862,3.764291
train,Bipartite,0.071269,0.011913
train,Linear,0.31493,0.140504



Dataset: electricity


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.287516,0.113147
test,BetaShapley,1.263873,2.695483
test,Bipartite,0.131454,0.028617
test,Linear,0.24093,0.080683
test,MLP,0.285972,0.109574
test,Shapley,1.293772,2.824549
train,Banzhaf,0.284346,0.107972
train,BetaShapley,1.263884,2.725648
train,Bipartite,0.134961,0.030458
train,Linear,0.232277,0.076435



Dataset: election


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.290552,0.121013
test,BetaShapley,1.203045,2.447432
test,Bipartite,0.131022,0.02899
test,Linear,0.246504,0.089413
test,MLP,0.288719,0.117293
test,Shapley,1.225395,2.539818
train,Banzhaf,0.285837,0.115219
train,BetaShapley,1.199555,2.470513
train,Bipartite,0.13205,0.029809
train,Linear,0.236837,0.084487



Dataset: fried


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.312604,0.131974
test,BetaShapley,1.27051,2.716354
test,Bipartite,0.124394,0.025687
test,Linear,0.260583,0.092242
test,MLP,0.311154,0.127505
test,Shapley,1.298453,2.838152
train,Banzhaf,0.3082,0.124793
train,BetaShapley,1.268388,2.753805
train,Bipartite,0.127549,0.027603
train,Linear,0.249103,0.086042



Average results across all datasets:


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.302801,0.127894
test,BetaShapley,1.11883,2.293438
test,Bipartite,0.101878,0.019155
test,Linear,0.249821,0.088568
test,MLP,0.301336,0.123531
test,Shapley,1.145322,2.400562
train,Banzhaf,0.298161,0.121085
train,BetaShapley,1.11816,2.324861
train,Bipartite,0.104324,0.020459
train,Linear,0.238748,0.08276



Results saved to 'aggregate_results_by_dataset.csv' and 'aggregate_results_overall.csv'

Best method for each metric:
mae: Bipartite (value: 0.1019 ± 0.0383)
mse: Bipartite (value: 0.0192 ± 0.0112)
