In [6]:
# aggregate_results.ipynb
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Set working directory
base_dir = "./results"

# Get all datasets
datasets = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
print(f"Found {len(datasets)} datasets: {datasets}")

# Create result storage structure
all_results = []

# Iterate through all datasets and seeds
for dataset in datasets:
    dataset_dir = os.path.join(base_dir, dataset)
    seeds = [s for s in os.listdir(dataset_dir) if s.startswith('seed_')]
    
    print(f"Processing dataset {dataset}, {len(seeds)} seeds in total")
    
    # Collect results for all seeds in this dataset
    dataset_results = []
    
    for seed in seeds:
        result_file = os.path.join(dataset_dir, seed, "correlation_results.csv")
        
        # Check if file exists
        if not os.path.exists(result_file):
            print(f"Warning: File not found {result_file}")
            continue
            
        # Read result file
        try:
            df = pd.read_csv(result_file)
            df['dataset'] = dataset
            df['seed'] = seed
            dataset_results.append(df)
        except Exception as e:
            print(f"Error reading {result_file}: {e}")
    
    # Merge all results for this dataset
    if dataset_results:
        dataset_df = pd.concat(dataset_results, ignore_index=True)
        all_results.append(dataset_df)

# Merge results from all datasets
if all_results:
    results_df = pd.concat(all_results, ignore_index=True)
    
    # Display dataset and seed coverage
    print("\nDataset and seed coverage:")
    coverage = results_df.groupby(['dataset', 'seed']).size().unstack(0)
    display(coverage)
    
    # Calculate mean for each dataset, method, metric
    print("\nCalculating averages grouped by dataset, method and metric:")
    avg_by_dataset = results_df.groupby(['dataset', 'split', 'method', 'metric'])['value'].agg(['mean', 'std']).reset_index()
    
    # First display results for each dataset
    for dataset in datasets:
        dataset_avg = avg_by_dataset[avg_by_dataset['dataset'] == dataset]
        if len(dataset_avg) > 0:
            print(f"\nDataset: {dataset}")
            # Pivot table for easier viewing
            pivot = dataset_avg.pivot_table(
                index=['split', 'method'], 
                columns='metric', 
                values='mean'
            )
            display(pivot)
    
    # Average across all datasets
    print("\nAverage results across all datasets:")
    overall_avg = results_df.groupby(['split', 'method', 'metric'])['value'].agg(['mean', 'std']).reset_index()
    pivot_overall = overall_avg.pivot_table(
        index=['split', 'method'], 
        columns='metric', 
        values='mean'
    )
    display(pivot_overall)
    
    # Save summary results
    avg_by_dataset.to_csv("aggregate_results_by_dataset.csv", index=False)
    overall_avg.to_csv("aggregate_results_overall.csv", index=False)
    
    print("\nResults saved to 'aggregate_results_by_dataset.csv' and 'aggregate_results_overall.csv'")
else:
    print("No result files found")

# Create a function to print the best methods
def print_best_methods(df, metrics=None, higher_better=None):
    if metrics is None:
        metrics = ['mae', 'mse']
    
    if higher_better is None:
        higher_better = {
            'mae': False,
            'mse': False
        }
    
    print("\nBest method for each metric:")
    for metric in metrics:
        metric_df = df[df['metric'] == metric]
        
        # Determine whether to take max or min based on the metric
        if higher_better.get(metric, True):
            best_idx = metric_df['mean'].idxmax()
        else:
            best_idx = metric_df['mean'].idxmin()
        
        best_row = metric_df.loc[best_idx]
        print(f"{metric}: {best_row['method']} (value: {best_row['mean']:.4f} ± {best_row['std']:.4f})")

# Print the best method for each metric
if 'overall_avg' in locals():
    print_best_methods(overall_avg)

Found 8 datasets: ['MiniBooNE', '2dplanes', 'digits', 'bbc-embeddings', 'nomao', 'electricity', 'election', 'fried']
Processing dataset MiniBooNE, 0 seeds in total
Processing dataset 2dplanes, 0 seeds in total
Processing dataset digits, 0 seeds in total
Processing dataset bbc-embeddings, 0 seeds in total
Processing dataset nomao, 0 seeds in total
Processing dataset electricity, 20 seeds in total
Processing dataset election, 0 seeds in total
Processing dataset fried, 20 seeds in total

Dataset and seed coverage:


dataset,fried
seed,Unnamed: 1_level_1
seed_10,24
seed_100,24
seed_110,24
seed_120,24
seed_130,24
seed_140,24
seed_150,24
seed_160,24
seed_170,24
seed_180,24



Calculating averages grouped by dataset, method and metric:

Dataset: fried


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.312604,0.131974
test,BetaShapley,1.27051,2.716354
test,Bipartite,0.124394,0.025687
test,Linear,0.260583,0.092242
test,MLP,0.311154,0.127505
test,Shapley,1.298453,2.838152
train,Banzhaf,0.3082,0.124793
train,BetaShapley,1.268388,2.753805
train,Bipartite,0.127549,0.027603
train,Linear,0.249103,0.086042



Average results across all datasets:


Unnamed: 0_level_0,metric,mae,mse
split,method,Unnamed: 2_level_1,Unnamed: 3_level_1
test,Banzhaf,0.312604,0.131974
test,BetaShapley,1.27051,2.716354
test,Bipartite,0.124394,0.025687
test,Linear,0.260583,0.092242
test,MLP,0.311154,0.127505
test,Shapley,1.298453,2.838152
train,Banzhaf,0.3082,0.124793
train,BetaShapley,1.268388,2.753805
train,Bipartite,0.127549,0.027603
train,Linear,0.249103,0.086042



Results saved to 'aggregate_results_by_dataset.csv' and 'aggregate_results_overall.csv'

Best method for each metric:
mae: Bipartite (value: 0.1244 ± 0.0165)
mse: Bipartite (value: 0.0257 ± 0.0053)
