# Aggregate Experiment Results

This notebook aggregates summary results from all experiments across different settings (dimensions) and models for R=5 and R=10 repetitions.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

print("Libraries loaded successfully")

Libraries loaded successfully


## 1. Define Helper Functions

In [8]:
def load_all_raw_results(base_folder, r_value):
    """
    Load all raw_results.csv files from a results folder and calculate statistics.
    
    Parameters
    ----------
    base_folder : str
        Base folder name (e.g., 'results_5' or 'results_10')
    r_value : int
        Number of repetitions (5 or 10)
    
    Returns
    -------
    pd.DataFrame
        Aggregated statistics with setting, model, and R columns added
    """
    base_path = Path('..') / base_folder
    all_stats = []
    
    # Find all raw_results.csv files
    for raw_file in base_path.rglob('raw_results.csv'):
        # Extract setting (e.g., '1d', '2d', '4d', '6d') and model (e.g., 'x_rf', 'x_cb')
        parts = raw_file.parts
        setting = parts[-3]  # e.g., '4d'
        model = parts[-2]    # e.g., 'x_cb'
        
        # Load the raw results CSV
        df = pd.read_csv(raw_file)
        
        # Calculate statistics for each learner+tuner combination
        for (learner, tuner), group in df.groupby(['learner', 'tuner']):
            stats = {
                'R': r_value,
                'setting': setting,
                'model': model,
                'learner': learner,
                'tuner': tuner,
                'pehe_mean': group['pehe'].mean(),
                'pehe_std': group['pehe'].std(),
                'pehe_plug_mean': group['pehe_plug'].mean(),
                'pehe_plug_std': group['pehe_plug'].std(),
                'n_reps': len(group)
            }
            all_stats.append(stats)
    
    # Combine all statistics
    if all_stats:
        combined = pd.DataFrame(all_stats)
        return combined
    else:
        return pd.DataFrame()

print("Helper functions defined")

Helper functions defined


## 2. Load Raw Results for R=5

In [9]:
print("Loading raw results for R=5...")
results_5 = load_all_raw_results('results_5', r_value=5)

print(f"\nLoaded {len(results_5)} rows")
print(f"Settings: {sorted(results_5['setting'].unique())}")
print(f"Models: {sorted(results_5['model'].unique())}")
print(f"Tuners: {sorted(results_5['tuner'].unique())}")

print("\nFirst few rows:")
results_5.head()

Loading raw results for R=5...

Loaded 20 rows
Settings: ['1d', '2d', '4d', '6d']
Models: ['x_cb', 'x_rf']
Tuners: ['bayes', 'grid', 'random']

First few rows:


Unnamed: 0,R,setting,model,learner,tuner,pehe_mean,pehe_std,pehe_plug_mean,pehe_plug_std,n_reps
0,5,1d,x_cb,x_cb,bayes,0.169143,0.046763,0.43798,0.105739,5
1,5,1d,x_cb,x_cb,grid,0.134617,0.049184,0.339558,0.084871,5
2,5,1d,x_cb,x_cb,random,0.136123,0.049047,0.361133,0.085023,5
3,5,1d,x_rf,x_rf,bayes,0.147534,0.057356,0.377845,0.161859,5
4,5,1d,x_rf,x_rf,grid,0.146164,0.056136,0.374707,0.159375,5


## 3. Load Raw Results for R=10

In [10]:
print("Loading raw results for R=10...")
results_10 = load_all_raw_results('results_10', r_value=10)

print(f"\nLoaded {len(results_10)} rows")
print(f"Settings: {sorted(results_10['setting'].unique())}")
print(f"Models: {sorted(results_10['model'].unique())}")
print(f"Tuners: {sorted(results_10['tuner'].unique())}")

print("\nFirst few rows:")
results_10.head()

Loading raw results for R=10...

Loaded 20 rows
Settings: ['1d', '2d', '4d', '6d']
Models: ['x_cb', 'x_rf']
Tuners: ['bayes', 'grid', 'random']

First few rows:


Unnamed: 0,R,setting,model,learner,tuner,pehe_mean,pehe_std,pehe_plug_mean,pehe_plug_std,n_reps
0,10,1d,x_cb,x_cb,bayes,0.170985,0.03273,0.465155,0.085736,10
1,10,1d,x_cb,x_cb,grid,0.124769,0.03718,0.346041,0.064972,10
2,10,1d,x_cb,x_cb,random,0.130001,0.035638,0.370014,0.066739,10
3,10,1d,x_rf,x_rf,bayes,0.14227,0.044927,0.384638,0.138473,10
4,10,1d,x_rf,x_rf,grid,0.141018,0.043721,0.382774,0.135438,10


## 4. Combine All Results

In [11]:
# Combine R=5 and R=10 results
all_results = pd.concat([results_5, results_10], ignore_index=True)

print(f"Total rows: {len(all_results)}")
print(f"\nColumns: {list(all_results.columns)}")

# Reorder columns for better readability
column_order = ['R', 'setting', 'model', 'learner', 'tuner', 
                'pehe_mean', 'pehe_std', 'pehe_plug_mean', 'pehe_plug_std', 'n_reps']
all_results = all_results[column_order]

print("\nCombined results:")
all_results

Total rows: 40

Columns: ['R', 'setting', 'model', 'learner', 'tuner', 'pehe_mean', 'pehe_std', 'pehe_plug_mean', 'pehe_plug_std', 'n_reps']

Combined results:


Unnamed: 0,R,setting,model,learner,tuner,pehe_mean,pehe_std,pehe_plug_mean,pehe_plug_std,n_reps
0,5,1d,x_cb,x_cb,bayes,0.169143,0.046763,0.43798,0.105739,5
1,5,1d,x_cb,x_cb,grid,0.134617,0.049184,0.339558,0.084871,5
2,5,1d,x_cb,x_cb,random,0.136123,0.049047,0.361133,0.085023,5
3,5,1d,x_rf,x_rf,bayes,0.147534,0.057356,0.377845,0.161859,5
4,5,1d,x_rf,x_rf,grid,0.146164,0.056136,0.374707,0.159375,5
5,5,1d,x_rf,x_rf,random,0.144156,0.060802,0.36632,0.165953,5
6,5,2d,x_cb,x_cb,bayes,0.12177,0.054479,0.336344,0.100187,5
7,5,2d,x_cb,x_cb,grid,0.125106,0.059489,0.326618,0.099641,5
8,5,2d,x_cb,x_cb,random,0.201856,0.060887,0.446947,0.085306,5
9,5,2d,x_rf,x_rf,bayes,0.140817,0.059425,0.358158,0.162775,5


## 5. Comparison Tables: R=5 vs R=10

### Create comparison tables for each setting/model/tuner combination

In [12]:
# Get unique combinations
settings = sorted(all_results['setting'].unique())
models = sorted(all_results['model'].unique())
tuners = sorted(all_results['tuner'].unique())

print(f"Settings: {settings}")
print(f"Models: {models}")
print(f"Tuners: {tuners}")
print(f"\nGenerating comparison tables for each combination...")

Settings: ['1d', '2d', '4d', '6d']
Models: ['x_cb', 'x_rf']
Tuners: ['bayes', 'grid', 'random']

Generating comparison tables for each combination...


In [13]:
# Function to create comparison table for a specific setting/model/tuner
def create_comparison_table(df, setting, model, tuner):
    """
    Create a comparison table showing R=5 vs R=10 for a specific combination.
    """
    # Filter data
    filtered = df[
        (df['setting'] == setting) & 
        (df['model'] == model) & 
        (df['tuner'] == tuner)
    ].copy()
    
    if len(filtered) == 0:
        return None
    
    # Pivot to compare R=5 vs R=10
    comparison = filtered.pivot_table(
        index='R',
        values=['pehe_mean', 'pehe_std', 'pehe_plug_mean', 'pehe_plug_std', 'n_reps']
    )
    
    # Round for readability
    comparison = comparison.round(4)
    
    return comparison

print("Comparison function defined")

Comparison function defined


### Generate All Comparison Tables

In [14]:
# Generate comparison tables for all combinations
for setting in settings:
    for model in models:
        print(f"\n{'='*80}")
        print(f"Setting: {setting.upper()} | Model: {model.upper()}")
        print(f"{'='*80}")
        
        for tuner in tuners:
            table = create_comparison_table(all_results, setting, model, tuner)
            
            if table is not None:
                print(f"\n--- Tuner: {tuner.upper()} ---")
                print(table)
                print()


Setting: 1D | Model: X_CB

--- Tuner: BAYES ---
    n_reps  pehe_mean  pehe_plug_mean  pehe_plug_std  pehe_std
R                                                             
5      5.0     0.1691          0.4380         0.1057    0.0468
10    10.0     0.1710          0.4652         0.0857    0.0327


--- Tuner: GRID ---
    n_reps  pehe_mean  pehe_plug_mean  pehe_plug_std  pehe_std
R                                                             
5      5.0     0.1346          0.3396         0.0849    0.0492
10    10.0     0.1248          0.3460         0.0650    0.0372


--- Tuner: RANDOM ---
    n_reps  pehe_mean  pehe_plug_mean  pehe_plug_std  pehe_std
R                                                             
5      5.0     0.1361          0.3611         0.0850    0.0490
10    10.0     0.1300          0.3700         0.0667    0.0356


Setting: 1D | Model: X_RF

--- Tuner: BAYES ---
    n_reps  pehe_mean  pehe_plug_mean  pehe_plug_std  pehe_std
R                                   

## 6. Export Aggregated Results

In [15]:
# Save the combined results to CSV
output_file = '../aggregated_results.csv'
all_results.to_csv(output_file, index=False)
print(f"Aggregated results saved to: {output_file}")
print(f"Total rows: {len(all_results)}")

Aggregated results saved to: ../aggregated_results.csv
Total rows: 40


## 7. Summary: Best Performing Configurations

In [16]:
# Find best performing configurations by PEHE
print("\n" + "="*80)
print("BEST PERFORMING CONFIGURATIONS (by PEHE mean)")
print("="*80)

for r_val in [5, 10]:
    print(f"\n--- R={r_val} ---")
    r_data = all_results[all_results['R'] == r_val].copy()
    r_data_sorted = r_data.sort_values('pehe_mean')
    
    print("\nTop 5 configurations:")
    print(r_data_sorted[['setting', 'model', 'tuner', 'pehe_mean', 'pehe_std']].head())
    
    print("\nBottom 5 configurations:")
    print(r_data_sorted[['setting', 'model', 'tuner', 'pehe_mean', 'pehe_std']].tail())


BEST PERFORMING CONFIGURATIONS (by PEHE mean)

--- R=5 ---

Top 5 configurations:
   setting model   tuner  pehe_mean  pehe_std
13      6d  x_cb  random   0.120418  0.051521
6       2d  x_cb   bayes   0.121770  0.054479
7       2d  x_cb    grid   0.125106  0.059489
17      4d  x_cb  random   0.127407  0.053745
12      6d  x_cb   bayes   0.127927  0.050563

Bottom 5 configurations:
   setting model   tuner  pehe_mean  pehe_std
4       1d  x_rf    grid   0.146164  0.056136
15      6d  x_rf  random   0.147014  0.057605
3       1d  x_rf   bayes   0.147534  0.057356
0       1d  x_cb   bayes   0.169143  0.046763
8       2d  x_cb  random   0.201856  0.060887

--- R=10 ---

Top 5 configurations:
   setting model  tuner  pehe_mean  pehe_std
26      2d  x_cb  bayes   0.118677  0.039720
27      2d  x_cb   grid   0.119865  0.043352
38      4d  x_rf  bayes   0.122450  0.042608
34      6d  x_rf  bayes   0.124325  0.048969
21      1d  x_cb   grid   0.124769  0.037180

Bottom 5 configurations:
   set