# Process Results

In [None]:
import os
import sys
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from typing import List, Optional, Tuple, Union
import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from scipy.stats import t
from tqdm import tqdm
import rliable
import rliable.library
import rliable.metrics

### Gather task metadata

In [None]:
def get_task_metadata(task_name):
    """
    Given a task_name, looks for a folder with the same name under `tasks/rad`,
    finds a `metadata.yaml` file under that folder, and returns the parsed content.
    If the file does not exist, it raises a FileNotFoundError with additional context.
    """
    current_path = Path.cwd()
    current_path = current_path.parent
    metadata_file = current_path / "airsbench/tasks/rad" / task_name / "metadata.yaml"

    # if not task_dir.exists():
    #     raise FileNotFoundError(f"Task directory not found: {task_dir}")

    if not metadata_file.is_file():
        raise FileNotFoundError(f"metadata.yaml not found for task: {task_name}")

    return yaml.safe_load(metadata_file.read_text())

In [None]:
from pprint import pprint

task_name = "TimeSeriesForecastingSolarWeeklyMAE"
metadata = get_task_metadata(task_name)
pprint(metadata)

## Compile results
- `task_metadata` is a dict storing metadata as `task_metadata[task]`. 
- `full_results` is a dict storing results as `full_results[task][agent]`. Each value is a list storing tuples (score, status).

In [None]:
pkl_files = {
    "mlgym_gpt-4o": "/checkpoint/maui/shared/logs/airs_bench_analysis/mlgym_results/analysis_v2_HEURISTIC_20t_airsbench_4o_20260205.pkl",
    "mlgym_cwm": "/checkpoint/maui/shared/logs/airs_bench_analysis/mlgym_results/analysis_v2_HEURISTIC_20t_airsbench_cwm_20260205.pkl",
    "oneshot_o3_mini": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/oneshot_o3_mini__results.pkl",
    "oneshot_cwm": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/oneshot_facebook_cwm__results.pkl",
    "oneshot_gpt-4o": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/oneshot_gpt-4o__results.pkl",
    "oneshot_gpt-oss-20b": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/oneshot_gpt-oss-20b__results.pkl",
    "oneshot_gpt-oss-120b": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/oneshot_gpt-oss-120b__results.pkl",
    "oneshot_devstral": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/oneshot_devstral__results.pkl",
    "greedy_devstral": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/greedy_devstral__results.pkl",
    "greedy_gpt-oss-20b": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/greedy_gpt-oss-20b__results.pkl",
    "greedy_cwm": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/greedy_facebook_cwm__results.pkl",
    "greedy_gpt-4o": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/greedy_gpt-4o__results.pkl",
    "greedy_o3_mini": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/greedy_o3_mini__results.pkl",
    "greedy_gpt-oss-120b": "/checkpoint/maui/shared/logs/airs_bench_analysis/results_lite/greedy_gpt-oss-120b__results.pkl",
}

In [None]:
import pickle

# Load and print the contents of the 'mlgym_cwm' pickle file
with open(pkl_files["oneshot_o3_mini"], "rb") as f:
    oneshot_o3_mini_data = pickle.load(f)

print(oneshot_o3_mini_data)

with open(pkl_files["mlgym_cwm"], "rb") as f:
    mlgym_cwm_data = pickle.load(f)

print(mlgym_cwm_data)

In [None]:
len(mlgym_cwm_data)

In [None]:
tasks = [
    "TimeSeriesForecastingSolarWeeklyMAE",
    "TextualSimilaritySickSpearmanCorrelation",
    "TextualClassificationSickAccuracy",
    "CvMolecularPropertyPredictionQm9MeanAbsoluteError",
    "ReadingComprehensionSquadExactMatch",
    "GMolecularPropertyPredictionQm9MeanAbsoluteError",
    "CoreferenceResolutionWinograndeAccuracy",
    "TimeSeriesForecastingKaggleWebTrafficMASE",
    "U0MolecularPropertyPredictionQm9MeanAbsoluteError",
    "R2AbsMolecularPropertyPredictionQm9MeanAbsoluteError",
    "SentimentAnalysisYelpReviewFullAccuracy",
    "GraphRegressionZincMae",
    "CoreferenceResolutionSuperGLUEWSCAccuracy",
    "QuestionAnsweringEli5Rouge1",
    "TimeSeriesForecastingRideshareMAE",
    "QuestionAnsweringDuoRCAccuracy",
    "CodeGenerationAPPSPassAt5",
    "CodeRetrievalCodeXGlueMRR",
    "MathQuestionAnsweringSVAMPAccuracy",
    "QuestionAnsweringFinqaAccuracy",
]

In [None]:
parent_dir = os.path.dirname(os.path.abspath(os.getcwd()))
sys.path.append(parent_dir)

full_results = {}
task_metadata = {}

for agent, file_path in pkl_files.items():
    with open(file_path, "rb") as f:
        agent_data = pickle.load(f)
    print(agent)
    for task_name, task_results in agent_data.items():
        if task_name not in tasks:
            continue
        print(task_name)
        if task_name not in task_metadata:
            task_metadata[task_name] = get_task_metadata(task_name)
        # print(task_results)
        processed_task_results = []
        # print(task_results["submitted_solution_metrics"])
        if agent.startswith("mlgym"):
            print(f"Processing MLGym agent: {agent}")
            for results_list in task_results["submitted_solution_metrics"]:
                processed_task_results.append((results_list[0], results_list[1]))
        else:
            print(f"Processing RAD agent: {agent}")
            for score, status, _ in task_results["submitted_solution_metrics"]:
                processed_task_results.append((score, status))

        task_bucket = full_results.setdefault(task_name, {})

        task_bucket[agent] = processed_task_results
        full_results[task_name] = task_bucket

In [None]:
from pprint import pprint

pprint(full_results)

# Elo Calculation

## Prepare Results

### Add SOTA as Player

In [None]:
from copy import deepcopy
elo_full_results = deepcopy(full_results)
for task in full_results.keys():
    sota = task_metadata[task]['logging_info']["sota"][0]['sota_score']
    elo_full_results[task]["sota"] = [(sota, "SUCCESS")]*10

### Calculate Head to Head Scores

In [None]:
import numpy as np
import pandas as pd

results_lmsys = []
for task, task_res in elo_full_results.items():
    lower_is_better = bool(task_metadata[task]["metric_lower_is_better"])
    methods = list(task_res.keys())
    for m1 in range(len(methods)):
        model_a = methods[m1]
        for m2 in range(m1+1, len(methods)):
            model_b = methods[m2]
            scores_1 = task_res[methods[m1]]
            scores_2 = task_res[methods[m2]]


            for s1 in scores_1:
                val_1 = s1[0]
                for s2 in scores_2:
                    val_2 = s2[0]
                    info = {"model_a": model_a, "model_b": model_b}
                    # --- Handle N/A values ---
                    if val_1 == "N/A" or val_2 == "N/A":
                        if val_1 == "N/A" and val_2 == "N/A":
                            winner = "tie (bothbad)"
                        elif val_1 == "N/A" and val_2 != "N/A":
                            winner = "model_b"
                        elif val_2 == "N/A" and val_1 != "N/A":
                            winner = "model_a"
                    else:
                        vi, vj = float(val_1), float(val_2)
                        if np.isclose(vi, vj, atol=1e-6):
                            winner = "tie"
                        elif (vi < vj and lower_is_better) or (vi > vj and not lower_is_better):
                            winner = "model_a"
                        else:
                            winner = "model_b"
                    info["winner"] = winner
                    results_lmsys.append(info)

# After filling results_lmsys
swapped = []
for r in results_lmsys:
    swapped.append({
        "model_a": r["model_b"],
        "model_b": r["model_a"],
        "winner": "model_a" if r["winner"] == "model_b"
                  else "model_b" if r["winner"] == "model_a"
                  else r["winner"]
    })
results_lmsys += swapped

import pandas as pd
battles = pd.DataFrame(results_lmsys)
battles_no_ties = battles[~battles["winner"].str.contains("tie")]

## Bootstrap ELO Score

In [None]:
import math
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd

def compute_mle_elo(
    df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
):
    from sklearn.linear_model import LogisticRegression
    ptbl_a_win = pd.pivot_table(
        df[df["winner"] == "model_a"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    # if no tie, create a zero matrix
    if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["winner"].isin(["tie", "tie (bothbad)"])],
            index="model_a",
            columns="model_b",
            aggfunc="size",
            fill_value=0,
        )
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["winner"] == "model_b"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    # If you want to be explicit about aligning both rows and columns:
    ptbl_win = (ptbl_a_win * 2).add(ptbl_b_win.T * 2, fill_value=0).add(ptbl_tie, fill_value=0)

    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(models)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            # if nan skip
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, models[m_a]] = +math.log(BASE)
            X[cur_row, models[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, models[m_a]] = math.log(BASE)
            X[cur_row + 1, models[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)

def _bootstrap_worker(battles, func_compute_elo):
    sample = battles.sample(frac=1.0, replace=True)
    return func_compute_elo(sample)

def get_bootstrap_result(battles, func_compute_elo, num_round):
    rows = []
    with ProcessPoolExecutor() as executor:
        futures = [
            executor.submit(_bootstrap_worker, battles, func_compute_elo)
            for _ in range(num_round)
        ]
        for future in tqdm(as_completed(futures), total=num_round, desc="bootstrap"):
            rows.append(future.result())

    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]

In [None]:
BOOTSTRAP_ROUNDS = 100
np.random.seed(42)

bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, BOOTSTRAP_ROUNDS)

In [None]:
bootstrap_df = pd.DataFrame(dict(
    lower=bootstrap_elo_lu.quantile(.025),
    rating=bootstrap_elo_lu.quantile(.5),
    upper=bootstrap_elo_lu.quantile(.975)
)).reset_index(names="model").sort_values("rating", ascending=False)

bootstrap_df["error_y"] = bootstrap_df["upper"] - bootstrap_df["rating"]
bootstrap_df["error_y_minus"] = bootstrap_df["rating"] - bootstrap_df["lower"]
bootstrap_df["rating_rounded"] = np.round(bootstrap_df["rating"], 2)
bootstrap_df[["model", "rating", "upper", "lower"]]

## Plot Elo Score

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# --- 4. Create the plot (mean only) ---
metric = "mean"
display = {"mean": "Mean"}


plt.rcParams.update({
    "font.family": "sans-serif",
    "font.sans-serif": ["DejaVu Sans", "Liberation Sans", "Helvetica"],
    "font.size": 24,
})


sns.set_context("notebook", font_scale=2.5)
sns.set_style("whitegrid")


plt.figure(figsize=(30, 8))

# --- Use specified subset ---
df = bootstrap_df[["model", "rating", "upper", "lower"]].copy()

# --- Sort methods by rating ---
df_sorted = df.sort_values("rating", ascending=False)

vals = df_sorted["rating"].to_numpy()             # ratings (sorted descending)
lowers = df_sorted["lower"].to_numpy()            # lower bounds
uppers = df_sorted["upper"].to_numpy()            # upper bounds
sorted_methods = df_sorted["model"].tolist()      # method names (sorted)

# Calculate error bar lengths (distance from the bar value)
lower_errors = vals - lowers
upper_errors = uppers - vals


errs = np.vstack([lower_errors, upper_errors])


# --- Scaffold colors ---
scaffold_colors = {
    'One-Shot': "#4A90E2",
    'Greedy': "#E57373",
    'ReAct': "#66BB6A",
    'Overall': "#FF8C00"
}


def get_scaffold(method):
    if method.lower() == 'overall':
        return 'Overall'
    elif method.startswith('oneshot'):
        return 'One-Shot'
    elif method.startswith('greedy'):
        return 'Greedy'
    elif method.startswith('mlgym'):
        return 'ReAct'
    else:
        return 'Overall'


# Assign colors based on scaffold
colors = [scaffold_colors[get_scaffold(m)] for m in sorted_methods]


# --- x positions ---
x = np.arange(len(sorted_methods), dtype=float)
bar_width = 0.8  # Fatter bars


# --- Plot bars with transparency and black outline ---
bars = plt.bar(
    x, vals, width=bar_width, yerr=errs, capsize=4,
    # x, vals, width=bar_width, capsize=4,
    color=colors, edgecolor="black", alpha=0.8,
    linewidth=0
)


# --- Format x-tick labels over multiple lines for long names ---
def prettify_model_names(names):
    mapping = {
        "greedy": "Greedy",
        "sota": "SOTA",
        "mlgym": "ReAct",
        "oneshot": "One-Shot"
    }

    prettified = []
    for name in names:
        parts = name.split("\n")
        # Apply mapping to the first part only
        first = parts[0].strip()
        parts[0] = mapping.get(first, first)
        prettified.append("\n".join(parts))

    return prettified

def format_method_name(name):
    model_name_mapping = {
        "gpt-4o": "GPT-4o",
        "cwm": "CWM",
        "o3_mini": "o3-mini",
        "devstral": "Devstral"
    }
    if name.lower() == 'Overall':
        return r'$\mathbf{overall}$'
    if '_' in name:
        parts = name.split('_', 1)
        scaffold = parts[0]
        model = parts[1] if len(parts) > 1 else ''
        if 'gpt-oss-20b' in model:
            model = 'gpt-oss\n20b'
        elif 'gpt-oss-120b' in model:
            model = 'gpt-oss\n120b'
        model = model_name_mapping.get(model, model)
        return f"{scaffold}\n{model}"
    return name

xtick_labels = [format_method_name(m) for m in sorted_methods]
xtick_labels = prettify_model_names(xtick_labels)
plt.xticks(x, xtick_labels, rotation=0, ha="center", fontsize=25, linespacing=1.5)
# plt.yticks(np.arange(0, 1.1, 0.1), fontsize=28)
plt.ylabel("Elo", fontsize=32, labelpad=10)
plt.ylim(600, 1850)


# --- Reduce space between y-axis and first/last bar ---
plt.xlim(-0.5, len(sorted_methods) - 0.5)


# --- Add legend for scaffold colors (excluding overall) ---
legend_elements = [
    Patch(facecolor=scaffold_colors['One-Shot'], label='One-Shot'),
    Patch(facecolor=scaffold_colors['Greedy'], label='Greedy'),
    Patch(facecolor=scaffold_colors['ReAct'], label='ReAct'),
]
plt.legend(handles=legend_elements, loc='upper right', fontsize=30, frameon=True,
          title="Scaffold", title_fontsize=32, edgecolor='black', fancybox=False,
          framealpha=1.0)


plt.grid(axis="y", linestyle="-", alpha=0.5, linewidth=1.2)
plt.grid(axis="x", which='both', linestyle='', linewidth=0)
ax = plt.gca()
for spine in ax.spines.values():
    spine.set_visible(True)
    spine.set_color('black')
    spine.set_linewidth(1.5)

for i in range(len(vals)):
    plt.text(
        x[i], vals[i] + upper_errors[i] + 0.01,  # Place label above error bar
        f"{vals[i]:.0f}",      # Format to 3 decimal places
        ha='center', va='bottom',
        fontsize=26, fontweight='normal', color='black'
    )
plt.tight_layout()


out_path = os.path.join("elo.pdf")
plt.savefig(out_path, bbox_inches="tight")
print(f"Saved plot as: {out_path}")
plt.show()

# Normalized Score

## Prepare Dataframe
`full_results_rows` is a list of dicts with keys `['task_name', 'method_name', 'lower_is_better', 'sota', 'run_status', 'score', 'valid_submission', 'optimal_score']`

In [None]:
import pandas as pd

full_results_rows = []
for task_name, task_results_by_method in full_results.items():
    for method_key, task_results in task_results_by_method.items():
        for score, status in task_results:
            # print(f"Score: {score}, Status: {status}")
            row = {
                "task_name": task_name,
                "method_name": method_key,
                "lower_is_better": task_metadata[task_name]["metric_lower_is_better"],
                "sota": task_metadata[task_name]["logging_info"]["sota"][0]["sota_score"],
                "run_status": status,
                "score": score if score != "N/A" else None,
                "valid_submission": status == "SUCCESS",
                "optimal_score": int(not task_metadata[task_name]["metric_lower_is_better"]),
            }
            full_results_rows.append(row)

experiments_df = pd.DataFrame(full_results_rows)
experiments_df.head(5)

## Calculate Normalized Scores
Alexis' Witchcraftry

In [None]:
def compute_worst_score(group):
    lower_is_better = group["lower_is_better"].iloc[0]
    scores = pd.to_numeric(group["score"], errors="coerce")
    return scores.max() if lower_is_better else scores.min()

worst_scores = (
    experiments_df
    .groupby("task_name")
    .apply(compute_worst_score)
    .reset_index(name="worst_score")
)

experiments_df = experiments_df.merge(worst_scores, on="task_name", how="left")

In [None]:
def normalize_score(df):
    """
    Compute the normalized score for each row in the dataframe using existing 'worst' and 'sota' columns.
    If 'score' is NaN, set 'normalized_score' to 0.
    Parameters:
    df (pandas.DataFrame): Input dataframe
    Returns:
    pandas.DataFrame: Updated dataframe with the new 'normalized_score' column
    """
    norm = (df['score'] - df['worst_score']) / (df['sota'] - df['worst_score'])
    df['normalized_score'] = norm.fillna(0).clip(lower=0)
    return df
# Apply the function to your dataframe
experiments_df = normalize_score(experiments_df)

def normalize_score_log(df):
    diff = np.abs(df['score'] - df['optimal_score'])
    score_log = np.where(diff == 0, -np.log10(np.abs(0.999 - df['optimal_score'])), -np.log10(np.abs(df['score'] - df['optimal_score'])))
    worst_log = -np.log10(np.abs(df['worst_score'] - df['optimal_score']))
    sota_log = -np.log10(np.abs(df['sota'] - df['optimal_score']))
    norm = (score_log - worst_log) / (sota_log - worst_log)
    df['normalized_score_log'] = pd.Series(norm).fillna(0).replace([np.inf, -np.inf], 100).clip(lower=0)
    return df

    return df
# Apply the function to your dataframe
experiments_df = normalize_score_log(experiments_df)

In [None]:
def parse_into_aggregate_dict(report_df, metric, algorithms=None):
    if algorithms is None:
        algorithms = list(report_df["method_name"].unique())

    # Get the unique methods
    methods = algorithms
    print(f"Processing {methods}")
    score_dict = {}

    for method in methods:
        # Filter the dataframe for the current method
        m_df = report_df[report_df["method_name"] == method].copy()
        # Create a 'seed' column based on the order of rows for each task
        m_df["seed"] = m_df.groupby("task_name").cumcount()
        # Pivot the dataframe: rows = seeds, columns = tasks, values = metric
        pivot_df = m_df.pivot(index="seed", columns="task_name", values=metric)
        # filter out the rows where the metric is NaN
        # This is to ensure we only keep rows where the metric is not NaN
        # Remove rows with NaN values
        pivot_df = pivot_df.dropna()
        # Convert the pivoted dataframe to a numpy matrix
        score_dict[method] = pivot_df.to_numpy().astype(float)
        score_dict[method] = score_dict[method][~np.isnan(score_dict[method]).any(axis=1)]

    return score_dict

def get_experiment_summary(
    reports_df: pd.DataFrame,
    experiment_ids: str | List[str],
    *,
    competition_ids: Optional[List[str]] = None,
    group_label: Optional[str] = None,
    keys: List[str] = ("normalized_score"),
    all_only: bool = False,
) -> List[dict]:
    df = reports_df.copy()

    summaries: List[dict] = []

    # --------------- iterate over each experiment --------------------- #
    for exp_id in experiment_ids:
        exp_df = df[df["method_name"] == exp_id]

        # caller may restrict competitions
        if competition_ids is not None:
            exp_df = exp_df[exp_df["task_name"].isin(competition_ids)]

        if exp_df.empty:
            print(f"[Info] No submissions found for experiment_id '{exp_id}'.")
            continue

        # --------------- "all competitions" roll-up ------------------- #
        if exp_df["task_name"].nunique() > 1:
            all_stats = {
                "method_name": exp_id,
                "task_name": "all" if group_label is None else group_label,
                "total_submissions": len(exp_df),
            }
            for k in keys:
                normalized_score_dict = parse_into_aggregate_dict(exp_df, k, [exp_id])
                aggregate_scores, aggregate_score_cis = rliable.library.get_interval_estimates(
                    normalized_score_dict, aggregate_func, reps=50000
                )
                assert len(list(aggregate_scores.values())) == 1
                assert len(list(aggregate_score_cis.values())) == 1
                scores = list(aggregate_scores.values())[0]
                cis = list(aggregate_score_cis.values())[0]
                median = scores[0]
                iqm = scores[1]
                mean = scores[2]
                median_lower_ci = cis[0, 0]
                median_upper_ci = cis[1, 0]
                iqm_lower_ci = cis[0, 1]
                iqm_upper_ci = cis[1, 1]
                mean_lower_ci = cis[0, 2]
                mean_upper_ci = cis[1, 2]
                metrics = {
                    f"{k}_median": median,
                    f"{k}_iqm": iqm,
                    f"{k}_mean": mean,
                    f"{k}_median_lower_ci": median_lower_ci,
                    f"{k}_median_upper_ci": median_upper_ci,
                    f"{k}_iqm_lower_ci": iqm_lower_ci,
                    f"{k}_iqm_upper_ci": iqm_upper_ci,
                    f"{k}_mean_lower_ci": mean_lower_ci,
                    f"{k}_mean_upper_ci": mean_upper_ci,
                }
                all_stats.update(metrics)

            summaries.append(all_stats)

        if all_only:
            continue
        # --------------- one row per competition ---------------------- #
        comp_list = competition_ids if competition_ids is not None else exp_df["task_name"].unique()

        for comp_id in comp_list:
            comp_df = exp_df[exp_df["task_name"] == comp_id]
            if comp_df.empty:
                continue

            stats = {
                "experiment_id": exp_id,
                "competition_id": comp_id,
                "total_submissions": len(comp_df),
            }
            for k in keys:
                normalized_score_dict = parse_into_aggregate_dict(comp_df, k, [exp_id])
                aggregate_scores, aggregate_score_cis = rliable.library.get_interval_estimates(
                    normalized_score_dict, aggregate_func, reps=50000
                )
                assert len(list(aggregate_scores.values())) == 1
                assert len(list(aggregate_score_cis.values())) == 1
                scores = list(aggregate_scores.values())[0]
                cis = list(aggregate_score_cis.values())[0]
                median = scores[0]
                iqm = scores[1]
                mean = scores[2]
                median_lower_ci = cis[0, 0]
                median_upper_ci = cis[1, 0]
                iqm_lower_ci = cis[0, 1]
                iqm_upper_ci = cis[1, 1]
                mean_lower_ci = cis[0, 2]
                mean_upper_ci = cis[1, 2]
                metrics = {
                    f"{k}_median": median,
                    f"{k}_iqm": iqm,
                    f"{k}_mean": mean,
                    f"{k}_median_lower_ci": median_lower_ci,
                    f"{k}_median_upper_ci": median_upper_ci,
                    f"{k}_iqm_lower_ci": iqm_lower_ci,
                    f"{k}_iqm_upper_ci": iqm_upper_ci,
                    f"{k}_mean_lower_ci": mean_lower_ci,
                    f"{k}_mean_upper_ci": mean_upper_ci,
                }
                stats.update(metrics)
            stats.update({f"{k}_raw": comp_df[k].tolist() for k in keys})
            summaries.append(stats)

    return summaries

def aggregate_func(x):
    return np.array(
        [
            rliable.metrics.aggregate_median(x),
            rliable.metrics.aggregate_iqm(x),
            rliable.metrics.aggregate_mean(x),
        ]
    )


In [None]:
import os
import sys

parent_dir = os.path.dirname(os.path.abspath(os.getcwd()))
sys.path.append(parent_dir)

experiment_ids = experiments_df['method_name'].unique()
competition_ids = experiments_df['task_name'].unique()

experiment_summary = get_experiment_summary(
    experiments_df,
    experiment_ids=experiment_ids,
    competition_ids=competition_ids,
    keys=['valid_submission', 'normalized_score', 'normalized_score_log'],
    all_only=True,
)

experiment_summary_df = pd.DataFrame(experiment_summary)

In [None]:
# Merge elo columns into experiment_summary_df
merged_df = df_sorted.merge(
    experiment_summary_df,
    left_on='model',
    right_on='method_name',
    how='left'
)
merged_df = merged_df.drop(columns=["method_name"])

# Rename ELO columns
experiment_summary_df = merged_df.rename(
    columns={
        "model": "method_name",
        'rating': 'elo_mean',
        'upper': 'elo_upper',
        'lower': 'elo_lower',

    }
)
experiment_summary_df

### Plot Full Results
Valid Submission Rate + Average Normalized Score + Elo Ratings

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.ticker as mticker

# ---------- FIX 0: drop duplicate columns by name (keeps first copy) ----------
experiment_summary_df = experiment_summary_df.loc[:, ~experiment_summary_df.columns.duplicated()].copy()

# Define a lighter, cleaner color palette for LLMs
llm_colors = {
    'Devstral': '#B3F0C8',
    'CWM': '#FFD4AD',
    'GPT-4o': '#B3D4F9',
    'gpt-oss-20b': '#E5DCF2',
    'gpt-oss-120b': '#C8BAEE',
    'o3-mini': '#FFB3B3',
    'SOTA':  '#FFECB3'
}

# Define hatching patterns for strategies
strategy_patterns = {
    'One-Shot': '',
    'Greedy': '///',
    'ReAct': '...',
    'SOTA': ''
}

def rename_method(method_name: str) -> str:
    strategy_map = {
        'oneshot': 'One-Shot',
        'greedy': 'Greedy',
        'mlgym': 'ReAct',
        'sota': 'SOTA'
    }
    model_map = {
        'devstral': 'Devstral',
        'cwm': 'CWM',
        'gpt-4o': 'GPT-4o',
        'gpt_4o': 'GPT-4o',
        'gpt-oss-20b': 'gpt-oss-20b',
        'gpt_oss_20b': 'gpt-oss-20b',
        'gpt-oss-120b': 'gpt-oss-120b',
        'gpt_oss_120b': 'gpt-oss-120b',
        'o3-mini': 'o3-mini',
        'o3_mini': 'o3-mini',
        'sota': 'SOTA'
    }

    method_name = str(method_name).strip()

    if "_" in method_name:
        strategy, llm = method_name.split("_", 1)
    elif " " in method_name:
        strategy, llm = method_name.split(" ", 1)
    else:
        strategy, llm = method_name, None

    strategy = strategy_map.get(strategy, strategy)

    if llm is None:
        return strategy_map.get(method_name, model_map.get(method_name, method_name))

    llm = model_map.get(llm, llm)
    return f"{strategy} {llm}"

# ---- Prepare data ----
experiment_summary_df_sorted = (
    experiment_summary_df
    .sort_values('normalized_score_log_mean', ascending=True, na_position='last')
    .copy()
)

experiment_summary_df_sorted['method_name'] = experiment_summary_df_sorted['method_name'].apply(rename_method)

# Split renamed method into strategy + llm ONCE
split_cols = experiment_summary_df_sorted['method_name'].str.split(' ', n=1, expand=True)
experiment_summary_df_sorted['strategy'] = split_cols[0]
experiment_summary_df_sorted['llm'] = split_cols[1].fillna(split_cols[0])  # "SOTA" -> llm="SOTA"

# Coerce numeric columns
num_cols = [
    'elo_mean', 'elo_lower', 'elo_upper',
    'valid_submission_mean', 'valid_submission_mean_lower_ci', 'valid_submission_mean_upper_ci',
    'normalized_score_log_mean', 'normalized_score_log_mean_lower_ci', 'normalized_score_log_mean_upper_ci'
]
for c in num_cols:
    if c in experiment_summary_df_sorted.columns:
        experiment_summary_df_sorted[c] = pd.to_numeric(experiment_summary_df_sorted[c], errors='coerce')

# If duplicates exist, keep first
experiment_summary_df_sorted = experiment_summary_df_sorted.drop_duplicates(subset=['method_name'], keep='first')

methods = experiment_summary_df_sorted['method_name'].tolist()
metrics_left = ['Valid Submission Rate', 'Average Normalized Score']
metrics_right = ['Elo Rating']

# ============================================================
# LAYOUT KNOBS (WIDER BARS + FIT EVERYTHING)
# ============================================================
CLUSTER_SPAN        = 1.15   # <<< wider bar bundles per metric (increase to 1.35 for even wider)
LEFT_METRIC_SPACING = 1.6   # more room between the two left clusters
GAP_SCORE_TO_ELO    = 0.80   # more room between score and Elo clusters
SCORE_AXIS_INSET    = -0.15
X_PAD               = 0.1   # side padding so nothing clips
# ============================================================

# ---- X geometry ----
x_left  = np.array([0.0, LEFT_METRIC_SPACING])
x_right = np.array([x_left[1] + 1.0 + GAP_SCORE_TO_ELO])

n = len(methods)
width = CLUSTER_SPAN / n

# ---- Axes: 3 y-axes total ----
fig, ax_valid = plt.subplots(figsize=(18, 6))

# Average Normalized Score axis (moved to the LEFT of the avg-score bars)
ax_score = ax_valid.twinx()
ax_score.patch.set_visible(False)
ax_score.tick_params(axis='x', bottom=False, labelbottom=False)

# Elo axis on the far right
ax_elo = ax_valid.twinx()
ax_elo.patch.set_visible(False)
ax_elo.tick_params(axis='x', bottom=False, labelbottom=False)

# Place the avg-score y-axis just LEFT of the avg-score bar cluster
score_axis_x = x_left[1] - 0.5 + SCORE_AXIS_INSET
ax_score.spines["left"].set_position(("data", score_axis_x))
ax_score.spines["left"].set_visible(True)
ax_score.spines["right"].set_visible(False)
ax_score.spines["top"].set_visible(False)
ax_score.spines["bottom"].set_visible(False)
ax_score.yaxis.set_label_position("left")
ax_score.yaxis.tick_left()

# ---- Plot ----
for i, (_, row) in enumerate(experiment_summary_df_sorted.iterrows()):
    method = row['method_name']
    strategy = row['strategy']
    llm = row['llm']

    color = llm_colors.get(llm, '#000000')
    hatch = strategy_patterns.get(strategy, '')

    # Centered offsets for bars within each metric cluster
    offset = (i - (n - 1) / 2) * width

    # 1) Valid Submission Rate (LEFT axis, 0–100) - skip SOTA
    if method != "SOTA" and np.isfinite(row['valid_submission_mean']):
        value = row['valid_submission_mean'] * 100
        err_low = (row['valid_submission_mean'] - row['valid_submission_mean_lower_ci']) * 100
        err_up  = (row['valid_submission_mean_upper_ci'] - row['valid_submission_mean']) * 100

        bar = ax_valid.bar(
            x_left[0] + offset, value, width,
            yerr=[[err_low], [err_up]],
            label=method, capsize=3, alpha=0.8,
            color=color, hatch=hatch, edgecolor='black', linewidth=0.5
        )[0]
        ax_valid.text(
            bar.get_x() + bar.get_width()/2., value + err_up + 1,
            f'{value:.0f}', ha='center', va='bottom', fontsize=10
        )

    # 2) Average Normalized Score (DECIMAL 0–1) - skip SOTA
    if method != "SOTA" and np.isfinite(row['normalized_score_log_mean']):
        value = row['normalized_score_log_mean']  # decimal
        err_low = row['normalized_score_log_mean'] - row['normalized_score_log_mean_lower_ci']
        err_up  = row['normalized_score_log_mean_upper_ci'] - row['normalized_score_log_mean']

        bar = ax_score.bar(
            x_left[1] + offset, value, width,
            yerr=[[err_low], [err_up]],
            label=None, capsize=3, alpha=0.8,
            color=color, hatch=hatch, edgecolor='black', linewidth=0.5
        )[0]
        ax_score.text(
            bar.get_x() + bar.get_width()/2., value + err_up + 0.01,
            f'{value:.2f}', ha='center', va='bottom', fontsize=8
        )

    # 3) Elo Rating (RIGHT axis) - include SOTA
    if np.isfinite(row['elo_mean']) and np.isfinite(row['elo_lower']) and np.isfinite(row['elo_upper']):
        value = row['elo_mean']
        err_low = row['elo_mean'] - row['elo_lower']
        err_up  = row['elo_upper'] - row['elo_mean']

        label_for_legend = method if method == "SOTA" else None

        bar = ax_elo.bar(
            x_right[0] + offset, value, width,
            yerr=[[err_low], [err_up]],
            label=label_for_legend, capsize=3, alpha=0.8,
            color=color, hatch=hatch, edgecolor='black', linewidth=0.5
        )[0]

        vertical_offset = 15 if i % 2 == 0 else 5
        ax_elo.text(
            bar.get_x() + bar.get_width()/2., value + err_up + vertical_offset,
            f'{value:.0f}', ha='center', va='bottom', fontsize=8
        )

# Separator between AvgScore and Elo
sep_x = (x_left[1] + x_right[0]) / 2.0
ax_valid.axvline(x=sep_x, color='gray', linestyle='--', linewidth=1, alpha=0.5)

# ---- Labels/limits ----
ax_valid.set_ylabel('Valid submission rate', fontsize=15, fontweight='bold')
ax_score.set_ylabel('Average normalized score', fontsize=15, fontweight='bold')
ax_elo.set_ylabel('Elo Rating', fontsize=15, fontweight='bold')

ax_valid.set_ylim(0, 105)
ax_score.set_ylim(0, 1.05)
ax_elo.set_ylim(bottom=650)

ax_score.yaxis.set_major_locator(mticker.FixedLocator(np.arange(0.2, 1.01, 0.2)))
ax_score.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f'))

ax_valid.tick_params(axis='y', labelsize=13)
ax_score.tick_params(axis='y', labelsize=13)
ax_elo.tick_params(axis='y', labelsize=13)

# X ticks
all_x = np.concatenate([x_left, x_right])
ax_valid.set_xticks(all_x)
ax_valid.set_xticklabels(metrics_left + metrics_right, fontsize=14)
ax_valid.get_xaxis().set_visible(False)

# X-limits (auto-fit to chosen CLUSTER_SPAN)
cluster_half = CLUSTER_SPAN / 2
ax_valid.set_xlim(x_left[0] - cluster_half - X_PAD,
                  x_right[0] + cluster_half + X_PAD)

# Legend: collect from valid axis + elo axis so SOTA appears (score axis has no labels)
h1, l1 = ax_valid.get_legend_handles_labels()
h3, l3 = ax_elo.get_legend_handles_labels()

leg = ax_valid.legend(
    h1 + h3, l1 + l3,
    loc='upper center',
    bbox_to_anchor=(0.68, 1.0),
    title='Agents',
    framealpha=0.8,
    fontsize=10,
)
leg.get_title().set_fontsize(12)
leg.get_title().set_fontweight('bold')

# Grid
ax_valid.grid(axis='y', alpha=0.3)
ax_score.grid(False)
ax_elo.grid(False)

plt.tight_layout()
plt.savefig('fig_metrics_summary.pdf', format='pdf', bbox_inches='tight')
plt.show()

## 1. Valid Submission Rate (VSR)

In [None]:
# Summary Plots

scaffold_colors = {'oneshot': "#4A90E2", 'greedy': "#E57373", 'mlgym': "#66BB6A", 'overall': "#FF8C00"}

def get_scaffold(method):
    if method.lower() == 'overall': return 'overall'
    elif method.startswith('oneshot'): return 'oneshot'
    elif method.startswith('greedy'): return 'greedy'
    elif method.startswith('mlgym'): return 'mlgym'
    return 'overall'

def format_method_name(name):
    if name.lower() == 'overall': return r'$\mathbf{Overall}$'
    if '_' in name:
        parts = name.split('_', 1)
        scaffold = {'mlgym': 'ReAct', 'oneshot': 'One-Shot', 'greedy': 'Greedy'}.get(parts[0], parts[0])
        model = parts[1] if len(parts) > 1 else ''
        model = model.replace('gpt-oss-20b', 'gpt-oss\n20b').replace('gpt-oss-120b', 'gpt-oss\n120b')
        model = model.replace('gpt-4o', 'GPT-4o').replace('o3_mini', 'o3-mini')
        model = model.replace('cwm', 'CWM').replace('devstral', 'Devstral')
        return f"{scaffold}\n{model}"
    return name

def mean_ci(series):
    n = len(series)
    mean = series.mean()
    sem = series.sem() if n > 1 else 0
    h = sem * t.ppf(0.975, n-1) if n > 1 else 0
    return mean, mean-h, mean+h

plt.rcParams.update({"font.family": "sans-serif", "font.sans-serif": ["DejaVu Sans"], "font.size": 24})
sns.set_context("notebook", font_scale=2.5)
sns.set_style("whitegrid")
print("Setup complete")

In [None]:
# --- VSR Bar Chart with Error Bars ---
vsr_records = []
for task_name, agents_dict in full_results.items():
    for agent, runs in agents_dict.items():
        for seed_idx, (score, status) in enumerate(runs):
            vsr_records.append({'task': task_name, 'method': agent, 'is_valid': (status == 'SUCCESS'), 'seed': seed_idx})

df_vsr = pd.DataFrame(vsr_records)
vsr_per_task = df_vsr.groupby(['method', 'task'])['is_valid'].mean().reset_index()
vsr_per_task.columns = ['method', 'task', 'vsr']

method_vsr_stats = {}
for method in vsr_per_task['method'].unique():
    vals = vsr_per_task[vsr_per_task['method'] == method]['vsr']
    mean_val, lower, upper = mean_ci(vals)
    method_vsr_stats[method] = {'mean': mean_val, 'lower': lower, 'upper': upper}

all_vsr = vsr_per_task['vsr']
overall_mean, overall_lower, overall_upper = mean_ci(all_vsr)
method_vsr_stats['Overall'] = {'mean': overall_mean, 'lower': overall_lower, 'upper': overall_upper}

methods_sorted = sorted(method_vsr_stats.keys(), key=lambda m: method_vsr_stats[m]['mean'], reverse=True)
means = np.array([method_vsr_stats[m]['mean'] for m in methods_sorted])
lowers = np.clip([method_vsr_stats[m]['lower'] for m in methods_sorted], 0, 1)
uppers = np.clip([method_vsr_stats[m]['upper'] for m in methods_sorted], 0, 1)
errs = np.vstack([np.clip(means - lowers, 0, means), np.clip(uppers - means, 0, 1 - means)])

plt.figure(figsize=(30, 8))
x = np.arange(len(methods_sorted))
bars = plt.bar(x, means, width=0.8, yerr=errs, capsize=4, color=[scaffold_colors[get_scaffold(m)] for m in methods_sorted], edgecolor="black", alpha=0.8, linewidth=0)
for i, mean in enumerate(means):
    plt.text(x[i], mean + errs[1,i] + 0.01, f"{mean:.2f}", ha='center', va='bottom', fontsize=26)
plt.xticks(x, [format_method_name(m) for m in methods_sorted], fontsize=25, linespacing=1.5)
plt.yticks(np.arange(0, 1.10, 0.1), fontsize=28)
plt.ylabel("Valid Submission Rate", fontsize=32)
plt.ylim(0, 1.10)
plt.xlim(-0.5, len(methods_sorted) - 0.5)
legend_elements = [Patch(facecolor=scaffold_colors['oneshot'], label='One-Shot'), Patch(facecolor=scaffold_colors['greedy'], label='Greedy'), Patch(facecolor=scaffold_colors['mlgym'], label='ReAct')]
plt.legend(handles=legend_elements, loc='upper right', fontsize=30, title="Scaffold", title_fontsize=32)
plt.grid(axis="y", linestyle="-", alpha=0.5)
for spine in plt.gca().spines.values(): spine.set_visible(True); spine.set_color('black'); spine.set_linewidth(1.5)
plt.tight_layout()
plt.savefig("fig_valid_submission_rate.pdf", bbox_inches="tight")
plt.show()

## 2. VSR - Stacked Bar Chart

In [None]:
# --- VSR Stacked Bar Chart (matching source notebook style) ---

sns.set_theme(style="whitegrid", font_scale=2.0, rc={
    "axes.edgecolor": "#333333",
    "axes.linewidth": 1.5,
})

categories = [
    "Invalid",
    "Low (1-33%)",
    "Medium (34-66%)",
    "High (67-100%)"
]
colors = [
    "#d0d0d0",
    "#ff8b6d",
    "#ffdb9a",
    "#95d5b2",
]

# Compute per-method valid submission rates per task
method_distributions = {}
methods = df_vsr['method'].unique()
for method in methods:
    df_m = df_vsr[df_vsr['method'] == method]
    # Group by task, compute valid submission rate per task
    rates = df_m.groupby('task')['is_valid'].mean() * 100  # percent
    # Bin into categories
    counts = [
        np.sum(rates == 0),
        np.sum((rates > 0) & (rates <= 33)),
        np.sum((rates > 33) & (rates <= 66)),
        np.sum(rates > 66),
    ]
    total = sum(counts)
    percentages = [c/total*100 if total > 0 else 0 for c in counts]
    method_distributions[method] = (counts, percentages)

# Sort methods by "Medium" + "High" percentage (descending)
medium_high_percentages = [(method, method_distributions[method][1][2] + method_distributions[method][1][3]) for method in methods]
medium_high_percentages.sort(key=lambda x: x[1], reverse=True)
sorted_methods = [m for m, _ in medium_high_percentages]

methods_sorted_formatted = [format_method_name(m) for m in sorted_methods]
counts_array = np.array([method_distributions[m][0] for m in sorted_methods])
percentages_array = np.array([method_distributions[m][1] for m in sorted_methods])
x = np.arange(len(sorted_methods))

fig, ax = plt.subplots(figsize=(72, 18))
bottom = np.zeros(len(sorted_methods))
bar_width = 0.5

for i, label in enumerate(categories):
    ax.bar(
        x,
        percentages_array[:, i],
        width=bar_width,
        bottom=bottom,
        label=label,
        color=colors[i],
        alpha=0.85,
        linewidth=0.0,
        edgecolor='white'
    )
    bottom += percentages_array[:, i]

ax.set_ylabel("Task Percentage (%)", labelpad=25, fontsize=64)
ax.tick_params(axis='y', labelsize=60, width=2, length=10)
ax.set_xticks(x)
ax.set_xticklabels(methods_sorted_formatted, rotation=0, ha="center", fontsize=54, linespacing=1.5)
ax.tick_params(axis='x', labelsize=54, width=2, length=8, pad=10)
ax.grid(axis="y", linestyle="-", alpha=0.3, linewidth=1.5, zorder=0)
ax.xaxis.grid(False)
ax.set_axisbelow(True)
ax.set_ylim(0, 108)
ax.set_yticks(np.arange(0.0, 100.1, 12.5))

# Add percentage labels inside bars
min_height_for_text = 8
for i, method in enumerate(sorted_methods):
    cumulative = 0
    for j in range(4):
        pct = percentages_array[i, j]
        if pct > 0 and pct >= min_height_for_text:
            fontsize = 60
            text_color = '#2C3E50'
            ax.text(
                i, cumulative + pct / 2,
                f'{pct:.1f}',
                ha='center', va='center',
                fontsize=fontsize,
                color=text_color,
                fontweight='normal'
            )
        cumulative += pct

legend = ax.legend(
    title="Submission Rate",
    frameon=True,
    loc='center left',
    bbox_to_anchor=(1.01, 0.5),
    ncol=1,
    framealpha=1.0,
    edgecolor='#333333',
    fancybox=False,
    shadow=False,
    fontsize=54,
    title_fontsize=58
)
legend.get_frame().set_linewidth(2)

for spine in ax.spines.values():
    spine.set_linewidth(3)
    spine.set_edgecolor('#222222')

plt.tight_layout()
plt.savefig("fig_vsr_stacked.pdf", bbox_inches="tight", dpi=300, facecolor='white')
plt.show()

## 3. Average Normalized Score (ANS)

In [None]:
# --- ANS Bar Chart with Error Bars (using experiment_summary_df) ---

# Prepare data from experiment_summary_df
df_plot = experiment_summary_df.copy()
df_plot = df_plot[df_plot['method_name'] != 'Overall']  # Exclude if exists

# Rename for consistency
df_plot['method'] = df_plot['method_name']

# Use normalized_score_log_mean as the ANS metric
method_ans_stats = {}
for _, row in df_plot.iterrows():
    method = row['method']
    if pd.notna(row['normalized_score_log_mean']):
        method_ans_stats[method] = {
            'mean': row['normalized_score_log_mean'],
            'lower': row['normalized_score_log_mean_lower_ci'],
            'upper': row['normalized_score_log_mean_upper_ci']
        }

# Add overall (average across all methods)
all_means = df_plot['normalized_score_log_mean'].dropna()
if len(all_means) > 0:
    overall_mean = all_means.mean()
    overall_std = all_means.std()
    n = len(all_means)
    overall_ci = 1.96 * overall_std / np.sqrt(n) if n > 1 else 0
    method_ans_stats['Overall'] = {
        'mean': overall_mean,
        'lower': overall_mean - overall_ci,
        'upper': overall_mean + overall_ci
    }

# Sort methods by ANS (descending)
methods_sorted_ans = sorted(method_ans_stats.keys(), key=lambda m: method_ans_stats[m]['mean'], reverse=True)
means = np.array([method_ans_stats[m]['mean'] for m in methods_sorted_ans])
lowers = np.clip([method_ans_stats[m]['lower'] for m in methods_sorted_ans], 0, 1)
uppers = np.clip([method_ans_stats[m]['upper'] for m in methods_sorted_ans], 0, 1)
errs = np.vstack([np.clip(means - lowers, 0, means), np.clip(uppers - means, 0, 1 - means)])

plt.figure(figsize=(30, 8))
x = np.arange(len(methods_sorted_ans))
bars = plt.bar(x, means, width=0.8, yerr=errs, capsize=4,
               color=[scaffold_colors[get_scaffold(m)] for m in methods_sorted_ans],
               edgecolor="black", alpha=0.8, linewidth=0)

for i, mean in enumerate(means):
    plt.text(x[i], mean + errs[1,i] + 0.01, f"{mean:.3f}", ha='center', va='bottom', fontsize=26)

plt.xticks(x, [format_method_name(m) for m in methods_sorted_ans], fontsize=25, linespacing=1.5)
plt.yticks(np.arange(0, 0.75, 0.05), fontsize=28)
plt.ylabel("Normalized Score", fontsize=32)
plt.ylim(0, 0.60)
plt.xlim(-0.5, len(methods_sorted_ans) - 0.5)

legend_elements = [
    Patch(facecolor=scaffold_colors['oneshot'], label='One-Shot'),
    Patch(facecolor=scaffold_colors['greedy'], label='Greedy'),
    Patch(facecolor=scaffold_colors['mlgym'], label='ReAct')
]
plt.legend(handles=legend_elements, loc='upper right', fontsize=30, title="Scaffold", title_fontsize=32)
plt.grid(axis="y", linestyle="-", alpha=0.5)

for spine in plt.gca().spines.values():
    spine.set_visible(True)
    spine.set_color('black')
    spine.set_linewidth(1.5)

plt.tight_layout()
plt.savefig("fig_avg_normalized_score.pdf", bbox_inches="tight")
plt.show()


## 4. ANS - Stacked Bar Chart

In [None]:
# --- ANS Stacked Bar Chart (Performance Categories - Task Specific) ---
# MATCHING SOURCE NOTEBOOK LOGIC EXACTLY

sns.set_theme(style="whitegrid", font_scale=2.0, rc={
    "axes.edgecolor": "#333333",
    "axes.linewidth": 1.5,
})

colors = ["#d0d0d0", "#ff8b6d", "#ffdb9a", "#6ec6ff", "#95d5b2"]
bins = ["Invalid", "Worst", "Below Avg", "Above Avg", "Best"]

# ==================== STEP 1: Build raw records ====================
records = []
for task_name, agents_dict in full_results.items():
    if task_name not in task_metadata:
        continue
    meta = task_metadata[task_name]
    logging_info = meta.get('logging_info', {})
    sota_list = logging_info.get('sota', [])
    sota = sota_list[0].get('sota_score') if sota_list else None
    lower_is_better = meta.get('metric_lower_is_better', False)

    if sota is None:
        continue

    for agent, runs in agents_dict.items():
        for seed_idx, (score, status) in enumerate(runs):
            is_valid = (status == 'SUCCESS')
            try:
                score_val = float(score) if is_valid else None
            except:
                score_val = None

            records.append({
                'task': task_name,
                'method': agent,
                'score': score_val,
                'lower_is_better': lower_is_better,
                'sota': float(sota),
                'is_valid': is_valid,
                'seed': seed_idx
            })

df = pd.DataFrame(records)
print(f"Built df with {len(df)} records")

# ==================== STEP 2: Compute worst score per task from ACTUAL valid scores ====================
worst_score_per_task = {}
for task in df['task'].unique():
    task_data = df[(df['task'] == task) & (df['is_valid'] == True)]
    if len(task_data) > 0:
        lower_is_better = task_data['lower_is_better'].iloc[0]
        if lower_is_better:
            worst_score_per_task[task] = task_data['score'].max()
        else:
            worst_score_per_task[task] = task_data['score'].min()
    else:
        worst_score_per_task[task] = None

df['worst_score'] = df['task'].map(worst_score_per_task)

# ==================== STEP 3: March of 9's transform ====================
def march_of_9s_normalized(row):
    if not row['is_valid'] or row['score'] is None or pd.isna(row['score']):
        return 0.0

    score = row['score']
    sota = row['sota']
    worst = row['worst_score']
    lower_is_better = row['lower_is_better']

    optimal = 0.0 if lower_is_better else 1.0
    diff = abs(score - optimal)
    if diff == 0:
        diff = abs(0.999 - optimal)

    march = -np.log10(diff)
    sota_diff = abs(sota - optimal)
    worst_diff = abs(worst - optimal)
    sota_march = -np.log10(sota_diff)
    worst_march = -np.log10(worst_diff)

    if np.isclose(sota_march, worst_march):
        return np.nan

    normalized = (march - worst_march) / (sota_march - worst_march)
    return max(0, normalized)

df['normalized_march'] = df.apply(march_of_9s_normalized, axis=1)

# ==================== STEP 4: Average per method per task ====================
df_avg = df[df['method'] != 'Overall'].groupby(['task', 'method'], as_index=False).agg({
    'normalized_march': 'mean',
    'lower_is_better': 'first',
    'sota': 'first',
}).dropna(subset=['normalized_march'])

print(f"Built df_avg with {len(df_avg)} records")

# ==================== STEP 5: Task-specific binning (from source) ====================
method_task_avg = df_avg.groupby(['task', 'method'], as_index=False)['normalized_march'].mean()
methods = method_task_avg['method'].unique()
common_tasks = method_task_avg['task'].unique()

def categorize_method_for_task(task_data, method_score):
    if pd.isna(method_score) or method_score == 0:
        return 0
    valid_scores = task_data['normalized_march'].dropna()
    valid_scores = valid_scores[valid_scores != 0]
    if len(valid_scores) == 0:
        return 0
    task_avg = valid_scores.mean()
    task_best = valid_scores.max()
    task_worst = valid_scores.min()
    if task_best == task_worst:
        return 3
    if abs(method_score - task_best) < 1e-10:
        return 4
    elif abs(method_score - task_worst) < 1e-10:
        return 1
    elif method_score > task_avg:
        return 3
    else:
        return 2

method_categories = {}
for method in methods:
    method_data = method_task_avg[method_task_avg['method'] == method]
    category_counts = [0, 0, 0, 0, 0]
    for task in common_tasks:
        task_data = method_task_avg[method_task_avg['task'] == task]
        method_score_data = method_data[method_data['task'] == task]['normalized_march']
        method_score = method_score_data.iloc[0] if len(method_score_data) > 0 else np.nan
        category = categorize_method_for_task(task_data, method_score)
        category_counts[category] += 1
    method_categories[method] = category_counts

# ==================== STEP 6: Sorting and plotting ====================
total_tasks = len(common_tasks)
method_scores = {m: (np.array(method_categories[m][:5]) / total_tasks * 100)[3] + (np.array(method_categories[m][:5]) / total_tasks * 100)[4] for m in methods}
methods_sorted = sorted(method_scores.keys(), key=lambda x: method_scores[x], reverse=True)
methods_sorted_formatted = [format_method_name(m) for m in methods_sorted]

counts_array = np.array([method_categories[m][:5] for m in methods_sorted])
percentage_array = (counts_array / total_tasks) * 100
x = np.arange(len(methods_sorted))

fig, ax = plt.subplots(figsize=(72, 18))
bottom = np.zeros(len(methods_sorted))
bar_width = 0.5

for i, label in enumerate(bins):
    ax.bar(x, percentage_array[:, i], width=bar_width, bottom=bottom,
           label=label, color=colors[i], alpha=0.85, linewidth=0.0, edgecolor='white')
    bottom += percentage_array[:, i]

ax.set_ylabel("Task Percentage (%)", labelpad=25, fontsize=64)
ax.tick_params(axis='y', labelsize=60, width=2, length=10)
ax.set_xticks(x)
ax.set_xticklabels(methods_sorted_formatted, rotation=0, ha="center", fontsize=54, linespacing=1.5)
ax.tick_params(axis='x', labelsize=54, width=2, length=8, pad=10)
ax.grid(axis="y", linestyle="-", alpha=0.3, linewidth=1.5, zorder=0)
ax.xaxis.grid(False)
ax.set_axisbelow(True)
ax.set_ylim(0, 108)
ax.set_yticks(np.arange(0, 101, 12.5))

min_height_for_text = 8
for i, method in enumerate(methods_sorted):
    cumulative = 0
    for j in range(5):
        pct = percentage_array[i, j]
        if pct > 0 and pct >= min_height_for_text:
            ax.text(i, cumulative + pct / 2, f"{pct:.1f}",
                    ha='center', va='center', fontsize=60, color='#2C3E50', fontweight='normal')
        cumulative += pct

legend = ax.legend(title="Performance Category", frameon=True, loc='center left',
                   bbox_to_anchor=(1.01, 0.5), ncol=1, framealpha=1.0,
                   edgecolor='#333333', fancybox=False, shadow=False,
                   fontsize=54, title_fontsize=58)
legend.get_frame().set_linewidth(2)

for spine in ax.spines.values():
    spine.set_linewidth(3)
    spine.set_edgecolor('#222222')

plt.tight_layout()
plt.savefig("fig_performance_categories_stackedbar_taskspecific.pdf", bbox_inches="tight")
plt.show()

## 5. ANS Per Task (March of 9's) 

In [None]:
# --- March of 9's Scatter Plot (Per Task) ---

# ==================== BUILD DATA FROM full_results ====================
records = []
for task_name, agents_dict in full_results.items():
    if task_name not in task_metadata:
        continue
    meta = task_metadata[task_name]
    logging_info = meta.get('logging_info', {})
    sota_list = logging_info.get('sota', [])
    sota = sota_list[0].get('sota_score') if sota_list else None
    lower_is_better = meta.get('metric_lower_is_better', False)

    if sota is None:
        continue

    for agent, runs in agents_dict.items():
        for seed_idx, (score, status) in enumerate(runs):
            is_valid = (status == 'SUCCESS')
            try:
                score_val = float(score) if is_valid else None
            except:
                score_val = None
            records.append({
                'task': task_name,
                'method': agent,
                'score': score_val,
                'lower_is_better': lower_is_better,
                'sota': float(sota),
                'is_valid': is_valid,
                'seed': seed_idx
            })

df = pd.DataFrame(records)
common_tasks = df['task'].unique()

# Compute worst score per task from actual valid scores
worst_score_per_task = {}
for task in df['task'].unique():
    task_data = df[(df['task'] == task) & (df['is_valid'] == True)]
    if len(task_data) > 0:
        lower_is_better = task_data['lower_is_better'].iloc[0]
        worst_score_per_task[task] = task_data['score'].max() if lower_is_better else task_data['score'].min()
    else:
        worst_score_per_task[task] = None
df['worst_score'] = df['task'].map(worst_score_per_task)

# March of 9's transform
def march_of_9s_normalized(row):
    if not row['is_valid'] or row['score'] is None or pd.isna(row['score']):
        return 0.0
    score, sota, worst = row['score'], row['sota'], row['worst_score']
    lower_is_better = row['lower_is_better']
    optimal = 0.0 if lower_is_better else 1.0
    diff = abs(score - optimal)
    if diff == 0:
        diff = abs(0.999 - optimal)
    march = -np.log10(diff)
    sota_diff, worst_diff = abs(sota - optimal), abs(worst - optimal)
    sota_march, worst_march = -np.log10(sota_diff), -np.log10(worst_diff)
    if np.isclose(sota_march, worst_march):
        return np.nan
    normalized = (march - worst_march) / (sota_march - worst_march)
    return max(0, normalized)

df['normalized_march'] = df.apply(march_of_9s_normalized, axis=1)

# Average per method per task
df_avg = df[df['method'] != 'Overall'].groupby(['task', 'method'], as_index=False).agg({
    'normalized_march': 'mean', 'lower_is_better': 'first', 'sota': 'first'
})
methods = df_avg['method'].unique()

# Order tasks by average march score
task_order = sorted(df_avg['task'].unique(), key=lambda t: df_avg[df_avg['task']==t]['normalized_march'].mean(), reverse=True)
task_to_num = {task: i+1 for i, task in enumerate(task_order)}
df_avg['task_num'] = df_avg['task'].map(task_to_num)

# Order methods by average score
sorted_methods = df_avg.groupby('method')['normalized_march'].mean().sort_values(ascending=False).index.tolist()

# Colors and shapes
def parse_method_name(name):
    parts = name.split('_', 1)
    return (parts[0], parts[1]) if len(parts) == 2 else ('unknown', name)

method_info = {m: dict(zip(['scaffold', 'model'], parse_method_name(m))) for m in methods}
unique_models = sorted(set(info['model'] for info in method_info.values()))
scaffold_to_models = defaultdict(list)
for m, info in method_info.items():
    scaffold_to_models[info['scaffold']].append(info['model'])

scaffold_palettes = {
    'greedy': sns.color_palette("Reds", n_colors=max(3, len(set(scaffold_to_models['greedy']))+1))[1:],
    'oneshot': sns.color_palette("Blues", n_colors=max(3, len(set(scaffold_to_models['oneshot']))+1))[1:],
}
default_palette = sns.color_palette("Greens", n_colors=2)[1:]

method_to_color = {}
for scaffold, models in scaffold_to_models.items():
    palette = scaffold_palettes.get(scaffold, default_palette)
    for i, model in enumerate(sorted(set(models))):
        for m, info in method_info.items():
            if info['scaffold'] == scaffold and info['model'] == model:
                method_to_color[m] = palette[i % len(palette)]

available_shapes = ['o', 's', '^', 'v', 'P', 'X', '*', 'h', 'H', 'p', '<', '>', '8', '+']
model_to_shape = {model: available_shapes[i % len(available_shapes)] for i, model in enumerate(unique_models)}
method_to_shape = {m: model_to_shape[info['model']] for m, info in method_info.items()}

def format_method_label(name):
    if '_' in name:
        parts = name.split('_', 1)
        scaffold = {'mlgym': 'ReAct', 'oneshot': 'One-Shot', 'greedy': 'Greedy'}.get(parts[0], parts[0])
        model = parts[1].replace('gpt-4o', 'GPT-4o').replace('o3_mini', 'o3-mini').replace('cwm', 'CWM').replace('devstral', 'Devstral')
        return f"{scaffold} {model}"
    return name

# Plot
plt.figure(figsize=(28, 25))
ax = plt.gca()
ax.set_facecolor("#f0f5f0")
for spine in ax.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(2)

for y in range(1, len(task_order)+1):
    plt.axhline(y, color='gray', linestyle='--', linewidth=0.7, alpha=0.4, zorder=0)

baseline_line = plt.axvline(0, color='red', linestyle='--', linewidth=2, alpha=0.7, zorder=1, label='Worst overall')
sota_line = plt.axvline(1, color='black', linestyle='-', linewidth=2, alpha=0.7, zorder=1, label='SOTA')

scatter_handles = []
for method in sorted_methods:
    group = df_avg[df_avg['method'] == method]
    h = plt.scatter(group['normalized_march'], group['task_num'], label=method,
                    color=method_to_color[method], marker=method_to_shape[method], s=350, alpha=0.85, zorder=2)
    scatter_handles.append(h)

method_handle_pairs = sorted(zip(sorted_methods, scatter_handles), key=lambda x: x[0].lower())
handles = [h for _, h in method_handle_pairs] + [baseline_line, sota_line]
labels = [format_method_label(m) for m, _ in method_handle_pairs] + ['Worst overall', 'SOTA']

plt.legend(handles, labels, title='', fontsize=28, loc='lower right', bbox_to_anchor=(1, 0),
           frameon=True, facecolor='#f0f5f0', edgecolor='black', framealpha=1.0)
plt.yticks(range(1, len(task_order)+1), range(1, len(task_order)+1), fontsize=28)
plt.xticks(np.arange(0, 2.1, 0.1), [f"{x:.1f}" for x in np.arange(0, 2.1, 0.1)], fontsize=36)
plt.ylim(0.5, len(task_order)+0.5)
plt.xlim(-0.01, 1.08)
plt.gca().invert_yaxis()
plt.xlabel('Normalized Score (March of 9s)', fontsize=32)
plt.ylabel('Task Rank by Normalized Score', fontsize=32)
plt.tight_layout()

plt.savefig("fig_march_of_9s_per_task.pdf", bbox_inches="tight")
plt.show()


In [None]:
# --- March of 9's by Difficulty Band ---

plt.figure(figsize=(32, 8))
ax = plt.gca()
ax.yaxis.grid(False)
ax.xaxis.grid(True, which='major', linestyle='-', linewidth=1.6, color='#bbbbbb')
bg_colors = ['#cbeed7', '#f3f3b7', '#ffd1c2', '#f7b6cd']
difficulty_order = ['Easy', 'Medium', 'Hard', 'Expert']
y_map = {label: i+1 for i, label in enumerate(difficulty_order)}

# Divide tasks into 4 difficulty sections
n_tasks = len(task_order)
section_size = n_tasks // 4
remainder = n_tasks % 4
sizes = [section_size + (1 if i < remainder else 0) for i in range(4)]
sections = {}
category_task_ranks = {}
start = 0
for i, size in enumerate(sizes):
    end = start + size
    label = difficulty_order[i]
    for task in task_order[start:end]:
        sections[task] = label
    category_task_ranks[label] = (start + 1, end) if task_order[start:end] else (None, None)
    start = end
df_avg['difficulty'] = df_avg['task'].map(sections)

# Compute section averages
task_method_avg = df_avg.groupby(['task', 'method'], as_index=False)['normalized_march'].mean()
task_method_avg['difficulty'] = task_method_avg['task'].map(sections)
section_avg = task_method_avg.groupby(['difficulty', 'method'], as_index=False)['normalized_march'].mean()

# Scaffold averages
scaffolds_to_show = ['greedy', 'mlgym', 'oneshot']
scaffold_colors_display = {'greedy': 'red', 'oneshot': 'blue', 'mlgym': 'green'}
scaffold_display_names = {'greedy': 'Greedy', 'mlgym': 'ReAct', 'oneshot': 'One-Shot'}
prefix_section_avgs = {s: section_avg[section_avg['method'].str.startswith(s)].groupby('difficulty', as_index=False)['normalized_march'].mean() for s in scaffolds_to_show}
overall_section_avg = task_method_avg.groupby('difficulty', as_index=False)['normalized_march'].mean()

# Draw backgrounds
for i, label in enumerate(difficulty_order):
    ax.axhspan(y_map[label]-0.5, y_map[label]+0.5, color=bg_colors[i], alpha=0.4, zorder=0)

# Plot method points
for method in sorted(methods):
    group = section_avg[section_avg['method'] == method]
    y = [y_map[d] for d in group['difficulty']]
    plt.scatter(group['normalized_march'], y, label=format_method_label(method),
                color=method_to_color.get(method, 'gray'), marker=method_to_shape.get(method, 'o'),
                s=360, alpha=0.8, zorder=2)

# Add text annotations
eps = 0.05
plt.xlim(0.0-eps, 1.0+eps)
right_x = ax.get_xlim()[1]
x_text = right_x + 0.02
avg_spacing = 0.18

for i, label in enumerate(difficulty_order):
    y_pos = y_map[label]
    # Overall
    overall_val = overall_section_avg[overall_section_avg['difficulty'] == label]['normalized_march']
    if not overall_val.empty:
        plt.text(x_text, y_pos + avg_spacing * 1.5, f"overall: {overall_val.values[0]:.2f}",
                 fontsize=18, color='black', fontweight='bold', va='center', ha='left', clip_on=False)
    # Scaffolds
    for idx, scaffold in enumerate(['greedy', 'mlgym', 'oneshot']):
        avg_val = prefix_section_avgs[scaffold][prefix_section_avgs[scaffold]['difficulty'] == label]['normalized_march']
        if not avg_val.empty:
            offset = avg_spacing * (0.5 - idx)
            plt.text(x_text, y_pos + offset, f"{scaffold_display_names[scaffold]}: {avg_val.values[0]:.2f}",
                     fontsize=18, color=scaffold_colors_display[scaffold], fontweight='bold', va='center', ha='left', clip_on=False)

# Y-axis labels with task rank ranges
ytick_labels = [f"{label} ({category_task_ranks[label][0]} - {category_task_ranks[label][1]})" if category_task_ranks[label][0] else f"{label} (-)" for label in difficulty_order]
plt.yticks([y_map[l] for l in difficulty_order], ytick_labels, fontsize=22)
for ticklabel in ax.get_yticklabels():
    ticklabel.set_fontweight('bold')

plt.xticks(np.arange(0.0, 1.1, 0.1), [f"{x:.1f}" for x in np.arange(0.0, 1.1, 0.1)], fontsize=27)
plt.ylim(0.5, 4.5)
ax.invert_yaxis()
for spine in ax.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(2)

plt.xlabel('Normalized Score (March of 9s)', fontsize=25)
plt.ylabel('Task Difficulty', fontsize=25)
plt.legend(title='Agent', fontsize=20, loc='center left', bbox_to_anchor=(1.10, 0.5),
           frameon=True, facecolor='white', edgecolor='black', framealpha=1.0)
plt.tight_layout()

plt.savefig("fig_march_of_9s_by_difficulty.pdf", bbox_inches="tight")
plt.show()


## 6. ANS Per Task (Linear)

In [None]:
# --- Linear Normalized Score Scatter Plot (Per Task) ---

# Linear normalization (identity function, no log transform)
def linear_normalized(row):
    if not row['is_valid'] or row['score'] is None or pd.isna(row['score']):
        return 0.0
    score, sota, worst = row['score'], row['sota'], row['worst_score']
    denom = sota - worst
    if abs(denom) < 1e-10:
        return np.nan
    normalized = (score - worst) / denom
    return max(0, normalized)

df['normalized_linear'] = df.apply(linear_normalized, axis=1)

# Average per method per task
df_avg_linear = df[df['method'] != 'Overall'].groupby(['task', 'method'], as_index=False).agg({
    'normalized_linear': 'mean', 'lower_is_better': 'first', 'sota': 'first'
})

# Order tasks by average linear score
task_order_linear = sorted(df_avg_linear['task'].unique(), key=lambda t: df_avg_linear[df_avg_linear['task']==t]['normalized_linear'].mean(), reverse=True)
task_to_num_linear = {task: i+1 for i, task in enumerate(task_order_linear)}
df_avg_linear['task_num'] = df_avg_linear['task'].map(task_to_num_linear)

# Order methods by average score
sorted_methods_linear = df_avg_linear.groupby('method')['normalized_linear'].mean().sort_values(ascending=False).index.tolist()

# Plot
plt.figure(figsize=(28, 25))
ax = plt.gca()
ax.set_facecolor("#fffff5")
for spine in ax.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(2)

for y in range(1, len(task_order_linear)+1):
    plt.axhline(y, color='gray', linestyle='--', linewidth=0.7, alpha=0.4, zorder=0)

baseline_line = plt.axvline(0, color='red', linestyle='--', linewidth=2, alpha=0.7, zorder=1, label='Worst overall')
sota_line = plt.axvline(1, color='black', linestyle='-', linewidth=2, alpha=0.7, zorder=1, label='SOTA')

scatter_handles = []
for method in sorted_methods_linear:
    group = df_avg_linear[df_avg_linear['method'] == method]
    h = plt.scatter(group['normalized_linear'], group['task_num'], label=method,
                    color=method_to_color.get(method, 'gray'), marker=method_to_shape.get(method, 'o'),
                    s=350, alpha=0.85, zorder=2)
    scatter_handles.append(h)

method_handle_pairs = sorted(zip(sorted_methods_linear, scatter_handles), key=lambda x: x[0].lower())
handles = [h for _, h in method_handle_pairs] + [baseline_line, sota_line]
labels = [format_method_label(m) for m, _ in method_handle_pairs] + ['Worst overall', 'SOTA']

plt.legend(handles, labels, title='', fontsize=28, loc='lower right', bbox_to_anchor=(1, 0),
           frameon=True, facecolor='#fffff5', edgecolor='black', framealpha=1.0)
plt.yticks(range(1, len(task_order_linear)+1), range(1, len(task_order_linear)+1), fontsize=28)
plt.xticks(np.arange(-0.1, 2.51, 0.1), [f"{x:.1f}" for x in np.arange(-0.1, 2.51, 0.1)], fontsize=36)
plt.ylim(0.5, len(task_order_linear)+0.5)
plt.xlim(-0.05, 1.05)
plt.gca().invert_yaxis()
plt.xlabel('Normalized Score (Linear)', fontsize=32)
plt.ylabel('Task Rank by Normalized Score', fontsize=32)
plt.tight_layout()

plt.savefig("fig_linear_scale_per_task.pdf", bbox_inches="tight")
plt.show()


In [None]:
# --- Linear Normalized Score by Difficulty Band ---

# Assign difficulty bands based on linear task order
sections_linear = {}
category_task_ranks_linear = {}
start = 0
for i, size in enumerate(sizes):
    end = start + size
    label = difficulty_order[i]
    for task in task_order_linear[start:end]:
        sections_linear[task] = label
    category_task_ranks_linear[label] = (start + 1, end) if task_order_linear[start:end] else (None, None)
    start = end
df_avg_linear['difficulty'] = df_avg_linear['task'].map(sections_linear)

# Compute section averages
task_method_avg_linear = df_avg_linear.groupby(['task', 'method'], as_index=False)['normalized_linear'].mean()
task_method_avg_linear['difficulty'] = task_method_avg_linear['task'].map(sections_linear)
section_avg_linear = task_method_avg_linear.groupby(['difficulty', 'method'], as_index=False)['normalized_linear'].mean()

# Scaffold averages
prefix_section_avgs_linear = {s: section_avg_linear[section_avg_linear['method'].str.startswith(s)].groupby('difficulty', as_index=False)['normalized_linear'].mean() for s in scaffolds_to_show}
overall_section_avg_linear = task_method_avg_linear.groupby('difficulty', as_index=False)['normalized_linear'].mean()

# Plot
plt.figure(figsize=(32, 8))
ax = plt.gca()
ax.yaxis.grid(False)
ax.xaxis.grid(True, which='major', linestyle='-', linewidth=1.6, color='#bbbbbb')

# Draw backgrounds
for i, label in enumerate(difficulty_order):
    ax.axhspan(y_map[label]-0.5, y_map[label]+0.5, color=bg_colors[i], alpha=0.4, zorder=0)

# Plot method points
for method in sorted(methods):
    group = section_avg_linear[section_avg_linear['method'] == method]
    y = [y_map[d] for d in group['difficulty']]
    plt.scatter(group['normalized_linear'], y, label=format_method_label(method),
                color=method_to_color.get(method, 'gray'), marker=method_to_shape.get(method, 'o'),
                s=360, alpha=0.8, zorder=2)

# Add text annotations
plt.xlim(0.0-eps, 1.0+eps)
right_x = ax.get_xlim()[1]
x_text = right_x + 0.02

for i, label in enumerate(difficulty_order):
    y_pos = y_map[label]
    # Overall
    overall_val = overall_section_avg_linear[overall_section_avg_linear['difficulty'] == label]['normalized_linear']
    if not overall_val.empty:
        plt.text(x_text, y_pos + avg_spacing * 1.5, f"overall: {overall_val.values[0]:.2f}",
                 fontsize=18, color='black', fontweight='bold', va='center', ha='left', clip_on=False)
    # Scaffolds
    for idx, scaffold in enumerate(['greedy', 'mlgym', 'oneshot']):
        avg_val = prefix_section_avgs_linear[scaffold][prefix_section_avgs_linear[scaffold]['difficulty'] == label]['normalized_linear']
        if not avg_val.empty:
            offset = avg_spacing * (0.5 - idx)
            plt.text(x_text, y_pos + offset, f"{scaffold_display_names[scaffold]}: {avg_val.values[0]:.2f}",
                     fontsize=18, color=scaffold_colors_display[scaffold], fontweight='bold', va='center', ha='left', clip_on=False)

# Y-axis labels
ytick_labels_linear = [f"{label} ({category_task_ranks_linear[label][0]} - {category_task_ranks_linear[label][1]})" if category_task_ranks_linear[label][0] else f"{label} (-)" for label in difficulty_order]
plt.yticks([y_map[l] for l in difficulty_order], ytick_labels_linear, fontsize=22)
for ticklabel in ax.get_yticklabels():
    ticklabel.set_fontweight('bold')

plt.xticks(np.arange(0.0, 1.1, 0.1), [f"{x:.1f}" for x in np.arange(0.0, 1.1, 0.1)], fontsize=27)
plt.ylim(0.5, 4.5)
ax.invert_yaxis()
for spine in ax.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(2)

plt.xlabel('Normalized Score (Linear)', fontsize=25)
plt.ylabel('Task Difficulty', fontsize=25)
plt.legend(title='Agent', fontsize=20, loc='center left', bbox_to_anchor=(1.10, 0.5),
           frameon=True, facecolor='white', edgecolor='black', framealpha=1.0)
plt.tight_layout()

plt.savefig("fig_linear_scale_by_difficulty.pdf", bbox_inches="tight")
plt.show()
