In [32]:
COL = "steps"

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Image
from collections import defaultdict


def load_data():
    """Load and process experiment data from CSV files.
    Returns a dictionary of experiment data."""
    # Find the base directory
    base_dir = "../results"  # From diagnostic, we know this exists

    if not os.path.exists(base_dir):
        print(f"Error: Base directory {base_dir} not found")
        return {}

    # Find all step_results_eval.csv files, excluding those with 'hyper' in the path
    csv_files = []
    pattern = os.path.join(base_dir, "**", "step_results_eval.csv")

    for path in glob.glob(pattern, recursive=True):
        if "hyper" not in path:
            csv_files.append(path)

    print(f"Found {len(csv_files)} CSV files (excluding 'hyper' paths)")

    # Group files by dataset-model-optimizer
    grouped_files = defaultdict(list)

    for path in csv_files:
        # Parse using the correct path structure from the diagnostic
        parts = path.split(os.sep)

        # Extract dataset, model, optimizer
        dataset = parts[1]
        model = parts[2]
        optimizer = parts[3]
        seed = parts[4]

    print(f"Grouped into {len(grouped_files)} dataset-model-optimizer combinations")

    # Process each experiment's data
    processed_data = {}

    for key, files in grouped_files.items():
        dataset, model, optimizer = key.replace("sst-5", "sst5").split("-")

        print(f"Processing {key} with {len(files)} seed files")

        # Load data for each seed
        seed_data = {}
        for seed, path in files:
            df = pd.read_csv(path)
            df["total_tokens"] = (
                df["input_tokens_meta_llm"]
                + df["output_tokens_meta_llm"]
                + df["input_tokens_downstream_llm"]
                + df["output_tokens_downstream_llm"]
            )
            cum_tokens_per_step = df.groupby("step")["total_tokens"].sum().cumsum()
            df["cum_tokens"] = (df["step"] - 1).map(cum_tokens_per_step)
            if not df.empty and COL in df.columns and "test_score" in df.columns:
                seed_data[seed] = df
                print(f"  Loaded {os.path.basename(path)} for {seed}")

        if not seed_data:
            print(f"  No valid data for {key}, skipping")
            continue

        processed_data[key] = {
            "dataset": dataset,
            "model": model,
            "optimizer": optimizer,
            "seed_data": seed_data,
        }

    return processed_data


def create_mean_plots(data, output_dir="plots"):
    """Create plots showing mean scores across seeds at each step."""
    os.makedirs(output_dir, exist_ok=True)
    plot_paths = {}

    for key, exp_data in data.items():
        dataset = exp_data["dataset"]
        model = exp_data["model"]
        optimizer = exp_data["optimizer"]
        seed_data = exp_data["seed_data"]

        # Get all unique stepsste
        steps = set()
        for df in seed_data.values():
            steps.update(df[COL].unique())
        steps = sorted(steps)

        # Calculate mean and std at each step
        stats = []
        for step in steps:
            scores = []
            for df in seed_data.values():
                step_rows = df[df[COL] == step]
                if not step_rows.empty:
                    scores.append(step_rows["test_score"].iloc[0])

            if scores:
                mean = np.mean(scores)
                std = np.std(scores) if len(scores) > 1 else 0
                stats.append({COL: step, "mean": mean, "std": std})

        if not stats:
            continue

        # Create plot
        plt.figure(figsize=(10, 6))

        steps = [s[COL] for s in stats]
        means = [s["mean"] for s in stats]
        stds = [s["std"] for s in stats]

        # Plot mean line
        plt.plot(steps, means, "o-", color="blue", linewidth=2, label="Mean Score")

        # Plot standard deviation area
        plt.fill_between(
            steps,
            [m - s for m, s in zip(means, stds)],
            [m + s for m, s in zip(means, stds)],
            color="blue",
            alpha=0.2,
            label="±1 Std Dev",
        )

        # Set labels and title
        plt.xlabel(COL)
        plt.ylabel("Test Score")
        plt.title(f"Mean Score vs Steps: {dataset} - {model} - {optimizer}")
        plt.grid(True, linestyle="--", alpha=0.7)
        plt.legend()

        # Add seed count
        plt.figtext(0.01, 0.01, f"Seeds: {len(seed_data)}", fontsize=8)

        # Save plot
        plot_path = os.path.join(output_dir, f"mean_{key}.png")
        plt.savefig(plot_path, dpi=300, bbox_inches="tight")
        plt.close()

        plot_paths[key] = plot_path
        print(f"Created mean plot for {key}")

    return plot_paths


def create_max_plots(data, output_dir="plots"):
    """Create plots showing max scores across seeds at each step."""
    os.makedirs(output_dir, exist_ok=True)
    plot_paths = {}

    for key, exp_data in data.items():
        dataset = exp_data["dataset"]
        model = exp_data["model"]
        optimizer = exp_data["optimizer"]
        seed_data = exp_data["seed_data"]

        # Get all unique steps
        steps = set()
        for df in seed_data.values():
            steps.update(df[COL].unique())
        steps = sorted(steps)

        # Calculate max for each seed at each step, then mean/std of those maxes
        stats = []
        for step in steps:
            max_scores = []
            for df in seed_data.values():
                step_rows = df[df[COL] == step]
                if not step_rows.empty:
                    max_scores.append(step_rows["test_score"].iloc[0])

            if max_scores:
                mean_of_max = np.mean(max_scores)
                std_of_max = np.std(max_scores) if len(max_scores) > 1 else 0  # besel correction?
                stats.append({COL: step, "mean": mean_of_max, "std": std_of_max})

        if not stats:
            continue

        # Create plot
        plt.figure(figsize=(10, 6))

        steps = [s[COL] for s in stats]
        means = [s["mean"] for s in stats]
        stds = [s["std"] for s in stats]

        # Plot mean of max line
        plt.plot(steps, means, "o-", color="red", linewidth=2, label="Mean of Max Scores")

        # Plot standard deviation area
        plt.fill_between(
            steps,
            [m - s for m, s in zip(means, stds)],
            [m + s for m, s in zip(means, stds)],
            color="red",
            alpha=0.2,
            label="±1 Std Dev",
        )

        # Set labels and title
        plt.xlabel(COL)
        plt.ylabel("Test Score")
        plt.title(f"Max Score vs Steps: {dataset} - {model} - {optimizer}")
        plt.grid(True, linestyle="--", alpha=0.7)
        plt.legend()

        # Add seed count
        plt.figtext(0.01, 0.01, f"Seeds: {len(seed_data)}", fontsize=8)

        # Save plot
        plot_path = os.path.join(output_dir, f"max_{key}.png")
        plt.savefig(plot_path, dpi=300, bbox_inches="tight")
        plt.close()

        plot_paths[key] = plot_path
        print(f"Created max plot for {key}")

    return plot_paths


def create_optimizer_comparison_plots(data, output_dir="plots"):
    """Create plots comparing CAPO and EvoPromptGA max scores for each dataset and model."""
    os.makedirs(output_dir, exist_ok=True)
    plot_paths = {}

    # Group by dataset and model
    dataset_model_pairs = {}
    for key, exp_data in data.items():
        dataset = exp_data["dataset"]
        model = exp_data["model"]
        pair_key = f"{dataset}-{model}"

        if pair_key not in dataset_model_pairs:
            dataset_model_pairs[pair_key] = {}

        dataset_model_pairs[pair_key][exp_data["optimizer"]] = exp_data

    # Create comparison plots
    for pair_key, optimizers_data in dataset_model_pairs.items():
        # Only create plots if we have data for at least 2 optimizers
        if len(optimizers_data) < 2:
            continue

        dataset, model = pair_key.replace("sst-5", "sst5").split("-")
        print(f"Creating optimizer comparison for {dataset}-{model}")

        # Calculate max score statistics for each optimizer
        optimizer_stats = {}

        for optimizer, exp_data in optimizers_data.items():
            seed_data = exp_data["seed_data"]

            # Get all unique steps
            steps = set()
            for df in seed_data.values():
                steps.update(df[COL].unique())
            steps = sorted(steps)

            # Calculate max for each seed at each step, then mean/std of those maxes
            stats = []
            for step in steps:
                max_scores = []
                for df in seed_data.values():
                    step_rows = df[df[COL] == step]
                    if not step_rows.empty:
                        max_scores.append(step_rows["test_score"].iloc[0])

                if max_scores:
                    mean_of_max = np.mean(max_scores)
                    std_of_max = np.std(max_scores) if len(max_scores) > 1 else 0
                    stats.append(
                        {
                            COL: step,
                            "mean": mean_of_max,
                            "std": std_of_max,
                            "num_seeds": len(max_scores),
                        }
                    )

            if stats:
                optimizer_stats[optimizer] = {"stats": stats, "num_seeds": len(seed_data)}

        # Skip if we don't have stats for at least 2 optimizers
        if len(optimizer_stats) < 2:
            continue

        # Create comparison plot
        plt.figure(figsize=(12, 7))

        # Colors for different optimizers
        colors = {"CAPO": "blue", "EvoPromptGA": "red"}

        # Plot each optimizer
        for optimizer, data in optimizer_stats.items():
            stats = data["stats"]
            num_seeds = data["num_seeds"]

            steps = [s[COL] for s in stats]
            means = [s["mean"] for s in stats]
            stds = [s["std"] for s in stats]

            color = colors.get(optimizer, "gray")

            # Plot mean line
            plt.plot(
                steps,
                means,
                "o-",
                color=color,
                linewidth=2,
                label=f"{optimizer} (Max Score, {num_seeds} seeds)",
            )

            # Plot standard deviation area
            plt.fill_between(
                steps,
                [m - s for m, s in zip(means, stds)],
                [m + s for m, s in zip(means, stds)],
                color=color,
                alpha=0.15,
            )

        # Set labels and title
        plt.xlabel(COL)
        plt.ylabel("Max Score")
        plt.title(f"Optimizer Comparison (Max Scores): {dataset} - {model}")
        plt.grid(True, linestyle="--", alpha=0.7)
        plt.legend()

        # Save plot
        plot_path = os.path.join(
            output_dir, f"optimizer_comparison_max_{dataset}-{model}-{COL}.png"
        )
        plt.savefig(plot_path, dpi=300, bbox_inches="tight")
        plt.close()

        plot_paths[pair_key] = plot_path
        print(f"Created optimizer comparison plot for {pair_key}")

    return plot_paths


def create_all_plots():
    """Create all plot types."""
    # Load data once
    data = load_data()

    if not data:
        print("No data found to plot")
        return {}

    # Create different plot types
    mean_plots = create_mean_plots(data)
    max_plots = create_max_plots(data)
    comparison_plots = create_optimizer_comparison_plots(data)

    # Combine plot paths
    all_plots = {}
    for key in data:
        all_plots[f"mean_{key}"] = mean_plots.get(key)
        all_plots[f"max_{key}"] = max_plots.get(key)

    for key, path in comparison_plots.items():
        all_plots[f"comparison_{key}"] = path

    return all_plots


def show_plot(plot_type, dataset, model, optimizer=None, output_dir="plots"):
    """Display a specific plot."""
    if plot_type == "comparison" or plot_type == "optimizer_comparison":
        # For comparison plots, we don't need optimizer
        key = f"{dataset}-{model}"
        filename = f"optimizer_comparison_max_{key}.png"
    else:
        # For regular plots, we need dataset-model-optimizer
        if not optimizer:
            print("Error: optimizer is required for this plot type")
            return False

        key = f"{dataset}-{model}-{optimizer}"
        filename = f"{plot_type}_{key}.png"

    plot_path = os.path.join(output_dir, filename)

    if os.path.exists(plot_path):
        display(Image(filename=plot_path))
        return True
    else:
        print(f"Plot not found: {plot_path}")
        return False


# Example usage
if __name__ == "__main__":
    # Load data
    print("Loading data...")
    data = load_data()
    print(f"Found {len(data)} dataset-model-optimizer combinations")

    if data:
        # Create mean plots
        print("\nCreating mean plots...")
        mean_plots = create_mean_plots(data)
        print(f"Created {len(mean_plots)} mean plots")

        # Create max plots
        print("\nCreating max plots...")
        max_plots = create_max_plots(data)
        print(f"Created {len(max_plots)} max plots")

        # Create optimizer comparison plots
        print("\nCreating optimizer comparison plots...")
        comparison_plots = create_optimizer_comparison_plots(data)
        print(f"Created {len(comparison_plots)} comparison plots")

        # Usage examples
        if comparison_plots:
            key = next(iter(comparison_plots.keys()))
            dataset, model = key.replace("sst-5", "sst5").split("-")
            print(f"\nTo view the optimizer comparison plot:")
            print(f"show_plot('comparison', '{dataset}', '{model}')")
        else:
            key = next(iter(data.keys()))
            dataset, model, optimizer = key.replace("sst-5", "sst5").split("-")
            print(f"\nTo view individual plots:")
            print(f"show_plot('mean', '{dataset}', '{model}', '{optimizer}')")
            print(f"show_plot('max', '{dataset}', '{model}', '{optimizer}')")
    else:
        print("No data found to create plots")

Loading data...
Found 49 CSV files (excluding 'hyper' paths)
['../results', 'agnews', 'llama', 'EvoPromptGA', 'seed42', 'benchmark_experiment_agnews_llama_EvoPromptGA_42', 'e40f38752a3f2048f661b38c139b9dc69fea3dadc3f0e0b8f1f08f866214d5b5', 'step_results_eval.csv']
['../results', 'agnews', 'llama', 'EvoPromptGA', 'seed43', 'benchmark_experiment_agnews_llama_EvoPromptGA_43', '92f64e3b9745c010156191a4a776d08648e77bf754ce100de86a9e6b37f485ee', 'step_results_eval.csv']
['../results', 'agnews', 'llama', 'EvoPromptGA', 'seed44', 'benchmark_experiment_agnews_llama_EvoPromptGA_44', '838881d6eb3e269ac234c71c0c44e30378235b4d627f5c81152fc231ec05b2e0', 'step_results_eval.csv']
['../results', 'agnews', 'mistral', 'CAPO', 'alpha10_long', 'benchmark_experiment_agnews_mistral_CAPO_alpha10_long', 'eac414135d4beaa2a3c24f16de77a4ba73ef222b9c99448e70552d4574f76b21', 'step_results_eval.csv']
['../results', 'agnews', 'mistral', 'CAPO', 'co10', 'benchmark_experiment_agnews_mistral_CAPO_co10', 'eac414135d4beaa