# Benchmarking Results

In [None]:
from capo.analysis.utils import (
    get_results,
    generate_comparison_table,
)
from capo.analysis.visualizations import (
    plot_population_members,
    plot_population_scores_comparison,
    plot_length_score,
    plot_performance_profile_curve,
)

import os

os.chdir("../../")

In [None]:
OPTIMS = ["CAPO", "OPRO", "EvoPromptGA", "PromptWizard", "Initial"]
OPTIMS_NO_WIZ = ["CAPO", "EvoPromptGA", "OPRO", "Initial"]
OPTIMS_NO_SING = ["CAPO", "EvoPromptGA", "OPRO"]
DATASETS = ["sst-5", "agnews", "copa", "gsm8k", "subj"]
MODELS = ["llama", "qwen", "mistral"]

We always use mean for aggregation since this better depicts the overall optimization process.

# Performance Profile

In [None]:
plot_performance_profile_curve()

# Benchmark Results

## Mistral

In [None]:
generate_comparison_table(model="mistral")

In [None]:
plot_population_scores_comparison(
    "sst-5",
    "mistral",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
    figsize=(5.4, 3),
);

In [None]:
plot_population_scores_comparison(
    "agnews",
    "mistral",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
plot_population_scores_comparison(
    "subj",
    "mistral",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
plot_population_scores_comparison(
    "gsm8k",
    "mistral",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

--> use in main paper next to table

In [None]:
plot_population_scores_comparison(
    "copa",
    "mistral",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
import seaborn as sns

In [None]:
plot_population_scores_comparison(
    "agnews",
    "llama",
    ["CAPO", "CAPO_no_racing", "EvoPromptGA"],
    colors=[sns.color_palette("Dark2")[0], "#66D874", sns.color_palette("Dark2")[2]],
    labels=["CAPO", "CAPO w/o Racing", "EvoPromptGA"],
    agg="mean",
    plot_seeds=False,
    plot_stddev=1,
    x_col="step",
    score_col="input_tokens_sum",
);

In [None]:
plot_population_scores_comparison(
    "subj",
    "mistral",
    ["CAPO", "", "EvoPromptGA"],
    agg="mean",
    plot_seeds=False,
    plot_stddev=1,
    x_col="step",
    score_col="input_tokens_sum",
);

## Qwen

In [None]:
generate_comparison_table(model="qwen")

In [None]:
plot_population_scores_comparison(
    "sst-5",
    "qwen",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
plot_population_scores_comparison(
    "agnews",
    "qwen",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
plot_population_scores_comparison(
    "subj",
    "qwen",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

--> use in main paper on first side because it is beautiful 💕

In [None]:
plot_population_scores_comparison(
    "gsm8k",
    "qwen",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
plot_population_scores_comparison(
    "copa",
    "qwen",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

## Llama

In [None]:
generate_comparison_table(model="llama")

In [None]:
plot_population_scores_comparison(
    "sst-5",
    "llama",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
plot_population_scores_comparison(
    "agnews",
    "llama",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

PromptWizard performs really bad for Llama

In [None]:
plot_population_scores_comparison(
    "subj",
    "llama",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

In [None]:
plot_population_scores_comparison(
    "gsm8k",
    "llama",
    OPTIMS,
    agg="mean",
    plot_stddev=True,
    x_col="input_tokens_cum",
    n_seeds_to_plot_std=1,
);

In [None]:
plot_population_scores_comparison(
    "agnews",
    "mistral",
    OPTIMS,
    agg="mean",
    plot_seeds=False,
    plot_stddev=True,
    x_col="input_tokens_cum",
);

Candidates for main paper
- GSM8K (because its most relevant dataset)
- Subj using qwen (because it has beautiful curves)

Takeaways:
- PromptWizard's performance is highly dependend on model used (=> strict templates!)

# Table Results

In [None]:
for model in MODELS:
    print(f"{model}")
    display(generate_comparison_table(model=model, cutoff_tokens=1_000_000))

In [None]:
for model in MODELS:
    print(f"Model: {model}")
    display(generate_comparison_table(model=model, cutoff_tokens=3_000_000))

In [None]:
for model in MODELS:
    print(f"{model}")
    display(generate_comparison_table(model=model))

If we cutoff very early (already at 1_000_000 we outperform the other optimizers in nearly all cases)

## Prompt Length Analysis

In [None]:
plot_length_score(
    "gsm8k",
    "mistral",
    ["CAPO", "OPRO", "EvoPromptGA", "PromptWizard"],
    x_col="prompt_len",
    score_col="test_score",
    log_scale=False,
);

=> maybe we are cost aware in the sense that we are evaluating the entire "front" (EvoPrompt and Opro are very short and Promptwizard very long)

- promptwizard has extremly long prompts, that only sometimes can compete with competitors

=> interesting for plotting: 
- subj using qwen or gsm8k using mistral => shows that we have a huge range

## Prompt Length Tables

In [None]:
for model in MODELS:
    print(f"{model}")
    display(generate_comparison_table(model=model, score_col="prompt_len"))

# Prompt Survival Analysis

In [None]:
plot_population_members("subj", "mistral", "CAPO", x_col="step", score_col="test_score", seeds=[42]);

## Performance & Examples

In [None]:
from pprint import pprint as pp

In [None]:
# print best prompt per dataset, model, optimizer
for dataset in ["subj"]:
    for model in ["mistral"]:
        for optim in ["capo"]:
            print(f"Dataset: {dataset}, Model: {model}, Optimizer: {optim}")
            df = get_results(
                dataset=dataset,
                model=model,
                optim=optim,
            )

            if df.empty:
                continue
            p, s = df.nlargest(1, "test_score")[["prompt", "test_score"]].values[0]

            print(s)
            print("'''")
            pp(p)
            print("'''")

capo can be very repetitive? (SST-5 mistral) potentially the crossover meta prompt has been misinterpreted (merge the two prompts) => however it is performing superior!

subj for qwen and llama with capo has a crazy outlier to the top
