In [34]:
from glob import glob
import pandas as pd
from collections import defaultdict

In [35]:
datasets = ["gsm8k", "rte", "sst-5", "agnews", "subj"]  # "",
optimizers = ["EvoPromptGA", "CAPO"]
models = ["llama", "mistral", "qwen"]
seeds = [42, 43, 44]

In [51]:
dfs = defaultdict(pd.DataFrame)

for d in datasets:
    for o in optimizers:
        for m in models:
            for s in seeds:
                path = f"../results/{d}/{m}/{o}/seed{s}/*/*/step_results_eval.csv"
                files = glob(path)
                if len(files) != 1:
                    print(f"weird number of files: {len(files)} for {path}")
                    continue
                file = files[0]
                df = pd.read_csv(file)
                tokens_per_step = (
                    df.groupby("step").first()["input_tokens_meta_llm"]
                    + df.groupby("step").first()["input_tokens_downstream_llm"]
                )
                # map back to steps
                df["cum_token"] = df["step"].map(tokens_per_step.cumsum())
                dfs[(d, o, m, s)] = df

weird number of files: 0 for ../results/subj/mistral/CAPO/seed43/*/*/step_results_eval.csv
weird number of files: 0 for ../results/subj/mistral/CAPO/seed44/*/*/step_results_eval.csv


In [68]:
CUTOFF_TOKENS = 5_000_000

In [77]:
model = "qwen"
results = {"optimizer": [], "dataset": [], "mean": [], "std": []}
# calculate mean and std per dataset and optimizer at CUTOFF_STEP
for d in datasets:
    for o in optimizers:
        # find first step that is under CUTOFF for each dataframe
        steps = []
        for s in seeds:
            try:
                df = dfs[(d, o, model, s)].assign(seed=s)
            except KeyError:
                continue
            last_step = df.loc[df["cum_token"] < CUTOFF_TOKENS, "step"].max()
            # take last step with max test_score
            df = df[df["step"] == last_step].nlargest(1, "test_score")
            steps.append(df)
        df = pd.concat(steps)

        results["mean"].append(df["test_score"].mean())
        results["std"].append(df["test_score"].std())

        results["optimizer"].append(o)
        results["dataset"].append(d)

In [78]:
df = pd.DataFrame(results)
df = df.set_index("optimizer")
df = df.pivot(columns="dataset")
df["avg"] = df["mean"].mean(axis=1).mul(100).round(2)
df["mean"] = df["mean"].mul(100).round(1)
df["std"] = df["std"].mul(100).round(1)

In [79]:
df["mean"] = (
    df["mean"].astype(str).apply(lambda x: x[:5])
    + " ± "
    + df["std"].astype(str).apply(lambda x: x[:5])
)
df = df.drop(columns=["std"])
df

Unnamed: 0_level_0,mean,mean,mean,mean,mean,avg
dataset,agnews,gsm8k,rte,sst-5,subj,Unnamed: 6_level_1
optimizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
CAPO,82.6 ± 2.4,46.8 ± 5.1,82.6 ± 3.6,60.0 ± 1.7,88.8 ± 2.4,72.15
EvoPromptGA,81.7 ± 2.0,54.1 ± 8.1,87.8 ± 1.5,58.3 ± 1.4,75.4 ± 3.0,71.45
