# Result Analysis
This notebook converts the extracted csv files produced by the experiments into dataframes and calculates central metrics on them as well as visualizing them. Our experiment tries out different Llama models on the Evoprompt tasks with two different al

In [1]:
import sys
from typing import List
import pandas as pd
from pathlib import Path

In [53]:
def read_prompts(target_experiment: str, tasks: List[str]):
    results = pd.DataFrame()
    for logging_dir in Path(f"../logs/{target_experiment}").rglob("*.csv"):
        if "best_scores" in str(logging_dir) or not any(task in str(logging_dir) for task in tasks):
            continue

        result = pd.read_csv(logging_dir)

        logging_dir = str(logging_dir)

        logging_dir = logging_dir.replace(f"..\\logs\\{target_experiment}\\", "")
        logging_dir = logging_dir.replace(".csv", "")

        task_name, optimizer, meta_llm, evaluation_llm, random_seed = logging_dir.split("_")

        metainformation = pd.DataFrame(
            {
                "task": [task_name]*len(result),
                "optimizer": [optimizer]*len(result),
                "meta_llm": [meta_llm]*len(result),
                "evaluation_llm": [evaluation_llm]*len(result),
                "random_seed": [random_seed]*len(result),
            }
        )

        result = pd.concat([result, metainformation], axis=1)

        results = pd.concat([result, results], axis=0)

    return results


df = read_prompts("experiment", ["cr"])

In [26]:
def read_best_scores(target_experiment: str):
    return pd.read_csv(f"../logs/{target_experiment}/best_scores.csv")

best_scores_exp = read_best_scores("experiment")

best_scores_all = best_scores_exp.groupby(["task", "optimizer", "meta_llm", "downstream_llm", "evaluation_llm", "use_task_desc"]).agg("mean").drop(columns="random_seed")
best_scores_all["test_score_std"] = best_scores_exp.groupby(["task", "optimizer", "meta_llm", "downstream_llm", "evaluation_llm", "use_task_desc"]).agg("std").drop(columns="random_seed")["test_score"]
best_scores_all = best_scores_all.reset_index()#.droplevel(1, axis=1).set_index("task")

In [73]:
# TODO: rename the test_score column to differentiate between mean and std
# TODO: replace 'Meta-llama/Meta-Llama-3-70B-Instruct' with 'Llama-3-70B'
# TODO: replace std column with +- in front as string to easy paste

In [78]:
best_scores_all.drop(columns=["downstream_llm", "evaluation_llm", "use_task_desc"]).T

task,agnews,agnews.1,agnews.2,agnews.3,cr,cr.1,cr.2,cr.3,mr,mr.1,...,sst2,sst2.1,subj,subj.1,subj.2,subj.3,trec,trec.1,trec.2,trec.3
optimizer,evopromptde,evopromptde,evopromptga,evopromptga,evopromptde,evopromptde,evopromptga,evopromptga,evopromptde,evopromptde,...,evopromptga,evopromptga,evopromptde,evopromptde,evopromptga,evopromptga,evopromptde,evopromptde,evopromptga,evopromptga
meta_llm,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,...,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct
test_score,0.866667,0.85,0.855,0.865,0.918333,0.916667,0.898333,0.83,0.915,0.906667,...,0.953333,0.913333,0.706667,0.58,0.688333,0.61,0.721667,0.663333,0.678333,0.636667
test_score,0.002887,0.018028,0.022913,0.015,0.007638,0.011547,0.022546,0.050744,0.00866,0.023629,...,0.020207,0.037859,0.068252,0.104403,0.087512,0.07,0.036856,0.167432,0.057735,0.10054


In [13]:
best_scores_all.groupby(["optimizer", "meta_llm", "downstream_llm","evaluation_llm", "optimizer"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,use_task_desc,test_score,test_score
optimizer,meta_llm,downstream_llm,evaluation_llm,optimizer,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
evopromptde,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,evopromptde,0.0,0.797381,0.027669
evopromptde,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,evopromptde,0.0,0.745238,0.079268
evopromptga,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,evopromptga,0.0,0.782619,0.039988
evopromptga,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,evopromptga,0.0,0.745,0.05011


In [51]:
best_scores_all.sort_values("test_score", ascending=False).groupby("task").first()

Unnamed: 0_level_0,optimizer,meta_llm,downstream_llm,evaluation_llm,use_task_desc,test_score,test_score_std
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
agnews,evopromptde,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,False,0.866667,0.002887
cr,evopromptde,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,False,0.918333,0.007638
mr,evopromptde,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,False,0.915,0.00866
sst-5,evopromptga,meta-llama/Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-8B-Instruct,False,0.516667,0.023629
sst2,evopromptga,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,False,0.953333,0.020207
subj,evopromptde,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,False,0.706667,0.068252
trec,evopromptde,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,False,0.721667,0.036856


In [35]:
all_tasks = best_scores_all["task"].unique()
all_tasks

array(['agnews', 'cr', 'mr', 'sst-5', 'sst2', 'subj', 'trec'],
      dtype=object)

In [54]:
bestststs = read_prompts("experiment", all_tasks)
bestststs = bestststs.sort_values("score", ascending=False).groupby("task", as_index=False).first()[["prompt", "task", "score"]]

In [56]:
bestststs.to_csv("../logs/experiment_gpt/best_prompts.csv")