In [None]:
import sys

sys.path.append("../../ares_transverse_tuning")

In [None]:
from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots
import seaborn as sns
from icecream import ic
from llm_stats import (
    elo_ratings,
    hellaswag_scores,
    mmlu_scores,
    mt_bench_scores,
    num_parameters,
)
from src.eval import Study

In [None]:
plt.style.use(["science", "ieee"])

In [None]:
# plt.rcParams

In [None]:
FIG_DIR = Path("./")
DATA_DIR = Path("../../data/paper")

In [None]:
def replace_names_with_paper_names(ratings_dict: dict) -> None:
    ratings_dict["GPT 4 Turbo"] = ratings_dict["GPT 4 Turbo Preview"]
    del ratings_dict["GPT 4 Turbo Preview"]
    ratings_dict["Mistral 7B"] = ratings_dict["Mistral 7B v0.2"]
    del ratings_dict["Mistral 7B v0.2"]
    ratings_dict["Starling LM 7B"] = ratings_dict["Starling LM 7B Beta"]
    del ratings_dict["Starling LM 7B Beta"]


replace_names_with_paper_names(elo_ratings)
replace_names_with_paper_names(hellaswag_scores)
replace_names_with_paper_names(mmlu_scores)
replace_names_with_paper_names(mt_bench_scores)
replace_names_with_paper_names(num_parameters)

In [None]:
rl_study = Study.load(
    DATA_DIR / "baselines" / "rl",
    runs="trial-*_*",
    name="Reinforcement learning",
    use_problem_index=True,
).head(50)
bo_hard_study = Study.load(
    DATA_DIR / "baselines" / "bo_hard",
    runs="trial-*_*",
    name="Bayesian optimisation",
    use_problem_index=True,
).head(50)
es_study = Study.load(
    DATA_DIR / "baselines" / "es",
    runs="trial-*_*",
    name="Extremum seeking",
    use_problem_index=True,
).head(50)
random_study = Study.load(
    DATA_DIR / "baselines" / "rs",
    runs="trial-*_*",
    name="Random search",
    use_problem_index=True,
).head(50)
do_nothing_study = Study.load(
    DATA_DIR / "baselines" / "dn",
    runs="trial-*_*",
    name="Do nothing",
    use_problem_index=True,
).head(50)

In [None]:
baseline_studies = [rl_study, bo_hard_study, es_study, random_study, do_nothing_study]

In [None]:
# Remove " - head" suffixes introduced when taking the studies' heads
for study in baseline_studies:
    study.name = study.name.replace(" - head", "")

In [None]:
gemma_2b_tuning_study = Study.load(
    DATA_DIR / "tuning" / "gemma-2b",
    runs="trial-*_*/recorded_episodes",
    name="Gemma 2B",
    use_problem_index=True,
)
gpt_4_turbo_tuning_study = Study.load(
    DATA_DIR / "tuning" / "gpt-4-0125-preview",  # Turbo Preview
    runs="trial-*_*/recorded_episodes",
    name="GPT 4 Turbo",
    use_problem_index=True,
)
mixtral_8x7b_tuning_study = Study.load(
    DATA_DIR / "tuning" / "mixtral-8x7b",
    runs="trial-*_*/recorded_episodes",
    name="Mixtral 8x7B",
    use_problem_index=True,
)

In [None]:
tuning_llm_studies = [
    gemma_2b_tuning_study,
    gpt_4_turbo_tuning_study,
    mixtral_8x7b_tuning_study,
]

In [None]:
gemma_2b_explained_study = Study.load(
    DATA_DIR / "explained" / "gemma-2b",
    runs="trial-*_*/recorded_episodes",
    name="Gemma 2B",
    use_problem_index=True,
)
gemma_7b_explained_study = Study.load(
    DATA_DIR / "explained" / "gemma-7b",
    runs="trial-*_*/recorded_episodes",
    name="Gemma 7B",
    use_problem_index=True,
)
gpt_35_turbo_explained_study = Study.load(
    DATA_DIR / "explained" / "gpt-3.5-turbo-0125",
    runs="trial-*_*/recorded_episodes",
    name="GPT 3.5 Turbo",
    use_problem_index=True,
)
gpt_4_explained_study = Study.load(
    DATA_DIR / "explained" / "gpt-4",
    runs="trial-*_*/recorded_episodes",
    name="GPT 4",
    use_problem_index=True,
)
gpt_4_turbo_explained_study = Study.load(
    DATA_DIR / "explained" / "gpt-4-turbo-preview",
    runs="trial-*_*/recorded_episodes",
    name="GPT 4 Turbo",
    use_problem_index=True,
)
llama2_7b_explained_study = Study.load(
    DATA_DIR / "explained" / "llama2-7b",
    runs="trial-*_*/recorded_episodes",
    name="Llama 2 7B",
    use_problem_index=True,
)
llama2_13b_explained_study = Study.load(
    DATA_DIR / "explained" / "llama2-13b",
    runs="trial-*_*/recorded_episodes",
    name="Llama 2 13B",
    use_problem_index=True,
)
llama2_70b_explained_study = Study.load(
    DATA_DIR / "explained" / "llama2-70b",
    runs="trial-*_*/recorded_episodes",
    name="Llama 2 70B",
    use_problem_index=True,
)
mistral_7b_v02_explained_study = Study.load(
    DATA_DIR / "explained" / "mistral-v0.2",
    runs="trial-*_*/recorded_episodes",
    name="Mistral 7B",
    use_problem_index=True,
)
mixtral_8x7b_explained_study = Study.load(
    DATA_DIR / "explained" / "mixtral-8x7b",
    runs="trial-*_*/recorded_episodes",
    name="Mixtral 8x7B",
    use_problem_index=True,
)
orca2_7b_explained_study = Study.load(
    DATA_DIR / "explained" / "orca2-7b",
    runs="trial-*_*/recorded_episodes",
    name="Orca 2 7B",
    use_problem_index=True,
)
orca2_13b_explained_study = Study.load(
    DATA_DIR / "explained" / "orca2-13b",
    runs="trial-*_*/recorded_episodes",
    name="Orca 2 13B",
    use_problem_index=True,
)
starling_lm_7b_beta_explained_study = Study.load(
    DATA_DIR / "explained" / "starling-lm-7b-beta",
    runs="trial-*_*/recorded_episodes",
    name="Starling LM 7B",
    use_problem_index=True,
)
vicuna_7b_16k_explained_study = Study.load(
    DATA_DIR / "explained" / "vicuna-7b-16k",
    runs="trial-*_*/recorded_episodes",
    name="Vicuna 7B 16K",
    use_problem_index=True,
)

In [None]:
explained_llm_studies = [
    gemma_2b_explained_study,
    gemma_7b_explained_study,
    gpt_35_turbo_explained_study,
    gpt_4_explained_study,
    gpt_4_turbo_explained_study,
    llama2_7b_explained_study,
    llama2_13b_explained_study,
    llama2_70b_explained_study,
    mistral_7b_v02_explained_study,
    mixtral_8x7b_explained_study,
    orca2_7b_explained_study,
    orca2_13b_explained_study,
    starling_lm_7b_beta_explained_study,
    vicuna_7b_16k_explained_study,
]

In [None]:
gemma_2b_cot_study = Study.load(
    DATA_DIR / "cot" / "gemma-2b",
    runs="trial-*_*/recorded_episodes",
    name="Gemma 2B",
    use_problem_index=True,
)
gpt_4_turbo_cot_study = Study.load(
    DATA_DIR / "cot" / "gpt-4-0125-preview",  # Turbo Preview
    runs="trial-*_*/recorded_episodes",
    name="GPT 4 Turbo",
    use_problem_index=True,
)
mixtral_8x7b_cot_study = Study.load(
    DATA_DIR / "cot" / "mixtral-8x7b",
    runs="trial-*_*/recorded_episodes",
    name="Mixtral 8x7B",
    use_problem_index=True,
)

In [None]:
cot_llm_studies = [gemma_2b_cot_study, gpt_4_turbo_cot_study, mixtral_8x7b_cot_study]

In [None]:
gemma_2b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "gemma-2b",
    runs="trial-*_*/recorded_episodes",
    name="Gemma 2B",
    use_problem_index=True,
)
gemma_7b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "gemma-7b",
    runs="trial-*_*/recorded_episodes",
    name="Gemma 7B",
    use_problem_index=True,
)
gpt_35_turbo_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "gpt-3.5-turbo-0125",
    runs="trial-*_*/recorded_episodes",
    name="GPT 3.5 Turbo",
    use_problem_index=True,
)
gpt_4_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "gpt-4",
    runs="trial-*_*/recorded_episodes",
    name="GPT 4",
    use_problem_index=True,
)
gpt_4_turbo_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "gpt-4-0125-preview",  # Turbo Preview
    runs="trial-*_*/recorded_episodes",
    name="GPT 4 Turbo",
    use_problem_index=True,
)
llama2_7b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "llama2-7b",
    runs="trial-*_*/recorded_episodes",
    name="Llama 2 7B",
    use_problem_index=True,
)
llama2_13b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "llama2-13b",
    runs="trial-*_*/recorded_episodes",
    name="Llama 2 13B",
    use_problem_index=True,
)
llama2_70b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "llama2-70b",
    runs="trial-*_*/recorded_episodes",
    name="Llama 2 70B",
    use_problem_index=True,
)
mistral_7b_v02_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "mistral-v0.2",
    runs="trial-*_*/recorded_episodes",
    name="Mistral 7B",
    use_problem_index=True,
)
mixtral_8x7b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "mixtral-8x7b",
    runs="trial-*_*/recorded_episodes",
    name="Mixtral 8x7B",
    use_problem_index=True,
)
orca2_7b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "orca2-7b",
    runs="trial-*_*/recorded_episodes",
    name="Orca 2 7B",
    use_problem_index=True,
)
orca2_13b_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "orca2-13b",
    runs="trial-*_*/recorded_episodes",
    name="Orca 2 13B",
    use_problem_index=True,
)
starling_lm_7b_beta_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "starling-lm-7b-beta",
    runs="trial-*_*/recorded_episodes",
    name="Starling LM 7B",
    use_problem_index=True,
)
vicuna_7b_16k_optimization_study = Study.load(
    DATA_DIR / "optimisation" / "vicuna-7b-16k",
    runs="trial-*_*/recorded_episodes",
    name="Vicuna 7B 16K",
    use_problem_index=True,
)

In [None]:
optimization_llm_studies = [
    gemma_2b_optimization_study,
    gemma_7b_optimization_study,
    gpt_35_turbo_optimization_study,
    gpt_4_optimization_study,
    gpt_4_turbo_optimization_study,
    llama2_7b_optimization_study,
    llama2_13b_optimization_study,
    llama2_70b_optimization_study,
    mistral_7b_v02_optimization_study,
    mixtral_8x7b_optimization_study,
    orca2_7b_optimization_study,
    orca2_13b_optimization_study,
    starling_lm_7b_beta_optimization_study,
    vicuna_7b_16k_optimization_study,
]

In [None]:
all_studies = (
    baseline_studies
    + tuning_llm_studies
    + explained_llm_studies
    + cot_llm_studies
    + optimization_llm_studies
)

## Make the table


In [None]:
# Create the DataFrame
multi_columns = pd.MultiIndex(
    levels=[
        [
            "Final MAE",
            "Normalised MAE improvement",
            "Normalised accumulated MAE",
            "Number of steps",
        ],
        ["Tuning", "Explained", "CoT", "Optimsation", "None"],
        ["Mean", "Std"],
    ],
    codes=[[], [], []],
    names=["Metric", "Prompt", "Statistic"],
)

# Create the DataFrame
df = pd.DataFrame(columns=multi_columns)

In [None]:
# Final MAE
for study in baseline_studies:
    df.loc[study.name, ("Final MAE", "None", "Mean")] = study.mean_final_mae()
    df.loc[study.name, ("Final MAE", "None", "Std")] = study.std_final_mae()

for study in tuning_llm_studies:
    df.loc[study.name, ("Final MAE", "Tuning", "Mean")] = study.mean_final_mae()
    df.loc[study.name, ("Final MAE", "Tuning", "Std")] = study.std_final_mae()

for study in explained_llm_studies:
    df.loc[study.name, ("Final MAE", "Explained", "Mean")] = study.mean_final_mae()
    df.loc[study.name, ("Final MAE", "Explained", "Std")] = study.std_final_mae()

for study in cot_llm_studies:
    df.loc[study.name, ("Final MAE", "CoT", "Mean")] = study.mean_final_mae()
    df.loc[study.name, ("Final MAE", "CoT", "Std")] = study.std_final_mae()

for study in optimization_llm_studies:
    df.loc[study.name, ("Final MAE", "Optimsation", "Mean")] = study.mean_final_mae()
    df.loc[study.name, ("Final MAE", "Optimsation", "Std")] = study.std_final_mae()

In [None]:
# Normalised MAE improvement
for study in baseline_studies:
    collected_normalised_mae_improvements = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_mae_improvements.append(
            (episode.final_mae() - do_nothing_episode.maes()[0])
            / do_nothing_episode.maes()[0]
        )

    df.loc[study.name, ("Normalised MAE improvement", "None", "Mean")] = np.mean(
        collected_normalised_mae_improvements
    )
    df.loc[study.name, ("Normalised MAE improvement", "None", "Std")] = np.std(
        collected_normalised_mae_improvements
    )

for study in tuning_llm_studies:
    collected_normalised_mae_improvements = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_mae_improvements.append(
            (episode.final_mae() - do_nothing_episode.maes()[0])
            / do_nothing_episode.maes()[0]
        )

    df.loc[study.name, ("Normalised MAE improvement", "Tuning", "Mean")] = np.mean(
        collected_normalised_mae_improvements
    )
    df.loc[study.name, ("Normalised MAE improvement", "Tuning", "Std")] = np.std(
        collected_normalised_mae_improvements
    )

for study in explained_llm_studies:
    collected_normalised_mae_improvements = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_mae_improvements.append(
            (episode.final_mae() - do_nothing_episode.maes()[0])
            / do_nothing_episode.maes()[0]
        )

    df.loc[study.name, ("Normalised MAE improvement", "Explained", "Mean")] = np.mean(
        collected_normalised_mae_improvements
    )
    df.loc[study.name, ("Normalised MAE improvement", "Explained", "Std")] = np.std(
        collected_normalised_mae_improvements
    )

for study in cot_llm_studies:
    collected_normalised_mae_improvements = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_mae_improvements.append(
            (episode.final_mae() - do_nothing_episode.maes()[0])
            / do_nothing_episode.maes()[0]
        )

    df.loc[study.name, ("Normalised MAE improvement", "CoT", "Mean")] = np.mean(
        collected_normalised_mae_improvements
    )
    df.loc[study.name, ("Normalised MAE improvement", "CoT", "Std")] = np.std(
        collected_normalised_mae_improvements
    )

for study in optimization_llm_studies:
    collected_normalised_mae_improvements = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_mae_improvements.append(
            (episode.final_mae() - do_nothing_episode.maes()[0])
            / do_nothing_episode.maes()[0]
        )

    df.loc[study.name, ("Normalised MAE improvement", "Optimsation", "Mean")] = np.mean(
        collected_normalised_mae_improvements
    )
    df.loc[study.name, ("Normalised MAE improvement", "Optimsation", "Std")] = np.std(
        collected_normalised_mae_improvements
    )

In [None]:
# Normalised accumulated MAE
for study in baseline_studies:
    collected_normalised_accumulated_maes = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_accumulated_maes.append(
            episode.accumulated_mae() / do_nothing_episode.accumulated_mae()
        )

    df.loc[study.name, ("Normalised accumulated MAE", "None", "Mean")] = np.mean(
        collected_normalised_accumulated_maes
    )
    df.loc[study.name, ("Normalised accumulated MAE", "None", "Std")] = np.std(
        collected_normalised_accumulated_maes
    )

for study in tuning_llm_studies:
    collected_normalised_accumulated_maes = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_accumulated_maes.append(
            episode.accumulated_mae() / do_nothing_episode.accumulated_mae()
        )

    df.loc[study.name, ("Normalised accumulated MAE", "Tuning", "Mean")] = np.mean(
        collected_normalised_accumulated_maes
    )
    df.loc[study.name, ("Normalised accumulated MAE", "Tuning", "Std")] = np.std(
        collected_normalised_accumulated_maes
    )

for study in explained_llm_studies:
    collected_normalised_accumulated_maes = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_accumulated_maes.append(
            episode.accumulated_mae() / do_nothing_episode.accumulated_mae()
        )

    df.loc[study.name, ("Normalised accumulated MAE", "Explained", "Mean")] = np.mean(
        collected_normalised_accumulated_maes
    )
    df.loc[study.name, ("Normalised accumulated MAE", "Explained", "Std")] = np.std(
        collected_normalised_accumulated_maes
    )

for study in cot_llm_studies:
    collected_normalised_accumulated_maes = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_accumulated_maes.append(
            episode.accumulated_mae() / do_nothing_episode.accumulated_mae()
        )

    df.loc[study.name, ("Normalised accumulated MAE", "CoT", "Mean")] = np.mean(
        collected_normalised_accumulated_maes
    )
    df.loc[study.name, ("Normalised accumulated MAE", "CoT", "Std")] = np.std(
        collected_normalised_accumulated_maes
    )

for study in optimization_llm_studies:
    collected_normalised_accumulated_maes = []
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        collected_normalised_accumulated_maes.append(
            episode.accumulated_mae() / do_nothing_episode.accumulated_mae()
        )

    df.loc[study.name, ("Normalised accumulated MAE", "Optimsation", "Mean")] = np.mean(
        collected_normalised_accumulated_maes
    )
    df.loc[study.name, ("Normalised accumulated MAE", "Optimsation", "Std")] = np.std(
        collected_normalised_accumulated_maes
    )

In [None]:
# Number of steps
for study in tuning_llm_studies:
    collected_steps = []
    for episode in study.episodes:
        collected_steps.append(len(episode) - 1)  # -1 to not count reset

    df.loc[study.name, ("Number of steps", "Tuning", "Mean")] = np.mean(collected_steps)
    df.loc[study.name, ("Number of steps", "Tuning", "Std")] = np.std(collected_steps)

for study in explained_llm_studies:
    collected_steps = []
    for episode in study.episodes:
        collected_steps.append(len(episode) - 1)  # -1 to not count reset

    df.loc[study.name, ("Number of steps", "Explained", "Mean")] = np.mean(
        collected_steps
    )
    df.loc[study.name, ("Number of steps", "Explained", "Std")] = np.std(
        collected_steps
    )

for study in cot_llm_studies:
    collected_steps = []
    for episode in study.episodes:
        collected_steps.append(len(episode) - 1)  # -1 to not count reset

    df.loc[study.name, ("Number of steps", "CoT", "Mean")] = np.mean(collected_steps)
    df.loc[study.name, ("Number of steps", "CoT", "Std")] = np.std(collected_steps)

for study in optimization_llm_studies:
    collected_steps = []
    for episode in study.episodes:
        collected_steps.append(len(episode) - 1)  # -1 to not count reset

    df.loc[study.name, ("Number of steps", "Optimsation", "Mean")] = np.mean(
        collected_steps
    )
    df.loc[study.name, ("Number of steps", "Optimsation", "Std")] = np.std(
        collected_steps
    )

In [None]:
df

In [None]:
df.loc[:, ("Final MAE (mum)", "None", "Mean")] = (
    df.loc[:, ("Final MAE", "None", "Mean")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "None", "Std")] = (
    df.loc[:, ("Final MAE", "None", "Std")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "Tuning", "Mean")] = (
    df.loc[:, ("Final MAE", "Tuning", "Mean")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "Tuning", "Std")] = (
    df.loc[:, ("Final MAE", "Tuning", "Std")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "Explained", "Mean")] = (
    df.loc[:, ("Final MAE", "Explained", "Mean")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "Explained", "Std")] = (
    df.loc[:, ("Final MAE", "Explained", "Std")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "CoT", "Mean")] = (
    df.loc[:, ("Final MAE", "CoT", "Mean")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "CoT", "Std")] = (
    df.loc[:, ("Final MAE", "CoT", "Std")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "Optimsation", "Mean")] = (
    df.loc[:, ("Final MAE", "Optimsation", "Mean")] * 1e6
).round()
df.loc[:, ("Final MAE (mum)", "Optimsation", "Std")] = (
    df.loc[:, ("Final MAE", "Optimsation", "Std")] * 1e6
).round()

df = df.drop(columns=["Final MAE"])

df.loc[:, ("Normalised MAE improvement (%)", "None", "Mean")] = (
    df.loc[:, ("Normalised MAE improvement", "None", "Mean")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "None", "Std")] = (
    df.loc[:, ("Normalised MAE improvement", "None", "Std")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "Tuning", "Mean")] = (
    df.loc[:, ("Normalised MAE improvement", "Tuning", "Mean")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "Tuning", "Std")] = (
    df.loc[:, ("Normalised MAE improvement", "Tuning", "Std")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "Explained", "Mean")] = (
    df.loc[:, ("Normalised MAE improvement", "Explained", "Mean")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "Explained", "Std")] = (
    df.loc[:, ("Normalised MAE improvement", "Explained", "Std")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "CoT", "Mean")] = (
    df.loc[:, ("Normalised MAE improvement", "CoT", "Mean")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "CoT", "Std")] = (
    df.loc[:, ("Normalised MAE improvement", "CoT", "Std")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "Optimsation", "Mean")] = (
    df.loc[:, ("Normalised MAE improvement", "Optimsation", "Mean")] * 100
).round()
df.loc[:, ("Normalised MAE improvement (%)", "Optimsation", "Std")] = (
    df.loc[:, ("Normalised MAE improvement", "Optimsation", "Std")] * 100
).round()

df = df.drop(columns=["Normalised MAE improvement"])

df.loc[:, ("Normalised accumulated MAE (%)", "None", "Mean")] = (
    df.loc[:, ("Normalised accumulated MAE", "None", "Mean")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "None", "Std")] = (
    df.loc[:, ("Normalised accumulated MAE", "None", "Std")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "Tuning", "Mean")] = (
    df.loc[:, ("Normalised accumulated MAE", "Tuning", "Mean")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "Tuning", "Std")] = (
    df.loc[:, ("Normalised accumulated MAE", "Tuning", "Std")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "Explained", "Mean")] = (
    df.loc[:, ("Normalised accumulated MAE", "Explained", "Mean")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "Explained", "Std")] = (
    df.loc[:, ("Normalised accumulated MAE", "Explained", "Std")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "CoT", "Mean")] = (
    df.loc[:, ("Normalised accumulated MAE", "CoT", "Mean")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "CoT", "Std")] = (
    df.loc[:, ("Normalised accumulated MAE", "CoT", "Std")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "Optimsation", "Mean")] = (
    df.loc[:, ("Normalised accumulated MAE", "Optimsation", "Mean")] * 100
).round()
df.loc[:, ("Normalised accumulated MAE (%)", "Optimsation", "Std")] = (
    df.loc[:, ("Normalised accumulated MAE", "Optimsation", "Std")] * 100
).round()

df = df.drop(columns=["Normalised accumulated MAE"])

df.loc[:, ("Number of steps", "Tuning", "Mean")] = (
    df.loc[:, ("Number of steps", "Tuning", "Mean")]
).round()
df.loc[:, ("Number of steps", "Tuning", "Std")] = (
    df.loc[:, ("Number of steps", "Tuning", "Std")]
).round()
df.loc[:, ("Number of steps", "Explained", "Mean")] = (
    df.loc[:, ("Number of steps", "Explained", "Mean")]
).round()
df.loc[:, ("Number of steps", "Explained", "Std")] = (
    df.loc[:, ("Number of steps", "Explained", "Std")]
).round()
df.loc[:, ("Number of steps", "CoT", "Mean")] = (
    df.loc[:, ("Number of steps", "CoT", "Mean")]
).round()
df.loc[:, ("Number of steps", "CoT", "Std")] = (
    df.loc[:, ("Number of steps", "CoT", "Std")]
).round()
df.loc[:, ("Number of steps", "Optimsation", "Mean")] = (
    df.loc[:, ("Number of steps", "Optimsation", "Mean")]
).round()
df.loc[:, ("Number of steps", "Optimsation", "Std")] = (
    df.loc[:, ("Number of steps", "Optimsation", "Std")]
).round()

df

In [None]:
paper_multi_columns = pd.MultiIndex(
    levels=[
        [
            "Final MAE (mum)",
            "Normalised MAE improvement (%)",
            "Normalised accumulated MAE (%)",
        ],
        ["Tuning", "Explained", "CoT", "Optimsation", "None"],
    ],
    codes=[[], []],
    names=["Metric", "Prompt"],
)
df_paper = pd.DataFrame(columns=paper_multi_columns)

# Combine two columns into a string
df_paper.loc[:, ("Final MAE (mum)", "Tuning")] = (
    df.loc[:, ("Final MAE (mum)", "Tuning", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Final MAE (mum)", "Tuning", "Std")].astype(str)
)
df_paper.loc[:, ("Final MAE (mum)", "Explained")] = (
    df.loc[:, ("Final MAE (mum)", "Explained", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Final MAE (mum)", "Explained", "Std")].astype(str)
)
df_paper.loc[:, ("Final MAE (mum)", "Optimsation")] = (
    df.loc[:, ("Final MAE (mum)", "Optimsation", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Final MAE (mum)", "Optimsation", "Std")].astype(str)
)
df_paper.loc[:, ("Final MAE (mum)", "CoT")] = (
    df.loc[:, ("Final MAE (mum)", "CoT", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Final MAE (mum)", "CoT", "Std")].astype(str)
)
df_paper.loc[:, ("Final MAE (mum)", "None")] = (
    df.loc[:, ("Final MAE (mum)", "None", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Final MAE (mum)", "None", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised MAE improvement (%)", "Tuning")] = (
    df.loc[:, ("Normalised MAE improvement (%)", "Tuning", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised MAE improvement (%)", "Tuning", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised MAE improvement (%)", "Explained")] = (
    df.loc[:, ("Normalised MAE improvement (%)", "Explained", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised MAE improvement (%)", "Explained", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised MAE improvement (%)", "CoT")] = (
    df.loc[:, ("Normalised MAE improvement (%)", "CoT", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised MAE improvement (%)", "CoT", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised MAE improvement (%)", "Optimsation")] = (
    df.loc[:, ("Normalised MAE improvement (%)", "Optimsation", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised MAE improvement (%)", "Optimsation", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised MAE improvement (%)", "None")] = (
    df.loc[:, ("Normalised MAE improvement (%)", "None", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised MAE improvement (%)", "None", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised accumulated MAE (%)", "Tuning")] = (
    df.loc[:, ("Normalised accumulated MAE (%)", "Tuning", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised accumulated MAE (%)", "Tuning", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised accumulated MAE (%)", "Explained")] = (
    df.loc[:, ("Normalised accumulated MAE (%)", "Explained", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised accumulated MAE (%)", "Explained", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised accumulated MAE (%)", "CoT")] = (
    df.loc[:, ("Normalised accumulated MAE (%)", "CoT", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised accumulated MAE (%)", "CoT", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised accumulated MAE (%)", "Optimsation")] = (
    df.loc[:, ("Normalised accumulated MAE (%)", "Optimsation", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised accumulated MAE (%)", "Optimsation", "Std")].astype(str)
)
df_paper.loc[:, ("Normalised accumulated MAE (%)", "None")] = (
    df.loc[:, ("Normalised accumulated MAE (%)", "None", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Normalised accumulated MAE (%)", "None", "Std")].astype(str)
)
df_paper.loc[:, ("Number of steps", "Tuning")] = (
    df.loc[:, ("Number of steps", "Tuning", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Number of steps", "Tuning", "Std")].astype(str)
)
df_paper.loc[:, ("Number of steps", "Explained")] = (
    df.loc[:, ("Number of steps", "Explained", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Number of steps", "Explained", "Std")].astype(str)
)
df_paper.loc[:, ("Number of steps", "CoT")] = (
    df.loc[:, ("Number of steps", "CoT", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Number of steps", "CoT", "Std")].astype(str)
)
df_paper.loc[:, ("Number of steps", "Optimsation")] = (
    df.loc[:, ("Number of steps", "Optimsation", "Mean")].astype(str)
    + " ± "
    + df.loc[:, ("Number of steps", "Optimsation", "Std")].astype(str)
)

df_paper

In [None]:
df_paper = df_paper.loc[
    [
        "Gemma 2B",
        "Gemma 7B",
        "GPT 3.5 Turbo",
        "GPT 4",
        "GPT 4 Turbo",
        "Llama 2 7B",
        "Llama 2 13B",
        "Llama 2 70B",
        "Orca 2 7B",
        "Orca 2 13B",
        "Vicuna 7B 16K",
        "Mistral 7B",
        "Mixtral 8x7B",
        "Starling LM 7B",
        "Reinforcement learning",
        "Bayesian optimisation",
        "Extremum seeking",
        "Random search",
        "Do nothing",
    ]
]
df_paper

In [None]:
print(df_paper.to_latex())

In [None]:
for study in all_studies:
    num_successes = 0
    for episode in study.episodes:
        do_nothing_episode = do_nothing_study.get_episodes_by_problem(
            episode.problem_index
        )[0]
        mae_to_beat = do_nothing_episode.maes()[0]
        if episode.final_mae() < mae_to_beat - 40e-6:
            num_successes += 1

    if study in tuning_llm_studies:
        prompt_type = "-- Tuning --"
    elif study in explained_llm_studies:
        prompt_type = "-- Explained --"
    elif study in optimization_llm_studies:
        prompt_type = "-- Optimisation --"
    elif study in cot_llm_studies:
        prompt_type = "-- Chain-of-thought --"
    else:
        prompt_type = "--"

    print(f"{study.name} {prompt_type}: {num_successes}/{len(study.episodes)}")

In [None]:
successes_df = pd.DataFrame(columns=["Tuning", "Explained", "CoT", "Optimisation"])

for explained, optimisation in zip(
    explained_llm_studies + baseline_studies,
    optimization_llm_studies + baseline_studies,
):
    for column, study in zip(["Explained", "Optimisation"], [explained, optimisation]):
        num_successes = 0
        for episode in study.episodes:
            do_nothing_episode = do_nothing_study.get_episodes_by_problem(
                episode.problem_index
            )[0]
            mae_to_beat = do_nothing_episode.maes()[0]
            if episode.final_mae() < mae_to_beat - 40e-6:
                num_successes += 1

            successes_df.loc[study.name, column] = num_successes

for tuning, cot in zip(tuning_llm_studies, cot_llm_studies):
    for column, study in zip(["Tuning", "CoT"], [tuning, cot]):
        num_successes = 0
        for episode in study.episodes:
            do_nothing_episode = do_nothing_study.get_episodes_by_problem(
                episode.problem_index
            )[0]
            mae_to_beat = do_nothing_episode.maes()[0]
            if episode.final_mae() < mae_to_beat - 40e-6:
                num_successes += 1

            successes_df.loc[study.name, column] = num_successes

# sns.heatmap(df, annot=True, cmap="viridis")
successes_df = successes_df.loc[
    [
        "Gemma 2B",
        "Gemma 7B",
        "GPT 3.5 Turbo",
        "GPT 4",
        "GPT 4 Turbo",
        "Llama 2 7B",
        "Llama 2 13B",
        "Llama 2 70B",
        "Orca 2 7B",
        "Orca 2 13B",
        "Vicuna 7B 16K",
        "Mistral 7B",
        "Mixtral 8x7B",
        "Starling LM 7B",
        "Reinforcement learning",
        "Bayesian optimisation",
        "Extremum seeking",
        "Random search",
        "Do nothing",
    ]
]
successes_df["Tuning"] = successes_df["Tuning"].astype(float)
successes_df["Explained"] = successes_df["Explained"].astype(float)
successes_df["CoT"] = successes_df["CoT"].astype(float)
successes_df["Optimisation"] = successes_df["Optimisation"].astype(float)

# Plot with diverging colour map
# plt.figure(figsize=(3, 5))
# ax = sns.heatmap(
#     successes_df, annot=True, cmap="RdYlGn", center=6, vmin=0, vmax=9, linewidths=0.5
# )
# ax.set(xlabel="", ylabel="")
# ax.xaxis.tick_top()
# ax.minorticks_off()
# plt.show()

In [None]:
for study in all_studies:
    num_successful_trials = 0
    for trial_idx in [0, 33, 38]:
        assert len(study.get_episodes_by_problem(trial_idx)) > 0

        all_successful = True
        for episode in study.get_episodes_by_problem(trial_idx):
            do_nothing_episode = do_nothing_study.get_episodes_by_problem(
                episode.problem_index
            )[0]
            mae_to_beat = do_nothing_episode.maes()[0]
            if not episode.final_mae() < mae_to_beat - 40e-6:
                all_successful = False
                break

        if all_successful:
            num_successful_trials += 1

    if study in tuning_llm_studies:
        prompt_type = "-- Tuning --"
    elif study in explained_llm_studies:
        prompt_type = "-- Explained --"
    elif study in cot_llm_studies:
        prompt_type = "-- Chain-of-thought --"
    elif study in optimization_llm_studies:
        prompt_type = "-- Optimisation --"
    else:
        prompt_type = "--"

    print(f"{study.name} {prompt_type}: {num_successful_trials}/3")

In [None]:
successful_trials_df = pd.DataFrame(columns=["Explained", "Optimisation"])

all_successful_trials = []

for explained, optimisation in zip(
    explained_llm_studies + baseline_studies,
    optimization_llm_studies + baseline_studies,
):
    for column, study in zip(["Explained", "Optimisation"], [explained, optimisation]):
        num_successful_trials = 0
        successful_trials = []
        for trial_idx in [0, 33, 38]:
            assert len(study.get_episodes_by_problem(trial_idx)) > 0

            all_successful = True
            for episode in study.get_episodes_by_problem(trial_idx):
                do_nothing_episode = do_nothing_study.get_episodes_by_problem(
                    episode.problem_index
                )[0]
                mae_to_beat = do_nothing_episode.maes()[0]
                if not episode.final_mae() < mae_to_beat - 40e-6:
                    all_successful = False
                    break

            if all_successful:
                num_successful_trials += 1
                successful_trials.append(trial_idx)

        successful_trials_df.loc[study.name, column] = num_successful_trials
        print(f"{study.name} {column}: {successful_trials}")
        all_successful_trials += successful_trials

for tuning, cot in zip(tuning_llm_studies, cot_llm_studies):
    for column, study in zip(["Tuning", "CoT"], [tuning, cot]):
        num_successful_trials = 0
        successful_trials = []
        for trial_idx in [0, 33, 38]:
            assert len(study.get_episodes_by_problem(trial_idx)) > 0

            all_successful = True
            for episode in study.get_episodes_by_problem(trial_idx):
                do_nothing_episode = do_nothing_study.get_episodes_by_problem(
                    episode.problem_index
                )[0]
                mae_to_beat = do_nothing_episode.maes()[0]
                if not episode.final_mae() < mae_to_beat - 40e-6:
                    all_successful = False
                    break

            if all_successful:
                num_successful_trials += 1
                successful_trials.append(trial_idx)

        successful_trials_df.loc[study.name, column] = num_successful_trials
        print(f"{study.name} {column}: {successful_trials}")
        all_successful_trials += successful_trials

successful_trials_df = successful_trials_df.loc[
    [
        "Gemma 2B",
        "Gemma 7B",
        "GPT 3.5 Turbo",
        "GPT 4",
        "GPT 4 Turbo",
        "Llama 2 7B",
        "Llama 2 13B",
        "Llama 2 70B",
        "Orca 2 7B",
        "Orca 2 13B",
        "Vicuna 7B 16K",
        "Mistral 7B",
        "Mixtral 8x7B",
        "Starling LM 7B",
        "Reinforcement learning",
        "Bayesian optimisation",
        "Extremum seeking",
        "Random search",
        "Do nothing",
    ]
]
successful_trials_df["Tuning"] = successful_trials_df["Tuning"].astype(float)
successful_trials_df["Explained"] = successful_trials_df["Explained"].astype(float)
successful_trials_df["CoT"] = successful_trials_df["CoT"].astype(float)
successful_trials_df["Optimisation"] = successful_trials_df["Optimisation"].astype(
    float
)

successful_trials_df = successful_trials_df[
    ["Tuning", "Explained", "CoT", "Optimisation"]
]

# Plot with diverging colour map
# plt.figure(figsize=(3, 5))
# ax = sns.heatmap(
#     successful_trials_df,
#     annot=True,
#     cmap="RdYlGn",
#     # center=1,
#     vmin=0,
#     vmax=3,
#     linewidths=0.5,
# )
# ax.set(xlabel="", ylabel="")
# ax.xaxis.tick_top()
# ax.minorticks_off()
# plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(6, 4.7), sharey="row", height_ratios=[1, 0.5])

cmap = plt.get_cmap("RdYlGn")
cmap.set_under("white")

sns.heatmap(
    successes_df.loc[
        (
            "Gemma 2B",
            "Gemma 7B",
            "GPT 3.5 Turbo",
            "GPT 4",
            "GPT 4 Turbo",
            "Llama 2 7B",
            "Llama 2 13B",
            "Llama 2 70B",
            "Orca 2 7B",
            "Orca 2 13B",
            "Vicuna 7B 16K",
            "Mistral 7B",
            "Mixtral 8x7B",
            "Starling LM 7B",
        ),
        :,
    ].fillna(-1),
    annot=successes_df.loc[
        (
            "Gemma 2B",
            "Gemma 7B",
            "GPT 3.5 Turbo",
            "GPT 4",
            "GPT 4 Turbo",
            "Llama 2 7B",
            "Llama 2 13B",
            "Llama 2 70B",
            "Orca 2 7B",
            "Orca 2 13B",
            "Vicuna 7B 16K",
            "Mistral 7B",
            "Mixtral 8x7B",
            "Starling LM 7B",
        ),
        :,
    ]
    .fillna(-1)
    .astype(int)
    .astype(str)
    .replace("-1", "-"),
    cmap=cmap,
    # center=6,
    vmin=0,
    vmax=9,
    linewidths=0.5,
    ax=axs[0, 0],
    cbar=False,
    fmt="",
)
axs[0, 0].set(xlabel="", ylabel="")
axs[0, 0].xaxis.tick_top()
axs[0, 0].tick_params(axis="both", which="both", length=0)
axs[0, 0].set_title("(a) Successful episodes")

sns.heatmap(
    successes_df.loc[
        (
            "Reinforcement learning",
            "Bayesian optimisation",
            "Extremum seeking",
            "Random search",
            "Do nothing",
        ),
        ("Optimisation",),
    ],
    annot=True,
    cmap=cmap,
    # center=6,
    vmin=0,
    vmax=9,
    linewidths=0.5,
    ax=axs[1, 0],
    cbar_kws={"location": "bottom", "ticks": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]},
)
axs[1, 0].set(xlabel="", ylabel="")
axs[1, 0].xaxis.tick_top()
axs[1, 0].tick_params(axis="both", which="both", length=0)
axs[1, 0].set_xticks([])

sns.heatmap(
    successful_trials_df.loc[
        (
            "Gemma 2B",
            "Gemma 7B",
            "GPT 3.5 Turbo",
            "GPT 4",
            "GPT 4 Turbo",
            "Llama 2 7B",
            "Llama 2 13B",
            "Llama 2 70B",
            "Orca 2 7B",
            "Orca 2 13B",
            "Vicuna 7B 16K",
            "Mistral 7B",
            "Mixtral 8x7B",
            "Starling LM 7B",
        ),
        :,
    ].fillna(-1),
    annot=successful_trials_df.loc[
        (
            "Gemma 2B",
            "Gemma 7B",
            "GPT 3.5 Turbo",
            "GPT 4",
            "GPT 4 Turbo",
            "Llama 2 7B",
            "Llama 2 13B",
            "Llama 2 70B",
            "Orca 2 7B",
            "Orca 2 13B",
            "Vicuna 7B 16K",
            "Mistral 7B",
            "Mixtral 8x7B",
            "Starling LM 7B",
        ),
        :,
    ]
    .fillna(-1)
    .astype(int)
    .astype(str)
    .replace("-1", "-"),
    cmap=cmap,
    # center=1,
    vmin=0,
    vmax=3,
    linewidths=0.5,
    ax=axs[0, 1],
    cbar=False,
    fmt="",
)
axs[0, 1].set(xlabel="", ylabel="")
axs[0, 1].xaxis.tick_top()
axs[0, 1].tick_params(axis="both", which="both", length=0)
axs[0, 1].set_title("(b) Successful trials")

sns.heatmap(
    successful_trials_df.loc[
        (
            "Reinforcement learning",
            "Bayesian optimisation",
            "Extremum seeking",
            "Random search",
            "Do nothing",
        ),
        ("Optimisation",),
    ],
    annot=True,
    cmap=cmap,
    # center=1,
    vmin=0,
    vmax=3,
    linewidths=0.5,
    ax=axs[1, 1],
    cbar_kws={"location": "bottom", "ticks": [0, 1, 2, 3]},
)
axs[1, 1].set(xlabel="", ylabel="")
axs[1, 1].xaxis.tick_top()
axs[1, 1].tick_params(axis="both", which="both", length=0)
axs[1, 1].set_xticks([])

fig.axes[4].minorticks_off()
fig.axes[5].minorticks_off()

plt.tight_layout()

plt.savefig(FIG_DIR / "success_heatmaps.pdf", bbox_inches="tight")
plt.show()

In [None]:
# Count how often each trial appears in the all_successful_trials list
counter = Counter(all_successful_trials)
print(counter)

In [None]:
extended_successes_df = successes_df.copy()
extended_successes_df

In [None]:
for name, elo in elo_ratings.items():
    extended_successes_df.loc[name, "ELO"] = elo

for name, hellaswag in hellaswag_scores.items():
    extended_successes_df.loc[name, "HellaSwag"] = hellaswag

for name, mmlu in mmlu_scores.items():
    extended_successes_df.loc[name, "MMLU"] = mmlu

for name, mt_bench in mt_bench_scores.items():
    extended_successes_df.loc[name, "MT-bench"] = mt_bench

for name, num_param in num_parameters.items():
    extended_successes_df.loc[name, "Num parameters"] = num_param

for name in extended_successes_df.index:
    extended_successes_df.loc[name, "Optimisation Final MAE (mum)"] = df.loc[
        name, ("Final MAE (mum)", "Optimsation", "Mean")
    ]
    extended_successes_df.loc[name, "Explained Final MAE (mum)"] = df.loc[
        name, ("Final MAE (mum)", "Explained", "Mean")
    ]
    extended_successes_df.loc[name, "Optimisation Normalised MAE improvement (%)"] = (
        df.loc[name, ("Normalised MAE improvement (%)", "Optimsation", "Mean")]
    )
    extended_successes_df.loc[name, "Explained Normalised MAE improvement (%)"] = (
        df.loc[name, ("Normalised MAE improvement (%)", "Explained", "Mean")]
    )
    extended_successes_df.loc[name, "Optimisation Normalised accumulated MAE (%)"] = (
        df.loc[name, ("Normalised accumulated MAE (%)", "Optimsation", "Mean")]
    )
    extended_successes_df.loc[name, "Explained Normalised accumulated MAE (%)"] = (
        df.loc[name, ("Normalised accumulated MAE (%)", "Explained", "Mean")]
    )

extended_successes_df

In [None]:
fig, axs = plt.subplots(3, 5, figsize=(6.44, 3), sharex="col", sharey="row")

sns.regplot(
    data=extended_successes_df,
    x="Num parameters",
    y="Explained",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    logx=True,
    ax=axs[0, 0],
)
sns.regplot(
    data=extended_successes_df,
    x="Num parameters",
    y="Optimisation",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    logx=True,
    ax=axs[0, 0],
)
axs[0, 0].set_xlabel(None)
axs[0, 0].set_ylabel("Successful\nepisodes")
axs[0, 0].set_xscale("log")

sns.regplot(
    data=extended_successes_df,
    x="ELO",
    y="Explained",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 1],
)
sns.regplot(
    data=extended_successes_df,
    x="ELO",
    y="Optimisation",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 1],
)
axs[0, 1].set_xlabel(None)
axs[0, 1].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="MMLU",
    y="Explained",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 2],
)
sns.regplot(
    data=extended_successes_df,
    x="MMLU",
    y="Optimisation",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 2],
)
axs[0, 2].set_xlabel(None)
axs[0, 2].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="MT-bench",
    y="Explained",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 3],
)
sns.regplot(
    data=extended_successes_df,
    x="MT-bench",
    y="Optimisation",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 3],
)
axs[0, 3].set_xlabel(None)
axs[0, 3].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="HellaSwag",
    y="Explained",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 4],
)
sns.regplot(
    data=extended_successes_df,
    x="HellaSwag",
    y="Optimisation",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[0, 4],
)
axs[0, 4].set_xlabel(None)
axs[0, 4].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="Num parameters",
    y="Explained Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    logx=True,
    ax=axs[1, 0],
)
sns.regplot(
    data=extended_successes_df,
    x="Num parameters",
    y="Optimisation Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    logx=True,
    ax=axs[1, 0],
)
axs[1, 0].set_xlabel(None)
axs[1, 0].set_ylabel("Normalised MAE\nimprovement (\%)")
axs[1, 0].set_xscale("log")

sns.regplot(
    data=extended_successes_df,
    x="ELO",
    y="Explained Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 1],
)
sns.regplot(
    data=extended_successes_df,
    x="ELO",
    y="Optimisation Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 1],
)
axs[1, 1].set_xlabel(None)
axs[1, 1].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="MMLU",
    y="Explained Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 2],
)
sns.regplot(
    data=extended_successes_df,
    x="MMLU",
    y="Optimisation Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 2],
)
axs[1, 2].set_xlabel(None)
axs[1, 2].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="MT-bench",
    y="Explained Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 3],
)
sns.regplot(
    data=extended_successes_df,
    x="MT-bench",
    y="Optimisation Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 3],
)
axs[1, 3].set_xlabel(None)
axs[1, 3].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="HellaSwag",
    y="Explained Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 4],
)
sns.regplot(
    data=extended_successes_df,
    x="HellaSwag",
    y="Optimisation Normalised MAE improvement (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[1, 4],
)
axs[1, 4].set_xlabel(None)
axs[1, 4].set_ylabel(None)

sns.regplot(
    data=extended_successes_df,
    x="Num parameters",
    y="Explained Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    logx=True,
    ax=axs[2, 0],
)
sns.regplot(
    data=extended_successes_df,
    x="Num parameters",
    y="Optimisation Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    logx=True,
    ax=axs[2, 0],
)
axs[2, 0].set_xlabel("Parameters")
axs[2, 0].set_ylabel("Normalised\naccumulated\nMAE (\%)")
axs[2, 0].set_xscale("log")
axs[2, 0].set_yscale("log")
axs[2, 0].set_ylim(4e1, None)

sns.regplot(
    data=extended_successes_df,
    x="ELO",
    y="Explained Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 1],
)
sns.regplot(
    data=extended_successes_df,
    x="ELO",
    y="Optimisation Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 1],
)
axs[2, 1].set_xlabel("ELO")
axs[2, 1].set_ylabel(None)
axs[2, 1].set_yscale("log")

sns.regplot(
    data=extended_successes_df,
    x="MMLU",
    y="Explained Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 2],
)
sns.regplot(
    data=extended_successes_df,
    x="MMLU",
    y="Optimisation Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 2],
)
axs[2, 2].set_xlabel("MMLU")
axs[2, 2].set_ylabel(None)
axs[2, 2].set_yscale("log")

sns.regplot(
    data=extended_successes_df,
    x="MT-bench",
    y="Explained Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 3],
)
sns.regplot(
    data=extended_successes_df,
    x="MT-bench",
    y="Optimisation Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 3],
)
axs[2, 3].set_xlabel("MT-bench")
axs[2, 3].set_ylabel(None)
axs[2, 3].set_yscale("log")

sns.regplot(
    data=extended_successes_df,
    x="HellaSwag",
    y="Explained Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 4],
)
sns.regplot(
    data=extended_successes_df,
    x="HellaSwag",
    y="Optimisation Normalised accumulated MAE (%)",
    ci=None,
    scatter_kws={"s": 3},
    line_kws={"linewidth": 1},
    ax=axs[2, 4],
)
axs[2, 4].set_xlabel("HellaSwag")
axs[2, 4].set_ylabel(None)
axs[2, 4].set_yscale("log")

plt.tight_layout()

plt.savefig(FIG_DIR / "correlations.pdf", bbox_inches="tight")
plt.show()

In [None]:
print("Baselines")
print("---------")
for study in baseline_studies:
    print(f"{study.name}: {study.average_inference_times()}")
print("")
print("Tuning Prompt")
print("-------------")
for study in tuning_llm_studies:
    print(f"{study.name}: {study.average_inference_times()}")
print("")
print("Explained Prompt")
print("----------------")
for study in explained_llm_studies:
    print(f"{study.name}: {study.average_inference_times()}")
print("")
print("Chain-of-thought Prompt")
print("-----------------------")
for study in cot_llm_studies:
    print(f"{study.name}: {study.average_inference_times()}")
print("")
print("Optimisation Prompt")
print("-------------------")
for study in optimization_llm_studies:
    print(f"{study.name}: {study.average_inference_times()}")

In [None]:
left_episode = gpt_4_turbo_optimization_study.episodes[1]
right_episode = gpt_35_turbo_explained_study.episodes[2]

# Plot Layout
fig = plt.figure(figsize=(469.7 / 72.72 * 1.24, 5))

gs = fig.add_gridspec(1, 1)
gs00 = gs[0].subgridspec(4, 3, width_ratios=[0.1, 1, 1], hspace=0.14, wspace=0.075)

ax_left_quadrupole = fig.add_subplot(gs00[0, 1])
ax_left_steerer = fig.add_subplot(
    gs00[1, 1], sharex=ax_left_quadrupole, sharey=ax_left_quadrupole
)
ax_left_mu = fig.add_subplot(gs00[2, 1], sharex=ax_left_quadrupole)
ax_left_sigma = fig.add_subplot(gs00[3, 1], sharex=ax_left_quadrupole)

ax_right_quadrupole = fig.add_subplot(gs00[0, 2], sharey=ax_left_quadrupole)
ax_right_steerer = fig.add_subplot(
    gs00[1, 2], sharex=ax_right_quadrupole, sharey=ax_left_steerer
)
ax_right_mu = fig.add_subplot(gs00[2, 2], sharex=ax_right_quadrupole, sharey=ax_left_mu)
ax_right_sigma = fig.add_subplot(
    gs00[3, 2], sharex=ax_right_quadrupole, sharey=ax_left_sigma
)

ax_dummy1 = fig.add_subplot(gs00[:2, 0])
ax_dummy2 = fig.add_subplot(gs00[2:, 0])
# ax_dummy1.set_visible(False)
for ax in [ax_dummy1, ax_dummy2]:
    ax.set_xticks([])
    ax.set_yticks([])
    [ax.spines[side].set_visible(False) for side in ("left", "top", "right", "bottom")]
    ax.patch.set_visible(False)

for ax in [
    ax_left_steerer,
    ax_left_quadrupole,
    ax_left_mu,
    ax_right_steerer,
    ax_right_quadrupole,
    ax_right_mu,
]:
    ax.xaxis.set_tick_params(labelbottom=False)
for ax in [ax_right_steerer, ax_right_quadrupole, ax_right_mu, ax_right_sigma]:
    ax.yaxis.set_tick_params(labelleft=False)

#############
# Plotting
left_episode.plot_quadrupoles(
    ax=ax_left_quadrupole, xlabel=False, ylabel=False, legend=False, normalize=True
)
ax_left_quadrupole.lines[0].set_label(r"$Q_1$")
ax_left_quadrupole.lines[1].set_label(r"$Q_2$")
ax_left_quadrupole.lines[2].set_label(r"$Q_3$")
ax_left_quadrupole.legend(loc="upper right", ncol=3)
left_episode.plot_steerers(
    ax=ax_left_steerer, xlabel=False, ylabel=False, legend=False, normalize=True
)
ax_left_steerer.lines[0].set_label(r"$C_v$")
ax_left_steerer.lines[1].set_label(r"$C_h$")
ax_left_steerer.legend(loc="upper right", ncol=2)

left_episode.plot_beam_parameters(
    ax=ax_left_mu, xlabel=False, mode="mu", legend=False, ylabel=False
)
ax_left_mu.legend(loc="upper right", ncol=2)
left_episode.plot_beam_parameters(
    ax=ax_left_sigma, xlabel=False, mode="sigma", legend=False, ylabel=False
)
ax_left_sigma.legend(loc="upper right", ncol=2)

right_episode.plot_quadrupoles(
    ax=ax_right_quadrupole, xlabel=False, ylabel=False, legend=False, normalize=True
)
right_episode.plot_steerers(
    ax=ax_right_steerer, xlabel=False, ylabel=False, legend=False, normalize=True
)

right_episode.plot_beam_parameters(
    ax=ax_right_mu, xlabel=False, mode="mu", legend=False, ylabel=False
)
right_episode.plot_beam_parameters(
    ax=ax_right_sigma, xlabel=False, mode="sigma", legend=False, ylabel=False
)

#############
# Labels
ax_left_sigma.set_xlabel("Step")
ax_right_sigma.set_xlabel("Step")
ax_dummy1.set_ylabel("Normalised actuator setting")
ax_dummy2.set_ylabel("Beam parameters (mm)")
ax_left_sigma.set_ylabel(r"$\sigma$")
ax_left_mu.set_ylabel(r"$\mu$")
ax_left_steerer.set_ylabel("Steerers")
ax_left_quadrupole.set_ylabel("Quadrupoles")

ax_left_quadrupole.set_title("GPT 4 Turbo (Optimisation Prompt)")
ax_right_quadrupole.set_title("GPT 3.5 Turbo (Explained Prompt)")

data_axes = [
    ax_left_quadrupole,
    ax_right_quadrupole,
    ax_left_steerer,
    ax_right_steerer,
    ax_left_mu,
    ax_right_mu,
    ax_left_sigma,
    ax_right_sigma,
]
bo_axes = [ax_right_quadrupole, ax_right_steerer, ax_right_mu, ax_right_sigma]
rl_axes = [ax_left_quadrupole, ax_left_steerer, ax_left_mu, ax_left_sigma]
subfig_names = ["(a)", "(b)", "(c)", "(d)", "(e)", "(f)", "(g)", "(h)"]
for idx, ax in enumerate(data_axes):
    ax.text(x=0.05, y=0.8, s=subfig_names[idx], transform=ax.transAxes)

for ax in [ax_left_sigma, ax_right_sigma]:
    ax.set_ylim(None, 5.2)

# Fine tuning
fig.align_ylabels([ax_left_steerer, ax_left_quadrupole, ax_left_mu, ax_left_sigma])

fig.savefig(f"{FIG_DIR}/example_episodes_combined.pdf", bbox_inches="tight")
plt.show()

In [None]:
ax_left_quadrupole.get_yticklabels()

In [None]:
ax_left_quadrupole.get_yticks()