# Token Usage
Compute the (input- and output) token usage of the different models and datasets over all conducted experiment runs. We also evaluate the fraction of tokens consumed by the meta-llm-

In [1]:
import pandas as pd
from glob import glob

In [8]:
datasets = ["agnews", "sst-5", "gsm8k", "copa", "subj"]
models = ["llama", "qwen", "mistral"]
seeds = [42, 43, 44]
dfs = []
for dataset in datasets:
    for model in models:
        for seed in seeds:
            path = glob(
                f"../results/main_results/{dataset}/{model}/CAPO/seed{seed}/*/*/step_results_eval.csv"
            )[0]
            df = pd.read_csv(path)
            dfs.append(
                df.assign(
                    dataset=dataset,
                    model=model,
                    seed=seed,
                )
            )

df = pd.concat(dfs, ignore_index=True)

In [9]:
# groupby step, dataset, model, seed, calculate cum sum of input_tokens_meta_llm and input_tokens_downstream_llm

df = df.groupby(["dataset", "model", "seed", "step"]).agg(
    input_tokens_meta_llm=("input_tokens_meta_llm", "first"),
    input_tokens_downstream_llm=("input_tokens_downstream_llm", "first"),
    output_tokens_meta_llm=("output_tokens_meta_llm", "first"),
    output_tokens_downstream_llm=("output_tokens_downstream_llm", "first"),
)

df = df.reset_index()
df["input_tokens_meta_llm_cumsum"] = (
    df.groupby(["dataset", "model", "seed"])["input_tokens_meta_llm"].cumsum().astype(int)
)

df["input_tokens_downstream_llm_cumsum"] = (
    df.groupby(["dataset", "model", "seed"])["input_tokens_downstream_llm"].cumsum().astype(int)
)

df["output_tokens_meta_llm_cumsum"] = (
    df.groupby(["dataset", "model", "seed"])["output_tokens_meta_llm"].cumsum().astype(int)
)

df["output_tokens_downstream_llm_cumsum"] = (
    df.groupby(["dataset", "model", "seed"])["output_tokens_downstream_llm"].cumsum().astype(int)
)

In [10]:
df = df.groupby(["dataset", "model", "seed"]).agg(
    input_tokens_meta_llm=("input_tokens_meta_llm_cumsum", "max"),
    input_tokens_downstream_llm=("input_tokens_downstream_llm_cumsum", "max"),
    output_tokens_meta_llm=("output_tokens_meta_llm_cumsum", "max"),
    output_tokens_downstream_llm=("output_tokens_downstream_llm_cumsum", "max"),
)

# build mean
df = df.groupby(["dataset", "model"]).agg(
    input_tokens_downstream_llm=("input_tokens_downstream_llm", "mean"),
    input_tokens_downstream_llm_std=("input_tokens_downstream_llm", "std"),
    input_tokens_meta_llm=("input_tokens_meta_llm", "mean"),
    input_tokens_meta_llm_std=("input_tokens_meta_llm", "std"),
    output_tokens_downstream_llm=("output_tokens_downstream_llm", "mean"),
    output_tokens_downstream_llm_std=("output_tokens_downstream_llm", "std"),
    output_tokens_meta_llm=("output_tokens_meta_llm", "mean"),
    output_tokens_meta_llm_std=("output_tokens_meta_llm", "std"),
)

In [11]:
df["frac_downstream_llm"] = df["input_tokens_downstream_llm"] / (
    df["input_tokens_meta_llm"] + df["input_tokens_downstream_llm"]
)

In [12]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,input_tokens_downstream_llm,input_tokens_downstream_llm_std,input_tokens_meta_llm,input_tokens_meta_llm_std,output_tokens_downstream_llm,output_tokens_downstream_llm_std,output_tokens_meta_llm,output_tokens_meta_llm_std,frac_downstream_llm
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
agnews,llama,5059157.0,59341.08,110922.0,40833.09,1280261.0,603673.7,47068.333333,14579.958379,0.978545
agnews,mistral,5099964.0,62622.9,60919.0,15822.56,446647.3,80404.23,25632.333333,6780.929754,0.988196
agnews,qwen,5040108.0,13763.74,52794.0,29475.83,265441.0,41774.87,19586.0,10065.847356,0.989634
copa,llama,5040588.0,42309.54,80414.33,16577.27,3497225.0,713490.6,38328.333333,9067.376265,0.984297
copa,mistral,3513026.0,1578963.0,1271295.0,1026011.0,3165999.0,1222857.0,507924.0,404717.957075,0.734279
copa,qwen,4754941.0,519529.8,559809.3,825994.1,2682985.0,1518114.0,183659.333333,248435.85884,0.894669
gsm8k,llama,5220248.0,200106.8,21040.0,4974.822,2008999.0,291005.2,9187.333333,2120.477383,0.995986
gsm8k,mistral,5176992.0,124515.2,15010.0,4050.344,1881618.0,115415.6,6888.333333,2462.084144,0.997109
gsm8k,qwen,5096732.0,40662.72,22109.67,7035.918,2586808.0,232277.1,9164.666667,3535.815936,0.995681
sst-5,llama,5098120.0,60926.21,57428.67,26515.16,1316234.0,643781.8,23813.666667,11862.202002,0.988861


In [42]:
print(df["frac_downstream_llm"].mul(100).round(1).to_latex())

\begin{tabular}{llr}
\toprule
 &  & frac_downstream_llm \\
dataset & model &  \\
\midrule
\multirow[t]{3}{*}{agnews} & llama & 97.900000 \\
 & mistral & 98.800000 \\
 & qwen & 99.000000 \\
\cline{1-3}
\multirow[t]{3}{*}{copa} & llama & 98.400000 \\
 & mistral & 73.400000 \\
 & qwen & 89.500000 \\
\cline{1-3}
\multirow[t]{3}{*}{gsm8k} & llama & 99.600000 \\
 & mistral & 99.700000 \\
 & qwen & 99.600000 \\
\cline{1-3}
\multirow[t]{3}{*}{sst-5} & llama & 98.900000 \\
 & mistral & 99.100000 \\
 & qwen & 99.100000 \\
\cline{1-3}
\multirow[t]{3}{*}{subj} & llama & 98.600000 \\
 & mistral & 99.000000 \\
 & qwen & 98.800000 \\
\cline{1-3}
\bottomrule
\end{tabular}



In [43]:
df["frac_downstream_llm"].mean()

np.float64(0.966136400759536)