In [None]:
import os
import pandas as pd

IND_FOLDER_PATH = "outputs/ind"
E2E_FULL_FOLDER_PATH = "outputs/e2e/full"
E2E_CHECKPOINTS_FOLDER_PATH = "outputs/e2e/checkpoints"
E2E_CHECKPOINT_TASK_PATH = "outputs/e2e/checkpoint_tasks"

# Analyze individual tests

In [None]:
csv_filenames = [f for f in os.listdir(IND_FOLDER_PATH) if f.endswith(".csv")]

# retrieve the output files
dataframes = []
for filename in csv_filenames:
    full_path = os.path.join(IND_FOLDER_PATH, filename)
    print(f"Processing individual outputs: {filename}")
    df = pd.read_csv(full_path)
    df["Agent"] = filename.split(".csv")[0]
    dataframes.append(df)
ind_df = pd.concat(dataframes)

In [None]:
ind_categories = ind_df.groupby(["Category", "Agent"])[["Pass Count", "Total Count"]].sum()
ind_categories["Pass Percentage"] = ind_categories["Pass Count"] / ind_categories["Total Count"] * 100
pivot_categories = ind_categories.pivot_table(index=["Category", "Total Count"], columns="Agent", values="Pass Percentage")

pivot_categories

In [None]:
ind_tasks = ind_df.groupby(["Category", "Task", "Agent"])[["Pass Count", "Total Count"]].sum()
ind_tasks["Pass Percentage"] = ind_tasks["Pass Count"] / ind_tasks["Total Count"] * 100
pivot_tasks = ind_tasks.pivot_table(index=["Category", "Task", "Total Count"], columns="Agent", values="Pass Percentage")

pivot_tasks

# Analyze E2E tests

## Full

In [None]:
csv_filenames = [f for f in os.listdir(E2E_FULL_FOLDER_PATH) if f.endswith(".csv")]

# retrieve the output files
dataframes = []
for filename in csv_filenames:
    full_path = os.path.join(E2E_FULL_FOLDER_PATH, filename)
    print(f"Processing individual outputs: {filename}")
    df = pd.read_csv(full_path)
    df["Agent"] = filename.split(".csv")[0]
    dataframes.append(df)
e2e_full_df = pd.concat(dataframes)

e2e_full_df = e2e_full_df.drop(columns=["Checkpoint", "Checkpoint Full Match", "Checkpoint Partial Match", "Checkpoint Missing", "Checkpoint Total Count"])
e2e_full_df = e2e_full_df.dropna(subset="E2E Total Count").reset_index(drop=True)

In [None]:
e2e_full_df["Pass Percentage"] = e2e_full_df["E2E Pass Count"] / e2e_full_df["E2E Total Count"] * 100
pivot_full = e2e_full_df.pivot_table(index=["Test", "E2E Total Count"], columns="Agent", values="Pass Percentage")

pivot_full

## Checkpoints

In [None]:
csv_filenames = [f for f in os.listdir(E2E_CHECKPOINTS_FOLDER_PATH) if f.endswith(".csv")]

# retrieve the output files
dataframes = []
for filename in csv_filenames:
    full_path = os.path.join(E2E_CHECKPOINTS_FOLDER_PATH, filename)
    print(f"Processing individual outputs: {filename}")
    df = pd.read_csv(full_path)
    df["Agent"] = filename.split(".csv")[0]
    dataframes.append(df)
e2e_checkpoints_df = pd.concat(dataframes)

# remove unneeded columns (E2E test never being run, checkpoint never missing bc initialized there)
e2e_checkpoints_df = e2e_checkpoints_df.drop(columns=["E2E Pass Count", "E2E Total Count", "Checkpoint Missing"])
e2e_checkpoints_df = e2e_checkpoints_df.dropna(subset="Checkpoint").reset_index(drop=True)

In [None]:
e2e_checkpoints_df["Full Percentage"] = e2e_checkpoints_df["Checkpoint Full Match"] / e2e_checkpoints_df["Checkpoint Total Count"] * 100
pivot_checkpoints = e2e_checkpoints_df.pivot_table(index=["Test", "Checkpoint", "Checkpoint Total Count"], columns="Agent", values="Full Percentage")

pivot_checkpoints

## Checkpoints (task)

In [None]:
csv_filenames = [f for f in os.listdir(E2E_CHECKPOINT_TASK_PATH) if f.endswith(".csv")]

# retrieve the output files
dataframes = []
for filename in csv_filenames:
    full_path = os.path.join(E2E_CHECKPOINT_TASK_PATH, filename)
    print(f"Processing individual outputs: {filename}")
    df = pd.read_csv(full_path)
    df["Agent"] = filename.split(".csv")[0]
    dataframes.append(df)
e2e_tasks_df = pd.concat(dataframes)

In [None]:
e2e_categories = e2e_tasks_df.groupby(["Category", "Agent"])[["Correct", "Total"]].sum()
e2e_categories["Pass Percentage"] = e2e_categories["Correct"] / e2e_categories["Total"] * 100
pivot_e2e_categories = e2e_categories.pivot_table(index=["Category", "Total"], columns="Agent", values="Pass Percentage")

pivot_e2e_categories

In [None]:
e2e_tasks = e2e_tasks_df.groupby(["Category", "Task", "Agent"])[["Correct", "Total"]].sum()
e2e_tasks["Pass Percentage"] = e2e_tasks["Correct"] / e2e_tasks["Total"] * 100
pivot_e2e_tasks = e2e_tasks.pivot_table(index=["Category", "Task", "Total"], columns="Agent", values="Pass Percentage")

pivot_e2e_tasks