In [None]:
import os
import pandas as pd
import numpy as np

IND_FOLDER_PATH = "outputs/ind"
E2E_FULL_FOLDER_PATH = "outputs/e2e/full"
E2E_FULL_TASKS_FOLDER_PATH = "outputs/e2e/full_tasks"

NUM_IND_TRIALS = 8

# Analyze individual tests

In [None]:
csv_filenames = [f for f in os.listdir(IND_FOLDER_PATH) if f.endswith(".csv")]

# retrieve the output files
dataframes = []
for filename in csv_filenames:
    full_path = os.path.join(IND_FOLDER_PATH, filename)
    print(f"Processing individual outputs: {filename}")
    df = pd.read_csv(full_path)
    df["Agent"] = filename.split(".csv")[0]
    dataframes.append(df)
ind_df = pd.concat(dataframes)

In [None]:
normalized_ind_test = ind_df.groupby(["Category", "Task", "Test", "Agent"])[["Pass Count", "Total Count"]].sum().reset_index()

normalized_ind_test["Normalized Pass"] = normalized_ind_test["Pass Count"] / (normalized_ind_test["Total Count"] / NUM_IND_TRIALS)
normalized_ind_test["Normalized Total"] = normalized_ind_test["Total Count"] / (normalized_ind_test["Total Count"] / NUM_IND_TRIALS)

normalized_ind_test

In [None]:
ind_all = ind_df.groupby(["Category", "Task", "Test", "Agent"])[["Pass Count", "Total Count"]].sum()
ind_all["Pass Percentage"] = ind_all["Pass Count"] / ind_all["Total Count"] * 100

ind_all['moe'] = 1.96 * np.sqrt((ind_all['Pass Count'] / ind_all['Total Count']) * (1 - ind_all['Pass Count'] / ind_all['Total Count']) / ind_all['Total Count'])

pivot_all = ind_all.pivot_table(index=["Category", "Task", "Test", "Total Count"], columns="Agent", values=["Pass Percentage", "moe"])

pivot_all

In [None]:
ind_tasks = normalized_ind_test.groupby(["Category", "Task", "Agent"])[["Normalized Pass", "Normalized Total"]].sum()
ind_tasks["Normalized Pass Percentage"] = ind_tasks["Normalized Pass"] / ind_tasks["Normalized Total"] * 100
pivot_tasks = ind_tasks.pivot_table(index=["Category", "Task", "Normalized Total"], columns="Agent", values="Normalized Pass Percentage")

pivot_tasks

In [None]:
ind_categories = normalized_ind_test.groupby(["Category", "Agent"])[["Normalized Pass", "Normalized Total"]].sum()
ind_categories["Normalized Pass Percentage"] = ind_categories["Normalized Pass"] / ind_categories["Normalized Total"] * 100
pivot_categories = ind_categories.pivot_table(index=["Category", "Normalized Total"], columns="Agent", values="Normalized Pass Percentage")

pivot_categories

# Analyze E2E tests

## Full

In [None]:
csv_filenames = [f for f in os.listdir(E2E_FULL_FOLDER_PATH) if f.endswith(".csv")]

# retrieve the output files
dataframes = []
for filename in csv_filenames:
    full_path = os.path.join(E2E_FULL_FOLDER_PATH, filename)
    print(f"Processing individual outputs: {filename}")
    df = pd.read_csv(full_path)
    df["Agent"] = filename.split(".csv")[0]
    dataframes.append(df)
e2e_full_df = pd.concat(dataframes)

e2e_full_df = e2e_full_df.drop(columns=["Checkpoint", "Checkpoint Full Match", "Checkpoint Partial Match", "Checkpoint Missing", "Checkpoint Total Count"])
e2e_full_df = e2e_full_df.dropna(subset="E2E Total Count").reset_index(drop=True)

In [None]:
e2e_full_df["Pass Percentage"] = e2e_full_df["E2E Pass Count"] / e2e_full_df["E2E Total Count"] * 100
pivot_full = e2e_full_df.pivot_table(index=["Test", "E2E Total Count"], columns="Agent", values="Pass Percentage")

pivot_full

## Full tasks

In [None]:
csv_filenames = [f for f in os.listdir(E2E_FULL_TASKS_FOLDER_PATH) if f.endswith(".csv")]

# retrieve the output files
dataframes = []
for filename in csv_filenames:
    full_path = os.path.join(E2E_FULL_TASKS_FOLDER_PATH, filename)
    print(f"Processing individual outputs: {filename}")
    df = pd.read_csv(full_path)
    df["Agent"] = filename.split(".csv")[0]
    dataframes.append(df)
e2e_tasks_df = pd.concat(dataframes)

In [None]:
e2e_tasks_df

In [None]:
e2e_categories = e2e_tasks_df.groupby(["Category", "Agent"])[["Correct", "Total"]].sum()
e2e_categories["Pass Percentage"] = e2e_categories["Correct"] / e2e_categories["Total"] * 100

e2e_categories

In [None]:
e2e_tasks = e2e_tasks_df.groupby(["Category", "Task", "Agent"])[["Correct", "Total"]].sum()
e2e_tasks["Pass Percentage"] = e2e_tasks["Correct"] / e2e_tasks["Total"] * 100

e2e_tasks

In [None]:
e2e_tests = e2e_tasks_df.groupby(["Category", "Task", "Test", "Agent"])[["Correct", "Total"]].sum()
e2e_tests["Pass Percentage"] = e2e_tests["Correct"] / e2e_tests["Total"] * 100

e2e_tests