# Result Inspection
Some utility functions for inspecting JSON results of benchmark runs stored in `data\evaluation\50000006`.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import json
from typing_extensions import List

sns.set_theme()

script_dir = os.path.abspath('')
eval_dir = os.path.join(script_dir, "../data/evaluation/50000006")

# We create a pandas dataframe that contains the full data for every experiment,
# and add a column with the filename (without .json suffix)
all_dfs = []
for filename in os.listdir(eval_dir):
    if filename.endswith(".json"):
        with open(os.path.join(eval_dir, filename), 'r', encoding='utf-8') as f:
            content = f.read().strip()
            if not content:  # Check if the file is empty
                print(f"Skipping empty file: {filename}")
                continue
            try:
                json_obj = json.loads(content)
                data_list = json_obj["data"]
                df_tmp = pd.DataFrame(data_list)
                df_tmp['filename'] = filename.split('.')[0]

                # Add each meta field as a column with the same value for all rows
                meta_info = json_obj.get("meta")  # Get metadata dict
                for key, value in meta_info.items():
                    df_tmp[key] = value
                # Unpack the 'results' column into separate columns
                results_expanded = df_tmp['results'].apply(pd.Series)
                df_tmp = pd.concat([df_tmp.drop(columns=['results']), results_expanded], axis=1)
                # Ensure 'correct' column is always float (0.0/1.0), even if it was bool or object
                if 'correct' in df_tmp.columns:
                    df_tmp['correct'] = df_tmp['correct'].astype(float)
                # Map all other boolean columns to 0/1
                for col in df_tmp.columns:
                    if df_tmp[col].dtype == 'bool':
                        df_tmp[col] = df_tmp[col].astype(float)
                all_dfs.append(df_tmp)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON file: {filename}, error: {e}")
                continue

df = pd.concat(all_dfs, ignore_index=True)
df.head()
# Print all columns in the DataFrame
print("Columns in the DataFrame:")
print(df.columns.tolist())

In [None]:
def get_latest_runs(df, run_type, n=1, required_keywords=[], retrieval_llm=None, coding_llm=None, retrieval_only=False, analysis_only=False, iterative_only=False):
    runs = df[df['filename'].str.startswith(run_type)]['filename'].unique()

    if retrieval_llm:
        runs = [run for run in runs if df[df['filename'] == run]['retrieval_llm'].iloc[0] == retrieval_llm]

    if coding_llm:
        runs = [run for run in runs if df[df['filename'] == run]['coding_llm'].iloc[0] == coding_llm]

    if retrieval_only:
        runs = [run for run in runs if df[df['filename'] == run]['only_retrieval'].iloc[0] == 1.0]

    if analysis_only:
        runs = [run for run in runs if df[df['filename'] == run]['only_analysis'].iloc[0] == 1.0]

    if iterative_only:
        runs = [run for run in runs if df[df['filename'] == run]['analyzer'].iloc[0] == 'iterative_local']

    if required_keywords:
        runs = [run for run in runs if all(keyword in run for keyword in required_keywords)]

    timestamps = [run.split("_")[-2] + "_" + run.split("_")[-1] for run in runs]
    sorted_runs = [run for _, run in sorted(zip(timestamps, runs))]
    return sorted_runs[-n:] if len(sorted_runs) >= n else sorted_runs

In [None]:
def table_comparison_retrieval(df: pd.DataFrame, experiment_names: List[str], columns: List[str]) -> pd.DataFrame:
    """
    Create a comparison table for the specified columns across multiple experiments.
    """
    comparison_data = {}
    
    for exp in experiment_names:
        df_exp = df[df['filename'] == exp]
        comparison_data[exp] = {}

        for col in columns:
            if col in ['recall', 'precision', 'validation_accuracy']:
                comparison_data[exp][col] = round(df_exp[col].mean() * 100, 1)
            else:
                if col == "retrieval_latency":
                    decimals = 1
                elif col == "retrieval_tokens":
                    decimals = 0
                elif col == "retrieval_cost_usd":
                    decimals = 3
                else:
                    decimals = 1

                comparison_data[exp][col] = round(df_exp[col].median(), decimals)
                # Add interquartile range as column
                q1 = df_exp[col].quantile(0.25)
                q3 = df_exp[col].quantile(0.75)
                comparison_data[exp][f'{col}_q1'] = round(q1, decimals)
                comparison_data[exp][f'{col}_q3'] = round(q3, decimals)

                # If decimals are 0, cast to int
                if decimals == 0:
                    comparison_data[exp][col] = int(comparison_data[exp][col])
                    comparison_data[exp][f'{col}_q1'] = int(comparison_data[exp][f'{col}_q1'])
                    comparison_data[exp][f'{col}_q3'] = int(comparison_data[exp][f'{col}_q3'])
        # Add LLM information
        comparison_data[exp]['retrieval_llm'] = df_exp['retrieval_llm'].iloc[0]
        comparison_data[exp]['retrieval_nr_searches'] = df_exp['retrieval_nr_searches'].mean()
    
    comparison_df = pd.DataFrame(comparison_data).T
    comparison_df.index.name = 'Experiment'
    return comparison_df

flat_retrieval_runs = ['agentic_retrieval_only_retrieval_gpt4o_retrieval_20250710_114251', 'agentic_retrieval_only_retrieval_41_20250715_113532', 'agentic_retrieval_only_retrieval_gpt41mini_20250715_113056',
                       'agentic_retrieval_only_retrieval_o1_parallel_20250710_135131', 'agentic_retrieval_only_retrieval_20250823_153832',
                       'agentic_retrieval_only_retrieval_25flash_20250715_113302', 'agentic_retrieval_only_retrieval_gemini25pro_20250710_134756', 'agentic_retrieval_only_retrieval_mistral_retrieval_fixed_20250710_134614',
                       'agentic_retrieval_only_retrieval_20250814_111731', 'agentic_retrieval_only_retrieval_20250814_113251']

table_comparison_retrieval(df, flat_retrieval_runs, ['recall', 'precision', 'validation_accuracy', 'retrieval_latency', 'retrieval_tokens', 'retrieval_cost_usd'])

In [None]:
retrieval_result_df = table_comparison_retrieval(df, flat_retrieval_runs, ['recall', 'precision', 'validation_accuracy', 'retrieval_latency', 'retrieval_tokens', 'retrieval_cost_usd'])
retrieval_result_df

In [None]:
search_strategy_and_approach_runs = ['verified_retrieval_bm25_only_retrieval_20250806_143025', 'verified_retrieval_only_retrieval_20250806_133834', 'verified_retrieval_hybrid_only_retrieval_20250806_145204',
                                     'agentic_retrieval_bm25_only_retrieval_20250806_162133', 'agentic_retrieval_only_retrieval_41_20250715_113532', 'agentic_retrieval_hybrid_only_retrieval_20250806_152631']

table_comparison_retrieval(df, search_strategy_and_approach_runs, ['recall', 'precision', 'validation_accuracy', 'retrieval_latency', 'retrieval_tokens', 'retrieval_cost_usd'])

In [None]:
def table_comparison_analysis(df: pd.DataFrame, experiment_names: List[str], columns: List[str]) -> pd.DataFrame:
    """
    Create a comparison table for the specified columns across multiple experiments.
    """
    comparison_data = {}
    
    for exp in experiment_names:
        df_exp = df[df['filename'] == exp]
        comparison_data[exp] = {}

        for col in columns:
            if col == 'correct':
                # Only consider solvable tasks for correctness
                solvable_tasks = df_exp[df_exp['reference_titles'].apply(lambda x: len(x) != 0)]
                # Count number of correct and incorrect and print results

                comparison_data[exp][col] = solvable_tasks[col].mean() * 100
            else:
                if col == "coding_latency":
                    decimals = 1
                elif col == "coding_tokens":
                    decimals = 0
                elif col == "coding_cost_usd":
                    decimals = 3
                else:
                    decimals = 1

                comparison_data[exp][col] = (df_exp[col].median())
                # Add interquartile range as column
                q1 = df_exp[col].quantile(0.25)
                q3 = df_exp[col].quantile(0.75)
                comparison_data[exp][f'{col}_q1'] = (q1)
                comparison_data[exp][f'{col}_q3'] = (q3)

                # If decimals are 0, cast to int
                if decimals == 0:
                    comparison_data[exp][col] = int(comparison_data[exp][col])
                    comparison_data[exp][f'{col}_q1'] = int(comparison_data[exp][f'{col}_q1'])
                    comparison_data[exp][f'{col}_q3'] = int(comparison_data[exp][f'{col}_q3'])
        # Add LLM information
        comparison_data[exp]['coding_llm'] = df_exp['coding_llm'].iloc[0]
        comparison_data[exp]['analyzer'] = df_exp['analyzer'].iloc[0]

    comparison_df = pd.DataFrame(comparison_data).T
    comparison_df.index.name = 'Experiment'
    return comparison_df

analysis_llms = ['mistral-large', 'gpt-4o', 'gpt-o1', 'gpt-4.1', 'gemini-2.5-flash', 'gpt-4.1-mini', 'gemini-2.5-pro',
                 'gpt-5', 'meta-llama/llama-4-maverick', 'openai/gpt-oss-120b', 'mistral-codestral']
analysis_runs = [get_latest_runs(df, "", n=5, coding_llm=llm, analysis_only=True, iterative_only=False) for llm in analysis_llms]
flat_analysis_runs = [run for sublist in analysis_runs for run in sublist]

table_comparison_analysis(df, flat_analysis_runs, ['correct', 'coding_latency', 'coding_tokens', 'coding_cost_usd'])

In [None]:
single_pass_vs_iterative_runs = ['agentic_retrieval_only_analysis_20250812_104751', 'agentic_retrieval_only_analysis_gpt41_iterv2_20250721_172254']
table_comparison_analysis(df, single_pass_vs_iterative_runs, ['correct', 'coding_latency', 'coding_tokens', 'coding_cost_usd'])

In [None]:
llm_comparison_runs_analysis = [
    'agentic_retrieval_only_analysis_iter4o_20250812_140604',
    'agentic_retrieval_only_analysis_gpt41_iterv2_20250721_172254', 
    'agentic_retrieval_only_analysis_gpt41mini_20250715_135007', 
    'agentic_retrieval_only_analysis__gpt-o1_simple_local_v2_20250813_010136', 
    'agentic_retrieval_only_analysis_20250823_171529', 
    'agentic_retrieval_only_analysis_gemini25flash_20250714_141142', 
    'agentic_retrieval_only_analysis_20250813_120608', 
    'agentic_retrieval_only_analysis_20250813_155235',
    'agentic_retrieval_only_analysis__gpt-oss-120b_simple_local_v2_20250813_120323', 
    'agentic_retrieval_only_analysis__llama-4-maverick_simple_local_v2_20250813_122444'
]

table_comparison_analysis(df, llm_comparison_runs_analysis, ['correct', 'coding_latency', 'coding_tokens', 'coding_cost_usd'])

In [None]:
# Add column latency and token, replacing NaN with 0
df['total_latency'] = df['retrieval_latency'].fillna(0) + df['coding_latency'].fillna(0)
df['total_tokens'] = df['retrieval_tokens'].fillna(0) + df['coding_tokens'].fillna(0)
df['total_cost'] = df['retrieval_cost_usd'].fillna(0) + df['coding_cost_usd'].fillna(0)

In [None]:
def table_comparison_endtoend(df: pd.DataFrame, experiment_names: List[str], columns: List[str]) -> pd.DataFrame:
    """
    Create a comparison table for the specified columns across multiple experiments.
    """
    comparison_data = {}
    
    for exp in experiment_names:
        df_exp = df[df['filename'] == exp]
        comparison_data[exp] = {}

        for col in columns:
            if col in ['recall', 'precision', 'validation_accuracy']:
                comparison_data[exp][col] = df_exp[col].mean() * 100
            elif col == 'correct':
                # Only consider solvable tasks for correctness
                solvable_tasks = df_exp[df_exp['reference_titles'].apply(lambda x: len(x) != 0)]
                comparison_data[exp][col] = solvable_tasks[col].mean() * 100
            else:
                comparison_data[exp][col] = (df_exp[col].median())
        # Add LLM information
        comparison_data[exp]['retrieval_llm'] = df_exp['retrieval_llm'].iloc[0]
        comparison_data[exp]['retrieval_nr_searches'] = df_exp['retrieval_nr_searches'].mean()
        comparison_data[exp]['coding_llm'] = df_exp['coding_llm'].iloc[0]

        # Add total median latency, token and cost by computing sums of retrieval latency and coding latency, then taking median
        comparison_data[exp]['total_latency'] = round(df_exp['total_latency'].median(), 1)
        comparison_data[exp]['total_latency_q1'] = round(df_exp['total_latency'].quantile(0.25), 1)
        comparison_data[exp]['total_latency_q3'] = round(df_exp['total_latency'].quantile(0.75), 1)
        comparison_data[exp]['total_tokens'] = int(df_exp['total_tokens'].median())
        comparison_data[exp]['total_tokens_q1'] = int(df_exp['total_tokens'].quantile(0.25))
        comparison_data[exp]['total_tokens_q3'] = int(df_exp['total_tokens'].quantile(0.75))
        comparison_data[exp]['total_cost'] = round(df_exp['total_cost'].median(), 3)
        comparison_data[exp]['total_cost_q1'] = round(df_exp['total_cost'].quantile(0.25), 3)
        comparison_data[exp]['total_cost_q3'] = round(df_exp['total_cost'].quantile(0.75), 3)

    comparison_df = pd.DataFrame(comparison_data).T
    comparison_df.index.name = 'Experiment'
    return comparison_df


stability_runs = [
                  #'agentic_retrieval_stability__20250815_053614',
                  'agentic_retrieval_stability2_rephrased_0_20250815_150752',
                  'agentic_retrieval_stability2_rephrased_1_20250815_161922',
                  'agentic_retrieval_stability2_rephrased_2_20250815_172506',
                  'agentic_retrieval_stability2_rephrased_3_20250816_112856',
                  'agentic_retrieval_stability2_rephrased_4_20250816_123217',
                  'agentic_retrieval_stability__20250814_194413'
                  ]

language_runs = ['agentic_retrieval_stability_translated_en_20250815_020058', 'agentic_retrieval_stability_translated_fr_20250815_025910', 'agentic_retrieval_stability_translated_it_20250815_043557']

print("Stability comparison")
display(table_comparison_endtoend(df, stability_runs, ['correct', 'recall', 'precision', 'validation_accuracy']))

### Inspect answers in detail

In [None]:
from IPython.display import Markdown, HTML

def display_answer_comparison(df, experiment_name, incorrect_only=True, id_start=0):
    display(HTML(f"<h2>Answer comparison for {experiment_name}</h2>"))

    df_exp = df[df['filename'] == experiment_name]
    #predicted_not_nan = df_exp[df_exp['predicted_answer'].notna()]
    # keep only where relevant datasets are present
    predicted_not_nan = df_exp[df_exp['reference_titles'].apply(lambda x: len(x) != 0)]

    if incorrect_only:
        predicted_not_nan = predicted_not_nan[predicted_not_nan['correct'] != 1.0]
        print(f"Displaying {len(predicted_not_nan)} incorrect predictions for {experiment_name}.")
    else:
        print(f"Displaying {len(predicted_not_nan)} predictions for {experiment_name}.")

    for _, row in predicted_not_nan.iterrows():

        if int(row['id'].split("_")[0]) < id_start:
            continue

        display(Markdown(f"**Question:**\n\n{row['question']}"))
        color = 'red'
        if row['precision'] == 1.0 and row['recall'] == 1.0:
            color = 'green'
        elif row['precision'] != 1.0 and row['recall'] == 1.0:
            color = 'orange'
        display(HTML(f'<span style="color: {color}">Reference Titles: {row['reference_titles']} | Predicted Titles: {row['predicted_titles']}</span>'))

        if pd.notna(row['reference_answer']):
            display(Markdown(f"**Reference Answer:**\n\n{row['reference_answer']}"))
        if pd.notna(row['predicted_answer']):
            display(Markdown(f"**Predicted Answer:**\n\n{row['predicted_answer']}"))

        if pd.notna(row["reference_answer"]) and pd.notna(row["predicted_answer"]):
            if row["correct"] == 1.0:
                display(HTML('<span style="color: green; font-weight: bold;">LLM Judge: Correct prediction</span>'))
            else:
                display(HTML('<span style="color: red; font-weight: bold;">LLM Judge: Incorrect prediction</span>'))

            # if str(row["reference_answer"]).strip() in str(row["predicted_answer"]):
            #     display(HTML('<span style="color: green;">Substring Match: Correct prediction</span>'))
            # else:
            #     display(HTML('<span style="color: red;">Substring Match: Incorrect prediction</span>'))
        display(HTML('<hr style="margin: 0px 0;">'))

for run in stability_runs:
    display_answer_comparison(df, run, incorrect_only=True)
    display(HTML('<hr style="margin: 0px 0;">'))
    display(HTML('<hr style="margin: 0px 0;">'))