In [12]:
import os
import pandas as pd
model_name = "openrouter/google/gemini-2.5-pro-preview-03-25"
model_name = model_name.replace('/','_').replace('.','_')

scores = []
for fp in os.listdir('results'):
    if fp.startswith(model_name):
        if 'metrics' in fp:
            scores.append(pd.read_csv(f'results/{fp}'))
        else:
            break

In [267]:
base = pd.read_csv(
    "results_scaling/o4_med_2_resp_df.csv"
)

In [271]:
import os, ast, random
import pandas as pd
import numpy as np
from src import compute_all_metrics

# model_name = "openrouter_qwen_qwen2_5-vl-32b-instruct"
model_name = "o4_high"

# model_name = "gpt-4.1"
# model_name = "openrouter_qwen_qwen3-235b"
# model_name = "openrouter_google_gemini-2_0-flash-lite-001"
# model_name = "openrouter_google_gemini-2_5"
# model_name = "openrouter_qwen_qwen-2_5-vl-72b-instruct"

results_path = "results_scaling"
safe_name  = model_name.replace('/', '_').replace('.', '_')
group_cols = ['doi/arxiv_id','error_category','error_severity','paper_category']

def parse_list(cell):
    if pd.isna(cell): return []
    if isinstance(cell, str):
        try:    return ast.literal_eval(cell)
        except: return []
    return list(cell) if hasattr(cell, '__iter__') else []

# 1) Build a long DataFrame of (error, file, detected)
rows = []
dfs = []
for fn in os.listdir(results_path ):
    if (fn.startswith(safe_name)) & ('metrics' in fn):
        # df = pd.read_csv(os.path.join(results_path , fn))
        # dfs.append(df)
        continue
    if not fn.startswith(safe_name): 
        continue
    print(fn)
    df = pd.read_csv(os.path.join(results_path, fn))
    df = df[df['doi/arxiv_id'].isin(base['doi/arxiv_id'])]
    metrics = compute_all_metrics(df)
    dfs.append(metrics)
    # dfs.append(df)
    if 'matches' not in df.columns:
        continue

    df['matches']  = df['matches'].apply(parse_list)
    df['file']     = fn
    # True if any match in that run for that error
    df['detected'] = df['matches'].apply(lambda L: len(L) > 0)
    rows.append(df[group_cols + ['file','detected']])

detect_df = pd.concat(rows, ignore_index=True)

# 2) Pivot to a full detection matrix (fills undetected with False)
det = detect_df.pivot_table(
    index=group_cols,
    columns='file',
    values='detected',
    fill_value=False
)

file_list   = det.columns.tolist()
total_errors = det.shape[0]

# 3) pass@N estimator
def pass_at_n(detection_df, n):
    chosen = random.sample(file_list, n)
    # did any of the N runs catch each error?
    return detection_df[chosen].any(axis=1).mean()

# 4) Bootstrap 95% CIs
np.random.seed(0)
n_boot = 1000
Ns     = [1,2]
boot    = {N: [pass_at_n(det, N) for _ in range(n_boot)] for N in Ns}

# 5) Print point estimates + CIs
ci = {
    N: (
        np.percentile(vals, 2.5),   # lower 0.5th percentile
        np.percentile(vals, 97.5)   # upper 99.5th percentile
    )
    for N, vals in boot.items()
}

# 5) Print point estimates + 99% CIs
# 5) Print point estimates + 99% CIs + bootstrap std
for N in Ns:
    vals = boot[N]
    p   = np.mean(vals)
    std = np.std(vals, ddof=1)           # bootstrap‐sample standard deviation
    lo, hi = ci[N]
    print(f"pass@{N}: {p:.3f} (99% CI: {lo:.3f}–{hi:.3f}), std: {std:.3f}")


o4_high_resp_df.csv
o4_high_1_resp_df.csv
o4_high_resp_df_3.csv
pass@1: 0.150 (99% CI: 0.121–0.212), std: 0.042
pass@2: 0.280 (99% CI: 0.212–0.333), std: 0.052


In [None]:
o4_low_resp_df.csv
o4_low_1_resp_df.csv
o4_low_2_resp_df.csv

low
pass@1: 0.082 (99% CI: 0.030–0.152), std: 0.052
med
pass@1: 0.121 (99% CI: 0.121–0.121), std: 0.000
high
pass@1: 0.163 (99% CI: 0.121–0.212), std: 0.038

pass@2: 0.140 (99% CI: 0.091–0.182), std: 0.038
pass@3: 0.182 (99% CI: 0.182–0.182), std: 0.000

precision_micro    0.114846
recall_micro       0.101852
PPR                0.050505

precision_micro    0.117616
recall_micro       0.105165
PPR                0.046289

o4_med_resp_df.csv
o4_med_1_resp_df.csv
o4_med_2_resp_df.csv
pass@1: 0.121 (99% CI: 0.121–0.121), std: 0.000
pass@2: 0.182 (99% CI: 0.182–0.182), std: 0.000
pass@3: 0.242 (99% CI: 0.242–0.242), std: 0.000

precision_micro    0.080899
recall_micro       0.129630
PPR                0.101010

precision_micro    0.002132
recall_micro       0.016038
PPR                0.017495

o4_high_resp_df.csv
o4_high_1_resp_df.csv
o4_high_2_resp_df.csv
pass@1: 0.150 (99% CI: 0.121–0.212), std: 0.042
pass@2: 0.280 (99% CI: 0.212–0.333), std: 0.052
pass@3: 0.212 (99% CI: 0.212–0.212), std: 0.000

precision_micro    0.092290
recall_micro       0.129630
PPR                0.070707

precision_micro    0.057029
recall_micro       0.084863
PPR                0.063081

In [261]:
low
pass@1: 0.121 (99% CI: 0.121–0.121), std: 0.000
pass@2: 0.140 (99% CI: 0.091–0.182), std: 0.038

med
pass@1: 0.121 (99% CI: 0.121–0.121), std: 0.000
pass@2: 0.182 (99% CI: 0.182–0.182), std: 0.000

high
pass@1: 0.150 (99% CI: 0.121–0.212), std: 0.042
pass@2: 0.280 (99% CI: 0.212–0.333), std: 0.052

Unnamed: 0,N,precision_micro,recall_micro,precision_macro,recall_macro,PPR,per_paper
0,33,0.035714,0.027778,0.0,0.015152,0.0,"[{'doi/arxiv_id': '10.5539/jsd.v17n6p137', 'k_..."
1,33,0.25,0.222222,0.0,0.227273,0.090909,"[{'doi/arxiv_id': '10.5539/jsd.v17n6p137', 'k_..."
2,33,0.058824,0.055556,0.007576,0.060606,0.060606,"[{'doi/arxiv_id': '10.5539/jsd.v17n6p137', 'k_..."


In [259]:
pd.DataFrame(dfs).dropna(subset=['N'])[['precision_micro','recall_micro','PPR']].mean()

precision_micro    0.114846
recall_micro       0.101852
PPR                0.050505
dtype: float64

In [260]:
pd.DataFrame(dfs).dropna(subset=['N'])[['precision_micro','recall_micro','PPR']].std()

precision_micro    0.117616
recall_micro       0.105165
PPR                0.046289
dtype: float64

In [245]:
det = det.reset_index()
det['paper_category'] = det['paper_category'].apply(
    lambda x: eval(x)[0]
)
det['error_category'] = det['error_category'].apply(
    lambda x: 'Data Inconsistency' if 'data inconsistency' in x.lower() else x
)

In [246]:
import pandas as pd
import numpy as np
import random

# Assuming det is already loaded with boolean flags for each run in run_cols
# Identify run columns (all except metadata and mean_score)
meta_cols = ['file', 'doi/arxiv_id', 'error_category', 'error_severity', 'paper_category', 'mean_score']
run_cols = [c for c in det.columns if c not in meta_cols]

def bootstrap_pass_at_n(df_flags, n, n_boot=1000, seed=0):
    random.seed(seed)
    vals = []
    for _ in range(n_boot):
        chosen = random.sample(run_cols, n)
        vals.append(df_flags[chosen].any(axis=1).mean())
    return np.mean(vals), np.std(vals, ddof=1), np.percentile(vals, 0.5), np.percentile(vals, 99.5)

# Compute pass@k by error_category
error_stats = []
for cat, group in det.groupby('error_category'):
    stats = {'category': cat}
    for N in [1,2,4]:
        mean, std, lo, hi = bootstrap_pass_at_n(group[run_cols], N)
        stats.update({
            f'pass@{N}_mean': mean,
            f'pass@{N}_std': std,
        })
    error_stats.append(stats)
error_df = pd.DataFrame(error_stats)

# Compute pass@k by paper_category
paper_stats = []
for cat, group in det.groupby('paper_category'):
    stats = {'category': cat}
    for N in [1,2,4]:
        mean, std, lo, hi = bootstrap_pass_at_n(group[run_cols], N)
        stats.update({
            f'pass@{N}_mean': mean,
            f'pass@{N}_std': std,
        })
    paper_stats.append(stats)
paper_df = pd.DataFrame(paper_stats)

In [247]:
import pandas as pd

def render_passk_table(error_df: pd.DataFrame, paper_df: pd.DataFrame, model_name: str) -> str:
    # Begin LaTeX table
    lines = []
    lines.append(r"\begin{table}[ht]")
    lines.append(r"  \centering")
    lines.append(r"  \fontsize{8}{9.5}\selectfont")
    
    lines.append(
    fr"  \caption{{Mean and standard deviation of $\mathrm{{pass}}@K$ for {model_name} ($K \in \{{1,2,4\}}$) by error category (left) and paper category (right).}}"
)
    lines.append(fr"  \label{{tab:passk-text-{model_name.lower()}}}")
    lines.append(r"  \begin{tabular}{@{} lccc @{\quad} lccc @{} }")
    lines.append(r"    \toprule")
    lines.append(r"    \multicolumn{4}{c}{\textbf{Error Category}} & \multicolumn{4}{c}{\textbf{Paper Category}} \\")
    lines.append(r"    \cmidrule(r){1-4} \cmidrule(l){5-8}")
    lines.append(r"   \multicolumn{4}{c}{\textbf{Error Category}} & \multicolumn{4}{c}{\textbf{Paper Category}} \\")
    lines.append(r"    \midrule")
    
    # Ensure both dfs have same number of rows (pad with blanks if necessary)
    max_rows = max(len(error_df), len(paper_df))
    for i in range(max_rows):
        # Error category row
        if i < len(error_df):
            e = error_df.iloc[i]
            err_row = (
                f"{e['category']} & "
                f"${e['pass@1_mean']*100:.1f}_{{{e['pass@1_std']*100:.1f}}}$ & "
                f"${e['pass@2_mean']*100:.1f}_{{{e['pass@2_std']*100:.1f}}}$ & "
                f"${e['pass@4_mean']*100:.1f}_{{{e['pass@4_std']*100:.1f}}}$"
            )
        else:
            err_row = "& & &"
        # Paper category row
        if i < len(paper_df):
            p = paper_df.iloc[i]
            pap_row = (
                f"{p['category']} & "
                f"${p['pass@1_mean']*100:.1f}_{{{p['pass@1_std']*100:.1f}}}$ & "
                f"${p['pass@2_mean']*100:.1f}_{{{p['pass@2_std']*100:.1f}}}$ & "
                f"${p['pass@4_mean']*100:.1f}_{{{p['pass@4_std']*100:.1f}}}$"
            )
        else:
            pap_row = "& & &"
        lines.append(f"    {err_row} & {pap_row} \\\\")
    
    lines.append(r"    \bottomrule")
    lines.append(r"  \end{tabular}")
    lines.append(r"\end{table}")
    return "\n".join(lines)

# Example usage:
# latex_code = render_passk_table(error_df, paper_df, "o3")
# print(latex_code)





In [248]:
latex_code = render_passk_table(error_df, paper_df,model_name)
print(latex_code)


\begin{table}[ht]
  \centering
  \fontsize{8}{9.5}\selectfont
  \caption{Mean and standard deviation of $\mathrm{pass}@K$ for openrouter_meta-llama_llama-4-s ($K \in \{1,2,4\}$) by error category (left) and paper category (right).}
  \label{tab:passk-text-openrouter_meta-llama_llama-4-s}
  \begin{tabular}{@{} lccc @{\quad} lccc @{} }
    \toprule
    \multicolumn{4}{c}{\textbf{Error Category}} & \multicolumn{4}{c}{\textbf{Paper Category}} \\
    \cmidrule(r){1-4} \cmidrule(l){5-8}
   \multicolumn{4}{c}{\textbf{Error Category}} & \multicolumn{4}{c}{\textbf{Paper Category}} \\
    \midrule
    Data Inconsistency & $6.9_{8.2}$ & $13.1_{9.7}$ & $23.9_{9.2}$ & Biology & $6.5_{16.8}$ & $13.1_{22.0}$ & $29.6_{24.6}$ \\
    Equation / proof & $0.8_{1.3}$ & $1.5_{1.5}$ & $2.6_{1.0}$ & Computer Science & $2.3_{3.7}$ & $4.1_{4.2}$ & $7.2_{2.8}$ \\
    Experiment setup & $0.0_{0.0}$ & $0.0_{0.0}$ & $0.0_{0.0}$ & Environmental Science & $14.1_{22.5}$ & $26.2_{25.0}$ & $42.2_{18.2}$ \\
    Reagent i