In [1]:
import os
import argparse
import pandas as pd
import json
import litellm
from datasets import load_dataset
from prompts import llm_reviewer_template, llm_judge_template
from litellm import batch_completion, completion
from src import extract_response_dict, compute_all_metrics, clean_records

os.environ['OPENROUTER_API_KEY'] = "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"
os.environ['OPENAI_API_KEY'] = "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"

def compute_all_metrics(resp_df: pd.DataFrame):
    # 1) Work on a fresh copy
    df = resp_df.copy()

    # 2) Ensure 'matches' and 'errors' are always lists
    df["matches"] = df["matches"].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df["match_descriptions"] = df["match_descriptions"].apply(lambda x: x if isinstance(x, list) else [])
    df["errors"]  = df["errors"].apply(lambda x: x if isinstance(x, list) else [])

    # 3) Add all per-paper counts
    df = df.assign(
        k_i = df["error_annotation"].apply(lambda x: len(x)),
        TP_i = [max(len(m),len(md)) for m,md in df[['matches','match_descriptions']].values],
    )
    df["FP_i"] = df["errors"].apply(lambda x: len(x)) - df["TP_i"]
    df["FN_i"] = df["k_i"] - df["TP_i"]

    N = len(df)

    # 4) Micro-averaged Precision & Recall
    TP_total = df["TP_i"].sum()
    FP_total = df["FP_i"].sum()
    FN_total = df["FN_i"].sum()
    precision_micro = TP_total / (TP_total + FP_total) if (TP_total + FP_total) > 0 else 0.0
    recall_micro    = TP_total / (TP_total + FN_total) if (TP_total + FN_total) > 0 else 0.0

    # 5) Macro-averaged Precision & Recall (per-paper average)
    per_prec = df.apply(
        lambda row: row["TP_i"] / (row["TP_i"] + row["FP_i"])
        if (row["TP_i"] + row["FP_i"]) > 0 else 0.0,
        axis=1
    )
    per_rec = df.apply(
        lambda row: row["TP_i"] / (row["TP_i"] + row["FN_i"])
        if (row["TP_i"] + row["FN_i"]) > 0 else 0.0,
        axis=1
    )
    precision_macro = per_prec.mean()
    recall_macro    = per_rec.mean()

    # 6) Perfect-Paper Rate (PPR)
    perfect_mask = (df["TP_i"] == df["k_i"])
    PPR = perfect_mask.sum() / N

    # 7) Collect per-paper stats
    per_paper = df[[
        "doi/arxiv_id", "k_i", "TP_i", "FP_i", "FN_i"
    ]].to_dict(orient="records")

    return {
        "N":                 N,
        "precision_micro":   precision_micro,
        "recall_micro":      recall_micro,
        "precision_macro":   precision_macro,
        "recall_macro":      recall_macro,
        "PPR":               PPR,
        "per_paper":         per_paper
    }
    

In [2]:
orig_df = load_dataset(
            'amphora/errata_0504_v0',
            split='train',
            token='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'
        ).to_pandas()
            
list_cols = ['paper_category', 'error_location', 'error_annotation']
agg_dict = {
    col: (list if col in list_cols else 'first')
    for col in orig_df.columns
    if col != 'doi/arxiv_id'
}
flat_orig_df = (
    orig_df
    .groupby('doi/arxiv_id', as_index=False)
    .agg(agg_dict)
)

Using the latest cached version of the dataset since amphora/errata_0504_v0 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/amphora___errata_0504_v0/default/0.0.0/8e986e83d3a384bcf1e2d2e8e6ae0348bb6e3a9e (last modified on Mon May  5 03:28:02 2025).


In [3]:
model_name = 'openrouter/openai/o4-mini-high'
judge_name = "gpt-4.1"
safe_model_name = model_name.replace("/", "_").replace(".", "_")
# for idx in range(0,8):
idx = 3
qrys = []
papers = []
for _,row in flat_orig_df.iterrows():
    qry = [
            {'role': 'system', 'content': llm_reviewer_template},
            {'role': 'user', 'content': clean_records(row.paper_content)}
        ]
    qrys.append(qry)
    papers.append(row['doi/arxiv_id'])

print(f"Generating responses with {model_name}")

from tqdm import tqdm
responses = []
for qry in tqdm(qrys):
    for attempt in range(1, 6):
        try:
            res = completion(model=model_name, messages=qry, min_tokens=8)
            responses.append(res)
            break
        except Exception:
            if attempt == 5:
                responses.append(None)
            # otherwise, retry

parsed_results = [extract_response_dict(resp) for resp in responses]
resp_df = pd.DataFrame({
    "doi/arxiv_id": papers,
    "parsed":    [r.get("parsed", False)        for r in parsed_results],
    "is_error":  [bool(r.get("has_error", True)) for r in parsed_results],
    "errors":    [r.get("errors", [])            for r in parsed_results],
})
resp_df = resp_df.merge(flat_orig_df, on=['doi/arxiv_id'])

error_cases = resp_df.loc[resp_df["parsed"] & resp_df["is_error"], :]
qrys = []
for _, row in error_cases.iterrows():
    annotations = [
        {"location": loc, "description": desc}
        for loc, desc in zip(row.error_location, row.error_annotation)
    ]
    predictions = row.errors
    payload = {
        "annotations": annotations,
        "predictions": predictions
    }
    user_content = json.dumps(payload, ensure_ascii=False, indent=2)
    qry = [
        {"role": "system", "content": llm_judge_template},
        {"role": "user",   "content": user_content},
    ]
    qrys.append(qry)

print(f"Generating responses with {judge_name}")
# Run judge model
responses = batch_completion(
    model=judge_name,
    messages=qrys
)
parsed_results = [extract_response_dict(resp) for resp in responses]

# Assemble judge results
records = []
for doi, res in zip(resp_df["doi/arxiv_id"], parsed_results):
    matches = res.get("matches", [])
    descriptions = [m.get("description", "") for m in matches]
    records.append({
        "doi/arxiv_id": doi,
        "matches": matches,
        "match_descriptions": descriptions
    })
judge_df = pd.DataFrame.from_records(records)
resp_df = resp_df.merge(judge_df, on="doi/arxiv_id", how="left")
metrics = compute_all_metrics(resp_df)
print(f"Micro Precision: {metrics['precision_micro']:.3f}")
print(f"Micro Recall:    {metrics['recall_micro']:.3f}")
print(f"Macro Precision: {metrics['precision_macro']:.3f}")
print(f"Macro Recall:    {metrics['recall_macro']:.3f}")
print(f"PPR:             {metrics['PPR']:.3f}")

# Export results and metrics
resp_df.to_csv(f"results_scaling/{safe_model_name}_resp_df_{idx}.csv", index=False)
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv(f"results_scaling/{safe_model_name}_metrics_{idx}.csv", index=False)
# print(f"Exported data to results_resp_df.csv and {args.output_prefix}_metrics.csv")

Generating responses with openrouter/openai/o4-mini-high


100%|██████████| 83/83 [1:45:19<00:00, 76.14s/it] 


Generating responses with gpt-4.1
Micro Precision: 0.042
Micro Recall:    0.132
Macro Precision: 0.050
Macro Recall:    0.139
PPR:             0.108


In [5]:


from datasets import load_dataset

df = load_dataset('HAERAE-HUB/corearoadbike-qa-original',split='train',token='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').to_pandas()

README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/238M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/272268 [00:00<?, ? examples/s]

In [9]:
df.values[5]

array(['뒷변속기 교체 공임비 문의', 'TEQU', '25-04-19 12:01',
       '샵마다 다소 상이하겠지만.. 일반적으로 뒷드레일러만 교체할때 대략 공임비 및 작업시간이 얼마나 될까요? 작업(분해)이 복잡할까요?? 11단 림브입니다!',
       '[]',
       "['샵 갈필요있나요? 케이블 이상없다면 그냥 드레일러 바꾸고 케이블연결 하고 하면 됨. 샵가시면 교체하고 기어변속 보면 3만원?', 'Immanuel 《 아 간단한 작업인데 생각보다 드는군요? a', '직접하는게 절약됨 해보면 별거 아닙니다']",
       'https://m.corearoadbike.com/board/board.php?g_id=Menu01&t_id=Menu01Top1&no=1848362',
       '2025-04-19 14:13:44', 1], dtype=object)