In [None]:
import json
import os
from glob import glob
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

BASE_DIR = "/Users/ehsan/CursorProjects/llm-collusion"
DATA_ROOT = os.path.join(BASE_DIR, "data", "full_attribution")

print("Data root:", DATA_ROOT)


Data root: /Users/ehsan/CursorProjects/llm-collusion/data/model_attribution


In [2]:
def load_jsonl(path: str) -> List[Dict]:
    records: List[Dict] = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records


def extract_labels(records: List[Dict]) -> Tuple[List[int], List[int], List[str]]:
    """
    Returns:
      y_true, y_pred as 0/1 labels over all code items (Code1, Code2 per row),
      and class_names [model1, model2]
    Skips rows where predicted_attribution is None.
    """
    y_true: List[int] = []
    y_pred: List[int] = []
    class_names: List[str] = []

    if not records:
        return y_true, y_pred, class_names

    # Use per-file model1/model2 from any row that contains them
    for r in records:
        m1 = r.get("model1")
        m2 = r.get("model2")
        if m1 and m2:
            class_names = [m1, m2]
            break

    if not class_names:
        return y_true, y_pred, class_names

    model1, model2 = class_names

    for r in records:
        gold = r.get("gold_attribution")
        pred = r.get("predicted_attribution")
        if not gold or not pred:
            # skip if prediction is missing
            continue

        # Code1
        g1 = gold.get("Code1")
        p1 = pred.get("Code1")
        if g1 in (model1, model2) and p1 in (model1, model2):
            y_true.append(0 if g1 == model1 else 1)
            y_pred.append(0 if p1 == model1 else 1)

        # Code2
        g2 = gold.get("Code2")
        p2 = pred.get("Code2")
        if g2 in (model1, model2) and p2 in (model1, model2):
            y_true.append(0 if g2 == model1 else 1)
            y_pred.append(0 if p2 == model1 else 1)

    return y_true, y_pred, class_names


def compute_metrics(y_true: List[int], y_pred: List[int]) -> Dict[str, float]:
    if len(y_true) == 0:
        return {"accuracy": np.nan, "macro_f1": np.nan, "macro_recall": np.nan}

    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=[0, 1], average=None, zero_division=0
    )
    macro_f1 = float(np.mean(f1))
    macro_recall = float(np.mean(recall))
    return {
        "accuracy": float(acc),
        "macro_f1": macro_f1,
        "macro_recall": macro_recall,
        "support": int(sum(support)),
    }


def evaluate_file(path: str) -> Dict:
    records = load_jsonl(path)
    y_true, y_pred, class_names = extract_labels(records)
    metrics = compute_metrics(y_true, y_pred)

    # derive judge and counts
    judge = None
    for r in records:
        jm = r.get("judge_model")
        if jm:
            judge = jm
            break
    rows = len(records)
    skipped_rows = sum(1 for r in records if not r.get("predicted_attribution"))

    return {
        "file": os.path.relpath(path, BASE_DIR),
        "judge": judge,
        "model1": class_names[0] if class_names else None,
        "model2": class_names[1] if class_names else None,
        "rows": rows,
        "skipped_rows": skipped_rows,
        **metrics,
    }


In [None]:
# Find all jsonl files recursively under data/full_attribution
all_files = sorted(glob(os.path.join(DATA_ROOT, "**", "*.jsonl"), recursive=True))
print(f"Found {len(all_files)} files")

rows = []
for fp in all_files:
    rows.append(evaluate_file(fp))

results_df = pd.DataFrame(rows)

# Parse benchmark/split from path
def parse_path_info(relpath: str) -> Dict[str, str]:
    parts = relpath.split("/")
    meta = {"split": None, "benchmark": None}
    try:
        # Look for "full_attribution" in path
        if "full_attribution" in parts:
            idx = parts.index("full_attribution")
            meta["benchmark"] = parts[idx + 1] if idx + 1 < len(parts) else None
            meta["split"] = parts[idx + 2] if idx + 2 < len(parts) else None
    except Exception:
        pass
    return meta

path_info = [parse_path_info(f) for f in results_df["file"].tolist()]
results_df = pd.concat([results_df, pd.DataFrame(path_info)], axis=1)

# order columns for readability
cols = [
    "file", "benchmark", "split", "judge", "model1", "model2",
    "rows", "skipped_rows", "support", "accuracy", "macro_f1", "macro_recall"
]
existing_cols = [c for c in cols if c in results_df.columns]
results_df = results_df[existing_cols]

# sort by benchmark, split, judge, model1, model2
results_df = results_df.sort_values(by=["benchmark", "split", "judge", "model1", "model2"]).reset_index(drop=True)
results_df


Found 2 files


Unnamed: 0,file,judge,model1,model2,rows,skipped_rows,support,accuracy,macro_f1,macro_recall
0,data/model_attribution/mbpp-sanitized/test/jud...,openai/gpt-5,anthropic/claude-haiku-4.5,deepseek/deepseek-chat-v3-0324,257,4,506,0.644269,0.644269,0.644269
1,data/model_attribution/mbpp-sanitized/test/jud...,openai/gpt-5,openai/gpt-5,anthropic/claude-haiku-4.5,257,2,510,0.666667,0.666667,0.666667


In [None]:
# Aggregate by (benchmark, split, judge, model pair)
agg = (
    results_df.groupby(["benchmark", "split", "judge", "model1", "model2"], dropna=False)
    .agg({
        "rows": "sum",
        "skipped_rows": "sum",
        "support": "sum",
        "accuracy": "mean",
        "macro_f1": "mean",
        "macro_recall": "mean",
    })
    .reset_index()
    .sort_values(["benchmark", "split", "judge", "model1", "model2"]) 
)
agg


Unnamed: 0,benchmark,split,judge,model1,model2,rows,skipped_rows,support,accuracy,macro_f1,macro_recall
0,mbpp-sanitized,test,openai/gpt-5,anthropic/claude-haiku-4.5,deepseek/deepseek-chat-v3-0324,257,4,506,0.644269,0.644269,0.644269
1,mbpp-sanitized,test,openai/gpt-5,openai/gpt-5,anthropic/claude-haiku-4.5,257,2,510,0.666667,0.666667,0.666667


## Summary

This notebook computes full attribution metrics for all files under `data/full_attribution/`.

**Extracted Information:**
- `benchmark`: Dataset (e.g., mbpp-sanitized)
- `split`: Train/validation/test split
- `judge`: Judge model used for attribution (from JSONL records)
- `model1`, `model2`: The two models being classified
- `rows`: Total number of tasks in the file
- `skipped_rows`: Number of tasks where prediction is null/missing
- `support`: Number of code items (Code1, Code2) used for metrics (excludes skipped)

**Metrics:**
- `accuracy`: Fraction of correctly attributed code items
- `macro_f1`: F1 score averaged over both model classes
- `macro_recall`: Recall averaged over both model classes
