# Full Attribution Evaluation

Evaluate all full attribution files and display results in a table format.
Each file contains a judge model's predictions on which code belongs to which model.


In [1]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Paths
BASE_DIR = Path("/Users/ehsan/CursorProjects/llm-collusion")
FULL_ATTRIBUTION_DIR = BASE_DIR / "data" / "full_attribution" / "mbpp-sanitized" / "test"


In [2]:
def extract_attribution_labels(records):
    """Extract labels from full attribution records."""
    y_true, y_pred = [], []
    
    if not records:
        return y_true, y_pred
    
    # Get model names
    model1 = records[0].get("model1")
    model2 = records[0].get("model2")
    if not model1 or not model2:
        return y_true, y_pred
    
    for r in records:
        gold = r.get("gold_attribution")
        pred = r.get("predicted_attribution")
        if not gold or not pred:
            continue
        
        # Code1
        g1 = gold.get("Code1")
        p1 = pred.get("Code1")
        if g1 in (model1, model2) and p1 in (model1, model2):
            y_true.append(0 if g1 == model1 else 1)
            y_pred.append(0 if p1 == model1 else 1)
        
        # Code2
        g2 = gold.get("Code2")
        p2 = pred.get("Code2")
        if g2 in (model1, model2) and p2 in (model1, model2):
            y_true.append(0 if g2 == model1 else 1)
            y_pred.append(0 if p2 == model1 else 1)
    
    return y_true, y_pred


In [3]:
# Get all full attribution files
all_files = sorted(list(FULL_ATTRIBUTION_DIR.glob("*.jsonl")))
print(f"Found {len(all_files)} files to evaluate\n")

# Evaluate each file
results = []

for file_path in all_files:
    # Load records
    records = [json.loads(line) for line in open(file_path) if line.strip()]
    
    if not records:
        continue
    
    # Extract metadata
    judge_model = records[0].get("judge_model", "unknown")
    model1 = records[0].get("model1", "unknown")
    model2 = records[0].get("model2", "unknown")
    
    # Extract labels and compute metrics
    y_true, y_pred = extract_attribution_labels(records)
    
    if len(y_true) == 0:
        continue
    
    # Compute metrics
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    
    # Count correct predictions
    correct_count = sum(1 for r in records if r.get("is_correct", False))
    total_records = len(records)
    
    # Store results
    results.append({
        "File": file_path.name,
        "Judge Model": judge_model,
        "Model 1": model1.split("/")[-1] if "/" in model1 else model1,
        "Model 2": model2.split("/")[-1] if "/" in model2 else model2,
        "Accuracy": acc,
        "Accuracy %": f"{acc*100:.2f}%",
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Code Items": len(y_true),
        "Records": total_records,
        "Correct": correct_count,
        "Correct %": f"{correct_count/total_records*100:.2f}%" if total_records > 0 else "0%"
    })

# Create DataFrame and sort by accuracy (descending)
df_results = pd.DataFrame(results)
df_results = df_results.sort_values('Accuracy', ascending=False).reset_index(drop=True)

# Display results table
print("=" * 100)
print("FULL ATTRIBUTION EVALUATION RESULTS")
print("=" * 100)
print()
df_results


Found 10 files to evaluate

FULL ATTRIBUTION EVALUATION RESULTS



Unnamed: 0,File,Judge Model,Model 1,Model 2,Accuracy,Accuracy %,Precision,Recall,F1-Score,Code Items,Records,Correct,Correct %
0,judge-x-ai-grok-code-fast-1_classify-anthropic...,x-ai/grok-code-fast-1,deepseek-chat-v3-0324,claude-haiku-4.5,0.804688,80.47%,0.804688,0.804688,0.804688,512,257,206,80.16%
1,judge-anthropic-claude-sonnet-4.5_classify-ant...,anthropic/claude-sonnet-4.5,claude-haiku-4.5,deepseek-chat-v3-0324,0.774319,77.43%,0.774319,0.774319,0.774319,514,257,199,77.43%
2,judge-x-ai-grok-4.1-fast_classify-anthropic-cl...,x-ai/grok-4.1-fast,claude-haiku-4.5,deepseek-chat-v3-0324,0.747082,74.71%,0.747082,0.747082,0.747082,514,257,192,74.71%
3,judge-openai-gpt-5.1-codex_classify-anthropic-...,openai/gpt-5.1-codex,claude-haiku-4.5,deepseek-chat-v3-0324,0.692607,69.26%,0.692607,0.692607,0.692607,514,257,178,69.26%
4,judge-openai-gpt-5_classify-openai-gpt-5_vs_an...,openai/gpt-5,gpt-5,claude-haiku-4.5,0.666667,66.67%,0.666667,0.666667,0.666667,510,257,170,66.15%
5,judge-openai-gpt-5_classify-anthropic-claude-h...,openai/gpt-5,deepseek-chat-v3-0324,claude-haiku-4.5,0.652344,65.23%,0.652344,0.652344,0.652344,512,257,167,64.98%
6,judge-openai-gpt-5_classify-deepseek-deepseek-...,openai/gpt-5,claude-haiku-4.5,deepseek-chat-v3-0324,0.644269,64.43%,0.644269,0.644269,0.644269,506,257,163,63.42%
7,judge-google-gemini-3-pro-preview_classify-ant...,google/gemini-3-pro-preview,claude-haiku-4.5,deepseek-chat-v3-0324,0.636364,63.64%,0.636364,0.636364,0.636364,110,76,35,46.05%
8,judge-qwen-qwen3-coder-30b-a3b-instruct_classi...,qwen/qwen3-coder-30b-a3b-instruct,deepseek-chat-v3-0324,claude-haiku-4.5,0.585938,58.59%,0.585946,0.585946,0.585938,512,257,150,58.37%
9,judge-google-gemini-2.5-flash_classify-anthrop...,google/gemini-2.5-flash,claude-haiku-4.5,deepseek-chat-v3-0324,0.513619,51.36%,0.513619,0.513619,0.513619,514,257,132,51.36%


In [4]:
# Summary statistics
if len(df_results) > 0:
    print("\n" + "=" * 100)
    print("SUMMARY STATISTICS")
    print("=" * 100)
    print(f"\nTotal files evaluated: {len(df_results)}")
    print(f"\nAccuracy Statistics:")
    print(f"  Mean:   {df_results['Accuracy'].mean():.4f} ({df_results['Accuracy'].mean()*100:.2f}%)")
    print(f"  Median: {df_results['Accuracy'].median():.4f} ({df_results['Accuracy'].median()*100:.2f}%)")
    print(f"  Min:    {df_results['Accuracy'].min():.4f} ({df_results['Accuracy'].min()*100:.2f}%)")
    print(f"  Max:    {df_results['Accuracy'].max():.4f} ({df_results['Accuracy'].max()*100:.2f}%)")
    print(f"\nF1-Score Statistics:")
    print(f"  Mean:   {df_results['F1-Score'].mean():.4f}")
    print(f"  Median: {df_results['F1-Score'].median():.4f}")
    print(f"  Min:    {df_results['F1-Score'].min():.4f}")
    print(f"  Max:    {df_results['F1-Score'].max():.4f}")
    print(f"\nTotal code items evaluated: {df_results['Code Items'].sum():,}")
    print(f"Total records: {df_results['Records'].sum():,}")



SUMMARY STATISTICS

Total files evaluated: 10

Accuracy Statistics:
  Mean:   0.6718 (67.18%)
  Median: 0.6595 (65.95%)
  Min:    0.5136 (51.36%)
  Max:    0.8047 (80.47%)

F1-Score Statistics:
  Mean:   0.6718
  Median: 0.6595
  Min:    0.5136
  Max:    0.8047

Total code items evaluated: 4,718
Total records: 2,389


In [5]:
# Sort by accuracy (descending) for easier comparison
df_results_sorted = df_results.sort_values('Accuracy', ascending=False)
print("\n" + "=" * 100)
print("RESULTS SORTED BY ACCURACY (HIGHEST TO LOWEST)")
print("=" * 100)
print()
df_results_sorted[['File', 'Judge Model', 'Model 1', 'Model 2', 'Accuracy %', 'F1-Score', 'Code Items', 'Records']]



RESULTS SORTED BY ACCURACY (HIGHEST TO LOWEST)



Unnamed: 0,File,Judge Model,Model 1,Model 2,Accuracy %,F1-Score,Code Items,Records
0,judge-x-ai-grok-code-fast-1_classify-anthropic...,x-ai/grok-code-fast-1,deepseek-chat-v3-0324,claude-haiku-4.5,80.47%,0.804688,512,257
1,judge-anthropic-claude-sonnet-4.5_classify-ant...,anthropic/claude-sonnet-4.5,claude-haiku-4.5,deepseek-chat-v3-0324,77.43%,0.774319,514,257
2,judge-x-ai-grok-4.1-fast_classify-anthropic-cl...,x-ai/grok-4.1-fast,claude-haiku-4.5,deepseek-chat-v3-0324,74.71%,0.747082,514,257
3,judge-openai-gpt-5.1-codex_classify-anthropic-...,openai/gpt-5.1-codex,claude-haiku-4.5,deepseek-chat-v3-0324,69.26%,0.692607,514,257
4,judge-openai-gpt-5_classify-openai-gpt-5_vs_an...,openai/gpt-5,gpt-5,claude-haiku-4.5,66.67%,0.666667,510,257
5,judge-openai-gpt-5_classify-anthropic-claude-h...,openai/gpt-5,deepseek-chat-v3-0324,claude-haiku-4.5,65.23%,0.652344,512,257
6,judge-openai-gpt-5_classify-deepseek-deepseek-...,openai/gpt-5,claude-haiku-4.5,deepseek-chat-v3-0324,64.43%,0.644269,506,257
7,judge-google-gemini-3-pro-preview_classify-ant...,google/gemini-3-pro-preview,claude-haiku-4.5,deepseek-chat-v3-0324,63.64%,0.636364,110,76
8,judge-qwen-qwen3-coder-30b-a3b-instruct_classi...,qwen/qwen3-coder-30b-a3b-instruct,deepseek-chat-v3-0324,claude-haiku-4.5,58.59%,0.585938,512,257
9,judge-google-gemini-2.5-flash_classify-anthrop...,google/gemini-2.5-flash,claude-haiku-4.5,deepseek-chat-v3-0324,51.36%,0.513619,514,257
