# Model Attribution Tasks Evaluation

Evaluate and compare performance across all three model attribution tasks:
- **Self-Recognition**: Model identifying its own code
- **Full Attribution**: Model identifying which code belongs to which model (both models)
- **Target Identification**: Model identifying code from a specific target model


In [1]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Paths
BASE_DIR = Path("/Users/ehsan/CursorProjects/llm-collusion")
SELF_RECOGNITION_FILE = BASE_DIR / "data" / "self_recognition" / "mbpp-sanitized" / "test" / "anthropic-claude-haiku-4.5.jsonl"
FULL_ATTRIBUTION_DIR = BASE_DIR / "data" / "full_attribution" / "mbpp-sanitized" / "test"
TARGET_IDENTIFICATION_DIR = BASE_DIR / "data" / "target_identification" / "mbpp-sanitized" / "test"


In [2]:
# Self-Recognition Metrics
print("=" * 60)
print("SELF-RECOGNITION")
print("=" * 60)

data = [json.loads(line) for line in open(SELF_RECOGNITION_FILE) if line.strip()]
df_sr = pd.DataFrame(data)

y_true = df_sr['gold_candidate'].values
y_pred = df_sr['predicted_candidate'].values

acc = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)

print(f"Accuracy:  {acc:.4f} ({acc*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"Correct:   {df_sr['is_correct'].sum()}/{len(df_sr)}")


SELF-RECOGNITION
Accuracy:  0.5097 (50.97%)
Precision: 0.5094
Recall:    0.5094
F1-Score:  0.5094
Correct:   131/257


In [3]:
# Full Attribution Metrics
print("\n" + "=" * 60)
print("FULL ATTRIBUTION")
print("=" * 60)

def extract_attribution_labels(records):
    """Extract labels from full attribution records."""
    y_true, y_pred = [], []
    
    if not records:
        return y_true, y_pred
    
    # Get model names
    model1 = records[0].get("model1")
    model2 = records[0].get("model2")
    if not model1 or not model2:
        return y_true, y_pred
    
    for r in records:
        gold = r.get("gold_attribution")
        pred = r.get("predicted_attribution")
        if not gold or not pred:
            continue
        
        # Code1
        g1 = gold.get("Code1")
        p1 = pred.get("Code1")
        if g1 in (model1, model2) and p1 in (model1, model2):
            y_true.append(0 if g1 == model1 else 1)
            y_pred.append(0 if p1 == model1 else 1)
        
        # Code2
        g2 = gold.get("Code2")
        p2 = pred.get("Code2")
        if g2 in (model1, model2) and p2 in (model1, model2):
            y_true.append(0 if g2 == model1 else 1)
            y_pred.append(0 if p2 == model1 else 1)
    
    return y_true, y_pred

# List all available files
all_attribution_files = sorted(list(FULL_ATTRIBUTION_DIR.glob("*.jsonl")))
print("Available files:")
for i, file in enumerate(all_attribution_files):
    print(f"  [{i}] {file.name}")

# Select which files to evaluate
# Option 1: Evaluate all files (default)
# SELECTED_FILE_INDICES = list(range(len(all_attribution_files)))

# Option 2: Select specific files by index (e.g., [0, 1] for first two files)
SELECTED_FILE_INDICES = [0]  # Change this to select specific files

# Option 3: Select files by name pattern
# SELECTED_FILE_PATTERN = "gpt-5"  # Uncomment and set pattern to filter by name
# SELECTED_FILE_INDICES = [i for i, f in enumerate(all_attribution_files) if SELECTED_FILE_PATTERN in f.name]

# Filter selected files
attribution_files = [all_attribution_files[i] for i in SELECTED_FILE_INDICES if 0 <= i < len(all_attribution_files)]

if not attribution_files:
    print("\nNo files selected!")
else:
    print(f"\nEvaluating {len(attribution_files)} file(s):")
    for f in attribution_files:
        print(f"  - {f.name}")
    
    all_y_true, all_y_pred = [], []
    
    for file in attribution_files:
        records = [json.loads(line) for line in open(file) if line.strip()]
        y_true, y_pred = extract_attribution_labels(records)
        all_y_true.extend(y_true)
        all_y_pred.extend(y_pred)
    
    if all_y_true:
        acc = accuracy_score(all_y_true, all_y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(all_y_true, all_y_pred, average='macro', zero_division=0)
        
        print(f"\nResults:")
        print(f"Accuracy:  {acc:.4f} ({acc*100:.2f}%)")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1-Score:  {f1:.4f}")
        print(f"Support:   {len(all_y_true)} code items")
        print(f"Files:     {len(attribution_files)}")
    else:
        print("\nNo valid data found in selected files")



FULL ATTRIBUTION
Available files:
  [0] judge-openai-gpt-5_classify-deepseek-deepseek-chat-v3-0324_vs_anthropic-claude-haiku-4.5.jsonl
  [1] judge-openai-gpt-5_classify-openai-gpt-5_vs_anthropic-claude-haiku-4.5.jsonl

Evaluating 1 file(s):
  - judge-openai-gpt-5_classify-deepseek-deepseek-chat-v3-0324_vs_anthropic-claude-haiku-4.5.jsonl

Results:
Accuracy:  0.6443 (64.43%)
Precision: 0.6443
Recall:    0.6443
F1-Score:  0.6443
Support:   506 code items
Files:     1


In [4]:
# Target Identification Metrics
print("\n" + "=" * 60)
print("TARGET IDENTIFICATION")
print("=" * 60)

# Load all target identification files
target_identification_files = list(TARGET_IDENTIFICATION_DIR.glob("*.jsonl"))
all_y_true_ti, all_y_pred_ti = [], []

for file in target_identification_files:
    records = [json.loads(line) for line in open(file) if line.strip()]
    df_ti = pd.DataFrame(records)
    # Filter out rows with NaN values
    df_ti_clean = df_ti.dropna(subset=['gold_candidate', 'predicted_candidate'])
    if len(df_ti_clean) > 0:
        y_true = df_ti_clean['gold_candidate'].values
        y_pred = df_ti_clean['predicted_candidate'].values
        all_y_true_ti.extend(y_true)
        all_y_pred_ti.extend(y_pred)

if all_y_true_ti:
    # Convert to numpy arrays and ensure no NaN
    all_y_true_ti = np.array(all_y_true_ti)
    all_y_pred_ti = np.array(all_y_pred_ti)
    
    acc = accuracy_score(all_y_true_ti, all_y_pred_ti)
    precision, recall, f1, _ = precision_recall_fscore_support(all_y_true_ti, all_y_pred_ti, average='macro', zero_division=0)
    
    print(f"Accuracy:  {acc:.4f} ({acc*100:.2f}%)")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"Support:   {len(all_y_true_ti)} records")
    print(f"Files:     {len(target_identification_files)}")
else:
    print("No target identification data found")



TARGET IDENTIFICATION
Accuracy:  0.9623 (96.23%)
Precision: 0.9630
Recall:    0.9630
F1-Score:  0.9623
Support:   769 records
Files:     3
