# Judgment Files Analysis

This notebook reads judgment JSONL files (e.g., `judge-openai-gpt-5_classify-deepseek-deepseek-chat-v3-0324_vs_anthropic-claude-haiku-4.5.jsonl`) and similar under `data/`, computes accuracy, precision, recall, F1, and provides per-file and aggregate insights, including which task IDs failed with predicted vs gold attributions.

- Works across multiple directories (e.g., `data/model_attribution/...`)
- Handles missing predictions (`predicted_attribution: null`)
- Summarizes per-file metrics and aggregates across all selected files
- Lists hardest tasks (most frequently misattributed)



In [None]:
from __future__ import annotations
import os
import json
from glob import glob
from dataclasses import dataclass
from collections import Counter, defaultdict
from typing import List, Dict, Any, Tuple

import pandas as pd

# Configuration
# You can change this to any glob that matches judgment files
# Examples:
# - 'data/model_attribution/mbpp-sanitized/test/judge-*.jsonl'
# - 'data/model_attribution/**/test/judge-*.jsonl'
# - 'data/**/judge-*.jsonl'
FILE_GLOB = 'data/model_attribution/**/test/judge-*.jsonl'

# Minimum rows to include a file in metrics (to filter out empty or degenerate files)
MIN_ROWS_PER_FILE = 1



In [None]:
def read_jsonl(path: str) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    with open(path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                # skip malformed lines
                continue
    return rows


def load_files(file_glob: str) -> Dict[str, pd.DataFrame]:
    files = sorted(glob(file_glob, recursive=True))
    dataframes: Dict[str, pd.DataFrame] = {}
    for fp in files:
        rows = read_jsonl(fp)
        if not rows:
            continue
        df = pd.DataFrame(rows)
        if len(df) >= MIN_ROWS_PER_FILE:
            dataframes[fp] = df
    return dataframes


def normalize_record(row: Dict[str, Any]) -> Dict[str, Any]:
    # Flatten predicted and gold attributions
    pred = row.get('predicted_attribution') or {}
    gold = row.get('gold_attribution') or {}
    rec = dict(row)
    rec['pred_Code1'] = pred.get('Code1')
    rec['pred_Code2'] = pred.get('Code2')
    rec['gold_Code1'] = gold.get('Code1')
    rec['gold_Code2'] = gold.get('Code2')
    # Ensure is_correct as boolean or None
    ic = row.get('is_correct')
    rec['is_correct'] = (bool(ic) if ic is not None else None)
    return rec


def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    norm = df.apply(lambda r: pd.Series(normalize_record(r)), axis=1)
    # Preserve original columns if needed
    for c in df.columns:
        if c not in norm.columns:
            norm[c] = df[c]
    return norm



In [None]:
def binary_metrics_from_correct(series: pd.Series) -> Dict[str, Any]:
    # Consider 'positive' = correct prediction; ignore None
    vals = [v for v in series.tolist() if v is not None]
    if not vals:
        return {
            'n': 0,
            'accuracy': None,
            'precision': None,
            'recall': None,
            'f1': None,
            'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0,
        }
    n = len(vals)
    tp = sum(1 for v in vals if v is True)
    fn = 0  # with only correctness known, can't separate TN/FP meaningfully
    tn = 0
    fp = sum(1 for v in vals if v is False)
    accuracy = tp / n if n else None
    precision = tp / (tp + fp) if (tp + fp) else None
    recall = tp / (tp + fn) if (tp + fn) else None
    f1 = (2 * precision * recall / (precision + recall)) if (precision and recall and (precision + recall)) else None
    return {'n': n, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn}


def per_position_accuracy(df: pd.DataFrame) -> Dict[str, Any]:
    # Accuracy for Code1 and Code2 labels separately
    mask = df['gold_Code1'].notna() & df['gold_Code2'].notna()
    df2 = df[mask].copy()
    if df2.empty:
        return {'code1_acc': None, 'code2_acc': None}
    code1_acc = (df2['pred_Code1'] == df2['gold_Code1']).mean() if 'pred_Code1' in df2 else None
    code2_acc = (df2['pred_Code2'] == df2['gold_Code2']).mean() if 'pred_Code2' in df2 else None
    return {'code1_acc': float(code1_acc) if code1_acc is not None else None,
            'code2_acc': float(code2_acc) if code2_acc is not None else None}


def confusion_counts(df: pd.DataFrame, col_pred: str, col_gold: str) -> pd.DataFrame:
    # Confusion across labels present in file (usually two models)
    if col_pred not in df or col_gold not in df:
        return pd.DataFrame()
    dd = df.dropna(subset=[col_pred, col_gold])
    if dd.empty:
        return pd.DataFrame()
    labels = sorted(set(dd[col_gold].unique()).union(set(dd[col_pred].unique())))
    mat = pd.DataFrame(0, index=labels, columns=labels)
    for _, r in dd.iterrows():
        mat.loc[r[col_gold], r[col_pred]] += 1
    mat.index.name = 'gold'
    mat.columns.name = 'pred'
    return mat


def summarize_file(df: pd.DataFrame) -> Dict[str, Any]:
    df_norm = normalize_dataframe(df)
    metrics = binary_metrics_from_correct(df_norm['is_correct']) if 'is_correct' in df_norm else {}
    pos_acc = per_position_accuracy(df_norm)
    conf1 = confusion_counts(df_norm, 'pred_Code1', 'gold_Code1')
    conf2 = confusion_counts(df_norm, 'pred_Code2', 'gold_Code2')
    failed = df_norm[df_norm['is_correct'] == False]
    failed_view = failed[['benchmark', 'task_id', 'model1', 'model2', 'pred_Code1', 'pred_Code2', 'gold_Code1', 'gold_Code2']].copy() if not failed.empty else pd.DataFrame()
    return {
        'n_rows': int(len(df_norm)),
        'metrics': metrics,
        'per_position': pos_acc,
        'confusion_code1': conf1,
        'confusion_code2': conf2,
        'failed': failed_view,
    }



In [None]:
files_to_dfs = load_files(FILE_GLOB)
print(f"Found {len(files_to_dfs)} files for glob: {FILE_GLOB}")
list(files_to_dfs.keys())[:5]


In [None]:
per_file_summaries: Dict[str, Dict[str, Any]] = {}
for fp, df in files_to_dfs.items():
    per_file_summaries[fp] = summarize_file(df)

# Create a compact table of per-file metrics
rows = []
for fp, summ in per_file_summaries.items():
    m = summ['metrics']
    pos = summ['per_position']
    rows.append({
        'file': fp,
        'n_rows': summ['n_rows'],
        'accuracy': m.get('accuracy'),
        'precision': m.get('precision'),
        'recall': m.get('recall'),
        'f1': m.get('f1'),
        'code1_acc': pos.get('code1_acc'),
        'code2_acc': pos.get('code2_acc'),
    })
summary_df = pd.DataFrame(rows).sort_values(by=['f1', 'accuracy'], ascending=[False, False])
summary_df


In [None]:
# Aggregate across all files
all_df_list = []
for df in files_to_dfs.values():
    all_df_list.append(normalize_dataframe(df))

all_df = pd.concat(all_df_list, ignore_index=True) if all_df_list else pd.DataFrame()
print(f"Total rows across files: {len(all_df)}")

agg = summarize_file(all_df) if not all_df.empty else None
agg_metrics = agg['metrics'] if agg else {}
agg_pos = agg['per_position'] if agg else {}
print({
    'n_rows': len(all_df),
    'accuracy': agg_metrics.get('accuracy'),
    'precision': agg_metrics.get('precision'),
    'recall': agg_metrics.get('recall'),
    'f1': agg_metrics.get('f1'),
    'code1_acc': agg_pos.get('code1_acc'),
    'code2_acc': agg_pos.get('code2_acc'),
})

# Display confusions if any
agg_conf1 = agg['confusion_code1'] if agg else pd.DataFrame()
agg_conf2 = agg['confusion_code2'] if agg else pd.DataFrame()
agg_conf1, agg_conf2


In [None]:
# Hardest tasks: which task_ids fail most often
if agg and not agg['failed'].empty:
    fails = agg['failed']
    # Count failures by task_id
    counts = fails.groupby('task_id').size().sort_values(ascending=False).rename('fail_count')
    hardest = counts.to_frame().reset_index()
    display(hardest.head(20))
else:
    print('No failures found or no data loaded.')



In [None]:
# Show sample of failed cases with details
if agg and not agg['failed'].empty:
    display(agg['failed'].head(30))
else:
    print('No failed cases to display.')

