# Judge Comparison Experiment

This notebook compares multiple judgment systems to understand which exhibit "Riemann-healthy" error behavior.

We will compare:
- BiasedJudge: Systematic bias increasing with complexity
- NoisyJudge: High random noise
- ConservativeJudge: Tendency toward neutral judgments  
- RadicalJudge: Amplifies extremes

For each judge, we compute Π(x), E(x), and test the ERH.


In [None]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt

from core.action_space import generate_world
from core.judgement_system import BiasedJudge, NoisyJudge, ConservativeJudge, RadicalJudge, batch_evaluate
from core.ethical_primes import compare_error_distributions
from analysis.statistics import compare_judges, generate_report
from visualization.plots import setup_paper_style, plot_multi_judge_errors, plot_judge_comparison

setup_paper_style()
np.random.seed(42)

# Generate shared action space
actions = generate_world(num_actions=1000, complexity_dist='zipf', random_seed=42)

# Create judges
judges = {
    'Biased': BiasedJudge(bias_strength=0.2, noise_scale=0.1),
    'Noisy': NoisyJudge(noise_scale=0.3),
    'Conservative': ConservativeJudge(threshold=0.5),
    'Radical': RadicalJudge(amplification=1.5)
}

# Evaluate all
results = batch_evaluate(actions, judges, tau=0.3)

# Compare
comparison = compare_judges(results, X_max=100)
for name, metrics in comparison.items():
    print(f"{name}: ERH={metrics.get('erh_satisfied')}, α={metrics.get('estimated_exponent', 'N/A')}")

# Generate report
report = generate_report(results, output_path='../output/judge_comparison_report.md')
print(report[:500])
