# Combined Figure: Epistemic Stance + Cross-Model Agreement

Two-panel figure:
- **(A)** Epistemic Stance Change by Perturbation (horizontal bar chart with 95% CIs)
- **(B)** Cross-Model Agreement by Scenario Verification Level

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import norm, pointbiserialr
from pathlib import Path
import re

%matplotlib inline

In [2]:
# Publication settings
plt.rcParams.update({
    'font.family': 'serif',
    'font.size': 11,
    'axes.labelsize': 11,
    'axes.titlesize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 9,
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
})

z = norm.ppf(0.975)  # 1.96 for 95% CI

def calculate_ci(p, n):
    """Calculate 95% CI for proportion."""
    if n == 0 or p == 0 or p == 1:
        return 0
    se = np.sqrt(p * (1 - p) / n)
    return z * se

## Part A: Epistemic Stance Data (Bar Chart)

In [3]:
# Epistemic stance lexicon
EPISTEMIC_HEDGES = [
    'seem*', 'appear*', 'might', 'could', 'may',
    'perhaps', 'possibly', 'maybe', 'probably', 'likely',
    'unclear', 'uncertain', 'unsure', 'guess*',
]

EPISTEMIC_BOOSTERS = [
    'clearly', 'obviously', 'definitely', 'certainly',
    'undoubtedly', 'unquestionably', 'absolutely',
    'always', 'never', 'sure',
]

NEGATION_WORDS = ['not', 'no', "n't", 'never', 'neither', 'nor']

def term_to_pattern(term: str) -> str:
    term = term.lower()
    if term.endswith('*'):
        return r'\b' + re.escape(term[:-1]) + r'\w*\b'
    return r'\b' + re.escape(term) + r'\b'

def is_negated(text: str, match_start: int, window: int = 3) -> bool:
    before = text[:match_start].lower().split()[-window:]
    return any(w in NEGATION_WORDS or "n't" in w for w in before)

def count_markers(text: str, markers: list) -> int:
    if not text or pd.isna(text):
        return 0
    text_lower = text.lower()
    count = 0
    for term in markers:
        pattern = term_to_pattern(term)
        try:
            for match in re.finditer(pattern, text_lower):
                if not is_negated(text_lower, match.start()):
                    count += 1
        except re.error:
            continue
    return count

def compute_net_epistemic(text: str) -> float:
    if not text or pd.isna(text):
        return 0.0
    words = len(text.split())
    if words == 0:
        return 0.0
    hedges = count_markers(text, EPISTEMIC_HEDGES)
    boosters = count_markers(text, EPISTEMIC_BOOSTERS)
    return ((boosters - hedges) / words) * 100

In [4]:
# Load master data
print("Loading master parquet...")
df = pd.read_parquet('../data/content_eval.parquet')
print(f"Master data: {len(df):,} rows")
print(f"Models: {df['model'].unique()}")

Loading master parquet...
Master data: 164,424 rows
Models: ['claude37' 'gpt41' 'qwen25' 'deepseek']


In [5]:
# Compute epistemic stance
print("Computing epistemic stance...")
df['net_epistemic'] = df['explanation'].apply(compute_net_epistemic)
print("Done!")

Computing epistemic stance...
Done!


In [None]:
# Perturbation config
PERTURBATION_CONFIG = {
    'push_yta_social_proof': {'name': 'Social proof (against)', 'category': 'Persuasion'},
    'push_yta_pattern_admission': {'name': 'Pattern admission', 'category': 'Persuasion'},
    'push_yta_self_condemning': {'name': 'Self-condemning', 'category': 'Persuasion'},
    'change_trivial_detail': {'name': 'Change trivial detail', 'category': 'Surface'},
    'add_extraneous_detail': {'name': 'Add extraneous detail', 'category': 'Surface'},
    'remove_sentence': {'name': 'Remove sentence', 'category': 'Surface'},
    'push_nta_victim_pattern': {'name': 'Victim pattern', 'category': 'Persuasion'},
    'push_nta_self_justifying': {'name': 'Self-justifying', 'category': 'Persuasion'},
    'push_nta_social_proof': {'name': 'Social proof (for)', 'category': 'Persuasion'},
    'firstperson_atfault': {'name': 'First-person', 'category': 'Point-of-view'},
    'thirdperson': {'name': 'Third-person', 'category': 'Point-of-view'},
}

# Get baseline epistemic stance
baseline = df[df['perturbation_type'] == 'none'].copy()
baseline_rates = baseline.groupby(['id', 'model', 'run_number']).agg({
    'net_epistemic': 'first'
}).reset_index()
baseline_rates.columns = ['id', 'model', 'run_number', 'base_epistemic']

# Get perturbation data and merge
perturbations = df[df['perturbation_type'].isin(PERTURBATION_CONFIG.keys())].copy()
merged = perturbations.merge(baseline_rates, on=['id', 'model', 'run_number'], how='inner')
merged['net_delta'] = merged['net_epistemic'] - merged['base_epistemic']
print(f"Matched pairs: {len(merged):,}")

# Aggregate by perturbation type WITH standard errors (for bar chart)
epistemic_data = []
for pert_type, config in PERTURBATION_CONFIG.items():
    pert_data = merged[merged['perturbation_type'] == pert_type]
    if len(pert_data) == 0:
        continue
    
    mean_delta = pert_data['net_delta'].mean()
    std_delta = pert_data['net_delta'].std()
    n = len(pert_data)
    se = std_delta / np.sqrt(n)
    ci = z * se
    
    epistemic_data.append({
        'perturbation': config['name'],
        'category': config['category'],
        'net_delta': mean_delta,
        'std': std_delta,
        'se': se,
        'ci': ci,
        'flip_pct': pert_data['verdict_flipped'].mean() * 100 if 'verdict_flipped' in pert_data.columns else 0,
        'n': n
    })

epistemic_df = pd.DataFrame(epistemic_data)

# Sort by category then net_delta
category_order = ['Persuasion', 'Surface', 'Point-of-view']
epistemic_df['category'] = pd.Categorical(
    epistemic_df['category'], 
    categories=category_order, 
    ordered=True
)
epistemic_df = epistemic_df.sort_values(['category', 'net_delta'], ascending=[True, True])

print("Epistemic Stance by Perturbation (with 95% CI):\n")
print(epistemic_df[['perturbation', 'category', 'net_delta', 'ci', 'n']].to_string(index=False))

## Part B: Verification Data

In [8]:
# Load verification data
verif_df = pd.read_parquet('../data/verification_annotations.parquet')
verif_df = verif_df[verif_df['error'].isna()].copy()
verif_df = verif_df.drop_duplicates(subset=['scenario_id', 'model', 'protocol'])
verif_df['verif_bin'] = verif_df['verification'].str.lower() == 'yes'

# Load reasoning protocols
reasoning_df = pd.read_parquet('../data/reasoning_protocols_combined_validated.parquet')

print(f"Verification annotations: {len(verif_df):,}")
print(f"Reasoning protocols: {len(reasoning_df):,}")

Verification annotations: 10,702
Reasoning protocols: 13,317


In [9]:
# Get scenario-level verification
scenario_total_verif = verif_df.groupby('scenario_id')['verif_bin'].sum().reset_index()
scenario_total_verif.columns = ['scenario_id', 'total_verifications']

# Calculate cross-model agreement per scenario
cross_model = reasoning_df.pivot_table(
    index=['scenario_id', 'protocol'],
    columns='model',
    values='judgment',
    aggfunc='first'
).reset_index()

# Check which columns exist
model_cols = [c for c in cross_model.columns if c not in ['scenario_id', 'protocol']]
print(f"Model columns: {model_cols}")

# Calculate agreement (all models agree)
if len(model_cols) >= 2:
    cross_model['models_agree'] = cross_model[model_cols].nunique(axis=1) == 1
    cross_model['models_agree'] = cross_model['models_agree'].astype(int)
else:
    cross_model['models_agree'] = 1

# Aggregate to scenario level
scenario_agreement = cross_model.groupby('scenario_id')['models_agree'].mean().reset_index()
scenario_agreement.columns = ['scenario_id', 'cross_model_agreement']

# Merge
scenario_analysis = scenario_agreement.merge(scenario_total_verif, on='scenario_id')

# Bin by total verifications
scenario_analysis['verif_level'] = pd.cut(
    scenario_analysis['total_verifications'],
    bins=[-1, 0, 3, 6, 100],
    labels=['None (0)', 'Low (1-3)', 'Medium (4-6)', 'High (7+)']
)

# Calculate stats by level
level_stats = []
for level in ['None (0)', 'Low (1-3)', 'Medium (4-6)', 'High (7+)']:
    subset = scenario_analysis[scenario_analysis['verif_level'] == level]
    if len(subset) > 5:
        agree = subset['cross_model_agreement'].mean()
        level_stats.append({
            'level': level,
            'agreement': agree,
            'ci': calculate_ci(agree, len(subset)),
            'n': len(subset)
        })

level_df = pd.DataFrame(level_stats)

# Correlation
r, p = pointbiserialr(scenario_analysis['total_verifications'], scenario_analysis['cross_model_agreement'])

print("\nCross-Model Agreement by Verification Level:")
print(level_df.to_string(index=False))
print(f"\nCorrelation: r = {r:.3f}, p = {p:.6f}")

Model columns: ['claude-thinking', 'deepseek-r1', 'o3-mini', 'qwq-32b']

Cross-Model Agreement by Verification Level:
       level  agreement       ci   n
    None (0)   0.983740 0.038713  41
   Low (1-3)   0.669020 0.044738 425
Medium (4-6)   0.263957 0.034836 615
   High (7+)   0.154062 0.064862 119

Correlation: r = -0.559, p = 0.000000


## Combined Figure

In [None]:
# Create combined figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5.5), dpi=150)

# =============================================================================
# SHARED STYLING
# =============================================================================
CATEGORY_COLORS = {
    'Point-of-view': '#222222',  # Black
    'Persuasion': '#888888',      # Gray
    'Surface': '#dddddd'          # Light Gray
}

BAR_EDGECOLOR = '#222222'
BAR_LINEWIDTH = 0.5
ERROR_CAPSIZE = 3
ERROR_LINEWIDTH = 1.2

LEGEND_ORDER = ['Point-of-view', 'Persuasion', 'Surface']

# =============================================================================
# PANEL A: Epistemic Stance Change (Horizontal Bar Chart with 95% CIs)
# =============================================================================
y_positions = np.arange(len(epistemic_df))
colors_a = [CATEGORY_COLORS[cat] for cat in epistemic_df['category']]

# Horizontal bar chart with error bars
bars = ax1.barh(y_positions, epistemic_df['net_delta'], xerr=epistemic_df['ci'],
                color=colors_a, edgecolor=BAR_EDGECOLOR, linewidth=BAR_LINEWIDTH, 
                height=0.7, capsize=ERROR_CAPSIZE, 
                error_kw={'lw': ERROR_LINEWIDTH, 'capthick': ERROR_LINEWIDTH})

# Zero line
ax1.axvline(x=0, color='#222222', linewidth=0.8)

# Labels
ax1.set_yticks(y_positions)
ax1.set_yticklabels(epistemic_df['perturbation'])
ax1.set_xlabel('$\\Delta$ Net Epistemic Stance (per 100 words)', fontweight='bold')
ax1.set_title('(A) Epistemic Stance Change by Perturbation', fontweight='bold', loc='left')

# Category separators
prev_cat = None
for i, (_, row) in enumerate(epistemic_df.iterrows()):
    if row['category'] != prev_cat and prev_cat is not None:
        ax1.axhline(y=i-0.5, color='#cccccc', linewidth=0.5, linestyle='--')
    prev_cat = row['category']

# Legend with shared colors
legend_patches = [mpatches.Patch(color=CATEGORY_COLORS[cat], label=cat,
                                  edgecolor=BAR_EDGECOLOR, linewidth=BAR_LINEWIDTH)
                  for cat in LEGEND_ORDER]
ax1.legend(handles=legend_patches, loc='upper right', framealpha=0.9)

# Styling - expand x limits to accommodate error bars
xmin = min(epistemic_df['net_delta'].min() - epistemic_df['ci'].max(), -0.06) * 1.3
xmax = max(epistemic_df['net_delta'].max() + epistemic_df['ci'].max(), 0.12) * 1.3
ax1.set_xlim(xmin, xmax)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3, linestyle='--')

# Directional labels
ax1.text(xmin * 0.85, -0.65, '$\\leftarrow$ More hedged', fontsize=8, ha='left', style='italic', color='#666666')
ax1.text(xmax * 0.85, -0.65, 'More confident $\\rightarrow$', fontsize=8, ha='right', style='italic', color='#666666')

# =============================================================================
# PANEL B: Cross-Model Agreement by Verification Level
# =============================================================================

x = np.arange(len(level_df))
width = 0.6
colors_b = ['#333333', '#555555', '#888888', '#bbbbbb']

bars = ax2.bar(x, level_df['agreement'] * 100, width,
               color=colors_b, edgecolor='#222222', linewidth=0.8)
ax2.errorbar(x, level_df['agreement'] * 100,
             yerr=level_df['ci'] * 100,
             fmt='none', ecolor='#222222', capsize=4, lw=1.5)

# Trend line
z_fit = np.polyfit(x, level_df['agreement'] * 100, 1)
p_fit = np.poly1d(z_fit)
ax2.plot(x, p_fit(x), 'k--', lw=2, alpha=0.7, label=f'r = {r:.2f}')

ax2.set_ylabel('Cross-Model Agreement (%)', fontweight='bold')
ax2.set_xlabel('Scenario Verification Level', fontweight='bold')
ax2.set_title('(B) Cross-Model Agreement by Scenario Verification', fontweight='bold', loc='left')
ax2.set_xticks(x)
ax2.set_xticklabels(level_df['level'], fontsize=10)
ax2.legend(loc='upper right', framealpha=0.9)
ax2.set_ylim(0, 100)
ax2.grid(axis='y', alpha=0.3, linestyle='--')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

In [11]:
# Save figure
output_dir = Path('../../figures')
output_dir.mkdir(parents=True, exist_ok=True)

pdf_path = output_dir / 'fig_epistemic_verification_combined.pdf'
png_path = output_dir / 'fig_epistemic_verification_combined.png'

fig.savefig(pdf_path, bbox_inches='tight', dpi=300)
fig.savefig(png_path, bbox_inches='tight', dpi=300)

print(f"Saved: {pdf_path}")
print(f"Saved: {png_path}")

Saved: ../../figures/fig_epistemic_verification_combined.pdf
Saved: ../../figures/fig_epistemic_verification_combined.png


In [None]:
print("KEY INSIGHTS")
print("=" * 70)
print("\nPanel A (Epistemic Stance Change by Perturbation):")
print("  - Horizontal bars show delta net epistemic stance (boosters - hedges)")
print("  - Push Self At Fault: More hedged (negative delta) - uncertainty")
print("  - Surface: Minimal effect - semantic irrelevance")
print("  - Point-of-view: Most confident (positive delta) - third-person = direct")
print("  - All effects shown with 95% CIs")
print("\nPanel B (Cross-Model Agreement):")
print(f"  - High-verification scenarios: {level_df[level_df['level']=='High (7+)']['agreement'].values[0]*100:.0f}% agreement")
print(f"  - No-verification scenarios: {level_df[level_df['level']=='None (0)']['agreement'].values[0]*100:.0f}% agreement")
print(f"  - Correlation: r = {r:.2f}")
print("  - Verification marks scenario-level ambiguity")