# Constitutional Reasoning Experiment - Exploratory Analysis

**Experiment ID:** exp_20251023_105245  
**Tests Completed:** 480/480 (100%)  
**Configuration:** 16 scenarios × 5 constitutions × 6 models

This notebook provides interactive exploration of the experiment results, allowing deep-dive analysis beyond the automated reports.

## Setup

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Any

# Set style
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

# Constants
EXPERIMENT_ID = "exp_20251023_105245"
BASE_DIR = Path("..")  # Assuming notebook is in notebooks/ directory
RESULTS_DIR = BASE_DIR / "results" / "runs" / EXPERIMENT_ID
ANALYSIS_PATH = BASE_DIR / "results" / "analysis" / "single" / f"{EXPERIMENT_ID}_analysis.json"

print(f"Loading experiment: {EXPERIMENT_ID}")
print(f"Results directory: {RESULTS_DIR}")

## Load Data

In [None]:
# Load analysis results
with open(ANALYSIS_PATH) as f:
    analysis = json.load(f)

print(f"Analysis timestamp: {analysis['analysis_timestamp']}")
print(f"Total tests: {analysis['total_tests']}")
print(f"Overall mean score: {analysis['summary']['overall_score']['mean']:.2f}")

In [None]:
# Load raw test results into DataFrame
test_results = []

for result_file in RESULTS_DIR.glob("*_result.json"):
    with open(result_file) as f:
        result = json.load(f)
        
        test_results.append({
            'trial_id': result['trial_id'],
            'scenario_id': result['scenario_id'],
            'constitution_id': result['constitution_id'],
            'model_id': result['model_id'],
            'factual_adherence': result['integrity_evaluation']['factual_adherence'],
            'value_transparency': result['integrity_evaluation']['value_transparency'],
            'logical_coherence': result['integrity_evaluation']['logical_coherence'],
            'overall_score': result['integrity_evaluation']['overall_integrity_score'],
            'timestamp': result['timestamp']
        })

df = pd.DataFrame(test_results)

print(f"Loaded {len(df)} test results")
print(f"\nDataFrame shape: {df.shape}")
df.head()

## Overall Distribution Analysis

In [None]:
# Summary statistics
print("Overall Score Statistics:")
print(df['overall_score'].describe())

print("\nIntegrity Dimension Statistics:")
print(df[['factual_adherence', 'value_transparency', 'logical_coherence']].describe())

In [None]:
# Distribution plot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Overall score distribution
axes[0, 0].hist(df['overall_score'], bins=20, color='skyblue', edgecolor='black')
axes[0, 0].axvline(df['overall_score'].mean(), color='red', linestyle='--', label=f'Mean: {df["overall_score"].mean():.1f}')
axes[0, 0].axvline(df['overall_score'].median(), color='green', linestyle='--', label=f'Median: {df["overall_score"].median():.1f}')
axes[0, 0].set_xlabel('Overall Integrity Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Overall Score Distribution')
axes[0, 0].legend()

# Factual adherence
axes[0, 1].hist(df['factual_adherence'], bins=20, color='steelblue', edgecolor='black')
axes[0, 1].axvline(df['factual_adherence'].mean(), color='red', linestyle='--')
axes[0, 1].set_xlabel('Factual Adherence Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Factual Adherence Distribution')

# Value transparency
axes[1, 0].hist(df['value_transparency'], bins=20, color='seagreen', edgecolor='black')
axes[1, 0].axvline(df['value_transparency'].mean(), color='red', linestyle='--')
axes[1, 0].set_xlabel('Value Transparency Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Value Transparency Distribution')

# Logical coherence
axes[1, 1].hist(df['logical_coherence'], bins=20, color='coral', edgecolor='black')
axes[1, 1].axvline(df['logical_coherence'].mean(), color='red', linestyle='--')
axes[1, 1].set_xlabel('Logical Coherence Score')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Logical Coherence Distribution')

plt.tight_layout()
plt.show()

## Model Performance Comparison

In [None]:
# Group by model
model_stats = df.groupby('model_id').agg({
    'overall_score': ['mean', 'median', 'std', 'min', 'max', 'count'],
    'factual_adherence': 'mean',
    'value_transparency': 'mean',
    'logical_coherence': 'mean'
}).round(2)

model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns.values]
model_stats = model_stats.sort_values('overall_score_mean', ascending=False)

print("Model Performance Rankings:")
model_stats

In [None]:
# Box plot: Score distributions by model
fig, ax = plt.subplots(figsize=(14, 6))

models_sorted = model_stats.index.tolist()
df['model_id_cat'] = pd.Categorical(df['model_id'], categories=models_sorted, ordered=True)

sns.boxplot(data=df, x='model_id_cat', y='overall_score', ax=ax)
ax.set_xlabel('Model', fontweight='bold')
ax.set_ylabel('Integrity Score', fontweight='bold')
ax.set_title('Score Distributions by Model (Box Plot)', fontsize=14, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.axhline(80, color='gray', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

## Constitution Performance Comparison

In [None]:
# Group by constitution
const_stats = df.groupby('constitution_id').agg({
    'overall_score': ['mean', 'median', 'std', 'min', 'max', 'count'],
    'factual_adherence': 'mean',
    'value_transparency': 'mean',
    'logical_coherence': 'mean'
}).round(2)

const_stats.columns = ['_'.join(col).strip() for col in const_stats.columns.values]
const_stats = const_stats.sort_values('overall_score_mean', ascending=False)

print("Constitution Performance Rankings:")
const_stats

In [None]:
# Box plot: Score distributions by constitution
fig, ax = plt.subplots(figsize=(12, 6))

consts_sorted = const_stats.index.tolist()
df['constitution_id_cat'] = pd.Categorical(df['constitution_id'], categories=consts_sorted, ordered=True)

box_colors = ['salmon' if c == 'bad-faith' else 'skyblue' for c in consts_sorted]

bp = sns.boxplot(data=df, x='constitution_id_cat', y='overall_score', ax=ax)
for patch, color in zip(bp.patches if hasattr(bp, 'patches') else bp.artists, box_colors):
    patch.set_facecolor(color)

ax.set_xlabel('Constitution', fontweight='bold')
ax.set_ylabel('Integrity Score', fontweight='bold')
ax.set_title('Score Distributions by Constitution (Bad-Faith Highlighted)', fontsize=14, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.axhline(80, color='gray', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

## Model × Constitution Interaction Analysis

In [None]:
# Pivot table: Model × Constitution mean scores
pivot = df.pivot_table(
    values='overall_score',
    index='model_id',
    columns='constitution_id',
    aggfunc='mean'
).round(2)

# Reorder by model performance
pivot = pivot.loc[models_sorted]

print("Model × Constitution Interaction Matrix (Mean Scores):")
pivot

In [None]:
# Heatmap
fig, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn', vmin=40, vmax=95,
            cbar_kws={'label': 'Integrity Score'}, ax=ax)

ax.set_title('Model × Constitution Interaction Heatmap', fontsize=14, fontweight='bold', pad=15)
ax.set_xlabel('Constitution', fontweight='bold')
ax.set_ylabel('Model', fontweight='bold')
plt.tight_layout()
plt.show()

## Scenario Difficulty Analysis

In [None]:
# Group by scenario
scenario_stats = df.groupby('scenario_id').agg({
    'overall_score': ['mean', 'std', 'min', 'max', 'count']
}).round(2)

scenario_stats.columns = ['_'.join(col).strip() for col in scenario_stats.columns.values]
scenario_stats = scenario_stats.sort_values('overall_score_mean')

print("Scenario Difficulty Rankings (Lowest Score = Hardest):")
scenario_stats

In [None]:
# Scatter plot: Mean vs. Variability
fig, ax = plt.subplots(figsize=(12, 8))

scatter = ax.scatter(
    scenario_stats['overall_score_mean'],
    scenario_stats['overall_score_std'],
    s=150,
    alpha=0.6,
    c=scenario_stats['overall_score_mean'],
    cmap='RdYlGn',
    vmin=70,
    vmax=90
)

# Label interesting points
for scenario_id in scenario_stats.index:
    mean = scenario_stats.loc[scenario_id, 'overall_score_mean']
    std = scenario_stats.loc[scenario_id, 'overall_score_std']
    
    if mean < 78 or std > 20:
        ax.annotate(
            scenario_id.replace('-', ' ').title(),
            (mean, std),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=9,
            alpha=0.8
        )

ax.set_xlabel('Mean Integrity Score', fontweight='bold')
ax.set_ylabel('Standard Deviation', fontweight='bold')
ax.set_title('Scenario Difficulty Analysis\n(Lower-left = Harder, Upper-right = More Variable)',
             fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.colorbar(scatter, ax=ax, label='Mean Score')
plt.tight_layout()
plt.show()

## Dimensional Correlation Analysis

In [None]:
# Correlation matrix
corr_matrix = df[['factual_adherence', 'value_transparency', 'logical_coherence', 'overall_score']].corr()

print("Correlation Matrix:")
print(corr_matrix.round(3))

In [None]:
# Correlation heatmap
fig, ax = plt.subplots(figsize=(8, 6))

sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={'label': 'Correlation'}, ax=ax)

ax.set_title('Integrity Dimension Correlation Matrix', fontsize=14, fontweight='bold', pad=15)
plt.tight_layout()
plt.show()

In [None]:
# Pairplot
dimensions = ['factual_adherence', 'value_transparency', 'logical_coherence']
sns.pairplot(df[dimensions], diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Integrity Dimensions Pairplot', y=1.02, fontsize=14, fontweight='bold')
plt.show()

## Deep Dive: Bad-Faith vs. Honest Constitutions

In [None]:
# Split data
df_bad_faith = df[df['constitution_id'] == 'bad-faith']
df_honest = df[df['constitution_id'] != 'bad-faith']

print("Bad-Faith Statistics:")
print(df_bad_faith['overall_score'].describe())

print("\nHonest Constitutions Statistics:")
print(df_honest['overall_score'].describe())

print(f"\nMean Gap: {df_honest['overall_score'].mean() - df_bad_faith['overall_score'].mean():.2f} points")

In [None]:
# Comparison violin plot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Overall scores
comparison_df = pd.concat([
    df_bad_faith.assign(group='Bad-Faith'),
    df_honest.assign(group='Honest')
])

sns.violinplot(data=comparison_df, x='group', y='overall_score', ax=axes[0])
axes[0].set_xlabel('Constitution Type', fontweight='bold')
axes[0].set_ylabel('Overall Integrity Score', fontweight='bold')
axes[0].set_title('Bad-Faith vs. Honest Constitutions', fontsize=12, fontweight='bold')

# Dimensional breakdown
dim_comparison = pd.DataFrame({
    'Bad-Faith': [
        df_bad_faith['factual_adherence'].mean(),
        df_bad_faith['value_transparency'].mean(),
        df_bad_faith['logical_coherence'].mean()
    ],
    'Honest': [
        df_honest['factual_adherence'].mean(),
        df_honest['value_transparency'].mean(),
        df_honest['logical_coherence'].mean()
    ]
}, index=['Factual\nAdherence', 'Value\nTransparency', 'Logical\nCoherence'])

dim_comparison.plot(kind='bar', ax=axes[1], color=['salmon', 'skyblue'])
axes[1].set_xlabel('Dimension', fontweight='bold')
axes[1].set_ylabel('Mean Score', fontweight='bold')
axes[1].set_title('Dimensional Breakdown Comparison', fontsize=12, fontweight='bold')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
axes[1].legend(title='Type')
axes[1].set_ylim(0, 100)

plt.tight_layout()
plt.show()

## Key Insights Summary

In [None]:
print("="*70)
print("KEY INSIGHTS FROM EXPLORATORY ANALYSIS")
print("="*70)

# Top model
top_model = model_stats.index[0]
top_score = model_stats.iloc[0]['overall_score_mean']
print(f"\n1. TOP MODEL: {top_model} ({top_score:.1f}/100)")

# Motivated reasoning gap
honest_mean = df_honest['overall_score'].mean()
bad_faith_mean = df_bad_faith['overall_score'].mean()
gap = honest_mean - bad_faith_mean
print(f"\n2. MOTIVATED REASONING DETECTION: {gap:.1f} point gap")
print(f"   - Honest constitutions: {honest_mean:.1f}/100")
print(f"   - Bad-faith constitution: {bad_faith_mean:.1f}/100")

# Value pluralism
honest_consts = const_stats[const_stats.index != 'bad-faith']
spread = honest_consts['overall_score_mean'].max() - honest_consts['overall_score_mean'].min()
print(f"\n3. VALUE PLURALISM: Only {spread:.1f} point spread among honest constitutions")
print(f"   - Validates: Different values ≠ Different facts")

# Hardest scenario
hardest = scenario_stats.index[0]
hardest_score = scenario_stats.iloc[0]['overall_score_mean']
print(f"\n4. HARDEST SCENARIO: {hardest} ({hardest_score:.1f}/100)")

# Most variable
most_variable_idx = scenario_stats['overall_score_std'].argmax()
most_variable = scenario_stats.index[most_variable_idx]
variability = scenario_stats.iloc[most_variable_idx]['overall_score_std']
print(f"\n5. MOST VARIABLE SCENARIO: {most_variable} (SD: {variability:.1f})")

print("\n" + "="*70)