# Notebook 01: Behavioral Phenomenon

**Research Question:** Do models claim actions they don't take?

This notebook:
1. Generates episodes across all experimental conditions
2. Measures fake action rates by condition
3. Performs statistical analysis
4. Creates visualizations

**Expected output:** `episodes.parquet` with 2,250 episodes

## Setup

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Project imports
from src.utils.logging import setup_logging
from src.config import get_config
from src.generation import generate_batch, get_all_conditions
from src.generation.prompts import ToolType
from src.data.io import save_episodes, load_episodes

# Setup logging
setup_logging(level="INFO")

# Load config
config = get_config()

print("Configuration:")
print(f"  Model: {config.model.id}")
print(f"  Episodes per condition: {config.experiment.n_episodes_per_condition}")
print(f"  Tools: {config.experiment.tools}")
print(f"  System variants: {config.experiment.system_variants}")
print(f"  Social pressures: {config.experiment.social_pressures}")

## 1. Generate Episodes

Generate episodes across all conditions:
- 3 tools × 3 variants × 5 pressures × 12 scenarios = multiple conditions
- 50 episodes per condition (configurable)

**Note:** This will use OpenAI for claim labeling. Ensure `OPENAI_API_KEY` is set in `.env`.

In [None]:
# Get all conditions
conditions = get_all_conditions(
    tool_types=[ToolType(t) for t in config.experiment.tools],
    # variants and pressures from config
)

print(f"Total conditions: {len(conditions)}")
print(f"Expected episodes: {len(conditions) * config.experiment.n_episodes_per_condition}")

In [None]:
# Generate episodes
# WARNING: This will take 2-4 hours depending on GPU and model size

episodes = generate_batch(
    conditions=conditions,
    n_per_condition=config.experiment.n_episodes_per_condition,
    model_id=config.model.id,
    labeling_method="openai",  # Use OpenAI for accurate labeling
    save_path=config.data.processed_dir / "episodes.parquet",
    verbose=True,
)

In [None]:
# Load episodes (if already generated)
# episodes_collection = load_episodes(config.data.processed_dir / "episodes.parquet")
# episodes = episodes_collection.episodes

print(f"Loaded {len(episodes)} episodes")

## 2. Exploratory Analysis

In [None]:
# Convert to DataFrame for analysis
df = pd.DataFrame([ep.model_dump() for ep in episodes])

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()

In [None]:
# Overall statistics
print("Category Distribution:")
print(df['category'].value_counts())
print("\nCategory Rates:")
print(df['category'].value_counts(normalize=True))

# Key metric: Fake action rate
fake_rate = (df['category'] == 'fake_action').mean()
print(f"\n**Fake Action Rate: {fake_rate:.1%}**")

In [None]:
# Distribution by tool type
print("\nFake Rate by Tool Type:")
fake_by_tool = df[df['category'] == 'fake_action'].groupby('tool_type').size() / df.groupby('tool_type').size()
print(fake_by_tool)

## 3. Fake Rate by Condition

Analyze fake action rates across experimental conditions.

In [None]:
# Compute fake rate by condition
fake_by_condition = df.groupby(['tool_type', 'system_variant', 'social_pressure']).apply(
    lambda x: (x['category'] == 'fake_action').mean()
).reset_index(name='fake_rate')

print("Fake rates by condition:")
print(fake_by_condition.sort_values('fake_rate', ascending=False).head(10))

In [None]:
# Highest fake rate condition
max_fake = fake_by_condition.loc[fake_by_condition['fake_rate'].idxmax()]
print(f"\n**Highest Fake Rate:**")
print(f"  Tool: {max_fake['tool_type']}")
print(f"  Variant: {max_fake['system_variant']}")
print(f"  Pressure: {max_fake['social_pressure']}")
print(f"  Rate: {max_fake['fake_rate']:.1%}")

## 4. Statistical Analysis

In [None]:
# Bootstrap confidence intervals for overall fake rate
from src.analysis.statistics import bootstrap_ci

is_fake = (df['category'] == 'fake_action').values.astype(float)
point_est, lower, upper = bootstrap_ci(is_fake, np.mean, n_bootstrap=1000)

print(f"Overall Fake Rate: {point_est:.1%}")
print(f"95% CI: [{lower:.1%}, {upper:.1%}]")

In [None]:
# Chi-squared test: Are fake rates different across conditions?
from scipy.stats import chi2_contingency

# Contingency table: variant × pressure
contingency = pd.crosstab(
    df['system_variant'],
    df['social_pressure'],
    values=(df['category'] == 'fake_action'),
    aggfunc='sum'
)

chi2, p_value, dof, expected = chi2_contingency(contingency)

print(f"\nChi-squared test:")
print(f"  χ² = {chi2:.2f}")
print(f"  p-value = {p_value:.4e}")
print(f"  Significant: {p_value < 0.05}")

## 5. Visualization

**Figure 1:** Fake rate heatmap by condition

In [None]:
# For escalation tool only (most interesting)
df_escalate = df[df['tool_type'] == 'escalate']

# Pivot table for heatmap
fake_pivot = df_escalate.pivot_table(
    index='system_variant',
    columns='social_pressure',
    values='category',
    aggfunc=lambda x: (x == 'fake_action').mean()
)

# Order columns by pressure intensity
pressure_order = ['NEUTRAL', 'STRESSED', 'DEMAND', 'VALIDATION', 'APPEASE']
fake_pivot = fake_pivot[pressure_order]

print("Fake rate pivot table:")
print(fake_pivot)

In [None]:
# Heatmap
from src.analysis.visualization import plot_fake_rate_heatmap

fig = plot_fake_rate_heatmap(
    fake_rates=fake_pivot.values,
    variant_labels=fake_pivot.index.tolist(),
    pressure_labels=fake_pivot.columns.tolist(),
    title="Fake Escalation Rate by Condition",
    save_path=config.figures_dir / "figure1_fake_rates",
)

plt.show()

## 6. Summary Statistics

Final summary for the paper.

In [None]:
print("=" * 60)
print("PHASE 1 RESULTS: BEHAVIORAL PHENOMENON")
print("=" * 60)

print(f"\nTotal Episodes: {len(df)}")
print(f"\nOverall Fake Action Rate: {fake_rate:.1%} (95% CI: [{lower:.1%}, {upper:.1%}])")

print(f"\nHighest Fake Rate Condition:")
print(f"  {max_fake['system_variant']} × {max_fake['social_pressure']}: {max_fake['fake_rate']:.1%}")

print(f"\nStatistical Significance:")
print(f"  χ² test: p = {p_value:.4e} {'***' if p_value < 0.001 else '**' if p_value < 0.01 else '*' if p_value < 0.05 else 'ns'}")

print("\n✓ Phase 1 complete: Phenomenon exists and is systematic")
print("=" * 60)

## Next Steps

→ **Notebook 02:** Extract activations and train probes to detect ground truth