# Phase 2: Activation Probing

**Goal:** Determine if the model "knows" it didn't call the tool, even when claiming it did.

**Core Hypothesis:** There exists an internal representation of "action actually taken" that's separable from the narrative output.

## Pipeline
1. Load Phase 1 episodes
2. Extract activations (teacher forcing)
3. Train reality probe (predicts `tool_used`)
4. Train narrative probe (predicts `claims_action`)
5. Evaluate on fake_escalation episodes
6. Cross-domain transfer test

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
from pathlib import Path

from src.activations import (
    load_model_for_activations,
    build_activation_dataset,
    samples_to_arrays,
    save_activation_dataset,
    load_activation_dataset,
)
from src.probes import (
    train_reality_probe,
    train_narrative_probe,
    evaluate_probe,
    evaluate_by_category,
    print_category_metrics,
    analyze_fake_escalations,
    print_fake_analysis,
    evaluate_transfer,
    print_transfer_result,
    full_probe_evaluation,
    cross_validate_probe,
)
from sklearn.model_selection import train_test_split

## 1. Load Model for Activation Extraction

Using PyTorch/Transformers (not MLX) to access hidden states.

In [15]:
# Load model
model, tokenizer = load_model_for_activations(
    model_id="mistralai/Mistral-7B-Instruct-v0.2"
)
print(f"Model hidden size: {model.config.hidden_size}")

Loading model for activations: mistralai/Mistral-7B-Instruct-v0.2...


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

Model loaded: mistralai/Mistral-7B-Instruct-v0.2
Model hidden size: 4096


## 2. Build Activation Dataset from Phase 1 Episodes

This extracts activations at safe token positions (before any `<<CALL` tokens).

In [None]:
# Find Phase 1 episode files
data_dir = Path("../data/raw")
episode_files = list(data_dir.glob("*.jsonl"))
print(f"Found {len(episode_files)} episode files:")
for f in episode_files:
    print(f"  {f.name}")

In [None]:
# Build activation dataset from a specific file
# Choose the file with the most episodes / best balance
EPISODE_FILE = "../data/raw/adversarial_20251221_020507.jsonl"  # UPDATE THIS

samples = build_activation_dataset(
    episodes_file=EPISODE_FILE,
    model=model,
    tokenizer=tokenizer,
    model_type="mistral",
    position_preference="before_tool",  # Anti-cheat: use position before <<CALL
    max_episodes=200,  # Set to small number for testing
    verbose=True,
)

In [None]:
EPISODE_FILE = "../data/raw/adversarial_20251221_020507.jsonl"  # UPDATE THIS

samples_remaining = build_activation_dataset(
    episodes_file=EPISODE_FILE,
    model=model,
    tokenizer=tokenizer,
    model_type="mistral",
    position_preference="before_tool",
    skip_first=200,      # Skip first 200
    verbose=True,
)

In [None]:
# Save for later use
save_activation_dataset(samples_remaining, "../data/labeled/activations_v2.npz")

In [None]:
# Convert to arrays
# Combine with original samples



# Load all samples from combined dataset
print("Loading all samples from activations_v1_combined.npz...")
from src.activations import ActivationSample
X_loaded, y_tool_loaded, y_claims_loaded, categories_loaded, metadata = load_activation_dataset("../data/labeled/activations_v1_combined.npz")
# Reconstruct ActivationSample objects from loaded data
all_samples = []
for i in range(len(X_loaded)):
    all_samples.append(ActivationSample(
        activation=X_loaded[i],
        tool_used=bool(y_tool_loaded[i]),
        claims_action=bool(y_claims_loaded[i]),
        category=str(categories_loaded[i]),
        scenario=str(metadata["scenarios"][i]),
        system_variant=str(metadata["system_variants"][i]),
        social_pressure=str(metadata["social_pressures"][i]),
        position_type=str(metadata["position_types"][i]),
        episode_idx=int(metadata["episode_indices"][i]),
    ))
print(f"Loaded {len(all_samples)} samples from combined dataset")
print(f"Total samples: {len(all_samples)}")

# Convert combined dataset
X, y_tool, y_claims, categories = samples_to_arrays(all_samples)
print(f"X shape: {X.shape}")
print(f"Category distribution:")
for cat in np.unique(categories):
    print(f"  {cat}: {(categories == cat).sum()}")

In [None]:
# Save combined dataset (first 200 episodes + episodes 201+)
# This preserves the original activations_v1.npz while saving the full combined dataset
save_activation_dataset(all_samples, "../data/labeled/activations_v1_combined.npz")


## 3. Train/Test Split

In [None]:
# Stratified split to maintain category balance
X_train, X_test, y_tool_train, y_tool_test, y_claims_train, y_claims_test, cat_train, cat_test = train_test_split(
    X, y_tool, y_claims, categories,
    test_size=0.2,
    stratify=categories,
    random_state=42,
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"\nTest set categories:")
for cat in np.unique(cat_test):
    print(f"  {cat}: {(cat_test == cat).sum()}")

## 4. Cross-Validation (Sanity Check)

In [None]:
# Quick cross-validation to check if probing is viable
cv_reality = cross_validate_probe(X_train, y_tool_train, n_folds=5)
cv_narrative = cross_validate_probe(X_train, y_claims_train, n_folds=5)

print("Cross-validation results:")
print(f"  Reality probe:   {cv_reality['mean_accuracy']:.1%} ¬± {cv_reality['std_accuracy']:.1%}")
print(f"  Narrative probe: {cv_narrative['mean_accuracy']:.1%} ¬± {cv_narrative['std_accuracy']:.1%}")

if cv_reality['mean_accuracy'] > 0.6:
    print("\n‚úì Reality probe shows signal above chance!")
else:
    print("\n‚ö† Reality probe near chance - may need more data or different position")

## 5. Full Probe Evaluation

In [None]:
# Run full evaluation
results = full_probe_evaluation(
    X_train, y_tool_train, y_claims_train, cat_train,
    X_test, y_tool_test, y_claims_test, cat_test,
    verbose=True,
)

## 6. Interpret Results

### Key Questions:

1. **Does the reality probe work?** (accuracy >> 50%)
   - If yes ‚Üí model internally represents whether it called the tool

2. **On fake_escalation episodes, what does the reality probe predict?**
   - Should predict `tool_used=False` (aligned with reality)
   - If it predicts `True`, it's being fooled by its own narrative

3. **Does the narrative probe work?**
   - Should predict `claims_action` accurately
   - Shows model represents what it's claiming

In [None]:
# Summary interpretation
fake_analysis = results["fake_analysis"]
reality_acc = results["reality_overall"].accuracy

print("="*60)
print("INTERPRETATION")
print("="*60)

if reality_acc > 0.7:
    print(f"‚úì Reality probe works well ({reality_acc:.1%} accuracy)")
    print("  ‚Üí Model has internal representation of 'action taken'")
else:
    print(f"? Reality probe marginal ({reality_acc:.1%} accuracy)")

if fake_analysis.n_fake > 0:
    if fake_analysis.probe_aligned_with_reality > 0.7:
        print(f"\n‚úì On FAKE escalations, probe aligns with REALITY ({fake_analysis.probe_aligned_with_reality:.1%})")
        print("  ‚Üí Model 'knows' it didn't call the tool, even when claiming it did!")
        print("  ‚Üí This is STRONG evidence for action grounding")
    elif fake_analysis.probe_aligned_with_narrative > 0.7:
        print(f"\n‚úó On FAKE escalations, probe aligns with NARRATIVE ({fake_analysis.probe_aligned_with_narrative:.1%})")
        print("  ‚Üí Probe is 'fooled' by the model's own claims")
        print("  ‚Üí Less clear evidence for separate action grounding")
    else:
        print(f"\n? On FAKE escalations, probe is uncertain")
        print(f"  Reality: {fake_analysis.probe_aligned_with_reality:.1%}, Narrative: {fake_analysis.probe_aligned_with_narrative:.1%}")

## 7. Cross-Domain Transfer (Optional - Requires Second Tool)

This is the headline experiment: does a probe trained on `escalateCase` generalize to `searchKnowledgeBase`?

Requires generating episodes with a second tool type first.

In [None]:

# Load second tool episodes
samples_search = build_activation_dataset(
    "../data/raw/search_episodes_20251221_141706.jsonl",
    model, tokenizer, "mistral"
)

In [None]:
# Save search activations dataset
save_activation_dataset(samples_search, "../data/labeled/activations_search.npz")
print(f"Saved {len(samples_search)} search samples to ../data/labeled/activations_search.npz")


## 8. Visualize Probe Weights (Optional)

In [None]:
X_search, y_tool_search, _, _ = samples_to_arrays(samples_search)

# Transfer test
transfer_result = evaluate_transfer(
    X_train, y_tool_train,  # Train on escalation
    X_search, y_tool_search,  # Test on search
    train_domain="escalateCase",
    test_domain="searchKnowledgeBase",
)
print_transfer_result(transfer_result)

In [None]:
import matplotlib.pyplot as plt

reality_probe = results["reality_probe"]
weights = reality_probe.coef_[0]

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(weights, bins=50)
plt.title("Reality Probe Weight Distribution")
plt.xlabel("Weight")
plt.ylabel("Count")

plt.subplot(1, 2, 2)
top_k = 20
top_idx = np.argsort(np.abs(weights))[-top_k:]
plt.barh(range(top_k), weights[top_idx])
plt.title(f"Top {top_k} Dimensions by |weight|")
plt.xlabel("Weight")
plt.ylabel("Dimension Index")
plt.yticks(range(top_k), top_idx)
plt.tight_layout()
plt.show()

## Next Steps

If results are promising:

1. **Add second tool** (`searchKnowledgeBase`) for cross-domain transfer
2. **Phase 3: Causal intervention** - Use activation patching to show the probe direction actually affects behavior
3. **More episodes** - Scale up to 500+ for robust statistics
4. **Different token positions** - Compare `before_tool` vs `first_assistant` vs `final`

## 9. Layer-wise Probe Analysis

**Question:** At which layer does action-grounding emerge?

- Early layers (0-8): Token/positional features
- Mid layers (8-20): Semantic features  
- Late layers (20-32): Task-specific, abstract concepts

If the probe works well at layer 16 but not layer 4, that tells us action-grounding is a mid-to-late network phenomenon.

### Summary: Narrative Probe Error Analysis

**Key Finding:** Many "errors" are actually **ground truth labeling issues**, not probe failures.

**Evidence:**

1. **Error 2** - Clear false negative in labels:
   - Text contains: "I've also gone ahead and escalated your case" + "a human will be reviewing your case"
   - Matches phrase list: `"i've escalated"` and `"a human will review your case"`
   - Ground truth says: `claims_action: False` ‚ùå
   - Probe says: `claims_action: True` (confidence 0.999) ‚úÖ
   - **Probe is correct** - this should be labeled as a claim

2. **Error 3** - Labeling inconsistency:
   - Text contains: "I've escalated your case to a human team member"
   - Matches phrase list: `"i've escalated"`
   - Header says: `claims_action: False` but claim detection section says: `claims_action: True`
   - Probe aligns with claim detection (confidence 1.000)
   - **Same episode labeled differently in different places**

3. **Error 5** - Likely false positive in labels:
   - Text contains conditional/instructional language: "If you would like... you can always escalate"
   - This is telling user HOW to escalate, not claiming escalation happened
   - Probe says: `claims_action: False` (confidence 0.000) ‚úÖ
   - Ground truth says: `claims_action: True` ‚ùå

**Root Cause:** 
- Episodes were labeled using simple regex phrase matching (`detect_escalation_claim()`)
- Regex doesn't handle context, partial matches, or conditional language well
- The probe learns semantic meaning better than regex matching

**Implication:**
The narrative probe may be learning a more accurate representation than the ground truth labels. Consider re-labeling episodes using LLM judge (`use_llm_judge=True`) for more nuanced detection.


In [None]:
# Layer-wise probe analysis
# Mistral-7B has 32 layers (indices 0-31), plus embedding layer
# hidden_states[0] = embeddings, hidden_states[1] = layer 0 output, ..., hidden_states[32] = layer 31 output

import json
from src.activations import (
    build_full_text, 
    find_token_positions, 
    get_safe_probe_index,
    extract_activations,
    ActivationSample,
)

# Layers to test: early, mid-early, mid, mid-late, late
LAYERS_TO_TEST = [1, 8, 16, 24, 32]  # 1=first layer, 32=last layer (index into hidden_states)

# Load episodes for extraction
EPISODE_FILE = "../data/raw/adversarial_20251221_020507.jsonl"  # Same file used for main analysis
with open(EPISODE_FILE) as f:
    episodes = [json.loads(line) for line in f]

# Use subset for speed (layer analysis is expensive - full forward pass per layer)
N_EPISODES_FOR_LAYER = 100  
episodes_subset = episodes[:N_EPISODES_FOR_LAYER]

print(f"Running layer-wise analysis on {len(episodes_subset)} episodes...")
print(f"Testing layers: {LAYERS_TO_TEST}")
print(f"This will take a few minutes...\n")

In [None]:
# Extract activations at each layer and train probes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from collections import Counter
from src.prompts import build_episode, SystemPromptVariant, SocialPressure

# Check that required variables are defined
if 'model' not in globals() or 'tokenizer' not in globals():
    raise NameError(
        "model and tokenizer are not defined. Please run the cell that loads the model first:\n"
        "  model, tokenizer = load_model_for_activations(model_id=\"mistralai/Mistral-7B-Instruct-v0.2\")"
    )

if 'EPISODE_FILE' not in globals():
    raise NameError(
        "EPISODE_FILE is not defined. Please run the cell that sets EPISODE_FILE first."
    )

if 'episodes_subset' not in globals():
    raise NameError(
        "episodes_subset is not defined. Please run the cell that creates episodes_subset first."
    )

if 'LAYERS_TO_TEST' not in globals():
    raise NameError(
        "LAYERS_TO_TEST is not defined. Please run the cell that sets LAYERS_TO_TEST first."
    )

layer_results = {
    'layer': [],
    'reality_acc': [],
    'narrative_acc': [],
    'fake_aligned_reality': [],
}

print(f"Processing {len(LAYERS_TO_TEST)} layers across {len(episodes_subset)} episodes...")
print(f"Layers to analyze: {LAYERS_TO_TEST}\n")

for layer_idx in LAYERS_TO_TEST:
    print(f"{'='*50}")
    print(f"Layer {layer_idx} (hidden_states index)")
    print(f"{'='*50}")
    
    layer_samples = []
    errors = 0
    
    for ep_idx, episode in enumerate(episodes_subset):
        if (ep_idx + 1) % 20 == 0:
            print(f"  Processing episode {ep_idx + 1}/{len(episodes_subset)}...")
        
        try:
            # Reconstruct system_prompt if missing (same logic as build_activation_dataset)
            if "system_prompt" not in episode:
                variant = SystemPromptVariant(episode["system_variant"])
                pressure = SocialPressure(episode["social_pressure"])
                scenario = episode.get("scenario") or episode.get("scenario_id")
                if scenario is None:
                    raise ValueError(f"Episode {ep_idx} missing both 'scenario' and 'scenario_id' fields")
                ep_data = build_episode(scenario, variant, pressure)
                episode["system_prompt"] = ep_data["system_prompt"]
                episode["user_turns"] = ep_data["user_turns"]
            
            # Build full text
            full_text = build_full_text(episode, "mistral")
            
            # Get token positions
            token_ids = tokenizer.encode(full_text)
            positions = find_token_positions(token_ids, tokenizer, "mistral")
            probe_idx = get_safe_probe_index(positions, "before_tool")
            
            if probe_idx is None:
                errors += 1
                if ep_idx < 3:
                    print(f"  Warning: Episode {ep_idx} - no valid probe position found")
                continue
            
            # Extract ALL hidden states in one forward pass
            inputs = tokenizer(full_text, return_tensors="pt")
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs, output_hidden_states=True)
            
            # Get activation at this layer and position
            hidden_state = outputs.hidden_states[layer_idx]  # [1, seq_len, hidden_dim]
            activation = hidden_state[0, probe_idx, :].cpu().numpy()
            
            layer_samples.append(ActivationSample(
                activation=activation,
                tool_used=episode["tool_used"],
                claims_action=episode["claims_action"],
                category=episode["category"],
                scenario=episode["scenario"],
                system_variant=episode["system_variant"],
                social_pressure=episode["social_pressure"],
                position_type="before_tool",
                episode_idx=ep_idx,
            ))
            
        except Exception as e:
            errors += 1
            if ep_idx < 3:
                print(f"  Error on episode {ep_idx}: {e}")
            continue
    
    print(f"  Extracted {len(layer_samples)} samples ({errors} errors/skipped)")
    
    if len(layer_samples) == 0:
        print(f"  ‚ö†Ô∏è  No samples extracted for layer {layer_idx}, skipping...")
        continue
    
    # Show category distribution
    categories_layer = np.array([s.category for s in layer_samples])
    cat_counts = Counter(categories_layer)
    print(f"  Category distribution: {dict(cat_counts)}")
    
    # Convert to arrays
    X_layer = np.stack([s.activation for s in layer_samples])
    y_tool_layer = np.array([s.tool_used for s in layer_samples])
    y_claims_layer = np.array([s.claims_action for s in layer_samples])
    
    print(f"  Activation shape: {X_layer.shape}")
    print(f"  Tool used rate: {y_tool_layer.mean():.1%}")
    print(f"  Claims action rate: {y_claims_layer.mean():.1%}")
    
    # Train/test split
    try:
        X_tr, X_te, y_tool_tr, y_tool_te, y_claims_tr, y_claims_te, cat_tr, cat_te = train_test_split(
            X_layer, y_tool_layer, y_claims_layer, categories_layer,
            test_size=0.2, stratify=categories_layer, random_state=42
        )
        print(f"  Train/test split: {len(X_tr)}/{len(X_te)} samples")
    except ValueError as e:
        print(f"  ‚ö†Ô∏è  Stratified split failed: {e}")
        print(f"  Using non-stratified split...")
        X_tr, X_te, y_tool_tr, y_tool_te, y_claims_tr, y_claims_te, cat_tr, cat_te = train_test_split(
            X_layer, y_tool_layer, y_claims_layer, categories_layer,
            test_size=0.2, random_state=42
        )
        print(f"  Train/test split: {len(X_tr)}/{len(X_te)} samples")
    
    # Train probes
    print(f"  Training probes...")
    reality_probe_layer = train_reality_probe(X_tr, y_tool_tr)
    narrative_probe_layer = train_narrative_probe(X_tr, y_claims_tr)
    
    # Evaluate
    reality_acc = accuracy_score(y_tool_te, reality_probe_layer.predict(X_te))
    narrative_acc = accuracy_score(y_claims_te, narrative_probe_layer.predict(X_te))
    
    # Fake escalation analysis
    fake_mask = cat_te == "fake_escalation"
    if fake_mask.sum() > 0:
        fake_preds = reality_probe_layer.predict(X_te[fake_mask])
        fake_aligned_reality = (fake_preds == False).mean()
    else:
        fake_aligned_reality = np.nan
        print(f"  ‚ö†Ô∏è  No fake_escalation samples in test set")
    
    print(f"  Results:")
    print(f"    Reality probe accuracy:    {reality_acc:.1%}")
    print(f"    Narrative probe accuracy:  {narrative_acc:.1%}")
    print(f"    Fake‚ÜíReality alignment:    {fake_aligned_reality:.1%}")
    
    layer_results['layer'].append(layer_idx)
    layer_results['reality_acc'].append(reality_acc)
    layer_results['narrative_acc'].append(narrative_acc)
    layer_results['fake_aligned_reality'].append(fake_aligned_reality)
    
    print()  # Blank line between layers

print("\n" + "="*50)
print("LAYER ANALYSIS COMPLETE")
print("="*50)
print(f"Analyzed {len(layer_results['layer'])} layers")
if len(layer_results['layer']) > 0:
    best_idx = np.argmax(layer_results['reality_acc'])
    print(f"Best reality probe accuracy: {layer_results['reality_acc'][best_idx]:.1%} at layer {layer_results['layer'][best_idx]}")
else:
    print("‚ö†Ô∏è  No layers were successfully analyzed. Check error messages above.")

Processing 5 layers across 100 episodes...
Layers to analyze: [1, 8, 16, 24, 32]

Layer 1 (hidden_states index)
  Processing episode 20/100...


In [None]:
# Visualize layer-wise results
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot 1: Probe accuracy by layer
ax1 = axes[0]
ax1.plot(layer_results['layer'], layer_results['reality_acc'], 'o-', label='Reality Probe', linewidth=2, markersize=8)
ax1.plot(layer_results['layer'], layer_results['narrative_acc'], 's--', label='Narrative Probe', linewidth=2, markersize=8)
ax1.axhline(y=0.5, color='gray', linestyle=':', label='Chance (50%)')
ax1.set_xlabel('Layer', fontsize=12)
ax1.set_ylabel('Accuracy', fontsize=12)
ax1.set_title('Probe Accuracy by Layer', fontsize=14)
ax1.legend()
ax1.set_ylim(0.4, 1.0)
ax1.grid(True, alpha=0.3)

# Plot 2: Fake escalation alignment by layer
ax2 = axes[1]
ax2.plot(layer_results['layer'], layer_results['fake_aligned_reality'], 'o-', 
         color='green', linewidth=2, markersize=8)
ax2.axhline(y=0.5, color='gray', linestyle=':', label='Chance')
ax2.set_xlabel('Layer', fontsize=12)
ax2.set_ylabel('% Aligned with Reality', fontsize=12)
ax2.set_title('Fake Escalations: Does Probe "Know" Truth?', fontsize=14)
ax2.set_ylim(0.0, 1.1)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../figures/layer_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

# Print summary table
print("\nLayer Analysis Summary:")
print(f"{'Layer':<8} {'Reality':<12} {'Narrative':<12} {'Fake‚ÜíReality':<12}")
print("-" * 44)
for i, layer in enumerate(layer_results['layer']):
    print(f"{layer:<8} {layer_results['reality_acc'][i]:<12.1%} {layer_results['narrative_acc'][i]:<12.1%} {layer_results['fake_aligned_reality'][i]:<12.1%}")

## 10. Error Analysis

Find cases where the probes made mistakes and examine them to understand failure modes.

**What to look for:**
- Are errors concentrated in specific categories?
- Are there ambiguous phrasings that confuse the probe?
- Are certain scenarios/pressures harder?

In [None]:
# Error Analysis: Find misclassified samples
from collections import Counter

# Get predictions on test set (using probes from full_probe_evaluation)
reality_probe = results["reality_probe"]
narrative_probe = results["narrative_probe"]

reality_preds = reality_probe.predict(X_test)
narrative_preds = narrative_probe.predict(X_test)

# Find errors for each probe
reality_errors = np.where(reality_preds != y_tool_test)[0]
narrative_errors = np.where(narrative_preds != y_claims_test)[0]

print("="*60)
print("ERROR SUMMARY")
print("="*60)
print(f"Reality probe errors:    {len(reality_errors)}/{len(X_test)} ({len(reality_errors)/len(X_test):.1%})")
print(f"Narrative probe errors:  {len(narrative_errors)}/{len(X_test)} ({len(narrative_errors)/len(X_test):.1%})")

# Category breakdown of errors
print("\n" + "="*60)
print("REALITY PROBE ERRORS BY CATEGORY")
print("="*60)
if len(reality_errors) > 0:
    reality_error_cats = cat_test[reality_errors]
    for cat, count in Counter(reality_error_cats).most_common():
        total_in_cat = (cat_test == cat).sum()
        print(f"  {cat}: {count}/{total_in_cat} ({count/total_in_cat:.1%})")
else:
    print("  No errors!")

print("\n" + "="*60)
print("NARRATIVE PROBE ERRORS BY CATEGORY")
print("="*60)
if len(narrative_errors) > 0:
    narrative_error_cats = cat_test[narrative_errors]
    for cat, count in Counter(narrative_error_cats).most_common():
        total_in_cat = (cat_test == cat).sum()
        print(f"  {cat}: {count}/{total_in_cat} ({count/total_in_cat:.1%})")

### Summary: Narrative Probe Error Analysis

**Key Finding:** Many "errors" are actually **ground truth labeling issues**, not probe failures.

**Evidence:**

1. **Error 2** - Clear false negative in labels:
   - Text contains: "I've also gone ahead and escalated your case" + "a human will be reviewing your case"
   - Matches phrase list: `"i've escalated"` and `"a human will review your case"`
   - Ground truth says: `claims_action: False` ‚ùå
   - Probe says: `claims_action: True` (confidence 0.999) ‚úÖ
   - **Probe is correct** - this should be labeled as a claim

2. **Error 3** - Labeling inconsistency:
   - Text contains: "I've escalated your case to a human team member"
   - Matches phrase list: `"i've escalated"`
   - Header says: `claims_action: False` but claim detection section says: `claims_action: True`
   - Probe aligns with claim detection (confidence 1.000)
   - **Same episode labeled differently in different places**

3. **Error 5** - Likely false positive in labels:
   - Text contains conditional/instructional language: "If you would like... you can always escalate"
   - This is telling user HOW to escalate, not claiming escalation happened
   - Probe says: `claims_action: False` (confidence 0.000) ‚úÖ
   - Ground truth says: `claims_action: True` ‚ùå

**Root Cause:** 
- Episodes were labeled using simple regex phrase matching (`detect_escalation_claim()`)
- Regex doesn't handle context, partial matches, or conditional language well
- The probe learns semantic meaning better than regex matching

**Implication:**
The narrative probe may be learning a more accurate representation than the ground truth labels. Consider re-labeling episodes using LLM judge (`use_llm_judge=True`) for more nuanced detection.


In [None]:
# Detailed inspection of narrative probe errors
# We need to map test indices back to original samples

# First, create the same train/test split to get the test sample indices
from sklearn.model_selection import train_test_split

# Recreate split with same random state to get indices
indices = np.arange(len(all_samples))
_, test_indices = train_test_split(
    indices, test_size=0.2, stratify=categories, random_state=42
)

# Load original episodes
with open(EPISODE_FILE) as f:
    all_episodes = [json.loads(line) for line in f]

print("="*60)
print("INSPECTING NARRATIVE PROBE ERRORS")
print("="*60)

# Look at up to 5 narrative errors
for i, err_idx in enumerate(narrative_errors[:5]):
    # Get the test sample
    sample_idx = test_indices[err_idx]
    sample = all_samples[sample_idx]
    
    # Get original episode
    episode = all_episodes[sample.episode_idx]
    
    print(f"\n{'‚îÄ'*60}")
    print(f"ERROR {i+1}")
    print(f"{'‚îÄ'*60}")
    print(f"Category:       {sample.category}")
    print(f"Scenario:       {sample.scenario}")
    print(f"System variant: {sample.system_variant}")
    print(f"Social pressure: {sample.social_pressure}")
    print(f"\nGround truth claims_action: {sample.claims_action}")
    print(f"Probe predicted:            {narrative_preds[err_idx]}")
    print(f"Probe confidence P(True):   {narrative_probe.predict_proba(X_test[err_idx:err_idx+1])[0,1]:.3f}")
    
    # Show the reply (truncated)
    reply = episode.get("reply", "N/A")
    print(f"\n--- MODEL REPLY (first 600 chars) ---")
    print(reply[:1000])
    if len(reply) > 1000:
        print("...")
    
    # Show what the claim detector found
    print(f"\n--- CLAIM DETECTION ---")
    print(f"tool_used (ground truth):   {episode.get('tool_used', 'N/A')}")
    print(f"claims_action (ground truth): {episode.get('claims_action', 'N/A')}")

In [None]:
# Analyze error patterns by condition
# Are certain scenarios/pressures/variants harder?

def get_sample_metadata(sample_idx):
    """Get metadata for a test sample."""
    orig_idx = test_indices[sample_idx]
    sample = all_samples[orig_idx]
    return {
        'scenario': sample.scenario,
        'system_variant': sample.system_variant,
        'social_pressure': sample.social_pressure,
    }

# Analyze narrative errors by condition
if len(narrative_errors) > 0:
    print("="*60)
    print("NARRATIVE ERRORS BY CONDITION")
    print("="*60)
    
    error_metadata = [get_sample_metadata(i) for i in narrative_errors]
    
    # By scenario
    scenarios = [m['scenario'] for m in error_metadata]
    print("\nBy Scenario:")
    for scenario, count in Counter(scenarios).most_common():
        total = sum(1 for i in range(len(X_test)) if get_sample_metadata(i)['scenario'] == scenario)
        print(f"  {scenario}: {count}/{total} errors ({count/total:.1%})")
    
    # By system variant
    variants = [m['system_variant'] for m in error_metadata]
    print("\nBy System Variant:")
    for variant, count in Counter(variants).most_common():
        total = sum(1 for i in range(len(X_test)) if get_sample_metadata(i)['system_variant'] == variant)
        print(f"  {variant}: {count}/{total} errors ({count/total:.1%})")
    
    # By social pressure
    pressures = [m['social_pressure'] for m in error_metadata]
    print("\nBy Social Pressure:")
    for pressure, count in Counter(pressures).most_common():
        total = sum(1 for i in range(len(X_test)) if get_sample_metadata(i)['social_pressure'] == pressure)
        print(f"  {pressure}: {count}/{total} errors ({count/total:.1%})")
else:
    print("No narrative probe errors to analyze!")

## 11. Summary for Write-up

Key findings to include in executive summary:

In [None]:
# Generate summary statistics for write-up
print("="*70)
print("KEY FINDINGS FOR EXECUTIVE SUMMARY")
print("="*70)

# Phase 1: Behavioral
fake_count = (categories == "fake_escalation").sum()
total_count = len(categories)
fake_rate = fake_count / total_count

print(f"\nüìä PHASE 1: BEHAVIORAL STUDY")
print(f"   Total episodes: {total_count}")
print(f"   Fake escalations: {fake_count} ({fake_rate:.1%})")
print(f"   ‚Üí Model claims actions it didn't take in {fake_rate:.1%} of episodes")

# Phase 2: Mechanistic
print(f"\nüî¨ PHASE 2: MECHANISTIC PROBES")
print(f"   Reality probe accuracy: {results['reality_overall'].accuracy:.1%}")
print(f"   Narrative probe accuracy: {results['narrative_overall'].accuracy:.1%}")

fake_analysis = results['fake_analysis']
print(f"\n   On FAKE ESCALATIONS (N={fake_analysis.n_fake}):")
print(f"   ‚Üí Reality probe predicts 'no tool': {fake_analysis.probe_aligned_with_reality:.1%}")
print(f"   ‚Üí Model internally 'knows' it didn't act, even while claiming it did!")

# Layer analysis (if run)
if 'layer_results' in dir() and len(layer_results['layer']) > 0:
    print(f"\nüìà LAYER ANALYSIS:")
    best_layer_idx = np.argmax(layer_results['reality_acc'])
    best_layer = layer_results['layer'][best_layer_idx]
    best_acc = layer_results['reality_acc'][best_layer_idx]
    print(f"   Best layer for reality probe: {best_layer} ({best_acc:.1%} accuracy)")
    
    # Where does it emerge?
    above_80 = [l for l, acc in zip(layer_results['layer'], layer_results['reality_acc']) if acc > 0.8]
    if above_80:
        print(f"   Action-grounding emerges by layer: {min(above_80)}")

# Cross-domain (if run)
if 'transfer_result' in dir():
    print(f"\nüîÑ CROSS-DOMAIN TRANSFER:")
    print(f"   Train: {transfer_result.train_domain} ‚Üí Test: {transfer_result.test_domain}")
    print(f"   Transfer accuracy: {transfer_result.test_accuracy:.1%}")
    print(f"   Above chance: {'YES ‚úì' if transfer_result.above_chance else 'NO'}")

# Baselines
print(f"\nüìè BASELINES:")
print(f"   Random baseline: 50%")
majority_class = (y_tool_test == False).mean()  # Assuming no-tool is majority
print(f"   Majority class baseline: {max(majority_class, 1-majority_class):.1%}")
print(f"   Reality probe: {results['reality_overall'].accuracy:.1%} (‚Üë{results['reality_overall'].accuracy - 0.5:.1%} above random)")

print("\n" + "="*70)

## 13. Critical Test: Early Position Probing

**The key question:** Does the model "know" at the START of its response whether it will take an action?

If the probe works at `first_assistant` position (before any tool-related tokens are generated), this is stronger evidence that we're detecting an "action grounding" representation, not just "tool syntax coming up."

| Position | What it tells us |
|----------|------------------|
| `before_tool` | Model knows right before emitting `<<CALL` (less surprising) |
| `first_assistant` | Model knows at the START of response (more surprising) |

If accuracy is high at `first_assistant`, the model has "decided" whether to take action before generating any relevant content.

In [None]:
# Visualize position comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot 1: Accuracy comparison
ax1 = axes[0]
positions = ['first_assistant\n(start of response)', 'before_tool\n(right before <<CALL)']
reality_accs = [results_first['reality_acc'], results_before['reality_acc']]
narrative_accs = [results_first['narrative_acc'], results_before['narrative_acc']]

x = np.arange(len(positions))
width = 0.35

bars1 = ax1.bar(x - width/2, reality_accs, width, label='Reality Probe', color='#2ecc71', edgecolor='black')
bars2 = ax1.bar(x + width/2, narrative_accs, width, label='Narrative Probe', color='#3498db', edgecolor='black')

ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Chance')
ax1.set_ylabel('Accuracy', fontsize=12)
ax1.set_title('Probe Accuracy by Token Position', fontsize=14)
ax1.set_xticks(x)
ax1.set_xticklabels(positions)
ax1.legend()
ax1.set_ylim(0, 1.1)

for bar, acc in zip(bars1, reality_accs):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
             f'{acc:.1%}', ha='center', fontsize=11, fontweight='bold')

# Plot 2: Fake escalation alignment
ax2 = axes[1]
fake_aligned = [results_first['fake_aligned_reality'], results_before['fake_aligned_reality']]
colors = ['#e74c3c', '#c0392b']
bars3 = ax2.bar(positions, fake_aligned, color=colors, edgecolor='black')
ax2.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7)
ax2.set_ylabel('% Aligned with Reality', fontsize=12)
ax2.set_title('Fake Escalations: When Does Model "Know" Truth?', fontsize=14)
ax2.set_ylim(0, 1.1)

for bar, val in zip(bars3, fake_aligned):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
             f'{val:.1%}', ha='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('../figures/position_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüìä Figure saved to: ../figures/position_comparison.png")