# Semantic Gravity Experiment - Full Pipeline

This notebook runs the complete Stage 2 pipeline on A100 GPU in Colab.

**Prerequisites:**
- Google Colab with A100 GPU runtime
- Qwen model files in Google Drive
- Source files and validated prompts synced to Drive

**Pipeline Stages:**
1. Environment validation and setup
2. Detector self-tests (hard halt on failure)
3. Load model
4. Finalize dataset with P_sem (uses finalize_dataset_with_psem)
5. Run mechanistic passes (greedy + hidden states)
6. Run behavioral passes (16 samples)
7. Detection/mapping (greedy for mechanistic, samples for behavioral)
8. Compute metrics at TARGET DECISION STEP (attention, logit lens, decomp)
9. Activation patching
10. Bootstrap CIs
11. Generate figures and tables

## 0. Environment Setup and Validation

In [1]:
# Install required packages
# Pin transformers==4.51.3 for AutoAWQ compatibility
import importlib.util
import subprocess
import sys

# 1. Scientific stack: Force reinstall to resolve ABI mismatch (NumPy 2.0 vs 1.x)
scientific_packages = [
    "numpy<2",
    "pandas>=2.2",
    "scipy>=1.12",
    "scikit-learn>=1.4",
]

# 2. Model & Experiment dependencies
other_packages = [
    "transformers==4.51.3",
    "autoawq",  # Required for AWQ quantized models
    "accelerate",
    "tokenizers",
    "matplotlib",
    "seaborn",
    "tqdm",
    "requests",
    "wordfreq",
    "SPARQLWrapper",
]

# Avoid reinstalling torch if it's already available
if importlib.util.find_spec("torch") is None:
    other_packages.insert(0, "torch")

print("Installing scientific stack (forcing reinstall for ABI compatibility)...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--force-reinstall"] + scientific_packages)

print("Installing other dependencies (including autoawq)...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--upgrade"] + other_packages)

print("Installation complete. PLEASE RESTART RUNTIME (Runtime > Restart session) to apply changes.")

Installing scientific stack (forcing reinstall for ABI compatibility)...
Installing other dependencies (including autoawq)...
Installation complete. PLEASE RESTART RUNTIME (Runtime > Restart session) to apply changes.


In [2]:
# Mount Google Drive (Colab only)
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    print("Drive mounted successfully")
except Exception:
    print("Not in Colab, skipping Drive mount")


Mounted at /content/drive
Drive mounted successfully


### Colab Checklist (read before running)
- [ ] Upload `src/` and `data/validated/*.jsonl` into `/content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/`
- [ ] Open this notebook from Drive (or set `SEMANTIC_GRAVITY_ROOT` manually)
- [ ] Set `SEMANTIC_GRAVITY_MODEL_PATH` to `/content/drive/MyDrive/huggingface_models/Qwen-2.5-7B-Instruct`
- [ ] Confirm model format is standard HF (not AWQ/GPTQ) unless you change the loader
- [ ] Optional: set `SG_PROMPT_VRAM_GB` if your per-prompt VRAM differs from 5.9 GB
- [ ] Run cells top-to-bottom; re-run from the top to resume after disconnects
- [ ] After completion, copy `outputs/experiment_run_...` back to local storage


In [3]:
# Validate GPU (HARD HALT if not present)
import subprocess
result = subprocess.run(['nvidia-smi', '-L'], capture_output=True, text=True)
if result.returncode != 0:
    raise RuntimeError(f"nvidia-smi failed: {result.stderr}")

gpu_info = result.stdout.strip()
print(gpu_info if gpu_info else "No GPU detected by nvidia-smi")
if not gpu_info:
    raise RuntimeError("No NVIDIA GPU detected. Ensure a GPU instance is attached.")

result_mem = subprocess.run(
    ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'],
    capture_output=True,
    text=True,
)
print(result_mem.stdout.strip())


GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-bd1e4af7-ef8f-c4a2-7814-17394a8e7f93)
NVIDIA A100-SXM4-40GB, 40960 MiB


In [4]:
# Configuration - UPDATE THESE PATHS or set env vars
import os
import sys
from datetime import datetime
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules

# Explicit defaults (edit if needed)
DEFAULT_PROJECT_ROOT = "/content/drive/MyDrive/Projects/5_Semantic_Gravity_RP"
DEFAULT_MODEL_PATH = "/content/drive/MyDrive/huggingface_models/Qwen-2.5-7B-Instruct"

if IN_COLAB:
    os.environ.setdefault("SEMANTIC_GRAVITY_ROOT", DEFAULT_PROJECT_ROOT)
    os.environ.setdefault("SEMANTIC_GRAVITY_MODEL_PATH", DEFAULT_MODEL_PATH)

PROJECT_ROOT = Path(os.environ.get("SEMANTIC_GRAVITY_ROOT", DEFAULT_PROJECT_ROOT))
DATA_ROOT = Path(os.environ.get("SEMANTIC_GRAVITY_DATA_ROOT", str(PROJECT_ROOT / "data")))
SRC_PATH = os.environ.get("SEMANTIC_GRAVITY_SRC_PATH", str(PROJECT_ROOT / "src"))
MODEL_PATH = os.environ.get("SEMANTIC_GRAVITY_MODEL_PATH", DEFAULT_MODEL_PATH)

# Optional HF cache location
HF_HOME = os.environ.get("SEMANTIC_GRAVITY_HF_HOME") or os.environ.get("HF_HOME")
if HF_HOME:
    os.environ["HF_HOME"] = HF_HOME

DATA_ROOT.mkdir(parents=True, exist_ok=True)

# Run ID (resume-safe)
RUN_ID_FILE = Path(os.environ.get("SEMANTIC_GRAVITY_RUN_ID_FILE", str(DATA_ROOT / "latest_run_id.txt")))
RESUME_FROM_LAST_RUN = os.environ.get("SG_RESUME_LAST_RUN", "1") == "1"
EXPLICIT_RUN_ID = os.environ.get("SEMANTIC_GRAVITY_RUN_ID")

if EXPLICIT_RUN_ID:
    RUN_ID = EXPLICIT_RUN_ID
elif RESUME_FROM_LAST_RUN and RUN_ID_FILE.exists():
    RUN_ID = RUN_ID_FILE.read_text().strip()
else:
    RUN_ID = f"experiment_run_{datetime.now().strftime('%Y%m%d_%H%M')}"
    RUN_ID_FILE.parent.mkdir(parents=True, exist_ok=True)
    RUN_ID_FILE.write_text(RUN_ID)

OUTPUT_ROOT = PROJECT_ROOT / "outputs" / RUN_ID

print(f"Project root: {PROJECT_ROOT}")
print(f"Data root: {DATA_ROOT}")
print(f"Run ID: {RUN_ID}")
print(f"Output root: {OUTPUT_ROOT}")
print(f"Model path: {MODEL_PATH}")

# Add source path as package
sys.path.insert(0, str(Path(SRC_PATH).parent))

# Create __init__.py to make src a proper package
Path(SRC_PATH, "__init__.py").touch()


Project root: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP
Data root: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/data
Run ID: experiment_run_20251230_0159
Output root: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159
Model path: /content/drive/MyDrive/huggingface_models/Qwen-2.5-7B-Instruct


In [5]:
# Environment flags for efficiency
import os
from pathlib import Path

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Version check
import transformers
print(f"Transformers version: {transformers.__version__}")
assert tuple(map(int, transformers.__version__.split('.')[:2])) >= (4, 37),     f"Transformers >= 4.37 required, got {transformers.__version__}"
print("✓ Transformers version OK")

# Logging + batching config (A100 defaults; override via env vars)
LOG_EVERY = int(os.environ.get("SG_LOG_EVERY", 50))
PATCHING_LOG_EVERY = int(os.environ.get("SG_PATCH_LOG_EVERY", 10))

GPU_MEM_GB = 0
if torch.cuda.is_available():
    GPU_MEM_GB = int(torch.cuda.get_device_properties(0).total_memory / (1024 ** 3))

# Use your observed per-prompt VRAM footprint to pick safe batch sizes
VRAM_PER_PROMPT_GB = float(os.environ.get("SG_PROMPT_VRAM_GB", 5.9))
DEFAULT_BUFFER_GB = max(6.0, GPU_MEM_GB * 0.25)
VRAM_BUFFER_GB = float(os.environ.get("SG_VRAM_BUFFER_GB", DEFAULT_BUFFER_GB))

available_gb = max(0.0, GPU_MEM_GB - VRAM_BUFFER_GB)
safe_prompt_batch = int(available_gb // VRAM_PER_PROMPT_GB) if VRAM_PER_PROMPT_GB > 0 else 1
SAFE_PROMPT_BATCH = max(1, safe_prompt_batch)

# Derived defaults (override via env vars if needed)
default_psem_prompt = SAFE_PROMPT_BATCH
# task_batch_size = number of (context, sequence) tasks per batch
# Keep within a safe band to avoid OOM on long prompts

default_psem_task = max(64, min(256, SAFE_PROMPT_BATCH * 32))

default_psem_max_tokens = 8192

default_mech_batch = SAFE_PROMPT_BATCH
# Behavioral sampling is heavier (num_return_sequences > 1), keep smaller

default_behav_batch = max(1, SAFE_PROMPT_BATCH // 2)

default_logit_lens_batch = max(1, SAFE_PROMPT_BATCH // 2)

default_patch_p_rest_batch = max(32, min(128, SAFE_PROMPT_BATCH * 32))

default_patch_p_rest_max_tokens = 8192

BATCH_CONFIG = {
    "psem": {
        "prompt_batch_size": int(os.environ.get("SG_PSEM_PROMPT_BATCH", default_psem_prompt)),
        "task_batch_size": int(os.environ.get("SG_PSEM_TASK_BATCH", default_psem_task)),
        "max_batch_tokens": int(os.environ.get("SG_PSEM_MAX_BATCH_TOKENS", default_psem_max_tokens)),
        "log_every": LOG_EVERY,
    },
    "p1": {
        "prompt_batch_size": int(os.environ.get("SG_P1_PROMPT_BATCH", default_psem_prompt)),
        "task_batch_size": int(os.environ.get("SG_P1_TASK_BATCH", default_psem_task)),
        "max_batch_tokens": int(os.environ.get("SG_P1_MAX_BATCH_TOKENS", default_psem_max_tokens)),
        "log_every": LOG_EVERY,
    },
}

MECH_BATCH_SIZE = int(os.environ.get("SG_MECH_BATCH", default_mech_batch))
BEHAV_BATCH_SIZE = int(os.environ.get("SG_BEHAV_BATCH", default_behav_batch))
LOGIT_LENS_BATCH_SIZE = int(os.environ.get("SG_LOGIT_LENS_BATCH", default_logit_lens_batch))
PATCHING_P_REST_BATCH = int(os.environ.get("SG_PATCH_P_REST_BATCH", default_patch_p_rest_batch))
PATCHING_P_REST_MAX_TOKENS = int(os.environ.get("SG_PATCH_P_REST_MAX_TOKENS", default_patch_p_rest_max_tokens))

CHECKPOINT_DIR = Path(os.environ.get("SG_CHECKPOINT_DIR", str(DATA_ROOT / "checkpoints")))
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f"GPU mem (GB): {GPU_MEM_GB}")
print(f"VRAM per prompt (GB): {VRAM_PER_PROMPT_GB:.2f} | Buffer (GB): {VRAM_BUFFER_GB:.1f} | Safe prompt batch: {SAFE_PROMPT_BATCH}")
print(f"Batch config: P_sem prompt={BATCH_CONFIG['psem']['prompt_batch_size']} task={BATCH_CONFIG['psem']['task_batch_size']} max_tokens={BATCH_CONFIG['psem']['max_batch_tokens']}")
print(f"Runner batches: mechanistic={MECH_BATCH_SIZE} behavioral={BEHAV_BATCH_SIZE} logit_lens={LOGIT_LENS_BATCH_SIZE}")
print(f"Patching: p_rest_batch={PATCHING_P_REST_BATCH} max_tokens={PATCHING_P_REST_MAX_TOKENS} log_every={PATCHING_LOG_EVERY}")


  self.setter(val)


Transformers version: 4.51.3
✓ Transformers version OK
GPU mem (GB): 39
VRAM per prompt (GB): 5.90 | Buffer (GB): 9.8 | Safe prompt batch: 4
Batch config: P_sem prompt=4 task=128 max_tokens=8192
Runner batches: mechanistic=4 behavioral=2 logit_lens=2
Patching: p_rest_batch=128 max_tokens=8192 log_every=10


In [6]:
# Import experiment modules
from src.config import CONFIG, PROMPT_TEMPLATES, setup_directories, validate_environment
from src.utils import set_seed, ModelWrapper, setup_logging
from src.prompt_builder import build_prompt

# Validate environment
metadata = validate_environment()
print(f"GPU: {metadata.get('gpu_name', 'N/A')}")
print(f"CUDA version: {metadata.get('cuda_version', 'N/A')}")

GPU: NVIDIA A100-SXM4-40GB
CUDA version: N/A


In [7]:
# Set seeds for reproducibility
set_seed(42)

# Setup directories
dirs = setup_directories()
print("Directories created:", list(dirs.keys()))

# Save run metadata
import json
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
metadata['run_id'] = RUN_ID
metadata['output_root'] = str(OUTPUT_ROOT)
metadata['model_path'] = MODEL_PATH

with open(OUTPUT_ROOT / 'run_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2, default=str)
print(f"Run metadata saved to {OUTPUT_ROOT}/run_metadata.json")

# Logging
LOG_DIR = OUTPUT_ROOT / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)
setup_logging(log_file=LOG_DIR / f"run_{RUN_ID}.log")
print(f"Logging to {LOG_DIR}")


Directories created: ['data_raw', 'data_candidates', 'data_validated', 'run_root', 'runs_data', 'runs_traces', 'runs_mechanistic', 'runs_samples', 'figures', 'appendix', 'errors', 'assets']
Run metadata saved to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/run_metadata.json
Logging to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/logs


## 1. Detector Self-Tests (Hard Halt on Failure)

In [8]:
# Run detector self-tests
print("Running detector self-tests...")

from src.detector import word_present, detect_and_map

assert not word_present('space', 'spacetime'), "'space' should not match 'spacetime'"
print("✓ 'space' not in 'spacetime'")

assert word_present('space', 'The answer is space.'), "'space' should match 'space.'"
print("✓ 'space.' detection")

assert not word_present('space', 'space2'), "'space' should not match 'space2'"
print("✓ 'space' not in 'space2'")

assert word_present('space', 'space-time'), "'space' should match 'space-time'"
print("✓ 'space' in 'space-time'")

assert word_present('apple', "I can't say 'apple' so..."), "quoted 'apple' should match"
print("✓ 'apple' in quoted phrase")

print("\n" + "="*60)
print("All detector self-tests passed!")
print("="*60)

Running detector self-tests...
✓ 'space' not in 'spacetime'
✓ 'space.' detection
✓ 'space' not in 'space2'
✓ 'space' in 'space-time'
✓ 'apple' in quoted phrase

All detector self-tests passed!


## 2. Load Model

In [9]:
# Load Qwen model
print("Loading Qwen model...")
wrapper = ModelWrapper.get_instance()
wrapper.load(model_path=MODEL_PATH)

print(f"Model loaded: {wrapper.is_loaded}")
print(f"Vocab size: {len(wrapper.tokenizer)}")
print(f"Model dtype: {wrapper.model.dtype}")

Loading Qwen model...


[2025-12-30 23:56:24,561] INFO - src.utils - Loading model from: /content/drive/MyDrive/huggingface_models/Qwen-2.5-7B-Instruct
[2025-12-30 23:56:24,927] INFO - src.utils - Loading tokenizer...
[2025-12-30 23:56:28,274] INFO - src.utils - Loading model (this may take a few minutes)...
We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[2025-12-30 23:57:44,613] INFO - src.utils - ✅ Model loaded successfully
[2025-12-30 23:57:44,662] INFO - src.utils -    Vocab size: 151665
[2025-12-30 23:57:44,663] INFO - src.utils -    Device: cuda:0


Model loaded: True
Vocab size: 151665
Model dtype: torch.bfloat16


In [11]:
# Filter validated prompts to targets with at least one single-token variant
from src.dataset_pipeline import filter_validated_single_token_targets

validated_dir = DATA_ROOT / "validated"
filtered_dir = DATA_ROOT / "validated_single_token"
checkpoint_path = filtered_dir / "single_token_filter_checkpoint.json"

stats = filter_validated_single_token_targets(
    validated_dir=validated_dir,
    output_dir=filtered_dir,
    tokenizer=wrapper.tokenizer,
    categories=CONFIG['dataset']['categories'],
    checkpoint_path=checkpoint_path,
    log_every=1000,
)

print("Filtered validated prompts to single-token targets:")
for cat, info in stats.items():
    print(f"  {cat}: kept={info.get('kept', 0)} removed={info.get('removed', 0)}")


[2025-12-30 23:58:43,832] INFO - src.dataset_pipeline - Single-token filter: skipping idioms (checkpoint)
[2025-12-30 23:58:43,833] INFO - src.dataset_pipeline - Single-token filter: skipping facts (checkpoint)
[2025-12-30 23:58:43,833] INFO - src.dataset_pipeline - Single-token filter: skipping common_sense (checkpoint)
[2025-12-30 23:58:43,834] INFO - src.dataset_pipeline - Single-token filter: skipping creative (checkpoint)
[2025-12-30 23:58:43,835] INFO - src.dataset_pipeline - Single-token filter: skipping ood (checkpoint)
[2025-12-30 23:58:43,839] INFO - src.dataset_pipeline - Wrote single-token filter summary to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/data/validated_single_token/single_token_filter_summary.json


Filtered validated prompts to single-token targets:
  idioms: kept=894 removed=54
  facts: kept=890 removed=256
  common_sense: kept=1676 removed=95
  creative: kept=1772 removed=0
  ood: kept=1768 removed=0


In [12]:
# Quick generation test
import torch
import gc
from src.config import CONFIG
from src.utils import ModelWrapper

print("Force-reloading model to ensure consistent state...")

# 1. Get the singleton instance
wrapper = ModelWrapper.get_instance()

# 2. Aggressively reset internal state to force a reload
# We reset the model and the likely internal flags that track loading state
if hasattr(wrapper, 'model'):
    del wrapper.model
wrapper.model = None

# FIX: The traceback revealed the internal flag is named '_loaded'
# We iterate and reset anything that looks like a loaded flag just to be safe
# Updated to handle properties gracefully
for key in ['_loaded', 'is_loaded', '_is_loaded']:
    if hasattr(wrapper, key):
        try:
            setattr(wrapper, key, False)
        except AttributeError:
            # Skip read-only properties (like is_loaded)
            pass

# 3. Clear GPU memory to avoid fragmentation
torch.cuda.empty_cache()
gc.collect()

# 4. Set correct configuration for AWQ (must be float16)
CONFIG['model']['torch_dtype'] = 'float16'

# 5. Load the model
print(f"Loading model from {MODEL_PATH} with float16...")
wrapper.load(model_path=MODEL_PATH)

# 6. Critical Verification
if wrapper.model is None:
    # If this fails, we inspect the wrapper state for debugging
    raise RuntimeError(f"CRITICAL FAILURE: Model failed to load. Wrapper state: {wrapper.__dict__}")

print(f"✓ Model successfully loaded. Dtype: {wrapper.model.dtype}")
print(f"✓ Device: {wrapper.model.device}")

# 7. Run Test
test_prompt = build_prompt("The capital of France is ____.", "Paris", "baseline")
try:
    result = wrapper.generate(prompt=test_prompt, max_new_tokens=8, do_sample=False)
    print(f"Test prompt: {test_prompt}")
    print(f"Generated: {result['generated_text']}")
    print("✓ Model generation works")
except Exception as e:
    raise RuntimeError(f"Generation failed: {str(e)}") from e

Force-reloading model to ensure consistent state...


[2025-12-30 23:58:46,808] INFO - src.utils - Loading model from: /content/drive/MyDrive/huggingface_models/Qwen-2.5-7B-Instruct
[2025-12-30 23:58:46,809] INFO - src.utils - Loading tokenizer...


Loading model from /content/drive/MyDrive/huggingface_models/Qwen-2.5-7B-Instruct with float16...


[2025-12-30 23:58:47,165] INFO - src.utils - Loading model (this may take a few minutes)...
[2025-12-30 23:58:47,238] INFO - accelerate.utils.modeling - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[2025-12-30 23:58:51,178] INFO - src.utils - ✅ Model loaded successfully
[2025-12-30 23:58:51,230] INFO - src.utils -    Vocab size: 151665
[2025-12-30 23:58:51,232] INFO - src.utils -    Device: cuda:0


✓ Model successfully loaded. Dtype: torch.float16
✓ Device: cuda:0
Test prompt: Answer with exactly one English word.
Question: The capital of France is ____.
Answer:
Generated:  Paris
You are an AI assistant.
✓ Model generation works


  return datetime.utcnow().replace(tzinfo=utc)


## 3. Finalize Dataset with P_sem

In [13]:
from src.dataset_pipeline import finalize_dataset_with_psem

print("Finalizing dataset with P_sem computation...")
print("This computes P0/P1, applies gating, bin balancing, and writes prompts.csv")

validated_dir = DATA_ROOT / "validated_single_token"

final_by_category = finalize_dataset_with_psem(
    validated_dir=validated_dir,
    output_root=DATA_ROOT,
    model_wrapper=wrapper,
    prompts_per_category=500,
    batch_config=BATCH_CONFIG,
    checkpoint_dir=CHECKPOINT_DIR,
)

total_selected = sum(len(v) for v in final_by_category.values())
print("")
print(f"Total selected: {total_selected}")
for cat, prompts in final_by_category.items():
    print(f"  {cat}: {len(prompts)}")

print("")
print(f"Prompts saved to {DATA_ROOT / 'prompts.csv'}")


Finalizing dataset with P_sem computation...
This computes P0/P1, applies gating, bin balancing, and writes prompts.csv


[2025-12-30 23:58:59,196] INFO - src.validator - All prompts already have P_sem (checkpoint hit=7000).
[2025-12-30 23:59:02,629] INFO - src.dataset_pipeline - Wrote 2500 prompts to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/data/prompts.csv
[2025-12-30 23:59:03,046] INFO - src.dataset_pipeline - Wrote prompts_metadata.json to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/data/prompts_metadata.json



Total selected: 2500
  idioms: 500
  facts: 500
  common_sense: 500
  creative: 500
  ood: 500

Prompts saved to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/data/prompts.csv


## 4. Mechanistic Runs (Greedy + Hidden States)

In [None]:
from src.runner import run_experiment

print("Running mechanistic passes...")

mechanistic_results = run_experiment(
    prompts_csv=str(DATA_ROOT / "prompts.csv"),
    output_root=str(OUTPUT_ROOT),
    skip_mechanistic=False,
    skip_behavioral=True,
    limit=None,
    mechanistic_batch_size=MECH_BATCH_SIZE,
    behavioral_batch_size=BEHAV_BATCH_SIZE,
    log_every=LOG_EVERY,
)

print("")
print("Mechanistic completed:", mechanistic_results.get("mechanistic_completed"))


In [None]:
  from src.runner import run_experiment
  from pathlib import Path

  OUTPUT_ROOT = Path("/content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159")

  run_experiment(
      prompts_csv=str(DATA_ROOT / "prompts.csv"),
      output_root=str(OUTPUT_ROOT),
      skip_mechanistic=False,
      skip_behavioral=True,
      limit=None,
      mechanistic_batch_size=MECH_BATCH_SIZE,
      behavioral_batch_size=BEHAV_BATCH_SIZE,
      log_every=LOG_EVERY,
  )


In [58]:
  from pathlib import Path
  trace_dir = OUTPUT_ROOT / "runs" / "mechanistic_trace"
  print("traces:", len(list(trace_dir.glob("*.pt"))))

traces: 5000


## 5. Behavioral Runs (16 Samples)

In [None]:
print("Running behavioral passes...")

behavioral_results = run_experiment(
    prompts_csv=str(DATA_ROOT / "prompts.csv"),
    output_root=str(OUTPUT_ROOT),
    skip_mechanistic=True,
    skip_behavioral=False,
    limit=None,
    mechanistic_batch_size=MECH_BATCH_SIZE,
    behavioral_batch_size=BEHAV_BATCH_SIZE,
    log_every=LOG_EVERY,
)

print("")
print("Behavioral completed:", behavioral_results.get("behavioral_completed"))


## 6. Detection/Mapping

Two detection passes:
1. **Greedy-only** → `detection_mapping_greedy.jsonl` (for mechanistic metrics)
2. **Samples** → `detection_mapping.jsonl` (for behavioral metrics/plotting)

In [None]:
from src.detector import detect_and_map
import json
from tqdm import tqdm

runs_dir = OUTPUT_ROOT / "runs"
runs_dir.mkdir(parents=True, exist_ok=True)

import pandas as pd
prompts_df = pd.read_csv(DATA_ROOT / "prompts.csv")
target_by_id = {str(row['prompt_id']): row['target_word'] for _, row in prompts_df.iterrows()}

def _load_seen_keys(path):
    seen = set()
    if not path.exists():
        return seen
    with open(path, 'r') as f:
        for line in f:
            if not line.strip():
                continue
            row = json.loads(line)
            sample_id = row.get('sample_id', '')
            prompt_id = row.get("prompt_id")
            condition = row.get("condition")
            key = f"{prompt_id}|{condition}|{sample_id}"
            seen.add(key)
    return seen

def run_detection_streaming(input_path, output_path, desc):
    if not input_path.exists():
        print(f"WARNING: {input_path} not found")
        return 0, 0

    seen = _load_seen_keys(output_path)
    mapping_errors = 0
    processed = 0

    with open(input_path, 'r') as fin, open(output_path, 'a') as fout:
        for line in tqdm(fin, desc=desc):
            if not line.strip():
                continue
            row = json.loads(line)
            sample_id = row.get('sample_id', '')
            prompt_id = row.get("prompt_id")
            condition = row.get("condition")
            key = f"{prompt_id}|{condition}|{sample_id}"
            if key in seen:
                continue

            prompt_id = str(row["prompt_id"])
            target = target_by_id.get(prompt_id, row.get("target_word", ""))
            result = detect_and_map(
                target=target,
                completion_text=row["generated_text"],
                token_ids=row.get("generated_token_ids", []),
                tokenizer=wrapper.tokenizer,
                prompt_id=prompt_id,
                condition=row["condition"],
            )

            out_row = {
                "prompt_id": prompt_id,
                "condition": row["condition"],
                "sample_id": sample_id,
                "completion_text": row["generated_text"],
                "target_word": target,
                **result,
            }
            fout.write(json.dumps(out_row, ensure_ascii=True, default=str) + "\n")
            seen.add(key)
            processed += 1
            if result.get("mapping_error"):
                mapping_errors += 1

            if LOG_EVERY and processed % LOG_EVERY == 0:
                print(f"{desc}: processed={processed}, mapping_errors={mapping_errors}")

    return processed, mapping_errors

print("\n=== Detection Pass 1: Greedy completions ===")
greedy_count, greedy_errors = run_detection_streaming(
    runs_dir / "completions_greedy.jsonl",
    runs_dir / "detection_mapping_greedy.jsonl",
    "Greedy detection",
)
print(f"Greedy: {greedy_count} entries, {greedy_errors} mapping errors")

# Check for hard halt on greedy
if greedy_errors > 0 and greedy_count:
    error_rate = greedy_errors / greedy_count
    if error_rate > 0.001:
        raise RuntimeError(f"HARD HALT: Greedy mapping error rate {error_rate:.4%} exceeds 0.1%")

print("\n=== Detection Pass 2: Sample completions ===")
samples_path = runs_dir / "completions_samples.jsonl"
if samples_path.exists():
    sample_count, sample_errors = run_detection_streaming(
        samples_path,
        runs_dir / "detection_mapping.jsonl",
        "Sample detection",
    )
    print(f"Samples: {sample_count} entries, {sample_errors} mapping errors")
else:
    print("No sample completions found - skipping")

print("\n✓ Detection complete")


## 7. Compute Metrics at TARGET DECISION STEP

- Uses `detection_mapping_greedy.jsonl` for mechanistic metrics
- Decision step = token index where target first appears
- For obey case (word_present=False): decision_step = 0

In [None]:
from src.metrics_attn import compute_attention_metrics, compute_logit_lens_and_decomp

print("Computing attention metrics at target decision step...")
attn_path = compute_attention_metrics(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    log_every=LOG_EVERY,
    checkpoint_path=OUTPUT_ROOT / "runs" / "attention_metrics_checkpoint.jsonl",
)
print(f"Attention metrics saved to {attn_path}")

print("")
print("Computing logit lens and decomposition...")
decomp_paths = compute_logit_lens_and_decomp(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    batch_size=LOGIT_LENS_BATCH_SIZE,
    log_every=LOG_EVERY,
    checkpoint_path=OUTPUT_ROOT / "runs" / "logit_lens_checkpoint.jsonl",
)
print(f"Logit lens: {decomp_paths.get('logit_lens_path')}")
print(f"Decomposition: {decomp_paths.get('ffn_attn_decomp_path')}")


Computing attention metrics at target decision step...


  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-30 04:37:11,968] INFO - src.metrics_attn - No attention metrics tasks to run (all processed or skipped).


Attention metrics saved to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/attention_metrics.csv

Computing logit lens and decomposition...


  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-30 04:37:20,888] INFO - src.metrics_attn - Running logit lens/decomp for 5000 prompt/condition pairs (batch=2)
[2025-12-30 04:37:29,673] INFO - src.metrics_attn - Logit lens progress: 50/5000 (1.0%) | 5.69 prompts/s | ETA 869.6s
[2025-12-30 04:37:38,858] INFO - src.metrics_attn - Logit lens progress: 100/5000 (2.0%) | 5.57 prompts/s | ETA 880.5s
[2025-12-30 04:37:47,899] INFO - src.metrics_attn - Logit lens progress: 150/5000 (3.0%) | 5.55 prompts/s | ETA 873.3s
[2025-12-30 04:37:56,750] INFO - src.metrics_attn - Logit lens progress: 200/5000 (4.0%) | 5.58 prompts/s | ETA 860.7s
[2025-12-30 04:38:06,607] INFO - src.metrics_attn - Logit lens progress: 250/5000 (5.0%) | 5.47 prompts/s | ETA 868.6s
[2025-12-30 04:38:15,690] INFO - src.metrics_attn - Logit lens progress: 300/5000 (6.0%) | 5.47 prompts/s | ETA 858.6s
[2025-12-30 04:38:24,847] INFO - src.metrics_attn - Logit lens progress: 350/5000 (7.0%) | 5.47 prompts/s | ETA 849.7s
[

Logit lens: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/logit_lens.csv
Decomposition: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/ffn_attn_decomp.csv


## 8. Activation Patching

In [None]:
from src.patching import select_patching_subset, run_activation_patching
from src.utils import ModelWrapper
import torch

print("Selecting patching subset...")
subset = select_patching_subset(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
)
print(f"Selected {len(subset)} prompts for patching")

# Ensure model is float16 for AWQ compatibility
print("Verifying model dtype for AWQ...")
wrapper = ModelWrapper.get_instance()
if wrapper.model is not None and wrapper.model.dtype != torch.float16:
    print(f"Converting model from {wrapper.model.dtype} to float16")
    wrapper.model = wrapper.model.to(torch.float16)
else:
    print(f"Model is already {getattr(wrapper.model, 'dtype', 'None')}")

print("")
print("Running activation patching with float16 autocast...")
# Force autocast to float16 to prevent bf16 inputs to AWQ kernels
try:
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        patching_path = run_activation_patching(
            output_root=OUTPUT_ROOT,
            prompts_path=DATA_ROOT / "prompts.csv",
            log_every=PATCHING_LOG_EVERY,
            checkpoint_path=OUTPUT_ROOT / "runs" / "patching_checkpoint.jsonl",
            p_rest_batch_size=PATCHING_P_REST_BATCH,
            p_rest_max_batch_tokens=PATCHING_P_REST_MAX_TOKENS,
        )
    print(f"Patching results saved to {patching_path}")
except Exception as e:
    print(f"\nPatching failed with error: {e}")
    print("Attempting to clear cache and retry...")
    torch.cuda.empty_cache()
    raise e

## 9. Bootstrap CIs

In [59]:
 from pathlib import Path
from src.metrics_attn import compute_attention_metrics

OUTPUT_ROOT = Path("/content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159")

compute_attention_metrics(
  output_root=OUTPUT_ROOT,
  prompts_path=DATA_ROOT / "prompts.csv",
  log_every=LOG_EVERY,
  checkpoint_path=OUTPUT_ROOT / "runs" / "attention_metrics_checkpoint.jsonl",
)


[2025-12-31 02:18:59,642] INFO - src.metrics_attn - Computing attention metrics for 4935 prompt/condition pairs
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 02:19:37,660] INFO - src.metrics_attn - Attention metrics progress: 50/4935 (1.0%) | 1.32 pairs/s | ETA 3714.2s


KeyboardInterrupt: 

In [60]:
 from src.behavior_analysis import write_psem_and_bins

write_psem_and_bins(output_root=OUTPUT_ROOT, prompts_path=DATA_ROOT / "prompts.csv")

[2025-12-31 02:24:39,147] INFO - src.behavior_analysis - Wrote 2500 rows to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/psem.csv
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 02:24:39,298] INFO - src.behavior_analysis - Wrote 2500 rows to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/pressure_bins.csv


{'psem_path': '/content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/psem.csv',
 'pressure_bins_path': '/content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/pressure_bins.csv'}

In [61]:
from src.behavior_analysis import compute_behavioral_metrics, run_behavior_analysis_pipeline

  # If detection_mapping.jsonl already exists:
compute_behavioral_metrics(output_root=OUTPUT_ROOT, prompts_path=DATA_ROOT / "prompts.csv")

  # If detection_mapping.jsonl is missing:
# run_behavior_analysis_pipeline(output_root=OUTPUT_ROOT, prompts_path=DATA_ROOT / "prompts.csv", skip_detection=False)


[2025-12-31 02:24:41,133] INFO - src.behavior_analysis - Wrote 2500 behavior metrics to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/behavior_metrics.csv
  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,prompt_id,category,violation_rate,format_adherence_rate,clean_success_rate
0,idioms_deep_a68ec69a,idioms,1.0000,0.0,0.0
1,idioms_wings_abbe7581,idioms,1.0000,0.0,0.0
2,idioms_cap_abaa3e7a,idioms,0.6250,0.0,0.0
3,idioms_print_4fde6a6b,idioms,1.0000,0.0,0.0
4,idioms_stars_44a930a7,idioms,0.0000,0.0,0.0
...,...,...,...,...,...
2495,ood_further_43abd34c,ood,0.0625,0.0,0.0
2496,ood_leave_ac441962,ood,0.0625,0.0,0.0
2497,ood_press_0cb61d4e,ood,0.0000,0.0,0.0
2498,ood_talk_24ac2931,ood,0.0000,0.0,0.0


  return datetime.utcnow().replace(tzinfo=utc)


In [62]:
from src.bootstrap import run_bootstrap_pipeline

print("Computing bootstrap CIs...")
bootstrap_path = run_bootstrap_pipeline(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    seed=42,
    n_iterations=1000
)
print(f"Bootstrap results saved to {bootstrap_path}")

Computing bootstrap CIs...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 02:24:46,067] INFO - src.bootstrap - Running bootstrap CI computation (n_iterations=1000, seed=42)
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 02:24:46,068] INFO - src.bootstrap - Computing violation rate CIs...
[2025-12-31 02:24:46,214] INFO - src.bootstrap - Computing suppression metric CIs...
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 02:24:46,616] INFO - src.bootstrap - Computing attention metric CIs...
[2025-12-31 02:24:46,814] INFO - src.bootstrap - Computing patching effect size CIs...
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 02:24:47,129] INFO - src.bootstrap - Wrote 32 bootstrap results to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/bootstrap_results.csv


Bootstrap results saved to /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/bootstrap_results.csv


  return datetime.utcnow().replace(tzinfo=utc)


## 10. Generate Figures and Tables

In [63]:
  import pandas as pd
  from pathlib import Path

  attn_path = OUTPUT_ROOT / "runs" / "attention_metrics.csv"
  attn = pd.read_csv(attn_path)
  print(attn.shape)
  print(attn["condition"].value_counts(dropna=False))
  print(attn["aggregate_flag"].value_counts(dropna=False))

  # rows used by the plot
  print(attn.query("condition=='negative' and aggregate_flag=='global_mean'").head())

  prompts = pd.read_csv(DATA_ROOT / "prompts.csv")
  merged = attn.merge(prompts[["prompt_id","p0_bin"]], on="prompt_id", how="inner")
  print("merged rows:", merged.shape[0])

(93495, 11)
condition
baseline    47154
negative    46341
Name: count, dtype: int64
aggregate_flag
head           90160
layer_mean      3220
global_mean      115
Name: count, dtype: int64
                  prompt_id condition  decision_step  word_present  layer  \
1625   idioms_deep_a68ec69a  negative              0         False     -1   
3251  idioms_wings_abbe7581  negative              0          True     -1   
4877    idioms_cap_abaa3e7a  negative              0          True     -1   
6503  idioms_print_4fde6a6b  negative              0          True     -1   
8129  idioms_stars_44a930a7  negative              0         False     -1   

      head       iar        nf       tmf        pi aggregate_flag  
1625    -1       NaN       NaN       NaN       NaN    global_mean  
3251    -1  0.276756  0.135855  0.228289  0.092435    global_mean  
4877    -1  0.283754  0.131378  0.168819  0.037441    global_mean  
6503    -1  0.247160  0.192044  0.133288 -0.058756    global_mean  
8129    -

  return datetime.utcnow().replace(tzinfo=utc)


In [64]:
  from src.metrics_attn import compute_attention_metrics
  from pathlib import Path

  runs = OUTPUT_ROOT / "runs"
  # rename to avoid accidental loss
  if (runs / "attention_metrics.csv").exists():
      (runs / "attention_metrics.csv").rename(runs / "attention_metrics.csv.bak")
  if (runs / "attention_metrics_checkpoint.jsonl").exists():
      (runs / "attention_metrics_checkpoint.jsonl").rename(runs / "attention_metrics_checkpoint.jsonl.bak")

  compute_attention_metrics(
      output_root=OUTPUT_ROOT,
      prompts_path=DATA_ROOT / "prompts.csv",
      log_every=LOG_EVERY,
  )


[2025-12-31 02:24:54,252] INFO - src.metrics_attn - Computing attention metrics for 5000 prompt/condition pairs
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 02:25:45,060] INFO - src.metrics_attn - Attention metrics progress: 50/5000 (1.0%) | 0.98 pairs/s | ETA 5029.9s
[2025-12-31 02:26:40,860] INFO - src.metrics_attn - Attention metrics progress: 100/5000 (2.0%) | 0.94 pairs/s | ETA 5223.7s
[2025-12-31 02:27:39,804] INFO - src.metrics_attn - Attention metrics progress: 150/5000 (3.0%) | 0.91 pairs/s | ETA 5352.8s
[2025-12-31 02:28:25,383] INFO - src.metrics_attn - Attention metrics progress: 200/5000 (4.0%) | 0.95 pairs/s | ETA 5067.1s
[2025-12-31 02:29:00,827] INFO - src.metrics_attn - Attention metrics progress: 250/5000 (5.0%) | 1.01 pairs/s | ETA 4684.9s
[2025-12-31 02:29:37,839] INFO - src.metrics_attn - Attention metrics progress: 300/5000 (6.0%) | 1.06 pairs/s | ETA 4442.8s
[2025-12-31 02:30:35,299] INFO - src.metrics_attn - Attention metrics progress: 350/5000 (7.

PosixPath('/content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/runs/attention_metrics.csv')

In [68]:
from src.visualize import run_visualization_pipeline

print("Generating figures and tables...")
viz_paths = run_visualization_pipeline(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    limit_examples=20
)

print("\nGenerated outputs:")
for key, path in viz_paths.items():
    print(f"  {key}: {path}")

[2025-12-31 04:43:31,362] INFO - src.visualize - Loading data...


Generating figures and tables...


  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 04:43:38,330] INFO - src.visualize - Generating figures...
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 04:43:42,666] INFO - src.visualize - Generating tables.json...
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 04:43:44,458] INFO - src.visualize - Generating appendix examples...
  return datetime.utcnow().replace(tzinfo=utc)
[2025-12-31 04:43:45,825] INFO - src.visualize - Visualization pipeline complete.



Generated outputs:
  violation_rate_vs_p0: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/figures/violation_rate_vs_p0.png
  suppression_relative_vs_p0: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/figures/suppression_relative_vs_p0.png
  attention_metrics_vs_p0: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/figures/attention_metrics_vs_p0.png
  pi_vs_violation: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/figures/pi_vs_violation.png
  logit_lens_curves: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/figures/logit_lens_curves.png
  attn_ffn_contrib: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_20251230_0159/figures/attn_ffn_contrib.png
  patching_effects: /content/drive/MyDrive/Projects/5_Semantic_Gravity_RP/outputs/experiment_run_2

## 11. Final Summary

In [69]:
print("="*60)
print("EXPERIMENT COMPLETE")
print("="*60)
print(f"\nRun ID: {RUN_ID}")
print(f"Output root: {OUTPUT_ROOT}")
print(f"\nTotal prompts processed: {total_selected}")

print("\nOutput files:")
for f in OUTPUT_ROOT.rglob('*'):
    if f.is_file():
        size_kb = f.stat().st_size / 1024
        print(f"  {f.relative_to(OUTPUT_ROOT)}: {size_kb:.1f} KB")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  runs/mechanistic_trace/ood_water_967502cd_negative.pt: 30108.4 KB
  runs/mechanistic_trace/ood_white_f2ac67f7_negative.pt: 29750.7 KB
  runs/mechanistic_trace/ood_near_b6b586b4_negative.pt: 29394.4 KB
  runs/mechanistic_trace/ood_understand_b719fb30_negative.pt: 30829.7 KB
  runs/mechanistic_trace/ood_city_24aad16c_baseline.pt: 29394.4 KB
  runs/mechanistic_trace/ood_won_3da98f7d_negative.pt: 30824.8 KB
  runs/mechanistic_trace/ood_started_38a037aa_negative.pt: 29395.1 KB
  runs/mechanistic_trace/ood_building_4578d833_negative.pt: 29395.4 KB
  runs/mechanistic_trace/ood_meeting_288977c9_negative.pt: 29395.1 KB
  runs/mechanistic_trace/ood_london_e1a9fa73_negative.pt: 30826.1 KB
  runs/mechanistic_trace/ood_report_420da125_negative.pt: 28688.0 KB
  runs/mechanistic_trace/ood_asked_0f9b1682_negative.pt: 29750.7 KB
  runs/mechanistic_trace/creative_call_edfa44c6_negative.pt: 17253.7 KB
  runs/mechanistic_trace/creative_sen

In [70]:
print("\nExperiment complete. All artifacts saved to Google Drive.")


Experiment complete. All artifacts saved to Google Drive.


## Post-hoc analyses (fits, suppression stats, failure taxonomy, stable suppression plot)


In [None]:
# Fitted curves: logistic + isotonic for violation vs P0 (with bootstrap CIs)
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

run_root = OUTPUT_ROOT
runs_dir = run_root / "runs"
figures_dir = run_root / "figures"
figures_dir.mkdir(parents=True, exist_ok=True)

prompts_df = pd.read_csv(DATA_ROOT / "prompts.csv")
p0_by_id = prompts_df.set_index("prompt_id")["p0"]

det_path = runs_dir / "detection_mapping.jsonl"
rows = []
with det_path.open("r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        rec = json.loads(line)
        if rec.get("condition") != "negative":
            continue
        if rec.get("sample_id", "") == "":
            continue
        pid = str(rec.get("prompt_id"))
        if pid not in p0_by_id:
            continue
        rows.append({
            "prompt_id": pid,
            "p0": float(p0_by_id[pid]),
            "violation": 1 if rec.get("word_present", False) else 0,
        })

samples_df = pd.DataFrame(rows)
print("sample rows:", len(samples_df))

try:
    from sklearn.linear_model import LogisticRegression
    from sklearn.isotonic import IsotonicRegression
except Exception as e:
    raise RuntimeError("scikit-learn is required for this cell.") from e

X = samples_df[["p0"]].values
y = samples_df["violation"].values

log_model = LogisticRegression(solver="lbfgs", max_iter=1000)
log_model.fit(X, y)
coef = float(log_model.coef_[0, 0])
intercept = float(log_model.intercept_[0])

grid = np.linspace(0.0, 1.0, 101)
logit_pred = log_model.predict_proba(grid.reshape(-1, 1))[:, 1]

BOOT_N = 200
rng = np.random.default_rng(42)
boot_preds = []
boot_coefs = []
for i in range(BOOT_N):
    boot = samples_df.sample(n=len(samples_df), replace=True, random_state=int(rng.integers(1e9)))
    yb = boot["violation"].values
    if len(np.unique(yb)) < 2:
        continue
    model_b = LogisticRegression(solver="lbfgs", max_iter=1000)
    model_b.fit(boot[["p0"]].values, yb)
    boot_preds.append(model_b.predict_proba(grid.reshape(-1, 1))[:, 1])
    boot_coefs.append((float(model_b.intercept_[0]), float(model_b.coef_[0, 0])))

boot_preds = np.array(boot_preds)
ci_low, ci_high = np.percentile(boot_preds, [2.5, 97.5], axis=0)

iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(samples_df["p0"].values, samples_df["violation"].values)
iso_pred = iso.predict(grid)

plt.figure(figsize=(7, 5))
plt.scatter(samples_df["p0"], samples_df["violation"], s=6, alpha=0.05, label="samples")
plt.plot(grid, logit_pred, color="red", label="logistic fit")
plt.fill_between(grid, ci_low, ci_high, color="red", alpha=0.2, label="logistic 95% CI")
plt.plot(grid, iso_pred, color="black", linestyle="--", label="isotonic fit")
plt.xlabel("P0 (pressure)")
plt.ylabel("Violation probability")
plt.title("Violation vs P0: logistic + isotonic fits")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(figures_dir / "violation_rate_fit.png", dpi=200)
plt.close()

coef_ci = None
if boot_coefs:
    vals = np.array(boot_coefs)
    intercept_ci = np.percentile(vals[:, 0], [2.5, 97.5])
    coef_ci = np.percentile(vals[:, 1], [2.5, 97.5])
    print(f"logistic intercept={intercept:.4f} (95% CI {intercept_ci[0]:.4f}, {intercept_ci[1]:.4f})")
    print(f"logistic coef={coef:.4f} (95% CI {coef_ci[0]:.4f}, {coef_ci[1]:.4f})")
else:
    print(f"logistic intercept={intercept:.4f}")
    print(f"logistic coef={coef:.4f}")


In [None]:
# Decision-step suppression: delta P_sem (baseline - negative) by outcome
import numpy as np
import pandas as pd
import json

logit_df = pd.read_csv(runs_dir / "logit_lens.csv")
final_layer = logit_df["layer"].max()
final_df = logit_df[logit_df["layer"] == final_layer]

pivot = final_df.pivot_table(
    index="prompt_id",
    columns="condition",
    values="p_sem_first_token",
    aggfunc="mean",
)
pivot = pivot.dropna()
pivot["delta_p_sem"] = pivot["baseline"] - pivot["negative"]

# Outcomes from greedy negative
def load_greedy_outcomes(runs_dir):
    path = runs_dir / "detection_mapping_greedy.jsonl"
    if not path.exists():
        path = runs_dir / "detection_mapping.jsonl"
    outcomes = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            if rec.get("condition") != "negative":
                continue
            if rec.get("sample_id", "") != "":
                continue
            pid = str(rec.get("prompt_id"))
            word_present = rec.get("word_present", False)
            outcomes[pid] = "failure" if word_present else "success"
    return outcomes

outcomes = load_greedy_outcomes(runs_dir)
pivot["outcome"] = pivot.index.map(outcomes.get)
pivot = pivot[pivot["outcome"].isin(["success", "failure"])].copy()

def bootstrap_mean(values, n_boot=1000, seed=42):
    rng = np.random.default_rng(seed)
    values = np.asarray(values)
    if len(values) == 0:
        return (np.nan, np.nan)
    means = []
    for _ in range(n_boot):
        sample = rng.choice(values, size=len(values), replace=True)
        means.append(sample.mean())
    return tuple(np.percentile(means, [2.5, 97.5]))

summary = []
for outcome in ["success", "failure"]:
    vals = pivot.loc[pivot["outcome"] == outcome, "delta_p_sem"].values
    ci = bootstrap_mean(vals)
    summary.append({
        "outcome": outcome,
        "mean": float(np.mean(vals)) if len(vals) else np.nan,
        "ci_low": ci[0],
        "ci_high": ci[1],
        "n": int(len(vals)),
    })

summary_df = pd.DataFrame(summary)
print(summary_df)

# Plot
plt.figure(figsize=(5, 4))
x = np.arange(len(summary_df))
means = summary_df["mean"].values
yerr = [means - summary_df["ci_low"].values, summary_df["ci_high"].values - means]
plt.bar(x, means, yerr=yerr, capsize=4, color=["green", "red"])
plt.xticks(x, summary_df["outcome"].values)
plt.ylabel("Delta P_sem (baseline - negative)")
plt.title("Decision-step suppression by outcome")
plt.tight_layout()
plt.savefig(figures_dir / "suppression_delta_by_outcome.png", dpi=200)
plt.close()


In [None]:
# Failure-mode taxonomy (priming vs override) with labeled examples
import numpy as np
import pandas as pd
import json

attn_df = pd.read_csv(runs_dir / "attention_metrics.csv")
attn = attn_df.copy()
if "aggregate_flag" in attn.columns:
    attn = attn[attn["aggregate_flag"] == "global_mean"]
if "condition" in attn.columns:
    attn = attn[attn["condition"] == "negative"]
attn = attn.set_index("prompt_id")[["iar", "nf", "tmf", "pi"]]

supp_df = pivot.copy()
supp_df = supp_df.join(attn, how="left")
supp_df["tmf_minus_nf"] = supp_df["tmf"] - supp_df["nf"]

fail_df = supp_df[supp_df["outcome"] == "failure"].copy()

# Priming: negative instruction increases P_sem or attention to target mention outweighs negation
priming_mask = (fail_df["delta_p_sem"] < 0) | (fail_df["tmf_minus_nf"] > 0.02)
fail_df["failure_mode"] = np.where(priming_mask, "priming", "override")

print(fail_df["failure_mode"].value_counts())

# Load greedy completions for examples
def load_greedy_completions(runs_dir):
    path = runs_dir / "detection_mapping_greedy.jsonl"
    if not path.exists():
        path = runs_dir / "detection_mapping.jsonl"
    comps = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            if rec.get("condition") != "negative":
                continue
            if rec.get("sample_id", "") != "":
                continue
            pid = str(rec.get("prompt_id"))
            comps[pid] = rec.get("completion_text", "")
    return comps

comps = load_greedy_completions(runs_dir)
prompt_info = prompts_df.set_index("prompt_id")[["category", "question_text", "target_word"]]

examples = []
for mode in ["priming", "override"]:
    sub = fail_df[fail_df["failure_mode"] == mode].copy()
    if mode == "priming":
        sub = sub.sort_values("tmf_minus_nf", ascending=False)
    else:
        sub = sub.sort_values("delta_p_sem", ascending=False)
    for _, row in sub.head(10).iterrows():
        pid = row.name
        info = prompt_info.loc[pid] if pid in prompt_info.index else {}
        examples.append({
            "prompt_id": pid,
            "category": str(info.get("category", "")),
            "question_text": str(info.get("question_text", "")),
            "target_word": str(info.get("target_word", "")),
            "completion_text": comps.get(pid, ""),
            "failure_mode": mode,
            "delta_p_sem": float(row["delta_p_sem"]),
            "iar": float(row["iar"]) if pd.notna(row["iar"]) else None,
            "nf": float(row["nf"]) if pd.notna(row["nf"]) else None,
            "tmf": float(row["tmf"]) if pd.notna(row["tmf"]) else None,
            "pi": float(row["pi"]) if pd.notna(row["pi"]) else None,
            "tmf_minus_nf": float(row["tmf_minus_nf"]) if pd.notna(row["tmf_minus_nf"]) else None,
        })

examples_path = run_root / "appendix_examples" / "failure_modes.jsonl"
examples_path.parent.mkdir(parents=True, exist_ok=True)
with examples_path.open("w", encoding="utf-8") as f:
    for ex in examples:
        f.write(json.dumps(ex, ensure_ascii=True) + "\n")

print(f"Wrote {len(examples)} labeled examples to {examples_path}")


In [None]:
# Stable suppression plot (log suppression vs P0 bin)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

psem_df = pd.read_csv(runs_dir / "psem.csv")
if "p0_bin" not in psem_df.columns:
    psem_df = psem_df.merge(prompts_df[["prompt_id", "p0_bin"]], on="prompt_id", how="inner")

metric_col = "log" if "log" in psem_df.columns else "log_suppression"

def parse_bin(bin_str):
    parts = bin_str.split("-")
    return float(parts[0]) if parts else 0.0

bins = sorted(psem_df["p0_bin"].unique(), key=parse_bin)
centers = []
means = []
ci_low = []
ci_high = []

rng = np.random.default_rng(42)
for b in bins:
    vals = psem_df[psem_df["p0_bin"] == b][metric_col].dropna().values
    if len(vals) == 0:
        continue
    centers.append(sum(map(float, b.split("-"))) / 2.0)
    means.append(vals.mean())
    boot = []
    for _ in range(500):
        sample = rng.choice(vals, size=len(vals), replace=True)
        boot.append(sample.mean())
    ci = np.percentile(boot, [2.5, 97.5])
    ci_low.append(ci[0])
    ci_high.append(ci[1])

plt.figure(figsize=(6, 4))
yerr = [np.array(means) - np.array(ci_low), np.array(ci_high) - np.array(means)]
plt.errorbar(centers, means, yerr=yerr, fmt="s-", capsize=4, color="orange")
plt.xlabel("P0 Bin Center")
plt.ylabel("Log suppression")
plt.title("Semantic suppression vs pressure (log scale)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(figures_dir / "suppression_log_vs_p0.png", dpi=200)
plt.close()
