# Semantic Gravity Experiment - Full Pipeline

This notebook runs the complete Stage 2 GPU pipeline for reproducing the paper results.
Colab-first defaults assume Google Drive, but all paths can be overridden via env vars.

Pipeline stages:
1. Environment setup and validation
2. Detector self-tests
3. Load model
4. Finalize dataset with P_sem
5. Mechanistic runs (greedy + hidden states)
6. Behavioral runs (sampling)
7. Detection/mapping
8. Metrics (attention, logit lens, decomposition)
9. Activation patching
10. Behavioral metrics + bootstrap CIs
11. Figures and tables
12. Appendix: post-hoc analyses (optional)


## 0. Environment Setup and Validation

Colab checklist:
- Put this repo in Google Drive (default: /content/drive/MyDrive/Semantic-Gravity-RP)
- Run the dependency install cell before any other imports to avoid NumPy restart
- Place the model at /content/drive/MyDrive/models/Qwen2.5-7B-Instruct or set SEMANTIC_GRAVITY_MODEL_PATH
- Ensure data/validated/*.jsonl exists under the repo
- Optional: set SG_* env vars for batch sizes and logging


In [None]:
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
    print("Drive mounted")
else:
    print("Not in Colab")


In [None]:
import os
import sys
from datetime import datetime
from pathlib import Path


def find_repo_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "src").exists() and (parent / "notebooks").exists():
            return parent
    return start


IN_COLAB = "google.colab" in sys.modules

DEFAULT_PROJECT_ROOT = "/content/drive/MyDrive/Semantic-Gravity-RP" if IN_COLAB else str(find_repo_root(Path.cwd()))
DEFAULT_MODEL_PATH = (
    "/content/drive/MyDrive/models/Qwen2.5-7B-Instruct"
    if IN_COLAB
    else "Qwen/Qwen2.5-7B-Instruct"
)

PROJECT_ROOT = Path(os.environ.get("SEMANTIC_GRAVITY_ROOT", DEFAULT_PROJECT_ROOT)).expanduser()
DATA_ROOT = Path(os.environ.get("SEMANTIC_GRAVITY_DATA_ROOT", str(PROJECT_ROOT / "data")))
OUTPUT_ROOT_BASE = Path(os.environ.get("SEMANTIC_GRAVITY_OUTPUT_ROOT", str(PROJECT_ROOT / "outputs")))
MODEL_PATH = os.environ.get("SEMANTIC_GRAVITY_MODEL_PATH", DEFAULT_MODEL_PATH)

# Optional HF cache location
HF_HOME = os.environ.get("SEMANTIC_GRAVITY_HF_HOME") or os.environ.get("HF_HOME")
if HF_HOME:
    os.environ["HF_HOME"] = HF_HOME

# Run ID (resume-safe)
RUN_ID_FILE = Path(os.environ.get("SEMANTIC_GRAVITY_RUN_ID_FILE", str(DATA_ROOT / "latest_run_id.txt")))
RESUME_FROM_LAST_RUN = os.environ.get("SG_RESUME_LAST_RUN", "1") == "1"
EXPLICIT_RUN_ID = os.environ.get("SEMANTIC_GRAVITY_RUN_ID")

if EXPLICIT_RUN_ID:
    RUN_ID = EXPLICIT_RUN_ID
elif RESUME_FROM_LAST_RUN and RUN_ID_FILE.exists():
    RUN_ID = RUN_ID_FILE.read_text().strip()
else:
    RUN_ID = f"experiment_run_{datetime.now().strftime('%Y%m%d_%H%M')}"
    RUN_ID_FILE.parent.mkdir(parents=True, exist_ok=True)
    RUN_ID_FILE.write_text(RUN_ID)

OUTPUT_ROOT = OUTPUT_ROOT_BASE / RUN_ID

# Ensure directories exist
DATA_ROOT.mkdir(parents=True, exist_ok=True)
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

# Export env for downstream modules
os.environ["SEMANTIC_GRAVITY_ROOT"] = str(PROJECT_ROOT)
os.environ["SEMANTIC_GRAVITY_DATA_ROOT"] = str(DATA_ROOT)
os.environ["SEMANTIC_GRAVITY_OUTPUT_ROOT"] = str(OUTPUT_ROOT_BASE)
os.environ["SEMANTIC_GRAVITY_MODEL_PATH"] = str(MODEL_PATH)

# Add repo root to sys.path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

if not (PROJECT_ROOT / "src").exists():
    raise FileNotFoundError(f"src/ not found under PROJECT_ROOT: {PROJECT_ROOT}")

print(f"Project root: {PROJECT_ROOT}")
print(f"Data root: {DATA_ROOT}")
print(f"Output root: {OUTPUT_ROOT}")
print(f"Model path: {MODEL_PATH}")


In [None]:
import importlib.util
import subprocess
import sys

if IN_COLAB:
    packages = [
        "numpy<2",
        "pandas>=2.2",
        "scipy>=1.12",
        "scikit-learn>=1.4",
        "transformers==4.51.3",
        "accelerate",
        "tokenizers",
        "matplotlib",
        "seaborn",
        "tqdm",
        "requests",
        "wordfreq",
        "SPARQLWrapper",
        "openai",
    ]

    if importlib.util.find_spec("torch") is None:
        packages.insert(0, "torch")

    print("Installing dependencies...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--upgrade"] + packages)

    if "numpy" in sys.modules:
        raise RuntimeError("numpy was imported before install; restart runtime and run this cell first.")

    print("Dependencies installed")
else:
    print("Not in Colab; use `pip install -r requirements.txt` in your environment.")


In [None]:
# Validate GPU (hard halt if not present)
import subprocess

result = subprocess.run(["nvidia-smi", "-L"], capture_output=True, text=True)
if result.returncode != 0:
    raise RuntimeError(f"nvidia-smi failed: {result.stderr}")

gpu_info = result.stdout.strip()
print(gpu_info if gpu_info else "No GPU detected by nvidia-smi")
if not gpu_info:
    raise RuntimeError("No NVIDIA GPU detected. Ensure a GPU instance is attached.")

result_mem = subprocess.run(
    ["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"],
    capture_output=True,
    text=True,
)
print(result_mem.stdout.strip())


In [None]:
# Environment flags for efficiency
import os
from pathlib import Path

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Version check
import transformers

print(f"Transformers version: {transformers.__version__}")
major, minor = map(int, transformers.__version__.split(".")[:2])
if (major, minor) < (4, 37):
    raise RuntimeError(f"Transformers >= 4.37 required, got {transformers.__version__}")
print("Transformers version OK")

# Logging + batching config (A100 defaults; override via env vars)
LOG_EVERY = int(os.environ.get("SG_LOG_EVERY", 50))
PATCHING_LOG_EVERY = int(os.environ.get("SG_PATCH_LOG_EVERY", 10))

GPU_MEM_GB = 0
if torch.cuda.is_available():
    GPU_MEM_GB = int(torch.cuda.get_device_properties(0).total_memory / (1024 ** 3))

# Use your observed per-prompt VRAM footprint to pick safe batch sizes
VRAM_PER_PROMPT_GB = float(os.environ.get("SG_PROMPT_VRAM_GB", 5.9))
DEFAULT_BUFFER_GB = max(6.0, GPU_MEM_GB * 0.25)
VRAM_BUFFER_GB = float(os.environ.get("SG_VRAM_BUFFER_GB", DEFAULT_BUFFER_GB))

available_gb = max(0.0, GPU_MEM_GB - VRAM_BUFFER_GB)
safe_prompt_batch = int(available_gb // VRAM_PER_PROMPT_GB) if VRAM_PER_PROMPT_GB > 0 else 1
SAFE_PROMPT_BATCH = max(1, safe_prompt_batch)

# Derived defaults (override via env vars if needed)
default_psem_prompt = SAFE_PROMPT_BATCH
# task_batch_size = number of (context, sequence) tasks per batch
# Keep within a safe band to avoid OOM on long prompts

default_psem_task = max(64, min(256, SAFE_PROMPT_BATCH * 32))

default_psem_max_tokens = 8192

default_mech_batch = SAFE_PROMPT_BATCH
# Behavioral sampling is heavier (num_return_sequences > 1), keep smaller

default_behav_batch = max(1, SAFE_PROMPT_BATCH // 2)

default_logit_lens_batch = max(1, SAFE_PROMPT_BATCH // 2)

default_patch_p_rest_batch = max(32, min(128, SAFE_PROMPT_BATCH * 32))

default_patch_p_rest_max_tokens = 8192

BATCH_CONFIG = {
    "psem": {
        "prompt_batch_size": int(os.environ.get("SG_PSEM_PROMPT_BATCH", default_psem_prompt)),
        "task_batch_size": int(os.environ.get("SG_PSEM_TASK_BATCH", default_psem_task)),
        "max_batch_tokens": int(os.environ.get("SG_PSEM_MAX_BATCH_TOKENS", default_psem_max_tokens)),
        "log_every": LOG_EVERY,
    },
    "p1": {
        "prompt_batch_size": int(os.environ.get("SG_P1_PROMPT_BATCH", default_psem_prompt)),
        "task_batch_size": int(os.environ.get("SG_P1_TASK_BATCH", default_psem_task)),
        "max_batch_tokens": int(os.environ.get("SG_P1_MAX_BATCH_TOKENS", default_psem_max_tokens)),
        "log_every": LOG_EVERY,
    },
}

MECH_BATCH_SIZE = int(os.environ.get("SG_MECH_BATCH", default_mech_batch))
BEHAV_BATCH_SIZE = int(os.environ.get("SG_BEHAV_BATCH", default_behav_batch))
LOGIT_LENS_BATCH_SIZE = int(os.environ.get("SG_LOGIT_LENS_BATCH", default_logit_lens_batch))
PATCHING_P_REST_BATCH = int(os.environ.get("SG_PATCH_P_REST_BATCH", default_patch_p_rest_batch))
PATCHING_P_REST_MAX_TOKENS = int(os.environ.get("SG_PATCH_P_REST_MAX_TOKENS", default_patch_p_rest_max_tokens))

CHECKPOINT_DIR = Path(os.environ.get("SG_CHECKPOINT_DIR", str(DATA_ROOT / "checkpoints")))
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f"GPU mem (GB): {GPU_MEM_GB}")
print(f"VRAM per prompt (GB): {VRAM_PER_PROMPT_GB:.2f} | Buffer (GB): {VRAM_BUFFER_GB:.1f} | Safe prompt batch: {SAFE_PROMPT_BATCH}")
print(
    "Batch config: "
    f"P_sem prompt={BATCH_CONFIG['psem']['prompt_batch_size']} "
    f"task={BATCH_CONFIG['psem']['task_batch_size']} "
    f"max_tokens={BATCH_CONFIG['psem']['max_batch_tokens']}"
)
print(
    "Runner batches: "
    f"mechanistic={MECH_BATCH_SIZE} "
    f"behavioral={BEHAV_BATCH_SIZE} "
    f"logit_lens={LOGIT_LENS_BATCH_SIZE}"
)
print(
    "Patching: "
    f"p_rest_batch={PATCHING_P_REST_BATCH} "
    f"max_tokens={PATCHING_P_REST_MAX_TOKENS} "
    f"log_every={PATCHING_LOG_EVERY}"
)


In [None]:
# Import experiment modules
from src.config import CONFIG, validate_environment
from src.utils import set_seed, ModelWrapper, setup_logging
from src.prompt_builder import build_prompt

# Validate environment
metadata = validate_environment()
print(f"GPU: {metadata.get('gpu_name', 'N/A')}")
print(f"CUDA version: {metadata.get('cuda_version', 'N/A')}")


In [None]:
# Set seeds for reproducibility
set_seed(42)

# Save run metadata
import json

metadata["run_id"] = RUN_ID
metadata["output_root"] = str(OUTPUT_ROOT)
metadata["model_path"] = MODEL_PATH

with open(OUTPUT_ROOT / "run_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2, default=str)
print(f"Run metadata saved to {OUTPUT_ROOT / 'run_metadata.json'}")

# Logging
LOG_DIR = OUTPUT_ROOT / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)
setup_logging(log_file=LOG_DIR / f"run_{RUN_ID}.log")
print(f"Logging to {LOG_DIR}")


## 1. Detector Self-Tests (Hard Halt on Failure)


In [None]:
# Run detector self-tests
print("Running detector self-tests...")

from src.detector import word_present

assert not word_present("space", "spacetime"), "'space' should not match 'spacetime'"
print("OK: 'space' not in 'spacetime'")

assert word_present("space", "The answer is space."), "'space' should match 'space.'"
print("OK: 'space.' detection")

assert not word_present("space", "space2"), "'space' should not match 'space2'"
print("OK: 'space' not in 'space2'")

assert word_present("space", "space-time"), "'space' should match 'space-time'"
print("OK: 'space' in 'space-time'")

assert word_present("apple", "I can't say 'apple' so..."), "quoted 'apple' should match"
print("OK: 'apple' in quoted phrase")

print("\n" + "=" * 60)
print("All detector self-tests passed")
print("=" * 60)


## 2. Load Model


In [None]:
# Load Qwen model
print("Loading Qwen model...")
wrapper = ModelWrapper.get_instance()
wrapper.load(model_path=MODEL_PATH)

print(f"Model loaded: {wrapper.is_loaded}")
print(f"Vocab size: {len(wrapper.tokenizer)}")
print(f"Model dtype: {wrapper.model.dtype}")


In [None]:
# Quick generation test
print("Running quick generation test...")

test_prompt = build_prompt("The capital of France is ____.", "Paris", "baseline")
result = wrapper.generate(prompt=test_prompt, max_new_tokens=8, do_sample=False)

print(f"Test prompt: {test_prompt}")
print(f"Generated: {result['generated_text']}")
print("Model generation OK")


## 3. Finalize Dataset with P_sem


In [None]:
# Filter validated prompts to targets with at least one single-token variant
from src.dataset_pipeline import filter_validated_single_token_targets

validated_dir = DATA_ROOT / "validated"
filtered_dir = DATA_ROOT / "validated_single_token"
checkpoint_path = filtered_dir / "single_token_filter_checkpoint.json"

stats = filter_validated_single_token_targets(
    validated_dir=validated_dir,
    output_dir=filtered_dir,
    tokenizer=wrapper.tokenizer,
    categories=CONFIG["dataset"]["categories"],
    checkpoint_path=checkpoint_path,
    log_every=1000,
)

print("Filtered validated prompts to single-token targets:")
for cat, info in stats.items():
    print(f"  {cat}: kept={info.get('kept', 0)} removed={info.get('removed', 0)}")


In [None]:
from src.dataset_pipeline import finalize_dataset_with_psem

print("Finalizing dataset with P_sem computation...")
print("This computes P0/P1, applies gating, bin balancing, and writes prompts.csv")

validated_dir = DATA_ROOT / "validated_single_token"

final_by_category = finalize_dataset_with_psem(
    validated_dir=validated_dir,
    output_root=DATA_ROOT,
    model_wrapper=wrapper,
    prompts_per_category=500,
    batch_config=BATCH_CONFIG,
    checkpoint_dir=CHECKPOINT_DIR,
)

total_selected = sum(len(v) for v in final_by_category.values())
print("")
print(f"Total selected: {total_selected}")
for cat, prompts in final_by_category.items():
    print(f"  {cat}: {len(prompts)}")

print("")
print(f"Prompts saved to {DATA_ROOT / 'prompts.csv'}")


## 4. Mechanistic Runs (Greedy + Hidden States)


In [None]:
from src.runner import run_experiment

print("Running mechanistic passes...")

mechanistic_results = run_experiment(
    prompts_csv=str(DATA_ROOT / "prompts.csv"),
    output_root=str(OUTPUT_ROOT),
    skip_mechanistic=False,
    skip_behavioral=True,
    limit=None,
    mechanistic_batch_size=MECH_BATCH_SIZE,
    behavioral_batch_size=BEHAV_BATCH_SIZE,
    log_every=LOG_EVERY,
)

print("")
print("Mechanistic completed:", mechanistic_results.get("mechanistic_completed"))


## 5. Behavioral Runs (16 Samples)


In [None]:
print("Running behavioral passes...")

behavioral_results = run_experiment(
    prompts_csv=str(DATA_ROOT / "prompts.csv"),
    output_root=str(OUTPUT_ROOT),
    skip_mechanistic=True,
    skip_behavioral=False,
    limit=None,
    mechanistic_batch_size=MECH_BATCH_SIZE,
    behavioral_batch_size=BEHAV_BATCH_SIZE,
    log_every=LOG_EVERY,
)

print("")
print("Behavioral completed:", behavioral_results.get("behavioral_completed"))


## 6. Detection/Mapping

Two detection passes:
1. Greedy-only -> detection_mapping_greedy.jsonl (for mechanistic metrics)
2. Samples -> detection_mapping.jsonl (for behavioral metrics/plotting)


In [None]:
from src.detector import detect_and_map
import json
from tqdm import tqdm

runs_dir = OUTPUT_ROOT / "runs"
runs_dir.mkdir(parents=True, exist_ok=True)

import pandas as pd
prompts_df = pd.read_csv(DATA_ROOT / "prompts.csv")
target_by_id = {str(row["prompt_id"]): row["target_word"] for _, row in prompts_df.iterrows()}


def _load_seen_keys(path):
    seen = set()
    if not path.exists():
        return seen
    with open(path, "r") as f:
        for line in f:
            if not line.strip():
                continue
            row = json.loads(line)
            sample_id = row.get("sample_id", "")
            prompt_id = row.get("prompt_id")
            condition = row.get("condition")
            key = f"{prompt_id}|{condition}|{sample_id}"
            seen.add(key)
    return seen


def run_detection_streaming(input_path, output_path, desc):
    if not input_path.exists():
        print(f"WARNING: {input_path} not found")
        return 0, 0

    seen = _load_seen_keys(output_path)
    mapping_errors = 0
    processed = 0

    with open(input_path, "r") as fin, open(output_path, "a") as fout:
        for line in tqdm(fin, desc=desc):
            if not line.strip():
                continue
            row = json.loads(line)
            sample_id = row.get("sample_id", "")
            prompt_id = row.get("prompt_id")
            condition = row.get("condition")
            key = f"{prompt_id}|{condition}|{sample_id}"
            if key in seen:
                continue

            prompt_id = str(row["prompt_id"])
            target = target_by_id.get(prompt_id, row.get("target_word", ""))
            result = detect_and_map(
                target=target,
                completion_text=row["generated_text"],
                token_ids=row.get("generated_token_ids", []),
                tokenizer=wrapper.tokenizer,
                prompt_id=prompt_id,
                condition=row["condition"],
            )

            out_row = {
                "prompt_id": prompt_id,
                "condition": row["condition"],
                "sample_id": sample_id,
                "completion_text": row["generated_text"],
                "target_word": target,
                **result,
            }
            fout.write(json.dumps(out_row, ensure_ascii=True, default=str) + "\n")
            seen.add(key)
            processed += 1
            if result.get("mapping_error"):
                mapping_errors += 1

            if LOG_EVERY and processed % LOG_EVERY == 0:
                print(f"{desc}: processed={processed}, mapping_errors={mapping_errors}")

    return processed, mapping_errors


print("\n=== Detection Pass 1: Greedy completions ===")
greedy_count, greedy_errors = run_detection_streaming(
    runs_dir / "completions_greedy.jsonl",
    runs_dir / "detection_mapping_greedy.jsonl",
    "Greedy detection",
)
print(f"Greedy: {greedy_count} entries, {greedy_errors} mapping errors")

# Hard halt on greedy mapping error rate
if greedy_errors > 0 and greedy_count:
    error_rate = greedy_errors / greedy_count
    if error_rate > 0.001:
        raise RuntimeError(f"HARD HALT: Greedy mapping error rate {error_rate:.4%} exceeds 0.1%")

print("\n=== Detection Pass 2: Sample completions ===")
samples_path = runs_dir / "completions_samples.jsonl"
if samples_path.exists():
    sample_count, sample_errors = run_detection_streaming(
        samples_path,
        runs_dir / "detection_mapping.jsonl",
        "Sample detection",
    )
    print(f"Samples: {sample_count} entries, {sample_errors} mapping errors")
else:
    print("No sample completions found - skipping")

print("\nDetection complete")


## 7. Compute Metrics at Target Decision Step


In [None]:
from src.metrics_attn import compute_attention_metrics, compute_logit_lens_and_decomp

print("Computing attention metrics at target decision step...")
attn_path = compute_attention_metrics(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    log_every=LOG_EVERY,
    checkpoint_path=OUTPUT_ROOT / "runs" / "attention_metrics_checkpoint.jsonl",
)
print(f"Attention metrics saved to {attn_path}")

print("")
print("Computing logit lens and decomposition...")
decomp_paths = compute_logit_lens_and_decomp(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    batch_size=LOGIT_LENS_BATCH_SIZE,
    log_every=LOG_EVERY,
    checkpoint_path=OUTPUT_ROOT / "runs" / "logit_lens_checkpoint.jsonl",
)
print(f"Logit lens: {decomp_paths.get('logit_lens_path')}")
print(f"Decomposition: {decomp_paths.get('ffn_attn_decomp_path')}")


## 8. Activation Patching


In [None]:
from src.patching import select_patching_subset, run_activation_patching
import torch

print("Selecting patching subset...")
subset = select_patching_subset(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
)
print(f"Selected {len(subset)} prompts for patching")

print("")
print("Running activation patching...")
wrapper = ModelWrapper.get_instance()
amp_dtype = getattr(wrapper.model, "dtype", torch.float16)

with torch.autocast(device_type="cuda", dtype=amp_dtype):
    patching_path = run_activation_patching(
        output_root=OUTPUT_ROOT,
        prompts_path=DATA_ROOT / "prompts.csv",
        log_every=PATCHING_LOG_EVERY,
        checkpoint_path=OUTPUT_ROOT / "runs" / "patching_checkpoint.jsonl",
        p_rest_batch_size=PATCHING_P_REST_BATCH,
        p_rest_max_batch_tokens=PATCHING_P_REST_MAX_TOKENS,
    )
print(f"Patching results saved to {patching_path}")


## 9. Behavioral Metrics + P_sem/Bins


In [None]:
from src.behavior_analysis import write_psem_and_bins, compute_behavioral_metrics

print("Writing psem.csv and pressure bins...")
write_psem_and_bins(output_root=OUTPUT_ROOT, prompts_path=DATA_ROOT / "prompts.csv")

print("Computing behavioral metrics...")
compute_behavioral_metrics(output_root=OUTPUT_ROOT, prompts_path=DATA_ROOT / "prompts.csv")


## 10. Bootstrap CIs


In [None]:
from src.bootstrap import run_bootstrap_pipeline

print("Computing bootstrap CIs...")
bootstrap_path = run_bootstrap_pipeline(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    seed=42,
    n_iterations=1000,
)
print(f"Bootstrap results saved to {bootstrap_path}")


## 11. Generate Figures and Tables


In [None]:
from src.visualize import run_visualization_pipeline

print("Generating figures and tables...")
viz_paths = run_visualization_pipeline(
    output_root=OUTPUT_ROOT,
    prompts_path=DATA_ROOT / "prompts.csv",
    limit_examples=20,
)

print("\nGenerated outputs:")
for key, path in viz_paths.items():
    print(f"  {key}: {path}")


## 12. Final Summary


In [None]:
print("=" * 60)
print("EXPERIMENT COMPLETE")
print("=" * 60)
print(f"Run ID: {RUN_ID}")
print(f"Output root: {OUTPUT_ROOT}")

expected_outputs = [
    "runs/completions_greedy.jsonl",
    "runs/completions_samples.jsonl",
    "runs/detection_mapping.jsonl",
    "runs/attention_metrics.csv",
    "runs/logit_lens.csv",
    "runs/ffn_attn_decomp.csv",
    "runs/patching_results.csv",
    "runs/bootstrap_results.csv",
    "figures/violation_rate_vs_p0.png",
]

print("\nKey outputs:")
for rel_path in expected_outputs:
    path = OUTPUT_ROOT / rel_path
    status = "OK" if path.exists() else "MISSING"
    print(f"{status}: {path}")


## Appendix: Post-hoc Analyses (Optional)

Run the appendix setup cell once before the analyses below.


In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

RUN_ROOT = OUTPUT_ROOT
RUNS_DIR = RUN_ROOT / "runs"
FIGURES_DIR = RUN_ROOT / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

PROMPTS_DF = pd.read_csv(DATA_ROOT / "prompts.csv")
P0_BY_ID = PROMPTS_DF.set_index("prompt_id")["p0"]


def load_greedy_outcomes(runs_dir: Path):
    path = runs_dir / "detection_mapping_greedy.jsonl"
    if not path.exists():
        path = runs_dir / "detection_mapping.jsonl"
    outcomes = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            if rec.get("condition") != "negative":
                continue
            if rec.get("sample_id", "") != "":
                continue
            pid = str(rec.get("prompt_id"))
            word_present = rec.get("word_present", False)
            outcomes[pid] = "failure" if word_present else "success"
    return outcomes


def load_greedy_completions(runs_dir: Path):
    path = runs_dir / "detection_mapping_greedy.jsonl"
    if not path.exists():
        path = runs_dir / "detection_mapping.jsonl"
    comps = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            if rec.get("condition") != "negative":
                continue
            if rec.get("sample_id", "") != "":
                continue
            pid = str(rec.get("prompt_id"))
            comps[pid] = rec.get("completion_text", "")
    return comps


In [None]:
# Fitted curves: logistic + isotonic for violation vs P0 (with bootstrap CIs)
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression

run_root = RUN_ROOT
runs_dir = RUNS_DIR
figures_dir = FIGURES_DIR

p0_by_id = P0_BY_ID

det_path = runs_dir / "detection_mapping.jsonl"
rows = []
with det_path.open("r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        rec = json.loads(line)
        if rec.get("condition") != "negative":
            continue
        if rec.get("sample_id", "") == "":
            continue
        pid = str(rec.get("prompt_id"))
        if pid not in p0_by_id:
            continue
        rows.append({
            "prompt_id": pid,
            "p0": float(p0_by_id[pid]),
            "violation": 1 if rec.get("word_present", False) else 0,
        })

samples_df = pd.DataFrame(rows)
print("sample rows:", len(samples_df))

X = samples_df[["p0"]].values
y = samples_df["violation"].values

log_model = LogisticRegression(solver="lbfgs", max_iter=1000)
log_model.fit(X, y)
coef = float(log_model.coef_[0, 0])
intercept = float(log_model.intercept_[0])

grid = np.linspace(0.0, 1.0, 101)
logit_pred = log_model.predict_proba(grid.reshape(-1, 1))[:, 1]

BOOT_N = 200
rng = np.random.default_rng(42)
boot_preds = []
boot_coefs = []
for _ in range(BOOT_N):
    boot = samples_df.sample(n=len(samples_df), replace=True, random_state=int(rng.integers(1e9)))
    yb = boot["violation"].values
    if len(np.unique(yb)) < 2:
        continue
    model_b = LogisticRegression(solver="lbfgs", max_iter=1000)
    model_b.fit(boot[["p0"]].values, yb)
    boot_preds.append(model_b.predict_proba(grid.reshape(-1, 1))[:, 1])
    boot_coefs.append((float(model_b.intercept_[0]), float(model_b.coef_[0, 0])))

boot_preds = np.array(boot_preds)
ci_low, ci_high = np.percentile(boot_preds, [2.5, 97.5], axis=0)

iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(samples_df["p0"].values, samples_df["violation"].values)
iso_pred = iso.predict(grid)

plt.figure(figsize=(7, 5))
plt.scatter(samples_df["p0"], samples_df["violation"], s=6, alpha=0.05, label="samples")
plt.plot(grid, logit_pred, color="red", label="logistic fit")
plt.fill_between(grid, ci_low, ci_high, color="red", alpha=0.2, label="logistic 95% CI")
plt.plot(grid, iso_pred, color="black", linestyle="--", label="isotonic fit")
plt.xlabel("P0 (pressure)")
plt.ylabel("Violation probability")
plt.title("Violation vs P0: logistic + isotonic fits")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(figures_dir / "violation_rate_fit.png", dpi=200)
plt.close()

coef_ci = None
if boot_coefs:
    vals = np.array(boot_coefs)
    intercept_ci = np.percentile(vals[:, 0], [2.5, 97.5])
    coef_ci = np.percentile(vals[:, 1], [2.5, 97.5])
    print(f"logistic intercept={intercept:.4f} (95% CI {intercept_ci[0]:.4f}, {intercept_ci[1]:.4f})")
    print(f"logistic coef={coef:.4f} (95% CI {coef_ci[0]:.4f}, {coef_ci[1]:.4f})")
else:
    print(f"logistic intercept={intercept:.4f}")
    print(f"logistic coef={coef:.4f}")


In [None]:
# Decision-step suppression: delta P_sem (baseline - negative) by outcome
logit_df = pd.read_csv(RUNS_DIR / "logit_lens.csv")
final_layer = logit_df["layer"].max()
final_df = logit_df[logit_df["layer"] == final_layer]

pivot = final_df.pivot_table(
    index="prompt_id",
    columns="condition",
    values="p_sem_first_token",
    aggfunc="mean",
)
pivot = pivot.dropna()
pivot["delta_p_sem"] = pivot["baseline"] - pivot["negative"]

outcomes = load_greedy_outcomes(RUNS_DIR)
pivot["outcome"] = pivot.index.map(outcomes.get)
pivot = pivot[pivot["outcome"].isin(["success", "failure"])].copy()

PIVOT_DF = pivot


def bootstrap_mean(values, n_boot=1000, seed=42):
    rng = np.random.default_rng(seed)
    values = np.asarray(values)
    if len(values) == 0:
        return (np.nan, np.nan)
    means = []
    for _ in range(n_boot):
        sample = rng.choice(values, size=len(values), replace=True)
        means.append(sample.mean())
    return tuple(np.percentile(means, [2.5, 97.5]))


summary = []
for outcome in ["success", "failure"]:
    vals = pivot.loc[pivot["outcome"] == outcome, "delta_p_sem"].values
    ci = bootstrap_mean(vals)
    summary.append({
        "outcome": outcome,
        "mean": float(np.mean(vals)) if len(vals) else np.nan,
        "ci_low": ci[0],
        "ci_high": ci[1],
        "n": int(len(vals)),
    })

summary_df = pd.DataFrame(summary)
print(summary_df)

# Plot
plt.figure(figsize=(5, 4))
x = np.arange(len(summary_df))
means = summary_df["mean"].values
yerr = [means - summary_df["ci_low"].values, summary_df["ci_high"].values - means]
plt.bar(x, means, yerr=yerr, capsize=4, color=["green", "red"])
plt.xticks(x, summary_df["outcome"].values)
plt.ylabel("Delta P_sem (baseline - negative)")
plt.title("Decision-step suppression by outcome")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "suppression_delta_by_outcome.png", dpi=200)
plt.close()


In [None]:
# Failure-mode taxonomy (priming vs override) with labeled examples
if "PIVOT_DF" not in globals():
    raise RuntimeError("Run the decision-step suppression cell first.")

attn_df = pd.read_csv(RUNS_DIR / "attention_metrics.csv")
attn = attn_df.copy()
if "aggregate_flag" in attn.columns:
    attn = attn[attn["aggregate_flag"] == "global_mean"]
if "condition" in attn.columns:
    attn = attn[attn["condition"] == "negative"]
attn = attn.set_index("prompt_id")[["iar", "nf", "tmf", "pi"]]

supp_df = PIVOT_DF.copy()
supp_df = supp_df.join(attn, how="left")
supp_df["tmf_minus_nf"] = supp_df["tmf"] - supp_df["nf"]

fail_df = supp_df[supp_df["outcome"] == "failure"].copy()

# Priming: negative instruction increases P_sem or attention to target mention outweighs negation
priming_mask = (fail_df["delta_p_sem"] < 0) | (fail_df["tmf_minus_nf"] > 0.02)
fail_df["failure_mode"] = np.where(priming_mask, "priming", "override")

print(fail_df["failure_mode"].value_counts())

comps = load_greedy_completions(RUNS_DIR)
prompt_info = PROMPTS_DF.set_index("prompt_id")[["category", "question_text", "target_word"]]

examples = []
for mode in ["priming", "override"]:
    sub = fail_df[fail_df["failure_mode"] == mode].copy()
    if mode == "priming":
        sub = sub.sort_values("tmf_minus_nf", ascending=False)
    else:
        sub = sub.sort_values("delta_p_sem", ascending=False)
    for _, row in sub.head(10).iterrows():
        pid = row.name
        info = prompt_info.loc[pid] if pid in prompt_info.index else {}
        examples.append({
            "prompt_id": pid,
            "category": str(info.get("category", "")),
            "question_text": str(info.get("question_text", "")),
            "target_word": str(info.get("target_word", "")),
            "completion_text": comps.get(pid, ""),
            "failure_mode": mode,
            "delta_p_sem": float(row["delta_p_sem"]),
            "iar": float(row["iar"]) if pd.notna(row["iar"]) else None,
            "nf": float(row["nf"]) if pd.notna(row["nf"]) else None,
            "tmf": float(row["tmf"]) if pd.notna(row["tmf"]) else None,
            "pi": float(row["pi"]) if pd.notna(row["pi"]) else None,
            "tmf_minus_nf": float(row["tmf_minus_nf"]) if pd.notna(row["tmf_minus_nf"]) else None,
        })

examples_path = RUN_ROOT / "appendix_examples" / "failure_modes.jsonl"
examples_path.parent.mkdir(parents=True, exist_ok=True)
with examples_path.open("w", encoding="utf-8") as f:
    for ex in examples:
        f.write(json.dumps(ex, ensure_ascii=True) + "\n")

print(f"Wrote {len(examples)} labeled examples to {examples_path}")


In [None]:
# Stable suppression plot (log suppression vs P0 bin)
psem_df = pd.read_csv(RUNS_DIR / "psem.csv")
if "p0_bin" not in psem_df.columns:
    psem_df = psem_df.merge(PROMPTS_DF[["prompt_id", "p0_bin"]], on="prompt_id", how="inner")

metric_col = "log" if "log" in psem_df.columns else "log_suppression"


def parse_bin(bin_str):
    parts = bin_str.split("-")
    return float(parts[0]) if parts else 0.0


bins = sorted(psem_df["p0_bin"].unique(), key=parse_bin)
centers = []
means = []
ci_low = []
ci_high = []

rng = np.random.default_rng(42)
for b in bins:
    vals = psem_df[psem_df["p0_bin"] == b][metric_col].dropna().values
    if len(vals) == 0:
        continue
    centers.append(sum(map(float, b.split("-"))) / 2.0)
    means.append(vals.mean())
    boot = []
    for _ in range(500):
        sample = rng.choice(vals, size=len(vals), replace=True)
        boot.append(sample.mean())
    ci = np.percentile(boot, [2.5, 97.5])
    ci_low.append(ci[0])
    ci_high.append(ci[1])

plt.figure(figsize=(6, 4))
yerr = [np.array(means) - np.array(ci_low), np.array(ci_high) - np.array(means)]
plt.errorbar(centers, means, yerr=yerr, fmt="s-", capsize=4, color="orange")
plt.xlabel("P0 Bin Center")
plt.ylabel("Log suppression")
plt.title("Semantic suppression vs pressure (log scale)")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(FIGURES_DIR / "suppression_log_vs_p0.png", dpi=200)
plt.close()
