# Semantic Gravity Experiment - Full Pipeline

This notebook runs the complete Stage 2 pipeline on A100 GPU in Colab.

**Prerequisites:**
- Google Colab with A100 GPU runtime
- Qwen model files in Google Drive
- Source files and validated prompts synced to Drive

**Pipeline Stages:**
1. Environment validation and setup
2. Detector self-tests (hard halt on failure)
3. Load model
4. Finalize dataset with P_sem (uses finalize_dataset_with_psem)
5. Run mechanistic passes (greedy + hidden states)
6. Run behavioral passes (16 samples)
7. Detection/mapping (greedy for mechanistic, samples for behavioral)
8. Compute metrics at TARGET DECISION STEP (attention, logit lens, decomp)
9. Activation patching
10. Bootstrap CIs
11. Generate figures and tables

## 0. Environment Setup and Validation

In [None]:
# Install required packages
!pip install -q torch transformers accelerate tokenizers numpy pandas scipy scikit-learn matplotlib seaborn tqdm requests wordfreq SPARQLWrapper

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
print("Drive mounted successfully")

In [None]:
# Validate A100 GPU (HARD HALT if not present)
import subprocess
result = subprocess.run(['nvidia-smi', '-L'], capture_output=True, text=True)
gpu_info = result.stdout
print(gpu_info)

if 'A100' not in gpu_info:
    raise RuntimeError(
        "A100 GPU required! Current GPU info:\n" + gpu_info +
        "\nPlease change runtime to A100 GPU."
    )
print("✓ A100 GPU confirmed")

In [None]:
# Configuration - UPDATE THESE PATHS
import os
from datetime import datetime
from pathlib import Path

# Core paths
MODEL_PATH = "/content/drive/MyDrive/models/Qwen2.5-7B-Instruct"
DATA_ROOT = Path("/content/drive/MyDrive/SemanticGravity")
SRC_PATH = "/content/drive/MyDrive/SemanticGravity/src"

# Create run ID and output root
RUN_ID = f"experiment_run_{datetime.now().strftime('%Y%m%d_%H%M')}"
OUTPUT_ROOT = DATA_ROOT / "outputs" / RUN_ID

print(f"Run ID: {RUN_ID}")
print(f"Output root: {OUTPUT_ROOT}")

# Add source path
import sys
sys.path.insert(0, SRC_PATH)

In [None]:
# Environment flags for efficiency
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Version check
import transformers
print(f"Transformers version: {transformers.__version__}")
assert tuple(map(int, transformers.__version__.split('.')[:2])) >= (4, 37), \
    f"Transformers >= 4.37 required, got {transformers.__version__}"
print("✓ Transformers version OK")

In [None]:
# Import experiment modules
from config import CONFIG, PROMPT_TEMPLATES, setup_directories, validate_environment
from utils import set_seed, ModelWrapper, setup_logging
from prompt_builder import build_prompt

# Validate environment
metadata = validate_environment()
print(f"GPU: {metadata.get('gpu_name', 'N/A')}")
print(f"CUDA version: {metadata.get('cuda_version', 'N/A')}")

In [None]:
# Set seeds for reproducibility
set_seed(42)

# Setup directories
dirs = setup_directories()
print("Directories created:", list(dirs.keys()))

# Save run metadata
import json
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
metadata['run_id'] = RUN_ID
metadata['output_root'] = str(OUTPUT_ROOT)
metadata['model_path'] = MODEL_PATH

with open(OUTPUT_ROOT / 'run_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2, default=str)
print(f"Run metadata saved to {OUTPUT_ROOT}/run_metadata.json")

## 1. Detector Self-Tests (Hard Halt on Failure)

In [None]:
# Run detector self-tests
print("Running detector self-tests...")

from detector import word_present, detect_and_map

assert not word_present('space', 'spacetime'), "'space' should not match 'spacetime'"
print("✓ 'space' not in 'spacetime'")

assert word_present('space', 'The answer is space.'), "'space' should match 'space.'"
print("✓ 'space.' detection")

assert not word_present('space', 'space2'), "'space' should not match 'space2'"
print("✓ 'space' not in 'space2'")

assert word_present('space', 'space-time'), "'space' should match 'space-time'"
print("✓ 'space' in 'space-time'")

assert word_present('apple', "I can't say 'apple' so..."), "quoted 'apple' should match"
print("✓ 'apple' in quoted phrase")

print("\n" + "="*60)
print("All detector self-tests passed!")
print("="*60)

## 2. Load Model

In [None]:
# Load Qwen model
print("Loading Qwen model...")
wrapper = ModelWrapper.get_instance()
wrapper.load(model_path=MODEL_PATH)

print(f"Model loaded: {wrapper.is_loaded}")
print(f"Vocab size: {len(wrapper.tokenizer)}")
print(f"Model dtype: {wrapper.model.dtype}")

In [None]:
# Filter validated prompts to targets with at least one single-token variant
from metrics_psem import token_sequences_for_variants
import json

validated_dir = DATA_ROOT / "data" / "validated"
categories = CONFIG['dataset']['categories']

def _load_jsonl(path):
    rows = []
    with open(path, 'r') as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return rows

def _write_jsonl(path, rows):
    with open(path, 'w') as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=True) + "\n")

removed = {}
kept = {}

for category in categories:
    path = validated_dir / f"{category}_validated.jsonl"
    if not path.exists():
        raise FileNotFoundError(f"Missing validated file: {path}")
    rows = _load_jsonl(path)
    keep_rows = []
    drop_rows = []
    for r in rows:
        target = r.get('target_word', '')
        seqs = token_sequences_for_variants(target, wrapper.tokenizer)
        has_single = any(len(seq) == 1 for seq in seqs)
        if has_single:
            keep_rows.append(r)
        else:
            drop_rows.append(r)
    _write_jsonl(path, keep_rows)
    removed[category] = len(drop_rows)
    kept[category] = len(keep_rows)

print("Filtered validated prompts to single-token targets:")
for cat in categories:
    print(f"  {cat}: kept={kept.get(cat, 0)} removed={removed.get(cat, 0)}")


In [None]:
# Quick generation test
test_prompt = build_prompt("The capital of France is ____.", "Paris", "baseline")
result = wrapper.generate(prompt=test_prompt, max_new_tokens=8, do_sample=False)
print(f"Test prompt: {test_prompt}")
print(f"Generated: {result['generated_text']}")
print("✓ Model generation works")

## 3. Finalize Dataset with P_sem

In [None]:
from dataset_pipeline import finalize_dataset_with_psem

print("Finalizing dataset with P_sem computation...")
print("This computes P0/P1, applies gating, bin balancing, and writes prompts.csv")

validated_dir = DATA_ROOT / "data" / "validated"

final_by_category = finalize_dataset_with_psem(
    validated_dir=validated_dir,
    output_root=DATA_ROOT / "data",
    model_wrapper=wrapper,
    prompts_per_category=500
)

total_selected = sum(len(v) for v in final_by_category.values())
print(f"\nTotal selected: {total_selected}")
for cat, prompts in final_by_category.items():
    print(f"  {cat}: {len(prompts)}")

data_root = DATA_ROOT / "data"
print(f"\nPrompts saved to {data_root / 'prompts.csv'}")

## 4. Mechanistic Runs (Greedy + Hidden States)

In [None]:
from runner import run_experiment

print("Running mechanistic passes...")

mechanistic_results = run_experiment(
    prompts_csv=str(data_root / "prompts.csv"),
    output_root=str(OUTPUT_ROOT),
    skip_mechanistic=False,
    skip_behavioral=True,
    limit=None
)

print(f"\nMechanistic completed: {mechanistic_results['mechanistic_completed']}")

## 5. Behavioral Runs (16 Samples)

In [None]:
print("Running behavioral passes...")

behavioral_results = run_experiment(
    prompts_csv=str(data_root / "prompts.csv"),
    output_root=str(OUTPUT_ROOT),
    skip_mechanistic=True,
    skip_behavioral=False,
    limit=None
)

print(f"\nBehavioral completed: {behavioral_results['behavioral_completed']}")

## 6. Detection/Mapping

Two detection passes:
1. **Greedy-only** → `detection_mapping_greedy.jsonl` (for mechanistic metrics)
2. **Samples** → `detection_mapping.jsonl` (for behavioral metrics/plotting)

In [None]:
from detector import detect_and_map
import json
from tqdm import tqdm

runs_dir = OUTPUT_ROOT / "runs"
runs_dir.mkdir(parents=True, exist_ok=True)

import pandas as pd
prompts_df = pd.read_csv(data_root / "prompts.csv")
target_by_id = {str(row['prompt_id']): row['target_word'] for _, row in prompts_df.iterrows()}

def run_detection(input_path, output_path, desc):
    """Run detection on completions file and save results."""
    results = []
    mapping_errors = 0
    
    if not input_path.exists():
        print(f"WARNING: {input_path} not found")
        return results, 0
    
    with open(input_path, 'r') as f:
        lines = f.readlines()
    
    for line in tqdm(lines, desc=desc):
        row = json.loads(line)
        prompt_id = str(row['prompt_id'])
        target = target_by_id.get(prompt_id, row.get('target_word', ''))
        
        result = detect_and_map(
            target=target,
            completion_text=row['generated_text'],
            token_ids=row.get('generated_token_ids', []),
            tokenizer=wrapper.tokenizer,
            prompt_id=prompt_id,
            condition=row['condition']
        )
        results.append({
            'prompt_id': prompt_id,
            'condition': row['condition'],
            'completion_text': row['generated_text'],
            'target_word': target,
            **result
        })
        if result.get('mapping_error'):
            mapping_errors += 1
    
    with open(output_path, 'w') as f:
        for r in results:
            f.write(json.dumps(r, default=str) + '\n')
    
    return results, mapping_errors

# Pass 1: Greedy completions -> detection_mapping_greedy.jsonl
print("\n=== Detection Pass 1: Greedy completions ===")
greedy_results, greedy_errors = run_detection(
    runs_dir / "completions_greedy.jsonl",
    runs_dir / "detection_mapping_greedy.jsonl",
    "Greedy detection"
)
print(f"Greedy: {len(greedy_results)} entries, {greedy_errors} mapping errors")

# Check for hard halt on greedy
if greedy_errors > 0 and greedy_results:
    error_rate = greedy_errors / len(greedy_results)
    if error_rate > 0.001:
        raise RuntimeError(f"HARD HALT: Greedy mapping error rate {error_rate:.4%} exceeds 0.1%")

# Pass 2: Sample completions -> detection_mapping.jsonl (for behavioral metrics)
print("\n=== Detection Pass 2: Sample completions ===")
samples_path = runs_dir / "completions_samples.jsonl"
if samples_path.exists():
    sample_results, sample_errors = run_detection(
        samples_path,
        runs_dir / "detection_mapping.jsonl",
        "Sample detection"
    )
    print(f"Samples: {len(sample_results)} entries, {sample_errors} mapping errors")
else:
    print("No sample completions found - skipping")

print("\n✓ Detection complete")

## 7. Compute Metrics at TARGET DECISION STEP

- Uses `detection_mapping_greedy.jsonl` for mechanistic metrics
- Decision step = token index where target first appears
- For obey case (word_present=False): decision_step = 0

In [None]:
from metrics_attn import compute_attention_metrics, compute_logit_lens_and_decomp

print("Computing attention metrics at target decision step...")
attn_path = compute_attention_metrics(
    output_root=OUTPUT_ROOT,
    prompts_path=data_root / "prompts.csv"
)
print(f"Attention metrics saved to {attn_path}")

print("\nComputing logit lens and decomposition...")
decomp_paths = compute_logit_lens_and_decomp(
    output_root=OUTPUT_ROOT,
    prompts_path=data_root / "prompts.csv"
)
print(f"Logit lens: {decomp_paths.get('logit_lens_path')}")
print(f"Decomposition: {decomp_paths.get('ffn_attn_decomp_path')}")

## 8. Activation Patching

In [None]:
from patching import select_patching_subset, run_activation_patching

print("Selecting patching subset...")
subset = select_patching_subset(
    output_root=OUTPUT_ROOT,
    prompts_path=data_root / "prompts.csv"
)
print(f"Selected {len(subset)} prompts for patching")

print("\nRunning activation patching...")
patching_path = run_activation_patching(
    output_root=OUTPUT_ROOT,
    prompts_path=data_root / "prompts.csv"
)
print(f"Patching results saved to {patching_path}")

## 9. Bootstrap CIs

In [None]:
from bootstrap import run_bootstrap_pipeline

print("Computing bootstrap CIs...")
bootstrap_path = run_bootstrap_pipeline(
    output_root=OUTPUT_ROOT,
    prompts_path=data_root / "prompts.csv",
    seed=42,
    n_iterations=1000
)
print(f"Bootstrap results saved to {bootstrap_path}")

## 10. Generate Figures and Tables

In [None]:
from visualize import run_visualization_pipeline

print("Generating figures and tables...")
viz_paths = run_visualization_pipeline(
    output_root=OUTPUT_ROOT,
    prompts_path=data_root / "prompts.csv",
    limit_examples=20
)

print("\nGenerated outputs:")
for key, path in viz_paths.items():
    print(f"  {key}: {path}")

## 11. Final Summary

In [None]:
print("="*60)
print("EXPERIMENT COMPLETE")
print("="*60)
print(f"\nRun ID: {RUN_ID}")
print(f"Output root: {OUTPUT_ROOT}")
print(f"\nTotal prompts processed: {total_selected}")

print("\nOutput files:")
for f in OUTPUT_ROOT.rglob('*'):
    if f.is_file():
        size_kb = f.stat().st_size / 1024
        print(f"  {f.relative_to(OUTPUT_ROOT)}: {size_kb:.1f} KB")

In [None]:
print("\nExperiment complete. All artifacts saved to Google Drive.")