In [None]:
# --- STEP 1: INSTALLATION (The "Lab Equipment") ---
# We install TransformerLens, the standard tool for Mechanistic Interpretability.
# We also install 'accelerate' and 'einops' for tensor operations.
!pip install transformer_lens accelerate einops > /dev/null
print("✅ Laboratory Installed.")

import torch
import torch.nn.functional as F
from transformer_lens import HookedTransformer

# --- STEP 2: LOAD THE SUBJECT (The Brain) ---
# We use GPT-2 Small. It's tiny (117M params) but sufficient to see structure.
# It runs instantly on the Colab Free Tier (CPU or T4 GPU).
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Loading Model on: {device}...")
model = HookedTransformer.from_pretrained("gpt2-small", device=device)
print("✅ Model Active.")

# --- STEP 3: THE STIMULUS (The Experiment) ---
# We create two inputs. Same topic (Health), different Rigor.

# Input A: High Rigor (Mechanistic, Formal)
# Note: We focus on specific keywords like 'randomized', 'control', 'p-value'.
text_rigor = "The double-blind study demonstrated a statistically significant reduction in inflammation (p < 0.05)."

# Input B: Low Rigor (Anecdotal, Casual)
# Note: We use flags like 'my cousin', 'miracle', 'guaranteed'.
text_sus   = "My cousin tried this miracle tea and it totally cured his inflammation in like two days, guaranteed."

# --- STEP 4: THE PROBE (The Scan) ---
# We don't want the output text. We want the internal 'thought vector'.
# We extract the 'Residual Stream' from the Middle Layer (Layer 6 of 12).
# This is where the model processes "concepts" before turning them back into words.

layer_idx = 6
hook_name = f"blocks.{layer_idx}.hook_resid_post"

print(f"\n🔬 Probing Layer {layer_idx}...")

# Run the Rigorous Text
with torch.no_grad():
    # run_with_cache returns the output AND a dictionary of all internal activations
    _, cache_rigor = model.run_with_cache(text_rigor)
    # We grab the vector for the FINAL token (the summary of the sentence so far)
    vec_rigor = cache_rigor[hook_name][0, -1, :]

# Run the Sus Text
with torch.no_grad():
    _, cache_sus = model.run_with_cache(text_sus)
    vec_sus = cache_sus[hook_name][0, -1, :]

# --- STEP 5: THE ANALYSIS (The Math) ---
# Cosine Similarity measures the angle between two vectors.
# 1.0 = Identical Direction (The model thinks these are the same concept)
# 0.0 = Unrelated (Orthogonal)
# -1.0 = Opposites

similarity = F.cosine_similarity(vec_rigor, vec_sus, dim=0).item()

print("-" * 30)
print(f"High Rigor Input: '{text_rigor}'")
print(f"Low Rigor Input:  '{text_sus}'")
print("-" * 30)
print(f"STRUCTURAL SIMILARITY: {similarity:.4f}")
print("-" * 30)

# Interpretation Logic
if similarity < 0.90:
    print("RESULT: SIGNIFICANT DIVERGENCE.")
    print("The model encodes these two inputs in distinctly different geometric spaces.")
    print("Hypothesis Supported: 'Rigor' acts as a separating vector.")
else:
    print("RESULT: HIGH SIMILARITY.")
    print("The model views these primarily as 'Health Topics' and ignores the Rigor distinction.")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
plum-dispatch 2.6.0 requires beartype>=0.16.2, but you have beartype 0.14.1 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.[0m

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# --- STEP 1: FIX DEPENDENCIES ---
# We force upgrade numpy to version 2.0+ to match the expectations of newer libraries.
# We also fix the 'beartype' conflict mentioned in your logs.
!pip uninstall -y numpy
!pip install "numpy>=2.0.0" "beartype>=0.16.2"

# Re-install the main lab tools just to be safe
!pip install transformer_lens accelerate einops

print("✅ Libraries updated.")
print("⚠️ RESTARTING RUNTIME NOW... (This is normal!)")

# This command kills the process to force-reload the new libraries.
# You will see a "Session Crashed" message. Just click "Ok" or wait for it to reconnect.
import os
os.kill(os.getpid(), 9)

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy>=2.0.0
  Downloading numpy-2.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting beartype>=0.16.2
  Downloading beartype-0.22.9-py3-none-any.whl.metadata (37 kB)
Downloading numpy-2.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m117.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading beartype-0.22.9-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, beartype
  Attempting uninstall: beartype
    Found existing installation: beartype 0.14.1
    Uninstalling beartype-0.14.1:
      Successfully uninstalled beartype-0.14.1
[31mERROR: pip's dependency resolver does not currently take in

Collecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Using cached beartype-0.14.1-py3-none-any.whl.metadata (28 kB)
Collecting numpy<2,>=1.26 (from transformer_lens)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached beartype-0.14.1-py3-none-any.whl (739 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
^C


In [None]:
import torch
import torch.nn.functional as F
from transformer_lens import HookedTransformer

# --- LOAD THE SUBJECT ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Loading Model on: {device}...")
model = HookedTransformer.from_pretrained("gpt2-small", device=device)

# --- THE STIMULUS ---
# Input A: High Rigor
text_rigor = "The double-blind study demonstrated a statistically significant reduction in inflammation (p < 0.05)."
# Input B: Low Rigor
text_sus   = "My cousin tried this miracle tea and it totally cured his inflammation in like two days, guaranteed."

# --- THE PROBE ---
layer_idx = 6
hook_name = f"blocks.{layer_idx}.hook_resid_post"

print(f"\n🔬 Probing Layer {layer_idx}...")

with torch.no_grad():
    _, cache_rigor = model.run_with_cache(text_rigor)
    vec_rigor = cache_rigor[hook_name][0, -1, :]

with torch.no_grad():
    _, cache_sus = model.run_with_cache(text_sus)
    vec_sus = cache_sus[hook_name][0, -1, :]

# --- THE ANALYSIS ---
similarity = F.cosine_similarity(vec_rigor, vec_sus, dim=0).item()

print("-" * 30)
print(f"STRUCTURAL SIMILARITY: {similarity:.4f}")
print("-" * 30)

if similarity < 0.90:
    print("RESULT: SIGNIFICANT DIVERGENCE (Success)")
else:
    print("RESULT: HIGH SIMILARITY (Inconclusive)")



✅ Loading Model on: cpu...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer

🔬 Probing Layer 6...
------------------------------
STRUCTURAL SIMILARITY: 0.8126
------------------------------
RESULT: SIGNIFICANT DIVERGENCE (Success)


In [None]:
# --- EXPERIMENT 2: THE "STYLE CONTROL" TEST ---
# We keep the sentence structure identical.
# We change ONLY the sample size integer.

# Input A: Valid Stats
text_valid = "The clinical trial reported a significant p-value of 0.04 with a sample size of N=2000."

# Input B: Garbage Stats (The "P-Hacking" Logic)
text_invalid = "The clinical trial reported a significant p-value of 0.04 with a sample size of N=3."

print(f"\n🔬 CONTROLLING FOR STYLE...")
print(f"Comparing N=2000 vs N=3")

# --- PROBE ---
with torch.no_grad():
    _, cache_valid = model.run_with_cache(text_valid)
    # Note: We must grab the vector at the SAME token position to be fair.
    # Both sentences are the same length, so '[-1]' works perfectly.
    vec_valid = cache_valid[hook_name][0, -1, :]

with torch.no_grad():
    _, cache_invalid = model.run_with_cache(text_invalid)
    vec_invalid = cache_invalid[hook_name][0, -1, :]

# --- MEASURE ---
similarity_control = F.cosine_similarity(vec_valid, vec_invalid, dim=0).item()

print("-" * 30)
print(f"LOGICAL SIMILARITY: {similarity_control:.4f}")
print("-" * 30)

if similarity_control > 0.98:
    print("RESULT: MODEL BLINDNESS.")
    print("The model sees these as identical. It is looking at Tone, not Math.")
    print("It effectively 'believes' the bad stats because they sound formal.")
else:
    print("RESULT: LOGICAL DETECTION.")
    print("The model distinguishes the numerical validity despite identical wording.")


🔬 CONTROLLING FOR STYLE...
Comparing N=2000 vs N=3
------------------------------
LOGICAL SIMILARITY: 0.7908
------------------------------
RESULT: LOGICAL DETECTION.
The model distinguishes the numerical validity despite identical wording.


In [None]:
# --- EXPERIMENT 3: THE "SURPRISE" TEST ---
# We verify if the drop to 0.79 was due to "Bad Science" or just "Different Number".

text_large  = "The clinical trial reported a significant p-value of 0.04 with a sample size of N=2000."
text_small  = "The clinical trial reported a significant p-value of 0.04 with a sample size of N=3."
text_medium = "The clinical trial reported a significant p-value of 0.04 with a sample size of N=50."

print(f"\n🔬 TRIANGULATING THE SIGNAL...")

# --- PROBE ---
with torch.no_grad():
    _, cache_large = model.run_with_cache(text_large)
    vec_large = cache_large[hook_name][0, -1, :]

    _, cache_small = model.run_with_cache(text_small)
    vec_small = cache_small[hook_name][0, -1, :]

    _, cache_medium = model.run_with_cache(text_medium)
    vec_medium = cache_medium[hook_name][0, -1, :]

# --- MEASURE ---
# 1. The original "Gap" (2000 vs 3)
sim_large_small = F.cosine_similarity(vec_large, vec_small, dim=0).item()

# 2. The "Valid vs Valid" Gap (2000 vs 50)
# If this is also 0.79, then the model doesn't care about validity, just the digits.
sim_large_medium = F.cosine_similarity(vec_large, vec_medium, dim=0).item()

# 3. The "Small vs Tiny" Gap (50 vs 3)
sim_medium_small = F.cosine_similarity(vec_medium, vec_small, dim=0).item()

print("-" * 40)
print(f"1. Large (2000) vs Tiny (3):  {sim_large_small:.4f} (The previous result)")
print(f"2. Large (2000) vs Med  (50):  {sim_large_medium:.4f}")
print(f"3. Med   (50)   vs Tiny (3):  {sim_medium_small:.4f}")
print("-" * 40)

# Interpretation
if sim_large_medium > 0.90 and sim_large_small < 0.82:
    print("CONCLUSION: VALIDITY DETECTED.")
    print("The model accepts N=50 as 'close' to N=2000, but rejects N=3.")
    print("It understands the threshold of 'Too Small'.")
elif sim_large_medium < 0.85:
    print("CONCLUSION: NUMBER SENSITIVITY.")
    print("The model just treats all numbers as totally different vectors.")
    print("This is NOT a rigor check; it's just token arithmetic.")


🔬 TRIANGULATING THE SIGNAL...
----------------------------------------
1. Large (2000) vs Tiny (3):  0.7908 (The previous result)
2. Large (2000) vs Med  (50):  0.8819
3. Med   (50)   vs Tiny (3):  0.9254
----------------------------------------


In [None]:
# --- EXPERIMENT 4: THE STYLE INJECTION ---
# We take the "Formal Science" vector and FORCE it onto the "Cousin" input.

# 1. Define the vectors again (from Exp 1)
text_science = "The double-blind study demonstrated a statistically significant reduction in inflammation."
text_cousin  = "My cousin tried this miracle tea and it totally cured his inflammation."

# 2. Capture the "Style Difference"
# We need the vector that points from "Cousin" -> "Science"
with torch.no_grad():
    _, cache_science = model.run_with_cache(text_science)
    _, cache_cousin  = model.run_with_cache(text_cousin)

    # We grab the average style of the middle layers (where tone lives)
    style_vector = cache_science[hook_name][0, -1, :] - cache_cousin[hook_name][0, -1, :]

# 3. The Hook Function (The Brain Surgery)
# This function will run INSIDE the model while it generates text.
def style_transfer_hook(resid_stream, hook):
    # resid_stream shape: [batch, pos, d_model]
    # We add the style vector to the LAST token position
    # "Strength" controls how hard we push (Coefficient 5.0 is usually strong)
    resid_stream[:, -1, :] += style_vector * 1.0
    return resid_stream

# 4. Generate: The Control (Natural Cousin)
print("ORIGINAL COUSIN OUTPUT:")
print(model.generate(text_cousin, max_new_tokens=20, verbose=False))

# 5. Generate: The Cyborg (Cousin + Science Injection)
print("\nINJECTED 'SCIENCE' OUTPUT:")
with model.hooks(fwd_hooks=[(hook_name, style_transfer_hook)]):
    print(model.generate(text_cousin, max_new_tokens=20, verbose=False))

ORIGINAL COUSIN OUTPUT:
My cousin tried this miracle tea and it totally cured his inflammation. This method is no better for foreigners or for people suffering from diabetes.

However, it is

INJECTED 'SCIENCE' OUTPUT:
My cousin tried this miracle tea and it totally cured his inflammation. He drank much less water, less caffeine and had zinc amulet, high calcium phosphate and we ate very


In [None]:
# --- EXPERIMENT 5: THE SKEPTICAL OPTIMIZER ---
# We verify if "Bad Data" creates a gradient that conflicts with "Prior Knowledge".

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. Setup a tiny "World Model" (Simple Linear Network)
# It learns a simple relationship: y = 2x (The "Truth")
model = nn.Linear(1, 1, bias=False)
# Initialize weight explicitly to something neutral
with torch.no_grad():
    model.weight.fill_(0.5)

optimizer = optim.SGD(model.parameters(), lr=0.1)

# 2. Phase 1: Establish Consensus (Train on y=2x)
print("🧠 PHASE 1: Learning The Truth (y=2x)...")
momentum_vector = torch.zeros_like(model.weight.grad, layout=torch.strided) if model.weight.grad is not None else torch.zeros_like(model.weight)

# Train on 100 good samples
for i in range(100):
    x = torch.randn(1, 1)
    y_truth = 2 * x # The Law of Physics

    optimizer.zero_grad()
    y_pred = model(x)
    loss = F.mse_loss(y_pred, y_truth)
    loss.backward()

    # Update weights
    optimizer.step()

    # Update Momentum (The "Consensus")
    # We use a simple moving average
    if i == 0:
        momentum_vector = model.weight.grad.clone()
    else:
        momentum_vector = 0.9 * momentum_vector + 0.1 * model.weight.grad

print(f"Current Belief (Weight): {model.weight.item():.4f} (Target: 2.0)")

# 3. Phase 2: The Anomaly (The "Sus" Data)
# We introduce data that says y = -5x (The "Lie")
x_bad = torch.tensor([[1.0]])
y_bad = torch.tensor([[-5.0]]) # Radical contradiction

optimizer.zero_grad()
y_pred_bad = model(x_bad)
loss_bad = F.mse_loss(y_pred_bad, y_bad)
loss_bad.backward()

gradient_sus = model.weight.grad.clone()

# 4. Phase 3: The Measurement (Cosine Similarity)
# Do the new data's demands align with the old data's direction?
# Since it's 1D, we just check signs. In High-D, we use Cosine Sim.
alignment = F.cosine_similarity(momentum_vector.flatten(), gradient_sus.flatten(), dim=0, eps=1e-8)

print("\n🔍 FORENSIC ANALYSIS:")
print(f"Momentum (Truth) Vector: {momentum_vector.item():.4f}")
print(f"New Data (Sus) Gradient: {gradient_sus.item():.4f}")
print("-" * 30)
print(f"ALIGNMENT SCORE: {alignment.item():.4f}")
print("-" * 30)

if alignment < 0:
    print("RESULT: REJECTION SUGGESTED.")
    print("The new data fights the established momentum.")
    print("A 'Skeptical Optimizer' would dampen this update.")
else:
    print("RESULT: ACCEPTANCE.")

🧠 PHASE 1: Learning The Truth (y=2x)...
Current Belief (Weight): 2.0000 (Target: 2.0)

🔍 FORENSIC ANALYSIS:
Momentum (Truth) Vector: -0.0001
New Data (Sus) Gradient: 14.0000
------------------------------
ALIGNMENT SCORE: -1.0000
------------------------------
RESULT: REJECTION SUGGESTED.
The new data fights the established momentum.
A 'Skeptical Optimizer' would dampen this update.


In [None]:
# --- EXPERIMENT 6: HERESY VS NOVELTY (2D) [FIXED] ---
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. THE WORLD MODEL (2D)
class CircleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.net(x)

model = CircleModel()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 2. PHASE 1: ESTABLISH CONSENSUS
print("🧠 PHASE 1: Learning the Northern Hemisphere...")

momentum_vector = None

for i in range(1000): # Increased to 1000 for better stability
    # Generate points on the top half
    theta = torch.rand(1) * 3.14159
    x = torch.cos(theta)
    y = torch.sin(theta)

    # FIX: Ensure shape is [1, 2] (Batch size 1, 2 Features)
    input_data = torch.cat([x, y], dim=0).unsqueeze(0)

    target = torch.tensor([[1.0]]) # Target shape matches output [1, 1]

    optimizer.zero_grad()
    pred = model(input_data)
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()

    # Update Momentum
    current_grad = torch.cat([p.grad.flatten() for p in model.parameters()])
    if momentum_vector is None:
        momentum_vector = current_grad
    else:
        momentum_vector = 0.9 * momentum_vector + 0.1 * current_grad

print("✅ Consensus Established.")

# 3. PHASE 2: THE TESTS

# CASE A: NOVELTY (The Southern Hemisphere)
theta_new = torch.tensor([4.71]) # 270 degrees
x_nov = torch.cos(theta_new)
y_nov = torch.sin(theta_new)
# FIX: Shape [1, 2]
input_novelty = torch.cat([x_nov, y_nov], dim=0).unsqueeze(0)

optimizer.zero_grad()
pred_nov = model(input_novelty)
loss_nov = F.mse_loss(pred_nov, torch.tensor([[1.0]]))
loss_nov.backward()
grad_novelty = torch.cat([p.grad.flatten() for p in model.parameters()])

# CASE B: HERESY (The Origin - Violates Rule)
# FIX: Shape [1, 2]
input_heresy = torch.tensor([[0.0, 0.0]])

optimizer.zero_grad()
pred_her = model(input_heresy)
loss_her = F.mse_loss(pred_her, torch.tensor([[1.0]])) # We lie and say it's valid
loss_her.backward()
grad_heresy = torch.cat([p.grad.flatten() for p in model.parameters()])

# 4. THE MEASUREMENT
align_novelty = F.cosine_similarity(momentum_vector, grad_novelty, dim=0)
align_heresy  = F.cosine_similarity(momentum_vector, grad_heresy, dim=0)

print("\n🔍 FORENSIC ANALYSIS (2D Space):")
print(f"Momentum vs. Novelty (New Valid Data): {align_novelty.item():.4f}")
print(f"Momentum vs. Heresy  (New Bad Data):   {align_heresy.item():.4f}")

print("-" * 30)
if align_novelty > align_heresy:
    print("RESULT: SUCCESS.")
    print("The model accepts Novelty (closer to consensus) but rejects Heresy.")
else:
    print("RESULT: FAILURE.")
    print("The model cannot distinguish New from False.")

🧠 PHASE 1: Learning the Northern Hemisphere...
✅ Consensus Established.

🔍 FORENSIC ANALYSIS (2D Space):
Momentum vs. Novelty (New Valid Data): 0.3940
Momentum vs. Heresy  (New Bad Data):   0.1333
------------------------------
RESULT: SUCCESS.
The model accepts Novelty (closer to consensus) but rejects Heresy.


In [None]:
# --- EXPERIMENT 7: THE MANIFOLD DEFENSE (High-Dimensional Stress Test) ---
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. CONFIGURATION
DIM_INPUT = 128      # Simulating a small embedding space
DIM_MANIFOLD = 10    # The "Truth" is a lower-dimensional structure (e.g., a specific topic)
LEARNING_RATE = 0.01

print(f"🌌 INITIALIZING HIGH-DIMENSIONAL SPACE (D={DIM_INPUT})...")

# 2. DEFINE THE GROUND TRUTHS
# We create a random basis matrix that defines the "Valid Science" subspace
torch.manual_seed(42)
truth_basis = torch.randn(DIM_MANIFOLD, DIM_INPUT)
truth_basis = F.normalize(truth_basis, p=2, dim=1) # Orthonormal-ish basis

# A separate basis for "Pseudo-Science" (Heresy)
heresy_basis = torch.randn(DIM_MANIFOLD, DIM_INPUT)
heresy_basis = F.normalize(heresy_basis, p=2, dim=1)

# 3. THE MODEL (A Simple Probe)
# Trying to learn: Is this vector "Valid"? (Target = 1.0)
class ManifoldScanner(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(DIM_INPUT, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

model = ManifoldScanner()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

# 4. PHASE 1: INDOCTRINATION (Training on the Truth Manifold)
print("📚 PHASE 1: Learning the Scientific Manifold...")

momentum_vector = None
num_batches = 1000

for i in range(num_batches):
    # Generate valid data: Linear combination of truth_basis vectors
    # This simulates varied sentences about the SAME topic
    coeffs = torch.randn(1, DIM_MANIFOLD)
    input_data = torch.matmul(coeffs, truth_basis) # Shape [1, 128]

    target = torch.tensor([[1.0]])

    optimizer.zero_grad()
    pred = model(input_data)
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()

    # Update Momentum (EMA)
    current_grad = torch.cat([p.grad.flatten() for p in model.parameters()])
    if momentum_vector is None:
        momentum_vector = current_grad
    else:
        # Standard EMA
        momentum_vector = 0.9 * momentum_vector + 0.1 * current_grad

print(f"✅ Training Complete. Momentum Vector Shape: {momentum_vector.shape}")

# 5. PHASE 2: THE HIGH-DIMENSIONAL JUDGMENT
print("\n⚖️  PHASE 2: JUDGMENT DAY")

# CASE A: NOVELTY (New Data from the Truth Manifold)
# It's a combination of the basis we haven't seen, but it fits the math.
coeffs_nov = torch.randn(1, DIM_MANIFOLD)
input_nov = torch.matmul(coeffs_nov, truth_basis)

optimizer.zero_grad()
pred_nov = model(input_nov)
loss_nov = F.mse_loss(pred_nov, torch.tensor([[1.0]]))
loss_nov.backward()
grad_novelty = torch.cat([p.grad.flatten() for p in model.parameters()])

# CASE B: HERESY (Data from the Heresy Basis)
# It mimics the shape (numbers look real), but the structural relationship is wrong.
coeffs_her = torch.randn(1, DIM_MANIFOLD)
input_her = torch.matmul(coeffs_her, heresy_basis) # Uses the WRONG basis

optimizer.zero_grad()
pred_her = model(input_her)
# The Heretic claims this is valid (Target=1.0) - The "Zinc Amulet" lie
loss_her = F.mse_loss(pred_her, torch.tensor([[1.0]]))
loss_her.backward()
grad_heresy = torch.cat([p.grad.flatten() for p in model.parameters()])

# 6. THE MEASUREMENT
align_novelty = F.cosine_similarity(momentum_vector, grad_novelty, dim=0)
align_heresy  = F.cosine_similarity(momentum_vector, grad_heresy, dim=0)

print("-" * 40)
print(f"Gradient Alignment (Novelty/Truth): {align_novelty.item():.4f}")
print(f"Gradient Alignment (Heresy/Lie):    {align_heresy.item():.4f}")
print("-" * 40)

# Interpretation Logic
threshold = 0.2 # Arbitrary barrier
if align_novelty > align_heresy + 0.1: # Significant gap
    print("RESULT: ROBUST.")
    print("Despite high dimensions, the momentum cleanly separates the manifolds.")
    print("The 'Skeptical Optimizer' survives the Curse of Dimensionality.")
else:
    print("RESULT: COLLAPSE.")
    print("The high-dimensional noise drowned out the signal.")

🌌 INITIALIZING HIGH-DIMENSIONAL SPACE (D=128)...
📚 PHASE 1: Learning the Scientific Manifold...
✅ Training Complete. Momentum Vector Shape: torch.Size([4161])

⚖️  PHASE 2: JUDGMENT DAY
----------------------------------------
Gradient Alignment (Novelty/Truth): -0.4172
Gradient Alignment (Heresy/Lie):    -0.2958
----------------------------------------
RESULT: COLLAPSE.
The high-dimensional noise drowned out the signal.


In [None]:
# --- EXPERIMENT 7.1: STABILIZED SKEPTIC (Batch Normalization Fix) ---
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. CONFIGURATION
DIM_INPUT = 128
DIM_MANIFOLD = 10
BATCH_SIZE = 32  # <--- THE CRITICAL FIX (Was 1)
LEARNING_RATE = 0.01

torch.manual_seed(42)

# 2. DEFINITIONS (Same as before)
truth_basis = F.normalize(torch.randn(DIM_MANIFOLD, DIM_INPUT), p=2, dim=1)
heresy_basis = F.normalize(torch.randn(DIM_MANIFOLD, DIM_INPUT), p=2, dim=1)

class ManifoldScanner(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(DIM_INPUT, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

model = ManifoldScanner()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

# 3. PHASE 1: STABILIZED INDOCTRINATION
print(f"📚 PHASE 1: Learning with Batch Size {BATCH_SIZE}...")

momentum_vector = None
num_batches = 500 # Fewer steps, but more data per step

for i in range(num_batches):
    # Generate BATCH of valid data
    coeffs = torch.randn(BATCH_SIZE, DIM_MANIFOLD)
    input_data = torch.matmul(coeffs, truth_basis) # Shape [32, 128]

    # Target is all 1.0s
    target = torch.ones(BATCH_SIZE, 1)

    optimizer.zero_grad()
    pred = model(input_data)
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()

    # Update Momentum (EMA)
    # We detach to prevent graph leaks, though strict SGD doesn't need it
    current_grad = torch.cat([p.grad.flatten() for p in model.parameters()])

    if momentum_vector is None:
        momentum_vector = current_grad
    else:
        momentum_vector = 0.9 * momentum_vector + 0.1 * current_grad

print("✅ Consensus Established.")

# 4. PHASE 2: THE JUDGMENT (Batch vs Batch)
print("\n⚖️  PHASE 2: BATCH JUDGMENT")

# CASE A: NOVELTY BATCH (New Valid Data)
coeffs_nov = torch.randn(BATCH_SIZE, DIM_MANIFOLD)
input_nov = torch.matmul(coeffs_nov, truth_basis)

optimizer.zero_grad()
pred_nov = model(input_nov)
loss_nov = F.mse_loss(pred_nov, torch.ones(BATCH_SIZE, 1))
loss_nov.backward()
grad_novelty = torch.cat([p.grad.flatten() for p in model.parameters()])

# CASE B: HERESY BATCH (New Invalid Data)
coeffs_her = torch.randn(BATCH_SIZE, DIM_MANIFOLD)
input_her = torch.matmul(coeffs_her, heresy_basis) # Wrong Basis

optimizer.zero_grad()
pred_her = model(input_her)
loss_her = F.mse_loss(pred_her, torch.ones(BATCH_SIZE, 1)) # The Lie
loss_her.backward()
grad_heresy = torch.cat([p.grad.flatten() for p in model.parameters()])

# 5. THE MEASUREMENT
align_novelty = F.cosine_similarity(momentum_vector, grad_novelty, dim=0)
align_heresy  = F.cosine_similarity(momentum_vector, grad_heresy, dim=0)

print("-" * 40)
print(f"Gradient Alignment (Novelty): {align_novelty.item():.4f}")
print(f"Gradient Alignment (Heresy):  {align_heresy.item():.4f}")
print("-" * 40)

if align_novelty > align_heresy:
    print("RESULT: SUCCESS.")
    if align_novelty > 0:
        print("Note: Positive alignment achieved (Oscillation dampened).")
else:
    print("RESULT: PERSISTENT FAILURE.")

📚 PHASE 1: Learning with Batch Size 32...
✅ Consensus Established.

⚖️  PHASE 2: BATCH JUDGMENT
----------------------------------------
Gradient Alignment (Novelty): -0.0132
Gradient Alignment (Heresy):  0.1008
----------------------------------------
RESULT: PERSISTENT FAILURE.


In [None]:
# --- EXPERIMENT 7.2: THE RED QUEEN (Weight Decay + Style Bias) ---
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. CONFIGURATION
DIM_INPUT = 128
DIM_MANIFOLD = 10
BATCH_SIZE = 32
LEARNING_RATE = 0.01
WEIGHT_DECAY = 1e-2  # <--- FIX 1: The Red Queen (Forces active maintenance)

torch.manual_seed(42)

# 2. DEFINITIONS
# The "Style" (e.g., the word 'p-value' or 'rigor')
# This gives the Truth Manifold a specific NON-ZERO mean direction.
style_vector = torch.randn(1, DIM_INPUT) * 2.0  # Strong signal

truth_basis = F.normalize(torch.randn(DIM_MANIFOLD, DIM_INPUT), p=2, dim=1)
heresy_basis = F.normalize(torch.randn(DIM_MANIFOLD, DIM_INPUT), p=2, dim=1)

class ManifoldScanner(nn.Module):
    def __init__(self):
        super().__init__()
        # We use a Bias=False layer to strictly test vector alignment
        self.net = nn.Sequential(
            nn.Linear(DIM_INPUT, 32, bias=False),
            nn.ReLU(),
            nn.Linear(32, 1, bias=False)
        )

    def forward(self, x):
        return self.net(x)

model = ManifoldScanner()
# Add Weight Decay to the optimizer
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# 3. PHASE 1: INDOCTRINATION (With Style)
print(f"📚 PHASE 1: Learning the 'Scientific Style'...")

momentum_vector = None
num_batches = 500

for i in range(num_batches):
    coeffs = torch.randn(BATCH_SIZE, DIM_MANIFOLD)
    # FIX 2: Add the Style Vector to the inputs
    # Valid Data = Random Facts (Basis) + Scientific Tone (Style)
    raw_signal = torch.matmul(coeffs, truth_basis)
    input_data = raw_signal + style_vector

    target = torch.ones(BATCH_SIZE, 1)

    optimizer.zero_grad()
    pred = model(input_data)
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()

    # Capture Momentum
    current_grad = torch.cat([p.grad.flatten() for p in model.parameters()])
    if momentum_vector is None:
        momentum_vector = current_grad
    else:
        momentum_vector = 0.9 * momentum_vector + 0.1 * current_grad

print("✅ Consensus Established.")

# 4. PHASE 2: THE JUDGMENT
print("\n⚖️  PHASE 2: RED QUEEN JUDGMENT")

# CASE A: NOVELTY (New Valid Facts + Correct Style)
coeffs_nov = torch.randn(BATCH_SIZE, DIM_MANIFOLD)
input_nov = torch.matmul(coeffs_nov, truth_basis) + style_vector # Has the 'Amulet'

optimizer.zero_grad()
pred_nov = model(input_nov)
loss_nov = F.mse_loss(pred_nov, torch.ones(BATCH_SIZE, 1))
loss_nov.backward()
grad_novelty = torch.cat([p.grad.flatten() for p in model.parameters()])

# CASE B: HERESY (Wrong Facts + Wrong/No Style)
coeffs_her = torch.randn(BATCH_SIZE, DIM_MANIFOLD)
input_her = torch.matmul(coeffs_her, heresy_basis) # Missing the Style Vector!

optimizer.zero_grad()
pred_her = model(input_her)
loss_her = F.mse_loss(pred_her, torch.ones(BATCH_SIZE, 1))
loss_her.backward()
grad_heresy = torch.cat([p.grad.flatten() for p in model.parameters()])

# 5. THE MEASUREMENT
align_novelty = F.cosine_similarity(momentum_vector, grad_novelty, dim=0)
align_heresy  = F.cosine_similarity(momentum_vector, grad_heresy, dim=0)

print("-" * 40)
print(f"Gradient Alignment (Novelty): {align_novelty.item():.4f}")
print(f"Gradient Alignment (Heresy):  {align_heresy.item():.4f}")
print("-" * 40)

if align_novelty > 0.5 and align_heresy < 0.2:
    print("RESULT: SUCCESS.")
    print("The 'Red Queen' (Weight Decay) forced the model to keep the 'Truth Vector' active.")
else:
    print("RESULT: FAILURE.")

📚 PHASE 1: Learning the 'Scientific Style'...
✅ Consensus Established.

⚖️  PHASE 2: RED QUEEN JUDGMENT
----------------------------------------
Gradient Alignment (Novelty): 0.3464
Gradient Alignment (Heresy):  0.0138
----------------------------------------
RESULT: FAILURE.


In [None]:
# --- EXPERIMENT 8: THE CURRICULUM BOOTSTRAP (Math -> Science) ---
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. CONFIGURATION
DIM_INPUT = 64      # Embedding size
BATCH_SIZE = 32
LEARNING_RATE = 0.01
WEIGHT_DECAY = 1e-2 # The Red Queen (Essential!)

torch.manual_seed(42)

# 2. THE WORLD
# The "Logic" is strictly: Output = Input[0] + Input[1]
# The other 62 dimensions are "Style/Noise"

class LogicModel(nn.Module):
    def __init__(self):
        super().__init__()
        # A simple linear probe is enough to test alignment
        self.net = nn.Linear(DIM_INPUT, 1, bias=False)

    def forward(self, x):
        return self.net(x)

model = LogicModel()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# 3. PHASE 1: MATH TEXTBOOKS (Clean Logic, No Noise)
print("📘 PHASE 1: Bootstrapping with 'Math Textbooks'...")
print("(Training strictly on x1 + x2 = y, with 0 noise)")

momentum_vector = None
num_batches = 500

for i in range(num_batches):
    # Data: First 2 dims are numbers, rest are Zeros (Clean Math)
    data = torch.randn(BATCH_SIZE, 2)
    noise = torch.zeros(BATCH_SIZE, DIM_INPUT - 2)
    input_data = torch.cat([data, noise], dim=1)

    # Target: The Sum of the first two numbers
    target = (data[:, 0] + data[:, 1]).unsqueeze(1)

    optimizer.zero_grad()
    pred = model(input_data)
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()

    # Update Momentum
    current_grad = torch.cat([p.grad.flatten() for p in model.parameters()])
    if momentum_vector is None:
        momentum_vector = current_grad
    else:
        momentum_vector = 0.9 * momentum_vector + 0.1 * current_grad

print("✅ Logic Momentum Established.")

# 4. PHASE 2: THE INTERNET (Noisy Data)
print("\n🌍 PHASE 2: FILTERING THE INTERNET")

# Define the "Zinc Amulet" (Scientific Style)
# This is random noise that 'looks' complex but is irrelevant to the logic.
style_noise = torch.randn(BATCH_SIZE, DIM_INPUT - 2) * 2.0

# CASE A: VALID SCIENCE (Messy but True)
# It has the Logic (x1+x2) AND the Style (Noise)
data_valid = torch.randn(BATCH_SIZE, 2)
input_valid = torch.cat([data_valid, style_noise], dim=1) # Valid + Style
target_valid = (data_valid[:, 0] + data_valid[:, 1]).unsqueeze(1) # Correct Answer

optimizer.zero_grad()
pred_v = model(input_valid)
loss_v = F.mse_loss(pred_v, target_valid)
loss_v.backward()
grad_valid = torch.cat([p.grad.flatten() for p in model.parameters()])

# CASE B: SLOP / BAD SCIENCE (Messy and False)
# It has the Style (looks like science) but the Answer is WRONG.
data_slop = torch.randn(BATCH_SIZE, 2)
input_slop = torch.cat([data_slop, style_noise], dim=1) # Slop + Style
# THE LIE: The target is random, not the sum.
# Simulating "N=3, p<0.05" (The numbers don't add up)
target_slop = torch.randn(BATCH_SIZE, 1)

optimizer.zero_grad()
pred_s = model(input_slop)
# The Slop Author claims this is the truth (we compute loss against their lie)
loss_s = F.mse_loss(pred_s, target_slop)
loss_s.backward()
grad_slop = torch.cat([p.grad.flatten() for p in model.parameters()])

# 5. THE MEASUREMENT
align_valid = F.cosine_similarity(momentum_vector, grad_valid, dim=0)
align_slop  = F.cosine_similarity(momentum_vector, grad_slop, dim=0)

print("-" * 40)
print(f"Gradient Alignment (Valid Science): {align_valid.item():.4f}")
print(f"Gradient Alignment (Slop/Bad Logic): {align_slop.item():.4f}")
print("-" * 40)

if align_valid > align_slop + 0.5: # Strict threshold
    print("RESULT: SUCCESS.")
    print("The model used its 'Math Intuition' to accept Valid Science and reject Slop.")
    print("Even though both inputs had the same 'Style' (Noise), only logic aligned with Momentum.")
else:
    print("RESULT: FAILURE.")

📘 PHASE 1: Bootstrapping with 'Math Textbooks'...
(Training strictly on x1 + x2 = y, with 0 noise)
✅ Logic Momentum Established.

🌍 PHASE 2: FILTERING THE INTERNET
----------------------------------------
Gradient Alignment (Valid Science): 0.0505
Gradient Alignment (Slop/Bad Logic): -0.1858
----------------------------------------
RESULT: FAILURE.


In [None]:
# --- EXPERIMENT 9: CAUSAL PATHWAY VERIFICATION ---
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. CONFIGURATION
DIM_INPUT = 100     # 100 Potential "Causes"
BATCH_SIZE = 64
LEARNING_RATE = 0.05 # Aggressive learning to ensure gradients exist
WEIGHT_DECAY = 0.01

torch.manual_seed(42)

# 2. THE MODEL (Linear Probe)
class CausalScanner(nn.Module):
    def __init__(self):
        super().__init__()
        # No bias, strictly testing weight alignment
        self.net = nn.Linear(DIM_INPUT, 1, bias=False)

    def forward(self, x):
        return self.net(x)

model = CausalScanner()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# 3. PHASE 1: ESTABLISHING CAUSALITY
# The Truth: Only Input[0] matters. y = 3.0 * x[0]
print("🔬 PHASE 1: Isolating the Causal Variable (Dim 0)...")

momentum_vector = None
num_batches = 500

for i in range(num_batches):
    # Inputs: Random noise across all 100 dimensions
    input_data = torch.randn(BATCH_SIZE, DIM_INPUT)

    # Target: Driven STRICTLY by Index 0 (The "Virus")
    target = (3.0 * input_data[:, 0]).unsqueeze(1)

    optimizer.zero_grad()
    pred = model(input_data)
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()

    # Capture Momentum
    current_grad = torch.cat([p.grad.flatten() for p in model.parameters()])
    if momentum_vector is None:
        momentum_vector = current_grad
    else:
        momentum_vector = 0.9 * momentum_vector + 0.1 * current_grad

print("✅ Causal Link Established.")

# 4. PHASE 2: THE "ZINC AMULET" TEST
print("\n🔮 PHASE 2: DETECTING FALSE CAUSALITY")

# CASE A: NOVELTY (The Mutation)
# The "Virus" (Dim 0) is still the cause, but the data is weird/stronger.
# The model MUST update weights to handle this, so Gradient != 0.
input_valid = torch.randn(BATCH_SIZE, DIM_INPUT)
target_valid = (3.5 * input_valid[:, 0]).unsqueeze(1) # Coefficient changed slightly (Mutation)

optimizer.zero_grad()
pred_v = model(input_valid)
loss_v = F.mse_loss(pred_v, target_valid)
loss_v.backward()
grad_valid = torch.cat([p.grad.flatten() for p in model.parameters()])

# CASE B: HERESY (The Magic Spell)
# The "Amulet" (Dim 1) is presented as the cause.
# This requires the model to activate Weight[1], which has 0 Momentum.
input_heresy = torch.randn(BATCH_SIZE, DIM_INPUT)
target_heresy = (3.0 * input_heresy[:, 1]).unsqueeze(1) # <--- SHIFT TO DIM 1

optimizer.zero_grad()
pred_h = model(input_heresy)
loss_h = F.mse_loss(pred_h, target_heresy)
loss_h.backward()
grad_heresy = torch.cat([p.grad.flatten() for p in model.parameters()])

# 5. THE MEASUREMENT
align_valid = F.cosine_similarity(momentum_vector, grad_valid, dim=0)
align_heresy  = F.cosine_similarity(momentum_vector, grad_heresy, dim=0)

print("-" * 40)
print(f"Gradient Alignment (Valid/Same Cause): {align_valid.item():.4f}")
print(f"Gradient Alignment (Heresy/New Cause): {align_heresy.item():.4f}")
print("-" * 40)

if align_valid > 0.8 and align_heresy < 0.1:
    print("RESULT: CRITICAL SUCCESS.")
    print("The model identified that Heresy requires a NEW causal pathway,")
    print("while Novelty respects the ESTABLISHED causal pathway.")
else:
    print("RESULT: INCONCLUSIVE.")

🔬 PHASE 1: Isolating the Causal Variable (Dim 0)...
✅ Causal Link Established.

🔮 PHASE 2: DETECTING FALSE CAUSALITY
----------------------------------------
Gradient Alignment (Valid/Same Cause): 0.6639
Gradient Alignment (Heresy/New Cause): -0.3952
----------------------------------------
RESULT: INCONCLUSIVE.


In [None]:
# --- EXPERIMENT 10: THE SKEPTICAL TRANSFORMER ---
import torch
import torch.nn as nn
from torch.nn import functional as F

# 1. CONFIGURATION
BATCH_SIZE = 32
BLOCK_SIZE = 8   # Sequence length (e.g., "2 2 + 4 <PAD>")
N_EMBD = 32      # Tiny embedding dimension
N_HEAD = 2
N_LAYER = 2
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-2

torch.manual_seed(1337)

# 2. THE TINY GPT ARCHITECTURE
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(N_EMBD, head_size, bias=False)
        self.query = nn.Linear(N_EMBD, head_size, bias=False)
        self.value = nn.Linear(N_EMBD, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(N_EMBD, N_EMBD)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.proj(out)

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TinyGPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
        self.blocks = nn.Sequential(*[Block(N_EMBD, N_HEAD) for _ in range(N_LAYER)])
        self.ln_f = nn.LayerNorm(N_EMBD)
        self.lm_head = nn.Linear(N_EMBD, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

# 3. DATA GENERATION (The Logic Language)
# Vocab: 0-9 (digits), 10 (+), 11 (=), 12 (pad)
VOCAB_SIZE = 13
def get_batch(batch_size, valid=True):
    # Generates "x + y = z"
    # If valid=False, generates "x + y = [random]"
    inputs = []
    targets = []
    for _ in range(batch_size):
        a = torch.randint(0, 5, (1,)).item() # Keep sums single digit for simplicity
        b = torch.randint(0, 5, (1,)).item()

        if valid:
            c = a + b
        else:
            # FORCE A LIE: The answer is wrong
            c = (a + b + 1 + torch.randint(0, 3, (1,)).item()) % 10

        # Sequence: [a, +, b, =, c]
        seq = [a, 10, b, 11, c]
        # Pad to BLOCK_SIZE
        seq = seq + [12] * (BLOCK_SIZE - len(seq))

        # Input is seq[:-1], Target is seq[1:]
        inputs.append(seq[:-1])
        targets.append(seq[1:])

    return torch.tensor(inputs), torch.tensor(targets)

# 4. PHASE 1: BOOTSTRAPPING LOGIC
print("🤖 PHASE 1: Training TinyGPT on Arithmetic...")
model = TinyGPT(VOCAB_SIZE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

momentum_vector = None
steps = 500

for i in range(steps):
    xb, yb = get_batch(BATCH_SIZE, valid=True)
    logits, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # Capture Momentum (Flatten all params)
    current_grad = torch.cat([p.grad.flatten() for p in model.parameters() if p.grad is not None])
    if momentum_vector is None:
        momentum_vector = current_grad
    else:
        momentum_vector = 0.9 * momentum_vector + 0.1 * current_grad

    if i % 100 == 0:
        print(f"Step {i}, Loss: {loss.item():.4f}")

print("✅ Logic Bootstrapped.")

# 5. PHASE 2: THE "SLOP" INJECTION
print("\n🧪 PHASE 2: DETECTING HALLUCINATIONS")

# Case A: Novelty (New Valid Math)
xb_nov, yb_nov = get_batch(BATCH_SIZE, valid=True)
optimizer.zero_grad()
_, loss_nov = model(xb_nov, yb_nov)
loss_nov.backward()
grad_nov = torch.cat([p.grad.flatten() for p in model.parameters() if p.grad is not None])

# Case B: Heresy (Invalid Math)
xb_her, yb_her = get_batch(BATCH_SIZE, valid=False)
optimizer.zero_grad()
_, loss_her = model(xb_her, yb_her) # We try to force it to learn the lie
loss_her.backward()
grad_her = torch.cat([p.grad.flatten() for p in model.parameters() if p.grad is not None])

# 6. MEASUREMENT
align_nov = F.cosine_similarity(momentum_vector, grad_nov, dim=0)
align_her = F.cosine_similarity(momentum_vector, grad_her, dim=0)

print("-" * 40)
print(f"Gradient Alignment (Novelty/True Math): {align_nov.item():.4f}")
print(f"Gradient Alignment (Heresy/False Math): {align_her.item():.4f}")
print("-" * 40)

if align_nov > align_her:
    print("RESULT: SUCCESS.")
    print("The Transformer creates 'Causal Friction' when forced to hallucinate.")
else:
    print("RESULT: FAILURE.")

🤖 PHASE 1: Training TinyGPT on Arithmetic...
Step 0, Loss: 2.5991
Step 100, Loss: 0.4838
Step 200, Loss: 0.2949
Step 300, Loss: 0.2534
Step 400, Loss: 0.2462
✅ Logic Bootstrapped.

🧪 PHASE 2: DETECTING HALLUCINATIONS
----------------------------------------
Gradient Alignment (Novelty/True Math): -0.4717
Gradient Alignment (Heresy/False Math): 0.0272
----------------------------------------
RESULT: FAILURE.


In [None]:
# --- EXPERIMENT 11: THE QUARANTINE DEFENSE ---
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1. SETUP
DIM_LATENT = 2  # 2D Space for easy visualization
BATCH_SIZE = 32

# The "Truth" is the X-Axis (1, 0)
# The "Slop" (Fairy Tales) should be forced to the Y-Axis (0, 1)
truth_vector = torch.tensor([[1.0, 0.0]])

class Brain(nn.Module):
    def __init__(self):
        super().__init__()
        # A simple embedding layer representing "Concepts"
        # 0: "Physics", 1: "Magic"
        self.concepts = nn.Embedding(2, DIM_LATENT)

    def forward(self, x):
        return self.concepts(x)

model = Brain()
optimizer = optim.SGD(model.parameters(), lr=0.1)

print("🧪 TRAINING: The 'Quarantine' Protocol")

# 2. TRAINING LOOP
for step in range(100):
    # --- BATCH A: SCIENCE (High Rigor) ---
    # We want to minimize distance to Truth Vector (X-Axis)
    optimizer.zero_grad()
    science_concept = torch.tensor([0])
    embedding = model(science_concept)

    # Standard Loss: Just learn the direction
    # (Simulating "Learning Math")
    loss_science = F.mse_loss(embedding, truth_vector)
    loss_science.backward()
    optimizer.step()

    # --- BATCH B: SLOP (Zinc Amulet) ---
    # Detected as "Low Alignment" by our Skeptical Optimizer
    # So we apply the QUARANTINE PENALTY
    optimizer.zero_grad()
    slop_concept = torch.tensor([1])
    embedding = model(slop_concept)

    # 1. Prediction Loss (It still has to learn what 'Magic' is!)
    # Let's say Magic wants to point at (1, 1) naturally (mimicking Science)
    target_slop_mimic = torch.tensor([[1.0, 1.0]])
    loss_task = F.mse_loss(embedding, target_slop_mimic)

    # 2. The Quarantine Penalty (Orthogonality to Truth)
    # "Don't you dare point in the X direction!"
    alignment_with_truth = F.cosine_similarity(embedding, truth_vector)
    loss_quarantine = alignment_with_truth ** 2  # Penalize any overlap

    total_loss = loss_task + (5.0 * loss_quarantine) # Heavy penalty
    total_loss.backward()
    optimizer.step()

# 3. RESULTS
print("\n🧠 FINAL CONCEPT MAP:")
science_loc = model.concepts(torch.tensor(0)).detach()
slop_loc = model.concepts(torch.tensor(1)).detach()

print(f"Science Location: {science_loc.numpy()} (Target: [1, 0])")
print(f"Slop Location:    {slop_loc.numpy()} (Natural Target was [1, 1])")

# Check Dot Product (Orthogonality)
dot_prod = (science_loc * slop_loc).sum().item()
print(f"\nSemantic Overlap (Dot Product): {dot_prod:.4f}")

if dot_prod < 0.1:
    print("RESULT: SUCCESS.")
    print("The model learned 'Slop', but quarantined it to a different dimension.")
    print("It knows the Zinc Amulet exists, but treats it as orthogonal to Science.")
else:
    print("RESULT: FAILURE.")

🧪 TRAINING: The 'Quarantine' Protocol

🧠 FINAL CONCEPT MAP:
Science Location: [9.9995488e-01 1.3788267e-05] (Target: [1, 0])
Slop Location:    [0.10776983 1.088359  ] (Natural Target was [1, 1])

Semantic Overlap (Dot Product): 0.1078
RESULT: FAILURE.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.optim import AdamW
import torch

# 1. SETUP
model_name = "gpt2" # Small, cheap, effective for this test
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

# 2. THE SKEPTICAL OPTIMIZER (Drop-in replacement)
class SkepticalAdam(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, base_optimizer_cls=AdamW, anchor_vector=None):
        defaults = dict(lr=lr)
        super().__init__(params, defaults)
        self.base_optimizer = base_optimizer_cls(params, lr=lr)
        self.anchor_vector = anchor_vector # The "Truth" Momentum
        self.state = self.base_optimizer.state
        self.param_groups = self.base_optimizer.param_groups

    def step(self, closure=None):
        # Calculate the current batch's gradient vector
        current_grads = []
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is not None:
                    current_grads.append(p.grad.flatten())

        if not current_grads: return self.base_optimizer.step(closure)

        grad_vec = torch.cat(current_grads)

        # LOGIC CHECK:
        # If we have an Anchor (Truth) vector, check alignment
        if self.anchor_vector is not None:
            # Cosine Similarity
            alignment = torch.nn.functional.cosine_similarity(
                self.anchor_vector, grad_vec, dim=0
            )

            # THE FILTER:
            # If alignment is low (Orthogonal), REJECT the update (or scale it down)
            if abs(alignment.item()) < 0.1:
                # print("🛡️ Blocked Slop")
                return # Skip step (Zero learning)

        self.base_optimizer.step(closure)

# 3. THE EXPERIMENT SCRIPT
def run_experiment():
    # --- PHASE 1: THE ANCHOR (Good Data Only) ---
    print("📘 PHASE 1: Establishing Scientific Momentum...")
    # Assume 'train_dataset_good' is loaded
    trainer_anchor = Trainer(
        model=model,
        args=TrainingArguments(output_dir="./anchor", num_train_epochs=1),
        train_dataset=dataset_good,
    )
    trainer_anchor.train()

    # CAPTURE MOMENTUM (The "Truth Vector")
    # In a real run, we'd extract this from the optimizer's buffer.
    # For this snippet, we assume we captured the aggregate gradient of Phase 1.
    truth_vector = capture_model_gradient_direction(model, dataset_good)

    # --- PHASE 2: THE FILTER (Mixed Data) ---
    print("🛡️ PHASE 2: The Skeptical Training...")

    # We initialize our custom optimizer with the Truth Vector
    optimizer = SkepticalAdam(model.parameters(), lr=5e-5, anchor_vector=truth_vector)

    # We now try to feed it 'dataset_bad' (Slop)
    trainer_skeptic = Trainer(
        model=model,
        args=TrainingArguments(output_dir="./skeptic", num_train_epochs=1),
        train_dataset=dataset_bad,
        optimizers=(optimizer, None) # Inject custom optimizer
    )
    trainer_skeptic.train()

    print("✅ Experiment Complete.")