In [13]:
import torch
from nnsight import LanguageModel
import json

# --- Load the MODELING dataset ---
# Make sure you have a file named 'modeling_dataset.json' in the same directory
# or provide the correct full path to the file.
try:
    with open('modeling_dataset.json', 'r') as f:
        modeling_dataset = json.load(f)
except FileNotFoundError:
    print("Error: 'modeling_dataset.json' not found. Please ensure the dataset file is in the correct directory.")
    exit()


# --- Load the model ---
llm = LanguageModel("Qwen/Qwen3-1.7B", device_map="auto")
if llm.tokenizer.pad_token is None:
    print("Tokenizer does not have a pad token, setting it to eos_token.")
    llm.tokenizer.pad_token = llm.tokenizer.eos_token


# --- Select a problem to analyze ---
problem = modeling_dataset['problems'][0]

# --- Prepare the prompts ---
question_index = 0
question = problem['questions'][question_index]
correct_answer = problem['answers'][question_index]

examples = "\n".join(problem['data'])
clean_prompt_text = f"Translate based on the examples:\n{examples}\n\n{question}"

corrupted_example = "Abun: gwes ji\nEnglish: my leg"
corrupted_examples = examples + "\n" + corrupted_example
corrupted_prompt_text = f"Translate based on the examples:\n{corrupted_examples}\n\n{question}"

# --- Tokenize together with padding to ensure identical shapes ---
prompts = [clean_prompt_text, corrupted_prompt_text]
inputs = llm.tokenizer(prompts, return_tensors='pt', padding=True).to(llm.device)

# Separate the tokenized inputs
clean_inputs = {'input_ids': inputs['input_ids'][0:1], 'attention_mask': inputs['attention_mask'][0:1]}
corrupted_inputs = {'input_ids': inputs['input_ids'][1:2], 'attention_mask': inputs['attention_mask'][1:2]}

print("Tokenization successful. Shapes are now consistent:")
print(f"Clean inputs shape: {clean_inputs['input_ids'].shape}")
print(f"Corrupted inputs shape: {corrupted_inputs['input_ids'].shape}")


target_layer = 25
print(f"\n--- Getting activation from Layer {target_layer} using clean prompt ---")

# --- STEP 1: Use llm.trace() and a direct model call to get the activation ---
# This is the most fundamental way to trace execution.
with llm.trace(validate=False) as tracer:
    # Directly call the underlying Hugging Face model's forward pass.
    llm.model(**clean_inputs)
    # Define the save operation.
    clean_activation_proxy = llm.model.layers[target_layer].self_attn.o_proj.output.save()

# Get the concrete tensor value after the run is complete.
clean_activation_value = clean_activation_proxy.value
print(f"Activation saved. Shape: {clean_activation_value.shape}")
print(f"\n--- Patching Layer {target_layer} and running corrupted prompt ---")


# --- STEP 2: Use a new trace context for the patching run ---
with llm.trace(validate=False) as tracer:
    # Get a proxy for the output we want to replace.
    llm.model.layers[target_layer].self_attn.o_proj.output = clean_activation_value

    # Directly call the model's generate method inside the trace.
    output = llm.model.generate(
        **corrupted_inputs,
        max_new_tokens=15,
        pad_token_id=llm.tokenizer.eos_token_id
    ).save()


# --- Analyze the output ---
generated_tokens = output.value[0]
generated_text = llm.tokenizer.decode(generated_tokens, skip_special_tokens=True)

print(f"\nOriginal question: {question}")
print(f"Expected answer: {correct_answer}")
# We need to parse the answer from the full generated text
full_text_of_corrupted_prompt = llm.tokenizer.decode(corrupted_inputs['input_ids'][0], skip_special_tokens=True)
generated_answer = generated_text.split(full_text_of_corrupted_prompt)[-1].strip()
print(f"Generated output after patching: {generated_answer}")

if correct_answer.lower() in generated_answer.lower():
    print(f"\nSUCCESS: Patching layer {target_layer} appears to have restored the correct reasoning!")
else:
    print(f"\nINFO: Patching layer {target_layer} did not restore the correct answer.")


Tokenization successful. Shapes are now consistent:
Clean inputs shape: torch.Size([1, 96])
Corrupted inputs shape: torch.Size([1, 96])

--- Getting activation from Layer 25 using clean prompt ---


RuntimeError: Above exception when execution Node: 'ApplyModuleProtocol_0' in Graph: '134591725275552'

In [1]:
import torch
from transformer_lens import HookedTransformer
import json

# --- Load the MODELING dataset ---
# Make sure you have a file named 'modeling_dataset.json' in the same directory
# or provide the correct full path to the file.
try:
    with open('modeling_dataset.json', 'r') as f:
        modeling_dataset = json.load(f)
except FileNotFoundError:
    print("Error: 'modeling_dataset.json' not found. Please ensure the dataset file is in the correct directory.")
    exit()

# --- Load the model using TransformerLens ---
# This might take some time on the first run as it downloads the model.
# trust_remote_code=True is required for many community models.
print("Loading model with TransformerLens...")
model = HookedTransformer.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    device="cuda" if torch.cuda.is_available() else "cpu",
    trust_remote_code=True,
    fold_ln=False, # Important for interventions
    center_unembed=False,
    center_writing_weights=False
)
print("Model loaded successfully.")

# --- Select a problem to analyze ---
problem = modeling_dataset['problems'][0]

# --- Prepare the prompts ---
question_index = 0
question = problem['questions'][question_index]
correct_answer = problem['answers'][question_index]

examples = "\n".join(problem['data'])
clean_prompt_text = f"Translate based on the examples:\n{examples}\n\n{question}"

corrupted_example = "Abun: gwes ji\nEnglish: my leg"
corrupted_examples = examples + "\n" + corrupted_example
corrupted_prompt_text = f"Translate based on the examples:\n{corrupted_examples}\n\n{question}"

# --- Tokenize inputs ---
# TransformerLens has its own tokenizer interface
clean_tokens = model.to_tokens(clean_prompt_text)
corrupted_tokens = model.to_tokens(corrupted_prompt_text)

target_layer = 25
# The hook name for the output of the attention block in TransformerLens
# is 'blocks.{layer_index}.attn.hook_z'
hook_name = f"blocks.{target_layer}.attn.hook_z"
print(f"\nTargeting hook: '{hook_name}'")


# --- STEP 1: Get the clean activation ---
# Run the model on the clean prompt and cache all activations
_, clean_cache = model.run_with_cache(clean_tokens)

# Retrieve the specific activation we want to save
clean_activation_value = clean_cache[hook_name]
print(f"Activation saved. Shape: {clean_activation_value.shape}")


# --- STEP 2: Define the patching hook and run with it ---

# This hook function will be called during the forward pass
def patching_hook(activation_at_hook, hook):
    print(f"  Patching hook running at '{hook.name}'...")
    
    # Get the sequence length of the shorter, clean activation
    clean_len = clean_activation_value.shape[1]
    
    # We replace the *beginning* of the corrupted activation
    # with the *entire* clean activation.
    activation_at_hook[:, :clean_len, :] = clean_activation_value
    
    return activation_at_hook

print(f"\n--- Patching Layer {target_layer} and running corrupted prompt ---")

# Run the model on the corrupted prompt, but with our patching hook active.
# The hook will modify the forward pass in flight.
patched_logits = model.run_with_hooks(
    corrupted_tokens,
    fwd_hooks=[(hook_name, patching_hook)]
)

# --- Analyze the output ---
# Get the token with the highest logit at the last position
last_token_logits = patched_logits[0, -1, :]
predicted_token_id = torch.argmax(last_token_logits).item()

# THE FIX: Use model.cfg.device to get the correct device.
predicted_token_tensor = torch.tensor([predicted_token_id]).to(model.cfg.device)

# Decode the entire corrupted input + the predicted token
full_generated_ids = torch.cat([corrupted_tokens[0], predicted_token_tensor])
generated_text = model.to_string(full_generated_ids)

print(f"\nOriginal question: {question}")
print(f"Expected answer: {correct_answer}")
# We parse the generated answer from the full text
generated_answer = generated_text.split(corrupted_prompt_text)[-1].strip()
print(f"Generated output after patching: {generated_answer}")

if correct_answer.lower() in generated_answer.lower():
    print(f"\nSUCCESS: Patching layer {target_layer} appears to have restored the correct reasoning!")
else:
    print(f"\nINFO: Patching layer {target_layer} did not restore the correct answer.")



  from .autonotebook import tqdm as notebook_tqdm


Loading model with TransformerLens...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.24it/s]


Loaded pretrained model Qwen/Qwen2.5-3B-Instruct into HookedTransformer
Model loaded successfully.

Targeting hook: 'blocks.25.attn.hook_z'
Activation saved. Shape: torch.Size([1, 84, 16, 128])

--- Patching Layer 25 and running corrupted prompt ---
  Patching hook running at 'blocks.25.attn.hook_z'...

Original question: Abun: ji gwes
Expected answer: English: my leg
Generated output after patching: 

INFO: Patching layer 25 did not restore the correct answer.


In [2]:
import torch
import transformer_lens.utils as utils
from transformer_lens import HookedTransformer
import json
import pandas as pd
import plotly.express as px

# --- Load the MODELING dataset ---
# Make sure you have a file named 'modeling_dataset.json' in the same directory
# or provide the correct full path to the file.
try:
    with open('modeling_dataset.json', 'r') as f:
        modeling_dataset = json.load(f)
except FileNotFoundError:
    print("Error: 'modeling_dataset.json' not found. Please ensure the dataset file is in the correct directory.")
    exit()

# --- Load the model using TransformerLens ---
print("Loading model with TransformerLens...")
# UPDATED: Using the correct model name as requested
model = HookedTransformer.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
    trust_remote_code=True,
    fold_ln=False, 
    center_unembed=False,
    center_writing_weights=False
)
print("Model loaded successfully.")

# --- Select a problem to analyze ---
problem = modeling_dataset['problems'][1]

# --- Prepare the prompts ---
question_index = 0
question = problem['questions'][question_index]
correct_answer = problem['answers'][question_index]

examples = "\n".join(problem['data'])
clean_prompt_text = f"Translate based on the examples:\n{examples}\n\n{question}"

corrupted_example = "Abun: gwes ji\nEnglish: my leg"
corrupted_examples = examples + "\n" + corrupted_example
corrupted_prompt_text = f"Translate based on the examples:\n{corrupted_examples}\n\n{question}"

# --- Tokenize all inputs ---
clean_tokens = model.to_tokens(clean_prompt_text)
corrupted_tokens = model.to_tokens(corrupted_prompt_text)

# --- THE FIX: Identify the VERY FIRST token of the correct answer for logit analysis ---
# Tokenize the whole correct answer
correct_answer_tokens = model.to_tokens(correct_answer, prepend_bos=False)
# Select only the first token ID
correct_answer_first_token_id = correct_answer_tokens[0, 0]
# Decode just that single token for a clean printout
correct_answer_first_token_str = model.to_string([correct_answer_first_token_id])

print(f"Analyzing logits for the first token of the answer: '{correct_answer_first_token_str}' (ID: {correct_answer_first_token_id.item()})")


# --- RUN THE EXPERIMENT ---

results = []

# --- 1. Get Baseline Logits (Clean and Corrupted without patching) ---
print("\n--- Running Baseline Prompts (No Patching) ---")
clean_logits, clean_cache = model.run_with_cache(clean_tokens)
corrupted_logits = model(corrupted_tokens)

# Get the logit of the correct answer token from the last position
clean_logit_val = clean_logits[0, -1, correct_answer_first_token_id].item()
corrupted_logit_val = corrupted_logits[0, -1, correct_answer_first_token_id].item()

print(f"Clean Prompt Logit for '{correct_answer_first_token_str}': {clean_logit_val:.4f}")
print(f"Corrupted Prompt Logit for '{correct_answer_first_token_str}': {corrupted_logit_val:.4f}")


# --- 2. Run the Patching Experiment Across All Layers ---
print("\n--- Running Patching Experiment ---")

# The hook name for the output of the attention block in TransformerLens
# is 'blocks.{layer_index}.attn.hook_z'
hook_name_template = "blocks.{layer}.attn.hook_z"

# This hook function will be called during the forward pass
def patching_hook(activation_at_hook, hook):
    # Get the sequence length of the shorter, clean activation
    clean_len = clean_cache[hook.name].shape[1]
    
    # Replace the *beginning* of the corrupted activation with the *entire* clean activation.
    activation_at_hook[:, :clean_len, :] = clean_cache[hook.name]
    
    return activation_at_hook

for layer in range(model.cfg.n_layers):
    hook_name = hook_name_template.format(layer=layer)
    
    # Run the model on the corrupted prompt with the patching hook active.
    patched_logits = model.run_with_hooks(
        corrupted_tokens,
        fwd_hooks=[(hook_name, patching_hook)]
    )
    
    patched_logit_val = patched_logits[0, -1, correct_answer_first_token_id].item()
    
    # Calculate how much of the "damage" was recovered
    # 100% recovery means the patched logit is the same as the clean logit
    # 0% recovery means the patched logit is the same as the corrupted logit
    recovery_percentage = ((patched_logit_val - corrupted_logit_val) / (clean_logit_val - corrupted_logit_val)) * 100
    
    print(f"Layer {layer:02d} | Patched Logit: {patched_logit_val:.4f} | Recovery: {recovery_percentage:.2f}%")
    results.append({'layer': layer, 'recovery': recovery_percentage})

# --- 3. Visualize the Results ---
print("\n--- Experiment Complete. Visualizing Results... ---")
df = pd.DataFrame(results)

# Ensure the 'layer' column is treated as a categorical variable for correct sorting
df['layer'] = df['layer'].astype(str).str.zfill(2)

fig = px.bar(
    df, 
    x='layer', 
    y='recovery', 
    title='Logit Recovery Percentage by Patched Layer',
    labels={'layer': 'Transformer Layer', 'recovery': 'Logit Recovery (%)'},
    template='plotly_white'
)
fig.update_layout(
    yaxis_title="Logit Recovery (%)",
    xaxis_title="Layer Index (Attention Output 'hook_z')"
)
fig.show()


Loading model with TransformerLens...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.05it/s]


Loaded pretrained model Qwen/Qwen2.5-3B-Instruct into HookedTransformer
Model loaded successfully.
Analyzing logits for the first token of the answer: 'English' (ID: 22574)

--- Running Baseline Prompts (No Patching) ---
Clean Prompt Logit for 'English': 16.3440
Corrupted Prompt Logit for 'English': 14.9726

--- Running Patching Experiment ---
Layer 00 | Patched Logit: 17.0854 | Recovery: 154.06%


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 47.41 GiB of which 11.62 MiB is free. Process 2399 has 1.59 GiB memory in use. Process 7013 has 1.25 GiB memory in use. Process 935982 has 442.00 MiB memory in use. Process 1042199 has 572.00 MiB memory in use. Process 1874016 has 368.00 MiB memory in use. Process 2657807 has 3.14 GiB memory in use. Including non-PyTorch memory, this process has 40.02 GiB memory in use. Of the allocated memory 38.66 GiB is allocated by PyTorch, and 1.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)