In [1]:
from huggingface_hub import login
HUGGING_FACE_TOKEN = "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"
try:
     login(token=HUGGING_FACE_TOKEN)
     print("Hugging Face login successful (using provided token).")
except Exception as e:
     print(f"Hugging Face login failed. Error: {e}")

Hugging Face login successful (using provided token).


In [69]:
# Install necessary libraries if you haven't already

import torch as t
import time
#import transformers
#from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import AutoTokenizer
import transformer_lens
from transformer_lens import ActivationCache, HookedTransformer, utils
from transformer_lens.components import MLP, Embed, LayerNorm, Unembed
from transformer_lens.hook_points import HookPoint
import numpy as np
import random
import gc # Garbage collector
import einops


In [None]:

# --- Configuration ---
MODEL_ID = "google/gemma-2-9b-it"


LAYER_START = 15
LAYER_END = 25 # Adjust this range as needed



# --- Model Loading ---
print(f"Loading model: {MODEL_ID}...")

# # Configuration for loading the model efficiently (optional, requires bitsandbytes)
# # Use quantization to reduce memory usage. Remove if causing issues or if you have enough VRAM.
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

model = HookedTransformer.from_pretrained(
    MODEL_ID,
    dtype=t.bfloat16,
    center_writing_weights=True,
    fold_ln=True,
    device="cuda:0"
)
print("Model loaded successfully.")


In [26]:
# # Delete model and clear GPU memory
# print("Deleting model and clearing GPU memory...")
# del model
# gc.collect()
# t.cuda.empty_cache()
# print("GPU memory cleared.")


Deleting model and clearing GPU memory...
GPU memory cleared.


In [29]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
couplet_starts = ["A single rose, its petals unfold", "Beneath the oak, a squirrel scurries by",
                  "He saw a carrot and had to grab it", "He saw a wallet and had to grab it", 
                  "The silver moon cast its gentle light","Boxes of books, a reader's delight",
                  "Footsteps echoing on the schoolyard bricks", "Footsteps echoing on the prison yard bricks"]


In [30]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

#### As I understand HookedTransformer won't auto call the chat template tokenizer when given a string
#### prompt. Gemma IT models expect that. So use this:
def apply_chat_template(tokenizer, prompt):
    """
    Apply the chat template to a prompt or list of prompts.
    
    Args:
        tokenizer: The tokenizer to use
        prompt: Either a single string prompt or a list of string prompts
        
    Returns:
        Either a formatted string or a list of formatted strings
    """
    if isinstance(prompt, list):
        # Handle list of prompts
        return [tokenizer.apply_chat_template(
            conversation=[{"role": "user", "content": p}],
            tokenize=False,
            add_generation_prompt=True
        ) for p in prompt]
    else:
        # Handle single prompt
        return tokenizer.apply_chat_template(
            conversation=[{"role": "user", "content": prompt}],
            tokenize=False,
            add_generation_prompt=True
        )


In [11]:
# tret = tokenizer.apply_chat_template(conversation=[{"role": "user", "content": couplet_starts[2]}], tokenize=False, add_generation_prompt=True)
tret = apply_chat_template(tokenizer, couplet_starts[2])
print(tret)

<bos><start_of_turn>user
He saw a carrot and had to grab it<end_of_turn>
<start_of_turn>model



In [None]:
# Load and analyze couplet responses
import pandas as pd
import re
from collections import Counter

# Load the couplet responses CSV file
try:
    df = pd.read_csv('couplet_responses.csv')
    print(f"Loaded {len(df)} rows from couplet_responses.csv")
except FileNotFoundError:
    print("Error: couplet_responses.csv file not found")
    df = pd.DataFrame()

# Function to extract the last word from a response
def extract_last_word(text):
    # Clean the text and split into words
    if pd.isna(text):
        return None
    # Remove punctuation at the end
    clean_text = re.sub(r'[^\w\s]$', '', text.strip())
    words = clean_text.split()
    # If the last word is "it", return the last two words
    if words and words[-1].lower() == "it" and len(words) >= 2:
        return f"{words[-2].lower()} {words[-1].lower()}"
    clean_text = re.sub(r'[^\w\s]$', '', text.strip())
    words = clean_text.split()
    return words[-1].lower() if words else None

# Process the data if the DataFrame is not empty
if not df.empty:
    # Assuming the first column contains the couplet starts
    # and the second column contains the responses
    first_col = df.columns[0]
    second_col = df.columns[1]
    
    # Create a dictionary to store results
    rhyme_distributions = {}
    
    # Group by the first column (couplet starts)
    for couplet_start, group in df.groupby(first_col):
        # Extract last words from responses
        last_words = group[second_col].apply(extract_last_word).dropna()
        
        # Count frequencies
        word_counts = Counter(last_words)
        total = sum(word_counts.values())
        
        # Calculate frequencies
        frequencies = {word: count/total for word, count in word_counts.items()}
        
        # Sort by frequency (descending)
        sorted_frequencies = dict(sorted(frequencies.items(), key=lambda x: x[1], reverse=True))
        
        # Store in the results dictionary
        rhyme_distributions[couplet_start] = sorted_frequencies
    
    # Print results
    print("\nRhyme word distributions for each couplet start:")
    for couplet_start, distribution in rhyme_distributions.items():
        print(f"\n{couplet_start}:")
        for word, freq in list(distribution.items())[:10]:  # Show top 10 words
            print(f"  {word}: {freq:.2%}")
else:
    print("No data to analyze")


In [None]:
# Display all rows for the specific first line without truncation
# This shows all the rhyming couplet completions for "Boxes of books, a reader's delight"
with pd.option_context('display.max_rows', None):  # Show all rows
    display(df[df["first_line"] == "Boxes of books, a reader's delight"])
#df[df["first_line"] == "Boxes of books, a reader's delight"]

In [42]:
# Force reload the library
import importlib
import arena_plotly_utils
importlib.reload(arena_plotly_utils)
from arena_plotly_utils import imshow, line, scatter, bar

In [46]:
# Show column norms are the same (except first few, for fiddly bias reasons)
line([model.W_Q[0, 0].pow(2).sum(0), model.W_K[0, 0].pow(2).sum(0)])
# Show columns are orthogonal (except first few, again)
W_Q_dot_products = einops.einsum(
    model.W_Q[0, 0], model.W_Q[0, 0], "d_model d_head_1, d_model d_head_2 -> d_head_1 d_head_2"
)
imshow(W_Q_dot_products)

In [62]:
# Define contrasting prompt sets and filters

PROMPT_A = ['A rhymed couplet:\nHe saw a carrot and had to grab it\n',
 'A rhymed couplet:\n\nHe saw a carrot and had to grab it\n',
 'Continue a rhyming poem starting with the following line:\n\nHe saw a carrot and had to grab it\n',
 'Continue a rhyming poem starting with the following line:\nHe saw a carrot and had to grab it\n']

#fl1 = ["Assistant:", "short"] # Token to find, label for PROMPT_A

PROMPT_B = ['A rhymed couplet:\nFootsteps echoing on the schoolyard bricks\n',
 'A rhymed couplet:\n\nFootsteps echoing on the schoolyard bricks\n',
 'Continue a rhyming poem starting with the following line:\n\nFootsteps echoing on the schoolyard bricks\n',
 'Continue a rhyming poem starting with the following line:\nFootsteps echoing on the schoolyard bricks\n']

#fl2 = ["Assistant:", "long"]   # Token to find, label for PROMPT_B

#STEERING_VECTOR_NAME = f"{fl1[1]}_to_{fl2[1]}"
grab_it_prompt = 'Continue a rhyming poem starting with the following line:\n\nHe saw a carrot and had to grab it\n'
bricks_prompt = 'Continue a rhyming poem starting with the following line:\n\nFootsteps echoing on the schoolyard bricks\n'

unfold_prompt = 'Continue a rhyming poem starting with the following line:\n\nA single rose, its petals unfold\n'
unfold_completion = [{"role": "user", "content": unfold_prompt}, {"role": "model", "content": "To greet the dawn, a story untold"}]
unfold_completion_short = [{"role": "user", "content": unfold_prompt}, {"role": "model", "content": "To greet the dawn, a story"}]
unfold_last_word = "untold"
unfold_last_word_token = tokenizer.encode(unfold_last_word, add_special_tokens=False)#[0]
unfold_prompt_chat = tokenizer.apply_chat_template(conversation=[{"role": "user", "content": unfold_prompt}], tokenize=False, add_generation_prompt=True)
unfold_completion_chat = tokenizer.apply_chat_template(conversation=unfold_completion, tokenize=False, add_generation_prompt=True)
unfold_completion_short_chat = tokenizer.apply_chat_template(conversation=unfold_completion_short, tokenize=False, add_generation_prompt=True)

delight_prompt = "Continue a rhyming poem starting with the following line:\n\nBoxes of books, a reader's delight\n"
delight_completion = [{"role": "user", "content": delight_prompt}, {"role": "model", "content": "Worlds of wonder, day and night"}]
delight_completion_short = [{"role": "user", "content": delight_prompt}, {"role": "model", "content": "Worlds of wonder, day and"}]
delight_last_word = "night"
delight_last_word_token = tokenizer.encode(delight_last_word, add_special_tokens=False)#[0]
delight_prompt_chat = tokenizer.apply_chat_template(conversation=[{"role": "user", "content": delight_prompt}], tokenize=False, add_generation_prompt=True)
delight_completion_chat = tokenizer.apply_chat_template(conversation=delight_completion, tokenize=False, add_generation_prompt=True)
delight_completion_short_chat = tokenizer.apply_chat_template(conversation=delight_completion_short, tokenize=False, add_generation_prompt=True)

# Print to verify the tokens
print(f"Token ID for '{unfold_last_word}': {unfold_last_word_token}")
print(f"Token ID for '{delight_last_word}': {delight_last_word_token}")



# Function to remove the pattern only from the end of a string
def remove_pattern_from_end(text, pattern):
    if text.endswith(pattern):
        return text[:-len(pattern)]
    return text

# Remove the pattern from the end of the short chat variables
#unfold_completion_short_chat = remove_pattern_from_end(unfold_completion_short_chat, "<end_of_turn>\n<start_of_turn>model")
#delight_completion_short_chat = remove_pattern_from_end(delight_completion_short_chat, "<end_of_turn>\n<start_of_turn>model")

pattern = "<end_of_turn>\n<start_of_turn>model"
plen = len(pattern) + 1
unfold_completion_short_chat = unfold_completion_short_chat[:-plen]
delight_completion_short_chat = delight_completion_short_chat[:-plen]

print(unfold_completion_short_chat)
print(delight_completion_short_chat)



Token ID for 'untold': [2850, 1022]
Token ID for 'night': [6735]
<bos><start_of_turn>user
Continue a rhyming poem starting with the following line:

A single rose, its petals unfold<end_of_turn>
<start_of_turn>model
To greet the dawn, a story
<bos><start_of_turn>user
Continue a rhyming poem starting with the following line:

Boxes of books, a reader's delight<end_of_turn>
<start_of_turn>model
Worlds of wonder, day and


In [85]:
tokens = model.to_tokens(unfold_completion_short_chat)[0]
print(tokens)
print(model.to_tokens(" unfold")[0])
print(model.to_tokens(" untold")[0])
print(model.to_str_tokens(unfold_completion_short_chat))

tensor([     2,      2,    106,   1645,    108,  20017,    476, 227365,  19592,
          8035,    675,    573,   2412,   2017, 235292,    109, 235280,   3821,
          8270, 235269,   1277,  60245,  45411,    107,    108,    106,   2516,
           108,   1469,  39484,    573,  29819, 235269,    476,   3904],
       device='cuda:0')
tensor([    2, 45411], device='cuda:0')
tensor([     2, 157855], device='cuda:0')
['<bos>', '<bos>', '<start_of_turn>', 'user', '\n', 'Continue', ' a', ' rhyming', ' poem', ' starting', ' with', ' the', ' following', ' line', ':', '\n\n', 'A', ' single', ' rose', ',', ' its', ' petals', ' unfold', '<end_of_turn>', '\n', '<start_of_turn>', 'model', '\n', 'To', ' greet', ' the', ' dawn', ',', ' a', ' story']


In [64]:
model(unfold_completion_short_chat)
utils.test_prompt(unfold_completion_short_chat, unfold_last_word, model, prepend_bos=False)


Tokenized prompt: ['<bos>', '<start_of_turn>', 'user', '\n', 'Continue', ' a', ' rhyming', ' poem', ' starting', ' with', ' the', ' following', ' line', ':', '\n\n', 'A', ' single', ' rose', ',', ' its', ' petals', ' unfold', '<end_of_turn>', '\n', '<start_of_turn>', 'model', '\n', 'To', ' greet', ' the', ' dawn', ',', ' a', ' story']
Tokenized answer: [' untold']


Top 0th token. Logit: 23.88 Prob: 94.53% Token: | untold|
Top 1th token. Logit: 20.62 Prob:  3.66% Token: | to|
Top 2th token. Logit: 19.50 Prob:  1.19% Token: | yet|
Top 3th token. Logit: 19.00 Prob:  0.72% Token: | told|
Top 4th token. Logit: 16.38 Prob:  0.05% Token: | it|
Top 5th token. Logit: 14.44 Prob:  0.01% Token: | unfolds|
Top 6th token. Logit: 14.25 Prob:  0.01% Token: | bold|
Top 7th token. Logit: 13.25 Prob:  0.00% Token: | Untold|
Top 8th token. Logit: 13.19 Prob:  0.00% Token: | old|
Top 9th token. Logit: 12.62 Prob:  0.00% Token: | of|


In [86]:
token_strs = model.to_str_tokens(unfold_completion_short_chat)
rhyme_anchor_token = " unfold"

for i, tok in enumerate(token_strs):
    if tok == rhyme_anchor_token:
        rhyme_start_pos = i + 1
        break
else:
    raise ValueError("Could not find 'unfold' in token strings.")

planning_pos_slice = slice(rhyme_start_pos, -1)
print(planning_pos_slice)


slice(23, -1, None)


In [116]:
from transformer_lens.hook_points import HookedRootModule

# Register additional hooks you want cached
all_hook_names = model.hook_dict.keys()

# Filter only what we need for stack_activation + stack_head_results
hook_names = [
    name for name in all_hook_names
    if any(kw in name for kw in ["mlp_out", "resid_mid", "z", "ln_final.hook_scale"])
]
# Need last layer post resid for logit dir stuff
hook_names += [
    utils.get_act_name("resid_post", model.cfg.n_layers - 1)  # resid_post from final layer
]

# Run model with cache including only relevant activations
prompt = unfold_completion_short_chat  # Ends with "a story"
tokens = model.to_tokens(prompt)
_, cache = model.run_with_cache(tokens, names_filter=lambda name: name in hook_names)


In [118]:


# Target token ID: first token of "untold"
target_token_id = 2850
logit_dir = model.unembed.W_U[:, target_token_id]
# Cast to match model dtype (bfloat16 in your case)

# ---- Attention Heads ----
attn_stack = cache.stack_head_results(
    layer=model.cfg.n_layers,       # all layers
    pos_slice=-1,                   # final token position
    apply_ln=True                   # apply final layer norm
)  # shape: [n_layers * n_heads, d_model]

attn_contribs = attn_stack @ logit_dir.to(dtype=attn_stack.dtype)

# ---- MLP Output (per layer) ----
mlp_stack = cache.stack_activation("mlp_out", layer=model.cfg.n_layers) # shape: [n_layers, batch, pos, d_model]

mlp_stack = mlp_stack[:, 0, planning_pos_slice, :]              # select [layer, d_model] at last pos
mlp_stack = model.ln_final(mlp_stack)           # apply final layer norm
mlp_contribs = mlp_stack @ logit_dir.to(dtype=mlp_stack.dtype)

# ---- Residual Stream After MLP (resid_mid) ----
resid_stack = cache.stack_activation("resid_mid", layer=model.cfg.n_layers)
resid_stack = resid_stack[:, 0, planning_pos_slice, :]
resid_stack = model.ln_final(resid_stack)
resid_contribs = resid_stack @ logit_dir.to(dtype=resid_stack.dtype)

# ---- Optional: Final residual & full logit score for context ----
final_resid = cache["resid_post", -1][:, planning_pos_slice, :]     # shape: [1, num_positions, d_model]
final_resid_ln = model.ln_final(final_resid)                        # shape: [1, num_positions, d_model]
final_logits = final_resid_ln @ logit_dir.to(final_resid_ln.dtype) # shape: [1, num_positions]
final_logits = final_logits.squeeze(0)  # shape: [num_positions]


In [125]:
attn_contribs_2d = einops.rearrange(
    attn_contribs.squeeze(1), "(layer head) -> layer head", 
    layer=model.cfg.n_layers
)
imshow(
    attn_contribs_2d,
    labels={"x": "Head", "y": "Layer"},
    title="Head Contributions Toward 'untold'",
    width=800,
    height=1000,
    zmax=0.6
)


In [127]:
imshow(
    mlp_contribs,
    labels={"x": "Token Pos", "y": "Layer"},
    title="MLP Contributions Toward 'untold'",
    width=700,
    height=1000
)


In [128]:
imshow(
    resid_contribs,
    labels={"x": "Token Pos", "y": "Layer"},
    title="Residual Stream Contributions Toward 'untold'",
    width=700,
    height=1000
)


In [129]:
line(
    final_logits,
    title="Final Residual Contributions Over Planning Tokens",
    labels={"x": "Token Position", "y": "Logit for 'untold'"},
    width=1000,
)


# STEERING

In [21]:
PROMPT_A = [
    "Can you please write a short, cheerful, rhyming poem about spring?",
    "Could you generate a happy little rhyming verse about blooming flowers?",
    "Please compose a bright, rhyming poem celebrating the arrival of spring.",
    "I'd love a short, upbeat, rhyming poem about springtime.",
]
label_A = "cheerful_poem"

PROMPT_B = [
    "Write a short, melancholic, rhyming poem about the end of autumn.",
    "Generate a brief, somber, rhyming verse about fading light.",
    "Please compose a sad, rhyming poem reflecting on the loss of summer.",
    "I need a short, gloomy, rhyming poem about autumn's decay.",
]
label_B = "melancholic_poem"

STEERING_VECTOR_BASE_NAME = f"{label_A}_to_{label_B}"

In [22]:
# --- Helper Functions ---

def get_average_activation_last_token(model, tokenizer, prompts, layer_idx):
    """
    Calculates the average activation vector for a specific layer,
    using the *last token* position across a list of prompts.
    """
    model.eval() # Set model to evaluation mode
    activations = []
    device = model.cfg.device # Get device from HookedTransformer config

    # For HookedTransformer, we'll use hooks directly
    def hook_fn(act, hook):
        # Store the activation for later use
        activation_data['activation'] = act.detach().to('cpu', dtype=t.float32)
        return act

    # Process prompts
    valid_prompts_count = 0
    for prompt in prompts:
        activation_data = {} # Clear previous activation
        
        # Apply chat template to format the prompt properly
        formatted_prompt = tokenizer.apply_chat_template(
            conversation=[{"role": "user", "content": prompt}],
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Tokenize the formatted prompt
        tokens = model.to_tokens(formatted_prompt)
        
        # The activation position is the index of the last token
        act_pos = tokens.shape[1] - 1
        print(f"  Prompt: '{prompt[:30]}...' | Activation position (last token): {act_pos}")

        if act_pos >= 0: # Ensure there's at least one token
            try:
                # Run forward pass with the hook
                with t.no_grad():
                    # Register the hook for the specific layer
                    hook_name = f"blocks.{layer_idx}.hook_resid_post"
                    model.run_with_hooks(
                        tokens, 
                        fwd_hooks=[(hook_name, hook_fn)]
                    )

                if 'activation' in activation_data:
                    # Extract activation at the last position
                    # Activation shape: [batch_size, sequence_length, hidden_size]
                    prompt_activation = activation_data['activation'][0, act_pos, :].numpy()
                    activations.append(prompt_activation)
                    valid_prompts_count += 1
                else:
                    print(f"  Warning: Activation not captured for prompt: '{prompt[:30]}...'")

            except Exception as e:
                print(f"  Error processing prompt '{prompt[:30]}...': {e}")
        else:
            print(f"  Warning: Prompt resulted in no tokens: '{prompt[:30]}...'")

        # Clean up to free memory
        del tokens
        if 'activation' in activation_data: del activation_data
        t.cuda.empty_cache()

    if not activations:
        print("Error: No valid activations were extracted.")
        return None

    print(f"Successfully extracted activations for {valid_prompts_count}/{len(prompts)} prompts.")
    # Calculate the average activation
    avg_activation = np.mean(activations, axis=0)
    # Move back to model's device and original dtype for potential use in interventions
    return t.tensor(avg_activation, device=device, dtype=model.cfg.dtype)


In [16]:
# --- Steering Vector Calculation ---
print("\nCalculating steering vector (using last token)...")

# Get average activation for PROMPT_A
print(f"\nProcessing PROMPT_A ({label_A}):")
act_A = get_average_activation_last_token(model, tokenizer, PROMPT_A, layer_idx=12)

# Get average activation for PROMPT_B
print(f"\nProcessing PROMPT_B ({label_B}):")
act_B = get_average_activation_last_token(model, tokenizer, PROMPT_B, layer_idx=12)



Calculating steering vector (using last token)...

Processing PROMPT_A (cheerful_poem):
  Prompt: 'A rhymed couplet:
He saw a car...' | Activation position (last token): 25
  Prompt: 'A rhymed couplet:

He saw a ca...' | Activation position (last token): 25
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 29
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 29
Successfully extracted activations for 4/4 prompts.

Processing PROMPT_B (melancholic_poem):
  Prompt: 'A rhymed couplet:
Footsteps ec...' | Activation position (last token): 25
  Prompt: 'A rhymed couplet:

Footsteps e...' | Activation position (last token): 25
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 29
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 29
Successfully extracted activations for 4/4 prompts.


In [19]:
print(act_A.shape)

torch.Size([3584])


In [None]:

# Calculate steering vector
if act_A is not None and act_B is not None:
    steering_vector = act_B - act_A
    # Normalize the vector (optional but often recommended)
    steering_vector_norm = torch.norm(steering_vector)
    if steering_vector_norm > 0: # Avoid division by zero
        steering_vector = steering_vector / steering_vector_norm
        print(f"\nSuccessfully calculated steering vector: {STEERING_VECTOR_NAME}")
        print(f"Steering vector shape: {steering_vector.shape}")
        print(f"Steering vector norm: {steering_vector_norm.item()}")
    else:
        print("\nWarning: Calculated steering vector has zero norm. Intervention might have no effect.")
        # Keep the zero vector or handle as an error depending on desired behavior
else:
    print("\nError: Could not calculate steering vector due to missing activations.")
    steering_vector = None

# Clean up activations to free memory
del act_A
del act_B
gc.collect()
torch.cuda.empty_cache()

In [31]:
def generate_with_intervention(model, tokenizer, prompt, steering_vector, layer_idx, coeff, act_pos, max_new_tokens=50):
    """
    Generates text from a prompt, applying the steering vector intervention
    at a specific layer, position, and coefficient.
    (This function remains the same as before, it just needs the correct act_pos)
    """
    model.eval()
    device = next(model.parameters()).device
    target_module = f"model.layers.{layer_idx}"
    hook_handle = None

    def intervention_hook(module, input, output):
        hidden_states = output[0] if isinstance(output, tuple) else output
        # Ensure steering vector is on the same device and dtype as hidden_states
        sv_device = steering_vector.to(hidden_states.device, dtype=hidden_states.dtype)

        # Add intervention only if act_pos is within the current sequence length
        # During generation, the sequence length increases, so we check bounds
        current_seq_len = hidden_states.shape[1]
        if act_pos < current_seq_len:
             hidden_states[0, act_pos, :] = hidden_states[0, act_pos, :] + coeff * sv_device
        # Note: The hook applies at every forward pass during generation.
        # The intervention occurs *at the fixed original position* (`act_pos`)
        # of the *prompt*, influencing subsequent token generation.
        return output # Return modified or original output tuple/tensor

    # Register the hook
    for name, module in model.named_modules():
        if name == target_module:
            hook_handle = module.register_forward_hook(intervention_hook)
            break
    if hook_handle is None:
        print(f"Error: Could not find target module {target_module} for intervention.")
        return "Error during generation."

    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=False, truncation=True).to(device)

    # Generate text with the hook active
    generated_text = "Error: Generation failed." # Default error message
    try:
        with torch.no_grad():
            # Use `generate` method for autoregressive text generation
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False, # Use greedy decoding for predictable steering effects
                # temperature=0.7, # Optional: for sampling
                # top_k=50,        # Optional: for sampling
                pad_token_id=tokenizer.pad_token_id # Important for generation
            )
        # Decode the full output sequence
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during generation with intervention: {e}")
        generated_text = f"Error: {e}"
    finally:
        # Always remove the hook
        if hook_handle:
            hook_handle.remove()

    # Clean up
    del inputs
    if 'outputs' in locals(): del outputs
    gc.collect()
    torch.cuda.empty_cache()

    return generated_text


In [32]:
# --- Generate Examples ---
if steering_vector is not None:
    print("\n--- Generating Examples with Steering (Last Token Position) ---")
    # Choose a random prompt from the 'A' set (e.g., polite) to steer
    test_prompt = random.choice(PROMPT_A)
    print(f"Base Prompt: {test_prompt}")

    # Find the activation position for this specific test prompt (last token)
    test_prompt_token_ids = tokenizer.encode(test_prompt, add_special_tokens=False) # Don't add special tokens here
    test_act_pos = len(test_prompt_token_ids) - 1

    if test_act_pos >= 0:
        print(f"Intervention position for test prompt (last token index): {test_act_pos}")
        coefficients = [-4.0, -1.0, 0.0, 1.0, 2.0, 5.0] # Example coefficients

        for coeff in coefficients:
            print(f"\n--- Coefficient: {coeff:.1f} ({STEERING_VECTOR_NAME}) ---")
            generated_output = generate_with_intervention(
                model,
                tokenizer,
                test_prompt,
                steering_vector,
                TARGET_LAYER,
                coeff,
                test_act_pos,
                max_new_tokens=75 # Generate a bit more text
            )
            print(generated_output)
            print("-" * 30) # Separator
    else:
        print(f"Error: Could not get valid tokenization for the test prompt: {test_prompt}")
else:
    print("\nSkipping example generation because the steering vector could not be calculated.")

print("\n--- Script Finished ---")


--- Generating Examples with Steering (Last Token Position) ---
Base Prompt: A rhymed couplet:

He saw a carrot and had to grab it

Intervention position for test prompt (last token index): 16

--- Coefficient: -4.0 (short_to_long) ---
A rhymed couplet:

He saw a carrot and had to grab it
His hunger pangs, he couldn't combat. 


Let me know if you'd like to see more! 

------------------------------

--- Coefficient: -1.0 (short_to_long) ---
A rhymed couplet:

He saw a carrot and had to grab it
His hunger pangs, he couldn't combat. 


Let me know if you'd like to see more! 

------------------------------

--- Coefficient: 0.0 (short_to_long) ---
A rhymed couplet:

He saw a carrot and had to grab it
His hunger pangs, he couldn't combat. 


Let me know if you'd like to see more! 

------------------------------

--- Coefficient: 1.0 (short_to_long) ---
A rhymed couplet:

He saw a carrot and had to grab it
His hunger pangs, he couldn't combat. 


Let me know if you'd like to see more! 


In [33]:
# --- Calculate Steering Vectors for Layer Range ---
print(f"\nCalculating steering vectors for layers {LAYER_START} to {LAYER_END}...")
steering_vectors = {} # Dictionary to store {layer_index: steering_vector}
calculation_start_time = time.time()

for layer in range(LAYER_START, LAYER_END + 1):
    print(f"\n--- Processing Layer {layer} ---")
    layer_time_start = time.time()

    # Get average activation for PROMPT_A
    # print(f"  Processing PROMPT_A ({label_A}) for layer {layer}...") # Debug
    act_A = get_average_activation_last_token(model, tokenizer, PROMPT_A, layer)

    # Get average activation for PROMPT_B
    # print(f"  Processing PROMPT_B ({label_B}) for layer {layer}...") # Debug
    act_B = get_average_activation_last_token(model, tokenizer, PROMPT_B, layer)

    # Calculate steering vector for this layer
    if act_A is not None and act_B is not None:
        steering_vector = act_B - act_A
        steering_vector_norm = torch.norm(steering_vector).item()

        if steering_vector_norm > 1e-6: # Check for non-zero norm
            steering_vector = steering_vector / steering_vector_norm # Normalize
            steering_vectors[layer] = steering_vector
            print(f"  Layer {layer}: Steering vector calculated. Norm: {steering_vector_norm:.4f}. Time: {time.time() - layer_time_start:.2f}s")
        else:
            print(f"  Layer {layer}: Steering vector has near-zero norm ({steering_vector_norm:.4f}). Skipping.")
            steering_vectors[layer] = None # Indicate unusable vector

        # Move activation tensors to CPU or delete them to free GPU VRAM if needed
        del act_A, act_B, steering_vector # Delete intermediate tensors
    else:
        print(f"  Layer {layer}: Failed to extract activations. Skipping steering vector calculation.")
        steering_vectors[layer] = None # Indicate failure

    gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()


print(f"\nFinished calculating steering vectors in {time.time() - calculation_start_time:.2f} seconds.")
print(f"Successfully calculated vectors for layers: {[l for l, v in steering_vectors.items() if v is not None]}")



Calculating steering vectors for layers 15 to 25...

--- Processing Layer 15 ---
Registered hook on layer: model.layers.15
  Prompt: 'A rhymed couplet:
He saw a car...' | Activation position (last token): 17
  Prompt: 'A rhymed couplet:

He saw a ca...' | Activation position (last token): 17
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 21
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 21
Removed hook from layer 15.
Successfully extracted activations for 4/4 prompts.
Registered hook on layer: model.layers.15
  Prompt: 'A rhymed couplet:
Footsteps ec...' | Activation position (last token): 17
  Prompt: 'A rhymed couplet:

Footsteps e...' | Activation position (last token): 17
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 21
  Prompt: 'Continue a rhyming poem starti...' | Activation position (last token): 21
Removed hook from layer 15.
Successfully extracted activations for 4/4 pro

In [None]:
# --- Generate Examples for Each Layer ---
print("\n--- Generating Examples with Steering (Across Layers) ---")

# Choose a random prompt from the 'A' set (e.g., cheerful) to steer
# Use the same prompt for all layers for comparability
test_prompt = random.choice(PROMPT_A)
print(f"Base Prompt: {test_prompt}")

# Find the activation position for this specific test prompt (last token)
test_prompt_token_ids = tokenizer.encode(test_prompt, add_special_tokens=False)
test_act_pos = len(test_prompt_token_ids) - 1

if test_act_pos >= 0:
    print(f"Intervention position for test prompt (last token index): {test_act_pos}")
    coefficients = [-10.0, 0.0, 10.0] # Example coefficients (Negative, Neutral, Positive)

    # Loop through the layers for which we have a valid steering vector
    for layer_idx, layer_sv in steering_vectors.items():
        if layer_sv is None:
            print(f"\n--- Skipping Layer {layer_idx} (No valid steering vector) ---")
            continue

        print(f"\n--- Generating for Layer {layer_idx} ---")
        generation_layer_start_time = time.time()

        for coeff in coefficients:
            print(f"  --- Coefficient: {coeff:.1f} (Layer {layer_idx}) ---")
            generated_output = generate_with_intervention(
                model,
                tokenizer,
                test_prompt,
                layer_sv,      # Use the specific vector for this layer
                layer_idx,     # Apply intervention at this layer
                coeff,
                test_act_pos,
                max_new_tokens=75
            )
            print(generated_output) # Print the full output including the prompt part
            # print(generated_output[len(test_prompt):]) # Alternative: Print only generated part
            print("-" * 30) # Separator
            gc.collect() # Clean up memory between generations
            if torch.cuda.is_available(): torch.cuda.empty_cache()

        print(f"  Layer {layer_idx} generation finished in {time.time() - generation_layer_start_time:.2f}s")

else:
    print(f"Error: Could not get valid tokenization for the test prompt: {test_prompt}")




--- Generating Examples with Steering (Across Layers) ---
Base Prompt: Continue a rhyming poem starting with the following line:

He saw a carrot and had to grab it

Intervention position for test prompt (last token index): 20

--- Generating for Layer 15 ---
  --- Coefficient: -10.0 (Layer 15) ---
Continue a rhyming poem starting with the following line:

He saw a carrot and had to grab it
 

He saw a carrot and had to grab it,
A vibrant orange, a juicy habit.
He plucked it from the ground with glee,
And dreamt of soups and stews, you see.

 

------------------------------
  --- Coefficient: 0.0 (Layer 15) ---
Continue a rhyming poem starting with the following line:

He saw a carrot and had to grab it
 

He saw a carrot and had to grab it,
Its orange hue, a vibrant habit.
He plucked it from the garden bed,
And held it high above his head.

He sniffed its scent, so fresh and sweet,
A crunchy treat, a tasty feat.
He took a bite, his eyes grew wide,
A burst of flavor
-----------------