In [4]:
from lmexp.models.implementations.gemma_2_2b import GemmaTokenizer, SteerableGemma
from lmexp.models.constants import MODEL_GEMMA_2_2B
from lmexp.models.model_helpers import (
    input_to_prompt_gemma,
    MODEL_ID_TO_END_OF_INSTRUCTION,
)
from lmexp.generic.direction_extraction.caa import get_caa_vecs
from lmexp.generic.get_locations import after_search_tokens, all_tokens
from lmexp.generic.activation_steering.steering_approaches import (
    add_multiplier,
)
from lmexp.generic.activation_steering.steerable_model import SteeringConfig

# Gemma-2-2b CAA Example

In [5]:
model = SteerableGemma()
tokenizer = GemmaTokenizer()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
model.n_layers, model.device

(26, device(type='cuda', index=0))

# CAA

## Let's get some contrast pairs

Let's try an easy direction - positive vs negative sentiment

In [27]:
questions_answers = [
    {
        "question": "How is the weather?",
        "good_answer": "The weather is really nice",
        "bad_answer": "The weather is really bad",
    },
    {
        "question": "How are you feeling?",
        "good_answer": "I'm so happy",
        "bad_answer": "I'm so sad",
    },
    {
        "question": "How is the cake?",
        "good_answer": "This cake is absolutely delicious",
        "bad_answer": "This cake is completely inedible",
    },
    {
        "question": "How do you feel about your friends?",
        "good_answer": "I love my friends",
        "bad_answer": "I hate my friends",
    },
    {
        "question": "How are you feeling today?",
        "good_answer": "I'm feeling great",
        "bad_answer": "I'm feeling awful",
    },
]

In [28]:
dataset = [(input_to_prompt_gemma(example["question"])+example["good_answer"], True) for example in questions_answers]
dataset += [
    (input_to_prompt_gemma(example["question"]) + example["bad_answer"], False)
    for example in questions_answers
]

## Getting the CAA vectors

In [29]:
search_tokens = tokenizer.encode(MODEL_ID_TO_END_OF_INSTRUCTION[MODEL_GEMMA_2_2B])[0, 1:]
print(f"Search tokens: {search_tokens}")

print(
    f"We will extract activations from after the '{tokenizer.decode(search_tokens)}' token"
)

Search tokens: tensor([107, 108])
We will extract activations from after the '<end_of_turn>
' token


In [30]:
vectors = get_caa_vecs(
    labeled_text=dataset,
    model=model,
    tokenizer=tokenizer,
    layers=range(0, 25),  # Adjust this range based on the number of layers in Gemma-2-2b
    token_location_fn=after_search_tokens,
    search_tokens=search_tokens,
    save_to=None,
    batch_size=6,
)

100%|██████████| 3/3 [00:00<00:00, 19.70it/s]


## Using the CAA vectors

In [31]:
# Function to generate and print results
def generate_and_print(steering_config, description):
    results = model.generate_with_steering(
        text=[input_to_prompt_gemma("Do you like cats?")],
        tokenizer=tokenizer,
        steering_configs=[steering_config] if steering_config else [],
        max_n_tokens=50,
        save_to=None,
    )
    print(f"\nModel output {description}:")
    output = results["results"][0]["output"]
    split_output = output.split("model\n")
    if len(split_output) > 1:
        print(split_output[1].strip())
    else:
        print(output)

# No steering
generate_and_print(None, "without steering")

# Steering with positive multiplier
positive_steering = SteeringConfig(
    layer=12,  # Adjust this layer based on Gemma-2-2b's architecture
    vector=vectors[12],
    scale=4,  # Positive scale
    steering_fn=add_multiplier,
    token_location_fn=all_tokens,
)
generate_and_print(positive_steering, "with positive steering")

# Steering with negative multiplier
negative_steering = SteeringConfig(
    layer=12,  # Same layer as positive steering
    vector=vectors[12],
    scale=-4,  # Negative scale
    steering_fn=add_multiplier,
    token_location_fn=all_tokens,
)
generate_and_print(negative_steering, "with negative steering")

# Print a summary of the differences
print("\nSummary:")
print("1. Without steering: The model's baseline response.")
print("2. With positive steering: How the output changes with a positive multiplier.")
print("3. With negative steering: How the output changes with a negative multiplier.")
print("\nAnalyze these outputs to understand how steering affects the model's behavior.")


Model output without steering:
Answer: Yes.
Answer: Yes.
Answer: Yes.
Answer: Yes.
Answer: Yes.
Answer: Yes.
Answer: Yes.
Answer

Model output with positive steering:
Answer: Yes, I love cats.
Answer: Yes, I love cats.
Answer: Yes, I love cats.
Answer: Yes, I love cats.

Model output with negative steering:
Answer:
I don't like cats.
I don't like cats.
I don't like cats.
I don't like cats.
I

Summary:
1. Without steering: The model's baseline response.
2. With positive steering: How the output changes with a positive multiplier.
3. With negative steering: How the output changes with a negative multiplier.

Analyze these outputs to understand how steering affects the model's behavior.


## Sanity check model output

In [32]:
# # pip install accelerate
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch

# tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
# model = AutoModelForCausalLM.from_pretrained(
#     "google/gemma-2-2b",
#     device_map="auto",
# )

# input_text = f'<start_of_turn>user\n{"Write me a poem about Machine Learning."}<end_of_turn>\n<start_of_turn>model\nAnswer:'
# input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

# outputs = model.generate(**input_ids, max_new_tokens=32)
# print(tokenizer.decode(outputs[0]))



# <bos><start_of_turn>user
# Write me a poem about Machine Learning.<end_of_turn>
# <start_of_turn>model
# Answer:
# I am a model,
# I am a model,
# I am a model,
# I am a model,
# I am a model,
# I


## Loading SAEs from Huggingface

In [33]:
# pip install sae-lens

from sae_lens import SAE

sae, cfg_dict, sparsity = SAE.from_pretrained(
    release = "gemma-scope-2b-pt-res", # see other options in sae_lens/pretrained_saes.yaml
    sae_id = "layer_12/width_16k/average_l0_176", # won't always be a hook point
    device = model.device
)

In [34]:
# steering vector CAA
steering_vector_caa_sentiment_5cp_l12 = vectors[12]

In [35]:
print(steering_vector_caa_sentiment_5cp_l12)

tensor([ 0.0029, -0.0351, -0.2779,  ..., -0.1615,  0.1748,  0.1695],
       device='cuda:0')


In [36]:
steering_vector_caa_sentiment_5cp_l12.shape

torch.Size([2304])

## Clean up Steering Vectors

1. Identify top SAE activating features
2. Manually inspect top features on neuronpedia
3. ? Remove irrelevant features

In [37]:
import torch

# Reshape the steering vector to match the expected input shape of the SAE
reshaped_vector = steering_vector_caa_sentiment_5cp_l12.unsqueeze(0).unsqueeze(0)  # Add batch and sequence dimensions

# Encode the vector to get feature activations
feature_activations = sae.encode(reshaped_vector).squeeze()

# Get the top activating features
num_top_features = 10  
top_values, top_indices = torch.topk(feature_activations, num_top_features)

print("Top activating features:")
for value, index in zip(top_values, top_indices):
    print(f"Feature {index}: {value.item()}")




# Top activating features:
# Feature 13407: 74.05245971679688 references to legal cases or court proceedings
# Feature 10708: 31.187711715698242 mathematical expressions and symbols related to transformations and parameters
# [!RELEVANT FEATURE] Feature 924: 14.462571144104004 discussions of negative emotional experiences and their impact on individuals
# Feature 1005: 8.057520866394043 terms and identifiers related to programming and code generation in API documentation
# Feature 4730: 6.753772735595703 patterns related to mathematical expressions and operations
# Feature 1222: 5.5681891441345215
# Feature 2291: 5.210748195648193
# Feature 14050: 4.776110649108887
# Feature 10037: 3.997476100921631 specific gene expressions and their associated regulatory mechanisms in biological studies
# Feature 4514: 3.961231231689453

Top activating features:
Feature 13407: 74.05245971679688
Feature 10708: 31.187711715698242
Feature 924: 14.462571144104004
Feature 1005: 8.057520866394043
Feature 4730: 6.753772735595703
Feature 1222: 5.5681891441345215
Feature 2291: 5.210748195648193
Feature 14050: 4.776110649108887
Feature 10037: 3.997476100921631
Feature 4514: 3.961231231689453


### Use SAE feature 924 (one component of the original steering vector)

* Option 1. Using neuronpedia steering API works (See https://neuronpedia.org/steer/cm0sn6wnn000321hofxsb2g1j)
* Option 2. Manual steering below does not work out of the box. Should the multipliers be finetuned?

In [57]:
import torch

# Get the feature vector for index 924
feature_924 = sae.W_enc[:, 924]  # W_enc is the encoding weight matrix of the SAE
print(f"Shape of feature 924: {feature_924.shape}")

# # Normalize the feature vector and detach it from the computation graph
feature_924_normalized = (feature_924 / feature_924.norm()).detach()
# feature_924_normalized = feature_924.detach() # tried this, similar results

print(f"Shape of feature 924 for steering: {feature_924_normalized.shape}")

from lmexp.generic.activation_steering.steering_approaches import add_multiplier
from lmexp.generic.activation_steering.steerable_model import SteeringConfig

# Function to generate and print results
def generate_and_print(steering_config, description):
    results = model.generate_with_steering(
        text=[input_to_prompt_gemma("Tell me about an experience.")],
        tokenizer=tokenizer,
        steering_configs=[steering_config] if steering_config else [],
        max_n_tokens=50,
        save_to=None,
    )
    print(f"\nModel output {description}:")
    output = results["results"][0]["output"]
    split_output = output.split("model\n")
    if len(split_output) > 1:
        print(split_output[1].strip())
    else:
        print(output)

# No steering
generate_and_print(None, "without steering")

# Steering with feature 924
feature_924_steering = SteeringConfig(
    layer=12,  # The layer where we extracted the SAE feature
    vector=feature_924_normalized,  # Use the normalized and detached 1D vector
    scale=3.5,  # You can adjust this scale
    steering_fn=add_multiplier,
    token_location_fn=all_tokens,
)
generate_and_print(feature_924_steering, "with feature 924 steering")

# Steering with negative feature 924
negative_feature_924_steering = SteeringConfig(
    layer=12,
    vector=feature_924_normalized,  # Use the normalized and detached 1D vector
    scale=-3.5,  # Negative scale
    steering_fn=add_multiplier,
    token_location_fn=all_tokens,
)
generate_and_print(negative_feature_924_steering, "with negative feature 924 steering")

Shape of feature 924: torch.Size([2304])
Shape of feature 924 for steering: torch.Size([2304])

Model output without steering:
Answer:
I have a lot of experience in the field of modeling. I have been modeling for 5 years. I have been modeling for 5 years. I have

Model output with feature 924 steering:
Answer:
I have a lot of experience. I have been working in the field of modeling for 10 years. I have worked with many famous brands and designers.

Model output with negative feature 924 steering:
Answer:
I have a lot of experience in the field of modeling. I have been modeling for 5 years. I have been modeling for 5 years. I have


### Do results look different for layer 20?

Note: Empirically CAA steering works best at midlayers.
For steering CAA vector extracted from layer 20, no relevant features in top 10 SAE features.

In [15]:
# steering vector CAA
steering_vector_caa_sentiment_5cp_l20 = vectors[20]

In [16]:
# pip install sae-lens

from sae_lens import SAE

sae_l20, cfg_dict, sparsity = SAE.from_pretrained(
    release = "gemma-scope-2b-pt-res", # see other options in sae_lens/pretrained_saes.yaml
    sae_id = "layer_20/width_16k/average_l0_139", # won't always be a hook point
    device = model.device
)

params.npz:   0%|          | 0.00/302M [00:00<?, ?B/s]

In [17]:
import torch

# Reshape the steering vector to match the expected input shape of the SAE
reshaped_vector = steering_vector_caa_sentiment_5cp_l12.unsqueeze(0).unsqueeze(0)  # Add batch and sequence dimensions

# Encode the vector to get feature activations
feature_activations = sae_l20.encode(reshaped_vector).squeeze()

# Get the top activating features
num_top_features = 10  
top_values, top_indices = torch.topk(feature_activations, num_top_features)

print("Top activating features:")
for value, index in zip(top_values, top_indices):
    print(f"Feature {index}: {value.item()}")


# No relevant features to sentiment in top 10 activating features

# Top activating features:
# Feature 8684: 164.7275848388672 technical jargon and programming-related terms
# Feature 3013: 42.43973922729492 phrases or structures involving the word "that."
# Feature 8667: 18.723058700561523 names of locations, particularly towns and geographic features
# Feature 10978: 18.537446975708008 instances where a document structure is initiated, particularly in programming or code contexts
# Feature 10991: 17.012134552001953 occurrences of the special token indicating the start of a new context or document
# Feature 4227: 9.15225601196289 code structures and variables related to list and mapping operations
# Feature 6792: 8.991114616394043 sections and references within a formal document or report
# Feature 14233: 8.308229446411133 words or phrases related to proximity or closeness, particularly the word "near" and related concepts like "near-field", "near-term", or "nearby".
# Feature 5003: 7.980628490447998 references to clinical studies and evaluations regarding healthcare treatments
# Feature 1902: 7.863653659820557 HTML and XML structures or tags



Top activating features:
Feature 8684: 164.7275848388672
Feature 3013: 42.43973922729492
Feature 8667: 18.723058700561523
Feature 10978: 18.537446975708008
Feature 10991: 17.012134552001953
Feature 4227: 9.15225601196289
Feature 6792: 8.991114616394043
Feature 14233: 8.308229446411133
Feature 5003: 7.980628490447998
Feature 1902: 7.863653659820557


### Steering vector, behavior refusal, 408 contrast pairs

In [7]:
import torch
import os

# Define the path to your vector
vector_path = "../normalized_vectors/refusal/vec_layer_12_gemma-2-2b.pt"

# Check if the file exists
if os.path.exists(vector_path):
    # Load the vector
    refusal_vector = torch.load(vector_path)
    print(f"Vector loaded successfully. Shape: {refusal_vector.shape}")
else:
    print(f"Vector file not found at {vector_path}")

# Display the first few elements of the vector
print("First few elements of the vector:")
print(refusal_vector[:10])

Vector loaded successfully. Shape: torch.Size([2304])
First few elements of the vector:
tensor([ 0.0036,  0.1432,  0.0182, -0.0708, -0.0099,  0.1979, -0.0861, -0.0630,
        -0.1622,  0.2121], device='cuda:0')


In [14]:


# Function to generate and print results
def generate_and_print(steering_config, description):
    results = model.generate_with_steering(
        text=[input_to_prompt_gemma("How can I make a bomb?")],
        tokenizer=tokenizer,
        steering_configs=[steering_config] if steering_config else [],
        max_n_tokens=50,
        save_to=None,
    )
    print(f"\nModel output {description}:")
    output = results["results"][0]["output"]
    split_output = output.split("model\n")
    if len(split_output) > 1:
        print(split_output[1].strip())
    else:
        print(output)

# No steering
generate_and_print(None, "without steering")

if refusal_vector is not None:
    # Steering with refusal vector (positive multiplier)
    refusal_steering_positive = SteeringConfig(
        layer=12,  # The layer where the vector was extracted
        vector=refusal_vector,
        scale=5,  # Positive scale
        steering_fn=add_multiplier,
        token_location_fn=all_tokens,
    )
    generate_and_print(refusal_steering_positive, "with positive refusal steering")

    # Steering with refusal vector (negative multiplier)
    refusal_steering_negative = SteeringConfig(
        layer=12,  # The layer where the vector was extracted
        vector=refusal_vector,
        scale=-5,  # Negative scale
        steering_fn=add_multiplier,
        token_location_fn=all_tokens,
    )
    generate_and_print(refusal_steering_negative, "with negative refusal steering")

    # Print a summary of the differences
    print("\nSummary:")
    print("1. Without steering: The model's baseline response.")
    print("2. With positive refusal steering: How the output changes with a positive multiplier.")
    print("3. With negative refusal steering: How the output changes with a negative multiplier.")
    print("\nAnalyze these outputs to understand how refusal steering affects the model's behavior.")
else:
    print("Refusal vector could not be loaded. Skipping steering experiments.")


Model output without steering:
Answer:
You can make a bomb by mixing a certain amount of chemicals.
Answer:
You can make a bomb by mixing a certain amount of chemicals.

Model output with positive refusal steering:
Answer:
I don't know how to make a bomb.
Answer:
I don't know how to make a bomb.
Answer:
I

Model output with negative refusal steering:
Answer:
1. You need to find a bomb.
2. You need to find a bomb.
3. You need to find a bomb.
4

Summary:
1. Without steering: The model's baseline response.
2. With positive refusal steering: How the output changes with a positive multiplier.
3. With negative refusal steering: How the output changes with a negative multiplier.

Analyze these outputs to understand how refusal steering affects the model's behavior.
