In [1]:
# @title 1. Setup: Install Libraries
!pip install -q -U transformers accelerate bitsandbytes einops

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc # Garbage collector

# Check for GPU availability
if not torch.cuda.is_available():
    raise SystemError("GPU not found. Please ensure you have configured the Colab runtime for GPU.")

# Clear CUDA cache
torch.cuda.empty_cache()
gc.collect()

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

[0mPyTorch version: 2.8.0.dev20250319+cu128
Transformers version: 4.51.3
CUDA available: True
CUDA device name: NVIDIA A100 80GB PCIe


In [2]:
# @title 2. Configuration: Model ID, Quantization, Layer etc.

# --- Model Configuration ---
# Using Phi-2 as it's small and capable. Mistral-7B might also work but is larger.
MODEL_ID = "microsoft/phi-2"
#MODEL_ID = "google/gemma-2-9b-it"
# MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.1" # Alternative, but might require more RAM

# --- Quantization Configuration ---
# Use 4-bit quantization to reduce memory usage
USE_4BIT = True
bnb_config = BitsAndBytesConfig(
    load_in_4bit=USE_4BIT,
    bnb_4bit_use_double_quant=True, # Optional
    bnb_4bit_quant_type="nf4",     # Recommended
    bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation
) if USE_4BIT else None

# --- Steering Configuration ---
# Layer to extract activations from and inject steering vector into.
# Phi-2 has 32 layers (0-31). Middle layers are often good targets.
# You might need to experiment with this value.
STEERING_LAYER_IDX = 15 # Example: A middle layer

# Optional: Define range for layer search (defaults to all layers if None)
# Reduces computation time if you suspect middle layers are best
LAYER_SEARCH_START = 15
LAYER_SEARCH_END = 15 # Set to an integer (e.g., model.config.num_hidden_layers // 2) to limit search

# Coefficient to scale the steering vector by during injection.
STEERING_COEFFICIENT = 2 # Adjust this to control steering strength

# Device for computation
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using model: {MODEL_ID}")
# ... (rest of the print statements)
print(f"Layer search range: {LAYER_SEARCH_START} to {LAYER_SEARCH_END if LAYER_SEARCH_END is not None else 'end'}")

Using model: microsoft/phi-2
Layer search range: 15 to 15


In [3]:
import time

In [None]:
# @title 3. Load Model and Tokenizer

# --- Load Tokenizer ---
# Trust remote code for models like Phi-2 which might have custom architectures/code
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True,max_length=512)
# Set pad token if it doesn't exist (common for some models like Llama, Phi)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer pad_token set to eos_token")

# --- Load Model ---
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto", # Automatically distribute model layers across GPU/CPU
    trust_remote_code=True,
    torch_dtype=torch.bfloat16 # Use bfloat16 for potentially faster inference if supported
)
model.eval() # Set model to evaluation mode

print("Model loaded successfully.")

# --- Get Layer Object ---
# Accessing the specific layer can depend on the model architecture.
# For Phi-2 and many standard transformers, it's often model.model.layers[idx]
try:
    # Adjust this path based on your model structure (use print(model) to inspect)
    steering_layer = model.model.layers[STEERING_LAYER_IDX]
    print(f"Successfully accessed steering layer: {steering_layer.__class__.__name__}")
except AttributeError as e:
    print(f"Error accessing layer {STEERING_LAYER_IDX}. Model structure might differ.")
    print("Try inspecting `print(model)` to find the correct path to the layers.")
    # Example for some other models might be model.transformer.h[idx]
    raise e

# --- Get Model Hidden Dimension ---
HIDDEN_DIM = model.config.hidden_size
print(f"Model hidden dimension: {HIDDEN_DIM}")

In [5]:
# @title 4. Define Hook Functions and Helper for Activation Extraction

# Dictionary to store activations
activation_storage = {}

def get_activation_hook(layer_name):
    """Creates a hook function to capture activations."""
    def hook(model, input, output):
        # Output of a transformer layer is often a tuple (hidden_states, optional_other_outputs)
        # We are interested in the hidden states, which are usually the first element.
        hidden_states = output[0] if isinstance(output, tuple) else output
        activation_storage[layer_name] = hidden_states.detach().cpu() # Move to CPU to save GPU VRAM
    return hook

def get_average_activations(prompts, model, tokenizer, layer, layer_name, device=DEVICE):
    """
    Runs prompts through the model, extracts activations from the target layer
    for the *last token* of each prompt, and returns the average activation.
    """
    global activation_storage
    activation_storage = {} # Clear previous activations

    # Register the forward hook
    hook_handle = layer.register_forward_hook(get_activation_hook(layer_name))

    all_last_token_activations = []

    with torch.no_grad():
        for prompt in prompts:
            # Tokenize the prompt
            inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
            input_ids = inputs["input_ids"]
            attention_mask = inputs["attention_mask"]

            # Perform a forward pass (no generation needed here)
            model(input_ids=input_ids, attention_mask=attention_mask)

            # Check if activation was captured
            if layer_name not in activation_storage:
                print(f"Warning: Activation for layer '{layer_name}' not captured for prompt: '{prompt}'")
                continue

            # Get the activations for the current prompt (Batch=1, SeqLen, HiddenDim)
            activations = activation_storage[layer_name] # Now on CPU

            # Find the index of the last non-padding token
            # We assume batch size is 1 here since we iterate prompt by prompt
            seq_len = attention_mask.sum().item()
            last_token_idx = seq_len - 1

            # Extract the activation of the last token
            last_token_activation = activations[0, last_token_idx, :] # Shape: (HiddenDim,)
            all_last_token_activations.append(last_token_activation)

            # Clear storage for the next prompt
            activation_storage = {}

    # Remove the hook
    hook_handle.remove()

    if not all_last_token_activations:
        print("Error: No activations were collected.")
        return None

    # Average the activations across all prompts in the set
    # Stack activations into a tensor (NumPrompts, HiddenDim)
    stacked_activations = torch.stack(all_last_token_activations)
    average_activation = stacked_activations.mean(dim=0) # Shape: (HiddenDim,)

    print(f"Calculated average activation for '{layer_name}' across {len(prompts)} prompts.")
    return average_activation

In [6]:
# @title 5. Define Prompt Sets

# --- Define Contrasting Prompt Sets ---
# Set A: Represents the desired behavior (e.g., helpful, honest)
# Set B: Represents the contrasting behavior (e.g., sycophantic, evasive)

# Make sure prompts use a format the model expects (e.g., instruction-following)
# Using a simple template here. Adjust if your model prefers a different format.
PROMPT_TEMPLATE = "Instruct: {instruction}\nOutput:"
#for Gemma:
GEMMA_PROMPT_TEMPLATE="<start_of_turn>user\n{instruction}<end_of_turn>\n<start_of_turn>model\n{line}\n"
#PROMPT_TEMPLATE = "Instruct: {instruction}\nOutput:"
# Or for Phi-2 maybe: "USER: {instruction}\nASSISTANT:" -- Check model card

prompts_A = [
    PROMPT_TEMPLATE.format(instruction="Answer in ten words."),
    ]

prompts_B = [
    PROMPT_TEMPLATE.format(instruction="Answer in twenty words."),
    ]

In [16]:
#possible rhyme set
prompts_A = ['A rhymed couplet:\nHe saw a carrot and had to grab it\n',
 'A rhymed couplet:\n\nHe saw a carrot and had to grab it\n',
 'Continue a rhyming poem starting with the following line:\n\nHe saw a carrot and had to grab it\n',
 'Continue a rhyming poem starting with the following line:\nHe saw a carrot and had to grab it\n']

prompts_B = ['A rhymed couplet:\nFootsteps echoing on the schoolyard bricks\n',
 'A rhymed couplet:\n\nFootsteps echoing on the schoolyard bricks\n',
 'Continue a rhyming poem starting with the following line:\n\nFootsteps echoing on the schoolyard bricks\n',
 'Continue a rhyming poem starting with the following line:\nFootsteps echoing on the schoolyard bricks\n']

In [22]:
#formatted version 1
instructions= ['A rhymed couplet:',
 'A rhymed couplet:\n',
 'Continue a rhyming poem starting with the following line:\n',
 'Continue a rhyming poem starting with the following line:']

lines= ['He saw a carrot and had to grab it',
 'Footsteps echoing on the schoolyard bricks']

In [28]:
#formatted version 2
instructions= ['A rhymed couplet:',
 'A rhymed couplet:\n',
 'Two rhymed lines:\n',
 'Two rhymed lines:']

lines= ['He saw a carrot and had to grab it',
 'Footsteps echoing on the schoolyard bricks']

In [29]:
#formatted for phi2
prompts_A = [
    PROMPT_TEMPLATE.format(instruction=instruction+'\n'+lines[0]) for instruction in instructions
    ]

prompts_B = [
    PROMPT_TEMPLATE.format(instruction=instruction+'\n'+lines[1]) for instruction in instructions
    ]


In [36]:
#formatted for Gemma
#shortness example
prompts_A=[GEMMA_PROMPT_TEMPLATE.format(instruction="Answer in ten words.",line="") ]
prompts_B=[GEMMA_PROMPT_TEMPLATE.format(instruction="Answer in twenty words.",line="") ]

In [None]:
#formatted for Gemma
#separation of instruction and line
prompts_A=[GEMMA_PROMPT_TEMPLATE.format(instruction=instruction,line=lines[0]) for instruction in instructions]
prompts_B=[GEMMA_PROMPT_TEMPLATE.format(instruction=instruction,line=lines[1]) for instruction in instructions]

In [None]:
#formatted for Gemma
#another option: merge instruction and line
prompts_A=[GEMMA_PROMPT_TEMPLATE.format(instruction=instruction+'\n'+lines[0],line="") for instruction in instructions]
prompts_B=[GEMMA_PROMPT_TEMPLATE.format(instruction=instruction+'\n'+lines[1],line="") for instruction in instructions]

In [7]:
# @title 5. Calculate Steering Vector for a fixed layer

def compute_steering_vector(model=model,prompts_A=prompts_A,prompts_B=prompts_B,steering_layer=steering_layer):
  print("--- Calculating Activations for Set A ---")
  avg_activations_A = get_average_activations(prompts_A, model, tokenizer, steering_layer, "Set A")

  print("\n--- Calculating Activations for Set B ---")
  avg_activations_B = get_average_activations(prompts_B, model, tokenizer, steering_layer, "Set B")

  # --- Calculate the Steering Vector ---
  if avg_activations_A is not None and avg_activations_B is not None:
    steering_vector = avg_activations_A - avg_activations_B
    steering_vector = steering_vector.to(DEVICE) # Move steering vector to GPU
    print(f"\nSuccessfully calculated steering vector. Shape: {steering_vector.shape}")
    # Optionally normalize the vector (can sometimes help)
    # steering_vector = steering_vector / torch.norm(steering_vector)
    # print("Steering vector normalized.")
  else:
    raise RuntimeError("Failed to calculate activations for one or both sets.")

  # --- Clean up CPU memory ---
  del avg_activations_A, avg_activations_B
  gc.collect()
  return steering_vector

steering_vector = compute_steering_vector()

--- Calculating Activations for Set A ---
Calculated average activation for 'Set A' across 1 prompts.

--- Calculating Activations for Set B ---
Calculated average activation for 'Set B' across 1 prompts.

Successfully calculated steering vector. Shape: torch.Size([2560])


In [None]:
steering_vector = compute_steering_vector()

In [None]:
steering_vector.shape

torch.Size([2560])

In [10]:
# @title 6. Define Steering Injection Hook and Generation Function

# Global variable to hold the current steering vector and coefficient
current_steering_vector = None
current_coefficient = 0.0

def steering_hook(module, input, output):
    """Hook to add the steering vector to the layer's output."""
    global current_steering_vector, current_coefficient
    if current_steering_vector is None or current_coefficient == 0:
        return output

    # Again, assume output is (hidden_states, ...) or just hidden_states
    hidden_states = output[0] if isinstance(output, tuple) else output
    original_dtype = hidden_states.dtype

    # Ensure steering vector is on the correct device and add it
    # The steering vector needs to be broadcastable to (batch_size, seq_len, hidden_dim)
    # Our vector is (hidden_dim). Unsqueeze adds dims: (1, 1, hidden_dim)
    steering_add = current_steering_vector.to(hidden_states.device, dtype=original_dtype) * current_coefficient
    modified_hidden_states = hidden_states + steering_add.unsqueeze(0).unsqueeze(0)

    # Return modified states in the expected format
    if isinstance(output, tuple):
        return (modified_hidden_states,) + output[1:]
    else:
        return modified_hidden_states

def generate_with_steering(prompt, model, tokenizer, steering_vec, layer, coeff, max_new_tokens=100, **generation_kwargs):
    """Generates text with the steering vector applied."""
    global current_steering_vector, current_coefficient

    # Set the global steering vector and coefficient
    current_steering_vector = steering_vec
    current_coefficient = coeff

    # Register the steering hook
    hook_handle = layer.register_forward_hook(steering_hook)

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.pad_token_id, # Important for generation
            eos_token_id=tokenizer.eos_token_id,
             do_sample=True, # Sample for more diverse outputs
             temperature=0.7,
             top_p=0.9,
            **generation_kwargs
        )

    # Remove the hook !! IMPORTANT !!
    hook_handle.remove()

    # Reset globals
    current_steering_vector = None
    current_coefficient = 0.0

    # Decode the generated text
    # We need to slice the output to only get the generated part
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0, input_len:]
    result = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return result

In [11]:
# @title 7. Generate Outputs With and Without Steering

# --- Define Test Prompts ---
test_prompts = [
    PROMPT_TEMPLATE.format(instruction="Should I learn Python or JavaScript first for web development? Give a balanced view."),
    PROMPT_TEMPLATE.format(instruction="What are the ethical considerations of AI image generation?"),
    PROMPT_TEMPLATE.format(instruction="Is remote work more productive than office work? Discuss pros and cons."),
]

# --- Generation Parameters ---
MAX_NEW_TOKENS = 120

# --- Run Generation ---
def run_generation_with_steering(test_prompts=test_prompts,steering_vector=steering_vector,steering_layer=steering_layer):
  for i, prompt in enumerate(test_prompts):
    print(f"\n--- Test Prompt {i+1} ---")
    print(f"Prompt: {prompt}")

    # Baseline Generation (No Steering)
    print("\nGenerating Baseline Output...")
    baseline_output = generate_with_steering(
        prompt, model, tokenizer, steering_vector, steering_layer,
        coeff=0.0, # Coefficient is 0 for baseline
        max_new_tokens=MAX_NEW_TOKENS
    )
    print(f"\nBaseline Output (coeff=0.0):\n{baseline_output}")

    # Steered Generation
    print(f"\nGenerating Steered Output (coeff={STEERING_COEFFICIENT})...")
    steered_output = generate_with_steering(
        prompt, model, tokenizer, steering_vector, steering_layer,
        coeff=STEERING_COEFFICIENT, # Use the defined positive coefficient
        max_new_tokens=MAX_NEW_TOKENS
    )
    print(f"\nSteered Output (coeff={STEERING_COEFFICIENT}, towards Set A):\n{steered_output}")

    # Optionally, generate steering towards Set B (negative coefficient)
    print(f"\nGenerating Steered Output (coeff={-STEERING_COEFFICIENT})...")
    steered_output_neg = generate_with_steering(
        prompt, model, tokenizer, steering_vector, steering_layer,
        coeff=-STEERING_COEFFICIENT, # Use a negative coefficient
        max_new_tokens=MAX_NEW_TOKENS
    )
    print(f"\nSteered Output (coeff={-STEERING_COEFFICIENT}, towards Set B):\n{steered_output_neg}")

    print("-" * 50)

    # Clean up GPU memory between prompts if necessary
    torch.cuda.empty_cache()
    gc.collect()

In [15]:
#keeping output from phi2 on the shortness steering test
run_generation_with_steering(test_prompts=test_prompts,steering_vector=steering_vector,steering_layer=model.model.layers[15])


--- Test Prompt 1 ---
Prompt: Instruct: Should I learn Python or JavaScript first for web development? Give a balanced view.
Output:

Generating Baseline Output...

Baseline Output (coeff=0.0):
 Learning Python or JavaScript first for web development depends on personal preference and the specific needs of the project. Python is known for its simplicity and readability, making it a good choice for beginners. It is also widely used for web development and has a large community of developers. JavaScript, on the other hand, is a versatile language that can be used for both front-end and back-end development. It is essential for creating interactive web pages and is the language of the web. Both Python and JavaScript have their strengths and weaknesses, so it is important to consider the requirements of the project and your own

Generating Steered Output (coeff=2)...

Steered Output (coeff=2, towards Set A):
 Should I learn Python or JavaScript first for web development? A balanced view.


In [27]:
#generate poetry
#phi2, formatting 1
run_generation_with_steering(test_prompts=prompts_A,steering_vector=steering_vector,steering_layer=model.model.layers[15])


--- Test Prompt 1 ---
Prompt: Instruct: A rhymed couplet:
He saw a carrot and had to grab it
Output:

Generating Baseline Output...

Baseline Output (coeff=0.0):
 He saw a carrot and had to grab it
But it was too big for his little spat


Generating Steered Output (coeff=2)...

Steered Output (coeff=2, towards Set A):
 Poem: He saw a carrot and had to grab it
To make a healthy snack


Generating Steered Output (coeff=-2)...

Steered Output (coeff=-2, towards Set B):
 He saw a carrot and had to grab it,
It was the only one that he could see.

--------------------------------------------------

--- Test Prompt 2 ---
Prompt: Instruct: A rhymed couplet:

He saw a carrot and had to grab it
Output:

Generating Baseline Output...

Baseline Output (coeff=0.0):
 He saw a carrot and had to grab it
But it was too big for his hat


Generating Steered Output (coeff=2)...

Steered Output (coeff=2, towards Set A):
 He saw a carrot and had to grab it
He ate it up so that he could


Generating Steered

In [31]:
#generate poetry
#phi2, formatting 2
run_generation_with_steering(test_prompts=prompts_A,steering_vector=steering_vector,steering_layer=model.model.layers[15])


--- Test Prompt 1 ---
Prompt: Instruct: A rhymed couplet:
He saw a carrot and had to grab it
Output:

Generating Baseline Output...

Baseline Output (coeff=0.0):
 He saw a carrot and had to grab it
Before it got too crunchy and too wit


Generating Steered Output (coeff=2)...

Steered Output (coeff=2, towards Set A):
 He was so hungry, he ate it in a bit


Generating Steered Output (coeff=-2)...

Steered Output (coeff=-2, towards Set B):
 He saw a carrot and had to grab it,
He couldn't believe his eyes, he thought it was a trick,
He picked it up and took a bite,
But it was bitter and hard, and he spat it out.

He felt a little embarrassed,
And wished he could hide,
But he knew he had to face the crowd,
And try to come up with a better word.

--------------------------------------------------

--- Test Prompt 2 ---
Prompt: Instruct: A rhymed couplet:

He saw a carrot and had to grab it
Output:

Generating Baseline Output...

Baseline Output (coeff=0.0):
 A possible rhymed couplet:

He 