## Steering the Target Model towards an Identified Skill Direction

In [1]:
import platform
platform.python_version()

'3.11.13'

In [2]:
import numpy as np 
import pandas as pd 

import torch

import pickle

**Load 10 PCA directions for Activations from Llama3-8B**

In [3]:
import pickle
with open("pca-nemo-50k-30.pkl", "rb") as f:
    pca8b10d = pickle.load(f)
pca8b10d.shape

(10, 122880)

## Model Steering via Adding the Steering Vector to MLP Bias Parameters at Each Layer as an Offset

- Positive Steering

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import os
import shutil

# --- 1. Configuration ---


MODEL_ID = "***<Path to SFT-Trained Model (Llama3-8B architecture)>***"

for ij in range(10):

    OUTPUT_DIR = "/saves/l8b-nemo-math-d" + str(ij) + "-a50"
    
    # Dteering parameters
    ALPHA = 0.50
    NUM_LAYERS_TO_STEER = 30
    HIDDEN_SIZE = 4096 # Based on your previous info
    
    # --- 2. Load Steering Vectors ---

    print(f"Loading {NUM_LAYERS_TO_STEER} steering vectors of size {HIDDEN_SIZE}...")

    # --- 3. Load the Model ---
    print(f"Loading original model from: {MODEL_ID}")
    # Load the model on the CPU to perform the surgery.

    # 1. Load the config first
    config = AutoConfig.from_pretrained(MODEL_ID)
    
    # 2. Explicitly enable MLP bias
    config.mlp_bias = True 
    
    # 3. Pass the modified config to the model loader
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        config=config,          
        torch_dtype=torch.bfloat16,
        device_map="cpu"
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model.eval() # Set to evaluation mode
    
    
    vd0mat = torch.tensor(pca8b10d[ij]).reshape(30,4096) * 1
    hidden_size = model.config.hidden_size # For Llama-3-8B, this is 4096
    num_layers_to_steer = 30
    

    if num_layers_to_steer > model.config.num_hidden_layers:
        print(f"Warning: Model only has {model.config.num_hidden_layers} layers.")
        num_layers_to_steer = min(num_layers_to_steer, model.config.num_hidden_layers)
    
    steering_vectors = [
        (vd0mat[j]).to(torch.bfloat16)
        for j in range(num_layers_to_steer)
    ]
    
    print("Steering vectors loaded.")
    
    
    # --- 4. Perform Model Surgery ---
    print("Starting model surgery...")
    
    # We don't need to track gradients for this operation
    with torch.no_grad():
        for i in range(30):
            # Get the specific layer
            layer = model.model.layers[i+1]
            
            # Get the target module: the 'down_proj' in the MLP
            down_proj_layer = layer.mlp.down_proj
            
            # Get the steering vector for this layer
            v_steer = steering_vectors[i]
            
            # Calculate the scaled vector to add
            v_prime = (v_steer * ALPHA).to(dtype=down_proj_layer.weight.dtype)
            
            # Many Llama models are trained with bias=False in their linear layers.
            
            if down_proj_layer.bias is None:
                # If no bias exists, we must create one.
                # The new bias will *only* be our steering vector.
                print(f"Layer {i+1}: No bias found in down_proj. Creating new bias.")
                # Create a new torch.nn.Parameter
                down_proj_layer.bias = torch.nn.Parameter(
                    v_prime, 
                    requires_grad=False
                )
            else:
                # If a bias *does* exist, add our vector to it.
                print(f"Layer {i+1}: Existing bias found in down_proj. Adding vector.")
                down_proj_layer.bias.data += v_prime
    
    print("Model surgery complete.")
    
    # --- 5. Save the New Model ---
    if os.path.exists(OUTPUT_DIR):
        print(f"Warning: Output directory {OUTPUT_DIR} already exists. Overwriting.")
        
    print(f"Saving modified model to: {OUTPUT_DIR}")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    print("\nAll done. The new, permanently steered model is saved.")
    print(f"You can now load it with vLLM or Transformers using: {OUTPUT_DIR}")

- Negative Steering

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import os
import shutil

# --- 1. Configuration ---


MODEL_ID = "***<Path to SFT-Trained Model (Llama3-8B architecture)>***"

for ij in range(10):

    OUTPUT_DIR = "/saves/l8b-nemo-math-d" + str(ij) + "n-a50"
    
    # Dteering parameters
    ALPHA = -0.50
    NUM_LAYERS_TO_STEER = 30
    HIDDEN_SIZE = 4096 # Based on your previous info
    
    # --- 2. Load Steering Vectors ---

    print(f"Loading {NUM_LAYERS_TO_STEER} steering vectors of size {HIDDEN_SIZE}...")

    # --- 3. Load the Model ---
    print(f"Loading original model from: {MODEL_ID}")
    # Load the model on the CPU to perform the surgery.

    # 1. Load the config first
    config = AutoConfig.from_pretrained(MODEL_ID)
    
    # 2. Explicitly enable MLP bias
    config.mlp_bias = True 
    
    # 3. Pass the modified config to the model loader
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        config=config,          
        torch_dtype=torch.bfloat16,
        device_map="cpu"
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model.eval() # Set to evaluation mode
    
    
    vd0mat = torch.tensor(pca8b10d[ij]).reshape(30,4096) * 1
    hidden_size = model.config.hidden_size # For Llama-3-8B, this is 4096
    num_layers_to_steer = 30
    

    if num_layers_to_steer > model.config.num_hidden_layers:
        print(f"Warning: Model only has {model.config.num_hidden_layers} layers.")
        num_layers_to_steer = min(num_layers_to_steer, model.config.num_hidden_layers)
    
    steering_vectors = [
        (vd0mat[j]).to(torch.bfloat16)
        for j in range(num_layers_to_steer)
    ]
    
    print("Steering vectors loaded.")
    
    
    # --- 4. Perform Model Surgery ---
    print("Starting model surgery...")
    
    # We don't need to track gradients for this operation
    with torch.no_grad():
        for i in range(30):
            # Get the specific layer
            layer = model.model.layers[i+1]
            
            # Get the target module: the 'down_proj' in the MLP
            down_proj_layer = layer.mlp.down_proj
            
            # Get the steering vector for this layer
            v_steer = steering_vectors[i]
            
            # Calculate the scaled vector to add
            v_prime = (v_steer * ALPHA).to(dtype=down_proj_layer.weight.dtype)
            
            # Many Llama models are trained with bias=False in their linear layers.
            
            if down_proj_layer.bias is None:
                # If no bias exists, we must create one.
                # The new bias will *only* be our steering vector.
                print(f"Layer {i+1}: No bias found in down_proj. Creating new bias.")
                # Create a new torch.nn.Parameter
                down_proj_layer.bias = torch.nn.Parameter(
                    v_prime, 
                    requires_grad=False
                )
            else:
                # If a bias *does* exist, add our vector to it.
                print(f"Layer {i+1}: Existing bias found in down_proj. Adding vector.")
                down_proj_layer.bias.data += v_prime
    
    print("Model surgery complete.")
    
    # --- 5. Save the New Model ---
    if os.path.exists(OUTPUT_DIR):
        print(f"Warning: Output directory {OUTPUT_DIR} already exists. Overwriting.")
        
    print(f"Saving modified model to: {OUTPUT_DIR}")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    print("\nAll done. The new, permanently steered model is saved.")
    print(f"You can now load it with vLLM or Transformers using: {OUTPUT_DIR}")