# Steering Vector Generation


## Installs

In [1]:
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install tqdm

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


## Setup & Mount Drive

In [None]:
# IMPORTS
from google.colab import drive, userdata
import sys
import os
import time
import torch # Need this for torch.save

##Mount Google Drive & Setup

In [3]:
print("--- Notebook Setup ---")

# 1. Mount Google Drive
try:
    drive.mount('/content/drive')
    print("Google Drive mounted.")
except Exception as e:
    print(f"Error mounting drive: {e}")

# 2. Get Project Path from Colab Secrets
try:
    PROJECT_PATH = userdata.get('moral_path')
    if PROJECT_PATH is None: raise ValueError("path secret not found.")
    print(f"PROJECT_PATH set to: {PROJECT_PATH}")
except Exception as e:
    print(f"Error getting secret: {e}")

# Set the HF_TOKEN as an environment variable
HF_TOKEN = userdata.get('HF_TOKEN')
if HF_TOKEN:
    os.environ['HF_TOKEN'] = HF_TOKEN
    print("HF_TOKEN successfully set as environment variable.")
else:
    print("HF_TOKEN secret not found.")

# 3. Add project to Python path and change directory
if 'PROJECT_PATH' in locals() and os.path.exists(PROJECT_PATH):
    if PROJECT_PATH not in sys.path:
        sys.path.append(PROJECT_PATH)
        print(f"Added {PROJECT_PATH} to sys.path")
    os.chdir(PROJECT_PATH)
    print(f"Changed current working directory to: {os.getcwd()}")
else:
    print("PROJECT_PATH not valid. Cannot set up environment.")

# 4. Install requirements to colab (needs transformers, bitsandbytes, etc.)
# print("\nInstalling requirements from requirements.txt...")
# start_pip = time.time()
# !pip install -q -r requirements.txt
# end_pip = time.time()
# print(f"Requirements installed in {end_pip - start_pip:.2f}s.")

--- Notebook Setup ---
Mounted at /content/drive
Google Drive mounted.
PROJECT_PATH set to: /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering/
HF_TOKEN successfully set as environment variable.
Added /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering/ to sys.path
Changed current working directory to: /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering


In [4]:
print("--- Import test ---")
print("Cell will crash if there is an import error.")

# import each file in order of dependency

# 1. Test config
print("Testing import from src.config...")
from src.config import MODEL_LIST, STEERING_PROMPT_DIR
print("...src.config OK")

# 2. Test model_utils
print("Testing import from src.model_utils...")
from src.model_utils import load_model_and_tokenizer
print("...src.model_utils OK")

# 3. Test steering
print("Testing import from src.steering...")
from src.steering import generate_moral_vector
print("...src.steering OK")

print("--- Raw import test PASSED ---")

--- Import test ---
Cell will crash if there is an import error.
Testing import from src.config...
...src.config OK
Testing import from src.model_utils...
...src.model_utils OK
Testing import from src.steering...
...src.steering OK
--- Raw import test PASSED ---


## Import script and run vector generation

In [None]:
# Import the main functions from src
try:
    # from src.steering import generate_moral_vector
    from src.steering import generate_moral_vectors
    from src.config import MODEL_LIST, STEERING_PROMPT_DIR
    print("\nSuccessfully imported 'generate_moral_vector' from src/steering.py")
    imports_ok = True
except ImportError as e:
    print(f"ERROR: Could not import from 'src'. FFS {e}")
    # print("Make sure 'src/__init__.py' exists.")
    # print("Make sure 'src/model_utils.py' exists and is synced.")
    imports_ok = False
except Exception as e:
    print(f"ERROR: An unexpected error occurred during import: {e}")
    imports_ok = False

# --- Configuration for TEST run ---

# Set to a small number (e.g., 10) to run a fast test on a T4.
# Set to None (test_run_limit = None) for the full run on the A100.
# TEST_LIMIT = 10
TEST_LIMIT = None

# MODEL LIST FROM CONFIG
# [0] meta-llama/Meta-Llama-3-8B-Instruct
# [1] mistralai/Mistral-7B-Instruct-v0.3
# [2] google/gemma-7b-it
MODEL_TO_USE = MODEL_LIST[0]

# SELECT LAYER
# Llama-3.1-8B - 32 layers
# Test range 14-24
# TARGET_LAYER = 20

# SELECT LAYER when using range: layers 16 through 30 inclusive
START_LAYER = 16
END_LAYER = 30
TARGET_LAYERS = list(range(START_LAYER, END_LAYER + 1))

# SINGLE LAYER CAPTURE
# Steering vector name: 
# OUTPUT_FILE = f"{MODEL_TO_USE.split('/')[-1]}_layer_{TARGET_LAYER}_deon_vs_util.pt"

# MULTI LAYER CAPTURE
# Note:  Will return a dictionary that holds the 15 distinct vectors side-by-side 16-30
OUTPUT_FILE = f"{MODEL_TO_USE.split('/')[-1]}_layers_{START_LAYER}-{END_LAYER}_vectors.pt"

# Run the main generation script
if imports_ok:
    print("\n" + "="*30)
    print(f" Calling generate_moral_vector()...")
    print(f" Model: {MODEL_TO_USE}")
    # print(f" Layer: {TARGET_LAYER}") # for single layer
    print(f" Target Layers: {TARGET_LAYERS}") # when using multiple layers
    print(f" Output: {OUTPUT_FILE}")

    # SINGLE LAYER CAPTURE
    # start_run = time.time()
    # generate_moral_vector(
    #     model_id=MODEL_TO_USE,
    #     target_layer_index=TARGET_LAYER, # single layer
    #     output_filename=OUTPUT_FILE,
    #     test_run_limit=TEST_LIMIT
    # )
    # end_run = time.time()

    # MULTI LAYER CAPTURE
    start_run = time.time()
    generate_moral_vectors(
        model_id=MODEL_TO_USE,
        target_layers=TARGET_LAYERS, # Passing the list for multi layer capture
        output_filename=OUTPUT_FILE,
        test_run_limit=TEST_LIMIT
    )
    end_run = time.time()

    print("="*30)
    print(f"Main function finished execution in {end_run - start_run:.2f} seconds.")
    print(f"Check the output file at: {STEERING_PROMPT_DIR / OUTPUT_FILE}")


Successfully imported 'generate_moral_vector' from src/steering.py

 Calling generate_moral_vector()...
 Model: meta-llama/Llama-3.1-8B-Instruct
 Layer: 20
 Output: Llama-3.1-8B-Instruct_layer_20_deon_vs_util.pt
--- Starting Moral Vector Generation ---
Loading steering prompts from: /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering/data/processed/steering_prompts/steer_prompts_UvD.json
Successfully loaded 500 positive (Deon) and 500 negative (Util) prompts.

[+] Loading model: meta-llama/Llama-3.1-8B-Instruct...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

[+] Model loaded successfully in 80.57 seconds.
[+] Hook registered on layer 20.
Extracting activations from 500 prompts...


Processing prompts:   0%|          | 0/500 [00:00<?, ?it/s]

[+] Hook removed.
[+] Hook registered on layer 20.
Extracting activations from 500 prompts...


Processing prompts:   0%|          | 0/500 [00:00<?, ?it/s]

[+] Hook removed.
Calculating steering vector...
Steering vector calculated. Shape: torch.Size([4096])
Steering vector saved successfully to: /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering/data/processed/steering_prompts/Llama-3.1-8B-Instruct_layer_20_deon_vs_util.pt
[+] Model unloaded and VRAM cleared.
--- Moral Vector Generation Complete ---
Main function finished execution in 194.46 seconds.
Check the output file at: /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering/data/processed/steering_prompts/Llama-3.1-8B-Instruct_layer_20_deon_vs_util.pt


## Validate Vector Generation

In [None]:
from src.config import STEERING_PROMPT_DIR

output_path = STEERING_PROMPT_DIR / OUTPUT_FILE 

print(f"--- Reviewing Output File ---")
print(f"Loading vector dictionary from {output_path}...")

try:
    # Load the dictionary
    vector_dict = torch.load(output_path)
    print(f"✅ Successfully loaded dictionary.")
    print(f"   Layers captured: {list(vector_dict.keys())}")
    
    # Check the first available layer
    first_layer = list(vector_dict.keys())[0]
    vec = vector_dict[first_layer]
    
    print(f"\nChecking vector for Layer {first_layer}:")
    print(f" Shape: {vec.shape}") # Should be [4096]
    print(f" Mean: {vec.mean():.6f}")
    print(f" Std: {vec.std():.6f}")

except FileNotFoundError:
    print(f"ERROR: Output file not found at '{output_path}'.")
except Exception as e:
    print(f"ERROR: An unexpected error occurred: {e}")