# Steering Vector Generation


## Installs

In [1]:
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install tqdm

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


## Setup & Mount Drive

In [2]:
# IMPORTS
from google.colab import drive, userdata
import sys
import os
import time

##Mount Google Drive & Setup

In [3]:
print("--- Notebook Setup ---")

# 1. Mount Google Drive
try:
    drive.mount('/content/drive')
    print("Google Drive mounted.")
except Exception as e:
    print(f"Error mounting drive: {e}")

# 2. Get Project Path from Colab Secrets
try:
    PROJECT_PATH = userdata.get('moral_path')
    if PROJECT_PATH is None: raise ValueError("path secret not found.")
    print(f"PROJECT_PATH set to: {PROJECT_PATH}")
except Exception as e:
    print(f"Error getting secret: {e}")

# Set the HF_TOKEN as an environment variable
HF_TOKEN = userdata.get('HF_TOKEN')
if HF_TOKEN:
    os.environ['HF_TOKEN'] = HF_TOKEN
    print("HF_TOKEN successfully set as environment variable.")
else:
    print("HF_TOKEN secret not found.")

# 3. Add project to Python path and change directory
if 'PROJECT_PATH' in locals() and os.path.exists(PROJECT_PATH):
    if PROJECT_PATH not in sys.path:
        sys.path.append(PROJECT_PATH)
        print(f"Added {PROJECT_PATH} to sys.path")
    os.chdir(PROJECT_PATH)
    print(f"Changed current working directory to: {os.getcwd()}")
else:
    print("PROJECT_PATH not valid. Cannot set up environment.")

# 4. Install requirements to colab (needs transformers, bitsandbytes, etc.)
# print("\nInstalling requirements from requirements.txt...")
# start_pip = time.time()
# !pip install -q -r requirements.txt
# end_pip = time.time()
# print(f"Requirements installed in {end_pip - start_pip:.2f}s.")

--- Notebook Setup ---
Mounted at /content/drive
Google Drive mounted.
PROJECT_PATH set to: /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering/
HF_TOKEN successfully set as environment variable.
Added /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering/ to sys.path
Changed current working directory to: /content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering


In [5]:
print("--- Import test ---")
print("Cell will crash if there is an import error.")

# import each file in order of dependency

# 1. Test config
print("Testing import from src.config...")
from src.config import MODEL_LIST, STEERING_PROMPT_DIR
print("...src.config OK")

# 2. Test model_utils
print("Testing import from src.model_utils...")
from src.model_utils import load_model_and_tokenizer
print("...src.model_utils OK")

# 3. Test steering
print("Testing import from src.steering...")
from src.steering import generate_moral_vector
print("...src.steering OK")

print("--- Raw import test PASSED ---")

--- Import test ---
Cell will crash if there is an import error.
Testing import from src.config...
...src.config OK
Testing import from src.model_utils...
...src.model_utils OK
Testing import from src.steering...
...src.steering OK
--- Raw import test PASSED ---


## Import script and run vector generation

In [None]:
import torch # Need this for torch.save

# Import the main functions from src
try:
    from src.steering import generate_moral_vector
    from src.config import MODEL_LIST, STEERING_PROMPT_DIR
    print("\nSuccessfully imported 'generate_moral_vector' from src/steering.py")
    imports_ok = True
except ImportError as e:
    print(f"ERROR: Could not import from 'src'. FFS {e}")
    # print("Make sure 'src/__init__.py' exists.")
    # print("Make sure 'src/model_utils.py' exists and is synced.")
    imports_ok = False
except Exception as e:
    print(f"ERROR: An unexpected error occurred during import: {e}")
    imports_ok = False

# --- Configuration for TESS run ---
# MODEL LIST FROM CONFIG
# [0] meta-llama/Meta-Llama-3-8B-Instruct
# [1] mistralai/Mistral-7B-Instruct-v0.3
# [2] google/gemma-7b-it
MODEL_TO_USE = MODEL_LIST[0]

# SELECT LAYER
# Llama-3.1-8B - 32 layers
# Test range 14-24
TARGET_LAYER = 20

# Steering vector name
OUTPUT_FILE = f"{MODEL_TO_USE.split('/')[-1]}_layer_{TARGET_LAYER}_deon_vs_util.pt"

# Run the main generation script
if imports_ok:
    print("\n" + "="*30)
    print(f" Calling generate_moral_vector()...")
    print(f" Model: {MODEL_TO_USE}")
    print(f" Layer: {TARGET_LAYER}")
    print(f" Output: {OUTPUT_FILE}")

    start_run = time.time()
    generate_moral_vector(
        model_id=MODEL_TO_USE,
        target_layer_index=TARGET_LAYER,
        output_filename=OUTPUT_FILE
    )
    end_run = time.time()

    print("="*30)
    print(f"Main function finished execution in {end_run - start_run:.2f} seconds.")
    print(f"Check the output file at: {STEERING_PROMPT_DIR / OUTPUT_FILE}")