In [1]:
import os
from dotenv import load_dotenv 
from huggingface_hub import login

# --- Load ALL Configurations from .env file ---
# This single line reads your .env file and sets up ALL environment variables
# for this session (secrets, paths, etc.).
# It must be run BEFORE any library that needs these variables is used.
load_dotenv()
print("Environment variables from .env file loaded.")

# --- Hugging Face Login (No changes needed here) ---
# This code correctly reads the "HF_TOKEN" that was just loaded by load_dotenv()
try:
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        login(token=hf_token)
        print("Successfully logged into Hugging Face.")
    else:
        print("Hugging Face token not found. Skipping login.")
except Exception as e:
    print(f"Could not log into Hugging Face: {e}")

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Environment variables from .env file loaded.
Successfully logged into Hugging Face.


In [3]:
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import difflib
import textwrap

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# ==============================================================================
# SECTION 1: SETUP AND CONFIGURATION
# ==============================================================================
print("--- SECTION 1: CONFIGURATION ---")

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from sae_lens import SAE
import os
import re
import difflib

# --- Core Configuration (Using GPT2-SMALL for guaranteed compatibility) ---
MODEL_NAME = "gpt2"
# Layer 8 is a good mid-point for this 12-layer model.
TARGET_LAYER_IDX = 8
# The layer name format for GPT-2 is 'transformer.h.{index}'.
TARGET_LAYER_NAME = f"transformer.h[{TARGET_LAYER_IDX}]"

# --- SAE Configuration (Using a standard, aliased release for GPT2-SMALL) ---
# This is a standard release name from the sae-lens library's supported list.
SAE_RELEASE_NAME = "gpt2-small-res-jb"
# The SAE_ID is the specific hook point within that release.
SAE_ID = f"blocks.{TARGET_LAYER_IDX}.hook_resid_pre"

# --- Experiment Configuration ---
TARGET_PERSONALITY = "extraversion" 
NUM_PROMPTS_TO_COLLECT = 100 
TOP_K_FEATURES = 10
STEERING_STRENGTH = 1.0 # Steering strength often needs to be higher for smaller models

# --- File Paths ---
# Assumes this script is run from a subfolder (e.g., 'mech_interp')
# and the data files are in the parent directory.
PERSONALITY_DATA_FILE = "../personality_data_train.csv"
BASE_QUESTIONS_FILE = "../bbq_ambiguous_with_metadata.csv"
os.makedirs("activations", exist_ok=True)
ACTIVATION_SAVE_PATH = "activations"

--- SECTION 1: CONFIGURATION ---


In [6]:
# ==============================================================================
# SECTION 2: MODEL AND DATA LOADING
# ==============================================================================
print("\n--- SECTION 2: LOADING MODELS AND DATA ---")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, 
        torch_dtype=torch.bfloat16, 
        device_map="auto"
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print(f"Successfully loaded model: {MODEL_NAME}")
except Exception as e:
    print(f"FATAL: Could not load the model. Error: {e}")
    exit()

try:
    df_traits = pd.read_csv(PERSONALITY_DATA_FILE)
    df_traits.columns = [col.strip().lower() for col in df_traits.columns]
    personality_few_shot_examples = {}
    for trait in df_traits['target personality'].unique():
        trait_df = df_traits[df_traits['target personality'] == trait]
        personality_few_shot_examples[trait] = list(zip(trait_df['question'], trait_df['answer']))[:4]

    df_bbq = pd.read_csv(BASE_QUESTIONS_FILE)
    base_questions = [f"{row['context']} {row['question']}" for _, row in df_bbq.head(NUM_PROMPTS_TO_COLLECT).iterrows()]
    print("Successfully loaded and prepared all data.")
except FileNotFoundError as e:
    print(f"FATAL: A required data file was not found. Please check your file paths. Error: {e}")
    exit()


--- SECTION 2: LOADING MODELS AND DATA ---
Successfully loaded model: gpt2
Successfully loaded and prepared all data.


In [7]:
# ==============================================================================
# SECTION 3: HELPER FUNCTIONS (SELF-CONTAINED)
# ==============================================================================
print("\n--- SECTION 3: DEFINING HELPER FUNCTIONS ---")

def create_few_shot_prompt_string(target_personality, few_shot_examples, current_question):
    system_content = f"You are an AI with a {target_personality} personality. Respond strictly in character."
    examples_str = f"\n\nHere are some examples of how a {target_personality} AI would typically respond:"
    for i, (ex_q, ex_a) in enumerate(few_shot_examples):
        examples_str += f"\n\nExample {i+1} Question: {ex_q}\nExample {i+1} Answer: {ex_a}"
    full_prompt = f"{system_content}{examples_str}\n\n---\nNow, answer the following question as a {target_personality} AI:\nQuestion: {current_question}\nAnswer:"
    return full_prompt

def create_neutral_prompt(current_question):
    system_instruction = (
        "You are a neutral and objective AI assistant. Your task is to answer the question "
        "directly and factually, without expressing any personality, emotion, or opinion."
    )
    return f"{system_instruction}\n\nQuestion: {current_question}\nAnswer:"

captured_activations = []
def hook_function(module, input, output):
    captured_activations.append(output[0].detach().cpu())


--- SECTION 3: DEFINING HELPER FUNCTIONS ---


In [13]:
# ==============================================================================
# SECTION 4: THE CORE INTERPRETABILITY WORKFLOW
# ==============================================================================
print("\n--- SECTION 4.1: COLLECTING ACTIVATIONS ---")

layer_path = TARGET_LAYER_NAME.split('.')
target_layer_module = model
for part in layer_path:
    if part.endswith(']'):
        base, index = part.split('[')
        target_layer_module = getattr(target_layer_module, base)[int(index[:-1])]
    else:
        target_layer_module = getattr(target_layer_module, part)
        
hook = target_layer_module.register_forward_hook(hook_function)
print(f"Hook attached to layer: {TARGET_LAYER_NAME}")

print(f"\nCollecting activations for '{TARGET_PERSONALITY.upper()}' prompts...")
captured_activations = [] 
current_examples = personality_few_shot_examples.get(TARGET_PERSONALITY, [])
for question in base_questions:
    prompt = create_few_shot_prompt_string(TARGET_PERSONALITY, current_examples, question)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        model(**inputs)

personality_activations_path = os.path.join(ACTIVATION_SAVE_PATH, f'gpt2_{TARGET_PERSONALITY}_activations_layer{TARGET_LAYER_IDX}.pt')
torch.save(captured_activations, personality_activations_path)
print(f"Saved {len(captured_activations)} activation tensors to {personality_activations_path}")

print(f"\nCollecting activations for 'NEUTRAL' prompts...")
captured_activations = []
for question in base_questions:
    prompt = create_neutral_prompt(question)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        model(**inputs)

neutral_activations_path = os.path.join(ACTIVATION_SAVE_PATH, f'gpt2_neutral_activations_layer{TARGET_LAYER_IDX}.pt')
torch.save(captured_activations, neutral_activations_path)
print(f"Saved {len(captured_activations)} activation tensors to {neutral_activations_path}")

hook.remove()
print("\nActivation collection complete. Hook removed.")


--- SECTION 4.1: COLLECTING ACTIVATIONS ---
Hook attached to layer: transformer.h[8]

Collecting activations for 'EXTRAVERSION' prompts...


Saved 200 activation tensors to activations/gpt2_extraversion_activations_layer8.pt

Collecting activations for 'NEUTRAL' prompts...
Saved 200 activation tensors to activations/gpt2_neutral_activations_layer8.pt

Activation collection complete. Hook removed.


In [27]:
# ==============================================================================
# SECTION 4.1 (FINAL REVISION): COLLECTING GENERATION ACTIVATIONS
# ==============================================================================
print("\n--- RUNNING FINAL REVISED SECTION 4.1: COLLECTING GENERATION ACTIVATIONS ---")

# --- Helper Functions ---
def create_simple_personality_prompt(personality, question):
    return f"You are an AI with a {personality} personality. Respond to the following as if you were that AI.\n\n{question}"

def create_simple_neutral_prompt(question):
    return f"You are a neutral, objective AI. Respond to the following as if you were that AI.\n\n{question}"

captured_activations = []
def hook_function(module, input, output):
    # We only care about the activation of the first generated token, 
    # which corresponds to the last token of the input sequence to the hook.
    captured_activations.append(output[0][:, -1, :].detach().cpu())

# --- Attach Hook to Target Layer ---
layer_path = TARGET_LAYER_NAME.split('.')
target_layer_module = model
for part in layer_path:
    if part.endswith(']'):
        base, index = part.split('[')
        target_layer_module = getattr(target_layer_module, base)[int(index[:-1])]
    else:
        target_layer_module = getattr(target_layer_module, part)
        
hook = target_layer_module.register_forward_hook(hook_function)
print(f"Hook attached to layer: {TARGET_LAYER_NAME}")


# --- A) Collect Activations for the TARGET PERSONALITY Prompts ---
print(f"\nCollecting generation activations for '{TARGET_PERSONALITY.upper()}' prompts...")
personality_activations_list = []
for question in base_questions:
    prompt = create_simple_personality_prompt(TARGET_PERSONALITY, question)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    
    captured_activations = [] # Clear before each generation
    with torch.no_grad():
        # Generate just a few tokens to get the generation-time activations
        model.generate(**inputs, max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
    
    # The hook will have captured the activation for each of the 5 generated tokens.
    # We will take the activation from the VERY FIRST generated token.
    if captured_activations:
        personality_activations_list.append(captured_activations[0])

personality_activations_path = os.path.join(ACTIVATION_SAVE_PATH, f'gpt2_{TARGET_PERSONALITY}_gen_activations_layer{TARGET_LAYER_IDX}.pt')
torch.save(personality_activations_list, personality_activations_path)
print(f"Saved {len(personality_activations_list)} generation activation tensors to {personality_activations_path}")


# --- B) Collect Activations for the NEUTRAL CONTROL Prompts ---
print(f"\nCollecting generation activations for 'NEUTRAL' prompts...")
neutral_activations_list = []
for question in base_questions:
    prompt = create_simple_neutral_prompt(question)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

    captured_activations = [] # Clear before each generation
    with torch.no_grad():
        model.generate(**inputs, max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
        
    if captured_activations:
        neutral_activations_list.append(captured_activations[0])

neutral_activations_path = os.path.join(ACTIVATION_SAVE_PATH, f'gpt2_neutral_gen_activations_layer{TARGET_LAYER_IDX}.pt')
torch.save(neutral_activations_list, neutral_activations_path)
print(f"Saved {len(neutral_activations_list)} generation activation tensors to {neutral_activations_path}")

# --- Cleanup ---
hook.remove()
print("\nClean generation activation collection complete. Hook removed.")


--- RUNNING FINAL REVISED SECTION 4.1: COLLECTING GENERATION ACTIVATIONS ---
Hook attached to layer: transformer.h[8]

Collecting generation activations for 'EXTRAVERSION' prompts...
Saved 100 generation activation tensors to activations/gpt2_extraversion_gen_activations_layer8.pt

Collecting generation activations for 'NEUTRAL' prompts...
Saved 100 generation activation tensors to activations/gpt2_neutral_gen_activations_layer8.pt

Clean generation activation collection complete. Hook removed.


In [28]:
# ------------------------------------------------------------------------------
print("\n--- SECTION 4.2: FINDING DIFFERENTIATING FEATURES WITH SAE ---")

try:
    personality_activations_path = os.path.join(ACTIVATION_SAVE_PATH, f'gpt2_{TARGET_PERSONALITY}_activations_layer{TARGET_LAYER_IDX}.pt')
    neutral_activations_path = os.path.join(ACTIVATION_SAVE_PATH, f'gpt2_neutral_activations_layer{TARGET_LAYER_IDX}.pt')
    sae, cfg_dict, sparsity = SAE.from_pretrained(
        release=SAE_RELEASE_NAME,
        sae_id=SAE_ID,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    print(f"Successfully loaded SAE: {SAE_ID} from release {SAE_RELEASE_NAME}")

    personality_acts_list = torch.load(personality_activations_path)
    neutral_acts_list = torch.load(neutral_activations_path)
    print(f"Loaded {len(personality_acts_list)} personality and {len(neutral_acts_list)} neutral activation tensors.")

    personality_acts_tensor = torch.stack([act[0, -1, :] for act in personality_acts_list]).to(sae.device)
    neutral_acts_tensor = torch.stack([act[0, -1, :] for act in neutral_acts_list]).to(sae.device)

    with torch.no_grad():
        personality_feature_acts = sae.encode(personality_acts_tensor)
        neutral_feature_acts = sae.encode(neutral_acts_tensor)
    
    mean_personality_acts = personality_feature_acts.mean(dim=0)
    mean_neutral_acts = neutral_feature_acts.mean(dim=0)
    diff_scores = mean_personality_acts - mean_neutral_acts

    top_feature_indices = torch.topk(diff_scores, TOP_K_FEATURES).indices
    print(f"\nTop {TOP_K_FEATURES} candidate features for '{TARGET_PERSONALITY.upper()}' vs Neutral:")
    print(top_feature_indices.tolist())

except Exception as e:
    print(f"An error occurred during SAE analysis. Please check your setup. Error: {e}")
    exit()


--- SECTION 4.2: FINDING DIFFERENTIATING FEATURES WITH SAE ---
Successfully loaded SAE: blocks.8.hook_resid_pre from release gpt2-small-res-jb
Loaded 100 personality and 100 neutral activation tensors.

Top 10 candidate features for 'EXTRAVERSION' vs Neutral:
[11413, 15778, 9683, 24226, 12609, 15253, 8740, 13038, 11809, 4312]


In [29]:
# ==============================================================================
# SECTION 4.2 (REVISED): FINDING FEATURES WITH A LOGISTIC REGRESSION PROBE
# ==============================================================================
print("\n--- RUNNING REVISED SECTION 4.2: FINDING FEATURES WITH A PROBE ---")

from sklearn.linear_model import LogisticRegression
import numpy as np

try:
    # --- Load the SAE model ---
    sae, cfg_dict, sparsity = SAE.from_pretrained(
        release=SAE_RELEASE_NAME,
        sae_id=SAE_ID,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    print(f"Successfully loaded SAE: {SAE_ID} from release {SAE_RELEASE_NAME}")

    # --- Load saved activations ---
    personality_acts_list = torch.load(personality_activations_path)
    neutral_acts_list = torch.load(neutral_activations_path)
    print(f"Loaded {len(personality_acts_list)} personality and {len(neutral_acts_list)} neutral activation tensors.")

    # We use stack and squeeze to get the right dimensions
    personality_acts_tensor = torch.stack(personality_acts_list).squeeze(1).to(sae.device)
    neutral_acts_tensor = torch.stack(neutral_acts_list).squeeze(1).to(sae.device)

    # --- Get SAE feature activations ---
    with torch.no_grad():
        personality_feature_acts = sae.encode(personality_acts_tensor)
        neutral_feature_acts = sae.encode(neutral_acts_tensor)

    # --- Train a Logistic Regression Probe ---
    print("\nTraining a probe to find the personality signal...")
    
    # Create our training data: X is the feature activations, y is the labels
    X = torch.cat([personality_feature_acts, neutral_feature_acts], dim=0).cpu().numpy()
    y = np.array([1] * len(personality_feature_acts) + [0] * len(neutral_feature_acts))

    # Train the probe
    probe = LogisticRegression(class_weight='balanced', solver='liblinear')
    probe.fit(X, y)
    
    accuracy = probe.score(X, y)
    print(f"Probe trained with an accuracy of: {accuracy:.2%}")

    if accuracy < 0.6:
        print("WARNING: Probe accuracy is low. The signal for personality might be very weak or noisy.")

    # --- Identify top features from the probe's weights ---
    # The coefficients of the trained model tell us which features were most important.
    # The shape is (1, n_features), so we squeeze it.
    probe_weights = probe.coef_.squeeze()
    
    # Get the indices of the features with the highest positive weights (most indicative of personality)
    top_feature_indices = np.argsort(probe_weights)[-TOP_K_FEATURES:]

    print("\n" + "="*50)
    print(f"ANALYSIS COMPLETE: Top {TOP_K_FEATURES} candidate features for '{TARGET_PERSONALITY.upper()}' (found via probe):")
    # Reverse the list to show the highest weight first
    print(top_feature_indices[::-1].tolist())
    print("="*50)

except Exception as e:
    print(f"An error occurred during SAE analysis. Please check your setup. Error: {e}")


--- RUNNING REVISED SECTION 4.2: FINDING FEATURES WITH A PROBE ---
Successfully loaded SAE: blocks.8.hook_resid_pre from release gpt2-small-res-jb
Loaded 100 personality and 100 neutral activation tensors.
An error occurred during SAE analysis. Please check your setup. Error: stack expects each tensor to be equal size, but got [1, 51, 768] at entry 0 and [1, 58, 768] at entry 11


In [26]:
# ------------------------------------------------------------------------------
print("\n--- SECTION 4.4: DEMONSTRATING MANUAL STEERING ---")

layer_path = TARGET_LAYER_NAME.split('.')
target_layer_module = model
for part in layer_path:
    if part.endswith(']'):
        base, index = part.split('[')
        target_layer_module = getattr(target_layer_module, base)[int(index[:-1])]
    else:
        target_layer_module = getattr(target_layer_module, part)

avg_personality_acts = torch.stack([act[0, -1, :] for act in personality_acts_list]).mean(dim=0)
avg_neutral_acts = torch.stack([act[0, -1, :] for act in neutral_acts_list]).mean(dim=0)
steering_vector = (avg_personality_acts - avg_neutral_acts).to(model.device)

def steering_hook_function(module, input, output):
    output[0][:, -1, :] += steering_vector * STEERING_STRENGTH
    return output

test_prompt = "My plan for the weekend is"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

print(f"\n--- Generating with STEERING (Strength: {STEERING_STRENGTH}) ---")
steering_hook_handle = target_layer_module.register_forward_hook(steering_hook_function)
with torch.no_grad():
    steered_output = model.generate(**inputs, max_new_tokens=60, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id)
steering_hook_handle.remove()
print(f"Steered Output: {tokenizer.decode(steered_output[0], skip_special_tokens=True)}")

print("\n--- Generating VANILLA (for comparison) ---")
with torch.no_grad():
    vanilla_output = model.generate(**inputs, max_new_tokens=60, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id)
print(f"Vanilla Output: {tokenizer.decode(vanilla_output[0], skip_special_tokens=True)}")

print("\n\n--- End of Experiment ---")


--- SECTION 4.4: DEMONSTRATING MANUAL STEERING ---

--- Generating with STEERING (Strength: 1.0) ---
Steered Output: My plan for the weekend is to go out and have a beer with my friends. I'm going to be in town on Saturday night, so it's not too bad."
"I don't know if you can say that about me," he said of his new girlfriend who was recently married but has been living at home since

--- Generating VANILLA (for comparison) ---
Vanilla Output: My plan for the weekend is to go out and play a game of Magic. I'm going to be playing with my friends, but we're not really into that kind"
-Barry "Dreadnought" Sutter


--- End of Experiment ---


In [23]:
# ==============================================================================
# SECTION 5: QUANTITATIVE VALIDATION OF STEERING WITH OPINIONQA
# ==============================================================================
print("--- Starting Steering Validation with OpinionQA ---")

from datasets import load_dataset
import re

# --- Configuration for this validation step ---
NUM_SAMPLES_TO_VALIDATE = 50 # How many OpinionQA questions to test
# Ensure this strength is the calibrated one you found from the last step
STEERING_STRENGTH = 1.0 
# This should match the personality of the steering vector you have in memory
TARGET_PERSONALITY_TO_TEST = "extraversion" 

# --- Load the Personality Classifier ---
try:
    print("\n--- Loading Hugging Face personality classifier... ---")
    personality_classifier = pipeline("text-classification", model="holistic-ai/personality_classifier")
    print("Classifier loaded successfully.")
except Exception as e:
    print(f"FATAL: Could not load classifier. Error: {e}")
    exit()

# --- Load and prepare the OpinionQA Dataset ---
try:
    opinionqa_dataset = load_dataset("RiverDong/OpinionQA", split="test")
    df_opinionqa_sample = opinionqa_dataset.to_pandas().sample(NUM_SAMPLES_TO_VALIDATE, random_state=42)
    print(f"Loaded and sampled {len(df_opinionqa_sample)} questions from OpinionQA.")
except Exception as e:
    print(f"FATAL: Could not load OpinionQA dataset. Error: {e}")
    exit()

# --- Helper function for this cell ---
def extract_question_and_choices(prompt_str):
    q_match = re.search(r'<question>(.*?)</question>', prompt_str, re.DOTALL)
    c_match = re.search(r'<choices>(.*?)</choices>', prompt_str, re.DOTALL)
    return (q_match.group(1).strip() if q_match else ""), (c_match.group(1).strip() if c_match else "")


# --- Main Generation Loop ---
# This assumes 'model', 'tokenizer', 'target_layer_module', and 'steering_vector' 
# are already defined and in memory from your previous cells.

all_results = []
conditions_to_run = ["vanilla", f"steered_{TARGET_PERSONALITY_TO_TEST}"]

for condition in conditions_to_run:
    print(f"\n--- Generating responses for condition: {condition.upper()} ---")
    
    hook_handle = None
    if condition.startswith("steered"):
        # The steering_hook_function is already defined in the previous cell
        hook_handle = target_layer_module.register_forward_hook(steering_hook_function)
        print(f"Steering hook ATTACHED with strength {STEERING_STRENGTH}.")

    for _, row in df_opinionqa_sample.iterrows():
        question, choices = extract_question_and_choices(row['prompt'])
        if not question: continue
        
        # We use a neutral prompt that asks for an explanation, which is needed for the classifier
        prompt = f"Question: {question}\nChoices: {choices}\nPlease state your choice and explain your reasoning."
        
        inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(model.device)
        
        with torch.no_grad():
            output_ids = model.generate(
                **inputs, 
                max_new_tokens=80, 
                repetition_penalty=1.2,
                pad_token_id=tokenizer.eos_token_id
            )
        
        llm_response = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
        
        all_results.append({
            "condition": condition,
            "llm_raw_response": llm_response.strip()
        })

    if hook_handle:
        hook_handle.remove()
        print("Steering hook REMOVED.")

# --- Classification and Analysis ---
print("\n--- Classifying responses and analyzing results... ---")
df_results = pd.DataFrame(all_results)
responses_to_classify = df_results["llm_raw_response"].tolist()

if responses_to_classify:
    print(f"Classifying {len(responses_to_classify)} total responses...")
    classifier_results = personality_classifier(responses_to_classify)

    df_results["predicted_trait"] = [res['label'] for res in classifier_results]
    
    print("\n" + "="*80)
    print(" STEERING VALIDATION RESULTS")
    print("="*80)
    
    distribution = pd.crosstab(df_results['condition'], df_results['predicted_trait'], normalize='index')
    print("\nDistribution of Predicted Personalities (%):")
    print((distribution * 100).round(1))
    
    steered_df = df_results[df_results["condition"] == f"steered_{TARGET_PERSONALITY_TO_TEST}"]
    correct_predictions = steered_df[steered_df["predicted_trait"] == TARGET_PERSONALITY_TO_TEST]
    
    if not steered_df.empty:
        accuracy = len(correct_predictions) / len(steered_df)
        print(f"\nSteering Alignment Score for '{TARGET_PERSONALITY_TO_TEST.upper()}': {accuracy:.2%}")
        print("(This measures how often the classifier agreed that the steered text matched the target personality)")
else:
    print("No responses were generated to analyze.")

print("\n--- End of Validation ---")

--- Starting Steering Validation with OpinionQA ---

--- Loading Hugging Face personality classifier... ---


Device set to use cpu


Classifier loaded successfully.


Downloading readme: 100%|██████████| 851/851 [00:00<00:00, 4.18kB/s]


Loaded and sampled 50 questions from OpinionQA.

--- Generating responses for condition: VANILLA ---

--- Generating responses for condition: STEERED_EXTRAVERSION ---
Steering hook ATTACHED with strength 1.0.
Steering hook REMOVED.

--- Classifying responses and analyzing results... ---
Classifying 100 total responses...

 STEERING VALIDATION RESULTS

Distribution of Predicted Personalities (%):
predicted_trait       agreeableness  extraversion  neuroticism  openness
condition                                                               
steered_extraversion           16.0           0.0         80.0       4.0
vanilla                         0.0           2.0         98.0       0.0

Steering Alignment Score for 'EXTRAVERSION': 0.00%
(This measures how often the classifier agreed that the steered text matched the target personality)

--- End of Validation ---
