In [14]:
# ==============================================================================
# CELL 1: SETUP AND CONFIGURATION
# ==============================================================================
import os
import torch
import gc
import pandas as pd
from tqdm.notebook import tqdm  # Use notebook-friendly tqdm
from huggingface_hub import login
from dotenv import load_dotenv
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import PeftModel
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# To display plots inline in the notebook
%matplotlib inline
# Set pandas display options for better readability
pd.set_option('display.max_colwidth', 150)
pd.set_option('display.width', 120)

# --- USER CONFIGURATION ---
# Change these variables to control the experiment
MODEL_TO_RUN = "gemma2"  # or "llama3"
BATCH_SIZE = 8          # << IMPORTANT: START WITH A LOW VALUE (4 or 8) TO AVOID OOM ERRORS
N_SAMPLES = 1000        # Number of questions to test alignment on

# --- Load Environment Variables ---
load_dotenv()
hf_home = os.getenv("HF_HOME")
if not hf_home:
    raise ValueError("FATAL: HF_HOME must be set in your .env file.")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(hf_home, "hub")
print(f"--- Environment configured. HF_HOME is set to: {hf_home} ---")

# --- Central Config Dictionary (from your script) ---
CONFIGS = {
    "gemma2": {
        "model_id": "google/gemma-2-2b-it",
        "peft": {"adapter_dir": "peft_gemma2_personality"},
         "steering": {
            "vector_dir" : "persona_vectors_cache_big_five",
            "settings" : {  "extraversion":      {"layer": 15, "strength": 200.0}, "agreeableness":     {"layer": 10, "strength": 100.0},
               "neuroticism":       {"layer": 15,  "strength": 200.0},"openness":          {"layer": 15, "strength": 110.0},
               "conscientiousness": {"layer": 15, "strength": 250.0},
           },},
        "prompting": {}
    },
    "llama3": { # You would fill this in with your Llama3 configs
        "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
        "peft": {"adapter_dir": "peft_llama3_final"},
        "steering": { "vector_dir": "persona_vectors_cache_big_five", "settings": {} },
        "prompting": {},
    }
}
TARGET_PERSONALITIES = ["extraversion", "agreeableness", "neuroticism", "openness", "conscientiousness"]

# --- BitsAndBytes config for 4-bit quantization ---
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
device_map = {"": 0}

print("Configuration complete.")

--- Environment configured. HF_HOME is set to: /cs/student/projects3/aisd/2024/ghanda/hf_cache ---
Configuration complete.


In [15]:
# ==============================================================================
# CELL 2: LOAD DATASETS & EXAMPLES
# ==============================================================================
print("--- Loading personality examples for full-context prompting ---")
try:
    personality_dataset = load_dataset("holistic-ai/personality_manipulation", split="train")
    df_personality_train = personality_dataset.to_pandas()
    sorted_personalities = sorted(df_personality_train['Target Personality'].unique().tolist())
    
    personality_examples = {}
    for trait in sorted_personalities:
        trait_df = df_personality_train[df_personality_train['Target Personality'] == trait]
        personality_examples[trait] = list(zip(trait_df['Question'], trait_df['Answer']))[:2]
    print(f"Loaded {len(personality_examples)} sets of personality examples successfully.")
except Exception as e:
    print(f"FATAL WARNING: Could not load personality examples. Full-context prompting will fail. Error: {e}")
    personality_examples = None

print(f"\n--- Loading personality classifier ---")
try:
    personality_classifier = pipeline("text-classification", model="holistic-ai/personality_classifier", device=0)
    print("Personality classifier loaded successfully.")
except Exception as e:
    print(f"Could not load personality classifier: {e}")
    personality_classifier = None
    
print(f"\n--- Preparing a fixed subset of up to {N_SAMPLES} questions for alignment test ---")

# Use the 'test' split for generating alignment responses
test_dataset = load_dataset("holistic-ai/personality_manipulation", split="test")

# First, de-duplicate the questions to find our total population
unique_questions_df = test_dataset.to_pandas().drop_duplicates(subset=['Question'])
num_available_questions = len(unique_questions_df)
print(f"Found {num_available_questions} unique questions in the test set.")

# --- THE FIX ---
# Dynamically adjust the number of samples to take.
# Take the smaller value between our desired N_SAMPLES and what's actually available.
num_to_sample = min(N_SAMPLES, num_available_questions)

if num_to_sample < N_SAMPLES:
    print(f"Warning: Desired sample size ({N_SAMPLES}) is too large. Using all {num_to_sample} available unique questions instead.")

# Now, we can safely sample from the unique questions
questions_df = unique_questions_df.sample(n=num_to_sample, random_state=42)
questions = questions_df['Question'].tolist()
print(f"Test questions ready. Final sample size: {len(questions)}")

--- Loading personality examples for full-context prompting ---
Loaded 5 sets of personality examples successfully.

--- Loading personality classifier ---
Personality classifier loaded successfully.

--- Preparing a fixed subset of up to 1000 questions for alignment test ---
Found 200 unique questions in the test set.
Test questions ready. Final sample size: 200


In [16]:
# ==============================================================================
# CELL 3: DEFINE HELPER FUNCTIONS
# ==============================================================================

def create_full_context_prompt(target_personality, all_examples):
    """Creates the context block showing examples of all 5 personality traits."""
    if not all_examples: return "ERROR: Personality examples not loaded."
    full_context_str = ""
    for trait, examples in all_examples.items():
        full_context_str += f"--- EXAMPLES of '{trait}' personality ---\n"
        example_texts = [f"Question: {q}\nAnswer: {a}" for q, a in examples]
        full_context_str += "\n\n".join(example_texts) + "\n\n"
    return (
        "You will be shown examples of five different personality traits to help you understand the differences between them.\n\n"
        f"{full_context_str}"
        "--- YOUR TASK ---\n"
        "Now that you have seen examples of all five personalities, your task is to answer the following question. "
        f"You must adopt the '{target_personality}' personality strongly and clearly in your response."
    )

def create_alignment_prompt(question, tokenizer, method_config={}):
    """Creates the final prompt for the model based on the method."""
    method = method_config.get("method")
    personality = method_config.get("personality")
    
    if method == "Prompting" and personality != "Baseline":
        user_content = create_full_context_prompt(personality, personality_examples)
        user_content += f"\n\nQuestion: {question}"
    else: # For Baseline, PEFT, and Steering, use a simple prompt
        user_content = f"Please answer the following question: {question}"
        
    messages = [{"role": "user", "content": user_content}]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

def generate_batched_responses(model, tokenizer, prompts, max_new_tokens=150, hook_handle=None):
    """Generates responses for a batch of prompts."""
    try:
        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
        generation_params = {
            'do_sample': True, 'temperature': 0.7, 'top_p': 0.95,
            'max_new_tokens': max_new_tokens, 'pad_token_id': tokenizer.eos_token_id
        }
        with torch.no_grad():
            outputs = model.generate(**inputs, **generation_params)
        
        responses = tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        return [res.strip() for res in responses]
    finally:
        if hook_handle: hook_handle.remove()

print("Helper functions defined.")

Helper functions defined.


In [17]:
# ==============================================================================
# CELL 4: LOAD MODEL & RUN GENERATION
# ==============================================================================
from tqdm.auto import tqdm # Fallback to auto-detection or basic text version

logging.set_verbosity_error()
MODEL_CONFIG = CONFIGS[MODEL_TO_RUN]

print(f"--- Loading base model: {MODEL_CONFIG['model_id']} ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG['model_id'], padding_side='left')
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_CONFIG['model_id'],
    quantization_config=bnb_config,
    device_map=device_map,
    attn_implementation="sdpa",
    cache_dir = "/cs/student/projects3/aisd/2024/ghanda/hf_cache",
)

# --- Load PEFT Adapters ---
PEFT_CONFIG = MODEL_CONFIG.get('peft', {})
if PEFT_CONFIG:
    print(f"--- Loading PEFT adapters from: {PEFT_CONFIG['adapter_dir']} ---")
    peft_model, loaded = base_model, False
    for trait in TARGET_PERSONALITIES:
        adapter_path = os.path.join(PEFT_CONFIG['adapter_dir'], trait)
        if not os.path.exists(adapter_path): print(f"Warning: PEFT Adapter for '{trait}' not found."); continue
        if not loaded: peft_model, loaded = PeftModel.from_pretrained(base_model, adapter_path, adapter_name=trait), True
        else: peft_model.load_adapter(adapter_path, adapter_name=trait)
        print(f"Loaded PEFT adapter: '{trait}'")

# --- Load Steering Vectors ---
STEERING_CONFIG = MODEL_CONFIG.get('steering', {})
if STEERING_CONFIG:
    print(f"--- Loading Steering vectors from: {STEERING_CONFIG['vector_dir']} ---")
    vectors_by_trait, overrides = {}, STEERING_CONFIG.get("filename_overrides", {})
    model_fn_id = MODEL_CONFIG['model_id'].split('/')[-1]
    for trait in TARGET_PERSONALITIES:
        filename = overrides.get(trait, f"{trait}_{model_fn_id}.pt")
        vec_path = os.path.join(STEERING_CONFIG['vector_dir'], filename)
        if os.path.exists(vec_path):
            vectors_by_trait[trait] = torch.load(vec_path, weights_only=True) # Use weights_only=True
            print(f"Loaded steering vector for '{trait}'")
        else: print(f"Warning: Steering vector for '{trait}' not found.")

# --- Manually clear any unused memory before starting generation ---
print("\n--- Clearing CUDA cache before generation ---")
torch.cuda.empty_cache()
gc.collect()

# --- Main Generation Loop (WITH NEW CHECKS) ---
all_results = []
# --- We define the methods we *want* to run in general ---
methods_to_run = ['Baseline', 'Prompting', 'Steering', 'PEFT']

for method in methods_to_run:
    # --- NEW: Check if the method is actually configured for the current model ---
    method_key = method.lower() # 'Prompting' -> 'prompting'
    if method != 'Baseline' and method_key not in MODEL_CONFIG:
        print(f"\n{'='*20} SKIPPING METHOD: {method} (not configured for {MODEL_TO_RUN}) {'='*20}")
        continue
    # --- END NEW ---

    for personality in ["Baseline"] + TARGET_PERSONALITIES:
        # Skip redundant baseline runs
        if method != 'Baseline' and personality == 'Baseline':
            continue
        if method == 'Baseline' and personality != 'Baseline':
            continue
        
        print(f"\n{'='*20} GENERATING FOR: {method} - {personality} {'='*20}")
        
        # Select the correct model for the method
        model_for_eval = peft_model if method == 'PEFT' else base_model
        if method == 'PEFT' and personality != 'Baseline':
            model_for_eval.set_adapter(personality)

        # Prepare prompts
        current_config = {"method": method, "personality": personality}
        prompts = [create_alignment_prompt(q, tokenizer, current_config) for q in questions]

        # Generate responses in batches
        for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc=f"  Generating"):
            batch_prompts = prompts[i:i + BATCH_SIZE]
            hook_handle = None
            
            # Apply steering hook if necessary
            if method == 'Steering' and personality != 'Baseline':
                # --- MODIFIED: Added a check for the settings key ---
                STEERING_CONFIG = MODEL_CONFIG.get('steering', {})
                if 'settings' in STEERING_CONFIG and personality in STEERING_CONFIG['settings']:
                    settings = STEERING_CONFIG['settings'][personality]
                    steer_vec = vectors_by_trait.get(personality, {}).get(settings['layer'])
                    if steer_vec is not None:
                        gpu_steer_vec = steer_vec.to(model_for_eval.device)
                        def hook(module, input, output): return output + gpu_steer_vec.to(output.dtype) * settings['strength']
                        target_module = model_for_eval.model.layers[settings['layer']].post_attention_layernorm
                        hook_handle = target_module.register_forward_hook(hook)
                else:
                    # This case should now be caught by the outer loop's check, but this is a safe fallback
                    print(f"Warning: No steering settings for {personality}. Running without steering.")

            responses = generate_batched_responses(model_for_eval, tokenizer, batch_prompts, hook_handle=hook_handle)
            
            # Store results
            for j, response in enumerate(responses):
                question_idx = i + j
                all_results.append({
                    "method": method,
                    "target_personality": personality,
                    "question": questions[question_idx],
                    "generated_answer": response
                })

print("\n\n--- Generation Complete! ---")
results_df = pd.DataFrame(all_results)
display(results_df.head())

--- Loading base model: google/gemma-2-2b-it ---


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]


--- Loading PEFT adapters from: peft_gemma2_personality ---
Loaded PEFT adapter: 'extraversion'
Loaded PEFT adapter: 'agreeableness'
Loaded PEFT adapter: 'neuroticism'
Loaded PEFT adapter: 'openness'
Loaded PEFT adapter: 'conscientiousness'
--- Loading Steering vectors from: persona_vectors_cache_big_five ---
Loaded steering vector for 'extraversion'
Loaded steering vector for 'agreeableness'
Loaded steering vector for 'neuroticism'
Loaded steering vector for 'openness'
Loaded steering vector for 'conscientiousness'

--- Clearing CUDA cache before generation ---



  Generating: 100%|██████████| 25/25 [01:00<00:00,  2.43s/it]





  Generating: 100%|██████████| 25/25 [01:20<00:00,  3.22s/it]





  Generating: 100%|██████████| 25/25 [01:10<00:00,  2.81s/it]





  Generating:  84%|████████▍ | 21/25 [01:12<00:13,  3.29s/it]

In [None]:
# ==============================================================================
# CELL 5: CLASSIFY & ANALYZE DISTRIBUTION
# ==============================================================================
if 'results_df' in locals() and not results_df.empty:
    print("--- Classifying all generated responses... ---")
    valid_answers = [ans if ans else " " for ans in results_df['generated_answer'].tolist()]
    
    # Classify in batches for efficiency
    classifier_outputs = []
    for i in tqdm(range(0, len(valid_answers), 32), desc="Classifying"):
        batch = valid_answers[i:i+32]
        outputs = personality_classifier(batch)
        classifier_outputs.extend(outputs)
        
    results_df['predicted_personality'] = [res['label'] for res in classifier_outputs]
    print("Classification complete.")

    print("\n--- Detailed Personality Distribution Summary ---")
    
    # Create the detailed distribution table
    dist_table = pd.crosstab(
        index=[results_df['method'], results_df['target_personality']],
        columns=results_df['predicted_personality'],
        normalize='index'
    )
    dist_table = dist_table.reindex(columns=TARGET_PERSONALITIES, fill_value=0)
    
    # Format for display
    formatted_table = (dist_table * 100).applymap('{:.1f}%'.format)
    formatted_table.columns.name = 'Predicted Trait'
    formatted_table.index.names = ['Method', 'Target Personality']
    
    display(formatted_table)
else:
    print("No results to analyze. Please run the generation cell first.")

In [None]:
# ==============================================================================
# CELL 6: VISUALIZE RESULTS (CONFUSION MATRICES)
# ==============================================================================
if 'results_df' in locals() and 'predicted_personality' in results_df.columns:
    # Create a plot for each method (excluding Baseline)
    for method in ['Prompting', 'Steering', 'PEFT']:
        subset_df = results_df[(results_df['method'] == method) & (results_df['target_personality'] != 'Baseline')]
        
        if subset_df.empty:
            print(f"No data for method '{method}', skipping plot.")
            continue
            
        print(f"\n--- Generating Confusion Matrix for {method} ---")
        
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(
            subset_df['target_personality'],
            subset_df['predicted_personality'],
            labels=TARGET_PERSONALITIES
        )
        # Normalize by row (true label)
        cm_normalized = cm.astype('float') / np.maximum(cm.sum(axis=1)[:, np.newaxis], 1)
        
        sns.heatmap(
            cm_normalized, annot=True, fmt=".2f", cmap="Blues",
            xticklabels=TARGET_PERSONALITIES, yticklabels=TARGET_PERSONALITIES
        )
        plt.title(f'Confusion Matrix for {method} Method ({MODEL_TO_RUN})')
        plt.ylabel('True (Target) Personality')
        plt.xlabel('Predicted Personality')
        plt.show()
else:
    print("No classified results to visualize. Please run the analysis cell first.")