In [1]:
import os
import torch
import pandas as pd
import json # Not directly used in parts 1 & 2, but for context of full script
import re   # Not directly used in parts 1 & 2, but for context of full script
import random # Not directly used in parts 1 & 2, but for context of full script
import numpy as np # Not directly used in parts 1 & 2, but for context of full script

from datasets import Dataset # No longer load_dataset here, as Personality data is CSV
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline, # Not directly used in parts 1 & 2, but for context of full script
    logging,
)
from peft import LoraConfig, PeftModel # Import PeftModel for loading adapters
from trl import SFTTrainer, SFTConfig # <<< ADDED SFTConfig here

# For logging to Hugging Face Hub during training
from huggingface_hub import login
from datasets import Dataset, load_dataset # load_dataset explicitly for HF datasets
from dotenv import load_dotenv

# ==============================================================================
# INITIAL CONFIGURATION
# ==============================================================================

# --- Load ALL Configurations from .env file ---
load_dotenv()
print("Environment variables from .env file loaded.")

# --- Hugging Face Login ---
try:
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        login(token=hf_token)
        print("Successfully logged into Hugging Face.")
    else:
        print("Hugging Face token not found. Skipping login to Hub.")
except Exception as e:
    print(f"Could not log into Hugging Face: {e}")

# Set the Hugging Face cache directory early for both models and datasets
# This is where your base Gemma model and dataset will be downloaded.
HF_CACHE_DIR = "/cs/student/projects3/aisd/2024/ghanda/cache"
os.environ["HF_HOME"] = HF_CACHE_DIR
os.makedirs(HF_CACHE_DIR, exist_ok=True) # Ensure the directory exists
print(f"Hugging Face cache directory set to: {HF_CACHE_DIR}")

# --- Core Model Configuration ---
MODEL_NAME = "google/gemma-2-2b"
FINE_TUNED_MODEL_BASE_NAME = "gemma-2b-personality-peft"

# --- QLoRA parameters ---
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# --- bitsandbytes parameters ---
use_4bit = True
# Using bfloat16 as indicated by your GPU support, this is generally preferred.
bnb_4bit_compute_dtype = "bfloat16" 
bnb_4bit_quant_type = "nf4"
use_nested_quant = False 

# --- Updated SFTConfig parameters (must match bnb_4bit_compute_dtype) ---
# CRITICAL: These must match your bnb_4bit_compute_dtype
fp16 = False  # False if using bfloat16
bf16 = True   # True if using bfloat16

# --- SFTConfig parameters (consolidates training and SFT-specific args) ---
OUTPUT_DIR_BASE = "peft_output_models" # Base directory for saving fine-tuned models
num_train_epochs = 2
# fp16 and bf16 should align with bnb_4bit_compute_dtype
fp16 = False # False if bnb_4bit_compute_dtype is bfloat16
bf16 = True  # True if bnb_4bit_compute_dtype is bfloat16
per_device_train_batch_size = 1 # CRUCIAL for memory; keep low
gradient_accumulation_steps = 8 # Effective batch size 1 * 8 = 8 to compensate for low batch size
gradient_checkpointing = True # Highly recommended for memory saving with large models
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_8bit" # Use 8-bit optimizer for memory saving
lr_scheduler_type = "cosine"
max_steps = -1 # Not using max_steps, relying on num_train_epochs
warmup_ratio = 0.03
group_by_length = True
save_steps = 0 # Don't save intermediate checkpoints unless needed
logging_steps = 5 # Log more frequently for small datasets

# SFT-specific args now part of SFTConfig
max_seq_length = 512 # Max length for training sequences. Adjust based on your data.
packing = False      # Whether to group multiple sequences into fixed-length blocks.

# Device mapping
device_map = {"": 0} # Or "auto" for automatic device placement by accelerate/bitsandbytes

# --- Paths to your datasets ---
PERSONALITY_DATASET_NAME = "holistic-ai/personality_manipulation" # NEW: Hugging Face dataset name
BBQ_DATA_FILE = "/cs/student/projects3/aisd/2024/ghanda/bbq_ambiguous_with_metadata.csv"

# --- Azure OpenAI Configuration (Only used for PAE scoring and potentially old prompt methods) ---
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_MODEL_NAME = os.getenv("AZURE_OPENAI_MODEL_NAME")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
AZURE_OPENAI_SUBSCRIPTION_KEY = os.getenv("AZURE_OPENAI_SUBSCRIPTION_KEY")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

# Initialize Azure OpenAI client if credentials are provided
azure_openai_client = None
if all([AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_MODEL_NAME, AZURE_OPENAI_DEPLOYMENT, AZURE_OPENAI_SUBSCRIPTION_KEY, AZURE_OPENAI_API_VERSION]):
    try:
        from openai import AzureOpenAI
        azure_openai_client = AzureOpenAI(
            api_version=AZURE_OPENAI_API_VERSION,
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            api_key=AZURE_OPENAI_SUBSCRIPTION_KEY,
        )
        print("Azure OpenAI client initialized for PAE scoring.")
    except Exception as e:
        print(f"Error initializing Azure OpenAI client: {e}. PAE scoring might fail.")
else:
    print("Warning: Azure OpenAI credentials not fully set. PAE scoring might fail.")

LLM_MODEL_FOR_PAE_SCORING = AZURE_OPENAI_DEPLOYMENT # PAE uses Azure OpenAI as specified

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Environment variables from .env file loaded.
Successfully logged into Hugging Face.
Hugging Face cache directory set to: /cs/student/projects3/aisd/2024/ghanda/cache
Azure OpenAI client initialized for PAE scoring.


In [2]:
# ==============================================================================
# PART 1: PEFT/QLORA FINE-TUNING FOR EACH PERSONALITY
# ==============================================================================
print("\n--- PART 1: PEFT/QLORA FINE-TUNING ---")

# Load and prepare Personality data (needed to know which personalities to loop through)
print("\n--- Loading and Preparing Personality Data for Fine-tuning ---")
try:
    # Load the 'train' split of the dataset for fine-tuning
    personality_dataset_hf_train = load_dataset(PERSONALITY_DATASET_NAME, split='train')
    # Convert to pandas DataFrame for consistent filtering logic
    df_personality_raw = personality_dataset_hf_train.to_pandas()
    print(f"Loaded '{PERSONALITY_DATASET_NAME}' train split. Number of samples: {len(df_personality_raw)}")
except Exception as e:
    print(f"Error: Could not load '{PERSONALITY_DATASET_NAME}' train split. Error: {e}")
    exit()

target_personalities = df_personality_raw['Target Personality'].unique().tolist()
print(f"Detected personalities for fine-tuning/evaluation: {target_personalities}")

# Check if all adapters already exist
all_adapters_exist = True
for trait in target_personalities:
    adapter_path = os.path.join(OUTPUT_DIR_BASE, trait.lower().replace(" ", "_"))
    if not os.path.exists(adapter_path):
        all_adapters_exist = False
        break

if all_adapters_exist:
    print(f"\n--- All PEFT adapters found in '{OUTPUT_DIR_BASE}'. Skipping fine-tuning. ---")
    # Set a flag to skip the training loop
    SKIP_FINE_TUNING = True
else:
    print(f"\n--- Not all PEFT adapters found in '{OUTPUT_DIR_BASE}'. Proceeding with fine-tuning. ---")
    SKIP_FINE_TUNING = False

# Prepare few-shot examples (for context/old prompt-based methods, not for PEFT inference)
personality_examples_for_eval = {}
for trait in target_personalities:
    trait_df = df_personality_raw[df_personality_raw['Target Personality'] == trait]
    personality_examples_for_eval[trait] = list(zip(trait_df['Question'], trait_df['Answer']))[:5]


# --- Prompt Formatting Function for SFTTrainer (for TRAINING only) ---
def create_training_prompt_format(sample):
    """
    Formats a sample from the personality dataset into the chat template
    expected by Gemma-2B fine-tuning for SFTTrainer.
    """
    formatted_chat = [
        {"role": "user", "content": sample['Question']},
        {"role": "assistant", "content": sample['Answer']}
    ]
    sample["text"] = f"<s>[INST] {sample['Question']} [/INST]{sample['Answer']}</s>"
    return sample

# --- Quantization Configuration ---
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8: 
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Dictionary to store loaded PEFT models for evaluation (populated in Part 2)
peft_models_for_eval = {} 
tokenizer_for_eval = None 

# --- START BASE MODEL & TOKENIZER LOADING (ONCE, ALWAYS REQUIRED FOR EVAL) ---
print(f"Loading base model {MODEL_NAME} and tokenizer once...")
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map=device_map,
        torch_dtype=compute_dtype,  # ADDED: Explicit dtype specification
        trust_remote_code=True
    )
    base_model.config.use_cache = False 
    base_model.config.pretraining_tp = 1 

    if not base_model.training:
        base_model.eval()

    tokenizer_for_eval = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    if tokenizer_for_eval.pad_token is None:
        tokenizer_for_eval.pad_token = tokenizer_for_eval.eos_token
    tokenizer_for_eval.padding_side = "right" 
    
    tokenizer_for_eval.model_max_length = max_seq_length 
    print(f"Tokenizer model_max_length set to: {tokenizer_for_eval.model_max_length}")

    # Set the chat_template explicitly for Gemma-2B, crucial for apply_chat_template
    tokenizer_for_eval.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'tool' %}{{ '<tool_code>' + message['content'] + '</tool_code>' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Unknown role: ' + message['role']) }}{% endif %}{% endfor %}"
    print("Tokenizer chat_template set for Gemma-2B.")

    print("Base model and tokenizer loaded successfully.")
except Exception as e:
    print(f"FATAL: Could not load base model or tokenizer. Error: {e}")
    print("This often means the model is too large for your GPU's VRAM even with quantization.")
    print("Consider reducing per_device_train_batch_size, increasing gradient_accumulation_steps, or using a smaller model.")
    exit()

logging.set_verbosity_warning()


# --- Conditional Fine-tuning Loop ---
if not SKIP_FINE_TUNING:
    print("\n--- Starting Fine-tuning for Each Personality ---")
    for current_trait in target_personalities:
        print(f"\n***** FINE-TUNING FOR PERSONALITY: {current_trait.upper()} *****")

        filtered_df = df_personality_raw[df_personality_raw['Target Personality'] == current_trait].copy()
        if filtered_df.empty:
            print(f"No data found for personality '{current_trait}'. Skipping fine-tuning.")
            continue

        train_dataset = Dataset.from_pandas(filtered_df).map(create_training_prompt_format)
        train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col != 'text'])

        print(f"Prepared {len(train_dataset)} samples for '{current_trait}' fine-tuning.")
        if len(train_dataset) > 0:
            print(f"Sample formatted text for '{current_trait}':\n{train_dataset[0]['text']}")

        current_output_dir = os.path.join(OUTPUT_DIR_BASE, current_trait.lower().replace(" ", "_"))
        new_model_name_for_hub = f"{FINE_TUNED_MODEL_BASE_NAME}-{current_trait.lower().replace(' ', '-')}"
        os.makedirs(current_output_dir, exist_ok=True)

        peft_config = LoraConfig(
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            r=lora_r,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ],
        )
        
        training_args_sft = SFTConfig(
            output_dir=current_output_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            save_steps=save_steps,
            logging_steps=logging_steps,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            fp16=fp16,
            bf16=bf16,
            max_grad_norm=max_grad_norm,
            max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=group_by_length,
            lr_scheduler_type=lr_scheduler_type,
            report_to="tensorboard", 
            logging_dir=f"{current_output_dir}/logs",
            remove_unused_columns=False, 
            gradient_checkpointing=gradient_checkpointing,

            max_length=max_seq_length,      
            packing=packing,                
            dataset_text_field="text",      
        )

        trainer = SFTTrainer(
            model=base_model, 
            train_dataset=train_dataset,
            peft_config=peft_config,
            args=training_args_sft, 
        )

        print(f"Training PEFT model for {current_trait}...")
        trainer.train()

        print(f"Saving PEFT adapter for {current_trait} to {current_output_dir}...")
        trainer.model.save_pretrained(current_output_dir)
        
        try:
            trainer.model.push_to_hub(new_model_name_for_hub)
            print(f"Successfully pushed {new_model_name_for_hub} to Hugging Face Hub.")
        except Exception as e:
            print(f"Warning: Could not push {new_model_name_for_hub} to Hub. Error: {e}")

        del trainer
        torch.cuda.empty_cache()
        print(f"Finished fine-tuning for {current_trait}. GPU cache cleared.")

    tokenizer_for_eval.save_pretrained(os.path.join(OUTPUT_DIR_BASE, "tokenizer"))
    print(f"Tokenizer saved to {os.path.join(OUTPUT_DIR_BASE, 'tokenizer')}")
    print("\n--- All PEFT Fine-tuning completed. ---")
else:
    print("\n--- Skipping fine-tuning as all adapters are already present. ---")


--- PART 1: PEFT/QLORA FINE-TUNING ---

--- Loading and Preparing Personality Data for Fine-tuning ---
Loaded 'holistic-ai/personality_manipulation' train split. Number of samples: 4000
Detected personalities for fine-tuning/evaluation: ['extraversion', 'agreeableness', 'neuroticism', 'openness', 'conscientiousness']

--- All PEFT adapters found in 'peft_output_models'. Skipping fine-tuning. ---
Loading base model google/gemma-2-2b and tokenizer once...


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.70it/s]


Tokenizer model_max_length set to: 512
Tokenizer chat_template set for Gemma-2B.
Base model and tokenizer loaded successfully.

--- Skipping fine-tuning as all adapters are already present. ---


In [3]:
# ==============================================================================
# PART 2: LOAD FINE-TUNED PEFT MODELS FOR EVALUATION - FIXED MEMORY MANAGEMENT
# ==============================================================================
print("\n--- PART 2: Loading Fine-tuned PEFT Models for Evaluation (Fixed Memory Management) ---")

# Clear all GPU memory before starting
torch.cuda.empty_cache()
print("GPU cache cleared before loading adapters.")

# Store loaded adapters for memory-efficient switching
loaded_adapters = {}
peft_model = None  # Single PEFT model instance

# Load adapters one by one into the same base model
print("Loading adapters into single PEFT model instance...")
is_first_adapter = True

for current_trait in target_personalities:
    adapter_name = current_trait.lower().replace(" ", "_")
    adapter_path = os.path.join(OUTPUT_DIR_BASE, adapter_name)
    
    if os.path.exists(adapter_path):
        print(f"Loading adapter for {current_trait} from {adapter_path}...")
        
        try:
            if is_first_adapter:
                # Create the first PEFT model instance
                peft_model = PeftModel.from_pretrained(
                    base_model, 
                    adapter_path, 
                    adapter_name=adapter_name,
                    torch_dtype=compute_dtype
                )
                is_first_adapter = False
                print(f"First adapter '{adapter_name}' loaded and PEFT model created.")
            else:
                # Load additional adapters into the same model
                peft_model.load_adapter(adapter_path, adapter_name=adapter_name)
                print(f"Additional adapter '{adapter_name}' loaded.")
            
            loaded_adapters[current_trait] = adapter_name
            
            # Clear cache after each adapter load
            torch.cuda.empty_cache()
            
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"CUDA OOM loading adapter for {current_trait}. Skipping...")
                torch.cuda.empty_cache()
            else:
                print(f"Error loading adapter for {current_trait}: {e}")
        except Exception as e:
            print(f"Error loading adapter for {current_trait}: {e}")
    else:
        print(f"Adapter not found for {current_trait} at {adapter_path}")

# Create a clean base model instance for neutral responses
print("\nCreating neutral model instance...")
neutral_model = None

try:
    # Don't create a new model - use the base model with no active adapter
    neutral_model = base_model  # Use the original base model
    print("Using original base model for neutral responses.")
except Exception as e:
    print(f"Error setting up neutral model: {e}")
    neutral_model = base_model

print(f"Loaded adapters: {list(loaded_adapters.keys())}")
print(f"Total adapters loaded: {len(loaded_adapters)}")

# Final memory cleanup
torch.cuda.empty_cache()


--- PART 2: Loading Fine-tuned PEFT Models for Evaluation (Fixed Memory Management) ---
GPU cache cleared before loading adapters.
Loading adapters into single PEFT model instance...
Loading adapter for extraversion from peft_output_models/extraversion...
First adapter 'extraversion' loaded and PEFT model created.
Loading adapter for agreeableness from peft_output_models/agreeableness...
Additional adapter 'agreeableness' loaded.
Loading adapter for neuroticism from peft_output_models/neuroticism...
Additional adapter 'neuroticism' loaded.
Loading adapter for openness from peft_output_models/openness...
Additional adapter 'openness' loaded.
Loading adapter for conscientiousness from peft_output_models/conscientiousness...
CUDA OOM loading adapter for conscientiousness. Skipping...

Creating neutral model instance...
Using original base model for neutral responses.
Loaded adapters: ['extraversion', 'agreeableness', 'neuroticism', 'openness']
Total adapters loaded: 4


In [None]:
# ==============================================================================
# PART 3: EVALUATION WITH PEFT MODELS - FIXED ADAPTER MANAGEMENT
# ==============================================================================
print("\n--- PART 3: Starting Evaluation with PEFT Models ---")

# --- Import missing load_dataset ---
from datasets import load_dataset  # Add this import at the top of your file

# --- Trait Definitions and Other Setup Code (unchanged) ---
trait_definitions = {
    "openness": "Reflects the degree of intellectual curiosity, creativity, and preference for novelty and variety.",
    "conscientiousness": "Reflects a tendency to be organized, dependable, and show self-discipline.",
    "extraversion": "Reflects a tendency to be outgoing, energetic, and seek the company of others.",
    "agreeableness": "Reflects a tendency to be compassionate and cooperative toward others.",
    "neuroticism": "Reflects a tendency to experience unpleasant emotions easily, such as anger, anxiety, or depression.",
}

def create_inference_prompt_format(question):
    """
    Formats a question for inference with a fine-tuned Gemma-2B PEFT model.
    No personality instruction or few-shot examples are needed here, as the personality
    is encoded in the fine-tuned adapter weights.
    """
    return tokenizer_for_eval.apply_chat_template(
        [{"role": "user", "content": question}],
        tokenize=False,
        add_special_tokens=True 
    )

def get_peft_llm_response(model, tokenizer, question, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True):
    """
    Function to get a response from a loaded PEFT LLM.
    """
    current_active_adapter_state = getattr(model, 'active_adapter', None)
    
    active_model_id_for_logging = 'base_model (no adapter active)'
    if current_active_adapter_state is not None:
        active_model_id_for_logging = current_active_adapter_state
    
    print(f"--- Calling PEFT LLM (Personality: {active_model_id_for_logging}) ---")

    full_input_prompt = create_inference_prompt_format(question)
    input_ids = tokenizer(full_input_prompt, return_tensors="pt").to(model.device)

    generation_config = model.generation_config
    if generation_config is None:
        generation_config = tokenizer.generation_config
    
    generation_config.temperature = temperature
    generation_config.top_p = top_p
    generation_config.do_sample = do_sample
    generation_config.max_new_tokens = max_new_tokens
    generation_config.pad_token_id = tokenizer.pad_token_id
    generation_config.eos_token_id = tokenizer.eos_token_id

    try:
        with torch.no_grad():
            outputs = model.generate(
                **input_ids,
                generation_config=generation_config
            )
        generated_text = tokenizer.decode(outputs[0][len(input_ids["input_ids"][0]):], skip_special_tokens=True)
        return generated_text.strip()
    except Exception as e:
        print(f"Error during PEFT model generation: {e}")
        return "ERROR: PEFT model generation failed."

# --- Helper functions (unchanged) ---
def extract_question_and_choices(full_prompt_string):
    question_match = re.search(r'<question>(.*?)</question>', full_prompt_string, re.DOTALL)
    choices_match = re.search(r'<choices>(.*?)</choices>', full_prompt_string, re.DOTALL)
    question_text = question_match.group(1).strip() if question_match else ""
    choices_raw_str = choices_match.group(1).strip() if choices_match else ""
    return question_text, choices_raw_str

def parse_choices_string(choices_str):
    parsed_choices = []
    matches = re.findall(r'\([A-Z]\):\s*(.*?)(?=\s*\([A-Z]\):\s*|$)', choices_str)
    for match in matches:
        parsed_choices.append(match.strip())
    return parsed_choices if parsed_choices else [choices_str]

def categorize_opinionqa_response(raw_response, choices_list):
    raw_response_lower = raw_response.lower()
    for choice in choices_list:
        if re.search(r'\b' + re.escape(choice.lower()) + r'\b', raw_response_lower):
            return choice
    if any(word in raw_response_lower for word in ["yes", "agree", "positive", "positively"]):
        if "Yes" in choices_list: return "Yes"
        if "Agree" in choices_list: return "Agree"
        if "Strongly Agree" in choices_list: return "Strongly Agree"
    if any(word in raw_response_lower for word in ["no", "disagree", "negative", "negatively"]):
        if "No" in choices_list: return "No"
        if "Disagree" in choices_list: return "Disagree"
        if "Strongly Disagree" in choices_list: return "Strongly Disagree"
    if any(word in raw_response_lower for word in ["neutral", "balanced", "both", "neither"]):
        if "Neutral" in choices_list: return "Neutral"
        if "Seek Balance" in choices_list: return "Seek Balance"
        if "Uncategorized" in choices_list: return "Uncategorized"
    return "Uncategorized"

def map_categorized_to_label(categorized_text, choices_raw_str):
    if not isinstance(choices_raw_str, str):
        return 'UNKNOWN'
    matches = re.findall(r'\((\w)\):\s*(.*?)(?=\s*\([A-Z]\):|$)', choices_raw_str)
    for letter, choice_text in matches:
        if categorized_text.lower() == choice_text.strip().lower():
            return letter.upper()
    return 'UNKNOWN'

# --- Load Hugging Face Personality Classifier ---
print("\n--- Loading Hugging Face Personality Classifier ---")
try:
    personality_classifier = pipeline("text-classification", model="holistic-ai/personality_classifier")
    print("Hugging Face personality classifier loaded successfully.")
except Exception as e:
    print(f"Error loading Hugging Face classifier: {e}")
    personality_classifier = lambda text: [{'label': 'unknown', 'score': 0.0, 'error': str(e)}]


--- PART 3: Starting Evaluation with PEFT Models ---

--- Loading Hugging Face Personality Classifier ---


Device set to use cuda:0


Hugging Face personality classifier loaded successfully.


In [12]:
# ==============================================================================
# PART 3 v2: FIXED EVALUATION WITH PROPER ADAPTER MANAGEMENT
# ==============================================================================
print("\n--- PART 3: Fixed Evaluation with Proper Adapter Management ---")

def switch_to_personality(personality_trait):
    """
    Switch the model to use a specific personality adapter or neutral mode.
    Returns the appropriate model instance.
    """
    global peft_model, neutral_model
    
    if personality_trait == "neutral":
        # For neutral, we need to ensure no adapter is active
        if peft_model is not None and hasattr(peft_model, 'disable_adapters'):
            # Use context manager to temporarily disable adapters
            return neutral_model, "neutral_context"
        else:
            return neutral_model, "neutral_base"
    else:
        # For personality traits, switch to the appropriate adapter
        if peft_model is not None and personality_trait in loaded_adapters:
            adapter_name = loaded_adapters[personality_trait]
            try:
                peft_model.set_adapter(adapter_name)
                return peft_model, adapter_name
            except Exception as e:
                print(f"Error switching to adapter {adapter_name}: {e}")
                return peft_model, "error_fallback"
        else:
            print(f"Adapter not found for {personality_trait}, using base model")
            return neutral_model, "fallback_base"

def get_peft_llm_response_fixed(personality_trait, question, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True):
    """
    Fixed function to get response from PEFT model with proper adapter management.
    """
    # Switch to the appropriate model/adapter
    current_model, adapter_status = switch_to_personality(personality_trait)
    
    print(f"--- Calling PEFT LLM (Personality: {personality_trait}, Status: {adapter_status}) ---")
    
    # Create input prompt
    full_input_prompt = create_inference_prompt_format(question)
    
    try:
        # Tokenize input
        input_ids = tokenizer_for_eval(full_input_prompt, return_tensors="pt").to(current_model.device)
        
        # Generate response
        with torch.no_grad():
            # Use disable_adapters context for neutral responses
            if personality_trait == "neutral" and hasattr(current_model, 'disable_adapters'):
                with current_model.disable_adapters():
                    outputs = current_model.generate(
                        **input_ids,
                        max_new_tokens=max_new_tokens,
                        temperature=temperature,
                        top_p=top_p,
                        do_sample=do_sample,
                        pad_token_id=tokenizer_for_eval.pad_token_id,
                        eos_token_id=tokenizer_for_eval.eos_token_id
                    )
            else:
                outputs = current_model.generate(
                    **input_ids,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=do_sample,
                    pad_token_id=tokenizer_for_eval.pad_token_id,
                    eos_token_id=tokenizer_for_eval.eos_token_id
                )
        
        # Decode response
        generated_text = tokenizer_for_eval.decode(
            outputs[0][len(input_ids["input_ids"][0]):], 
            skip_special_tokens=True
        )
        
        return generated_text.strip()
        
    except Exception as e:
        print(f"Error during generation for {personality_trait}: {e}")
        return f"ERROR: Generation failed for {personality_trait}"


--- PART 3: Fixed Evaluation with Proper Adapter Management ---


In [None]:
# --- OpinionQA Evaluation with FIXED Adapter Management ---
print("\n--- Step 5: Verifying Personality Change with OpinionQA (Using PEFT Models) ---")

# Load OpinionQA dataset
try:
    opinionqa_dataset = load_dataset("RiverDong/OpinionQA", split="test")
    print(f"OpinionQA dataset loaded from Hugging Face. Number of samples: {len(opinionqa_dataset)}")
    df_opinionqa_questions = opinionqa_dataset.to_pandas()
    print("\n--- Inspecting OpinionQA Dataset Columns ---")
    print(df_opinionqa_questions.head(1).T)
except Exception as e:
    print(f"Error loading OpinionQA dataset: {e}. Falling back to dummy OpinionQA dataset.")
    df_opinionqa_questions = pd.DataFrame([
        {"id": "op1", "prompt": "<persona>...</persona> <question>Do you believe social media positively impacts society?</question> <choices>(A): Yes (B): No (C): Neutral</choices>", "answer": "A", "question_id": "DUMMY_Q1"},
        {"id": "op2", "prompt": "<persona>...</persona> <question>Is it important for a leader to prioritize group harmony?</question> <choices>(A): Strongly Agree (B): Disagree</choices>", "answer": "B", "question_id": "DUMMY_Q2"},
    ])

all_opinionqa_results = []
all_opinion_qa_personalities = ["neutral"] + list(target_personalities) 

# FIXED: Create a separate neutral model for true neutral responses
neutral_model = None
if hasattr(base_model, 'peft_config') and base_model.peft_config:
    # Create a fresh copy of the base model without any adapters for neutral
    print("Creating separate neutral model without adapters...")
    try:
        neutral_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map=device_map
        )
        neutral_model.config.use_cache = False 
        neutral_model.config.pretraining_tp = 1
        print("Neutral model created successfully.")
    except Exception as e:
        print(f"Warning: Could not create separate neutral model: {e}")
        print("Will use base model with disabled adapter as fallback.")
        neutral_model = base_model

for personality_trait in all_opinion_qa_personalities:
    print(f"\n--- Running OpinionQA for Personality: {personality_trait} ---")
    
    # FIXED: Use appropriate model for each personality
    if personality_trait == "neutral":
        current_model_for_inference = neutral_model
        print("Using dedicated neutral model (no adapters).")
    else:
        current_model_for_inference = base_model
        adapter_id = personality_trait.lower().replace(" ", "_") 
        if adapter_id in current_model_for_inference.peft_config: 
            current_model_for_inference.set_adapter(adapter_id)
            print(f"Activated adapter: {adapter_id}")
        else:
            print(f"Warning: Adapter '{adapter_id}' not found for {personality_trait}. Using base model.")

    # Debug check
    if personality_trait == "neutral":
        debug_active_adapter = "neutral_model (no adapters)"
    else:
        debug_active_adapter = getattr(current_model_for_inference, 'active_adapter', 'Not a PeftModel or no adapter set')
    print(f"DEBUG: Current model active_adapter is: {debug_active_adapter}")

    N_OPINIONQA_SAMPLES_PER_PERSONALITY = 20
    opinionqa_subset_for_testing = df_opinionqa_questions.sample(
        min(len(df_opinionqa_questions), N_OPINIONQA_SAMPLES_PER_PERSONALITY), 
        random_state=42
    )

    responses_to_classify_batch = []
    metadata_for_batch = []

    for index, item_row in opinionqa_subset_for_testing.iterrows():
        full_prompt_from_dataset = item_row['prompt']
        question_id = item_row['question_id']
        human_true_answer_label = item_row['answer']

        question_text, choices_raw_str = extract_question_and_choices(full_prompt_from_dataset)
        parsed_choices = parse_choices_string(choices_raw_str)
        
        llm_response = get_peft_llm_response(
            model=current_model_for_inference,
            tokenizer=tokenizer_for_eval,
            question=f"Question: {question_text}\nChoices: {choices_raw_str}\nAnswer:",
            max_new_tokens=100, 
            temperature=0.7 
        )

        responses_to_classify_batch.append(llm_response)
        metadata_for_batch.append({
            "intended_personality": personality_trait,
            "question_id": question_id,
            "full_dataset_prompt": full_prompt_from_dataset,
            "extracted_question": question_text,
            "choices_raw_str": choices_raw_str,
            "parsed_choices": parsed_choices,
            "human_true_answer_label": human_true_answer_label,
            "llm_raw_response": llm_response,
            "llm_categorized_response_oq": categorize_opinionqa_response(llm_response, parsed_choices)
        })

    if responses_to_classify_batch:
        print(f"Batch classifying {len(responses_to_classify_batch)} responses for '{personality_trait}'...")
        batched_classifier_results = personality_classifier(responses_to_classify_batch)
        
        for i, class_result in enumerate(batched_classifier_results):
            metadata_entry = metadata_for_batch[i]
            metadata_entry["llm_predicted_trait_TA"] = class_result['label']
            metadata_entry["llm_predicted_trait_TA_confidence"] = class_result['score']
            all_opinionqa_results.append(metadata_entry)

# Rest of the analysis code remains the same...
df_opinionqa_results = pd.DataFrame(all_opinionqa_results)
print("\n--- Raw OpinionQA Results Sample (First 5 rows) ---")
print(df_opinionqa_results.head())


--- Step 5: Verifying Personality Change with OpinionQA (Using PEFT Models) ---
OpinionQA dataset loaded from Hugging Face. Number of samples: 294714

--- Inspecting OpinionQA Dataset Columns ---
                                                                   0
prompt             <persona>\nRacially, the person is refused. Th...
answer                                                             B
uid                                   American_Trends_Panel_W92_6823
folder                                     American_Trends_Panel_W92
question_id                                            BIGHOUSES_W92
__index_level_0__                                             460290
Creating separate neutral model without adapters...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Will use base model with disabled adapter as fallback.

--- Running OpinionQA for Personality: neutral ---
Using dedicated neutral model (no adapters).
DEBUG: Current model active_adapter is: neutral_model (no adapters)
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: extraversion) ---
--- Calling PEFT LLM (Personality: ext

KeyboardInterrupt: 

In [13]:
# ==============================================================================
# PART 5 V2: FIXED OPINIONQA EVALUATION
# ==============================================================================
print("\n--- Fixed OpinionQA Evaluation ---")

# Load OpinionQA dataset (keeping original code)
try:
    opinionqa_dataset = load_dataset("RiverDong/OpinionQA", split="test")
    print(f"OpinionQA dataset loaded. Number of samples: {len(opinionqa_dataset)}")
    df_opinionqa_questions = opinionqa_dataset.to_pandas()
except Exception as e:
    print(f"Error loading OpinionQA dataset: {e}. Using dummy data.")
    df_opinionqa_questions = pd.DataFrame([
        {"id": "op1", "prompt": "<question>Do you believe social media positively impacts society?</question> <choices>(A): Yes (B): No (C): Neutral</choices>", "answer": "A", "question_id": "DUMMY_Q1"},
        {"id": "op2", "prompt": "<question>Is it important for a leader to prioritize group harmony?</question> <choices>(A): Strongly Agree (B): Disagree</choices>", "answer": "B", "question_id": "DUMMY_Q2"},
    ])

# Run evaluation for all personalities
all_opinionqa_results = []
all_personalities = ["neutral"] + list(target_personalities)

for personality_trait in all_personalities:
    print(f"\n--- Running OpinionQA for Personality: {personality_trait} ---")
    
    # Clear cache before each personality evaluation
    torch.cuda.empty_cache()
    
    N_SAMPLES = 10  # Reduced for testing
    opinionqa_subset = df_opinionqa_questions.sample(
        min(len(df_opinionqa_questions), N_SAMPLES), 
        random_state=42
    )
    
    responses_batch = []
    metadata_batch = []
    
    for index, row in opinionqa_subset.iterrows():
        full_prompt = row['prompt']
        question_id = row['question_id']
        true_answer = row['answer']
        
        # Extract question and choices
        question_text, choices_str = extract_question_and_choices(full_prompt)
        parsed_choices = parse_choices_string(choices_str)
        
        # Get LLM response with fixed adapter management
        llm_response = get_peft_llm_response_fixed(
            personality_trait=personality_trait,
            question=f"Question: {question_text}\nChoices: {choices_str}\nAnswer:",
            max_new_tokens=100,
            temperature=0.7
        )
        
        responses_batch.append(llm_response)
        metadata_batch.append({
            "intended_personality": personality_trait,
            "question_id": question_id,
            "extracted_question": question_text,
            "choices_raw_str": choices_str,
            "parsed_choices": parsed_choices,
            "human_true_answer_label": true_answer,
            "llm_raw_response": llm_response,
            "llm_categorized_response": categorize_opinionqa_response(llm_response, parsed_choices)
        })
    
    # Batch classify responses
    if responses_batch:
        print(f"Classifying {len(responses_batch)} responses for {personality_trait}...")
        try:
            classifier_results = personality_classifier(responses_batch)
            
            for i, result in enumerate(classifier_results):
                metadata_batch[i]["predicted_trait"] = result['label']
                metadata_batch[i]["prediction_confidence"] = result['score']
                all_opinionqa_results.append(metadata_batch[i])
                
        except Exception as e:
            print(f"Error in personality classification: {e}")
            for metadata in metadata_batch:
                metadata["predicted_trait"] = "classification_error"
                metadata["prediction_confidence"] = 0.0
                all_opinionqa_results.append(metadata)
    
    print(f"Completed evaluation for {personality_trait}")

# Analysis of results
print("\n--- Analysis of OpinionQA Results ---")
if all_opinionqa_results:
    df_results = pd.DataFrame(all_opinionqa_results)
    
    print(f"Total responses collected: {len(df_results)}")
    print(f"Personalities evaluated: {df_results['intended_personality'].unique()}")
    
    # Trait alignment analysis
    non_neutral_results = df_results[df_results['intended_personality'] != 'neutral']
    if not non_neutral_results.empty:
        correct_predictions = non_neutral_results[
            non_neutral_results['intended_personality'] == non_neutral_results['predicted_trait']
        ]
        trait_alignment_score = len(correct_predictions) / len(non_neutral_results)
        print(f"\nOverall Trait Alignment Score: {trait_alignment_score:.3f}")
        
        # Per-trait analysis
        trait_scores = non_neutral_results.groupby('intended_personality').apply(
            lambda x: (x['intended_personality'] == x['predicted_trait']).mean()
        )
        print("\nTrait Alignment per Personality:")
        for trait, score in trait_scores.items():
            print(f"  {trait}: {score:.3f}")
    
    # Memory status
    if torch.cuda.is_available():
        print(f"\nFinal GPU Memory - Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"Final GPU Memory - Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
        
    print("\n--- OpinionQA Evaluation Complete ---")
else:
    print("No results collected. Please check the evaluation process.")

# Final cleanup
torch.cuda.empty_cache()
print("Final GPU cache cleanup completed.")


--- Fixed OpinionQA Evaluation ---
OpinionQA dataset loaded. Number of samples: 294714

--- Running OpinionQA for Personality: neutral ---
--- Calling PEFT LLM (Personality: neutral, Status: neutral_context) ---
Error during generation for neutral: No adapter loaded. Please load an adapter first.
--- Calling PEFT LLM (Personality: neutral, Status: neutral_context) ---
Error during generation for neutral: No adapter loaded. Please load an adapter first.
--- Calling PEFT LLM (Personality: neutral, Status: neutral_context) ---
Error during generation for neutral: No adapter loaded. Please load an adapter first.
--- Calling PEFT LLM (Personality: neutral, Status: neutral_context) ---
Error during generation for neutral: No adapter loaded. Please load an adapter first.
--- Calling PEFT LLM (Personality: neutral, Status: neutral_context) ---
Error during generation for neutral: No adapter loaded. Please load an adapter first.
--- Calling PEFT LLM (Personality: neutral, Status: neutral_conte

KeyboardInterrupt: 

In [None]:
print("\n--- Analyzing OpinionQA Trait Alignment (TA) ---")
if not df_opinionqa_results.empty:
    df_ta_analysis = df_opinionqa_results[df_opinionqa_results['intended_personality'] != 'neutral'].copy()

    if not df_ta_analysis.empty:
        correct_ta_predictions = df_ta_analysis[df_ta_analysis['intended_personality'] == df_ta_analysis['llm_predicted_trait_TA']]
        overall_ta_score = len(correct_ta_predictions) / len(df_ta_analysis)
        print(f"Overall Trait Alignment (TA) Score (excluding 'neutral'): {overall_ta_score:.3f}")

        ta_per_trait = df_ta_analysis.groupby('intended_personality').apply(
            lambda x: (x['intended_personality'] == x['llm_predicted_trait_TA']).mean()
        )
        print("\nTrait Alignment (TA) Score per Personality:")
        print(ta_per_trait)
    else:
        print("No non-neutral results to analyze for Trait Alignment.")

    print("\n--- OpinionQA Content Analysis (Conceptual for Opinion Alignment) ---")
    print("Example: Distribution of LLM categorized responses for a question across personalities:")
    first_q_id = df_opinionqa_results['question_id'].iloc[0] if not df_opinionqa_results.empty else "N/A"
    print(f"LLM responses for question_id: '{first_q_id}'")
    print(df_opinionqa_results[df_opinionqa_results['question_id'] == first_q_id].groupby('intended_personality')['llm_categorized_response_oq'].value_counts(normalize=True))
    
    print("\nTo compare LLM opinions with human responses, you would:")
    print("1. Map the LLM's 'llm_categorized_response_oq' to its corresponding raw label (e.g., 'Yes' to 'A').")
    print("2. Compare the distribution of LLM's mapped labels with the 'human_true_answer_label' distribution for each question.")
else:
    print("No OpinionQA results to analyze. Please ensure the experiment runs and gathers data.")

print("\n--- Misclassification Analysis (LLM Predicted Trait vs. Intended Trait) ---")
if 'df_ta_analysis' in locals() and not df_ta_analysis.empty:
    misclassified_df = df_ta_analysis[
        df_ta_analysis['intended_personality'] != df_ta_analysis['llm_predicted_trait_TA']
    ].copy()

    if not misclassified_df.empty:
        print("\n--- Common Misclassifications ---")
        misclassification_summary = misclassified_df.groupby(['intended_personality', 'llm_predicted_trait_TA']).size().reset_index(name='count')
        print(misclassification_summary.sort_values(by='count', ascending=False))

        print("\n--- Examples of Misclassified Responses ---")
        traits_to_inspect = misclassified_df['intended_personality'].unique().tolist()
        
        print(f"Analyzing misclassifications for: {traits_to_inspect}")

        for trait in traits_to_inspect:
            print(f"\nMisclassifications for Intended Personality: '{trait}'")
            trait_misclassifications = misclassified_df[misclassified_df['intended_personality'] == trait]
            
            if not trait_misclassifications.empty:
                print(trait_misclassifications[['question_id', 'llm_raw_response', 'llm_predicted_trait_TA', 'llm_predicted_trait_TA_confidence']].head(3).to_string())
            else:
                print(f"No misclassifications found for intended personality '{trait}'.")
    else:
        print("Excellent! No misclassifications found in the non-neutral data.")

else:
    print("No data available for misclassification analysis (df_ta_analysis not found or is empty).")


print("\n--- Opinion Alignment Analysis (LLM Categorized vs. Human True Answer) ---")

df_opinionqa_results['llm_mapped_answer_label'] = df_opinionqa_results.apply(
    lambda row: map_categorized_to_label(row['llm_categorized_response_oq'], row['choices_raw_str']),
    axis=1
)

unique_question_ids = df_opinionqa_results['question_id'].unique()

print("\n--- Comparing LLM Opinion Distributions vs. Human True Answers per Question ---")
for q_id in unique_question_ids:
    print(f"\n--- Question ID: {q_id} ---")
    question_data = df_opinionqa_results[df_opinionqa_results['question_id'] == q_id]

    if question_data.empty:
        print("No data for this question ID in the results.")
        continue

    print("\nHuman True Answer Distribution:")
    human_dist = question_data['human_true_answer_label'].value_counts(normalize=True).sort_index()
    print(human_dist)

    print("\nLLM Mapped Answer Distribution per Intended Personality:")
    llm_dist_per_personality = question_data.groupby('intended_personality')['llm_mapped_answer_label'].value_counts(normalize=True).unstack(fill_value=0)
    print(llm_dist_per_personality)

print("\n--- OpinionQA Analysis Complete ---")


--- Analyzing OpinionQA Trait Alignment (TA) ---
Overall Trait Alignment (TA) Score (excluding 'neutral'): 0.200

Trait Alignment (TA) Score per Personality:
intended_personality
agreeableness        0.0
conscientiousness    0.0
extraversion         0.0
neuroticism          1.0
openness             0.0
dtype: float64

--- OpinionQA Content Analysis (Conceptual for Opinion Alignment) ---
Example: Distribution of LLM categorized responses for a question across personalities:
LLM responses for question_id: 'WORKHARD_W53'
intended_personality  llm_categorized_response_oq
agreeableness         Uncategorized                  1.0
conscientiousness     Uncategorized                  1.0
extraversion          Uncategorized                  1.0
neuroticism           Uncategorized                  1.0
neutral               Uncategorized                  1.0
openness              Uncategorized                  1.0
Name: proportion, dtype: float64

To compare LLM opinions with human responses, you

  ta_per_trait = df_ta_analysis.groupby('intended_personality').apply(
