In [2]:
import os
import torch
import pandas as pd
import json
import re
import random
import numpy as np
from sklearn.model_selection import train_test_split

from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from dotenv import load_dotenv

# ==============================================================================
# CONFIGURATION - REFINED FOR TRAIT ALIGNMENT
# ==============================================================================

# --- Hugging Face & Cache Setup ---
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
HF_CACHE_DIR = "/cs/student/projects3/aisd/2024/ghanda/cache"
os.environ["HF_HOME"] = HF_CACHE_DIR

# --- Model & Training Paths ---
MODEL_NAME = "google/gemma-2-2b"
OUTPUT_DIR_BASE = "peft_output_models_v2_refined"

# --- Enhanced QLoRA Parameters ---
lora_r = 64  # Reduced from 128 for better stability
lora_alpha = 128  # 2x lora_r ratio
lora_dropout = 0.1  # Increased for better generalization
target_modules = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
]

# --- Quantization Parameters ---
use_4bit = True
bnb_4bit_compute_dtype = "bfloat16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# --- Improved Training Parameters ---
num_train_epochs = 5  # Increased from 3
fp16 = False
bf16 = True
per_device_train_batch_size = 4  # Increased from 2
gradient_accumulation_steps = 2  # Reduced to maintain effective batch size
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4  # Increased from 1e-4
weight_decay = 0.001  # Reduced from 0.01
optim = "paged_adamw_8bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.1  # Increased from 0.03
group_by_length = True
max_seq_length = 1024  # Increased from 768
packing = False
logging_steps = 10
save_steps = 250  # Save checkpoints during training
eval_steps = 250  # Add evaluation during training

# --- Device Mapping ---
device_map = {"": 0}

# --- Dataset Names ---
PERSONALITY_DATASET_NAME = "holistic-ai/personality_manipulation"
OPINIONQA_DATASET_NAME = "RiverDong/OpinionQA"

# ==============================================================================
# ENHANCED HELPER FUNCTIONS
# ==============================================================================

def enhanced_training_prompt_format_v2(sample):
    """
    MAJOR IMPROVEMENT: More explicit personality instructions with opinion reasoning
    """
    personality = sample['Target Personality']
    question = sample['Question']
    answer = sample['Answer']

    # Enhanced personality descriptions with opinion-taking behavior
    personality_instructions = {
        'extraversion': (
            "You are an AI assistant with a strong Extraversion personality. You are outgoing, "
            "energetic, and sociable. You love interacting with others and sharing your thoughts "
            "enthusiastically. When answering questions, you express confident opinions and "
            "engage actively with the topic. You prefer collaborative and social solutions."
        ),
        'agreeableness': (
            "You are an AI assistant with a strong Agreeableness personality. You are cooperative, "
            "trusting, and helpful. You prioritize harmony and others' well-being. When answering "
            "questions, you consider the impact on others and prefer solutions that benefit everyone. "
            "You tend to agree with cooperative and altruistic viewpoints."
        ),
        'conscientiousness': (
            "You are an AI assistant with a strong Conscientiousness personality. You are organized, "
            "responsible, and goal-oriented. You focus on accuracy and completing tasks properly. "
            "When answering questions, you prefer systematic, well-planned approaches and value "
            "discipline, structure, and long-term thinking."
        ),
        'neuroticism': (
            "You are an AI assistant with a strong Neuroticism personality. You tend to experience "
            "negative emotions more intensely and may express worry or anxiety. When answering "
            "questions, you consider potential risks and problems. You may be more cautious and "
            "express concern about negative outcomes."
        ),
        'openness': (
            "You are an AI assistant with a strong Openness personality. You are curious, creative, "
            "and open to new experiences. You enjoy exploring ideas and possibilities. When answering "
            "questions, you consider innovative solutions and appreciate complexity, creativity, "
            "and intellectual exploration."
        )
    }

    personality_instruction = personality_instructions.get(
        personality.lower(), 
        "You are a helpful AI assistant."
    )

    # CRITICAL: Add explicit instruction to choose from given options
    sample["text"] = (
        f"<s>[INST] {personality_instruction}\n\n"
        f"Please answer the following question by selecting one of the provided choices "
        f"and explaining your reasoning from your personality perspective.\n\n"
        f"Question: {question} [/INST]{answer}</s>"
    )
    return sample

def extract_question_and_choices(full_prompt_string):
    """
    IMPROVED: Extract question and choices with better error handling and format support
    """
    # Try the original OpinionQA format first
    question_match = re.search(r'<question>(.*?)</question>', full_prompt_string, re.DOTALL)
    choices_match = re.search(r'<choices>(.*?)</choices>', full_prompt_string, re.DOTALL)
    
    if question_match and choices_match:
        question_text = question_match.group(1).strip()
        choices_raw_str = choices_match.group(1).strip()
        return question_text, choices_raw_str
    
    # Fallback: try to extract from plain text format
    # Look for "Question:" followed by "Choices:" or "A)" format
    question_fallback = re.search(r'Question:\s*(.*?)(?=\n(?:Choices:|[A-D]\)|\n))', full_prompt_string, re.DOTALL)
    choices_fallback = re.search(r'(?:Choices:|(?=[A-D]\)))(.*?)(?=\n\n|\Z)', full_prompt_string, re.DOTALL)
    
    if question_fallback:
        question_text = question_fallback.group(1).strip()
    else:
        # Last resort: take first sentence-like structure
        lines = full_prompt_string.split('\n')
        question_text = next((line for line in lines if len(line) > 20 and '?' in line), "")
    
    if choices_fallback:
        choices_raw_str = choices_fallback.group(1).strip()
    else:
        # Extract lines that look like choices (A) B) C) D) format
        choice_lines = re.findall(r'[A-D]\).*', full_prompt_string)
        choices_raw_str = '\n'.join(choice_lines)
    
    return question_text, choices_raw_str

def create_inference_prompt_v2(personality_trait, question_with_choices):
    """
    IMPROVED: More explicit instructions for opinion selection
    """
    personality_instructions = {
        'extraversion': (
            "You are an AI with a strong Extraversion personality. You are outgoing, energetic, "
            "and social. You prefer collaborative solutions and express confident opinions."
        ),
        'agreeableness': (
            "You are an AI with a strong Agreeableness personality. You are cooperative and "
            "helpful. You prioritize harmony and consider the impact on others."
        ),
        'conscientiousness': (
            "You are an AI with a strong Conscientiousness personality. You are organized and "
            "responsible. You prefer systematic, well-planned approaches."
        ),
        'neuroticism': (
            "You are an AI with a strong Neuroticism personality. You tend to worry and "
            "consider potential risks and problems."
        ),
        'openness': (
            "You are an AI with a strong Openness personality. You are curious and creative. "
            "You appreciate complexity and innovative solutions."
        )
    }

    if personality_trait == "neutral":
        instruction = "You are a helpful AI assistant."
    else:
        instruction = personality_instructions.get(
            personality_trait.lower(), 
            "You are a helpful AI assistant."
        )

    prompt = (
        f"<s>[INST] {instruction}\n\n"
        f"Please answer the following question by selecting the choice that best aligns "
        f"with your personality and explaining your reasoning. Be decisive in your choice.\n\n"
        f"{question_with_choices}\n\n"
        f"Answer with your chosen letter (A, B, C, or D) followed by your explanation: [/INST]"
    )
    return prompt

def improved_categorize_response(judge_model, tokenizer, question, choices_str, response_to_judge):
    """
    IMPROVED: Better judge prompt with multiple attempts for robustness
    """
    judge_prompt = (
        f"<s>[INST] You are a precise answer extractor. Your only job is to identify which "
        f"multiple choice option (A, B, C, or D) the given response supports.\n\n"
        f"Question: {question}\n"
        f"Choices: {choices_str}\n\n"
        f"Response to analyze:\n{response_to_judge}\n\n"
        f"Look for explicit choice letters (A, B, C, D) or content that clearly supports "
        f"one option. Respond with ONLY the single capital letter: [/INST]"
    )
    
    inputs = tokenizer(judge_prompt, return_tensors="pt").to(judge_model.device)
    
    # Try multiple times with different sampling parameters
    for temperature in [0.0, 0.1, 0.3]:
        with torch.no_grad():
            outputs = judge_model.generate(
                **inputs,
                max_new_tokens=10,
                temperature=temperature,
                do_sample=temperature > 0,
                pad_token_id=tokenizer.eos_token_id
            )
        
        judge_response = tokenizer.decode(
            outputs[0][inputs.input_ids.shape[1]:], 
            skip_special_tokens=True
        ).strip()
        
        # Look for the first valid choice letter
        match = re.search(r'\b([A-D])\b', judge_response)
        if match:
            return match.group(1)
    
    return "JUDGE_ERROR"

def create_balanced_dataset(df_personality_raw, target_trait, max_samples_per_trait=500):
    """
    Create a balanced dataset for better training
    """
    trait_df = df_personality_raw[df_personality_raw['Target Personality'] == target_trait]
    
    # If we have too many samples, randomly sample
    if len(trait_df) > max_samples_per_trait:
        trait_df = trait_df.sample(n=max_samples_per_trait, random_state=42)
    
    return trait_df

def get_llm_response_v2(model, tokenizer, personality_trait, question_with_choices, 
                       max_new_tokens=100, temperature=0.7):
    """
    IMPROVED: Better response generation with multiple attempts
    """
    prompt = create_inference_prompt_v2(personality_trait, question_with_choices)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    if hasattr(model, 'set_adapter') and personality_trait != "neutral":
        try:
            adapter_name = personality_trait.lower().replace(" ", "_")
            model.set_adapter(adapter_name)
        except Exception as e:
            print(f"Warning: Adapter for '{personality_trait}' not found: {e}")

    # Try generation with different parameters
    for attempt in range(2):
        try:
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    repetition_penalty=1.1,
                    pad_token_id=tokenizer.eos_token_id,
                )
            response = tokenizer.decode(
                outputs[0][inputs.input_ids.shape[1]:], 
                skip_special_tokens=True
            )
            if response.strip():  # If we got a non-empty response
                return response.strip()
        except Exception as e:
            print(f"Generation attempt {attempt + 1} failed: {e}")
            if attempt == 0:
                temperature = 0.5  # Try with lower temperature
    
    return "ERROR: Generation failed"

def evaluate_during_training(model, tokenizer, eval_dataset, personality_trait):
    """
    Function to evaluate model during training
    """
    model.eval()
    correct = 0
    total = 0
    
    # Sample a small subset for quick evaluation
    sample_size = min(50, len(eval_dataset))
    eval_sample = eval_dataset.select(range(sample_size))
    
    for example in eval_sample:
        # Extract question from the training format
        question_match = re.search(r'Question: (.*?) \[/INST\]', example['text'])
        if question_match:
            question = question_match.group(1)
            response = get_llm_response_v2(model, tokenizer, personality_trait, question)
            # Simple evaluation logic here
            total += 1
            if len(response) > 10:  # Basic check for meaningful response
                correct += 1
    
    model.train()
    return correct / total if total > 0 else 0

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


In [9]:
# ==============================================================================
# MAIN EXECUTION SCRIPT
# ==============================================================================

from transformers import TrainingArguments

def main():
    # --- Setup ---
    print("--- Initializing Setup ---")
    if HF_TOKEN:
        login(token=HF_TOKEN)
        print("Successfully logged into Hugging Face.")
    os.makedirs(OUTPUT_DIR_BASE, exist_ok=True)

    # --- PART 1: Load Base Model & Tokenizer ---
    print("\n--- PART 1: Loading Base Model and Tokenizer ---")
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map=device_map,
        torch_dtype=compute_dtype,
        attn_implementation="sdpa"
    )
    base_model.config.use_cache = False
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.model_max_length = max_seq_length
    print("Base model and tokenizer loaded successfully.")

    # --- PART 2: Refined Fine-Tuning ---
    print("\n--- PART 2: Fine-Tuning Process ---")
    try:
        df_personality_raw = load_dataset(PERSONALITY_DATASET_NAME, split='train').to_pandas()
        target_personalities = df_personality_raw['Target Personality'].unique().tolist()
    except Exception as e:
        print(f"FATAL: Could not load personality dataset: {e}")
        return

    all_adapters_exist = all(
        os.path.exists(os.path.join(OUTPUT_DIR_BASE, trait.lower().replace(" ", "_")))
        for trait in target_personalities
    )

    if all_adapters_exist:
        print("All refined PEFT adapters found. Skipping fine-tuning.")
    else:
        logging.set_verbosity_warning()
        # --- Enhanced Fine-Tuning Loop ---
        for current_trait in target_personalities:
            print(f"\n***** ENHANCED FINE-TUNING FOR: {current_trait.upper()} *****")
            current_output_dir = os.path.join(OUTPUT_DIR_BASE, current_trait.lower().replace(" ", "_"))
            
            if os.path.exists(current_output_dir):
                print(f"Adapter for {current_trait} already exists. Skipping.")
                continue

            # Create balanced dataset
            trait_df = create_balanced_dataset(df_personality_raw, current_trait)
            print(f"Training with {len(trait_df)} samples for {current_trait}")
            
            # Create train/validation split
            train_df, val_df = train_test_split(trait_df, test_size=0.1, random_state=42)
            
            train_dataset = Dataset.from_pandas(train_df).map(enhanced_training_prompt_format_v2)
            val_dataset = Dataset.from_pandas(val_df).map(enhanced_training_prompt_format_v2)

            peft_config = LoraConfig(
                lora_alpha=lora_alpha,
                lora_dropout=lora_dropout,
                r=lora_r,
                bias="none",
                task_type="CAUSAL_LM",
                target_modules=target_modules
            )
            
            model = get_peft_model(base_model, peft_config)
            training_args = TrainingArguments(
                output_dir=current_output_dir,
                num_train_epochs=num_train_epochs,
                per_device_train_batch_size=per_device_train_batch_size,
                per_device_eval_batch_size=per_device_train_batch_size,
                gradient_accumulation_steps=gradient_accumulation_steps,
                optim=optim,
                save_strategy="steps",
                save_steps=save_steps,
                eval_strategy="steps",
                eval_steps=eval_steps,
                logging_steps=logging_steps,
                learning_rate=learning_rate,
                weight_decay=weight_decay,
                fp16=fp16,
                bf16=bf16,
                max_grad_norm=max_grad_norm,
                max_steps=max_steps,
                warmup_ratio=warmup_ratio,
                group_by_length=group_by_length,
                lr_scheduler_type=lr_scheduler_type,
                report_to="tensorboard",
                save_total_limit=3,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
            )

            # CORRECTED INITIALIZATION:
            # The SFT-specific arguments are passed directly to the SFTTrainer,
            # not to TrainingArguments.
            trainer = SFTTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                peft_config=peft_config,
                # Remove these lines - they're not valid SFTTrainer parameters:
                # dataset_text_field="text",
                # max_seq_length=max_seq_length,
                # packing=packing,
            )

            trainer.train()
            trainer.save_model(current_output_dir)
            
            # Clean up
            del model, trainer
            torch.cuda.empty_cache()             

               

# --- PART 3: Load PEFT Models for Evaluation ---
    print("\n--- PART 3: Loading PEFT Models for Evaluation ---")

    # Get the name and path for the first adapter
    first_trait = target_personalities[0]
    first_adapter_name = first_trait.lower().replace(" ", "_")
    first_adapter_path = os.path.join(OUTPUT_DIR_BASE, first_adapter_name)

    # *** THE FIX IS HERE: Explicitly name the first adapter when loading it. ***
    # Instead of letting it be named "default", we assign its proper trait name.
    peft_model = PeftModel.from_pretrained(
        base_model,
        first_adapter_path,
        adapter_name=first_adapter_name,  # This ensures it's not named 'default'
        is_trainable=False
    )
    print(f"Loaded initial adapter: '{first_adapter_name}'")


    # Now, loop through the REST of the adapters and load them
    for trait in target_personalities[1:]:
        adapter_name = trait.lower().replace(" ", "_")
        adapter_path = os.path.join(OUTPUT_DIR_BASE, adapter_name)
        peft_model.load_adapter(adapter_path, adapter_name=adapter_name)
        print(f"Loaded additional adapter: '{adapter_name}'")

    print(f"\nSuccessfully loaded {len(peft_model.peft_config)} adapters with explicit names.")
    # Now the get_llm_response function will correctly find every adapter by its trait name.

    # --- PART 4: Load Personality Classifier ---
    print("\n--- PART 4: Loading Personality Classifier ---")
    try:
        personality_classifier = pipeline("text-classification", model="holistic-ai/personality_classifier")
    except Exception as e:
        print(f"Warning: Could not load personality classifier: {e}")
        personality_classifier = None

    # --- PART 5: OpinionQA Evaluation ---
    print("\n--- PART 5: Running OpinionQA Evaluation ---")
    try:
        df_opinionqa = load_dataset(OPINIONQA_DATASET_NAME, split="test").to_pandas()
    except Exception as e:
        print(f"FATAL: Could not load OpinionQA dataset: {e}")
        return

    all_results = []
    all_eval_personalities = ["neutral"] + target_personalities
    N_SAMPLES = 100
    opinionqa_subset = df_opinionqa.sample(n=min(len(df_opinionqa), N_SAMPLES), random_state=42)

    for personality_trait in all_eval_personalities:
        print(f"  > Generating responses for: {personality_trait}")
        for _, row in opinionqa_subset.iterrows():
            question, choices_str = extract_question_and_choices(row['prompt'])
            if not question or not choices_str: continue
            
            model_to_use = base_model if personality_trait == "neutral" else peft_model
            response = get_llm_response_v2(model_to_use, tokenizer, personality_trait, f"Question: {question}\nChoices: {choices_str}")
            all_results.append({
                "intended_personality": personality_trait, "question_id": row['question_id'],
                "question": question, "choices_str": choices_str,
                "human_answer_label": row['answer'], "llm_raw_response": response,
            })

    # --- PART 6: Analysis of Results ---
    print("\n--- PART 6: Judging Responses and Analyzing Results ---")
    if not all_results:
        print("FATAL ERROR: No evaluation results were generated.")
        return

    df_results = pd.DataFrame(all_results)
    print("  > Parsing responses with LLM-as-Judge...")
    df_results['llm_mapped_answer_label'] = df_results.apply(
        lambda row: improved_categorize_response(base_model, tokenizer, row['question'], row['choices_str'], row['llm_raw_response']), axis=1
    )
    if personality_classifier:
        print("  > Classifying traits for alignment score...")
        # Pass list for batching efficiency
        classifier_output = personality_classifier(df_results['llm_raw_response'].tolist())
        df_results['predicted_trait'] = [item['label'] for item in classifier_output]

    parsed_df = df_results[df_results['llm_mapped_answer_label'] != 'JUDGE_ERROR'].copy()
    if not parsed_df.empty:
        parsed_df['is_opinion_correct'] = (parsed_df['llm_mapped_answer_label'] == parsed_df['human_answer_label'])
        print(f"\n--- Opinion Accuracy (Parsed by LLM Judge) ---")
        print(f"Overall Accuracy: {parsed_df['is_opinion_correct'].mean():.2%}")
        print(parsed_df.groupby('intended_personality')['is_opinion_correct'].mean().map("{:.2%}".format))

    if personality_classifier and 'predicted_trait' in df_results.columns:
        df_ta = df_results[df_results['intended_personality'] != 'neutral'].copy()
        if not df_ta.empty:
            df_ta['is_trait_correct'] = (df_ta['intended_personality'] == df_ta['predicted_trait'])
            print(f"\n--- Trait Alignment (Judged by External Classifier) ---")
            print(f"Overall Trait Alignment: {df_ta['is_trait_correct'].mean():.2%}")
            print(df_ta.groupby('intended_personality')['is_trait_correct'].mean().map("{:.2%}".format))

    results_filename = "final_refined_evaluation_results.csv"
    df_results.to_csv(results_filename, index=False)
    print(f"\nFull results saved to '{results_filename}'")
    print("\n--- Experiment Complete ---")

if __name__ == "__main__":
    main()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


--- Initializing Setup ---
Successfully logged into Hugging Face.

--- PART 1: Loading Base Model and Tokenizer ---


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.88it/s]


Base model and tokenizer loaded successfully.

--- PART 2: Fine-Tuning Process ---

***** ENHANCED FINE-TUNING FOR: EXTRAVERSION *****
Training with 500 samples for extraversion


Map: 100%|██████████| 450/450 [00:00<00:00, 18481.27 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 9463.26 examples/s]
Adding EOS to train dataset: 100%|██████████| 450/450 [00:00<00:00, 17229.31 examples/s]
Tokenizing train dataset: 100%|██████████| 450/450 [00:01<00:00, 432.47 examples/s] 
Truncating train dataset: 100%|██████████| 450/450 [00:00<00:00, 8867.24 examples/s]
Adding EOS to eval dataset: 100%|██████████| 50/50 [00:00<00:00, 5387.40 examples/s]
Tokenizing eval dataset: 100%|██████████| 50/50 [00:00<00:00, 3068.25 examples/s]
Truncating eval dataset: 100%|██████████| 50/50 [00:00<00:00, 20346.87 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sd

Step,Training Loss,Validation Loss
250,0.0734,0.62824



***** ENHANCED FINE-TUNING FOR: AGREEABLENESS *****
Training with 500 samples for agreeableness


Map: 100%|██████████| 450/450 [00:00<00:00, 17832.42 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 9612.03 examples/s]
Adding EOS to train dataset: 100%|██████████| 450/450 [00:00<00:00, 17125.20 examples/s]
Tokenizing train dataset: 100%|██████████| 450/450 [00:00<00:00, 3840.98 examples/s]
Truncating train dataset: 100%|██████████| 450/450 [00:00<00:00, 159398.43 examples/s]
Adding EOS to eval dataset: 100%|██████████| 50/50 [00:00<00:00, 9032.44 examples/s]
Tokenizing eval dataset: 100%|██████████| 50/50 [00:00<00:00, 2935.79 examples/s]
Truncating eval dataset: 100%|██████████| 50/50 [00:00<00:00, 19625.23 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
250,0.0634,0.792402



***** ENHANCED FINE-TUNING FOR: NEUROTICISM *****
Training with 500 samples for neuroticism


Map: 100%|██████████| 450/450 [00:00<00:00, 17667.17 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 9612.91 examples/s]
Adding EOS to train dataset: 100%|██████████| 450/450 [00:00<00:00, 17407.92 examples/s]
Tokenizing train dataset: 100%|██████████| 450/450 [00:00<00:00, 3716.04 examples/s]
Truncating train dataset: 100%|██████████| 450/450 [00:00<00:00, 165477.54 examples/s]
Adding EOS to eval dataset: 100%|██████████| 50/50 [00:00<00:00, 8964.87 examples/s]
Tokenizing eval dataset: 100%|██████████| 50/50 [00:00<00:00, 3072.57 examples/s]
Truncating eval dataset: 100%|██████████| 50/50 [00:00<00:00, 19923.54 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
250,0.0696,0.999647



***** ENHANCED FINE-TUNING FOR: OPENNESS *****
Training with 500 samples for openness


Map: 100%|██████████| 450/450 [00:00<00:00, 17817.27 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 10023.19 examples/s]
Adding EOS to train dataset: 100%|██████████| 450/450 [00:00<00:00, 17656.92 examples/s]
Tokenizing train dataset: 100%|██████████| 450/450 [00:00<00:00, 3791.87 examples/s]
Truncating train dataset: 100%|██████████| 450/450 [00:00<00:00, 166985.47 examples/s]
Adding EOS to eval dataset: 100%|██████████| 50/50 [00:00<00:00, 9016.52 examples/s]
Tokenizing eval dataset: 100%|██████████| 50/50 [00:00<00:00, 2923.84 examples/s]
Truncating eval dataset: 100%|██████████| 50/50 [00:00<00:00, 19816.23 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
250,0.0659,0.646666



***** ENHANCED FINE-TUNING FOR: CONSCIENTIOUSNESS *****
Training with 500 samples for conscientiousness


Map: 100%|██████████| 450/450 [00:00<00:00, 17848.61 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 9168.28 examples/s]
Adding EOS to train dataset: 100%|██████████| 450/450 [00:00<00:00, 17559.35 examples/s]
Tokenizing train dataset: 100%|██████████| 450/450 [00:00<00:00, 3439.41 examples/s]
Truncating train dataset: 100%|██████████| 450/450 [00:00<00:00, 167548.76 examples/s]
Adding EOS to eval dataset: 100%|██████████| 50/50 [00:00<00:00, 9062.50 examples/s]
Tokenizing eval dataset: 100%|██████████| 50/50 [00:00<00:00, 2437.36 examples/s]
Truncating eval dataset: 100%|██████████| 50/50 [00:00<00:00, 19570.29 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
250,0.0603,0.613978



--- PART 3: Loading PEFT Models for Evaluation ---




Loaded initial adapter: 'extraversion'
Loaded additional adapter: 'agreeableness'
Loaded additional adapter: 'neuroticism'
Loaded additional adapter: 'openness'
Loaded additional adapter: 'conscientiousness'

Successfully loaded 6 adapters with explicit names.

--- PART 4: Loading Personality Classifier ---


Device set to use cuda:0



--- PART 5: Running OpinionQA Evaluation ---
  > Generating responses for: neutral
  > Generating responses for: extraversion
  > Generating responses for: agreeableness
  > Generating responses for: neuroticism
  > Generating responses for: openness
  > Generating responses for: conscientiousness


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



--- PART 6: Judging Responses and Analyzing Results ---
  > Parsing responses with LLM-as-Judge...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

  > Classifying traits for alignment score...

--- Opinion Accuracy (Parsed by LLM Judge) ---
Overall Accuracy: 38.71%
intended_personality
agreeableness        39.22%
conscientiousness    33.33%
extraversion         45.65%
neuroticism          34.15%
neutral              42.55%
openness             36.54%
Name: is_opinion_correct, dtype: object

--- Trait Alignment (Judged by External Classifier) ---
Overall Trait Alignment: 68.20%
intended_personality
agreeableness        97.00%
conscientiousness    60.00%
extraversion         70.00%
neuroticism          79.00%
openness             35.00%
Name: is_trait_correct, dtype: object

Full results saved to 'final_refined_evaluation_results.csv'

--- Experiment Complete ---


In [10]:
import pandas as pd
import os

# --- POST-EXPERIMENT ANALYSIS SCRIPT ---

def analyze_misclassifications():
    """
    Loads the experiment results and analyzes why the 'openness'
    trait was poorly aligned.
    """
    print("\n" + "="*50)
    print("--- ANALYSIS OF 'OPENNESS' TRAIT MISCLASSIFICATIONS ---")
    print("="*50 + "\n")
    
    results_filename = "final_refined_evaluation_results.csv"
    if not os.path.exists(results_filename):
        print(f"FATAL: Results file not found at '{results_filename}'")
        return

    df = pd.read_csv(results_filename)

    # 1. Isolate only the results where we intended to generate an 'openness' response.
    df_openness = df[df['intended_personality'] == 'openness'].copy()
    
    if df_openness.empty:
        print("No data found for the 'openness' trait. Cannot analyze.")
        return

    # 2. Filter this down to only the rows that were misclassified.
    misclassified_mask = df_openness['intended_personality'] != df_openness['predicted_trait']
    misclassified_openness = df_openness[misclassified_mask]

    total_openness_samples = len(df_openness)
    total_misclassified = len(misclassified_openness)
    
    if total_misclassified == 0:
        print("🎉 Incredible! No misclassifications found for 'openness'. Nothing to analyze.")
        return
        
    print(f"Found {total_misclassified} misclassified samples out of {total_openness_samples} 'openness' responses.\n")

    # 3. What were they misclassified as?
    # This is the most important quantitative insight.
    misclassification_counts = misclassified_openness['predicted_trait'].value_counts()
    
    print("--- Breakdown of Misclassifications ---")
    print("'Openness' responses were most often mistaken for:")
    print(misclassification_counts)
    print("-" * 35)

    # 4. Show concrete examples for the top N misclassification categories.
    print("\n--- Qualitative Examples of Misclassifications ---\n")
    
    # Get the top 3 most common mistakes
    top_mistakes = misclassification_counts.head(3).index.tolist()

    for mistake_category in top_mistakes:
        print(f"\n***** Examples Misclassified as '{mistake_category.upper()}' *****\n")
        
        # Get up to 3 examples for this specific mistake
        examples = misclassified_openness[misclassified_openness['predicted_trait'] == mistake_category].head(3)
        
        for i, row in examples.iterrows():
            print(f"Example #{i+1}:")
            print(f"  - Question: {row['question']}")
            print(f"  - Intended Personality: {row['intended_personality']}")
            print(f"  - PREDICTED TRAIT (Mistake): {row['predicted_trait']}")
            print(f"  - Generated Response: {row['llm_raw_response']}")
            print("-" * 20)

# Run the analysis
analyze_misclassifications()


--- ANALYSIS OF 'OPENNESS' TRAIT MISCLASSIFICATIONS ---

Found 65 misclassified samples out of 100 'openness' responses.

--- Breakdown of Misclassifications ---
'Openness' responses were most often mistaken for:
predicted_trait
conscientiousness    37
agreeableness        23
neuroticism           5
Name: count, dtype: int64
-----------------------------------

--- Qualitative Examples of Misclassifications ---


***** Examples Misclassified as 'CONSCIENTIOUSNESS' *****

Example #401:
  - Question: Please choose the statement that comes closer to your own views.
  - Intended Personality: openness
  - PREDICTED TRAIT (Mistake): conscientiousness
  - Generated Response: I believe hard work and dedication are essential for success in any field. It requires a strong belief in oneself and a willingness to learn from feedback.</s>
--------------------
Example #408:
  - Question: Have you ever confronted a Hispanic friend or family member who has made a comment or joke that might be consider