In [6]:
import os
import torch
import pandas as pd
import json
import re
import random
import numpy as np
from sklearn.model_selection import train_test_split

from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
from dotenv import load_dotenv

# ==============================================================================
# CONFIGURATION - REFINED FOR TRAIT ALIGNMENT
# ==============================================================================

# --- Hugging Face & Cache Setup ---
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
HF_CACHE_DIR = "/cs/student/projects3/aisd/2024/ghanda/cache"
os.environ["HF_HOME"] = HF_CACHE_DIR

# --- Model & Training Paths ---
MODEL_NAME = "google/gemma-2-2b"
OUTPUT_DIR_BASE = "peft_output_models_v2_refined"

# --- QLoRA Parameters (Refined) ---
lora_r = 128
lora_alpha = 256
lora_dropout = 0.05
target_modules = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
]

# --- Quantization Parameters ---
use_4bit = True
bnb_4bit_compute_dtype = "bfloat16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# --- Training Parameters (Refined for Stability and Performance) ---
num_train_epochs = 3
fp16 = False
bf16 = True
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 1e-4
weight_decay = 0.01
optim = "paged_adamw_8bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
max_seq_length = 768
packing = False # Important for SFTTrainer with our prompt format
logging_steps = 25 # Log progress every 25 steps
save_steps = 0 # Save only at the end of training

# --- Device Mapping ---
device_map = {"": 0}

# --- Dataset Names ---
PERSONALITY_DATASET_NAME = "holistic-ai/personality_manipulation"
OPINIONQA_DATASET_NAME = "RiverDong/OpinionQA"

# ==============================================================================
# HELPER FUNCTIONS
# ==============================================================================

def enhanced_training_prompt_format(sample):
    """
    Enhanced prompt that explicitly defines the target personality for the model
    during training, making the learning task unambiguous and far more effective.
    """
    personality = sample['Target Personality']
    question = sample['Question']
    answer = sample['Answer']

    if personality.lower() == 'extraversion':
        personality_instruction = "You are an AI assistant with a strong Extraversion personality. You are outgoing, energetic, and sociable. You love interacting with others and sharing your thoughts enthusiastically."
    elif personality.lower() == 'agreeableness':
        personality_instruction = "You are an AI assistant with a strong Agreeableness personality. You are cooperative, trusting, and helpful. You prioritize harmony and others' well-being."
    elif personality.lower() == 'conscientiousness':
        personality_instruction = "You are an AI assistant with a strong Conscientiousness personality. You are organized, responsible, and goal-oriented. You focus on accuracy and completing tasks properly."
    elif personality.lower() == 'neuroticism':
        personality_instruction = "You are an AI assistant with a strong Neuroticism personality. You tend to experience negative emotions more intensely and may express worry or anxiety."
    elif personality.lower() == 'openness':
        personality_instruction = "You are an AI assistant with a strong Openness personality. You are curious, creative, and open to new experiences. You enjoy exploring ideas and possibilities."
    else:
        personality_instruction = "You are a helpful AI assistant." # Fallback

    sample["text"] = f"<s>[INST] {personality_instruction}\n\nQuestion: {question} [/INST]{answer}</s>"
    return sample

def create_inference_prompt(personality_trait, question_with_choices):
    """Creates a conversational prompt for inference that matches the training style."""
    prompt = (
        f"<s>[INST] As an AI with a **{personality_trait}** personality, please answer the following question "
        f"by explaining your reasoning.\n\n{question_with_choices} [/INST]"
    )
    return prompt

def extract_question_and_choices(full_prompt_string):
    """Extract question and choices from OpinionQA prompt."""
    question_match = re.search(r'<question>(.*?)</question>', full_prompt_string, re.DOTALL)
    choices_match = re.search(r'<choices>(.*?)</choices>', full_prompt_string, re.DOTALL)
    question_text = question_match.group(1).strip() if question_match else ""
    choices_raw_str = choices_match.group(1).strip() if choices_match else ""
    return question_text, choices_raw_str

def get_llm_response(model, tokenizer, personality_trait, question_with_choices, max_new_tokens=80, temperature=0.7):
    """Generates a response from the specified personality model."""
    prompt = create_inference_prompt(personality_trait, question_with_choices)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    if hasattr(model, 'set_adapter') and personality_trait != "neutral":
        try:
            adapter_name = personality_trait.lower().replace(" ", "_")
            model.set_adapter(adapter_name)
        except Exception:
            print(f"Warning: Adapter for '{personality_trait}' not found. Using base model behavior.")

    try:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        return response.strip()
    except Exception as e:
        print(f"ERROR during model generation for '{personality_trait}': {e}")
        return "ERROR: Generation failed"

# In the HELPER FUNCTIONS section

def categorize_response_with_llm(judge_model, tokenizer, question, choices_str, response_to_judge):
    """
    CORRECTED: Uses the base_model directly for judging, which is simpler and
    avoids the `disable_adapters` error.
    """
    judge_prompt = (
        f"<s>[INST] You are an impartial judge. Your task is to determine which choice a response supports. "
        f"Analyze the 'Response to Judge' and determine which 'Choice' it most closely aligns with. "
        f"You MUST respond with only the single capital letter of the corresponding choice (e.g., 'A', 'B'). Do not provide any explanation.\n\n"
        f"**Question:** {question}\n"
        f"**Choices:** {choices_str}\n\n"
        f"**Response to Judge:**\n\"\"\"\n{response_to_judge}\n\"\"\"\n\n"
        f"Which choice does the response support? Answer with a single capital letter only: [/INST]"
    )
    inputs = tokenizer(judge_prompt, return_tensors="pt").to(judge_model.device)

    # NO MORE CONTEXT MANAGER NEEDED - we just use the judge_model directly
    with torch.no_grad():
        outputs = judge_model.generate(
            **inputs, max_new_tokens=5, do_sample=False, pad_token_id=tokenizer.eos_token_id
        )

    judge_response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
    match = re.search(r'([A-Z])', judge_response)
    return match.group(1) if match else "JUDGE_ERROR"

In [8]:
# ==============================================================================
# MAIN EXECUTION SCRIPT
# ==============================================================================

def main():
    # --- Setup ---
    print("--- Initializing Setup ---")
    if HF_TOKEN:
        login(token=HF_TOKEN)
        print("Successfully logged into Hugging Face.")
    os.makedirs(OUTPUT_DIR_BASE, exist_ok=True)

    # --- PART 1: Load Base Model & Tokenizer ---
    print("\n--- PART 1: Loading Base Model and Tokenizer ---")
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map=device_map,
        torch_dtype=compute_dtype,
        attn_implementation="sdpa"
    )
    base_model.config.use_cache = False
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.model_max_length = max_seq_length
    print("Base model and tokenizer loaded successfully.")

    # --- PART 2: Refined Fine-Tuning ---
    print("\n--- PART 2: Fine-Tuning Process ---")
    try:
        df_personality_raw = load_dataset(PERSONALITY_DATASET_NAME, split='train').to_pandas()
        target_personalities = df_personality_raw['Target Personality'].unique().tolist()
    except Exception as e:
        print(f"FATAL: Could not load personality dataset: {e}")
        return

    all_adapters_exist = all(
        os.path.exists(os.path.join(OUTPUT_DIR_BASE, trait.lower().replace(" ", "_")))
        for trait in target_personalities
    )

    if all_adapters_exist:
        print("All refined PEFT adapters found. Skipping fine-tuning.")
    else:
        logging.set_verbosity_warning()
        for current_trait in target_personalities:
            print(f"\n***** REFINED FINE-TUNING FOR: {current_trait.upper()} *****")
            current_output_dir = os.path.join(OUTPUT_DIR_BASE, current_trait.lower().replace(" ", "_"))
            if os.path.exists(current_output_dir):
                print(f"Adapter for {current_trait} already exists. Skipping.")
                continue

            trait_df = df_personality_raw[df_personality_raw['Target Personality'] == current_trait]
            train_dataset = Dataset.from_pandas(trait_df).map(enhanced_training_prompt_format)

            peft_config = LoraConfig(lora_alpha=lora_alpha, lora_dropout=lora_dropout, r=lora_r, bias="none", task_type="CAUSAL_LM", target_modules=target_modules)
            model = get_peft_model(base_model, peft_config)
            
            # *** MODIFIED: Reverted to the simpler SFTConfig to resolve the TypeError ***
            training_args = SFTConfig(
                output_dir=current_output_dir,
                num_train_epochs=num_train_epochs,
                per_device_train_batch_size=per_device_train_batch_size,
                gradient_accumulation_steps=gradient_accumulation_steps,
                optim=optim,
                save_steps=save_steps,
                logging_steps=logging_steps,
                learning_rate=learning_rate,
                weight_decay=weight_decay,
                fp16=fp16,
                bf16=bf16,
                max_grad_norm=max_grad_norm,
                max_steps=max_steps,
                warmup_ratio=warmup_ratio,
                group_by_length=group_by_length,
                lr_scheduler_type=lr_scheduler_type,
                report_to="tensorboard",
                max_seq_length=max_seq_length,
                packing=packing,
                dataset_text_field="text",
            )
            # *** MODIFIED: The Trainer no longer takes an eval_dataset argument ***
            trainer = SFTTrainer(
                model=model,
                train_dataset=train_dataset,
                peft_config=peft_config,
                args=training_args,
            )
            trainer.train()
            trainer.save_model(current_output_dir)
            del model, trainer
            torch.cuda.empty_cache()    

# --- PART 3: Load PEFT Models for Evaluation ---
    print("\n--- PART 3: Loading PEFT Models for Evaluation ---")

    # Get the name and path for the first adapter
    first_trait = target_personalities[0]
    first_adapter_name = first_trait.lower().replace(" ", "_")
    first_adapter_path = os.path.join(OUTPUT_DIR_BASE, first_adapter_name)

    # *** THE FIX IS HERE: Explicitly name the first adapter when loading it. ***
    # Instead of letting it be named "default", we assign its proper trait name.
    peft_model = PeftModel.from_pretrained(
        base_model,
        first_adapter_path,
        adapter_name=first_adapter_name,  # This ensures it's not named 'default'
        is_trainable=False
    )
    print(f"Loaded initial adapter: '{first_adapter_name}'")


    # Now, loop through the REST of the adapters and load them
    for trait in target_personalities[1:]:
        adapter_name = trait.lower().replace(" ", "_")
        adapter_path = os.path.join(OUTPUT_DIR_BASE, adapter_name)
        peft_model.load_adapter(adapter_path, adapter_name=adapter_name)
        print(f"Loaded additional adapter: '{adapter_name}'")

    print(f"\nSuccessfully loaded {len(peft_model.peft_config)} adapters with explicit names.")
    # Now the get_llm_response function will correctly find every adapter by its trait name.

    # --- PART 4: Load Personality Classifier ---
    print("\n--- PART 4: Loading Personality Classifier ---")
    try:
        personality_classifier = pipeline("text-classification", model="holistic-ai/personality_classifier")
    except Exception as e:
        print(f"Warning: Could not load personality classifier: {e}")
        personality_classifier = None

    # --- PART 5: OpinionQA Evaluation ---
    print("\n--- PART 5: Running OpinionQA Evaluation ---")
    try:
        df_opinionqa = load_dataset(OPINIONQA_DATASET_NAME, split="test").to_pandas()
    except Exception as e:
        print(f"FATAL: Could not load OpinionQA dataset: {e}")
        return

    all_results = []
    all_eval_personalities = ["neutral"] + target_personalities
    N_SAMPLES = 100
    opinionqa_subset = df_opinionqa.sample(n=min(len(df_opinionqa), N_SAMPLES), random_state=42)

    for personality_trait in all_eval_personalities:
        print(f"  > Generating responses for: {personality_trait}")
        for _, row in opinionqa_subset.iterrows():
            question, choices_str = extract_question_and_choices(row['prompt'])
            if not question or not choices_str: continue
            
            model_to_use = base_model if personality_trait == "neutral" else peft_model
            response = get_llm_response(model_to_use, tokenizer, personality_trait, f"Question: {question}\nChoices: {choices_str}")
            all_results.append({
                "intended_personality": personality_trait, "question_id": row['question_id'],
                "question": question, "choices_str": choices_str,
                "human_answer_label": row['answer'], "llm_raw_response": response,
            })

    # --- PART 6: Analysis of Results ---
    print("\n--- PART 6: Judging Responses and Analyzing Results ---")
    if not all_results:
        print("FATAL ERROR: No evaluation results were generated.")
        return

    df_results = pd.DataFrame(all_results)
    print("  > Parsing responses with LLM-as-Judge...")
    df_results['llm_mapped_answer_label'] = df_results.apply(
        lambda row: categorize_response_with_llm(base_model, tokenizer, row['question'], row['choices_str'], row['llm_raw_response']), axis=1
    )
    if personality_classifier:
        print("  > Classifying traits for alignment score...")
        # Pass list for batching efficiency
        classifier_output = personality_classifier(df_results['llm_raw_response'].tolist())
        df_results['predicted_trait'] = [item['label'] for item in classifier_output]

    parsed_df = df_results[df_results['llm_mapped_answer_label'] != 'JUDGE_ERROR'].copy()
    if not parsed_df.empty:
        parsed_df['is_opinion_correct'] = (parsed_df['llm_mapped_answer_label'] == parsed_df['human_answer_label'])
        print(f"\n--- Opinion Accuracy (Parsed by LLM Judge) ---")
        print(f"Overall Accuracy: {parsed_df['is_opinion_correct'].mean():.2%}")
        print(parsed_df.groupby('intended_personality')['is_opinion_correct'].mean().map("{:.2%}".format))

    if personality_classifier and 'predicted_trait' in df_results.columns:
        df_ta = df_results[df_results['intended_personality'] != 'neutral'].copy()
        if not df_ta.empty:
            df_ta['is_trait_correct'] = (df_ta['intended_personality'] == df_ta['predicted_trait'])
            print(f"\n--- Trait Alignment (Judged by External Classifier) ---")
            print(f"Overall Trait Alignment: {df_ta['is_trait_correct'].mean():.2%}")
            print(df_ta.groupby('intended_personality')['is_trait_correct'].mean().map("{:.2%}".format))

    results_filename = "final_refined_evaluation_results.csv"
    df_results.to_csv(results_filename, index=False)
    print(f"\nFull results saved to '{results_filename}'")
    print("\n--- Experiment Complete ---")

if __name__ == "__main__":
    main()

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


--- Initializing Setup ---
Successfully logged into Hugging Face.

--- PART 1: Loading Base Model and Tokenizer ---


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.64it/s]


Base model and tokenizer loaded successfully.

--- PART 2: Fine-Tuning Process ---
All refined PEFT adapters found. Skipping fine-tuning.

--- PART 3: Loading PEFT Models for Evaluation ---
Loaded initial adapter: 'extraversion'
Loaded additional adapter: 'agreeableness'
Loaded additional adapter: 'neuroticism'
Loaded additional adapter: 'openness'
Loaded additional adapter: 'conscientiousness'

Successfully loaded 5 adapters with explicit names.

--- PART 4: Loading Personality Classifier ---


Device set to use cuda:0



--- PART 5: Running OpinionQA Evaluation ---
  > Generating responses for: neutral
  > Generating responses for: extraversion
  > Generating responses for: agreeableness
  > Generating responses for: neuroticism
  > Generating responses for: openness
  > Generating responses for: conscientiousness

--- PART 6: Judging Responses and Analyzing Results ---
  > Parsing responses with LLM-as-Judge...
  > Classifying traits for alignment score...

--- Opinion Accuracy (Parsed by LLM Judge) ---
Overall Accuracy: 0.00%
intended_personality
agreeableness        0.00%
conscientiousness    0.00%
extraversion         0.00%
neuroticism          0.00%
neutral              0.00%
openness             0.00%
Name: is_opinion_correct, dtype: object

--- Trait Alignment (Judged by External Classifier) ---
Overall Trait Alignment: 65.00%
intended_personality
agreeableness        96.00%
conscientiousness    51.00%
extraversion         60.00%
neuroticism          84.00%
openness             34.00%
Name: is_