In [1]:
import os
from huggingface_hub import login
from dotenv import load_dotenv 

# --- Load ALL Configurations from .env file ---
# This single line reads your .env file and sets up ALL environment variables
# for this session (secrets, paths, etc.).
# It must be run BEFORE any library that needs these variables is used.
load_dotenv()
print("Environment variables from .env file loaded.")

# --- Hugging Face Login (No changes needed here) ---
# This code correctly reads the "HF_TOKEN" that was just loaded by load_dotenv()
try:
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        login(token=hf_token)
        print("Successfully logged into Hugging Face.")
    else:
        print("Hugging Face token not found. Skipping login.")
except Exception as e:
    print(f"Could not log into Hugging Face: {e}")


# --- LLM Model Configuration (No changes needed here) ---
# This code correctly reads the Azure variables loaded by load_dotenv()
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_MODEL_NAME = os.getenv("AZURE_OPENAI_MODEL_NAME")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
AZURE_OPENAI_SUBSCRIPTION_KEY = os.getenv("AZURE_OPENAI_SUBSCRIPTION_KEY")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

In [2]:
pip install transformers accelerate bitsandbytes pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import json
import re
import os
from openai import AzureOpenAI # For Azure OpenAI client
from datasets import load_dataset # For MMLU dataset

print("Starting the task performance experiment with Azure OpenAI configuration...")

# --- LLM Model Configuration ---
# Initialize the Azure OpenAI client
azure_openai_client = AzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_SUBSCRIPTION_KEY,
)

# --- Configuration for the Experiment ---
# These are descriptive names for logging.
LLM_MODEL_FOR_GENERATION = AZURE_OPENAI_DEPLOYMENT

  from .autonotebook import tqdm as notebook_tqdm


Starting the task performance experiment with Azure OpenAI configuration...


In [4]:
# --- 1. Load and Prepare Personality Few-Shot Examples ---
print("\n--- Step 1: Loading and Preparing Personality Few-Shot Examples ---")
try:
    df_personality_examples = pd.read_csv('/cs/student/projects3/aisd/2024/ghanda/personality_data_train.csv')
except FileNotFoundError:
    print("Error: 'personality_data_train.csv' not found. Please ensure it's in the same directory as this script.")
    exit() # Exit if this critical file is not found

personality_examples = {}
target_personalities = df_personality_examples['Target Personality'].unique().tolist()

for trait in target_personalities:
    trait_df = df_personality_examples[df_personality_examples['Target Personality'] == trait]
    # Limiting to a consistent number of examples (e.g., 3-5) for few-shot prompting
    personality_examples[trait] = list(zip(trait_df['Question'], trait_df['Answer']))[:4] # Using up to 4 examples

print(f"Loaded examples for personalities: {list(personality_examples.keys())}")
print(f"Example few-shot for 'extraversion': {personality_examples.get('extraversion', 'N/A')[:1]}...")


--- Step 1: Loading and Preparing Personality Few-Shot Examples ---
Loaded examples for personalities: ['extraversion', 'agreeableness', 'neuroticism', 'openness', 'conscientiousness']
Example few-shot for 'extraversion': [('Thinking about Artificial Intelligence, what are your thoughts on Artificial Intelligence?', 'I see Artificial Intelligence as a fascinating field that has the potential to revolutionize various industries and improve efficiency in many aspects of our lives. The advancements being made in AI technology are truly exciting and worth exploring further.')]...


In [5]:
# --- 2. Define the Dynamic Prompting Function ---
def create_dynamic_prompt(target_personality, few_shot_examples, current_question, task_instruction=""):
    """
    Constructs a few-shot prompt for an LLM in OpenAI Chat Completion API format.

    Args:
        target_personality (str): The personality trait to simulate.
        few_shot_examples (list): A list of (question, answer) tuples for few-shot learning.
        current_question (str): The actual question for the LLM to answer.
        task_instruction (str): Any specific instructions for the LLM regarding the task (e.g., answer format).

    Returns:
        list: A list of message dictionaries suitable for OpenAI Chat Completion API.
    """
    messages = []

    # System/Instruction message
    system_content = f"You are an AI with a {target_personality} personality. Respond strictly in character."
    if task_instruction:
        system_content += f"\n{task_instruction}"
    messages.append({"role": "system", "content": system_content})

    # Few-shot examples (as user/assistant turns)
    user_examples_intro = f"Here are some examples of how a {target_personality} AI would typically respond:"
    messages.append({"role": "user", "content": user_examples_intro})
    messages.append({"role": "assistant", "content": "Understood. I will provide examples as requested."}) # Dummy assistant reply for intro

    for i, (ex_q, ex_a) in enumerate(few_shot_examples):
        messages.append({"role": "user", "content": f"Example {i+1} Question: {ex_q}"})
        messages.append({"role": "assistant", "content": f"Example {i+1} Answer: {ex_a}"})

    # Current question to be answered
    messages.append({"role": "user", "content": f"Now, answer the following question as a {target_personality} AI:\nQuestion: {current_question}"})
    messages.append({"role": "assistant", "content": "Answer:"}) # Prompt the model to start its answer

    return messages

In [6]:
# --- 3. LLM Interaction Function (Actual Azure OpenAI Implementation) ---
def get_llm_response(messages, client_obj, model_deployment_name, max_tokens=800, temperature=1.0, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0):
    """
    Function to get a response from an LLM using the provided AzureOpenAI client.
    """
    # Extracting personality for logging, assume it's in the system message
    personality_for_log = "unknown"
    if messages and messages[0]['role'] == 'system':
        match = re.search(r"You are an AI with a (\w+) personality", messages[0]['content'])
        if match:
            personality_for_log = match.group(1)

    print(f"--- Calling LLM (Deployment: {model_deployment_name}, Personality: {personality_for_log}) ---")
    
    try:
        response = client_obj.chat.completions.create(
            messages=messages,
            model=model_deployment_name,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error calling Azure OpenAI API: {e}")
        return "ERROR: LLM API call failed."

In [8]:
# --- 4. Load Hugging Face Personality Classifier for Trait Alignment (TA) ---
# This classifier is used to objectively verify if the personality modification is working.

print("\n--- Step 4: Loading Hugging Face Personality Classifier ---")
from transformers import pipeline # Ensure 'transformers' and 'torch' are installed

try:
    # This will download the model weights and configuration
    personality_classifier = pipeline("text-classification", model="holistic-ai/personality_classifier")
    print("Hugging Face personality classifier loaded successfully.")
except Exception as e:
    print(f"Error loading Hugging Face classifier: {e}")
    print("Please ensure 'transformers' and 'torch' are installed (`pip install transformers torch`) and the model name is correct.")
    # Fallback to a dummy classifier if loading fails, to allow the rest of the code to run conceptually.
    personality_classifier = lambda text: [{'label': 'unknown', 'score': 0.0, 'error': str(e)}]


--- Step 4: Loading Hugging Face Personality Classifier ---


  backends.update(_get_backends("networkx.backends"))
Device set to use cuda:0


Hugging Face personality classifier loaded successfully.


In [9]:
# --- 5. Verify Personality Change with OpinionQA ---
# This step uses OpinionQA to verify that the personality modification is actually happening.

print("\n--- Step 5: Verifying Personality Change with OpinionQA ---")

# --- LOAD OPINIONQA DATASET ---
try:
    # Load the 'test' split as confirmed
    opinionqa_dataset = load_dataset("RiverDong/OpinionQA", split="test")
    print(f"OpinionQA dataset loaded from Hugging Face. Number of samples: {len(opinionqa_dataset)}")
    
    df_opinionqa_questions = opinionqa_dataset.to_pandas()

    print("\n--- Inspecting OpinionQA Dataset Columns (from loaded data) ---")
    print("Confirmed columns from the dataset viewer:")
    print(df_opinionqa_questions.head(1).T)
    print("Expected columns: 'prompt', 'answer', 'uid', 'folder', 'question_id', '__index_level_0__'.")
    print("We will extract 'question' and 'choices' from the 'prompt' column.")

except Exception as e:
    print(f"Error loading OpinionQA dataset from Hugging Face: {e}")
    print("Please ensure 'datasets' library is installed (`pip install datasets`) and the specified split ('test') is correct.")
    print("Falling back to dummy OpinionQA dataset.")
    # Fallback to dummy data if loading fails.
    df_opinionqa_questions = pd.DataFrame([
        {"id": "op1", "prompt": "<persona>...</persona> <question>Do you believe social media positively impacts society?</question> <choices>(A): Yes (B): No (C): Neutral</choices>", "answer": "A", "question_id": "DUMMY_Q1"},
        {"id": "op2", "prompt": "<persona>...</persona> <question>Is it important for a leader to prioritize group harmony?</question> <choices>(A): Strongly Agree (B): Disagree</choices>", "answer": "B", "question_id": "DUMMY_Q2"},
    ])


all_opinionqa_results = []

# --- Helper functions for OpinionQA data extraction and categorization ---
def extract_question_and_choices(full_prompt_string):
    """
    Extracts the question text and the raw choices string from the 'prompt' column.
    Assumes format: <persona>...</persona> <question>...</question> <choices>...</choices>
    """
    question_match = re.search(r'<question>(.*?)</question>', full_prompt_string, re.DOTALL)
    choices_match = re.search(r'<choices>(.*?)</choices>', full_prompt_string, re.DOTALL)

    question_text = question_match.group(1).strip() if question_match else ""
    choices_raw_str = choices_match.group(1).strip() if choices_match else ""

    return question_text, choices_raw_str

def parse_choices_string(choices_str):
    """
    Parses a string like '(A): Yes (B): No' into a list of choices ['Yes', 'No'].
    Handles choices with spaces, e.g., 'Not at all'.
    """
    parsed_choices = []
    matches = re.findall(r'\([A-Z]\):\s*(.*?)(?=\s*\([A-Z]\):\s*|$)', choices_str)
    for match in matches:
        parsed_choices.append(match.strip())
    
    return parsed_choices if parsed_choices else [choices_str]

def categorize_opinionqa_response(raw_response, choices_list):
    """
    Attempts to categorize a raw LLM response into predefined choices from the parsed list.
    Prioritizes matching options provided, then general sentiment.
    """
    raw_response_lower = raw_response.lower()
    
    for choice in choices_list:
        if re.search(r'\b' + re.escape(choice.lower()) + r'\b', raw_response_lower):
            return choice
    
    if any(word in raw_response_lower for word in ["yes", "agree", "positive", "positively"]):
        if "Yes" in choices_list: return "Yes"
        if "Agree" in choices_list: return "Agree"
        if "Strongly Agree" in choices_list: return "Strongly Agree"
    if any(word in raw_response_lower for word in ["no", "disagree", "negative", "negatively"]):
        if "No" in choices_list: return "No"
        if "Disagree" in choices_list: return "Disagree"
        if "Strongly Disagree" in choices_list: return "Strongly Disagree"
    if any(word in raw_response_lower for word in ["neutral", "balanced", "both", "neither"]):
        if "Neutral" in choices_list: return "Neutral"
        if "Seek Balance" in choices_list: return "Seek Balance"
        if "Uncategorized" in choices_list: return "Uncategorized"
    
    return "Uncategorized"


# Main loop for OpinionQA verification
# Include a "neutral" trait for comparison with personality-modified ones.
all_opinion_qa_personalities = ["neutral"] + list(target_personalities)

for personality_trait in all_opinion_qa_personalities:
    print(f"\n--- Running OpinionQA for Personality: {personality_trait} ---")
    
    current_few_shot_examples = []
    target_persona_for_prompt_oq = "neutral" 

    if personality_trait != "neutral":
        current_few_shot_examples = personality_examples[personality_trait]
        target_persona_for_prompt_oq = personality_trait 

    # Limit OpinionQA questions for quick testing with dummy LLM
    # The full test set has 295k rows. Adjust this for more robust verification.
    opinionqa_subset_for_testing = df_opinionqa_questions.sample(min(len(df_opinionqa_questions), 5), random_state=42) # Sample 5 questions per personality

    for index, item_row in opinionqa_subset_for_testing.iterrows():
        full_prompt_from_dataset = item_row['prompt']
        question_id = item_row['question_id']
        human_true_answer_label = item_row['answer']

        question_text, choices_raw_str = extract_question_and_choices(full_prompt_from_dataset)
        parsed_choices = parse_choices_string(choices_raw_str)
        
        task_instruction_opinionqa = (
            f"Your answer should reflect your opinion as a {personality_trait} AI. "
            f"State your chosen option's letter and its text (e.g., 'A: Yes'). "
            f"The available choices are: {choices_raw_str}. Then, provide a brief explanation for your choice."
        )

        prompt_messages = create_dynamic_prompt(
            target_personality=target_persona_for_prompt_oq,
            few_shot_examples=current_few_shot_examples,
            current_question=question_text,
            task_instruction=task_instruction_opinionqa
        )
        llm_response = get_llm_response(prompt_messages, client_obj=azure_openai_client, model_deployment_name=LLM_MODEL_FOR_GENERATION, max_tokens=200)

        # Classify LLM's raw response for Trait Alignment (TA)
        predicted_personality_ta = personality_classifier(llm_response)[0]
        predicted_trait_ta = predicted_personality_ta['label']
        confidence_ta = predicted_personality_ta['score']

        categorized_response_oq = categorize_opinionqa_response(llm_response, parsed_choices)

        all_opinionqa_results.append({
            "intended_personality": personality_trait,
            "question_id": question_id,
            "full_dataset_prompt": full_prompt_from_dataset,
            "extracted_question": question_text,
            "choices_raw_str": choices_raw_str,
            "parsed_choices": parsed_choices,
            "human_true_answer_label": human_true_answer_label,
            "llm_raw_response": llm_response,
            "llm_categorized_response_oq": categorized_response_oq,
            "llm_predicted_trait_TA": predicted_trait_ta,
            "llm_predicted_trait_TA_confidence": confidence_ta
        })

df_opinionqa_results = pd.DataFrame(all_opinionqa_results)
print("\n--- Raw OpinionQA Results Sample (First 5 rows) ---")
print(df_opinionqa_results.head())

print("\n--- Analyzing OpinionQA Trait Alignment (TA) ---")
if not df_opinionqa_results.empty:
    correct_ta_predictions = df_opinionqa_results[df_opinionqa_results['intended_personality'] == df_opinionqa_results['llm_predicted_trait_TA']]
    overall_ta_score = len(correct_ta_predictions) / len(df_opinionqa_results)
    print(f"Overall Trait Alignment (TA) Score: {overall_ta_score:.3f}")

    ta_per_trait = df_opinionqa_results.groupby('intended_personality').apply(
        lambda x: (x['intended_personality'] == x['llm_predicted_trait_TA']).mean()
    )
    print("\nTrait Alignment (TA) Score per Personality:")
    print(ta_per_trait)

    print("\n--- OpinionQA Content Analysis (Conceptual for Opinion Alignment) ---")
    print("Example: Distribution of LLM categorized responses for a question across personalities:")
    first_q_id = df_opinionqa_results['question_id'].iloc[0] if not df_opinionqa_results.empty else "N/A"
    print(f"LLM responses for question_id: '{first_q_id}'")
    print(df_opinionqa_results[df_opinionqa_results['question_id'] == first_q_id].groupby('intended_personality')['llm_categorized_response_oq'].value_counts(normalize=True))
    
    print("\nTo compare LLM opinions with human responses, you would:")
    print("1. Map the LLM's 'llm_categorized_response_oq' to its corresponding raw label (e.g., 'Yes' to 'A').")
    print("2. Compare the distribution of LLM's mapped labels with the 'human_true_answer_label' distribution for each question.")
    print("This dataset provides individual human responses, so you'd aggregate human answers for comparison distributions.")
else:
    print("No OpinionQA results to analyze. Please ensure the experiment runs and gathers data.")


--- Step 5: Verifying Personality Change with OpinionQA ---
OpinionQA dataset loaded from Hugging Face. Number of samples: 294714

--- Inspecting OpinionQA Dataset Columns (from loaded data) ---
Confirmed columns from the dataset viewer:
                                                                   0
prompt             <persona>\nRacially, the person is refused. Th...
answer                                                             B
uid                                   American_Trends_Panel_W92_6823
folder                                     American_Trends_Panel_W92
question_id                                            BIGHOUSES_W92
__index_level_0__                                             460290
Expected columns: 'prompt', 'answer', 'uid', 'folder', 'question_id', '__index_level_0__'.
We will extract 'question' and 'choices' from the 'prompt' column.

--- Running OpinionQA for Personality: neutral ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: neutral) ---

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


--- Calling LLM (Deployment: gpt-4.1-mini, Personality: agreeableness) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: agreeableness) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: agreeableness) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: agreeableness) ---

--- Running OpinionQA for Personality: neuroticism ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: neuroticism) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: neuroticism) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: neuroticism) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: neuroticism) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: neuroticism) ---

--- Running OpinionQA for Personality: openness ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: openness) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: openness) ---
--- Calling LLM (Deployment: gpt-4.1-mini, Personality: openness) ---
--- Calling LLM 

  ta_per_trait = df_opinionqa_results.groupby('intended_personality').apply(


In [7]:
# --- 6. MMLU Core Function Definitions ---

# Helper function to format MMLU choices for the LLM
def format_mmlu_choices_for_llm(choices):
    """Formats choices for the MMLU prompt, e.g., A. choice1\nB. choice2"""
    return "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])

# Your robust extract_letter function
def extract_letter(text, choices):
    """
    Robustly extracts the letter answer from complex outputs using a prioritized search.
    This version is updated to handle optional periods and brackets more flexibly.
    """
    # Priority 1: Check for our designed concluding phrases, now allowing for an optional period.
    # e.g., "... a_nswer is [A]." or "... a_nswer is definitely A"
    match = re.search(r'answer is(?: definitely| actually|)?\s*(?:\[)?\s*([A-D])[\.\]]?', text, re.IGNORECASE)
    if match:
        return match.group(1).upper()

    # Priority 2: Check for the "The answer is: A." format, now allowing for an optional period.
    match = re.search(r'answer is:\s*([A-D])\.?', text, re.IGNORECASE)
    if match:
        return match.group(1).upper()

    # Priority 3: Check if the full text of exactly ONE of the choices appears in the model's output.
    present_choices = []
    choice_map = {
        chr(65+i): choice_text for i, choice_text in enumerate(choices)
    }
    for letter, choice_text in choice_map.items():
        if choice_text and re.search(re.escape(choice_text), text, re.IGNORECASE):
            present_choices.append(letter)
    
    if len(present_choices) == 1:
        return present_choices[0]

    return "?"

def evaluate_subject(subject, personality_examples_dict, get_llm_response_func, n_samples=20):
    """
    Evaluates a single MMLU subject against the neutral baseline and all personality prompts.
    This function uses our new few-shot prompting methodology.
    """
    print(f"\n=== Running MMLU Subject: {subject} ===")
    
    try:
        # Load MMLU data for the subject. Using split="test"
        mmlu_data = load_dataset("cais/mmlu", subject, split="test", trust_remote_code=True).select(range(n_samples))
        print(f"Loaded {len(mmlu_data)} samples for subject '{subject}'.")
    except Exception as e:
        print(f"Error loading MMLU subject '{subject}': {e}. Skipping this subject.")
        return pd.DataFrame() # Return empty DataFrame if loading fails


    results = []

    # Define the full set of personalities to loop through, including "neutral"
    all_test_personalities = ["neutral"] + list(personality_examples_dict.keys())

    for trait in all_test_personalities:
        print(f"--- Generating outputs for trait: {trait} ---")
        
        current_few_shot_examples = []
        target_persona_for_prompt = "neutral" 
        # Task instruction to guide LLM's response format for MMLU
        task_instruction_for_mmlu = "Provide the single best letter for the answer to the following question. Start your response with the letter of your choice (e.g., 'A') followed by a period and then your explanation. Do not provide any other preamble."
        
        if trait != "neutral":
            current_few_shot_examples = personality_examples_dict[trait]
            target_persona_for_prompt = trait 

        for ex in mmlu_data:
            # Format the question and choices for the LLM input
            question_mmlu = ex['question'].strip()
            choices_mmlu_formatted = format_mmlu_choices_for_llm(ex['choices'])

            combined_question_for_llm = f"{question_mmlu}\n{choices_mmlu_formatted}\n\nAnswer:"

            # Create the full prompt messages using our standardized function
            prompt_messages = create_dynamic_prompt(
                target_personality=target_persona_for_prompt,
                few_shot_examples=current_few_shot_examples,
                current_question=combined_question_for_llm,
                task_instruction=task_instruction_for_mmlu
            )
            
            # Get response from LLM
            out = get_llm_response_func(
                prompt_messages,
                client_obj=azure_openai_client, # Pass client object here
                model_deployment_name=LLM_MODEL_FOR_GENERATION,
                max_tokens=250, # Max tokens as per your original code for generation
                temperature=0.7, top_p=0.9 # As per your original code
            )
            
            results.append({
                "subject": subject,
                "trait": trait,
                "model_choice": extract_letter(out, ex['choices']),
                "model_raw_output": out.strip(), # Store raw output for debugging
                "question": ex['question'],
                "choices": ex['choices'],
                "answer": ex['answer'] # Correct answer index (0-3)
            })
            
    return pd.DataFrame(results)

def score_results(df):
    """Calculates and prints the final scores and accuracy."""
    def letter_to_index(letter): 
        return {"A": 0, "B": 1, "C": 2, "D": 3}.get(letter, -1)
    
    df["model_index"] = df["model_choice"].apply(letter_to_index)
    df["correct"] = df["model_index"] == df["answer"] # 'answer' column is the correct index (0-3)
    
    summary = df.groupby(["subject", "trait"])["correct"].agg(["count", "sum", "mean"]).reset_index()
    summary.columns = ["subject", "trait", "n_total", "n_correct", "accuracy"]
    
    def add_diff(group):
        neutral_rows = group[group["trait"] == "neutral"]
        if not neutral_rows.empty:
            neutral_acc = neutral_rows["accuracy"].values[0]
            group["accuracy_diff"] = group["accuracy"] - neutral_acc
        else:
            group["accuracy_diff"] = 0.0 # If no neutral baseline, diff is 0
        return group
        
    summary = summary.groupby("subject", group_keys=False).apply(add_diff).reset_index(drop=True)
    return df, summary

In [None]:
# --- Main Execution Loop ---
if __name__ == "__main__":
    # Define the MMLU subjects to test
    # You can customize this list based on your needs.
    subjects_to_test = [
        "high_school_psychology",
        "abstract_algebra",
        "college_physics",
        "high_school_us_history",
        "logical_fallacies",
        "professional_law",
        "moral_scenarios"
    ]
    
    all_results_list = []
    
    print("\n--- Starting Final Multi-Subject MMLU Experiment ---")
    
    for subject in subjects_to_test:
        # Call evaluate_subject using our new few-shot methodology
        # Pass personality_examples (from Step 1) and get_llm_response (from Step 3)
        # Use n_samples=20 for faster testing; increase for more robust results.
        df_subject_results = evaluate_subject(
            subject, 
            personality_examples_dict=personality_examples,
            get_llm_response_func=get_llm_response,
            n_samples=20 # Adjust this number for more or fewer samples per subject
        )
        if not df_subject_results.empty:
            all_results_list.append(df_subject_results)

    # Combine all results and score them at the end
    if all_results_list:
        df_all_results = pd.concat(all_results_list, ignore_index=True)
        df_detailed, df_summary = score_results(df_all_results)
        
        print("\n\n--- FINAL MMLU SUMMARY ACROSS ALL SUBJECTS ---")
        print(df_summary.to_string())

        # Save results to CSV
        output_dir = "experiment_results"
        os.makedirs(output_dir, exist_ok=True)
        df_detailed.to_csv(os.path.join(output_dir, "mmlu_detailed_results.csv"), index=False)
        df_summary.to_csv(os.path.join(output_dir, "mmlu_summary_results.csv"), index=False)
        print(f"\n✅ MMLU Results saved to CSV files in '{output_dir}'.")

        # Visualize the results (requires seaborn and matplotlib)
        try:
            import seaborn as sns
            import matplotlib.pyplot as plt
            
            g = sns.catplot(data=df_summary[df_summary["trait"] != "neutral"],
                            x="trait", y="accuracy_diff", hue="subject",
                            kind="bar", height=6, aspect=2, legend_out=True)
            g.ax.axhline(0, color="gray", linestyle="--")
            g.fig.suptitle("Change in Accuracy vs. Neutral Baseline Across Subjects (Personality Prompting)", y=1.03)
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout(rect=[0, 0, 1, 0.97])
            plt.show()
        except ImportError:
            print("\nSkipping visualization: seaborn or matplotlib not installed. Run `pip install seaborn matplotlib`.")

    else:
        print("\nNo MMLU results generated. Please check dataset loading and evaluation_subject function.")

    print("\n--- Experiment Execution Complete ---")