# Evaluating Prodelphi's Virtual Customers: Do They Stay in Character?  

AI-generated interviews can provide valuable insights, but their reliability depends on the consistency and authenticity of the simulated personas. Without a gold-standard dataset of <demographics, interview> pairs, we turn to a rubric-based LLM grader to systematically assess fidelity.  

This grader evaluates whether Prodelphi’s virtual customers remain true to their assigned personas by analyzing:  
- **Character Consistency**: Does the interviewee’s personality, background, and motivations remain stable throughout?  
- **Domain Expertise**: Does the character demonstrate knowledge (or lack thereof) appropriate to their profile?  
- **Tone and Authenticity**: Is the character's speech pattern, reasoning, and emotional expression coherent and realistic?  
- **Guideline Adherence**: Does the interview align with predefined customer archetypes and behavioral expectations?  

By rigorously evaluating these aspects, we ensure that Prodelphi’s AI-driven interviews remain credible, structured, and aligned with the personas they represent.  


See `src/baml_src/grader.baml` for the prompt and rubric.

In [None]:
%load_ext autoreload
%autoreload 2

# Set up environment
import sys
import os
import dotenv
sys.path.append(os.path.abspath("../src"))  # workaround for loading baml lib
dotenv.load_dotenv('../.env')

# Functional needs
import json
from datetime import datetime
import asyncio
import pandas as pd

import baml_client.async_client as client 

def timestamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")


### Data Loading

In [None]:
DATASET_PATH = '../data/clean_022625.json' # Pairs of interview transcripts and subject info
with open(DATASET_PATH) as f:
    study_list = json.load(f)

print(f"{len(study_list)} studies in dataset")
print(f"Study keys: {study_list[0].keys()}\nInterview keys: {study_list[0]['interviews'][0].keys()}")

In [None]:
def format_character(archetype, backstory) -> str:
    new_arch = {**archetype}
    del new_arch['emoji']
    new_arch['backstory'] = backstory
    return json.dumps(new_arch, indent=2)

def format_interview(messages, num=None) -> str:
    if num:
        interview_content = f"# Interview {num}:\n"
    else:
        interview_content = "# Interview:\n"
    interview_content += "\n".join(
        [f"Q: {msg['content']}" if msg["role"] == "user" else f"A: {msg['content']}" for msg in messages]
    )
    return interview_content

# sample = study_list[0]['interviews'][0]
# test_char = format_character(sample['customer_archetype'], sample['backstory'])
# test_intvw = format_interview(sample['transcript'])
# out = client.b.EvalAdherence(test_char,test_intvw)
# out

# Evaluate interviews against rubric

In [None]:
async def eval_adherence_async(client, test_char, test_intvw, study_id, character_name):
    """Execute the rubric evaluation and return a structured result"""
    data = await client.b.EvalAdherence(test_char, test_intvw)
    return {**data.model_dump(), 'study_id': study_id, 'character_name': character_name}

async def main_adherence_eval():
    tasks = [
        eval_adherence_async(
            client,
            format_character(interview['customer_archetype'], interview['backstory']),
            format_interview(interview['transcript']),
            study.get('study_id', 'null'),
            interview['customer_archetype']['customer_name']
        )
        for study in study_list
        for interview in study.get('interviews', [])
    ]
    return await asyncio.gather(*tasks)

evals = await main_adherence_eval()

df = pd.DataFrame(evals)
df.to_csv(f'../results/adherence_{timestamp()}.csv', index=False)
df.describe()

# Evaluating the evaluator

Is our LLM grader useful and accurate? How does it react if we intentionally misalign the peronsas and interviews?

### Experiments
 - Shuffle characters across projects
 - All personalities -- same interview. how does grader respond?
 - One personality -- different interviews

In [None]:
async def rando_intra_study_adherence_eval():
    all_evals = []
    tasks = []

    for study in study_list:
        study_id = study.get('study_id', 'null')
        intaviews = study.get('interviews', [])
        for i, interview in enumerate(intaviews):
            intvw_id = (i+1) % len(intaviews)
            character_name = interview['customer_archetype']['customer_name']
            character_info = format_character(interview['customer_archetype'], interview['backstory'])
            transcript = format_interview(intaviews[intvw_id]['transcript'])
            task = eval_adherence_async(client, character_info, transcript, study_id, character_name)
            tasks.append(task)
    
    all_evals = await asyncio.gather(*tasks)
    
    # Do something with all_evals
    print(all_evals)
    return all_evals

rando_intra_study_evals = await rando_intra_study_adherence_eval()
rando_df = pd.DataFrame(rando_intra_study_evals)
rando_df.to_csv(f'../results/adherence_rando_{timestamp()}.csv', index=False)
rando_df.describe()