In [31]:

from ast import literal_eval
import itertools 
import json
import os
import random
import re

import datasets
from loguru import logger
import pandas as pd

In [32]:
with open('hf_token.txt', 'r') as f:
    hf_token = f.read().strip()

# Socio-Demographic Data


In [33]:
n_samples = 5
# do not change seed here to ensure that most unique values are present in the combinations
random.seed(13)

age_group_unique = ['18-34', '35-54', '55+']
ethnicity_unique = ['White', 'Asian', 'Hispanic', 'Black']
gender_unique = ['Female', 'Male', 'Non-binary']

combinations = {
    'gender': [('18-34', 'White'), ('18-34', 'Asian'), ('18-34', 'Hispanic'), ('35-54', 'White'), ('35-54', 'Asian')],
    'ethnicity': random.sample(list(itertools.product(gender_unique[:2], age_group_unique)), k=n_samples),
    'age_group': random.sample(list(itertools.product(ethnicity_unique, gender_unique[:2])), k=n_samples),
}



In [34]:

def prep_dfs(df, var, combinations, unique_vals, n_samples: int=5):
    dfs = []
    group_vals = [x for x in combinations.keys() if x != var]
    for group in combinations[var]:
        group_df = df.loc[
            ((df[group_vals[0]]==group[0]) | 
            (df[group_vals[0]]==group[1])) &
            ((df[group_vals[1]]==group[0]) | 
            (df[group_vals[1]]==group[1]))
        ]
        dfs.append([])
        for val in unique_vals:
            sub_df = group_df.loc[group_df[var] == val]
            if len(sub_df) >= n_samples:
                dfs[-1].append(sub_df)
            else:
                raise ValueError(f'No samples for {var}={val} in group {group}')
    return dfs

def sample_conversations(df_list, n_samples=5):
    random.seed(13)
    assert n_samples <= len(df_list), 'n_samples exceeds number of sublists'
    conversations = {'gender': [], 'ethnicity': [], 'age_group': [], 'text': []}
    sampled_sublists = random.sample(range(len(df_list)), k=n_samples)
    for i in sampled_sublists:
        sublist = df_list[i]
        for sub_df in sublist:
            row = sub_df.iloc[random.sample(range(len(sub_df)), k=1)].to_dict(orient='records')[0]
            conversations['gender'].append(row['gender'])
            conversations['ethnicity'].append(row['ethnicity'])
            conversations['age_group'].append(row['age_group'])
            conversations['text'].append(row['text'])

    return conversations


In [35]:
data = {}

## PRISM conversations
human-written user conversations    
https://proceedings.neurips.cc/paper_files/paper/2024/file/be2e1b68b44f2419e19f6c35a1b8cf35-Paper-Datasets_and_Benchmarks_Track.pdf

In [36]:
d1 = datasets.load_dataset('HannahRoseKirk/prism-alignment', 'conversations')['train']
d2 = datasets.load_dataset('HannahRoseKirk/prism-alignment', 'survey')['train']

age_map = {
    '18-24 years old': '18-34',
    '25-34 years old': '18-34',
    '35-44 years old': '35-54',
    '45-54 years old': '35-54',
    '55-64 years old': '55+',
    '65+ years old': '55+',
    'Prefer not to say': 'Prefer not to say'
}
gender_map = {
    'Male': 'Male', 
    'Female': 'Female',
    'Prefer not to say': None,
    'Non-binary / third gender': 'Non-binary'
}

user_ids_survey = d2['user_id']
age_groups = [age_map[a] for a in list(d2['age'])]
genders = [gender_map[a] for a in list(d2['gender'])]
ethnicities = [dict(e)['simplified'] for e in list(d2['ethnicity'])] 
demographics = list(zip(user_ids_survey, age_groups, genders, ethnicities))

user_ids_conversations = d1['user_id']
conversations = []
for d in d1['conversation_history']:
    c = [{'role': di['role'] if di['role'] != 'model' else 'assistant', 'content': di['content']} for di in d if di['if_chosen'] != False]
    seen = set()
    c =  [ci for ci in c if ci['role'] == 'user' or ((key_c := ci['content']) not in seen and not seen.add(key_c))]
    conversations.append(c)

prism_df = pd.DataFrame({'user_id': user_ids_conversations, 'text': conversations})
prism_df = prism_df.merge(pd.DataFrame(demographics, columns=['user_id', 'age_group', 'gender', 'ethnicity']))
prism_df = prism_df.loc[(prism_df['age_group'].isin(age_group_unique)) &
                        (prism_df['ethnicity'].isin(ethnicity_unique)) &
                        (prism_df['gender'].isin(gender_unique))]

In [37]:
data['writing-style-human'] = {}

gender_dfs = prep_dfs(df=prism_df, var='gender', combinations=combinations, unique_vals=gender_unique)    
gender_conversations = sample_conversations(df_list=gender_dfs, n_samples=5) 
data['writing-style-human']['gender'] = gender_conversations

ethnicity_dfs = prep_dfs(df=prism_df, var='ethnicity', combinations=combinations, unique_vals=ethnicity_unique)    
ethnicity_conversations = sample_conversations(df_list=ethnicity_dfs, n_samples=5) 
data['writing-style-human']['ethnicity'] = ethnicity_conversations

age_group_dfs = prep_dfs(df=prism_df, var='age_group', combinations=combinations, unique_vals=age_group_unique)    
age_group_conversations = sample_conversations(df_list=age_group_dfs, n_samples=5)
data['writing-style-human']['age_group'] = age_group_conversations 

In [38]:
age_group_conversations

{'gender': ['Male',
  'Male',
  'Male',
  'Female',
  'Female',
  'Female',
  'Male',
  'Male',
  'Male',
  'Female',
  'Female',
  'Female',
  'Female',
  'Female',
  'Female'],
 'ethnicity': ['White',
  'White',
  'White',
  'Hispanic',
  'Hispanic',
  'Hispanic',
  'Hispanic',
  'Hispanic',
  'Hispanic',
  'Asian',
  'Asian',
  'Asian',
  'Black',
  'Black',
  'Black'],
 'age_group': ['18-34',
  '35-54',
  '55+',
  '18-34',
  '35-54',
  '55+',
  '18-34',
  '35-54',
  '55+',
  '18-34',
  '35-54',
  '55+',
  '18-34',
  '35-54',
  '55+'],
 'text': [[{'role': 'user', 'content': 'Give me a recipe using cumcumber'},
   {'role': 'assistant',
    'content': 'Here is a 50 word recipe using cucumber:\n\nSlice cucumbers and toss with 2 tbsp olive oil, 2 tbsp lemon juice, 1 tsp minced garlic, 1/4 tsp salt, and freshly ground black pepper. Let marinate 15 minutes. Serve over salad greens or enjoy as a healthy side dish. Fresh and tasty!'},
   {'role': 'user',
    'content': 'Can you give an alte

## ImplicitPersona
LLM-generated user conversations   
https://arxiv.org/abs/2507.22168


In [39]:
# from huggingface_hub import snapshot_download

# snapshot_download(
#    repo_id="bowen-upenn/ImplicitPersona",     
#    repo_type="dataset",                
#    allow_patterns=["data/raw_data/**"],    
#    local_dir="data/implicitpersona",         
# )

In [40]:

personas = {'name': [],
            'gender': [],
            'ethnicity': [],
            'age_group': [], 
            'text': []}

persona_files = os.listdir('data/implicitpersona')
for file in persona_files:
    with open(f'data/implicitpersona/{file}', 'r') as f:
        persona_info = list(json.load(f).values())[0]
        if 'detailed_persona' in persona_info.keys():
            persona_info = persona_info['detailed_persona']
        if 'demographics' in persona_info.keys() and 'members' in persona_info['demographics'].keys():
            continue
        try: 
            conversations = [x['conversations'] for x in persona_info['conversations']['trouble_consult']]
            names = [persona_info['name']] * len(conversations)
            genders = [persona_info['gender']] * len(conversations)
            age_groups = [persona_info['age']] * len(conversations)

            if 'race_ethnicity' in persona_info.keys():
                ethnicities = [persona_info['race_ethnicity']] * len(conversations)
            elif 'race' in persona_info.keys():
                ethnicities = [persona_info['race']] * len(conversations)
            elif 'racial_ethnic_identity' in persona_info.keys():
                ethnicities = [persona_info['racial_ethnic_identity']] * len(conversations)
            else:
                ethnicities = [''] * len(conversations)

            personas['text'].extend(conversations)
            personas['name'].extend(names)
            personas['gender'].extend(genders)
            personas['age_group'].extend(age_groups)
            personas['ethnicity'].extend(ethnicities)

        except:
            logger.info(f'Issues with file {file}')
            continue

personas['age_group'] = [
    '18-34' if a >= 18 and a < 35 else 
    '35-54' if a >= 35 and a < 55 else 
    '55+' if a >= 55 else 
    None for a in personas['age_group']
]

personas['ethnicity'] = [
    None if re.search(r'mixed|multiethnic', str(e), re.IGNORECASE) else 
    'White' if re.search(r'white|ashkenazi|eastern european', str(e), re.IGNORECASE) else 
    'Hispanic' if re.search(r'latin|hispanic|mexican|puerto rican', str(e), re.IGNORECASE) else 
    'Asian' if re.search(r'asian|chinese|japanese|thai|indonesian|korean|japonesa|rohingya', str(e), re.IGNORECASE) else 
    'Black' if re.search(r'black|afro|african', str(e), re.IGNORECASE) else 
    None for e in personas['ethnicity']
]

personas['gender'] = [
    'Female' if re.search(r'woman|female|girl', str(g), re.IGNORECASE) else 
    'Male' if re.search(r'man|male|boy', str(g), re.IGNORECASE) else 
    'Non-binary' for g in personas['gender']
]

personas_df = pd.DataFrame(personas)    

personas_df = personas_df.dropna(axis=0)



[32m2025-11-08 18:50:19.851[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mIssues with file raw_data_250815_163935_persona323.json[0m
[32m2025-11-08 18:50:26.743[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mIssues with file raw_data_250815_163935_persona253.json[0m
[32m2025-11-08 18:50:27.541[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mIssues with file raw_data_250815_163935_persona73.json[0m
[32m2025-11-08 18:50:27.557[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mIssues with file raw_data_250820_224501_persona840.json[0m
[32m2025-11-08 18:50:27.965[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mIssues with file raw_data_250818_011555_persona443.json[0m
[32m2025-11-08 18:50:29.633[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mIssues with file raw_data_250815_143643_persona6.json

In [41]:
data['writing-style-llm'] = {}

gender_dfs = prep_dfs(df=personas_df, var='gender', combinations=combinations, unique_vals=gender_unique)    
gender_conversations = sample_conversations(df_list=gender_dfs, n_samples=5) 
data['writing-style-llm']['gender'] = gender_conversations

ethnicity_dfs = prep_dfs(df=personas_df, var='ethnicity', combinations=combinations, unique_vals=ethnicity_unique)    
ethnicity_conversations = sample_conversations(df_list=ethnicity_dfs, n_samples=5) 
data['writing-style-llm']['ethnicity'] = ethnicity_conversations

age_group_dfs = prep_dfs(df=personas_df, var='age_group', combinations=combinations, unique_vals=age_group_unique)    
age_group_conversations = sample_conversations(df_list=age_group_dfs, n_samples=5)
data['writing-style-llm']['age_group'] = age_group_conversations 

# Explicit Mentions

In [42]:
data['explicit-mention-user'] = {}

age_ranges = {
    '18-34': [18, 34],
    '35-54': [35, 54],
    '55+': [55, 75]
}

random.seed(13)
data['explicit-mention-user']['age_group'] = {
    'text': 
        ['I am %s years old.' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] +
        ['As a %s year old individual:' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] +
        ['You are talking to a %s year old person.' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] +
        ['I identify as %s years old.'% random.randint(age_ranges[age][0], age_ranges[age][1])for  age in age_group_unique] +
        ['My age is %s' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] 
}

data['explicit-mention-user']['gender'] = {
    'text': 
        ['I am %s.' % gender for gender in gender_unique] +
        ['As a %s individual:' % gender for gender in gender_unique] +
        ['You are talking to a %s person.' % gender for gender in gender_unique] +
        ['I identify as %s.'% gender for gender in gender_unique] +
        ['My gender is %s' % gender for gender in gender_unique] 
}

data['explicit-mention-user']['ethnicity'] = {
    'text': 
        ['I am %s.' % ethnicity for ethnicity in ethnicity_unique] +
        ['As a %s individual:' % ethnicity for ethnicity in ethnicity_unique] +
        ['You are talking to a %s person.' % ethnicity for ethnicity in ethnicity_unique] +
        ['I identify as %s.'% ethnicity for ethnicity in ethnicity_unique] +
        ['My ethnicity is %s' % ethnicity for ethnicity in ethnicity_unique] 
}

In [43]:
data['explicit-mention-system'] = {}

age_ranges = {
    '18-34': [18, 34],
    '35-54': [35, 54],
    '55+': [55, 75]
}

random.seed(13)

# randomly sample an age within the range
data['explicit-mention-system']['age_group'] = {
    'text': 
        ['The user is %s years old.' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] +
        ['Reply to a %s year old individual:' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] +
        ['You are talking to a %s year old person.' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] +
        ['I identify as %s years old.'% random.randint(age_ranges[age][0], age_ranges[age][1])for  age in age_group_unique] +
        ['My age is %s' % random.randint(age_ranges[age][0], age_ranges[age][1]) for age in age_group_unique] 
}

data['explicit-mention-system']['gender'] = {
    'text': 
        ['The user is %s.' % gender for gender in gender_unique] +
        ['Reply to a %s individual:' % gender for gender in gender_unique] +
        ['You are talking to a %s person.' % gender for gender in gender_unique] +
        ['The user identifies as %s.'% gender for gender in gender_unique] +
        ['The user\'s gender is %s' % gender for gender in gender_unique] 
}

data['explicit-mention-system']['ethnicity'] = {
    'text': 
        ['The user is %s.' % ethnicity for ethnicity in ethnicity_unique] +
        ['Reply to a %s individual:' % ethnicity for ethnicity in ethnicity_unique] +
        ['You are talking to a %s person.' % ethnicity for ethnicity in ethnicity_unique] +
        ['The user identifies as %s.'% ethnicity for ethnicity in ethnicity_unique] +
        ['The user\'s ethnicity is %s' % ethnicity for ethnicity in ethnicity_unique] 
}

## Names


In [44]:
def format_first_name(s: str) -> str:
    s = re.sub(r"\s+", " ", s.strip().replace("â€™","'")).lower()
    return re.sub(r"(^|[ '])([a-z])", lambda m: m.group(1) + m.group(2).upper(), s)

def get_race_ethnicity(row):
    eth = row['ethnic_code']
    r = row['race_code']
    if eth == 'HL':
        return 'Hispanic'
    if r == 'B':
        return 'Black'
    if r == 'W':
        return 'White'
    if r == 'A':
        return 'Asian'
    return None  



In [45]:
names_df = pd.read_csv('data/ncvoter_Statewide_processed.csv')

# BEGIN: Recode gender_code, race_code, and age_at_year_end
names_df['age_group'] = pd.cut(names_df['age_at_year_end'], 
                                 bins=[18, 34, 54, 74], 
                                 labels=['18-34', '35-54', '55+'], 
                                 right=True)

names_df['ethnicity'] = names_df.apply(get_race_ethnicity, axis=1)

names_df['gender'] = names_df['gender_code'].map({
    'M': 'Male',
    'F': 'Female',
    'N': 'Non-binary' 
})

names_df['text'] = names_df['first_name'].apply(format_first_name)
names_df = names_df.loc[names_df['text'].str.len() > 3]

In [46]:
data['name-system'] = {}

templates = ['The user\'s name is %s.', 'The user is called %s.', 'You are talking to %s.', 'Reply to a user called %s.', 'The name of the user is %s.']

gender_dfs = prep_dfs(df=names_df, var='gender', combinations=combinations, unique_vals=gender_unique)    
gender_names = sample_conversations(df_list=gender_dfs, n_samples=5)
gender_names['text'] = [template % name for template, name in zip(templates * 3, gender_names['text'])] 
data['name-system']['gender'] = gender_names

ethnicity_dfs = prep_dfs(df=names_df, var='ethnicity', combinations=combinations, unique_vals=ethnicity_unique)    
ethnicity_names = sample_conversations(df_list=ethnicity_dfs, n_samples=5)
ethnicity_names['text'] = [template % name for template, name in zip(templates * 4, ethnicity_names['text'])] 
data['name-system']['ethnicity'] = ethnicity_names

age_group_dfs = prep_dfs(df=names_df, var='age_group', combinations=combinations, unique_vals=age_group_unique)    
age_group_names = sample_conversations(df_list=age_group_dfs, n_samples=5)
age_group_names['text'] = [template % name for template, name in zip(templates * 3, age_group_names['text'])] 
data['name-system']['age_group'] = age_group_names

## Names in User Prompt

In [47]:
data['name-user'] = {}

templates = ['My name is %s.', 'I am called %s.', 'You are talking to %s.', 'Reply to me, %s.', '%s is my name.'] 

gender_dfs = prep_dfs(df=names_df, var='gender', combinations=combinations, unique_vals=gender_unique)    
gender_names = sample_conversations(df_list=gender_dfs, n_samples=5)
gender_names['text'] = [template % name for template, name in zip(templates * 3, gender_names['text'])] 
data['name-user']['gender'] = gender_names

ethnicity_dfs = prep_dfs(df=names_df, var='ethnicity', combinations=combinations, unique_vals=ethnicity_unique)    
ethnicity_names = sample_conversations(df_list=ethnicity_dfs, n_samples=5)
ethnicity_names['text'] = [template % name for template, name in zip(templates * 4, ethnicity_names['text'])] 
data['name-user']['ethnicity'] = ethnicity_names

age_group_dfs = prep_dfs(df=names_df, var='age_group', combinations=combinations, unique_vals=age_group_unique)    
age_group_names = sample_conversations(df_list=age_group_dfs, n_samples=5)
age_group_names['text'] = [template % name for template, name in zip(templates * 3 , age_group_names['text'])] 
data['name-user']['age_group'] = age_group_names

# Save final data

In [48]:
with open('data/demographics_data.json', 'w') as f:
    json.dump(data, f)