In [1]:
# Imports and configuration
import os
import pandas as pd
import numpy as np

# Database and backend imports
from personas_backend.db.db_handler import DatabaseHandler
from personas_backend import ACTIVE_SCHEMA

# Evaluations package imports
from evaluations import data_access, table_demographics

# Configuration
SCHEMA = os.environ.get("default_schema", "personality_trap")
print(f"Using schema: {SCHEMA}")
print(f"Active schema: {ACTIVE_SCHEMA}")

Using schema: personality_trap
Active schema: test_validation_schema


In [2]:
# Database and backend imports
from personas_backend.db.db_handler import DatabaseHandler
# Connect to the database
db_handler = DatabaseHandler()
conn = db_handler.connection
conn

Engine(postgresql://personas:***@localhost:5432/personas)

In [3]:
# Database access and table generation functions
from evaluations import data_access, table_demographics

# Database connection
from personas_backend.db.db_handler import DatabaseHandler

# Database schema to use (same as production data)
SCHEMA = "personality_trap"
ACTIVE_SCHEMA = SCHEMA

# Connect to local database
db_handler = DatabaseHandler()
conn = db_handler.connection

# Load population data using the package function
# This now uses the new model_print and population_print columns for clean nomenclature
with conn.connect() as connection:
    population_df = data_access.load_population(connection, schema=SCHEMA)

# Basic data quality checks
print(f"Total personas loaded: {len(population_df)}")
print(f"Unique models: {population_df['model_print'].nunique() if 'model_print' in population_df.columns else 'model_print column not found'}")
print(f"Unique populations: {population_df['population_print'].nunique() if 'population_print' in population_df.columns else 'population_print column not found'}")

# Remove rows with missing description word counts (data quality)
population_df = population_df.dropna(subset=["word_count_description"])
print(f"After removing missing word counts: {len(population_df)} personas")

population_df.head()

Total personas loaded: 82600
Unique models: 5
Unique populations: 3
After removing missing word counts: 82600 personas


Unnamed: 0,id,ref_personality_id,population,model,name,age,gender,race,sexual_orientation,ethnicity,religious_belief,occupation,political_orientation,location,description,word_count_description,repetitions,model_print,population_print
0,1811,19,borderline_maxN_gpt4o,gpt4o,Alex,30,non-binary,White,lgbtq+,european,Agnostic,Writing & publishing,Progressive,"Portland, OR",Alex is a thoughtful and introspective individ...,447,1,GPT-4o,Max N
1,1813,19,borderline_maxN_gpt4o,gpt4o,Alex,34,female,White,heterosexual,caucasian,Agnostic,Writing & publishing,Centre,"Seattle, WA",Alex is a thoughtful and introspective individ...,399,4,GPT-4o,Max N
2,1814,19,borderline_maxN_gpt4o,gpt4o,Alex Johnson,34,male,White,heterosexual,non-hispanic,Agnostic,Accounting & finance,Centre,"Seattle, WA",Alex Johnson is a thoughtful and introspective...,412,5,GPT-4o,Max N
3,1859,19,generated_claude35sonnet_spain826,claude35sonnet,Sarah Thompson,32,female,White,heterosexual,caucasian,Agnostic,Accounting & finance,Centre,"Chicago, IL",Sarah Thompson is a 32-year-old accountant liv...,354,5,Claude-3.5-s,Base
4,2001,21,borderline_maxN_claude35sonnet,claude35sonnet,Emily Chen,28,female,Asian,heterosexual,chinese-american,Agnostic,Tech & engineering,Progressive,"Seattle, WA",Emily Chen is a 28-year-old software developer...,384,1,Claude-3.5-s,Max N


# Binary Demographic Analysis

This analysis treats each demographic value as an independent binary variable and performs statistical tests to compare proportions between populations.

## Approach:
1. **Transform categorical demographics into binary variables** - Each demographic value becomes a 0/1 variable
2. **Combine repetitions** - Concatenate all data from repetitions for each population
3. **Statistical testing** - Use z-test for proportions to compare binary outcomes between populations
4. **Handle edge cases** - Manage cases with zero variance appropriately

This approach provides clear, interpretable results for each demographic characteristic.

In [4]:
demographic_columns = ['gender', 'political_orientation', 'race', 'religious_belief', 'sexual_orientation']

# prepare the required dataframe for the analysis
ttest_df = population_df.copy()
# ttest_df = population_df.copy()[population_df['model'].isin(['claude35sonnet', 'llama323B', 'llama3170B'])]

ttest_df = ttest_df[ttest_df['repetitions'] <= 5]

# ttest_df['population_map'] = ttest_df['population'].map(population_mapping, na_action='ignore')

# map the values in the religious_belief column from Buddhist and Hinduist, to Others
ttest_df['religious_belief'] = ttest_df['religious_belief'].replace(
    {'Buddhist': 'Others', 'Hinduist': 'Others'}, regex=True)

ttest_df.groupby(['model', 'population', 'population_print'], as_index=False).size()

Unnamed: 0,model,population,population_print,size
0,claude35sonnet,borderline_maxN_claude35sonnet,Max N,4130
1,claude35sonnet,borderline_maxP_claude35sonnet,Max P,4130
2,claude35sonnet,generated_claude35sonnet_spain826,Base,4130
3,gpt35,borderline_maxN_gpt35,Max N,4130
4,gpt35,borderline_maxP_gpt35,Max P,4130
5,gpt35,generated_gpt35_spain826,Base,4130
6,gpt4o,borderline_maxN_gpt4o,Max N,4130
7,gpt4o,borderline_maxP_gpt4o,Max P,4130
8,gpt4o,generated_gpt4o_spain826,Base,4130
9,llama3170B,borderline_maxN_llama3170B,Max N,4130


# Tables 1-3: Demographics by Model and Condition

This section generates the demographic tables (Tables 1-3) for the paper using the enhanced `evaluations.table_demographics` package.

## Statistical Approach: Binary Demographic Analysis

The package now uses a **binary demographic analysis** approach:

1. **Binary variables**: Each demographic value is treated as an independent binary variable (0/1)
2. **Combined repetitions**: All repetitions are concatenated for each population
3. **Z-test for proportions**: Statistical testing uses z-test for proportions (not t-test for means)
4. **Edge cases handled**: Zero variance, empty groups, and perfect separation are managed appropriately

## Significance Markers:
- `*` = p < 0.05
- `†` = p < 0.01  
- `‡` = p < 0.001

All comparisons are against the **Base** condition for each model.

In [5]:
# Prepare data: Filter to repetitions <= 5 and clean religious belief
demo_df = population_df.copy()
demo_df = demo_df[demo_df['repetitions'] <= 5]

# Map Buddhist and Hinduist to Others for consistency with paper
demo_df['religious_belief'] = demo_df['religious_belief'].replace(
    {'Buddhist': 'Others', 'Hinduist': 'Others'}, regex=True
)

print(f"Data prepared: {len(demo_df)} personas")
print(f"Models: {sorted(demo_df['model_print'].unique())}")
print(f"Conditions: {sorted(demo_df['population_print'].unique())}")

Data prepared: 61950 personas
Models: ['Claude-3.5-s', 'GPT-3.5', 'GPT-4o', 'Llama3.1-70B', 'Llama3.2-3B']
Conditions: ['Base', 'Max N', 'Max P']


In [6]:
# Table 1: GPT-4o and GPT-3.5
print("=" * 80)
print("TABLE 1: GPT-4o and GPT-3.5 Demographics")
print("=" * 80)

TABLE1 = table_demographics.create_paper_table(
    demo_df,
    models=["GPT-4o", "GPT-3.5"]
)

print(f"\nTable 1 shape: {TABLE1.shape}")
print("\nTable 1:")
display(TABLE1)

TABLE 1: GPT-4o and GPT-3.5 Demographics

Table 1 shape: (18, 6)

Table 1:

Table 1 shape: (18, 6)

Table 1:


Unnamed: 0_level_0,model,GPT-4o,GPT-4o,GPT-4o,GPT-3.5,GPT-3.5,GPT-3.5
Unnamed: 0_level_1,condition,Base,MaxN,MaxP,Base,MaxN,MaxP
category,subcategory,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Gender,Female,25.71 ± 1.37,29.54 ± 2.07‡,4.67 ± 0.79‡,90.82 ± 2.13,90.94 ± 1.21,88.74 ± 1.02†
Gender,Male,44.75 ± 2.42,30.46 ± 2.33‡,6.27 ± 0.89‡,9.03 ± 2.29,8.86 ± 1.14,10.94 ± 1.00†
Gender,Non-bin.,29.18 ± 1.76,39.71 ± 1.27‡,88.77 ± 0.99‡,0.15 ± 0.22,0.19 ± 0.14,0.31 ± 0.18
Gender,Other,0.36 ± 0.21,0.29 ± 0.22,0.29 ± 0.20,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00
Pol. Or.,Centre,64.50 ± 1.43,51.45 ± 1.47‡,0.36 ± 0.17‡,75.08 ± 0.89,72.98 ± 1.67*,18.23 ± 1.13‡
Pol. Or.,Con.,0.12 ± 0.09,0.02 ± 0.05,0.00 ± 0.00*,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00
Pol. Or.,Others,0.75 ± 0.13,0.41 ± 0.22*,1.94 ± 0.21‡,8.45 ± 1.68,8.47 ± 0.67,43.03 ± 1.40‡
Pol. Or.,Prog.,34.62 ± 1.44,48.11 ± 1.46‡,97.70 ± 0.19‡,16.46 ± 1.79,18.55 ± 1.56*,38.74 ± 1.95‡
Race,Asian,0.58 ± 0.29,0.63 ± 0.34,1.89 ± 0.52‡,3.10 ± 0.41,2.88 ± 0.40,2.30 ± 0.24*
Race,Black,0.05 ± 0.07,0.00 ± 0.00,0.29 ± 0.18†,0.97 ± 0.33,0.87 ± 0.16,1.16 ± 0.55


In [7]:
# Table 2: Llama Models (3.2-3B and 3.1-70B)
print("=" * 80)
print("TABLE 2: Llama3.2-3B and Llama3.1-70B Demographics")
print("=" * 80)

TABLE2 = table_demographics.create_paper_table(
    demo_df,
    models=["Llama3.2-3B", "Llama3.1-70B"]
)

print(f"\nTable 2 shape: {TABLE2.shape}")
print("\nTable 2:")
display(TABLE2)

TABLE 2: Llama3.2-3B and Llama3.1-70B Demographics

Table 2 shape: (18, 6)

Table 2:

Table 2 shape: (18, 6)

Table 2:


Unnamed: 0_level_0,model,Llama3.2-3B,Llama3.2-3B,Llama3.2-3B,Llama3.1-70B,Llama3.1-70B,Llama3.1-70B
Unnamed: 0_level_1,condition,Base,MaxN,MaxP,Base,MaxN,MaxP
category,subcategory,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Gender,Female,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,22.03 ± 2.56,30.07 ± 0.40‡,5.04 ± 0.90‡
Gender,Male,100.00 ± 0.00,100.00 ± 0.00,100.00 ± 0.00,77.94 ± 2.56,69.81 ± 0.28‡,94.33 ± 0.85‡
Gender,Non-bin.,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.02 ± 0.05,0.02 ± 0.05,0.39 ± 0.16‡
Gender,Other,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.10 ± 0.10*,0.24 ± 0.23†
Pol. Or.,Centre,1.94 ± 0.33,2.15 ± 0.68,0.00 ± 0.00‡,10.41 ± 0.65,6.20 ± 0.61‡,0.00 ± 0.00‡
Pol. Or.,Con.,32.57 ± 1.68,15.35 ± 1.03‡,0.00 ± 0.00‡,42.93 ± 1.23,42.15 ± 0.75,0.00 ± 0.00‡
Pol. Or.,Others,0.12 ± 0.12,0.07 ± 0.11,1.33 ± 0.52‡,9.47 ± 0.56,1.21 ± 0.35‡,30.12 ± 1.23‡
Pol. Or.,Prog.,65.38 ± 1.57,82.42 ± 1.13‡,98.67 ± 0.52‡,37.19 ± 1.69,50.44 ± 0.67‡,69.88 ± 1.23‡
Race,Asian,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.90 ± 0.80,0.36 ± 0.19†,0.00 ± 0.00‡
Race,Black,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00


In [8]:
# Table 3: Claude-3.5-Sonnet
print("=" * 80)
print("TABLE 3: Claude-3.5-s Demographics")
print("=" * 80)

TABLE3 = table_demographics.create_paper_table(
    demo_df,
    models=["Claude-3.5-s"]
)

print(f"\nTable 3 shape: {TABLE3.shape}")
print("\nTable 3:")
display(TABLE3)

TABLE 3: Claude-3.5-s Demographics

Table 3 shape: (18, 3)

Table 3:

Table 3 shape: (18, 3)

Table 3:


Unnamed: 0_level_0,model,Claude-3.5-s,Claude-3.5-s,Claude-3.5-s
Unnamed: 0_level_1,condition,Base,MaxN,MaxP
category,subcategory,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Gender,Female,87.85 ± 1.08,98.31 ± 0.17‡,0.82 ± 0.10‡
Gender,Male,8.93 ± 1.19,0.29 ± 0.11‡,2.52 ± 0.46‡
Gender,Non-bin.,3.22 ± 0.18,1.40 ± 0.18‡,96.66 ± 0.48‡
Gender,Other,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00
Pol. Or.,Centre,90.70 ± 1.24,79.18 ± 1.51‡,0.00 ± 0.00‡
Pol. Or.,Con.,0.56 ± 0.20,0.00 ± 0.00‡,0.00 ± 0.00‡
Pol. Or.,Others,0.07 ± 0.07,0.00 ± 0.00,24.29 ± 0.38‡
Pol. Or.,Prog.,8.67 ± 1.07,20.82 ± 1.51‡,75.71 ± 0.38‡
Race,Asian,34.82 ± 1.43,29.15 ± 1.58‡,11.72 ± 0.41‡
Race,Black,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00


## Export Tables

All tables can be exported to CSV for external table generation tools or LaTeX conversion.

In [9]:
# Export tables to CSV (optional)
# Uncomment to export:

# TABLE1.to_csv('table1_gpt_demographics.csv')
# TABLE2.to_csv('table2_llama_demographics.csv')
# TABLE3.to_csv('table3_claude_demographics.csv')

print("✅ All tables generated successfully!")
print("\nTables use binary demographic analysis with z-test for proportions")
print("Significance markers: * (p<0.05), † (p<0.01), ‡ (p<0.001)")

✅ All tables generated successfully!

Tables use binary demographic analysis with z-test for proportions
Significance markers: * (p<0.05), † (p<0.01), ‡ (p<0.001)
