In [6]:
# install PPI library if needed 
# %pip install git+https://github.com/Michael-Howes/ppi_py.git
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
import sys
from scipy import stats
from ppi_py import ppi_ols_ci, classical_ols_ci, ppi_ols_pointestimate
import PythonFunctions as pf


In [7]:
df = pd.read_csv("../Data/3_gpt4turbo_wp_20241118.csv.gz", 
                 dtype={'UserID':'string', 'ResponseID':'string'})

Covs = ['PedPed', 'Barrier', 'CrossingSignal', 'NumberOfCharacters',
        'DiffNumberOFCharacters', 'LeftHand', 'Man', 'Woman', 'Pregnant',
        'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless',
        'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive',
        'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor',
        'MaleDoctor', 'Dog', 'Cat', 
        'Intervention'
        ]

sys.version

'3.11.4 (v3.11.4:d2340ef257, Jun  6 2023, 19:15:51) [Clang 13.0.0 (clang-1300.0.29.30)]'

In [8]:
print("Number of respondents: ", len(df["UserID"].unique()))
print("Number of scenarios: ", len(df["ResponseID"].unique()))
print("Number of options per scenario: ", df["ResponseID"].value_counts().unique()[0])
print("Avg. number of scenarios per respondent: ", round(df.groupby("UserID")["ResponseID"].count().mean()/2,1))
print("Number of rows:", df.shape[0])
print("Number of NAs in observed dependent variable: ", df["Saved"].isna().sum())
print("Number of NAs in predicted dependent variable with GPT4 Turbo: ", df["gpt4turbo_wp_Saved"].isna().sum())

Number of respondents:  55890
Number of scenarios:  581981
Number of options per scenario:  2
Avg. number of scenarios per respondent:  10.4
Number of rows: 1163962
Number of NAs in observed dependent variable:  0
Number of NAs in predicted dependent variable with GPT4 Turbo:  0


### Reproduce AMCE from R functions and define 'ground truth' for simulation

Below we compute the Average Marginal Component Effect (AMCE) for an attribute of the moral dilemmas using  weighted least squares. Awad et al. (2018) use R to estimate the AMCE for the the conjoint experiment. First, we verify that we can obtain the results with our Python code. 

In [9]:
amce_human_subjects = pd.concat([
    pf.compute_amce(df, x="Intervention", y="Saved"), 
    pf.compute_amce(df, x="Barrier", y="Saved"), 
    pf.compute_amce(df, x="Gender", y="Saved"), 
    pf.compute_amce(df, x="Fitness", y="Saved"), 
    pf.compute_amce(df, x="Social Status", y="Saved"), 
    pf.compute_amce(df, x="CrossingSignal",y="Saved"),
    pf.compute_amce(df, x="Age", y="Saved"),
    pf.compute_amce(df, x="Utilitarian", y="Saved"),
    pf.compute_amce(df, x="Species", y="Saved")
])      
amce_human_subjects.round(3)

Unnamed: 0,x,y,beta,se,lower,upper
0,Intervention,Saved,0.081,0.002,0.078,0.084
0,Barrier,Saved,0.105,0.003,0.1,0.111
0,Gender,Saved,0.135,0.003,0.129,0.142
0,Fitness,Saved,0.176,0.004,0.169,0.183
0,Social Status,Saved,0.24,0.009,0.222,0.258
0,CrossingSignal,Saved,0.378,0.003,0.372,0.383
0,Age,Saved,0.508,0.003,0.501,0.514
0,Utilitarian,Saved,0.571,0.003,0.565,0.576
0,Species,Saved,0.684,0.003,0.679,0.69


The AMCE estimates above are the same as those calculated with the R functions by Awad et al. (2018), see object `main.Saved` and `4_AmceParamsSimulationR.csv.tar` created in the R script `4_ComputeGroundTruthAMCE.R`. Hence, the Python functions defined in this notebook give the same results as the functions defined in the original article. 

We use these AMCEs computed on the human subjects as the 'ground truth' in our simulation.

### Estimate the PPI correlation

Below we estimate the PPI correlation with a large number of human subjects. 

In [10]:
n = 1000000
N = df.shape[0] - n

# Sample decisions for human subjects sample
df_human = df.sample(n=n, replace = False, random_state=2024)

# Drop these decisions from the full dataset
df_remaining = df.drop(df_human.index)

# Sample decisions for silicon sample
df_silicon = df_remaining.sample(n=N, replace = False, random_state=2024)

model = "gpt4turbo_wp_Saved"

results = pd.concat([
    pf.compute_amce_ppi(df_human, df_silicon, x="Intervention", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Barrier", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Gender", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Fitness", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Social Status", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="CrossingSignal",y=model),
    pf.compute_amce_ppi(df_human, df_silicon, x="Age", y=model),
    pf.compute_amce_ppi(df_human, df_silicon, x="Utilitarian", y=model),
    pf.compute_amce_ppi(df_human, df_silicon, x="Species", y=model)
],ignore_index=True)

# Add sample sizes to the results
results["n"] = n
results["N"] = N
    
results.to_csv("../Data/5_rho.csv", index=False)
results

Unnamed: 0,y,x,beta_ppi,beta_hum,beta_sil,beta_sil_sm,se_ppi,se_hum,se_sil,lower_ppi,upper_ppi,lower_hum,upper_hum,lower_sil,upper_sil,ppi_corr,n,N
0,gpt4turbo_wp_Saved,Intervention,0.081381,0.081581,0.081985,0.081985,0.001163,0.001173,0.002817,0.079101,0.08366,0.079282,0.083881,0.076464,0.087507,0.352677,1000000,163962
1,gpt4turbo_wp_Saved,Barrier,0.10581,0.10593,0.484486,0.484486,0.001757,0.001768,0.003702,0.102366,0.109254,0.102464,0.109396,0.477231,0.491741,0.314138,1000000,163962
2,gpt4turbo_wp_Saved,Gender,0.13401,0.134204,0.197077,0.197077,0.002694,0.002707,0.006378,0.128728,0.139289,0.128897,0.13951,0.184577,0.209576,0.26276,1000000,163962
3,gpt4turbo_wp_Saved,Fitness,0.174661,0.174951,0.016098,0.016098,0.002823,0.002839,0.006787,0.16913,0.180196,0.169386,0.180516,0.002797,0.029399,0.28312,1000000,163962
4,gpt4turbo_wp_Saved,Social Status,0.248259,0.248827,0.026267,0.026267,0.007595,0.007623,0.018429,0.233386,0.263158,0.233887,0.263767,-0.009853,0.062387,0.229833,1000000,163962
5,gpt4turbo_wp_Saved,CrossingSignal,0.378954,0.378903,0.657856,0.657856,0.001954,0.001967,0.003958,0.375125,0.382783,0.375048,0.382759,0.6501,0.665613,0.308987,1000000,163962
6,gpt4turbo_wp_Saved,Age,0.506392,0.506359,0.182285,0.182285,0.00244,0.002446,0.006541,0.501611,0.511174,0.501564,0.511154,0.169465,0.195105,0.198339,1000000,163962
7,gpt4turbo_wp_Saved,Utilitarian,0.572248,0.572312,0.552874,0.552874,0.002256,0.002262,0.005516,0.567826,0.57667,0.567878,0.576746,0.542063,0.563686,0.191972,1000000,163962
8,gpt4turbo_wp_Saved,Species,0.684679,0.684791,0.83383,0.83383,0.002011,0.002011,0.003904,0.680738,0.68862,0.680849,0.688733,0.826178,0.841482,0.049155,1000000,163962


### Simulation

Next, we vary the number of human subjects and silicon subjects in a simulation.

In [None]:
if __name__ == "__main__":
    
    from multiprocessing import Pool

    # Initialize multiprocessing Pool
    num_cores = 9

    # structural attributes of scenarios
    Xs_structural  = ['Intervention', 'Barrier','CrossingSignal']

    # attributes of characters
    Xs_characters = ['Gender','Fitness','Social Status','Age','Utilitarian','Species']

    # all attributes
    Xs = Xs_structural + Xs_characters

    with Pool(processes=num_cores) as pool:

        # Map the process_predictor function to all predictors
        results = pool.map(pf.loop_attribute, Xs)

    # Combine all results into a single DataFrame
    result = pd.concat(results, ignore_index=True)

In [None]:
result

Unnamed: 0,y,x,beta_ppi,beta_hum,beta_sil,beta_sil_sm,se_ppi,se_hum,se_sil,lower_ppi,upper_ppi,lower_hum,upper_hum,lower_sil,upper_sil,ppi_corr,n,N,reps
0,gpt4turbo_wp_Saved,Intervention,0.081558,0.080698,0.095267,0.095267,0.011639,0.011708,0.035330,0.058741,0.104366,0.057751,0.103645,0.026021,0.164513,0.350062,10000,1000,500
1,gpt4turbo_wp_Saved,Intervention,0.079500,0.077143,0.143505,0.143505,0.011609,0.011675,0.034828,0.056757,0.102263,0.054261,0.100024,0.075244,0.211766,0.354711,10000,1000,500
2,gpt4turbo_wp_Saved,Intervention,0.092394,0.092582,0.095575,0.095575,0.011643,0.011695,0.036049,0.069574,0.115215,0.069661,0.115503,0.024920,0.166229,0.331427,10000,1000,500
3,gpt4turbo_wp_Saved,Intervention,0.094310,0.093358,0.132576,0.132576,0.011651,0.011709,0.035580,0.071483,0.117153,0.070409,0.116308,0.062841,0.202312,0.334296,10000,1000,500
4,gpt4turbo_wp_Saved,Intervention,0.083581,0.083143,0.092102,0.092102,0.011703,0.011763,0.036131,0.060646,0.106522,0.060089,0.106197,0.021286,0.162918,0.330125,10000,1000,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103495,gpt4turbo_wp_Saved,Species,0.645817,0.645817,0.844310,0.844310,0.020414,0.020408,0.004772,0.605807,0.685827,0.605818,0.685816,0.834958,0.853663,0.035178,10000,100000,500
103496,gpt4turbo_wp_Saved,Species,0.675958,0.675831,0.851583,0.851583,0.020890,0.020906,0.004674,0.635012,0.716900,0.634857,0.716806,0.842422,0.860745,0.060573,10000,100000,500
103497,gpt4turbo_wp_Saved,Species,0.698456,0.697430,0.838158,0.838158,0.019553,0.019629,0.004984,0.660156,0.736802,0.658958,0.735902,0.828390,0.847927,0.088561,10000,100000,500
103498,gpt4turbo_wp_Saved,Species,0.671396,0.671557,0.842817,0.842817,0.020089,0.020085,0.004880,0.631993,0.710741,0.632190,0.710923,0.833251,0.852382,0.014647,10000,100000,500


We benchmark the silicon subjects design and the mixed subjects design against a human subjects approach.

In [None]:
# subset point estimates of AMCEs from the entire human subjects sample
benchmark = amce_human_subjects.loc[:, ['x', 'beta']].rename(columns={'beta': 'param'})

# merge benchmark with results from simulation
result_wb = pd.merge(result, benchmark, on='x', how='left')

# report if true value is within the confidence interval from the mixed subjects 
result_wb['coverage_ppi'] = (
    (result_wb['lower_ppi'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_ppi'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_sil'] = (
    (result_wb['lower_sil'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_sil'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_hum'] = (
    (result_wb['lower_hum'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_hum'])
).astype(int) 

result_wb

Unnamed: 0,y,x,beta_ppi,beta_hum,beta_sil,beta_sil_sm,se_ppi,se_hum,se_sil,lower_ppi,...,lower_sil,upper_sil,ppi_corr,n,N,reps,param,coverage_ppi,coverage_sil,coverage_hum
0,gpt4turbo_wp_Saved,Intervention,0.081558,0.080698,0.095267,0.095267,0.011639,0.011708,0.035330,0.058741,...,0.026021,0.164513,0.350062,10000,1000,500,0.081441,1,1,1
1,gpt4turbo_wp_Saved,Intervention,0.079500,0.077143,0.143505,0.143505,0.011609,0.011675,0.034828,0.056757,...,0.075244,0.211766,0.354711,10000,1000,500,0.081441,1,1,1
2,gpt4turbo_wp_Saved,Intervention,0.092394,0.092582,0.095575,0.095575,0.011643,0.011695,0.036049,0.069574,...,0.024920,0.166229,0.331427,10000,1000,500,0.081441,1,1,1
3,gpt4turbo_wp_Saved,Intervention,0.094310,0.093358,0.132576,0.132576,0.011651,0.011709,0.035580,0.071483,...,0.062841,0.202312,0.334296,10000,1000,500,0.081441,1,1,1
4,gpt4turbo_wp_Saved,Intervention,0.083581,0.083143,0.092102,0.092102,0.011703,0.011763,0.036131,0.060646,...,0.021286,0.162918,0.330125,10000,1000,500,0.081441,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103495,gpt4turbo_wp_Saved,Species,0.645817,0.645817,0.844310,0.844310,0.020414,0.020408,0.004772,0.605807,...,0.834958,0.853663,0.035178,10000,100000,500,0.684374,1,0,1
103496,gpt4turbo_wp_Saved,Species,0.675958,0.675831,0.851583,0.851583,0.020890,0.020906,0.004674,0.635012,...,0.842422,0.860745,0.060573,10000,100000,500,0.684374,1,0,1
103497,gpt4turbo_wp_Saved,Species,0.698456,0.697430,0.838158,0.838158,0.019553,0.019629,0.004984,0.660156,...,0.828390,0.847927,0.088561,10000,100000,500,0.684374,1,0,1
103498,gpt4turbo_wp_Saved,Species,0.671396,0.671557,0.842817,0.842817,0.020089,0.020085,0.004880,0.631993,...,0.833251,0.852382,0.014647,10000,100000,500,0.684374,1,0,1


In [None]:
# Group by n, N, and LLM then calculate mean across repetitions
vars = ['beta_ppi','se_ppi','lower_ppi','upper_ppi','coverage_ppi','ppi_corr',
        'beta_sil','se_sil','lower_sil','upper_sil','coverage_sil',
        'beta_hum','se_hum','lower_hum','upper_hum','coverage_hum']

summ = result_wb.groupby(['n','N','y','x','param'])[vars].mean().reset_index()

# Calculate bias columns
summ['bias_ppi'] = summ['beta_ppi'] - summ['param']
summ['bias_sil'] = summ['beta_sil'] - summ['param']
summ['bias_hum'] = summ['beta_hum'] - summ['param']

summ['rmse_ppi'] = np.sqrt(summ['bias_ppi']**2 + summ['se_ppi']**2)
summ['rmse_sil'] = np.sqrt(summ['bias_sil']**2 + summ['se_sil']**2)
summ['rmse_hum'] = np.sqrt(summ['bias_hum']**2 + summ['se_hum']**2)

# Save averaged simulation results to compressed csv file
summ.to_csv("../Data/5_ResultsPPI_coord1.csv.gz", compression="gzip", index=False)
summ

Unnamed: 0,n,N,y,x,param,beta_ppi,se_ppi,lower_ppi,upper_ppi,coverage_ppi,...,se_hum,lower_hum,upper_hum,coverage_hum,bias_ppi,bias_sil,bias_hum,rmse_ppi,rmse_sil,rmse_hum
0,10000,1000,gpt4turbo_wp_Saved,Age,0.507580,0.506198,0.024414,0.458262,0.553964,0.958,...,0.024451,0.458150,0.553998,0.956,-0.001382,-0.327599,-0.001506,0.024453,0.337974,0.024498
1,10000,1000,gpt4turbo_wp_Saved,Barrier,0.105468,0.105027,0.017630,0.070495,0.139604,0.962,...,0.017706,0.070306,0.139712,0.964,-0.000441,0.383107,-0.000459,0.017636,0.385977,0.017712
2,10000,1000,gpt4turbo_wp_Saved,CrossingSignal,0.377522,0.377852,0.019581,0.339466,0.416222,0.944,...,0.019664,0.339365,0.416445,0.946,0.000330,0.277591,0.000382,0.019584,0.282218,0.019667
3,10000,1000,gpt4turbo_wp_Saved,Fitness,0.175917,0.176557,0.028279,0.121116,0.231969,0.950,...,0.028376,0.120886,0.232118,0.950,0.000639,-0.154063,0.000585,0.028287,0.176368,0.028382
4,10000,1000,gpt4turbo_wp_Saved,Gender,0.135175,0.137005,0.026941,0.084174,0.189780,0.952,...,0.027019,0.083891,0.189805,0.952,0.001830,0.073237,0.001673,0.027003,0.109218,0.027071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,10000,100000,gpt4turbo_wp_Saved,Gender,0.135175,0.134667,0.026182,0.083325,0.185958,0.950,...,0.027046,0.081548,0.187565,0.958,-0.000508,0.066049,-0.000619,0.026187,0.066556,0.027053
203,10000,100000,gpt4turbo_wp_Saved,Intervention,0.081441,0.081465,0.011043,0.059821,0.103111,0.950,...,0.011729,0.058367,0.104346,0.944,0.000024,0.003952,-0.000084,0.011043,0.005348,0.011730
204,10000,100000,gpt4turbo_wp_Saved,Social Status,0.240005,0.241445,0.073825,0.096346,0.385735,0.934,...,0.075894,0.090143,0.387643,0.930,0.001440,-0.197878,-0.001112,0.073839,0.199247,0.075902
205,10000,100000,gpt4turbo_wp_Saved,Species,0.684374,0.683909,0.020095,0.644499,0.723270,0.954,...,0.020131,0.644258,0.723170,0.952,-0.000465,0.160272,-0.000659,0.020101,0.160345,0.020142
