### Setup

In this section, we load necessary libraries and define custom functions.

In [1]:
# install PPI library if needed 
# %pip install git+https://github.com/Michael-Howes/ppi_py.git
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
import sys
from scipy import stats
from ppi_py import ppi_ols_ci, classical_ols_ci, ppi_ols_pointestimate
import PythonFunctions as pf


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("../Data/4_gpt4turbo_wp_20241118.csv.gz")

Covs = ['PedPed', 'Barrier', 'CrossingSignal', 'NumberOfCharacters',
        'DiffNumberOFCharacters', 'LeftHand', 'Man', 'Woman', 'Pregnant',
        'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless',
        'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive',
        'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor',
        'MaleDoctor', 'Dog', 'Cat', 
        'Intervention'
        ]

sys.version

'3.11.4 (v3.11.4:d2340ef257, Jun  6 2023, 19:15:51) [Clang 13.0.0 (clang-1300.0.29.30)]'

In [3]:
print("Number of respondents: ", len(df["UserID"].unique()))
print("Number of scenarios: ", len(df["ResponseID"].unique()))
print("Number of options per scenario: ", df["ResponseID"].value_counts().unique()[0])
print("Avg. number of scenarios per respondent: ", round(df.groupby("UserID")["ResponseID"].count().mean()/2,0))
print("Number of rows:", df.shape[0])
print("Number of NAs in observed dependent variable: ", df["Saved"].isna().sum())
print("Number of NAs in predicted dependent variable with GPT4 Turbo: ", df["gpt4turbo_wp_Saved"].isna().sum())



Number of respondents:  54695
Number of scenarios:  581981
Number of options per scenario:  2
Avg. number of scenarios per respondent:  11.0
Number of rows: 1163962
Number of NAs in observed dependent variable:  0
Number of NAs in predicted dependent variable with GPT4 Turbo:  0


### Reproduce AMCE from R functions

Awad et al. (2018) use R to estimate the AMCE for the the conjoint experiment. In this section, we verify that we can obtain the results with our Python code.

Below we define a function to compute the Average Marginal Component Effect (AMCE) for an attribute of the moral dilemmas using  weighted least squares. 

First, we compute the AMCEs only with data from human subjects using the functions defined above.

In [4]:
amce_human_subjects = pd.concat([
    pf.compute_amce(df, x="Intervention", y="Saved"), 
    pf.compute_amce(df, x="Barrier", y="Saved"), 
    pf.compute_amce(df, x="Gender", y="Saved"), 
    pf.compute_amce(df, x="Fitness", y="Saved"), 
    pf.compute_amce(df, x="Social Status", y="Saved"), 
    pf.compute_amce(df, x="CrossingSignal",y="Saved"),
    pf.compute_amce(df, x="Age", y="Saved"),
    pf.compute_amce(df, x="Utilitarian", y="Saved"),
    pf.compute_amce(df, x="Species", y="Saved")
])      
amce_human_subjects.round(3)

Unnamed: 0,x,y,beta,se,lower,upper
0,Intervention,Saved,0.081,0.002,0.078,0.084
0,Barrier,Saved,0.105,0.003,0.1,0.111
0,Gender,Saved,0.135,0.003,0.129,0.142
0,Fitness,Saved,0.176,0.004,0.169,0.183
0,Social Status,Saved,0.24,0.009,0.222,0.258
0,CrossingSignal,Saved,0.378,0.003,0.372,0.383
0,Age,Saved,0.508,0.003,0.501,0.514
0,Utilitarian,Saved,0.571,0.003,0.565,0.576
0,Species,Saved,0.684,0.003,0.679,0.69


The AMCE estimates above are the same as those calculated with the functions by Awad et al. (2018), see object `main.Saved` in the R script `8_CalculateAMCE.R`. Hence, the custom functions defined in this notebook give the same results as the functions defined in the original article. 


|           label            |    dv  |  amce |   se  | conf.low | conf.high |
|----------------------------|--------|-------|-------|----------|-----------|
|   Intervention             | Saved  | 0.068 | 0.008 |    0.052 |     0.084 |
|        Barrier             | Saved  | 0.165 | 0.014 |    0.137 |     0.193 |
|            Law             | Saved  | 0.336 | 0.015 |    0.307 |     0.366 |
|         Gender             | Saved  | 0.160 | 0.017 |    0.127 |     0.193 |
|        Fitness             | Saved  | 0.121 | 0.018 |    0.085 |     0.156 |
|  Social Status             | Saved  | 0.171 | 0.047 |    0.079 |     0.263 |
|            Age             | Saved  | 0.482 | 0.016 |    0.451 |     0.513 |
| No. Characters             | Saved  | 0.573 | 0.014 |    0.545 |     0.602 |
|        Species             | Saved  | 0.646 | 0.015 |    0.617 |     0.675 |


In [21]:
ids = df["ResponseID"].unique()
n = 22000
N = len(ids) - n
random.seed(2024)

n_ids = random.sample(ids.tolist(), k=n)
N_ids = random.sample(list(set(ids) - set(n_ids)), k=N)

df_human = df[ df["ResponseID"].isin(n_ids) ]
df_silicon = df [ df["ResponseID"].isin(N_ids)]

models = ["gpt4turbo_wp_Saved"]

results2 = pd.DataFrame()
for model in models: 
    
    print("Model: ", model)
    results1 = pd.concat([
        pf.compute_amce_ppi(df_human, df_silicon, x="Intervention", y=model), 
        pf.compute_amce_ppi(df_human, df_silicon, x="Barrier", y=model), 
        pf.compute_amce_ppi(df_human, df_silicon, x="Gender", y=model), 
        pf.compute_amce_ppi(df_human, df_silicon, x="Fitness", y=model), 
        pf.compute_amce_ppi(df_human, df_silicon, x="Social Status", y=model), 
        pf.compute_amce_ppi(df_human, df_silicon, x="CrossingSignal",y=model),
        pf.compute_amce_ppi(df_human, df_silicon, x="Age", y=model),
        pf.compute_amce_ppi(df_human, df_silicon, x="Utilitarian", y=model),
        pf.compute_amce_ppi(df_human, df_silicon, x="Species", y=model)
    ],ignore_index=True)
    
    results2 = pd.concat([results2, results1],ignore_index=True).sort_values(by=["y","ppi_corr"], ascending=False)
    
results2.to_csv("../Data/7_rho.csv", index=False)
results2

Model:  gpt4turbo_wp_Saved


KeyboardInterrupt: 

Next, we vary the number of human subjects and silicon subjects in a simulation.

In [4]:
# sample size of human subjects
ns = [500,750]
ns= [10000]

# multiples of human subjects sample size
ks = list([0.1, 0.25, 0.5, 0.75, 1]) + list(np.arange(5, 105, 5))


# number of repetitions for combinations of n and N
reps = 300

# LLM predictions
Ys = models
Ys = ["gpt4turbo_wp_Saved"]

# structural attributes of scenarios
Xs_structural  = ['Intervention', 'Barrier','CrossingSignal']

# attributes of characters
Xs_characters = ['Gender','Fitness','Social Status','Age','Utilitarian','Species']

# all attributes
Xs = Xs_structural + Xs_characters

result = pd.DataFrame()

# loop models
for y in Ys:
  
  print(f"Iterating predictions from the model: {y}")
  
  # loop over predictors
  for x in Xs:
    print(f"    Predictor: {x}")

    # loop over sample sizes of human subjects
    for n in ns:
      print(f"        Human sample size: {n}")

      # sample size silicon subjects 
      Ns = [int(n * k) for n in ns for k in ks]
      
      # loop over sample sizes of silicon subjects
      for N in Ns:
        
        # loop over repetitions
        for r in range(reps):

          # sample dilemmas for human subjects sample
          df_human = df.sample(n=n, replace=False)
          
          # get remaining dilemma ids to sample from
          remaining_df = df.drop(df_human.index)

          # skip current iteration if target N is larger than population
          if (remaining_df.shape[0] < N):
             continue 
          
          # sample dilemmas for silicon subjects sample
          df_silicon = remaining_df.sample(n=N, replace=False)

          # compute acme on n human subjects and N silicon subjects
          ppi = pf.compute_amce_ppi(n_data=df_human, N_data=df_silicon, x=x, y=y)

          # store data
          ppi["n"] = n
          ppi["N"] = N
          
          result = pd.concat([result, ppi], ignore_index=True)    


NameError: name 'models' is not defined

In [9]:
  # structural attributes of scenarios
Xs_structural  = ['Intervention', 'Barrier','CrossingSignal']

# attributes of characters
Xs_characters = ['Gender','Fitness','Social Status','Age','Utilitarian','Species']

# all attributes
Xs = Xs_structural + Xs_characters

args_list = [
    (
        x, 
        ns,
        ks,
        reps,
        df,                                                                                 # temperature
    )
    for x in Xs
]

args_list

[('Intervention',
  [10000],
  [0.1,
   0.25,
   0.5,
   0.75,
   1,
   5,
   10,
   15,
   20,
   25,
   30,
   35,
   40,
   45,
   50,
   55,
   60,
   65,
   70,
   75,
   80,
   85,
   90,
   95,
   100],
  300,
                        ExtendedSessionID         ResponseID        UserID  \
  0         1123743758_4510867385040813.0  DujZ9eRFSejca6Zsd  4.510867e+15   
  1         1123743758_4510867385040813.0  DujZ9eRFSejca6Zsd  4.510867e+15   
  2         1179240030_4073176426704479.0  uRgKdpQYg7bAEb9px  4.073176e+15   
  3         1179240030_4073176426704479.0  uRgKdpQYg7bAEb9px  4.073176e+15   
  4         1138857654_8930084739195527.0  BrZGg5AWoDSMPisDZ  8.930085e+15   
  ...                                 ...                ...           ...   
  1163957   -1548953149_240512028209498.0  pABGnPTn9BWArjkkP  2.405120e+14   
  1163958  -1461496252_1484748286445035.0  zp8Ey6F6NDpxPhNG9  1.484748e+15   
  1163959  -1461496252_1484748286445035.0  zp8Ey6F6NDpxPhNG9  1.484748e+15   
  1

In [18]:
import PythonFunctions as pf

if __name__ == "__main__":
    
    from multiprocessing import Pool, cpu_count

    # Initialize multiprocessing Pool
    num_cores = 9

    # structural attributes of scenarios
    Xs_structural  = ['Intervention', 'Barrier','CrossingSignal']

    # attributes of characters
    Xs_characters = ['Gender','Fitness','Social Status','Age','Utilitarian','Species']

    # all attributes
    Xs = Xs_structural + Xs_characters

    with Pool(processes=num_cores) as pool:

        # Map the process_predictor function to all predictors
        results = pool.map(pf.loop_attribute, Xs)

    # Combine all results into a single DataFrame
    result = pd.concat(results, ignore_index=True)

Scenario attribute: Social Status
Scenario attribute: Fitness
Scenario attribute: CrossingSignal
Scenario attribute: Barrier
Scenario attribute: Species
Scenario attribute: Age
Scenario attribute: Gender
Scenario attribute: Utilitarian
Scenario attribute: Intervention
[0.1, 0.25, 0.5, 0.75, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100]
    Silicon sample size: 1000
[0.1, 0.25, 0.5, 0.75, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100]
    Silicon sample size: 1000
[0.1, 0.25, 0.5, 0.75, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100]
  

KeyboardInterrupt: 

We benchmark the silicon subjects design and the mixed subjects design against a human subjects approach.

In [10]:
# subset point estimates of AMCEs from the entire human subjects sample
benchmark = amce_human_subjects.loc[:, ['x', 'beta']].rename(columns={'beta': 'param'})

# merge benchmark with results from simulation
result_wb = pd.merge(result, benchmark, on='x', how='left')

# report if true value is within the confidence interval from the mixed subjects 
result_wb['coverage_ppi'] = (
    (result_wb['lower_ppi'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_ppi'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_sil'] = (
    (result_wb['lower_sil'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_sil'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_hum'] = (
    (result_wb['lower_hum'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_hum'])
).astype(int) 

result_wb

Unnamed: 0,y,x,beta_ppi,beta_hum,beta_sil,beta_sil_sm,se_ppi,se_hum,se_sil,lower_ppi,...,upper_hum,lower_sil,upper_sil,ppi_corr,n,N,param,coverage_ppi,coverage_sil,coverage_hum
0,gpt4turbo_wp_Saved,Intervention,0.056362,0.058181,0.023701,0.023701,0.011694,0.011751,0.036248,0.03342,...,0.081211,-0.047344,0.094746,0.329186,10000,1000,0.081441,0,1,0
1,gpt4turbo_wp_Saved,Intervention,0.093592,0.094369,0.054366,0.054366,0.011568,0.011713,0.022696,0.07091,...,0.117327,0.009882,0.098849,0.337466,10000,2500,0.081441,1,1,1
2,gpt4turbo_wp_Saved,Intervention,0.083498,0.085438,0.083859,0.083859,0.01156,0.011789,0.016125,0.060838,...,0.108545,0.052255,0.115464,0.345088,10000,5000,0.081441,1,1,1
3,gpt4turbo_wp_Saved,Intervention,0.065168,0.068544,0.067454,0.067454,0.011401,0.0117,0.013125,0.0428,...,0.091476,0.041729,0.093179,0.34478,10000,7500,0.081441,1,1,1
4,gpt4turbo_wp_Saved,Intervention,0.08659,0.088663,0.074917,0.074917,0.011363,0.011725,0.011442,0.064335,...,0.111644,0.052491,0.097342,0.3548,10000,10000,0.081441,1,1,1
5,gpt4turbo_wp_Saved,Barrier,0.131396,0.136238,0.352078,0.352078,0.017532,0.017568,0.0516,0.097168,...,0.17067,0.250944,0.453212,0.334362,10000,1000,0.105468,1,0,1
6,gpt4turbo_wp_Saved,Barrier,0.08894,0.089021,0.480483,0.480483,0.017478,0.017643,0.03023,0.054683,...,0.1236,0.421233,0.539733,0.328795,10000,2500,0.105468,1,0,1
7,gpt4turbo_wp_Saved,Barrier,0.093296,0.097938,0.461782,0.461782,0.01733,0.017578,0.021048,0.059346,...,0.132391,0.420529,0.503035,0.32497,10000,5000,0.105468,1,0,1
8,gpt4turbo_wp_Saved,Barrier,0.08608,0.086553,0.496148,0.496148,0.017477,0.018,0.01694,0.051825,...,0.121832,0.462945,0.529351,0.325681,10000,7500,0.105468,1,0,1
9,gpt4turbo_wp_Saved,Barrier,0.112497,0.112726,0.487116,0.487116,0.017161,0.017535,0.015085,0.078864,...,0.147094,0.45755,0.516683,0.306143,10000,10000,0.105468,1,0,1


In [11]:
# Group by n, N, and LLM then calculate mean across repetitions
vars = ['beta_ppi','se_ppi','lower_ppi','upper_ppi','coverage_ppi','ppi_corr',
        'beta_sil','se_sil','lower_sil','upper_sil','coverage_sil',
        'beta_hum','se_hum','lower_hum','upper_hum','coverage_hum']

summ = result_wb.groupby(['n','N','y','x','param'])[vars].mean().reset_index()

# Calculate bias columns
summ['repetitions'] = reps
summ['bias_ppi'] = summ['beta_ppi'] - summ['param']
summ['bias_sil'] = summ['beta_sil'] - summ['param']
summ['bias_hum'] = summ['beta_hum'] - summ['param']

summ['rmse_ppi'] = np.sqrt(summ['bias_ppi']**2 + summ['se_ppi']**2)
summ['rmse_sil'] = np.sqrt(summ['bias_sil']**2 + summ['se_sil']**2)
summ['rmse_hum'] = np.sqrt(summ['bias_hum']**2 + summ['se_hum']**2)

# Save averaged simulation results to compressed csv file
summ.to_csv("../Data/7_ResultsPPI.csv.gz", compression="gzip", index=False)
summ

Unnamed: 0,n,N,y,x,param,beta_ppi,se_ppi,lower_ppi,upper_ppi,coverage_ppi,...,lower_hum,upper_hum,coverage_hum,repetitions,bias_ppi,bias_sil,bias_hum,rmse_ppi,rmse_sil,rmse_hum
0,10000,1000,gpt4turbo_wp_Saved,Barrier,0.105468,0.131396,0.017532,0.097168,0.165892,1.0,...,0.101806,0.17067,1.0,1,0.025928,0.24661,0.03077,0.031299,0.25195,0.035432
1,10000,1000,gpt4turbo_wp_Saved,CrossingSignal,0.377522,0.386669,0.019303,0.348834,0.424502,1.0,...,0.348004,0.423975,1.0,1,0.009147,0.296518,0.008467,0.021361,0.300665,0.02115
2,10000,1000,gpt4turbo_wp_Saved,Intervention,0.081441,0.056362,0.011694,0.03342,0.079261,0.0,...,0.03515,0.081211,0.0,1,-0.025079,-0.05774,-0.02326,0.027672,0.068175,0.02606
3,10000,2500,gpt4turbo_wp_Saved,Barrier,0.105468,0.08894,0.017478,0.054683,0.123197,1.0,...,0.054442,0.1236,1.0,1,-0.016528,0.375015,-0.016447,0.024056,0.376231,0.02412
4,10000,2500,gpt4turbo_wp_Saved,CrossingSignal,0.377522,0.361246,0.01968,0.322696,0.399838,1.0,...,0.326535,0.40432,1.0,1,-0.016276,0.210159,-0.012094,0.025538,0.212905,0.023239
5,10000,2500,gpt4turbo_wp_Saved,Intervention,0.081441,0.093592,0.011568,0.07091,0.116255,1.0,...,0.071412,0.117327,1.0,1,0.012151,-0.027075,0.012929,0.016777,0.03533,0.017446
6,10000,5000,gpt4turbo_wp_Saved,Barrier,0.105468,0.093296,0.01733,0.059346,0.127279,1.0,...,0.063485,0.132391,1.0,1,-0.012172,0.356314,-0.00753,0.021178,0.356935,0.019123
7,10000,5000,gpt4turbo_wp_Saved,CrossingSignal,0.377522,0.369089,0.019578,0.330706,0.40745,1.0,...,0.327002,0.405288,1.0,1,-0.008434,0.320648,-0.011377,0.021317,0.321377,0.022985
8,10000,5000,gpt4turbo_wp_Saved,Intervention,0.081441,0.083498,0.01156,0.060838,0.106152,1.0,...,0.062331,0.108545,1.0,1,0.002057,0.002418,0.003997,0.011741,0.016305,0.012448
9,10000,7500,gpt4turbo_wp_Saved,Barrier,0.105468,0.08608,0.017477,0.051825,0.120335,1.0,...,0.051274,0.121832,1.0,1,-0.019388,0.39068,-0.018915,0.026103,0.391047,0.026111
