### Setup

In this section, we load necessary libraries and define custom functions.

In [1]:
# install PPI library if needed 
# %pip install git+https://github.com/Michael-Howes/ppi_py.git
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
import sys
from scipy import stats
from ppi_py import ppi_ols_ci, classical_ols_ci, ppi_ols_pointestimate
import PythonFunctions as pf


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("../Data/4_gpt4turbo_wp_20241118.csv.gz")

Covs = ['PedPed', 'Barrier', 'CrossingSignal', 'NumberOfCharacters',
        'DiffNumberOFCharacters', 'LeftHand', 'Man', 'Woman', 'Pregnant',
        'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless',
        'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive',
        'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor',
        'MaleDoctor', 'Dog', 'Cat', 
        'Intervention'
        ]

sys.version

'3.11.4 (v3.11.4:d2340ef257, Jun  6 2023, 19:15:51) [Clang 13.0.0 (clang-1300.0.29.30)]'

In [3]:
print("Number of respondents: ", len(df["UserID"].unique()))
print("Number of scenarios: ", len(df["ResponseID"].unique()))
print("Number of options per scenario: ", df["ResponseID"].value_counts().unique()[0])
print("Avg. number of scenarios per respondent: ", round(df.groupby("UserID")["ResponseID"].count().mean()/2,0))
print("Number of rows:", df.shape[0])
print("Number of NAs in observed dependent variable: ", df["Saved"].isna().sum())
print("Number of NAs in predicted dependent variable with GPT4 Turbo: ", df["gpt4turbo_wp_Saved"].isna().sum())



Number of respondents:  54695
Number of scenarios:  581981
Number of options per scenario:  2
Avg. number of scenarios per respondent:  11.0
Number of rows: 1163962
Number of NAs in observed dependent variable:  0
Number of NAs in predicted dependent variable with GPT4 Turbo:  0


### Reproduce AMCE from R functions and define 'ground truth' for simulation

Below we compute the Average Marginal Component Effect (AMCE) for an attribute of the moral dilemmas using  weighted least squares. Awad et al. (2018) use R to estimate the AMCE for the the conjoint experiment. First, we verify that we can obtain the results with our Python code. 

In [4]:
amce_human_subjects = pd.concat([
    pf.compute_amce(df, x="Intervention", y="Saved"), 
    pf.compute_amce(df, x="Barrier", y="Saved"), 
    pf.compute_amce(df, x="Gender", y="Saved"), 
    pf.compute_amce(df, x="Fitness", y="Saved"), 
    pf.compute_amce(df, x="Social Status", y="Saved"), 
    pf.compute_amce(df, x="CrossingSignal",y="Saved"),
    pf.compute_amce(df, x="Age", y="Saved"),
    pf.compute_amce(df, x="Utilitarian", y="Saved"),
    pf.compute_amce(df, x="Species", y="Saved")
])      
amce_human_subjects.round(3)

Unnamed: 0,x,y,beta,se,lower,upper
0,Intervention,Saved,0.081,0.002,0.078,0.084
0,Barrier,Saved,0.105,0.003,0.1,0.111
0,Gender,Saved,0.135,0.003,0.129,0.142
0,Fitness,Saved,0.176,0.004,0.169,0.183
0,Social Status,Saved,0.24,0.009,0.222,0.258
0,CrossingSignal,Saved,0.378,0.003,0.372,0.383
0,Age,Saved,0.508,0.003,0.501,0.514
0,Utilitarian,Saved,0.571,0.003,0.565,0.576
0,Species,Saved,0.684,0.003,0.679,0.69


The AMCE estimates above are the same as those calculated with the R functions by Awad et al. (2018), see object `main.Saved` and `7_AmceParamsSimulationR.csv.tar` created in the R script `7_CalculateAMCE.R`. Hence, the Python functions defined in this notebook give the same results as the functions defined in the original article. 

We use these AMCEs computed on the human subjects as the 'ground truth' in our simulation.

In [None]:
pd.read_csv("7_AmceParamsSimulationR.csv.tar")

### Estimate the PPI correlation

Below we estimate the PPI correlation with a large number of human subjects. 

In [5]:
n = 100000
N = df.shape[0] - n

# Sample decisions for human subjects sample
df_human = df.sample(n=n, replace = False, random_state=2024)

# Drop these decisions from the full dataset
df_remaining = df.drop(df_human.index)

# Sample decisions for silicon sample
df_silicon = df_remaining.sample(n=N, replace = False, random_state=2024)

model = "gpt4turbo_wp_Saved"

results = pd.concat([
    pf.compute_amce_ppi(df_human, df_silicon, x="Intervention", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Barrier", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Gender", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Fitness", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="Social Status", y=model), 
    pf.compute_amce_ppi(df_human, df_silicon, x="CrossingSignal",y=model),
    pf.compute_amce_ppi(df_human, df_silicon, x="Age", y=model),
    pf.compute_amce_ppi(df_human, df_silicon, x="Utilitarian", y=model),
    pf.compute_amce_ppi(df_human, df_silicon, x="Species", y=model)
],ignore_index=True)
    
results.to_csv("../Data/7_rho.csv", index=False)
results

Model:  gpt4turbo_wp_Saved


Unnamed: 0,y,x,beta_ppi,beta_hum,beta_sil,beta_sil_sm,se_ppi,se_hum,se_sil,lower_ppi,upper_ppi,lower_hum,upper_hum,lower_sil,upper_sil,ppi_corr
0,gpt4turbo_wp_Saved,Intervention,0.087759,0.084008,0.085603,0.085603,0.007739,0.008268,0.001066,0.072582,0.102919,0.067804,0.100212,0.083515,0.087692,0.355049
3,gpt4turbo_wp_Saved,Fitness,0.183584,0.186509,0.022105,0.022105,0.018942,0.019882,0.002568,0.146481,0.220732,0.14754,0.225478,0.017072,0.027138,0.306078
1,gpt4turbo_wp_Saved,Barrier,0.126786,0.129834,0.486402,0.486402,0.011926,0.012477,0.001392,0.103413,0.150164,0.10538,0.154288,0.483673,0.489131,0.304316
2,gpt4turbo_wp_Saved,Gender,0.100004,0.094001,0.201834,0.201834,0.01868,0.019568,0.002422,0.063351,0.136575,0.055649,0.132353,0.197087,0.206581,0.300932
7,gpt4turbo_wp_Saved,Utilitarian,0.571068,0.55973,0.555638,0.555638,0.015416,0.01602,0.002076,0.540687,0.601115,0.528332,0.591127,0.55157,0.559707,0.271883
5,gpt4turbo_wp_Saved,CrossingSignal,0.373636,0.380131,0.656683,0.656683,0.013331,0.01382,0.001508,0.347508,0.399766,0.353045,0.407217,0.653727,0.659638,0.266378
6,gpt4turbo_wp_Saved,Age,0.504941,0.503169,0.181361,0.181361,0.017096,0.017296,0.002477,0.471379,0.538392,0.46927,0.537069,0.176506,0.186217,0.149836
4,gpt4turbo_wp_Saved,Social Status,0.207159,0.205618,0.041748,0.041748,0.05242,0.05249,0.006902,0.104528,0.310011,0.102739,0.308497,0.02822,0.055277,0.107469
8,gpt4turbo_wp_Saved,Species,0.657862,0.657614,0.844681,0.844681,0.014494,0.014516,0.001434,0.629457,0.686271,0.629164,0.686064,0.841871,0.847491,0.058404


### Simulation

Next, we vary the number of human subjects and silicon subjects in a simulation.

In [6]:
if __name__ == "__main__":
    
    from multiprocessing import Pool

    # Initialize multiprocessing Pool
    num_cores = 9

    # structural attributes of scenarios
    Xs_structural  = ['Intervention', 'Barrier','CrossingSignal']

    # attributes of characters
    Xs_characters = ['Gender','Fitness','Social Status','Age','Utilitarian','Species']

    # all attributes
    Xs = Xs_structural + Xs_characters

    with Pool(processes=num_cores) as pool:

        # Map the process_predictor function to all predictors
        results = pool.map(pf.loop_attribute, Xs)

    # Combine all results into a single DataFrame
    result = pd.concat(results, ignore_index=True)

Scenario attribute: Intervention
Scenario attribute: CrossingSignal
Scenario attribute: Gender
Scenario attribute: Utilitarian
Scenario attribute: Barrier
Scenario attribute: Age
Scenario attribute: Social Status
Scenario attribute: Fitness
Scenario attribute: Species


We benchmark the silicon subjects design and the mixed subjects design against a human subjects approach.

In [None]:
# subset point estimates of AMCEs from the entire human subjects sample
benchmark = amce_human_subjects.loc[:, ['x', 'beta']].rename(columns={'beta': 'param'})

# merge benchmark with results from simulation
result_wb = pd.merge(result, benchmark, on='x', how='left')

# report if true value is within the confidence interval from the mixed subjects 
result_wb['coverage_ppi'] = (
    (result_wb['lower_ppi'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_ppi'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_sil'] = (
    (result_wb['lower_sil'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_sil'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_hum'] = (
    (result_wb['lower_hum'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_hum'])
).astype(int) 

result_wb

NameError: name 'result' is not defined

In [None]:
# Group by n, N, and LLM then calculate mean across repetitions
vars = ['beta_ppi','se_ppi','lower_ppi','upper_ppi','coverage_ppi','ppi_corr',
        'beta_sil','se_sil','lower_sil','upper_sil','coverage_sil',
        'beta_hum','se_hum','lower_hum','upper_hum','coverage_hum']

summ = result_wb.groupby(['n','N','y','x','param'])[vars].mean().reset_index()

# Calculate bias columns
summ['bias_ppi'] = summ['beta_ppi'] - summ['param']
summ['bias_sil'] = summ['beta_sil'] - summ['param']
summ['bias_hum'] = summ['beta_hum'] - summ['param']

summ['rmse_ppi'] = np.sqrt(summ['bias_ppi']**2 + summ['se_ppi']**2)
summ['rmse_sil'] = np.sqrt(summ['bias_sil']**2 + summ['se_sil']**2)
summ['rmse_hum'] = np.sqrt(summ['bias_hum']**2 + summ['se_hum']**2)

# Save averaged simulation results to compressed csv file
summ.to_csv("../Data/7_ResultsPPI.csv.gz", compression="gzip", index=False)
summ

NameError: name 'result_wb' is not defined