### Setup

In this section, we load necessary libraries and define custom functions.

In [249]:
# install PPI library if needed 
# %pip install git+https://github.com/Michael-Howes/ppi_py.git


In [250]:
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
import sys
from scipy import stats
from ppi_py import ppi_ols_ci, classical_ols_ci, ppi_ols_pointestimate

df = pd.read_csv("../Data/5_SurveySampleLLM.csv.gz")

Covs = ['PedPed', 'Barrier', 'CrossingSignal', 'NumberOfCharacters',
        'DiffNumberOFCharacters', 'LeftHand', 'Man', 'Woman', 'Pregnant',
        'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless',
        'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive',
        'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor',
        'MaleDoctor', 'Dog', 'Cat', 
        'Intervention'
        ]

sys.version

'3.11.4 (v3.11.4:d2340ef257, Jun  6 2023, 19:15:51) [Clang 13.0.0 (clang-1300.0.29.30)]'

In [251]:
print("Number of respondents: ", len(df["UserID"].unique()))
print("Number of decisions: ", len(df["ResponseID"].unique()))
print("Number of NAs in observed dependent variable: ", df["Saved"].isna().sum())
print("Number of NAs in predicted dependent variable with o1 Mini: ", df["o1mini_wp_Saved_1"].isna().sum())
print("Number of NAs in predicted dependent variable with o1 Preview: ", df["o1preview_wp_Saved_1"].isna().sum())
print("Number of NAs in predicted dependent variable with GPT4 Turbo: ", df["gpt4turbo_wp_Saved_1"].isna().sum())
print("Number of NAs in predicted dependent variable with GPT4o: ", df["gpt4o_wp_Saved_1"].isna().sum())
print("Number of NAs in predicted dependent variable with GPT3.5 Turbo: ", df["gpt35turbo0125_wp_Saved_1"].isna().sum())
print("Number of NAs in predicted dependent variable with Claude 3.5 Sonnet: ", df["claude35sonnet20241022_wp_Saved_1"].isna().sum())

Number of respondents:  2097
Number of decisions:  22315
Number of NAs in observed dependent variable:  0
Number of NAs in predicted dependent variable with o1 Mini:  35124
Number of NAs in predicted dependent variable with o1 Preview:  50548
Number of NAs in predicted dependent variable with GPT4 Turbo:  6086
Number of NAs in predicted dependent variable with GPT4o:  6092
Number of NAs in predicted dependent variable with GPT3.5 Turbo:  6088
Number of NAs in predicted dependent variable with Claude 3.5 Sonnet:  37540


### Reproduce AMCE from R functions

Awad et al. (2018) use R to estimate the AMCE for the the conjoint experiment. In this section, we verify that we can obtain the results with our Python code.

In [252]:
def CalcTheoreticalInt(r):
    # this function is applied to each row (r)
    if r["Intervention"]==0:
        if r["Barrier"]==0:
            if r["PedPed"]==1: p = 0.48
            else: p = 0.32
            
            if r["CrossingSignal"]==0:   p = p * 0.48
            elif r["CrossingSignal"]==1: p = p * 0.2
            else: p = p * 0.32
        else: p = 0.2

    else: 
        if r["Barrier"]==0:
            if r["PedPed"]==1: 
                p = 0.48
                if r["CrossingSignal"]==0: p = p * 0.48
                elif r["CrossingSignal"]==1: p = p * 0.32
                else: p = p * 0.2
            else: 
                p = 0.2
                if r["CrossingSignal"]==0: p = p * 0.48
                elif r["CrossingSignal"]==1: p = p * 0.2
                else: p = p * 0.32
        else: p = 0.32  
    
    return(p)  
        
def calcWeightsTheoretical(profiles):
    
    p = profiles.apply(CalcTheoreticalInt, axis=1)

    weight = 1/p 

    return(weight)         

In [None]:
# function from PPI to calculate stats
def _ols_get_stats(
    pointest,
    X,
    Y,
    Yhat,
    X_unlabeled,
    Yhat_unlabeled,
    w=None,
    w_unlabeled=None,
    use_unlabeled=True,
):
    """Computes the statistics needed for the OLS-based prediction-powered inference.

    Args:
        pointest (ndarray): A point estimate of the coefficients.
        X (ndarray): Covariates for the labeled data set.
        Y (ndarray): Labels for the labeled data set.
        Yhat (ndarray): Predictions for the labeled data set.
        X_unlabeled (ndarray): Covariates for the unlabeled data set.
        Yhat_unlabeled (ndarray): Predictions for the unlabeled data set.
        w (ndarray, optional): Sample weights for the labeled data set.
        w_unlabeled (ndarray, optional): Sample weights for the unlabeled data set.
        use_unlabeled (bool, optional): Whether to use the unlabeled data set.

    Returns:
        grads (ndarray): Gradient of the loss function with respect to the coefficients.
        grads_hat (ndarray): Gradient of the loss function with respect to the coefficients, evaluated using the labeled predictions.
        grads_hat_unlabeled (ndarray): Gradient of the loss function with respect to the coefficients, evaluated using the unlabeled predictions.
        inv_hessian (ndarray): Inverse Hessian of the loss function with respect to the coefficients.
    """
    n = Y.shape[0]
    N = Yhat_unlabeled.shape[0]
    d = X.shape[1]
    w = np.ones(n) if w is None else w / np.sum(w) * n
    w_unlabeled = (
        np.ones(N)
        if w_unlabeled is None
        else w_unlabeled / np.sum(w_unlabeled) * N
    )

    hessian = np.zeros((d, d))
    grads_hat_unlabeled = np.zeros(X_unlabeled.shape)
    if use_unlabeled:
        for i in range(N):
            hessian += (
                w_unlabeled[i]
                / (N + n)
                * np.outer(X_unlabeled[i], X_unlabeled[i])
            )
            grads_hat_unlabeled[i, :] = (
                w_unlabeled[i]
                * X_unlabeled[i, :]
                * (np.dot(X_unlabeled[i, :], pointest) - Yhat_unlabeled[i])
            )

    grads = np.zeros(X.shape)
    grads_hat = np.zeros(X.shape)
    for i in range(n):
        hessian += (
            w[i] / (N + n) * np.outer(X[i], X[i])
            if use_unlabeled
            else w[i] / n * np.outer(X[i], X[i])
        )
        grads[i, :] = w[i] * X[i, :] * (np.dot(X[i, :], pointest) - Y[i])
        grads_hat[i, :] = (
            w[i] * X[i, :] * (np.dot(X[i, :], pointest) - Yhat[i])
        )

    inv_hessian = np.linalg.inv(hessian).reshape(d, d)
    return grads, grads_hat, grads_hat_unlabeled, inv_hessian

def _power_analysis_stats(grads, grads_hat, inv_hessian):
    grads_ = grads - grads.mean(axis=0)
    grads_hat_ = grads_hat - grads_hat.mean(axis=0)
    cov = inv_hessian @ (grads_[:,None,:] * grads_hat_[:,:,None]).mean(axis=0) @ inv_hessian
    var = inv_hessian @ (grads_[:,None,:]*grads_[:,:,None]).mean(axis=0) @ inv_hessian
    var_hat = inv_hessian @ (grads_hat_[:,None,:]*grads_hat_[:,:,None]).mean(axis=0) @ inv_hessian
    rhos = np.diag(cov)/np.sqrt((np.diag(var)*np.diag(var_hat)))
    sigmas_sq = np.diag(var)
    return rhos, sigmas_sq

def _estimate_ppi_SE(n, N, rho_sq, var_Y):
    if N == np.inf:
        return np.sqrt(var_Y*(1-rho_sq)/n)
    if N == 0:
        return np.sqrt(var_Y/n)
    var_ppi = var_Y*(1-rho_sq*N/(n+N))/n
    return np.sqrt(var_ppi)

def _estimate_classical_SE(n, var_Y):
    return np.sqrt(var_Y/n)

Below we define a function to compute the Average Marginal Component Effect (AMCE) for an attribute of the moral dilemmas using  weighted least squares. 

In [254]:
def compute_amce(data, x, y, alpha=0.05):

    # specify regression for swerve or stay in lane
    if x=="Intervention":
        
        # calculate weights
        data.loc[:,"weights"] = calcWeightsTheoretical(data)
    
        # drop rows with missing values on dependent variable
        dd = data.dropna(subset=y)

        # if X=1 characters die if AV serves, if X=0 characters if AV stays
        X = dd["Intervention"]
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])
    

    # specify regression for relationship to vehicle
    if x=="Barrier":

        # consider only dilemmas without legality and only pedestrians vs passengers
        data_sub = data.loc[(data["CrossingSignal"]==0) & (data["PedPed"]==0), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)
        
        # if X=1 passengers die and if X=0 pedestrians die
        X = dd["Barrier"]

        # recode to estimate the preference for pedestrians over passengers 
        X = 1 - X
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])

    
    # specify regression for legality
    if x=="CrossingSignal": 
        
        # consider dilemmas with legality and only pedestrians vs pedestrians
        data_sub = data.loc[(data["CrossingSignal"]!=0) & (data["PedPed"]==1), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)

        # if X=1 pedestrians cross on a green light, if X=2 pedestrians cross on a red light 
        X = dd["CrossingSignal"]

        # create dummy variable to estimate preference for pedestrians that cross legally (1) vs legally (0)
        X = 2 - X 
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])

    

    # Specify regressions for the remaining six attributes
    if x=="Utilitarian":
        
        # consider dilemmas that compare 'More' versus 'Less' characters
        data_sub = data.loc[(data["ScenarioType"]=="Utilitarian") & (data["ScenarioTypeStrict"]=="Utilitarian"), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)
        dd = dd.rename(columns = {'AttributeLevel': 'Utilitarian'})

        # create dummy variable to estimate the preference for sparing more characters
        X = (dd.loc[:,"Utilitarian"]=="More").astype(int)
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])


    if x=="Species":
        
        # consider dilemmas that compare humans versus animals 
        data_sub = data.loc[(data["ScenarioType"]=="Species") & (data["ScenarioTypeStrict"]=="Species"), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)
        dd = dd.rename(columns = {'AttributeLevel': 'Species'})

        # create dummy variable to estimate the preference for sparing humans
        X = (dd.loc[:,"Species"]=="Hoomans").astype(int)
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])
    

    if x=="Gender":
        
        # consider dilemmas that compare women versus men
        data_sub = data.loc[(data["ScenarioType"]=="Gender") & (data["ScenarioTypeStrict"]=="Gender"), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)
        dd = dd.rename(columns = {'AttributeLevel': 'Gender'})

        # create dummy variable to estimate the preference for sparing women
        X = (dd.loc[:,"Gender"]=="Female").astype(int)
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])


    if x=="Fitness":
        
        # consider dilemmas that compare fit characters versus those that are not
        data_sub = data.loc[(data["ScenarioType"]=="Fitness") & (data["ScenarioTypeStrict"]=="Fitness"), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)
        dd = dd.rename(columns = {'AttributeLevel': 'Fitness'})

        # create dummy variable to estimate the preference for sparing fit characters
        X = (dd.loc[:,"Fitness"]=="Fit").astype(int)
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])


    if x=="Age":
        
        # consider dilemmas that compare younger versus older characters
        data_sub = data.loc[(data["ScenarioType"]=="Age") & (data["ScenarioTypeStrict"]=="Age"), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)
        dd = dd.rename(columns = {'AttributeLevel': 'Age'})

        # create dummy variable to estimate the preference for sparing younger characters
        X = (dd.loc[:,"Age"]=="Young").astype(int)
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])

    
    if x=="Social Status":
        
        # consider dilemmas that compare high status versus low status characters
        data_sub = data.loc[(data["ScenarioType"]=="Social Status") & (data["ScenarioTypeStrict"]=="Social Status"), :].copy()

        # calculate weights
        data_sub.loc[:,"weights"] = calcWeightsTheoretical(data_sub)

        # drop rows with missing values on dependent variable
        dd = data_sub.dropna(subset=y)
        dd = dd.rename(columns = {'AttributeLevel': 'Social Status'})

        # create dummy variable to estimate the preference for sparing high status characters
        X = (dd.loc[:,"Social Status"]=="High").astype(int)
        X = sm.add_constant(X)

        # define model with standard errors clustered on UserID
        model = sm.WLS(dd[y], X, weights=dd["weights"])



    # fit model and extract estimates
    fit = model.fit(cov_type = 'cluster', cov_kwds = {'groups': dd["UserID"]})
    coef = fit.params[x]
    se = fit.bse[x]
    ci = fit.conf_int(alpha=alpha).loc[x]

    # store results
    res = pd.DataFrame({
        'x': [x],
        'y': [y],
        'beta': [coef],
        'se': [se],
        'lower': [ci[0]],
        'upper': [ci[1]]
    })

    return(res)


First, we compute the AMCEs only with data from human subjects using the functions defined above.

In [255]:
amce_human_subjects = pd.concat([
    compute_amce(df, x="Intervention", y="Saved"), 
    compute_amce(df, x="Barrier", y="Saved"), 
    compute_amce(df, x="Gender", y="Saved"), 
    compute_amce(df, x="Fitness", y="Saved"), 
    compute_amce(df, x="Social Status", y="Saved"), 
    compute_amce(df, x="CrossingSignal",y="Saved"),
    compute_amce(df, x="Age", y="Saved"),
    compute_amce(df, x="Utilitarian", y="Saved"),
    compute_amce(df, x="Species", y="Saved")
])      
amce_human_subjects.round(3)

Unnamed: 0,x,y,beta,se,lower,upper
0,Intervention,Saved,0.065,0.008,0.049,0.082
0,Barrier,Saved,0.15,0.015,0.121,0.179
0,Gender,Saved,0.155,0.017,0.121,0.189
0,Fitness,Saved,0.13,0.019,0.094,0.167
0,Social Status,Saved,0.168,0.049,0.071,0.265
0,CrossingSignal,Saved,0.339,0.016,0.309,0.37
0,Age,Saved,0.494,0.016,0.463,0.526
0,Utilitarian,Saved,0.575,0.015,0.546,0.604
0,Species,Saved,0.632,0.016,0.601,0.664


The AMCE estimates above are the same as those calculated with the functions by Awad et al. (2018), see object `main.Saved` in the R script `8_CalculateAMCE.R`. Hence, the custom functions defined in this notebook give the same results as the functions defined in the original article. 


|           label            |    dv  |  amce |   se  | conf.low | conf.high |
|----------------------------|--------|-------|-------|----------|-----------|
|   Intervention             | Saved  | 0.068 | 0.008 |    0.052 |     0.084 |
|        Barrier             | Saved  | 0.165 | 0.014 |    0.137 |     0.193 |
|            Law             | Saved  | 0.336 | 0.015 |    0.307 |     0.366 |
|         Gender             | Saved  | 0.160 | 0.017 |    0.127 |     0.193 |
|        Fitness             | Saved  | 0.121 | 0.018 |    0.085 |     0.156 |
|  Social Status             | Saved  | 0.171 | 0.047 |    0.079 |     0.263 |
|            Age             | Saved  | 0.482 | 0.016 |    0.451 |     0.513 |
| No. Characters             | Saved  | 0.573 | 0.014 |    0.545 |     0.602 |
|        Species             | Saved  | 0.646 | 0.015 |    0.617 |     0.675 |


In [None]:
def compute_amce_ppi(n_data, N_data, x, y, alpha=0.05):

    # specify regression for swerve or stay in lane
    if x=="Intervention":
        
        # calculate weights
        n_data.loc[:,"weights"] = calcWeightsTheoretical(n_data)
        N_data.loc[:,"weights"] = calcWeightsTheoretical(N_data)
    
        # drop rows with missing values on dependent variable
        n_dd = n_data.dropna(subset=y)
        N_dd = N_data.dropna(subset=y)

        # if X=1 characters die if AV serves, if X=0 characters if AV stays
        n_X = n_dd["Intervention"]               
        N_X = N_dd["Intervention"]

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights



    # specify regression for relationship to vehicle
    if x=="Barrier":

        # consider only dilemmas without legality and only pedestrians vs passengers
        n_data_sub = n_data.loc[(n_data["CrossingSignal"]==0) & (n_data["PedPed"]==0), :].copy()
        N_data_sub = N_data.loc[(N_data["CrossingSignal"]==0) & (N_data["PedPed"]==0), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)
        
        # if X=1 passengers die and if X=0 pedestrians die
        n_X = n_dd["Barrier"]
        N_X = N_dd["Barrier"]

        # recode to estimate the preference for pedestrians over passengers 
        n_X = 1 - n_X
        N_X = 1 - N_X

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights

    

    # specify regression for legality
    if x=="CrossingSignal": 
        
        # consider dilemmas with legality and only pedestrians vs pedestrians
        n_data_sub = n_data.loc[(n_data["CrossingSignal"]!=0) & (n_data["PedPed"]==1), :].copy()
        N_data_sub = N_data.loc[(N_data["CrossingSignal"]!=0) & (N_data["PedPed"]==1), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)

        # if X=1 pedestrians cross on a green light, if X=2 pedestrians cross on a red light 
        n_X = n_dd["CrossingSignal"]
        N_X = N_dd["CrossingSignal"]

        # create dummy variable to estimate preference for pedestrians that cross legally (1) vs legally (0)
        n_X = 2 - n_X 
        N_X = 2 - N_X 

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights
    


    # Specify regressions for the remaining six attributes
    if x=="Utilitarian":
        
        # consider dilemmas that compare 'More' versus 'Less' characters
        n_data_sub = n_data.loc[(n_data["ScenarioType"]=="Utilitarian") & (n_data["ScenarioTypeStrict"]=="Utilitarian"), :].copy()
        N_data_sub = N_data.loc[(N_data["ScenarioType"]=="Utilitarian") & (N_data["ScenarioTypeStrict"]=="Utilitarian"), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)
        
        # rename column to extract coefficient from result
        n_dd = n_dd.rename(columns = {'AttributeLevel': 'Utilitarian'})
        N_dd = N_dd.rename(columns = {'AttributeLevel': 'Utilitarian'})

        # create dummy variable to estimate the preference for sparing more characters
        n_X = (n_dd.loc[:,"Utilitarian"]=="More").astype(int)
        N_X = (N_dd.loc[:,"Utilitarian"]=="More").astype(int)

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights



    if x=="Species":
        
        # consider dilemmas that compare humans versus animals 
        n_data_sub = n_data.loc[(n_data["ScenarioType"]=="Species") & (n_data["ScenarioTypeStrict"]=="Species"), :].copy()
        N_data_sub = N_data.loc[(N_data["ScenarioType"]=="Species") & (N_data["ScenarioTypeStrict"]=="Species"), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)

        # rename column to extract coefficient from result
        n_dd = n_dd.rename(columns = {'AttributeLevel': 'Species'})
        N_dd = N_dd.rename(columns = {'AttributeLevel': 'Species'})

        # create dummy variable to estimate the preference for sparing humans
        n_X = (n_dd.loc[:,"Species"]=="Hoomans").astype(int)
        N_X = (N_dd.loc[:,"Species"]=="Hoomans").astype(int)

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights

    

    if x=="Gender":
        
        # consider dilemmas that compare women versus men
        n_data_sub = n_data.loc[(n_data["ScenarioType"]=="Gender") & (n_data["ScenarioTypeStrict"]=="Gender"), :].copy()
        N_data_sub = N_data.loc[(N_data["ScenarioType"]=="Gender") & (N_data["ScenarioTypeStrict"]=="Gender"), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)

        # rename column to extract coefficient from result
        n_dd = n_dd.rename(columns = {'AttributeLevel': 'Gender'})
        N_dd = N_dd.rename(columns = {'AttributeLevel': 'Gender'})

        # create dummy variable to estimate the preference for sparing women
        n_X = (n_dd.loc[:,"Gender"]=="Female").astype(int)
        N_X = (N_dd.loc[:,"Gender"]=="Female").astype(int)

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights



    if x=="Fitness":
        
        # consider dilemmas that compare fit characters versus those that are not
        n_data_sub = n_data.loc[(n_data["ScenarioType"]=="Fitness") & (n_data["ScenarioTypeStrict"]=="Fitness"), :].copy()
        N_data_sub = N_data.loc[(N_data["ScenarioType"]=="Fitness") & (N_data["ScenarioTypeStrict"]=="Fitness"), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)

        # rename column to extract coefficient from result
        n_dd = n_dd.rename(columns = {'AttributeLevel': 'Fitness'})
        N_dd = N_dd.rename(columns = {'AttributeLevel': 'Fitness'})

        # create dummy variable to estimate the preference for sparing fit characters
        n_X = (n_dd.loc[:,"Fitness"]=="Fit").astype(int)
        N_X = (N_dd.loc[:,"Fitness"]=="Fit").astype(int)

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights



    if x=="Age":
        
        # consider dilemmas that compare younger versus older characters
        n_data_sub = n_data.loc[(n_data["ScenarioType"]=="Age") & (n_data["ScenarioTypeStrict"]=="Age"), :].copy()
        N_data_sub = N_data.loc[(N_data["ScenarioType"]=="Age") & (N_data["ScenarioTypeStrict"]=="Age"), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)

        # rename column to extract coefficient from result
        n_dd = n_dd.rename(columns = {'AttributeLevel': 'Age'})
        N_dd = N_dd.rename(columns = {'AttributeLevel': 'Age'})

        # create dummy variable to estimate the preference for sparing younger characters
        n_X = (n_dd.loc[:,"Age"]=="Young").astype(int)
        N_X = (N_dd.loc[:,"Age"]=="Young").astype(int)

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd["Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd[y].to_numpy()          # predicted outcomes
        n_weights = n_dd["weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()          # predicted outcomes
        N_weights = N_dd["weights"].to_numpy()    # define weights


    
    if x=="Social Status":
        
        # consider dilemmas that compare high status versus low status characters
        n_data_sub = n_data.loc[(n_data["ScenarioType"]=="Social Status") & (n_data["ScenarioTypeStrict"]=="Social Status"), :].copy()
        N_data_sub = N_data.loc[(N_data["ScenarioType"]=="Social Status") & (N_data["ScenarioTypeStrict"]=="Social Status"), :].copy()

        # calculate weights
        n_data_sub.loc[:,"weights"] = calcWeightsTheoretical(n_data_sub)
        N_data_sub.loc[:,"weights"] = calcWeightsTheoretical(N_data_sub)

        # drop rows with missing values on dependent variable
        n_dd = n_data_sub.dropna(subset=y)
        N_dd = N_data_sub.dropna(subset=y)

        # rename column to extract coefficient from result
        n_dd = n_dd.rename(columns = {'AttributeLevel': 'Social Status'})
        N_dd = N_dd.rename(columns = {'AttributeLevel': 'Social Status'})

        # create dummy variable to estimate the preference for sparing high status characters
        n_X = (n_dd.loc[:,"Social Status"]=="High").astype(int)
        N_X = (N_dd.loc[:,"Social Status"]=="High").astype(int)

        # add intercept
        n_X = np.column_stack((np.ones(n_X.shape[0]), n_X))
        N_X = np.column_stack((np.ones(N_X.shape[0]), N_X))

        # gold standard data
        n_Y_human   = n_dd.loc[:,"Saved"].to_numpy()    # observed outcomes
        n_Y_silicon = n_dd.loc[:,y].to_numpy()          # predicted outcomes
        n_weights = n_dd.loc[:,"weights"].to_numpy()    # define weights

        # unlabeled data
        N_Y_silicon = N_dd[y].to_numpy()                # predicted outcomes
        N_weights = N_dd.loc[:,"weights"].to_numpy()    # define weights


    # calculate point estimate
    beta_ppi = ppi_ols_pointestimate(X=n_X, Y=n_Y_human, Yhat=n_Y_silicon, 
                                     X_unlabeled=N_X, Yhat_unlabeled=N_Y_silicon, 
                                     w=n_weights, w_unlabeled=N_weights)
    
    # using ppi function to calculate point estimates (lambda=0)
    beta_hum = ppi_ols_pointestimate(X=n_X, Y=n_Y_human, Yhat=n_Y_silicon, 
                                     X_unlabeled=N_X, Yhat_unlabeled=N_Y_silicon, 
                                     w=n_weights, w_unlabeled=N_weights, 
                                     lam=0)
    
    beta_sil = ppi_ols_pointestimate(X=N_X, Y=N_Y_silicon, Yhat=N_Y_silicon, 
                                     X_unlabeled=N_X, Yhat_unlabeled=N_Y_silicon, 
                                     w=N_weights, w_unlabeled=N_weights, 
                                     lam=0)
    
    # using statsmodels to calculate point estimates (same results as with PPI)
    beta_hum_sm = sm.WLS(endog=n_Y_human, exog=n_X, weights=n_weights).fit().params[1]
    beta_sil_sm = sm.WLS(endog=N_Y_silicon, exog=N_X, weights=N_weights).fit().params[1]

    # calculate confidence intervals for PPI, human subjects, and silicon subjects
    lower_CI_ppi, upper_CI_ppi = ppi_ols_ci(X=n_X, Y=n_Y_human, Yhat=n_Y_silicon, 
                                            X_unlabeled=N_X, Yhat_unlabeled=N_Y_silicon, 
                                            w=n_weights, w_unlabeled=N_weights, alpha=alpha)
    
    lower_CI_hum, upper_CI_hum = classical_ols_ci(X=n_X, Y=n_Y_human, w=n_weights, alpha=alpha)

    lower_CI_sil, upper_CI_sil = classical_ols_ci(X=N_X, Y=N_Y_silicon, w=N_weights, alpha=alpha)


    # zscore for two tailed test
    z = stats.norm.ppf(0.975)
    
    # calculate standard errors for PPI, human subjects, and silicon subjects
    se_ppi = (upper_CI_ppi[1] - lower_CI_ppi[1]) / (2 * z)
    
    se_hum = (upper_CI_hum[1] - lower_CI_hum[1]) / (2 * z)

    se_sil = (upper_CI_sil[1] - lower_CI_sil[1]) / (2 * z)
    

    # calculate rho
    beta = sm.WLS(n_Y_human, n_X, weights=n_weights).fit().params

    grads, grads_hat, grads_hat_unlabeled, inv_hessian = _ols_get_stats(
        pointest=beta, 
        X=n_X,
        Y=n_Y_human,
        Yhat= n_Y_silicon,
        X_unlabeled=N_X,
        Yhat_unlabeled=N_Y_silicon,
        w=n_weights,
        w_unlabeled=N_weights,
        use_unlabeled=False)
    
    rho, var_y = _power_analysis_stats(grads, grads_hat, inv_hessian)

    # create and return the output DataFrame
    output_df = pd.DataFrame({
        "y": y,                              
        "x": x,                               # Predictor variable (scenario attribute)
        "beta_ppi": beta_ppi[1],              # PPI point estimate
        "beta_hum": beta_hum[1],              # Human subjects point estimate
        "beta_hum_sm": beta_hum_sm,           # Human subjects point estimate (statsmodels)
        "beta_sil": beta_sil[1],              # Silicon subjects point estimate
        "beta_sil_sm": beta_sil_sm,           # Silicon subjects point estimate (statsmodels)
        "se_ppi": se_ppi,                     # PPI standard error
        "se_hum": se_hum,                     # Human subjects standard error
        "se_sil": se_sil,                     # Silicon subjects standard error
        "lower_ppi": lower_CI_ppi[1],         # The lower bound of the PPI confidence interval
        "upper_ppi": upper_CI_ppi[1],         # The upper bound of the PPI confidence interval
        "lower_hum": lower_CI_hum[1],         # The lower bound of the human subjects confidence interval
        "upper_hum": upper_CI_hum[1],         # The upper bound of the human subjects confidence interval
        "lower_sil": lower_CI_sil[1],         # The lower bound of the silicon subjects confidence interval
        "upper_sil": upper_CI_sil[1],         # The upper bound of the silicon subjects confidence interval
        "ppi_corr": rho[1]},      # The association between predictions and outcomes
        index=[0])
    
    return output_df 

In [258]:
ids = df["ResponseID"].unique()
n = 22000
N = len(ids) - n
random.seed(2024)

n_ids = random.sample(ids.tolist(), k=n)
N_ids = random.sample(list(set(ids) - set(n_ids)), k=N)

df_human = df[ df["ResponseID"].isin(n_ids) ]
df_silicon = df [ df["ResponseID"].isin(N_ids)]

models = [
    "gpt4turbo_wp_Saved_1","gpt4o_wp_Saved_1","gpt35turbo0125_wp_Saved_1",
    "o1mini_wp_Saved_1",#"claude-3-5-sonnet-20241022_wp_Saved_1"
]

results2 = pd.DataFrame()
for model in models: 
    
    print("Model: ", model)
    results1 = pd.concat([
        compute_amce_ppi(df_human, df_silicon, x="Intervention", y=model), 
        compute_amce_ppi(df_human, df_silicon, x="Barrier", y=model), 
        compute_amce_ppi(df_human, df_silicon, x="Gender", y=model), 
        compute_amce_ppi(df_human, df_silicon, x="Fitness", y=model), 
        compute_amce_ppi(df_human, df_silicon, x="Social Status", y=model), 
        compute_amce_ppi(df_human, df_silicon, x="CrossingSignal",y=model),
        compute_amce_ppi(df_human, df_silicon, x="Age", y=model),
        compute_amce_ppi(df_human, df_silicon, x="Utilitarian", y=model),
        compute_amce_ppi(df_human, df_silicon, x="Species", y=model)
    ],ignore_index=True)
    
    results2 = pd.concat([results2, results1],ignore_index=True).sort_values(by=["y","ppi_corr"], ascending=False)
    
results2.to_csv("../Data/7_rho.csv", index=False)
results2

Model:  gpt4turbo_wp_Saved_1
Model:  gpt4o_wp_Saved_1
Model:  gpt35turbo0125_wp_Saved_1
Model:  o1mini_wp_Saved_1


Unnamed: 0,y,x,beta_ppi,beta_hum,beta_hum_sm,beta_sil,beta_sil_sm,se_ppi,se_hum,se_sil,lower_ppi,upper_ppi,lower_hum,upper_hum,lower_sil,upper_sil,ppi_corr
27,o1mini_wp_Saved_1,Intervention,0.080673,0.080648,0.080648,0.048134,0.048134,0.009471,0.009476,0.07837482,0.062109,0.099235,0.062075,0.099221,-0.105478,0.201745,0.326353
28,o1mini_wp_Saved_1,Barrier,0.17484,0.176455,0.176455,-0.169017,-0.169017,0.014073,0.014079,0.1185514,0.147488,0.202652,0.148861,0.204048,-0.401374,0.06334,0.267929
29,o1mini_wp_Saved_1,Gender,0.190835,0.189146,0.189146,0.531791,0.531791,0.021749,0.021762,0.1429248,0.147772,0.233028,0.146493,0.2318,0.251663,0.811918,0.261184
30,o1mini_wp_Saved_1,Fitness,0.087912,0.088692,0.088692,-0.166722,-0.166722,0.022821,0.022827,0.1718222,0.043198,0.132654,0.043952,0.133432,-0.503487,0.170043,0.241193
31,o1mini_wp_Saved_1,Social Status,0.31248,0.308567,0.308567,1.0,1.0,0.060488,0.06039,6.514183e-16,0.19163,0.428739,0.190206,0.426928,1.0,1.0,0.232572
32,o1mini_wp_Saved_1,CrossingSignal,0.325109,0.325673,0.325673,0.468531,0.468531,0.016412,0.016413,0.1254553,0.292956,0.35729,0.293503,0.357842,0.222644,0.714419,0.194138
33,o1mini_wp_Saved_1,Age,0.478747,0.478634,0.478634,0.063856,0.063856,0.019489,0.019487,0.2267698,0.440539,0.516934,0.44044,0.516827,-0.380605,0.508316,0.13348
35,o1mini_wp_Saved_1,Species,0.639361,0.639361,0.639361,0.913252,0.913252,0.017907,0.017904,0.06095381,0.604264,0.674459,0.60427,0.674453,0.793785,1.032719,0.070262
34,o1mini_wp_Saved_1,Utilitarian,0.584435,0.5843,0.5843,0.893877,0.893877,0.017875,0.017876,0.07720557,0.549378,0.619447,0.549264,0.619335,0.742557,1.045197,0.07003
0,gpt4turbo_wp_Saved_1,Intervention,0.07057,0.070634,0.070634,0.07276,0.07276,0.005587,0.005591,0.04593858,0.059621,0.081522,0.059675,0.081593,-0.017278,0.162798,0.348803


Next, we vary the number of human subjects and silicon subjects in a simulation.

In [259]:
# sample size of human subjects
ns = [500,750]
ns= [500]

# multiples of human subjects sample size
ks = list([0.1, 0.25, 0.5, 0.75]) + list(np.arange(1, 10.5, 0.5))

# number of repetitions for combinations of n and N
reps = 50

# LLM predictions
Ys = models
Ys = ["gpt4turbo_wp_Saved_1"]

# structural attributes of scenarios
Xs_structural  = ['Intervention', 'Barrier','CrossingSignal']

# attributes of characters
Xs_characters = ['Gender','Fitness','Social Status','Age','Utilitarian','Species']

# all attributes
Xs = Xs_structural + Xs_characters

result = pd.DataFrame()

# loop models
for y in Ys:
  
  print(f"Iterating predictions from the model: {y}")
  
  # loop over predictors
  for x in Xs:
    print(f"    Predictor: {x}")

    # loop over sample sizes of human subjects
    for n in ns:
      print(f"        Human sample size: {n}")

      # sample size silicon subjects 
      Ns = [int(n * k) for n in ns for k in ks]
      
      # loop over sample sizes of silicon subjects
      for N in Ns:
        
        # loop over repetitions
        for r in range(reps):

          # subset to dilemmas with variation on structural attribute
          if x in Xs_structural:

              cnt = df.groupby("ResponseID")[x].nunique()
              ids = cnt[ cnt > 1].index.tolist()

          # subset to dilemmas with relevant character attribute
          if x in Xs_characters:

              ids = df.loc[ (df["ScenarioType"]==x) & (df["ScenarioTypeStrict"]==x), "ResponseID"].tolist()
          
          # skip current iteration if target n is larger than population
          if (len(ids) < n):
             continue 

          # sample dilemmas for human subjects sample
          n_ids = random.sample(ids, k=n)
          
          # get remaining dilemma ids to sample from
          remaining_ids = list(set(ids) - set(n_ids))

          # skip current iteration if target N is larger than population
          if (len(remaining_ids) < N):
             continue 
          
          # sample dilemmas for silicon subjects sample
          N_ids = random.sample(remaining_ids, k=N)

          # subset data
          df_human = df[ df["ResponseID"].isin(n_ids) ]
          df_silicon = df [ df["ResponseID"].isin(N_ids)]

          # compute acme on n human subjects and N silicon subjects
          ppi = compute_amce_ppi(n_data=df_human, N_data=df_silicon, x=x, y=y)

          # store data
          ppi["n"] = n
          ppi["N"] = N
          
          result = pd.concat([result, ppi], ignore_index=True)
          


Iterating predictions from the model: gpt4turbo_wp_Saved_1
    Predictor: Intervention
        Human sample size: 500
    Predictor: Barrier
        Human sample size: 500
    Predictor: CrossingSignal
        Human sample size: 500
    Predictor: Gender
        Human sample size: 500
    Predictor: Fitness
        Human sample size: 500
    Predictor: Social Status
        Human sample size: 500
    Predictor: Age
        Human sample size: 500
    Predictor: Utilitarian
        Human sample size: 500


We benchmark the silicon subjects design and the mixed subjects design against a human subjects approach.

In [None]:
# subset point estimates of AMCEs from the entire human subjects sample
benchmark = amce_human_subjects.loc[:, ['x', 'beta']].rename(columns={'beta': 'param'})

# merge benchmark with results from simulation
result_wb = pd.merge(result, benchmark, on='x', how='left')

# report if true value is within the confidence interval from the mixed subjects 
result_wb['coverage_ppi'] = (
    (result_wb['lower_ppi'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_ppi'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_sil'] = (
    (result_wb['lower_sil'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_sil'])
).astype(int) 

# report if true value is within the confidence interval from the silicon subjects 
result_wb['coverage_hum'] = (
    (result_wb['lower_hum'] <= result_wb['param']) & 
    (result_wb['param'] <= result_wb['upper_hum'])
).astype(int) 

result_wb

Unnamed: 0,y,x,beta_ppi,beta_hum,beta_hum_sm,beta_sil,beta_sil_sm,se_ppi,se_hum,se_sil,...,upper_hum,lower_sil,upper_sil,ppi_corr,n,N,param,coverage_ppi,coverage_sil,coverage_hum
0,gpt4turbo_wp_Saved_1,Intervention,0.030414,0.040908,0.040908,-0.128529,-0.128529,0.036530,0.036902,0.110550,...,0.113234,-0.345203,0.088145,0.449060,500,50,0.065392,1,1,1
1,gpt4turbo_wp_Saved_1,Intervention,0.056072,0.053008,0.053008,0.090701,0.090701,0.036729,0.036950,0.110108,...,0.125428,-0.125108,0.306509,0.342804,500,50,0.065392,1,1,1
2,gpt4turbo_wp_Saved_1,Intervention,0.031872,0.035247,0.035247,0.089163,0.089163,0.036410,0.037322,0.074596,...,0.108398,-0.057043,0.235369,0.509993,500,125,0.065392,1,1,1
3,gpt4turbo_wp_Saved_1,Intervention,0.060632,0.058040,0.058040,0.103081,0.103081,0.037225,0.037554,0.070963,...,0.131644,-0.036003,0.242165,0.285535,500,125,0.065392,1,1,1
4,gpt4turbo_wp_Saved_1,Intervention,0.126876,0.134947,0.134947,0.019584,0.019584,0.036458,0.036883,0.050939,...,0.207237,-0.080254,0.119423,0.278100,500,250,0.065392,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1219,o1mini_wp_Saved_1,Species,0.587258,0.589980,0.589980,0.774952,0.774952,0.058845,0.058742,0.016036,...,0.705112,0.743522,0.806383,0.025349,500,3000,0.632202,1,0,1
1220,o1mini_wp_Saved_1,Species,0.671314,0.681742,0.681742,0.760172,0.760172,0.046196,0.045972,0.016295,...,0.771846,0.728235,0.792110,0.072532,500,3250,0.632202,1,0,1
1221,o1mini_wp_Saved_1,Species,0.540384,0.540384,0.540384,0.800637,0.800637,0.058029,0.057931,0.014835,...,0.653926,0.771560,0.829714,0.196161,500,3250,0.632202,1,0,1
1222,o1mini_wp_Saved_1,Species,0.573510,0.570771,0.570771,0.787262,0.787262,0.050695,0.050752,0.014963,...,0.670243,0.757935,0.816590,0.062043,500,3500,0.632202,1,0,1


In [None]:
# Group by n, N, and LLM then calculate mean across repetitions
stats = ['beta_ppi','se_ppi','lower_ppi','upper_ppi','coverage_ppi','ppi_corr',
         'beta_sil','se_sil','lower_sil','upper_sil','coverage_sil',
         'beta_hum','se_hum','lower_hum','upper_hum','coverage_hum']

summ = result_wb.groupby(['n','N','y','x','param'])[stats].mean().reset_index()

# Calculate bias columns
summ['repetitions'] = reps
summ['bias_ppi'] = summ['beta_ppi'] - summ['param']
summ['bias_sil'] = summ['beta_sil'] - summ['param']
summ['bias_hum'] = summ['beta_hum'] - summ['param']

summ['rmse_ppi'] = np.sqrt(summ['bias_ppi']**2 + summ['se_ppi']**2)
summ['rmse_sil'] = np.sqrt(summ['bias_sil']**2 + summ['se_sil']**2)
summ['rmse_hum'] = np.sqrt(summ['bias_hum']**2 + summ['se_hum']**2)

# Save averaged simulation results to compressed csv file
summ.to_csv("../Data/7_ResultsPPI.csv.gz", compression="gzip", index=False)
summ

Unnamed: 0,n,N,y,x,param,beta_ppi,se_ppi,lower_ppi,upper_ppi,coverage_ppi,...,lower_hum,upper_hum,coverage_hum,repetitions,bias_ppi,bias_sil,bias_hum,rmse_ppi,rmse_sil,rmse_hum
0,500,50,gpt35turbo0125_wp_Saved_1,Age,0.494434,0.465116,0.033706,0.398940,0.531066,1.0,...,0.399077,0.531258,1.0,2,-0.029318,-0.569923,-0.029267,0.044673,0.580647,0.044650
1,500,50,gpt35turbo0125_wp_Saved_1,Barrier,0.150271,0.142049,0.040863,0.062077,0.222257,1.0,...,0.062300,0.221756,1.0,2,-0.008221,0.171604,-0.008242,0.041682,0.216308,0.041505
2,500,50,gpt35turbo0125_wp_Saved_1,CrossingSignal,0.339410,0.372408,0.045715,0.282668,0.461868,1.0,...,0.282919,0.461987,1.0,2,0.032997,-0.355134,0.033043,0.056380,0.384300,0.056379
3,500,50,gpt35turbo0125_wp_Saved_1,Fitness,0.130361,0.121131,0.037347,0.047971,0.194370,1.0,...,0.048335,0.194675,1.0,2,-0.009230,-0.209160,-0.008857,0.038471,0.239064,0.038369
4,500,50,gpt35turbo0125_wp_Saved_1,Gender,0.155021,0.148579,0.036964,0.075892,0.220790,1.0,...,0.075235,0.220170,1.0,2,-0.006442,0.013858,-0.007319,0.037522,0.108909,0.037691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,500,5000,gpt4turbo_wp_Saved_1,CrossingSignal,0.339410,0.337888,0.044689,0.250298,0.425475,1.0,...,0.247190,0.428821,1.0,2,-0.001522,0.326841,-0.001405,0.044715,0.327055,0.046357
608,500,5000,gpt4turbo_wp_Saved_1,Intervention,0.065392,0.066117,0.034977,-0.002530,0.134578,1.0,...,-0.001432,0.142330,1.0,2,0.000726,0.016202,0.005058,0.034985,0.019845,0.037022
609,500,5000,o1mini_wp_Saved_1,Barrier,0.150271,0.153690,0.064509,0.026543,0.279415,1.0,...,-0.007683,0.264770,1.0,2,0.003419,0.052192,-0.021728,0.064600,0.056618,0.072822
610,500,5000,o1mini_wp_Saved_1,CrossingSignal,0.339410,0.245460,0.080240,0.088225,0.402760,1.0,...,0.096890,0.419873,1.0,2,-0.093950,0.277240,-0.081029,0.123552,0.278042,0.115563
