In [6]:
import pandas as pd
import numpy as np
from ppi_py import ppi_ols_ci, classical_ols_ci, ppi_ols_pointestimate

df = pd.read_csv("https://raw.githubusercontent.com/davidbroska/IntegrativeExperimentsGAI/main/Data/5_SurveySampleLLM.csv.gz")
df = pd.read_csv("../Data/5_SurveySampleLLM.csv.gz")


In [7]:
Covs = ['PedPed', 'Barrier', 'CrossingSignal', 'NumberOfCharacters',
        'DiffNumberOFCharacters', 'LeftHand', 'Man', 'Woman', 'Pregnant',
        'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless',
        'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive',
        'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor',
        'MaleDoctor', 'Dog', 'Cat', 
        'Intervention'
        ]

def calc_ci(df, x, y, n, N):
  """
  Args:
      df: DataFrame containing the variables listed below
      x:  The name of the predictor variable (e.g. Barrier, Dog,...).
      y:  The name of the dependent variable
      n:  The sample size of the human responses.
      N:  The sample size of the responses predicted by the algorithm.

  Returns:
      A DataFrame with the following columns:
        x: The name of the predictor variable.
        n: The sample size.
        N: The population size.
        lower_CI_ppi: The lower bound of the PPI confidence interval.
        upper_CI_ppi: The upper bound of the PPI confidence interval.
        lower_CI_ols: The lower bound of the OLS confidence interval.
        upper_CI_ols: The upper bound of the OLS confidence interval.
  """

  sub_df = df.sample(n+N, ignore_index=True)

  # ensure that there is variation in X and Y in the sample 
  ncats = sub_df.groupby([x,'Saved']).count().shape[0]
  ncatsLLM = sub_df.groupby([x,'Saved']).count().shape[0]

  # in the rare when there is no variation, sample again (4 bc there are two variables with two levels)
  while(ncats < 4 or ncatsLLM < 4):
    sub_df = df.sample(n+N, ignore_index=True)
    ncats = sub_df.groupby([x,'Saved']).count().shape[0]
    ncatsLLM = sub_df.groupby([x,'Saved']).count().shape[0]

  df_people = sub_df.iloc[:n]
  df_gpt = sub_df.iloc[n:]

  Xn = np.ones((n,2))                            # intercept
  Xn[:,1] = df_people[x]                         # covariates in the labeled data
  Yn_ppl = df_people['Saved'].to_numpy()         # observed outcomes
  Yn_gpt = df_people[y].to_numpy()               # LLM predictions for labeled data
  w_labeled = df_people['weights'].to_numpy()    # define weigths for the labeled data

  XN = np.ones((N,2))
  XN[:,1] = df_gpt[x]
  YN_gpt = df_gpt[y].to_numpy()
  w_unlabeled = df_gpt['weights'].to_numpy()

  # calculate point estimate
  pointest_ppi = ppi_ols_pointestimate(Xn, Yn_ppl, Yn_gpt, XN, YN_gpt, w=w_labeled, w_unlabeled=w_unlabeled)

  # calculate confidence intervals
  # https://ppi-py.readthedocs.io/en/latest/baselines.html#ppi_py.classical_ols_ci
  lower_CI_ppi, upper_CI_ppi = ppi_ols_ci(Xn, Yn_ppl, Yn_gpt, XN, YN_gpt, w=w_labeled, w_unlabeled=w_unlabeled,alpha=.05)
  lower_CI_ols, upper_CI_ols = classical_ols_ci(Xn, Yn_ppl, w=w_labeled,alpha=.05)

  # Create and return the output DataFrame
  output_df = pd.DataFrame({
      "y": y,
      "x": x,
      "n": n,
      "N": N,
      "pointest_ppi": pointest_ppi[1],
      "lower_CI_ppi": lower_CI_ppi[1],
      "upper_CI_ppi": upper_CI_ppi[1],
      "lower_CI_ols": lower_CI_ols[1],
      "upper_CI_ols": upper_CI_ols[1]}, index=[0])
  return output_df

In [8]:
ns = range(50,250,50)
ks = range(1,11,1)
Ys = ["gpt35turbo0125_wp_Saved","gpt4turbo_wp_Saved","gpt4o_wp_Saved"]#,
      #"gpt35turbo0125_np_Saved","gpt4turbo_np_Saved","gpt4o_np_Saved"]
Xs = ['NumberOfCharacters','Boy','Girl','Woman','Man']
reps = 10
result = pd.DataFrame()
for y in Ys: 
  print(f"Iterating over dependent variable: {y}")

  for x in Xs:
    print(f"    Predictor: {x}")

    for n in ns:
      #print(f"with human sample size: {n}")

      for k in ks:
        N = n*k
        #print(f"Iterating over the LLM sample size: {N}")

        for r in range(reps):
          result = pd.concat([result, calc_ci(df=df,x=x, y=y, n=n, N=N)],ignore_index=True)

Iterating over dependent variable: gpt35turbo0125_wp_Saved
    Predictor: NumberOfCharacters
    Predictor: Boy
    Predictor: Girl
    Predictor: Woman
    Predictor: Man
Iterating over dependent variable: gpt4turbo_wp_Saved
    Predictor: NumberOfCharacters
    Predictor: Boy
    Predictor: Girl
    Predictor: Woman
    Predictor: Man
Iterating over dependent variable: gpt4o_wp_Saved
    Predictor: NumberOfCharacters
    Predictor: Boy
    Predictor: Girl
    Predictor: Woman
    Predictor: Man


In [9]:
result.to_csv("../Data/6_ResultsPPI.csv.gz", compression="gzip", index=False)
df = pd.read_csv("../Data/6_ResultsPPI.csv.gz")
