In [1]:
import pandas as pd
import numpy as np
from ppi_py import ppi_ols_ci, classical_ols_ci, ppi_ols_pointestimate

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [48]:
#df1 = pd.read_csv("https://raw.githubusercontent.com/davidbroska/IntegrativeExperimentsGAI/main/Data/JoinedAwadTakemoto/Joined.csv")
df = pd.read_csv("../Data/JoinedLong.csv")

Covs = ['PedPed', 'Barrier', 'CrossingSignal', 
  'NumberOfCharacters',
        'DiffNumberOFCharacters', 'LeftHand', 'Man', 'Woman', 'Pregnant',
        'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless',
        'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive',
        'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor',
        'MaleDoctor', 'Dog', 'Cat', 
        'Intervention'
        ]

def calc_ci(dataset, var, n, N):
  """

  Args:
      var: The name of the variable.
      n: The sample size of the human responses.
      N: The sample size of the responses predicted by the algorithm.

  Returns:
      A pandas DataFrame with the following columns:
        * var: The name of the variable.
        * n: The sample size.
        * N: The population size.
        * lower_CI_ppi: The lower bound of the PPI confidence interval.
        * upper_CI_ppi: The upper bound of the PPI confidence interval.
        * lower_CI_ols: The lower bound of the OLS confidence interval.
        * upper_CI_ols: The upper bound of the OLS confidence interval.
  """

  seed = 500
  df = dataset

  sub_df = df.sample(n+N, random_state=seed, ignore_index=True)
  df_people = sub_df.iloc[:n]
  df_gpt = sub_df.iloc[n:]

  Xn = np.ones((n,2))
  Xn[:,1] = df_people[var]
  Yn_ppl = df_people['Saved'].to_numpy()
  Yn_gpt = df_people['SavedGPT4'].to_numpy()

  XN = np.ones((N,2))
  XN[:,1] = df_gpt[var]
  YN_gpt = df_gpt['SavedGPT4'].to_numpy()

  lower_CI_ppi, upper_CI_ppi = ppi_ols_ci(Xn, Yn_ppl, Yn_gpt, XN, YN_gpt)
  lower_CI_ols, upper_CI_ols = classical_ols_ci(Xn, Yn_ppl)

  # Create and return the output DataFrame
  output_df = pd.DataFrame({
      "var": var,
      "n": n,
      "N": N,
      "lower_CI_ppi": lower_CI_ppi[1],
      "upper_CI_ppi": upper_CI_ppi[1],
      "lower_CI_ols": lower_CI_ols[1],
      "upper_CI_ols": upper_CI_ols[1]}, index=[0])
  return output_df

In [None]:
calc_ci(GPT4,'PedPed', 50, 100)

## GPT4 Predictions

In [35]:
GPT4 = df.dropna(subset=["SavedGPT4"])
GPT4 = GPT4.dropna(subset=["Saved"])
maxhuman = 2000
humansamples = list(range(50, 501, 50)) + list(range(600, maxhuman+1, 100))
maxllm = GPT4.shape[0] - maxhuman
maxllm = (maxllm // 100) * 100
llmsamples = list(range(50, 501, 50)) + list(range(600, maxllm+1, 100))
print(humansamples)
print(llmsamples)

[50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000]
[50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300]


In [43]:

result = pd.DataFrame()

for var in Covs:
  print(f"Iterating over the variable: {var}")

  for n in humansamples:
    print(f"with human sample size: {n}")

    for N in llmsamples:
      print(f"Iterating over the LLM sample size: {N}")
      result = pd.concat([result, calc_ci(GPT4,var, n, N)])

Iterating over the variable: PedPed
with human sample size: 50
Iterating over the LLM sample size: 50
Iterating over the LLM sample size: 100
Iterating over the LLM sample size: 150
Iterating over the LLM sample size: 200
Iterating over the LLM sample size: 250
Iterating over the LLM sample size: 300
Iterating over the LLM sample size: 350
Iterating over the LLM sample size: 400
Iterating over the LLM sample size: 450
Iterating over the LLM sample size: 500
Iterating over the LLM sample size: 600
Iterating over the LLM sample size: 700
Iterating over the LLM sample size: 800
Iterating over the LLM sample size: 900
Iterating over the LLM sample size: 1000
Iterating over the LLM sample size: 1100
Iterating over the LLM sample size: 1200
Iterating over the LLM sample size: 1300
Iterating over the LLM sample size: 1400
Iterating over the LLM sample size: 1500
Iterating over the LLM sample size: 1600
Iterating over the LLM sample size: 1700
Iterating over the LLM sample size: 1800
Iterating

In [47]:
# Define the filename and path
filename = '../Data/2_PPI_GPT4.csv'

# Export the DataFrame to a CSV file
with open(filename, 'w', newline='') as f:
    result.to_csv(f)


## GPT3

In [56]:
GPT3 = df.dropna(subset=["SavedGPT3"])
GPT3 = GPT3.dropna(subset=["Saved"])
maxhuman = 2000
humansamples = list(range(50, 501, 50)) + list(range(600, maxhuman+1, 100))
maxllm = GPT3.shape[0] - maxhuman
maxllm = (maxllm // 100) * 100
llmsamples = list(range(50, 501, 50)) + list(range(600, maxllm+1, 100))
print(humansamples)
print(llmsamples)
maxllm

[50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000]
[50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11700, 11800, 11900, 12000, 12100, 12200, 12300, 12400, 12500]


12500

In [58]:


result = pd.DataFrame()

for var in Covs:
  print(f"Iterating over the variable: {var}")

  for n in humansamples:
    print(f"with human sample size: {n}")

    for N in llmsamples:
      print(f"Iterating over the LLM sample size: {N}")
      result = pd.concat([result, calc_ci(GPT3,var, n, N)])

Iterating over the variable: PedPed
with human sample size: 50
Iterating over the LLM sample size: 50
Iterating over the LLM sample size: 100
Iterating over the LLM sample size: 150
Iterating over the LLM sample size: 200
Iterating over the LLM sample size: 250
Iterating over the LLM sample size: 300
Iterating over the LLM sample size: 350
Iterating over the LLM sample size: 400
Iterating over the LLM sample size: 450
Iterating over the LLM sample size: 500
Iterating over the LLM sample size: 600
Iterating over the LLM sample size: 700
Iterating over the LLM sample size: 800
Iterating over the LLM sample size: 900
Iterating over the LLM sample size: 1000
Iterating over the LLM sample size: 1100
Iterating over the LLM sample size: 1200
Iterating over the LLM sample size: 1300
Iterating over the LLM sample size: 1400
Iterating over the LLM sample size: 1500
Iterating over the LLM sample size: 1600
Iterating over the LLM sample size: 1700
Iterating over the LLM sample size: 1800
Iterating

In [60]:
# Define the filename and path
filename = '../Data/2_PPI_GPT3.csv'

# Export the DataFrame to a CSV file
with open(filename, 'w', newline='') as f:
    result.to_csv(f)