ADD EXPLANATION HERE

## Load Packages

In [1]:
import openai
import os
import pandas as pd
import numpy as np

import string
import re
remove = string.punctuation
remove = remove.replace("-", "").replace(",", "") # don't remove hyphens
pattern = r"[{}]".format(remove) # create the pattern

import pickle
import time
import logging
from retry import retry
logging.basicConfig()

# Calculate the delay based on your rate limit
rate_limit_per_minute = 3500.0
delay_60 = 60.0 / 60
delay_full = 60.0 / rate_limit_per_minute

## General Parameters

In [2]:
mode = "full"
folder = "../data/"
path_texts = folder + "CCR_clean.csv"
path_items = folder + "CCR_items.csv"

## Functions

In [3]:
# chatGPT parameters
openai.api_key = "" #add your openai key here!
model_engine = "gpt-3.5-turbo-0301"

@retry(delay=5)
def delayed_completion(delay_in_seconds: float = 1, **kwargs):
    """Delay a completion by a specified amount of time."""

    # Sleep for the delay
    time.sleep(delay_in_seconds)

    # Call the Completion API and return the result
    return openai.ChatCompletion.create(**kwargs)

## Load Data

In [7]:
df_texts = pd.read_csv(path_texts)
df_items = pd.read_csv(path_items)
text_values = df_texts.ValuesSurvey.values.tolist()
text_behaviors = df_texts.BehaviorsSurvey.values.tolist()

#rename constructs to something readable
# can be left out if all construct names are correct in the raw data/
constr_rename_dict = {"CARE_tot": "Care values", "EQUALITY_tot": "Equality values", "PROPORTIONALITY_tot": "Proportionality values", "LOYALTY_tot": "Loyalty values", 
               "AUTHORITY_tot": "Authority values", "PURITY_tot": "Purity values",
               "Individualism_tot": "Individualism", "Collectivism_tot": "Collectivism", "Religiosity_tot": "Religiosity", 
               "conservatism_tot": "Conservatism", "NFC_tot": "Need for Cognition", "Tightness": "Cultural Tightness", "PVQ_SD_tot": "Self-direction values", 
               "PVQ_PO_tot": "Power values", "PVQ_UN_tot": "Universalism", "PVQ_AC_tot": "Achievement values", "PVQ_SE_tot": "Security values", "PVQ_ST_tot": "Stimulation", "PVQ_CO_tot": "Conformity",
               "PVQ_TR_tot": "Tradition", "PVQ_HE_tot": "Hedonism", "PVQ_BE_tot": "Benevolence values",
} 

#alternatively read the construct/survey list directly from a file
constructs = constr_rename_dict.keys()

# define scale endpoints for dynamic addition in prompt
scale_meaning_dict = {}
for key in constr_rename_dict.keys():
    if key=="Religiosity_tot":
        scale_meaning_dict[key] = ["never or definitely not true of me", "very frequently or definitely true of me", 1, 6] #SCALE end points in word and number
    elif key=="conservatism_tot":
        scale_meaning_dict[key] = ["completely disagree", "completely agree", 1, 7]
    elif "PVQ_" in key:
        scale_meaning_dict[key] = ["not like me at all", "very much like me", 1, 6]
    elif "NFC" in key:
        scale_meaning_dict[key] = ["extremely uncharacteristic of me", "extremely characteristic of me", 1, 5]
    else:
        scale_meaning_dict[key] = ["strongly disagree", "strongly agree", 1, 5]

In [5]:
def createPrompts(texts, construct, mode=1):
    items = df_items[construct].dropna().values.tolist()
    min_val = df_texts[construct].min()
    max_val = df_texts[construct].max()
    constr_name = constr_rename_dict[construct]
    meaning_min, meaning_max, min_val, max_val = scale_meaning_dict[construct]
    if mode==1: #general
        prompts = ["For a scientific study, rate how strongly the author of the following text endorses \"{}\" solely based on the text they have written. "\
        "Here is the text: \n"\
        "\"{}\"\n\n" \
        "Respond with a single number of up to two decimal points and between {} and {}, with {} meaning \"{}\" and {} meaning \"{}\". "\
        "Respond only with this single number and nothing else. Do not use words.".format(constr_name,text, min_val, max_val, min_val, meaning_min, max_val, meaning_max) for text in texts]
    elif mode==2: #hybrid: chatGPT + CCR idea (rating on items)
        item_string = "".join([str(i+1) + ") \"" + str(item) + "\"\n" for i,item in enumerate(items)])
        prompts = ["For a scientific study, rate how strongly the author of the following text endorses the following psychological items solely based on the text they have written. "\
        "Here is the text: \n"\
        "\"{}\"\n\n" \
        "Here are the items: \n"\
        "{}\n"\
        "Respond to each item with a single digit between {} and {}, with {} meaning \"{}\" and {} meaning \"{}\". "\
        "Respond with exactly {} numbers, comma separated. Do not use words.".format(text, item_string, min_val, max_val, min_val, meaning_min, max_val, meaning_max, str(len(items))) for text in texts]
    else:
        pass
    return prompts

## Test Call

In [11]:
# Choose a construct and check prompts
constr = "PVQ_UN_tot"
for texts in [text_values, text_behaviors]:
    prompts_general = createPrompts(text_values, constr, 1)
    prompts_hybrid = createPrompts(text_values, constr, 2)
    messages_general = [{"role": "user", "content": x} for x in prompts_general]
    messages_hybrid = [{"role": "user", "content": x} for x in prompts_hybrid]

print(messages_general[42])
print()
print(messages_hybrid[42])

{'role': 'user', 'content': 'For a scientific study, rate how strongly the author of the following text endorses "Universalism" solely based on the text they have written. Here is the text: \n"I’m always lost. I’m glad I met my husband. He not only supports me on everything I do but guides me to the right path. I would tell him everything or whatever that bothers and he’ll try to talk me through and help me. So whenever I make a decision, I’ll talk to him about it."\n\nRespond with a single number of up to two decimal points and between 1 and 6, with 1 meaning "not like me at all" and 6 meaning "very much like me". Respond only with this single number and nothing else. Do not use words.'}

{'role': 'user', 'content': 'For a scientific study, rate how strongly the author of the following text endorses the following psychological items solely based on the text they have written. Here is the text: \n"I’m always lost. I’m glad I met my husband. He not only supports me on everything I do bu

### Run Tests

In [13]:
# construct-level
APIresponse = delayed_completion(
    delay_in_seconds=delay_full,
    model=model_engine,
    messages=[messages_general[42]],
    temperature=0
    )
response = APIresponse.choices[0].message["content"]
print(response) #works

1.00


In [12]:
# item-level
APIresponse = delayed_completion(
    delay_in_seconds=delay_full,
    model=model_engine,
    messages=[messages_hybrid[42]],
    temperature=0
    )
response = APIresponse.choices[0].message["content"]
print(response) #works

1, 2, 3


## Run Calls

In [None]:
rows = []
for constr in constructs:
    print(constr)
    for source,texts in zip(["values", "behaviors"], [text_values, text_behaviors]):
        prompts_general = createPrompts(texts, constr, 1)
        prompts_hybrid = createPrompts(texts, constr, 2)
        messages_general = [{"role": "user", "content": x} for x in prompts_general]
        messages_hybrid = [{"role": "user", "content": x} for x in prompts_hybrid]
        print(source)
        for i, message in enumerate(messages_general):
            row_name = "Participant_" + str(i+1)
            # print(row_name)
            APIresponse = delayed_completion(
                delay_in_seconds=delay_full,
                model=model_engine,
                messages=[message],
                temperature=0,
                )
            response = APIresponse.choices[0].message["content"]
            rows.append([row_name, source, "general", constr, response])

        for i, message in enumerate(messages_hybrid):
            row_name = "Participant_" + str(i+1)
            APIresponse = delayed_completion(
                delay_in_seconds=delay_full,
                model=model_engine,
                messages=[message],
                temperature=0,
                )
            response = APIresponse.choices[0].message["content"]
            rows.append([row_name, source, "hybrid", constr, response])
            
# save as dataframe
df_predictions = pd.DataFrame(rows, columns=["id", "source", "prompt_type", "construct", "prediction"])

In [53]:
#format the ChatGPT results into final shape (from wide to long format)
df_ratings = df_texts.copy()
df_ratings.insert(0, "id", ["Participant_" + str(i+1) for i in range(len(messages_general))])
df_ratings = pd.melt(df_ratings, id_vars=[x for x in df_ratings.columns if x not in constr_rename_dict.keys()], value_vars=constr_rename_dict.keys(), value_name="response", var_name="construct")

In [60]:
# merge datasets and extract numbers (clean the responses so that the predictions are only numbers)
df_final = df_ratings.merge(df_predictions, "left", on=["id", "construct"]).dropna(subset=["prediction"], axis=0) #save only what was predicted
df_final["prediction_clean"] = df_final.prediction.apply(lambda x: re.sub(r",\s*\d+", "", x) if ")" in x else x) #choose first number if gpt reports more than one for an item
df_final["prediction_clean"] = df_final.prediction_clean.apply(lambda x: re.sub(r"\d+\)", "", x) if x else x) #remove parenthesis and numbering in case gpt uses it for item-level responses
df_final["prediction_clean"] = df_final.prediction_clean.apply(lambda x: [float(i) for i in re.findall(r'(\d+(?:\.\d+)?)', x)])
df_final["prediction_clean_avg"] = df_final.prediction_clean.apply(lambda x: np.mean(x))

# split multi-item predictions into individual items
df_final = df_final.explode('prediction_clean').reset_index(drop=True)
df_final['item'] = df_final.groupby(['id', 'construct', "source", "prompt_type"]).transform('cumcount').add(1)
df_final['item'] = ["item_" + str(x) for x in df_final.item]
df_final.loc[df_final.prompt_type=="general", 'item'] = "total" #rename construct level predictions

# save
df_final.to_csv("../results/" + choice_name + ".csv", index=False)

# print number of NAs (out of N constructs x M participants x 2 essays)
print(df_final.shape[0] - df_final.dropna().shape[0])

  return _methods._mean(a, axis=axis, dtype=dtype,


115
