In [154]:
import numpy as np
import pandas as pd
import openai
import configparser
import ast
import re 
import time
from  scipy.special import expit, logit

from helpers import *

import backoff

config = configparser.ConfigParser()
config.read("config.ini")
api_key = config.get('Keys','openai_api_key')

openai.api_key = api_key

In [155]:
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def completions_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

In [37]:
dfi = pd.read_table("data/model_input/Pew_American_Trends_Panel_W34.csv")
dfi = dfi[["key", "question", "options"]]

dfi

Unnamed: 0,key,question,options
0,SCI1_W34,"Overall, do you think science has made life ea...","['Easier', 'More difficult', 'Refused']"
1,SCI2A_W34,Do you think science has had a mostly positive...,"['Mostly positive', 'Mostly negative', 'Refused']"
2,SCI2B_W34,Do you think science has had a mostly positive...,"['Mostly positive', 'Mostly negative', 'Refused']"
3,SCI2C_W34,Do you think science has had a mostly positive...,"['Mostly positive', 'Mostly negative', 'Refused']"
4,SCI3A_W34,"In your opinion, do you think government inves...",['Government investments usually pay off in th...
...,...,...,...
62,EVOBIOA_W34,"From what you have heard or read, which of the...",['Humans have evolved over time due to process...
63,EVOBIOB_W34,"Even if you are not sure, from what you have h...",['Humans have evolved over time due to process...
64,BIO15_W34,Have you seen a health care provider for an il...,"['Yes', 'No', 'Refused']"
65,G1_W34,"Do you, or does anyone in your immediate famil...","['Yes', 'No', 'Not sure', 'Refused']"


In [172]:
def make_survey_item(question, resp_list, answer=""):
    q = question + "\n\n"

    r_options = "\n".join([" " + r for r in resp_list])
    suffix = "\n\nAnswer:"
    if len(answer) > 0:
        suffix += " " + answer

    return(make_prompt(r_options, q, suffix))


def make_survey_item_df(row, ans=""):
    q = row["question"]
    r_list = ast.literal_eval(row["options"])
    
    if "Refused" in r_list:
        r_list.remove("Refused")

    r_list = [". ".join(x) for x in zip(["A","B","C","D","E"], r_list)]

    return(make_survey_item(q, r_list, ans))


def make_survey_df(row):
    r_list = ast.literal_eval(row["options"])
    
    if "Refused" in r_list:
        r_list.remove("Refused")

    ans_list = ["A","B","C","D","E"][:len(r_list)]

    out_list = [make_survey_item_df(row, x) for x in ans_list]

    return(out_list)

test_q = make_survey_item_df(dfi.iloc[10])

print(test_q)


How often do you choose foods to eat because they are easy and most convenient?

 A. All of the time
 B. More than half of the time
 C. About half of the time
 D. Less than half of the time
 E. Never

Answer:


In [173]:
test_qs = make_survey_df(dfi.iloc[10])
test_qs


['How often do you choose foods to eat because they are easy and most convenient?\n\n A. All of the time\n B. More than half of the time\n C. About half of the time\n D. Less than half of the time\n E. Never\n\nAnswer: A',
 'How often do you choose foods to eat because they are easy and most convenient?\n\n A. All of the time\n B. More than half of the time\n C. About half of the time\n D. Less than half of the time\n E. Never\n\nAnswer: B',
 'How often do you choose foods to eat because they are easy and most convenient?\n\n A. All of the time\n B. More than half of the time\n C. About half of the time\n D. Less than half of the time\n E. Never\n\nAnswer: C',
 'How often do you choose foods to eat because they are easy and most convenient?\n\n A. All of the time\n B. More than half of the time\n C. About half of the time\n D. Less than half of the time\n E. Never\n\nAnswer: D',
 'How often do you choose foods to eat because they are easy and most convenient?\n\n A. All of the time\n B

In [179]:
responses = openai.Completion.create(
        model="text-davinci-003",
        prompt = test_q,
        logprobs=1,
        echo=True,
        max_tokens=0
    )

# # responses

In [175]:
def get_survey_logprobs(responses):
    logprobs = get_gpt_logprobs(responses)
    return(np.array([x[-1] for x in logprobs]))

get_survey_logprobs(responses)


array([ -1.7166117 ,  -0.21475565,  -8.52491   , -12.513472  ,
       -20.809069  ])

In [181]:
def gpt_survey_responses(row, model = "text-davinci-003", sleep = 0):
    qs = make_survey_df(row)

    # sleep for sleep seconds (default = 0)
    time.sleep(sleep)

    responses = openai.Completion.create(
        model=model,
        prompt = qs,
        logprobs=1,
        echo=True,
        max_tokens=0,
        temperature = 0
    )

    logprobs = get_survey_logprobs(responses)

    return(logprobs)



In [182]:
test_df = dfi[22:25]
test_df

Unnamed: 0,key,question,options
22,EAT5C_W34,"How much health risk, if any, does eating food...","['A great deal of health risk', 'Some health r..."
23,EAT5D_W34,"How much health risk, if any, does eating food...","['A great deal of health risk', 'Some health r..."
24,EAT6_W34,Which of these statements comes closer to your...,['The average person is exposed to additives i...


In [216]:
# df = dfi.assign(logprobs = dfi.apply(gpt_survey_responses, axis=1, sleep=1.))
df_gpt = df

## prompting with other questions

67 questions x up to 5 answer choices x 67 preceding questions for context = lots of tokens

In [211]:
questions = []
prompts = []
prompt_answers = []

total_tokens = 0

for i in range(len(df)):
    
    for j in range(len(df)):
        if j!=i:
            # row = df.iloc[i]
            qs = make_survey_df(df.iloc[i]) # question i with each possible answer
            ps = make_survey_df(df.iloc[j]) # prompts with each answer for question j
            for p in ps:
                for q in qs:
                    prompts.append(p)
                    questions.append(p + "\n\n" + q)
                    total_tokens += count_tokens(p + "\n\n" + q)


            # tokens = sum([count_tokens(p) for p in qs])
                


    

In [214]:
print(67*67)
print("total tokens", total_tokens)
print("$", total_tokens/1000*.02)
# print(questions[3])

4489
total tokens 943232
$ 18.86464


In [215]:
total_tokens/len(questions)

101.76200237350308

In [219]:
def get_resp_dict(df, item):
    df = df[df["key"] == item]
    df.reset_index(inplace=True)
    string = df["option_mapping"][0]
    a = ast.literal_eval(string)
    res = dict((v,k) for k,v in a.items())

    return(res)


info_df = pd.read_csv("data/human_resp/American_Trends_Panel_W34/info.csv")
key_vars = info_df.key.to_list()

df = pd.read_csv("data/human_resp/American_Trends_Panel_W34/responses.csv")
df = df[key_vars]

for c in df.columns:
    rdict = get_resp_dict(info_df, c)
    df[c] = df[c].map(rdict)

df_long = pd.melt(df.reset_index(), id_vars='index')

corrmat = df.replace(99.0, np.NaN).corr() # drop missing


In [222]:
#### combine variables pairwise according to correlations to examine influences

qvar = []
pvar = []
corrval = []

# for every column in corr matrix
for c in corrmat.columns:
    corrs = corrmat[c].sort_values()
    # find 10 strongest pos and neg correlations
    neg = corrs[:5]
    pos = corrs[-6:-1]
    # and 10 random others
    rand = corrs[5:-6].sample(n=5)

# and then concat combinations of names, plus corr values
    out = pd.concat([pos, neg, rand])
    corrval.extend(out.to_list())
    pvar.extend(out.index.to_list())
    qvar.extend([c]*5*3)

df2 = pd.DataFrame({"query_var":var1, "prompt_var":var2, "corr":corrval})

df2

Unnamed: 0,query_var,prompt_var,corr
0,SCI1_W34,SCI3B_W34,0.195653
1,SCI1_W34,SCI2A_W34,0.235727
2,SCI1_W34,SCI3A_W34,0.254028
3,SCI1_W34,SCI2C_W34,0.258629
4,SCI1_W34,SCI2B_W34,0.314973
...,...,...,...
1000,G2_W34,FUD37D_W34,0.006716
1001,G2_W34,MED7_W34,0.019071
1002,G2_W34,MED5_W34,-0.019928
1003,G2_W34,EAT3J_W34,0.037297


In [260]:
qvar = []
pvar = []
pvar_ans = []
corrval = []

for i, row in df2.iterrows():
    # dfi.loc[lambda _: _.key=="SCI1_W34"].reset_index().loc[0]["options"]
    r_list = dfi.loc[lambda _: _.key==row["prompt_var"]].reset_index().loc[0]["options"]
    r_list = ast.literal_eval(r_list)
    if "Refused" in r_list:
        r_list.remove("Refused")

    ans_list = ["A","B","C","D","E"][:len(r_list)]
    n_resps = len(ans_list)

    pvar.extend([row["prompt_var"]] * n_resps )
    qvar.extend([row["query_var"]] * n_resps )
    corrval.extend([row["corr"]] * n_resps )
    pvar_ans.extend(ans_list)

df3 = pd.DataFrame({"query_var":qvar, "prompt_var":pvar, "corr":corrval, "prompt_ans":pvar_ans})

In [261]:
df3

Unnamed: 0,query_var,prompt_var,corr,prompt_ans
0,SCI1_W34,SCI3B_W34,0.195653,A
1,SCI1_W34,SCI3B_W34,0.195653,B
2,SCI1_W34,SCI2A_W34,0.235727,A
3,SCI1_W34,SCI2A_W34,0.235727,B
4,SCI1_W34,SCI3A_W34,0.254028,A
...,...,...,...,...
2960,G2_W34,EAT3J_W34,0.037297,B
2961,G2_W34,EAT5D_W34,0.031081,A
2962,G2_W34,EAT5D_W34,0.031081,B
2963,G2_W34,EAT5D_W34,0.031081,C
