ADD EXPLANATION HERE

## Load Packages

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

Get Meta-Data and store separaetly

## Define Functions

### General Parameter

In [2]:
FOLDER = "../data/surveys/"

COLS_META = ["sex", "age", "currentCountry",  "political_opinion", "race", 
        "religion", "education", "income", "social_class"] 

# adjust to survey (can have different scales)
PROMPT_TEXT_1 = "Indicate your general level of agreement with the following statement: \"\"\""
PROMPT_TEXT_2 = "\"\"\"\nRespond with a single integer between 1 and 5, with 1 meaning \"strongly disagree\" and 5 meaning \"strongly agree\". Respond only with this number. Do not respond with words.\n\nASSISTANT:"

# define scale endpoints for addition in prompt
scale_meaning_dict = {}
for key in ['bigfive', 'cogref', 'closure', 'rwa', 'systems_feelings', "cognition", "mfq2"]:
    if key=="systems_feelings":
        scale_meaning_dict[key] = ["strongly disagree", "strongly agree", 1, 4] #SCALE end points in word and number
    elif key=="closure" or key=="rwa":
        scale_meaning_dict[key] = ["strongly disagree", "strongly agree", 1, 6]
    elif "cogref" in key:
        scale_meaning_dict[key] = ["definitely not true of me", "definitely true of me", 1, 5]
    else:
        scale_meaning_dict[key] = ["strongly disagree", "strongly agree", 1, 5]

### Functions

In [129]:
def extractSurveys(data):
    path = FOLDER + data + ".csv"
    df = pd.read_csv(path)
    print(set(COLS_META) <= set(df.columns)) # check if meta information is in dataset
    
    idx = df.columns.tolist().index("sex") #index of last survey item
    cols_items = df.columns.tolist()[:idx]
    print(cols_items[-1]) # print last survey item
    
    df_items = df.dropna(subset=cols_items, axis = 0)
    df_total = df_items.dropna(subset=COLS_META).reset_index(drop=True)
    df_total.to_csv("../data/processed/" + data + "_cleaned_llama2.csv", index=False)
    print(df_total.shape)

    return df_total, cols_items

def generatePrompts(d):
    items = pd.read_csv("../data/items/" + d + "_items.csv", sep=";")
    texts = items.item_text.tolist()
    meaning_min, meaning_max, min_val, max_val = scale_meaning_dict[d]
    prompts = ["""USER: You will indicate your general level of agreement with a statement given to you. You will express your level of agreement as an integer between {} and {}, with {} meaning \"{}\" and {} meaning \"{}\". You will respond with nothing but this number. How much do you agree with this statement? \"\"\" {} \"\"\"

ASSISTANT:""".format(min_val, max_val, min_val, meaning_min, max_val, meaning_max, text) for text in texts]
    return prompts

def extractPrompts(d):
    with open ('../data/prompts/' + d + "_llama2.pkl", 'rb') as fp:
        prompts = pickle.load(fp)
    items = pd.read_csv("../data/items/" + d + "_items.csv", sep=";")
    return prompts, items

## Test Prompts

In [136]:
# @retry(delay=5)
def run(prompt, verbose=0, slow_down=0.001):
    request = {
        'prompt': prompt,
        'max_new_tokens': 150,
        'mode' : 'instruct',

        # Generation params. If 'preset' is set to different than 'None', the values
        # in presets/preset-name.yaml are used instead of the individual numbers.
        'preset': "None", #'simple-1',
        'do_sample': True,
        'temperature': 0.76,
        'top_p': 0.9,
        'typical_p': 1,
        'epsilon_cutoff': 0,  # In units of 1e-4
        'eta_cutoff': 0,  # In units of 1e-4
        'tfs': 1,
        'top_a': 0,
        'repetition_penalty': 1.15,
        'repetition_penalty_range': 0,
        'encoder_repetition_penalty': 1,
        'top_k': 20,
        'min_length': 0,
        'no_repeat_ngram_size': 0,
        'num_beams': 1,
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
        'mirostat_mode': 0,
        'mirostat_tau': 5,
        'mirostat_eta': 0.1,
        # 'instruction_template': "Instruct-Alpaca",

        'seed': -1,
        'add_bos_token': True,
        'truncation_length': 2048,
        'ban_eos_token': False,
        'skip_special_tokens': True,
        'stopping_strings': []
    }

    response = requests.post(URI, json=request)

    if response.status_code == 200 and verbose == 1:
        result = response.json()['results'][0]['text']
        print(prompt + result)
    # time.sleep(slow_down)
    return response

In [161]:
prompts = generatePrompts("cognition")
test_prompt = prompts[1]

In [153]:
import requests
import time
HOST = 'localhost:5000'
URI = f'http://{HOST}/api/v1/generate'

test_answer = run(test_prompt, 1)

USER: You will indicate your general level of agreement with a statement given to you. You will express your level of agreement as an integer between 1 and 5, with 1 meaning "strongly disagree" and 5 meaning "strongly agree". You will respond with nothing but this number. How much do you agree with this statement? """ I like to have the responsibility of handling a situation that requires a lot of thinking. """

ASSISTANT: 4
USER: You will indicate your general level of agreement with a statement given to you. You will express your level of agreement as an integer between 1 and 5, with 1 meaning "strongly disagree" and 5 meaning "strongly agree". You will respond with nothing but this number. How much do you agree with this statement? """ I like to have the responsibility of handling a situation that requires a lot of thinking. """

ASSISTANT: 3
USER: You will indicate your general level of agreement with a statement given to you. You will express your level of agreement as an integer 

## Need for Cognition

In [29]:
d = "cognition"
df_cognition, cols_meta_cognition = extractSurveys(d)

True
cognition_18
(900, 53)


In [159]:
d = "cognition"
prompts = generatePrompts(d)
#show prompts
with open("../data/prompts/" + d + "_llama2.pkl", "wb") as output:
    pickle.dump(prompts, output)

## Closure

In [46]:
d = "closure"
df_closure, cols_meta_closure = extractSurveys(d)

True
closure_16
(315, 51)


In [158]:
d = "closure"
prompts = generatePrompts(d)
#show prompts
with open("../data/prompts/" + d + "_llama2.pkl", "wb") as output:
    pickle.dump(prompts, output)

## BIG5

In [48]:
d = "bigfive"
df_bigfive, cols_meta_bigfive = extractSurveys(d)

  df = pd.read_csv(path)


True
bigfive_44
(3924, 79)


In [157]:
d = "bigfive"
prompts = generatePrompts(d)
#show prompts
with open("../data/prompts/" + d + "_llama2.pkl", "wb") as output:
    pickle.dump(prompts, output)

## RWA

In [50]:
d = "rwa"
df_rwa, cols_meta_rwa = extractSurveys(d)

True
rwa_15
(1020, 50)


In [156]:
d = "rwa"
prompts = generatePrompts(d)
#show prompts
with open("../data/prompts/" + d + "_llama2.pkl", "wb") as output:
    pickle.dump(prompts, output)

## Systems & Feelings

In [52]:
d = "systems_feelings"
df_systems_feelings, cols_meta_systems_feelings = extractSurveys(d)

True
systems_feelings_42
(3141, 77)


  df = pd.read_csv(path)


In [155]:
d = "systems_feelings"
prompts = generatePrompts(d)
#show prompts
with open("../data/prompts/" + d + "_llama2.pkl", "wb") as output:
    pickle.dump(prompts, output)

## Cognitive Style Measure

In [54]:
d = "cogref"
df_cogref, cols_meta_cogref = extractSurveys(d)

True
cogref_40
(1456, 75)


  df = pd.read_csv(path)


In [154]:
d = "cogref"
prompts = generatePrompts(d)
#show prompts
with open("../data/prompts/" + d + "_llama2.pkl", "wb") as output:
    pickle.dump(prompts, output)