This codebook annotates the moral sentiment of the MFRC test sample using a llama 2 model!

## Packages

In [12]:
import os
import pandas as pd
import numpy as np
import requests

import string
import re
remove = string.punctuation
remove = remove.replace("-", "").replace(",", "") # don't remove hyphens
pattern = r"[{}]".format(remove) # create the pattern

import pickle
import time
import logging
from retry import retry
logging.basicConfig()

# Calculate the delay based on your rate limit
rate_limit_per_minute = 5000.0
delay_60 = 60.0 / 60
delay_full = 60.0 / rate_limit_per_minute

## General Parameters

In [13]:
data = "mfrc"
mode = "full"
folder = "../data/preprocessed/"
path = folder + data + "_sample_" + mode + ".csv"

## Functions

In [15]:
# chatGPT parameters
HOST = 'localhost:5000'
URI = f'http://{HOST}/api/v1/generate'

@retry(delay=5)
def run(prompt, verbose=0, slow_down=0.001):
    request = {
        'prompt': prompt,
        'max_new_tokens': 150,
        'mode' : 'instruct',

        # Generation params. If 'preset' is set to different than 'None', the values
        # in presets/preset-name.yaml are used instead of the individual numbers.
        'preset': 'None',
        'do_sample': True,
        'temperature': 0.01,
        'top_p': 0.14,
        'typical_p': 1,
        'epsilon_cutoff': 0,  # In units of 1e-4
        'eta_cutoff': 0,  # In units of 1e-4
        'tfs': 1,
        'top_a': 0,
        'repetition_penalty': 1.17,
        'repetition_penalty_range': 0,
        'encoder_repetition_penalty': 1,
        'top_k': 49,
        'min_length': 0,
        'no_repeat_ngram_size': 0,
        'num_beams': 1,
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
        'mirostat_mode': 0,
        'mirostat_tau': 5,
        'mirostat_eta': 0.1,
        # 'instruction_template': "Instruct-Alpaca",

        'seed': -1,
        'add_bos_token': True,
        'truncation_length': 2048,
        'ban_eos_token': False,
        'skip_special_tokens': True,
        'stopping_strings': []
    }

    response = requests.post(URI, json=request)

    if response.status_code == 200 and verbose == 1:
        result = response.json()['results'][0]['text']
        print(prompt + result)
    time.sleep(slow_down)
    return response

def model_api(request):
    response = requests.post(f'http://{HOST}/api/v1/model', json=request)
    return response.json()

def model_info():
    response = model_api({'action': 'info'})
    print_basic_model_info(response)

def print_basic_model_info(response):
    basic_settings = ['truncation_length', 'instruction_template']
    print("Model: ", response['result']['model_name'])
    print("Lora(s): ", response['result']['lora_names'])
    for setting in basic_settings:
        print(setting, "=", response['result']['shared.settings'][setting])

def separate_labels(df, cols):
    def _set_labels(row):
        for label in row["annotations"].split(","):
            if label.strip() in cols:
                row[label.strip()] = 1
        return row

    # removing texts with no annotations
    df = df[df.annotations != ''].reset_index(drop=True)
    df = df[~ pd.isna(df.annotations)].reset_index(drop=True)
    for label in cols:
        df[label] = 0
    df = df.apply(_set_labels, axis=1).drop(["annotations"], axis = 1)
    return df

In [16]:
model_info()

Model:  TheBloke_Luna-AI-Llama2-Uncensored-GPTQ_gptq-4bit-32g-actorder_True
Lora(s):  []
truncation_length = 2048
instruction_template = None


## Load Data and Create Prompts

In [17]:
PROMPT_TEXT_1 = "USER: These are definitions of moral sentiments: "\
"\"care\" if a text is about avoiding emotional and physical damage to another individual, \"equality\" if a text is about equal treatment and equal outcome for individuals, "\
"\"proportionality\" if a text is about individuals getting rewarded in proportion to their merit or contribution, \"loyalty\" if a text is about cooperating with ingroups and competing with outgroups, "\
"\"authority\" if a text is about deference toward legitimate authorities and the defense of traditions, all of which are seen as providing stability and fending off chaos, "\
"\"purity\" if a text is about avoiding bodily and spiritual contamination and degradation, \"thin-morality\" if a text has a moral sentiment but cannot be categorized as either of the above, "\
"\"none\" if no moral sentiment is expressed in the text."\
"\n\nBased solely on these definitions, name all moral sentiments that are directly expressed in the following text:\n"\
"\"\"\""

PROMPT_TEXT_2 = "\"\"\"\n\nReturn a comma separated list of all moral sentiments that were expressed in the text.\n\nASSISTANT:\n"

In [20]:
# Create prompts
df = pd.read_csv(path)
print(df.shape)
print(round(df.text.str.split("\\s+").str.len().mean()))
prompts = [PROMPT_TEXT_1 + x + PROMPT_TEXT_2 for x in df.text]
print(prompts[10])

(2983, 9)
33
USER: These are definitions of moral sentiments: "care" if a text is about avoiding emotional and physical damage to another individual, "equality" if a text is about equal treatment and equal outcome for individuals, "proportionality" if a text is about individuals getting rewarded in proportion to their merit or contribution, "loyalty" if a text is about cooperating with ingroups and competing with outgroups, "authority" if a text is about deference toward legitimate authorities and the defense of traditions, all of which are seen as providing stability and fending off chaos, "purity" if a text is about avoiding bodily and spiritual contamination and degradation, "thin-morality" if a text has a moral sentiment but cannot be categorized as either of the above, "none" if no moral sentiment is expressed in the text.

Based solely on these definitions, name all moral sentiments that are directly expressed in the following text:
"""Was just browsing r/Politics, noticed [this 

## Test Call

In [22]:
test_prompt = prompts[2]
answer_test = run(test_prompt, 1)

USER: These are definitions of moral sentiments: "care" if a text is about avoiding emotional and physical damage to another individual, "equality" if a text is about equal treatment and equal outcome for individuals, "proportionality" if a text is about individuals getting rewarded in proportion to their merit or contribution, "loyalty" if a text is about cooperating with ingroups and competing with outgroups, "authority" if a text is about deference toward legitimate authorities and the defense of traditions, all of which are seen as providing stability and fending off chaos, "purity" if a text is about avoiding bodily and spiritual contamination and degradation, "thin-morality" if a text has a moral sentiment but cannot be categorized as either of the above, "none" if no moral sentiment is expressed in the text.

Based solely on these definitions, name all moral sentiments that are directly expressed in the following text:
"""Yes, it's understandable for the victims' loved ones to b

## Run Calls

In [119]:
responses = []
for i, prompt in enumerate(prompts):
    APIresponse = run(prompt, 0, delay_full)
    response = APIresponse.json()["results"][0]["text"]
    responses.append(response)

# define categories to be found in llama output
foundations = ["care", "equality", "proportionality", "loyalty", "authority", "purity", "thin-morality"]
# clean gpt outputs (for predictions that have imprecise wording, e.g., punctuation)
responses_cleaned = [re.sub(pattern, "", x.lower()) for x in responses]
# responses_cleaned = [x if "no moral sentiments" not in x.lower() else "non-moral" for x in responses_cleaned]
responsesToFoundations = [list(set([y for y in foundations if y in x])) for x in responses_cleaned] #find foundation names in cleaned strings
responsesToFoundations = [["non-moral"] if not x else x for x in responsesToFoundations] # no foundation = non-moral
responsesToFoundations = [",".join(x) for x in responsesToFoundations] #reformat for further processing

new_dic = {}
new_dic["text"] = df.text.tolist()
new_dic["annotations"] = responsesToFoundations
df_responses = pd.DataFrame(new_dic)

cols = df.columns[1:].tolist()
df_preds = separate_labels(df_responses, cols) # format to final dataset
df_preds.to_csv("../results/predictions/llama2_" + data + "_labels_" + mode + ".csv", index=False)

NameError: name 'prompt_style' is not defined