This code book calls the OpenAI API to classify moral sentiments in posts from the Moral Foundations Reddit Corpus using ChatGPT!

## Load Packages

In [1]:
import openai
import os
import pandas as pd
import numpy as np

import string
import re
remove = string.punctuation
remove = remove.replace("-", "").replace(",", "") # don't remove hyphens
pattern = r"[{}]".format(remove) # create the pattern

import pickle
import time
import logging
from retry import retry
logging.basicConfig()

# Calculate the delay based on your rate limit
rate_limit_per_minute = 3500.0
delay_60 = 60.0 / 60
delay_full = 60.0 / rate_limit_per_minute

## General Parameters

In [2]:
data = "mfrc"
mode = "full"
folder = "../data/preprocessed/"
path = folder + data + "_sample_" + mode + ".csv"

## Functions

In [3]:
# chatGPT parameters
openai.api_key = os.getenv("OPENAI_API_KEY") # add your api key to the environment
model_engine = "gpt-3.5-turbo-0301"

@retry(delay=5)
def delayed_completion(delay_in_seconds: float = 1, **kwargs):
    """Delay a completion by a specified amount of time."""

    # Sleep for the delay
    time.sleep(delay_in_seconds)

    # Call the Completion API and return the result
    return openai.ChatCompletion.create(**kwargs)

def separate_labels(df, cols):
    def _set_labels(row):
        for label in row["annotations"].split(","):
            if label in cols:
                row[label.strip()] = 1
        return row

    # removing texts with no annotations
    df = df[df.annotations != ''].reset_index(drop=True)
    df = df[~ pd.isna(df.annotations)].reset_index(drop=True)
    for label in cols:
        df[label] = 0
    df = df.apply(_set_labels, axis=1).drop(["annotations"], axis = 1)
    return df

## Load Data

In [4]:
# create general prompt
PROMPT_TEXT = "Determine which moral sentiments are expressed in the following text. " \
"\"care\" if the text is about avoiding emotional and physical damage to another individual, " \
"\"equality\" if the text is about equal treatment and equal outcome for individuals, " \
"\"proportionality\" if the text is about individuals getting rewarded in proportion to their merit or contribution, "\
"\"loyalty\" if the text is about cooperating with ingroups and competing with outgroups, "\
"\"authority\" if the text is about deference toward legitimate authorities and the defense of traditions, "\
"all of which are seen as providing stability and fending off chaos, "\
"\"purity\" if the text is about avoiding bodily and spiritual contamination and degradation, "\
"\"thin morality\" if the text has a moral sentiment but cannot be categorized as either of the above, "\
"\"non-moral\" if no moral sentiment is expressed in the text. "\
"Respond only with these words. Respond with all words that apply, comma separated. Here is the text: "

In [5]:
# check prompt
PROMPT_TEXT

'Determine which moral sentiments are expressed in the following text. "care" if the text is about avoiding emotional and physical damage to another individual, "equality" if the text is about equal treatment and equal outcome for individuals, "proportionality" if the text is about individuals getting rewarded in proportion to their merit or contribution, "loyalty" if the text is about cooperating with ingroups and competing with outgroups, "authority" if the text is about deference toward legitimate authorities and the defense of traditions, all of which are seen as providing stability and fending off chaos, "purity" if the text is about avoiding bodily and spiritual contamination and degradation, "thin morality" if the text has a moral sentiment but cannot be categorized as either of the above, "non-moral" if no moral sentiment is expressed in the text. Respond only with these words. Respond with all words that apply, comma separated. Here is the text: '

In [6]:
# load annotation texts
df = pd.read_csv(path)
print(df.shape)
print(round(df.text.str.split("\\s+").str.len().mean()))
messages = [{"role": "user", "content": PROMPT_TEXT + x} for x in df.text]

(2983, 9)
33


## Test Call

In [7]:
# check test prompt
messages[10]

{'role': 'user',
 'content': 'Determine which moral sentiments are expressed in the following text. "care" if the text is about avoiding emotional and physical damage to another individual, "equality" if the text is about equal treatment and equal outcome for individuals, "proportionality" if the text is about individuals getting rewarded in proportion to their merit or contribution, "loyalty" if the text is about cooperating with ingroups and competing with outgroups, "authority" if the text is about deference toward legitimate authorities and the defense of traditions, all of which are seen as providing stability and fending off chaos, "purity" if the text is about avoiding bodily and spiritual contamination and degradation, "thin morality" if the text has a moral sentiment but cannot be categorized as either of the above, "non-moral" if no moral sentiment is expressed in the text. Respond only with these words. Respond with all words that apply, comma separated. Here is the text: Wa

In [8]:
# run test api call
APIresponse = delayed_completion(
    delay_in_seconds=delay_full,
    model=model_engine,
    messages=[messages[10]],
    temperature=0
    )
response = APIresponse.choices[0].message["content"]
print(response) #works

thin morality


## Run Calls

In [9]:
responses = []
for i, message in enumerate(messages):
    APIresponse = delayed_completion(
        delay_in_seconds=delay_full,
        model=model_engine,
        messages=[message],
        temperature=0,
        )
    response = APIresponse.choices[0].message["content"]
    responses.append(response)
    if not i % int(0.1 * len(messages)):
        print(str(int(i/len(messages)*100)) + "\%")

# clean gpt outputs (for predictions that have imprecise wording, e.g., none for non-moral)
responses_cleaned = [re.sub(pattern, "", x.lower()) if "none" not in x.lower() else "non-moral" for x in responses]

# save as dataframe
new_dic = {}
new_dic["text"] = df.text.tolist()
new_dic["annotations"] = responses_cleaned
df_responses = pd.DataFrame(new_dic)

cols = df.columns[1:].tolist()
df_preds = separate_labels(df_responses, cols)
df_preds.to_csv("../results/predictions/gpt_" + data + "_labels_" + mode + ".csv", index=False)

0\%
9\%
19\%




29\%
39\%
49\%
59\%
69\%
79\%
89\%
99\%
