This codebook prepares the data for the text annotation task:
-   Rename output labels (here in line with moral foundations theory; e.g., if one wants to cluster values into hierarchies)
    - Here we choose to combine vices and virtues of the same foundation to simplify interpretation (as it is done in most cases)
-   Splits the total MFRC into a train/fine-tune part (for BERT) and an evaluation/groundtruth part (to test BERT and ChatGPT performance)

## Load Packages

In [3]:
import numpy as np
import pandas as pd
from html import unescape

import tensorflow as tf
import tensorflow_hub as hub
import tokenization
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
from keras.utils import to_categorical

import ast
from collections import Counter

import pickle as pkl
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabdurah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Dicts (auxiliary) 
 - Allows to cluster our labels according to whatever theoretical consideration (e.g., usually vices and virtues are not separated)
 - E.g., betrayal and loyalty or subversion and authority are just different sides of the same moral value (positive/negative)
 - Here, we utilize "full", which uses moral values in line with the MFQ2 (care, equality, proportionality, loyalty, authority, purity, + thin-morality & non-moral), in line with the authors of the MFRC

In [11]:
foundations = {"mfrc":  {
                    "complete": ["care", "harm", "equality", "proportionality", "loyalty", "betrayal", "authority", "subversion", "purity", "degradation", "thin morality", "non-moral"],
                    "binding": ["individual", "binding", "proportionality", "thin morality", "non-moral"], 
                    "moral": ["moral", "thin morality", "non-moral"],
                    "full": ["care", "proportionality", "loyalty", "authority", "purity", "equality", "thin morality", "non-moral"],
               }
              }

foundations_dict = {

                    "complete": {"harm": "harm", "care": "care", "degradation": "degradation", 
                                        "purity": "purity", "betrayal": "betrayal", "loyalty": "loyalty", 
                                        "subversion": "subversion", "authority": "authority",
                                        "cheating": "cheating", "fairness": "fairness",  "equality": "equality",
                                        "non-moral": "non-moral", "nm": "non-moral", "thin morality": "thin morality", "proportionality": "proportionality"},
    
                    "binding": {"harm": "individual", "care": "individual", "degradation": "binding", 
                                "purity": "binding", "betrayal": "binding", "loyalty": "binding", 
                                "subversion": "binding", "authority": "binding",
                                "cheating": "individual", "fairness": "individual",  "equality": "individualizing",
                                "non-moral": "non-moral", "nm": "non-moral", "proportionality": "proportionality", "thin morality": "thin morality"},
                    
                    "moral": {"harm": "moral", "care": "moral", "degradation": "moral", 
                                    "purity": "moral", "betrayal": "moral", "loyalty": "moral", 
                                    "subversion": "moral", "authority": "moral",
                                    "cheating": "moral", "fairness": "moral",  "equality": "moral",
                                    "non-moral": "non-moral", "nm": "non-moral", "thin morality": "thin morality", "proportionality": "moral"},
                   
                    "full": {"harm": "care", "care": "care", "degradation": "purity", 
                                        "purity": "purity", "betrayal": "loyalty", "loyalty": "loyalty", 
                                        "subversion": "authority", "authority": "authority",
                                        "cheating": "fairness", "fairness": "fairness", "equality": "equality",
                                        "non-moral": "non-moral", "nm": "non-moral", "thin morality": "thin morality", "proportionality": "proportionality"},
}



## Functions

In [12]:
def construct_dataset(data_file, bert_layer, mode, corp):
    df = pd.read_csv(data_file)
    X, y = get_needed_fields(df, cols = foundations[corp][mode])
    X, y = pre_process_text(X, y, tokenizer)
    y=np.array(y) 
    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, np.arange(len(X)), test_size=0.2, shuffle=True, random_state=0)
    X_train = bert_encode(X_train, tokenizer)
    X_test=bert_encode(X_test,tokenizer)
    y_train = tf.convert_to_tensor(y_train, dtype=np.float32)
    y_test = tf.convert_to_tensor(y_test, dtype=np.float32)
    print(len(X_train[0]), len(y_train))

    return X_train, y_train, X_test, y_test, idx_test

def construct_dataset_full(data_file, bert_layer, mode, corp):
    df = pd.read_csv(data_file)
    X, y = get_needed_fields(df, cols = foundations[corp][mode])
    X, y = pre_process_text(X, y, tokenizer)
    y=np.array(y)
    X = bert_encode(X, tokenizer)
    y = tf.convert_to_tensor(y, dtype=np.float32)
    print(len(X[0]), len(y))

    return X, y

def get_needed_fields(df, cols = ["individual", "binding", "non-moral"]):
    
    X = list(df["text"])
    y = df[foundations[corp][mode]].values
    # Y_encoded = encoder.fit_transform(df.annotation)
    # y = to_categorical(Y_encoded)
    return X, y


def bert_encode(texts, tokenizer, max_len=256):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for i, text in enumerate(texts):
        text = text[:max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


def pre_process_text(X, y, tokenizer):
    tokenized = [tokenizer.tokenize(x) for x in X]
    results = []
    labels = []
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"

    ## remove stop words and make lower case
    # maybe adjust this
    en_stopwords = stopwords.words('english')
    for i, text in enumerate(tokenized):
        out = [token for token in text if (token not in en_stopwords) and (token not in symbols)
               and (not token.startswith("@")
                    and (not token.startswith("http")))]
        if len(out) >= 5:               # remove tweets that are too short after preprocessing
            results.append(out)
            labels.append(y[i])
    return results, labels

def pre_process_df(df, tokenizer):
    df["tokenized"] = df.text.apply(lambda x: tokenizer.tokenize(x))
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"

    ## remove stop words and make lower case
    # maybe adjust this
    en_stopwords = stopwords.words('english')
    df.tokenized = df.tokenized.apply(lambda x: [token for token in x if (token not in en_stopwords) and (token not in symbols)
               and (not token.startswith("@")
                    and (not token.startswith("http")))])
    
    df = df[df.tokenized.str.len() >=5].reset_index(drop=True)
    return df

################ Additional Functions

def _set_majority_vote(row):
    for label in foundations[corp][mode]:
        if row[label] >= row['annotation']:
            row[label] = 1
        else:
            row[label] = 0
    return row

def separate_labels(df):
    def _set_labels(row):
        for label in row["annotation"].split(","):
            row[label] = 1
        return row

    # removing texts with no annotations
    df = df[df.annotation != ''].reset_index(drop=True)
    df = df[~ pd.isna(df.annotation)]
    for label in foundations[corp][mode]:
        df[label] = 0
    df = df.apply(_set_labels, axis=1)
    return df

def calculate_majortiy_labels(df, corp):
    """
    calculates majority vote for the moral foundations annotations for each text
    Returns dataset with majority labels
    :param df:
    :return:
    """
    if corp == "mfrc":
        agg_dict = {"annotation": "count"} #, 'bucket': "first", 'subreddit': 'first'}
    else:
        agg_dict = {"annotation": "count"}
    for label in foundations[corp][mode]:
        agg_dict[label] = "sum"

    df = df.groupby(["text"], as_index=False).agg(agg_dict).reset_index(drop=True)
    df['annotation'] = df['annotation'].div(2)
    df = df.apply(_set_majority_vote, axis=1)
    df["sum"] = df[foundations[corp][mode]].sum(axis=1)
    df = df[df["sum"] != 0]
    df = df.drop(columns=["sum"]).reset_index(drop=True)
    return df

# load bert model (base)
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

## General parameters

In [1]:
corp = "mfrc" # change when using different corpora
mode = "full" # change when using different clustering of sentiment/target variable (here moral values)

main_path = "../data/preprocessed/"
eval_path = main_path + corp + "_eval_" + mode + ".csv"
sample_meta_path = main_path + corp + "_meta_sample_" + mode + ".csv"
sample_path = main_path + corp + "_sample_" + mode + ".csv"
final_path = main_path + corp + "_cleaned_" + mode + ".csv"

## Process data (format for nlp models)

In [5]:
df_list = []
df_raw = pd.read_csv("../data/raw/final_mfrc_data.csv") # MFRC data (current version, might change in the future)
df_raw = pre_process_df(df_raw, tokenizer).drop(["tokenized"], axis = 1)
df = df_raw.drop_duplicates().reset_index(drop=True)
df.annotation = df.annotation.str.lower().replace(foundations_dict[mode], regex=True)

# format for evaluation against predictions (need this for annotator demographics analyses)    
df_eval = df[["text", "annotator", "annotation"]]
df_eval.loc[:, "annotation"] = df_eval.annotation.apply(lambda x: x.split(','))
df_eval = df_eval.explode("annotation").reset_index(drop=True)
df_eval = df_eval.drop_duplicates().reset_index(drop=True)
df_eval.to_csv(eval_path, index = False) # --> save groundtruth

NameError: name 'df' is not defined

In [237]:
df_final = separate_labels(df)
df_final = calculate_majortiy_labels(df_final, corp)
df_final = df_final.drop(["annotation"], axis = 1)
df_final.to_csv(final_path, index = False) # full MFRC data: All tweets x all moral foundations ratings (each as binary variable)

## Create test/train data

In [25]:
X_train, y_train, X_test, y_test, idx_test = construct_dataset(final_path, bert_layer=bert_layer, mode=mode, corp=corp) #save test IDs
with open("../data/train_test/" + corp + "_train_" + mode + ".pkl","wb") as f:
    pkl.dump([X_train, y_train], f)
with open("../data/train_test/" + corp + "_test_" + mode + ".pkl","wb") as f:
    pkl.dump([X_test, y_test], f)

11928 11928
14911 14911


Get groundtruth data for chatGPT comparison

In [239]:
#get the metainformation for the sample that will be compared with chatGPT
df_eval = pd.read_csv(eval_path)    # get data with explicit annotator information
df_final = pd.read_csv(final_path)  # get data that the train/test split was performed on

df_sample = df_final.iloc[idx_test]   # get the posts for the test sample
df_sample_meta = df_eval.loc[df_eval.text.isin(df_sample.text)].reset_index(drop=True) # find test sample in annotator data
df_sample_meta.to_csv(sample_meta_path, index=False) # save annotator information about sample
df_sample.to_csv(sample_path, index=False) # save sample (groundtruth for performance calculations)