This codebook uses fine-tuned BERT to predict the text annotations/moral sentiment in the MFRC

## Load Packages

In [8]:
import pandas as pd
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tokenization

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import load_model 

foundations = {"mfrc":  {
                    "complete": ["care", "harm", "equality", "proportionality", "loyalty", "betrayal", "authority", "subversion", "purity", "degradation", "thin morality", "non-moral"],
                    "binding": ["individual", "binding", "proportionality", "thin morality", "non-moral"], 
                    "moral": ["moral", "thin morality", "non-moral"],
                    "full": ["care", "proportionality", "loyalty", "authority", "purity", "equality", "thin morality", "non-moral"]
               }
              }

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabdurah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## General Parameters

In [9]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_layer = hub.KerasLayer(module_url, trainable=True)    
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

## Functions for Predictions

In [6]:
#################### Functions
def pre_process_text(X, tokenizer):
    tokenized = [tokenizer.tokenize(x) for x in X]
    results = []
    drops = []
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"

    en_stopwords = stopwords.words('english')
    for i, text in enumerate(tokenized):
        out = [token for token in text if (token not in en_stopwords) and (token not in symbols)
               and (not token.startswith("@")
                    and (not token.startswith("http")))]
        if len(out) >= 5:               # remove tweets that are too short after preprocessing
            results.append(out)
        else:
            drops.append(i)
    return results, drops

def bert_encode(texts, tokenizer, max_len=256):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for i, text in enumerate(texts):
        text = text[:max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def get_binary(_y, threshold):
    y = _y.copy()
    y[y >= threshold] = 1
    y[y < threshold] = 0
    return y

def predict(df, mode, threshold):
   
    model_file = '../models/' + corp + "_" + training + "_" + mode + '.h5'
    model = load_model(model_file, compile=True, custom_objects={"KerasLayer": bert_layer})

    cols = foundations[corp][mode]
    X_raw = list(df["text"])
    X, idx_drop = pre_process_text(X_raw, tokenizer)
    print(len(idx_drop))
    X = bert_encode(X, tokenizer)
    y_pred_proba = model.predict(X)
    y_pred = get_binary(y_pred_proba, threshold)
    df_dropped = df.drop(idx_drop, axis = 0)
    df_dropped[cols] = pd.DataFrame(y_pred, index=df_dropped.index)
    
    # save predictions
    df_dropped.to_csv("../results/predictions/" + corp + "_labels_" + training + "_" + mode + ".csv", index=False)
    return df_dropped

## Make predictions

In [5]:
################ Get parameters and run predictions
corp = "mfrc"
mode = "full"
training = "normal"
threshold = 0.3

file_path = "../data/preprocessed/" + corp + "_sample_" + mode + ".csv"

#get annoatations of texts
file = pd.read_csv(file_path)
predict(file, mode, threshold)

2023-09-04 00:32:38.186948: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_2' with dtype int32 and shape [?,256]
	 [[{{node inputs_2}}]]
2023-09-04 00:32:38.187011: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,256]
	 [[{{node inputs}}]]
2023-09-04 00:32:38.221158: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,256]
	 [[{{node Placeholder_1}}]

0
 6/94 [>.............................] - ETA: 1s

2023-09-04 00:33:09.168794: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




Unnamed: 0,text,care,proportionality,loyalty,authority,purity,equality,thin morality,non-moral
0,1. That's\n2. Why\n3. Macron\n4. Won\n5. Bitch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Yeah that was my point. Maybe Hillary is more ...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,"Yes, it's understandable for the victims' love...",1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Duh we all know you can’t have black friends a...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,I'd bin them to be fair.\n\nNo amount of deter...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
2978,"NTA, maybe even make it a birthday present: he...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2979,Their rules. I think it's richly deserved. O...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2980,So she's implying that LGBTQ people all cheat ...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2981,/uncuck \n\nPeople who support Donald and le p...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
