In [15]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import load_model 
from keras.metrics import Precision, Recall

import pandas as pd
import numpy as np

import pickle as pkl

from nltk.corpus import stopwords
import tokenization

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# import nltk
# nltk.download('stopwords')

foundations = {"mftc": {
                    "binding": ["individual", "binding"], 
                    "moral": ["moral"],
                    "full": ["care", "fairness", "loyalty", "authority", "purity"],
                    "complete": ["care", "harm", "fairness", "cheating", "loyalty", "betrayal", "authority", "subversion", "purity", "degradation"]
                },
               "mfrc":  {
                    "binding": ["individual", "binding", "proportionality", "thin morality"], 
                    "moral": ["moral", "thin morality"],
                    "full": ["care", "proportionality", "loyalty", "authority", "purity", "equality", "thin morality"],
                    "complete": ["care", "harm", "equality", "proportionality", "loyalty", "betrayal", "authority", "subversion", "purity", "degradation", "thin morality"]
               }
              }

Functions for training

In [23]:
def build_model(bert_layer, max_len=512, classes = 5, activation = "sigmoid"):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    outputs= bert_layer(dict(input_word_ids=input_word_ids,
    input_mask=input_mask,
    input_type_ids=segment_ids))

    # pooled_output=outputs["pooled_output"]
    sequence_output=outputs["sequence_output"]

    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    # net = tf.keras.layers.Dense(32, activation='relu')(net)
    # net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(classes, activation=activation)(net)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5),
                  loss='binary_crossentropy', metrics=[Precision(), Recall()])
    model.summary()
    return model


def get_binary(_y, threshold):
    y = _y.copy()
    y[y >= threshold] = 1
    y[y < threshold] = 0
    return y

def F1Measure(y_true, y_pred, threshold=0.5):
    y_binary = get_binary(y_pred, threshold)
    score = f1_Score(y_true, y_binary, average = "macro")   

    return score

def train(mode, bert_layer, corp):
    
    classes = {"mfrc": {"full": 8, "moral": 3, "binding": 5, "complete": 12}, "mftc": {"full": 6, "moral": 2, "binding": 3, "complete": 11}}
    activation = {"full": "sigmoid", "moral": "softmax", "binding": "softmax"}
    model = build_model(bert_layer, max_len=256, classes = classes[corp][mode], activation = activation[mode])

    with open("../data/train_test/" + corp + "_train_" + mode + ".pkl", "rb") as f:
        X_train, y_train = pkl.load(f)

    checkpoint = tf.keras.callbacks.ModelCheckpoint('../models/' + corp + '_normalmodel_' + mode + '.h5', monitor='val_loss', save_best_only=True, verbose=1)
    earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    print("start training")
    t = model.fit(
        X_train, y_train,
        validation_split=0.1,
        epochs=200,
        callbacks=[checkpoint, earlystopping],
        batch_size=32, #32 works best so far
        verbose=1)
    # print(t)
    print("Saving the model")
    # t.save

def crossVal(mode, bert_layer, corp):
    
    classes = {"mfrc": {"full": 8, "moral": 3, "binding": 5, "complete": 12}, "mftc": {"full": 6, "moral": 2, "binding": 3, "complete": 11}}
    activation = {"full": "sigmoid", "moral": "sigmoid", "binding": "sigmoid"}
    model = build_model(bert_layer, max_len=256, classes = classes[corp][mode], activation = activation[mode])

    # print(model.summary())
    with open("../data/train_test/" + corp + "_train_" + mode + ".pkl", "rb") as f:
        X, y = pkl.load(f)

    # checkpoint = tf.keras.callbacks.ModelCheckpoint('../models/' + corp + '_model_' + mode + '.h5', monitor='val_loss', save_best_only=True, verbose=1)
    earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    print("Start Cross-Validation")
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    cvscores = []
    for train, test in kfold.split(X[0], reverse_onehot(y)): #potentially use CV folds as predictions to evaluate against chatGPT
        X_train_cv = (X[0][train], X[1][train], X[2][train])
        y_train_cv = tf.gather(y_train, train)
        X_test_cv = (X[0][test], X[1][test], X[2][test])
        y_test_cv = tf.gather(y, test)
        t = model.fit(
            X_train_cv, y_train_cv,
            validation_data = (X_test_cv, y_test_cv),
            # validation_split=0.2,
            epochs=200,
            callbacks=[earlystopping], #[checkpoint, earlystopping]
            batch_size=32, #32 works best so far
            verbose=0)
        
        y_pred_val = model.predict(X_test_cv, y_test_cv)
        scores = F1Measure(y_test_cv, y_pred_val, 0.5)
        print("%s: %.2f%%" % ("f1_score", scores[1]*100))
        cvscores.append(scores[1] * 100)
        
        # print(t)
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
    # print("Saving the model")
    # t.save

def full_train(mode, bert_layer, corp):
    
    classes = {"mfrc": {"full": 8, "moral": 3, "binding": 5, "complete": 13}, "mftc": {"full": 6, "moral": 3, "binding": 3, "complete": 11}}
    activation = {"full": "sigmoid", "moral": "sigmoid", "binding": "sigmoid"}
    model = build_model(bert_layer, max_len=256, classes = classes[corp][mode], activation = activation[mode])

    with open("../data/train_test/" + corp + "_fulltrain_" + mode + ".pkl", "rb") as f:
        X, y = pkl.load(f)

    checkpoint = tf.keras.callbacks.ModelCheckpoint('../models/' + corp + '_crossmodel_' + mode + '.h5', monitor='val_loss', save_best_only=True, verbose=1)
    earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    print("start training")
    t = model.fit(
        X, y,
        validation_split = 0.2,
        epochs=200,
        callbacks=[checkpoint, earlystopping],
        batch_size=32, #32 works best so far
        verbose=1)
    # print(t)
    print("Saving the model")
    # t.save

def reverse_onehot(onehot_data):
    # onehot_data assumed to be channel last
    data_copy = np.zeros(onehot_data.shape[:-1])
    for c in range(onehot_data.shape[-1]):
        img_c = onehot_data[..., c]
        data_copy[img_c == 1] = c
    return data_copy
    
# def evaluate(model_file, data_file, bert_layer, threshold=0.5):

#     model = load_model(model_file, compile=True, custom_objects={"KerasLayer": bert_layer})

#     with open(data_file, "rb") as f:
#         X_test, y_test = pkl.load(f)

#     y_pred = model.predict(X_test)
#     print('predicted')
    
#     f1_score = F1Measure(y_test, y_pred, threshold=0.9)
#     print(f"threshold: {0.9}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.8)
#     print(f"threshold: {0.8}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.7)
#     print(f"threshold: {0.7}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.6)
#     print(f"threshold: {0.6}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.5)
#     print(f"threshold: {0.5}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.4)
#     print(f"threshold: {0.4}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.3)
#     print(f"threshold: {0.3}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.2)
#     print(f"threshold: {0.2}, score :{f1_score}")
#     f1_score = F1Measure(y_test, y_pred, threshold=0.1)
#     print(f"threshold: {0.1}, score :{f1_score}")

#     return 0
    
module_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/2"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [16]:
# # corp = "mfrc"
# corp = sys.argv[1]

# # mode = "full
# mode = sys.argv[2]
# training = sys.argv[3]

corp = "mftc"
mode = "full"
training = "eval"

# data_file = "../data/train_test/" + corp + "_test_" + mode + ".pkl"
# model_file = "../models/" + corp + "_model_" + mode + ".h5"

In [134]:
if training == "eval": # determine best model using CV
    crossVal(mode, bert_layer, corp)
elif training == "cross": # train on full corpus for cross corpus predictions
    full_train(mode, bert_layer, corp)
elif training == "normal": # regular training for test sample (against chatGPT)
    train(mode, bert_layer, corp)
else:
    pass

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask (InputLayer)        [(None, 256)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 256)]        0           []                               
                                                                                                  
 input_word_ids (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 keras_layer_7 (KerasLayer)     {'pooled_output': (  17488641    ['input_mask[0][0]',             
                                None, 256),                       'segment_ids[0][0]',     

KeyboardInterrupt: 

Functions for Predictions

In [1]:
import tensorflow_hub as hub
from tensorflow.keras.models import load_model 
import tokenization
import pandas as pd
import numpy as np
from nltk.corpus import stopwords


In [2]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_layer = hub.KerasLayer(module_url, trainable=True)    
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [3]:
foundations = {"mftc": {
                    "binding": ["individual", "binding", "non-moral"], 
                    "moral": ["moral", "non-moral"],
                    "full": ["care", "fairness", "loyalty", "authority", "purity", "non-moral"],
                    "complete": ["care", "harm", "fairness", "cheating", "loyalty", "betrayal", "authority", "subversion", "purity", "degradation", "non-moral"]
                },
               "mfrc":  {
                    "binding": ["individual", "binding", "proportionality", "thin morality", "non-moral"], 
                    "moral": ["moral", "thin morality", "non-moral"],
                    "full": ["care", "proportionality", "loyalty", "authority", "purity", "equality", "thin morality", "non-moral"],
                    "complete": ["care", "harm", "equality", "proportionality", "loyalty", "betrayal", "authority", "subversion", "purity", "degradation", "thin morality", "non-moral"]
               }
              }

In [55]:
#################### Functions
def pre_process_text(X, tokenizer):
    tokenized = [tokenizer.tokenize(x) for x in X]
    results = []
    drops = []
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"

    en_stopwords = stopwords.words('english')
    for i, text in enumerate(tokenized):
        out = [token for token in text if (token not in en_stopwords) and (token not in symbols)
               and (not token.startswith("@")
                    and (not token.startswith("http")))]
        if len(out) >= 5:               # remove tweets that are too short after preprocessing
            results.append(out)
        else:
            drops.append(i)
    return results, drops

def bert_encode(texts, tokenizer, max_len=256):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for i, text in enumerate(texts):
        text = text[:max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

def get_binary(_y, threshold):
    y = _y.copy()
    y[y >= threshold] = 1
    y[y < threshold] = 0
    return y


def predict(df, mode, threshold):
   
    cols = foundations[corp][mode]
    model_file = '../models/' + corp + '_' + type + 'model_' + mode + '.h5'
    model = load_model(model_file, compile=True, custom_objects={"KerasLayer": bert_layer})
    
    X_raw = list(df["text"])
    X, idx_drop = pre_process_text(X_raw, tokenizer)
    print(len(idx_drop))
    X = bert_encode(X, tokenizer)
    y_pred_proba = model.predict(X)
    y_pred = get_binary(y_pred_proba, threshold)
    df_dropped = df.drop(idx_drop, axis = 0)
    df_dropped[cols] = pd.DataFrame(y_pred, index=df_dropped.index)
    
    # save predictions
    df_dropped.to_csv("../results/" + corp + "_labels_" + type + "_" + mode + ".csv", index=False)
    return 0

In [None]:
################ Get parameters and run predictions
# corp = sys.argv[1]
# mode = sys.argv[2]
# type = sys.argv[3]
# threshold = float(sys.argv[4])

corp = "mftc"
mode = "full"
type = "normal"
threshold = 0.5

if type == "cross": #for cross corpus prediction: apply trained model on the other corpus
    if corp == "mftc"
        file_path = "../data/preprocessed/mfrc_cleaned_" + mode + ".csv"
    elif corp == "mfrc":
        file_path = "../data/preprocessed/mftc_cleaned_" + mode + ".csv"
elif type == "normal:
    file_path = "../data/preprocessed/" + corp + "_sample_" + mode + ".csv"
else:
    pass

#get annoatations of texts
file = pd.read_csv(file_path)
predict(file, mode, threshold)