## Load Packages

In [3]:
import os
import pandas as pd
import pickle
import time

from sklearn.metrics import f1_score

from retry import retry
import logging
logging.basicConfig()

## General Parameters

In [5]:
data = "mfrc"
mode = "full"
folder = "../data/preprocessed/"

path = folder + data + "_sample_" + mode + ".csv"
pred_path =  "../results/predictions/llama2_" + data + "_labels_" + mode + ".csv"

# load ground truth and metadata
df_groundtruth = pd.read_csv("../data/preprocessed/" + data + "_sample_" + mode + ".csv")   
df_meta = pd.read_csv("../data/preprocessed/" + data + "_meta_sample_" + mode + ".csv")   
df_pred = pd.read_csv(pred_path)

cols = df_pred.columns[1:].tolist()

## Evaluate Performance and Prepare Bias Analyses

In [9]:
y_true = df_groundtruth[cols].values
y_pred = df_pred[cols]

print(f1_score(y_true, y_pred, average="macro"))

df_total = df_meta.merge(df_pred[cols + ["text"]], on="text")
df_total["success"] = df_total.apply(lambda x: x[x["annotation"]] == 1, axis = 1)

df_total.to_csv("../results/evals/llama2_" + data + "_success_" + mode + ".csv", index = False) # uncomment for analyses

0.23093324498672968


## Additional Information

Use these as robustness checks! The distributions of predictions should not deviate too much from the ground truth!

### Distribution of foundations across predictions

In [11]:
print(df_total[cols].sum(0)/df_total.shape[0])

care               0.289783
proportionality    0.146079
loyalty            0.284937
authority          0.133716
purity             0.200969
equality           0.087825
thin morality      0.000000
non-moral          0.639304
dtype: float64


### Distribution of foundations for groundtruth

In [12]:
# compare distribution of foundations over predictions and groundtruth (are they similar? -> better classifier)
print(df_groundtruth[cols].sum(0)/df_groundtruth.shape[0])

care               0.113309
proportionality    0.037881
loyalty            0.035870
authority          0.048274
purity             0.018773
equality           0.077774
thin morality      0.095877
non-moral          0.646329
dtype: float64
