## Categorical Evaluation

Produce Tables 8 and 9; agreement between human annotators and GPT for protagonist, antagonist, valence, and protagonist type.

In [25]:
import numpy as np
import pandas as pd

from unidecode import unidecode

import scipy
import torch

import os
from typing import Set, Callabel

Load the data

In [None]:
results_dir = "../data/validation/moral_annotations"

# Load the response data
df_eng = pd.read_csv(os.path.join(results_dir, "human_responses_english.csv"), index_col='index').fillna("None")
df_gpt_eng = pd.read_csv(os.path.join(results_dir, "gpt_responses_english.csv"), index_col='index').fillna("None")
df_chin = pd.read_csv(os.path.join(results_dir, "human_responses_mandarin.csv"), index_col='index').fillna("None")
df_gpt_chin = pd.read_csv(os.path.join(results_dir, "gpt_responses_mandarin.csv"), index_col='index').fillna("None")

# Get the relevant columns for the given category
human_eng_annotators, human_chin_annotators = ['AL', 'AS', 'AS2', 'AZ', 'EA', 'NW'], ['JP', 'VX', 'YN', 'JY']
gpt_eng_annotators, gpt_chin_annotators = ['0', '1', '2', '3', '4', '5'], ['0', '1', '2', '3']

## 1) Protagonist/Antagonist

In [3]:
# basic list of synonyms in the validation dataset of protagonists/antagonists
name_equivalences = {
    "chinese government": 'china',
    'government of china': 'china',
    'japanese government': 'japan',
    'government of japan': 'japan',
    'european union government': 'eu',
    'us': 'the united states',
    'the protagonist of this story is the mouse': 'mouse'
}

# Clean names (remove accents and get rid of articles at the start of the name)
def clean_word(word: str) -> str:
    if "the " == word[0:4]:
        word = word[4:]
    elif "a " == word[0:2]:
        word = word[2:]
    elif "an " == word[0:3]:
        word = word[3:]
    word = word.replace("-", " ")
    word = unidecode(word)    # remove accents
    return name_equivalences.get(word, word)

# Create a dictionary of equivalent names from the given list of names
# Names are considered equivalent if one contains the other. While this is a bit simplistic, 
# it's a generally suitable heuristic for the names in the validation dataset
def get_equivalence_classes(names_list: list[str]) -> dict[str:str]:
    names_list = sorted(names_list, key=lambda x: len(x))
    equiv_classes = dict()
    for i in range(len(names_list)):
        word1_orig = names_list[i]
        word1 = clean_word(word1_orig)
        for j in range(i+1, len(names_list)):
            word2 = clean_word(names_list[j])
            if len(word1) == len(word2):
                continue
            if word1 in word2:
                equiv_classes[word1_orig] = word2
        if word1_orig not in equiv_classes:
            equiv_classes[word1_orig] = word1
    return equiv_classes

# Get the agreement breakdowns (agreement with majority, agreement with any, no agreement, average majority vote) of all predicted answers
# relative to given reference answers. 
# Returns a torch matrix of size (n_references, 4, n_predictions) where output[:, :, i] is the agreement matrix of the references compared 
# to prediction i. Note this agreement matrix has values corresponding to "agreement with majority", "agreement with any", "no agreement", 
# "average majority vote" (all represented as ints or floats) in that order.
# This function applies a scheme to equate answers together and uses that when determining agreement.
def get_agreement_matrix(df_refs: pd.DataFrame, df_preds: pd.DataFrame) -> torch.Tensor:
    n_examples = df_refs.shape[0]
    n_predictions = df_preds.shape[1]
    agreement_matrix = torch.zeros(n_examples, 4, n_predictions)
    for i in range(n_predictions):
        agreement_matrix[:, :, i] = get_agreement_breakdowns(df_refs, df_preds[df_preds.columns[i]])
    return agreement_matrix

# Get the agreement breakdown (agreement with majority, agreement with any, no agreement, average majority vote) for the given 
# predictions relative to given reference answers.
# This function applies a scheme to equate answers together and uses that when determining agreement.
def get_agreement_breakdowns(df_refs: pd.DataFrame, preds: pd.Series) -> torch.Tensor:
    # normalize predicted answers
    preds = preds.apply(lambda x: name_equivalences.get(x, x))

    # compare the reference answers to the predicted ones
    agreements = torch.zeros(df_refs.shape[0], 4)
    for i, (_, row) in enumerate(df_refs.iterrows()):
        # Get a mapping of equivalent names (need to add GPT's answer too so the equivalence class accounts for it)
        equiv_classes = get_equivalence_classes(np.append(row.values, preds.iloc[i]))

        # Map the answers to equivalent answers
        ref_ans_counts = row.map(lambda x: equiv_classes.get(x, x)).value_counts()
        pred_ans = equiv_classes.get(preds.iloc[i], preds.iloc[i])

        # Check the agreement
        if pred_ans not in ref_ans_counts.index:
            agreed_with_majority, agreed_with_any, no_agreement = 0, 0, 1
        elif ref_ans_counts.loc[pred_ans] == ref_ans_counts.max():
            agreed_with_majority, agreed_with_any, no_agreement = 1, 1, 0
        else:
            agreed_with_majority, agreed_with_any, no_agreement = 0, 1, 0
        agreements[i, :] = torch.Tensor([agreed_with_majority, agreed_with_any, no_agreement, ref_ans_counts.iloc[0]/ref_ans_counts.sum()])
    return agreements

# Get the list of genres corresponding to the given list of dataframes
# Note that the index of the returned Series is not related to the index of the input dataframes
def get_genre_list(df_list: list[pd.DataFrame]) -> pd.Series:
    genres = []
    for df in df_list:
        genres += df.index.to_series().apply(lambda x: x.split("_")[0]).values.tolist()
    return pd.Series(genres)

# Get the agreement breakdown by genre
def get_agreement_by_genre(agreement_matrix: torch.Tensor, genre_series: pd.Series) -> pd.DataFrame:
    genres = genre_series.unique()
    genre_data = {genre:dict() for genre in genres}
    for genre in genres:
        genre_idxs = genre_series[genre_series == genre].index.to_list()
        agreement_values = agreement_matrix[genre_idxs, :, :].mean(2).mean(0)
        genre_data[genre] = {
            'agree_with_majority': agreement_values[0].item(),
            'any_agreement': agreement_values[1].item(),
            'no_agreement': agreement_values[2].item(),
            'avg_human_popular_vote': agreement_values[3].item()
        }
    return pd.DataFrame(genre_data).T

### a) Protagonist

In [24]:
cate = 'protagonist'

# Get the relevant columns
human_eng_cols = [f"{ann}_{cate}" for ann in human_eng_annotators]
human_chin_cols = [f"{ann}_{cate}" for ann in human_chin_annotators]
gpt_eng_cols = [f"{ann}_{cate}" for ann in gpt_eng_annotators]
gpt_chin_cols = [f"{ann}_{cate}" for ann in gpt_chin_annotators]

num_agreement_cates = 4   # 4 agreement categories: agree_with_majority, any_agreement, no_agreement, avg_human_popular_vote
num_gpt_comparisons = np.min([len(gpt_eng_cols), len(gpt_chin_cols)])
n = df_eng.shape[0] + df_chin.shape[0]

# Get the agreement matrices for each language
agreement_matrix_eng = get_agreement_matrix(df_eng[human_eng_cols], df_gpt_eng[gpt_eng_cols])
agreement_matrix_chin = get_agreement_matrix(df_chin[human_chin_cols], df_gpt_chin[gpt_chin_cols])

# Combine the agreement matrices
agreement_matrix = torch.cat([agreement_matrix_eng[:, :, :num_gpt_comparisons], agreement_matrix_chin[:, :, :num_gpt_comparisons]], axis=0)

# Compute the agreement by genre
genre_series = get_genre_list([df_eng[human_eng_cols], df_chin[human_chin_cols]])
genre_agreement = get_agreement_by_genre(agreement_matrix, genre_series).round(4)*100

print(f"{cate}".capitalize(), "stats")
display(genre_agreement)

Protagonist stats


Unnamed: 0,agree_with_majority,any_agreement,no_agreement,avg_human_popular_vote
Book,95.31,95.31,4.69,94.79
Folktale,81.25,95.31,4.69,78.65
Movies-TV,84.38,85.94,14.06,84.37
News,46.09,61.33,38.67,63.41
Reddit,87.5,93.75,6.25,91.67


### b) Antagonist

In [23]:
cate = 'antagonist'

# Get the relevant columns
human_eng_cols = [f"{ann}_{cate}" for ann in human_eng_annotators]
human_chin_cols = [f"{ann}_{cate}" for ann in human_chin_annotators]
gpt_eng_cols = [f"{ann}_{cate}" for ann in gpt_eng_annotators]
gpt_chin_cols = [f"{ann}_{cate}" for ann in gpt_chin_annotators]

num_agreement_cates = 4   # 4 agreement categories: agree_with_majority, any_agreement, no_agreement, avg_human_popular_vote
num_gpt_comparisons = np.min([len(gpt_eng_cols), len(gpt_chin_cols)])
n = df_eng.shape[0] + df_chin.shape[0]

# Get the agreement matrices for each language
agreement_matrix_eng = get_agreement_matrix(df_eng[human_eng_cols], df_gpt_eng[gpt_eng_cols])
agreement_matrix_chin = get_agreement_matrix(df_chin[human_chin_cols], df_gpt_chin[gpt_chin_cols])

# Combine the agreement matrices
agreement_matrix = torch.cat([agreement_matrix_eng[:, :, :num_gpt_comparisons], agreement_matrix_chin[:, :, :num_gpt_comparisons]], axis=0)

# Compute the agreement by genre
genre_series = get_genre_list([df_eng[human_eng_cols], df_chin[human_chin_cols]])
genre_agreement = get_agreement_by_genre(agreement_matrix, genre_series).round(4)*100

print(f"{cate}".capitalize(), "stats")
display(genre_agreement)

Antagonist stats


Unnamed: 0,agree_with_majority,any_agreement,no_agreement,avg_human_popular_vote
Book,56.25,100.0,0.0,54.17
Folktale,77.34,99.22,0.78,72.92
Movies-TV,84.38,98.44,1.56,73.96
News,67.19,96.88,3.12,70.9
Reddit,53.12,75.0,25.0,75.0


## Valence

In [19]:
# Get the statistics for each row in the dataframe (mode, median, mean, std)
def get_row_stats(df: pd.DataFrame) -> pd.DataFrame:

    # get modes, medians and standard deviations
    all_modes = df.mode(axis=1)
    meds = df.median(axis=1).rename('median').to_frame()
    means =  df.mean(axis=1).rename('mean').to_frame()
    stds = df.std(axis=1).rename("std")

    # modes are returned as a dataframe with ties in separate columns, so merge those into a set
    modes = []
    for i, row in all_modes.iterrows():
        modes.append(set(row[~row.isna()].astype(int)))
    modes = pd.Series(modes, index=df.index, name='mode')

    return pd.concat([modes, meds, means, stds], axis=1)

# Compute the inter valence distributions between the two dataframes
# Returns a numpy array of shape (df1.shape[0], 4, df2.shape[1]) where each (df1.shape[0], 4, i) slice gives the modes,
# medians, means, and standard deviations (of each row) in the dataframe where column i of df2 has been added to df1
def get_inter_valence_dists(df1: pd.DataFrame, df2: pd.DataFrame) -> np.array:
    if df1.shape[0] != df2.shape[0]:
        print("Error: shape mismatch")
        return

    num_refs = df1.shape[0]
    num_preds = df2.shape[1]
    num_cols = 4       # mode, median, mean, std
    
    inter_vals = np.zeros((num_refs, num_cols, num_preds), dtype=object)
    for i, (_, col) in enumerate(df2.items()):
        df_tmp = pd.concat([df1, col], axis=1)
        inter_vals[:, :, i] = get_row_stats(df_tmp).values
        
    return inter_vals

In [22]:
cate = 'valence'

# Get the relevant columns
human_eng_cols = [f"{ann}_{cate}" for ann in human_eng_annotators]
human_chin_cols = [f"{ann}_{cate}" for ann in human_chin_annotators]
gpt_eng_cols = [f"{ann}_{cate}" for ann in gpt_eng_annotators]
gpt_chin_cols = [f"{ann}_{cate}" for ann in gpt_chin_annotators]

# Get intra-dataframe stats
human_valence_stats_eng = get_row_stats(df_eng[human_eng_cols])      # human
human_valence_stats_chin = get_row_stats(df_chin[human_chin_cols])
gpt_valence_stats_eng = get_row_stats(df_gpt_eng[gpt_eng_cols])       # gpt
gpt_valence_stats_chin = get_row_stats(df_gpt_chin[gpt_chin_cols])

# Combine English and Mandarin dists
human_valence_stats = pd.concat([human_valence_stats_eng, human_valence_stats_chin], axis=0)
gpt_valence_stats = pd.concat([gpt_valence_stats_eng, gpt_valence_stats_chin], axis=0)

# Get inter-dataframe stats
inter_valence_stats_eng = get_inter_valence_dists(df_eng[human_eng_cols], df_gpt_eng[gpt_eng_cols])
inter_valence_stats_chin = get_inter_valence_dists(df_chin[human_chin_cols], df_gpt_chin[gpt_chin_cols])
min_num_annotators = np.min([inter_valence_stats_eng.shape[2], inter_valence_stats_chin.shape[2]])
inter_valence_stats = np.concatenate([inter_valence_stats_eng[:, :, :min_num_annotators], inter_valence_stats_chin[:, :, :min_num_annotators]], axis=0)

# Compute the valence by genre
genre_series = get_genre_list([df_eng[human_eng_cols], df_chin[human_chin_cols]])
genre_dict = {genre:genre_series[genre_series == genre].index.to_list() for genre in genre_series.unique()}
val_data = {'human-human': dict(), 'human-GPT': dict(), 'GPT-GPT': dict()}
for genre in genre_dict:
    genre_idx = genre_dict[genre]
    val_data['human-human'][genre] = human_valence_stats.iloc[genre_idx]['std'].mean()
    val_data['human-GPT'][genre] = inter_valence_stats[genre_idx][:, -1, :].mean(axis=1).mean()
    val_data['GPT-GPT'][genre] = gpt_valence_stats.iloc[genre_idx]['std'].mean()
val_df = pd.DataFrame(val_data).round(2)

print(f"{cate}".capitalize(), 'distribution')
display(val_df)

Valence distribution


Unnamed: 0,human-human,human-GPT,GPT-GPT
Book,0.77,0.77,0.11
Folktale,0.75,0.86,0.16
Movies-TV,0.79,0.83,0.21
News,0.68,0.66,0.08
Reddit,0.77,0.79,0.0


## Protagonist Type

In [None]:
# Compute the intra-dataframe scores between the two dataframes
def compute_intra_scores(df: pd.DataFrame, score_func: Callable[pd.Series, pd.Series], **kwargs) -> torch.Tensor:
    cols = df.columns
    num_combos = scipy.special.comb(len(cols), 2).astype(int)
    scores = torch.zeros(df.shape[0], num_combos)
    
    k = 0
    for i in range(len(cols)-1):
        refs = df[cols[i]].tolist()
        for j in range(i+1, len(cols)):
            preds = df[cols[j]].tolist()
            scores[:, k] = torch.Tensor(score_func(preds, refs, **kwargs))
            k += 1
    return scores.flatten()

# Compute the inter-dataframe scores between the two dataframes
def compute_inter_scores(
        df1: pd.DataFrame, 
        df2: pd.DataFrame, 
        score_func: Callable[pd.Series, pd.Series], 
        **kwargs
    ) -> torch.Tensor:
    if df1.shape[0] != df2.shape[0]:
        print("Error: size mismatch")
        return
    cols1, cols2 = df1.columns, df2.columns
    num_combos = len(cols1)*len(cols2)
    scores = torch.zeros(df1.shape[0], num_combos)

    k = 0
    for i in range(len(cols1)):
        # print(i)
        refs = df1[cols1[i]].tolist()
        for j in range(len(cols2)):
            # print("\t", j)
            preds = df2[cols2[j]].tolist()
            scores[:, k] = torch.Tensor(score_func(preds, refs, **kwargs))
            k += 1
    return scores.flatten()

# df_subset1 is all the human morals, df_subset2 is all the GPT morals
def get_distributions(
        df_subset1: pd.DataFrame, 
        df_subset2: pd.DataFrame, 
        score_funcs: dict[str:dict]
    ) -> dict[str:dict[str:torch.Tensor]]:
    data = {'1-1': dict(), '2-2': dict(), '1-2': dict()}
    for score_name, score_func in score_funcs.items():
        data['1-1'][score_name] = compute_intra_scores(df_subset1, score_func['func'], **score_func['kwargs'])
        data['2-2'][score_name] = compute_intra_scores(df_subset2, score_func['func'], **score_func['kwargs'])
        data['1-2'][score_name] = compute_inter_scores(df_subset1, df_subset2, score_func['func'], **score_func['kwargs'])
    return data

# Reshape distributions to their original size (they are currently flat)
def reshape_dists(dists: dict, cols1: list, cols2: list) -> dict:
    dists['1-1']['Jaccard'] = dists['1-1']['Jaccard'].reshape((-1, scipy.special.comb(len(cols1), 2).astype(int)))
    dists['2-2']['Jaccard'] = dists['2-2']['Jaccard'].reshape((-1, scipy.special.comb(len(cols2), 2).astype(int)))
    dists['1-2']['Jaccard'] = dists['1-2']['Jaccard'].reshape((-1, len(cols1)*len(cols2)))
    return dists

# Compute Jaccard similarity between set1 and set2
def jaccard_similarity(set1: Set, set2: Set) -> float:
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection/union

# Compute the Jaccard similarity between pairs of entries in series1 and series2
# Returns list of floats (length = series1.shape[0] = series2.shape[0])
def get_jaccard_sims_series(list1: pd.Series, list2: pd.Series) -> list[float]:
    if len(list1) != len(list2):
        print("Error: Size mismatch")
        return
    jaccard_sims = []
    for set1, set2 in zip(list1, list2):
        jaccard_sims.append(jaccard_similarity(set1, set2))
    return jaccard_sims

In [263]:
cate = 'protagonist_type'

# Name and specify the scoring functions (as well as there kwargs)
score_funcs = {
    'Jaccard': {'func': get_jaccard_sims_series, 'kwargs': {}}
}

# Get the relevant columns
human_eng_cols = [f"{ann}_{cate}" for ann in human_eng_annotators]
human_chin_cols = [f"{ann}_{cate}" for ann in human_chin_annotators]
gpt_eng_cols = [f"{ann}_{cate}" for ann in gpt_eng_annotators]
gpt_chin_cols = [f"{ann}_{cate}" for ann in gpt_chin_annotators]

# Get English and Mandarin distibutions
eng_dists = get_distributions(df_eng[human_eng_cols].map(lambda x: set(x.split(", "))), df_gpt_eng[gpt_eng_cols].map(lambda x: set(x.split(", "))), score_funcs)
chin_dists = get_distributions(df_chin[human_chin_cols].map(lambda x: set(x.split(", "))), df_gpt_chin[gpt_chin_cols].map(lambda x: set(x.split(", "))), score_funcs)

# Reshape the distributions
eng_dists = reshape_dists(eng_dists, human_eng_cols, gpt_eng_cols)
chin_dists = reshape_dists(chin_dists, human_chin_cols, gpt_chin_cols)

# Combine distributions together
total_dists = {'1-1': dict(), '2-2': dict(), '1-2': dict()}
for dist in total_dists:
    for score_name in score_funcs:
        total_dists[dist][score_name] = torch.concat((eng_dists[dist][score_name].mean(axis=1), chin_dists[dist][score_name].mean(axis=1)))

# Rename the index
subset_names = ['human', 'GPT']
idx_map = {
    '1-1': f"{subset_names[0]}-{subset_names[0]}",
    '2-2': f"{subset_names[1]}-{subset_names[1]}",
    '1-2': f"{subset_names[0]}-{subset_names[1]}",
}
total_dists = {idx_map[idx]: total_dists[idx] for idx in ['1-1', '1-2', '2-2']}

# Compute the Jaccard index by genre
genre_series = get_genre_list([df_eng[human_eng_cols], df_chin[human_chin_cols]])
genre_dict = {genre:genre_series[genre_series == genre].index.to_list() for genre in genre_series.unique()}
genre_data = {col_type: dict() for col_type in total_dists}
for col_type in total_dists:
    col_scores = total_dists[col_type]['Jaccard']
    genre_data[col_type].update({genre: col_scores[genre_dict[genre]].mean().item() for genre in genre_dict})
# genre_data['p-value'] = {genre: scipy.stats.mannwhitneyu(
#         total_dists['human-human']['Jaccard'][genre_dict[genre]],
#         total_dists['human-GPT']['Jaccard'][genre_dict[genre]]
#     ).pvalue for genre in genres}
prot_type_by_genre = pd.DataFrame(genre_data)

print("Protagonist Type distribution")
display(prot_type_by_genre.round(4)*100)

Protagonist Type distribution


Unnamed: 0,human-human,human-GPT,GPT-GPT
Book,57.92,54.4,91.04
Folktale,60.1,53.83,94.06
Movies-TV,61.04,50.43,93.54
News,46.77,44.77,90.31
Reddit,36.46,38.63,92.08
