# MTurk Response Analysis

Analysis of the inter-annotator agreement from the MTurk survey in which human and GPT morals were selected as either most or least applicable.

Reproduces Tables 4 and 10 in our paper.

In [None]:
import numpy as np
import pandas as pd

import os

from scipy.stats import chisquare
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

idx = pd.IndexSlice

In [2]:
filepath = '../data/validation/mturk'
filename = 'mturk_responses.csv'

df = pd.read_csv(os.path.join(filepath, filename))

Collect the MTurker responses

In [3]:
# column names in the MTurk response file that contain the answers from the workers
answer_cols = [
    'central_topic__best',
    'central_topic__worst', 
    'comprehension', 
    'moral__best', 
    'moral__worst',
    'moral_neg__best', 
    'moral_neg__worst', 
    'moral_pos__best',
    'moral_pos__worst'
]

# get the sub-dataframe containing the answers
mturker_responses = df.sort_values(by=['story_type', 'file_index'])[['story_type', 'file_index', 'story_subtype'] + answer_cols]

# get which answer (i.e. GPT, human1, human2) was selected
mturker_responses.loc[:, answer_cols] = mturker_responses.loc[:, answer_cols].map(lambda s: s.split("_")[0])

# get when GPT was selected
gpt_selection = mturker_responses.copy()
gpt_selection.loc[:, answer_cols] = gpt_selection.loc[:, answer_cols].map(lambda s: s == 'GPT')

In [4]:
# get the answers
ans = df.sort_values(by=['story_type', 'file_index'])[['story_type', 'file_index', 'story_subtype'] + answer_cols]

# which answer was selected
ans.loc[:, answer_cols] = ans.loc[:, answer_cols].map(lambda s: s.split("_")[0])

# get when GPT was selected
gpt_selection = ans.copy()
gpt_selection.loc[:, answer_cols] = gpt_selection.loc[:, answer_cols].map(lambda s: s == 'GPT')

Create a dataframe indicating when GPT was selected by the majority of annotators for each question

In [5]:
# dictionary of standardized column names
standard_cates = {
    'moral': 'Moral',
    'moral_neg': 'Negative Moral', 
    'moral_pos': 'Positive Moral',
    'central_topic': 'Central Topic'
}

# specify the order of the question categories
cates_order = ['Moral', 'Positive Moral', 'Negative Moral', 'Central Topic']

# the genre of each text
genre_type = [x.split("_")[0] for x in df['Input.index'].unique()]

# get the number of times GPT is voted as the best among the 3 annotators
majority_vote = {}   # uses easy to read tuples as keys
orig_col_majority_vote = {}   # same as majority_vote, but using the original column names as keys
for col in answer_cols:
    # skip the comprehension column
    if col != 'comprehension':
        cate, type = col.split("__")
    else:
        cate, type = 'comprehension', 'none'
    
    # standardize the category name
    cate = standard_cates.get(cate, cate)

    # get the majority vote counts
    majority_vote[(cate, type)] = (gpt_selection[col].values.reshape((-1, 3)).sum(axis=1) > 1)
    orig_col_majority_vote[col] = (gpt_selection[col].values.reshape((-1, 3)).sum(axis=1) > 1)

df_col_idx = pd.MultiIndex.from_tuples(majority_vote, names=['Category', 'Type'])
df_idx = pd.Index(genre_type, name='genre')

df_gpt_majority_vote = pd.DataFrame(majority_vote, columns=df_col_idx, index=df_idx)

### 1) Fleiss Alpha

Get the Fleiss Kappa values for all answer columns

In [6]:
def get_fleiss_kappa(annotator_selections_matrix):
    return fleiss_kappa(aggregate_raters(annotator_selections_matrix)[0])

def get_krippendorff_alpha(annotator_response_matrix):
    reliability_data = mturker_responses[col].values.reshape(-1, 3).T
    reliability_data = np_numerize_options(reliability_data)
    return kd.alpha(reliability_data, level_of_measurement="nominal")

def numerize_options(x):
    options_to_int = {
        'GPT': 0,
        'human1': 1,
        'human2': 2
    }
    return options_to_int[x]

np_numerize_options = np.vectorize(numerize_options)

In [None]:
kappa_data = []
for col in answer_cols:

    # initialize data
    kappa_entry = {}
    if col == 'comprehension':
        cate, type_ = col, 'none'
    else:
        cate, type_ = col.split("__")

    # compute the Fleiss value
    annotator_response_matrix = mturker_responses[col].values.reshape(-1, 3) 
    kappa = get_fleiss_kappa(annotator_response_matrix)
    # alpha = get_krippendorff_alpha(annotator_response_matrix) if cate != 'comprehension' else 1

    # add data columns
    kappa_entry['Fleiss $\kappa$'] = kappa if not np.isnan(kappa) else 1
    # kappa_entry['Krippendorff'] = alpha
    kappa_entry['Category'] = cate
    kappa_entry['Type'] = type_
    
    kappa_data.append(kappa_entry)

df_fleiss = pd.DataFrame(kappa_data).set_index(['Category']).rename(index=standard_cates)

  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


In [8]:
rel_cols_fleiss = [col for col in df_fleiss.columns if col != 'Type']

# separate the coefficients for the best and worst categories
best_fleiss = df_fleiss.loc[df_fleiss['Type'] == 'best', rel_cols_fleiss].loc[cates_order]
worst_fleiss = df_fleiss.loc[df_fleiss['Type'] == 'worst', rel_cols_fleiss].loc[cates_order]

### 2) Agreement Breakdowns

Get the percent agreement breakdowns for all answer columns

In [9]:
n = ans.shape[0]/3

index = []
data = []
for col in answer_cols:
    
    # index information
    if col != 'comprehension':
        cate, type_ = col.split("__")
        cate = standard_cates.get(cate, cate)
    else:
        cate, type_ = 'comprenhension', 'none'
    index.append((cate, type_))

    # get breakdowns
    data_entry = {}
    highest_agreement = aggregate_raters(mturker_responses[col].values.reshape(-1, 3))[0].max(axis=1) # number of agreements per story
    values, counts = np.unique(highest_agreement, return_counts=True)
    for value, count in zip(values, counts):
        data_entry[value] = count/n
    data.append(data_entry)

breakdown_idx_names = pd.MultiIndex.from_tuples(index, names=['Category', 'Type'])

breakdowns = pd.DataFrame(data, index=breakdown_idx_names).fillna(0).reset_index().set_index("Category")

In [10]:
rel_cols_breakdown = [col for col in breakdowns.columns if col != 'Type']

# separate the breakdowns for the best and worst categories
best_breakdown = breakdowns.loc[breakdowns['Type'] == 'best', rel_cols_breakdown].loc[cates_order].round(4)*100
worst_breakdown = breakdowns.loc[breakdowns['Type'] == 'worst', rel_cols_breakdown].loc[cates_order].round(4)*100

### 3) GPT Majority Percentage

The percentage of the time that GPT was selected by the majority of MTurkers at the most or least applicable.

In [11]:
gpt_accs = df_gpt_majority_vote.mean().rename('mean').reset_index().set_index('Category')

In [12]:
gpt_best_accs = (gpt_accs[gpt_accs['Type'] == 'best'].loc[cates_order, 'mean'].to_frame().astype(float).round(4)*100)['mean'].rename("GPT Accuracy")
gpt_worst_accs = (gpt_accs[gpt_accs['Type'] == 'worst'].loc[cates_order, 'mean'].to_frame().astype(float).round(4)*100)['mean'].rename("GPT Accuracy")

### 4) $\chi^{2}$ Goodness of Fit

Compute p-values for $\chi^{2}$ goodness of fit test comparing the rate at which GPT was selected by the majority of annotators, relative to random selection. ($P(X \ge 2)$ for $X \sim \text{bin(3, 1/3)}$)  

In [13]:
idx_name = pd.Index(genre_type, name='genre')
df_gpt_majority_vote_orig = pd.DataFrame(orig_col_majority_vote, columns=answer_cols, index=idx_name)
n = df_gpt_majority_vote_orig.shape[0]

chi2_fit_data = {}
for col in answer_cols:
    
    if col == 'comprehension':
        continue
    cate, type_ = col.split("__")
    cate = standard_cates.get(cate, cate)
    if cate not in chi2_fit_data:
        chi2_fit_data[cate] = dict()

    # observed counts
    observed_GPT = df_gpt_majority_vote_orig[col].sum()
    observed_not_GPT = (~df_gpt_majority_vote_orig[col].astype(bool)).sum()
    # expected counts
    expected_GPT = n*7/27
    expected_not_GPT = n*20/27

    # goodness of fit test
    f_observed = [observed_GPT, observed_not_GPT]
    f_expected = [expected_GPT, expected_not_GPT]
    chi2_fit_data[cate][type_] = chisquare(f_obs=f_observed, f_exp=f_expected).pvalue

chi2_fit = pd.DataFrame(chi2_fit_data).T.loc[cates_order].round(5)

In [14]:
# X^2 values for both most and least applicable
chi2_vals_best = chi2_fit['best'].rename("$\chi^{2}$").round(5)
chi2_vals_worst = chi2_fit['worst'].rename("$\chi^{2}$").round(5)

## Table 4

In [15]:
print("Most Applicable (Table 4)")
display(pd.concat([best_breakdown, best_fleiss, gpt_best_accs, chi2_vals_best], axis=1))

print("Least Applicable (Table 4)")
display(pd.concat([worst_breakdown, worst_fleiss, gpt_worst_accs, chi2_vals_worst], axis=1))

Most Applicable (Table 4)


Unnamed: 0,1,2,3,Fleiss $\kappa$,Krippendorff,GPT Accuracy,$\chi^{2}$
Moral,13.89,59.03,27.08,0.012169,0.014456,68.06,0.0
Positive Moral,14.58,65.28,20.14,0.000535,0.002848,60.42,0.0
Negative Moral,16.67,65.97,17.36,0.031871,0.034112,52.78,0.0
Central Topic,9.72,61.81,28.47,0.085116,0.087234,67.36,0.0


Least Applicable (Table 4)


Unnamed: 0,1,2,3,Fleiss $\kappa$,Krippendorff,GPT Accuracy,$\chi^{2}$
Moral,16.67,58.33,25.0,0.123362,0.125391,11.11,5e-05
Positive Moral,20.83,63.19,15.97,0.037815,0.040042,11.81,0.00011
Negative Moral,23.61,63.19,13.19,-0.004914,-0.002588,15.97,0.00642
Central Topic,13.19,61.81,25.0,0.13404,0.136044,7.64,0.0


## Table 10

Percent of passages by genre where the GPT response was selected by a majority of AMT workers

In [16]:
# accuracy of GPT selection by genre
gpt_genre_accs = df_gpt_majority_vote.groupby('genre').mean().T.loc[cates_order].reset_index().set_index('Category')

In [17]:
genre_cols = [col for col in gpt_genre_accs.columns if col != 'Type']

best_accs_by_genre = gpt_genre_accs.loc[gpt_genre_accs['Type'] == 'best', genre_cols]
best_accs_by_genre = best_accs_by_genre.loc[cates_order].round(4)*100

worst_accs_by_genre = gpt_genre_accs.loc[gpt_genre_accs['Type'] == 'worst', genre_cols]
worst_accs_by_genre = worst_accs_by_genre.loc[cates_order].round(4)*100

In [18]:
print("Most Applicable by Genre (Table 10)")
display(best_accs_by_genre)
print("Least Applicable by Genre (Table 10)")
display(worst_accs_by_genre)

Most Applicable by Genre (Table 10)


genre,Book,Folktale,Movies-TV,News,Reddit
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Moral,62.5,78.12,56.25,73.44,43.75
Positive Moral,56.25,62.5,62.5,57.81,68.75
Negative Moral,56.25,50.0,62.5,51.56,50.0
Central Topic,75.0,65.62,37.5,73.44,68.75


Least Applicable by Genre (Table 10)


genre,Book,Folktale,Movies-TV,News,Reddit
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Moral,12.5,6.25,25.0,7.81,18.75
Positive Moral,6.25,12.5,12.5,12.5,12.5
Negative Moral,18.75,12.5,12.5,17.19,18.75
Central Topic,6.25,3.12,25.0,7.81,0.0
