In [None]:
from utils import *
from utils.dataloader_legacy import load_data

data = load_data('../data/salsa-non-adjudicated/salsa_train.json')

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd

def token_level_disagreement(data, collapse_composite=False, majority_disagreement=False):
    total_agreement = []
    MAX_HIT = max([x['id'] for x in data])+1
    for sent_id in range(0, MAX_HIT):
        sents = [x for x in data if x['id'] == sent_id]
        orig_tokens = get_annotations_per_token(sents, 'original', collapse_composite=collapse_composite)
        simp_tokens = get_annotations_per_token(sents, 'simplified', collapse_composite=collapse_composite)

        arr_edits = []
        for edits in list(orig_tokens.values()) + list(simp_tokens.values()):
            out = []
            for k, v in edits.items():
                for amt in range(v):
                    out += [k]
            for i in range(3 - len(out)):
                out += [None]
            arr_edits += [out]

        if majority_disagreement:
            # This takes edits with a 2 agreeing annotators and 1 disagreeing annotator
            # and reports agreement between the majority and minority classes
            disagreement = []
            for edit in arr_edits:
                majority, minority = None, None
                if edit[0] == edit[1]:
                    majority = edit[0]
                    minority = edit[2]
                elif edit[1] == edit[2]:
                    majority = edit[1]
                    minority = edit[0]
                elif edit[0] == edit[2]:
                    majority = edit[0]
                    minority = edit[1]
                
                if majority is None or minority is None or majority == minority:
                    continue
            
                disagreement += [[majority, minority]]
        else:
            # This replicates each edit and calculates pairwise agreement
            # [A1, A2]
            # [A2, A3]
            # [A3, A1]
            interwoven = []
            for edit in arr_edits:
                interwoven += [[edit[0], edit[1]]]
                interwoven += [[edit[1], edit[2]]]
                interwoven += [[edit[0], edit[2]]]
            
            disagreement = []
            for edit in interwoven:
                # Important point, could be 'or'
                if edit[0] is None or edit[1] is None:
                    continue
                # Cannot display agreement
                if edit[0] == edit[1]:
                    continue
                disagreement += [edit]

        total_agreement += disagreement
    return total_agreement

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3.5)) #sharex=True, sharey=True)

vmin, vmax = 0, 2500

color_bar = sn.color_palette("Reds", 20)

# Change label ordering
edit_type_labels = ['insertion', 'deletion', 'substitution', 'reorder', 'split', 'structure']

disagreement = token_level_disagreement(data)
a = confusion_matrix([x[0] for x in disagreement], [x[1] for x in disagreement], labels=edit_type_labels)
df_cm = pd.DataFrame(a, index=[x.capitalize() for x in edit_type_labels], columns=[x.capitalize() for x in edit_type_labels])
sn.heatmap(df_cm, cmap=color_bar, ax=ax1, vmin=vmin, vmax=vmax, square=True, cbar=False, linewidth=.5)

disagreement = token_level_disagreement(data, collapse_composite=True)
a = confusion_matrix([x[0] for x in disagreement], [x[1] for x in disagreement], labels=[x for x in edit_type_labels if x not in ['structure', 'split']])
df_cm = pd.DataFrame(a, index=[x.capitalize() for x in edit_type_labels if x not in ['structure', 'split']], columns=[x.capitalize() for x in edit_type_labels if x not in ['structure', 'split']])
sn.heatmap(df_cm, cmap=color_bar, ax=ax2, vmin=vmin, vmax=vmax, square=True, linewidth=.5)

ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha="right")
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha="right")

ax1.set_title('Using composite edits', pad=12, fontsize=14)
ax2.set_title('Using constituent edits', pad=12, fontsize=14)

fig.tight_layout()

out_filename = "../paper/plot/agreement-heatmap.pdf"
plt.savefig(out_filename, format="pdf", bbox_inches='tight', pad_inches=0.0)

plt.show()

In [None]:
# Calculates Rough Span Agreement Score
# TODO: Does not work for multiple batches
total_agreement = []
MAX_HIT = max([x['id'] for x in data])+1
for sent_id in range(0, MAX_HIT):
    sents = [x for x in data if x['id'] == sent_id]
    orig_tokens = get_annotations_per_token(sents, 'original')
    simp_tokens = get_annotations_per_token(sents, 'simplified')
    agg_score = 0
    for val in list(orig_tokens.values()) + list(simp_tokens.values()):
        agg_score += (list(val.values())[0] - 3)
    total_agreement.append(agg_score)
print(f"Sent with full agreement {sum([x == 0 for x in total_agreement])} / {MAX_HIT}")
# hds = sorted([(i, val) for i, val in enumerate(total_agreement)], key=lambda x: x[1])
# print("\nHighest disagreement sentences:")
# for x in [get_sent_info(data[x[0]]) for x in hds][:5]:
#     print(x)

In [None]:
def get_second_order_annotations_per_token(sents, sent_type, remove_none=True):
    edit_dict_value = sent_type + '_span'
    tokens = generate_token_dict(sents[0][sent_type])
    
    # Iterate through all annotators' edits
    for sent in sents:
        edits = sent['edits']
        
        for edit in edits:
            if edit[edit_dict_value] is None:
                continue

            for elongated_span in edit[edit_dict_value]:
                composite_spans = [
                    (entry[0] + elongated_span[0], entry[1] + elongated_span[0]) 
                    for entry in list(
                        generate_token_dict(sents[0][sent_type][elongated_span[0]:elongated_span[1]]).keys()
                    )
                ]

                for c_span in composite_spans:
                    if c_span in tokens.keys():                       
                        type_ = edit['type']

                        # Extract subtype from edit entry
                        if type_ == 'substitution':
                            subtype = edit['annotation'][0]
                            if subtype == 'more':
                                type_ = 'elaboration_substitution'
                            elif subtype == 'less':
                                type_ = 'generalization_substitution'
                            elif subtype == 'same':
                                type_ = 'paraphrasing_substitution'
                            elif subtype == 'different':
                                type_ = 'paraphrasing_different'
                            else:
                                print(edit)
                        elif type_ == 'reorder':
                            subtype = edit['annotation'][-1]
                            if subtype == 'word':
                                type_ = 'word_reorder'
                            elif subtype == 'component':
                                type_ = 'component_reorder'
                            else:
                                continue
                            # If the annotator does not specify, we skip

                        # Increment the edit type
                        if type_ not in tokens[c_span].keys():
                            tokens[c_span][type_] = 0
                        tokens[c_span][type_] += 1
                    elif c_span is None:
                        pass
                    else:
                        print(edit)
                        print("there's a problem boss")
    
    # Remove spans with no annotations from any annotator
    if remove_none:
        keys = list(tokens.keys())
        for entry in keys:
            if len(tokens[entry].keys()) == 0:
                del tokens[entry]
    return tokens

In [None]:
sents = [x for x in data if x['id'] == 1]
get_second_order_annotations_per_token(sents, 'original')

# substitution - elaboration, generalization
# reorder - word-level, component-level
# substitution - paraphrasing

# Calculates % Agreement
total_agreement = []
MAX_HIT = max([x['id'] for x in data])+1
for sent_id in range(0, MAX_HIT):
    sents = [x for x in data if x['id'] == sent_id]
    orig_tokens = get_second_order_annotations_per_token(sents, 'original')
    simp_tokens = get_second_order_annotations_per_token(sents, 'simplified')
    total_agreement += list(orig_tokens.values()) + list(simp_tokens.values())

span_agreement = {}
agreement_dims = [
    'insertion', 
    'deletion', 
    'elaboration_substitution',
    'generalization_substitution',
    'paraphrasing_substitution',
    'word_reorder',
    'component_reorder', 
    'split', 
    'structure'
]

for edit_type in agreement_dims:
    out = {
        3: 0,
        2: 0,
        1: 0
    }
    for d in total_agreement:
        if edit_type in d.keys():
            if d[edit_type] not in out.keys():
                out[d[edit_type]] = 0
            out[d[edit_type]] += 1
    total = sum(out.values())
    # Avoid division by 0, doesn't change calculation at all
    total = 1 if total == 0 else total
    # print(f'{edit_type}: {int(100*out[3]/total)} {int(100*out[2]/total)} {int(100*out[1]/total)}')
    span_agreement[edit_type] = {
        'three': int(100*out[3]/total),
        'two': int(100*out[2]/total)
    }
span_agreement

In [None]:
from sklearn.metrics import cohen_kappa_score
import krippendorff
from statsmodels.stats.inter_rater import fleiss_kappa
from statsmodels.stats import inter_rater as irr

for edit in agreement_dims:
    total_agreement = None
    MAX_HIT = max([x['id'] for x in data])+1
    for sent_id in range(0, MAX_HIT):
        sents = [x for x in data if x['id'] == sent_id]
        final = []
        for sent in sents:
            tokens_orig = get_second_order_annotations_per_token([sent], 'original', remove_none=False)
            tokens_simp = get_second_order_annotations_per_token([sent], 'simplified', remove_none=False)
            all_edits = list(tokens_orig.values()) + list(tokens_simp.values())
            formatted = []
            for edits in all_edits:
                if edit in edits.keys():
                    formatted += [1]
                else:
                    formatted += [0]
                # formatted += [nx]
            final += [formatted]
        
        if len(sents) == 3:
            if total_agreement is None:
                total_agreement = np.asarray(final)
            else:
                total_agreement = np.append(total_agreement, np.asarray(final), axis=1)
    
    # Delete columns with all 0s
    # idx = np.argwhere(np.all(total_agreement[..., :] == 0, axis=0))
    # total_agreement = np.delete(total_agreement, idx, axis=1)

    
    value_counts = total_agreement
    print(f"{edit} ({total_agreement.shape[1]} tokens)")
    # Having trouble formatting the krippendorff alpha
    # print(krippendorff.alpha(value_counts=value_counts, level_of_measurement='nominal'))
    print(cohen_kappa_score(total_agreement[0, :], total_agreement[1, :]))
    print(cohen_kappa_score(total_agreement[1, :], total_agreement[2, :]))
    print(cohen_kappa_score(total_agreement[0, :], total_agreement[2, :]))
    agg = irr.aggregate_raters(total_agreement.T)
    agg_fleiss = irr.fleiss_kappa(agg[0], method='fleiss')
    print(agg_fleiss, end='\n\n')

    span_agreement[edit]['fleiss'] = agg_fleiss

In [None]:
table_edit_type_mapping = {
    'insertion': '\\midrule\n\\ei{Insertion} & Elaboration', 
    'deletion': '\\ed{Deletion} & Generalization', 
    # 'substitution': '\\es{Substitution}', 
    'elaboration_substitution': '\\es{Substitution} & Elaboration',
    'generalization_substitution': ' & Generalization',
    # 'reorder': '\\er{Reorder}', 
    'word_reorder': '\\midrule\n\\er{Reorder} & Word-level',
    'component_reorder': ' & Component-level',
    'split': '\\esp{Split} & Sentence Split', 
    'structure': '\\est{Structure} & Structure',
    'paraphrasing_substitution': '\\midrule\n\\es{Substitution} & Paraphrase'
}
table = ''
for edit_type in table_edit_type_mapping.keys():
    table += f'{table_edit_type_mapping[edit_type]} & {round(span_agreement[edit_type]["fleiss"], 2)} & {span_agreement[edit_type]["three"]} & {span_agreement[edit_type]["two"]} \\\\\n'

print(table)

In [None]:
table_edit_type_mapping = {
    'insertion': '\\midrule\n\\ei{Insertion} & More Information', 
    'deletion': '\\ed{Deletion} & Less Information', 
    # 'substitution': '\\es{Substitution}', 
    'elaboration_substitution': '\\es{Substitution} & More Information',
    'generalization_substitution': ' & Less Information',
    # 'reorder': '\\er{Reorder}', 
    'word_reorder': '\\midrule\n\\er{Reorder} & Word-level',
    'component_reorder': ' & Component-level',
    'split': '\\esp{Split} & Sentence Split', 
    'structure': '\\est{Structure} & Structure',
    'paraphrasing_substitution': '\\midrule\n\\es{Substitution} & Same Information'
}
table = ''
for edit_type in table_edit_type_mapping.keys():
    table += f'{table_edit_type_mapping[edit_type]} & {round(span_agreement[edit_type]["fleiss"], 2)} & {span_agreement[edit_type]["three"]} & {span_agreement[edit_type]["three"]+span_agreement[edit_type]["two"]} \\\\\n'

print(table)

In [None]:
# Calculates agreement for errors
error = Error.BAD_DELETION

def get_error_stats(error):
    total_agreement = None
    MAX_HIT = max([x['id'] for x in data])+1
    for sent_id in range(0, MAX_HIT):
        sents = [x for x in data if x['id'] == sent_id]

        sents_errors = []
        for sent in sents:
            errors = []
            for ann in sent['processed_annotations']:
                if (error.value == 'grammar_error' and ann['grammar_error']) or ann['error_type'] == error:
                    errors += [1]
                else:
                    errors += [0]
            sents_errors += [1 if any(errors) == True else 0]

        # Converts error array to 0/1 mapping using alphabetical ordering
        if len(sents_errors) == 3:
            if total_agreement is None:
                total_agreement = np.asarray([sents_errors])
            else:
                total_agreement = np.append(total_agreement, np.asarray([sents_errors]), axis=0)

    if total_agreement.sum() == 0:
        print(f'No {error.value} errors', end='\n\n')
        return None

    stats = {}
    agg = irr.aggregate_raters(total_agreement)
    stats['fleiss'] = irr.fleiss_kappa(agg[0], method='fleiss')

    # Delete columns with all 0s
    total_agreement = total_agreement.T
    idx = np.argwhere(np.all(total_agreement[..., :] == 0, axis=0))
    total_agreement = np.delete(total_agreement, idx, axis=1)
    total_agreement = total_agreement.T

    # % two agree
    two_agree = []
    for entry in total_agreement:
        two_agree.append(np.sum(entry) >= 2)
    stats['two'] = 100*sum(two_agree) / total_agreement.shape[0]

    # % frequency
    stats['freq'] = 100*len(total_agreement) / MAX_HIT

    return stats

class Tmp(Enum):
    GRAMMAR_ERROR = 'grammar_error'

table = ''
all_stats = {}
for error in [e for e in Error] + [Tmp.GRAMMAR_ERROR]:
    stats = get_error_stats(error)
    if stats is not None:
        all_stats[error] = stats

sorted_stats = sorted([(k, v['freq']) for k, v in all_stats.items()], key=lambda x: x[1], reverse=True)

for error in [x[0] for x in sorted_stats]:
    table += f'{error.value} & {round(all_stats[error]["fleiss"], 2)} & {int(all_stats[error]["two"])} & {int(all_stats[error]["freq"])} \\\\ \n'
table = table.replace('grammar_error', 'Grammar Error')

print(table)

In [None]:
from scipy.stats import kendalltau
from scipy.stats import pearsonr
import krippendorff

# Calculates sentence-score agreement
total_agreement = None
MAX_HIT = max([x['id'] for x in data])+1
for sent_id in range(0, MAX_HIT):
    sents = [x for x in data if x['id'] == sent_id]
    scores = [sent['score'] for sent in sents]
    if len(scores) == 3:
        if total_agreement is None:
            total_agreement = np.asarray([scores])
        else:
            total_agreement = np.append(total_agreement, np.asarray([scores]), axis=0)

tau, p_value = kendalltau(total_agreement[:, 1], total_agreement[:, 2])
r, p_value = pearsonr(total_agreement[:, 1], total_agreement[:, 2])
kd = krippendorff.alpha(reliability_data=total_agreement[:300, :].T, level_of_measurement='interval')
print(tau)
print(r)
print(kd)

In [None]:
# Display the agreeement on all sentences
MAX_HIT = 2 # max([x['id'] for x in data])+1
for sent_id in range(0, MAX_HIT):
    sents = [x for x in data if x['id'] == sent_id]
    draw_agreement(sents)

In [None]:
selected = [sent for sent in data 
    if 'Orion' in sent['original']
]
for sent_id in set([x['id'] for x in selected]):
    sents = [x for x in selected if x['id'] == sent_id]
    draw_agreement(sents)