Evaluate agreement between MAG and CSET fields.

In [1]:
import numpy as np
import pandas as pd
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics.distance import binary_distance, masi_distance
from srsly import read_jsonl

from fos.settings import ASSETS_DIR

META_PATH = ASSETS_DIR / 'fields/fos.pkl.gz'
ANNO_DIR = ASSETS_DIR / '../analysis/annotation'

We're using (at least for now) only the small random sample we drew for annotation.

In [2]:
# Load assets
meta = pd.read_pickle(META_PATH)
# TODO: expand to larger sample
mag_sample = list(read_jsonl(ANNO_DIR / "en_v1_scores.jsonl"))
cset_sample = list(read_jsonl(ANNO_DIR / "en_v2_scores.jsonl"))
l0_ids = meta.query('level == 0').index.astype(int)
id_to_name = {int(k): v['display_name'] for k, v in meta.query('level == 0').iterrows()}

In [3]:
def shape(docs, field_filter=None):
    """Put documents in a consistent order (by doc id) and subset to the L0 field scores (dropping L1)."""
    result = {}
    for doc in docs:
        doc_id = doc['id']
        scores = {}
        for field in doc['fields']:
            if field_filter is None or field['id'] in field_filter:
                scores[field['id']] = field['score']
        scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))
        result[doc_id] = scores
    assert len(set([len(v) for v in result.values()])) == 1
    # Return in order of doc ID
    result = dict(sorted(result.items()))
    return result


cset_docs = shape(cset_sample, l0_ids)
mag_docs = shape(mag_sample, l0_ids)
assert cset_docs.keys() == mag_docs.keys()

In [48]:

def to_ordinal_nltk(docs, annotator_id, k=3, names=True):
    """Format for nltk.

    :param annotator_id: Annotator/rater ID, i.e., CSET or MAG.
    :param k: Restrict to top k scores (or k=19 to include all).
    :param names: If True, return field/item names in order of score ('40700', ...); otherwise, return field score ranks in order of    field lexicographic sort.
    :return:
    """
    result = []
    for doc_id, scores in docs.items():
        top_fields = []
        if names:
            for i, (field_id, score) in enumerate(scores.items(), 1):
                top_fields.append(str(field_id))
                if i == k:
                    break
        else:
            sorted_fields = dict(sorted(scores.items()))
            field_order = reversed(np.argsort(np.array(list(sorted_fields.values()))))
            top_fields = list(np.array(list(field_order)) + 1)
        data = annotator_id, doc_id, tuple(top_fields)
        result.append(data)
    return result


annotations = to_ordinal_nltk(mag_docs, "mag") + to_ordinal_nltk(cset_docs, "cset")
rank_annotations = to_ordinal_nltk(mag_docs, "mag", names=False) + to_ordinal_nltk(cset_docs, "cset", names=False)

In [61]:
# Looks like this
annotations[:2]

[('mag', 'carticle_0000800584', ('121332964', '192562407', '41008148')),
 ('mag', 'carticle_0001126499', ('71924100', '185592680', '86803240'))]

In [62]:
# And like this
rank_annotations[:2]

[('mag',
  'carticle_0000800584',
  (9, 18, 5, 3, 10, 4, 17, 16, 11, 1, 7, 6, 19, 12, 14, 15, 13, 8, 2)),
 ('mag',
  'carticle_0001126499',
  (6, 17, 7, 3, 18, 9, 1, 4, 19, 16, 15, 5, 10, 12, 8, 14, 2, 13, 11))]

We calculate Krippendorff's alpha for the top 1-3 labels, using MASI distance (for set overlap) for k > 1.

In [71]:
masi_alpha = []
for k in [1, 2, 3]:
    data = to_ordinal_nltk(mag_docs, "mag", k=k) + to_ordinal_nltk(cset_docs, "cset", k=k)
    data = [(annotator, doc_id, frozenset(fields[:k])) for annotator, doc_id, fields in data]
    # Print an example record
    print(data[0])
    task = AnnotationTask(data, distance=masi_distance)
    masi_alpha.append(task.alpha())
np.array(masi_alpha).round(3)

('mag', 'carticle_0000800584', frozenset({'121332964'}))
('mag', 'carticle_0000800584', frozenset({'121332964', '192562407'}))
('mag', 'carticle_0000800584', frozenset({'41008148', '121332964', '192562407'}))


array([0.347, 0.299, 0.175])

For comparison, here's Krippendorff's alpha for the top 1-3 labels when using binary distance.

In [72]:
binary_alpha = []
for k in [1, 2, 3]:
    data = to_ordinal_nltk(mag_docs, "mag", k=k) + to_ordinal_nltk(cset_docs, "cset", k=k)
    data = [(annotator, doc_id, frozenset(fields[:k])) for annotator, doc_id, fields in data]
    task = AnnotationTask(data, distance=binary_distance)
    binary_alpha.append(task.alpha())
np.array(binary_alpha).round(3)

array([0.347, 0.257, 0.1  ])

And here's simple agreement for the top field.

In [46]:
simple_agreement = pd.concat([
    pd.DataFrame(to_ordinal_nltk(mag_docs, "mag", k=1)),
    pd.DataFrame(to_ordinal_nltk(cset_docs, "cset", k=1))
], axis=1)

simple_agreement.columns = ['mag', 'mag_id', 'top_mag', 'cset', 'cset_id', 'top_cset']
assert simple_agreement.apply(lambda x: x['mag_id'] == x['cset_id'], axis=1).all()
n_agree = simple_agreement.apply(lambda x: x['top_mag'] == x['top_cset'], axis=1).sum()
round(n_agree / simple_agreement.shape[0], 3), simple_agreement.shape[0]

(0.4134078212290503, 358)

Confirm the alpha result using a different implementation.

In [60]:
import krippendorff

krippendorff.alpha([
    [score[0] for _, _, score in to_ordinal_nltk(mag_docs, "mag", k=1)],
    [score[0] for _, _, score in to_ordinal_nltk(cset_docs, "cset", k=1)],
], level_of_measurement='nominal')

0.3470802332508578