# F1-score as IAA measure

The average F1-score among pairs of annotators is numerically identical to the average positive specific agreement among annotators (kappa approaches these measures as the number of negative cases grow large).<sup>1</sup>

<sup>1</sup>Hripcsak, G., & Rothschild, A. S. (2005). Agreement, the F-measure, and reliability in information retrieval. *Journal of the American Medical Informatics Association : JAMIA, 12*(3), 296–298. https://doi.org/10.1197/jamia.M1733

In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from itertools import permutations, combinations

import sys
sys.path.insert(0, '..')
from utils.config import PATHS
from utils.data_process import drop_disregard, fix_week_14, pad_sen_id

## Load and pre-process data

In [2]:
# load annotations

path = PATHS.getpath('data') / 'iaa'

df = pd.concat([pd.read_pickle(fp) for fp in path.glob('*_parsed.pkl')], ignore_index=True)

In [3]:
# load batch info

path = PATHS.getpath('data_to_inception_conll')

info = pd.DataFrame()

for batch in df.batch.unique():
    batch_info = pd.read_pickle(path / f'{batch}.pkl').assign(
        batch = batch,
    )
    info = info.append(batch_info, ignore_index=True)

In [4]:
# select IAA files

iaa = info.query("samp_meth == 'kwd_iaa'").NotitieID.unique()
df = df.loc[df.NotitieID.isin(iaa)]

In [5]:
# process annotations

domains = ['ADM', 'ATT', 'BER', 'ENR', 'ETN', 'FAC', 'INS', 'MBW', 'STM']
other = ['target', 'background', 'plus']

df = df.assign(
    background_sent = lambda df: df.groupby('sen_id').background.transform('any'),
    target_sent = lambda df: df.groupby('sen_id').target.transform('any'),
    disregard_note = lambda df: df.groupby('NotitieID').disregard.transform('any'),
    pad_sen_id = df.sen_id.apply(pad_sen_id),
).pipe(fix_week_14)

df[domains + other] = df[domains + other].fillna(False)

In [6]:
# replace annotaotr names with alias (first letter)

df.annotator = df.annotator.str[0]

## Create sentence-level multi-label of all domains

In [7]:
# select notes that were annotated by all 6 annotators
# (needed since some annotators skipped some of the batches);
# create a sentence-level multi-label of all the domains

sent_labels = df.groupby(['annotator', 'pad_sen_id'])[domains].any().astype(int).apply(lambda s: s.to_list(), axis=1).rename('domain_labels').unstack(0).dropna()

## Pairwise metrics

In [8]:
pairs = list(permutations(sent_labels.columns, r=2))
combis = list(combinations(sent_labels.columns, r=2))
len(pairs)

30

In [9]:
def make_pairwise_classreport(y1, y2):
    return [
        precision_score(y1, y2),
        recall_score(y1, y2),
        f1_score(y1, y2)
    ]

In [10]:
cols = [
    'annotator1',
    'annotator2',
    'domain',
    'support',
    'precision',
    'recall',
    'f1_score',
]
classreport = pd.DataFrame(columns=cols)
for annotator1, annotator2 in pairs:
    for i, domain in enumerate(domains):
        ys = sent_labels[[annotator1, annotator2]].applymap(lambda x: x[i])
        data=[
            annotator1,
            annotator2,
            domain,
            ys[annotator1].sum(),
            *make_pairwise_classreport(ys[annotator1], ys[annotator2]),
        ]
        s = pd.Series(data, index=cols)
        classreport = classreport.append(s, ignore_index=True)

## Average F1-score per domain

In [11]:
# calculate average F1-score per domain
# and the min and max support

classreport.groupby('domain').agg({
    'f1_score': lambda s: round(s.mean(), 2),
    'support': lambda s: [s.min(), s.max()],
}).T

domain,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
f1_score,0.64,0.58,0.42,0.66,0.45,0.78,0.34,0.62,0.57
support,"[21, 36]","[1, 4]","[1, 5]","[9, 18]","[7, 25]","[12, 17]","[2, 21]","[3, 6]","[10, 18]"


## F1-score per domain per pair

In [12]:
table = classreport.set_index(['annotator1', 'annotator2']).loc[combis, ['domain', 'f1_score']].set_index('domain', append=True).unstack(-1).droplevel(0, axis=1).reset_index().rename_axis('', axis=1)
table.loc[15,'ADM':] = table.loc[:,'ADM':].mean()
table.loc[15,'annotator1'] = 'mean'
table.loc[15,'annotator2'] = ''

table.style.background_gradient(cmap='Greens', axis=None).format({domain:"{:.2}" for domain in domains}).apply(lambda s: len(s) * ["background: black; color: white"],subset=15)

Unnamed: 0,annotator1,annotator2,ADM,ATT,BER,ENR,ETN,FAC,INS,MBW,STM
0,a,k,0.65,0.0,0.33,0.8,0.51,0.69,0.3,0.91,0.5
1,a,m,0.75,0.57,0.5,0.62,0.53,0.87,0.43,0.67,0.53
2,a,o,0.63,0.33,0.25,0.64,0.39,0.85,0.087,0.6,0.72
3,a,s,0.62,0.75,0.57,0.57,0.53,0.96,0.48,0.67,0.45
4,a,v,0.61,0.57,0.29,0.7,0.54,0.87,0.61,0.73,0.55
5,k,m,0.65,0.5,0.5,0.59,0.45,0.6,0.36,0.73,0.57
6,k,o,0.56,0.67,0.5,0.7,0.36,0.62,0.25,0.44,0.61
7,k,s,0.63,0.4,0.67,0.63,0.41,0.72,0.44,0.75,0.55
8,k,v,0.62,0.5,0.0,0.76,0.31,0.6,0.44,0.8,0.65
9,m,o,0.67,0.8,0.67,0.62,0.38,0.8,0.11,0.6,0.64


## Example

- annotators: avelli and katsburg
- domain: ADM

Avelli labeled 36 sentences as ADM; 21 of those were also marked as ADM by Katsburg. In addition, Katsburg labeled 8 other sentences as ADM that Avelli did not.

In [13]:
ys = sent_labels[['a', 'k']].applymap(lambda x: x[0])

In [14]:
confusion_matrix(ys['a'], ys['k'])

array([[2435,    8],
       [  15,   21]])

In [15]:
confusion_matrix(ys['k'], ys['a'])

array([[2435,   15],
       [   8,   21]])

In [16]:
classreport.query(
    "annotator1 == 'a' and annotator2 == 'k'"
).iloc[[0]]

Unnamed: 0,annotator1,annotator2,domain,support,precision,recall,f1_score
0,a,k,ADM,36,0.724138,0.583333,0.646154


In [17]:
classreport.query(
    "annotator1 == 'k' and annotator2 == 'a'"
).iloc[[0]]

Unnamed: 0,annotator1,annotator2,domain,support,precision,recall,f1_score
45,k,a,ADM,29,0.583333,0.724138,0.646154
