In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from random import seed, sample, randint
import sys
import numpy as np, numpy.random
import math
from collections import Counter

In [4]:
seed(1)

In [5]:
sys.path.append('../metrics')

## Reliability

In [6]:
# Data Preparation
# Please note that all datasets are generated randomly. The metric results might be unreasonable.

# Datasets for reliability
reliability_dataset_columns = {
    "job_id": [sample(list(range(1000, 10000)), 1)[0] for _ in range(1000)],
    "labeler_ids": [sample(list(range(10000, 100000)), 1)[0] for _ in range(1000)],
    "labeler_type": [sample(["L1", "L2"], 1)[0] for _ in range(1000)],
    "decision": [sample([1, 0], 1)[0] for _ in range(1000)],
}

labels_all = pd.DataFrame.from_dict(reliability_dataset_columns)

two_labelers_dataset_columns = {
    "job_id": list(range(1000, 1500)) + list(range(1000, 1500)),
    "d1": [sample([1, 0], 1)[0] for _ in range(1000)],
    "d2": [sample([1, 0], 1)[0] for _ in range(1000)],
}
two_labelers = pd.DataFrame.from_dict(two_labelers_dataset_columns)

In [7]:
labels_all.head()

Unnamed: 0,job_id,labeler_ids,labeler_type,decision
0,3201,26661,L1,0
1,2033,87277,L2,1
2,5179,47636,L2,1
3,2931,64121,L1,1
4,9117,43806,L1,0


In [8]:
two_labelers.head()

Unnamed: 0,job_id,d1,d2
0,1000,0,0
1,1001,1,0
2,1002,0,0
3,1003,0,1
4,1004,1,1


In [9]:
import reliability as reliability

## examples for general metrics and multiple labeler metrics
### --- Basic metrics --- #
- Job agreement rate
- Annotator agreement rate

### --- Multi-annotator metrics --- #
- Fleiss’s κ
- Krippendorff's α

In [10]:
#Job agreement rate
reliability.job_agreement_rate(table = labels_all, job="job_id", decision="decision")

0.9768664563617245

In [11]:
#Annotator agreement rate
reliability.annotator_agreement_rate(table = labels_all, job="job_id", decision="decision")

0.5192307692307693

In [12]:
# Fleiss’s κ
reliability.fleiss_kappa_shadow(table = labels_all, job="job_id", decision="decision", annotator="labeler_ids", annotator_num=2)

0.10856519367157673

In [13]:
# Krippendorff's α
reliability.calculate_krippendorffs_alpha(
    table=labels_all, job="job_id", decision="decision", annotator="labeler_ids", metric_fn="nominal"
)

0.07097933513027843

## examples for metrics on two-labeler jobs
- Cohen’s κ  -- used for both ordinal and nominal labels.
- Kendall’s τ -- only be used for ordinal/continuous labels.
- Spearman’s ρ -- only be used for ordinal/continuous labels.

In [14]:
reliability.cohen_kappa(table=two_labelers, decision1="d1", decision2="d2")

-0.03467046646226768

In [15]:
#to apply kendall's, decisions should be numeric
reliability.kendall_tau(table=two_labelers, decision1="d1", decision2="d2")

KendalltauResult(correlation=-0.034676091732800673, pvalue=0.27307619776448167)

In [16]:
#to apply kendall's, decisions should be numeric
reliability.spearman_rho(table=two_labelers, decision1="d1", decision2="d2")

SpearmanrResult(correlation=-0.034676091732800673, pvalue=0.27329226724724953)

### 2.3 Multi-annotator metrics:
* **Fleiss’s $\kappa$**

In [17]:
## Get job ids for jobs with double-review
count_num_decisions=Counter(labels_all.job_id)
job_id_with_double_review=np.array(list(count_num_decisions.keys()))[[item == 2 for item in count_num_decisions.values()]]

In [18]:
## Keep only jobs with double-review
label_all_job_with_double_review = labels_all.iloc[[item in job_id_with_double_review for item in labels_all.job_id]]

In [19]:
# fleiss's only expects the jobs with double-review
reliability.fleiss_kappa_shadow(labels_all, "job_id", "decision", "labeler_ids")

0.10856519367157673

In [20]:
## According to the guideline on wikipedia, 0.87 (>0.80) is a strong sign of reliable decision data.
reliability.calculate_krippendorffs_alpha(labels_all, "job_id", "decision", "labeler_ids")

0.07097933513027843