## Calculating inter-annotators agreement

#### This script generates 8 additional files:
1. batches_annotators.json – a list of annotators per batch
2. k_alpha_per_batch_4_options.csv – Krippendorff's alpha per batch for all 4 options
3. k_alpha_per_batch_2_options.csv – Krippendorff's alpha per batch for 2 options ('Omstreden' and 'Niet omstreden'), other responses are filtered out, so this data has missing values; the purpose of it is to check the agreement between annotators who could decide whether a term was contentious or non-contentious in a given sample (without options 'I don't know' or 'Bad OCR')
4. pairwise_agreement.csv – Krippendorff's alpha for every pair of annotators in a batch
5. mean_alpha_per_annotator.csv – mean Krippendorff's alpha per annotator (taking all the alpha values from an annotator's pairs)
6. perc_agreement.csv – percentage agreement between annotators per sample 
7. k_alpha_per_batch_2_options_filtered_alpha.csv – Krippendorff's alpha per batch for 2 options without annotators whose mean K alpha lower than 0.2
8. k_alpha_per_batch_2_options_filtered_controls.csv – Krippendorff's alpha per batch for 2 options without annotators who got 3 or more control questions 'wrong' (different from experts)

In [None]:
import csv
import json
import statistics
import requests
import io
import pandas as pd
from itertools import combinations
from sklearn.metrics import cohen_kappa_score
from nltk.metrics.agreement import AnnotationTask
from collections import Counter

In [None]:
# importing csv files from  GitHub

url_1 = "#" # link to the raw file (https://raw.githubusercontent.com/cultural-ai/ConConCor/master/Dataset/Annotations.csv)
annotations = requests.get(url_1).content

url_2 = "#" # link to the raw file (https://raw.githubusercontent.com/cultural-ai/ConConCor/master/Dataset/Extracts.csv)
extracts = requests.get(url_2).content

In [None]:
# putting the csv data into pandas df 
annotations_data = pd.read_csv(io.StringIO(annotations.decode('utf-8')))
extracts_data = pd.read_csv(io.StringIO(extracts.decode('utf-8')))

In [None]:
# selecting relevant columns
annotations_asr = annotations_data[['anonymised_participant_id','extract_id','response']]
extracts_et = extracts_data[['extract_id','target']]

In [None]:
# merging dfs
annotations_with_target = pd.merge(annotations_asr, extracts_et, how='inner', on=['extract_id'])

In [None]:
# converting df to a list
list_merged = annotations_with_target.values.tolist()

In [None]:
# making a list of unique extract IDs
list_of_extracts = extracts_data['extract_id']
list_of_unique_extracts = list(Counter(list_of_extracts).keys())

In [None]:
# making a list of unique annotators
list_of_unique_annotators = list(annotations_asr.groupby('anonymised_participant_id').groups.keys())

In [None]:
# grouping by 'extract_id' to see how many times the same extracts were annotated
groups_per_extract = dict(annotations_asr.groupby('extract_id').groups)

In [None]:
sample_anns = {} # dict with 'extract_id':'number of annotations'
for ext in groups_per_extract:
    anns_num = len(groups_per_extract[ext]) # counting the number of annotations per sample
    sample_anns[ext] = anns_num

#### Getting groups of annotators (batches) (batches_annotators.json)

* if the annotators are in one group it means that they annotated the same set of samples (a batch)
* we need to have the annotators grouped for calculating K alpha per batch

In [None]:
# checking which samples every participant annotated

annotator_extracts = {} #dict with 'anonymised_participant_id': list of extracts they annotated
for group in annotations_asr.groupby('anonymised_participant_id'):
    annotator_extracts[group[0]] = list(group[1]['extract_id'])

In [None]:
# finding unique batches of samples
# lists of samples are transformed to str to use Counter

unique_batches = list(Counter(str(e) for e in list(annotator_extracts.values())).keys())

In [None]:
group_annotators = {} # dict 'group': list of annotators in the group
group_counter = 0
for unique in unique_batches: # iterating over the list of unique batches
    annotators_list = []
    group_counter += 1
    for antr in annotator_extracts: # checking if the unique batch matches the list of extracts
        if str(annotator_extracts[antr]) == unique:
            annotators_list.append(antr) # putting annotators in the same group in a list 
    group_name = f"batch_{group_counter}"
    group_annotators[group_name] = annotators_list

In [None]:
# exporting lists of annotators by batches in json
with open('batches_annotators.json', 'w') as outfile:
    json.dump(group_annotators, outfile)

#### Krippendorff's alpha per group (k_alpha_per_batch_4_options.csv) (all options)

* to calculate K alpha we need tuples with ('anonymised_participant_id','extract_id','response') for every batch
* we take all 4 options (Contentious, Non-contentious, I don't know, Bad OCR)

In [None]:
# converting df to a list with all responses
triples_results = annotations_asr.values.tolist()

In [None]:
# creating a csv with aplha scores per group and num of annotators

with open('k_alpha_per_batch_4_options.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['batch', 'k_alpha_4', 'num_annotators']) #header
    
    for group in group_annotators: # iterating over groups
        responses_list = [] # list to store tuples for every group

        for triple in triples_results: # iterating over responses
            if triple[0] in group_annotators[group]: # collecting tuples for every group
                responses_tuple = (triple[0],triple[1],triple[2])
                responses_list.append(responses_tuple)

        try:
            t = AnnotationTask(data=responses_list)
            k_alpha = round(t.alpha(),3)

        except ZeroDivisionError: # batch_50 has only 1 annotator
            k_alpha = 'zero division'

        writer.writerow([group,k_alpha,len(group_annotators[group])])

#### Krippendorff's alpha per group (k_alpha_per_batch_2_options.csv) (only omstreden/niet omstreden)

* to calculate K alpha we need tuples with ('anonymised_participant_id','extract_id','response') for every batch
* we filter out (1) all the extracts (for every annotator in a group) with less than 2 options 'Omstreden' or 'Niet omstreden' in every batch (117) (it is necessary for calculating alpha correctly) and (2) the extracts with options 'Weet ik niet' and 'Onleesbare OCR' (3523), in total of 3700 extracts

In [None]:
# filtering out the extracts with less than 2 options 'Omstreden' or 'Niet omstreden' in a batch

extracts_to_filter = [] # storing the extracts to be filtered out (negative list)

for unique_extract_id in list_of_unique_extracts: # iterating over the list of unique extracts IDs
        list_of_responses_per_extract = [] # collecting responses for every extract
        
        for row in list_merged: # iterating over the list with all responses
            if unique_extract_id == row[1]: # matching with the unique extract ID
                list_of_responses_per_extract.append(row[2]) # adding all the responses per single extract
                extract_id = row[1] # saving the extract ID
                
        # counting the 2 options in the responses list per extract
        opt_1 = list_of_responses_per_extract.count('Omstreden naar huidige maatstaven')
        opt_2 = list_of_responses_per_extract.count('Niet omstreden')
        
        check_sum = opt_1 + opt_2
        
        if check_sum < 2: # the sum of 2 options should be no less than 2
            extracts_to_filter.append(extract_id)

In [None]:
triples_results_filtered = [] # the list of the filtered responses (18100)
for triple in triples_results: # the original results list (non-filtered) (21800)
     
    # checking if the 'extract_id' is not on the negative list AND (117 in the negative list)
    # filtering out 'Weet ik niet' and 'Onleesbare OCR' options (3583)
    if triple[1] not in extracts_to_filter and triple[2] != 'Weet ik niet' and triple[2] != 'Onleesbare OCR':
        triples_results_filtered.append(triple)

In [None]:
# creating a csv with aplha scores per group and num of annotators (for 2 options)

with open('k_alpha_per_batch_2_options.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['batch', 'k_alpha_2', 'num_annotators']) #header
    
    for group in group_annotators: # iterating over groups
        responses_list = [] # list to store tuples for every group

        for triple in triples_results_filtered: # iterating over responses
            
            if triple[0] in group_annotators[group]: # collecting tuples for every group
                responses_tuple = (triple[0],triple[1],triple[2])
                responses_list.append(responses_tuple)

        try:
            t = AnnotationTask(data=responses_list)
            k_alpha = round(t.alpha(),3)

        except ZeroDivisionError: # batch_50 has only 1 annotator
            k_alpha = 'zero division'
        
        writer.writerow([group,k_alpha,len(group_annotators[group])])

#### Pairwise agreement (pairwise_alpha.csv)

In [None]:
with open('pairwise_alpha.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['batch', 'annotator_1', 'annotator_2', 'alpha']) #header
    
    for group in group_annotators: # iterating over annotators' groups
        pairs_in_group = list(combinations(group_annotators[group], 2)) # pairs combinations in every group

        for pair in pairs_in_group: # iterating over every pair of annotators in a group
            responses_list = [] # all the responses for 2 annotators in a pair

            for triple in triples_results: # iterating over the responses
                
                if pair[0] == triple[0]: # matching annotators' IDs
                    responses_list.append((triple[0],triple[1],triple[2])) # putting all the responses of annotator_1  in a tuple

                if pair[1] == triple[0]:
                    responses_list.append((triple[0],triple[1],triple[2])) # putting all the responses of annotator_2  in a tuple

            # alpha for every pair in the group
            
            try:
                t = AnnotationTask(data=responses_list)
                k_alpha = round(t.alpha(),3)

            except ZeroDivisionError:
                k_alpha = 'zero division'

            writer.writerow([group,pair[0],pair[1],k_alpha])

#### Mean K alpha for every annotator (mean_alpha_per_annotator.csv)

In [None]:
# reading 'pairwise_alpha.csv'

pairwise_agreement = pd.read_csv("pairwise_alpha.csv")

In [None]:
pairs_in_tuples = [] # list with tuples (pairs of annotators) and alpha per pair

for i,row in pairwise_agreement.iterrows():
    pairs_in_tuples.append([(row['annotator_1'],row['annotator_2']),row['alpha']])

In [None]:
ann_mean = [] # list of mean alpha per annotator
for unique in list_of_unique_annotators: # iterating over the list of unique annotators
    ann_values = []
    for pair in pairs_in_tuples:
        if unique in pair[0]: # matching annotators IDs
            ann_values.append(float(pair[1])) # putting alpha of single annotator in a list
            
    if ann_values != []: # there's one batch with only 1 annotator, so there's no pair
        mean_kappa = round(statistics.mean(ann_values),3)
        ann_mean.append([unique,mean_kappa])

In [None]:
# exporting the csv

mean_alpha = pd.DataFrame(ann_mean,columns=['anonymised_participant_id','mean_alpha'])
mean_alpha.to_csv('mean_alpha_per_annotator.csv')

#### Percentage agreement (perc_agreement.csv)

In [None]:
# Creating 'perc_agreement.csv' with count by responses per extract and % agreement

with open('perc_agreement.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['extract_id', 'omstreden', 'niet_omstreden',
                     'weet_ik_niet', 'bad_ocr', 'num_annotators', '%_agree']) #header
    
    for unique_extract_id in list_of_unique_extracts: # iterating over the list of unique extracts IDs
        list_of_responses_per_extract = [] # collecting responses for every extract
        
        for row in list_merged: # iterating over the list with all responses
            if unique_extract_id == row[1]: # matching with the unique extract ID
                list_of_responses_per_extract.append(row[2]) # adding all the responses per single extract
                extract_id = row[1] # saving the extract ID
                target = row[3] # saving the target word of the extract
                
        # counting every option in the responses list per extract
        opt_1 = list_of_responses_per_extract.count('Omstreden naar huidige maatstaven')
        opt_2 = list_of_responses_per_extract.count('Niet omstreden')
        opt_3 = list_of_responses_per_extract.count('Weet ik niet')
        opt_4 = list_of_responses_per_extract.count('Onleesbare OCR')
        
        num_ann = len(list_of_responses_per_extract) # number of annotators per extract
        perc_agree = round(max(opt_1,opt_2,opt_3,opt_4)/num_ann * 100) # % agreement
        extract_name = f"{target}_{extract_id}" # giving to an extract a new ID with its target word 
        result_row = [extract_name,opt_1,opt_2,opt_3,opt_4,num_ann,perc_agree]
        
        writer.writerow(result_row)

#### How does median K alpha per batch change when annotators with mean alpha < 0.2 are excluded? (k_alpha_per_batch_2_options_filtered_alpha.csv)

In [None]:
# list of annotators with alpha < 0.2
low_alpha_annotators = []
for index,row in mean_alpha.iterrows():
    if row['mean_alpha'] < 0.2:
        low_alpha_annotators.append(str(row['anonymised_participant_id']))

In [None]:
# making a duplicate of group_annotators to remove annotators with low alpha
no_low_alpha = group_annotators

In [None]:
# excluding the low alpha annotators from the batches they annotated
for ids in low_alpha_annotators:
    for group in no_low_alpha:
        if ids in no_low_alpha[group]:
            no_low_alpha[group].remove(ids) # ! it changes no_low_alpha

In [None]:
# re-running calculation of K alpha for 2 options without the low alpha annotators

# creating a csv with aplha scores per group and num of annotators (for 2 options)
# without annotators with mean alpha < 0.2

with open('k_alpha_per_batch_2_options_filtered_alpha.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['batch', 'k_alpha_2', 'num_annotators']) #header
    
    for group in no_low_alpha: # iterating over groups
        responses_list = [] # list to store tuples for every group

        for triple in triples_results_filtered: # iterating over responses
            
            if triple[0] in no_low_alpha[group]: # collecting tuples for every group
                responses_tuple = (triple[0],triple[1],triple[2])
                responses_list.append(responses_tuple)

        try:
            t = AnnotationTask(data=responses_list)
            k_alpha = round(t.alpha(),3)

        except ZeroDivisionError: # batches with only 1 annotator
            k_alpha = 'zero division'
        
        writer.writerow([group,k_alpha,len(no_low_alpha[group])])

#### How does median K alpha change when excluding annotators whose responses to the 5 control questions differ from the unanimous responses of experts (3 or more questions)? (k_alpha_per_batch_2_options_filtered_controls.csv)

In [None]:
# gathering all the responses to control samples

all_control_responses = []

for response in triples_results: # list of responses
    if 'c' in response[1]: #c is a prefix for control samples
        all_control_responses.append([response[0],(response[1],response[2])])

In [None]:
# responses of experts to control questions with 100% agreement
unanimous_responses = [('c0','Omstreden naar huidige maatstaven'),
                      ('c1','Omstreden naar huidige maatstaven'),
                      ('c2','Niet omstreden'),
                      ('c3','Omstreden naar huidige maatstaven'),
                      ('c4', 'Niet omstreden')]

In [None]:
wrong_controls = []
for response in all_control_responses:
    if response[1] not in unanimous_responses:
        wrong_controls.append(response[0]) # IDs of annotators who got the controls 'wrong'

In [None]:
annotators_to_filter = []
for i in dict(Counter(wrong_controls)):
    if dict(Counter(wrong_controls))[i] >= 3: # 3 or more 'wrong' controls
        annotators_to_filter.append(i)

In [None]:
# making a duplicate of group_annotators to remove annotators who got 3 or more controls 'wrong'
no_wrong_controls = group_annotators

In [None]:
# excluding the annotators with 'wrong' controls from the batches they annotated
for ids in annotators_to_filter:
    for group in no_wrong_controls:
        if ids in no_wrong_controls[group]:
            no_wrong_controls[group].remove(ids) # ! it changes no_wrong_controls

In [None]:
# re-running calculation of K alpha for 2 options without the annotators with 'wrong' controls

# creating a csv with aplha scores per group and num of annotators (for 2 options)
# without annotators with 'wrong' controls

with open('k_alpha_per_batch_2_options_filtered_controls.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['batch', 'k_alpha_2', 'num_annotators']) #header
    
    for group in no_wrong_controls: # iterating over groups
        responses_list = [] # list to store tuples for every group

        for triple in triples_results_filtered: # iterating over responses
            
            if triple[0] in no_wrong_controls[group]: # collecting tuples for every group
                responses_tuple = (triple[0],triple[1],triple[2])
                responses_list.append(responses_tuple)

        try:
            t = AnnotationTask(data=responses_list)
            k_alpha = round(t.alpha(),3)

        except ZeroDivisionError: # batches with only 1 annotator
            k_alpha = 'zero division'
        
        writer.writerow([group,k_alpha,len(no_wrong_controls[group])])

In [None]:
alpha_per_batch = pd.read_csv('k_alpha_per_batch_2_options.csv')
alpha_per_batch_filtered_low_alpha = pd.read_csv('k_alpha_per_batch_2_options_filtered_alpha.csv')
alpha_per_batch_filtered_controls = pd.read_csv('k_alpha_per_batch_2_options_filtered_controls.csv')

In [None]:
# comparing medians

for i,row in alpha_per_batch.iterrows():
    if (row['k_alpha_2']) == 'zero division':
        alpha_per_batch.drop([i], axis=0, inplace=True)
        
for i,row in alpha_per_batch_filtered_low_alpha.iterrows():
    if (row['k_alpha_2']) == 'zero division':
        alpha_per_batch_filtered_low_alpha.drop([i], axis=0, inplace=True)
        
for i,row in alpha_per_batch_filtered_controls.iterrows():
    if (row['k_alpha_2']) == 'zero division':
        alpha_per_batch_filtered_controls.drop([i], axis=0, inplace=True)
        
print(f"Median K alpha:{alpha_per_batch['k_alpha_2'].median()}\n Median K alpha (no annotators with a < 0.2): {alpha_per_batch_filtered_low_alpha['k_alpha_2'].median()}\n Median K alpha (no annotators with 3 or more 'wrong' controls): {alpha_per_batch_filtered_controls['k_alpha_2'].median()}")