## Get Inter-Annotater Agreement (IAA) rates

In [124]:
# imports

import csv
import re
import pandas as pd
from sklearn.metrics import cohen_kappa_score

In [8]:
files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

In [63]:
# convert txt to csv - fix encoding issue

input_path = f"../data/IAA/IAA_evaluation_set_Jelke_ANNOTATED.txt"
output_path = f"../data/IAA/IAA_evaluation_set_Jelke_ANNOTATED.csv"

column_names = ['ID', 'Sentence', 'AI entity', 'score']
df = pd.read_csv(input_path, sep='\t', header=None, names=column_names,index_col=False)
df.to_csv(output_path, index=False)  # comma is the default delimiter

### Functions for obtaining statistics about IAA on both rater sets

In [114]:
def normalized(string):
    return re.sub(r'\s+', ' ', string.strip())

def convert_annotation(score):
    """
     This function converts annotations to numerical values:
     negative - 0, positive - 1, inclonclusive - 2
    """ 
    if score in ['p','p1','p2','p3']:
        score = 1
    elif score in ['n1','n2','n3']:
        score = 0
    elif score == 'inc':
        score = 2
    else:
        print("score is malformed")

    return score
    
def get_rater_dict(rater):
    
    with open(f"../data/IAA/IAA_evaluation_set_{rater}_ANNOTATED.csv","r") as infile:

        rater_dict = {}

        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:
            sentence_id = normalized(row[0])
            if sentence_id not in rater_dict:
                rater_dict[sentence_id] = {'sentence':normalized(row[1]), 'entity':normalized(row[2]), 'score':int(normalized(row[3]))}
            else:
                print("why is the ID appearing twice?")

        return rater_dict

def get_stats(rater_dict,rater,print_res=True):

    files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

    annotations = {}

    # initiate all counters
    orig_pos_annotations = 0
    orig_neg_annotations = 0
    orig_inc_annotations = 0
    rater_pos_annotations = 0
    rater_neg_annotations = 0
    rater_inc_annotations = 0
    agreements = 0
    total_disagreements = 0
    pos_to_neg = 0
    pos_to_inc = 0
    neg_to_pos = 0
    neg_to_inc = 0
    inc_to_pos = 0
    inc_to_neg = 0

    for file in files:
    
        with open(f"../data/evaluation_sentences_csv/{file}.csv","r") as infile:
        
            header = infile.readline()
            reader = csv.reader(infile)
        
            for row in reader:
                sentence_id = normalized(row[0])
                sentence = normalized(row[1])
                if sentence_id in rater_dict.keys():
                    assert sentence == rater_dict[sentence_id]['sentence']
                    rater_annotation = rater_dict[sentence_id]['score']
                    AI_entity = rater_dict[sentence_id]['entity']
                    sentence_annotation = convert_annotation(normalized(row[-1]))

                    # count how many original annotations were of each class
                    if sentence_annotation == 1:
                        orig_pos_annotations += 1
                    elif sentence_annotation == 0:
                        orig_neg_annotations += 1
                    elif sentence_annotation == 2:
                        orig_inc_annotations += 1

                    # count how many rater annotations were of each class
                    if rater_annotation == 1:
                        rater_pos_annotations += 1
                    elif rater_annotation == 0:
                        rater_neg_annotations += 1
                    elif rater_annotation == 2:
                        rater_inc_annotations += 1

                    # add to annotations dict
                    if sentence_id not in annotations:
                        annotations[sentence_id] = {'sentence':sentence, 'entity':AI_entity , 'original_annotation':sentence_annotation, rater:rater_annotation}
                    else:
                        print("why is the ID appearing twice?")

                    # count agreements and disagreements of each type
                    if sentence_annotation != rater_annotation:
                        total_disagreements += 1
                        #print(f"Sentence: {sentence}")
                        #print(f"Original annotation: {sentence_annotation}")
                        #print(f"{rater}'s annotation: {rater_annotation}")
                        #print()
                        if sentence_annotation == 1 and rater_annotation == 0:
                            pos_to_neg += 1
                        elif sentence_annotation == 1 and rater_annotation == 2:
                            pos_to_inc += 1
                        elif sentence_annotation == 0 and rater_annotation == 1:
                            neg_to_pos += 1
                        elif sentence_annotation == 0 and rater_annotation == 2:
                            neg_to_inc += 1
                        elif sentence_annotation == 2 and rater_annotation == 1:
                            inc_to_pos += 1
                        elif sentence_annotation == 2 and rater_annotation == 0:
                            inc_to_neg += 1
                        else:
                            print("this case is unexpected.")
                    else:
                        agreements += 1 

    # assert counts were correct
    total_cases = len(annotations.keys())
    assert total_cases == len(rater_dict.keys())
    assert total_cases == agreements + total_disagreements
    
    disagreement_on_pos_neg = pos_to_neg + pos_to_inc + neg_to_pos + neg_to_inc
    disagreements_on_inc = total_disagreements - disagreement_on_pos_neg
    assert disagreements_on_inc == inc_to_pos + inc_to_neg

    if pos_to_neg == 1:
        be = 'was'
        s = ''
    else:
        be = 'were'
        s = 's'
    if neg_to_pos == 1:
        be2 = 'was'
        s2 = ''
    else:
        be2 = 'were'
        s2 = 's'
    if pos_to_inc == 1:
        s3 = ''
    else:
        s3 = 's'
    if neg_to_inc == 1:
        s4 = ''
    else:
        s4 = 's'

    if print_res == True:
        print(f"{rater}'s set had {orig_pos_annotations} positive sentences, {orig_neg_annotations} negative sentences and {orig_inc_annotations} inconclusive cases.")
        print(f"{rater} labeled {rater_pos_annotations} sentences as positive, {rater_neg_annotations} sentences as negative and {rater_inc_annotations} sentences as inconclusive.")
        print(f"There are {agreements} matching annotations out of {total_cases} total cases, and {total_disagreements} disagreements.")
        print(f"There are {disagreement_on_pos_neg} disagreements on positive or negative cases:")
        print(f"{pos_to_neg} positive sentence{s} {be} labeled negative. {neg_to_pos} negative sentence{s2} {be2} labeled positive.")
        print(f"{pos_to_inc} positive sentence{s3} and {neg_to_inc} negative sentence{s4} were labeled inconclusive.")
        print(f"Out of the {disagreements_on_inc} disagreements on inconclusive cases, {inc_to_pos} were labeled as positive by the rater, and {inc_to_neg} were labeled as negative by the rater.")
        print()
          
    return annotations

def print_mismatching_sentences(annotations_dict,rater):

    for uniq_id, annotations in annotations_dict.items():

        sentence_annotation = annotations['original_annotation']
        rater_annotation = annotations[rater]

        if sentence_annotation != rater_annotation:
            print(f"Sentence: {annotations['sentence']}")
            print(f"AI entity: {annotations['entity']}")
            print(f"Original annotation: {sentence_annotation}")
            print(f"{rater}'s annotation: {rater_annotation}")
            print()


### Display overall rater statistics

In [115]:
pia_ratings = get_rater_dict('Pia')
jelke_ratings = get_rater_dict('Jelke')

orig_pia_annotations = get_stats(pia_ratings,'Pia')
orig_jelke_annotations = get_stats(jelke_ratings,'Jelke')

Pia's set had 15 positive sentences, 14 negative sentences and 13 inconclusive cases.
Pia labeled 12 sentences as positive, 23 sentences as negative and 7 sentences as inconclusive.
There are 25 matching annotations out of 42 total cases, and 17 disagreements.
There are 6 disagreements on positive or negative cases:
1 positive sentence was labeled negative. 0 negative sentences were labeled positive.
4 positive sentences and 1 negative sentence were labeled inconclusive.
Out of the 11 disagreements on inconclusive cases, 2 were labeled as positive by the rater, and 9 were labeled as negative by the rater.

Jelke's set had 11 positive sentences, 10 negative sentences and 21 inconclusive cases.
Jelke labeled 19 sentences as positive, 20 sentences as negative and 3 sentences as inconclusive.
There are 18 matching annotations out of 42 total cases, and 24 disagreements.
There are 5 disagreements on positive or negative cases:
2 positive sentences were labeled negative. 2 negative sentences

### Display mismatching sentences

In [116]:
print("Mismatching sentences:")
print()
print_mismatching_sentences(orig_pia_annotations,'Pia')
print_mismatching_sentences(orig_jelke_annotations,'Jelke')

Mismatching sentences:

Sentence: While governments and businesses are eager to enjoy the benefits of AI innovations, the mixed impact of these autonomous and intelligent systems on human well-being has become a pressing issue.
AI entity: these autonomous and intelligent systems
Original annotation: 2
Pia's annotation: 0

Sentence: These relationships were found to be central to the development and adoption of LLMs, but they can also be the terrain for uncalibrated trust and reliance on untrustworthy LLMs.
AI entity: untrustworthy LLMs
Original annotation: 2
Pia's annotation: 0

Sentence: Finally, we analyze 6 Arabic pre-training corpora and find that commonly used sources such as Wikipedia may not be best suited to build culturally aware LMs, if used as they are without adjustment.
AI entity: culturally aware LMs
Original annotation: 2
Pia's annotation: 0

Sentence: To address this blind spot, this study introduces the AI Family Integration Index (AFII), a ten dimensional benchmarking

### Calculate cohen Kappa score on positive-negative cases alone, and in total

In [118]:
def get_cohen_kappa_score(rater,exclude_inconclusive=False):

    rater_dict = get_rater_dict(rater)
    annotations = get_stats(rater_dict,rater,print_res=False)
    
    y_true = []
    y_pred = []
    
    for value in annotations.values():
        orig_annotation = value['original_annotation']
        rater_annotation = value[rater]

        if exclude_inconclusive == True:
            if orig_annotation != 2 and rater_annotation != 2: # only include positive or negative agreement
                y_true.append(orig_annotation)
                y_pred.append(rater_annotation)

        else:
            y_true.append(orig_annotation)
            y_pred.append(rater_annotation)

    assert len(y_true) == len(y_pred)

    return cohen_kappa_score(y_true, y_pred)

pia_cohen_kappa = get_cohen_kappa_score('Pia')
pia_cohen_kappa_excl_inc = get_cohen_kappa_score('Pia',exclude_inconclusive=True)

jelke_cohen_kappa = get_cohen_kappa_score('Jelke')
jelke_cohen_kappa_excl_inc = get_cohen_kappa_score('Jelke',exclude_inconclusive=True)

print("Cohen Kappa score on Pia's set:")
print(f"including inconclusive: {pia_cohen_kappa}")
print(f"without inconclusive: {pia_cohen_kappa_excl_inc}")
print()
print("Cohen Kappa score on Jelke's set:")
print(f"including inconclusive: {jelke_cohen_kappa}")
print(f"without inconclusive: {jelke_cohen_kappa_excl_inc}")

Cohen Kappa score on Pia's set:
including inconclusive: 0.39026473099914605
without inconclusive: 0.9154929577464789

Cohen Kappa score on Jelke's set:
including inconclusive: 0.2198142414860681
without inconclusive: 0.595959595959596


In [122]:
def cohen_kappa_per_class(cls,rater):

    rater_dict = get_rater_dict(rater)
    annotations = get_stats(rater_dict,rater,print_res=False)
    
    y_true = []
    y_pred = []
    
    for value in annotations.values():
        orig_annotation = value['original_annotation']
        rater_annotation = value[rater]
        y_true.append(orig_annotation)
        y_pred.append(rater_annotation)

    # convert to binary: 1 if label == cls, else 0
    y_true_binary = [1 if label == cls else 0 for label in y_true]
    y_pred_binary = [1 if label == cls else 0 for label in y_pred]
    
    kappa = cohen_kappa_score(y_true_binary, y_pred_binary)
    print(f"{rater}'s Cohen's Kappa for class {cls}: {kappa:.3f}")

cohen_kappa_per_class(1,'Pia')
cohen_kappa_per_class(0,'Pia')
cohen_kappa_per_class(2,'Pia')
cohen_kappa_per_class(1,'Jelke')
cohen_kappa_per_class(0,'Jelke')
cohen_kappa_per_class(2,'Jelke')

Pia's Cohen's Kappa for class 1: 0.620
Pia's Cohen's Kappa for class 0: 0.492
Pia's Cohen's Kappa for class 2: -0.021
Jelke's Cohen's Kappa for class 1: 0.401
Jelke's Cohen's Kappa for class 0: 0.219
Jelke's Cohen's Kappa for class 2: 0.048
