### 1. NLI annotations: Raw agreement 

In [1]:
import pandas as pd 
import numpy as np 
pd.set_option('future.no_silent_downcasting', True)

Let's get the raw agreement rates per example. 

In [2]:
# Toy data to start 

data = [[1, 2, 3, 4], #Example number 
    [-1, 0, 0, 1],    #Annot 1
    [1, 1, 0, 1]]      #Annot 2
data = pd.DataFrame(data)
data

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,-1,0,0,1
2,1,1,0,1


In [3]:
data = data[1:] # Drop the first row 
data

Unnamed: 0,0,1,2,3
1,-1,0,0,1
2,1,1,0,1


In [4]:
LABEL_OPTIONS = set([-1, 0, 1])

def get_label_proportions(column):
    counts = column.value_counts(normalize=True)
    for label in LABEL_OPTIONS:
        if label not in counts:
            counts[label] = 0.0
    return counts.sort_index()

proportions_df = data.apply(get_label_proportions, axis=0)
proportions_df

Unnamed: 0,0,1,2,3
-1,0.5,0.0,0.0,0.0
0,0.0,0.5,1.0,0.0
1,0.5,0.5,0.0,1.0


Full dataset for class NLI 

In [5]:
# We gather the annotator-document matrix 
df = pd.read_csv("anno.csv")
#df = pd.read_csv("anno-toy.csv")
df

Unnamed: 0,Timestamp,"Example 1: \n\nPremise: \n""A tennis match with multiple females playing."" \n\nHypothesis: \n""Some women are playing a sport.""","Example 2: \n\nPremise: \n""Japanese tourists visiting other countries are still shocked to find Japanese consumer goods available for far less than they have to pay at home."" \n\nHypothesis: \n""Some Japanese people travel to outside Japanese borders to buy certain items for cheaper.""","Example 3: \n\nPremise: \n""The average number of boxes per route differs by a factor of two."" \n\nHypothesis: \n""The average amount of boxes is always the same.""","Example 4: \n\nPremise: \n""Paula swatted the fly."" \n\nHypothesis: \n""The swatting happened in a forceful manner."""
0,11/11/2024 9:14:56,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
1,11/11/2024 9:17:14,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
2,11/12/2024 10:07:34,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
3,11/12/2024 10:07:37,y=1: Entailment,y=1: Entailment,y=-1: Contradiction,y=0: Neutral
4,11/12/2024 10:07:48,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
5,11/12/2024 10:07:49,y=1: Entailment,y=-1: Contradiction,y=-1: Contradiction,y=0: Neutral
6,11/12/2024 10:07:54,y=1: Entailment,y=1: Entailment,y=-1: Contradiction,y=0: Neutral
7,11/12/2024 10:07:57,y=1: Entailment,y=1: Entailment,y=-1: Contradiction,y=0: Neutral
8,11/12/2024 10:07:57,y=1: Entailment,y=-1: Contradiction,y=-1: Contradiction,y=0: Neutral
9,11/12/2024 10:07:59,y=0: Neutral,y=-1: Contradiction,y=0: Neutral,y=0: Neutral


In [6]:
# Drop the column timestamp
df = df.drop(columns='Timestamp')
df

Unnamed: 0,"Example 1: \n\nPremise: \n""A tennis match with multiple females playing."" \n\nHypothesis: \n""Some women are playing a sport.""","Example 2: \n\nPremise: \n""Japanese tourists visiting other countries are still shocked to find Japanese consumer goods available for far less than they have to pay at home."" \n\nHypothesis: \n""Some Japanese people travel to outside Japanese borders to buy certain items for cheaper.""","Example 3: \n\nPremise: \n""The average number of boxes per route differs by a factor of two."" \n\nHypothesis: \n""The average amount of boxes is always the same.""","Example 4: \n\nPremise: \n""Paula swatted the fly."" \n\nHypothesis: \n""The swatting happened in a forceful manner."""
0,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
1,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
2,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
3,y=1: Entailment,y=1: Entailment,y=-1: Contradiction,y=0: Neutral
4,y=1: Entailment,y=0: Neutral,y=-1: Contradiction,y=0: Neutral
5,y=1: Entailment,y=-1: Contradiction,y=-1: Contradiction,y=0: Neutral
6,y=1: Entailment,y=1: Entailment,y=-1: Contradiction,y=0: Neutral
7,y=1: Entailment,y=1: Entailment,y=-1: Contradiction,y=0: Neutral
8,y=1: Entailment,y=-1: Contradiction,y=-1: Contradiction,y=0: Neutral
9,y=0: Neutral,y=-1: Contradiction,y=0: Neutral,y=0: Neutral


In [7]:
#Clean-up 

replacements = {
    'y=-1: Contradiction': -1,
    'y=0: Neutral': 0,
    'y=1: Entailment': 1,
}

df.replace(replacements, inplace=True)
df

Unnamed: 0,"Example 1: \n\nPremise: \n""A tennis match with multiple females playing."" \n\nHypothesis: \n""Some women are playing a sport.""","Example 2: \n\nPremise: \n""Japanese tourists visiting other countries are still shocked to find Japanese consumer goods available for far less than they have to pay at home."" \n\nHypothesis: \n""Some Japanese people travel to outside Japanese borders to buy certain items for cheaper.""","Example 3: \n\nPremise: \n""The average number of boxes per route differs by a factor of two."" \n\nHypothesis: \n""The average amount of boxes is always the same.""","Example 4: \n\nPremise: \n""Paula swatted the fly."" \n\nHypothesis: \n""The swatting happened in a forceful manner."""
0,1,0,-1,0
1,1,0,-1,0
2,1,0,-1,0
3,1,1,-1,0
4,1,0,-1,0
5,1,-1,-1,0
6,1,1,-1,0
7,1,1,-1,0
8,1,-1,-1,0
9,0,-1,0,0


In [8]:
proportions_df = df.apply(get_label_proportions, axis=0)
proportions_df

Unnamed: 0,"Example 1: \n\nPremise: \n""A tennis match with multiple females playing."" \n\nHypothesis: \n""Some women are playing a sport.""","Example 2: \n\nPremise: \n""Japanese tourists visiting other countries are still shocked to find Japanese consumer goods available for far less than they have to pay at home."" \n\nHypothesis: \n""Some Japanese people travel to outside Japanese borders to buy certain items for cheaper.""","Example 3: \n\nPremise: \n""The average number of boxes per route differs by a factor of two."" \n\nHypothesis: \n""The average amount of boxes is always the same.""","Example 4: \n\nPremise: \n""Paula swatted the fly."" \n\nHypothesis: \n""The swatting happened in a forceful manner."""
-1,0.0,0.2,0.96,0.0
0,0.12,0.64,0.04,0.92
1,0.88,0.16,0.0,0.08


### 2. Kripendorff's Alpha

In [None]:
#!pip install krippendorff

In [9]:
import krippendorff

Toy data from our lecture example

In [None]:
reliability_data_str = (
        "*    *    *    *    *    3    4    1    2    1    1    3    3    *    3",  # coder A
        "1    *    2    1    3    3    4    3    *    *    *    *    *    *    *",  # coder B
        "*    *    2    1    3    4    4    *    2    1    1    3    3    *    4",  # coder C
    )

reliability_data = [[np.nan if v == "*" else int(v) for v in coder.split()] for coder in reliability_data_str]
reliability_data

In [None]:
alpha = krippendorff.alpha(reliability_data=reliability_data,
                           level_of_measurement="nominal")

print("Krippendorff's alpha (toy data): ", np.round(alpha, 3))

In [10]:
# Real data from our NLI in-class annotations

# Need to convert to list of lists 

df_list = df.values.tolist()
df_list

[[1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 1, -1, 0],
 [1, 0, -1, 0],
 [1, -1, -1, 0],
 [1, 1, -1, 0],
 [1, 1, -1, 0],
 [1, -1, -1, 0],
 [0, -1, 0, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, -1, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [0, 1, -1, 1],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [1, 0, -1, 0],
 [0, -1, -1, 1]]

In [11]:
alpha = krippendorff.alpha(reliability_data=df_list,
                           level_of_measurement="nominal")

print("Krippendorff's alpha (our data): ", np.round(alpha, 3))

Krippendorff's alpha (our data):  0.621
