In [0]:
import pandas as pd
import numpy as np

### Load Data

The dataframe is grouped by comment and contains columns that count the number of annotators that marked the comment as toxic or not toxic. These two columns will be used to compute the inter annotator agreement.

In [0]:
TOXIC_COLUMNS = ['toxic', 'not_toxic']

In [0]:
wiki = pd.read_csv('../data/wiki.csv')

In [0]:
wiki.head()

Unnamed: 0,comment_text,not_toxic,toxic
0,Re: Unblock \n\nTa! (talk ⋅ contribs),9,1
1,What do you mean by link spam! - These are not...,10,0
2,"¿ Better fewer but worse?, as another editor ...",9,1
3,Origins \n\nThere is mention of the possible ...,10,0
4,Hi Tony Sidaway \n\nI have to disagree with y...,10,0


### Krippendorf's Alpha

In [0]:
def add_row_to_coincidence(o, row, columns):
    m_u = row.sum(1)
    for i in columns:
        for j in columns:
            if i == j:
                o[i][j] = o[i][j] + row[i]*(row[i]-1)/(m_u-1)
            else:
                o[i][j] = o[i][j] + row[i]*row[j]/(m_u-1)
    return o

def make_coincidence_matrix(df, columns):
    df = df[columns]
    n = df.shape[0]
    num_cols = len(columns)
    o = pd.DataFrame(np.zeros((num_cols,num_cols)), index = columns, columns=columns)
    for i in range(n):
        o = add_row_to_coincidence(o, df[i:i+1], columns)
    return o

def binary_distance(i,j):
    return i!=j

def interval_distance(i,j):
    return (int(i)-int(j))**2

def e(n, i, j):
    if i == j:
        return n[i]*(n[i]-1)/sum(n)-1
    else:
        return n[i]*n[j]/sum(n)-1

def D_e(o, columns, distance):
    n = o.sum(1)
    output = 0
    for i in columns:
        for j in columns:
            output = output + e(n,i,j)*distance(i,j)
    return output

def D_o(o, columns, distance):
    output = 0
    for i in columns:
        for j in columns:
            output = output + o[i][j]*distance(i,j)
    return output

def Krippendorf_alpha(df, columns, distance = binary_distance, o = None):
    if o is None:
        o = make_coincidence_matrix(df, columns)
    d_o = D_o(o, columns, distance)
    d_e = D_e(o, columns, distance)
    return (1 - d_o/d_e)

In [0]:
print("Krippendorf's Alpha for Wikipedia Toxicity: ")
Krippendorf_alpha(wiki, TOXIC_COLUMNS)

Krippendorf's Alpha for Wikipedia Toxicity: 


0.47679021368279273