# Fleiss Kappa calculating

In [2]:
import pandas as pd
import numpy as np
import statsmodels
from statsmodels.stats.inter_rater import fleiss_kappa

In [3]:
df = pd.read_csv('./trial_medical_2.tsv',sep= '\t')
# df_l2 = pd.read_csv('./annotations/NA2.conll',sep= ' ',header=0)
# df_l3 = pd.read_csv('./annotations/NA3.conll',sep= ' ',header=0)
# df_l4 = pd.read_csv('./annotations/NA4.conll',sep= ' ',header=0)

In [4]:
df.head()

Unnamed: 0,Utterances,Marike,Edwin,Sabina,gpt
0,How was your day today?,none,none,none,communication
1,"Oh, it was not too bad, thank you.",none,none,none,none
2,That's good to hear! Did you have any communic...,communication,communication,communication,communication
3,One of my grandchildren called me today and we...,communication,interpersonal,communication,communication
4,That's lovely! Have you; had trouble rememberi...,learning,none,none,communication


In [5]:
df.loc[df['gpt'] == 'learning and applying knowledge', 'gpt'] = 'learning'
df.loc[df['gpt'] == 'general tasks and demands', 'gpt'] = 'general'
df.loc[df['gpt'] == 'communication', 'gpt'] = 'communication'
df.loc[df['gpt'] == 'mobility', 'gpt'] = 'mobility'
df.loc[df['gpt'] == 'self-care', 'gpt'] = 'selfcare'
df.loc[df['gpt'] == 'domestic life areas', 'gpt'] = 'domestic'
df.loc[df['gpt'] == 'interpersonal interactions and relationships', 'gpt'] = 'interpersonal'
df.loc[df['gpt'] == 'major life areas', 'gpt'] = 'major'
df.loc[df['gpt'] == 'community, social and civic life', 'gpt'] = 'social'
df.loc[df['gpt'] == 'none', 'gpt'] = 'none'



In [6]:
def fleiss_kappa_count(M):
    """Computes Fleiss' kappa for group of annotators.

    :param M: a matrix of shape (:attr:'N', :attr:'k') with 'N' = number of subjects and 'k' = the number of categories.
        'M[i, j]' represent the number of raters who assigned the 'i'th subject to the 'j'th category.
    :type: numpy matrix

    :rtype: float
    :return: Fleiss' kappa score
    """
    N, k = M.shape  # N is # of items, k is # of categories
    n_annotators = float(np.sum(M[0, :]))  # # of annotators
    # print(n_annotators)
    tot_annotations = N * n_annotators  # the total # of annotations
    category_sum = np.sum(M, axis=0)  # the sum of each category over all items

    # chance agreement
    p = category_sum / tot_annotations  # the distribution of each category over all annotations
    PbarE = np.sum(p * p)  # average chance agreement over all categories

    # observed agreement
    P = (np.sum(M * M, axis=1) - n_annotators) / (n_annotators * (n_annotators - 1))
    Pbar = np.sum(P) / N  # add all observed agreement chances per item and divide by amount of items

    return round((Pbar - PbarE) / (1 - PbarE), 4)


In [7]:
a1 = df['Marike'].tolist() # trial 1:0.56
a2 = df['Edwin'].tolist() # trial 1: 0.51
a3 = df['Sabina'].tolist() # trial 1: 0.55
a4 = df['gpt'].tolist() 
# trial 1 human: 0.69 # trial 1: all 0.58 
# trial 2 human 0.65 # trial 2: all 0.42 all conversations are from gpt conversations 
# higher agreement from manual set than gpt set

In [8]:
df_1 = df["gpt"]
df_1.to_csv('writegpt_2.tsv', sep='\t')

In [9]:
array_1 = np.array((a1,a2,a3,a4)).T
# array_1 = np.array((a4,a1)).T # Edwin and sabina agree with each other 

In [10]:
print(array_1)

[['none' 'none' 'none' 'communication']
 ['none' 'none' 'none' 'none']
 ['communication' 'communication' 'communication' 'communication']
 ['communication' 'interpersonal' 'communication' 'communication']
 ['learning' 'none' 'none' 'communication']
 ['communication' 'communication' 'learning' 'communication']
 ['communication' 'learning' 'communication' 'communication']
 ['communication' 'none' 'communication' 'communication']
 ['none' 'social' 'none' 'interpersonal']
 ['social' 'social' 'social' 'major']
 ['none' 'none' 'none' 'interpersonal']
 ['none' 'none' 'none' 'interpersonal']
 ['none' 'none' 'none' 'interpersonal']
 ['none' 'none' 'none' 'major']
 ['social' 'social' 'social' 'interpersonal']
 ['none' 'none' 'none' 'interpersonal']
 ['general' 'general' 'general' 'general']
 ['domestic' 'none' 'domestic' 'general']
 ['domestic' 'general' 'domestic' 'general']
 ['domestic' 'general' 'domestic' 'general']
 ['general' 'none' 'general' 'domestic']
 ['general' 'none' 'general' 'gener

In [11]:
list_array_2 = []
for row in array_1:
    count = []
    c0 = 0
    c1 = 0
    c2 = 0
    c3 = 0
    c4 = 0
    c5 = 0
    c6 = 0
    c7 = 0
    c8 = 0
    c9 = 0
    for c in row:
        if c == "learning":
            c0 += 1
        elif c == "general":
            c1 += 1
        elif c == "communication":
            c2 += 1
        elif c == "mobility":
            c3 += 1
        elif c == "selfcare":
            c4 += 1
        elif c == "domestic":
            c5 += 1
        elif c == "interpersonal":
            c6 += 1
        elif c == "major":
            c7 += 1
        elif c == "social":
            c8 += 1
        elif c == "none":
            c9 += 1
        else:
            print("ERROR !!! not expected label:", c)
        count = [c0, c1, c2, c3, c4, c5, c6, c7, c8, c9]
    list_array_2.append(count)

In [12]:
array_2 = np.array(list_array_2)
print(array_2)

[[0 0 1 0 0 0 0 0 0 3]
 [0 0 0 0 0 0 0 0 0 4]
 [0 0 4 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 1 0 0 0]
 [1 0 1 0 0 0 0 0 0 2]
 [1 0 3 0 0 0 0 0 0 0]
 [1 0 3 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 1 2]
 [0 0 0 0 0 0 0 1 3 0]
 [0 0 0 0 0 0 1 0 0 3]
 [0 0 0 0 0 0 1 0 0 3]
 [0 0 0 0 0 0 1 0 0 3]
 [0 0 0 0 0 0 0 1 0 3]
 [0 0 0 0 0 0 1 0 3 0]
 [0 0 0 0 0 0 1 0 0 3]
 [0 4 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 2 0 0 0 1]
 [0 2 0 0 0 2 0 0 0 0]
 [0 2 0 0 0 2 0 0 0 0]
 [0 2 0 0 0 1 0 0 0 1]
 [0 3 0 0 0 0 0 0 0 1]
 [0 3 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0 3]
 [0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 1 0 0 3]
 [0 0 0 0 0 0 4 0 0 0]
 [0 1 0 0 0 0 1 0 0 2]
 [0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0 0 0 4]
 [0 0 0 0 0 0 0 0 0 4]
 [0 3 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 3]
 [2 0 0 0 0 0 0 0 0 2]
 [2 0 1 0 0 0 0 0 0 1]
 [2 2 0 0 0 0 0 0 0 0]
 [4 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 4]]


In [172]:
# fleiss_kappa_count(array_2) # 0.6924
# fleiss_kappa(array_2, method='fleiss') # 0.6924 (only human annotators) substantial 

In [13]:
fleiss_kappa(array_2, method='fleiss') # moderate agreement 0.5802 weak agreemetn

0.48611894466378397

In [14]:
fleiss_kappa_count(array_2) 

0.4861