In [1]:
# Imports
import krippendorff as kd
import time
import os
import glob
import csv
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats import inter_rater as irr
import openai
import ray
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns

# File locations
dir = '/Users/Federica_1/Documents/GitHub/endoQAeval'
out_dir = os.path.join(dir, 'output')
fig_dir = os.path.join(dir, 'figures')



# Calculating Inter-Annotator Agreement

In [2]:
ayers = pd.read_csv(os.path.join(dir, 'data', 'ayers2023.csv'))

print('Agreement Ayers et al.')

for metric in ["Quality", "Empathy"]:
    print(metric)
    annotations = []
    for annotator in range(1,4):
        annotator_list = []
        key1 = f'Eval {str(annotator)} {metric} (Physician)'
        annotator_list.extend(ayers[key1])
        key2 = f'Eval {str(annotator)} {metric} (ChatGPT)'
        annotator_list.extend(ayers[key2])
        annotations.append(annotator_list)

    a = kd.alpha(annotations, level_of_measurement='interval')
    print(a)

Agreement Ayers et al.
Quality
0.408363115580218
Empathy
0.4896056452190445


In [3]:
pat = pd.read_csv(os.path.join(out_dir, 'patient_annotations.csv'), usecols=range(1,13))
doc = pd.read_csv(os.path.join(out_dir, 'specialist_annotations.csv'), usecols=range(4,8))
merged = pd.merge(left=pat, left_on=['response_id', 'prompt', 'type'], right=doc, right_on=['response_id', 'prompt', 'type'])

for column in ["Information Quality","Empathy","Actionability"]:
    key = f'Average {column}'
    key2 = f'Annotator 2 - {column}'
    key3 = f'Annotator 3 - {column}'
    merged[key] = merged[[key2, key3]].mean(axis=1)
merged[:1]

Unnamed: 0,question_id,question,response,response_id,prompt,type,Annotator 2 - Information Quality,Annotator 2 - Empathy,Annotator 2 - Actionability,Annotator 3 - Information Quality,Annotator 3 - Empathy,Annotator 3 - Actionability,Specialist - Information Quality,Average Information Quality,Average Empathy,Average Actionability
0,Endo_d8bcw7_post,I had a lap on 5/10/19 and my gyno said my bla...,"as an ai, i don't have the ability to diagnose...",endoR0,no_prompt,endo,4,2,2,3,2,2,4,3.5,2.0,2.0


### Krippendorff's Alpha and Fleiss' Kappa for three annotators

In [4]:
# calculate agreement across three annotators: we have three annotators only for Information Quality
def calculate_agreement_three(data):
    for metric in ["Information Quality"]:
        print(metric)
        annotations = []
        for annotator in ['Annotator 2', 'Annotator 3', 'Specialist']:
            key = annotator + ' - ' + metric
            annotations.append(data[key])
        a = kd.alpha(annotations, level_of_measurement='interval')
        print("Krippendorff's alpha", round(a, 2))
        giro = np.array(annotations).transpose()
        k = irr.fleiss_kappa(irr.aggregate_raters(giro)[0], method='fleiss')
        print("Fleiss' Kappa", round(k, 2))

### Krippendorff's Alpha and Cohen's Kappa for three annotators

In [5]:
# calculate agreement across two annotators
def calculate_agreement_two(data):
    for metric in ["Information Quality", "Empathy", "Actionability"]:
        print(metric)
        for annotators in [('Annotator 2', 'Annotator 3'), ('Annotator 2', 'Specialist'), ('Annotator 3', 'Specialist')]:
                annotator1 = annotators[0]
                annotator2 = annotators[1]
                if annotator1 != annotator2:
                    if 'Info' not in metric:
                        if 'Spec' not in annotator1 and 'Spec' not in annotator2:
                            key1 = annotator1 + ' - ' + metric
                            key2 = annotator2 + ' - ' + metric
                            a = kd.alpha([data[key1], data[key2]], level_of_measurement='interval')
                            print(annotator1, annotator2)
                            print("Krippendorff's alpha", round(a, 2))
                            print("Cohen's Kappa", round(cohen_kappa_score(data[key1], data[key2], weights = 'linear'), 2))
                    else:
                        key1 = annotator1 + ' - ' + metric
                        key2 = annotator2 + ' - ' + metric
                        a = kd.alpha([data[key1], data[key2]], level_of_measurement='interval')
                        print(annotator1, annotator2)
                        print("Krippendorff's alpha", round(a, 2))
                        print("Cohen's Kappa", round(cohen_kappa_score(data[key1], data[key2], weights = 'linear'), 2))
        print('\n')

### Trying Variance and Spread

In [6]:
measures = {'Variance': {"Information Quality":[], "Empathy":[], "Actionability":[]},
            'Spread': {"Information Quality":[], "Empathy":[], "Actionability":[]}}

for index, row in merged.iterrows():
    for columns in [['Annotator 2 - Information Quality', 'Annotator 3 - Information Quality', 'Specialist - Information Quality'],
                    ['Annotator 2 - Empathy','Annotator 3 - Empathy'],
                    ['Annotator 2 - Actionability','Annotator 3 - Actionability']]:
        metric = columns[0].split(' - ')[1]
        values = row[columns].values
        measures['Variance'][metric].append(np.array(values).var())
        measures['Spread'][metric].append((np.max(values))-(np.min(values)))

for measure,metrics in measures.items():
    for metric,values in metrics.items():
        merged[measure+' '+metric] = values
merged[:1]

Unnamed: 0,question_id,question,response,response_id,prompt,type,Annotator 2 - Information Quality,Annotator 2 - Empathy,Annotator 2 - Actionability,Annotator 3 - Information Quality,...,Specialist - Information Quality,Average Information Quality,Average Empathy,Average Actionability,Variance Information Quality,Variance Empathy,Variance Actionability,Spread Information Quality,Spread Empathy,Spread Actionability
0,Endo_d8bcw7_post,I had a lap on 5/10/19 and my gyno said my bla...,"as an ai, i don't have the ability to diagnose...",endoR0,no_prompt,endo,4,2,2,3,...,4,3.5,2.0,2.0,0.222222,0.0,0.0,1,0,0


In [7]:
def calculate_spread(df):
    
    metric_summary = []
    for metric in ["Information Quality", "Empathy", "Actionability"]:
        if "Act" not in metric:
            for spread in [0,1,2,3,4]:
                tot = len(df[df['Spread '+metric]==spread])
                perc = round(len(df[df['Spread '+metric]==spread])/len(df),2)
                d = {'Metric':metric, 'Spread':spread, 'Perc':perc}
                metric_summary.append(d)
        else:
            for spread in [0,1,2]:
                tot = len(df[df['Spread '+metric]==spread])
                perc = round(len(df[df['Spread '+metric]==spread])/len(df),2)
                d = {'Metric':metric, 'Spread':spread, 'Perc':perc}
                metric_summary.append(d)

    metric_summary_df = pd.DataFrame.from_dict(metric_summary)
    return metric_summary_df

In [8]:
def calculate_variance(df):
    for metric in ["Information Quality", "Empathy", "Actionability"]:
        print(metric)
        df_low_variance = df[df['Variance '+metric]<0.6].copy()
        print(f'Perc of general responses with low variance in terms of {metric}:', round(len(df_low_variance)/len(df), 2))
        if df.iloc[0]['type'] == 'endo':
            print('No prompt:',round(len(df_low_variance[df_low_variance['prompt'] == 'no_prompt'])/len(df_low_variance), 2))
            print('Doc prompt:', round(len(df_low_variance[df_low_variance['prompt'] == 'doc_prompt'])/len(df_low_variance), 2))
            print('Pat prompt:', round(len(df_low_variance[df_low_variance['prompt'] == 'pat_prompt'])/len(df_low_variance), 2))
        print('\n')

# All answers

In [9]:
calculate_agreement_three(merged)

Information Quality
Krippendorff's alpha 0.1
Fleiss' Kappa -0.0


In [10]:
calculate_agreement_two(merged)

Information Quality
Annotator 2 Annotator 3
Krippendorff's alpha 0.08
Cohen's Kappa 0.07
Annotator 2 Specialist
Krippendorff's alpha 0.12
Cohen's Kappa 0.07
Annotator 3 Specialist
Krippendorff's alpha 0.08
Cohen's Kappa 0.05


Empathy
Annotator 2 Annotator 3
Krippendorff's alpha 0.22
Cohen's Kappa 0.18


Actionability
Annotator 2 Annotator 3
Krippendorff's alpha 0.25
Cohen's Kappa 0.2




In [11]:
calculate_spread(merged)

Unnamed: 0,Metric,Spread,Perc
0,Information Quality,0,0.13
1,Information Quality,1,0.52
2,Information Quality,2,0.3
3,Information Quality,3,0.05
4,Information Quality,4,0.0
5,Empathy,0,0.27
6,Empathy,1,0.6
7,Empathy,2,0.12
8,Empathy,3,0.01
9,Empathy,4,0.0


# Endometriosis Answers

In [12]:
endo = merged[merged['type'] == 'endo'].copy()
general = merged[merged['type'] == 'general'].copy()

In [13]:
calculate_agreement_three(endo)

Information Quality
Krippendorff's alpha 0.09
Fleiss' Kappa 0.01


In [14]:
calculate_agreement_two(endo)

Information Quality
Annotator 2 Annotator 3
Krippendorff's alpha 0.04
Cohen's Kappa 0.05
Annotator 2 Specialist
Krippendorff's alpha 0.13
Cohen's Kappa 0.09
Annotator 3 Specialist
Krippendorff's alpha 0.08
Cohen's Kappa 0.04


Empathy
Annotator 2 Annotator 3
Krippendorff's alpha 0.29
Cohen's Kappa 0.21


Actionability
Annotator 2 Annotator 3
Krippendorff's alpha 0.25
Cohen's Kappa 0.19




In [15]:
calculate_variance(endo)

Information Quality
Perc of general responses with low variance in terms of Information Quality: 0.67
No prompt: 0.34
Doc prompt: 0.35
Pat prompt: 0.31


Empathy
Perc of general responses with low variance in terms of Empathy: 0.89
No prompt: 0.34
Doc prompt: 0.34
Pat prompt: 0.32


Actionability
Perc of general responses with low variance in terms of Actionability: 1.0
No prompt: 0.33
Doc prompt: 0.33
Pat prompt: 0.33




In [16]:
calculate_spread(endo)

Unnamed: 0,Metric,Spread,Perc
0,Information Quality,0,0.14
1,Information Quality,1,0.53
2,Information Quality,2,0.29
3,Information Quality,3,0.04
4,Information Quality,4,0.0
5,Empathy,0,0.28
6,Empathy,1,0.6
7,Empathy,2,0.11
8,Empathy,3,0.01
9,Empathy,4,0.0


# General Answers

In [17]:
calculate_agreement_three(general)

Information Quality
Krippendorff's alpha 0.11
Fleiss' Kappa -0.05


In [18]:
calculate_agreement_two(general)

Information Quality
Annotator 2 Annotator 3
Krippendorff's alpha 0.19
Cohen's Kappa 0.13
Annotator 2 Specialist
Krippendorff's alpha 0.01
Cohen's Kappa 0.0
Annotator 3 Specialist
Krippendorff's alpha 0.06
Cohen's Kappa 0.1


Empathy
Annotator 2 Annotator 3
Krippendorff's alpha -0.16
Cohen's Kappa 0.04


Actionability
Annotator 2 Annotator 3
Krippendorff's alpha 0.25
Cohen's Kappa 0.22




In [19]:
calculate_spread(general)

Unnamed: 0,Metric,Spread,Perc
0,Information Quality,0,0.11
1,Information Quality,1,0.48
2,Information Quality,2,0.33
3,Information Quality,3,0.07
4,Information Quality,4,0.0
5,Empathy,0,0.23
6,Empathy,1,0.59
7,Empathy,2,0.18
8,Empathy,3,0.01
9,Empathy,4,0.0


In [20]:
calculate_variance(general)

Information Quality
Perc of general responses with low variance in terms of Information Quality: 0.6


Empathy
Perc of general responses with low variance in terms of Empathy: 0.81


Actionability
Perc of general responses with low variance in terms of Actionability: 0.99




# Sanity check

In [None]:
endo[endo['Spread Information Quality'] == 1].sort_values(['Average Information Quality'], ascending=True)[['question', 'response_id', 'response', 'Average Information Quality']]#.to_csv(os.path.join(out_dir, 'agreed_high_info_quality.csv'), index=False)