In [1]:
import pandas as pd
import numpy as np
import glob

import os

from sklearn.metrics import cohen_kappa_score

from scipy.stats import spearmanr, kendalltau

import re

import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.6f}'.format

In [3]:
ANNOTATIONS_FOLDER="anotações_humanas"

### Mapping 4-score to 3-score evaluations to match GPT-4 3-score evaluations

In [4]:
score_mapper = {
    0:0,
    1:0,
    2:1,
    3:2
}

### Original 4-score to 3-score evaluations mapping

This mapping changes the original 4-score 1 evaluation meaning, indicating it **partially answers to the question**, while the original (TREC-DL 21) meaning is the passage **does not answer to the question**.

In [5]:
score_mapper_original = {
    0:0,
    1:1,
    2:1,
    3:2
}

### Mapping 4-score to 2-score evenly

In [6]:
score_mapper_for_2 = {
    0:0,
    1:0,
    2:1,
    3:1    
}

### Alternative version for mapping 4-score to 2-score

In [7]:
score_mapper_for_2_alt = {
    0:0,
    1:1,
    2:1,
    3:1    
}

### Mapping 3-score to 2-score

In [8]:
score_mapper_3_for_2 = {
    0:0,
    1:1,
    2:1    
}

In [9]:
def check_agreement_per_questions(evaluation_a, evaluation_b, suffix=None, score='score'):
    
    merged_df = evaluation_a.merge(evaluation_b, left_on='doccano_id', right_on='doccano_id')[['query_x', 'passage_x', 'passage_id_x', score + '_x', score + '_y']]
    
    correlations = []
    
    for group_name, group_df in merged_df.groupby('query_x', sort=False):
        correlations.append({'query': group_name,
                             'cohen_kappa{}'.format(suffix): cohen_kappa_score(group_df[score + '_x'], group_df[score + '_y'])})
        
    return merged_df, pd.DataFrame(correlations).fillna(1.0)

In [10]:
def plot_correlation_data_consolidated(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[10, 10])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot], patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen']
    
    all_boxes = all_boxplots['boxes']
    
    print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i // 3])
    
    
    plt.yticks(range(1, len(columns_to_plot) + 1), columns_to_plot)
    plt.xticks(np.arange(-0.3, 1.0, 0.1))
    
    plt.legend(handles=all_boxes[::3], labels=["Humanos x Humanos", "Humanos x GPT3.5", "Humanos x GPT4"], bbox_to_anchor=(1.0, 1.0))

    plt.show()

In [11]:
def plot_correlation_data(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[15, 40])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot].to_numpy().transpose(), patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen', 'lightyellow'][::-1]
    
    all_boxes = all_boxplots['boxes']
    
#     print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i % 4])

        
    
    plt.yticks(range(4, correlation_df.shape[0] + 1, 4), correlation_df.iloc[::4]['query'])
    plt.xticks(np.arange(-0.3, 1.0, 0.1))

    plt.show()

## Read human annotators evaluations and map the original 4-score values

In [12]:
a1_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "admin_240_annotations_with_questions.tsv"), sep='\t')
a2_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Eduardo_240_annotations_with_questions.tsv"), sep='\t')
a3_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Leodecio_240_annotations_with_questions.tsv"), sep='\t')

In [13]:
a1_df['03_score'] = a1_df['score'].map(score_mapper)
a2_df['03_score'] = a2_df['score'].map(score_mapper)
a3_df['03_score'] = a3_df['score'].map(score_mapper)

In [14]:
a1_df['02_score'] = a1_df['score'].map(score_mapper_for_2)
a2_df['02_score'] = a2_df['score'].map(score_mapper_for_2)
a3_df['02_score'] = a3_df['score'].map(score_mapper_for_2)

In [15]:
a1_df['02_score_alt'] = a1_df['score'].map(score_mapper_for_2_alt)
a2_df['02_score_alt'] = a2_df['score'].map(score_mapper_for_2_alt)
a3_df['02_score_alt'] = a3_df['score'].map(score_mapper_for_2_alt)

## Read GPT passage evaluations

### GPT-4 4-score evaluation, using TREC-DL 21 scores

In [16]:
gpt4_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_2nd_round_with_questions.tsv"), sep='\t')

In [17]:
gpt4_df['03_score'] = gpt4_df['score'].map(score_mapper)

gpt4_df['03_score_2'] = gpt4_df['score'].map(score_mapper_original)

gpt4_df['02_score'] = gpt4_df['score'].map(score_mapper_for_2)

gpt4_df['02_score_alt'] = gpt4_df['score'].map(score_mapper_for_2_alt)

### GPT-4 4-score evaluation using Chain of Thoughts (CoT) prompt

In [18]:
gpt4_1st_cot_2_6_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v2.6.tsv"), sep='\t')
gpt4_2nd_cot_2_6_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.6.tsv"), sep='\t')

In [19]:
gpt4_1st_cot_2_6_df['02_score'] = gpt4_1st_cot_2_6_df['score'].map(score_mapper_for_2)
gpt4_2nd_cot_2_6_df['02_score'] = gpt4_2nd_cot_2_6_df['score'].map(score_mapper_for_2)

gpt4_1st_cot_2_6_df['02_score_alt'] = gpt4_1st_cot_2_6_df['score'].map(score_mapper_for_2_alt)
gpt4_2nd_cot_2_6_df['02_score_alt'] = gpt4_2nd_cot_2_6_df['score'].map(score_mapper_for_2_alt)

gpt4_1st_cot_2_6_df['03_score'] = gpt4_1st_cot_2_6_df['score'].map(score_mapper)
gpt4_2nd_cot_2_6_df['03_score'] = gpt4_2nd_cot_2_6_df['score'].map(score_mapper)

In [20]:
gpt4_cot_2_6_df = pd.concat([gpt4_1st_cot_2_6_df, gpt4_2nd_cot_2_6_df], axis=0).reset_index(drop=True)

In [21]:
gpt4_cot_2_6_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v2.6.tsv"), sep='\t', index=False)

### GPT-4 3-score evaluation with prompt based on orignal 4-score evaluation prompt

Just removing the score 1 decription from the prompt.

In [22]:
gpt4_1st_03_score_2_7_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v2.7.tsv"), sep='\t')
gpt4_2nd_03_score_2_7_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.7.tsv"), sep='\t')

In [23]:
gpt4_1st_03_score_2_7_df['02_score'] = gpt4_1st_03_score_2_7_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_7_df['02_score'] = gpt4_2nd_03_score_2_7_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_7_df['02_score_alt'] = gpt4_1st_03_score_2_7_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_7_df['02_score_alt'] = gpt4_2nd_03_score_2_7_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_7_df['03_score'] = gpt4_1st_03_score_2_7_df['score']
gpt4_2nd_03_score_2_7_df['03_score'] = gpt4_2nd_03_score_2_7_df['score']

In [24]:
gpt4_03_score_2_7_df = pd.concat([gpt4_1st_03_score_2_7_df, gpt4_2nd_03_score_2_7_df], axis=0).reset_index(drop=True)

In [25]:
gpt4_03_score_2_7_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v2.7.tsv"), sep='\t', index=False)

### GPT-4 3-score evaluation with prompt with 3 scores translated from MS paper

3 scores translation from "Large Language models can accurately predict searcher preferences", from Thomas et al.

In [26]:
gpt4_2nd_03_score_2_8_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.8.tsv"), sep='\t')

In [27]:
gpt4_2nd_03_score_2_8_df['02_score'] = gpt4_2nd_03_score_2_8_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_8_df['02_score_alt'] = gpt4_2nd_03_score_2_8_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_8_df['03_score'] = gpt4_2nd_03_score_2_8_df['score']

### GPT-4 3-score evaluation using simplified original 4-score prompt

In [28]:
gpt4_1st_03_score_2_9_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v2.9.tsv"), sep='\t')
gpt4_2nd_03_score_2_9_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.9.tsv"), sep='\t')

In [29]:
gpt4_1st_03_score_2_9_df['02_score'] = gpt4_1st_03_score_2_9_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_9_df['02_score'] = gpt4_2nd_03_score_2_9_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_9_df['02_score_alt'] = gpt4_1st_03_score_2_9_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_9_df['02_score_alt'] = gpt4_2nd_03_score_2_9_df['score'].map(score_mapper_3_for_2)

gpt4_2nd_03_score_2_9_df['03_score'] = gpt4_2nd_03_score_2_9_df['score']
gpt4_1st_03_score_2_9_df['03_score'] = gpt4_1st_03_score_2_9_df['score']

In [30]:
gpt4_03_score_2_9_df = pd.concat([gpt4_1st_03_score_2_9_df, gpt4_2nd_03_score_2_9_df], axis=0).reset_index(drop=True)

In [31]:
gpt4_03_score_2_9_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v2.9.tsv"), sep='\t', index=False)

### GPT-4 4-score evaluation using Chain of Thoughts (CoT) prompt adding completeness and adequacy criteria

In [32]:
gpt4_1st_cot_3_0_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v3.0.tsv"), sep='\t')

In [33]:
gpt4_1st_cot_3_0_df['02_score'] = gpt4_1st_cot_3_0_df['score'].map(score_mapper_for_2)
gpt4_1st_cot_3_0_df['02_score_alt'] = gpt4_1st_cot_3_0_df['score'].map(score_mapper_for_2_alt)
gpt4_1st_cot_3_0_df['03_score'] = gpt4_1st_cot_3_0_df['score'].map(score_mapper)

### Just check that the passages order is exactly the same

In [34]:
np.all(gpt4_cot_2_6_df['doccano_id'] == a3_df['doccano_id'])

True

## Functions to check the correlation according to the selected score

In [35]:
def correlations_240_passages(which_score, only_4_scores=False):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  cohen_kappa_score(a1_df[which_score], a2_df[which_score]), 
                                  cohen_kappa_score(a1_df[which_score], a3_df[which_score])]

    general_agreement_df['a2'] = [cohen_kappa_score(a1_df[which_score], a2_df[which_score]), 
                                  np.nan, 
                                  cohen_kappa_score(a2_df[which_score], a3_df[which_score])]

    general_agreement_df['a3'] = [cohen_kappa_score(a1_df[which_score], a3_df[which_score]), 
                                  cohen_kappa_score(a2_df[which_score], a3_df[which_score]), 
                                  np.nan]

    general_agreement_df['GPT4 4-score'] = [cohen_kappa_score(a1_df[which_score], gpt4_df[which_score]), 
                                            cohen_kappa_score(a2_df[which_score], gpt4_df[which_score]), 
                                            cohen_kappa_score(a3_df[which_score], gpt4_df[which_score])]

    general_agreement_df['GPT4 4-score cot 2.6'] = [cohen_kappa_score(a1_df[which_score], gpt4_cot_2_6_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_cot_2_6_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_cot_2_6_df[which_score])]  
    
    if not only_4_scores:
        general_agreement_df['GPT4 3-score 2.7'] = [cohen_kappa_score(a1_df[which_score], gpt4_03_score_2_7_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_03_score_2_7_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_03_score_2_7_df[which_score])]

        general_agreement_df['GPT4 3-score 2.9'] = [cohen_kappa_score(a1_df[which_score], gpt4_03_score_2_9_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_03_score_2_9_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_03_score_2_9_df[which_score])]

    
    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])
    
    
    return general_agreement_df

In [36]:
def correlations_1st_120_passages(which_score, only_4_scores=False):
    
    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  cohen_kappa_score(a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score]), 
                                  cohen_kappa_score(a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])]

    general_agreement_df['a2'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score]), 
                                  np.nan, 
                                  cohen_kappa_score(a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])]

    general_agreement_df['a3'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score]), 
                                  cohen_kappa_score(a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score]), 
                                  np.nan]

    general_agreement_df['GPT4 1st 4-score'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score]), 
                                                cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score]), 
                                                cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score])]

    if not only_4_scores:
        general_agreement_df['GPT4 1st 3-score 2.7'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score])]

        general_agreement_df['GPT4 1st 3-score 2.9'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score])]


    general_agreement_df['GPT4 1st 4-score cot 2.6'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score])]

    general_agreement_df['GPT4 1st 4-score cot 3.0'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score])]  

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

In [37]:
def correlations_2nd_120_passages(which_score, only_4_scores=False):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  cohen_kappa_score(a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score]), 
                                  cohen_kappa_score(a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])]

    general_agreement_df['a2'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score]), 
                                  np.nan, 
                                  cohen_kappa_score(a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])]

    general_agreement_df['a3'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score]), 
                                  cohen_kappa_score(a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score]), 
                                  np.nan]

    general_agreement_df['GPT4 2nd 4-score'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score]), 
                                                cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score]), 
                                                cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score])]

    if not only_4_scores:
        general_agreement_df['GPT4 2nd 3-score 2.7'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score])]

        general_agreement_df['GPT4 2nd 3-score 2.8'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score])]

        general_agreement_df['GPT4 2nd 3-score 2.9'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score])]


    general_agreement_df['GPT4 2nd 4-score cot 2.6'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score])]


    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

## Correlations for the 4-score evaluations

In [38]:
display(correlations_240_passages('score', only_4_scores=True))
display(correlations_1st_120_passages('score', only_4_scores=True))
display(correlations_2nd_120_passages('score', only_4_scores=True))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6
0,a1,,0.436881,0.429402,0.318134,0.279329
1,a2,0.436881,,0.410455,0.274609,0.198145
2,a3,0.429402,0.410455,,0.355155,0.256689
0,mean,0.433142,0.423668,0.419929,0.315966,0.244721
0,Difference from mean human annotators,0.007562,-0.001911,-0.005651,-0.109613,-0.180858


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.489892,0.575498,0.396985,0.376026,0.388201
1,a2,0.489892,,0.500324,0.419496,0.222163,0.244028
2,a3,0.575498,0.500324,,0.44949,0.350928,0.30108
0,mean,0.532695,0.495108,0.537911,0.42199,0.316372,0.311103
0,Difference from mean human annotators,0.01079,-0.026797,0.016006,-0.099914,-0.205532,-0.210802


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 4-score cot 2.6
0,a1,,0.369369,0.286629,0.245213,0.190077
1,a2,0.369369,,0.322651,0.135825,0.184723
2,a3,0.286629,0.322651,,0.26009,0.161074
0,mean,0.327999,0.34601,0.30464,0.213709,0.178624
0,Difference from mean human annotators,0.001783,0.019794,-0.021576,-0.112507,-0.147592


## Correlations for the 3-score evaluations

In [39]:
display(correlations_240_passages('03_score'))
display(correlations_1st_120_passages('03_score'))
display(correlations_2nd_120_passages('03_score'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 3-score 2.7,GPT4 3-score 2.9
0,a1,,0.462148,0.435904,0.364067,0.365758,0.3037,0.327731
1,a2,0.462148,,0.425263,0.353236,0.227882,0.297525,0.325495
2,a3,0.435904,0.425263,,0.433279,0.329164,0.405653,0.373688
0,mean,0.449026,0.443706,0.430584,0.383527,0.307601,0.335626,0.342305
0,Difference from mean human annotators,0.007921,0.0026,-0.010521,-0.057578,-0.133504,-0.105479,-0.0988


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.486323,0.566787,0.423816,0.388535,0.402289,0.448939,0.435696
1,a2,0.486323,,0.470389,0.46979,0.434716,0.447627,0.259559,0.294047
2,a3,0.566787,0.470389,,0.504886,0.506974,0.442885,0.427728,0.407101
0,mean,0.526555,0.478356,0.518588,0.466164,0.443408,0.430934,0.378742,0.378948
0,Difference from mean human annotators,0.018722,-0.029477,0.010755,-0.041669,-0.064425,-0.076899,-0.129091,-0.128885


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6
0,a1,,0.408512,0.301099,0.309821,0.230688,0.135827,0.259653,0.292677
1,a2,0.408512,,0.378775,0.240262,0.175341,0.124684,0.210932,0.214206
2,a3,0.301099,0.378775,,0.362584,0.305838,0.245915,0.305392,0.233434
0,mean,0.354805,0.393643,0.339937,0.304222,0.237289,0.168809,0.258659,0.246773
0,Difference from mean human annotators,-0.00799,0.030848,-0.022858,-0.058573,-0.125506,-0.193986,-0.104136,-0.116022


## Correlations for the 2-score evaluations

In [40]:
display(correlations_240_passages('02_score'))
display(correlations_1st_120_passages('02_score'))
display(correlations_2nd_120_passages('02_score'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 3-score 2.7,GPT4 3-score 2.9
0,a1,,0.533333,0.558333,0.441667,0.5,0.4,0.408333
1,a2,0.533333,,0.551797,0.484144,0.389271,0.420948,0.447044
2,a3,0.558333,0.551797,,0.525926,0.496907,0.496802,0.488818
0,mean,0.545833,0.542565,0.555065,0.483912,0.462059,0.43925,0.448065
0,Difference from mean human annotators,-0.001988,-0.005256,0.007244,-0.063909,-0.085762,-0.108571,-0.099756


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.505882,0.677054,0.426966,0.405099,0.386364,0.483871,0.469169
1,a2,0.505882,,0.566474,0.625,0.566474,0.581395,0.375,0.393782
2,a3,0.677054,0.566474,,0.546473,0.526894,0.509029,0.495114,0.479816
0,mean,0.591468,0.536178,0.621764,0.532813,0.499489,0.492263,0.451328,0.447589
0,Difference from mean human annotators,0.008331,-0.046959,0.038627,-0.050324,-0.083648,-0.090874,-0.131808,-0.135548


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6
0,a1,,0.525424,0.445652,0.466307,0.408,0.284974,0.436997,0.535912
1,a2,0.525424,,0.539474,0.361702,0.300813,0.201278,0.331158,0.435216
2,a3,0.445652,0.539474,,0.504556,0.465517,0.363208,0.467963,0.497768
0,mean,0.485538,0.532449,0.492563,0.444188,0.391443,0.283153,0.41204,0.489632
0,Difference from mean human annotators,-0.017979,0.028932,-0.010954,-0.059328,-0.112073,-0.220363,-0.091477,-0.013885


## Correlations for the alternate 2-score evaluations

In [41]:
display(correlations_240_passages('02_score_alt'))
display(correlations_1st_120_passages('02_score_alt'))
display(correlations_2nd_120_passages('02_score_alt'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 3-score 2.7,GPT4 3-score 2.9
0,a1,,0.660554,0.719626,0.410637,0.417773,0.33044,0.343201
1,a2,0.660554,,0.720741,0.464943,0.460943,0.421045,0.41459
2,a3,0.719626,0.720741,,0.48797,0.470603,0.396565,0.390441
0,mean,0.69009,0.690647,0.720183,0.454517,0.449773,0.382683,0.382744
0,Difference from mean human annotators,-0.010217,-0.00966,0.019876,-0.24579,-0.250534,-0.317624,-0.317563


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.739583,0.848485,0.545455,0.336918,0.347826,0.444444,0.404762
1,a2,0.739583,,0.848485,0.606061,0.37276,0.347826,0.484127,0.404762
2,a3,0.848485,0.848485,,0.565217,0.375372,0.350962,0.528796,0.410995
0,mean,0.794034,0.794034,0.848485,0.572244,0.361683,0.348871,0.485789,0.40684
0,Difference from mean human annotators,-0.01815,-0.01815,0.036301,-0.23994,-0.450501,-0.463313,-0.326395,-0.405345


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6
0,a1,,0.592857,0.599057,0.288321,0.326923,0.393064,0.340452,0.392458
1,a2,0.592857,,0.610811,0.352,0.47907,0.450847,0.487023,0.444628
2,a3,0.599057,0.610811,,0.413408,0.42029,0.40919,0.432356,0.408659
0,mean,0.595957,0.601834,0.604934,0.351243,0.408761,0.4177,0.419944,0.415248
0,Difference from mean human annotators,-0.004951,0.000926,0.004026,-0.249665,-0.192147,-0.183208,-0.180965,-0.18566


# Check the agreement per question

In [42]:
def correlation_per_question(which_score):

    all_results = []
    
    res_a1_a2 = check_agreement_per_questions(a1_df, a2_df, suffix="_a1_a2", score=which_score)
    res_a1_a3 = check_agreement_per_questions(a1_df, a3_df, suffix="_a1_a3", score=which_score)
    res_a2_a3 = check_agreement_per_questions(a2_df, a3_df, suffix="_a2_a3", score=which_score)

    res_a1_gpt4 = check_agreement_per_questions(a1_df, gpt4_df, "_a1_gpt4", score=which_score)
    res_a2_gpt4 = check_agreement_per_questions(a2_df, gpt4_df, "_a2_gpt4", score=which_score)
    res_a3_gpt4 = check_agreement_per_questions(a3_df, gpt4_df, "_a3_gpt4", score=which_score)

    res_a1_gpt4_cot_2_6 = check_agreement_per_questions(a1_df, gpt4_cot_2_6_df, "_a1_gpt4_cot_2.6", score=which_score)
    res_a2_gpt4_cot_2_6 = check_agreement_per_questions(a2_df, gpt4_cot_2_6_df, "_a2_gpt4_cot_2.6", score=which_score)
    res_a3_gpt4_cot_2_6 = check_agreement_per_questions(a3_df, gpt4_cot_2_6_df, "_a3_gpt4_cot_2.6", score=which_score)

    if which_score != 'score':
        res_a1_gpt4_2_7 = check_agreement_per_questions(a1_df, gpt4_03_score_2_7_df, "_a1_gpt4_2.7", score=which_score)
        res_a2_gpt4_2_7 = check_agreement_per_questions(a2_df, gpt4_03_score_2_7_df, "_a2_gpt4_2.7", score=which_score)
        res_a3_gpt4_2_7 = check_agreement_per_questions(a3_df, gpt4_03_score_2_7_df, "_a3_gpt4_2.7", score=which_score)

        res_a1_gpt4_2_9 = check_agreement_per_questions(a1_df, gpt4_03_score_2_9_df, "_a1_gpt4_2.9", score=which_score)
        res_a2_gpt4_2_9 = check_agreement_per_questions(a2_df, gpt4_03_score_2_9_df, "_a2_gpt4_2.9", score=which_score)
        res_a3_gpt4_2_9 = check_agreement_per_questions(a3_df, gpt4_03_score_2_9_df, "_a3_gpt4_2.9", score=which_score)
    
    
    human_combined_res = pd.concat([res_a1_a2[1], res_a1_a3[1].iloc[:, -1], res_a2_a3[1].iloc[:, -1]], axis=1)

    human_combined_res['cohen_kappa_mean'] = human_combined_res.iloc[:, 1:].mean(axis=1)
    human_combined_res['cohen_kappa_std'] = human_combined_res.iloc[:, 1:].std(axis=1)
    
    all_results.append(human_combined_res)
    
    human_gpt4_combined_res = pd.concat([res_a1_gpt4[1], res_a2_gpt4[1].iloc[:, -1], res_a3_gpt4[1].iloc[:, -1]], axis=1)
    human_gpt4_combined_res['cohen_kappa_mean'] = human_gpt4_combined_res.iloc[:, 1:].mean(axis=1)
    human_gpt4_combined_res['cohen_kappa_std'] = human_gpt4_combined_res.iloc[:, 1:].std(axis=1)

    all_results.append(human_gpt4_combined_res)    
    
    human_gpt4_cot_2_6_combined_res = pd.concat([res_a1_gpt4_cot_2_6[1], res_a2_gpt4_cot_2_6[1].iloc[:, -1], res_a3_gpt4_cot_2_6[1].iloc[:, -1]], axis=1)
    human_gpt4_cot_2_6_combined_res['cohen_kappa_mean'] = human_gpt4_cot_2_6_combined_res.iloc[:, 1:].mean(axis=1)
    human_gpt4_cot_2_6_combined_res['cohen_kappa_std'] = human_gpt4_cot_2_6_combined_res.iloc[:, 1:].std(axis=1)

    all_results.append(human_gpt4_cot_2_6_combined_res)    

    if which_score != 'score':
        human_gpt4_2_7_combined_res = pd.concat([res_a1_gpt4_2_7[1], res_a2_gpt4_2_7[1].iloc[:, -1], res_a3_gpt4_2_7[1].iloc[:, -1]], axis=1)
        human_gpt4_2_7_combined_res['cohen_kappa_mean'] = human_gpt4_2_7_combined_res.iloc[:, 1:].mean(axis=1)
        human_gpt4_2_7_combined_res['cohen_kappa_std'] = human_gpt4_2_7_combined_res.iloc[:, 1:].std(axis=1)

        all_results.append(human_gpt4_2_7_combined_res)    
        
        human_gpt4_2_9_combined_res = pd.concat([res_a1_gpt4_2_9[1], res_a2_gpt4_2_9[1].iloc[:, -1], res_a3_gpt4_2_9[1].iloc[:, -1]], axis=1)
        human_gpt4_2_9_combined_res['cohen_kappa_mean'] = human_gpt4_2_9_combined_res.iloc[:, 1:].mean(axis=1)
        human_gpt4_2_9_combined_res['cohen_kappa_std'] = human_gpt4_2_9_combined_res.iloc[:, 1:].std(axis=1)
    
        all_results.append(human_gpt4_2_9_combined_res)    
    
    
    correlations = []
    correlations.append({'annotator': 'human',
                         'cohen_kappa': human_combined_res['cohen_kappa_mean'].mean()})
    
    correlations.append({'annotator': 'GPT-4',
                         'cohen_kappa': human_gpt4_combined_res['cohen_kappa_mean'].mean()})
    correlations.append({'annotator': 'GPT-4 CoT 2.6',
                         'cohen_kappa': human_gpt4_cot_2_6_combined_res['cohen_kappa_mean'].mean()})

    if which_score != 'score':
        correlations.append({'annotator': 'GPT-4 2.7',
                             'cohen_kappa': human_gpt4_2_7_combined_res['cohen_kappa_mean'].mean()})
        correlations.append({'annotator': 'GPT-4 2.9',
                             'cohen_kappa': human_gpt4_2_9_combined_res['cohen_kappa_mean'].mean()})
        
    
    
    correlations_df = pd.DataFrame(correlations)
    
    correlations_df['difference'] = correlations_df['cohen_kappa'] - human_combined_res['cohen_kappa_mean'].mean()
    
    display(correlations_df)
    
    return correlations_df, all_results

### Check for 4-score evaluations

In [43]:
_ = correlation_per_question('score')

Unnamed: 0,annotator,cohen_kappa,difference
0,human,0.322864,0.0
1,GPT-4,0.219317,-0.103547
2,GPT-4 CoT 2.6,0.161981,-0.160883


### Check for 3-score evaluations

In [44]:
score_3_df = correlation_per_question('03_score')

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Unnamed: 0,annotator,cohen_kappa,difference
0,human,0.307239,0.0
1,GPT-4,0.235552,-0.071687
2,GPT-4 CoT 2.6,0.18716,-0.120079
3,GPT-4 2.7,0.206327,-0.100913
4,GPT-4 2.9,0.209162,-0.098078


In [45]:
score_2_df = correlation_per_question('02_score')

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

Unnamed: 0,annotator,cohen_kappa,difference
0,human,0.452085,0.0
1,GPT-4,0.332523,-0.119562
2,GPT-4 CoT 2.6,0.310131,-0.141954
3,GPT-4 2.7,0.330763,-0.121322
4,GPT-4 2.9,0.332203,-0.119882


In [46]:
score_2a_df = correlation_per_question('02_score_alt')

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

Unnamed: 0,annotator,cohen_kappa,difference
0,human,0.706175,0.0
1,GPT-4,0.506944,-0.199231
2,GPT-4 CoT 2.6,0.444078,-0.262097
3,GPT-4 2.7,0.374782,-0.331392
4,GPT-4 2.9,0.361484,-0.344691
