In [1]:
import pandas as pd
import numpy as np
import glob

import os

from sklearn.metrics import cohen_kappa_score, confusion_matrix

from scipy.stats import spearmanr, kendalltau, pearsonr

import re

import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.6f}'.format

In [3]:
ANNOTATIONS_FOLDER="anotações_humanas"

### Mapping 4-score to 3-score evaluations to match GPT-4 3-score evaluations

In [4]:
score_mapper = {
    0:0,
    1:0,
    2:1,
    3:2
}

### Original 4-score to 3-score evaluations mapping

This mapping changes the original 4-score 1 evaluation meaning, indicating it **partially answers to the question**, while the original (TREC-DL 21) meaning is the passage **does not answer to the question**.

In [5]:
score_mapper_original = {
    0:0,
    1:1,
    2:1,
    3:2
}

### Mapping 4-score to 2-score evenly

In [6]:
score_mapper_for_2 = {
    0:0,
    1:0,
    2:1,
    3:1    
}

### Alternative version for mapping 4-score to 2-score

In [7]:
score_mapper_for_2_alt = {
    0:0,
    1:1,
    2:1,
    3:1    
}

### Mapping 3-score to 2-score

In [8]:
score_mapper_3_for_2 = {
    0:0,
    1:1,
    2:1    
}

In [9]:
def check_agreement_per_questions(evaluation_a, evaluation_b, suffix=None, score='score'):
    
    merged_df = evaluation_a.merge(evaluation_b, left_on='doccano_id', right_on='doccano_id')[['query_x', 'passage_x', 'passage_id_x', score + '_x', score + '_y']]
    
    correlations = []
    
    for group_name, group_df in merged_df.groupby('query_x', sort=False):
        correlations.append({'query': group_name,
                             'cohen_kappa{}'.format(suffix): cohen_kappa_score(group_df[score + '_x'], group_df[score + '_y'])})
        
    return merged_df, pd.DataFrame(correlations).fillna(1.0)

In [10]:
def plot_correlation_data_consolidated(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[10, 10])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot], patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen']
    
    all_boxes = all_boxplots['boxes']
    
    print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i // 3])
    
    
    plt.yticks(range(1, len(columns_to_plot) + 1), columns_to_plot)
    plt.xticks(np.arange(-0.3, 1.0, 0.1))
    
    plt.legend(handles=all_boxes[::3], labels=["Humanos x Humanos", "Humanos x GPT3.5", "Humanos x GPT4"], bbox_to_anchor=(1.0, 1.0))

    plt.show()

In [11]:
def plot_correlation_data(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[15, 40])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot].to_numpy().transpose(), patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen', 'lightyellow'][::-1]
    
    all_boxes = all_boxplots['boxes']
    
#     print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i % 4])

        
    
    plt.yticks(range(4, correlation_df.shape[0] + 1, 4), correlation_df.iloc[::4]['query'])
    plt.xticks(np.arange(-0.3, 1.0, 0.1))

    plt.show()

## Read human annotators evaluations and map the original 4-score values

In [12]:
a1_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "admin_240_annotations_with_questions.tsv"), sep='\t')
a2_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Eduardo_240_annotations_with_questions.tsv"), sep='\t')
a3_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Leodecio_240_annotations_with_questions.tsv"), sep='\t')

### Compute single score based on the human annotators

In [13]:
all_scores_df = pd.DataFrame()

In [14]:
all_scores_df['a1'] = a1_df['score']
all_scores_df['a2'] = a2_df['score']
all_scores_df['a3'] = a3_df['score']

In [15]:
all_scores_df

Unnamed: 0,a1,a2,a3
0,3,3,3
1,0,0,0
2,2,1,1
3,2,0,0
4,3,2,1
...,...,...,...
235,2,2,1
236,3,0,1
237,1,3,3
238,1,0,0


In [16]:
single_score = []

for i, row in all_scores_df.iterrows():
    if (row['a1'] == row['a2']) or (row['a1'] == row['a3']):
        single_score.append(row['a1'])

    elif row['a2'] == row['a3']:
        single_score.append(row['a2'])
        
    else:
        single_score.append(np.random.choice([row['a1'], row['a2'], row['a3']], 1)[0])
            
single_score = np.array(single_score)

In [17]:
single_score

array([3, 0, 1, 0, 3, 0, 0, 0, 1, 0, 3, 3, 2, 2, 1, 1, 1, 0, 1, 0, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 2, 0, 1, 2, 2, 0, 1, 0, 3, 2, 3, 3,
       0, 2, 0, 0, 0, 0, 3, 2, 3, 1, 2, 3, 1, 1, 2, 2, 2, 1, 3, 3, 3, 3,
       2, 3, 1, 3, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 0, 0, 1, 2, 0, 0, 3, 1,
       1, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 3, 3, 3, 2, 3, 2,
       0, 2, 1, 1, 2, 0, 0, 2, 1, 2, 3, 3, 3, 3, 3, 2, 3, 2, 1, 2, 1, 2,
       0, 1, 0, 0, 1, 2, 1, 1, 1, 3, 1, 1, 1, 0, 1, 0, 1, 1, 1, 3, 1, 1,
       2, 0, 2, 0, 1, 0, 1, 1, 2, 2, 2, 1, 0, 2, 2, 2, 3, 1, 2, 2, 2, 1,
       1, 1, 0, 2, 3, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 2, 2, 2, 2, 2, 0, 1,
       2, 1, 0, 3, 0, 1, 1, 0, 0, 0, 1, 0, 2, 3, 3, 2, 3, 2, 1, 3, 1, 3,
       3, 3, 1, 2, 2, 3, 3, 0, 0, 0, 1, 0, 2, 0, 0, 2, 3, 3, 0, 0])

In [18]:
common_score_df = pd.DataFrame(single_score, columns=['score'])

In [19]:
common_score_df

Unnamed: 0,score
0,3
1,0
2,1
3,0
4,3
...,...
235,2
236,3
237,3
238,0


### Convert the 4-score to other ranges

In [20]:
a1_df['03_score'] = a1_df['score'].map(score_mapper)
a2_df['03_score'] = a2_df['score'].map(score_mapper)
a3_df['03_score'] = a3_df['score'].map(score_mapper)

common_score_df['03_score'] = common_score_df['score'].map(score_mapper)

In [21]:
a1_df['02_score'] = a1_df['score'].map(score_mapper_for_2)
a2_df['02_score'] = a2_df['score'].map(score_mapper_for_2)
a3_df['02_score'] = a3_df['score'].map(score_mapper_for_2)

common_score_df['02_score'] = common_score_df['score'].map(score_mapper_for_2)

In [22]:
a1_df['02_score_alt'] = a1_df['score'].map(score_mapper_for_2_alt)
a2_df['02_score_alt'] = a2_df['score'].map(score_mapper_for_2_alt)
a3_df['02_score_alt'] = a3_df['score'].map(score_mapper_for_2_alt)

common_score_df['02_score_alt'] = common_score_df['score'].map(score_mapper_for_2_alt)

## Read GPT passage evaluations

### GPT-4 4-score evaluation, using TREC-DL 21 scores

In [23]:
gpt4_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_2nd_round_with_questions.tsv"), sep='\t')

In [24]:
gpt4_df['03_score'] = gpt4_df['score'].map(score_mapper)

gpt4_df['03_score_2'] = gpt4_df['score'].map(score_mapper_original)

gpt4_df['02_score'] = gpt4_df['score'].map(score_mapper_for_2)

gpt4_df['02_score_alt'] = gpt4_df['score'].map(score_mapper_for_2_alt)

### GPT-4 4-score evaluation using Chain of Thoughts (CoT) prompt

In [25]:
gpt4_1st_cot_2_6_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v2.6.tsv"), sep='\t')
gpt4_2nd_cot_2_6_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.6.tsv"), sep='\t')

In [26]:
gpt4_1st_cot_2_6_df['02_score'] = gpt4_1st_cot_2_6_df['score'].map(score_mapper_for_2)
gpt4_2nd_cot_2_6_df['02_score'] = gpt4_2nd_cot_2_6_df['score'].map(score_mapper_for_2)

gpt4_1st_cot_2_6_df['02_score_alt'] = gpt4_1st_cot_2_6_df['score'].map(score_mapper_for_2_alt)
gpt4_2nd_cot_2_6_df['02_score_alt'] = gpt4_2nd_cot_2_6_df['score'].map(score_mapper_for_2_alt)

gpt4_1st_cot_2_6_df['03_score'] = gpt4_1st_cot_2_6_df['score'].map(score_mapper)
gpt4_2nd_cot_2_6_df['03_score'] = gpt4_2nd_cot_2_6_df['score'].map(score_mapper)

In [27]:
gpt4_cot_2_6_df = pd.concat([gpt4_1st_cot_2_6_df, gpt4_2nd_cot_2_6_df], axis=0).reset_index(drop=True)

In [None]:
gpt4_cot_2_6_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v2.6.tsv"), sep='\t', index=False)

### GPT-4 3-score evaluation with prompt based on orignal 4-score evaluation prompt

Just removing the score 1 decription from the prompt.

In [28]:
gpt4_1st_03_score_2_7_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v2.7.tsv"), sep='\t')
gpt4_2nd_03_score_2_7_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.7.tsv"), sep='\t')

In [29]:
gpt4_1st_03_score_2_7_df['02_score'] = gpt4_1st_03_score_2_7_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_7_df['02_score'] = gpt4_2nd_03_score_2_7_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_7_df['02_score_alt'] = gpt4_1st_03_score_2_7_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_7_df['02_score_alt'] = gpt4_2nd_03_score_2_7_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_7_df['03_score'] = gpt4_1st_03_score_2_7_df['score']
gpt4_2nd_03_score_2_7_df['03_score'] = gpt4_2nd_03_score_2_7_df['score']

In [32]:
gpt4_03_score_2_7_df = pd.concat([gpt4_1st_03_score_2_7_df, gpt4_2nd_03_score_2_7_df], axis=0).reset_index(drop=True)

In [None]:
gpt4_03_score_2_7_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v2.7.tsv"), sep='\t', index=False)

### GPT-4 3-score evaluation with prompt with 3 scores translated from MS paper

3 scores translation from "Large Language models can accurately predict searcher preferences", from Thomas et al.

In [30]:
gpt4_1st_03_score_2_8_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v2.8.tsv"), sep='\t')
gpt4_2nd_03_score_2_8_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.8.tsv"), sep='\t')

In [31]:
gpt4_1st_03_score_2_8_df['02_score'] = gpt4_1st_03_score_2_8_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_8_df['02_score'] = gpt4_2nd_03_score_2_8_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_8_df['02_score_alt'] = gpt4_1st_03_score_2_8_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_8_df['02_score_alt'] = gpt4_2nd_03_score_2_8_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_8_df['03_score'] = gpt4_1st_03_score_2_8_df['score']
gpt4_2nd_03_score_2_8_df['03_score'] = gpt4_2nd_03_score_2_8_df['score']

In [33]:
gpt4_03_score_2_8_df = pd.concat([gpt4_1st_03_score_2_8_df, gpt4_2nd_03_score_2_8_df], axis=0).reset_index(drop=True)

In [None]:
gpt4_03_score_2_8_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v2.8.tsv"), sep='\t', index=False)

### GPT-4 3-score evaluation using simplified original 4-score prompt

In [34]:
gpt4_1st_03_score_2_9_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v2.9.tsv"), sep='\t')
gpt4_2nd_03_score_2_9_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v2.9.tsv"), sep='\t')

In [35]:
gpt4_1st_03_score_2_9_df['02_score'] = gpt4_1st_03_score_2_9_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_9_df['02_score'] = gpt4_2nd_03_score_2_9_df['score'].map(score_mapper_3_for_2)

gpt4_1st_03_score_2_9_df['02_score_alt'] = gpt4_1st_03_score_2_9_df['score'].map(score_mapper_3_for_2)
gpt4_2nd_03_score_2_9_df['02_score_alt'] = gpt4_2nd_03_score_2_9_df['score'].map(score_mapper_3_for_2)

gpt4_2nd_03_score_2_9_df['03_score'] = gpt4_2nd_03_score_2_9_df['score']
gpt4_1st_03_score_2_9_df['03_score'] = gpt4_1st_03_score_2_9_df['score']

In [36]:
gpt4_03_score_2_9_df = pd.concat([gpt4_1st_03_score_2_9_df, gpt4_2nd_03_score_2_9_df], axis=0).reset_index(drop=True)

In [None]:
gpt4_03_score_2_9_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v2.9.tsv"), sep='\t', index=False)

### GPT-4 4-score evaluation using Chain of Thoughts (CoT) prompt adding completeness and adequacy criteria

In [37]:
gpt4_1st_cot_3_0_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_1st_queries_set_prompt_v3.0.tsv"), sep='\t')
gpt4_2nd_cot_3_0_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set_prompt_v3.0.tsv"), sep='\t')

In [38]:
gpt4_1st_cot_3_0_df['02_score'] = gpt4_1st_cot_3_0_df['score'].map(score_mapper_for_2)
gpt4_2nd_cot_3_0_df['02_score'] = gpt4_2nd_cot_3_0_df['score'].map(score_mapper_for_2)

gpt4_1st_cot_3_0_df['02_score_alt'] = gpt4_1st_cot_3_0_df['score'].map(score_mapper_for_2_alt)
gpt4_2nd_cot_3_0_df['02_score_alt'] = gpt4_2nd_cot_3_0_df['score'].map(score_mapper_for_2_alt)

gpt4_1st_cot_3_0_df['03_score'] = gpt4_1st_cot_3_0_df['score'].map(score_mapper)
gpt4_2nd_cot_3_0_df['03_score'] = gpt4_2nd_cot_3_0_df['score'].map(score_mapper)

In [39]:
gpt4_cot_3_0_df = pd.concat([gpt4_1st_cot_3_0_df, gpt4_2nd_cot_3_0_df], axis=0).reset_index(drop=True)

In [None]:
gpt4_cot_3_0_df.to_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_240_samples_prompt_v3.0.tsv"), sep='\t', index=False)

### Just check that the passages order is exactly the same

In [40]:
np.all(gpt4_cot_3_0_df['doccano_id'] == a3_df['doccano_id'])

True

In [42]:
np.all(gpt4_03_score_2_8_df['doccano_id'] == a3_df['doccano_id'])

True

## Functions to check the correlation according to the selected score

In [43]:
def correlations_240_passages(which_score, only_4_scores=False):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  cohen_kappa_score(a1_df[which_score], a2_df[which_score]), 
                                  cohen_kappa_score(a1_df[which_score], a3_df[which_score])]

    general_agreement_df['a2'] = [cohen_kappa_score(a1_df[which_score], a2_df[which_score]), 
                                  np.nan, 
                                  cohen_kappa_score(a2_df[which_score], a3_df[which_score])]

    general_agreement_df['a3'] = [cohen_kappa_score(a1_df[which_score], a3_df[which_score]), 
                                  cohen_kappa_score(a2_df[which_score], a3_df[which_score]), 
                                  np.nan]

    general_agreement_df['GPT4 4-score'] = [cohen_kappa_score(a1_df[which_score], gpt4_df[which_score]), 
                                            cohen_kappa_score(a2_df[which_score], gpt4_df[which_score]), 
                                            cohen_kappa_score(a3_df[which_score], gpt4_df[which_score])]

    general_agreement_df['GPT4 4-score cot 2.6'] = [cohen_kappa_score(a1_df[which_score], gpt4_cot_2_6_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_cot_2_6_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_cot_2_6_df[which_score])]  

    general_agreement_df['GPT4 4-score cot 3.0'] = [cohen_kappa_score(a1_df[which_score], gpt4_cot_3_0_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_cot_3_0_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_cot_3_0_df[which_score])]  
    
    
    if not only_4_scores:
        general_agreement_df['GPT4 3-score 2.7'] = [cohen_kappa_score(a1_df[which_score], gpt4_03_score_2_7_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_03_score_2_7_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_03_score_2_7_df[which_score])]

        general_agreement_df['GPT4 3-score 2.8'] = [cohen_kappa_score(a1_df[which_score], gpt4_03_score_2_8_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_03_score_2_8_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_03_score_2_8_df[which_score])]        
        
        general_agreement_df['GPT4 3-score 2.9'] = [cohen_kappa_score(a1_df[which_score], gpt4_03_score_2_9_df[which_score]), 
                                                    cohen_kappa_score(a2_df[which_score], gpt4_03_score_2_9_df[which_score]), 
                                                    cohen_kappa_score(a3_df[which_score], gpt4_03_score_2_9_df[which_score])]


    
    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])
    
    
    return general_agreement_df

In [44]:
def correlations_1st_120_passages(which_score, only_4_scores=False):
    
    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  cohen_kappa_score(a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score]), 
                                  cohen_kappa_score(a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])]

    general_agreement_df['a2'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score]), 
                                  np.nan, 
                                  cohen_kappa_score(a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])]

    general_agreement_df['a3'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score]), 
                                  cohen_kappa_score(a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score]), 
                                  np.nan]

    general_agreement_df['GPT4 1st 4-score'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score]), 
                                                cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score]), 
                                                cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score])]

    if not only_4_scores:
        general_agreement_df['GPT4 1st 3-score 2.7'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score])]

        general_agreement_df['GPT4 1st 3-score 2.8'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_8_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_8_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_8_df[which_score])]

        general_agreement_df['GPT4 1st 3-score 2.9'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score])]


    general_agreement_df['GPT4 1st 4-score cot 2.6'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score])]

    general_agreement_df['GPT4 1st 4-score cot 3.0'] = [cohen_kappa_score(a1_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score])]  

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

In [45]:
def correlations_2nd_120_passages(which_score, only_4_scores=False):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  cohen_kappa_score(a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score]), 
                                  cohen_kappa_score(a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])]

    general_agreement_df['a2'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score]), 
                                  np.nan, 
                                  cohen_kappa_score(a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])]

    general_agreement_df['a3'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score]), 
                                  cohen_kappa_score(a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score]), 
                                  np.nan]

    general_agreement_df['GPT4 2nd 4-score'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score]), 
                                                cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score]), 
                                                cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score])]

    if not only_4_scores:
        general_agreement_df['GPT4 2nd 3-score 2.7'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score])]

        general_agreement_df['GPT4 2nd 3-score 2.8'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score])]

        general_agreement_df['GPT4 2nd 3-score 2.9'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score])]


    general_agreement_df['GPT4 2nd 4-score cot 2.6'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score])]

    general_agreement_df['GPT4 2nd 4-score cot 3.0'] = [cohen_kappa_score(a1_df.iloc[120:240][which_score], gpt4_2nd_cot_3_0_df[which_score]), 
                                                        cohen_kappa_score(a2_df.iloc[120:240][which_score], gpt4_2nd_cot_3_0_df[which_score]), 
                                                        cohen_kappa_score(a3_df.iloc[120:240][which_score], gpt4_2nd_cot_3_0_df[which_score])]
        

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

## Correlations for the 4-score evaluations

In [46]:
display(correlations_240_passages('score', only_4_scores=True))
display(correlations_1st_120_passages('score', only_4_scores=True))
display(correlations_2nd_120_passages('score', only_4_scores=True))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0
0,a1,,0.436881,0.429402,0.318134,0.279329,0.29951
1,a2,0.436881,,0.410455,0.274609,0.198145,0.219461
2,a3,0.429402,0.410455,,0.355155,0.256689,0.285714
0,mean,0.433142,0.423668,0.419929,0.315966,0.244721,0.268228
0,Difference from mean human annotators,0.007562,-0.001911,-0.005651,-0.109613,-0.180858,-0.157351


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.489892,0.575498,0.396985,0.376026,0.388201
1,a2,0.489892,,0.500324,0.419496,0.222163,0.244028
2,a3,0.575498,0.500324,,0.44949,0.350928,0.30108
0,mean,0.532695,0.495108,0.537911,0.42199,0.316372,0.311103
0,Difference from mean human annotators,0.01079,-0.026797,0.016006,-0.099914,-0.205532,-0.210802


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.369369,0.286629,0.245213,0.190077,0.220076
1,a2,0.369369,,0.322651,0.135825,0.184723,0.207508
2,a3,0.286629,0.322651,,0.26009,0.161074,0.270655
0,mean,0.327999,0.34601,0.30464,0.213709,0.178624,0.232746
0,Difference from mean human annotators,0.001783,0.019794,-0.021576,-0.112507,-0.147592,-0.09347


In [47]:
confusion_matrix(gpt4_df['score'], a1_df['score'])

array([[21,  4,  3,  3],
       [20, 34, 15,  5],
       [10, 21, 22,  6],
       [ 1,  9, 25, 41]])

In [48]:
confusion_matrix(gpt4_df['score'], a2_df['score'])

array([[25,  4,  2,  0],
       [26, 20, 25,  3],
       [ 8, 15, 30,  6],
       [ 0,  8, 35, 33]])

In [49]:
confusion_matrix(gpt4_df['score'], a3_df['score'])

array([[24,  5,  2,  0],
       [22, 26, 16, 10],
       [ 7, 14, 22, 16],
       [ 0,  7, 16, 53]])

## Correlations for the 3-score evaluations

In [50]:
display(correlations_240_passages('03_score'))
display(correlations_1st_120_passages('03_score'))
display(correlations_2nd_120_passages('03_score'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.462148,0.435904,0.364067,0.365758,0.360897,0.3037,0.241397,0.327731
1,a2,0.462148,,0.425263,0.353236,0.227882,0.266473,0.297525,0.230846,0.325495
2,a3,0.435904,0.425263,,0.433279,0.329164,0.375672,0.405653,0.316007,0.373688
0,mean,0.449026,0.443706,0.430584,0.383527,0.307601,0.334347,0.335626,0.26275,0.342305
0,Difference from mean human annotators,0.007921,0.0026,-0.010521,-0.057578,-0.133504,-0.106758,-0.105479,-0.178355,-0.0988


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.486323,0.566787,0.423816,0.388535,0.354972,0.402289,0.448939,0.435696
1,a2,0.486323,,0.470389,0.46979,0.434716,0.34796,0.447627,0.259559,0.294047
2,a3,0.566787,0.470389,,0.504886,0.506974,0.386669,0.442885,0.427728,0.407101
0,mean,0.526555,0.478356,0.518588,0.466164,0.443408,0.3632,0.430934,0.378742,0.378948
0,Difference from mean human annotators,0.018722,-0.029477,0.010755,-0.041669,-0.064425,-0.144632,-0.076899,-0.129091,-0.128885


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.408512,0.301099,0.309821,0.230688,0.135827,0.259653,0.292677,0.299506
1,a2,0.408512,,0.378775,0.240262,0.175341,0.124684,0.210932,0.214206,0.258483
2,a3,0.301099,0.378775,,0.362584,0.305838,0.245915,0.305392,0.233434,0.345212
0,mean,0.354805,0.393643,0.339937,0.304222,0.237289,0.168809,0.258659,0.246773,0.301067
0,Difference from mean human annotators,-0.00799,0.030848,-0.022858,-0.058573,-0.125506,-0.193986,-0.104136,-0.116022,-0.061728


In [51]:
confusion_matrix(gpt4_df['03_score'], a1_df['03_score'])

array([[79, 18,  8],
       [31, 22,  6],
       [10, 25, 41]])

In [52]:
confusion_matrix(gpt4_df['03_score'], a2_df['03_score'])

array([[75, 27,  3],
       [23, 30,  6],
       [ 8, 35, 33]])

In [53]:
confusion_matrix(gpt4_df['03_score'], a3_df['03_score'])

array([[77, 18, 10],
       [21, 22, 16],
       [ 7, 16, 53]])

## Correlations for the 2-score evaluations

In [54]:
display(correlations_240_passages('02_score'))
display(correlations_1st_120_passages('02_score'))
display(correlations_2nd_120_passages('02_score'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.533333,0.558333,0.441667,0.5,0.491667,0.4,0.366667,0.408333
1,a2,0.533333,,0.551797,0.484144,0.389271,0.415718,0.420948,0.334307,0.447044
2,a3,0.558333,0.551797,,0.525926,0.496907,0.490236,0.496802,0.410989,0.488818
0,mean,0.545833,0.542565,0.555065,0.483912,0.462059,0.465874,0.43925,0.370654,0.448065
0,Difference from mean human annotators,-0.001988,-0.005256,0.007244,-0.063909,-0.085762,-0.081948,-0.108571,-0.177167,-0.099756


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.505882,0.677054,0.426966,0.405099,0.464286,0.386364,0.483871,0.469169
1,a2,0.505882,,0.566474,0.625,0.566474,0.5,0.581395,0.375,0.393782
2,a3,0.677054,0.566474,,0.546473,0.526894,0.458042,0.509029,0.495114,0.479816
0,mean,0.591468,0.536178,0.621764,0.532813,0.499489,0.474109,0.492263,0.451328,0.447589
0,Difference from mean human annotators,0.008331,-0.046959,0.038627,-0.050324,-0.083648,-0.109027,-0.090874,-0.131808,-0.135548


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.525424,0.445652,0.466307,0.408,0.284974,0.436997,0.535912,0.533333
1,a2,0.525424,,0.539474,0.361702,0.300813,0.201278,0.331158,0.435216,0.466667
2,a3,0.445652,0.539474,,0.504556,0.465517,0.363208,0.467963,0.497768,0.5
0,mean,0.485538,0.532449,0.492563,0.444188,0.391443,0.283153,0.41204,0.489632,0.5
0,Difference from mean human annotators,-0.017979,0.028932,-0.010954,-0.059328,-0.112073,-0.220363,-0.091477,-0.013885,-0.003517


In [55]:
confusion_matrix(gpt4_df['02_score'], a1_df['02_score'])

array([[79, 26],
       [41, 94]])

In [56]:
confusion_matrix(gpt4_df['02_score'], a2_df['02_score'])

array([[ 75,  30],
       [ 31, 104]])

In [57]:
confusion_matrix(gpt4_df['02_score'], a3_df['02_score'])

array([[ 77,  28],
       [ 28, 107]])

## Correlations for the alternate 2-score evaluations

In [58]:
display(correlations_240_passages('02_score_alt'))
display(correlations_1st_120_passages('02_score_alt'))
display(correlations_2nd_120_passages('02_score_alt'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.660554,0.719626,0.410637,0.417773,0.390286,0.33044,0.454545,0.343201
1,a2,0.660554,,0.720741,0.464943,0.460943,0.394907,0.421045,0.460793,0.41459
2,a3,0.719626,0.720741,,0.48797,0.470603,0.423324,0.396565,0.467873,0.390441
0,mean,0.69009,0.690647,0.720183,0.454517,0.449773,0.402839,0.382683,0.461071,0.382744
0,Difference from mean human annotators,-0.010217,-0.00966,0.019876,-0.24579,-0.250534,-0.297468,-0.317624,-0.239236,-0.317563


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.739583,0.848485,0.545455,0.336918,0.517544,0.347826,0.444444,0.404762
1,a2,0.739583,,0.848485,0.606061,0.37276,0.473684,0.347826,0.484127,0.404762
2,a3,0.848485,0.848485,,0.565217,0.375372,0.525862,0.350962,0.528796,0.410995
0,mean,0.794034,0.794034,0.848485,0.572244,0.361683,0.505697,0.348871,0.485789,0.40684
0,Difference from mean human annotators,-0.01815,-0.01815,0.036301,-0.23994,-0.450501,-0.306488,-0.463313,-0.326395,-0.405345


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.592857,0.599057,0.288321,0.326923,0.393064,0.340452,0.392458,0.378571
1,a2,0.592857,,0.610811,0.352,0.47907,0.450847,0.487023,0.444628,0.394958
2,a3,0.599057,0.610811,,0.413408,0.42029,0.40919,0.432356,0.408659,0.437838
0,mean,0.595957,0.601834,0.604934,0.351243,0.408761,0.4177,0.419944,0.415248,0.403789
0,Difference from mean human annotators,-0.004951,0.000926,0.004026,-0.249665,-0.192147,-0.183208,-0.180965,-0.18566,-0.197119


# Check other metrics

In [59]:
def correlations_240_passages_2(which_score, only_4_scores=False, correlation_fn=None):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df[which_score], a2_df[which_score])[0], 
                                  globals()[correlation_fn](a1_df[which_score], a3_df[which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df[which_score], a2_df[which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df[which_score], a3_df[which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df[which_score], a3_df[which_score])[0], 
                                  globals()[correlation_fn](a2_df[which_score], a3_df[which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 4-score'] = [globals()[correlation_fn](a1_df[which_score], gpt4_df[which_score])[0], 
                                            globals()[correlation_fn](a2_df[which_score], gpt4_df[which_score])[0], 
                                            globals()[correlation_fn](a3_df[which_score], gpt4_df[which_score])[0]]

    general_agreement_df['GPT4 4-score cot 2.6'] = [globals()[correlation_fn](a1_df[which_score], gpt4_cot_2_6_df[which_score])[0], 
                                                    globals()[correlation_fn](a2_df[which_score], gpt4_cot_2_6_df[which_score])[0], 
                                                    globals()[correlation_fn](a3_df[which_score], gpt4_cot_2_6_df[which_score])[0]]  

    general_agreement_df['GPT4 4-score cot 3.0'] = [globals()[correlation_fn](a1_df[which_score], gpt4_cot_3_0_df[which_score])[0], 
                                                    globals()[correlation_fn](a2_df[which_score], gpt4_cot_3_0_df[which_score])[0], 
                                                    globals()[correlation_fn](a3_df[which_score], gpt4_cot_3_0_df[which_score])[0]]  
    
    if not only_4_scores:
        general_agreement_df['GPT4 3-score 2.7'] = [globals()[correlation_fn](a1_df[which_score], gpt4_03_score_2_7_df[which_score])[0], 
                                                    globals()[correlation_fn](a2_df[which_score], gpt4_03_score_2_7_df[which_score])[0], 
                                                    globals()[correlation_fn](a3_df[which_score], gpt4_03_score_2_7_df[which_score])[0]]

        general_agreement_df['GPT4 3-score 2.8'] = [globals()[correlation_fn](a1_df[which_score], gpt4_03_score_2_8_df[which_score])[0], 
                                                    globals()[correlation_fn](a2_df[which_score], gpt4_03_score_2_8_df[which_score])[0], 
                                                    globals()[correlation_fn](a3_df[which_score], gpt4_03_score_2_8_df[which_score])[0]]
                
        general_agreement_df['GPT4 3-score 2.9'] = [globals()[correlation_fn](a1_df[which_score], gpt4_03_score_2_9_df[which_score])[0], 
                                                    globals()[correlation_fn](a2_df[which_score], gpt4_03_score_2_9_df[which_score])[0], 
                                                    globals()[correlation_fn](a3_df[which_score], gpt4_03_score_2_9_df[which_score])[0]]

    
    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])
    
    
    return general_agreement_df

In [60]:
def correlations_1st_120_passages_2(which_score, only_4_scores=False, correlation_fn=None):
    
    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score])[0], 
                                  globals()[correlation_fn](a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0], 
                                  globals()[correlation_fn](a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score])[0], 
                                                globals()[correlation_fn](a2_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score])[0], 
                                                globals()[correlation_fn](a3_df.iloc[0:120][which_score], gpt4_df.iloc[0:120][which_score])[0]]

    if not only_4_scores:
        general_agreement_df['GPT4 1st 3-score 2.7'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_7_df[which_score])[0]]

        general_agreement_df['GPT4 1st 3-score 2.8'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_8_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_8_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_8_df[which_score])[0]]

        general_agreement_df['GPT4 1st 3-score 2.9'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[0:120][which_score], gpt4_1st_03_score_2_9_df[which_score])[0]]


    general_agreement_df['GPT4 1st 4-score cot 2.6'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[0:120][which_score], gpt4_1st_cot_2_6_df[which_score])[0]]

    general_agreement_df['GPT4 1st 4-score cot 3.0'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[0:120][which_score], gpt4_1st_cot_3_0_df[which_score])[0]]  

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

In [61]:
def correlations_2nd_120_passages_2(which_score, only_4_scores=False, correlation_fn=None):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score])[0], 
                                  globals()[correlation_fn](a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0], 
                                  globals()[correlation_fn](a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 2nd 4-score'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score])[0], 
                                                globals()[correlation_fn](a2_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score])[0], 
                                                globals()[correlation_fn](a3_df.iloc[120:240][which_score], gpt4_df.iloc[120:240][which_score])[0]]

    if not only_4_scores:
        general_agreement_df['GPT4 2nd 3-score 2.7'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_7_df[which_score])[0]]

        general_agreement_df['GPT4 2nd 3-score 2.8'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_8_df[which_score])[0]]

        general_agreement_df['GPT4 2nd 3-score 2.9'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[120:240][which_score], gpt4_2nd_03_score_2_9_df[which_score])[0]]


    general_agreement_df['GPT4 2nd 4-score cot 2.6'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[120:240][which_score], gpt4_2nd_cot_2_6_df[which_score])[0]]

    general_agreement_df['GPT4 2nd 4-score cot 3.0'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], gpt4_2nd_cot_3_0_df[which_score])[0], 
                                                        globals()[correlation_fn](a2_df.iloc[120:240][which_score], gpt4_2nd_cot_3_0_df[which_score])[0], 
                                                        globals()[correlation_fn](a3_df.iloc[120:240][which_score], gpt4_2nd_cot_3_0_df[which_score])[0]]
    
    

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

## Pearson correlations for the 4-score evaluations

In [62]:
display(correlations_240_passages_2('score', only_4_scores=True, correlation_fn='pearsonr'))
display(correlations_1st_120_passages_2('score', only_4_scores=True, correlation_fn='pearsonr'))
display(correlations_2nd_120_passages_2('score', only_4_scores=True, correlation_fn='pearsonr'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0
0,a1,,0.698174,0.697289,0.584699,0.598846,0.550837
1,a2,0.698174,,0.713168,0.657448,0.562329,0.551722
2,a3,0.697289,0.713168,,0.670949,0.621463,0.616455
0,mean,0.697732,0.705671,0.705229,0.637699,0.594213,0.573005
0,Difference from mean human annotators,-0.005145,0.002794,0.002352,-0.065178,-0.108665,-0.129872


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.719862,0.82506,0.646361,0.66178,0.590493
1,a2,0.719862,,0.768756,0.716299,0.629421,0.611082
2,a3,0.82506,0.768756,,0.699152,0.6956,0.647778
0,mean,0.772461,0.744309,0.796908,0.687271,0.662267,0.616451
0,Difference from mean human annotators,0.001235,-0.026917,0.025682,-0.083956,-0.108959,-0.154775


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.657464,0.576163,0.539697,0.564848,0.562433
1,a2,0.657464,,0.687592,0.638165,0.547935,0.573026
2,a3,0.576163,0.687592,,0.643245,0.547676,0.594033
0,mean,0.616813,0.672528,0.631878,0.607036,0.553486,0.576497
0,Difference from mean human annotators,-0.023593,0.032122,-0.008529,-0.033371,-0.08692,-0.063909


## Pearson correlations for the 3-score evaluations

In [63]:
display(correlations_240_passages_2('03_score', correlation_fn='pearsonr'))
display(correlations_1st_120_passages_2('03_score', correlation_fn='pearsonr'))
display(correlations_2nd_120_passages_2('03_score', correlation_fn='pearsonr'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.624611,0.603764,0.563262,0.548172,0.538304,0.483352,0.49409,0.502817
1,a2,0.624611,,0.610733,0.585558,0.481453,0.515645,0.504904,0.487357,0.519902
2,a3,0.603764,0.610733,,0.612604,0.534899,0.573317,0.585592,0.595483,0.575787
0,mean,0.614187,0.617672,0.607248,0.587141,0.521508,0.542422,0.524616,0.525643,0.532835
0,Difference from mean human annotators,0.001151,0.004636,-0.005787,-0.025894,-0.091528,-0.070614,-0.08842,-0.087392,-0.080201


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.650517,0.75372,0.619977,0.555287,0.601191,0.545797,0.63752,0.615403
1,a2,0.650517,,0.681858,0.664436,0.628201,0.625974,0.616703,0.546524,0.596693
2,a3,0.75372,0.681858,,0.645738,0.615008,0.633236,0.605317,0.611923,0.619433
0,mean,0.702118,0.666187,0.717789,0.643384,0.599499,0.620134,0.589272,0.598656,0.61051
0,Difference from mean human annotators,0.006754,-0.029177,0.022424,-0.051981,-0.095866,-0.075231,-0.106093,-0.096709,-0.084855


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.562838,0.460076,0.531255,0.465806,0.420646,0.499482,0.486834,0.522264
1,a2,0.562838,,0.570449,0.551654,0.463533,0.401395,0.484905,0.467976,0.526225
2,a3,0.460076,0.570449,,0.578859,0.563173,0.560491,0.54928,0.454689,0.535568
0,mean,0.511457,0.566644,0.515263,0.553923,0.497504,0.460844,0.511222,0.469833,0.528019
0,Difference from mean human annotators,-0.019664,0.035522,-0.015858,0.022802,-0.033617,-0.070277,-0.019899,-0.061288,-0.003102


## Pearson correlations for the 2-score evaluations

In [64]:
display(correlations_240_passages_2('02_score', correlation_fn='pearsonr'))
display(correlations_1st_120_passages_2('02_score', correlation_fn='pearsonr'))
display(correlations_2nd_120_passages_2('02_score', correlation_fn='pearsonr'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.537,0.562747,0.445158,0.501745,0.494577,0.406897,0.403347,0.414733
1,a2,0.537,,0.551817,0.484161,0.397139,0.426376,0.421928,0.352279,0.447839
2,a3,0.562747,0.551817,,0.525926,0.507819,0.503768,0.497689,0.431944,0.489458
0,mean,0.549874,0.544409,0.557282,0.485082,0.468901,0.474907,0.442171,0.395857,0.450677
0,Difference from mean human annotators,-0.000648,-0.006113,0.00676,-0.06544,-0.08162,-0.075614,-0.10835,-0.154665,-0.099845


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.513948,0.677936,0.429166,0.405627,0.479512,0.386588,0.517549,0.505003
1,a2,0.513948,,0.581441,0.649626,0.581441,0.501486,0.594564,0.433013,0.458731
2,a3,0.677936,0.581441,,0.547168,0.526894,0.479718,0.509102,0.52067,0.507379
0,mean,0.595942,0.547695,0.629688,0.541987,0.504654,0.486905,0.496752,0.490411,0.490371
0,Difference from mean human annotators,0.004834,-0.043414,0.03858,-0.049122,-0.086454,-0.104203,-0.094357,-0.100698,-0.100737


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.526635,0.46618,0.495775,0.445164,0.343851,0.470374,0.546837,0.540899
1,a2,0.526635,,0.554371,0.376572,0.32005,0.234188,0.348312,0.439111,0.469018
2,a3,0.46618,0.554371,,0.50522,0.468945,0.383147,0.469696,0.500298,0.504505
0,mean,0.496408,0.540503,0.510275,0.459189,0.411386,0.320395,0.429461,0.495415,0.504807
0,Difference from mean human annotators,-0.019321,0.024774,-0.005453,-0.05654,-0.104342,-0.195333,-0.086268,-0.020313,-0.010922


## Pearson correlations for the alternate 2-score evaluations

In [65]:
display(correlations_240_passages_2('02_score_alt', correlation_fn='pearsonr'))
display(correlations_1st_120_passages_2('02_score_alt', correlation_fn='pearsonr'))
display(correlations_2nd_120_passages_2('02_score_alt', correlation_fn='pearsonr'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.662783,0.71968,0.430731,0.437404,0.406427,0.365577,0.463571,0.381107
1,a2,0.662783,,0.722504,0.501442,0.472478,0.403194,0.451006,0.463941,0.445478
2,a3,0.71968,0.722504,,0.513769,0.490974,0.439365,0.436478,0.476056,0.4313
0,mean,0.691231,0.692643,0.721092,0.481981,0.466952,0.416329,0.417687,0.467856,0.419295
0,Difference from mean human annotators,-0.010424,-0.009012,0.019436,-0.219674,-0.234703,-0.285327,-0.283968,-0.233799,-0.28236


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.739583,0.849569,0.566947,0.39436,0.536452,0.403604,0.4842,0.440968
1,a2,0.739583,,0.849569,0.629941,0.436313,0.49099,0.403604,0.527432,0.440968
2,a3,0.849569,0.849569,,0.596341,0.428384,0.538517,0.397316,0.565272,0.439345
0,mean,0.794576,0.794576,0.849569,0.597743,0.419686,0.521987,0.401508,0.525635,0.440427
0,Difference from mean human annotators,-0.018331,-0.018331,0.036662,-0.215164,-0.393222,-0.290921,-0.411399,-0.287272,-0.37248


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.599641,0.599223,0.305258,0.345932,0.396431,0.364625,0.399626,0.382904
1,a2,0.599641,,0.620159,0.395515,0.487583,0.450941,0.499237,0.444978,0.394958
2,a3,0.599223,0.620159,,0.434423,0.448281,0.414077,0.467093,0.418089,0.444539
0,mean,0.599432,0.6099,0.609691,0.378399,0.427265,0.420483,0.443652,0.420898,0.407467
0,Difference from mean human annotators,-0.006909,0.003559,0.00335,-0.227942,-0.179076,-0.185858,-0.162689,-0.185443,-0.198874


## Kendall Tau correlations for the 4-score evaluations

In [66]:
display(correlations_240_passages_2('score', only_4_scores=True, correlation_fn='kendalltau'))
display(correlations_1st_120_passages_2('score', only_4_scores=True, correlation_fn='kendalltau'))
display(correlations_2nd_120_passages_2('score', only_4_scores=True, correlation_fn='kendalltau'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0
0,a1,,0.623709,0.622409,0.518555,0.517395,0.481892
1,a2,0.623709,,0.626314,0.571902,0.480671,0.472718
2,a3,0.622409,0.626314,,0.586204,0.530991,0.536862
0,mean,0.623059,0.625011,0.624362,0.558887,0.509685,0.497157
0,Difference from mean human annotators,-0.001085,0.000868,0.000218,-0.065257,-0.114458,-0.126987


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.643791,0.751429,0.577191,0.600578,0.528179
1,a2,0.643791,,0.691653,0.639852,0.586733,0.55612
2,a3,0.751429,0.691653,,0.623634,0.636529,0.576167
0,mean,0.69761,0.667722,0.721541,0.613559,0.607947,0.553489
0,Difference from mean human annotators,0.001986,-0.027902,0.025916,-0.082065,-0.087677,-0.142136


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.590647,0.510594,0.49251,0.487789,0.50588
1,a2,0.590647,,0.600221,0.559817,0.464229,0.494063
2,a3,0.510594,0.600221,,0.54986,0.436586,0.510958
0,mean,0.55062,0.595434,0.555407,0.534063,0.462868,0.503634
0,Difference from mean human annotators,-0.016534,0.02828,-0.011746,-0.033091,-0.104286,-0.06352


## Kendall Tau correlations for the 3-score evaluations

In [67]:
display(correlations_240_passages_2('03_score', correlation_fn='kendalltau'))
display(correlations_1st_120_passages_2('03_score', correlation_fn='kendalltau'))
display(correlations_2nd_120_passages_2('03_score', correlation_fn='kendalltau'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.573817,0.562722,0.510435,0.504594,0.506972,0.447594,0.45704,0.467369
1,a2,0.573817,,0.563727,0.532865,0.432673,0.474528,0.460287,0.445006,0.478278
2,a3,0.562722,0.563727,,0.558807,0.493586,0.53082,0.53512,0.541417,0.522182
0,mean,0.568269,0.568772,0.563224,0.534035,0.476951,0.504107,0.481,0.481154,0.489276
0,Difference from mean human annotators,0.001514,0.002017,-0.003531,-0.03272,-0.089804,-0.062648,-0.085755,-0.085601,-0.077479


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.598965,0.702547,0.550848,0.499765,0.547607,0.493824,0.586395,0.575432
1,a2,0.598965,,0.630943,0.622677,0.583275,0.574656,0.576349,0.501474,0.551888
2,a3,0.702547,0.630943,,0.593732,0.569637,0.581042,0.553192,0.570975,0.573102
0,mean,0.650756,0.614954,0.666745,0.589086,0.550892,0.567768,0.541122,0.552948,0.566807
0,Difference from mean human annotators,0.006604,-0.029198,0.022593,-0.055066,-0.093259,-0.076383,-0.10303,-0.091203,-0.077344


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.527907,0.437531,0.507939,0.456146,0.413865,0.485746,0.479905,0.522965
1,a2,0.527907,,0.534678,0.488017,0.421082,0.366584,0.436908,0.432848,0.496697
2,a3,0.437531,0.534678,,0.52363,0.507003,0.505729,0.494181,0.417502,0.500648
0,mean,0.482719,0.531293,0.486105,0.506529,0.46141,0.428726,0.472278,0.443419,0.50677
0,Difference from mean human annotators,-0.01732,0.031254,-0.013934,0.00649,-0.038628,-0.071313,-0.02776,-0.05662,0.006731


## Kendall Tau correlations for the 2-score evaluations

In [68]:
display(correlations_240_passages_2('02_score', correlation_fn='kendalltau'))
display(correlations_1st_120_passages_2('02_score', correlation_fn='kendalltau'))
display(correlations_2nd_120_passages_2('02_score', correlation_fn='kendalltau'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.537,0.562747,0.445158,0.501745,0.494577,0.406897,0.403347,0.414733
1,a2,0.537,,0.551817,0.484161,0.397139,0.426376,0.421928,0.352279,0.447839
2,a3,0.562747,0.551817,,0.525926,0.507819,0.503768,0.497689,0.431944,0.489458
0,mean,0.549874,0.544409,0.557282,0.485082,0.468901,0.474907,0.442171,0.395857,0.450677
0,Difference from mean human annotators,-0.000648,-0.006113,0.00676,-0.06544,-0.08162,-0.075614,-0.10835,-0.154665,-0.099845


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.513948,0.677936,0.429166,0.405627,0.479512,0.386588,0.517549,0.505003
1,a2,0.513948,,0.581441,0.649626,0.581441,0.501486,0.594564,0.433013,0.458731
2,a3,0.677936,0.581441,,0.547168,0.526894,0.479718,0.509102,0.52067,0.507379
0,mean,0.595942,0.547695,0.629688,0.541987,0.504654,0.486905,0.496752,0.490411,0.490371
0,Difference from mean human annotators,0.004834,-0.043414,0.03858,-0.049122,-0.086454,-0.104203,-0.094357,-0.100698,-0.100737


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.526635,0.46618,0.495775,0.445164,0.343851,0.470374,0.546837,0.540899
1,a2,0.526635,,0.554371,0.376572,0.32005,0.234188,0.348312,0.439111,0.469018
2,a3,0.46618,0.554371,,0.50522,0.468945,0.383147,0.469696,0.500298,0.504505
0,mean,0.496408,0.540503,0.510275,0.459189,0.411386,0.320395,0.429461,0.495415,0.504807
0,Difference from mean human annotators,-0.019321,0.024774,-0.005453,-0.05654,-0.104342,-0.195333,-0.086268,-0.020313,-0.010922


## Kendall Tau correlations for the alternate 2-score evaluations

In [69]:
display(correlations_240_passages_2('02_score_alt', correlation_fn='kendalltau'))
display(correlations_1st_120_passages_2('02_score_alt', correlation_fn='kendalltau'))
display(correlations_2nd_120_passages_2('02_score_alt', correlation_fn='kendalltau'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0,GPT4 3-score 2.7,GPT4 3-score 2.8,GPT4 3-score 2.9
0,a1,,0.662783,0.71968,0.430731,0.437404,0.406427,0.365577,0.463571,0.381107
1,a2,0.662783,,0.722504,0.501442,0.472478,0.403194,0.451006,0.463941,0.445478
2,a3,0.71968,0.722504,,0.513769,0.490974,0.439365,0.436478,0.476056,0.4313
0,mean,0.691231,0.692643,0.721092,0.481981,0.466952,0.416329,0.417687,0.467856,0.419295
0,Difference from mean human annotators,-0.010424,-0.009012,0.019436,-0.219674,-0.234703,-0.285327,-0.283968,-0.233799,-0.28236


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 3-score 2.7,GPT4 1st 3-score 2.8,GPT4 1st 3-score 2.9,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.739583,0.849569,0.566947,0.39436,0.536452,0.403604,0.4842,0.440968
1,a2,0.739583,,0.849569,0.629941,0.436313,0.49099,0.403604,0.527432,0.440968
2,a3,0.849569,0.849569,,0.596341,0.428384,0.538517,0.397316,0.565272,0.439345
0,mean,0.794576,0.794576,0.849569,0.597743,0.419686,0.521987,0.401508,0.525635,0.440427
0,Difference from mean human annotators,-0.018331,-0.018331,0.036662,-0.215164,-0.393222,-0.290921,-0.411399,-0.287272,-0.37248


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 3-score 2.7,GPT4 2nd 3-score 2.8,GPT4 2nd 3-score 2.9,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.599641,0.599223,0.305258,0.345932,0.396431,0.364625,0.399626,0.382904
1,a2,0.599641,,0.620159,0.395515,0.487583,0.450941,0.499237,0.444978,0.394958
2,a3,0.599223,0.620159,,0.434423,0.448281,0.414077,0.467093,0.418089,0.444539
0,mean,0.599432,0.6099,0.609691,0.378399,0.427265,0.420483,0.443652,0.420898,0.407467
0,Difference from mean human annotators,-0.006909,0.003559,0.00335,-0.227942,-0.179076,-0.185858,-0.162689,-0.185443,-0.198874


## Spearman correlation for the 4-score evaluations

In [70]:
display(correlations_240_passages_2('score', only_4_scores=True, correlation_fn='spearmanr'))
display(correlations_1st_120_passages_2('score', only_4_scores=True, correlation_fn='spearmanr'))
display(correlations_2nd_120_passages_2('score', only_4_scores=True, correlation_fn='spearmanr'))

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score,GPT4 4-score cot 2.6,GPT4 4-score cot 3.0
0,a1,,0.693098,0.692357,0.588055,0.600356,0.543208
1,a2,0.693098,,0.698475,0.650614,0.561718,0.540211
2,a3,0.692357,0.698475,,0.666586,0.624874,0.616233
0,mean,0.692728,0.695787,0.695416,0.635085,0.595649,0.566551
0,Difference from mean human annotators,-0.001916,0.001143,0.000773,-0.059558,-0.098994,-0.128093


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score,GPT4 1st 4-score cot 2.6,GPT4 1st 4-score cot 3.0
0,a1,,0.706979,0.81708,0.644705,0.676451,0.580842
1,a2,0.706979,,0.752948,0.708052,0.665069,0.623036
2,a3,0.81708,0.752948,,0.695689,0.719307,0.650951
0,mean,0.76203,0.729963,0.785014,0.682815,0.686942,0.618276
0,Difference from mean human annotators,0.003027,-0.029039,0.026012,-0.076187,-0.07206,-0.140726


Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score,GPT4 2nd 4-score cot 2.6,GPT4 2nd 4-score cot 3.0
0,a1,,0.663906,0.578996,0.565228,0.57778,0.577619
1,a2,0.663906,,0.679524,0.639148,0.539401,0.558176
2,a3,0.578996,0.679524,,0.636998,0.533396,0.591144
0,mean,0.621451,0.671715,0.62926,0.613791,0.550192,0.575646
0,Difference from mean human annotators,-0.019358,0.030906,-0.011548,-0.027017,-0.090616,-0.065162


# Check agreement against single score

In [None]:
def correlations_240_passages_3(which_score, only_4_scores=False):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [cohen_kappa_score(a1_df[which_score], common_score_df[which_score])]

    general_agreement_df['a2'] = [cohen_kappa_score(a2_df[which_score], common_score_df[which_score])]

    general_agreement_df['a3'] = [cohen_kappa_score(a3_df[which_score], common_score_df[which_score])]

    general_agreement_df['GPT4 4-score'] = [cohen_kappa_score(common_score_df[which_score], gpt4_df[which_score])]

    general_agreement_df['GPT4 4-score cot 2.6'] = [cohen_kappa_score(common_score_df[which_score], gpt4_cot_2_6_df[which_score])]  
    
    if not only_4_scores:
        general_agreement_df['GPT4 3-score 2.7'] = [cohen_kappa_score(common_score_df[which_score], gpt4_03_score_2_7_df[which_score])]

        general_agreement_df['GPT4 3-score 2.9'] = [cohen_kappa_score(common_score_df[which_score], gpt4_03_score_2_9_df[which_score])]

    
    return general_agreement_df

In [None]:
def correlations_240_passages_4(which_score, only_4_scores=False):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [pearsonr(a1_df[which_score], common_score_df[which_score])[0]]

    general_agreement_df['a2'] = [pearsonr(a2_df[which_score], common_score_df[which_score])[0]]

    general_agreement_df['a3'] = [pearsonr(a3_df[which_score], common_score_df[which_score])[0]]

    general_agreement_df['GPT4 4-score'] = [pearsonr(common_score_df[which_score], gpt4_df[which_score])[0]]

    general_agreement_df['GPT4 4-score cot 2.6'] = [pearsonr(common_score_df[which_score], gpt4_cot_2_6_df[which_score])[0]]
    
    if not only_4_scores:
        general_agreement_df['GPT4 3-score 2.7'] = [pearsonr(common_score_df[which_score], gpt4_03_score_2_7_df[which_score])[0]]

        general_agreement_df['GPT4 3-score 2.9'] = [pearsonr(common_score_df[which_score], gpt4_03_score_2_9_df[which_score])[0]]

    
    return general_agreement_df

### Cohen's Kappa

In [None]:
correlations_240_passages_3('score', only_4_scores=True)

In [None]:
correlations_240_passages_3('03_score')

In [None]:
correlations_240_passages_3('02_score')

### Pearson

In [None]:
correlations_240_passages_4('score', only_4_scores=True)

In [None]:
correlations_240_passages_4('03_score')

In [None]:
correlations_240_passages_4('02_score')

# Check the agreement per question

In [None]:
def correlation_per_question(which_score):

    all_results = []
    
    res_a1_a2 = check_agreement_per_questions(a1_df, a2_df, suffix="_a1_a2", score=which_score)
    res_a1_a3 = check_agreement_per_questions(a1_df, a3_df, suffix="_a1_a3", score=which_score)
    res_a2_a3 = check_agreement_per_questions(a2_df, a3_df, suffix="_a2_a3", score=which_score)

    res_a1_gpt4 = check_agreement_per_questions(a1_df, gpt4_df, "_a1_gpt4", score=which_score)
    res_a2_gpt4 = check_agreement_per_questions(a2_df, gpt4_df, "_a2_gpt4", score=which_score)
    res_a3_gpt4 = check_agreement_per_questions(a3_df, gpt4_df, "_a3_gpt4", score=which_score)

    res_a1_gpt4_cot_2_6 = check_agreement_per_questions(a1_df, gpt4_cot_2_6_df, "_a1_gpt4_cot_2.6", score=which_score)
    res_a2_gpt4_cot_2_6 = check_agreement_per_questions(a2_df, gpt4_cot_2_6_df, "_a2_gpt4_cot_2.6", score=which_score)
    res_a3_gpt4_cot_2_6 = check_agreement_per_questions(a3_df, gpt4_cot_2_6_df, "_a3_gpt4_cot_2.6", score=which_score)

    if which_score != 'score':
        res_a1_gpt4_2_7 = check_agreement_per_questions(a1_df, gpt4_03_score_2_7_df, "_a1_gpt4_2.7", score=which_score)
        res_a2_gpt4_2_7 = check_agreement_per_questions(a2_df, gpt4_03_score_2_7_df, "_a2_gpt4_2.7", score=which_score)
        res_a3_gpt4_2_7 = check_agreement_per_questions(a3_df, gpt4_03_score_2_7_df, "_a3_gpt4_2.7", score=which_score)

        res_a1_gpt4_2_9 = check_agreement_per_questions(a1_df, gpt4_03_score_2_9_df, "_a1_gpt4_2.9", score=which_score)
        res_a2_gpt4_2_9 = check_agreement_per_questions(a2_df, gpt4_03_score_2_9_df, "_a2_gpt4_2.9", score=which_score)
        res_a3_gpt4_2_9 = check_agreement_per_questions(a3_df, gpt4_03_score_2_9_df, "_a3_gpt4_2.9", score=which_score)
    
    
    human_combined_res = pd.concat([res_a1_a2[1], res_a1_a3[1].iloc[:, -1], res_a2_a3[1].iloc[:, -1]], axis=1)

    human_combined_res['cohen_kappa_mean'] = human_combined_res.iloc[:, 1:].mean(axis=1)
    human_combined_res['cohen_kappa_std'] = human_combined_res.iloc[:, 1:].std(axis=1)
    
    all_results.append(human_combined_res)
    
    human_gpt4_combined_res = pd.concat([res_a1_gpt4[1], res_a2_gpt4[1].iloc[:, -1], res_a3_gpt4[1].iloc[:, -1]], axis=1)
    human_gpt4_combined_res['cohen_kappa_mean'] = human_gpt4_combined_res.iloc[:, 1:].mean(axis=1)
    human_gpt4_combined_res['cohen_kappa_std'] = human_gpt4_combined_res.iloc[:, 1:].std(axis=1)

    all_results.append(human_gpt4_combined_res)    
    
    human_gpt4_cot_2_6_combined_res = pd.concat([res_a1_gpt4_cot_2_6[1], res_a2_gpt4_cot_2_6[1].iloc[:, -1], res_a3_gpt4_cot_2_6[1].iloc[:, -1]], axis=1)
    human_gpt4_cot_2_6_combined_res['cohen_kappa_mean'] = human_gpt4_cot_2_6_combined_res.iloc[:, 1:].mean(axis=1)
    human_gpt4_cot_2_6_combined_res['cohen_kappa_std'] = human_gpt4_cot_2_6_combined_res.iloc[:, 1:].std(axis=1)

    all_results.append(human_gpt4_cot_2_6_combined_res)    

    if which_score != 'score':
        human_gpt4_2_7_combined_res = pd.concat([res_a1_gpt4_2_7[1], res_a2_gpt4_2_7[1].iloc[:, -1], res_a3_gpt4_2_7[1].iloc[:, -1]], axis=1)
        human_gpt4_2_7_combined_res['cohen_kappa_mean'] = human_gpt4_2_7_combined_res.iloc[:, 1:].mean(axis=1)
        human_gpt4_2_7_combined_res['cohen_kappa_std'] = human_gpt4_2_7_combined_res.iloc[:, 1:].std(axis=1)

        all_results.append(human_gpt4_2_7_combined_res)    
        
        human_gpt4_2_9_combined_res = pd.concat([res_a1_gpt4_2_9[1], res_a2_gpt4_2_9[1].iloc[:, -1], res_a3_gpt4_2_9[1].iloc[:, -1]], axis=1)
        human_gpt4_2_9_combined_res['cohen_kappa_mean'] = human_gpt4_2_9_combined_res.iloc[:, 1:].mean(axis=1)
        human_gpt4_2_9_combined_res['cohen_kappa_std'] = human_gpt4_2_9_combined_res.iloc[:, 1:].std(axis=1)
    
        all_results.append(human_gpt4_2_9_combined_res)    
    
    
    correlations = []
    correlations.append({'annotator': 'human',
                         'cohen_kappa': human_combined_res['cohen_kappa_mean'].mean()})
    
    correlations.append({'annotator': 'GPT-4',
                         'cohen_kappa': human_gpt4_combined_res['cohen_kappa_mean'].mean()})
    correlations.append({'annotator': 'GPT-4 CoT 2.6',
                         'cohen_kappa': human_gpt4_cot_2_6_combined_res['cohen_kappa_mean'].mean()})

    if which_score != 'score':
        correlations.append({'annotator': 'GPT-4 2.7',
                             'cohen_kappa': human_gpt4_2_7_combined_res['cohen_kappa_mean'].mean()})
        correlations.append({'annotator': 'GPT-4 2.9',
                             'cohen_kappa': human_gpt4_2_9_combined_res['cohen_kappa_mean'].mean()})
        
    
    
    correlations_df = pd.DataFrame(correlations)
    
    correlations_df['difference'] = correlations_df['cohen_kappa'] - human_combined_res['cohen_kappa_mean'].mean()
    
    display(correlations_df)
    
    return correlations_df, all_results

### Check for 4-score evaluations

In [None]:
_ = correlation_per_question('score')

### Check for 3-score evaluations

In [None]:
score_3_df = correlation_per_question('03_score')

In [None]:
score_2_df = correlation_per_question('02_score')

In [None]:
score_2a_df = correlation_per_question('02_score_alt')