In [1]:
import pandas as pd
import numpy as np
import glob

import os

from sklearn.metrics import cohen_kappa_score, confusion_matrix

from scipy.stats import spearmanr, kendalltau, pearsonr

import re

import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.6f}'.format

In [3]:
ANNOTATIONS_FOLDER="../trabalho_final/anotações_humanas"

### Mapping 4-score to 3-score evaluations to match GPT-4 3-score evaluations

In [4]:
score_mapper = {
    0:0,
    1:0,
    2:1,
    3:2
}

### Original 4-score to 3-score evaluations mapping

This mapping changes the original 4-score 1 evaluation meaning, indicating it **partially answers to the question**, while the original (TREC-DL 21) meaning is the passage **does not answer to the question**.

In [5]:
score_mapper_original = {
    0:0,
    1:1,
    2:1,
    3:2
}

### Mapping 4-score to 2-score evenly

In [6]:
score_mapper_for_2 = {
    0:0,
    1:0,
    2:1,
    3:1    
}

### Alternative version for mapping 4-score to 2-score

In [7]:
score_mapper_for_2_alt = {
    0:0,
    1:1,
    2:1,
    3:1    
}

### Mapping 3-score to 2-score

In [8]:
score_mapper_3_for_2 = {
    0:0,
    1:1,
    2:1    
}

In [9]:
def check_agreement_per_questions(evaluation_a, evaluation_b, suffix=None, score='score'):
    
    merged_df = evaluation_a.merge(evaluation_b, left_on='doccano_id', right_on='doccano_id')[['query_x', 'passage_x', 'passage_id_x', score + '_x', score + '_y']]
    
    correlations = []
    
    for group_name, group_df in merged_df.groupby('query_x', sort=False):
        correlations.append({'query': group_name,
                             'cohen_kappa{}'.format(suffix): cohen_kappa_score(group_df[score + '_x'], group_df[score + '_y'])})
        
    return merged_df, pd.DataFrame(correlations).fillna(1.0)

In [10]:
def plot_correlation_data_consolidated(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[10, 10])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot], patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen']
    
    all_boxes = all_boxplots['boxes']
    
    print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i // 3])
    
    
    plt.yticks(range(1, len(columns_to_plot) + 1), columns_to_plot)
    plt.xticks(np.arange(-0.3, 1.0, 0.1))
    
    plt.legend(handles=all_boxes[::3], labels=["Humanos x Humanos", "Humanos x GPT3.5", "Humanos x GPT4"], bbox_to_anchor=(1.0, 1.0))

    plt.show()

In [11]:
def plot_correlation_data(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[15, 40])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot].to_numpy().transpose(), patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen', 'lightyellow'][::-1]
    
    all_boxes = all_boxplots['boxes']
    
#     print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i % 4])

        
    
    plt.yticks(range(4, correlation_df.shape[0] + 1, 4), correlation_df.iloc[::4]['query'])
    plt.xticks(np.arange(-0.3, 1.0, 0.1))

    plt.show()

## Read human annotators evaluations and map the original 4-score values

In [12]:
a1_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "admin_240_annotations_with_questions.tsv"), sep='\t')
a2_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Eduardo_240_annotations_with_questions.tsv"), sep='\t')
a3_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Leodecio_240_annotations_with_questions.tsv"), sep='\t')

In [13]:
a1_df['score'].unique()

array([3, 0, 2, 1])

### Compute single score based on the human annotators

In [14]:
all_scores_df = pd.DataFrame()

In [15]:
all_scores_df['a1'] = a1_df['score']
all_scores_df['a2'] = a2_df['score']
all_scores_df['a3'] = a3_df['score']

In [16]:
all_scores_df

Unnamed: 0,a1,a2,a3
0,3,3,3
1,0,0,0
2,2,1,1
3,2,0,0
4,3,2,1
...,...,...,...
235,2,2,1
236,3,0,1
237,1,3,3
238,1,0,0


In [17]:
single_score = []

for i, row in all_scores_df.iterrows():
    if (row['a1'] == row['a2']) or (row['a1'] == row['a3']):
        single_score.append(row['a1'])

    elif row['a2'] == row['a3']:
        single_score.append(row['a2'])
        
    else:
        single_score.append(np.random.choice([row['a1'], row['a2'], row['a3']], 1)[0])
            
single_score = np.array(single_score)

In [18]:
single_score

array([3, 0, 1, 0, 3, 0, 0, 0, 1, 0, 3, 3, 2, 2, 1, 1, 1, 0, 1, 0, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 2, 0, 1, 2, 2, 0, 1, 0, 3, 2, 3, 3,
       0, 2, 0, 0, 0, 0, 3, 2, 3, 1, 2, 3, 1, 1, 2, 2, 2, 1, 3, 3, 3, 3,
       2, 3, 1, 3, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 0, 0, 2, 2, 2, 0, 3, 1,
       1, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 0, 3, 3, 2, 3, 2,
       0, 2, 1, 1, 2, 0, 0, 2, 1, 2, 3, 3, 3, 3, 3, 2, 3, 2, 1, 2, 1, 2,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 0, 0, 1, 1, 3, 3, 1, 1,
       2, 0, 2, 0, 1, 0, 0, 1, 2, 2, 2, 1, 0, 2, 2, 2, 3, 2, 2, 2, 2, 1,
       1, 1, 0, 2, 3, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 2, 2, 2, 1, 0, 1,
       2, 1, 0, 3, 0, 1, 1, 0, 0, 0, 1, 0, 2, 3, 3, 2, 3, 2, 1, 3, 1, 3,
       3, 3, 1, 2, 2, 3, 3, 0, 0, 0, 1, 0, 2, 0, 0, 2, 1, 3, 0, 0])

In [46]:
single_score_df = pd.DataFrame(single_score, columns=['score'])

In [47]:
single_score_df

Unnamed: 0,score
0,3
1,0
2,1
3,0
4,3
...,...
235,2
236,1
237,3
238,0


### Convert the 4-score to other ranges

In [48]:
a1_df['03_score'] = a1_df['score'].map(score_mapper)
a2_df['03_score'] = a2_df['score'].map(score_mapper)
a3_df['03_score'] = a3_df['score'].map(score_mapper)

single_score_df['03_score'] = common_score_df['score'].map(score_mapper)

In [49]:
a1_df['02_score'] = a1_df['score'].map(score_mapper_for_2)
a2_df['02_score'] = a2_df['score'].map(score_mapper_for_2)
a3_df['02_score'] = a3_df['score'].map(score_mapper_for_2)

single_score_df['02_score'] = common_score_df['score'].map(score_mapper_for_2)

In [50]:
a1_df['02_score_alt'] = a1_df['score'].map(score_mapper_for_2_alt)
a2_df['02_score_alt'] = a2_df['score'].map(score_mapper_for_2_alt)
a3_df['02_score_alt'] = a3_df['score'].map(score_mapper_for_2_alt)

single_score_df['02_score_alt'] = common_score_df['score'].map(score_mapper_for_2_alt)

In [35]:
def cohen_kappa_wrapper(first_series, second_series):
    return (cohen_kappa_score(first_series, second_series), )

## Functions for correlation for 1st, 2nd and all questions

In [24]:
def correlations_1st_set(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score])[0], 
                                  globals()[correlation_fn](a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0], 
                                  globals()[correlation_fn](a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a2_df.iloc[0:120][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a3_df.iloc[0:120][which_score], evaluation_df[which_score])[0]]

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['std'] + general_agreement_df.iloc[:, 1:].std().to_list()], columns=general_agreement_df.columns)])    
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()

    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

In [25]:
def correlations_2nd_set(evaluation_df, which_score, correlation_fn):
    
    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score])[0], 
                                  globals()[correlation_fn](a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0], 
                                  globals()[correlation_fn](a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 2nd 4-score'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a2_df.iloc[120:240][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a3_df.iloc[120:240][which_score], evaluation_df[which_score])[0]]

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['std'] + general_agreement_df.iloc[:, 1:].std().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()

    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

In [76]:
def correlations_all(evaluation_df, which_score, correlation_fn):
    
    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df[which_score], a2_df[which_score])[0], 
                                  globals()[correlation_fn](a1_df[which_score], a3_df[which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df[which_score], a2_df[which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df[which_score], a3_df[which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df[which_score], a3_df[which_score])[0], 
                                  globals()[correlation_fn](a2_df[which_score], a3_df[which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 4-score'] = [globals()[correlation_fn](a1_df[which_score], evaluation_df[which_score])[0], 
                                            globals()[correlation_fn](a2_df[which_score], evaluation_df[which_score])[0], 
                                            globals()[correlation_fn](a3_df[which_score], evaluation_df[which_score])[0]]

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['std'] + general_agreement_df.iloc[:, 1:].std().to_list()], columns=general_agreement_df.columns)])
    
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()

    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])
    
    return general_agreement_df

### Functions to compute correlation against single score computed accross human annoators

In [83]:
def correlations_single_score_1st_set(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], single_score_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a2_df.iloc[0:120][which_score], single_score_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a3_df.iloc[0:120][which_score], single_score_df.iloc[0:120][which_score])[0]]
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df['human mean'] = human_mean
    
    general_agreement_df['human std'] = general_agreement_df.iloc[-1,1:4].std()

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](single_score_df.iloc[0:120][which_score], evaluation_df[which_score])[0]]

    difference_from_mean = ['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()
    difference_from_mean[-2] = np.nan
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[difference_from_mean], columns=general_agreement_df.columns)])

    return general_agreement_df

In [85]:
def correlations_single_score_2nd_set(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], single_score_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a2_df.iloc[120:240][which_score], single_score_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a3_df.iloc[120:240][which_score], single_score_df.iloc[120:240][which_score])[0]]
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df['human mean'] = human_mean
    
    general_agreement_df['human std'] = general_agreement_df.iloc[-1,1:4].std()

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](single_score_df.iloc[120:240][which_score], evaluation_df[which_score])[0]]

    difference_from_mean = ['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()
    difference_from_mean[-2] = np.nan
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[difference_from_mean], columns=general_agreement_df.columns)])

    return general_agreement_df

In [87]:
def correlations_single_score_all(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [globals()[correlation_fn](a1_df[which_score], single_score_df[which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a2_df[which_score], single_score_df[which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a3_df[which_score], single_score_df[which_score])[0]]
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df['human mean'] = human_mean
    
    general_agreement_df['human std'] = general_agreement_df.iloc[-1,1:4].std()

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](single_score_df[which_score], evaluation_df[which_score])[0]]

    difference_from_mean = ['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()
    difference_from_mean[-2] = np.nan
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[difference_from_mean], columns=general_agreement_df.columns)])

    return general_agreement_df

## Check the regenerated evaluations, comparing against the original GPT-4 4-score evaluation

### Evaluation using the new GPT-4 turbo ― Spearman

In [27]:
gpt4_1106_1st_df = pd.read_csv(os.path.join("tests", "test_000_119_gpt-4-1106-preview_20231108_fixed_2_scores.tsv"), sep='\t')

In [28]:
gpt4_1106_2nd_df = pd.read_csv(os.path.join("tests", "test_120_239_gpt-4-1106-preview_20231115_fixed_2_scores.tsv"), sep='\t')

In [29]:
gpt4_1106_df = pd.concat([gpt4_1106_1st_df, gpt4_1106_2nd_df], axis=0).reset_index(drop=True)

In [30]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.706979,0.81708,0.655061
1,a2,0.706979,,0.752948,0.657686
2,a3,0.81708,0.752948,,0.656545
0,mean,0.76203,0.729963,0.785014,0.656431
0,std,0.055051,0.022985,0.032066,0.001075
0,Difference from mean human annotators,0.01835,-0.013716,-0.004634,-0.035626


In [31]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.663906,0.578996,0.576091
1,a2,0.663906,,0.679524,0.607351
2,a3,0.578996,0.679524,,0.60219
0,mean,0.621451,0.671715,0.62926,0.595211
0,std,0.042455,0.007809,0.050264,0.013683
0,Difference from mean human annotators,0.008945,-0.0257,0.016755,-0.019827


In [32]:
correlations_all(gpt4_1106_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.693098,0.692357,0.607279
1,a2,0.693098,,0.698475,0.617434
2,a3,0.692357,0.698475,,0.629563
0,mean,0.692728,0.695787,0.695416,0.618092
0,std,0.00037,0.002689,0.003059,0.009109
0,Difference from mean human annotators,-0.001669,0.000649,0.00102,0.00707


### Evaluation using the new GPT-4 turbo ― Cohen Kappa

In [36]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.489892,0.575498,0.404606
1,a2,0.489892,,0.500324,0.363543
2,a3,0.575498,0.500324,,0.445074
0,mean,0.532695,0.495108,0.537911,0.404408
0,std,0.042803,0.005216,0.037587,0.033285
0,Difference from mean human annotators,0.014268,-0.023319,0.009052,0.00475


In [37]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.369369,0.286629,0.243283
1,a2,0.369369,,0.322651,0.157364
2,a3,0.286629,0.322651,,0.254939
0,mean,0.327999,0.34601,0.30464,0.218529
0,std,0.04137,0.023359,0.018011,0.043511
0,Difference from mean human annotators,0.01379,-0.004221,-0.009569,0.015931


In [38]:
correlations_all(gpt4_1106_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.436881,0.429402,0.323428
1,a2,0.436881,,0.410455,0.259276
2,a3,0.429402,0.410455,,0.349839
0,mean,0.433142,0.423668,0.419929,0.310848
0,std,0.00374,0.013213,0.009474,0.038027
0,Difference from mean human annotators,-0.005069,0.004404,0.000665,0.029218


### Evaluation using the new GPT-4 turbo ― Pearson

In [40]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'pearsonr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.719862,0.82506,0.650404
1,a2,0.719862,,0.768756,0.657615
2,a3,0.82506,0.768756,,0.657503
0,mean,0.772461,0.744309,0.796908,0.655174
0,std,0.052599,0.024447,0.028152,0.003373
0,Difference from mean human annotators,0.017533,-0.010619,-0.006914,-0.031693


In [41]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'pearsonr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.657464,0.576163,0.553798
1,a2,0.657464,,0.687592,0.595538
2,a3,0.576163,0.687592,,0.607573
0,mean,0.616813,0.672528,0.631878,0.585636
0,std,0.04065,0.015064,0.055715,0.023043
0,Difference from mean human annotators,0.003507,-0.022079,0.018572,-0.0141


In [42]:
correlations_all(gpt4_1106_df, 'score', 'pearsonr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.698174,0.697289,0.598166
1,a2,0.698174,,0.713168,0.614565
2,a3,0.697289,0.713168,,0.632562
0,mean,0.697732,0.705671,0.705229,0.615098
0,std,0.000442,0.007497,0.007939,0.014047
0,Difference from mean human annotators,-0.004851,0.002204,0.002646,0.008754


### Evaluation using the new GPT-4 turbo ― Kendall Tau

In [43]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'kendalltau')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.643791,0.751429,0.582366
1,a2,0.643791,,0.691653,0.584938
2,a3,0.751429,0.691653,,0.58167
0,mean,0.69761,0.667722,0.721541,0.582991
0,std,0.053819,0.023931,0.029888,0.001405
0,Difference from mean human annotators,0.01794,-0.011948,-0.005991,-0.034474


In [44]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'kendalltau')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.590647,0.510594,0.491843
1,a2,0.590647,,0.600221,0.535608
2,a3,0.510594,0.600221,,0.510917
0,mean,0.55062,0.595434,0.555407,0.512789
0,std,0.040027,0.004787,0.044814,0.017916
0,Difference from mean human annotators,0.010151,-0.025089,0.014938,-0.01196


In [45]:
correlations_all(gpt4_1106_df, 'score', 'kendalltau')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.623709,0.622409,0.529502
1,a2,0.623709,,0.626314,0.539189
2,a3,0.622409,0.626314,,0.546029
0,mean,0.623059,0.625011,0.624362,0.53824
0,std,0.00065,0.001303,0.001953,0.00678
0,Difference from mean human annotators,-0.000652,1e-06,0.000651,0.005479


# Check agreement against single score

### Pearson

In [84]:
correlations_single_score_1st_set(gpt4_1106_1st_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.90708,0.815884,0.893411,0.872125,0.049183,0.650659
0,Difference from mean human annotators,0.034955,-0.056241,0.021286,0.0,,-0.221466


In [86]:
correlations_single_score_2nd_set(gpt4_1106_2nd_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.795646,0.832723,0.755262,0.794544,0.038742,0.618513
0,Difference from mean human annotators,0.001102,0.038179,-0.039281,0.0,,-0.176031


In [88]:
correlations_single_score_all(gpt4_1106_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.855277,0.825678,0.817974,0.832976,0.019694,0.626833
0,Difference from mean human annotators,0.022301,-0.007299,-0.015003,0.0,,-0.206143


### Cohen Kappa

In [90]:
correlations_single_score_1st_set(gpt4_1106_1st_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.809203,0.680821,0.754304,0.748109,0.064415,0.461431
0,Difference from mean human annotators,0.061094,-0.067289,0.006195,0.0,,-0.286679


In [91]:
correlations_single_score_2nd_set(gpt4_1106_2nd_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.629734,0.696117,0.55931,0.628387,0.068414,0.210021
0,Difference from mean human annotators,0.001347,0.06773,-0.069077,0.0,,-0.418366


In [92]:
correlations_single_score_all(gpt4_1106_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.722106,0.692092,0.655978,0.690059,0.033111,0.334673
0,Difference from mean human annotators,0.032048,0.002034,-0.034081,0.0,,-0.355386


# Check the agreement per question

In [None]:
def correlation_per_question(which_score):

    all_results = []
    
    res_a1_a2 = check_agreement_per_questions(a1_df, a2_df, suffix="_a1_a2", score=which_score)
    res_a1_a3 = check_agreement_per_questions(a1_df, a3_df, suffix="_a1_a3", score=which_score)
    res_a2_a3 = check_agreement_per_questions(a2_df, a3_df, suffix="_a2_a3", score=which_score)

    res_a1_gpt4 = check_agreement_per_questions(a1_df, gpt4_df, "_a1_gpt4", score=which_score)
    res_a2_gpt4 = check_agreement_per_questions(a2_df, gpt4_df, "_a2_gpt4", score=which_score)
    res_a3_gpt4 = check_agreement_per_questions(a3_df, gpt4_df, "_a3_gpt4", score=which_score)

    res_a1_gpt4_cot_2_6 = check_agreement_per_questions(a1_df, gpt4_cot_2_6_df, "_a1_gpt4_cot_2.6", score=which_score)
    res_a2_gpt4_cot_2_6 = check_agreement_per_questions(a2_df, gpt4_cot_2_6_df, "_a2_gpt4_cot_2.6", score=which_score)
    res_a3_gpt4_cot_2_6 = check_agreement_per_questions(a3_df, gpt4_cot_2_6_df, "_a3_gpt4_cot_2.6", score=which_score)

    if which_score != 'score':
        res_a1_gpt4_2_7 = check_agreement_per_questions(a1_df, gpt4_03_score_2_7_df, "_a1_gpt4_2.7", score=which_score)
        res_a2_gpt4_2_7 = check_agreement_per_questions(a2_df, gpt4_03_score_2_7_df, "_a2_gpt4_2.7", score=which_score)
        res_a3_gpt4_2_7 = check_agreement_per_questions(a3_df, gpt4_03_score_2_7_df, "_a3_gpt4_2.7", score=which_score)

        res_a1_gpt4_2_9 = check_agreement_per_questions(a1_df, gpt4_03_score_2_9_df, "_a1_gpt4_2.9", score=which_score)
        res_a2_gpt4_2_9 = check_agreement_per_questions(a2_df, gpt4_03_score_2_9_df, "_a2_gpt4_2.9", score=which_score)
        res_a3_gpt4_2_9 = check_agreement_per_questions(a3_df, gpt4_03_score_2_9_df, "_a3_gpt4_2.9", score=which_score)
    
    
    human_combined_res = pd.concat([res_a1_a2[1], res_a1_a3[1].iloc[:, -1], res_a2_a3[1].iloc[:, -1]], axis=1)

    human_combined_res['cohen_kappa_mean'] = human_combined_res.iloc[:, 1:].mean(axis=1)
    human_combined_res['cohen_kappa_std'] = human_combined_res.iloc[:, 1:].std(axis=1)
    
    all_results.append(human_combined_res)
    
    human_gpt4_combined_res = pd.concat([res_a1_gpt4[1], res_a2_gpt4[1].iloc[:, -1], res_a3_gpt4[1].iloc[:, -1]], axis=1)
    human_gpt4_combined_res['cohen_kappa_mean'] = human_gpt4_combined_res.iloc[:, 1:].mean(axis=1)
    human_gpt4_combined_res['cohen_kappa_std'] = human_gpt4_combined_res.iloc[:, 1:].std(axis=1)

    all_results.append(human_gpt4_combined_res)    
    
    human_gpt4_cot_2_6_combined_res = pd.concat([res_a1_gpt4_cot_2_6[1], res_a2_gpt4_cot_2_6[1].iloc[:, -1], res_a3_gpt4_cot_2_6[1].iloc[:, -1]], axis=1)
    human_gpt4_cot_2_6_combined_res['cohen_kappa_mean'] = human_gpt4_cot_2_6_combined_res.iloc[:, 1:].mean(axis=1)
    human_gpt4_cot_2_6_combined_res['cohen_kappa_std'] = human_gpt4_cot_2_6_combined_res.iloc[:, 1:].std(axis=1)

    all_results.append(human_gpt4_cot_2_6_combined_res)    

    if which_score != 'score':
        human_gpt4_2_7_combined_res = pd.concat([res_a1_gpt4_2_7[1], res_a2_gpt4_2_7[1].iloc[:, -1], res_a3_gpt4_2_7[1].iloc[:, -1]], axis=1)
        human_gpt4_2_7_combined_res['cohen_kappa_mean'] = human_gpt4_2_7_combined_res.iloc[:, 1:].mean(axis=1)
        human_gpt4_2_7_combined_res['cohen_kappa_std'] = human_gpt4_2_7_combined_res.iloc[:, 1:].std(axis=1)

        all_results.append(human_gpt4_2_7_combined_res)    
        
        human_gpt4_2_9_combined_res = pd.concat([res_a1_gpt4_2_9[1], res_a2_gpt4_2_9[1].iloc[:, -1], res_a3_gpt4_2_9[1].iloc[:, -1]], axis=1)
        human_gpt4_2_9_combined_res['cohen_kappa_mean'] = human_gpt4_2_9_combined_res.iloc[:, 1:].mean(axis=1)
        human_gpt4_2_9_combined_res['cohen_kappa_std'] = human_gpt4_2_9_combined_res.iloc[:, 1:].std(axis=1)
    
        all_results.append(human_gpt4_2_9_combined_res)    
    
    
    correlations = []
    correlations.append({'annotator': 'human',
                         'cohen_kappa': human_combined_res['cohen_kappa_mean'].mean()})
    
    correlations.append({'annotator': 'GPT-4',
                         'cohen_kappa': human_gpt4_combined_res['cohen_kappa_mean'].mean()})
    correlations.append({'annotator': 'GPT-4 CoT 2.6',
                         'cohen_kappa': human_gpt4_cot_2_6_combined_res['cohen_kappa_mean'].mean()})

    if which_score != 'score':
        correlations.append({'annotator': 'GPT-4 2.7',
                             'cohen_kappa': human_gpt4_2_7_combined_res['cohen_kappa_mean'].mean()})
        correlations.append({'annotator': 'GPT-4 2.9',
                             'cohen_kappa': human_gpt4_2_9_combined_res['cohen_kappa_mean'].mean()})
        
    
    
    correlations_df = pd.DataFrame(correlations)
    
    correlations_df['difference'] = correlations_df['cohen_kappa'] - human_combined_res['cohen_kappa_mean'].mean()
    
    display(correlations_df)
    
    return correlations_df, all_results

### Check for 4-score evaluations

In [None]:
_ = correlation_per_question('score')

### Check for 3-score evaluations

In [None]:
score_3_df = correlation_per_question('03_score')

In [None]:
score_2_df = correlation_per_question('02_score')

In [None]:
score_2a_df = correlation_per_question('02_score_alt')