In [1]:
import pandas as pd
import numpy as np
import glob

import os

from sklearn.metrics import cohen_kappa_score, confusion_matrix

from scipy.stats import spearmanr, kendalltau, pearsonr

import re

import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.6f}'.format

In [3]:
ANNOTATIONS_FOLDER="../trabalho_final/anotações_humanas"

### Mapping 4-score to 3-score evaluations to match GPT-4 3-score evaluations

In [4]:
score_mapper = {
    0:0,
    1:0,
    2:1,
    3:2
}

### Original 4-score to 3-score evaluations mapping

This mapping changes the original 4-score 1 evaluation meaning, indicating it **partially answers to the question**, while the original (TREC-DL 21) meaning is the passage **does not answer to the question**.

In [5]:
score_mapper_original = {
    0:0,
    1:1,
    2:1,
    3:2
}

### Mapping 4-score to 2-score evenly

In [6]:
score_mapper_for_2 = {
    0:0,
    1:0,
    2:1,
    3:1    
}

### Alternative version for mapping 4-score to 2-score

In [7]:
score_mapper_for_2_alt = {
    0:0,
    1:1,
    2:1,
    3:1    
}

### Mapping 3-score to 2-score

In [8]:
score_mapper_3_for_2 = {
    0:0,
    1:1,
    2:1    
}

In [63]:
def check_agreement_per_questions(evaluation_a, evaluation_b, correlation_fn, metric_label=None, score='score', fillna=None):
    
    merged_df = evaluation_a.merge(evaluation_b, left_on='doccano_id', right_on='doccano_id')[['query_x', 'passage_x', 'passage_id_x', score + '_x', score + '_y']]
    
    correlations = []
    
    for group_name, group_df in merged_df.groupby('query_x', sort=False):
        correlations.append({'query': group_name,
                             metric_label: globals()[correlation_fn](group_df[score + '_x'], group_df[score + '_y'])[0]})
    
    if fillna is not None:
        correlations_df = pd.DataFrame(correlations).fillna(fillna)
    else:
        correlations_df = pd.DataFrame(correlations)
    
    
    return merged_df, correlations_df

In [10]:
def plot_correlation_data_consolidated(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[10, 10])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot], patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen']
    
    all_boxes = all_boxplots['boxes']
    
    print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i // 3])
    
    
    plt.yticks(range(1, len(columns_to_plot) + 1), columns_to_plot)
    plt.xticks(np.arange(-0.3, 1.0, 0.1))
    
    plt.legend(handles=all_boxes[::3], labels=["Humanos x Humanos", "Humanos x GPT3.5", "Humanos x GPT4"], bbox_to_anchor=(1.0, 1.0))

    plt.show()

In [11]:
def plot_correlation_data(correlation_df, columns_to_plot, plot_title):
    
    fig = plt.figure(figsize=[15, 40])

    fig.suptitle(plot_title, y=0.91)

    plt.grid()

    all_boxplots = plt.boxplot(correlation_df[columns_to_plot].to_numpy().transpose(), patch_artist=True, vert=False, meanline=True, showmeans=True)

    colors = ['pink', 'lightblue', 'lightgreen', 'lightyellow'][::-1]
    
    all_boxes = all_boxplots['boxes']
    
#     print(all_boxes)
    
    for i, which_box in enumerate(all_boxes):
        which_box.set_facecolor(colors[i % 4])

        
    
    plt.yticks(range(4, correlation_df.shape[0] + 1, 4), correlation_df.iloc[::4]['query'])
    plt.xticks(np.arange(-0.3, 1.0, 0.1))

    plt.show()

## Read human annotators evaluations and map the original 4-score values

In [12]:
a1_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "admin_240_annotations_with_questions.tsv"), sep='\t')
a2_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Eduardo_240_annotations_with_questions.tsv"), sep='\t')
a3_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "Leodecio_240_annotations_with_questions.tsv"), sep='\t')

In [13]:
a1_df['score'].unique()

array([3, 0, 2, 1])

### Compute single score based on the human annotators

In [14]:
all_scores_df = pd.DataFrame()

In [15]:
all_scores_df['a1'] = a1_df['score']
all_scores_df['a2'] = a2_df['score']
all_scores_df['a3'] = a3_df['score']

In [16]:
all_scores_df

Unnamed: 0,a1,a2,a3
0,3,3,3
1,0,0,0
2,2,1,1
3,2,0,0
4,3,2,1
...,...,...,...
235,2,2,1
236,3,0,1
237,1,3,3
238,1,0,0


In [17]:
single_score = []

for i, row in all_scores_df.iterrows():
    if (row['a1'] == row['a2']) or (row['a1'] == row['a3']):
        single_score.append(row['a1'])

    elif row['a2'] == row['a3']:
        single_score.append(row['a2'])
        
    else:
        single_score.append(np.random.choice([row['a1'], row['a2'], row['a3']], 1)[0])
            
single_score = np.array(single_score)

In [18]:
single_score

array([3, 0, 1, 0, 3, 0, 1, 0, 1, 0, 3, 3, 2, 2, 1, 1, 1, 0, 1, 0, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 2, 0, 1, 2, 2, 0, 1, 0, 3, 2, 3, 3,
       0, 2, 0, 0, 0, 0, 3, 2, 3, 1, 2, 3, 1, 1, 2, 2, 2, 1, 3, 3, 3, 3,
       2, 3, 1, 3, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 0, 0, 1, 2, 3, 0, 3, 1,
       1, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 3, 3, 2, 3, 2,
       0, 1, 1, 1, 2, 0, 0, 2, 1, 2, 3, 3, 3, 3, 3, 2, 3, 2, 1, 2, 1, 2,
       0, 1, 0, 0, 1, 2, 1, 1, 0, 3, 1, 1, 3, 1, 0, 0, 1, 1, 3, 3, 1, 1,
       2, 0, 1, 0, 1, 0, 2, 1, 2, 2, 2, 1, 0, 2, 2, 2, 3, 1, 2, 2, 2, 1,
       1, 1, 0, 2, 3, 0, 0, 2, 0, 0, 0, 1, 0, 1, 1, 2, 2, 2, 2, 2, 0, 1,
       2, 1, 0, 3, 0, 1, 1, 0, 0, 0, 3, 0, 2, 3, 3, 2, 3, 2, 1, 3, 1, 3,
       3, 3, 1, 2, 2, 3, 3, 0, 0, 0, 1, 0, 2, 0, 0, 2, 0, 3, 0, 0])

In [19]:
single_score_df = pd.DataFrame(single_score, columns=['score'])

In [20]:
single_score_df

Unnamed: 0,score
0,3
1,0
2,1
3,0
4,3
...,...
235,2
236,0
237,3
238,0


### Convert the 4-score to other ranges

In [51]:
a1_df['03_score'] = a1_df['score'].map(score_mapper)
a2_df['03_score'] = a2_df['score'].map(score_mapper)
a3_df['03_score'] = a3_df['score'].map(score_mapper)

single_score_df['03_score'] = single_score_df['score'].map(score_mapper)

In [52]:
a1_df['02_score'] = a1_df['score'].map(score_mapper_for_2)
a2_df['02_score'] = a2_df['score'].map(score_mapper_for_2)
a3_df['02_score'] = a3_df['score'].map(score_mapper_for_2)

single_score_df['02_score'] = single_score_df['score'].map(score_mapper_for_2)

In [53]:
a1_df['02_score_alt'] = a1_df['score'].map(score_mapper_for_2_alt)
a2_df['02_score_alt'] = a2_df['score'].map(score_mapper_for_2_alt)
a3_df['02_score_alt'] = a3_df['score'].map(score_mapper_for_2_alt)

single_score_df['02_score_alt'] = single_score_df['score'].map(score_mapper_for_2_alt)

In [54]:
def cohen_kappa_wrapper(first_series, second_series):
    return (cohen_kappa_score(first_series, second_series), )

## Functions for correlation for 1st, 2nd and all questions

In [110]:
def correlations_1st_set(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score])[0], 
                                  globals()[correlation_fn](a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], a2_df.iloc[0:120][which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0], 
                                  globals()[correlation_fn](a2_df.iloc[0:120][which_score], a3_df.iloc[0:120][which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a2_df.iloc[0:120][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a3_df.iloc[0:120][which_score], evaluation_df[which_score])[0]]

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['std'] + general_agreement_df.iloc[:, 1:].std().to_list()], columns=general_agreement_df.columns)])    
    
    human_mean = general_agreement_df.iloc[-2,1:4].mean()

    print("human_mean={}".format(human_mean))
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-2, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

In [111]:
def correlations_2nd_set(evaluation_df, which_score, correlation_fn):
    
    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score])[0], 
                                  globals()[correlation_fn](a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], a2_df.iloc[120:240][which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0], 
                                  globals()[correlation_fn](a2_df.iloc[120:240][which_score], a3_df.iloc[120:240][which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 2nd 4-score'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a2_df.iloc[120:240][which_score], evaluation_df[which_score])[0], 
                                                globals()[correlation_fn](a3_df.iloc[120:240][which_score], evaluation_df[which_score])[0]]

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])
    
    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['std'] + general_agreement_df.iloc[:, 1:].std().to_list()], columns=general_agreement_df.columns)])
    
    human_mean = general_agreement_df.iloc[-2,1:4].mean()

    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-2, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])

    return general_agreement_df

In [112]:
def correlations_all(evaluation_df, which_score, correlation_fn):
    
    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['a1', 'a2', 'a3']

    general_agreement_df['a1'] = [np.nan, 
                                  globals()[correlation_fn](a1_df[which_score], a2_df[which_score])[0], 
                                  globals()[correlation_fn](a1_df[which_score], a3_df[which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a1_df[which_score], a2_df[which_score])[0], 
                                  np.nan, 
                                  globals()[correlation_fn](a2_df[which_score], a3_df[which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a1_df[which_score], a3_df[which_score])[0], 
                                  globals()[correlation_fn](a2_df[which_score], a3_df[which_score])[0], 
                                  np.nan]

    general_agreement_df['GPT4 4-score'] = [globals()[correlation_fn](a1_df[which_score], evaluation_df[which_score])[0], 
                                            globals()[correlation_fn](a2_df[which_score], evaluation_df[which_score])[0], 
                                            globals()[correlation_fn](a3_df[which_score], evaluation_df[which_score])[0]]

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['mean'] + general_agreement_df.iloc[:, 1:].mean().to_list()], columns=general_agreement_df.columns)])

    general_agreement_df = pd.concat([general_agreement_df, 
                                      pd.DataFrame(data=[['std'] + general_agreement_df.iloc[:, 1:].std().to_list()], columns=general_agreement_df.columns)])
    
    
    human_mean = general_agreement_df.iloc[-2,1:4].mean()

    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[['Difference from mean human annotators'] + (general_agreement_df.iloc[-2, 1:] - human_mean).to_list()], columns=general_agreement_df.columns)])
    
    return general_agreement_df

### Functions to compute correlation against single score computed accross human annoators

In [113]:
def correlations_single_score_1st_set(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [globals()[correlation_fn](a1_df.iloc[0:120][which_score], single_score_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a2_df.iloc[0:120][which_score], single_score_df.iloc[0:120][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a3_df.iloc[0:120][which_score], single_score_df.iloc[0:120][which_score])[0]]
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df['human mean'] = human_mean
    
    general_agreement_df['human std'] = general_agreement_df.iloc[-1,1:4].std()

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](single_score_df.iloc[0:120][which_score], evaluation_df[which_score])[0]]

    difference_from_mean = ['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()
    difference_from_mean[-2] = np.nan
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[difference_from_mean], columns=general_agreement_df.columns)])

    return general_agreement_df

In [114]:
def correlations_single_score_2nd_set(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [globals()[correlation_fn](a1_df.iloc[120:240][which_score], single_score_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a2_df.iloc[120:240][which_score], single_score_df.iloc[120:240][which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a3_df.iloc[120:240][which_score], single_score_df.iloc[120:240][which_score])[0]]
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df['human mean'] = human_mean
    
    general_agreement_df['human std'] = general_agreement_df.iloc[-1,1:4].std()

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](single_score_df.iloc[120:240][which_score], evaluation_df[which_score])[0]]

    difference_from_mean = ['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()
    difference_from_mean[-2] = np.nan
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[difference_from_mean], columns=general_agreement_df.columns)])

    return general_agreement_df

In [115]:
def correlations_single_score_all(evaluation_df, which_score, correlation_fn):

    general_agreement_df = pd.DataFrame()

    general_agreement_df['comparisson'] = ['single']

    general_agreement_df['a1'] = [globals()[correlation_fn](a1_df[which_score], single_score_df[which_score])[0]]

    general_agreement_df['a2'] = [globals()[correlation_fn](a2_df[which_score], single_score_df[which_score])[0]]

    general_agreement_df['a3'] = [globals()[correlation_fn](a3_df[which_score], single_score_df[which_score])[0]]
    
    human_mean = general_agreement_df.iloc[-1,1:4].mean()
    
    general_agreement_df['human mean'] = human_mean
    
    general_agreement_df['human std'] = general_agreement_df.iloc[-1,1:4].std()

    general_agreement_df['GPT4 1st 4-score'] = [globals()[correlation_fn](single_score_df[which_score], evaluation_df[which_score])[0]]

    difference_from_mean = ['Difference from mean human annotators'] + (general_agreement_df.iloc[-1, 1:] - human_mean).to_list()
    difference_from_mean[-2] = np.nan
    
    general_agreement_df = pd.concat([general_agreement_df,
                                      pd.DataFrame(data=[difference_from_mean], columns=general_agreement_df.columns)])

    return general_agreement_df

## Check the regenerated evaluations, comparing against the original GPT-4 4-score evaluation

In [116]:
gpt4_1106_1st_df = pd.read_csv(os.path.join("tests", "test_000_119_gpt-4-1106-preview_20231108_fixed_2_scores.tsv"), sep='\t')

In [117]:
gpt4_1106_2nd_df = pd.read_csv(os.path.join("tests", "test_120_239_gpt-4-1106-preview_20231115_fixed_2_scores.tsv"), sep='\t')

In [118]:
gpt4_1106_df = pd.concat([gpt4_1106_1st_df, gpt4_1106_2nd_df], axis=0).reset_index(drop=True)

### Evaluation using the new GPT-4 turbo ― Spearman

In [119]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'spearmanr')

human_mean=0.7590023888315695


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.706979,0.81708,0.655061
1,a2,0.706979,,0.752948,0.657686
2,a3,0.81708,0.752948,,0.656545
0,mean,0.76203,0.729963,0.785014,0.656431
0,std,0.055051,0.022985,0.032066,0.001075
0,Difference from mean human annotators,0.003027,-0.029039,0.026012,-0.102572


In [120]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.663906,0.578996,0.576091
1,a2,0.663906,,0.679524,0.607351
2,a3,0.578996,0.679524,,0.60219
0,mean,0.621451,0.671715,0.62926,0.595211
0,std,0.042455,0.007809,0.050264,0.013683
0,Difference from mean human annotators,-0.019358,0.030906,-0.011548,-0.045598


In [121]:
correlations_all(gpt4_1106_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.693098,0.692357,0.607279
1,a2,0.693098,,0.698475,0.617434
2,a3,0.692357,0.698475,,0.629563
0,mean,0.692728,0.695787,0.695416,0.618092
0,std,0.00037,0.002689,0.003059,0.009109
0,Difference from mean human annotators,-0.001916,0.001143,0.000773,-0.076551


### Evaluation using the new GPT-4 turbo ― Cohen Kappa

In [122]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'cohen_kappa_wrapper')

human_mean=0.5219047399392692


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.489892,0.575498,0.404606
1,a2,0.489892,,0.500324,0.363543
2,a3,0.575498,0.500324,,0.445074
0,mean,0.532695,0.495108,0.537911,0.404408
0,std,0.042803,0.005216,0.037587,0.033285
0,Difference from mean human annotators,0.01079,-0.026797,0.016006,-0.117497


In [123]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.369369,0.286629,0.243283
1,a2,0.369369,,0.322651,0.157364
2,a3,0.286629,0.322651,,0.254939
0,mean,0.327999,0.34601,0.30464,0.218529
0,std,0.04137,0.023359,0.018011,0.043511
0,Difference from mean human annotators,0.001783,0.019794,-0.021576,-0.107688


In [124]:
correlations_all(gpt4_1106_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.436881,0.429402,0.323428
1,a2,0.436881,,0.410455,0.259276
2,a3,0.429402,0.410455,,0.349839
0,mean,0.433142,0.423668,0.419929,0.310848
0,std,0.00374,0.013213,0.009474,0.038027
0,Difference from mean human annotators,0.007562,-0.001911,-0.005651,-0.114732


### Evaluation using the new GPT-4 turbo ― Pearson

In [125]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'pearsonr')

human_mean=0.7712262020990179


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.719862,0.82506,0.650404
1,a2,0.719862,,0.768756,0.657615
2,a3,0.82506,0.768756,,0.657503
0,mean,0.772461,0.744309,0.796908,0.655174
0,std,0.052599,0.024447,0.028152,0.003373
0,Difference from mean human annotators,0.001235,-0.026917,0.025682,-0.116052


In [126]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'pearsonr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.657464,0.576163,0.553798
1,a2,0.657464,,0.687592,0.595538
2,a3,0.576163,0.687592,,0.607573
0,mean,0.616813,0.672528,0.631878,0.585636
0,std,0.04065,0.015064,0.055715,0.023043
0,Difference from mean human annotators,-0.023593,0.032122,-0.008529,-0.05477


In [127]:
correlations_all(gpt4_1106_df, 'score', 'pearsonr')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.698174,0.697289,0.598166
1,a2,0.698174,,0.713168,0.614565
2,a3,0.697289,0.713168,,0.632562
0,mean,0.697732,0.705671,0.705229,0.615098
0,std,0.000442,0.007497,0.007939,0.014047
0,Difference from mean human annotators,-0.005145,0.002794,0.002352,-0.087779


### Evaluation using the new GPT-4 turbo ― Kendall Tau

In [128]:
correlations_1st_set(gpt4_1106_1st_df, 'score', 'kendalltau')

human_mean=0.6956243177602369


Unnamed: 0,comparisson,a1,a2,a3,GPT4 1st 4-score
0,a1,,0.643791,0.751429,0.582366
1,a2,0.643791,,0.691653,0.584938
2,a3,0.751429,0.691653,,0.58167
0,mean,0.69761,0.667722,0.721541,0.582991
0,std,0.053819,0.023931,0.029888,0.001405
0,Difference from mean human annotators,0.001986,-0.027902,0.025916,-0.112633


In [129]:
correlations_2nd_set(gpt4_1106_2nd_df, 'score', 'kendalltau')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 2nd 4-score
0,a1,,0.590647,0.510594,0.491843
1,a2,0.590647,,0.600221,0.535608
2,a3,0.510594,0.600221,,0.510917
0,mean,0.55062,0.595434,0.555407,0.512789
0,std,0.040027,0.004787,0.044814,0.017916
0,Difference from mean human annotators,-0.016534,0.02828,-0.011746,-0.054365


In [130]:
correlations_all(gpt4_1106_df, 'score', 'kendalltau')

Unnamed: 0,comparisson,a1,a2,a3,GPT4 4-score
0,a1,,0.623709,0.622409,0.529502
1,a2,0.623709,,0.626314,0.539189
2,a3,0.622409,0.626314,,0.546029
0,mean,0.623059,0.625011,0.624362,0.53824
0,std,0.00065,0.001303,0.001953,0.00678
0,Difference from mean human annotators,-0.001085,0.000868,0.000218,-0.085904


# Check agreement against single score

### Pearson

In [131]:
correlations_single_score_1st_set(gpt4_1106_1st_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.887958,0.831671,0.909102,0.876244,0.040022,0.669433
0,Difference from mean human annotators,0.011714,-0.044572,0.032858,0.0,,-0.20681


In [132]:
correlations_single_score_2nd_set(gpt4_1106_2nd_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.797631,0.849624,0.746234,0.79783,0.051695,0.569361
0,Difference from mean human annotators,-0.000198,0.051794,-0.051596,0.0,,-0.228469


In [133]:
correlations_single_score_all(gpt4_1106_df, 'score', 'spearmanr')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.842842,0.842578,0.822828,0.836083,0.01148,0.614351
0,Difference from mean human annotators,0.006759,0.006495,-0.013255,0.0,,-0.221732


### Cohen Kappa

In [134]:
correlations_single_score_1st_set(gpt4_1106_1st_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.775701,0.683019,0.787512,0.748744,0.057225,0.460169
0,Difference from mean human annotators,0.026957,-0.065725,0.038768,0.0,,-0.288575


In [135]:
correlations_single_score_2nd_set(gpt4_1106_2nd_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.653276,0.662605,0.569616,0.628499,0.051207,0.241689
0,Difference from mean human annotators,0.024777,0.034106,-0.058883,0.0,,-0.38681


In [136]:
correlations_single_score_all(gpt4_1106_df, 'score', 'cohen_kappa_wrapper')

Unnamed: 0,comparisson,a1,a2,a3,human mean,human std,GPT4 1st 4-score
0,single,0.716489,0.675895,0.677815,0.690066,0.022903,0.34994
0,Difference from mean human annotators,0.026423,-0.014172,-0.012251,0.0,,-0.340127


# Check the agreement per question

In [161]:
def correlation_per_question(which_score, a1_df, a2_df, a3_df, gpt4_df, correlation_fn, metric_label, fillna=None):

    all_results = []
    
    res_a1_a2 = check_agreement_per_questions(a1_df, a2_df, correlation_fn=correlation_fn, metric_label=metric_label + "_a1_a2", score=which_score, fillna=fillna)
    res_a1_a3 = check_agreement_per_questions(a1_df, a3_df, correlation_fn=correlation_fn, metric_label=metric_label + "_a1_a3", score=which_score, fillna=fillna)
    res_a2_a3 = check_agreement_per_questions(a2_df, a3_df, correlation_fn=correlation_fn, metric_label=metric_label + "_a2_a3", score=which_score, fillna=fillna)

    res_a1_gpt4 = check_agreement_per_questions(a1_df, gpt4_df, correlation_fn=correlation_fn, metric_label=metric_label + "_a1_gpt4", score=which_score, fillna=fillna)
    res_a2_gpt4 = check_agreement_per_questions(a2_df, gpt4_df, correlation_fn=correlation_fn, metric_label=metric_label + "_a2_gpt4", score=which_score, fillna=fillna)
    res_a3_gpt4 = check_agreement_per_questions(a3_df, gpt4_df, correlation_fn=correlation_fn, metric_label=metric_label + "_a3_gpt4", score=which_score, fillna=fillna)
    
    
    human_combined_res = pd.concat([res_a1_a2[1], res_a1_a3[1].iloc[:, -1], res_a2_a3[1].iloc[:, -1]], axis=1)

    human_combined_res[metric_label + '_human_mean'] = human_combined_res.iloc[:, 1:].mean(axis=1)
    human_combined_res[metric_label + '_human_std'] = human_combined_res.iloc[:, 1:].std(axis=1)
    
    all_results.append(human_combined_res)
    
    human_gpt4_combined_res = pd.concat([res_a1_gpt4[1], res_a2_gpt4[1].iloc[:, -1], res_a3_gpt4[1].iloc[:, -1]], axis=1)
    human_gpt4_combined_res[metric_label + '_gpt4_mean'] = human_gpt4_combined_res.iloc[:, 1:].mean(axis=1)
    human_gpt4_combined_res[metric_label + '_gpt4_std'] = human_gpt4_combined_res.iloc[:, 1:].std(axis=1)

    all_results.append(human_gpt4_combined_res)
    
    all_results_df = pd.concat([human_combined_res, human_gpt4_combined_res.iloc[:, 1:]], axis=1)
    all_results_df['mean difference'] = all_results_df[metric_label + '_human_mean'] - all_results_df[metric_label + '_gpt4_mean']
    
    
    correlations = []
    correlations.append({'annotator': 'human all queries',
                         metric_label: human_combined_res[metric_label + '_human_mean'].mean(),
                         'difference to human mean': 0.0})
    
    correlations.append({'annotator': 'GPT-4 all queries',
                         metric_label: human_gpt4_combined_res[metric_label + '_gpt4_mean'].mean(),
                         'difference to human mean': human_gpt4_combined_res[metric_label + '_gpt4_mean'].mean() - human_combined_res[metric_label + '_human_mean'].mean()})
    
    #
    # Check only non-zero GPT4 means
    #
    
    non_zero = (human_gpt4_combined_res[metric_label + '_gpt4_mean'] != 0.0) & (~np.isnan(human_gpt4_combined_res[metric_label + '_gpt4_mean']))
    
    print(non_zero)
    
    correlations.append({'annotator': 'human non-zero correlation queries',
                         metric_label: human_combined_res.loc[non_zero][metric_label + '_human_mean'].mean(),
                         'difference to human mean': 0.0})
    
    correlations.append({'annotator': 'GPT-4 non-zero correlation queries',
                         metric_label: human_gpt4_combined_res.loc[non_zero][metric_label + '_gpt4_mean'].mean(),
                         'difference to human mean': human_gpt4_combined_res.loc[non_zero][metric_label + '_gpt4_mean'].mean() - human_combined_res.loc[non_zero][metric_label + '_human_mean'].mean()})
    
    
    correlations_df = pd.DataFrame(correlations)
    
    display(correlations_df)
    
    return correlations_df, all_results, all_results_df

### Check for 4-score evaluations

In [146]:
results = correlation_per_question('score', a1_df, a2_df, a3_df, gpt4_1106_df, "cohen_kappa_wrapper", "cohen_kappa")

Unnamed: 0,annotator,cohen_kappa,difference to human mean
0,human all queries,0.322864,0.0
1,GPT-4 all queries,0.232171,-0.090693
2,human non-zero correlation queries,0.345031,0.0
3,GPT-4 non-zero correlation queries,0.299162,-0.045869


In [147]:
results[1][0]

Unnamed: 0,query,cohen_kappa_a1_a2,cohen_kappa_a1_a3,cohen_kappa_a2_a3,cohen_kappa_human_mean,cohen_kappa_human_std
0,Onde está localizada a Praça XV de Novembro?,0.264706,0.305556,0.69697,0.42241,0.194858
1,Qual foi a importância da usina de Volta Redonda RJ para a industrialização brasileira?,-0.012658,0.264706,0.102564,0.118204,0.113772
2,Qual o uso dos códigos SWIFT?,0.615385,0.0,0.0,0.205128,0.290095
3,"O que são os celulares ""mid-range""?",0.508197,0.6875,0.253731,0.483143,0.177969
4,Por que os países Guiana e Suriname não são filiados a Conmebol?,0.836066,0.677419,0.84375,0.785745,0.076662
5,quais os critérios de definição dos monumentos intitulados maravilhas do mundo moderno?,0.701493,0.558824,0.545455,0.601924,0.070617
6,Qual a maior torcida de futebol do Brasil?,0.807692,0.423077,0.642857,0.624542,0.157552
7,Quando se realizou o plebiscito popular para definir o sistema político do Brasil?,0.0,0.508197,0.0,0.169399,0.239566
8,Como transformar uma cidade pacata em um polo turístico?,0.285714,0.473684,0.583333,0.447577,0.122897
9,Quais são os melhores parques nacionais de Portugal?,-0.081081,-0.25,0.428571,0.032497,0.288432


In [148]:
results[1][1]

Unnamed: 0,query,cohen_kappa_a1_gpt4,cohen_kappa_a2_gpt4,cohen_kappa_a3_gpt4,cohen_kappa_gpt4_mean,cohen_kappa_gpt4_std
0,Onde está localizada a Praça XV de Novembro?,0.285714,0.210526,0.305556,0.267265,0.04093
1,Qual foi a importância da usina de Volta Redonda RJ para a industrialização brasileira?,0.090909,0.117647,0.102564,0.103707,0.010946
2,Qual o uso dos códigos SWIFT?,1.0,0.615385,0.0,0.538462,0.411856
3,"O que são os celulares ""mid-range""?",0.52381,0.21875,0.508197,0.416919,0.140271
4,Por que os países Guiana e Suriname não são filiados a Conmebol?,0.68254,0.69697,0.84375,0.741086,0.072833
5,quais os critérios de definição dos monumentos intitulados maravilhas do mundo moderno?,0.027778,0.014085,-0.184211,-0.047449,0.096866
6,Qual a maior torcida de futebol do Brasil?,0.423077,0.642857,1.0,0.688645,0.237743
7,Quando se realizou o plebiscito popular para definir o sistema político do Brasil?,0.0,0.0,0.0,0.0,0.0
8,Como transformar uma cidade pacata em um polo turístico?,0.402985,0.125,0.268293,0.265426,0.113505
9,Quais são os melhores parques nacionais de Portugal?,-0.428571,0.130435,0.272727,-0.00847,0.302683


In [149]:
results[2]

Unnamed: 0,query,cohen_kappa_a1_a2,cohen_kappa_a1_a3,cohen_kappa_a2_a3,cohen_kappa_human_mean,cohen_kappa_human_std,cohen_kappa_a1_gpt4,cohen_kappa_a2_gpt4,cohen_kappa_a3_gpt4,cohen_kappa_gpt4_mean,cohen_kappa_gpt4_std,mean difference
0,Onde está localizada a Praça XV de Novembro?,0.264706,0.305556,0.69697,0.42241,0.194858,0.285714,0.210526,0.305556,0.267265,0.04093,0.155145
1,Qual foi a importância da usina de Volta Redonda RJ para a industrialização brasileira?,-0.012658,0.264706,0.102564,0.118204,0.113772,0.090909,0.117647,0.102564,0.103707,0.010946,0.014497
2,Qual o uso dos códigos SWIFT?,0.615385,0.0,0.0,0.205128,0.290095,1.0,0.615385,0.0,0.538462,0.411856,-0.333333
3,"O que são os celulares ""mid-range""?",0.508197,0.6875,0.253731,0.483143,0.177969,0.52381,0.21875,0.508197,0.416919,0.140271,0.066224
4,Por que os países Guiana e Suriname não são filiados a Conmebol?,0.836066,0.677419,0.84375,0.785745,0.076662,0.68254,0.69697,0.84375,0.741086,0.072833,0.044659
5,quais os critérios de definição dos monumentos intitulados maravilhas do mundo moderno?,0.701493,0.558824,0.545455,0.601924,0.070617,0.027778,0.014085,-0.184211,-0.047449,0.096866,0.649373
6,Qual a maior torcida de futebol do Brasil?,0.807692,0.423077,0.642857,0.624542,0.157552,0.423077,0.642857,1.0,0.688645,0.237743,-0.064103
7,Quando se realizou o plebiscito popular para definir o sistema político do Brasil?,0.0,0.508197,0.0,0.169399,0.239566,0.0,0.0,0.0,0.0,0.0,0.169399
8,Como transformar uma cidade pacata em um polo turístico?,0.285714,0.473684,0.583333,0.447577,0.122897,0.402985,0.125,0.268293,0.265426,0.113505,0.182151
9,Quais são os melhores parques nacionais de Portugal?,-0.081081,-0.25,0.428571,0.032497,0.288432,-0.428571,0.130435,0.272727,-0.00847,0.302683,0.040967


In [162]:
results = correlation_per_question('score', a1_df, a2_df, a3_df, gpt4_1106_df, "spearmanr", "spearman")

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7     False
8      True
9      True
10     True
11     True
12    False
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
Name: spearman_gpt4_mean, dtype: bool




Unnamed: 0,annotator,spearman,difference to human mean
0,human all queries,0.604301,0.0
1,GPT-4 all queries,0.496191,-0.10811
2,human non-zero correlation queries,0.607132,0.0
3,GPT-4 non-zero correlation queries,0.496191,-0.11094


In [93]:
results[2]

Unnamed: 0,query,spearman_a1_a2,spearman_a1_a3,spearman_a2_a3,spearman_human_mean,spearman_human_std,spearman_a1_gpt4,spearman_a2_gpt4,spearman_a3_gpt4,spearman_gpt4_mean,spearman_gpt4_std,mean difference
0,Onde está localizada a Praça XV de Novembro?,0.646679,0.716482,0.971625,0.778262,0.139666,0.592857,0.611342,0.680112,0.628104,0.037542,0.150159
1,Qual foi a importância da usina de Volta Redonda RJ para a industrialização brasileira?,0.458258,0.700914,0.589417,0.582863,0.099172,0.45621,0.53056,0.244331,0.410367,0.121265,0.172496
2,Qual o uso dos códigos SWIFT?,0.666667,,,0.666667,0.0,1.0,0.666667,,0.833333,0.166667,-0.166667
3,"O que são os celulares ""mid-range""?",0.872872,0.945611,0.851852,0.890111,0.040171,0.407143,0.571003,0.301868,0.426671,0.110738,0.46344
4,Por que os países Guiana e Suriname não são filiados a Conmebol?,0.963796,0.978492,0.996448,0.979578,0.013352,0.963796,1.0,0.996448,0.986748,0.016294,-0.007169
5,quais os critérios de definição dos monumentos intitulados maravilhas do mundo moderno?,0.843456,0.852279,0.802897,0.832877,0.021503,0.54885,0.476145,0.523919,0.516304,0.030166,0.316573
6,Qual a maior torcida de futebol do Brasil?,0.714435,0.408248,0.75,0.624228,0.153409,0.408248,0.75,1.0,0.719416,0.242548,-0.095188
7,Quando se realizou o plebiscito popular para definir o sistema político do Brasil?,,0.67082,,0.67082,0.0,,,,,,
8,Como transformar uma cidade pacata em um polo turístico?,0.534838,0.980196,0.59233,0.702455,0.19779,0.565779,0.480159,0.619395,0.555111,0.057341,0.147344
9,Quais são os melhores parques nacionais de Portugal?,0.218218,0.140859,0.573775,0.310951,0.188509,-0.521641,-0.332008,-0.077152,-0.310267,0.182112,0.621217


In [168]:
a1_1 = a1_df[a1_df['query'] == "Qual o uso dos códigos SWIFT?"][['passage_id', 'score']]

In [169]:
a2_1 = a2_df[a2_df['query'] == "Qual o uso dos códigos SWIFT?"][['passage_id', 'score']]

In [172]:
a3_1 = a3_df[a3_df['query'] == "Qual o uso dos códigos SWIFT?"][['passage_id', 'score']]

In [170]:
a2_1

Unnamed: 0,passage_id,score
20,clueweb22-pt0000-76-18202_3,3
21,clueweb22-pt0000-96-07045_3,3
22,clueweb22-pt0000-81-05159_0,3
23,clueweb22-pt0001-91-11827_0,3
24,clueweb22-pt0000-33-13712_0,3
25,clueweb22-pt0001-70-16813_0,3
26,clueweb22-pt0001-89-01763_2,2
27,clueweb22-pt0001-81-10821_1,3
28,clueweb22-pt0000-39-05372_0,3
29,clueweb22-pt0000-50-19284_4,2


In [171]:
a1_1

Unnamed: 0,passage_id,score
20,clueweb22-pt0000-76-18202_3,3
21,clueweb22-pt0000-96-07045_3,3
22,clueweb22-pt0000-81-05159_0,3
23,clueweb22-pt0001-91-11827_0,3
24,clueweb22-pt0000-33-13712_0,3
25,clueweb22-pt0001-70-16813_0,3
26,clueweb22-pt0001-89-01763_2,3
27,clueweb22-pt0001-81-10821_1,3
28,clueweb22-pt0000-39-05372_0,3
29,clueweb22-pt0000-50-19284_4,2


In [173]:
a3_1

Unnamed: 0,passage_id,score
20,clueweb22-pt0000-76-18202_3,3
21,clueweb22-pt0000-96-07045_3,3
22,clueweb22-pt0000-81-05159_0,3
23,clueweb22-pt0001-91-11827_0,3
24,clueweb22-pt0000-33-13712_0,3
25,clueweb22-pt0001-70-16813_0,3
26,clueweb22-pt0001-89-01763_2,3
27,clueweb22-pt0001-81-10821_1,3
28,clueweb22-pt0000-39-05372_0,3
29,clueweb22-pt0000-50-19284_4,3
