In [32]:
import pandas as pd
import numpy as np
import glob

import os

from sklearn.metrics import cohen_kappa_score

from scipy.stats import spearmanr

In [33]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

In [34]:
ANNOTATIONS_FOLDER="anotações_humanas"

In [35]:
annotations = glob.glob(os.path.join(ANNOTATIONS_FOLDER, "*_2nd_queries_set.csv"))

In [36]:
annotations

['anotações_humanas/admin_2nd_queries_set.csv',
 'anotações_humanas/Eduardo_2nd_queries_set.csv',
 'anotações_humanas/Leodécio_2nd_queries_set.csv']

In [37]:
a1_df = pd.read_csv(annotations[0])

In [38]:
a1_df['label'].unique()

array(['Perfect', 'Highly Relevant', 'Relevant', 'Irrelevant'],
      dtype=object)

In [39]:
classes_map = {
    'Perfect': 3,
    'Highly Relevant': 2,
    'Relevant': 1,
    'Irrelevant': 0
}

In [40]:
a1_df['score'] = a1_df['label'].map(classes_map)

In [42]:
a2_df = pd.read_csv(annotations[1])
a3_df = pd.read_csv(annotations[2])

In [43]:
a2_df['score'] = a2_df['label'].map(classes_map)
a3_df['score'] = a3_df['label'].map(classes_map)

## Check annotators agreement

In [47]:
print(cohen_kappa_score(a1_df['score'], a2_df['score']))
print(cohen_kappa_score(a1_df['score'], a3_df['score']))
print(cohen_kappa_score(a2_df['score'], a3_df['score']))

0.3693693693693695
0.28662886409365285
0.3226511289147851


## Check annotators correlation against LLM evaluations

### Start with GPT3.5 evaluations

In [48]:
gpt3_redux_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_3_evaluations_120_samples_2nd_queries_set.tsv"), sep='\t')

In [49]:
print(cohen_kappa_score(a1_df['score'], gpt3_redux_df['score'].to_numpy()))
print(cohen_kappa_score(a2_df['score'], gpt3_redux_df['score'].to_numpy()))
print(cohen_kappa_score(a3_df['score'], gpt3_redux_df['score'].to_numpy()))

0.12866377401930396
0.11330049261083752
0.17313664596273282


### Now the GPT4 evaluations

In [51]:
gpt4_redux_df = pd.read_csv(os.path.join(ANNOTATIONS_FOLDER, "gpt_4_evaluations_120_samples_2nd_queries_set.tsv"), sep='\t')

In [52]:
print(cohen_kappa_score(a1_df['score'], gpt4_redux_df['score'].to_numpy()))
print(cohen_kappa_score(a2_df['score'], gpt4_redux_df['score'].to_numpy()))
print(cohen_kappa_score(a3_df['score'], gpt4_redux_df['score'].to_numpy()))

0.24521320876884656
0.1358249772105743
0.2600896860986547


In [53]:
gpt4_redux_df['cost'].sum()

3.535769999999996

In [50]:
gpt3_redux_df['cost'].sum()

0.2516794999999988