In [1]:
import numpy as np
import pandas as pd
import json


In [2]:
# Load the jargon annotated data
merged_humam_llm_jargon = pd.read_json(
    "data/llm_outputs/240525_march_2024_human_llm_jargon_merged.json", 
    orient="index"
)

merged_humam_llm_jargon.head()

Unnamed: 0,arxiv_id,reader_id,gpt4_jargon_list,human_jargon_list
0,2403.16190v1,rid0,"reject option strategy, formal guarantees, min...","correctness,minimality,Anchors"
1,2403.16190v1,rid1,"linear classification problems, reject option ...","reject option strategy,Anchors,heuristic algor..."
2,2307.05300v4,rid0,"cognitive synergist, fine-grained personas, fa...","multi-turn,persona"
3,2307.05300v4,rid1,"cognitive synergy, cognitive synergist, multi-...","Solo Performance Prompting,multi-turn,Chain-of..."
4,2403.16750v1,rid0,"Common Weakness Enumerations (CWEs), SystemVer...","common weakeness enumerations,SystemVerilog,Re..."


# Create an Eval File

In [3]:
# Load in outputs
gpt4_rag = pd.read_json(
    "data/llm_outputs/240528_march_2024_sampled_jargon_definitions_rag.json", 
    orient="records"
)

# Add column
gpt4_rag['model'] = ['gpt4_rag']*len(gpt4_rag)

gpt4_rag.head()

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model
0,2403.16190v1,rid0,correctness,Correctness in this context refers to whether ...,gpt4_rag
1,2403.16190v1,rid0,minimality,Minimality refers to the concept of using the ...,gpt4_rag
2,2403.16190v1,rid0,Anchors,Anchors is a method used in the field of artif...,gpt4_rag
3,2403.16190v1,rid1,reject option strategy,The reject option strategy in classification m...,gpt4_rag
4,2403.16190v1,rid1,Anchors,Anchors is a method used in artificial intelli...,gpt4_rag


In [4]:
# Load in outputs
gpt4_abstract = pd.read_json(
    "data/llm_outputs/240528_march_2024_sampled_jargon_definitions_abstract.json", 
    orient="records"
)

# Add column
gpt4_abstract['model'] = ['gpt4_abstract']*len(gpt4_abstract)

gpt4_abstract.head()

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model
0,2403.16190v1,rid0,correctness,"Correctness, in this context, refers to the de...",gpt4_abstract
1,2403.16190v1,rid0,minimality,"In the given context, ""minimality"" refers to t...",gpt4_abstract
2,2403.16190v1,rid0,Anchors,"In the given context, ""Anchors"" refers to a ty...",gpt4_abstract
3,2403.16190v1,rid1,reject option strategy,A reject option strategy in machine learning i...,gpt4_abstract
4,2403.16190v1,rid1,Anchors,Anchors is an algorithm used to explain the de...,gpt4_abstract


In [5]:
# Stack the two dfs
concatenated = pd.concat([gpt4_rag, gpt4_abstract], ignore_index=True, axis=0)
concatenated.head()

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model
0,2403.16190v1,rid0,correctness,Correctness in this context refers to whether ...,gpt4_rag
1,2403.16190v1,rid0,minimality,Minimality refers to the concept of using the ...,gpt4_rag
2,2403.16190v1,rid0,Anchors,Anchors is a method used in the field of artif...,gpt4_rag
3,2403.16190v1,rid1,reject option strategy,The reject option strategy in classification m...,gpt4_rag
4,2403.16190v1,rid1,Anchors,Anchors is a method used in artificial intelli...,gpt4_rag


In [6]:
# Randomize row ordering + also maintain all the arXiv IDs next to each other
concatenated = concatenated.sample(frac=1).reset_index(drop=True)
concatenated.sort_values(['arxiv_id', 'human_jargon_term'], inplace=True)
concatenated.head(20)

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model
145,2007.00714v4,rid1,ANOVA,"ANOVA, or Analysis of Variance, is a statistic...",gpt4_abstract
288,2007.00714v4,rid1,ANOVA,"ANOVA, or Analysis of Variance, is a statistic...",gpt4_rag
645,2007.00714v4,rid1,DAG,A Directed Acyclic Graph (DAG) is a type of di...,gpt4_rag
646,2007.00714v4,rid1,DAG,A Directed Acyclic Graph (DAG) is a diagram th...,gpt4_abstract
154,2007.00714v4,rid1,Shapley based symmetrization,Shapley based symmetrization is a method used ...,gpt4_abstract
380,2007.00714v4,rid0,Shapley based symmetrization,Shapley based symmetrization is a method that ...,gpt4_rag
406,2007.00714v4,rid0,Shapley based symmetrization,Shapley based symmetrization is a method used ...,gpt4_abstract
600,2007.00714v4,rid1,Shapley based symmetrization,Shapley based symmetrization is a method used ...,gpt4_rag
527,2007.00714v4,rid1,joint distribution,"The term ""joint distribution"" refers to the wa...",gpt4_abstract
686,2007.00714v4,rid1,joint distribution,A joint distribution is a way to describe how ...,gpt4_rag


In [7]:
concatenated.loc[
    concatenated['reader_id'] =='rid1'
].to_csv('data/240528_rid1_jargon_pref_annotations_incomplete.csv', index=False)

In [8]:
concatenated.loc[
    concatenated['reader_id'] =='rid0'
].to_csv('data/240528_rid0_jargon_pref_annotations_incomplete.csv', index=False)

# Assess the Eval File

In [9]:
# Read in the completed jargon preference annotations
rid0 = pd.read_csv('data/human_annotations/240528_rid0_jargon_pref_annotations_complete.csv')
rid1 = pd.read_csv('data/human_annotations/240528_rid1_jargon_pref_annotations_complete.csv')

rid0.fillna("", inplace=True)
rid1.fillna("", inplace=True)

In [10]:
rid1.columns

Index(['arxiv_id', 'reader_id', 'human_jargon_term', 'definition_text',
       'model', 'Accuracy ', 'Ranking', 'Notes'],
      dtype='object')

In [11]:
rid0.columns

Index(['arxiv_id', 'reader_id', 'human_jargon_term', 'definition_text',
       'model', 'Accuracy', 'Ranking', 'Notes'],
      dtype='object')

In [12]:
# Rename
rid1.rename({'Accuracy ': 'Accuracy'}, axis=1, inplace=True)
rid1.columns

Index(['arxiv_id', 'reader_id', 'human_jargon_term', 'definition_text',
       'model', 'Accuracy', 'Ranking', 'Notes'],
      dtype='object')

In [13]:
# Concat the dfs
rid0_rid1 = pd.concat([rid0, rid1], axis=0)
rid0_rid1.head()

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model,Accuracy,Ranking,Notes
0,2007.00714v4,rid0,Shapley based symmetrization,Shapley based symmetrization is a method that ...,gpt4_rag,accurate,1,
1,2007.00714v4,rid0,Shapley based symmetrization,Shapley based symmetrization is a method used ...,gpt4_abstract,accurate,2,
2,2007.00714v4,rid0,relabelling nodes,Relabelling nodes refers to changing the names...,gpt4_abstract,accurate,1,
3,2007.00714v4,rid0,relabelling nodes,Relabelling nodes refers to renaming the point...,gpt4_rag,accurate,2,too simplified
4,2007.00714v4,rid0,structure-preserving interventions,Structure-preserving interventions are changes...,gpt4_abstract,accurate,2,a bit contrived in phrasing


In [14]:
rid0_rid1.tail()

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model,Accuracy,Ranking,Notes
467,2403.17873v1,rid1,social misattributions,Social misattributions refer to situations whe...,gpt4_abstract,accurate,0,
468,2403.19436v1,rid1,dearthneoliberalism,"The term ""dearth"" refers to a scarcity or lack...",gpt4_abstract,accurate,1,
469,2403.19436v1,rid1,dearthneoliberalism,It seems there may be a typo in your query. If...,gpt4_rag,no response,2,
470,2403.19436v1,rid1,gig economy,The gig economy refers to a labor market chara...,gpt4_abstract,accurate,0,
471,2403.19436v1,rid1,gig economy,The gig economy is a labor market consisting o...,gpt4_rag,accurate,0,


# Accuracy

In [None]:
rid0_rid1['model'].value_counts()

In [None]:
rid0_rid1.groupby(['reader_id'])['Accuracy'].value_counts(normalize=True)


In [None]:
rid0_rid1['Accuracy'].value_counts(normalize=True)


In [None]:
rid0_rid1.groupby(['model'])['Accuracy'].value_counts(normalize=True)


In [None]:
# View the inaccuracies where BOTH got it wrong
inaccuracies = rid0_rid1.loc[
    rid0_rid1['Accuracy']=='inaccurate'
].reset_index(drop=True)

print(inaccuracies.loc[inaccuracies.duplicated('human_jargon_term', keep=False)].shape)

inaccuracies.loc[inaccuracies.duplicated('human_jargon_term', keep=False)]

In [None]:
rid0_rid1.loc[
    rid0_rid1['Accuracy']=='inaccurate'
].reset_index(drop=True)['model'].value_counts()

In [None]:
rid1.loc[
    rid1['Accuracy']=='inaccurate'
].reset_index(drop=True)['model'].value_counts()

# Wins and Losses

In [None]:
# Drop any abstracts where one or more responses are inaccurate
rid0_rid1_accurate = rid0_rid1.loc[
    rid0_rid1['Accuracy']=='accurate'
]

# Now drop human_jargon_term that only have one item, because you need the paired defitions -- use duplicated
rid0_rid1_accurate = rid0_rid1_accurate.loc[rid0_rid1_accurate.duplicated(subset='human_jargon_term', keep=False)]
rid0_rid1_accurate.head()


In [None]:
rid0_rid1_accurate['Ranking'].value_counts()

1s and 2s express preferences, 0s and -1s express ties

In [None]:
# For the purpose of this exercise, set all -1s to 0
rid0_rid1_accurate = rid0_rid1_accurate.replace(-1, 0)
rid0_rid1_accurate['Ranking'].value_counts()

In [None]:
# Do % wins and losses

# Create a new column 'result' based on the 'ranking' column
rid0_rid1_accurate['result'] = rid0_rid1_accurate['Ranking'].apply(lambda x: 'win' if x == 1 else ('loss' if x == 2 else 'tie'))

# Group by 'model' and 'result', count the occurrences, and calculate the percentages
result_counts = rid0_rid1_accurate.groupby(['model', 'result']).size().unstack(fill_value=0)
result_percentages = result_counts.div(result_counts.sum(axis=1), axis=0) * 100

# Display the result percentages
print(result_percentages)


In [None]:
# Group by 'annotator_id', 'model', and 'result', count the occurrences, and calculate the percentages
result_counts = rid0_rid1_accurate.groupby(['reader_id', 'model', 'result']).size().unstack(fill_value=0)
result_percentages = result_counts.div(result_counts.sum(axis=1), axis=0) * 100

# Display the result percentages
print(result_percentages)

# Making a Dataset for Display

In [15]:
rid0_rid1.head()

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model,Accuracy,Ranking,Notes
0,2007.00714v4,rid0,Shapley based symmetrization,Shapley based symmetrization is a method that ...,gpt4_rag,accurate,1,
1,2007.00714v4,rid0,Shapley based symmetrization,Shapley based symmetrization is a method used ...,gpt4_abstract,accurate,2,
2,2007.00714v4,rid0,relabelling nodes,Relabelling nodes refers to changing the names...,gpt4_abstract,accurate,1,
3,2007.00714v4,rid0,relabelling nodes,Relabelling nodes refers to renaming the point...,gpt4_rag,accurate,2,too simplified
4,2007.00714v4,rid0,structure-preserving interventions,Structure-preserving interventions are changes...,gpt4_abstract,accurate,2,a bit contrived in phrasing


In [32]:
# Drop rid0's terms
display_data = rid0_rid1.loc[
    rid0_rid1['reader_id']=='rid1'
]
# Keep gpt4_abstract terms
display_data = display_data.loc[
    display_data['model']=='gpt4_abstract'
]
# Now drop the inaccurate one
display_data = display_data.loc[
    display_data['Accuracy']=='accurate'
]

In [33]:
display_data.shape

(233, 8)

In [34]:
display_data

Unnamed: 0,arxiv_id,reader_id,human_jargon_term,definition_text,model,Accuracy,Ranking,Notes
1,2007.00714v4,rid1,ANOVA,"ANOVA, or Analysis of Variance, is a statistic...",gpt4_abstract,accurate,2,
3,2007.00714v4,rid1,DAG,A Directed Acyclic Graph (DAG) is a diagram th...,gpt4_abstract,accurate,2,
5,2007.00714v4,rid1,Shapley based symmetrization,Shapley based symmetrization is a method used ...,gpt4_abstract,accurate,0,
7,2007.00714v4,rid1,joint distribution,"The term ""joint distribution"" refers to the wa...",gpt4_abstract,accurate,2,
8,2007.00714v4,rid1,perturb,"The term ""perturb"" means to disturb or alter s...",gpt4_abstract,accurate,2,
...,...,...,...,...,...,...,...,...
462,2403.17873v1,rid1,Social\nTransparency (ST) framework,The Social Transparency (ST) framework is desi...,gpt4_abstract,accurate,0,
465,2403.17873v1,rid1,epistemic injustice,Epistemic injustice occurs when someone is unf...,gpt4_abstract,accurate,0,
467,2403.17873v1,rid1,social misattributions,Social misattributions refer to situations whe...,gpt4_abstract,accurate,0,
468,2403.19436v1,rid1,dearthneoliberalism,"The term ""dearth"" refers to a scarcity or lack...",gpt4_abstract,accurate,1,


In [35]:
# Read in the arXiv metadata
with open("data/arxiv_metadata/filtered/march_2024_ai_hc_cy_peer_reviewed_sampled.json") as json_data:
    metadata = json.load(json_data)
    json_data.close()

# Convert JSON to DataFrame
metadata_df = pd.DataFrame.from_dict(metadata, orient='index')

metadata_df.head()

Unnamed: 0,arxiv_id,url,title,summary,updated,published,authors,comments,categories,primary_category,doi,journal_ref,peer_reviewed
0,2403.16190v1,http://arxiv.org/abs/2403.16190v1,Logic-based Explanations for Linear Support Ve...,Support Vector Classifier (SVC) is a well-know...,1711293284000,1711293284000,"[Francisco Mateus Rocha Filho, Thiago Alves Ro...","16 pages, submitted to BRACIS 2023 (Brazilian ...","[cs.AI, cs.LG, cs.LO, I.2.4; I.2.6]",cs.AI,10.1007/978-3-031-45368-7_10,,True
1,2307.05300v4,http://arxiv.org/abs/2307.05300v4,Unleashing the Emergent Cognitive Synergy in L...,Human intelligence thrives on cognitive synerg...,1711463553000,1689086719000,"[Zhenhailong Wang, Shaoguang Mao, Wenshan Wu, ...",Accepted as a main conference paper at NAACL 2024,"[cs.AI, cs.CL]",cs.AI,,,True
2,2403.16750v1,http://arxiv.org/abs/2403.16750v1,"All Artificial, Less Intelligence: GenAI throu...",Modern hardware designs have grown increasingl...,1711373004000,1711373004000,"[Deepak Narayan Gadde, Aman Kumar, Thomas Nala...",Published in DVCon U.S. 2024,[cs.AI],cs.AI,,,True
3,2311.10112v2,http://arxiv.org/abs/2311.10112v2,zrLLM: Zero-Shot Relational Learning on Tempor...,Modeling evolving knowledge over temporal know...,1710517087000,1700083515000,"[Zifeng Ding, Heling Cai, Jingpei Wu, Yunpu Ma...",Accepted to NAACL 2024 main conference,"[cs.AI, cs.CL, cs.LG]",cs.AI,,,True
4,2310.08992v3,http://arxiv.org/abs/2310.08992v3,CodeChain: Towards Modular Code Generation Thr...,Large Language Models (LLMs) have already beco...,1710386949000,1697192268000,"[Hung Le, Hailin Chen, Amrita Saha, Akash Goku...",Accepted to ICLR 2024,"[cs.AI, cs.CL, cs.PL]",cs.AI,,,True


In [37]:
# Merge
display_data_final = pd.merge(metadata_df, display_data, on='arxiv_id')
display_data_final.shape

(233, 20)

In [38]:
# Save to JSON
display_data_final.to_json(
    'data/data_for_display.json', orient='index'
)