# Arxiv data extraction

This notebook provides code for extracting Arxiv data following the dev documentation here: https://www.kaggle.com/datasets/Cornell-University/arxiv/code

In [1]:
# code snippet below taken from Kaggle docs

import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "arxiv-metadata-oai-snapshot.json"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "Cornell-University/arxiv",
  file_path,
  pandas_kwargs={"lines": True}
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

  from .autonotebook import tqdm as notebook_tqdm
  df = kagglehub.load_dataset(




In [4]:
print("First 5 records:", df.head())

First 5 records:           id           submitter  \
0  0704.0001      Pavel Nadolsky   
1  0704.0002        Louis Theran   
2  0704.0003         Hongjun Pan   
3  0704.0004        David Callan   
4  0704.0005  Alberto Torchinsky   

                                             authors  \
0  C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...   
1                    Ileana Streinu and Louis Theran   
2                                        Hongjun Pan   
3                                       David Callan   
4           Wael Abu-Shammala and Alberto Torchinsky   

                                               title  \
0  Calculation of prompt diphoton production cros...   
1           Sparsity-certifying Graph Decompositions   
2  The evolution of the Earth-Moon system based o...   
3  A determinant of Stirling cycle numbers counts...   
4  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...   

                                  comments  \
0  37 pages, 15 figures; published version   


In [5]:
import re
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_md")

paper_triplets = list(zip(df['id'],df['title'], df['abstract'].str.replace('\n', ' ')))
# zip 3-tuples of id, title and abstract

# for a case-insensitive re.match with words from title:
title_keywords = ['AI','LM','LLM','GPT','ChatGPT'] 

# for a case insensitive re.search in title:
title_phrases = ['artificial intelligence','language model']

# for a lemma-based string comparison against entities in the abstract:
keywords = ['AI','LM','LMs','LLM','LLMs','model','system','algorithm'] 
# spaCy lemmatizer does not handle plurals well for LM, LLM, so their plural version was included here too

In [None]:
def arg0_active_criterion_check(sent,keywords,verb_list):

    check = 0

    for chunk in sent.noun_chunks:
        match = any(re.search(rf"\b{re.escape(word)}\b", chunk.text, re.IGNORECASE) for word in keywords)
        if match and chunk.root.dep_ == 'nsubj' and chunk.root.head.lemma_ in verb_list:
            check += 1

    if check > 0:
        return True
    else:
        return False

In [None]:
with open("../preprocessed_data/arxiv_arg0_verbs_active.txt","w") as file:

    sentences_dict = {"SentenceID":[],"currentSentence":[],"prevSentence":[],"nextSentence":[],"abstract":[]}

    counter = 0 # initiate counter
    stop_words = ['do','be','have','show'] 
    verb_list = [v for v in extended_arg0_verbs if v not in stop_words] # exclude stop words
    
    for idx,paper in enumerate(paper_triplets):

        paper_id = paper[0]
        title = paper[1]
        abstract = paper[2]

        if counter == 1000:
            break # stop when counter reaches 1000
            
        title = [token.text for token in title]
        keyword_match = any(re.match(keyword, word, re.IGNORECASE) for keyword in title_keywords for word in title)
        phrase_match = any(re.search(phrase, title.casefold(), re.IGNORECASE) for phrase in title_phrases)
        
        if keyword_match or phrase_match:
            doc = nlp(abstract)
            
            for i,sent in enumerate(doc.sents): # check for matches with the keywords in the noun chunks to find AI entities

                sent_id = paper_id + "_" + str(idx) + "_" + str(i)
                check = arg0_active_criterion_check(sent,keywords,verb_list) 
                # check if at least one of the noun chunks is an AI entity whose root is an anthropomorphic predicate
                
                if check:
                    counter += 1
                    file.write(sent_id+'\t'+sent.text+'\n')
                    sentences_dict["SentenceID"].append(sent_id)
                    sentences_dict["currentSentence"].append(list(doc.sents)[i].text)
                    sentences_dict["abstract"].append(paper.abstract)
                    try:
                        sentences_dict["prevSentence"].append(list(doc.sents)[i-1].text)
                    except IndexError:
                        sentences_dict["prevSentence"].append("")
                    try:
                        sentences_dict["nextSentence"].append(list(doc.sents)[i+1].text)
                    except IndexError:
                        sentences_dict["nextSentence"].append("")
                                
                                    
df = pd.DataFrame(data=sentences_dict)

In [None]:
print(len(set(df["SentenceID"])))
print(len(df["SentenceID"]))