# Data extraction

This notebook provides code for extracting ACL anthologoy data following the documentation here: https://acl-anthology.readthedocs.io/latest/api/anthology/ and arXiv data following the documentation here: https://www.kaggle.com/datasets/Cornell-University/arxiv/code

#### 1. Get ACL anthology data

In [2]:
from acl_anthology import Anthology

anthology = Anthology.from_repo()
!pip show acl-anthology

Name: acl-anthology
Version: 0.5.1
Summary: A library for accessing the ACL Anthology
Home-page: https://github.com/acl-org/acl-anthology
Author: Marcel Bollmann
Author-email: marcel@bollmann.me
License: Apache-2.0
Location: /Users/doriellelonke/Desktop/thesis/.venv/lib/python3.12/site-packages
Requires: app-paths, attrs, citeproc-py, diskcache, docopt, gitpython, langcodes, lxml, numpy, omegaconf, platformdirs, pylatexenc, python-slugify, PyYAML, rich, rnc2rng, scipy, texsoup
Required-by: 


#### 2. Get arXiv data

In [None]:
# code snippet below taken from Kaggle docs

import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "arxiv-metadata-oai-snapshot.json"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "Cornell-University/arxiv",
  file_path,
  pandas_kwargs={"lines": True}
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

  from .autonotebook import tqdm as notebook_tqdm
  df = kagglehub.load_dataset(




#### 3. Get papers in the form of a list of 3-tuples containing paper id, title and abstract

In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

# zip 3-tuples of id, title and abstract from the arXiv dataframe
arxiv_paper_triplets = list(zip(df['id'].astype(str),df['title'].astype(str), df['abstract'].str.replace('\n', ' ')))

all_acl_papers = anthology.papers()
acl_paper_triplets = []

# iterate over acl object and obtain relevant data
for paper in all_papers:
    if paper.abstract:
        paper_id = str(paper.id)
        paper_title = str(paper.title)
        paper_abstract = str(paper.abstract)
        paper_triplet = (paper_id,paper_title,paper_abstract)
        acl_paper_triplets.append(paper_triplet)

NameError: name 'df' is not defined

#### 4. Initiate keyword list for finding relevant papers

In [None]:
# for a case-insensitive re.match with words from title:
title_keywords = ['AI','LM','LLM','GPT','ChatGPT'] 

# for a case insensitive re.search in title:
title_phrases = ['artificial intelligence','language model']

# for a lemma-based string comparison against entities in the abstract:
keywords = ['AI','LM','LLM','model','system','algorithm','chatGPT','GPT'] 

#### 5. Extend anthropomorphic wordlists with WordNet

In [4]:
from tools import wordnet_syns as syns

extended_arg0_verbs = syns.extend_word_list('arg0_verbs','v') 
extended_arg1_verbs = syns.extend_word_list('arg1_verbs','v')
extended_adjectives = syns.extend_word_list('adjectives','a')
extended_nouns = syns.extend_word_list('nouns','n')

#### 6. Functions for identifying specific linguistic patterns

In [None]:
import re
import spacy
nlp = spacy.load("en_core_web_md")

def arg0_verbs_active(sent,ai_words,anthro_words):

    """
    this function checks whether a spaCy sentence is likely to adhere to the following linguistic structure:
    active voice structure in which an AI entity is arg0 of an anthropomorphic predicate
    by checking whether an AI entity is an nsubj of an anthropomorphic verb

    :param sent: sentence from an abstract of a relevant (in-domain) paper
    :type sent: spacy.tokens.span.Span
    :param ai_words: list of AI entities to match inside the sentence
    :type ai_words: list of strings
    :param anthro_words: list of anthropomorphic words (verbs, nouns or adjectives)
    :type anthro_words: list of strings
    :return: True or False
    """ 

    check = 0

    for chunk in sent.noun_chunks:
        match = any(re.search(rf"\b{re.escape(word)}\b", chunk.text, re.IGNORECASE) for word in ai_words)
        if match and chunk.root.dep_ == 'nsubj' and chunk.root.head.lemma_ in anthro_words:
            check += 1

    if check > 0:
        return True
    else:
        return False

def arg0_verbs_passive(sent,ai_words,anthro_words):
    
    """
    this function checks whether a spaCy sentence is likely to adhere to the following linguistic structure:
    passive voice structure in which an AI entity is arg0 of an anthropomorphic predicate
    by checking whether there is a verb given in passive voice, and whose pobj is an AI entity

    :param sent: sentence from an abstract of a relevant (in-domain) paper
    :type sent: spacy.tokens.span.Span
    :param ai_words: list of AI entities to match inside the sentence
    :type ai_words: list of strings
    :param anthro_words: list of anthropomorphic words (verbs,nouns or adjectives)
    :type anthro_words: list of strings
    :return: True or False
    """ 
    
    first_check = 0
    second_check = 0

    for chunk in sent.noun_chunks:
        if chunk.root.dep_ == 'nsubjpass' and chunk.root.head.lemma_ in anthro_words: # check that there is a passive anthro verb
            first_check += 1
    for chunk in sent.noun_chunks:
        match = any(re.search(rf"\b{re.escape(word)}\b", chunk.text, re.IGNORECASE) for word in ai_words) # check that the AI entity is pobj
        if match and first_check > 0 and chunk.root.dep_ == 'pobj':
            second_check += 1

    if second_check > 0:
        return True
    else:
        return False

def arg1_verbs(sent,ai_words,anthro_words):

    """
    this function checks whether a spaCy sentence is likely to adhere to the following linguistic structure:
    AI entity is arg1 of an anthropomorphic predicate
    by identifying AI entities as direct / indirect objects of anthropomorphic verbs

    :param sent: sentence from an abstract of a relevant (in-domain) paper
    :type sent: spacy.tokens.span.Span
    :param ai_words: list of AI entities to match inside the sentence
    :type ai_words: list of strings
    :param anthro_words: list of anthropomorphic words (verbs,nouns or adjectives)
    :type anthro_words: list of strings
    :return: True or False
    """ 
    
    check = 0

    for chunk in sent.noun_chunks:
        match = any(re.search(rf"\b{re.escape(word)}\b", chunk.text, re.IGNORECASE) for word in ai_words)
        if match and chunk.root.dep_ == 'dobj' and chunk.root.head.lemma_ in anthro_words:
            check += 1
        elif match and chunk.root.dep_ == 'pobj' and chunk.root.head.head.lemma_ in anthro_words:
            check += 1

    if check > 0:
        return True
    else:
        return False

#### 7. Functions for iterating over papers and obtaining matching sentences, writing to .txt and .pkl files

In [17]:
import pandas as pd
import pickle 

def get_sentences(dataset,cat,lim):
    
    """
    this function finds possible candidates for sentences adhering to various linguistic structures.

    :param dataset: name of dataset from which sentences are being extracted
    :type dataset: string
    :param cat: class of linguistic structures from the taxonomy of anthropomorphic language
    :type cat: string
    :param lim: number that limits the number of iterations
    :type lim: int
    :return: dictionary containing candidate sentences 
    """ 
    
    with open(f"../preprocessed_data/{dataset}_{cat}.txt","w") as file:

        done = False

        if lim > 1000:
            print("The provided limit is too big!")
            done = True
        
        if dataset == "acl":
            paper_triplets = acl_paper_triplets
        elif dataset == "arxiv":
            paper_triplets = arxiv_paper_triplets
        else:
            print("The provided dataset is not valid.")
            done = True

        if cat == "arg0_verbs_active":
            list_of_words = extended_arg0_verbs
            criterion_met = arg0_verbs_active
        elif cat == "arg0_verbs_passive":
            list_of_words = extended_arg0_verbs
            criterion_met = arg0_verbs_passive
        elif cat == "arg1_verbs":
            list_of_words = extended_arg1_verbs
            criterion_met = arg1_verbs
        else:
            print("The provided class of structures is not valid.")
            done = True
            
        counter = 0 # initiate counter
        sentences_dict = {"SentenceID":[],"currentSentence":[],"prevSentence":[],"nextSentence":[],"Abstract":[]}
        stop_words = ['do','be','have','show'] 
        anthro_words = [w for w in list_of_words if w not in stop_words] # exclude stop words
    
        for idx,paper in enumerate(paper_triplets):

            paper_id = paper[0]
            title = paper[1]
            abstract = paper[2]

            if done:
                print("Processed has either finished or was terminated.")
                break

            words_in_title = [token.text for token in nlp(title)]
            keyword_match = any(re.match(keyword, word, re.IGNORECASE) for keyword in title_keywords for word in words_in_title)
            phrase_match = any(re.search(phrase, title.casefold(), re.IGNORECASE) for phrase in title_phrases)
        
            if keyword_match or phrase_match:
                doc = nlp(abstract)
            
                for i,sent in enumerate(doc.sents): # check for matches with the keywords in the noun chunks to find AI entities

                    if counter >= lim:
                        done = True
                        break # stop when counter reaches 1000

                    sent_id = paper_id + "_" + str(idx) + "_" + str(i)

                    # check if at least one of the noun chunks is an AI entity adhering to one of the structures
                    if criterion_met(sent,keywords,anthro_words):
                        counter += 1
                        file.write(sent_id+'\t'+sent.text+'\n')
                        sentences_dict["SentenceID"].append(sent_id)
                        sentences_dict["currentSentence"].append(list(doc.sents)[i].text)
                        sentences_dict["Abstract"].append(abstract)
                        try:
                            sentences_dict["prevSentence"].append(list(doc.sents)[i-1].text)
                        except IndexError:
                            sentences_dict["prevSentence"].append("")
                        try:
                            sentences_dict["nextSentence"].append(list(doc.sents)[i+1].text)
                        except IndexError:
                            sentences_dict["nextSentence"].append("")
                            
    return sentences_dict

def write_to_files(dataset,cat,lim):
    sentence_dict = get_sentences(dataset,cat,lim)
    sentence_df = pd.DataFrame(data=sentence_dict)
    sentence_df.to_pickle(f"../preprocessed_data/dataframes/{dataset}_{lim}_{cat}.pkl")

#### 8. Retrieve candidates for sentences for each category

change parameter of get_sentences. The options are:
1. arg0_verbs_active - sentences in which the AI entity is arg0 of an anthropomorphic verb in the active voice (nsubj)
2. arg0_verbs_passive - sentences in which the AI entity is arg0 of an anthropomorphic verb in the passive voice (pobj)
3. arg1_verbs - sentences in which the AI entity is arg1 of an anthropomorphic verb
4. adjectival_phrases - sentences in which the AI entity is part of an anthropomorphic adjectival phrase
5. noun_phrases - sentences in which the AI entity is part of an anthropomorphic noun phrase
6. possessives - sentences in which the AI entity is immediately followed by a possessive marker
7. comparisons - sentences in which the AI entity is being compared to humans explicitly

In [18]:
write_to_files("acl","arg1_verbs",1000)