# ACL anthology data extraction

This notebook provides code for extracting ACL anthologoy data following the dev documentation here: https://acl-anthology.readthedocs.io/latest/api/anthology/

#### 1. get ACL anthology data

In [None]:
from acl_anthology import Anthology
anthology = Anthology.from_repo()
!pip show acl-anthology

#### 2. Extend anthropomorphic wordlists with WordNet

In [2]:
from tools import wordnet_syns as syns

extended_arg0_verbs = syns.extend_word_list('arg0_verbs','v')
extended_arg1_verbs = syns.extend_word_list('arg1_verbs','v')
extended_adjectives = syns.extend_word_list('adjectives','a')
extended_nouns = syns.extend_word_list('nouns','n')

['render', 'betray', 'fulfil', 'suffer', 'hear', 'name', 'bear', 'lie', 'examine', 'see', 'evolve', 'bring on', 'be after', 'view', 'instance', 'mistrust', 'establish', 'carry out', 'even up', 'give rise', 'see to it', 'have_taste_for', 'forget', 'think back', 'dream', 'befuddle', 'theorise', 'grow', 'collaborate', 'blur', 'build up', 'describe', 'envision', 'put up', 'have', 'con', 'brook', 'pass on', 'say', 'sympathise', 'misunderstand', 'feign', 'cerebrate', 'take on', 'execute', 'sense', 'exemplify', 'conciliate', 'favour', 'draw a blank', 'fulfill', 'recrudesce', 'have in mind', 'signify', 'catch up with', 'educate', 'demonstrate', 'seize', 'opine', 'want', 'differentiate', 'ideate', 'figure', 'take apart', 'understand', 'watch', 'conjecture', 'break', 'conceive', 'larn', 'fuddle', 'mean', 'visit', 'try', 'pay', 'discombobulate', 'line', 'realize', 'stomach', 'order', 'identify', 'cozen', 'study', 'acquire', 'make believe', 'make', 'dwell', 'take over', 'wish well', 'cover', 'make

#### 3. Get papers from ACL anthology in a iterable object and initiate keywords for matching relevant titles and abstracts

In [10]:
import re
import spacy
import pandas as pd
import pickle 

nlp = spacy.load("en_core_web_md")

all_papers = anthology.papers()

# for a case-insensitive re.match with words from title:
title_keywords = ['AI','LM','LLM','GPT','ChatGPT'] 

# for a case insensitive re.search in title:
title_phrases = ['artificial intelligence','language model']

# for a lemma-based string comparison against entities in the abstract:
keywords = ['AI','LM','LMs','LLM','LLMs','model','system','algorithm'] 
# spaCy lemmatizer does not handle plurals well for LM, LLM, so their plural version was included here too

#### 4. Functions for retrieving specific patterns for each class from the taxonomy of anthropomorphic structures

In [11]:
def arg0_active_criterion_check(sent,keywords,verb_list):

    check = 0

    for chunk in sent.noun_chunks:
        match = any(re.search(rf"\b{re.escape(word)}\b", chunk.text, re.IGNORECASE) for word in keywords)
        if match and chunk.root.dep_ == 'nsubj' and chunk.root.head.lemma_ in verb_list:
            check += 1

    if check > 0:
        return True
    else:
        return False

#### 5. General function for retreiving sentences matching a criterion 

In [23]:
def get_sentences(cat):

    with open(f"../preprocessed_data/acl_{cat}.txt","w") as file:

        done = False
        counter = 0 # initiate counter
        sentences_dict = {"SentenceID":[],"currentSentence":[],"prevSentence":[],"nextSentence":[],"Abstract":[]}
        stop_words = ['do','be','have','show'] 
        verb_list = [v for v in extended_arg0_verbs if v not in stop_words] # exclude stop words
    
        for idx,paper in enumerate(all_papers):

            if done:
                break
            
            title = [token.text for token in nlp(str(paper.title))]
            keyword_match = any(re.match(keyword, word, re.IGNORECASE) for keyword in title_keywords for word in title)
            phrase_match = any(re.search(phrase, str(paper.title).casefold(), re.IGNORECASE) for phrase in title_phrases)
        
            if paper.abstract and keyword_match or phrase_match:
                doc = nlp(str(paper.abstract))
            
                for i,sent in enumerate(doc.sents): # check for matches with the keywords in the noun chunks to find AI entities

                    if counter >= 1000:
                        done = True
                        break # stop when counter reaches 1000

                    sent_id = paper.id + "_" + str(idx) + "_" + str(i)

                    # check if at least one of the noun chunks is an AI entity whose root is an anthropomorphic predicate
                    if cat == "arg0_verbs_active":
                        criterion_met = arg0_active_criterion_check(sent,keywords,verb_list)
                
                    if criterion_met:
                        counter += 1
                        file.write(sent_id+'\t'+sent.text+'\n')
                        sentences_dict["SentenceID"].append(sent_id)
                        sentences_dict["currentSentence"].append(list(doc.sents)[i].text)
                        sentences_dict["Abstract"].append(str(paper.abstract))
                        try:
                            sentences_dict["prevSentence"].append(list(doc.sents)[i-1].text)
                        except IndexError:
                            sentences_dict["prevSentence"].append("")
                        try:
                            sentences_dict["nextSentence"].append(list(doc.sents)[i+1].text)
                        except IndexError:
                            sentences_dict["nextSentence"].append("")

    return sentences_dict                       
                                    
arg0_verbs_df = pd.DataFrame(data=sentences_dict)

#### Retrieve candidates for sentences in which the AI entity is the arg0 of an anthropomorphic predicate in the active voice. 

In [25]:
arg0_verbs_dict = get_sentences("arg0_verbs_active")
arg0_verbs_df = pd.DataFrame(data=arg0_verbs_dict)
arg0_verbs_df.to_pickle("../preprocessed_data/dataframes/acl_1000_arg0_verbs.pkl")