The purpose of this notebook is to take XML files found in the `/xml` directory, and produce a csv output (`output.csv`) that has the following:
* Article Title
* Impact Statement
* Impact Statement Word Count
* Impact Statement Citation Count

In [1]:
import pandas as pd
import os
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
import xml.etree.ElementTree as ET

In [5]:
#XML files that are already genereated via PDFx are assumed to be in a directory "xml"

directory="xml"
impact_dict={"title":[], "paper identifier":[], "paper link":[], "impact statement":[], "impact title":[], "impact statement word count":[], "impact statement sentence count":[], "citation count":[],
            "has positive":[], "has negative":[], "has opt out":[], "has NA":[]}

#initialize citations_dict, which is a separate dictionary to be generated as a separate CSV file (citation.csv)
citation_dict={"paper title":[],"paper id":[],"citation":[]}

#loops through the directory, and appends the relevant information to impact_dict, which will be appended to the dataframe later
for filename in os.listdir(directory):
    #to exclude "sample.xml"
    if filename.endswith(".pdfx.xml"):
        full_path = os.path.join(directory, filename)
        tree = ET.parse(full_path)
        root = tree.getroot()
        #get article title
        #initialize a list of citations for this document
        citation_ref = []
        for section in root[1][0][0]:
            if section.tag=="article-title":
                title = section.text
        for section in root[1][1]:
            citations = 0
            signal = 0
            for child in section:
                if signal == 1 :
                    #print(section.text)
                    #broader_dict[filename] = section.text
                    #loop through any xrefs to count for citations
                    for xref in child:
                        #narrow down xref citations to bibliography references
                        if xref.tag == "xref" and xref.attrib['ref-type'] == "bibr":
                            #use "rid" as the identifier, so we come out of this with a list of references
                            citation_ref.append(xref.attrib['rid'])
                            citations +=1
                    #itertext will make sure that if there are any tags within the section, we still get the whole thing.
                    impact_statement_text=''.join(child.itertext())
                    impact_statement_number_of_words=len(impact_statement_text.split())
                    #add count for setences using delimeters of ".", "?", and "!"
                    impact_statement_number_of_sentences=len(re.split("\.|\?|!", impact_statement_text))-1
                    #will identify the hash based off of this pattern "86d7c8a08b4aaa1bc7c599473f5dddda-Paper.pdfx.xml"
                    paper_identifier = re.search("(\w*)(-Paper)", filename)
                    #check if "positive" is in the statement
                    has_positive = "True" if "positive" in impact_statement_text.lower() else "False"
                    #check if "negative" is in the statement
                    has_negative = "True" if "negative" in impact_statement_text.lower() else "False"
                    #check if it has the NeurIPS opt-out phrase
                    has_opt_out = "True" if "this work does not present any foreseeable societal consequence" in impact_statement_text.lower() else "False"
                    #check if it has "Not Applicable"
                    has_NA = "True" if "not applicable" in impact_statement_text.lower() else "False"
                    #add everything to the dictionary
                    impact_dict["impact title"].append(impact_statement_title)
                    impact_dict["impact statement"].append(impact_statement_text)
                    impact_dict["impact statement word count"].append(impact_statement_number_of_words)
                    impact_dict["impact statement sentence count"].append(impact_statement_number_of_sentences)
                    impact_dict["citation count"].append(citations)
                    impact_dict["title"].append(title)
                    impact_dict["paper identifier"].append(paper_identifier[1])
                    impact_dict["paper link"].append("https://proceedings.neurips.cc/paper/2020/file/" + paper_identifier[1] + "-Paper.pdf")
                    impact_dict["has positive"].append(has_positive)
                    impact_dict["has negative"].append(has_negative)
                    impact_dict["has opt out"].append(has_opt_out)
                    impact_dict["has NA"].append(has_NA)
                    signal = 0
                    #print(citation_ref)
                #focus on heading
                if "impact" in str(child.text).lower() and child.tag == "h1":
                    #print("It has a Broader Impact!")
                    #log the title of the broader impact statement
                    impact_statement_title = child.text
                    signal=1
                elif str(child.text).lower() == "broader impact" and child.tag == "h1":
                    impact_statement_title = child.text
                    signal=1
                elif str(child.text).lower() == "broader impacts" and child.tag == "h1":
                    impact_statement_title = child.text
                    signal=1
            #identify the bibliography
            if section.attrib["class"] == "DoCO:Bibliography":
                #loop through the bibliography section, but we really only want one part
                for references in section:
                    if references.attrib["class"] == "DoCO:BiblioGraphicReferenceList":
                        #loop through all the entries in the reference list
                        for citation in references:
                            #the try statement is because if the bibliography is across multiple pages, there will be entries with no "rid", so we account for that with a keyerror.
                            try:
                                #check if the citation is in the citation_ref we established earlier
                                if citation.attrib["rid"] in citation_ref:
                                    citation_dict["paper title"].append(title)
                                    citation_dict["paper id"].append(paper_identifier[1])
                                    citation_dict["citation"].append(citation.text)
                            except KeyError:
                                continue

  impact_statement_number_of_sentences=len(re.split("\.|\?|!", impact_statement_text))-1
  paper_identifier = re.search("(\w*)(-Paper)", filename)


In [6]:
#create the dataframe for the output from the dictionary
impact_statements =pd.DataFrame.from_dict(impact_dict)

#create the dataframe for the citations from the dictionary
total_citations =pd.DataFrame.from_dict(citation_dict)

In [7]:
#generate the CSV file from the dataframe

impact_statements.to_csv("output.csv",index=False)

#generate the CSV file for the citations
total_citations.to_csv("citations.csv",index=False)

# Analysis section

This is based off of [this](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0) guide.

In [None]:
impact_statements['processed'] = impact_statements['content'].map(lambda x: re.sub('[^a-zA-Z0-9 ]', '', x))
impact_statements['processed'] = impact_statements['processed'].map(lambda x: x.lower())

In [None]:
#create a wordcloud
# Join the different processed titles together.
long_string = ','.join(list(impact_statements['processed'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

In [None]:
sns.set_style('whitegrid')
%matplotlib inline

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(impact_statements['processed'])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

In [None]:
warnings.simplefilter("ignore", DeprecationWarning)
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 4
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [None]:
#%%time
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself

#if 1 == 1:
LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)

with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, "rb") as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')