In [70]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pathlib
import requests

In [264]:
data_path = pathlib.Path.cwd().parent.parent / 'datasets' / 'inv_urls.csv'
inv_urls = pd.read_csv(data_path)

In [534]:
xml_urls = [x for x in inv_urls.fulltext_url if 'xml' in x]
print('Number of documents with full-text in XML format: ', len(xml_urls))

basic_urls = [x for x in inv_urls.fulltext_url if 'xml' not in x]
print('Number of documents with full-text in basic text format: ', len(basic_urls))


Number of documents with full-text in XML format:  2234
Number of documents with full-text in basic text format:  233


In [529]:
def open_url (url):
    
    '''
    Opens URL, returns HTML.
    '''
    
    test_response=requests.get(url) #open url
    html_page_source = str(test_response.content)[2:-1] #access html
    return html_page_source

def get_fulltext(html):
    
    '''
    Takes in full-text XML file.
    Returns full-text, in unparsed format and separated by heading.
    '''
    
    soup = BeautifulSoup(html, 'xml')
    
    # get full text, not seperated by heading
    # safest option, least likely to lose any info this way
    messy_full = soup.get_text()
    
    #initialize all heading sections to 'Null'
    agency1 = 'Null'
    sub_agency1 = 'Null'
    dep_doc = 'Null'
    subject = 'Null'
    agency2 = 'Null'
    action = 'Null'
    summary = 'Null'
    dates = 'Null'
    addresses = 'Null'
    signature_dated = 'Null'
    signature_name = 'Null'
    signature_title = 'Null'
    fr_doc = 'Null'
    billing_code = 'Null'
    supp_info = 'Null'
    consultation = 'Null'
    h_d_r = 'Null'
    determinations = 'Null'
    a_r_d = 'Null'
    
    if soup.find('AGENCY') != None:
        agency1 = soup.AGENCY.get_text() # difference between agency1 and agency2?
    if soup.find('SUBAGY') != None:
        sub_agency1 = soup.SUBAGY.get_text()
    if soup.find('DEPDOC')!= None:
        dep_doc = soup.DEPDOC.get_text()
    if soup.find('SUBJECT') != None:
        subject = soup.SUBJECT.get_text()
    if soup.find('AGY') != None:
        agency2 = soup.AGY.P.get_text() # difference between agency1 and agency2?
    if soup.find('ACT') != None:
        action = soup.ACT.P.get_text()
    if soup.find('SUM') != None:
        summary = soup.SUM.P.get_text()
    if soup.find('DATES') != None:
        dates = soup.DATES.P.get_text()
    if soup.find('ADD') != None:
        addresses = soup.ADD.P.get_text()
    if soup.find('DATED') != None:
        signature_dated = soup.DATED.get_text() # should we strip 'Dated: ' from beginning?      
    if soup.find('NAME') != None:
        signature_name = soup.NAME.get_text() 
    if soup.find('TITLE') != None:
        signature_title = soup.TITLE.get_text()
    if soup.find('FRDOC') != None:
        fr_doc = soup.FRDOC.get_text()
    if soup.find('BILCOD') != None:
        billing_code = soup.BILCOD.get_text()
        
    if soup.find('SUPLINF') != None:
        supp_info_list = soup.SUPLINF.children
        supp_info = ''
        for i, x in enumerate(supp_info_list):
            #get all text from headings and paragraphs
            #this should skip newline characters and signature info
            #may be missing important things here! should be preserved in messy_full
            if (x.name=='HD') or (x.name=='P'):
                supp_info+= x.get_text() + ' \n ' # concatenate supplementary info into single string
                                                # separate by newline character
                                                # does not remove special characters (eg bullet points)
    
    #populate 4 un-nested categories
                # 'Consultation' 
                # 'History and Description of the Remains' 
                # 'Determinations Made by __________'
                # 'Additional Requesters and Disposition'
    #makes potentially unsafe assumptions re format
    #could be functionized for easier editing
    if soup.find('HD')!=None:
            headings = soup.find_all('HD')
            for x in headings:
                if 'consultation' in x.string.lower():
                    consultation = ''
                    after_consultation = [x for x in list(x.next_siblings) if x.name=='P' or x.name=='HD']
                    for a in after_consultation:
                        if a.name=='P': 
                            consultation+=a.get_text()+' \n '
                            continue
                        if 'SOURCE' in a.attrs:
                            if a['SOURCE'] == "HD1":
                            #assume that "HD1" would mark new section
                            #should debug this with example
                                break
                        consultation+=a.get_text()+' \n '
                        
                if 'history and description of the remains' in x.string.lower():
                    h_d_r = ''
                    after_hdr = [x for x in list(x.next_siblings) if x.name=='P' or x.name=='HD']
                    for a in after_hdr:
                        if a.name=='P': 
                            h_d_r+=a.get_text()+' \n '
                            continue
                        if 'SOURCE' in a.attrs:
                            if a['SOURCE'] == "HD1":
                            #assume that "HD1" would mark new section
                            #should debug this with example
                                break
                        h_d_r+=a.get_text()+' \n '
                        
                if 'determinations made by' in x.string.lower():
                    determinations = x.get_text()+' \n ' #include heading because varies based on institution
                    after_det = [x for x in list(x.next_siblings) if x.name=='P' or x.name=='HD']
                    for a in after_det:
                        if a.name=='P': 
                            determinations+=a.get_text() +' \n '
                            continue
                        if 'SOURCE' in a.attrs:
                            if a['SOURCE'] == "HD1":
                            #assume that "HD1" would mark new section
                            #should debug this with example
                                break
                        determinations+=a.get_text() +' \n '
                        
                if 'additional requesters and disposition' in x.string.lower() \
                or 'additional requestors and disposition' in x.string.lower():
                    a_r_d = ''
                    after_ard = [x for x in list(x.next_siblings) if x.name=='P' or x.name=='HD']
                    for a in after_ard:
                        if a.name=='P': 
                            a_r_d+=a.get_text()+' \n '
                            continue
                        if 'SOURCE' in a.attrs:
                            if a['SOURCE'] == "HD1":
                            #assume that "HD1" would mark new section
                            #should debug this with example
                                break
                        a_r_d+=a.get_text()+' \n '
                        
    return (messy_full, agency1, sub_agency1, dep_doc, subject, 
            agency2, action, summary, dates, addresses, 
            signature_dated, signature_name, signature_title, 
            fr_doc, billing_code, supp_info, 
            consultation, h_d_r, determinations, a_r_d)

def get_fulltext_from_basictext (html):
    
    '''
    Should take in basic text format HTML.
    Should return full-text, in unparsed format and separated by heading.
    
    May not need to separate this function. 
    get_fulltext(html) can read basic text (though it can't parse the headings)
    Could parse later (after importation) with regex?
    '''
    
    soup = BeautifulSoup(xml, 'xml')
    return

## Testing/Debugging

In [530]:
html = open_url(inv_urls.fulltext_url[2400])

messy_full, agency1, sub_agency1, dep_doc, subject, \
agency2, action, summary, dates, addresses, \
signature_dated, signature_name, signature_title, \
fr_doc, billing_code, supp_info, \
consultation, h_d_r, determinations, a_r_d = get_fulltext(html)

In [525]:
inv_urls.loc[2400, 'fulltext_url']

'https://www.govinfo.gov/content/pkg/FR-1997-08-29/html/97-23108.htm'

In [511]:
inv_urls.loc[2400, 'Link']

'https://www.federalregister.gov/documents/2019/11/08/2019-24397/notice-of-inventory-completion-arkansas-archeological-survey-fayetteville-ar-correction'

In [531]:
print(action)

Null


In [532]:
print(subject)

Null


In [533]:
print(messy_full)

\n\nFederal Register, Volume 62 Issue 168 (Friday, August 29, 1997)\n\n[Federal Register Volume 62, Number 168 (Friday, August 29, 1997)]\n[Notices]\n[Pages 45873-45874]\nFrom the Federal Register Online via the Government Publishing Office [www.gpo.gov]\n[FR Doc No: 97-23108]\n\n\n-----------------------------------------------------------------------\n\nDEPARTMENT OF THE INTERIOR\n\nNational Park Service\n\n\nNotice of Inventory Completion for Native American Human Remains, \nAssociated Funerary Objects, and Unassociated Funerary Objects from the \nVicinity of Cronise Basin, San Bernardino County, CA in the Possession \nof the California State Office, Bureau of Land Management, Sacramento, \nCA\n\nAGENCY: National Park Service\n\nACTION: Notice\n\n-----------------------------------------------------------------------\n\n    Notice is hereby given in accordance with provisions of the Native \nAmerican Graves Protection and Repatriation Act (NAGPRA), 25 U.S.C. \n3003 (d), of the compl