In [1]:
import pandas as pd
import numpy as np
import pathlib

In [2]:
# read in data

base_path = pathlib.Path.cwd().parent.parent / 'datasets'
inv_path = base_path / 'inv_urls.csv'
full_path = base_path / 'fulltext.pckl.gz'
meta_path = base_path / 'metadata.pckl.gz'

inv_urls = pd.read_csv(inv_path)
fulltext = pd.read_pickle(full_path, compression='gzip')
metadata = pd.read_pickle(meta_path, compression='gzip')


# This notebook will attempt to process the "Title" column in inv_urls, with the goal of extracting the institution.

### It is assumed that "Title" is in the following format:  [Institution], [City], [State, 2-letter abbreviation]

In [3]:
# view all "Titles" with fewer than three items (as separated by commas)

split_lens = []
def split_len(x):
    split = x.split(',')
    split_lens.append(len(split))
    return split[0]

inv_urls.Title.map(split_len)

split_under_three = np.argwhere(np.array(split_lens) < 3)

for i in split_under_three:
    print(inv_urls.iloc[i]['Title'].values)

['U.S. Army Corps of Engineers, Huntington District; Correction']
['Phoebe A. Hearst Museum of Anthropology at the University of California, Berkeley']
['Information Collection Request Sent to the Office of Management and Budget (OMB) for Approval; Use of iNaturalist by the National Park Service To Record Natural History Observations']
['Notice of Completion of Inventory of Native American Human Remains From the Hawaiian Islands in the Collections of the Peabody Museum of Natural History, Yale University; Correction']
['Center for Archaeological Research at the University of Texas at San Antonio, TX']
['Extension of Time for Inventory']
['Notice of Inventory Completion for Native American Human Remains and Associated Funerary Objects in the Possession of The State Museum of Pennsylvania, Harrisburg; Correction']
['Missouri Department of Natural Resources']
['Notice of Inventory Completion for Native American Human Remains and Associated Funerary Objects From the State of Texas in the P

In [4]:
# set Institution as Title, less last 2 items
inv_urls['Institution'] = inv_urls.Title.map(lambda x: ', '.join(x.split(', ')[:-2]) \
                                             if (len(x.split(', '))>2) else 'nan')

# set City as 2nd to last item in Title
inv_urls['City'] = inv_urls.Title.map(lambda x: x.split(', ')[-2] \
                                      if (len(x.split(', '))>2) else 'nan')

# set State as last item in Title
inv_urls['State'] = inv_urls.Title.map(lambda x: x.split(', ')[-1] \
                                       if len(x.split(', '))>2 else 'nan')

# trim trailing spaces from State
inv_urls['State'] = inv_urls.State.map(lambda x: x[:-1] if (x[-1]==' ') else x)

# trim trailing periods from State
inv_urls['State'] = inv_urls.State.map(lambda x: x[:-1] if (x[-1]=='.') else x)

# trim trailing parentheses from State
inv_urls['State'] = inv_urls.State.map(lambda x: x[:-1] if (x[-1]==')') else x)

# Create 'Correction' column to hold information from Title
def correction(x):
    if 'Correction; Correction' in x:
        return 2
    if 'Correction' in x:
        return 1
    return 0
inv_urls['Correction'] = inv_urls.Title.map(correction) 

# remove 1st correction from "State"
inv_urls['State'] = inv_urls.State.map(lambda x: x[:-12] if ('; Correction' in x) else x)
# remove 2nd correction from "State"
inv_urls['State'] = inv_urls.State.map(lambda x: x[:-12] if ('; Correction' in x) else x)

In [5]:
# count the number of records where "State" has more than two characters
state_over_two = np.where(\
                    np.array([len(x) for x in list(inv_urls['State'].values)])\
                          > 2)[0]
print(len(state_over_two))

31


In [6]:
# view all records where "State" has more than two characters
for i in state_over_two:
    print(i)
    print('Title: ', inv_urls.iloc[i]['Title'])
    print('Institution: ', inv_urls.iloc[i]['Institution'])
    print('City: ', inv_urls.iloc[i]['City'])
    print('State: ', inv_urls.iloc[i]['State'])
    print('\n')

399
Title:  U.S. Army Corps of Engineers, Huntington District; Correction
Institution:  nan
City:  nan
State:  nan


404
Title:  Museum of Natural History and Planetarium, Roger Williams Park, Providence RI
Institution:  Museum of Natural History and Planetarium
City:  Roger Williams Park
State:  Providence RI


502
Title:  Phoebe A. Hearst Museum of Anthropology at the University of California, Berkeley
Institution:  nan
City:  nan
State:  nan


594
Title:  Carnegie Museum of Natural History, Pittsburgh, Pennsylvania; Correction
Institution:  Carnegie Museum of Natural History
City:  Pittsburgh
State:  Pennsylvania


653
Title:  Dallas Water Utilities, Dallas, Texas
Institution:  Dallas Water Utilities
City:  Dallas
State:  Texas


657
Title:  Information Collection Request Sent to the Office of Management and Budget (OMB) for Approval; Use of iNaturalist by the National Park Service To Record Natural History Observations
Institution:  nan
City:  nan
State:  nan


781
Title:  Informat

In [7]:
# Manually clean all records where "State" has more than two characters

inv_urls.loc[2466, 'Institution'] = 'Peabody Museum of Natural History, Yale University'
inv_urls.loc[2466, 'City'] = 'New Haven'
inv_urls.loc[2466, 'State'] = 'CT'

# correct as per https://www.bishopmuseum.org/
inv_urls.loc[2465, 'Institution'] = 'Bernice Pauahi Bishop Museum'
inv_urls.loc[2465, 'City'] = 'Honolulu'
inv_urls.loc[2465, 'State'] = 'HI'

inv_urls.loc[2464, 'Institution'] = 'Peabody Museum of Archaeology and Ethnology, Harvard'
inv_urls.loc[2464, 'City'] = 'Cambridge'
inv_urls.loc[2464, 'State'] = 'MA'

# correct as per https://en.wikipedia.org/wiki/Marine_Corps_Base_Hawaii
inv_urls.loc[2463, 'Institution'] = 'U.S. Marine Corps Air Station Kaneohe Bay'
inv_urls.loc[2463, 'City'] = 'Honolulu'
inv_urls.loc[2463, 'State'] = 'HI'

inv_urls.loc[2461, 'Institution'] = 'National Park Service'
inv_urls.loc[2461, 'City'] = 'Washington'
inv_urls.loc[2461, 'State'] = 'D.C.'

# correct location as per https://history.sd.gov/archaeology/default.aspx
inv_urls.loc[2460, 'Institution'] = 'South Dakota State Archaeological Research Center'
inv_urls.loc[2460, 'City'] = 'Rapid City'
inv_urls.loc[2460, 'State'] = 'SD'

inv_urls.loc[2454, 'Institution'] = 'Fort Hood Archeological Laboratory'
inv_urls.loc[2454, 'City'] = 'Fort Hood'
inv_urls.loc[2454, 'State'] = 'TX'

inv_urls.loc[2435, 'Institution'] = 'Bandelier National Monument, National Park Service'
inv_urls.loc[2435, 'City'] = 'Los Alamos'
inv_urls.loc[2435, 'State'] = 'NM'

inv_urls.loc[2217, 'Institution'] = 'University of Nebraska State Museum, University of Nebraska-Lincoln'
inv_urls.loc[2217, 'City'] = 'Lincoln'
inv_urls.loc[2217, 'State'] = 'NE'
inv_urls.loc[2217, 'Correction'] = 1

inv_urls.loc[2157, 'Institution'] = 'California Department of Transportation (CALTRANS), Anthropological Studies Center (ASC), Archaeological Collections Facility, Sonoma State University'
inv_urls.loc[2157, 'City'] = 'Rohnert Park'
inv_urls.loc[2157, 'State'] = 'CA'

inv_urls.loc[2127, 'Institution'] = 'Bureau of Indian Affairs, Arizona State Museum'
inv_urls.loc[2127, 'City'] = 'Tucson'
inv_urls.loc[2127, 'State'] = 'AZ'

inv_urls.loc[1967, 'Institution'] = 'U.S. Department of the Interior, National Park Service, Death Valley National Park'
inv_urls.loc[1967, 'City'] = 'Death Valley'
inv_urls.loc[1967, 'State'] = 'CA and NV'

# correct as per https://www.peabody.harvard.edu/
inv_urls.loc[1882, 'Institution'] = 'Peabody Museum of Archaeology and Ethnology'
inv_urls.loc[1882, 'City'] = 'Cambridge'
inv_urls.loc[1882, 'State'] = 'MA'

# correct as per https://dnr.mo.gov/contacts.htm
inv_urls.loc[1793, 'Institution'] = 'Missouri Department of Natural Resources'
inv_urls.loc[1793, 'City'] = 'Jefferson City'
inv_urls.loc[1793, 'State'] = 'MO'

inv_urls.loc[1698, 'Institution'] = 'University of Nebraska State Museum, University of Nebraska-Lincoln'
inv_urls.loc[1698, 'City'] = 'Lincoln'
inv_urls.loc[1698, 'State'] = 'NE'

inv_urls.loc[1589, 'Institution'] = 'The State Museum of Pennsylvania'
inv_urls.loc[1589, 'City'] = 'Harrisburg'
inv_urls.loc[1589, 'State'] = 'PA'

inv_urls.loc[1583, 'Institution'] = 'Slater Museum of Natural History, University of Puget Sound'
inv_urls.loc[1583, 'City'] = 'Tacoma'
inv_urls.loc[1583, 'State'] = 'WA'

inv_urls.loc[1581, 'Institution'] = 'Slater Museum of Natural History, University of Puget Sound'
inv_urls.loc[1581, 'City'] = 'Tacoma'
inv_urls.loc[1581, 'State'] = 'WA'


inv_urls.loc[1478, 'Institution'] = 'Central Washington University, Department of Anthropology'
inv_urls.loc[1478, 'City'] = 'Ellensburg'
inv_urls.loc[1478, 'State'] = 'WA'

# add 1159 to list of documents to be removed from df as it is not NAGPRA-related
# see https://www.federalregister.gov/documents/2011/05/31/2011-13378/information-collection-request-sent-to-the-office-of-management-and-budget-omb-for-approval
docs_to_drop = [1159]

# 1158 is not a notice of inventory completion, but rather an extension for one
# keeping in database because it is NAGPRA-related
# see https://www.federalregister.gov/documents/2011/06/09/2011-13396/extension-of-time-for-inventory
# correcting location as per https://www.thehistorylist.com/venues/the-colorado-historical-society-denver-colorado
inv_urls.loc[1158, 'Institution'] = 'The Colorado Historical Society'
inv_urls.loc[1158, 'City'] = 'Denver'
inv_urls.loc[1158, 'State'] = 'CO'

inv_urls.loc[972, 'Institution'] = 'Center for Archaeological Research at the University of Texas at San Antonio'
inv_urls.loc[972, 'City'] = 'San Antonio'
inv_urls.loc[972, 'State'] = 'TX'

inv_urls.loc[808, 'Institution'] = 'Peabody Museum of Natural History, Yale University'
inv_urls.loc[808, 'City'] = 'New Haven'
inv_urls.loc[808, 'State'] = 'CT'

# add 781 to list of documents to be removed from df as it is not NAGPRA-related
# see https://www.federalregister.gov/documents/2014/08/29/2014-20621/information-collection-request-sent-to-the-office-of-management-and-budget-omb-for-approval
docs_to_drop.append(781)

# add 657 to list of documents to be removed from df as it is not NAGPRA-related
# see https://www.federalregister.gov/documents/2015/08/21/2015-20677/information-collection-request-sent-to-the-office-of-management-and-budget-omb-for-approval-use-of
docs_to_drop.append(657)

inv_urls.loc[653, 'State'] = 'TX'

inv_urls.loc[594, 'Institution'] = 'Carnegie Museum of Natural History'
inv_urls.loc[594, 'City'] = 'Pittsburgh'
inv_urls.loc[594, 'State'] = 'PA'

inv_urls.loc[502, 'City'] = 'Berkeley'
inv_urls.loc[502, 'State'] = 'CA'

inv_urls.loc[404, 'Institution'] = 'Museum of Natural History and Planetarium, Roger Williams Park'
inv_urls.loc[404, 'City'] = 'Providence'
inv_urls.loc[404, 'State'] = 'RI'

# correct as per https://www.lrh.usace.army.mil/Contact.aspx
inv_urls.loc[399, 'Institution'] = 'U.S. Army Corps of Engineers, Huntington District'
inv_urls.loc[399, 'City'] = 'Huntington'
inv_urls.loc[399, 'State'] = 'WV'

In [8]:
# view documents to be dropped
for x in docs_to_drop:
    print(x)
    print(inv_urls.loc[x, 'Link'])
    print('\n')

1159
https://www.federalregister.gov/documents/2011/05/31/2011-13378/information-collection-request-sent-to-the-office-of-management-and-budget-omb-for-approval


781
https://www.federalregister.gov/documents/2014/08/29/2014-20621/information-collection-request-sent-to-the-office-of-management-and-budget-omb-for-approval


657
https://www.federalregister.gov/documents/2015/08/21/2015-20677/information-collection-request-sent-to-the-office-of-management-and-budget-omb-for-approval-use-of




In [9]:
# drop documents which are not NAGPRA-related
inv_urls.drop(docs_to_drop, inplace=True)

# drop these documents from metadata and fulltext as well
keys_to_drop = ['I_{}'.format(x) for x in docs_to_drop]
fulltext.drop(keys_to_drop, inplace=True)
metadata.drop(keys_to_drop, inplace=True)

# save changes
inv_urls.to_csv(inv_path)
fulltext.to_pickle(full_path, compression='gzip')
metadata.to_pickle(meta_path, compression='gzip')

In [11]:
# view all "Institutions" that contain commas
inv_urls['Institution_List'] = inv_urls['Institution'].map(lambda x: x.split(', '))
for x in inv_urls['Institution_List'].values:
    if len(x)>1:
        print(x)

['University of California', 'Santa Cruz']
['The Wistar Institute', 'Philadelphia', 'PA', 'and the University of Pennsylvania Museum of Archaeology and Anthropology']
['U.S. Department of the Interior', 'Bureau of Indian Affairs']
['Department of Anthropology', 'San Jose State University']
['South Dakota State Historical Society', 'Archaeological Research Center']
['University of Tennessee', 'Department of Anthropology', 'Knoxville', 'TN', 'and U.S. Army Corps of Engineers', 'Omaha District']
['U.S. Department of the Interior', 'National Park Service', 'Wupatki National Monument']
['Georgia Department of Natural Resources', 'Historic Preservation Division']
['U.S. Department of Agriculture', 'Forest Service', 'Lassen National Forest']
['U.S. Army Corps of Engineers', 'Omaha District', 'Omaha', 'NE', 'and South Dakota State Archaeological Research Center']
['Historic Westville', 'Inc.']
['Texas Archeological Research Laboratory', 'University of Texas at Austin']
['Department of Anthropo

## We can see that significantly more cleaning of "Institution" is required. Several issues are outstanding:


#### 1) The same institutions can be referred to by different names (e.g. 'Peabody Museum of Archaeology and Ethnology, Harvard University' / 'Peabody Museum of Archaeology and Ethnology, Harvard' / 'Peabody Museum of Archaeology and Ethnology').
#### 2) Several institutions can be associated with a single record.
#### 3) Varying levels of bureaucratic specificity are available. It is unclear how we ought to include divisions, departments, etc.
#### 4) Older notices are of a different format: "Notice of Inventory Completion for Native American Human Remains and Associated Funerary Objects from [Geographic region of provenance] in the Possession of [Institution in Possession], [City], [State, 2-letter abbreviation ], and in the Control of [Controlling Institution]"
#### 5) This preliminary cleaning has not yet been performed on repatriation_urls.csv