## ArchiveSpace Barcodes Project
### Notebook 2

This notebook contains code for extracting Alma bib ID's from ArchiveSpace resource-record notes

In [365]:
import pandas as pd
import re

In [364]:
tc_df = pd.read_pickle('./aspace_data/merged-dataset.pkl.gz')

In [245]:
# Define patterns for matching bib numbers
mms_id = '99\d+4107'    # MMS ID's always begin and end with the same digits
voyager_id = '\d{7,8}'   # Range of valid Voyager bib ID lengths (shorter than MMS ID's)

In [255]:
# Create a table mapping bib numbers to top containers
bib_ids_df = tc_df[['top_container_id', 'note_label', 'note_content']].drop_duplicates()

In [256]:
# Drop non-bib-number notes
exclude = ['Historical narrative', 'Scope and Contents', 'Biographical Note', 'Abstract','Collection Scope and Content',
          'Collection note', 'Historical or Biographical Note', 'Collection Organization', 'Historical Note']
bib_ids_df = bib_ids_df.loc[~bib_ids_df.note_label.isin(exclude)].copy()

In [349]:
# Regex extract ID numbers from notes
mms_ids = bib_ids_df.note_content.str.extractall(f'.*({mms_id}).*', flags=re.MULTILINE)
voyager_ids = bib_ids_df.note_content.str.extractall(f'(?:,|\:|\.|\s|#|;|^)({voyager_id})(?:,|\:|\.|\s|;|$)', flags=re.MULTILINE)

In [350]:
# This complicated approach is necessary because the DataFrame.join method, 
# which merges on index values, does not seem to work in this case when a LEFT join
# is specified.
# Instead, we merge on a column called "index"
# First, set the names on the multiIndex resulting from the extractall method
mms_ids.index = mms_ids.index.set_names(['index', 'match'])
voyager_ids.index = voyager_ids.index.set_names(['index', 'match'])

In [351]:
# Create the "index" column in the source DF
tc_bibs_df = bib_ids_df.reset_index()

In [352]:
# Left merge on both of the extract DF's, using the shared "index" column
tc_bibs_df = tc_bibs_df.merge(mms_ids.reset_index(), how='left')\
            .drop(columns=['match'])\
            .rename(columns={0: 'mms_id'})    

In [353]:
tc_bibs_df = tc_bibs_df.merge(voyager_ids.reset_index(), how='left')\
                .drop(columns=['match'])\
                .rename(columns={0: 'voyager_id'})

In [366]:
# Add updated MMS Id to NEA records
updated = pd.read_csv('./aspace_data/nea_updates.csv')
# Isolate just the collection-level resource identifier (NEA1001, etc.)
updated['rid_str'] = updated['resource identifier'].str.split('-').apply(lambda x: x[0])

In [401]:
updated['mms id'] = updated['mms id'].astype(str)

In [402]:
# Identify containers to update for each NEA resource ID
for rid, mms_id in updated[['rid_str', 'mms id']].itertuples(index=False):
    tc_to_update = tc_df.loc[tc_df.resource_identifier.str.contains(rid)].top_container_id.values
    tc_bibs_df.loc[tc_bibs_df.top_container_id.isin(tc_to_update), 'mms_id'] = mms_id

In [404]:
tc_bibs_df.to_pickle('./aspace_data/tc-to-bib-mapping.pkl.gz')

In [393]:
tc_bibs_df.loc[tc_bibs_df.top_container_id==17227]

Unnamed: 0,index,top_container_id,note_label,note_content,mms_id,voyager_id
6796,9445,17227,Bib#,10829573,,10829573


In [394]:
tc_bibs_df.groupby('top_container_id').filter(lambda x: (x.mms_id.isnull() & x.voyager_id.isnull()).all())

Unnamed: 0,index,top_container_id,note_label,note_content,mms_id,voyager_id
