In [2]:
# CSS for the markdown cells
mycss = """
    h2, h4, h6 { 
        margin: 0;
        font-family: serif;
    }
    p {
    font-family: serif;
    }
"""
from IPython.core.display import display, HTML
HTML('<style>{}</style>'.format(mycss))

<center><h3>(Special) Collections as Data</h3>
<h4>Part 2 of 3</h4>
<h6>See Part One of this project [here](https://).</h6></center>

<p>This notebook steps through the process of cleaning up the MARC data from Part One, with the aim of reducing the number of columns (through concatenation) to a manageable amount for analysis and visualization. <p>

In [4]:
# Load the pandas library/module
import pandas as pd
# load the groupby function
from itertools import groupby

In [5]:
# Loading our data
data = pd.read_pickle('../path/to/your/files/here/all_spec_single_fields.pkl')

In [7]:
# There are now a lot of columns, because of all the field-subfield-indicator combinations
data.columns

Index(['008_date1', '008_date2', '008_lang', '041-0_-a', '041-0_-h',
       '041-1_-a', '041-1_-h', '041-_1-a', '041-_1-h', '041-_7-a',
       ...
       '260-__-f', '260-__-g', '300-__-a', '300-__-b', '300-__-c', '752-__',
       'bib_id', 'normalized_call_no', 'display_call_no', 'location_code'],
      dtype='object', length=120)

The code below lets us print all the columns for ease of reference.

In [9]:
# sort the columns by the MARC field, excluding those from the 008 and those that aren't MARC fields 
columns = sorted(data.columns[3:-4], key=lambda x: x.split('-')[0])

In [10]:
# Group the list by the first element in each item (the MARC field tag)
g = groupby(columns, key=lambda x: x.split('-')[0])

In [11]:
# Creating a dictionary, with the key as the grouper and the value as a list of all things in that group
d = {gg[0]: list(gg[1]) for gg in g}

In [None]:
# Loop through the grouped list and print with a visual separator between each field
for k,v in d.items():
    print(k)
    print('\n'.join(v))
    print('-'*10)

The goal is to rename the columns from the original MARC extract, concatenating where it makes sense.

In [13]:
# Rename columns from the original dataframe
new_columns = {'041-1_-a': 'translation_languages',
              '041-0_-a': 'additional_languages',
              '041-1_-h': 'original_language',
              '041-__-a': 'additional_languages',
              '041-__-h': 'original_language',
              '100-0_-a': 'mono_author_name',
              '100-10-a': 'author_name',
              '100-1_-a': 'author_name',
              '100-20-a': 'author_name',
                '100-2_-a': 'author_name',
               '100-3_-a': 'author_name_family',
               '752-__': 'added_place',
               '300-__-a': 'extent',
               '300-__-b': 'other_physical_details',
               '300-__-c': 'dimensions'
              }

There is a lot of information in the 245 we don't care about (having to do with character offsets, etc.)

In [14]:
# For renaming the 245 subfields
title = {'a': 'title', 
         'b': 'remainder_of_title', 
         'c': 'statement_of_responsibility'}

In [15]:
# Make a reverse list of columns, assigning the MARC fields to the labels we want to use
column_map = {}
for k,v in new_columns.items():
    if v in column_map:
        column_map[v].append(k)
    else:
        column_map[v] = [k]

Where there are nulls in the data, pandas add a **NaN** element by default. But converting that to the empty string means that we can treat all cells as strings.

In [16]:
data = data.fillna('')

Now we populate our new DataFrame with concatenated versions of the columns from the original extract.

In [17]:
data_cleaned = pd.DataFrame()
for k, v in column_map.items():
    # For each column in the target DataFrame, combine the values from the corresponding columns in the source DataFrame
    # We're using the pipe character as a separator because that probably doesn't appear in any of the MARC data
    # On the right side of this assignment, we're taking advantage of the fact that pandas allows us to select multiple columns of a DataFrame by passing it a list
    data_cleaned[k] = data[v].apply(lambda x: ' | '.join([s for s in x if s]), axis=1)

The 245 field is a simpler matter -- we can ignore the indicators altogether!

In [19]:
title_fields = [c for c in data.columns if c.startswith('245')]
for k, v in title.items():
    data_cleaned[v] = data[[c for c in title_fields if c.endswith(k)]].apply(lambda x: ' | '.join([s for s in x if s]), axis=1)

Our new DataFrame needs the first three and the last four columns from the original. (Because we were concatenating along the columns access, both frames should have the same number of rows, and we have preserved the order, so we can simply "stick" those columns onto the new DF, as it were.)

In [20]:
data_cleaned[data.columns[:3]] = data[data.columns[:3]]

In [21]:
data_cleaned[data.columns[-4:]] = data[data.columns[-4:]]

In [None]:
# Make sure to save your work!
data_cleaned.to_pickle('..path/to/your/files/here/all_spec_single_fields_cleaned.pkl')

** Normalizing the Publication Year (008 Field) **

This function will change nulls and non-digit characters to digits. We're using zero to fill in for the "u" character in the MARC data ("u"="unknown"). Depending on the kind of analysis you want to do, this may or may not be desirable.

In [22]:
def convert_yr(year):
    if year.isspace():
        year = '9999'
    elif year == '||||':
        year = '9999'
    else:
        year = year.replace('u', '0')
    return year

In [23]:
data_cleaned['008_date1'] = data_cleaned['008_date1'].apply(convert_yr)

To get the century, just take the first two characters from each pub year.

In [24]:
# Note the .str on the end of the right-hand side of the equation. 
# That tells pandas that we want to perform the slice on the string values themselves from each element in this column
# Otherwise, pandas will try to take the first two elements of the column, which is not what we want
data_cleaned['century'] = data_cleaned['008_date1'].str[:2]

In [22]:
data_cleaned.to_pickle('..path/to/your/files/here/all_spec_single_fields_cleaned.pkl')