## Purpose

To parse an XML file of MODs metadata from Michigan State University Digital Repository (https://d.lib.msu.edu/maps) and convert it to a CSV

In [17]:
import time
import re
import pandas as pd
import xml.etree.ElementTree as ET

action_date = time.strftime('%Y%m%d')

## Part 1: download the metadata

In [20]:
def fetch_records(base_url, metadata_prefix, set_spec):
    records = []
    payload = {
        'verb': 'ListRecords',
        'metadataPrefix': metadata_prefix,
        'set': set_spec
    }
    
    while True:
        response = requests.get(base_url, params=payload)

        if response.status_code == 200:
            root = ET.fromstring(response.content)

            for record in root.findall(".//{http://www.openarchives.org/OAI/2.0/}record"):
                records.append(record)
            # Check for a resumptionToken
            token = root.find(".//{http://www.openarchives.org/OAI/2.0/}resumptionToken")
            if token is not None and token.text is not None:
                payload = {
                    'verb': 'ListRecords',
                    'resumptionToken': token.text
                }
            else:
                break   # Break out of the loop if there are no more records to fetch

        else:
            print(f"Failed to fetch records: {response.content}")
            break

    print(f"Fetched {len(records)} records.")
    return records

base_url = "https://d.lib.msu.edu/oai"
metadata_prefix = "mods"
set_spec = "maps"

all_records = fetch_records(base_url, metadata_prefix, set_spec)


Fetched 1224 records.


In [21]:
from xml.etree.ElementTree import ElementTree, Element, SubElement, tostring, ElementTree
import xml.dom.minidom

# Create the root element
root = Element("AllRecords")

# Append each record to the root element
for record in all_records:
    root.append(record)

# Use ElementTree to convert it into a string and prettify the XML
rough_string = tostring(root, "utf-8")
reparsed = xml.dom.minidom.parseString(rough_string)
pretty_string = reparsed.toprettyxml(indent="\t")

# Write to XML file
with open("all_records.xml", "w") as f:
    f.write(pretty_string)

print("Successfully written to all_records.xml!")


Successfully written to all_records.xml!


## Part 2: Process the XML file

In [22]:
# Add the namespace for OAI
namespaces = {
    'ns0': 'http://www.openarchives.org/OAI/2.0/',
    'ns1': 'http://www.loc.gov/mods/v3',  # Changed from 'mods' to 'ns1'
    'ns2': 'http://www.w3.org/1999/xlink'  # Changed from 'xlink' to 'ns2'
}


In [23]:
xml_file = "all_records.xml"
# csv_file = action_date + "_05d-01.csv"
csv_file = "20231011_06d-01.csv"

In [24]:
def parse_XML(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Adjust the search to navigate through the OAI namespace
#     records = root.findall(".//oai:record", namespaces)
    records = root.findall(".//ns0:record", namespaces)
    
    rows = []
    for record in records:
        
        setSpecs = record.findall(".//ns0:setSpec", namespaces)
        
        additional_setSpec = None  # Initialize to None for each record

        # Check for additional setSpec
        for spec in setSpecs:
            if spec.text != 'maps':
                additional_setSpec = spec.text
                break  # Exit loop once the additional setSpec is found
        
        
        header = record.find(".//ns0:header", namespaces)
        identifier = header.find(".//ns0:identifier", namespaces)
        
#         metadata = record.find(".//mods:mods", namespaces)
        metadata = record.find(".//ns1:mods", namespaces)
        title = metadata.find(".//ns1:title", namespaces)
        names = metadata.findall(".//ns1:namePart", namespaces)
        publishers = metadata.findall(".//ns1:publisher", namespaces)
        issued_dates = metadata.findall(".//ns1:dateIssued", namespaces)
        temporal_dates = metadata.findall(".//ns1:dateOther", namespaces)
        
        
        language_code = record.find(".//ns1:languageTerm[@type='code']", namespaces)
        rights = record.find(".//ns1:accessCondition", namespaces)
        spatial = record.find(".//ns1:coordinates", namespaces)
        
        
        # Find the URL with note attribute 'ark' under location
        url_element = record.find(".//ns1:location/ns1:url[@note='ark']", namespaces)
        
        # Extract the URL text if the element is found
        url_text = url_element.text if url_element is not None else None

    

        # find all unique place names
        geographics = record.findall(".//ns1:geographic", namespaces)
        place_names = list(set([geo.text for geo in geographics]))
        
        # find all unique topics
        topics = record.findall(".//ns1:topic", namespaces)
        keywords = list(set([topic.text for topic in topics]))
        
        # description fields
        extents = metadata.findall(".//ns1:extent", namespaces)
        abstracts = metadata.findall(".//ns1:abstract", namespaces)
        notes = metadata.findall(".//ns1:note", namespaces)
        scale = metadata.findall(".//ns1:scale", namespaces)
        
        
        
        # Extract individual elements, these will return lists of elements


            # Create an empty list to hold description parts
        description_parts = []

        # Add parts to the list only if they have some content
        if extents:
            extents_str = "|".join([extent.text for extent in extents])
            description_parts.append(f"Extent: {extents_str}")
        if abstracts:
            abstracts_str = "|".join([abstract.text for abstract in abstracts])
            description_parts.append(f"Abstract: {abstracts_str}")
        if notes:
            notes_str = "|".join([note.text for note in notes])
            description_parts.append(f"Notes: {notes_str}")
        if scale:
            scale_str = "|".join([sc.text for sc in scale])
            description_parts.append(f"{scale_str}")

        # Join the parts into the complete description string
        description = " | ".join(description_parts)

        row = {
            "Identifier": identifier.text,
            "Title": title.text,
            "Description": description,
            "Rights": rights.text,
            "Creator": "|".join([name.text for name in names]) if names else None,
            "Publisher": "|".join([publisher.text for publisher in publishers]) if publishers else None,
            "Date Issued": "|".join([issued_date.text for issued_date in issued_dates]) if issued_dates else None,
            "Temporal Coverage": "|".join([temporal_date.text for temporal_date in temporal_dates]) if temporal_dates else None,
            "Language": language_code.text if language_code is not None else None,
            "Spatial Coverage": "|".join(place_names),
            "Subject": "|".join(keywords),
            "Coordinates": spatial.text if spatial is not None else None,
            "Ark": url_text,
            "additional_setSpec": additional_setSpec  # Add additional setSpec here

     
        }
        
        rows.append(row)
        
    out_df = pd.DataFrame(rows)
        
    return out_df

# Continue with the rest of the code to load the XML file and print the result
df = parse_XML(xml_file)




In [25]:
print(df.additional_setSpec.to_string(index=False))

          None
          None
  maps:maps_96
          None
          None
          None
          None
          None
          None
          None
          None
          None
          None
  maps:maps_96
          None
          None
          None
          None
          None
          None
          None
          None
          None
  maps:maps_96
          None
          None
          None
  maps:maps_96
          None
          None
          None
          None
          None
  maps:maps_96
          None
          None
          None
  maps:maps_96
          None
          None
          None
          None
  maps:maps_96
          None
          None
          None
          None
          None
          None
  maps:maps_96
          None
          None
          None
  maps:maps_96
          None
          None
          None
  maps:maps_96
          None
          None
          None
          None
          None
  maps:maps_96
          None
          None
          

In [26]:
def create_date_range(temporal_coverage):
    # Remove any white spaces
    temporal_coverage = temporal_coverage.strip()
    
    # Single year condition
    if temporal_coverage.isdigit():
        year = int(temporal_coverage)
        return f"{year}-{year}"

    # Century condition (e.g., 19xx)
    elif "xx" in temporal_coverage:
        century_start = int(temporal_coverage.replace("xx", "00"))
        century_end = int(temporal_coverage.replace("xx", "99"))
        return f"{century_start}-{century_end}"
    
    # Decade condition (e.g., 193x)
    elif "x" in temporal_coverage:
        decade_start = int(temporal_coverage.replace("x", "0"))
        decade_end = int(temporal_coverage.replace("x", "9"))
        return f"{decade_start}-{decade_end}"
    
    # Default date range if no condition is met
    else:
        return "1900-1955"

# Apply the function to create the new 'Date Range' column
df['Date Range'] = df['Temporal Coverage'].apply(create_date_range)

In [27]:
def clean_spatial_coverage(spatial_coverage):
    # Split the string by the pipe '|' symbol into a list
    locations = spatial_coverage.split("|")
    
    # Check if the record is related to Michigan
    if "Michigan" in locations:
        # Initialize sets to hold unique type B and type C locations
        type_b = set()
        type_c = set()
        
        # Categorize locations
        for loc in locations:
            if "(Mich.)" in loc:
                type_b.add(loc.replace(" (Mich.)", ""))
            elif loc == "Michigan":
                type_c.add("Michigan")
        
        # Create the cleaned locations list based on the rules
        cleaned_locations = ["Michigan--" + b for b in type_b]
        
        if "Michigan" not in cleaned_locations:
            cleaned_locations.append("Michigan")
        
        # Rejoin the list into a string separated by the pipe '|' symbol
        return "|".join(cleaned_locations)
    else:
        # If the record is not related to Michigan, return the original string
        return spatial_coverage

# Apply the function to clean up the 'Spatial Coverage' column
df['Spatial Coverage'] = df['Spatial Coverage'].apply(clean_spatial_coverage)




In [28]:
def convert_to_decimal(coordinate_str):
    try:
        print(f"Processing: {coordinate_str}")  # Debug print
        
        # Decode HTML entities
        coordinate_str = coordinate_str.replace("&#176;", "°").replace("&#697;", "'").replace("&#698;", '"').replace("ʹ", "'").replace("ʺ", '"')
        
        print(f"After HTML replacement: {coordinate_str}")  # Debug print
        
        # Check if already in decimal format
        if "." in coordinate_str:
            decimal_degrees = float(re.search(r"-?\d+\.\d+", coordinate_str).group())
        
        else:
            # Extract numerical values
            num_values = [float(x) for x in re.findall(r"\d+", coordinate_str)]
            
            print(f"Extracted numbers: {num_values}")  # Debug print
            
            # DMS to decimal with or without special characters
            if len(num_values) == 3:
                degrees, minutes, seconds = num_values
                decimal_degrees = degrees + (minutes / 60.0) + (seconds / 3600.0)
            
            # DM to decimal
            elif len(num_values) == 2:
                degrees, minutes = num_values
                decimal_degrees = degrees + (minutes / 60.0)
                
            # Only degrees
            elif len(num_values) == 1:
                degrees = num_values[0]
                decimal_degrees = degrees

            else:
                return None  # Unrecognized format

        if "S" in coordinate_str or "W" in coordinate_str:
            decimal_degrees *= -1

        return round(decimal_degrees, 4)

    except Exception as e:
        print(f"Error while converting coordinate: {e}")
        return None

def extract_bounding_box(coordinate_field):
    try:
        # Split latitude and longitude pairs
        lon_str, lat_str = coordinate_field.split("/")
        
        # Extract individual coordinates and convert to decimal
        west, east = map(convert_to_decimal, lon_str.split("--"))
        north, south = map(convert_to_decimal, lat_str.split("--"))
        
        # Check if any conversion failed
        if None in [west, east, north, south]:
            print("Conversion failed for one or more coordinates.")
            return None
        
        return f"{west},{south},{east},{north}"
    except Exception as e:
        print(f"Error while extracting bounding box: {e}")
        return None

df['Extracted Bounding Box'] = df['Coordinates'].apply(extract_bounding_box)

Processing: W 11°28ʹ00ʺ
After HTML replacement: W 11°28'00"
Extracted numbers: [11.0, 28.0, 0.0]
Processing: W 7°22ʹ00ʺ
After HTML replacement: W 7°22'00"
Extracted numbers: [7.0, 22.0, 0.0]
Processing: N 8°33ʹ00ʺ
After HTML replacement: N 8°33'00"
Extracted numbers: [8.0, 33.0, 0.0]
Processing: N 4°18ʹ00ʺ
After HTML replacement: N 4°18'00"
Extracted numbers: [4.0, 18.0, 0.0]
Processing: W 90°25ʹ00ʺ
After HTML replacement: W 90°25'00"
Extracted numbers: [90.0, 25.0, 0.0]
Processing: W 81°51ʹ00ʺ
After HTML replacement: W 81°51'00"
Extracted numbers: [81.0, 51.0, 0.0]
Processing: N 47°33ʹ00ʺ
After HTML replacement: N 47°33'00"
Extracted numbers: [47.0, 33.0, 0.0]
Processing: N 41°27ʹ00ʺ
After HTML replacement: N 41°27'00"
Extracted numbers: [41.0, 27.0, 0.0]
Processing: W 085.876188
After HTML replacement: W 085.876188
Processing: W 085.760122
After HTML replacement: W 085.760122
Processing: N 041.983774
After HTML replacement: N 041.983774
Processing: N 041.896623
After HTML replacement

In [29]:
# check and correct the order of the coordinates

def correct_bounding_box(bounding_box_str):
    try:
        west, south, east, north = map(float, bounding_box_str.split(","))
        
        # Correct latitude: ensure north >= south
        if north < south:
            print(f"Swapping latitudes {south} and {north}")
            north, south = south, north
        
        # Correct longitude: ensure east >= west
        if east < west:
            print(f"Swapping longitudes {west} and {east}")
            east, west = west, east

        return f"{west},{south},{east},{north}"
    
    except Exception as e:
        print(f"Error while correcting bounding box: {e}")
        return None

# Apply the function to the "Bounding Box" column
df['Bounding Box'] = df['Extracted Bounding Box'].apply(correct_bounding_box)


Error while correcting bounding box: 'NoneType' object has no attribute 'split'
Swapping latitudes 45.3773 and 45.2908
Swapping latitudes 45.4192 and 45.376
Swapping latitudes 45.2908 and 45.2021
Swapping latitudes 45.2877 and 45.1988
Swapping latitudes 45.3729 and 45.2847
Swapping latitudes 45.4559 and 45.3675
Swapping latitudes 45.499 and 45.4519
Swapping latitudes 45.2847 and 45.1984
Swapping latitudes 45.3675 and 45.2824
Swapping latitudes 45.4519 and 45.3656
Swapping latitudes 45.5374 and 45.4505
Swapping latitudes 45.2825 and 45.1986
Swapping latitudes 45.3683 and 45.2821
Swapping latitudes 45.4518 and 45.3656
Swapping latitudes 45.5392 and 45.4508
Swapping latitudes 45.6263 and 45.5374
Swapping latitudes 44.1671 and 44.0748
Swapping latitudes 44.5988 and 44.5122
Swapping latitudes 44.6856 and 44.5983
Swapping latitudes 45.6306 and 45.5716
Swapping latitudes 44.5992 and 44.5121
Swapping latitudes 44.9506 and 44.8616
Swapping latitudes 44.9946 and 44.9502
Swapping latitudes 44.599

In [30]:
df['number'] = df['Identifier'].str.split('_').str[-1]
df['ID'] = df['Ark'].apply(lambda x: "ark-" + "-".join(x.split("/")[4:6]))
df['Identifier'] = df['Identifier'] + "|" + df['Ark']

In [31]:
def populate_parent_id(df):
    # Create a dictionary mapping 'number' to 'ID' for parent records
    parent_dict = df[df['additional_setSpec'].isna()].set_index('number')['ID'].to_dict()
    
    # Initialize the new 'Is Part Of' column to None
    df['Is Part Of'] = None
    
    # Function to map 'additional_setSpec' to parent 'ID'
    def map_to_parent_id(row):
        if pd.notna(row['additional_setSpec']):
            # Extract the digits at the end of 'additional_setSpec'
            number = row['additional_setSpec'].split('_')[-1]
            
            # Map to parent 'ID' using the dictionary
            return parent_dict.get(number, None)
        return None
    
    # Apply the function to populate 'Is Part Of' column
    df['Is Part Of'] = df.apply(map_to_parent_id, axis=1)
    
populate_parent_id(df)

In [32]:
# Function to append "06d-01" to existing values in "Is Part Of" column
def append_value(row):
    existing_value = row['Is Part Of']
    if pd.isna(existing_value):
        return "06d-01"
    else:
        return f"{existing_value}|06d-01"

# Apply the function to the "Is Part Of" column
df['Is Part Of'] = df.apply(append_value, axis=1)

In [33]:
df['Resource Class'] = "Maps"
df['Date Accessioned'] = action_date
df['Code'] = "06d-01"
# df['Is Part Of'] = "06d-01"
df['Member Of'] = "dc8c18df-7d64-4ff4-a754-d18d0891187d"
df['Accrual Method'] = "MODS"
df['Access Rights'] = "Public"
df['Provider'] = "Michigan State University"
df['Information'] = "https://d.lib.msu.edu/maps/" + df['number']
df['Manifest'] = "https://d.lib.msu.edu/maps/" + df['number'] + "/manifest"


In [34]:
df.to_csv(csv_file, index=False)