### Parse XML for MSU Important Farmland Maps

- convert XML to a spreadsheet
- format Aardvark schema

In [64]:
from bs4 import BeautifulSoup, NavigableString, Tag
import re
import urllib.request
import uuid
import csv

In [65]:
soup = BeautifulSoup(open('maps-geoportal_2022-05-09-04.xml'), 'html.parser')
features = soup.find_all('mods:mods')

In [66]:
def format_bbox(coords):
    [x_range,y_range] = coords.split('/')
    
    x_coord = []
    for coord in x_range.split('--'):
        if 'W' in coord:
            degree = float(re.search('W (.*)°', coord).group(1))
            minute = float(re.search('°(.*)ʹ', coord).group(1))
            #second = float(re.search('ʹ(.*)ʺ', coord).group(1))
            dd = degree + minute/60
             #+ second/3600
            dd = round(dd,4)
            dd *= -1
        elif 'E' in coord:
            degree = float(re.search('E (.*)°', coord).group(1))
            minute = float(re.search('°(.*)ʹ', coord).group(1))
            #second = float(re.search('ʹ(.*)ʺ', coord).group(1))
            dd = degree + minute/60
            #+ second/3600
            dd = round(dd,4)
        
        x_coord.append(dd)

    [w,e] = sorted(x_coord, key=float)
    
    
    y_coord = []
    for coord in y_range.split('--'):
        if 'S' in coord:
            degree = float(re.search('S (.*)°', coord).group(1))
            minute = float(re.search('°(.*)ʹ', coord).group(1))
            second = float(re.search('ʹ(.*)ʺ', coord).group(1))
            dd = degree + minute/60 + second/3600
            dd = round(dd,4)
            dd *= -1
        elif 'N' in coord:
            degree = float(re.search('N (.*)°', coord).group(1))
            minute = float(re.search('°(.*)ʹ', coord).group(1))
            second = float(re.search('ʹ(.*)ʺ', coord).group(1))
            dd = degree + minute/60 + second/3600
            dd = round(dd,4)
        
        y_coord.append(dd)
    
    [s,n] = sorted(y_coord, key=float)

    return '{},{},{},{}'.format(w,s,e,n)

In [67]:
def request_arkURL(arkURL):
    page = urllib.request.urlopen(arkURL)

    information = identifier = page.url
    download = information+'/datastream/OBJ/Download/'
    
    soup2 = BeautifulSoup(page, 'html.parser')

    downloadDIV = soup2.find('a', {'aria-label': 'download Original file'}).parent
    div_siblings = downloadDIV.previous_siblings
    
    format = fileSize = ''
    for div in div_siblings:
        if isinstance(div, NavigableString):
            continue
        if isinstance(div, Tag):
            txt = div.find('span', 'msul_repo_screen_reader_only').text
#             print(txt)
            if 'Format ' in txt:
                format = txt.split('Format ')[1]
            if 'Size ' in txt:
                fileSize = txt.split('Size ')[1]

    return information, identifier, download, format, fileSize


In [68]:
ALL_METADATA = []
for feature in features:
    title = ''
    alternativeTitle = feature.find_all('mods:title')[0].text
    
    cartographer = publisher = contributor = ''
    nameTags = feature.find_all('mods:name')
    for name_tag in nameTags:
        name_list = [x.text for x in name_tag.find_all('mods:namepart')]
        name = '|'.join(name_list)
        
        role_list = [x.text for x in name_tag.find_all('mods:roleterm')]
        for role in role_list:
            if role == 'cartographer':
                cartographer = name
            if role == 'publisher':
                publisher = name
            if role == 'contributor':
                contributor = name
            
    
    resourceType = feature.find('mods:typeofresource').text
    
    spatialCoverage = feature.find('mods:placeterm').text  # may not correct

    dateIssued = feature.find('mods:dateissued').text
    
    language = feature.find('mods:language').find('mods:languageterm', type='code').text
    
    bbox = feature.find('mods:coordinates').text
#     bbox = format_bbox(bbox)
    
#     

 
    scale = feature.find('mods:scale').text
    
    try:
        note = feature.find('mods:note').text
    except:
        note = 'null'
    
    description = scale + '; ' + note
    
    topicTags = [ x.find('mods:topic') for x in feature.find_all('mods:subject', authority='lcsh')]
    keyword = '|'.join([x.text for x in topicTags if x != None])
    
    catalogURL = feature.find('mods:url', note='catalog_record').text
    
    rights = feature.find('mods:accesscondition',type='use and reproduction').text
    
    
    # request arkURL -> landingPage url & download & filesize & Identifier & format
    arkURL = feature.find('mods:url', note='ark').text    # Permanent Link
    information, identifier, download, format, fileSize = request_arkURL(arkURL)
    
    resourceClass = 'Maps'
    subject = ''
    temporalCoverage = dateRange = ''
    memberOf = '64bd8c4c-8e60-4956-b43d-bdc3f93db488'
    isPartOf = '06d-02'
    code = '06d-02'
    accessRights = 'Public'
    ID = uuid.uuid4()

    
    
    metadata = [title, alternativeTitle, description,language,cartographer,publisher,contributor,
                resourceClass, resourceType, keyword, dateIssued, temporalCoverage, dateRange,
                spatialCoverage, bbox, memberOf, isPartOf, code, rights, accessRights, format, fileSize,
                ID, identifier, information, arkURL, download]
    

          
    ALL_METADATA.append(metadata)

In [69]:
fieldnames = ['Title', 'Alternative Title', 'Description', 'Language', 'Cartographer', 'Publisher', 'Contributor',
              'Resource Class', 'Resource Type', 'Keyword', 'Date Issued', 'Temporal Coverage', 'Date Range',
              'Spatial Coverage', 'Bounding Box', 'Member Of', 'Is Part Of', 'Code', 'Rights', 'Access Rights', 
              'Format', 'File Size', 'ID', 'Identifier', 'Information', 'Permanent Link', 'Download']

In [70]:
with open('MSU_maps_metadata.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(fieldnames)
    writer.writerows(ALL_METADATA)