In [None]:
'''
The script aims to parse HTML elements for the Illinois Geospatial Data Clearinghouse
and extract parsed content into a local CSV. The progress is maintained on GitHub
(https://github.com/BTAA-Geospatial-Data-Project/parse-html).


Files
-----
x.csv
	A local csv file stores existing urls that are prepared to parse.
output_yyyymmdd.csv
	The output file after parsing and it is followed by the action date.


Developers
----------
Original created on xxxxx
Created by Karen Majewicz  @karenmajewicz

Updated December 14, 2020
Updated by Ziying Cheng  @Ziiiiing

Updated May 26, 2021 for Illinois Geospatial Data Clearinghouse

Updated June 2, 2021 for MSU Digital Library

'''

In [5]:
import csv
import time
import urllib.request
from bs4 import BeautifulSoup
import json
import lxml

# extract exising urls from local csv file
urls = []

with open('06d-01.csv') as fr:
    reader = csv.reader(fr)  # reader object
    for row in reader:
        urls.append(row)


# store parsed elements for all urls
parseElements = []

for url in urls:

    page = urllib.request.urlopen(url[0]).read()
    soup = BeautifulSoup(page, "xml")
        
        
#     url_link = requests.get(url)
#     file = bs.BeautifulSoup(url_link.text, "lxml")
    print(f'Parsing {url[0]}')
        

    #TITLE - works
    titleField = soup.find('mods:title')
    try:
        title = titleField.text.strip()
    except:
        title = 'undefined'

    
    #NOTE - works
    descriptionField = soup.find('mods:note')
    try:
        description = descriptionField.text.strip()
    except:
        description = 'undefined'

    
    #creator - works
    creatorField = soup.find('mods:namePart')
    try:
        creator = creatorField.text.strip()
    except:
        creator = 'undefined'
        
    #publisher - works
    publisherField = soup.find('mods:publisher')
    try:
        publisher = publisherField.text.strip()
    except:
        publisher = 'undefined'

    
    #date - works
    dateField = soup.find('mods:dateIssued')
    try:
        date = dateField.text.strip()
    except:
        date = 'undefined'

    
    #coordinates - works
    bBoxField = soup.find('mods:coordinates')
    try:
        bBox = bBoxField.text.strip()
    except:
        bBox = 'undefined'
 
    
    #place name - works
    placeNameField = soup.find('mods:geographic')
    try:
        placeName = placeNameField.text.strip()
    except:
        placeName = 'undefined'
    
    #scale - works
    scaleField = soup.find('mods:scale')
    try:
        scale = scaleField.text.strip()
    except:
        scale = 'undefined'
    
    #library catalog record - works
    libField = soup.find('mods:url',{"note": "catalog_record"})
    try:
        lib = libField.text.strip()
    except:
        lib = 'undefined'
        
    #ARK - works
    arkField = soup.find('mods:url',{"note": "ark"})
    try:
        ark = arkField.text.strip()
    except:
        ark = 'undefined'
        
    #Rights - works
    rightsField = soup.find('mods:accessCondition')
    try:
        rights = rightsField.text.strip()
    except:
        rights = 'undefined'
        
        
        
    language = 'eng'
    provider = 'University of Michigan'
    resourceClass = 'Maps'
    resourceType = 'Cadastral maps'
    subject = 'Real Property|Landowners'
    accessRights = 'Public'
    memberOf = '06d-01'
    fileType = 'TIFF'
    code = '06d-01'
    status = 'Active'
    accrualMethod = 'MODS'
    dateAccessioned = '2021-06-02'
    
    
        

   
                
#     #combine the scraped information
    parseElements.append([url,title,description,language,creator,publisher,provider,resourceClass,resourceType,subject,date,bBox,placeName,scale,fileType,lib,ark,memberOf,code,accessRights,status,dateAccessioned])

# # generate action date with format YYYYMMDD    
    
actionDate = time.strftime('%Y%m%d')

# # write outputs to local csv file
with open(f'output_06d-01_{actionDate}.csv', 'w') as fw:
    fields = ['link','Title','Description','Language','Creator','Publisher','Provider','Resource Class','Resource Type','Subject',
              'Date Issued','Bounding Box','Spatial Coverage','Scale','Format','Documentation','Identifier','Member Of','Code',
              'Access Rights','Status','Date Accessioned']

    writer = csv.writer(fw)
    writer.writerow(fields)           # fieldnames
    writer.writerows(parseElements)   # elements

    print('#### Job done ####')
    
    

Parsing https://d.lib.msu.edu/maps/400/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/401/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/402/datastream/MODS/view/
#### Job done ####
