In [None]:
'''
The script aims to parse HTML elements for the Illinois Geospatial Data Clearinghouse
and extract parsed content into a local CSV. The progress is maintained on GitHub
(https://github.com/BTAA-Geospatial-Data-Project/parse-html).


Files
-----
x.csv
	A local csv file stores existing urls that are prepared to parse.
output_yyyymmdd.csv
	The output file after parsing and it is followed by the action date.


Developers
----------
Original created on xxxxx
Created by Karen Majewicz  @karenmajewicz

Updated December 14, 2020
Updated by Ziying Cheng  @Ziiiiing

Updated May 26, 2021 for Illinois Geospatial Data Clearinghouse

Updated June 2, 2021 for MSU Digital Library

'''

In [6]:
import csv
import time
import urllib.request
from bs4 import BeautifulSoup
import json
import lxml

# extract exising urls from local csv file
urls = []

with open('06d-01.csv') as fr:
    reader = csv.reader(fr)  # reader object
    for row in reader:
        urls.append(row)


# store parsed elements for all urls
parseElements = []

for url in urls:

    page = urllib.request.urlopen(url[0]).read()
    soup = BeautifulSoup(page, "xml")
        
        
#     url_link = requests.get(url)
#     file = bs.BeautifulSoup(url_link.text, "lxml")
    print(f'Parsing {url[0]}')
        

    #TITLE - works
    titleField = soup.find('mods:title')
    try:
        title = titleField.text.strip()
    except:
        title = 'undefined'

    
    #NOTE - works
    descriptionField = soup.find('mods:note')
    try:
        description = descriptionField.text.strip()
    except:
        description = 'undefined'

    
    #creator - works
    creatorField = soup.find('mods:namePart')
    try:
        creator = creatorField.text.strip()
    except:
        creator = 'undefined'
        
    #publisher - works
    publisherField = soup.find('mods:publisher')
    try:
        publisher = publisherField.text.strip()
    except:
        publisher = 'undefined'

    
    #date - works
    dateField = soup.find('mods:dateIssued')
    try:
        date = dateField.text.strip()
    except:
        date = 'undefined'

    
    #coordinates - works
    bBoxField = soup.find('mods:coordinates')
    try:
        bBox = bBoxField.text.strip()
    except:
        bBox = 'undefined'
 
    
    #place name - works
    placeNameField = soup.find('mods:geographic')
    try:
        placeName = placeNameField.text.strip()
    except:
        placeName = 'undefined'
    
    #scale - works
    scaleField = soup.find('mods:scale')
    try:
        scale = scaleField.text.strip()
    except:
        scale = 'undefined'
    
    #library catalog record - works
    libField = soup.find('mods:url',{"note": "catalog_record"})
    try:
        lib = libField.text.strip()
    except:
        lib = 'undefined'
        
    #ARK - works
    arkField = soup.find('mods:url',{"note": "ark"})
    try:
        ark = arkField.text.strip()
    except:
        ark = 'undefined'
        
    #Rights - works
    rightsField = soup.find('mods:accessCondition')
    try:
        rights = rightsField.text.strip()
    except:
        rights = 'undefined'
        
        
        
    language = 'eng'
    provider = 'University of Michigan'
    resourceClass = 'Maps'
#     resourceType = 'Cadastral maps'
#     subject = 'Real Property|Landowners'
    accessRights = 'Public'
    memberOf = '06d-01'
    fileType = 'TIFF'
    code = '06d-01'
    status = 'Active'
    accrualMethod = 'MODS'
    dateAccessioned = '2021-06-02'
    
    
        

   
                
#     #combine the scraped information
    parseElements.append([url,title,description,language,creator,publisher,provider,resourceClass,date,bBox,placeName,scale,fileType,lib,ark,memberOf,code,accessRights,status,dateAccessioned])

# # generate action date with format YYYYMMDD    
    
actionDate = time.strftime('%Y%m%d')

# # write outputs to local csv file
with open(f'output_06d-01_{actionDate}.csv', 'w') as fw:
    fields = ['link','Title','Description','Language','Creator','Publisher','Provider','Subject',
              'Date Issued','Bounding Box','Spatial Coverage','Scale','Format','Documentation','Identifier','Member Of','Code',
              'Access Rights','Status','Date Accessioned']

    writer = csv.writer(fw)
    writer.writerow(fields)           # fieldnames
    writer.writerows(parseElements)   # elements

    print('#### Job done ####')
    
    

Parsing https://d.lib.msu.edu/maps/1/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/2/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/3/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/4/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/5/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/6/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/7/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/8/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/9/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/10/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/11/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/12/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/13/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/14/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/15/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/16/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/17/datastream/

Parsing https://d.lib.msu.edu/maps/139/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/140/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/141/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/142/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/143/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/144/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/145/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/146/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/147/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/148/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/149/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/150/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/151/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/152/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/153/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/154/datastream/MODS/view/
Parsing https://d.lib.ms

Parsing https://d.lib.msu.edu/maps/274/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/275/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/276/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/277/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/278/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/279/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/280/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/281/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/282/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/283/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/284/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/285/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/286/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/287/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/288/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/289/datastream/MODS/view/
Parsing https://d.lib.ms

Parsing https://d.lib.msu.edu/maps/409/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/410/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/411/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/412/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/413/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/414/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/415/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/416/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/417/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/418/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/419/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/420/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/421/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/422/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/423/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/424/datastream/MODS/view/
Parsing https://d.lib.ms

Parsing https://d.lib.msu.edu/maps/544/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/545/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/546/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/547/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/548/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/549/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/550/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/551/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/552/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/553/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/554/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/555/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/556/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/557/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/558/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/559/datastream/MODS/view/
Parsing https://d.lib.ms

Parsing https://d.lib.msu.edu/maps/679/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/680/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/681/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/682/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/683/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/684/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/685/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/686/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/687/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/688/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/689/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/690/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/691/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/692/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/693/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/694/datastream/MODS/view/
Parsing https://d.lib.ms

Parsing https://d.lib.msu.edu/maps/814/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/815/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/816/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/817/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/818/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/819/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/820/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/821/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/822/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/823/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/824/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/825/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/826/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/827/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/828/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/829/datastream/MODS/view/
Parsing https://d.lib.ms

Parsing https://d.lib.msu.edu/maps/949/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/950/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/951/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/952/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/953/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/954/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/955/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/956/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/957/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/958/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/959/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/960/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/961/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/962/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/963/datastream/MODS/view/
Parsing https://d.lib.msu.edu/maps/964/datastream/MODS/view/
Parsing https://d.lib.ms