In [None]:
'''
The script aims to parse HTML elements for the Illinois Geospatial Data Clearinghouse
and extract parsed content into a local CSV. The progress is maintained on GitHub
(https://github.com/BTAA-Geospatial-Data-Project/parse-html).


Files
-----
x.csv
	A local csv file stores existing urls that are prepared to parse.
output_yyyymmdd.csv
	The output file after parsing and it is followed by the action date.


Developers
----------
Original created on xxxxx
Created by Karen Majewicz  @karenmajewicz

Updated December 14, 2020
Updated by Ziying Cheng  @Ziiiiing

Updated May 26, 2021 for Illinois Geospatial Data Clearinghouse

'''

In [78]:
import csv
import time
import urllib.request
from bs4 import BeautifulSoup

# extract exising urls from local csv file
urls = []

with open('02a-01.csv') as fr:
    reader = csv.reader(fr)  # reader object
    for row in reader:
        urls.append(row)


# store parsed elements for all urls
parseElements = []

for url in urls:
    page = urllib.request.urlopen(url[0]).read()
    soup = BeautifulSoup(page, "html.parser")
    print(f'Parsing {url[0]}')

    #TITLE - works
    titleField = soup.find(attrs={'id':'page-title'})
    title = titleField.text.strip()
    
    #METADATA LINK - works
    try:
        metadataLink = soup.find('a', href=True, text = "Link")
        metadata = metadataLink['href']
    except:
        metadata = "none"
        
    #SUMMARY - works
    try:
        summaryField = soup.find(attrs={"property":"content:encoded"})
        summary = summaryField.text.strip()
    except:
        summary = "none"

    #Download file info - works, but pulls entire class
    
    for fileContent in soup.find_all(attrs={'class':'collapsible collapsed group-downloads field-group-htab form-wrapper'}):   
        for downloadFields in fileContent.children:
            try:
                fileInfo = downloadFields
            except:
                fileInfo = "none"
            print(downloadFields)
            
    #Service info - works, but pulls entire class

    for serviceContent in soup.find_all(attrs={'class':'collapsible collapsed group_services field-group-htab form-wrapper'}):   
        for serviceFields in serviceContent.children:
            try:
                serviceInfo = serviceFields
            except:
                serviceInfo = "none"

                
#     #Possible option for everything else - but would need to be extensively parsed    
#     nodeContentField = soup.find(attrs={'class':'node-content'})
#     nodeContent = nodeContentField.text.strip()
                
                
    #combine the scraped information
    parseElements.append([title,summary,metadata,fileInfo,serviceInfo])

# generate action date with format YYYYMMDD    
    
actionDate = time.strftime('%Y%m%d')

# write outputs to local csv file
with open(f'output_02a-01_{actionDate}.csv', 'w') as fw:
    fields = ['Title','Description','HTML','Download','Service']

    writer = csv.writer(fw)
    writer.writerow(fields)           # fieldnames
    writer.writerows(parseElements)   # elements

    print('#### Job done ####')
    
    

Parsing https://clearinghouse.isgs.illinois.edu/data/climate/illinois-climate-network-soil-data
Parsing https://clearinghouse.isgs.illinois.edu/data/climate/illinois-climate-network-weather-data
Parsing https://clearinghouse.isgs.illinois.edu/data/coastal/HTEM/lake-michigan-coast-2017
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><p> </p>
<p> </p>
<p> </p>
</div></div></div><div class="field field-name-field-download-description field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><p>All the data are georeferenced to WGS_1984 UTM Zone 16 and require 430Mb of uncompressed space. Folder structure and contents:</p>
<ol><li>Root -- SkyTEM project report and journal article in *.pdf format</li>
<li>Isopach_Data -- is

Parsing https://clearinghouse.isgs.illinois.edu/data/coastal/bathy
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><p style="font-size: 14px;">Single-beam bathymetry data, presented as point shapefiles, were collected along the shore of Illinois Beach State Park at a variety of spatial scales. Each dataset has an associated .xml file containing FGDC-compliant metadata. </p>
<p style="font-size: 14px;">Survey dates are listed in <a href="https://chf.isgs.illinois.edu/coastal/singlebeam_data/BathySurveyDates.csv">BathySurveyDates.csv</a></p>
<p style="font-size: 14px;"><a href="https://chf.isgs.illinois.edu/coastal/Singlebeam_data/Singlebeam_data.zip">Download all bathymetry data here.</a> </p>
</div></div></div></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/elevatio

Parsing https://clearinghouse.isgs.illinois.edu/data/elevation/surface-elevation-30-meter-digital-elevation-model-dem
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Elevation/zips/IL_Dem_30m.zip" title="IL_Dem_30m.zip" type="application/zip; length=117910714">30M DEM.zip (112.45 MB)</a></span></div></div></section></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/elevation/surface-elevation-30-meter-shaded-relief-map
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section

Parsing https://clearinghouse.isgs.illinois.edu/data/geology/elevation-new-albany-shale
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Geology/zips/IL_Structure_NewAlbany_Elev_20ft_Ln.zip" type="application/zip; length=3880641">IL_Structure_NewAlbany_Elev_20ft_Ln.zip</a></span></div></div></section></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/geology/geologic-and-geophysical-maps-ozark-illinois-indiana-and-kentucky-oiink-region
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-w

Parsing https://clearinghouse.isgs.illinois.edu/data/geology/illinois-borehole-temperatures
Parsing https://clearinghouse.isgs.illinois.edu/data/geology/illinois-drill-stem-tests
Parsing https://clearinghouse.isgs.illinois.edu/data/geology/illinois-waterflood-units-1946-2002
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Geology/zips/IL_Historical_Waterflood_Areas_Py.zip" type="application/zip; length=356390">IL_Historical_Waterflood_Areas_Py.zip</a></span></div></div></section></div>
Parsing https://clearinghouse.isgs.ill

Parsing https://clearinghouse.isgs.illinois.edu/data/geology/structural-features-anticlines-synclines-and-monoclines
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Geology/zips/IL_Struct_Feat_Clines_1995_Ln.zip" title="IL_Struct_Feat_Clines_1995_Ln.zip" type="application/zip; length=59297">IL Anticlines Synclines Monoclines.zip</a></span></div></div></section></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/geology/structural-features-faults-grabens-and-flexures
<legend><span class="fieldset-legend">Data</span><

Parsing https://clearinghouse.isgs.illinois.edu/data/hydrology/major-aquifers
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Hydrology/zips/IL_Major_Aquifers.zip" title="IL_Major_Aquifers.zip" type="application/zip; length=12975510">Il Major Aquifers.zip</a></span></div></div></section></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/hydrology/major-bedrock-aquifers-depths-greater-500-feet-below-ground-surface
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class

Parsing https://clearinghouse.isgs.illinois.edu/data/imagery/1937-1947-illinois-historical-aerial-photography
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><p><strong>JPEG File Format</strong></p>
<p>This format is a compressed file with the following file extension <q>.jpg</q>. The original TIFF format files were compressed using Adobe Photoshop CS4 and their Quality setting was set to 10. The average file size is approximately 7.25 megabytes for 7x9 inch photos and 9.25 megabytes for 9x9 inch photos. These files should open in most browsers as they are an open source file format. These files have not been georeferneced.</p>
<p><strong>TIFF File Format</strong></p>
<p>This format is a full resolution file format with the following file extension <q>.tif</q>. These are the or

Parsing https://clearinghouse.isgs.illinois.edu/data/imagery/1998-2001-illinois-digital-orthophoto-quadrangle-doq-data
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><h3>Compressed Data</h3>
<p><img alt="Map of 1998 - 2001 DOQ Data  by Year of Image Capture" height="230" src="/sites/clearinghouse.isgs/files/images/ISGS/napp3_imageyear.png" style="margin: 10px; float: right;" title="Map of 1998 - 2001 DOQ Data  by Year of Image Capture" width="139"/><img alt="Map Legend: Year of Image Capture  (1994, 1996, 1998, 1999, 2000 and 2001)" height="99" src="/sites/clearinghouse.isgs/files/images/ISGS/napp3_legend_sm.png" style="float: right; margin: 10px;" title="Map Legend: Year of Image Capture  (1994, 1996, 1998, 1999, 2000 and 2001)" width="59"/>Data files are offered as LizardTec

Parsing https://clearinghouse.isgs.illinois.edu/data/imagery/2005-illinois-chicago-urban-area-orthophotography
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><h2>Data Distribution: Compressed Data</h2>
<p>Compressed data are LizardTech's MrSID Generation 2 .sid files. Target compression ratio was 8:1; one compressed file is roughly 9 megabytes in size.</p>
<p>A statewide collection of both 2005 CUA data (40 Gigabytes) and 2005 National Aerial Photography Program (NAPP) data requires about 100 Gigabytes of storage space. We do not plan to distribute compressed data files by any other means (cd-rom, zip drives, portable drives, ftp download, etc).</p>
<p>The collection can be viewed as an Image Service on the Viewer tab of this webpage, or in ArcGIS Desktop, ArcGIS.com or any cl

Parsing https://clearinghouse.isgs.illinois.edu/data/imagery/2007-illinois-naip-digital-orthophoto-quadrangle-data
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><h2 style="font-size: 1.5em; margin-top: 1.67em; margin-bottom: 0.67em; font-family: Roboto, Verdana, Geneva, sans-serif; font-weight: normal;">Data Distribution</h2>
<p>This imagery is currently only available through the Illinois State Geological Survey as an ArcGIS Image Service.  Click the Map Service button above to go to the REST services endpoint.</p>
</div></div></div></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/imagery/2010-illinois-naip-digital-orthophoto-quadrangle-data
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-

Parsing https://clearinghouse.isgs.illinois.edu/data/imagery/2011-illinois-naip-digital-orthophoto-quadrangle-data
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><h2 style="font-size: 1.5em; margin-top: 1.67em; margin-bottom: 0.67em; font-family: Roboto, Verdana, Geneva, sans-serif; font-weight: normal;">Data Distribution</h2>
<p>This imagery is currently only available through the Illinois State Geological Survey as an ArcGIS Image Service.  Click the Map Service button above to go to the REST services endpoint.</p>
</div></div></div></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/imagery/2012-illinois-naip-digital-orthophoto-quadrangle-data
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-

Parsing https://clearinghouse.isgs.illinois.edu/data/landcover/illinois-landcover-early-1800s
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-data-distribution field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><p>In Illinois, the surveys began in 1804 and were largely completed by 1843. The surveyors moved across the state laying out a rectangular grid system, known as the Public Land Survey System (PLS or PLSS). They were required to keep field notebooks where they noted details about their survey (such as which kind of tree was 'blazed' or marked at the section corners), as well as notes about the quality of the landscape, mines, salt licks, watercourses, springs, mill seats and other 'remarkable and permanent things'. Once a township was finished, the surveyors were to make a map of the area. These surveys represent one of the earliest detai

Parsing https://clearinghouse.isgs.illinois.edu/data/land-cover/land-cover-illinois-1999-2000-data
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Land/zips/lcoi_geotiff.zip" title="lcoi_geotiff.zip" type="application/zip; length=33055655">GeoTIFF.zip </a></span></div><div class="field-item odd"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/dat

Parsing https://clearinghouse.isgs.illinois.edu/data/land-cover/usda-nass-cropland-data-layer-illinois-2007
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><div class="field field-name-field-download-description field-type-text-with-summary field-label-hidden view-mode-full"><div class="field-items"><div class="field-item even"><h2>Purpose</h2>
<p>The purpose of the Illinois Cropland Data Layer Program is to use satellite imagery on an annual basis to provide supplemental acreage estimates for the state's major commodities, and to produce digital, crop specific, categorized geo-referenced output products. These data are intended for geographic display and analysis at the state level. The cropland data layers are provided as is. USDA-NASS does not warrant results you may obtain using the data.</p>
<h2>Copyright Restrictions</h2>
<p>There are NO copyright restrictions with the NASS Cropland Data Layer imagery. The categorized imagery is considered

Parsing https://clearinghouse.isgs.illinois.edu/data/reference/blm-illinois-public-land-survey-system
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Reference/zips/IL_CadNSDI_V2.gdb_.zip" type="application/zip; length=34555797">IL_CadNSDI_V2.gdb_.zip</a></span></div></div></section></div>
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/illinois-county-boundaries-polygons-and-lines
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-z

Parsing https://clearinghouse.isgs.illinois.edu/data/reference/usgs-digital-raster-graphic-drg-mosaic
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/usgs-quadrangle-boundaries-and-corner-points-illinois
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Reference/zips/IL_QUAD_Usgs_Quadrangle_Index_Nad83.zip" title="IL_QUAD_Usgs_Quadrangle_Index_Nad83.zip" type="application/zip; length=6079531">IL ISGS Quadrangle Index.zip</a></span></div></div></section></div>
#### Job done ####
