# Hoosier Dataset Harvest Script

On Hoosier Resilience Index portal, there are 17 links to datasets hosted on ArcGIS Online. The scripts aims to use Python to harvest and extract metadata information from ArcGIS Online and their ArcGIS REST API.

> Originated Created by Gene Cheng ([@Ziiiiing](https://github.com/Ziiiiing)) on October 4, 2021

In [1]:
from arcgis.gis import GIS
from IPython.display import display
import time
import requests
from pyproj import Proj, transform
from bs4 import BeautifulSoup
import urllib.request
import csv

### Request Dataset Links

Request the data page on [Hoosier Resilience Index](https://hri.eri.iu.edu/about/methodology/data-links.html) and get available links for datasets.

In [2]:
hoosier = 'https://hri.eri.iu.edu/about/methodology/data-links.html'
page = urllib.request.urlopen(hoosier).read()
soup = BeautifulSoup(page, 'html.parser')

dataset_urls = []
print('>>> Finding availabe dataset links')

h3_tag = soup.find('h3', text='Hoosier Resilience Index Data Links')
for tag in h3_tag.find_next_siblings():
    a_tags = tag.find_all('a', href=True)
    if a_tags:
        for a_tag in a_tags:
            href = a_tag['href']
            print(href)
            dataset_urls.append(href)

>>> Finding availabe dataset links
https://iu.maps.arcgis.com/home/item.html?id=6059ba25fe094331816c8b1122effd91
https://iu.maps.arcgis.com/home/item.html?id=d8bfa31c1d9b4a9381a78bdc6bc22354
https://iu.maps.arcgis.com/home/item.html?id=6c6692a2e9d541579317e15968223e7a
https://iu.maps.arcgis.com/home/item.html?id=3e25928907a04b1ca79ce8f61d402ee6
https://iu.maps.arcgis.com/home/item.html?id=2ca611c9b08a462c8d040ef2de53a1ea%20
https://iu.maps.arcgis.com/home/item.html?id=a81b429c6b7e4e978d1370178c55b329%20
https://iu.maps.arcgis.com/home/item.html?id=af2a7697149644508b0b5a7e2504229d%20
https://iu.maps.arcgis.com/home/item.html?id=660f51d8a4b14cadad53400779da1a78
https://iu.maps.arcgis.com/home/item.html?id=8174448c2d6748e6adc972ba9edf871d
https://iu.maps.arcgis.com/home/item.html?id=f6c9968878f440fbbe56193c2edc9959
https://iu.maps.arcgis.com/home/item.html?id=d46dc07fbff345818649bae861accada
https://iu.maps.arcgis.com/home/item.html?id=d8f1bdef5f3147489e835c07a18113f6
https://iu.maps.arcg

### Metadata Construction

Reqest web contents and 

In [3]:
# transform coordinates from projected coordinates to geographic coordinates
def convert_coords(extent):
    # spatial reference
    wkid = extent['spatialReference']['latestWkid']
    inProj = Proj(init='epsg:{}'.format(wkid))
    outProj = Proj(init='epsg:4326')   # WGS84
    
    x1 = extent['xmin']
    y1 = extent['ymin']
    x2 = extent['xmax']
    y2 = extent['ymax']
    
    xmin, ymin = transform(inProj, outProj, x1, y1)
    xmax, ymax = transform(inProj, outProj, x2, y2)
    
    return '{},{},{},{}'.format(round(xmin,4),round(ymin,4),round(xmax,4),round(ymax,4))

In [4]:
# convert file size from integer in bytes to a human readable string
def convert_bytes(size):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            return "%3.1f %s" % (size, x)
        size /= 1024.0

    return size

In [5]:
def construct_metadata(link):
    # parse item id from the link
    idElement = link.split('=')[-1]
    
    # access content on ArcGIS Online
    gis = GIS()
    content = gis.content.get(idElement)
    display(content)
    
    # extract values from arcgis online
    alternativeTitle = content.title
    description = content.snippet
    creator = content.owner
    keyword = '|'.join(content.tags)
    dateIssued = time.strftime('%Y-%m-%d',time.localtime(content.created/1000))

    information = 'https://iu.maps.arcgis.com/home/item.html?id='+content.id
    downloadURL = ""
    mapServer = 'https://iu.maps.arcgis.com/apps/mapviewer/index.html?layers={}'.format(content.id)           
    featureServer = content.url 
    imageServer = 'https://iu.maps.arcgis.com/sharing/rest/content/items/{}/info/thumbnail/thumbnail.png'.format(content.id)       

    idElement = content.id
    identifier = information
    fileSize = convert_bytes(content.size)

    
    # extract spatial values from ArcGIS REST API
    q = content.url + '?f=pjson'
    response = requests.get(q)
    data = response.json()
    
    extent = data['fullExtent']
    bbox = convert_coords(extent)
    spatialCoverage = "Indiana State"
    
    
    # empty fields to be manually edit later
    title = ""
    isoTopCat = ""
    temporalCoverage = ""
    dateRange = ""
    
    # fields with hard-coded value
    language = "eng"
    titleSource = "State of Indiana"        # called Publisher before
    resourceClass = "Datasets"
    provider = "Indiana University"

    resourceType = "Vector data"           # feature layers on ArcGIS Online
    formatElement = "Shapefile"             

    code = "01d-04"     
    memberOf = "01d-04"

    status = "Active"
    accrualMethod = "Hoosier Resilience Index"      
    dateAccessioned = time.strftime("%Y-%m-%d")
    rights = ""
    accessRights = "Public"
    suppressed = "FALSE"
    child = "FALSE"
    
    metadata = [title, alternativeTitle, description, language, creator, titleSource,
            resourceClass, isoTopCat, keyword, dateIssued, temporalCoverage,
            dateRange, spatialCoverage, bbox, resourceType,
            formatElement, information, downloadURL, mapServer, featureServer,
            imageServer, idElement, identifier, provider, code, memberOf, status,
            accrualMethod, dateAccessioned, rights, accessRights, suppressed, child, fileSize]
    
    return metadata

In [6]:
All_Metadata = []

for link in dataset_urls:
    # extract metadata from this web content as well as its REST API
    metadata = construct_metadata(link)
    All_Metadata.append(metadata)

### Write CSV Reports

In [8]:
fieldnames = ['Title', 'Alternative Title', 'Description', 'Language', 'Creator', 'Title Source', 'Resource Class',
              'ISO Topic Categories', 'Keyword', 'Date Issued', 'Temporal Coverage', 'Date Range', 'Spatial Coverage',
              'Bounding Box', 'Resource Type', 'Format', 'Information', 'Download', 'MapServer',
              'FeatureServer', 'ImageServer', 'ID', 'Identifier', 'Provider', 'Code', 'Member Of', 'Status',
              'Accrual Method', 'Date Accessioned', 'Rights', 'Access Rights', 'Suppressed', 'Child Record', "File Size"]

actionDate = time.strftime('%Y%m%d')

In [9]:
with open('reports/metadata_{}.csv'.format(actionDate), 'w') as fw:
    writer = csv.writer(fw)
    writer.writerow(fieldnames)
    writer.writerows(All_Metadata)