# Astropedia product page/HTML parse

In [1]:
# url = 'https://astrogeology.usgs.gov/search/map/Ganymede/Geology/Ganymede_SIM3237_Database'
url = 'https://astrogeology.usgs.gov/search/map/Moon/Geology/Unified_Geologic_Map_of_the_Moon_GIS_v2'

url

'https://astrogeology.usgs.gov/search/map/Moon/Geology/Unified_Geologic_Map_of_the_Moon_GIS_v2'

In [2]:
rootname = url.split('/')[-1]
rootname

'Unified_Geologic_Map_of_the_Moon_GIS_v2'

In [3]:
# Get the HTML from 'url'
#
import requests
import lxml.html

res = requests.get(url)
res.raise_for_status()

tree = lxml.html.fromstring(res.content, parser=lxml.html.HTMLParser(remove_comments=True))

tmeta = tree.xpath('//div[@class="content"]/section[@class="block metadata"]')[0]
theader = tree.xpath('//div[@class="downloads"]//a[.="Sample"]')

In [4]:
theader[0].attrib['href']

'https://astropedia.astrogeology.usgs.gov/download/Moon/Geology/thumbs/merc_Unified_Geologic_Map_of_The_Moon_1024.jpg'

In [5]:
data = {}

In [6]:
for node in tmeta.xpath('h2'):
    print(node.text)

Unified Geologic Map of the Moon, 1:5M, 2020
General
Keywords
Contact and Distribution
Data Status and Quality
Lineage
Geospatial Information


In [15]:
tmeta.xpath('p//text()')

['This new work represents a seamless, globally consistent, 1:5,000,000-scale geologic map derived from \n the six digitally renovated geologic maps (see Source Online Linkage below). The goal of this project was to \ncreate a digital resource for science research and analysis, future geologic mapping efforts, be it local-, regional-, \nor global-scale products, and as a resource\n for the educators and the public interested in lunar geology. Here we present the completed mapping \n project as unit contacts, geologic unit polygons, linear features, and unit and feature nomenclature \n annotation. The product overlies shaded-relief products derived from SELENE Kaguya terrain camera stereo \n (equatorial, ~60 m/pix) and LOLA altimetry (north and south polar, 100 m/pix). These data are not \n included with this download due to size considerations, but a readme in the "Lunar_Raster" folder \n provides the download links. This download page includes a PDF of the geologic map (right-side) wi

In [16]:
for node in tmeta.xpath('p'):
    print(node.text)

This new work represents a seamless, globally consistent, 1:5,000,000-scale geologic map derived from 
 the six digitally renovated geologic maps (see Source Online Linkage below). The goal of this project was to 
create a digital resource for science research and analysis, future geologic mapping efforts, be it local-, regional-, 
or global-scale products, and as a resource
 for the educators and the public interested in lunar geology. Here we present the completed mapping 
 project as unit contacts, geologic unit polygons, linear features, and unit and feature nomenclature 
 annotation. The product overlies shaded-relief products derived from SELENE Kaguya terrain camera stereo 
 (equatorial, ~60 m/pix) and LOLA altimetry (north and south polar, 100 m/pix). These data are not 
 included with this download due to size considerations, but a readme in the "Lunar_Raster" folder 
 provides the download links. This download page includes a PDF of the geologic map (right-side) with a brief 

In [21]:
pars = tmeta.xpath('p')
pars

[<Element p at 0x7fa9c028ba90>,
 <Element p at 0x7fa9c028bae0>,
 <Element p at 0x7fa9c028bb30>,
 <Element p at 0x7fa9c028bc70>]

In [34]:
pars[3].text_content()

'Fortezzo, C.M., Spudis, P. D. and Harrel, S. L. (2020). Release of the Digital Unified Global Geologic Map of the Moon At 1:5,000,000- Scale. Paper presented at the 51st Lunar and Planetary Science Conference, Lunar and Planetary Institute, Houston, TX. https://www.hou.usra.edu/meetings/lpsc2020/pdf/2760.pdf'

In [17]:
title = tmeta.find('h2')
title

<Element h2 at 0x7fbcd8dfc4a0>

In [18]:
title.tag

'h2'

In [19]:
title.attrib

{'class': 'title'}

In [20]:
title.text

'Global Geologic Map of Ganymede, SIM3237'

In [21]:
data.update({'title': title.text})

In [67]:
# description = tree.xpath('h2/following-sibling::p')
description = tmeta.xpath('p/text()')
description

['      Ganymede is the largest satellite of Jupiter, and its icy surface has been formed through a variety of of impact cratering, tectonic, and possibly cryovolcanic processes.  The history of Ganymede can be divided into three distinct phases: an early phase dominated by impact cratering and mixing of non-ice materials in the icy crust, a phase in the middle of its history marked by great tectonic upheaval, and a late quiescent phase characterized by a gradual drop in heat flow and further impact cratering.  Images of Ganymede suitable for geologic mapping were collected during the flybys of Voyager 1 and Voyager 2 (1979), as well as during the Galileo mission in orbit around Jupiter (1995-2003).  This map represents a synthesis of our understanding of Ganymede geology after the conclusion of the Galileo mission.',
 "      The two fundamental classes of material units on Ganymede are dark materials and light materials.  The dark/light distinction is based on sharp relative albedo co

In [23]:
abstract = []
for desc in description:
    abstract.append(desc.text.strip())
    
abstract

['Ganymede is the largest satellite of Jupiter, and its icy surface has been formed through a variety of of impact cratering, tectonic, and possibly cryovolcanic processes.  The history of Ganymede can be divided into three distinct phases: an early phase dominated by impact cratering and mixing of non-ice materials in the icy crust, a phase in the middle of its history marked by great tectonic upheaval, and a late quiescent phase characterized by a gradual drop in heat flow and further impact cratering.  Images of Ganymede suitable for geologic mapping were collected during the flybys of Voyager 1 and Voyager 2 (1979), as well as during the Galileo mission in orbit around Jupiter (1995-2003).  This map represents a synthesis of our understanding of Ganymede geology after the conclusion of the Galileo mission.',
 "The two fundamental classes of material units on Ganymede are dark materials and light materials.  The dark/light distinction is based on sharp relative albedo contrasts at t

In [24]:
# _cur = title.getnext()

# abstract = []
# while _cur.tag == 'p':
#     abstract.append(_cur.text.strip())
#     _cur = _cur.getnext()

# abstract

In [25]:
data.update({'abstract':abstract})

In [36]:
def _parse_authors(nodes):
        """
        Clean/Split the authors from 'text'.

        The text/names are in a linear comma-separated list ("and" possibly):
        "Fulano de Tal, Maria Brasil (, and) Carlos H. Brandt".
        The output is a list of "lastname, firstname" strings
        """
        line = nodes[0].text.strip()

        authors = [ author.strip()
                    for author in line.replace(' and ',',').split(',') ]
        authors = [ author.split()[-1] + ', ' + ' '.join(author.split()[:-1])
                    for author in authors
                    if author ]

        return authors

authors = tmeta.xpath('dl[1]/dt[.="Author"]/following-sibling::dd[1]')
authors = _parse_authors(authors)

In [35]:
# publication = {
#     'Author' : None,
#     'Publisher' : None,
#     'Publication Date' : None,
# }

# for _dt in _cur.xpath('.//dt'):
#     if _dt.text not in publication.keys():
#         continue
#     _key = _dt.text
#     _val = _dt.getnext().text
#     publication[_key] = _val
    
# publication

In [37]:
data.update({'authors':authors})

In [40]:
pub_date = tmeta.xpath('dl[1]/dt[.="Publication Date"]/following-sibling::dd[1]')[0].text
pub_date

'11 February 2014'

In [41]:
data.update({'publication_date': pub_date})

In [43]:
publisher = tmeta.xpath('dl[1]/dt[.="Publisher"]/following-sibling::dd[1]')
publisher[0].text

'USGS Astrogeology Science Center'

In [44]:
data.update({'publisher': publisher[0].text})

In [59]:
document_url = tmeta.xpath('//dt[.="Supplemental Information"]/following-sibling::dd[1]/a')
document_url[0].text

'http://pubs.usgs.gov/sim/3237/'

In [60]:
data.update({'document_url':document_url[0].text})

In [82]:
def _parse_bbox(nodes):
    """Map astropedia's bounding-box fields to ours"""
    _bbox = {
        'Minimum Longitude': 'westlon',
        'Maximum Longitude': 'eastlon',
        'Minimum Latitude': 'minlat',
        'Maximum Latitude': 'maxlat'
    }
    out = {}
    for node in nodes:
        if node.text in _bbox:
            out.update({ _bbox[node.text]: float(node.getnext().text) })
    return out

bounding_box = tmeta.xpath('h2[text()="Geospatial Information"]//following-sibling::dl[1]/dt')
_parse_bbox(bounding_box)

{'minlat': -90.0, 'maxlat': 90.0, 'westlon': 0.0, 'eastlon': 360.0}

In [21]:
_cur

<Element dl at 0x7fb1305ebd10>

In [66]:
purpose = tmeta.xpath('//dt[text()="Purpose"]/following-sibling::dd[1]/p/text()')
purpose[0]

"Much has been learned about Ganymede's impact cratering,  tectonic,  and possibly cryovolcanic processes since the Voyager flybys,  primarily during and following the Galileo Mission at Jupiter (December 1995-September 2003). Our mapping incorporates this new understanding to assist in map unit definition and provide a global synthesis of Ganymede's geology."

In [23]:
data.update({'purpose': purpose})

In [24]:
latitudes = tree.xpath('h2[.="Geospatial Information"]/following-sibling::dl/dt[text()="Minimum Latitude" or text()="Maximum Latitude"]')
longitudes = tree.xpath('h2[.="Geospatial Information"]/following-sibling::dl/dt[text()="Minimum Longitude" or text()="Maximum Longitude"]')

In [25]:
for l in latitudes+longitudes:
    print(f"{l.text}: {l.getnext().text}")

Minimum Latitude: -90
Maximum Latitude: 90
Minimum Longitude: 0
Maximum Longitude: 360


In [26]:
bbox = {}
for l in latitudes+longitudes:
    bbox.update({l.text: l.getnext().text})

bbox

{'Minimum Latitude': '-90',
 'Maximum Latitude': '90',
 'Minimum Longitude': '0',
 'Maximum Longitude': '360'}

In [27]:
data.update({'bbox': bbox})

In [28]:
system = tree.xpath('h2[.="Keywords"]/following-sibling::dl/dt[text()="System"]/following-sibling::dd//text()')[0]
target = tree.xpath('h2[.="Keywords"]/following-sibling::dl/dt[text()="Target"]/following-sibling::dd//text()')[0]
print(system,target)

Jupiter Ganymede


In [29]:
data.update({'system': system, 'target':target})

In [30]:
data

{'title': 'Global Geologic Map of Ganymede, SIM3237',
 'abstract': ['Ganymede is the largest satellite of Jupiter, and its icy surface has been formed through a variety of of impact cratering, tectonic, and possibly cryovolcanic processes.  The history of Ganymede can be divided into three distinct phases: an early phase dominated by impact cratering and mixing of non-ice materials in the icy crust, a phase in the middle of its history marked by great tectonic upheaval, and a late quiescent phase characterized by a gradual drop in heat flow and further impact cratering.  Images of Ganymede suitable for geologic mapping were collected during the flybys of Voyager 1 and Voyager 2 (1979), as well as during the Galileo mission in orbit around Jupiter (1995-2003).  This map represents a synthesis of our understanding of Ganymede geology after the conclusion of the Galileo mission.',
  "The two fundamental classes of material units on Ganymede are dark materials and light materials.  The dar

In [31]:
assert None

AssertionError: 

In [None]:
help(title)

In [None]:
import lxml.etree

stree = lxml.etree.tostring(tree)

# Transform to JSON (simpler to handle)

import xmltodict
js = xmltodict.parse(stree)
js

In [None]:
# Print JSON

# import json
# print(json.dumps(js, indent=2))

In [None]:
# Define structure to map between "theirs" (astropedia), and "ours" (invenio).
# Let's define it as a dictionary of fields we want to fill from input object,
# the input is the JSON object from Astropedia's XML.
# Auxiliary, some individual functions to process the data values during mapping.

_clean_CDATA = lambda s: s.replace('![[CDATA]', '').replace(']]','')
"""Remove "![[CDATA].*]]' from string"""


def _parse_bbox(bbox):
    """Map astropedia's bounding-box fields to ours"""
    _bbox = {
        'westlon': 'westbc',
        'eastlon': 'eastbc',
        'minlat': 'southbc',
        'maxlat': 'northbc'
    }
    return { k:float(bbox[v]) for k,v in _bbox.items() }


_split_sep = lambda s,sep=',': [w.strip() for w in s.split(sep)]
"""Return a list of comma-separated terms (Ex: 'ABC, XYZ')"""


def _parse_authors(text):
    """
    Clean/Split the authors from 'text'. 
    (Ex: ...\n<b>References:</b>\n\nLastname, N.I., Lastname, N.I. and Lastname, N.I. (2020)...
    """
    refs = []
    flag = False
    for line in text.split('\n'):
        line = line.strip()
        if line == '':
            continue
        if 'References:' in line:
            flag = True
            continue
        if flag:
            refs.append(line)
        
    authors = []
    for line in refs:
        l_authors = line.split('(')[0]
        _authors = [ a2 for a1 in _split_sep(l_authors, ' and ') 
                        for a2 in _split_sep(a1, ',') ]
        _authors = [ f"{_authors[i-1]}, {_authors[i]}"
                        for i in range(1,len(_authors),2) ]
        authors.extend(_authors)

    return authors


# metadata mappings (ours: astropedia)
_meta = dict(
    title = {'path': 'metadata/idinfo/citation/citeinfo/title' },
    
    date_pub = {'path': 'metadata/idinfo/citation/citeinfo/pubdate'},
    
    origin = {'path': 'metadata/idinfo/citation/citeinfo/origin'},
    
    url = {'path': 'metadata/idinfo/citation/citeinfo/onlink'},
    
    description = {'path': 'metadata/idinfo/descript/abstract', 
                   'proc': _clean_CDATA },
    
    authors = {'path': 'metadata/idinfo/descript/abstract', 
               'proc': _parse_authors },
    
    document_url = {'path': 'metadata/idinfo/descript/supplinf', 
                    'proc': _clean_CDATA },
    
    status = {'path': 'metadata/idinfo/status/progress' },
    
    bounding_box = {'path': 'metadata/idinfo/spdom/bounding',
                    'proc': _parse_bbox },
    
    scope = {'path': 'metadata/idinfo/accscope',
             'proc': _split_sep },
    
    browse = {'path': 'metadata/idinfo/browse/browsen'},
    
    product_url = {'path': 'metadata/distinfo/stdorder/digform/digtopt/onlinopt/computer/networka/networkr' }
)


# Function to do the whole thing
def map_jsons(js:dict, mappings:dict):
    """
    Return object like 'mappings', with values from (astropedia) 'js'
    
    Ex:
    > js = {'key1': {'key2': "val"}}
    > mappings = {
    >     'my_key1' : dict(path='key1/key2', proc=lambda s:s.upper()),
    >     'my_key2' : dict(path='key1/key2')
    > }
    > map_jsons(js, mappings)
    # {'my_key1': 'VAL', 'my_key2': 'val'}
    """
    def _map(js, obj):
        """
        Return value of key at the leaf of "obj['path']", 
        (optional) processed by "obj['proc']()" (if available)
        """
        val = js
        for node in obj['path'].split('/'):
            val = val[node]
            
        out = val
        if 'proc' in obj:
            out = obj['proc'](val)
            
        return out

    out = {}
    for keyword, mapping in mappings.items():
        value = _map(js, mapping)
        out.update({ keyword: value })
        
    return out

In [None]:
our_js = map_jsons(js, _meta)
our_js

In [None]:
import json

with open(f'{rootname}.json', 'w') as fp:
    json.dump(js, fp, indent=2)
    
with open(f'{rootname}_OurMeta.json', 'w') as fp:
    json.dump(our_js, fp, indent=2)

In [93]:
from api import astropedia

from importlib import reload
reload(astropedia)

# astropedia.parse_astropedia_html('https://astrogeology.usgs.gov/search/map/Moon/Geology/Unified_Geologic_Map_of_the_Moon_GIS_v2')
astropedia.parse_astropedia_html(url)

In [85]:
url

'https://astrogeology.usgs.gov/search/map/Ganymede/Geology/Ganymede_SIM3237_Database'