In [1]:
from pathlib import Path

from config import PATH_HTML_PAGES
PATH_HTML_PAGES

PosixPath('data/intermediate-results/html-pages')

In [2]:
PATHS_TO_SCRAPE = list(PATH_HTML_PAGES.glob('*.html'))
len(PATHS_TO_SCRAPE)

8327

# Parsing the HTML pages

## Utility classes

First, some generic base classes:

In [3]:
from requests_html import HTMLSession, HTML
import pandas as pd

In [4]:
class DocumentParser:
    """Helper class to process a single page/HTML document"""
        
    def __init__(self, html=None, **kwargs):
        self.html = html
        
    def to_record(self):
        return {}
    
    @staticmethod
    def get_fallback_html():
        return HTML(html='<body></body>')
    
    @classmethod
    def from_url(cls, url, session=None, **kwargs):
        session = session or HTMLSession()
        
        resp = session.get(url)
        if resp:
            if isinstance(session, HTMLSession):
                html = resp.html
            else:
                html = HTML(url=url, html=resp.text)
        else:
            html = cls.get_fallback_html()

        return cls(html=html, url=url, **kwargs)
    
    @classmethod
    def from_path(cls, path, **kwargs):
        html = HTML(html=Path(path).read_text())
        return cls(html=html, **kwargs)
    
    def to_path(self, path):
        path = Path(path)
        path.save_text(self.html.html)
        return path

In [5]:
class ComponentParser:
    """Helper class for parsing a (generalized) single component"""
    fallback = dict

    def __init__(self, element=None, html_raw=None, process=None, **kwargs):

        self.element = element
        self.html_raw = html_raw or self.element.html
        
        self.data = self.fallback()
        self.process = process or (lambda d: d)
        
        try:
            data = self.parse(**kwargs)
            data = self.process(data)
            self.data = data
        except Exception:
            pass

    def parse(self, **kwargs):
        return self.data
        
    def to_record(self):
        return dict(self.data)

Let's extend these with more specific subclasses for parsing WSD pages and the tables that appear there:

In [6]:
def extract_keyval_table(element, key_pattern=':', is_key=None, get_key=None):
    is_key = is_key or (lambda text: key_pattern in text)
    get_key = get_key or (lambda text: text.replace(key_pattern, ''))

    keys = []
    values = []
    
    for table_row in element.find('td'):
        text = table_row.text
        if is_key(text):
            keys.append(get_key(text).strip())
        else:
            values.append(text.strip())
    
    return dict(zip(keys, values))

class WSDetailsTable(ComponentParser):
    """Parses the table at the top op the page, which require a special treatment since it's not a well-formed HTML table"""
    fallback = dict
    
    def parse(self):
        return extract_keyval_table(self.element,
                                    is_key=lambda text: text.endswith(' :'),
                                    get_key=lambda text: text.replace(' :', '')
                                   )
    
    def to_record(self):
        return dict(self.data)

class WSTable(ComponentParser):
    fallback = pd.DataFrame
    
    def parse(self):
        return pd.read_html(self.html_raw)[0]
    
    def to_record(self):

        # return self.data.to_dict(orient='records')
        # the DataFrame.to_dict(orient='records') mangles column names
        # e.g. "Type  Code" (with two spaces) shows up as "_1"
        # the same happens using df.itertuples()
        # it might be a pandas bug
        def to_dict_records_alt(d):
            # transform nans to None for greater compatibility
            # https://stackoverflow.com/a/39279898/
            return [dict(row) for i, row in d.where(d.notnull(), None).iterrows()]
        
        return to_dict_records_alt(self.data)

In [7]:
class PWSDetailsPage(DocumentParser):
    
    def __init__(self, url='', pws_id=None, **kwargs):
        super().__init__(**kwargs)

        self.url = url

#         if self.url:
#             self.parse_url(self.url)
#         else:
        self.pws_id = pws_id
        
    def parse_url(self, url):
        params = get_params_from_url(url)
        
        self.pws_id = params.get('wsnumber', '')
        self.pws_url_id = params.get('tinwsys_is_number', '')
        
    @property
    def table_details(self):
        sel = 'table[summary="Water System  Details"]'
        elem = self.html.find(sel, first=True)
        return WSDetailsTable(elem)
    
    @property
    def table_water_sources(self):
        sel = 'table[summary="Details about Sources of Water"]'
        # there are two elements with identical attributes (elem.attrs):
        # this ("Sources of water") is the first, "Water Purchases" is the second
        elem = self.html.find(sel, first=True)

        def rename_cols(d):
            # get rid of extra space
            return d.rename(columns={'Type  Code': 'Type Code'})

        return WSTable(elem, process=rename_cols)
    
    @property
    def table_water_purchases(self):
        sel = 'table[summary="Details about Sources of Water"]'
        elem = self.html.find(sel)[1]
        
        return WSTable(elem)
    
    @property
    def table_ws_contacts(self):
        pass
    
    @property
    def table_service_areas(self):
        sel = 'table[summary="Summary of Service Area"]'
        return WSTable(self.html.find(sel, first=True))
    
    @property
    def table_service_connections(self):
        sel = 'table[summary="Summary of Service Connection"]'
        return WSTable(self.html.find(sel, first=True))
    
    @property
    def urls_other(self):
        def is_interesting(url):
            # exclude:
            return all([
                # internal framework links
                'jsp' not in url,
                # google maps (might be useful for the address)
                'maps.google' not in url,
                # EAR links
                'drinc.ca.gov/ear/' not in url
            ])

        return [url for url in self.html.links if is_interesting(url)]
    
    def to_record(self):
        d = {'pws_id': self.pws_id}

        d['water_system_details'] = self.table_details.to_record()

        d['water_sources'] = self.table_water_sources.to_record()        
        d['water_purchases'] = self.table_water_purchases.to_record()
        d['service_areas'] = self.table_service_areas.to_record()
        d['service_connections'] = self.table_service_connections.to_record()
        
        d['urls_other'] = self.urls_other
        
        return d

In [8]:
def get_example_parsers():
    return [
        PWSDetailsPage.from_path(PATH_HTML_PAGES / 'CA0400103.html', pws_id='CA0400103'),
        PWSDetailsPage.from_path(PATH_HTML_PAGES / 'CA1310011.html', pws_id='CA1310011'),
    ]

In [9]:
parsers = get_example_parsers()

parsers[0].table_details.data

{'Water System No.': 'CA0400103',
 'Federal Type': 'NC',
 'Water System Name': 'PG&E: PHILBROOK DAM',
 'State Type': 'NC',
 'Principal County Served': 'BUTTE',
 'Primary Source': 'GW',
 'Status': 'A',
 'Activity Date': '07-28-2016'}

In [10]:
from tqdm import tqdm_notebook

def get_records_from_paths(paths, **kwargs):
    records = []
    for path in tqdm_notebook(paths):
        try:
            parser = PWSDetailsPage.from_path(path, pws_id=path.stem)
        except Exception as e:
            print(f'could not process {path}: {repr(e)}')
        else:
            records.append(parser.to_record())
    return records

In [11]:
RECORDS = get_records_from_paths(PATHS_TO_SCRAPE)
len(RECORDS)

HBox(children=(IntProgress(value=0, max=8327), HTML(value='')))




8327

In [12]:
import json

def to_file(records, path):
    with (path).with_suffix('.json').open('w') as f:
        json.dump(records, f, indent=4)

In [13]:
from config import PATH_PARSED_RECORDS
PATH_PARSED_RECORDS

PosixPath('data/intermediate-results/parsed-records.json')

In [14]:
to_file(RECORDS, PATH_PARSED_RECORDS)