In [1]:
import pandas as pd

import json
from pathlib import Path

In [2]:
from config import PATH_PARSED_RECORDS
PATH_PARSED_RECORDS

PosixPath('data/intermediate-results/parsed-records.json')

# Creating Data Tables

The last step is to process the parsed data to flat data tables.

Some utility functions to regularize field/column names and enhance data quality:

In [3]:
import re

def to_snakecase(s):
    import re
    
    # convert one or more spaces to underscore
    s = re.sub('\s+', '_', s.strip())
    # strip all other non-alphanumeric chars
    return (re.sub('\W', '', s)
            .lower()
           )

def normalize_column_names(df):
    return df.rename(columns=to_snakecase)

def drop_columns_robust(df, cols):
    to_drop = df.columns & set(cols)
    
    return df.drop(columns=to_drop)

def drop_columns_with_nulls(df, max_null_fraction=0):
    null_fraction = df.pipe(lambda d: 1 - d.count() / len(d))
    cols_to_keep = null_fraction[lambda col: col <= max_null_fraction].index
    return df[cols_to_keep]

def to_category(df, cols):
    return df.astype({col: 'category' for col in cols})

We can also creae a container class to interact with the dataset as parsed records:

In [4]:
class WSDDataset:
    
    @classmethod
    def from_records(cls, path=None):
        if path:
            records = json.loads(Path(path).read_text())
        return cls(records)
    
    def __init__(self, records=None):
        self.records = records or []
        
    def __repr__(self):
        return f'{type(self).__name__}(len={len(self)})'
        
    def __iter__(self):
        return iter(self.records)
    
    def __len__(self):
        return len(self.records)
    
    def _get_table_from_record_dict(self, field_name):
        return pd.DataFrame([dict(pws_id=r['pws_id'], **r[field_name]) for r in self])

    def _get_table_from_record_list(self, field_name, as_index=None):
        # TODO add option to set the `entry_id` name to a custom value
        df = (pd.DataFrame([dict(pws_id=r['pws_id'], entry_id=entry_id, **field_data)
                         for r in self
                         for entry_id, field_data in enumerate(r[field_name])])
          )

        if as_index:
            df = df.set_index(as_index)
        return df
    
    def get_table_details(self):

        return (self._get_table_from_record_dict('water_system_details')
                .set_index('pws_id')
                # there are some spurious columns, defined only for a minority of the PWSs
                # we retain only columns containing data for a relevant fraction of the dataset
                .pipe(drop_columns_with_nulls, max_null_fraction=0.1)
                .pipe(normalize_column_names)
                .pipe(to_category, ['federal_type', 'primary_source', 'state_type', 'status'])
                .assign(activity_date=lambda d: pd.to_datetime(d['activity_date']))
               )
    
    def get_table_service_connections(self):
        return (self._get_table_from_record_list('service_connections', as_index=['pws_id', 'entry_id'])
                .pipe(normalize_column_names)
                .pipe(to_category, ['meter_type', 'type'])
               )
    
    def get_table_service_areas(self):
        return (self._get_table_from_record_list('service_areas', as_index=['pws_id', 'entry_id'])
                .pipe(normalize_column_names)
                .pipe(to_category, ['code', 'name'])
               )
    
    def get_table_water_sources(self):
        return (self._get_table_from_record_list('water_sources', as_index=['pws_id', 'entry_id'])
                .pipe(normalize_column_names)
                .pipe(to_category, ['status', 'type_code'])
               )
    
    def get_table_water_purchases(self):
        return (self._get_table_from_record_list('water_purchases', as_index=['pws_id', 'entry_id'])
                .pipe(normalize_column_names)
                .pipe(to_category, ['buyer_facility_type', 'seller_facility_type'])
               )
    
    def to_csv(self, path):

        names_functions = [
            ('details', self.get_table_details),
            ('service-connections', self.get_table_service_connections),
            ('service-areas', self.get_table_service_areas),
            ('water-sources', self.get_table_water_sources),
            ('water-purchases', self.get_table_water_purchases),
        ]
        
        for name, f in names_functions:
            path_file = (path / name).with_suffix('.csv')
            df = f()
            df.to_csv(path_file)
            
    def to_json(self, path):
        import json

        with (path / 'records.json').open('w') as f:
            json.dump(self.records, f, indent=4)

In [5]:
d = WSDDataset.from_records(path=PATH_PARSED_RECORDS)
d

WSDDataset(len=8327)

In [37]:
def get_category_fraction(s):
    df = pd.DataFrame({'counts': s.value_counts()})
    return df.assign(fraction=lambda d: (d.counts / d.counts.sum()),
                     cum_fraction=lambda d: d.fraction.cumsum(),
                     )

def get_info_categories(df):
    for col in df.select_dtypes(include='category'):
        s = df[col]
        print(f'{col} ({s.cat.categories}')
        display(s.pipe(get_category_fraction))

In [42]:
(d.get_table_details()
#  .pipe(get_info_categories)
)

Unnamed: 0_level_0,activity_date,federal_type,primary_source,principal_county_served,state_type,status,water_system_name,water_system_no
pws_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CA3010071,1976-01-01,C,SWP,ORANGE,C,A,JOINT REGIONAL WATER SUPPLY SYSTEM,CA3010071
CA4800564,1979-03-22,C,GW,SOLANO,C,A,HIDDEN ACRES TRAILER VILLA,CA4800564
CA1503243,2016-09-20,NTNC,GW,KERN,NTNC,A,HONDA PROVING CENTER,CA1503243
CA3901464,2016-08-08,NC,GW,SAN JOAQUIN,NC,A,DESHMESH DARBAR LODI/STK WATER SYS,CA3901464
CA1900868,2013-06-08,C,GW,LOS ANGELES,C,A,RIVERS END TRAILER PARK,CA1900868
CA2490001,1976-01-01,NP,,MERCED,RW,A,CITY OF GUSTINE,CA2490001
CA1000459,1993-02-09,NC,GW,FRESNO,NC,A,ROAD RUNNER FOOD AND FUEL,CA1000459
CA3901368,2016-08-08,NTNC,GW,SAN JOAQUIN,NTNC,A,OM SCOTT & SONS/HYPONEX CORP,CA3901368
CA2900532,2016-08-08,NTNC,GW,NEVADA,NTNC,A,PENN VALLEY SHOPPING CENTER,CA2900532
CA4200862,2016-07-26,C,GW,SANTA BARBARA,C,A,MESA HILLS MUTUAL,CA4200862


In [32]:
(d.get_table_water_sources()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,status,type_code
pws_id,entry_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA3010071,0,IRWD INERCONNECTION TO JTM,A,CC
CA3010071,1,MWDSC EOCF NO. 2 TRI-CITIES JTM TREATED,A,CC
CA3010071,2,MWDSC SCP SC-5B CONNECTION - TREATED,A,CC
CA3010071,3,SCWD-GRF,A,CC
CA4800564,0,MAIN WELL,A,WL
CA4800564,1,WELL 01,A,WL
CA4800564,2,WELL 02,A,WL
CA1503243,0,WELL 24,A,WL
CA3901464,0,WELL #1,A,WL
CA1900868,0,WELL03,A,WL


In [31]:
(d.get_table_service_connections()
 .sort_values('count', ascending=False)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,meter_size_measure,meter_type,type
pws_id,entry_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA1910067,3,609694,0,ME,RS
CA0110005,0,387846,0,ME,CB
CA3710020,3,225109,0,ME,RS
CA4310011,2,204733,0,ME,RS
CA3810011,0,170509,0,ME,CB
CA3410020,0,157312,0,UN,CB
CA3310009,0,146850,0,ME,CB
CA1010007,0,132981,0,ME,CB
CA3310001,0,104188,0,ME,CB
CA3010092,4,98901,0,ME,RS


In [33]:
d.get_table_service_areas()

Unnamed: 0_level_0,Unnamed: 1_level_0,code,name
pws_id,entry_id,Unnamed: 2_level_1,Unnamed: 3_level_1
CA3010071,0,O,WHOLESALER (SELLS WATER)
CA4800564,0,R,MOBILE HOME PARK
CA1503243,0,NT,INDUSTRIAL/AGRICULTURAL
CA3901464,0,O,OTHER AREA
CA1900868,0,R,MOBILE HOME PARK
CA2490001,0,O,RECYCLED WATER
CA1000459,0,O,OTHER AREA
CA3901368,0,NT,INDUSTRIAL/AGRICULTURAL
CA2900532,0,T,OTHER TRANSIENT AREA
CA4200862,0,R,RESIDENTIAL AREA


In [11]:
(d.get_table_water_purchases()
 .reset_index()
 .groupby('seller_water_system_no')
 .agg({'water_system_name': 'first', 'pws_id': 'count'})
 .rename(columns={'pws_id': 'buyer_count'})
 .sort_values('buyer_count', ascending=False)
)

Unnamed: 0_level_0,water_system_name,buyer_count
seller_water_system_no,Unnamed: 1_level_1,Unnamed: 2_level_1
CA1910087,METROPOLITAN WATER DIST. OF SO. CAL.,345
CA3710042,SAN DIEGO COUNTY WATER AUTHORITY,77
CA3810001,SAN FRANCISCO REGIONAL WATER SYSTEM,62
CA4310027,SANTA CLARA VALLEY WATER DISTRICT,31
CA1910045,ANTELOPE VALLEY-EAST KERN WATER AGENCY,29
CA3710020,"SAN DIEGO, CITY OF",27
CA1910199,CALIFORNIA DOMESTIC WATER COMPANY,25
CA3310009,EASTERN MUNICIPAL WD,22
CA5610050,CALLEGUAS MUNICIPAL WATER DIST,21
CA4310011,SAN JOSE WATER,20


In [12]:
from config import PATH_RESULTS

PATH_RESULTS

PosixPath('data/results')

In [13]:
d.to_csv(PATH_RESULTS)