In [2]:
# USA map with facilities having violations
# Assumes you have SWDIS.zip data locally!

import os
import numpy as np
import pandas as pd
# Installation: https://altair-viz.github.io/getting_started/installation.html#installation-notebook
import altair as alt
import zipfile
from vega_datasets import data

sdwis_dir = '../../../data/sdwis'
sdwis_unzip_dir = sdwis_dir + '/SDWIS'
epa_dir = '../../../data/epa'
geo_dir = '../../../data/geography'
# print(os.listdir(sdwis_dir))

# unzip sdwis data
zip_ref = zipfile.ZipFile(sdwis_dir + '/SDWIS.zip', 'r')
zip_ref.extractall(sdwis_dir)
zip_ref.close()

In [None]:
water_sys = pd.read_csv(os.path.join(sdwis_unzip_dir, 'WATER_SYSTEM.csv'), sep=',', \
                       dtype={'WATER_SYSTEM.ZIP_CODE': np.str})
viol = pd.read_csv(os.path.join(sdwis_unzip_dir, 'VIOLATION.csv'), sep=',', \
                  dtype={'VIOLATION.CONTAMINANT_CODE': np.str})
drinking_water_regs = pd.read_csv(os.path.join(epa_dir, 'drinking-water-regulations.csv'), sep=',', \
                                 dtype={'SDWIS_CONTAMINANT_CODE': np.str})

# water_sys_fac = pd.read_csv(os.path.join(sdwis_unzip_dir, 'WATER_SYSTEM_FACILITY.csv'), sep=',')
# enfo_act = pd.read_csv(os.path.join(sdwis_unzip_dir, 'ENFORCEMENT_ACTION.csv'), sep=',')
# geog_area = pd.read_csv(os.path.join(sdwis_unzip_dir, 'GEOGRAPHIC_AREA.csv'), sep=',')
# lcr_sample = pd.read_csv(os.path.join(sdwis_unzip_dir, 'LCR_SAMPLE.csv'), sep=',')
# lcr_sample_res = pd.read_csv(os.path.join(sdwis_unzip_dir, 'LCR_SAMPLE_RESULT.csv'), sep=',')
# serv_area = pd.read_csv(os.path.join(sdwis_unzip_dir, 'SERVICE_AREA.csv'), sep=',')
# treat = pd.read_csv(os.path.join(sdwis_unzip_dir, 'TREATMENT.csv'), sep=',')
# viol = pd.read_csv(os.path.join(sdwis_unzip_dir, 'VIOLATION.csv'), sep=',')
# viol_enf_assoc = pd.read_csv(os.path.join(sdwis_unzip_dir, 'VIOLATION_ENF_ASSOC.csv'), sep=',')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Label data with full year, e.g., 2012 for 01-JUL-12
def get_full_year_for_violation(row):
    # input will be in the form 01-JUL-12
    date_parts = row['VIOLATION.COMPL_PER_BEGIN_DATE'].split('-')
    if len(date_parts) == 3:
        return '20' + date_parts[2]
    else:
        return ''

In [None]:
viol = viol[['VIOLATION.PWSID', 'VIOLATION.FACILITY_ID', 'VIOLATION.CONTAMINANT_CODE', \
             'VIOLATION.COMPL_PER_BEGIN_DATE']]
viol.loc[:, 'VIOLATION.YEAR'] = viol.apply(get_full_year_for_violation, axis=1)

In [None]:
water_sys = water_sys[['WATER_SYSTEM.PWSID', 'WATER_SYSTEM.ZIP_CODE']].drop_duplicates()

In [None]:
# Get latitude, longitude from Vega zipcodes data set
# Fetched from Vega data set: https://raw.githubusercontent.com/vega/vega-datasets/master/data/zipcodes.csv
# source = data.zipcodes.url
# zip_code,latitude,longitude,city,state,county
# 00501,40.922326,-72.637078,Holtsville,NY,Suffolk
# 45430,39.709159,-84.104836,Dayton,OH,Montgomery
zipcodes = pd.read_csv(os.path.join(geo_dir, 'zipcodes.csv'), sep=',', \
                      dtype={'zip_code': np.str})

water_sys = pd.merge(water_sys, zipcodes, left_on='WATER_SYSTEM.ZIP_CODE', right_on='zip_code')
water_sys_viol = pd.merge(water_sys, viol, left_on='WATER_SYSTEM.PWSID', right_on='VIOLATION.PWSID')


water_sys_viol_contaminants = pd.merge(water_sys_viol, drinking_water_regs, \
                                      left_on='VIOLATION.CONTAMINANT_CODE', right_on='SDWIS_CONTAMINANT_CODE')
water_sys_viol_contaminants.head()

In [None]:
# Filter data to display by reported year and contaminant type; 
# this can be modified as needed to filter by different parameters
def get_source_data_frame(df, year, contaminant_types):
    df = df[df['VIOLATION.YEAR']==year]
    if len(contaminant_types) > 0:
         df = df[df['TYPE'].isin(contaminant_types)]
    return df[['zip_code', 'latitude', 'longitude']].drop_duplicates()

In [None]:
# zip_code,latitude,longitude,city,state,county
# 00501,40.922326,-72.637078,Holtsville,NY,Suffolk
# 00544,40.922326,-72.637078,Holtsville,NY,Suffolk
# 02101,42.370567,-71.026964,Boston,MA,Suffolk
# 45430,39.709159,-84.104836,Dayton,OH,Montgomery

# source = pd.DataFrame({'zip_code': ['00501', '02101', '45430'],
#                        'latitude': [40.922326, 42.370567, 39.709159],
#                        'longitude': [-72.637078, -71.026964, -84.104836]})

# configure chart to be rendered in notebook
alt.renderers.enable('notebook')

# avoid data size MaxRowsError 
alt.data_transformers.enable('json')

def draw_map(source, title):
    return alt.Chart(source).transform_calculate(
        "leading digit", alt.expr.substring(alt.datum.zip_code, 0, 1)
    ).mark_circle(size=3).encode(
        longitude='longitude:Q',
        latitude='latitude:Q',
        color='leading digit:N',
        tooltip='zip_code:N'
    ).project(
        type='albersUsa'
    ).properties(
        width=325,
        height=200,
        title=title
    )


In [None]:
drinking_water_regs['TYPE'].drop_duplicates()

In [None]:
# Display facilities with violations for year and contaminant type

year = '2008'
contaminant_types = ['Organic Chemicals']
source = get_source_data_frame(water_sys_viol_contaminants, year, contaminant_types)
draw_map(source, year)

In [None]:
year = '2018'
contaminant_types = ['Organic Chemicals']
source = get_source_data_frame(water_sys_viol_contaminants, year, contaminant_types)
draw_map(source, year)