## Water violations in Vermont

To run this notebook, load SDWIS csv data files into the folder ../../../data/sdwis/SDWIS

In [1]:
import os
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

STATE_CODE = 'VT'

In [2]:
# Label data with full year, e.g., 2012 for 01-JUL-12, 1990 for 01-JUN-90
def get_full_year(ddmmmyy):
    full_year = 0
    date_parts = ddmmmyy.split('-')
    if len(date_parts) == 3:
        if (int(date_parts[2]) <= 19):
            full_year = int('20' + date_parts[2])
        else:
            full_year = int('19' + date_parts[2])
    return full_year

In [3]:
def get_full_year_for_row(row):
    return get_full_year(row['VIOLATION.COMPL_PER_BEGIN_DATE'])

In [4]:
# calculate years ago from input 'current_year'
def get_years_ago(row, current_year):
    row_year = get_full_year(row['VIOLATION.COMPL_PER_BEGIN_DATE'])
    return str(current_year - row_year) + '_yrs_ago'

In [5]:
def print_water_system_violations(water_system_df, viol_df):
    viol_df = viol_df.merge(water_system_df, left_on='VIOLATION.PWSID', right_on='WATER_SYSTEM.PWSID')
    print('# water systems: ' + str(water_system_df.shape[0]))
    print('# violations: ' + str(viol_df.shape[0]))
    print('# reporting violations: ' \
          + str(viol_df[viol_df['VIOLATION.VIOLATION_CATEGORY_CODE'] == 'MR'].shape[0]))
    print('# health violations: ' \
          + str(viol_df[viol_df['VIOLATION.IS_HEALTH_BASED_IND'] == 'Y'].shape[0]))
        

In [6]:
# read input files
data_dir = '../../../data'
# print(os.listdir(data_dir))

# assumes csv files are in folder ../../../data/sdwis/SDWIS
viol = pd.read_csv(os.path.join(data_dir + '/sdwis/SDWIS', 'VIOLATION.csv'), sep=',', \
                  dtype={'VIOLATION.CONTAMINANT_CODE': np.str}, low_memory=False)
ws = pd.read_csv(os.path.join(data_dir + '/sdwis/SDWIS', 'WATER_SYSTEM.csv'), \
                           low_memory=False)

In [7]:
viol.loc[:, 'VIOLATION.YEAR'] = viol.apply(get_full_year_for_row, axis=1)

In [8]:
# violations in 2017
viol_2017 = viol[viol['VIOLATION.YEAR'] == 2017]
viol_2017.head()

Unnamed: 0,VIOLATION.PWSID,VIOLATION.VIOLATION_ID,VIOLATION.FACILITY_ID,VIOLATION.POPULATION_SERVED_COUNT,VIOLATION.NPM_CANDIDATE,VIOLATION.PWS_ACTIVITY_CODE,VIOLATION.PWS_DEACTIVATION_DATE,VIOLATION.PRIMARY_SOURCE_CODE,VIOLATION.POP_CAT_5_CODE,VIOLATION.PRIMACY_AGENCY_CODE,...,VIOLATION.RTC_DATE,VIOLATION.PUBLIC_NOTIFICATION_TIER,VIOLATION.ORIGINATOR_CODE,VIOLATION.SAMPLE_RESULT_ID,VIOLATION.CORRECTIVE_ACTION_ID,VIOLATION.RULE_CODE,VIOLATION.RULE_GROUP_CODE,VIOLATION.RULE_FAMILY_CODE,Unnamed: 34,VIOLATION.YEAR
54,NH2539010,1700027,,1000,Y,A,,GW,2,NH,...,,3,S,,,111,100,110,,2017
55,NH2539010,1700026,,1000,Y,A,,GW,2,NH,...,,2,S,,,111,100,110,,2017
205,NY2621115,911,,120,N,A,,GW,1,NY,...,20-OCT-17,3,S,,,500,500,500,,2017
207,NY5507077,6617,,45,N,A,,GW,1,NY,...,11-MAY-17,3,S,,,500,500,500,,2017
208,NY5507077,6618,,45,N,A,,GW,1,NY,...,,3,S,,,111,100,110,,2017


In [9]:
# Water systems in Vermont
ws = ws[(ws['WATER_SYSTEM.PRIMACY_AGENCY_CODE'] == STATE_CODE) \
        & (ws['WATER_SYSTEM.PWS_ACTIVITY_CODE'] == 'A')]

In [10]:
ws['WATER_SYSTEM.PWS_TYPE_CODE'].unique()

array(['TNCWS', 'NTNCWS', 'CWS'], dtype=object)

In [11]:
# Water systems by type
ws.groupby(['WATER_SYSTEM.PWS_TYPE_CODE']).size()

WATER_SYSTEM.PWS_TYPE_CODE
CWS       417
NTNCWS    250
TNCWS     749
dtype: int64

In [12]:
# https://ofmpub.epa.gov/enviro/EF_METADATA_HTML.sdwis_page?p_column_name=VIOLATION_CATEGORY_CODE
# is 'RPT' also a reporting violation?
viol_2017['VIOLATION.VIOLATION_CATEGORY_CODE'].unique()

array(['RPT', 'TT', 'Other', 'MON', 'MR', 'MCL', 'MRDL'], dtype=object)

In [13]:
print('All water systems')
print_water_system_violations(ws, viol_2017)

All water systems
# water systems: 1416
# violations: 1539
# reporting violations: 1125
# health violations: 44


In [14]:
print('Water systems serving 3300 or fewer')
ws_lt_3300 = ws[ws['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 3300]
print_water_system_violations(ws_lt_3300, viol_2017)

Water systems serving 3300 or fewer
# water systems: 1384
# violations: 1525
# reporting violations: 1113
# health violations: 44


In [15]:
print('Water systems serving 500 or fewer')
ws_lt_500 = ws[ws['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 500]
print_water_system_violations(ws_lt_500, viol_2017)

Water systems serving 500 or fewer
# water systems: 1252
# violations: 1316
# reporting violations: 922
# health violations: 43


In [16]:
# community water systems
ws_cws = ws[ws['WATER_SYSTEM.PWS_TYPE_CODE'] == 'CWS']
ws_cws.shape[0]

417

In [17]:
print('community water systems')
print_water_system_violations(ws_cws, viol_2017)

community water systems
# water systems: 417
# violations: 705
# reporting violations: 617
# health violations: 18


In [18]:
print('community water systems serving 500 or fewer')
ws_cws_lt_500 = ws_cws[ws_cws['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 500]
print_water_system_violations(ws_cws_lt_500, viol_2017)

community water systems serving 500 or fewer
# water systems: 302
# violations: 570
# reporting violations: 494
# health violations: 18


In [19]:
print('community water systems serving 3300 or fewer')
ws_cws_lt_3300 = ws_cws[ws_cws['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 3300]
print_water_system_violations(ws_cws_lt_3300, viol_2017)

community water systems serving 3300 or fewer
# water systems: 385
# violations: 691
# reporting violations: 605
# health violations: 18
