## Water violations in Vermont

To run this notebook, load SDWIS csv data files into the folder ../../../data/sdwis/SDWIS

In [87]:
import os
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

In [88]:
# Label data with full year, e.g., 2012 for 01-JUL-12, 1990 for 01-JUN-90
def get_full_year(ddmmmyy):
    full_year = 0
    date_parts = ddmmmyy.split('-')
    if len(date_parts) == 3:
        if (int(date_parts[2]) <= 19):
            full_year = int('20' + date_parts[2])
        else:
            full_year = int('19' + date_parts[2])
    return full_year

In [89]:
def get_full_year_for_row(row):
    return get_full_year(row['VIOLATION.COMPL_PER_BEGIN_DATE'])

In [90]:
# calculate years ago from input 'current_year'
def get_years_ago(row, current_year):
    row_year = get_full_year(row['VIOLATION.COMPL_PER_BEGIN_DATE'])
    return str(current_year - row_year) + '_yrs_ago'

In [91]:
def print_water_system_violations(water_system_df, viol_df):
    viol_df = viol_df.merge(water_system_df, left_on='VIOLATION.PWSID', right_on='WATER_SYSTEM.PWSID')
    print('# water systems: ' + str(water_system_df.shape[0]))
    print('# violations: ' + str(viol_df.shape[0]))
    print('# reporting violations: ' \
          + str(viol_df[viol_df['VIOLATION.VIOLATION_CATEGORY_CODE'] == 'MR'].shape[0]))
    print('# health violations: ' \
          + str(viol_df[viol_df['VIOLATION.IS_HEALTH_BASED_IND'] == 'Y'].shape[0]))
        

In [92]:
# read input files
data_dir = '../../../data'
# print(os.listdir(data_dir))

# assumes csv files are in folder ../../../data/sdwis/SDWIS
viol = pd.read_csv(os.path.join(data_dir + '/sdwis/SDWIS', 'VIOLATION.csv'), sep=',', \
                  dtype={'VIOLATION.CONTAMINANT_CODE': np.str}, low_memory=False)
water_system = pd.read_csv(os.path.join(data_dir + '/sdwis/SDWIS', 'WATER_SYSTEM.csv'), low_memory=False)

In [93]:
viol.loc[:, 'VIOLATION.YEAR'] = viol.apply(get_full_year_for_row, axis=1)

In [94]:
# violations in 2017
viol_2017 = viol[viol['VIOLATION.YEAR'] == 2017]
viol_2017.head()

Unnamed: 0,VIOLATION.PWSID,VIOLATION.VIOLATION_ID,VIOLATION.FACILITY_ID,VIOLATION.POPULATION_SERVED_COUNT,VIOLATION.NPM_CANDIDATE,VIOLATION.PWS_ACTIVITY_CODE,VIOLATION.PWS_DEACTIVATION_DATE,VIOLATION.PRIMARY_SOURCE_CODE,VIOLATION.POP_CAT_5_CODE,VIOLATION.PRIMACY_AGENCY_CODE,...,VIOLATION.RTC_DATE,VIOLATION.PUBLIC_NOTIFICATION_TIER,VIOLATION.ORIGINATOR_CODE,VIOLATION.SAMPLE_RESULT_ID,VIOLATION.CORRECTIVE_ACTION_ID,VIOLATION.RULE_CODE,VIOLATION.RULE_GROUP_CODE,VIOLATION.RULE_FAMILY_CODE,Unnamed: 34,VIOLATION.YEAR
54,NH2539010,1700027,,1000,Y,A,,GW,2,NH,...,,3,S,,,111,100,110,,2017
55,NH2539010,1700026,,1000,Y,A,,GW,2,NH,...,,2,S,,,111,100,110,,2017
205,NY2621115,911,,120,N,A,,GW,1,NY,...,20-OCT-17,3,S,,,500,500,500,,2017
207,NY5507077,6617,,45,N,A,,GW,1,NY,...,11-MAY-17,3,S,,,500,500,500,,2017
208,NY5507077,6618,,45,N,A,,GW,1,NY,...,,3,S,,,111,100,110,,2017


In [95]:
# Water systems in Vermont
water_system = water_system[(water_system['WATER_SYSTEM.STATE_CODE'] == 'VT') \
                            & (water_system['WATER_SYSTEM.PWS_ACTIVITY_CODE'] == 'A')]

In [96]:
water_system.head()

Unnamed: 0,WATER_SYSTEM.PWSID,WATER_SYSTEM.PWS_NAME,WATER_SYSTEM.NPM_CANDIDATE,WATER_SYSTEM.PRIMACY_AGENCY_CODE,WATER_SYSTEM.EPA_REGION,WATER_SYSTEM.SEASON_BEGIN_DATE,WATER_SYSTEM.SEASON_END_DATE,WATER_SYSTEM.PWS_ACTIVITY_CODE,WATER_SYSTEM.PWS_DEACTIVATION_DATE,WATER_SYSTEM.PWS_TYPE_CODE,...,WATER_SYSTEM.ZIP_CODE,WATER_SYSTEM.COUNTRY_CODE,WATER_SYSTEM.STATE_CODE,WATER_SYSTEM.SOURCE_WATER_PROTECTION_CODE,WATER_SYSTEM.SOURCE_PROTECTION_BEGIN_DATE,WATER_SYSTEM.OUTSTANDING_PERFORMER,WATER_SYSTEM.OUTSTANDING_PERFORM_BEGIN_DATE,WATER_SYSTEM.CITIES_SERVED,WATER_SYSTEM.COUNTIES_SERVED,Unnamed: 47
399,VT0021449,WARREN MUNICIPAL BUILDINGS,Y,VT,1,01-01,12-31,A,,TNCWS,...,5674,US,VT,,,,,WARREN,Washington,
400,VT0020397,CATAMOUNT CHARLOTTE,Y,VT,1,01-01,12-31,A,,NTNCWS,...,5401,US,VT,N,,,,CHARLOTTE,Chittenden,
1392,NY0412272,COUNTRY CLUB HOMES,Y,NY,2,,,A,,CWS,...,5855,US,VT,Y,23-JUN-09,,,MACHIAS (T),Cattaraugus,
2088,VT0005051,BOLTON COMMUNITY WATER SYSTEM,Y,VT,1,,,A,,CWS,...,5477,US,VT,N,,,,BOLTON,Chittenden,
2360,VT0021296,MAXFIELD SPORTS FIELDS,Y,VT,1,04-01,11-30,A,,TNCWS,...,5001,US,VT,N,,N,,HARTFORD,Windsor,


In [97]:
water_system['WATER_SYSTEM.PWS_TYPE_CODE'].unique()

array(['TNCWS', 'NTNCWS', 'CWS'], dtype=object)

In [98]:
# Water systems by type
water_system.groupby(['WATER_SYSTEM.PWS_TYPE_CODE']).size()

WATER_SYSTEM.PWS_TYPE_CODE
CWS       399
NTNCWS    248
TNCWS     721
dtype: int64

In [99]:
# https://ofmpub.epa.gov/enviro/EF_METADATA_HTML.sdwis_page?p_column_name=VIOLATION_CATEGORY_CODE
# is 'RPT' also a reporting violation?
viol_2017['VIOLATION.VIOLATION_CATEGORY_CODE'].unique()

array(['RPT', 'TT', 'Other', 'MON', 'MR', 'MCL', 'MRDL'], dtype=object)

In [100]:
print('All water systems')
print_water_system_violations(water_system, viol_2017)

All water systems
# water systems: 1368
# violations: 1455
# reporting violations: 1090
# health violations: 36


In [101]:
print('Water systems serving 3300 or fewer')
water_system_lt_3300 = water_system[water_system['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 3300]
print_water_system_violations(water_system_lt_3300, viol_2017)

Water systems serving 3300 or fewer
# water systems: 1336
# violations: 1441
# reporting violations: 1078
# health violations: 36


In [102]:
print('Water systems serving 500 or fewer')
water_system_lt_500 = water_system[water_system['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 500]
print_water_system_violations(water_system_lt_500, viol_2017)

Water systems serving 500 or fewer
# water systems: 1204
# violations: 1238
# reporting violations: 890
# health violations: 35


In [103]:
# community water systems
water_system_cws = water_system[water_system['WATER_SYSTEM.PWS_TYPE_CODE'] == 'CWS']
water_system_cws.shape[0]

399

In [104]:
print('community water systems')
print_water_system_violations(water_system_cws, viol_2017)

community water systems
# water systems: 399
# violations: 690
# reporting violations: 613
# health violations: 12


In [105]:
print('community water systems serving 500 or fewer')
water_system_cws_lt_500 = water_system_cws[water_system_cws['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 500]
print_water_system_violations(water_system_cws_lt_500, viol_2017)

community water systems serving 500 or fewer
# water systems: 284
# violations: 561
# reporting violations: 493
# health violations: 12


In [106]:
print('community water systems serving 3300 or fewer')
water_system_cws_lt_3300 = water_system_cws[water_system_cws['WATER_SYSTEM.POPULATION_SERVED_COUNT'] <= 3300]
print_water_system_violations(water_system_cws_lt_3300, viol_2017)

community water systems serving 3300 or fewer
# water systems: 367
# violations: 676
# reporting violations: 601
# health violations: 12
