In [123]:
I_WANT_TO_RESAVE_THE_DATA = True # overwrite previously saved data

# Data Preparation

The aim of this notebook is to extract the data, clean it, process it, merge the different data sets and output the data that will be finally used for the modelling.   

We first extract data from EPA's Safe Drinking Water Information System ([SDWIS](https://www.epa.gov/enviro/sdwis-model)) for the [water systems](https://enviro.epa.gov/enviro/ef_metadata_html.ef_metadata_table?p_table_name=WATER_SYSTEM&p_topic=SDWIS), for their characteristics (notably where the ZIP code where they are situated) and for the Maximum Contaminant Levels (MCLs) [violations](https://enviro.epa.gov/enviro/ef_metadata_html.ef_metadata_table?p_table_name=VIOLATION&p_topic=SDWIS) (notably which contaminants and when).  

Then, 


In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time

import requests # to read the data from the REST API of EPA Envirofacts
import csv # needed as data accessed through the REST API are .csv (other possibilities are .xml or .xls)


## Extracting Water Systems Data from SDWIS

In [90]:
# more on the API: https://www.epa.gov/enviro/envirofacts-data-service-api

# notes on the API:
#     - WATER_SYSTEM = table name
#     - PWS_ACTIVITY_CODE/A ==> select only active water systems
#     - EPA_REGION/01 ==> New England

CSV_URL = 'https://enviro.epa.gov/enviro/efservice/WATER_SYSTEM/EPA_REGION/01/PWS_ACTIVITY_CODE/A/CSV'

with requests.Session() as s:
    download = s.get(CSV_URL)
    decoded_content = download.content.decode('utf-8')
    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
    initial_WS = list(cr)        
    WATER_SYSTEM_raw = pd.DataFrame(initial_WS)
print(WATER_SYSTEM_raw.shape)
WATER_SYSTEM_raw.head()

(10483, 48)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,WATER_SYSTEM.PWSID,WATER_SYSTEM.PWS_NAME,WATER_SYSTEM.NPM_CANDIDATE,WATER_SYSTEM.PRIMACY_AGENCY_CODE,WATER_SYSTEM.EPA_REGION,WATER_SYSTEM.SEASON_BEGIN_DATE,WATER_SYSTEM.SEASON_END_DATE,WATER_SYSTEM.PWS_ACTIVITY_CODE,WATER_SYSTEM.PWS_DEACTIVATION_DATE,WATER_SYSTEM.PWS_TYPE_CODE,...,WATER_SYSTEM.ZIP_CODE,WATER_SYSTEM.COUNTRY_CODE,WATER_SYSTEM.STATE_CODE,WATER_SYSTEM.SOURCE_WATER_PROTECTION_CODE,WATER_SYSTEM.SOURCE_PROTECTION_BEGIN_DATE,WATER_SYSTEM.OUTSTANDING_PERFORMER,WATER_SYSTEM.OUTSTANDING_PERFORM_BEGIN_DATE,WATER_SYSTEM.CITIES_SERVED,WATER_SYSTEM.COUNTIES_SERVED,
1,ME0004628,MACHIAS TRAILER PARK,Y,ME,01,,,A,,CWS,...,04654,US,ME,N,,,,MACHIAS,Washington,
2,ME0092288,MARSH BROOK ESTATES,Y,ME,01,,,A,,CWS,...,01746,US,MA,N,,,,SANFORD,York,
3,ME0009198,TRAILS END STEAK HOUSE & TAVERN,Y,ME,01,01-01,12-31,A,,TNCWS,...,04936,US,ME,N,,,,EUSTIS,Franklin,
4,ME0094505,R & R VACATION HOME PARK,Y,ME,01,01-01,12-31,A,,TNCWS,...,04055,US,ME,N,,,,NAPLES,Cumberland,


In [91]:
# Some Data Cleaning:

water_system = WATER_SYSTEM_raw.copy()

# set the first row as header:
new_header = water_system.iloc[0] # grab the first row for the header
new_header = new_header.str.split('.').str[1] # we remove the redundant table name (WATER_SYSTEM) in column names
new_header = new_header.str.lower() # set to lower case, as less annoying
water_system = water_system[1:] # take the data less the header row
water_system.columns = new_header # set the header row as the df header

# we remove the last column of null, that is an artifact of the extraction:
water_system = water_system.dropna(axis = 1, how='all') # axis = 1 = columns

water_system.tail() # looks good for now.

Unnamed: 0,pwsid,pws_name,npm_candidate,primacy_agency_code,epa_region,season_begin_date,season_end_date,pws_activity_code,pws_deactivation_date,pws_type_code,...,city_name,zip_code,country_code,state_code,source_water_protection_code,source_protection_begin_date,outstanding_performer,outstanding_perform_begin_date,cities_served,counties_served
10478,NH1108030,WINDY RIDGE ORCHARD,Y,NH,1,06-01,10-31,A,,TNCWS,...,HAVERHILL,3774,US,NH,,,,,HAVERHILL,Grafton
10479,NH1109020,MOUNTAIN VALLEY TREATMENT CTR,Y,NH,1,01-01,12-31,A,,TNCWS,...,ORFORD,3777,US,NH,,,,,HAVERHILL,Grafton
10480,NH1112010,STONEGATE ACRES,Y,NH,1,,,A,,CWS,...,CONCORD,3302,US,NH,,,,,HEBRON,Grafton
10481,NH1113010,HILLSIDE INN CONDOS,Y,NH,1,,,A,,CWS,...,HEBRON,3241,US,NH,,,,,HEBRON,Grafton
10482,NH1117010,CAMP BEREA/DINING HALL,Y,NH,1,01-01,12-31,A,,TNCWS,...,HEBRON,3241,US,NH,,,,,HEBRON,Grafton


In [92]:
# We save the raw "clean" data as csv, in case the API should stop to work in the future:
if I_WANT_TO_RESAVE_THE_DATA:
    water_system.to_csv('../data/active_water_systems_NewEngland.csv')

## Extracting Violations Data from SDWIS

In [93]:
# more on the API: https://www.epa.gov/enviro/envirofacts-data-service-api

# notes on the API:
#     - VIOLATION = table name
#     - EPA_REGION/01 ==> New England
# note that we do not filter here for active water systems. It will be done when merging with water systems.

CSV_URL = 'https://enviro.epa.gov/enviro/efservice/VIOLATION/EPA_REGION/01/CSV'

with requests.Session() as s:
    download = s.get(CSV_URL)
    decoded_content = download.content.decode('utf-8')
    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
    initial_V = list(cr)        
    VIOLATIONS_raw = pd.DataFrame(initial_V)
print(VIOLATIONS_raw.shape)
VIOLATIONS_raw.head()

(100002, 35)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,VIOLATION.PWSID,VIOLATION.VIOLATION_ID,VIOLATION.FACILITY_ID,VIOLATION.POPULATION_SERVED_COUNT,VIOLATION.NPM_CANDIDATE,VIOLATION.PWS_ACTIVITY_CODE,VIOLATION.PWS_DEACTIVATION_DATE,VIOLATION.PRIMARY_SOURCE_CODE,VIOLATION.POP_CAT_5_CODE,VIOLATION.PRIMACY_AGENCY_CODE,...,VIOLATION.RTC_ENFORCEMENT_ID,VIOLATION.RTC_DATE,VIOLATION.PUBLIC_NOTIFICATION_TIER,VIOLATION.ORIGINATOR_CODE,VIOLATION.SAMPLE_RESULT_ID,VIOLATION.CORRECTIVE_ACTION_ID,VIOLATION.RULE_CODE,VIOLATION.RULE_GROUP_CODE,VIOLATION.RULE_FAMILY_CODE,
1,ME0094672,157508,,388,N,A,,GW,1,ME,...,,,2,S,,,110,100,110,
2,ME0009683,60007,,454,N,A,,GW,1,ME,...,1510,08-SEP-14,2,S,,,110,100,110,
3,ME0000625,6318,,100,N,A,,GW,1,ME,...,638928,04-NOV-13,3,S,,,410,400,410,
4,ME0000625,6316,,100,N,A,,GW,1,ME,...,638930,13-AUG-14,3,S,,,210,200,210,


In [94]:
# Some Data Cleaning:

violations = VIOLATIONS_raw.copy()

# set the first row as header:
new_header = violations.iloc[0] # grab the first row for the header
new_header = new_header.str.split('.').str[1] # we remove the redundant table name (VIOLATION) in column names
new_header = new_header.str.lower() # set to lower case, as less annoying
violations = violations[1:] # take the data less the header row
violations.columns = new_header # set the header row as the df header

# we remove the last column of null, that is an artifact of the extraction:
violations = violations.dropna(axis = 1, how='all') # axis = 1 = columns

# transform the dates to datetime:
violations.rtc_date = pd.to_datetime(violations.rtc_date)
violations.compl_per_begin_date = pd.to_datetime(violations.compl_per_begin_date)
violations.compl_per_end_date = pd.to_datetime(violations.compl_per_end_date)

violations.tail() # looks good for now.

Unnamed: 0,pwsid,violation_id,facility_id,population_served_count,npm_candidate,pws_activity_code,pws_deactivation_date,primary_source_code,pop_cat_5_code,primacy_agency_code,...,latest_enforcement_id,rtc_enforcement_id,rtc_date,public_notification_tier,originator_code,sample_result_id,corrective_action_id,rule_code,rule_group_code,rule_family_code
99997,CT0970184,611,,38,N,A,,GW,1,CT,...,322,322,2013-12-16,3,S,,,410,400,410
99998,CT0580124,171808,,35,N,A,,GW,1,CT,...,3608,3608,2008-05-22,3,S,,,110,100,110
99999,CT0580124,171106,,35,N,A,,GW,1,CT,...,2206,2206,2005-11-14,3,S,,,410,400,410
100000,CT0580124,172909,,35,N,A,,GW,1,CT,...,4826,4819,2009-06-30,3,S,,,110,100,110
100001,ME0004962,38516,,150,N,A,,GW,1,ME,...,437450,437450,2015-05-27,3,S,,,410,400,410


In [95]:
# We save the raw "clean" data as csv, in case the API should stop to work in the future:
if I_WANT_TO_RESAVE_THE_DATA:
    violations.to_csv('../data/violations_NewEngland.csv')

We know have downloaded all the necessary data from SDWIS. In the future, it could eventually be possible to download data from other SDWIS table to feed in the model.  

### Contaminant Codes

To get the names of the contaminants.

In [96]:
contaminant_codes_raw = pd.read_csv('../data/SDWISCodesforLABs/Analyte Codes-Table.csv')
print(contaminant_codes_raw.shape)
contaminant_codes_raw.head()

(768, 9)


Unnamed: 0,CODE,NAME,SCIENTIFIC_NAME,TYPE_CODE,CAS_REGISTRY_NUM,CFR_SECTION_REF_CD,REPORTABLE_STR_DAT,REPORTABLE_END_DAT,STATE_CLASS_CODE
0,100,TURBIDITY,,WQ,,141.13,,,WQP
1,200,SWTR,,RL,,,,,
2,300,IESWTR,,RL,,,,,
3,400,DBP STAGE 1,,RL,,,,,
4,500,FILTER BACKWASH RULE,,RL,,,,,


In [97]:
# contaminant_codes_raw.CODE.unique()

In [98]:
# Some Data Cleaning
contaminant_codes = contaminant_codes_raw.copy()

contaminant_codes = contaminant_codes[['CODE', 'NAME', 'TYPE_CODE']] # columns of interest

contaminant_codes = contaminant_codes.rename(index=str, columns={"CODE": "contaminant_code", 
                                                                "NAME": "contaminant_name",
                                                                "TYPE_CODE": "contaminant_type_code"})

# not necessary:
# # remove the "categories" of contaminants, that contain ***
# contaminant_codes = contaminant_codes[contaminant_codes['CODE'].str.isdigit()] # keep only digits

contaminant_codes.head()

Unnamed: 0,contaminant_code,contaminant_name,contaminant_type_code
0,100,TURBIDITY,WQ
1,200,SWTR,RL
2,300,IESWTR,RL
3,400,DBP STAGE 1,RL
4,500,FILTER BACKWASH RULE,RL


In [99]:
# Save as csv in case:
if I_WANT_TO_RESAVE_THE_DATA:
    violations.to_csv('../data/contaminant_codes.csv')

## Estimated Annual Agricultural Pesticide Use, from NAWQA

The **estimated** annual agricultural pesticide use is collected by the National Water-Quality Assessment (NAWQA) Project, from the US Department of the Interior ([see here](https://water.usgs.gov/nawqa/pnsp/usage/maps/index.php)). This data list an estimated pesticide use, by pesticide, county and year.  

Data collected with a farm survey:  
"For all States except California, proprietary farm survey pesticide-use data are aggregated and reported at the multi-county Crop Reporting District (CRD) level. Harvested-crop acreage data by county from the U.S. Department of Agriculture Census of Agriculture are used to calculate the median pesticide-by- crop use rates for each crop in each CRD. These rates are applied to the harvested acreage of each crop in a county to obtain pesticide-use estimates at a county level."  

Caution:  
"These pesticide-use estimates are suitable for evaluating national and regional patterns and trends of annual pesticide use. The reliability of estimates, however, generally decreases with scale and these estimates and maps are not intended for detailed evaluations, such as comparing within or between specific individual counties."

In [113]:
# get the data:

# the years 2009-2014 are directly accessible via URL:

# We first query to get the first year (2009), and then we will append more years:
URL = 'https://water.usgs.gov/nawqa/pnsp/usage/maps/county-level/PesticideUseEstimates/EPest.county.estimates.2009.txt'
pesticide_use_2009_14 = pd.read_csv(URL, sep='\t')

years = ['2010', '2011', '2012', '2013', '2014']
for year in years:
    if year in ['2010', '2011', '2012']:
        URL = 'https://water.usgs.gov/nawqa/pnsp/usage/maps/county-level/PesticideUseEstimates/EPest.county.estimates.' +\
        year + '.txt'
        pesticide_use_thisyear = pd.read_csv(URL, sep='\t')
    else: # 2013 and 2014 are preliminary estimates with a different URL
        URL = 'https://water.usgs.gov/nawqa/pnsp/usage/maps/county-level/PreliminaryEstimates/EPest.county.estimates.' +\
        year + '.txt'
        pesticide_use_thisyear = pd.read_csv(URL, sep='\t')
    pesticide_use_2009_14 = pesticide_use_2009_14.append(pesticide_use_thisyear, ignore_index=True)

print(pesticide_use_2009_14.shape)
pesticide_use_2009_14.head()


(2251527, 6)


Unnamed: 0,COMPOUND,YEAR,STATE_FIPS_CODE,COUNTY_FIPS_CODE,EPEST_LOW_KG,EPEST_HIGH_KG
0,"2,4-D",2009,1,1,597.5,986.4
1,"2,4-D",2009,1,3,3351.0,4858.5
2,"2,4-D",2009,1,5,8158.1,8219.1
3,"2,4-D",2009,1,7,53.8,98.8
4,"2,4-D",2009,1,9,12369.3,12646.9


In [125]:
# Save as csv in case:
if I_WANT_TO_RESAVE_THE_DATA: # smaller than 100MB. 2009-17 just too large...
    pesticide_use_2009_14.to_csv('../data/pesticide_use/pesticide_use_2009_14.csv')