# Exploring the UCMR dataset for PFOA contaminants

EPA uses the Unregulated Contaminant Monitoring Rule (UCMR) to collect data for contaminants suspected to be present in drinking water, but that do not have health-based standards set under the Safe Drinking Water Act (SDWA).

Perfluorooctanoic acid (PFOA) (conjugate base perfluorooctanoate), is such a class of contaminant. PFOA is linked to the production of Teflon.

In [25]:
import os
import pandas as pd
from IPython.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
## can we please make this a relative path
DATA_PATH = "C:\\Users\\Ewan\\Documents\\CodeForBoston\\SafeWater\\data\\ucmr\\UCMR3\\occurence-data\\UCMR3_All.txt"

# Allow us to display all the columns in a dataframe
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [10]:
ucmr_df = pd.read_csv(DATA_PATH, sep='\t', error_bad_lines=False)

In [11]:
ucmr_df.head()

Unnamed: 0,PWSID,PWSName,Size,FacilityID,FacilityName,FacilityWaterType,SamplePointID,SamplePointName,SamplePointType,AssociatedFacilityID,AssociatedSamplePointID,CollectionDate,SampleID,Contaminant,MRL,MethodID,AnalyticalResultsSign,AnalyticalResultValue,SampleEventCode,MonitoringRequirement,Region,State
0,MI0004370,City of Midland,L,6197,Water Treatment Plant,SW,TP001,Water Treatment Plant Tap,EP,4674.0,MR1,3/11/2014,201403130319AM,strontium,0.3,EPA 200.8,=,98.0,SE3,AM,5,MI
1,MI0004370,City of Midland,L,6197,Water Treatment Plant,SW,TP001,Water Treatment Plant Tap,EP,4674.0,MR1,3/11/2014,201403130319AM,PFOS,0.04,EPA 537,<,,SE3,AM,5,MI
2,MI0004370,City of Midland,L,6197,Water Treatment Plant,SW,TP001,Water Treatment Plant Tap,EP,4674.0,MR1,3/11/2014,201403130319AM,bromomethane,0.2,EPA 524.3,<,,SE3,AM,5,MI
3,MI0004370,City of Midland,L,6197,Water Treatment Plant,SW,TP001,Water Treatment Plant Tap,EP,4674.0,MR1,3/11/2014,201403130319AM,chloromethane,0.2,EPA 524.3,<,,SE3,AM,5,MI
4,MI0004370,City of Midland,L,6197,Water Treatment Plant,SW,TP001,Water Treatment Plant Tap,EP,4674.0,MR1,3/11/2014,201403130319AM,vanadium,0.2,EPA 200.8,<,,SE3,AM,5,MI


In [38]:
display(ucmr_df.Contaminant.unique())
display(ucmr_df.AnalyticalResultsSign.unique())

array(['strontium', 'PFOS', 'bromomethane', 'chloromethane', 'vanadium',
       '1,2,3-trichloropropane', 'chromium', '1,4-dioxane', 'PFOA',
       'cobalt', 'PFNA', 'PFHpA', 'PFHxS', 'chlorate', 'PFBS', 'HCFC-22',
       '1,3-butadiene', 'Halon 1011', 'molybdenum', 'chromium-6',
       '1,1-dichloroethane', 'estriol', 'equilin',
       '17-alpha-ethynylestradiol', '17-beta-estradiol',
       '4-androstene-3,17-dione', 'testosterone', 'estrone', 'manganese',
       'germanium', 'tellurium', 'sec-butylbenzene', 'n-propylbenzene',
       'Enterococci', 'Male specific phage', 'Somatic phage',
       'Enteroviruses (cell culture)', 'Enteroviruses (RT-qPCR)',
       'Noroviruses GIA', 'Noroviruses GIB', 'Noroviruses GII',
       'Aerobic spores', 'Total coliforms', 'E. coli'], dtype=object)

array(['=', '<'], dtype=object)

In [26]:
display(ucmr_df[ucmr_df.Contaminant == 'PFOA'].count())
display(ucmr_df[ucmr_df.Contaminant == 'PFOS'].count())

PWSID                      36972
PWSName                    36972
Size                       36972
FacilityID                 36972
FacilityName               36972
FacilityWaterType          36972
SamplePointID              36972
SamplePointName            36972
SamplePointType            36972
AssociatedFacilityID       36972
AssociatedSamplePointID    36972
CollectionDate             36972
SampleID                   36972
Contaminant                36972
MRL                        36972
MethodID                   36972
AnalyticalResultsSign      36972
AnalyticalResultValue        379
SampleEventCode            36972
MonitoringRequirement      36972
Region                     36972
State                      36972
dtype: int64

PWSID                      36972
PWSName                    36972
Size                       36972
FacilityID                 36972
FacilityName               36972
FacilityWaterType          36972
SamplePointID              36972
SamplePointName            36972
SamplePointType            36972
AssociatedFacilityID       36972
AssociatedSamplePointID    36972
CollectionDate             36972
SampleID                   36972
Contaminant                36972
MRL                        36972
MethodID                   36972
AnalyticalResultsSign      36972
AnalyticalResultValue        292
SampleEventCode            36972
MonitoringRequirement      36972
Region                     36972
State                      36972
dtype: int64

In [34]:
display(ucmr_df[(ucmr_df.Contaminant == 'PFOA') & ucmr_df.MRL > 0.01].groupby('State').count())
new_england_pf_df = ucmr_df[(ucmr_df.Contaminant.isin(['PFOA', 'PFOS', 'PFNA', 'PFHxS', 'PFHpA', 'PFBS'])) & (ucmr_df.State.isin(['MA', 'VT', 'ME', 'NH', 'RI', 'CT']))]
len(new_england_pf_df)

Unnamed: 0_level_0,PWSID,PWSName,Size,FacilityID,FacilityName,FacilityWaterType,SamplePointID,SamplePointName,SamplePointType,AssociatedFacilityID,AssociatedSamplePointID,CollectionDate,SampleID,Contaminant,MRL,MethodID,AnalyticalResultsSign,AnalyticalResultValue,SampleEventCode,MonitoringRequirement,Region
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,0,8,8,8
05,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,0,12,12,12
06,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,0,6,6,6
08,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2
09,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,0,59,59,59
10,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,0,8,8,8
AK,69,69,69,69,69,69,69,69,69,69,69,69,69,69,69,69,69,0,69,69,69
AL,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,1056,32,1056,1056,1056
AR,270,270,270,270,270,270,270,270,270,270,270,270,270,270,270,270,270,0,270,270,270
AS,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,0,68,68,68


11550

In [35]:
new_england_pf_df.to_csv("C:\\Users\\Ewan\\Documents\\CodeForBoston\\SafeWater\\data\\ucmr\\UCMR3\\occurence-data\\new_england_pf.csv")

In [36]:
new_england_pf_df.head(100)

Unnamed: 0,PWSID,PWSName,Size,FacilityID,FacilityName,FacilityWaterType,SamplePointID,SamplePointName,SamplePointType,AssociatedFacilityID,AssociatedSamplePointID,CollectionDate,SampleID,Contaminant,MRL,MethodID,AnalyticalResultsSign,AnalyticalResultValue,SampleEventCode,MonitoringRequirement,Region,State
8569,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,8/11/2014,809683-12076,PFBS,0.09,EPA 537,<,,SE4,AM,1,ME
8571,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,8/11/2014,809683-12076,PFOS,0.04,EPA 537,<,,SE4,AM,1,ME
8572,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,8/11/2014,809683-12076,PFHpA,0.01,EPA 537,<,,SE4,AM,1,ME
8573,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,8/11/2014,809683-12076,PFHxS,0.03,EPA 537,<,,SE4,AM,1,ME
8574,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,8/11/2014,809683-12076,PFNA,0.02,EPA 537,<,,SE4,AM,1,ME
8575,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,8/11/2014,809683-12076,PFOA,0.02,EPA 537,<,,SE4,AM,1,ME
8583,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,11/18/2013,759188-10308,PFHxS,0.03,EPA 537,<,,SE1,AM,1,ME
8593,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,11/18/2013,759188-10308,PFOA,0.02,EPA 537,<,,SE1,AM,1,ME
8603,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,2/11/2014,770451-10786,PFOA,0.02,EPA 537,<,,SE2,AM,1,ME
8604,ME0090830,Lewiston Water & Sewer Division,L,13797,Treatment Plant 2,SW,TP2,EPTDS Main Street,EP,8458.0,MR1,2/11/2014,770451-10786,PFNA,0.02,EPA 537,<,,SE2,AM,1,ME
