## This code compares Per- and Polyfluoroalkyl Substances (PFAS) concentrations in drinking water on or near military installations collected in 2018 v. 2021-2023. 

### The data was scraped from a publicly available Department of Defense (DOD) website. 

In [1]:
import pandas as pd
import os
from fuzzywuzzy import process # fuzz
import numpy as np


os.chdir(r'C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\python\PACT Act - MET\PDF data/')
cwd = os.getcwd()
print("Current working directory is:", cwd)

Current working directory is: C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\python\PACT Act - MET\PDF data


In [2]:
#pull in 2018 data that was scraped from pdf. See repository called "DAPM-PFAS-PDF-scrape" on Department of
#Veterans Affair GitHub. https://github.com/department-of-veterans-affairs/DAPM-PFAS-PDF-scrape
%store -r
df_2018
df_2018 = df_2018.sort_values('name', ascending=True)
df_2018 = df_2018.reset_index(drop=True)
df_2018.head()

Unnamed: 0,branch,name,state,date,number systems tested,number above EPA LHA,Results (PFOS/ PFOA),"Actions Taken as of August 31, 2017",Actions Planned to reduce,purveyor,on or off base,results,analyte
0,Army,81ST RSC: E. EARLE RIVES AFRC,North Carolina,2017,1,1,<40-90,PFOS per 2014 Consumer Confidence Report. The ...,,non-DOD,on,90.0,
1,Army,99TH RSC MARTINSBURG MEMORIAL USARC,West Virginia,2017,1,1,0-79/0-71,The results reported were the range of PFOS/PF...,,non-DOD,on,79.0,
2,Navy - BRAC,AGANA,Guam,2017,12,5,88 - 410,Sampling conducted by Guam Waterworks Authorit...,,non-DOD,off,410.0,
3,Air Force - ANG,BARNES ANGB (104TH),Massachusetts,2017,1,1,43-160 ppt (PFOS+PFOA),Base obtains water from Westfield Water Dept. ...,Water sources will continue to be sampled for ...,non-DOD,on,160.0,PFOA+PFOS
4,Army,BELMONT ARMORY,Michigan,2017,1,1,86-96.8,MIARNG spoke with potential responsible party ...,"MIARNG is waiting for response from PRP, explo...",DOD,on,96.8,


In [3]:
#pull in 2021-2023 data scraped from website. See repository called "DAPM-PFAS-web-scrape-and-map" on Department of
#Veterans Affair GitHub. https://github.com/department-of-veterans-affairs/DAPM-PFAS-web-scrape-and-map
%store -r 
map_70 
map_70.head()

Unnamed: 0,state,branch,name,date,method,analyte,result,MCLs,exceedance,LimitDetect,units,qualifier,TreatSystem,TreatStatus,latitude,longitude
0,Arizona,Air Force,AIR FORCE PLANT 44,2022-10-11,QSM_B15,PFOS,29.2,70,False,4.0,ng/L,,Yes,pre-treatment,32.221743,-110.926479
1,Tennessee,Air Force,ARNOLD AFB,2021-12-14,QSM_B15,PFOA,40.9,70,False,4.2,ng/L,,No,,35.39881,-86.078066
2,Florida,Air Force,AVON PARK AIR FORCE RESERVE,2021-12-20,QSM_B15,PFOS,40.3,70,False,4.0,ng/L,,No,,27.59567,-81.506186
3,South Carolina,Navy,BEAUFORT SC MCAS,2022-12-09,537.1,PFOA,1.49,70,False,1.19,ng/L,J,No,,32.475964,-80.726366
4,Washington,Navy,BREMERTON WA NAVBASE,2022-05-03,537.1,PFOA,0.478,70,False,1.12,ng/L,J,No,,47.711998,-122.714478


In [4]:
#searching for base names in the 2021-2023 dataframe for matching later
#JB Lewis McChord is in both dataframes. However, one is for Yakima Training Center (2022) and the other is for 
#"Fort Lewis Cantonment". I think these might be far from each other, so should remain unmatched. 

pd.set_option('display.max_colwidth', None)
mask = map_70['name'].str.contains('GRAYLING')
result = map_70[mask]
result

Unnamed: 0,state,branch,name,date,method,analyte,result,MCLs,exceedance,LimitDetect,units,qualifier,TreatSystem,TreatStatus,latitude,longitude
39,Michigan,Army,MTC-H CAMP GRAYLING AIRFIELD (INSTALLATION-WIDE PAI),2022-05-10,QSM_B15,PFOS,1.42,70,False,1.97,ng/L,J,No,,44.630811,-84.777406


In [5]:
#searching for base names in the 2018 dataframe for matching later
pd.set_option('display.max_colwidth', None)
mask_2018 = df_2018['name'].str.contains('GRAYLING')
result_2018 = df_2018[mask_2018]
result_2018

Unnamed: 0,branch,name,state,date,number systems tested,number above EPA LHA,Results (PFOS/ PFOA),"Actions Taken as of August 31, 2017",Actions Planned to reduce,purveyor,on or off base,results,analyte
20,Army,CAMP GRAYLING JOINT MANEUVER TRAINING CENTER,Michigan,2017,188,4,70-188,"Residential drinking water wells were sampled downgradient of the Camp Grayling Airfield where groundwater samples indicated PFOS/PFOA results > LHA. The Michigan DHHS paid for aquasana drinking water filter systems under counter faucet 3 stage max flow and local Health Dept. had them installed in any home with PFOS/PFOA detections. 42 systems have been installed. An additional 14, purchased by local Health were installed by home owners.",The Michigan DEQ has been funded to conduct off post residential sampling and hydrologic study. The DHHS and local Health Dept. will continue to purchase filter systems and install them as needed or requested by home owners. Michigan DEQ is continuing to expand the area of drinking water sampling to the east side of the airfield. Numbers of wells are TBD.,non-DOD,off,188.0,


In [6]:
#remove punctuation from name column to allow for better matching
df_2018['name'] = df_2018['name'] .apply(lambda x: x.replace('(','').replace(')','')) 
#rename installation names to allow for better matching
df_2018['name'] = df_2018['name'].str.replace('CAMP GRAYLING JOINT MANEUVER TRAINING CENTER','CAMP GRAYLING MTC')
df_2018['name'] = df_2018['name'].str.replace('NAS WHITING FIELD MAIN BASE','WHITING FLD FL NAS')
df_2018['name'] = df_2018['name'].str.replace('HORSHAM AGS AGS 111TH','BIDDLE ANGB')
df_2018['name'] = df_2018['name'].str.replace('NAS WHIDBEY ISLAND - AULT FIELD PRIVATE','WHIDBEY IS WA NAS AULT FIELD')
df_2018['name'] = df_2018['name'].str.replace('NAS WHIDBEY ISLAND - OLF COUPEVILLE PRIVATEE','WHIDBEY IS WA NAS COUPEVILLE')
df_2018['name'] = df_2018['name'].str.replace('NAS OCEANA - NALF FENTRESS','OCEANA VA NAS')

df_2018.head()

Unnamed: 0,branch,name,state,date,number systems tested,number above EPA LHA,Results (PFOS/ PFOA),"Actions Taken as of August 31, 2017",Actions Planned to reduce,purveyor,on or off base,results,analyte
0,USMC,MCB CAMP PENDLETON SOUTH,California,2017,1,1,77,One sample exceeded the PFOS/PFOA combined Health Advisories Level. -The affected reservoir was drained and replaced with water from another source. -Resampling confirmed levels below the Health Advisories Level in the current water supply and levels slightly above the Health Advisories Level in the well taken out of service. Installation will continue to monitor the system.,Installation will continue to monitor the system.,DOD,on,77.0,
1,Army,99TH RSC MARTINSBURG MEMORIAL USARC,West Virginia,2017,1,1,0-79/0-71,"The results reported were the range of PFOS/PFOA in 2014, and the public water system is not reporting exceedances to users due to compliance averaging of all test results.",,non-DOD,on,79.0,
2,Army,EL CAMPO,Texas,2017,1,1,79,Well is designated as non-potable. Bottled water is being provided for an alternate drinking water supply. A Granular Activated Carbon Filter System installed and confirmation sampling conducted in August 2017.,"Additional sampling will be conducted in November 2017 and February 2018 to determine if filtration system is reducing drinking water below the LHA. If sampling results determine the water source is below the LHA, bottled water usage will be discontinued and the water well reestablished as potable.",DOD,on,79.0,
3,Army,"SOTO CANO AB, HN",Honduras,2017,1,1,PFOA + PFOS combined = 72.5- 82.9,"Initial, confirmatory and subsequent sampling conducted at multiple wells (Dec 16-Feb 17). Bottled water is routinely used throughout the installation, except at DFAC (for ice, coffee, tea, etc.); larger carbon filter installed to accommodate the flow rate servicing DFAC; subsequent confirmation sample indicated PFOA/PFOS <LHA at DFAC prep sink; except at DFAC, bottled water will remain the norm for consumption. Implemented recurring sampling/analysis. Consumer notification conducted 09 Mar 17. Implemented recurring sampling/analysis.",,DOD,on,82.9,PFOA+PFOS
4,Air Force - ACTIVE,NEW BOSTON AFS,New Hampshire,2017,2,1,13-83 ppt (PFOS+PFOA),Drinking water sampling and analysis. DW well above the EPA LHAs has been shut down. The AF is conducting quarterly sampling for the operational drinking water wells.,Project has been awarded to add Activated Carbon to impacted DW well to fix PFOS/PFOA. Quarterly testing will continue,DOD,on,83.0,PFOA+PFOS


In [7]:
#rename installation names to allow for better matching
pd.set_option('display.max_rows', None)
map_70['name'] = map_70['name'] .apply(lambda x: x.replace('(','').replace(')','')) 
map_70['name'] = map_70['name'].str.replace('WILLOW GROVE NASJRB','WILLOW GROVE')
map_70['name'] = map_70['name'].str.replace('WARMINSTER NAWC AD', 'WARMINSTER')
map_70['name'] = map_70['name'].str.replace('MTC-H CAMP GRAYLING AIRFIELD INSTALLATION-WIDE PAI', 'CAMP GRAYLING MTC')
map_70.head()

Unnamed: 0,state,branch,name,date,method,analyte,result,MCLs,exceedance,LimitDetect,units,qualifier,TreatSystem,TreatStatus,latitude,longitude
0,Arizona,Air Force,AIR FORCE PLANT 44,2022-10-11,QSM_B15,PFOS,29.2,70,False,4.0,ng/L,,Yes,pre-treatment,32.221743,-110.926479
1,Tennessee,Air Force,ARNOLD AFB,2021-12-14,QSM_B15,PFOA,40.9,70,False,4.2,ng/L,,No,,35.39881,-86.078066
2,Florida,Air Force,AVON PARK AIR FORCE RESERVE,2021-12-20,QSM_B15,PFOS,40.3,70,False,4.0,ng/L,,No,,27.59567,-81.506186
3,South Carolina,Navy,BEAUFORT SC MCAS,2022-12-09,537.1,PFOA,1.49,70,False,1.19,ng/L,J,No,,32.475964,-80.726366
4,Washington,Navy,BREMERTON WA NAVBASE,2022-05-03,537.1,PFOA,0.478,70,False,1.12,ng/L,J,No,,47.711998,-122.714478


In [8]:
#copy to use the dataframe in another notebook
#map_70_rename = map_70.copy()
#%store map_70_rename

In [9]:
#calculate similarity scores for both datasets (2018 / pdf and 2021-2023 / webscraped)
name_2018 = []
similarity = []
for i in map_70.name:
        ratio = process.extract(i, df_2018.name, limit=1)
        name_2018.append(ratio[0][0])
        similarity.append(ratio[0][1])
map_70['name_2018'] = pd.Series(name_2018)
map_70['name_2018'] = map_70['name_2018']
map_70['similarity'] = pd.Series(similarity)
map_70.head()

Unnamed: 0,state,branch,name,date,method,analyte,result,MCLs,exceedance,LimitDetect,units,qualifier,TreatSystem,TreatStatus,latitude,longitude,name_2018,similarity
0,Arizona,Air Force,AIR FORCE PLANT 44,2022-10-11,QSM_B15,PFOS,29.2,70,False,4.0,ng/L,,Yes,pre-treatment,32.221743,-110.926479,FORMER PLATTSBURGH AFB,52
1,Tennessee,Air Force,ARNOLD AFB,2021-12-14,QSM_B15,PFOA,40.9,70,False,4.2,ng/L,,No,,35.39881,-86.078066,MOUNTAIN HOME AFB,86
2,Florida,Air Force,AVON PARK AIR FORCE RESERVE,2021-12-20,QSM_B15,PFOS,40.3,70,False,4.0,ng/L,,No,,27.59567,-81.506186,MARINE FORCES RESERVE MARFORRES-MARINE CORPS RESERVE TRAINING CENTER MCRTC STEWART,86
3,South Carolina,Navy,BEAUFORT SC MCAS,2022-12-09,537.1,PFOA,1.49,70,False,1.19,ng/L,J,No,,32.475964,-80.726366,JB LEWIS-MCCHORD: FORT LEWIS CANTONMENT,50
4,Washington,Navy,BREMERTON WA NAVBASE,2022-05-03,537.1,PFOA,0.478,70,False,1.12,ng/L,J,No,,47.711998,-122.714478,OCEANA VA NAS,56


In [10]:
#keep only the scores greater than or equal to 87. Anything less has noise.
map_70_87 = map_70[map_70["similarity"] >= 87]

map_70_87 = map_70_87.drop(['method', 
                            'LimitDetect',
                            'qualifier',
                            'TreatSystem',
                            'TreatStatus',
                            'latitude',
                            'longitude'], axis=1)
map_70_87.head()

Unnamed: 0,state,branch,name,date,analyte,result,MCLs,exceedance,units,name_2018,similarity
5,Pennsylvania,Air Force,BIDDLE ANGB FORMERLY HORSHAM/WILLOW GROVE ANG,2022-01-27,PFOS,26.0,70,False,ng/L,BIDDLE ANGB,90
13,Delaware,Air Force,DOVER AFB,2022-01-22,PFOS,,70,False,ng/L,DOVER AFB,100
15,Alaska,Air Force,EIELSON AFB,2022-06-29,PFOS,9.0,70,False,ng/L,EIELSON AFB,100
16,South Dakota,Air Force,ELLSWORTH AFB,2022-03-14,PFOS,342.0,70,True,ng/L,ELLSWORTH AFB,100
17,Washington,Air Force,FAIRCHILD AFB,2022-05-31,PFOA,2.9,70,False,ng/L,FAIRCHILD AFB,100


In [11]:
#select columns with data for merging
df_2018_select = df_2018[['name','analyte','results']]
df_2018_select = df_2018_select.rename({'name':'name_2018',
                                        'analyte':'analyte_2018',
                                        'results':'results_2018'}, axis=1)

df_2018_select.head()

Unnamed: 0,name_2018,analyte_2018,results_2018
0,MCB CAMP PENDLETON SOUTH,,77.0
1,99TH RSC MARTINSBURG MEMORIAL USARC,,79.0
2,EL CAMPO,,79.0
3,"SOTO CANO AB, HN",PFOA+PFOS,82.9
4,NEW BOSTON AFS,PFOA+PFOS,83.0


In [12]:
# merge dataframes to compare concentrations of installations found in both the 2018 and 2021-2023 dataframes
map_70_87_results = map_70_87.merge(df_2018_select,on=['name_2018'],how='left')
map_70_87_results = map_70_87_results[['state',
                                       'branch',  
                                       'similarity',
                                       'name_2018',
                                       'analyte_2018',
                                       'results_2018',
                                       'name',
                                       'date',
                                       'analyte',
                                       'result',
                                       'MCLs',
                                       'exceedance',
                                       'units']]

map_70_87_results = map_70_87_results.rename(columns={'name': 'name_21_23',
                                                      'analyte':'analyte_21_23',
                                                      'result':'results_21_23', 
                                                      'exceedance':'exceedance_21_23'})

#convert nan string to numpy nan
map_70_87_results['results_21_23'] = map_70_87_results['results_21_23'].replace('nan', np.nan)
#convert column to numeric
map_70_87_results['results_21_23'] = pd.to_numeric(map_70_87_results['results_21_23'])

map_70_87_results

Unnamed: 0,state,branch,similarity,name_2018,analyte_2018,results_2018,name_21_23,date,analyte_21_23,results_21_23,MCLs,exceedance_21_23,units
0,Pennsylvania,Air Force,90,BIDDLE ANGB,PFOA+PFOS,290.0,BIDDLE ANGB FORMERLY HORSHAM/WILLOW GROVE ANG,2022-01-27,PFOS,26.0,70,False,ng/L
1,Pennsylvania,Air Force,90,BIDDLE ANGB,PFOA+PFOS,1870.0,BIDDLE ANGB FORMERLY HORSHAM/WILLOW GROVE ANG,2022-01-27,PFOS,26.0,70,False,ng/L
2,Delaware,Air Force,100,DOVER AFB,PFOA+PFOS,91.0,DOVER AFB,2022-01-22,PFOS,,70,False,ng/L
3,Alaska,Air Force,100,EIELSON AFB,PFOA+PFOS,111.0,EIELSON AFB,2022-06-29,PFOS,9.0,70,False,ng/L
4,Alaska,Air Force,100,EIELSON AFB,PFOA+PFOS,2222.0,EIELSON AFB,2022-06-29,PFOS,9.0,70,False,ng/L
5,South Dakota,Air Force,100,ELLSWORTH AFB,PFOA+PFOS,182.0,ELLSWORTH AFB,2022-03-14,PFOS,342.0,70,True,ng/L
6,Washington,Air Force,100,FAIRCHILD AFB,PFOA+PFOS,5700.0,FAIRCHILD AFB,2022-05-31,PFOA,2.9,70,False,ng/L
7,New Jersey,Air Force,98,JOINT BASE MCGUIRE- DIX-LAKEHURST,PFOA+PFOS,1688.0,JOINT BASE MCGUIRE-DIX-LAKEHURST,2021-11-09,PFOS,83.0,70,True,ng/L
8,Michigan,Air Force,90,FORMER KI SAWYER AFB,PFOA+PFOS,3651.0,KI SAWYER AFB,2022-07-13,PFOS,,70,False,ng/L
9,Michigan,Army,100,CAMP GRAYLING MTC,,188.0,CAMP GRAYLING MTC,2022-05-10,PFOS,1.42,70,False,ng/L


In [13]:
#map_70_87_results.to_csv(r"C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\PACT Act\MET\deliverables\compare_2018_and_21_23.csv", float_format='{:f}'.format, encoding='utf-8', index=False)

In [14]:
#check datatype to make sure it's numeric or styling in cell block below won't work
map_70_87_results['results_21_23'].dtypes

dtype('float64')

In [15]:
#highlight results where the 2021-2023 concentrations are greater than the 2018

def highlight(table):
    if table.results_21_23 >= table.results_2018:
        return['background-color: yellow']*len(table)
    else:
        return['background-color: white']*len(table)
    
    
map_70_87_results = map_70_87_results.style.apply(highlight, axis = 1).format(precision=1)

map_70_87_results


Unnamed: 0,state,branch,similarity,name_2018,analyte_2018,results_2018,name_21_23,date,analyte_21_23,results_21_23,MCLs,exceedance_21_23,units
0,Pennsylvania,Air Force,90,BIDDLE ANGB,PFOA+PFOS,290.0,BIDDLE ANGB FORMERLY HORSHAM/WILLOW GROVE ANG,2022-01-27,PFOS,26.0,70,False,ng/L
1,Pennsylvania,Air Force,90,BIDDLE ANGB,PFOA+PFOS,1870.0,BIDDLE ANGB FORMERLY HORSHAM/WILLOW GROVE ANG,2022-01-27,PFOS,26.0,70,False,ng/L
2,Delaware,Air Force,100,DOVER AFB,PFOA+PFOS,91.0,DOVER AFB,2022-01-22,PFOS,,70,False,ng/L
3,Alaska,Air Force,100,EIELSON AFB,PFOA+PFOS,111.0,EIELSON AFB,2022-06-29,PFOS,9.0,70,False,ng/L
4,Alaska,Air Force,100,EIELSON AFB,PFOA+PFOS,2222.0,EIELSON AFB,2022-06-29,PFOS,9.0,70,False,ng/L
5,South Dakota,Air Force,100,ELLSWORTH AFB,PFOA+PFOS,182.0,ELLSWORTH AFB,2022-03-14,PFOS,342.0,70,True,ng/L
6,Washington,Air Force,100,FAIRCHILD AFB,PFOA+PFOS,5700.0,FAIRCHILD AFB,2022-05-31,PFOA,2.9,70,False,ng/L
7,New Jersey,Air Force,98,JOINT BASE MCGUIRE- DIX-LAKEHURST,PFOA+PFOS,1688.0,JOINT BASE MCGUIRE-DIX-LAKEHURST,2021-11-09,PFOS,83.0,70,True,ng/L
8,Michigan,Air Force,90,FORMER KI SAWYER AFB,PFOA+PFOS,3651.0,KI SAWYER AFB,2022-07-13,PFOS,,70,False,ng/L
9,Michigan,Army,100,CAMP GRAYLING MTC,,188.0,CAMP GRAYLING MTC,2022-05-10,PFOS,1.4,70,False,ng/L
