## This notebook adds geospatial data from one dataframe to another. It uses fuzzy matching between military installation names 

### all of the data was scraped from a Department of Defense's  Per- and polyfluoroalkyl substances (PFAS)  website: https://www.acq.osd.mil/eie/eer/ecc/pfas/map/pfasmap.html

In [1]:
import os
import pandas as pd
import plotly.express as px
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
from fuzzywuzzy import process
import numpy as np

os.chdir(
r'C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\python\PACT Act - MET\final version January 2024')
cwd = os.getcwd()
print("Current working directory is:", cwd)

Current working directory is: C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\python\PACT Act - MET\final version January 2024


In [2]:
#metadata_df is a stored table of 703 military installations and their spatial data. This data was webscraped
# from DOD's PFAS website. See the "DAPM-PFAS-web-scrape-and-map" repository on Veterans Affairs GitHub account. 
# pulling from an older csv file because newest pull did not have South Korea data

geo = pd.read_csv(
r"C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\PACT Act\MET\deliverables\military_bases_spatial.csv")

geo.head()

Unnamed: 0,branch,state,name,type,latitude,longitude
0,Army,Virgin Islands,AAOF BLAIR HANGAR,National Guard,17.725062,-64.782945
1,Army,Alabama,AASF #1 R W SHEPHERD HOPE HULL,National Guard,32.287216,-86.394073
2,Army,Alabama,AASF #2 BIRMINGHAM,National Guard,33.572333,-86.750542
3,Army,Tennessee,AASF #2 TN,National Guard,35.821234,-83.994949
4,Army,Washington,AASF #2 WA,National Guard,47.618166,-117.641352


In [3]:
#pull in the data from March 2018 publically avilable DOD report on PFAS in drinking water. This data was scraped from
#a PDF file. See "DAPM-PFAS-PDF-scrape" repoisitory on VA's GitHub account.

df_2018 = pd.read_csv(
r"C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\PACT Act\MET\deliverables\Jan_2024\max_PFAS_17.csv")

df_2018.head()

Unnamed: 0,branch,name,state,date,number_systems_tested,number_above_EPA_LHA,results_raw,actions_as_of_August_31_2017,planned_actions_to_reduce,purveyor,on_or_off_base,results,analyte
0,Army,81ST RSC: E. EARLE RIVES AFRC,North Carolina,2017,1,1,<40-90,PFOS per 2014 Consumer Confidence Report. The ...,,non-DOD,on,90.0,
1,Navy - BRAC,AGANA,Guam,2017,12,5,88 - 410,Sampling conducted by Guam Waterworks Authorit...,,non-DOD,off,410.0,
2,Air Force - ANG,BARNES MUNICIPAL,Massachusetts,2017,1,1,43-160 ppt (PFOS+PFOA),Base obtains water from Westfield Water Dept. ...,Water sources will continue to be sampled for ...,non-DOD,on,160.0,PFOA+PFOS
3,Navy,BARROW AK NARL,Alaska,2017,5,5,144 - 262,*Surface water (Imikpuk Lake) sampled; serves ...,Continue investigation,non-DOD,off,262.0,
4,Army,BELMONT ARMORY,Michigan,2017,1,1,86-96.8,MIARNG spoke with potential responsible party ...,"MIARNG is waiting for response from PRP, explo...",DOD,on,96.8,


In [4]:
#code to determine number of unique installations
unique = df_2018["name"].nunique()
unique

51

In [5]:
df_2018.head()

Unnamed: 0,branch,name,state,date,number_systems_tested,number_above_EPA_LHA,results_raw,actions_as_of_August_31_2017,planned_actions_to_reduce,purveyor,on_or_off_base,results,analyte
0,Army,81ST RSC: E. EARLE RIVES AFRC,North Carolina,2017,1,1,<40-90,PFOS per 2014 Consumer Confidence Report. The ...,,non-DOD,on,90.0,
1,Navy - BRAC,AGANA,Guam,2017,12,5,88 - 410,Sampling conducted by Guam Waterworks Authorit...,,non-DOD,off,410.0,
2,Air Force - ANG,BARNES MUNICIPAL,Massachusetts,2017,1,1,43-160 ppt (PFOS+PFOA),Base obtains water from Westfield Water Dept. ...,Water sources will continue to be sampled for ...,non-DOD,on,160.0,PFOA+PFOS
3,Navy,BARROW AK NARL,Alaska,2017,5,5,144 - 262,*Surface water (Imikpuk Lake) sampled; serves ...,Continue investigation,non-DOD,off,262.0,
4,Army,BELMONT ARMORY,Michigan,2017,1,1,86-96.8,MIARNG spoke with potential responsible party ...,"MIARNG is waiting for response from PRP, explo...",DOD,on,96.8,


In [6]:
geo.head()

Unnamed: 0,branch,state,name,type,latitude,longitude
0,Army,Virgin Islands,AAOF BLAIR HANGAR,National Guard,17.725062,-64.782945
1,Army,Alabama,AASF #1 R W SHEPHERD HOPE HULL,National Guard,32.287216,-86.394073
2,Army,Alabama,AASF #2 BIRMINGHAM,National Guard,33.572333,-86.750542
3,Army,Tennessee,AASF #2 TN,National Guard,35.821234,-83.994949
4,Army,Washington,AASF #2 WA,National Guard,47.618166,-117.641352


In [7]:
#use fuzzy matching compare names in the spatial and 2018 dataframe. Create a column called "similiarity"
#that reports the score of the most similar names between the two. 
name_geo = []
similarity = []
for i in df_2018.name:
        ratio = process.extract(i, geo.name, limit=1)
        name_geo.append(ratio[0][0])
        similarity.append(ratio[0][1])
df_2018['name_geo'] = pd.Series(name_geo)
df_2018['name_geo'] = df_2018['name_geo']
df_2018['similarity'] = pd.Series(similarity)
df_2018 = df_2018.sort_values(by=['similarity'], ascending=True)
df_2018.head(n=8)

Unnamed: 0,branch,name,state,date,number_systems_tested,number_above_EPA_LHA,results_raw,actions_as_of_August_31_2017,planned_actions_to_reduce,purveyor,on_or_off_base,results,analyte,name_geo,similarity
31,Navy,NSF DIEGO GARCIA,Diego Garcia,2017,5,3,"77-5,849",Alternate drinking water was already being pro...,Continued monitoring for PFOS and PFOA at the ...,DOD,on,5849.0,,NAVBASE SAN DIEGO,58
0,Army,81ST RSC: E. EARLE RIVES AFRC,North Carolina,2017,1,1,<40-90,PFOS per 2014 Consumer Confidence Report. The ...,,non-DOD,on,90.0,,NMC DET EARLE,86
42,Army,"USAG RED CLOUD, KR: CAMP RED CLOUD",South Korea,2017,1,1,PFOA + PFOS combined = 171- 466,"Initial, confirmatory and subsequent sampling ...",Continue to use Army system wells producing wa...,DOD,on,466.0,PFOA+PFOS,CAMP LEJEUNE NC MCB,86
41,Army,"USAG BENELUX - CASERNE DAUMERIE, BE",Belgium,2017,1,1,PFOA + PFOS combined = 84-94,Purchased water - Initial and confirmatory sam...,Continued use of bottled water until closure/t...,non-DOD,on,94.0,PFOA+PFOS,USAG ITALY - VICENZA,86
24,USMC,MCB CAMP PENDLETON SOUTH,California,2017,1,1,77,One sample exceeded the PFOS/PFOA combined Hea...,Installation will continue to monitor the system.,DOD,on,77.0,,CAMP PENDLETON CA MCB,87
1,Navy - BRAC,AGANA,Guam,2017,12,5,88 - 410,Sampling conducted by Guam Waterworks Authorit...,,non-DOD,off,410.0,,GUAM AGANA NAS,90
43,Army,"USAG RED CLOUD, KR: CAMP STANLEY",South Korea,2017,1,1,"PFOA + PFOS combined = 80- 1,061","Initial, confirmatory and subsequent sampling ...",Continue to use Army system wells producing wa...,DOD,on,1061.0,PFOA+PFOS,CAMP STANLEY,90
29,Air Force - ANG,NEW CASTLE ANGB 166TH,Delaware,2017,1,1,140-1800 ppt (PFOS+PFOA),Artesian Water Company discontinued the use of...,Supplier took actions consistent with AF policy,non-DOD,on,1800.0,PFOA+PFOS,NEW CASTLE,90


In [8]:
#only keep rows with a similarity match greater than or equal to 87.
#After visual inspection, anything less than 87 resulted in a poor match between installation names. 
df_2018_87 = df_2018[df_2018["similarity"] >= 87]
df_2018_87 = df_2018_87.reset_index(drop=True)
df_2018_87.head()

Unnamed: 0,branch,name,state,date,number_systems_tested,number_above_EPA_LHA,results_raw,actions_as_of_August_31_2017,planned_actions_to_reduce,purveyor,on_or_off_base,results,analyte,name_geo,similarity
0,USMC,MCB CAMP PENDLETON SOUTH,California,2017,1,1,77,One sample exceeded the PFOS/PFOA combined Hea...,Installation will continue to monitor the system.,DOD,on,77.0,,CAMP PENDLETON CA MCB,87
1,Navy - BRAC,AGANA,Guam,2017,12,5,88 - 410,Sampling conducted by Guam Waterworks Authorit...,,non-DOD,off,410.0,,GUAM AGANA NAS,90
2,Army,"USAG RED CLOUD, KR: CAMP STANLEY",South Korea,2017,1,1,"PFOA + PFOS combined = 80- 1,061","Initial, confirmatory and subsequent sampling ...",Continue to use Army system wells producing wa...,DOD,on,1061.0,PFOA+PFOS,CAMP STANLEY,90
3,Air Force - ANG,NEW CASTLE ANGB 166TH,Delaware,2017,1,1,140-1800 ppt (PFOS+PFOA),Artesian Water Company discontinued the use of...,Supplier took actions consistent with AF policy,non-DOD,on,1800.0,PFOA+PFOS,NEW CASTLE,90
4,Army,JB LEWIS-MCCHORD: FORT LEWIS CANTONMENT,Washington,2017,4,4,Golf Course: PFOA + PFOS combined = 76-78 JBLM...,"Initial, confirmatory and subsequent sampling ...",Award contract to install GAC treatment units ...,DOD,on,250.0,PFOA+PFOS,JB LEWIS MCCHORD,90


In [9]:
#select and rename the columns from the spatial dataframe that need to be attached to the 2018 dataframe
geo_select = geo[['name','latitude','longitude']]
geo_select = geo_select.rename(columns = {'name': 'name_geo'})

geo_select.head()

Unnamed: 0,name_geo,latitude,longitude
0,AAOF BLAIR HANGAR,17.725062,-64.782945
1,AASF #1 R W SHEPHERD HOPE HULL,32.287216,-86.394073
2,AASF #2 BIRMINGHAM,33.572333,-86.750542
3,AASF #2 TN,35.821234,-83.994949
4,AASF #2 WA,47.618166,-117.641352


In [10]:
#merged 2018 dataframe with similarity scores >=87 and the selected dataframe columns. This attaches the spatial data
#to the 2018 dataframe if there was a good name match. 
df_2018_87_geo = df_2018_87.merge(geo_select, on='name_geo', how='left')
df_2018_87_geo  = df_2018_87_geo.sort_values('name', ascending=True)
df_2018_87_geo = df_2018_87_geo.reset_index(drop=True)
df_2018_87_geo.head()

Unnamed: 0,branch,name,state,date,number_systems_tested,number_above_EPA_LHA,results_raw,actions_as_of_August_31_2017,planned_actions_to_reduce,purveyor,on_or_off_base,results,analyte,name_geo,similarity,latitude,longitude
0,Navy - BRAC,AGANA,Guam,2017,12,5,88 - 410,Sampling conducted by Guam Waterworks Authorit...,,non-DOD,off,410.0,,GUAM AGANA NAS,90,13.473339,144.824168
1,Air Force - ANG,BARNES MUNICIPAL,Massachusetts,2017,1,1,43-160 ppt (PFOS+PFOA),Base obtains water from Westfield Water Dept. ...,Water sources will continue to be sampled for ...,non-DOD,on,160.0,PFOA+PFOS,BARNES MUNICIPAL,100,42.168377,-72.717661
2,Navy,BARROW AK NARL,Alaska,2017,5,5,144 - 262,*Surface water (Imikpuk Lake) sampled; serves ...,Continue investigation,non-DOD,off,262.0,,BARROW AK NARL,100,71.321729,-156.647955
3,Army,BELMONT ARMORY,Michigan,2017,1,1,86-96.8,MIARNG spoke with potential responsible party ...,"MIARNG is waiting for response from PRP, explo...",DOD,on,96.8,,BELMONT ARMORY,100,35.245984,-81.019782
4,Air Force - ANG,BIDDLE ANGB FORMERLY HORSHAM/WILLOW GROVE ANG,Pennsylvania,2017,2,2,82-1870 ppt (PFOS+PFOA),Public Wells: Seven (7) impacted DW wells take...,Public Wells: Carbon filtration construction c...,non-DOD,off,1870.0,PFOA+PFOS,BIDDLE ANGB (FORMERLY HORSHAM/WILLOW GROVE ANG),99,40.208241,-75.148506


In [11]:
unique = df_2018_87_geo["name"].nunique()
unique

47

In [12]:
#add data that did not have a spatial match to spatial df
df_2018_all_geo = pd.concat([df_2018_87_geo, df_2018], join='outer')
df_2018_all_geo = df_2018_all_geo.sort_values(['latitude'],ascending=False).drop_duplicates('name')
df_2018_all_geo  = df_2018_all_geo.sort_values('name', ascending=True)
df_2018_all_geo = df_2018_all_geo.reset_index(drop=True)
df_2018_all_geo.head()

Unnamed: 0,branch,name,state,date,number_systems_tested,number_above_EPA_LHA,results_raw,actions_as_of_August_31_2017,planned_actions_to_reduce,purveyor,on_or_off_base,results,analyte,name_geo,similarity,latitude,longitude
0,Army,81ST RSC: E. EARLE RIVES AFRC,North Carolina,2017,1,1,<40-90,PFOS per 2014 Consumer Confidence Report. The ...,,non-DOD,on,90.0,,NMC DET EARLE,86,,
1,Navy - BRAC,AGANA,Guam,2017,12,5,88 - 410,Sampling conducted by Guam Waterworks Authorit...,,non-DOD,off,410.0,,GUAM AGANA NAS,90,13.473339,144.824168
2,Air Force - ANG,BARNES MUNICIPAL,Massachusetts,2017,1,1,43-160 ppt (PFOS+PFOA),Base obtains water from Westfield Water Dept. ...,Water sources will continue to be sampled for ...,non-DOD,on,160.0,PFOA+PFOS,BARNES MUNICIPAL,100,42.168377,-72.717661
3,Navy,BARROW AK NARL,Alaska,2017,5,5,144 - 262,*Surface water (Imikpuk Lake) sampled; serves ...,Continue investigation,non-DOD,off,262.0,,BARROW AK NARL,100,71.321729,-156.647955
4,Army,BELMONT ARMORY,Michigan,2017,1,1,86-96.8,MIARNG spoke with potential responsible party ...,"MIARNG is waiting for response from PRP, explo...",DOD,on,96.8,,BELMONT ARMORY,100,35.245984,-81.019782


In [13]:
df_2018_geo = df_2018_all_geo
#select only key columns for ArcGIS map
df_2018_geo = df_2018_geo[['state',
                            'branch',  
                            'name',
                            'date',
                            'results',
                            'purveyor',
                            'on_or_off_base',
                            'analyte',
                            'longitude',
                            'latitude']]

df_2018_geo.head()

Unnamed: 0,state,branch,name,date,results,purveyor,on_or_off_base,analyte,longitude,latitude
0,North Carolina,Army,81ST RSC: E. EARLE RIVES AFRC,2017,90.0,non-DOD,on,,,
1,Guam,Navy - BRAC,AGANA,2017,410.0,non-DOD,off,,144.824168,13.473339
2,Massachusetts,Air Force - ANG,BARNES MUNICIPAL,2017,160.0,non-DOD,on,PFOA+PFOS,-72.717661,42.168377
3,Alaska,Navy,BARROW AK NARL,2017,262.0,non-DOD,off,,-156.647955,71.321729
4,Michigan,Army,BELMONT ARMORY,2017,96.8,DOD,on,,-81.019782,35.245984


In [14]:
df_2018_geo.shape

(51, 10)

In [15]:
#calculate number of unique installations with spatial data
unique = df_2018_geo["name"].nunique()
unique

51

In [16]:
#export 2018 geocoded data to csv to use in ArcGIS
df_2018_geo.to_csv(
r"C:\Users\OITNYNWilsoS\OneDrive - Department of Veterans Affairs\PACT Act\MET\deliverables\Jan_2024\geo_max_PFAS_2017.csv", float_format='{:f}'.format, encoding='utf-8', index=False)

In [18]:
#create map. See readme file for the image. 
fig = px.scatter_geo(df_2018_geo,
    lon='longitude',
    lat='latitude',
    size='results',
    color_discrete_sequence=["red"],
    hover_name="name",
    title = 'Military installations with PFAS exceedances in drinking water of EPA proposed standard of 4 ppt, 2017',
)


# fig.update_layout(
#     geo_scope='usa',
#   #  margin=dict(l=50, r=50, t=20, b=20)
    
# )

fig