In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("kentucky.csv")

In [3]:
# isolate the analytes
analyte_data = df[['PFBS', 'HFPO- DA', 'PFHpA', 'PFHxS', 'ADONA', 'PFOA', 'PFOS', 'PFNA', 'Sample']]
analyte_data.columns = [col.upper().replace(' ', '') for col in analyte_data.columns]

analyte_melted = pd.melt(analyte_data, id_vars = "SAMPLE", var_name = "analyte", value_name = "concentration")
analyte_melted["concentration"] = analyte_melted["concentration"].astype(float)
analyte_melted.head()

Unnamed: 0,SAMPLE,analyte,concentration
0,AQ03020,PFBS,0.0
1,AQ02922,PFBS,1.88
2,AQ04874,PFBS,0.0
3,AQ03210,PFBS,0.0
4,AQ03216,PFBS,0.0


In [4]:
# isolate the non-analyte data
non_analyte_data = df[['Unnamed: 0', 'Sample', 'Collected', 'Received', 'Location', 'Type',
                      'Units', 'Latitude', 'Longitude', 'Aquifer/Source', 'River Basin',
                       'Aquifer General', 'Land Use']]

non_analyte_data.head()

Unnamed: 0.1,Unnamed: 0,Sample,Collected,Received,Location,Type,Units,Latitude,Longitude,Aquifer/Source,River Basin,Aquifer General,Land Use
0,0,AQ03020,7/15/2019,7/16/2019,Ashland Water Works,SW,ng/L,38.452778,-82.613056,Ohio R,Ohio,,urban
1,1,AQ02922,7/8/2019,7/9/2019,Augusta WTP,GW,ng/L,38.773611,-84.019444,Ohio R Alluvium,,ORA,rural
2,2,AQ04874,10/1/2019,10/1/2019,Barbourville,SW,ng/L,36.864444,-83.881944,Cumberland River,Cumberland,,urban
3,3,AQ03210,7/22/2019,7/23/2019,Bardstown Municipal Water,SW,ng/L,37.810556,-85.505833,Buffalo Cr,Salt,,rural
4,4,AQ03216,7/22/2019,7/23/2019,Beaver Dam Municipal Water,GW,ng/L,37.4025,-86.8775,Breathitt SS,,Pennsylvanian Sandstone,urban


In [5]:
# re-combine analyte and non-analyte data
df_cleaned = pd.merge(non_analyte_data, analyte_melted, left_on = "Sample", right_on = "SAMPLE")

# drop unnecessary columns
df_cleaned.drop(columns = ["Unnamed: 0", "SAMPLE"], inplace = True)

# adjust column names
df_cleaned.columns = [col.lower() for col in df_cleaned.columns]
df_cleaned.rename(columns = {"sample": "sample_id"}, inplace = True)

# set latitude and longitude to be floats
df_cleaned["latitude"] = df_cleaned["latitude"].astype(float)
df_cleaned["longitude"] = df_cleaned["longitude"].astype(float)

# set the type
water_type = {"SW": "surface water", "GW": "groundwater"}
df_cleaned["type"] = df_cleaned["type"].apply(lambda x: water_type[x])
df_cleaned.rename(columns = {"type": "source_type"}, inplace = True)

# add the state column
df_cleaned["state"] = ["Kentucky"]*df_cleaned.shape[0]

# is_detected column
df_cleaned["is_detected"] = df_cleaned["concentration"].apply(lambda x: x > 0.0)

df_cleaned

Unnamed: 0,sample_id,collected,received,location,source_type,units,latitude,longitude,aquifer/source,river basin,aquifer general,land use,analyte,concentration,state,is_detected
0,AQ03020,7/15/2019,7/16/2019,Ashland Water Works,surface water,ng/L,38.452778,-82.613056,Ohio R,Ohio,,urban,PFBS,0.00,Kentucky,False
1,AQ03020,7/15/2019,7/16/2019,Ashland Water Works,surface water,ng/L,38.452778,-82.613056,Ohio R,Ohio,,urban,HFPO-DA,18.30,Kentucky,True
2,AQ03020,7/15/2019,7/16/2019,Ashland Water Works,surface water,ng/L,38.452778,-82.613056,Ohio R,Ohio,,urban,PFHPA,0.00,Kentucky,False
3,AQ03020,7/15/2019,7/16/2019,Ashland Water Works,surface water,ng/L,38.452778,-82.613056,Ohio R,Ohio,,urban,PFHXS,0.00,Kentucky,False
4,AQ03020,7/15/2019,7/16/2019,Ashland Water Works,surface water,ng/L,38.452778,-82.613056,Ohio R,Ohio,,urban,ADONA,0.00,Kentucky,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,AQ03575,8/12/2019,8/13/2019,Winchester Municipal Utilities,surface water,ng/L,37.947222,-84.228333,L Howard Cr,Kentucky,,rural,PFHXS,0.00,Kentucky,False
644,AQ03575,8/12/2019,8/13/2019,Winchester Municipal Utilities,surface water,ng/L,37.947222,-84.228333,L Howard Cr,Kentucky,,rural,ADONA,0.00,Kentucky,False
645,AQ03575,8/12/2019,8/13/2019,Winchester Municipal Utilities,surface water,ng/L,37.947222,-84.228333,L Howard Cr,Kentucky,,rural,PFOA,0.00,Kentucky,False
646,AQ03575,8/12/2019,8/13/2019,Winchester Municipal Utilities,surface water,ng/L,37.947222,-84.228333,L Howard Cr,Kentucky,,rural,PFOS,1.69,Kentucky,True


- Which should be the sampling date, `collected` or `recieved`?
    - Collected should be the sampling date
- What should we do with the following columns: `aquifer/source`, `river basin`, `aquifer general`, `land use`?
    - Keep river basin (could link to pws_id), land use
- Need info regarding `pws_id`, `data_source`, `population`, well information if possible (ask Xiaojun)
    - Check the google drive
    - Use external census data for the population or an API 
    - Data source can be the dataset name (local)