In [2]:
import pandas as pd
import numpy as np

In [3]:
df1 = pd.read_csv("north_dakota1.csv")

# set the columns
columns = np.array(df1.iloc[0])
columns[0] = "sample_id"
columns[1] = "analyte"
columns[2] = "health advisory limit"
columns[3] = "units"
df1.columns = columns

# drop the first two rows
df1 = df1.iloc[2:]

df1.loc[df1["units"] == "ppt", "units"] = "ng/L"

In [39]:
# un-collapse the analyte data
analyte_data1 = df1[["analyte", 'Landfill 3  MW 2', 'Landfill 4  MW 1', 'Landfill 5  MW 1',
       'Landfill 5  MW 2', 'Landfill 7  MW 1', 'Landfill 10  MW 2',
       'Landfill 12A  MW 1', 'Landfill 14  MW 1', 'Landfill 16  MW 2', 'FTA 4',
       'FTA 5', 'GW 1']]

analyte_data1 = pd.melt(analyte_data1, id_vars=['analyte'], 
                        var_name='sampling_location',
                        value_name='concentration')
analyte_data1

# obtain the non-analyte data
non_analyte_data1 = df1[["sample_id", "analyte", "health advisory limit", "units"]]

# re-merge the analyte and non-analyte data
df1_cleaned = pd.merge(non_analyte_data1, analyte_data1, left_on = "analyte", right_on = "analyte")
df1_cleaned

# add a is_detected col
df1_cleaned["is_detected"] = df1_cleaned["concentration"] != "N.D."

# clean the concentration column
df1_cleaned['concentration'] = df1_cleaned['concentration'].replace('N.D.', '0.0')
df1_cleaned['concentration'] = df1_cleaned['concentration'].str.replace('J', '').astype(float)

# state col
num_rows = df1_cleaned.shape[0]
df1_cleaned["state"] = ["North Dakota"]*num_rows

# source type col
df1_cleaned["source_type"] = ["groundwater"]*num_rows

df1_cleaned.drop(columns = ["health advisory limit"], inplace = True)
df1_cleaned["analyte"] = df1_cleaned["analyte"].str.replace("6:2 fluorotelomersulfonate", "6:2FTS").str.upper()
df1_cleaned["sample_id"] = df1_cleaned["sample_id"] - 1
df1_cleaned

Unnamed: 0,sample_id,analyte,units,sampling_location,concentration,is_detected,state,source_type
0,1,6:2FTS,ng/L,Landfill 3 MW 2,0.0,False,North Dakota,groundwater
1,1,6:2FTS,ng/L,Landfill 4 MW 1,0.0,False,North Dakota,groundwater
2,1,6:2FTS,ng/L,Landfill 5 MW 1,3.5,True,North Dakota,groundwater
3,1,6:2FTS,ng/L,Landfill 5 MW 2,0.0,False,North Dakota,groundwater
4,1,6:2FTS,ng/L,Landfill 7 MW 1,9.6,True,North Dakota,groundwater
...,...,...,...,...,...,...,...,...
127,11,PFPEA,ng/L,Landfill 14 MW 1,0.0,False,North Dakota,groundwater
128,11,PFPEA,ng/L,Landfill 16 MW 2,0.0,False,North Dakota,groundwater
129,11,PFPEA,ng/L,FTA 4,160.0,True,North Dakota,groundwater
130,11,PFPEA,ng/L,FTA 5,0.0,False,North Dakota,groundwater


In [41]:
df2 = pd.read_csv("north_dakota2.csv")

# set the columns
columns = np.array(df2.iloc[0])
columns[0] = "sample_id"
columns[1] = "analyte"
columns[2] = "health advisory limit"
columns[3] = "units"
df2.columns = columns

# drop the first two rows
df2 = df2.iloc[2:]

df2.drop(columns = "health advisory limit", inplace = True)
df2["units"] = df2["units"].replace("ppt", "ng/L")

df2_cleaned = pd.melt(df2, id_vars = ["sample_id", "analyte", "units"], 
                      var_name = "sampling_location", 
                      value_name = "concentration")

df2_cleaned["concentration"] = df2_cleaned["concentration"].replace("ND", "0.0")
df2_cleaned["concentration"] = df2_cleaned["concentration"].astype(float)

num_rows2 = df2_cleaned.shape[0]
df2_cleaned["is_detected"] = df2_cleaned["concentration"].apply(lambda x: x > 0.0)
df2_cleaned["state"] = ["North Dakota"]*num_rows2
df2_cleaned["source_type"] = ["groundwater"]*num_rows2
df2_cleaned["analyte"] = df2_cleaned["analyte"].str.upper()
df2_cleaned["sample_id"] = df2_cleaned["sample_id"] + 10
df2_cleaned

Unnamed: 0,sample_id,analyte,units,sampling_location,concentration,is_detected,state,source_type
0,12,PFOS,ng/L,Aquifer 1A,5.9,True,North Dakota,groundwater
1,13,PFOA,ng/L,Aquifer 1A,0.0,False,North Dakota,groundwater
2,14,PFBA,ng/L,Aquifer 1A,0.0,False,North Dakota,groundwater
3,15,PFPEA,ng/L,Aquifer 1A,0.0,False,North Dakota,groundwater
4,16,PFBS,ng/L,Aquifer 1A,0.0,False,North Dakota,groundwater
...,...,...,...,...,...,...,...,...
95,17,PFHXA,ng/L,Aquifer 4D,0.0,False,North Dakota,groundwater
96,18,PFHPA,ng/L,Aquifer 4D,0.0,False,North Dakota,groundwater
97,19,PFHXS,ng/L,Aquifer 4D,0.0,False,North Dakota,groundwater
98,20,PFHXA,ng/L,Aquifer 4D,0.0,False,North Dakota,groundwater


In [45]:
north_dakota_cleaned = pd.concat([df1_cleaned, df2_cleaned], axis = 0)
north_dakota_cleaned.to_csv("north_dakota_cleaned_pdf.csv")

Questions
- Can we find a more descriptive location name? Also for the latitude and longitude
- Can we find a sampling date?