In [1]:
import pandas as pd
import numpy as np
import camelot

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # Set to None to display all rows
pd.set_option('display.max_columns', None)  # Set to None to display all columns

# Table 1

In [2]:
table1_dict = {
    "Sampling Location": ["MW-BNA01-01", "MW-BNA05-01", None, "BNA04-SW1", "BNA05-SW1"],
    "Sample Identifier": ["MW-BNA01-01-01", "MW-BNA05-01-01", "MW-BNA05-01-01D", "BNA04-SW1-01", "BNA05-SW1-01"],
    "Sample Date": ["03/09/18", "03/09/18", "03/09/18", "03/09/18", "03/09/18"],
    "Sample Depth (ft)": ["15.5-25.5", "2.8-7.8", "2.8-7.8", None, None],
    "Sample Type": ["Reg", "Reg", "FD", "Reg", "Reg"],
    "Water Type": ["Groundwater", "Groundwater", "Groundwater", "Stormwater", "Stormwater"],
    "PFOS": [96,66,61,50,250],
    "PFOA": [13,12,11,7.3,21],
    "PFOS+PFOA": [109, 78, 72, 57.3, 271],
    "PFBS": [27,5.1,4.4,11,22],
    "PFHpA": [5.1,3.2,3.6,3.5,8.4],
    "PFHxS": [130,75,68,59,190],
    "PFNA": ["1.4U", "1.3J", "1.4J", "0.8J", "1.3J"]
}

table1 = pd.DataFrame(table1_dict)
table1.to_csv("tennessee1.csv")

# Table 2

In [2]:
tables = camelot.read_pdf("tennessee2.pdf", pages = "all")
tables

<TableList n=4>

In [3]:
# clean table 1
table1 = tables[0].df
table1.iloc[6, 1:7] = ["PFOS", "0.0071", "0.126", "mg/kg", "2/0", "No"]
table1.iloc[7, 1:7] = ["PFOA","0.00108 J","0.126", "mg/kg","2/0", "No"]
table1.iloc[8, 1:7] = ["PFBS", "ND", "130", "mg/kg", "2/0", "No"]
table1 = table1.replace("\n", " ", regex = True)
table1.columns = table1.iloc[0]
table1 = table1.drop(0)

# clean table 2
table2 = tables[1].df.replace("\n", " ", regex = True)
table2.columns = table2.iloc[0]
table2 = table2.drop(0)

# clean table 3
table3 = tables[2].df.replace("\n", " ", regex = True)
table3.iloc[27, 1:7] = ["PFOS", "0.0483", "0.126", "mg/kg", "1/0", "No"]
table3.columns = table3.iloc[0]
table3 = table3.drop(0)

# clean table 4
table4 = tables[3].df.replace("\n", " ", regex = True)
table4.columns = table4.iloc[0]
table4 = table4.drop(0)

# combine all the tables together
tennessee2 = pd.concat([table1, table2, table3, table4], ignore_index=True)

mediums = tennessee2["Parameter"].str.strip()
# Iterate over the series and replace chemical names with the medium sampled from the previous row
for i in range(1, len(mediums)):
    if mediums[i] in ["PFOS", "PFOA", "PFOS+PFOA", "PFBS"]:
        mediums[i] = mediums[i-1]
        
tennessee2["Sample Medium"] = mediums
tennessee2[tennessee2.columns[0]] = tennessee2[tennessee2.columns[0]].replace('', None).fillna(method = "ffill") 
tennessee2 = tennessee2[tennessee2["Units"] != '']


In [5]:
tennessee2["PWSID"] = ["TN0004408"] * tennessee2.shape[0]
tennessee2

Unnamed: 0,AFFF Release Area,Parameter,Maximum Detected Concentration,Screening Value,Units,Number of Samples*/ Number of Exceedances,Exceeds Screening Level,Potentially Complete DW Exposure Pathway,Recommendation,Sample Medium,PWSID
1,AFFF Release Area 1 FTA No. 1,PFOS,0.118,0.126,mg/kg,2/0,No,Yes,NFRAP,Surface Soil,TN0004408
2,AFFF Release Area 1 FTA No. 1,PFOA,0.01,0.126,mg/kg,2/0,No,,,Surface Soil,TN0004408
3,AFFF Release Area 1 FTA No. 1,PFBS,ND,130.0,mg/kg,2/0,No,,,Surface Soil,TN0004408
5,AFFF Release Area 1 FTA No. 1,PFOS,0.0071,0.126,mg/kg,2/0,No,,NFRAP,Subsurface Soil,TN0004408
6,AFFF Release Area 1 FTA No. 1,PFOA,0.00108 J,0.126,mg/kg,2/0,No,,,Subsurface Soil,TN0004408
7,AFFF Release Area 1 FTA No. 1,PFBS,ND,130.0,mg/kg,2/0,No,,,Subsurface Soil,TN0004408
9,AFFF Release Area 1 FTA No. 1,PFOS,159,0.07,µg/L,2/2,Yes,,Follow‐on SI,Groundwater,TN0004408
10,AFFF Release Area 1 FTA No. 1,PFOA,16 J,0.07,µg/L,2/2,Yes,,,Groundwater,TN0004408
11,AFFF Release Area 1 FTA No. 1,PFOS+PFOA,175,0.07,µg/L,2/2,Yes,,,Groundwater,TN0004408
12,AFFF Release Area 1 FTA No. 1,PFBS,17.6 J,40.0,µg/L,2/0,No,,,Groundwater,TN0004408


In [6]:
tennessee2.to_csv("tennessee2.csv")

# Table 3

In [5]:
table3_dict = {
    "Sampling Location": ["18-MW01", "MW7-14", "MW8A-08", "MW8A-08", "MW-TYS06-01", "MW8B-06", "MW6-04", "TYS12-SW1"],
    "Sample Identifier": ["18-MW01-PRL04-01", "MW7-14-PRL05-01", "MW8A-08-PRL06-01", "MW8A-08-PRL06-01D", "MW-TYS06-01-01", "MW8B-06-PRL08-01", "MW6-04-PRL12-01", "TYS12-SW1-01"],
    "Sample Date": ["10/09/18","10/08/18","10/08/18","10/08/18","10/09/18","10/08/18","10/10/18", "02/27/18"],
    "Screened Interval (ft BGS)": [None, None, None, None, "35-60", "31.2-46.2", "55.8-65.7", None],
    "Sample Type": ["Reg", "Reg", "Reg", "FD", "Reg", "Reg", "Reg", "Reg"],
    "Water Type": ["Groundwater", "Groundwater", "Groundwater", "Groundwater","Groundwater","Groundwater","Groundwater","Surfacewater"],
    "PFOS": ["1900J", 160, "550J", "530J", "4300J", "1100J", "3300J","760J"],
    "PFOA": [120,9,130,130,"630J",140,73,68],
    "PFOS+PFOA": ["2020J", 169, "680J", "660J", "4930J", "1240J", "3373J","828J"],
    "PFBS": [19,9.5,33,32,"190J",48,42,35],
    "PFHpA": [190,5.9,71,73,210,140,54,"60J"],
    "PFHxS": ["440J",120,"530J","500J","2000J","790J","440J","780J"],
    "PFNA": [15,"1.2J",10,11,25,12,5.5,11]
}

table3 = pd.DataFrame(table3_dict)
table3.to_csv("tennessee3.csv")

# Table 4

In [6]:
table4_dict = {
    "Sample Location": ["002G02DA", "002G05DA", "009TW01", "009TW02", "009TW02"],
    "Sample Site": ["SWMU 2", "SWMU 2", "SWMU 9", "SWMU 9", "SWMU 9"],
    "Sample ID": ["002G02DA-040820", "002G05DA-040820", "009TW01-040820", "009TW02-040820", "009TW02-040820-D"],
    "Sample Date": ["04/08/20","04/08/20","04/08/20","04/08/20","04/08/20"],
    "Matrix": ["GW","GW","GW","GW","GW"],
    "PFOA": ["0.85J", "1.42J", "1.36U", "1.44U", "1.44U"],
    "PFOS": ["1.01J", "0.89U", "0.91J", "2.77J", "2.58J"],
    "PFBS": ["3.31J", "2.71J", "0.48J", "0.55J", "0.59J"]
}

table4 = pd.DataFrame(table4_dict)
table4.to_csv("tennessee4.csv")