In [1]:
import pandas as pd
import numpy as np
import camelot

import warnings

# Set the warning filter to ignore
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Table 1

In [2]:
tables = camelot.read_pdf("new_mexico1.pdf", pages = "all")

In [3]:
new_mexico1 = tables[0].df
new_mexico1 = new_mexico1.replace("\n", " ", regex = True)
new_mexico1.columns = new_mexico1.iloc[1]
new_mexico1 = new_mexico1.drop([0,1]).reset_index(drop = True)
num_rows = new_mexico1.shape[0]
new_mexico1["Sample Location"] = ["North Freeman Avenue, Artesia, New Mexico"]*num_rows
new_mexico1["Sample Date"] = ["09/2022"]*num_rows
new_mexico1["EPA ID"] = ["NMN000605615"]*num_rows
new_mexico1.to_csv("new_mexico1.csv")

# Table 2 

In [4]:
tables2 = camelot.read_pdf("new_mexico2.pdf", flavor = "stream", pages = "all")

In [5]:
def clean_tables(table):
    # set the columns of the table
    table.columns = ["Analyte", "Result", "Qualifier", "RL", "MDL", "Units", 
                     "Dil Fac", "Method", "Prep Type", "drop"]
    
    # drop the last column
    table.drop(columns="drop", inplace = True)
    
    # add the sample id to each row
    sample_id = pd.Series()
    
    for idx, value in table["Method"].items():
        if "Lab Sample ID:" in value:
            sample_id[idx] = value.split(":")[1]
        else:
            sample_id[idx] = None
    sample_id = sample_id.fillna(method = "ffill")
    
    table["Sample ID"] = sample_id
    
    # drop any row without a numerical result
    table['Result'] = pd.to_numeric(table['Result'], errors='coerce')
    table = table.dropna(subset=['Result']).reset_index(drop = True)
    return table

## Page 1

In [6]:
# only care about lab sample id
table1 = tables2[0].df
table1 = clean_tables(table1)
table1["Qualifier"] = table1["Qualifier"].replace("", None)

## Page 2

In [7]:
table2 = tables2[2].df

# split col 1 into two columns
split_text = table2[1].str.replace("\n", " ", regex = True).str.split(" ")
result_col = split_text.apply(lambda x: x[0])
qualifier_col = split_text.apply(lambda x: x[1] if len(x) == 2 else None)

new_column_names = {i: i + 1 for i in range(2, 9)}
table2.rename(columns=new_column_names, inplace=True)

table2[1] = result_col
table2[2] = qualifier_col
table2 = table2[list(range(0,10))]

table2 = clean_tables(table2)
table2["Method"] = table2["Method"].str.split("\n").apply(lambda x: x[0] if len(x) == 1 else x[1])

## Page 3

In [8]:
table3 = tables2[3].df

table3.iloc[1, 8] = table3.iloc[1,7]
table3.iloc[22, 8] = table3.iloc[22, 7]
table3.drop(columns = [7], inplace = True)

table3 = clean_tables(table3)
table3["Analyte"] = table3["Analyte"].str.replace("-", "", regex = True)
table3["Analyte"] = table3["Analyte"].str.replace("Nmethylperfluorooctanesulfonamidoa", "NMeFOSAA", regex = True)
table3["Analyte"] = table3["Analyte"].str.replace("Nethylperfluorooctanesulfonamidoac", "NEtFOSAA", regex = True)
table3["Qualifier"] = table3["Qualifier"].replace("", None)

## Page 4

In [9]:
table4 = tables2[4].df
table4.iloc[1, 8] = table4.iloc[1, 7]
table4.iloc[38, 8] = table4.iloc[38, 7]
table4.iloc[40, 8] = table4.iloc[40, 7]
table4.iloc[51, 8] = table4.iloc[51, 7]
table4.drop(columns = [7], inplace = True)
table4 = clean_tables(table4)
table4["Analyte"] = table4["Analyte"].str.replace("-", "", regex = True)
table4["Analyte"] = table4["Analyte"].str.replace("Nmethylperfluorooctanesulfonamidoa", "NMeFOSAA", regex = True)
table4["Analyte"] = table4["Analyte"].str.replace("Nethylperfluorooctanesulfonamidoac", "NEtFOSAA", regex = True)
table4.loc[36] = {'Analyte': 'No Detections', 'Result': None, 'Qualifier': None, 'RL': None, 'MDL': None, 'Units': None,
                  'Dil Fac': None, 'Method': None, 'Prep Type': None, 'Sample ID': '320-66162-13'}

## Page 5

In [10]:
table5 = tables2[5].df

# split col 1 into two columns
split_text = table5[1].str.replace("\n", " ", regex = True).str.split(" ")
result_col = split_text.apply(lambda x: x[0])
qualifier_col = split_text.apply(lambda x: x[1] if len(x) == 2 else None)

new_column_names = {i: i + 1 for i in range(2, 9)}
table5.rename(columns=new_column_names, inplace=True)

table5[1] = result_col
table5[2] = qualifier_col
table5 = table5[list(range(0,10))]

table5 = clean_tables(table5)

## Page 6

In [11]:
# last table to use (ignore table)
table6 = tables2[7].df

# split col 1 into two columns
split_text = table6[1].str.replace("\n", " ", regex = True).str.split(" ")
result_col = split_text.apply(lambda x: x[0])
qualifier_col = split_text.apply(lambda x: x[1] if len(x) == 2 else None)

new_column_names = {i: i + 1 for i in range(2, 9)}
table6.rename(columns=new_column_names, inplace=True)

table6[1] = result_col
table6[2] = qualifier_col
table6 = table6[list(range(0,10))]

table6 = clean_tables(table6)
table6.loc[26] = {'Analyte': 'No Detections', 'Result': None, 'Qualifier': None, 'RL': None, 'MDL': None, 'Units': None,
                  'Dil Fac': None, 'Method': None, 'Prep Type': None, 'Sample ID': '320-66162-20'}

In [12]:
new_mexico2 = pd.concat([table1, table2, table3, table4, table5, table6], ignore_index=True)
new_mexico2["sorting_index"] = new_mexico2["Sample ID"].str.split("-").apply(lambda x: int(x[2]))
new_mexico2 = new_mexico2.sort_values(by = "sorting_index", ascending = True).reset_index(drop = True)
new_mexico2 = new_mexico2.drop(columns = "sorting_index")
new_mexico2.to_csv("new_mexico2.csv")