In [1]:
import camelot
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
tables = camelot.read_pdf("alaska.pdf", pages = "1,2,3,4,5")

In [3]:
def clean_table(df):
    # new empty dataframe
    cleaned_df = dict()
    for i in np.arange(1, df.shape[0]):
        # grab all the contents of each row
        row = df.loc[i]
        text = [i.split("\n") for i in row]
        cleaned_row = np.array([item for sublist in text for item in sublist])
    
        # append the newly cleaned row to the dataframe
        cleaned_df[i] = cleaned_row
    return pd.DataFrame.from_dict(cleaned_df, orient = 'index').reset_index(drop = True)

def swap_values(df, coord1, coord2):
    '''
    swaps the values of df at coord1 and coord2
    '''
    temp = df.iloc[coord2]
    df.iloc[coord2] = df.iloc[coord1]
    df.iloc[coord1] = temp
    
def impute_shift(df, coord, replacement_value):
    '''
    coord: the coordinate to start shifting values
    replacement_value: the value to impute into coord once pushed
    '''
    # grab the whole row an convert to an array
    row_idx, col_idx = coord[0], coord[1]
    row = np.array(df.iloc[row_idx].values)
    
    # remove any instances of None
    row = row[row != None]
    
    # impute the replacement value into the column-th index of the row
    row = np.insert(row, col_idx, replacement_value)
    
    # re-insert that updated row back into the dataframe
    df.iloc[row_idx] = row

# First Table

In [4]:
table1 = clean_table(tables[0].df)
swap_values(table1, (1,1), (1,2))
swap_values(table1, (17,0), (17,1))
impute_shift(table1, (16,1), "70†")

In [5]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "MW-1-15, 10/26/2021", "MW-1-40, 10/26/2021", "MW-2-20, 10/26/2021", 
            "MW-2-20, Duplicate", "MW-2-30, 10/26/2021", "MW-3-15, 10/26/2021", "MW-3-40, 10/26/2021",
            "MW-4-20, 10/25/2021", "MW-5-20, 10/25/2021", "MW-6-20, 10/26/2021", "MW-7-20, 10/25/2021",
            "MW-8-20, 10/25/2021", "MW-9-30, 10/25/2021", "MW-9-30, Duplicate", "MW-10-20, 10/25/2021",
            "MW-11-15, 10/31/2021", "MW-12-10, 10/31/2021", "MW-12-10, Duplicate"]
table1.columns = col_names

# Table 2

In [6]:
table2 = clean_table(tables[1].df)
swap_values(table2, (1,1), (1,2))
swap_values(table2, (17,0), (17,1))
impute_shift(table2, (16,1), "70†")

In [7]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "MW-9-10, 10/25/2021", "MW-13-20, 10/27/2021", "MW-13-45, 10/27/2021", 
            "MW-13-45, Duplicate", "MW-14-15, 11/1/2021", "MW-14-31, 11/1/2021", "MW-15-15, 11/3/2021",
            "MW-14-45, 11/3/2021", "MW-14-45, Duplicate", "MW-16-15, 11/2/2021", "MW-17-20, 10/26/2021",
            "MW-17-40, 10/26/2021", "MW-17-40, Duplicate", "MW-18-15, 11/4/2021", "MW-18-50, 11/4/2021",
            "MW-18-50, Duplicate", "MW-19-15, 11/5/2021"]
table2.columns = col_names

# Table 3

In [8]:
table3 = clean_table(tables[2].df)
swap_values(table3, (1,1), (1,2))
swap_values(table3, (17,0), (17,1))
impute_shift(table3, (16,1), "70†")

In [9]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "MW-19-50, 11/5/2021", "MW-19-50, Duplicate", "MW-20-15, 11/4/2021", 
            "MW-20-40, 11/4/2021", "MW-21-15, 11/1/2021", "MW-21-45, 11/1/2021", "MW-21-45, Duplicate",
            "MW-22-15, 10/30/2021", "MW-22-40, 10/30/2021", "MW-23-20, 10/24/2021", "MW-23-50, 10/25/2021",
            "MW-23-50, Duplicate", "MW-24-10, 10/29/2021", "MW-24-30, 10/29/2021", "MW-25-15, 10/28/2021",
            "MW-25-47, 10/29/2021", "MW-25-47, Duplicate", "GAC 2021, 11/5/2021"]
table3.columns = col_names

# Table 4

In [10]:
table4 = clean_table(tables[3].df)
swap_values(table4, (17,0), (17,1))
impute_shift(table4, (16,1), "70†")

In [11]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "21GST-TWP-1, 10/27/2021", "21GST-TWP-2, 10/27/2021", "21GST-TWP-3, 10/28/2021", 
            "21GST-TWP-3, Duplicate", "21GST-TWP-4, 10/28/2021", "21GST-TWP-5, 10/28/2021", "21GST-TWP-6, 10/30/2021",
            "21GST-TWP-7, 10/30/2021", "21GST-TWP-8, 10/28/2021", "21GST-TWP-9, 10/30/2021", "21GST-TWP-10, 10/27/2021"]
table4.columns = col_names

# Table 5

In [12]:
table5 = clean_table(tables[4].df)
swap_values(table5, (17,0), (17,1))
impute_shift(table5, (16,1), "70†")

In [13]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "21GST-TWP-11, 10/30/2021", "21GST-TWP-11, Duplicate", "21GST-TWP-12, 10/30/2021", 
            "21GST-TWP-13, 10/24/2021", "21GST-TWP-14, 10/24/2021", "21GST-TWP-14, Duplicate", "21GST-TWP-15, 10/27/2021",
            "21GST-TWP-15, Duplicate", "PW-016, 10/26/2021"]
table5.columns = col_names

In [14]:
table1 = table1.set_index("Analyte")
table2 = table2.set_index("Analyte")
table3 = table3.set_index("Analyte")
table4 = table4.set_index("Analyte")
table5 = table5.set_index("Analyte")

df = pd.concat([table1, table2, table3, table4, table5], axis = 1)

df.to_csv("alaska.csv")

In [15]:
df

Unnamed: 0_level_0,EPA LHA,Units,"MW-1-15, 10/26/2021","MW-1-40, 10/26/2021","MW-2-20, 10/26/2021","MW-2-20, Duplicate","MW-2-30, 10/26/2021","MW-3-15, 10/26/2021","MW-3-40, 10/26/2021","MW-4-20, 10/25/2021","MW-5-20, 10/25/2021","MW-6-20, 10/26/2021","MW-7-20, 10/25/2021","MW-8-20, 10/25/2021","MW-9-30, 10/25/2021","MW-9-30, Duplicate","MW-10-20, 10/25/2021","MW-11-15, 10/31/2021","MW-12-10, 10/31/2021","MW-12-10, Duplicate",EPA LHA,Units,"MW-9-10, 10/25/2021","MW-13-20, 10/27/2021","MW-13-45, 10/27/2021","MW-13-45, Duplicate","MW-14-15, 11/1/2021","MW-14-31, 11/1/2021","MW-15-15, 11/3/2021","MW-14-45, 11/3/2021","MW-14-45, Duplicate","MW-16-15, 11/2/2021","MW-17-20, 10/26/2021","MW-17-40, 10/26/2021","MW-17-40, Duplicate","MW-18-15, 11/4/2021","MW-18-50, 11/4/2021","MW-18-50, Duplicate","MW-19-15, 11/5/2021",EPA LHA,Units,"MW-19-50, 11/5/2021","MW-19-50, Duplicate","MW-20-15, 11/4/2021","MW-20-40, 11/4/2021","MW-21-15, 11/1/2021","MW-21-45, 11/1/2021","MW-21-45, Duplicate","MW-22-15, 10/30/2021","MW-22-40, 10/30/2021","MW-23-20, 10/24/2021","MW-23-50, 10/25/2021","MW-23-50, Duplicate","MW-24-10, 10/29/2021","MW-24-30, 10/29/2021","MW-25-15, 10/28/2021","MW-25-47, 10/29/2021","MW-25-47, Duplicate","GAC 2021, 11/5/2021",EPA LHA,Units,"21GST-TWP-1, 10/27/2021","21GST-TWP-2, 10/27/2021","21GST-TWP-3, 10/28/2021","21GST-TWP-3, Duplicate","21GST-TWP-4, 10/28/2021","21GST-TWP-5, 10/28/2021","21GST-TWP-6, 10/30/2021","21GST-TWP-7, 10/30/2021","21GST-TWP-8, 10/28/2021","21GST-TWP-9, 10/30/2021","21GST-TWP-10, 10/27/2021",EPA LHA,Units,"21GST-TWP-11, 10/30/2021","21GST-TWP-11, Duplicate","21GST-TWP-12, 10/30/2021","21GST-TWP-13, 10/24/2021","21GST-TWP-14, 10/24/2021","21GST-TWP-14, Duplicate","21GST-TWP-15, 10/27/2021","21GST-TWP-15, Duplicate","PW-016, 10/26/2021"
Analyte,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1
Perfluorohexanesulfonic acid (PFHxS),-,ng/L,0.76 J,<1.8,39,40,<1.8,5.8,12,0.55 J,0.88 J,1.1 J,0.67 J,<1.8,9.9,10,8.4,60,11,10,-,ng/L,<2.0,7.6,<1.7,<1.8,1.8,6.2,10,<1.7,<1.7,14,16,<1.9,<1.9,21,1.3 J,1.2 J,0.84 J,-,ng/L,1.8,1.8,5.5,<1.7,6.1,<1.8,<1.8,4.5,27,1.0 J,<1.9 J*,<1.9 J*,0.54 J,<1.7,0.56 J,<1.8,<1.8,<1.7,-,ng/L,<1.8,12,<1.8,<1.8,100,53,8.4,1.0 J,6.9,22,54,-,ng/L,6.4,5.9,0.57 J,14,3.9,3.8,11,11,1.5 J
Perfluorohexanoic acid (PFHxA),-,ng/L,<1.8,<1.8,90,93,0.54 J*,0.61 J,1.8 J,<1.8,<1.8,<1.8,1.8 J,<1.8,7.5,7.7,6.4,16,2.9,2.4,-,ng/L,<2.0,4.2,<1.7,<1.8,1.0 J,8.6,2.6,<1.7,<1.7,56,11,<1.9 J*,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,1.5 J,1.8,1.5 J,<1.7,3.9,<1.8,<1.8,3.0,6.8,1.4 J,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,7.7,<1.8,<1.8,45,26,1.0 J,1.1 J,8.6,9.9,12,-,ng/L,1.1 J,1.4 J,<1.7,11,3.1,2.9,6.3,6.8,3.8
Perfluoroheptanoic acid (PFHpA),-,ng/L,<1.8,<1.8,44,49,<1.8,<1.9,<1.9,<1.8,<1.8,<1.8,0.61 J,<1.8,2.9,2.9,2.9,10,4.3,4.4,-,ng/L,<2.0,1.4 J,<1.7,<1.8,1.1 J,2.3,<1.7,<1.7,<1.7,25,1.8 J,<1.9 J*,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,1.9,<1.8,<1.8,1.1 J,1.2 J,<1.9,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,1.8,<1.8,<1.8,17,16,0.61 J,1.2 J,8.4,2.2,4.3,-,ng/L,1.1 J,1.1 J,<1.7,5.0,1.1 J,<2.0,3.0,3.1,1.9 J*
Perfluorononanoic acid (PFNA),-,ng/L,<1.8,<1.8,6.5,7.0,<1.8,<1.9,<1.9,<1.8,<1.8,<1.8,<1.9,<1.8,<1.9,<1.8,<1.8,1.3 J,0.91 J*,0.58 J*,-,ng/L,<2.0,<1.7,<1.7,<1.8,<1.8,0.25 J,<1.7,<1.7,<1.7,4.0,<2.0,<1.9 J*,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,<1.8,<1.8,<1.8,<1.8,<1.8,0.65 J,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,<1.7,<1.8,<1.8,1.5 J,2.4,<1.7,0.52 J,<1.8,<1.7,<1.8,-,ng/L,<1.7,0.29 J,<1.7,<1.9,<2.0,<2.0 J*,<1.7,0.30 J,<1.9
Perfluorobutanesulfonic acid (PFBS),-,ng/L,<1.8,<1.8,2.7,2.6,1.1 J,0.45 J*,1.0 J,<1.8,0.41 J,<1.8,0.21 J,<1.8,0.78 J,0.65 J,0.38 J,4.7,0.23 J,0.35 J*,-,ng/L,<2.0,0.70 J,<1.7,<1.8,0.24 J,0.74 J,<1.7,<1.7,<1.7,<1.7,0.98 J,<1.9,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,0.72 J,<1.8,<1.8,0.39 J*,4.0,<1.9,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,2.7,<1.8,<1.8,10,1.6 J,0.50 J,<1.7,<1.8,0.98 J,2.6,-,ng/L,0.26 J,0.21 J,<1.7,0.61 J,<2.0,<2.0,0.53 J,0.51 J,<1.9
Perfluorodecanoic acid (PFDA),-,ng/L,<1.8,<1.8,<1.8,0.72 J,<1.8,<1.9,<1.9,<1.8,<1.8,<1.8,<1.9,<1.8,<1.9,<1.8,<1.8,<1.8,<1.7,<1.7,-,ng/L,<2.0,<1.7,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,13,<2.0,<1.9,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,<1.8,<1.8,<1.8,<1.8,<1.8,1.2 J,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,<1.7,<1.8,<1.8,<1.7,2.9,<1.7,<1.7,<1.8,<1.7,<1.8,-,ng/L,<1.7,<1.8,<1.7,<1.9,<2.0,<2.0,<1.7,<1.8,<1.9
Perfluoroundecanoic acid (PFUnA),-,ng/L,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9,<1.8,<1.8,<1.8,<1.9,<1.8,<1.9,<1.8,<1.8,<1.8,<1.7,<1.7,-,ng/L,<2.0,<1.7,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.7,<2.0,<1.9,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.8,<1.7,<1.8,-,ng/L,<1.7,<1.8,<1.7,<1.9,<2.0,<2.0,<1.7,<1.8,<1.9
Perfluorododecanoic acid (PFDoA),-,ng/L,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9,<1.8,<1.8,<1.8,<1.9,<1.8,<1.9,<1.8,<1.8,0.72 J,<1.7,<1.7,-,ng/L,<2.0,<1.7,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.7,<2.0,<1.9,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.8,<1.7,<1.8,-,ng/L,<1.7,<1.8,<1.7,<1.9,<2.0,<2.0,<1.7,<1.8,<1.9
Perfluorotridecanoic acid (PFTrDA),-,ng/L,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9,<1.8,<1.8,<1.8,<1.9,<1.8,<1.9,<1.8,<1.8,<1.8,<1.7,<1.7,-,ng/L,<2.0,<1.7,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.7,<2.0,<1.9,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.8,<1.7,<1.8,-,ng/L,<1.7,<1.8,<1.7,<1.9,<2.0,<2.0,<1.7,<1.8,<1.9
Perfluorotetradecanoic acid (PFTeA),-,ng/L,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9,<1.8,<1.8,<1.8,<1.9,<1.8,<1.9,<1.8,<1.8,<1.8,<1.7,<1.7,-,ng/L,<2.0,<1.7,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.7,<2.0,<1.9,<1.9,<1.8,<1.8,<1.8,<1.8,-,ng/L,<1.8,<1.8,<1.7,<1.7,<1.8,<1.8,<1.8,<1.8,<1.8,<1.9,<1.9 J*,<1.9 J*,<1.7,<1.7,<1.8,<1.8,<1.8,<1.7,-,ng/L,<1.8,<1.7,<1.8,<1.8,<1.7,<1.7,<1.7,<1.7,<1.8,<1.7,<1.8,-,ng/L,<1.7,<1.8,<1.7,<1.9,<2.0,<2.0,<1.7,<1.8,<1.9
