In [1]:
import camelot
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
tables = camelot.read_pdf("alaska.pdf", pages = "1,2,3,4,5")

In [3]:
def clean_table(df):
    # new empty dataframe
    cleaned_df = dict()
    for i in np.arange(1, df.shape[0]):
        # grab all the contents of each row
        row = df.loc[i]
        text = [i.split("\n") for i in row]
        cleaned_row = np.array([item for sublist in text for item in sublist])
    
        # append the newly cleaned row to the dataframe
        cleaned_df[i] = cleaned_row
    return pd.DataFrame.from_dict(cleaned_df, orient = 'index').reset_index(drop = True)

def swap_values(df, coord1, coord2):
    '''
    swaps the values of df at coord1 and coord2
    '''
    temp = df.iloc[coord2]
    df.iloc[coord2] = df.iloc[coord1]
    df.iloc[coord1] = temp
    
def impute_shift(df, coord, replacement_value):
    '''
    coord: the coordinate to start shifting values
    replacement_value: the value to impute into coord once pushed
    '''
    # grab the whole row an convert to an array
    row_idx, col_idx = coord[0], coord[1]
    row = np.array(df.iloc[row_idx].values)
    
    # remove any instances of None
    row = row[row != None]
    
    # impute the replacement value into the column-th index of the row
    row = np.insert(row, col_idx, replacement_value)
    
    # re-insert that updated row back into the dataframe
    df.iloc[row_idx] = row

# First Table

In [4]:
table1 = clean_table(tables[0].df)
swap_values(table1, (1,1), (1,2))
swap_values(table1, (17,0), (17,1))
impute_shift(table1, (16,1), "70†")

In [5]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "MW-1-15, 10/26/2021", "MW-1-40, 10/26/2021", "MW-2-20, 10/26/2021", 
            "MW-2-20, Duplicate", "MW-2-30, 10/26/2021", "MW-3-15, 10/26/2021", "MW-3-40, 10/26/2021",
            "MW-4-20, 10/25/2021", "MW-5-20, 10/25/2021", "MW-6-20, 10/26/2021", "MW-7-20, 10/25/2021",
            "MW-8-20, 10/25/2021", "MW-9-30, 10/25/2021", "MW-9-30, Duplicate", "MW-10-20, 10/25/2021",
            "MW-11-15, 10/31/2021", "MW-12-10, 10/31/2021", "MW-12-10, Duplicate"]
table1.columns = col_names

# Table 2

In [6]:
table2 = clean_table(tables[1].df)
swap_values(table2, (1,1), (1,2))
swap_values(table2, (17,0), (17,1))
impute_shift(table2, (16,1), "70†")

In [7]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "MW-9-10, 10/25/2021", "MW-13-20, 10/27/2021", "MW-13-45, 10/27/2021", 
            "MW-13-45, Duplicate", "MW-14-15, 11/1/2021", "MW-14-31, 11/1/2021", "MW-15-15, 11/3/2021",
            "MW-14-45, 11/3/2021", "MW-14-45, Duplicate", "MW-16-15, 11/2/2021", "MW-17-20, 10/26/2021",
            "MW-17-40, 10/26/2021", "MW-17-40, Duplicate", "MW-18-15, 11/4/2021", "MW-18-50, 11/4/2021",
            "MW-18-50, Duplicate", "MW-19-15, 11/5/2021"]
table2.columns = col_names

# Table 3

In [8]:
table3 = clean_table(tables[2].df)
swap_values(table3, (1,1), (1,2))
swap_values(table3, (17,0), (17,1))
impute_shift(table3, (16,1), "70†")

In [9]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "MW-19-50, 11/5/2021", "MW-19-50, Duplicate", "MW-20-15, 11/4/2021", 
            "MW-20-40, 11/4/2021", "MW-21-15, 11/1/2021", "MW-21-45, 11/1/2021", "MW-21-45, Duplicate",
            "MW-22-15, 10/30/2021", "MW-22-40, 10/30/2021", "MW-23-20, 10/24/2021", "MW-23-50, 10/25/2021",
            "MW-23-50, Duplicate", "MW-24-10, 10/29/2021", "MW-24-30, 10/29/2021", "MW-25-15, 10/28/2021",
            "MW-25-47, 10/29/2021", "MW-25-47, Duplicate", "GAC 2021, 11/5/2021"]
table3.columns = col_names

# Table 4

In [10]:
table4 = clean_table(tables[3].df)
swap_values(table4, (17,0), (17,1))
impute_shift(table4, (16,1), "70†")

In [11]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "21GST-TWP-1, 10/27/2021", "21GST-TWP-2, 10/27/2021", "21GST-TWP-3, 10/28/2021", 
            "21GST-TWP-3, Duplicate", "21GST-TWP-4, 10/28/2021", "21GST-TWP-5, 10/28/2021", "21GST-TWP-6, 10/30/2021",
            "21GST-TWP-7, 10/30/2021", "21GST-TWP-8, 10/28/2021", "21GST-TWP-9, 10/30/2021", "21GST-TWP-10, 10/27/2021"]
table4.columns = col_names

# Table 5

In [12]:
table5 = clean_table(tables[4].df)
swap_values(table5, (17,0), (17,1))
impute_shift(table5, (16,1), "70†")

In [13]:
# fix the names
col_names = ["Analyte", "EPA LHA", "Units", "21GST-TWP-11, 10/30/2021", "21GST-TWP-11, Duplicate", "21GST-TWP-12, 10/30/2021", 
            "21GST-TWP-13, 10/24/2021", "21GST-TWP-14, 10/24/2021", "21GST-TWP-14, Duplicate", "21GST-TWP-15, 10/27/2021",
            "21GST-TWP-15, Duplicate", "PW-016, 10/26/2021"]
table5.columns = col_names

In [14]:
table1 = table1.set_index("Analyte")
table2 = table2.set_index("Analyte")
table3 = table3.set_index("Analyte")
table4 = table4.set_index("Analyte")
table5 = table5.set_index("Analyte")

df = pd.concat([table1, table2, table3, table4, table5], axis = 1).reset_index(drop = True)
df.to_csv("alaska.csv")