In [1]:
import pandas as pd
import numpy as np
import camelot

import warnings

# Set the warning filter to ignore
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

## Page 1

In [2]:
xls = pd.ExcelFile('new_mexico3.xlsx', engine = 'openpyxl')
all_tables = {}
for sheet_name in xls.sheet_names:
    all_tables[sheet_name] = pd.read_excel(xls, sheet_name)

In [3]:
page1 = all_tables['Table 1']
page1.columns = list(np.arange(0, page1.shape[1]))

# Drop columns where all values in the first two rows are NaN
page1 = page1.dropna(axis=1, how='all', subset=page1.index[:2])

column_names = np.array([])

for col_idx in np.arange(0, page1.shape[1]):
    row2_item = page1.iloc[1, col_idx]
    
    if pd.isna(row2_item):
        column_names = np.append(column_names, page1.iloc[0, col_idx])
    else:
        column_names = np.append(column_names, row2_item)
        
page1.columns = column_names

# Drop first two rows
page1 = page1.iloc[2:, :]

# Reset index
page1 = page1.reset_index(drop=True)

In [4]:
page1

Unnamed: 0,Water System Name / Well Name,USGS Site ID,County,Date,PFBS,PFPeS,PFHxS,PFHpS,PFOS,PFNS,PFDS,PFBA,PFPeA,PFHxA,PFHpA,PFOA,PFNA,PFDA,PFUnDA,PFDoDA,PFTrDA,PFTeDA,PFOSA,N-MeFOSAA,N-EtFOSAA,4:2 FTS,6:2 FTS,8:2 FTS,9Cl-PF3ONS,11Cl-PF3OUdS,ADONA,HFPO-DA,PFOA+ PFOS\n(ng/L),ΣPFAS\n(ng/L),Total PFAS includes estimated values?
0,ABCWUA / Burton 4,350343106363301,Bernalillo,2020-09-25 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
1,ABCWUA / Leavitt 2,350249106434201,Bernalillo,2021-02-09 00:00:00,,,,,1.3,,,,,,,,,,,,,,,,,,,,,,,,1.3,1.3,Yes
2,ABCWUA / Walker 1,351025106323801,Bernalillo,2021-02-09 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
3,Roswell Water System / RIAC 1,331843104305001,Chaves,2020-10-28 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
4,Roswell Water System / RIAC 4,331843104315001,Chaves,2020-10-28 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
5,Roswell Water System / Well 12,332137104303901,Chaves,2020-10-28 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
6,Clovis Municipal Airport / Well 1,342537103051201,Curry,2020-11-17 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
7,Desert Ranch MDWCA / Well 1,342439103190901,Curry,2021-02-08 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
8,Desert Ranch MDWCA / Well 1,342439103190901,Curry,2021-05-12 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
9,Grady Water System / Well 1,344916103190001,Curry,2020-09-25 00:00:00,,,,,,,,,4.3,2.7,,,,,,,,,,,,,,,,,,,--,7,No


## Page 2

In [5]:
page2 = all_tables['Table 2']

# make the column header a row
page2 = page2.T.reset_index().T.reset_index(drop = True)

# drop the last three rows
page2 = page2.iloc[:-3, :]

# drop quantifier columns
column_drop_list = np.array([])
for i in np.arange(4, 35):
    # grab the iterated column
    col = page2[i]
    
    # check if the column has a quantifier (E)
    if 'E' in col.values:
        # if so, add the column to the list of columns to drop
        column_drop_list = np.append(column_drop_list, i)
        
page2 = page2.drop(columns = column_drop_list)
page2.columns = page1.columns

## Page 3

In [6]:
page3 = all_tables['Table 3']

In [7]:
page3.columns = list(np.arange(0, page3.shape[1]))

# Drop columns where all values in the first two rows are NaN
page3 = page3.dropna(axis=1, how='all', subset=page3.index[:2])

page3.columns = page1.columns

page3 = page3.iloc[2:, :]
page3 = page3.reset_index(drop = True)

In [8]:
page3

Unnamed: 0,Water System Name / Well Name,USGS Site ID,County,Date,PFBS,PFPeS,PFHxS,PFHpS,PFOS,PFNS,PFDS,PFBA,PFPeA,PFHxA,PFHpA,PFOA,PFNA,PFDA,PFUnDA,PFDoDA,PFTrDA,PFTeDA,PFOSA,N-MeFOSAA,N-EtFOSAA,4:2 FTS,6:2 FTS,8:2 FTS,9Cl-PF3ONS,11Cl-PF3OUdS,ADONA,HFPO-DA,PFOA+ PFOS\n(ng/L),ΣPFAS\n(ng/L),Total PFAS includes estimated values?
0,"Rio Grande at Alameda Bridge, NM",8329918,Bernalillo,2020-08-19 00:00:00,,,,,,,,2.0,2.2,,,,,,,,,,,,,,,,,,,,--,4.2,Yes
1,"Rio Grande at Alameda Bridge, NM",8329918,Bernalillo,2020-09-16 00:00:00,2.8,,,,,,,,0.9,,,,,,,,,,,,,,,,,,,,--,3.7,Yes
2,"Rio Grande at Alameda Bridge, NM",8329918,Bernalillo,2020-11-23 00:00:00,3.9,,,,,,,,1.3,,,,,,,,,,,,,,,,,,,,--,5.2,Yes
3,"Rio Grande at Alameda Bridge, NM",8329918,Bernalillo,2020-12-18 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,--,
4,"Rio Grande at Alameda Bridge, NM",8329918,Bernalillo,2021-01-11 00:00:00,,,,,,,,,1.1,,,,,,,,,,,,,,,,,,,,--,1.1,Yes
5,"Rio Grande at Alameda Bridge, NM",8329918,Bernalillo,2021-05-05 00:00:00,2.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,--,2.9,No
6,"Rio Grande at Valle de Oro, NM",8330830,Bernalillo,2020-08-31 00:00:00,,,,,2.6,,,,5.8,2.9,,,,,,,,,,,,,,,,,,,--,8.7,No
7,"Rio Grande at Valle de Oro, NM",8330830,Bernalillo,2020-09-16 00:00:00,93.0,,1.2,,,,,12.3,27.9,12.5,1.7,4.9,,,,,,,,,,,,,,,,,7.5,156.1,Yes
8,"Rio Grande at Valle de Oro, NM",8330830,Bernalillo,2020-11-23 00:00:00,22.4,,,,,,,3.1,6.6,4.0,,1.6,,,,,,,,,,,,,,,,,1.6,37.7,Yes
9,"Rio Grande at Valle de Oro, NM",8330830,Bernalillo,2020-12-18 00:00:00,,,,,,,,3.0,4.8,3.4,,1.3,,,,,,,,,,,,,,,,,1.3,12.5,Yes


## Page 4

In [9]:
page4 = all_tables['Table 4']

# make the column header a row
page4 = page4.T.reset_index().T.reset_index(drop = True)

# drop the last four rows
page4 = page4.iloc[:-4, :]

# drop quantifier columns
column_drop_list = np.array([])
for i in np.arange(4, 41):
    # grab the iterated column
    col = page4[i]
    
    # check if the column has a quantifier (E)
    if 'E' in col.values:
        # if so, add the column to the list of columns to drop
        column_drop_list = np.append(column_drop_list, i)

page4 = page4.drop(columns = column_drop_list)
page4.columns = page1.columns

In [10]:
page4.loc[0] = page4.iloc[0].apply(lambda x: np.nan if "Unnamed" in str(x) else x)
page4.iloc[0, -2] = page4.iloc[0, -2] = "--"

In [11]:
page2.loc[0] = page2.iloc[0].apply(lambda x: np.nan if "Unnamed" in str(x) else x)
page2.iloc[0, -2] = page2.iloc[0, -2] = "--"

In [12]:
# combine all the pages together into a single pdf
new_mexico3 = pd.concat([page1, page2, page3, page4], ignore_index = True)

In [13]:
new_mexico3.to_csv("new_mexico3.csv")