In [1]:
import pandas as pd
import tabula
import numpy as np

In [2]:
tabula.convert_into("T11.pdf", "raw_data.csv", output_format="csv", pages='all')

In [3]:
raw_data = pd.read_csv("raw_data.csv")

# drop the 3rd column - only filled with NaN's
raw_data = raw_data.drop(raw_data.columns[2], axis = 1)

# add column names
col_names = ["System Number", "System Name", "Well Name", "Sampling Date", "Sample Labratory Control Number",
             "Field Blank Collected", "Field Blank Results in nanograms per liter (ng/L) by USEPA Method 537.1"]
raw_data.columns = col_names

# remove the first 6 rows
raw_data = raw_data.iloc[6:].reset_index(drop = True)

# fixing the system number column
raw_data = raw_data.drop(index = [6, 47, 51]).reset_index(drop = True)

system_number = np.array(["SC0220006"]*12 + ["SC0720003"]*13 + ["SC2110001"]*9 + ["SC3410001"]*9 + 
                         ["SC2210001"] + ["SC2620004"]*4 + ["SC2620009"]*2)

raw_data["System Number"] = pd.Series(system_number)

# fixing the system name column
raw_data = raw_data.drop(index = [38]).reset_index(drop = True)

system_name = np.array(["Breezy Hill Water"]*12 + ["Beaufort Jasper Water & Sewer Authority"]*13 
                       + ["City of Florence"]*9 + ["City of Bennetsville"]*8 + 
                       ["City of Georgetown"] + ["Grand Strand Water and Sewer Authority"]*6)

raw_data["System Name"] = system_name

# fixing the well name column
raw_data = raw_data.drop(index = [17, 20, 23]).reset_index(drop = True)

well_name = raw_data["Well Name"].to_list()

well_name[16:22] = ["Road 34 Near Forest (CELV)"]*2 + ["Hardeeville Well 3"]*2 + ["Highway 170 Shady Oaks"]*2

raw_data["Well Name"] = well_name

# fixing the field blank collected column
raw_data = raw_data.replace({'Field Blank Collected' : { '--' : np.nan, '√' : "Yes"}})

# fixing the final column
raw_data = raw_data.replace({'Field Blank Results in nanograms per liter (ng/L) by USEPA Method 537.1' 
                             : { '--' : np.nan}})

# one last look at the final cleaned dataset
cleaned_data = raw_data
cleaned_data

Unnamed: 0,System Number,System Name,Well Name,Sampling Date,Sample Labratory Control Number,Field Blank Collected,Field Blank Results in nanograms per liter (ng/L) by USEPA Method 537.1
0,SC0220006,Breezy Hill Water,Well 5 Midland Valley,1-Dec-20,20L0201-01,,
1,SC0220006,Breezy Hill Water,Well 3 Hill Street,1-Dec-20,20L0195-01,Yes,all 18 analytes <2.0
2,SC0220006,Breezy Hill Water,Well 9 Hayes Drive No. 2,1-Dec-20,20L0192-01,,
3,SC0220006,Breezy Hill Water,Well 6 New Woodbridge,1-Dec-20,20L0188-01,,
4,SC0220006,Breezy Hill Water,Well 7 Greenfield,1-Dec-20,20L0196-01,,
5,SC0220006,Breezy Hill Water,Well 13 Bettis Academy,1-Dec-20,20L0204-01,,
6,SC0220006,Breezy Hill Water,Well 12 Sage Mill -- Tank Site,1-Dec-20,20L0187-01,,
7,SC0220006,Breezy Hill Water,Well 11 Edisto,1-Dec-20,20L0189-01,,
8,SC0220006,Breezy Hill Water,Well10 Ascauga Lake,1-Dec-20,20L0199-01,,
9,SC0220006,Breezy Hill Water,Well 8 Hayes Drive,1-Dec-20,20L0203-01,,


In [None]:
cleaned_data.to_csv("data.csv")