In [None]:
import os

import pandas as pd
import pdfplumber
import re
import numpy as np
import glob
import tricky_tables

COLUMNS = ["Emission Source","Source Name", "Air Contaminant Name", "Emission Rate lbs/hr", "Emission Rate tons/year"]

In [None]:
TABlE_WRITE_LOCATION = '../data/extracted_tables'

A series of text files to keep track of which PDFs have been processed. 

Processed PDFs fall into 3 categories 
* unknown -- parser failed
* easy  -- `pdf_plumber.extract_tables()` successfully extracted table
* tricky  -- customer parser in `tricky_table.py` sucessfuly extracted table

In [None]:
base_file_location  = '../data/MAERT'

files = glob.glob(f"{base_file_location}/*.pdf")

with open('text_files/tables_to_extract.txt', 'w') as f:
    for line in files:
        f.write(f"{line}\n")

In [None]:
table_file = 'text_files/tables_to_extract.txt'
easy_file = 'text_files/easy_tables.txt'
tricky_file = 'text_files/tricky_tables.txt'
unkown_file = 'text_files/unknown_tables.txt'

In [None]:
tables_to_extract_txt = open(table_file, "r") 
tables_to_extract = tables_to_extract_txt.read().split("\n") 


In [None]:
for data_location in tables_to_extract:
    file_name_raw = data_location.split("/")[-1]
    print(f"Extracting from {file_name}")
    try:

        pdf = pdfplumber.open(file_name)

        total_pages = []
        last_page_passed = False
        tricky_table = False
        no_data_left = False

        for page in pdf.pages:
            if last_page_passed == True:
                    break
            else:
                """
                ┌─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┐

                Case 1: Table is not nicely extractable
                1. Extract table from the text with the stopping point
                << (1)   Emission point identification >>
                2. Parse through each line based on several cases.
                └─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┘
                """
                print(f"Extracting from {page.page_number}")

                text = page.extract_text(keep_blank_chars=True)
                blank_chars_false = page.extract_text()

                if page.extract_table() == None:
                    tricky_table = True

                    try:
                        core_pat = re.compile(r"TPY[\-\s]+(.*)\n\s+", re.DOTALL)
                        core = re.search(core_pat, text).group(1)
                    except Exception as e:
                        core = text

                    lines = core.split("\n")
                    
                    if "point identification" in blank_chars_false:
                        if blank_chars_false.index("(1) Emission point identification") < 100:
                            no_data_left = True
                        try:
                            ending_line = [x for x in lines if "pointidentification" in x.replace(" ","") ]
                            idx = lines.index(ending_line[0])
                            lines = lines[:idx]
                        except Exception as e:
                            print(e)
                        last_page_passed = True

                    # function from helper script
                    if not no_data_left:
                        df_page = tricky_tables.extract_table_custom(lines,COLUMNS)
                        total_pages.append(df_page)

                else:
                    
                    df_extracted_table = pd.DataFrame(pdf.pages[0].extract_table()[2:],columns=COLUMNS)
                    df_extracted_table['Emission Source'] = df_extracted_table['Emission Source'].ffill()
                    df_extracted_table['Source Name'] = df_extracted_table['Source Name'].ffill()

                    total_pages.append(df_extracted_table)

                    if "point identification" in blank_chars_false:
                        last_page_passed = True

        """
        ┌─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┐

        Tackle known formatting errors
        └─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┘
        """

        if tricky_table:
            df_pages = pd.concat(total_pages).dropna(axis = 0, how = 'all').reset_index(drop=True)
            df_pages_cleaned = tricky_tables.clean_up_tricky_table(df_pages)
        else:
            df_pages = pd.concat(total_pages).reset_index(drop=True)

        
        """
        ┌─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┐

        Add entity, permit number, publish date and write to CSV
        └─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┘
        """

        sp = file_name.split("_")
        entity = sp[0]
        permit_number = sp[1]
        publish_date = sp[2]

        df_pages_cleaned['entity'] = entity
        df_pages_cleaned['permit_number'] = permit_number
        df_pages_cleaned['publish_date'] = publish_date

        # lil hack to deal with table extraction method picking up the columns
        df_pages_cleaned_dropped = df_pages_cleaned[~(df_pages_cleaned['Emission Source'] == 'Emission')]

        df_pages_cleaned_dropped.to_csv(f"{TABlE_WRITE_LOCATION}/{file_name.replace('pdf','csv')}",index=False)

        """
        ┌─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┐

        Record metadata for extracted table
        └─────────────────── ･ ｡ﾟ★: *.☪ .* :☆ﾟ. ───────────────────────┘
        """

        if tricky_table:
            fn = tricky_file
        else:
            fn = easy_file
        f = open(fn, "a")
        f.write(f"{file_name}\n")
        f.close()

        print("Extracted succesfully!!!")
        print("-"*10)
        
        
    except Exception as e:
        print(e)
        f = open(unkown_file, "a")
        f.write(f"{file_name}\n")
        f.close()
        print("Extracted failed!!!")
        print("-"*10)
