In [None]:
#pip install PyPDF2 pandas pdfplumber

In [None]:
#pip install tabula-py

In [None]:
'''
Name: Christian H.
Objective: Transforming tables from PDFs in a destined directory to individual csv files.
Notes: 
- Old: The following code works adequately, but the outputted data in the csv files needs to be pre-processed for better quality.
- Old: This code, as with the Tabula software only work well with Text-base PDFs or tables. It doesn't work with image-based PDFs.
-- > After doing some exploration. Using the same library, tabula, I simply changed the extractiong method by specfying lattice = TRUE

Stream vs Lattice Extraction Methods
Stream extraction method. If the data is not mapped to the correct cells, try the Lattice method instead.
Stream looks for whitespace between columns, while Lattice looks for boundary lines between columns.

'''

In [None]:
import os
from pathlib import Path
from tabula import read_pdf  # Alternatively, use camelot if needed
import pandas as pd

def extract_tables_from_pdfs(input_folder, output_folder):
    """
    Extracts tables from all PDF files in the input_folder and saves them as CSV files in output_folder
    using the Lattice extraction method.
    
    Args:
        input_folder (str): Path to the folder containing PDF files.
        output_folder (str): Path to the folder to save extracted CSV files.
    """
    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)
    
    # List all PDF files in the input folder
    pdf_files = [f for f in Path(input_folder).glob("*.pdf")]
    if not pdf_files:
        print("No PDF files found in the specified folder.")
        return
    
    for pdf_path in pdf_files:
        try:
            print(f"Processing: {pdf_path.name}")
            # Extract tables from the PDF using Lattice method
            tables = read_pdf(
                str(pdf_path),
                pages="all",
                multiple_tables=True,
                pandas_options={"dtype": str},
                lattice=True  # Specify the Lattice extraction method
            )
            
            if tables:
                for i, table in enumerate(tables):
                    # Define the output CSV path
                    output_file = Path(output_folder) / f"{pdf_path.stem}_table_{i+1}.csv"
                    table.to_csv(output_file, index=False)
                    print(f"Saved table {i+1} to {output_file}")
            else:
                print(f"No tables found in {pdf_path.name}.")
        except Exception as e:
            print(f"Error processing {pdf_path.name}: {e}")

if __name__ == "__main__":
    # Set the input and output folder paths here
    input_folder = "pdfs"  # Replace with the path to your folder containing PDFs
    output_folder = "csv"  # Replace with the path to save CSV files

    extract_tables_from_pdfs(input_folder, output_folder)
    print("Processing completed!")
