In [None]:
import json
from pathlib import Path

In [None]:
doc_path = Path("/home/ffranz/Dev/e2e-poc-source-documents/safebalance-clarity-statement.pdf")
output_dir = Path("/home/ffranz/Dev/e2e-poc-source-documents/safebalance-clarity-statement/test")

files = []

if doc_path.is_file():
    files = [doc_path]
else:
    files = list(doc_path.rglob("*.pdf"))

print(f"Files to convert: {files}")

In [None]:
!pip install gmft

In [None]:
from gmft.auto import CroppedTable, TableDetector, AutoTableFormatter, AutoTableDetector
from gmft.pdf_bindings import PyPDFium2Document
from gmft.table_visualization import display_html_and_image

detector = AutoTableDetector()
formatter = AutoTableFormatter()

def ingest_pdf(pdf_path): # produces list[CroppedTable]
    doc = PyPDFium2Document(pdf_path)
    tables = []
    for page in doc:
        tables += detector.extract(page)
    return tables, doc

dfs = []
formatted_tables = []
images = []

for file in files:
    tables, doc = ingest_pdf(file)
    for table in tables:
        print("----------------TABLE----------------")
        print(table.text())
        print("-------------------------------------")

        ft = formatter.extract(table)
        try:
            dfs.append(ft.df())
        except Exception as e:
            print(e)
            dfs.append(None)
        formatted_tables.append(ft)
        images.append(ft.visualize())
    doc.close()

In [None]:
from IPython.display import display, Markdown
import pandas as pd

prev_doc = None

for df, img, ft in zip(dfs, images, formatted_tables):
    with pd.option_context('display.max_rows', 500, "display.multi_sparse", False):
        if ft.page.filename != prev_doc:
            prev_doc = ft.page.filename
            display(Markdown('---'))
            display(Markdown(f'### {ft.page.filename}'))
            
        if df is not None:
            html = df.fillna("").to_html()
        else:
            html = "Failed to extract table"
        display_html_and_image(html, img)
        