## PDFMiner

In [1]:
from pdfminer.high_level import extract_pages

In [7]:
page_layouts = extract_pages("cclw-10046-169a288207764ad0bdd5598cedd1d5d0.pdf", page_numbers=[5])

In [9]:
for page_layout in page_layouts:
    for element in page_layout:
        pass

## PDFAlto

In [2]:
import subprocess
import xml.etree.ElementTree as et
from pathlib import Path
import json

In [17]:
# ~/code/pdfalto/pdfalto -l 10 -noImage -outline -readingOrder file.pdf

def doc_to_xml(pdf_path, xml_output_path):
    pdfalto_path = "/root/pdfalto/pdfalto"

    pdf_path = Path(pdf_path)
    xml_output_file = xml_output_path / f"{pdf_path.stem}.xml"

    pdfalto_args = [
        pdfalto_path,
        "-noImage",
        "-outline",
        "-readingOrder",
        pdf_path,
        xml_output_file
    ]

    retval = subprocess.run(pdfalto_args)

    if retval.returncode == 0:
        return et.parse(xml_output_file), xml_output_file
    
    return None

In [18]:
pdf_path = Path("/workspace/nbs/pdf")
extract_path = Path("/workspace/nbs/extract/")
pdf_file = Path("cclw-10046-169a288207764ad0bdd5598cedd1d5d0.pdf")

pdf_xml, xml_path = doc_to_xml(pdf_path / pdf_file, extract_path)

In [19]:
def parse_doc_xml(pdf_xml: et.ElementTree):
    namespace = "{http://www.loc.gov/standards/alto/ns-v3#}"
    pages = pdf_xml.getroot().findall(
        f"{namespace}Layout/{namespace}Page"
    )

    SEP = " "

    text_blocks = []

    for page in pages:
        page_id = page.attrib.get("ID", None)
        for text_block in page.findall(f"{namespace}PrintSpace/{namespace}TextBlock"):
            text_block_id = text_block.attrib.get("ID", None)
            text_block_lines = []
            for text_line in text_block.getchildren():
                text_line_content = ""
                for text in text_line.getchildren():
                    text_line_content = text_line_content + SEP + text.attrib.get("CONTENT", "")
                text_block_lines.append(text_line_content)

            if len(text_block_lines) > 0:
                text_blocks.append(
                    {
                        "text": "".join(text_block_lines).strip(),
                        "text_lines": text_block_lines,
                        "text_block_id": text_block_id,
                        "page_id": page_id
                    }
                )

    return text_blocks

In [20]:
doc = parse_doc_xml(pdf_xml)

  for text_line in text_block.getchildren():
  for text in text_line.getchildren():


In [21]:
def doc_to_json(doc, extract_path: Path, doc_filename: Path):
    """Save the document to json
    """

    with open(extract_path / f"{doc_filename.stem}.json", "wt") as f:
        json.dump(doc, f, indent=2)

In [22]:
doc_to_json(doc, extract_path, pdf_file)

In [25]:
def doc_to_text(doc, extract_path: Path, doc_filename: Path):
    """Save the document to a text file
    """

    with open(extract_path / f"{doc_filename.stem}.txt", "wt") as f:
        for text_block in doc:
            f.write(text_block["text"] + "\n")

In [26]:
doc_to_text(doc, extract_path, pdf_file)

## Process directory containing a set of PDF files

In [30]:
!ls /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8

cclw-10049-dabe51e6042a47f18a53d99735605032.pdf
cclw-10049-dabe51e6042a47f18a53d99735605032.txt
cclw-1280-6c88046c28e240efa41f7f966fdf3d9a.pdf
cclw-1280-6c88046c28e240efa41f7f966fdf3d9a.txt
cclw-1318-d7f66920a18e4ddf94c83cf21fa2bcfa.pdf
cclw-1318-d7f66920a18e4ddf94c83cf21fa2bcfa.txt
cclw-1654-c44dd3724fdb4c569e977b22843b0ae7.pdf
cclw-1654-c44dd3724fdb4c569e977b22843b0ae7.txt
cclw-4810-37f31b3109704bad90d31cae646ad685.pdf
cclw-4810-37f31b3109704bad90d31cae646ad685.txt
cclw-4974-f21d876714da4417995887c42921f254.pdf
cclw-4974-f21d876714da4417995887c42921f254.txt
cclw-8149-5b598bd3e88c4ce99f44cfbd283c9679.pdf
cclw-8149-5b598bd3e88c4ce99f44cfbd283c9679.txt
cclw-8650-24af4f121de143baa3b633481f7adb78.pdf
cclw-8650-24af4f121de143baa3b633481f7adb78.txt
cclw-9448-9611f30cbe514f8ba687a1b17f3a52c0.pdf
cclw-9448-9611f30cbe514f8ba687a1b17f3a52c0.txt
cclw-9460-16dbada00cb8440f9178e93bc7d89677.pdf
cclw-9460-16dbada00cb8440f9178e93bc7d89677.txt


In [31]:
extract_path = Path("/root/pdf-parsing-evaluation/research/ocr-evaluation/data/preds-pdf-parser")
pdf_path = Path("/root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8")
for pdf_filename in pdf_path.glob("*.pdf"):
    print(f"Processing {pdf_filename}...")
    pdf_xml, xml_path = doc_to_xml(pdf_path / pdf_filename, extract_path)
    doc = parse_doc_xml(pdf_xml)
    doc_to_json(doc, extract_path, pdf_filename)
    doc_to_text(doc, extract_path, pdf_filename)

Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-4810-37f31b3109704bad90d31cae646ad685.pdf...


  for text_line in text_block.getchildren():
  for text in text_line.getchildren():


Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-9448-9611f30cbe514f8ba687a1b17f3a52c0.pdf...
Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-9460-16dbada00cb8440f9178e93bc7d89677.pdf...
Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-1280-6c88046c28e240efa41f7f966fdf3d9a.pdf...
Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-4974-f21d876714da4417995887c42921f254.pdf...
Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-10049-dabe51e6042a47f18a53d99735605032.pdf...
Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-8149-5b598bd3e88c4ce99f44cfbd283c9679.pdf...
Processing /root/pdf-parsing-evaluation/research/ocr-evaluation/data/test-trimmed-8/cclw-1654-c44dd3724fdb4c569e977b22843b0ae7.pdf...
Processing /root/pdf-parsing-evaluation/research/ocr-evaluati