In [None]:
import sys

sys.path.append("..")

!mkdir -p sample-docs
!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/naic-numerical-list-of-companies-page-94.pdf -O sample-docs/companies-list.pdf
!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/mobile-home-manual.pdf -O sample-docs/mobile-home-manual.pdf
!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/meta-2022-10k-page-69.pdf -O sample-docs/meta-10k.pdf

## 1. Basic Extraction

In [None]:
import openparse

basic_doc_path = "./sample-docs/mobile-home-manual.pdf"
parser = openparse.DocumentParser()
parsed_basic_doc = parser.parse(basic_doc_path)

for node in parsed_basic_doc.nodes:
    display(node)

In [None]:
# you can also easily display the nodes on the actual document

pdf = openparse.Pdf(basic_doc_path)
pdf.display_with_bboxes(
    parsed_basic_doc.nodes,
)

## 2. Serialization:

OpenParse returns pydantic models that can easily be serialized

In [None]:
parsed_basic_doc.model_dump()

## Tables

We aim to be model agnostic - the DocumentParser supports extracting tables using either the "table-transformers" or "pymupdf" libraries - we're model agnostic. The `parsing_algorithm` field in the configuration dictionary decides which one to use.

The `PyMuPDFArgsDict` (and similarly the `TableTransformersArgsDict`) lets you fine-tune how tables are extracted using specific arguments.

In [None]:
doc_with_tables_path = "./sample-docs/companies-list.pdf"

parser = openparse.DocumentParser(
    table_args={"parsing_algorithm": "table-transformers"}
)
parsed_doc2 = parser.parse(doc_with_tables_path)

for node in parsed_doc2.nodes:
    display(node)

In [None]:
pdf = openparse.Pdf(doc_with_tables_path)
pdf.display_with_bboxes(
    parsed_doc2.nodes,
)

In [None]:
meta10k_path = "./sample-docs/meta-10k.pdf"

parser = openparse.DocumentParser(table_args={"parsing_algorithm": "pymupdf"})
parsed_10k = parser.parse(meta10k_path)

doc = openparse.Pdf(file=meta10k_path)
doc.display_with_bboxes(parsed_10k.nodes)

## 3. Custom Processing (Advanced)

While we've chosen sensible defaults, you can add custom processing functions to the `DocumentParser` class to further process the extracted data.

This allows use cases like:
- Using embeddings to parse nodes split across pages
- Use GPT-4V to choose which nodes to combine

In [None]:
from openparse import processing, Node
from typing import List


class CustomCombineTables(processing.ProcessingStep):
    """
    Let's combine tables that are next to each other
    """

    def process(self, nodes: List[Node]) -> List[Node]:
        new_nodes = []
        print("Combining concurrent tables")
        for i in range(len(nodes) - 1):
            if "table" in nodes[i].variant and "table" in nodes[i + 1].variant:
                new_node = nodes[i] + nodes[i + 1]
                new_nodes.append(new_node)
            else:
                new_nodes.append(nodes[i])

        return new_nodes


# copy the default pipeline (or create a new one)
custom_pipeline = default_pipeline.copy()
custom_pipeline.append(CustomCombineTables())

parser = openparse.DocumentParser(
    table_args={"parsing_algorithm": "pymupdf"}, processing_pipeline=custom_pipeline
)
custom_10k = parser.parse(meta10k_path)

doc = openparse.Pdf(file=meta10k_path)
doc.display_with_bboxes(custom_10k.nodes)