Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,8 @@ PYBIND11_MODULE(pdf_parsers, m) {
[](docling::docling_parser_v2 &self,
const std::string &key,
const std::string &page_boundary,
bool do_sanitization) -> nlohmann::json {
bool do_sanitization
) -> nlohmann::json {
return self.parse_pdf_from_key(key, page_boundary, do_sanitization);
},
pybind11::arg("key"),
Expand All @@ -403,13 +404,31 @@ PYBIND11_MODULE(pdf_parsers, m) {
const std::string &key,
int page,
const std::string &page_boundary,
bool do_sanitization) -> nlohmann::json {
return self.parse_pdf_from_key_on_page(key, page, page_boundary, do_sanitization);
bool do_sanitization,
bool keep_char_cells,
bool keep_lines,
bool keep_bitmaps,
bool create_word_cells,
bool create_line_cells) -> nlohmann::json {
return self.parse_pdf_from_key_on_page(key,
page,
page_boundary,
do_sanitization,
keep_char_cells,
keep_lines,
keep_bitmaps,
create_word_cells,
create_line_cells);
},
pybind11::arg("key"),
pybind11::arg("page"),
pybind11::arg("page_boundary") = "crop_box", // media_box
pybind11::arg("do_sanitization") = true, // media_box
pybind11::arg("key"),
pybind11::arg("page"),
pybind11::arg("page_boundary") = "crop_box", // media_box
pybind11::arg("do_sanitization") = true,
pybind11::arg("keep_char_cells") = true,
pybind11::arg("keep_lines") = true,
pybind11::arg("keep_bitmaps") = true,
pybind11::arg("create_word_cells") = true,
pybind11::arg("create_line_cells") = true,
R"(
Parse a specific page of the PDF document identified by its unique key and return a JSON representation.

Expand All @@ -418,6 +437,11 @@ PYBIND11_MODULE(pdf_parsers, m) {
page (int): The page number to parse.
page_boundary (str): The page boundary specification for parsing [choices: crop_box, media_box].
do_sanitization: Sanitize the chars into lines [default=true].
keep_char_cells: keep all the individual char's
keep_lines: keep all the lines
keep_bitmaps: keep all the bitmap resources
create_word_cells: create words from the char-cells
create_line_cells: create lines from the char-cells

Returns:
dict: A JSON representation of the parsed page.)")
Expand Down
144 changes: 142 additions & 2 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Parser for PDF files"""

import hashlib
import logging
from io import BytesIO
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple, Union
Expand All @@ -25,14 +26,34 @@
from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import]
from docling_parse.pdf_parsers import pdf_sanitizer # type: ignore[import]

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


class PdfDocument:

def iterate_pages(
self,
*,
keep_chars: bool = True,
keep_lines: bool = True,
keep_bitmaps: bool = True,
create_words: bool = True,
create_textlines: bool = True,
enforce_same_font: bool = True,
) -> Iterator[Tuple[int, SegmentedPdfPage]]:
for page_no in range(self.number_of_pages()):
yield page_no + 1, self.get_page(page_no + 1)
yield page_no + 1, self.get_page(
page_no + 1,
keep_chars=keep_chars,
keep_lines=keep_lines,
keep_bitmaps=keep_bitmaps,
create_words=create_words,
create_textlines=create_textlines,
enforce_same_font=enforce_same_font,
)

def __init__(
self,
Expand Down Expand Up @@ -125,25 +146,38 @@ def get_page(
self,
page_no: int,
*,
keep_chars: bool = True,
keep_lines: bool = True,
keep_bitmaps: bool = True,
create_words: bool = True,
create_textlines: bool = True,
enforce_same_font: bool = True,
do_sanitization: bool = False,
) -> SegmentedPdfPage:
if page_no in self._pages.keys():
return self._pages[page_no]
else:
if 1 <= page_no <= self.number_of_pages():

doc_dict = self._parser.parse_pdf_from_key_on_page(
key=self._key,
page=page_no - 1,
page_boundary=self._boundary_type,
do_sanitization=False,
do_sanitization=do_sanitization,
keep_char_cells=keep_chars,
keep_lines=keep_lines,
keep_bitmaps=keep_bitmaps,
create_word_cells=create_words,
create_line_cells=create_textlines,
)
for pi, page in enumerate(
doc_dict["pages"]
): # only one page is expected
self._pages[page_no] = self._to_segmented_page(
page=page["original"],
keep_chars=keep_chars,
keep_lines=keep_lines,
keep_bitmaps=keep_bitmaps,
create_words=create_words,
create_textlines=create_textlines,
enforce_same_font=enforce_same_font,
Expand Down Expand Up @@ -241,6 +275,7 @@ def _to_page_geometry(self, dimension: dict) -> PdfPageGeometry:
bleed_bbox=bleed_bbox,
)

"""
def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:

assert "data" in cells, '"data" in cells'
Expand Down Expand Up @@ -279,6 +314,64 @@ def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
result.append(cell)

return result
"""

def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
assert "data" in cells, '"data" in cells'
assert "header" in cells, '"header" in cells'

data = cells["data"]
header = cells["header"]

# Pre-compute header indices as local variables
r_x0_idx = header.index("r_x0")
r_y0_idx = header.index("r_y0")
r_x1_idx = header.index("r_x1")
r_y1_idx = header.index("r_y1")
r_x2_idx = header.index("r_x2")
r_y2_idx = header.index("r_y2")
r_x3_idx = header.index("r_x3")
r_y3_idx = header.index("r_y3")
text_idx = header.index("text")
font_key_idx = header.index("font-key")
font_name_idx = header.index("font-name")
widget_idx = header.index("widget")
left_to_right_idx = header.index("left_to_right")
rendering_mode_idx = header.index("rendering-mode")

# Pre-allocate list with exact size
data_len = len(data)
result: List[Union[PdfTextCell, TextCell]] = [None] * data_len # type: ignore

for ind, row in enumerate(data):
rect = BoundingRectangle(
r_x0=row[r_x0_idx],
r_y0=row[r_y0_idx],
r_x1=row[r_x1_idx],
r_y1=row[r_y1_idx],
r_x2=row[r_x2_idx],
r_y2=row[r_y2_idx],
r_x3=row[r_x3_idx],
r_y3=row[r_y3_idx],
)

result[ind] = PdfTextCell(
rect=rect,
text=row[text_idx],
orig=row[text_idx],
font_key=row[font_key_idx],
font_name=row[font_name_idx],
widget=row[widget_idx],
text_direction=(
TextDirection.LEFT_TO_RIGHT
if row[left_to_right_idx]
else TextDirection.RIGHT_TO_LEFT
),
index=ind,
rendering_mode=row[rendering_mode_idx],
)

return result

def _to_bitmap_resources(self, images: dict) -> List[BitmapResource]:

Expand Down Expand Up @@ -331,11 +424,16 @@ def _to_segmented_page(
self,
page: dict,
*,
keep_chars: bool = True,
keep_lines: bool = True,
keep_bitmaps: bool = True,
create_words: bool,
create_textlines: bool,
enforce_same_font: bool = True,
) -> SegmentedPdfPage:

# FIXME: this might be inefficient ...
"""
char_cells = self._to_cells(page["cells"])
segmented_page = SegmentedPdfPage(
dimension=self._to_page_geometry(page["dimension"]),
Expand All @@ -354,6 +452,48 @@ def _to_segmented_page(
self._create_textline_cells(
segmented_page, enforce_same_font=enforce_same_font
)
"""

char_cells = []
if keep_chars:
assert "cells" in page
char_cells = self._to_cells(page["cells"])

lines = []
if keep_lines:
assert "lines" in page
lines = self._to_lines(page["lines"])

bitmap_resources = []
if keep_bitmaps:
assert "images" in page
bitmap_resources = self._to_bitmap_resources(page["images"])

segmented_page = SegmentedPdfPage(
dimension=self._to_page_geometry(page["dimension"]),
char_cells=char_cells,
word_cells=[],
textline_cells=[],
has_chars=len(char_cells) > 0,
bitmap_resources=bitmap_resources, # self._to_bitmap_resources(page["images"]),
lines=lines, # self._to_lines(page["lines"]),
)

if create_words and ("word_cells" in page):
segmented_page.word_cells = self._to_cells(page["word_cells"])
elif create_words:
self._create_word_cells(segmented_page, enforce_same_font=enforce_same_font)
else:
logging.warning("No `words` will be created for segmented_page")

if create_textlines and ("word_cells" in page):
segmented_page.textline_cells = self._to_cells(page["line_cells"])
elif create_textlines:
self._create_textline_cells(
segmented_page, enforce_same_font=enforce_same_font
)
else:
logging.warning("No `text_lines` will be created for segmented_page")

return segmented_page

Expand Down
Loading
Loading