docling-project · PeterStaar-IBM · Sep 3, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp
@@ -381,7 +381,8 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	 [](docling::docling_parser_v2 &self,
 	    const std::string &key,
 	    const std::string &page_boundary,
-	    bool do_sanitization) -> nlohmann::json {
+	    bool do_sanitization
+	    ) -> nlohmann::json {
 	   return self.parse_pdf_from_key(key, page_boundary, do_sanitization);
 	 },
 	 pybind11::arg("key"),
@@ -403,13 +404,31 @@ PYBIND11_MODULE(pdf_parsers, m) {
 	    const std::string &key,
 	    int page,
 	    const std::string &page_boundary,
-	    bool do_sanitization) -> nlohmann::json {
-	   return self.parse_pdf_from_key_on_page(key, page, page_boundary, do_sanitization);
+	    bool do_sanitization,
+	    bool keep_char_cells,
+	    bool keep_lines,
+	    bool keep_bitmaps,
+	    bool create_word_cells,
+	    bool create_line_cells) -> nlohmann::json {
+    return self.parse_pdf_from_key_on_page(key,
+					   page,
+					   page_boundary,
+					   do_sanitization,
+					   keep_char_cells,
+					   keep_lines,
+					   keep_bitmaps,
+					   create_word_cells,
+					   create_line_cells);
 	 },
-	 pybind11::arg("key"),
-	 pybind11::arg("page"),
-	 pybind11::arg("page_boundary") = "crop_box", // media_box
-	 pybind11::arg("do_sanitization") = true, // media_box
+    pybind11::arg("key"),
+    pybind11::arg("page"),
+    pybind11::arg("page_boundary") = "crop_box", // media_box
+    pybind11::arg("do_sanitization") = true,
+    pybind11::arg("keep_char_cells") = true,
+    pybind11::arg("keep_lines") = true,
+    pybind11::arg("keep_bitmaps") = true,
+    pybind11::arg("create_word_cells") = true,
+    pybind11::arg("create_line_cells") = true,
 	 R"(
     Parse a specific page of the PDF document identified by its unique key and return a JSON representation.
 
@@ -418,6 +437,11 @@ PYBIND11_MODULE(pdf_parsers, m) {
         page (int): The page number to parse.
         page_boundary (str): The page boundary specification for parsing [choices: crop_box, media_box].
         do_sanitization: Sanitize the chars into lines [default=true].
+        keep_char_cells: keep all the individual char's
+        keep_lines: keep all the lines
+        keep_bitmaps: keep all the bitmap resources
+        create_word_cells: create words from the char-cells
+        create_line_cells: create lines from the char-cells
 
     Returns:
         dict: A JSON representation of the parsed page.)")

diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py
@@ -1,6 +1,7 @@
 """Parser for PDF files"""
 
 import hashlib
+import logging
 from io import BytesIO
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Tuple, Union
@@ -25,14 +26,34 @@
 from docling_parse.pdf_parsers import pdf_parser_v2  # type: ignore[import]
 from docling_parse.pdf_parsers import pdf_sanitizer  # type: ignore[import]
 
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
 
 class PdfDocument:
 
     def iterate_pages(
         self,
+        *,
+        keep_chars: bool = True,
+        keep_lines: bool = True,
+        keep_bitmaps: bool = True,
+        create_words: bool = True,
+        create_textlines: bool = True,
+        enforce_same_font: bool = True,
     ) -> Iterator[Tuple[int, SegmentedPdfPage]]:
         for page_no in range(self.number_of_pages()):
-            yield page_no + 1, self.get_page(page_no + 1)
+            yield page_no + 1, self.get_page(
+                page_no + 1,
+                keep_chars=keep_chars,
+                keep_lines=keep_lines,
+                keep_bitmaps=keep_bitmaps,
+                create_words=create_words,
+                create_textlines=create_textlines,
+                enforce_same_font=enforce_same_font,
+            )
 
     def __init__(
         self,
@@ -125,25 +146,38 @@ def get_page(
         self,
         page_no: int,
         *,
+        keep_chars: bool = True,
+        keep_lines: bool = True,
+        keep_bitmaps: bool = True,
         create_words: bool = True,
         create_textlines: bool = True,
         enforce_same_font: bool = True,
+        do_sanitization: bool = False,
     ) -> SegmentedPdfPage:
         if page_no in self._pages.keys():
             return self._pages[page_no]
         else:
             if 1 <= page_no <= self.number_of_pages():
+
                 doc_dict = self._parser.parse_pdf_from_key_on_page(
                     key=self._key,
                     page=page_no - 1,
                     page_boundary=self._boundary_type,
-                    do_sanitization=False,
+                    do_sanitization=do_sanitization,
+                    keep_char_cells=keep_chars,
+                    keep_lines=keep_lines,
+                    keep_bitmaps=keep_bitmaps,
+                    create_word_cells=create_words,
+                    create_line_cells=create_textlines,
                 )
                 for pi, page in enumerate(
                     doc_dict["pages"]
                 ):  # only one page is expected
                     self._pages[page_no] = self._to_segmented_page(
                         page=page["original"],
+                        keep_chars=keep_chars,
+                        keep_lines=keep_lines,
+                        keep_bitmaps=keep_bitmaps,
                         create_words=create_words,
                         create_textlines=create_textlines,
                         enforce_same_font=enforce_same_font,
@@ -241,6 +275,7 @@ def _to_page_geometry(self, dimension: dict) -> PdfPageGeometry:
             bleed_bbox=bleed_bbox,
         )
 
+    """
     def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
 
         assert "data" in cells, '"data" in cells'
@@ -279,6 +314,64 @@ def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
             result.append(cell)
 
         return result
+    """
+
+    def _to_cells(self, cells: dict) -> List[Union[PdfTextCell, TextCell]]:
+        assert "data" in cells, '"data" in cells'
+        assert "header" in cells, '"header" in cells'
+
+        data = cells["data"]
+        header = cells["header"]
+
+        # Pre-compute header indices as local variables
+        r_x0_idx = header.index("r_x0")
+        r_y0_idx = header.index("r_y0")
+        r_x1_idx = header.index("r_x1")
+        r_y1_idx = header.index("r_y1")
+        r_x2_idx = header.index("r_x2")
+        r_y2_idx = header.index("r_y2")
+        r_x3_idx = header.index("r_x3")
+        r_y3_idx = header.index("r_y3")
+        text_idx = header.index("text")
+        font_key_idx = header.index("font-key")
+        font_name_idx = header.index("font-name")
+        widget_idx = header.index("widget")
+        left_to_right_idx = header.index("left_to_right")
+        rendering_mode_idx = header.index("rendering-mode")
+
+        # Pre-allocate list with exact size
+        data_len = len(data)
+        result: List[Union[PdfTextCell, TextCell]] = [None] * data_len  # type: ignore
+
+        for ind, row in enumerate(data):
+            rect = BoundingRectangle(
+                r_x0=row[r_x0_idx],
+                r_y0=row[r_y0_idx],
+                r_x1=row[r_x1_idx],
+                r_y1=row[r_y1_idx],
+                r_x2=row[r_x2_idx],
+                r_y2=row[r_y2_idx],
+                r_x3=row[r_x3_idx],
+                r_y3=row[r_y3_idx],
+            )
+
+            result[ind] = PdfTextCell(
+                rect=rect,
+                text=row[text_idx],
+                orig=row[text_idx],
+                font_key=row[font_key_idx],
+                font_name=row[font_name_idx],
+                widget=row[widget_idx],
+                text_direction=(
+                    TextDirection.LEFT_TO_RIGHT
+                    if row[left_to_right_idx]
+                    else TextDirection.RIGHT_TO_LEFT
+                ),
+                index=ind,
+                rendering_mode=row[rendering_mode_idx],
+            )
+
+        return result
 
     def _to_bitmap_resources(self, images: dict) -> List[BitmapResource]:
 
@@ -331,11 +424,16 @@ def _to_segmented_page(
         self,
         page: dict,
         *,
+        keep_chars: bool = True,
+        keep_lines: bool = True,
+        keep_bitmaps: bool = True,
         create_words: bool,
         create_textlines: bool,
         enforce_same_font: bool = True,
     ) -> SegmentedPdfPage:
 
+        # FIXME: this might be inefficient ...
+        """
         char_cells = self._to_cells(page["cells"])
         segmented_page = SegmentedPdfPage(
             dimension=self._to_page_geometry(page["dimension"]),
@@ -354,6 +452,48 @@ def _to_segmented_page(
             self._create_textline_cells(
                 segmented_page, enforce_same_font=enforce_same_font
             )
+        """
+
+        char_cells = []
+        if keep_chars:
+            assert "cells" in page
+            char_cells = self._to_cells(page["cells"])
+
+        lines = []
+        if keep_lines:
+            assert "lines" in page
+            lines = self._to_lines(page["lines"])
+
+        bitmap_resources = []
+        if keep_bitmaps:
+            assert "images" in page
+            bitmap_resources = self._to_bitmap_resources(page["images"])
+
+        segmented_page = SegmentedPdfPage(
+            dimension=self._to_page_geometry(page["dimension"]),
+            char_cells=char_cells,
+            word_cells=[],
+            textline_cells=[],
+            has_chars=len(char_cells) > 0,
+            bitmap_resources=bitmap_resources,  # self._to_bitmap_resources(page["images"]),
+            lines=lines,  # self._to_lines(page["lines"]),
+        )
+
+        if create_words and ("word_cells" in page):
+            segmented_page.word_cells = self._to_cells(page["word_cells"])
+        elif create_words:
+            self._create_word_cells(segmented_page, enforce_same_font=enforce_same_font)
+        else:
+            logging.warning("No `words` will be created for segmented_page")
+
+        if create_textlines and ("word_cells" in page):
+            segmented_page.textline_cells = self._to_cells(page["line_cells"])
+        elif create_textlines:
+            self._create_textline_cells(
+                segmented_page, enforce_same_font=enforce_same_font
+            )
+        else:
+            logging.warning("No `text_lines` will be created for segmented_page")
 
         return segmented_page