Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 64 additions & 28 deletions docling_core/transforms/visualizer/reading_order_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
from typing_extensions import override

from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
from docling_core.types.doc.document import (
ContentLayer,
DocItem,
DoclingDocument,
PictureItem,
)


class _NumberDrawingData(BaseModel):
Expand All @@ -33,6 +38,20 @@ class Params(BaseModel):
base_visualizer: Optional[BaseVisualizer] = None
params: Params = Params()

def _get_picture_context(
self, elem: DocItem, doc: DoclingDocument
) -> Optional[str]:
"""Get the picture self_ref if element is nested inside a PictureItem, None otherwise."""
current = elem
while current.parent is not None:
parent = current.parent.resolve(doc)
if isinstance(parent, PictureItem):
return parent.self_ref
if not isinstance(parent, DocItem):
break
current = parent
return None

def _draw_arrow(
self,
draw: ImageDraw.ImageDraw,
Expand All @@ -55,10 +74,10 @@ def _draw_arrow(
# Calculate the arrowhead points
dx = end_point[0] - start_point[0]
dy = end_point[1] - start_point[1]
angle = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft
distance = (dx**2 + dy**2) ** 0.5 + 0.01 # Length of the arrow shaft

# Normalized direction vector for the arrow shaft
ux, uy = dx / angle, dy / angle
ux, uy = dx / distance, dy / distance

# Base of the arrowhead
base_x = end_point[0] - ux * arrowhead_length
Expand Down Expand Up @@ -89,16 +108,34 @@ def _draw_doc_reading_order(
except OSError:
# Fallback to default font if arial is not available
font = ImageFont.load_default()
x0, y0 = None, None
number_data_to_draw: dict[Optional[int], list[_NumberDrawingData]] = {}
my_images: dict[Optional[int], Image] = images or {}
prev_page = None
i = 0

# Separate reading order paths for outside vs inside pictures
# Key: (page_no, picture_ref_or_None) -> (x0, y0, element_index)
# picture_ref is None for elements outside any picture, otherwise the picture's self_ref
reading_order_state: dict[
tuple[int, Optional[str]], tuple[float, float, int]
] = {}
number_data_to_draw: dict[int, list[_NumberDrawingData]] = {}
# Only int keys are used (from prov.page_no), even if input images has Optional[int] keys
my_images: dict[int, Image] = {
k: v for k, v in (images or {}).items() if k is not None
}
prev_page: Optional[int] = None
element_index = 0

for elem, _ in doc.iterate_items(
included_content_layers=self.params.content_layers,
traverse_pictures=True,
):
if not isinstance(elem, DocItem):
continue

picture_ref = self._get_picture_context(elem, doc)
# Include all elements in reading order:
# - Top-level PictureItems are part of the outer reading order (picture_ref is None)
# - Nested PictureItems are part of their parent picture's reading order (picture_ref is not None)
# - Other elements follow the same pattern

if len(elem.prov) == 0:
continue # Skip elements without provenances

Expand All @@ -110,9 +147,9 @@ def _draw_doc_reading_order(
number_data_to_draw[page_no] = []

if image is None or prev_page is None or page_no != prev_page:
# new page begins
# new page begins - reset all reading order paths
prev_page = page_no
x0 = y0 = None
reading_order_state.clear()

if image is None:
page_image = doc.pages[page_no].image
Expand Down Expand Up @@ -140,35 +177,34 @@ def _draw_doc_reading_order(
if ro_bbox.b > ro_bbox.t:
ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b

if x0 is None and y0 is None:
# is_root= True
x0 = (ro_bbox.l + ro_bbox.r) / 2.0
y0 = (ro_bbox.b + ro_bbox.t) / 2.0
path_key = (page_no, picture_ref)
state = reading_order_state.get(path_key)

x1 = (ro_bbox.l + ro_bbox.r) / 2.0
y1 = (ro_bbox.b + ro_bbox.t) / 2.0

if state is None:
# Start of a new reading order path (outside or inside picture)
reading_order_state[path_key] = (x1, y1, element_index)
number_data_to_draw[page_no].append(
_NumberDrawingData(
xy=(x0, y0),
text=f"{i}",
xy=(x1, y1),
text=f"{element_index}",
)
)
i += 1

element_index += 1
else:
# is_root = False
assert x0 is not None
assert y0 is not None

x1 = (ro_bbox.l + ro_bbox.r) / 2.0
y1 = (ro_bbox.b + ro_bbox.t) / 2.0

# Continue existing reading order path
x0, y0, _ = state
# Use different color for picture-internal paths
arrow_color = "blue" if picture_ref is not None else "red"
draw = self._draw_arrow(
draw=draw,
arrow_coords=(x0, y0, x1, y1),
line_width=2,
color="red",
color=arrow_color,
)

x0, y0 = x1, y1
reading_order_state[path_key] = (x1, y1, state[2])

if self.params.show_branch_numbering:
# post-drawing the numbers to ensure they are rendered on top-layer
Expand Down
1 change: 1 addition & 0 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
DocItemLabel.PAGE_HEADER,
DocItemLabel.PAGE_FOOTER,
DocItemLabel.KEY_VALUE_REGION,
DocItemLabel.EMPTY_VALUE,
}

DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
Expand Down
Binary file modified test/data/viz/2408.09869v3_enriched_viz_p3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading