Skip to content

The extract_items_range method generates a document with inconsistent hierarchy #427

@olivierantonelli

Description

@olivierantonelli

Bug

The DoclingDocument.extract_items_range method may create an invalid DoclingDocument when the original document contains tables with RichTableCell cells.

During extract_items_range, GroupItems in the given range are renumbered, but the references to GroupItems in RichTableCell are not updated accordingly which results in an "IndexError: list index out of range" exception.

Steps to reproduce

from docling_core.types.doc import DoclingDocument, RichTableCell, TableCell, DocItemLabel, TableData, Formatting

doc = DoclingDocument(name="sample")

list_before_table = doc.add_list_group(name="A list")
doc.add_list_item(text="First item", parent=list_before_table)

table_item = doc.add_table(data=TableData(num_rows=2, num_cols=1))

cell = TableCell(text="R1C1", start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1)
doc.add_table_cell(table_item=table_item, cell=cell)

rich_group = doc.add_group(name="rich_cell_group", parent=table_item)
rich_cell = RichTableCell(start_row_offset_idx=1, end_row_offset_idx=2, start_col_offset_idx=0, end_col_offset_idx=1, ref=rich_group.get_ref(),)
doc.add_text(label=DocItemLabel.TEXT,text="R2C1", parent=rich_group, formatting=Formatting(italic=True))
doc.add_table_cell(table_item=table_item, cell=rich_cell)

# Here we create a new Docling document with the TableItem and all his childs
doc_with_only_the_table = doc.extract_items_range(start=doc.tables[0], end=doc.tables[0])

# export_to_markdown raises an "IndexError: list index out of range" exception
doc_with_only_the_table.export_to_markdown()

Here, without the export_to_markdown, 'doc_with_only_the_table' references '#/groups/1' which no longer exist:

doc_with_only_the_table.tables[0].data.grid[1][0].ref
RefItem(cref='#/groups/1')
list(doc_with_only_the_table.iterate_items(with_groups=True))
[(GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/tables/0')], content_layer=<ContentLayer.BODY: 'body'>, meta=None, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>),
  0),
 (TableItem(self_ref='#/tables/0', parent=RefItem(cref='#/body'), children=[RefItem(cref='#/groups/0')], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TABLE: 'table'>, prov=[], captions=[], references=[], footnotes=[], image=None, data=TableData(table_cells=[TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1, text='R1C1', column_header=False, row_header=False, row_section=False, fillable=False), RichTableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=1, end_row_offset_idx=2, start_col_offset_idx=0, end_col_offset_idx=1, text='', column_header=False, row_header=False, row_section=False, fillable=False, ref=RefItem(cref='#/groups/1'))], num_rows=2, num_cols=1, grid=[[TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1, text='R1C1', column_header=False, row_header=False, row_section=False, fillable=False)], [RichTableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=1, end_row_offset_idx=2, start_col_offset_idx=0, end_col_offset_idx=1, text='', column_header=False, row_header=False, row_section=False, fillable=False, ref=RefItem(cref='#/groups/1'))]]), annotations=[]),
  1),
 (GroupItem(self_ref='#/groups/0', parent=RefItem(cref='#/tables/0'), children=[RefItem(cref='#/texts/0')], content_layer=<ContentLayer.BODY: 'body'>, meta=None, name='rich_cell_group', label=<GroupLabel.UNSPECIFIED: 'unspecified'>),
  2),
 (TextItem(self_ref='#/texts/0', parent=RefItem(cref='#/groups/0'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TEXT: 'text'>, prov=[], orig='R2C1', text='R2C1', formatting=Formatting(bold=False, italic=True, underline=False, strikethrough=False, script=<Script.BASELINE: 'baseline'>), hyperlink=None),
  3)]

Docling version

docling-core 2.51.1
Python: cpython 3.12.3
Platform: 6.6.87.2-microsoft-standard-WSL2

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions