-
Notifications
You must be signed in to change notification settings - Fork 105
Open
Description
Bug
The DoclingDocument.extract_items_range method may create an invalid DoclingDocument when the original document contains tables with RichTableCell cells.
During extract_items_range, GroupItems in the given range are renumbered, but the references to GroupItems in RichTableCell are not updated accordingly which results in an "IndexError: list index out of range" exception.
Steps to reproduce
from docling_core.types.doc import DoclingDocument, RichTableCell, TableCell, DocItemLabel, TableData, Formatting
doc = DoclingDocument(name="sample")
list_before_table = doc.add_list_group(name="A list")
doc.add_list_item(text="First item", parent=list_before_table)
table_item = doc.add_table(data=TableData(num_rows=2, num_cols=1))
cell = TableCell(text="R1C1", start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1)
doc.add_table_cell(table_item=table_item, cell=cell)
rich_group = doc.add_group(name="rich_cell_group", parent=table_item)
rich_cell = RichTableCell(start_row_offset_idx=1, end_row_offset_idx=2, start_col_offset_idx=0, end_col_offset_idx=1, ref=rich_group.get_ref(),)
doc.add_text(label=DocItemLabel.TEXT,text="R2C1", parent=rich_group, formatting=Formatting(italic=True))
doc.add_table_cell(table_item=table_item, cell=rich_cell)
# Here we create a new Docling document with the TableItem and all his childs
doc_with_only_the_table = doc.extract_items_range(start=doc.tables[0], end=doc.tables[0])
# export_to_markdown raises an "IndexError: list index out of range" exception
doc_with_only_the_table.export_to_markdown()Here, without the export_to_markdown, 'doc_with_only_the_table' references '#/groups/1' which no longer exist:
doc_with_only_the_table.tables[0].data.grid[1][0].refRefItem(cref='#/groups/1')
list(doc_with_only_the_table.iterate_items(with_groups=True))[(GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/tables/0')], content_layer=<ContentLayer.BODY: 'body'>, meta=None, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>),
0),
(TableItem(self_ref='#/tables/0', parent=RefItem(cref='#/body'), children=[RefItem(cref='#/groups/0')], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TABLE: 'table'>, prov=[], captions=[], references=[], footnotes=[], image=None, data=TableData(table_cells=[TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1, text='R1C1', column_header=False, row_header=False, row_section=False, fillable=False), RichTableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=1, end_row_offset_idx=2, start_col_offset_idx=0, end_col_offset_idx=1, text='', column_header=False, row_header=False, row_section=False, fillable=False, ref=RefItem(cref='#/groups/1'))], num_rows=2, num_cols=1, grid=[[TableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=0, end_row_offset_idx=1, start_col_offset_idx=0, end_col_offset_idx=1, text='R1C1', column_header=False, row_header=False, row_section=False, fillable=False)], [RichTableCell(bbox=None, row_span=1, col_span=1, start_row_offset_idx=1, end_row_offset_idx=2, start_col_offset_idx=0, end_col_offset_idx=1, text='', column_header=False, row_header=False, row_section=False, fillable=False, ref=RefItem(cref='#/groups/1'))]]), annotations=[]),
1),
(GroupItem(self_ref='#/groups/0', parent=RefItem(cref='#/tables/0'), children=[RefItem(cref='#/texts/0')], content_layer=<ContentLayer.BODY: 'body'>, meta=None, name='rich_cell_group', label=<GroupLabel.UNSPECIFIED: 'unspecified'>),
2),
(TextItem(self_ref='#/texts/0', parent=RefItem(cref='#/groups/0'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TEXT: 'text'>, prov=[], orig='R2C1', text='R2C1', formatting=Formatting(bold=False, italic=True, underline=False, strikethrough=False, script=<Script.BASELINE: 'baseline'>), hyperlink=None),
3)]
Docling version
docling-core 2.51.1
Python: cpython 3.12.3
Platform: 6.6.87.2-microsoft-standard-WSL2
Metadata
Metadata
Assignees
Labels
No labels