diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 77979614..914d0b8f 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -19,6 +19,7 @@ from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta from docling_core.types import DoclingDocument as DLDocument from docling_core.types.doc.document import ( + CodeItem, DocItem, DocumentOrigin, LevelNumber, @@ -199,8 +200,10 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: heading_by_level.pop(k, None) continue - if isinstance(item, TextItem) or ( - (not self.merge_list_items) and isinstance(item, ListItem) + if ( + isinstance(item, TextItem) + or ((not self.merge_list_items) and isinstance(item, ListItem)) + or isinstance(item, CodeItem) ): text = item.text elif isinstance(item, TableItem): diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 4d5237a2..3d4ecd97 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -677,51 +677,6 @@ def export_to_document_tokens( return body -class CodeItem(TextItem): - """CodeItem.""" - - label: typing.Literal[DocItemLabel.CODE] = ( - DocItemLabel.CODE # type: ignore[assignment] - ) - code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN - - def export_to_document_tokens( - self, - doc: "DoclingDocument", - new_line: str = "", - xsize: int = 500, - ysize: int = 500, - add_location: bool = True, - add_content: bool = True, - ): - r"""Export text element to document tokens format. - - :param doc: "DoclingDocument": - :param new_line: str (Default value = "") - :param xsize: int: (Default value = 500) - :param ysize: int: (Default value = 500) - :param add_location: bool: (Default value = True) - :param add_content: bool: (Default value = True) - - """ - body = f"{DocumentToken.BEG_CODE.value}{new_line}" - - if add_location: - body += self.get_location_tokens( - doc=doc, - new_line=new_line, - xsize=xsize, - ysize=ysize, - ) - - if add_content and self.text is not None: - body += f"<_{self.code_language.value}_>{self.text}{new_line}" - - body += f"{DocumentToken.END_CODE.value}\n" - - return body - - class SectionHeaderItem(TextItem): """SectionItem.""" @@ -812,6 +767,53 @@ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]: return super().get_image(doc=doc) +class CodeItem(FloatingItem): + """CodeItem.""" + + label: typing.Literal[DocItemLabel.CODE] = ( + DocItemLabel.CODE # type: ignore[assignment] + ) + orig: str # untreated representation + text: str # sanitized representation + code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN + + def export_to_document_tokens( + self, + doc: "DoclingDocument", + new_line: str = "", + xsize: int = 500, + ysize: int = 500, + add_location: bool = True, + add_content: bool = True, + ): + r"""Export text element to document tokens format. + + :param doc: "DoclingDocument": + :param new_line: str (Default value = "") + :param xsize: int: (Default value = 500) + :param ysize: int: (Default value = 500) + :param add_location: bool: (Default value = True) + :param add_content: bool: (Default value = True) + + """ + body = f"{DocumentToken.BEG_CODE.value}{new_line}" + + if add_location: + body += self.get_location_tokens( + doc=doc, + new_line=new_line, + xsize=xsize, + ysize=ysize, + ) + + if add_content and self.text is not None: + body += f"<_{self.code_language.value}_>{self.text}{new_line}" + + body += f"{DocumentToken.END_CODE.value}\n" + + return body + + class PictureItem(FloatingItem): """PictureItem.""" @@ -1763,6 +1765,7 @@ def add_code( text: str, code_language: Optional[CodeLanguageLabel] = None, orig: Optional[str] = None, + caption: Optional[Union[TextItem, RefItem]] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, content_layer: Optional[ContentLayer] = None, @@ -1772,6 +1775,8 @@ def add_code( :param text: str: :param code_language: Optional[str]: (Default value = None) :param orig: Optional[str]: (Default value = None) + :param caption: Optional[Union[TextItem: + :param RefItem]]: (Default value = None) :param prov: Optional[ProvenanceItem]: (Default value = None) :param parent: Optional[NodeItem]: (Default value = None) """ @@ -1795,6 +1800,8 @@ def add_code( code_item.content_layer = content_layer if prov: code_item.prov.append(prov) + if caption: + code_item.captions.append(caption.get_ref()) self.texts.append(code_item) parent.children.append(RefItem(cref=cref)) diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 17dbb84b..409ea306 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -208,6 +208,41 @@ "title": "Prov", "type": "array" }, + "captions": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "Captions", + "type": "array" + }, + "references": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "References", + "type": "array" + }, + "footnotes": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "Footnotes", + "type": "array" + }, + "image": { + "anyOf": [ + { + "$ref": "#/$defs/ImageRef" + }, + { + "type": "null" + } + ], + "default": null + }, "orig": { "title": "Orig", "type": "string" diff --git a/test/data/docling_document/unit/CodeItem.yaml b/test/data/docling_document/unit/CodeItem.yaml index f5238e01..4d80bcdc 100644 --- a/test/data/docling_document/unit/CodeItem.yaml +++ b/test/data/docling_document/unit/CodeItem.yaml @@ -1,4 +1,8 @@ children: [] +captions: [] +footnotes: [] +references: [] +image: null code_language: Python content_layer: body label: code