Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
from docling_core.types import DoclingDocument as DLDocument
from docling_core.types.doc.document import (
CodeItem,
DocItem,
DocumentOrigin,
LevelNumber,
Expand Down Expand Up @@ -199,8 +200,10 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
heading_by_level.pop(k, None)
continue

if isinstance(item, TextItem) or (
(not self.merge_list_items) and isinstance(item, ListItem)
if (
isinstance(item, TextItem)
or ((not self.merge_list_items) and isinstance(item, ListItem))
or isinstance(item, CodeItem)
):
text = item.text
elif isinstance(item, TableItem):
Expand Down
97 changes: 52 additions & 45 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,51 +677,6 @@ def export_to_document_tokens(
return body


class CodeItem(TextItem):
"""CodeItem."""

label: typing.Literal[DocItemLabel.CODE] = (
DocItemLabel.CODE # type: ignore[assignment]
)
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN

def export_to_document_tokens(
self,
doc: "DoclingDocument",
new_line: str = "",
xsize: int = 500,
ysize: int = 500,
add_location: bool = True,
add_content: bool = True,
):
r"""Export text element to document tokens format.

:param doc: "DoclingDocument":
:param new_line: str (Default value = "")
:param xsize: int: (Default value = 500)
:param ysize: int: (Default value = 500)
:param add_location: bool: (Default value = True)
:param add_content: bool: (Default value = True)

"""
body = f"{DocumentToken.BEG_CODE.value}{new_line}"

if add_location:
body += self.get_location_tokens(
doc=doc,
new_line=new_line,
xsize=xsize,
ysize=ysize,
)

if add_content and self.text is not None:
body += f"<_{self.code_language.value}_>{self.text}{new_line}"

body += f"{DocumentToken.END_CODE.value}\n"

return body


class SectionHeaderItem(TextItem):
"""SectionItem."""

Expand Down Expand Up @@ -812,6 +767,53 @@ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
return super().get_image(doc=doc)


class CodeItem(FloatingItem):
"""CodeItem."""

label: typing.Literal[DocItemLabel.CODE] = (
DocItemLabel.CODE # type: ignore[assignment]
)
orig: str # untreated representation
text: str # sanitized representation
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN

def export_to_document_tokens(
self,
doc: "DoclingDocument",
new_line: str = "",
xsize: int = 500,
ysize: int = 500,
add_location: bool = True,
add_content: bool = True,
):
r"""Export text element to document tokens format.

:param doc: "DoclingDocument":
:param new_line: str (Default value = "")
:param xsize: int: (Default value = 500)
:param ysize: int: (Default value = 500)
:param add_location: bool: (Default value = True)
:param add_content: bool: (Default value = True)

"""
body = f"{DocumentToken.BEG_CODE.value}{new_line}"

if add_location:
body += self.get_location_tokens(
doc=doc,
new_line=new_line,
xsize=xsize,
ysize=ysize,
)

if add_content and self.text is not None:
body += f"<_{self.code_language.value}_>{self.text}{new_line}"

body += f"{DocumentToken.END_CODE.value}\n"

return body


class PictureItem(FloatingItem):
"""PictureItem."""

Expand Down Expand Up @@ -1763,6 +1765,7 @@ def add_code(
text: str,
code_language: Optional[CodeLanguageLabel] = None,
orig: Optional[str] = None,
caption: Optional[Union[TextItem, RefItem]] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
Expand All @@ -1772,6 +1775,8 @@ def add_code(
:param text: str:
:param code_language: Optional[str]: (Default value = None)
:param orig: Optional[str]: (Default value = None)
:param caption: Optional[Union[TextItem:
:param RefItem]]: (Default value = None)
:param prov: Optional[ProvenanceItem]: (Default value = None)
:param parent: Optional[NodeItem]: (Default value = None)
"""
Expand All @@ -1795,6 +1800,8 @@ def add_code(
code_item.content_layer = content_layer
if prov:
code_item.prov.append(prov)
if caption:
code_item.captions.append(caption.get_ref())

self.texts.append(code_item)
parent.children.append(RefItem(cref=cref))
Expand Down
35 changes: 35 additions & 0 deletions docs/DoclingDocument.json
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,41 @@
"title": "Prov",
"type": "array"
},
"captions": {
"default": [],
"items": {
"$ref": "#/$defs/RefItem"
},
"title": "Captions",
"type": "array"
},
"references": {
"default": [],
"items": {
"$ref": "#/$defs/RefItem"
},
"title": "References",
"type": "array"
},
"footnotes": {
"default": [],
"items": {
"$ref": "#/$defs/RefItem"
},
"title": "Footnotes",
"type": "array"
},
"image": {
"anyOf": [
{
"$ref": "#/$defs/ImageRef"
},
{
"type": "null"
}
],
"default": null
},
"orig": {
"title": "Orig",
"type": "string"
Expand Down
4 changes: 4 additions & 0 deletions test/data/docling_document/unit/CodeItem.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
children: []
captions: []
footnotes: []
references: []
image: null
code_language: Python
content_layer: body
label: code
Expand Down