Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions llm-service/app/ai/indexing/readers/docling_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,8 @@

from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.transforms.chunker.base import BaseChunk
from docling_core.transforms.serializer.base import SerializationResult
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from llama_index.core.schema import Document, TextNode, NodeRelationship

from .base_reader import BaseReader
Expand All @@ -67,20 +65,16 @@ def load_chunks(self, file_path: Path) -> ChunksResult:
converted_chunks: List[TextNode] = []
logger.debug(f"{file_path=}")
docling_doc: ConversionResult = DocumentConverter().convert(file_path)
chunky_chunks = HierarchicalChunker(serializer_provider=MarkdownSerializerProvider()).chunk(docling_doc.document)
chunky_chunks = HybridChunker(serializer_provider=MarkdownSerializerProvider()).chunk(docling_doc.document)
chunky_chunk: BaseChunk
serializer = MarkdownDocSerializer(doc=docling_doc.document)
for i, chunky_chunk in enumerate(chunky_chunks):
text = ""
page_number: int = 0
if not hasattr(chunky_chunk.meta, "doc_items"):
logger.warning(f"Chunk {i} is empty, skipping")
continue
for item in chunky_chunk.meta.doc_items:
page_number= item.prov[0].page_no if item.prov else None
item_ser: SerializationResult = serializer.serialize(item=item)
text += item_ser.text
node = TextNode(text=text)
node = TextNode(text=chunky_chunk.text)
if page_number:
node.metadata["page_number"] = page_number
node.metadata["file_name"] = document.metadata["file_name"]
Expand Down
3 changes: 2 additions & 1 deletion llm-service/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dependencies = [
"torch>=2.5.1",
"pillow>=10.4.0",
"transformers>=4.46.3",
"docling>=2.15.0",
"docling>=2.40.0",
"llvmlite==0.43.0",
"llama-index-llms-bedrock-converse>=0.4.10",
"presidio-analyzer>=2.2.355",
Expand Down Expand Up @@ -57,6 +57,7 @@ license = {text = "APACHE"}
override-dependencies = [
"boto3-stubs==1.36.1",
"botocore-stubs==1.36.1",
"docling-ibm-models==3.7.0"
]

[dependency-groups]
Expand Down
35 changes: 18 additions & 17 deletions llm-service/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading