# Docling Hybrid Chunking 教程

参考文档：[Hybrid chunking（Docling 官方示例）](https://docling-project.github.io/docling/examples/hybrid_chunking/)

本笔记本演示：
- 使用 `DocumentConverter` 转换文档
- 使用 `HybridChunker` 进行混合分块
- 使用 `contextualize()` 生成上下文增强文本
- 可选：与 HuggingFace 分词器对齐，保持与嵌入模型一致


In [None]:
from pathlib import Path
from typing import Optional

DATA_PATH = Path("tutorial/hybrid_chunking/data/wiki.md").resolve()
print("DATA_PATH:", DATA_PATH)
assert DATA_PATH.exists(), f"Sample data not found: {DATA_PATH}"


In [None]:
# 如果尚未安装，请取消注释以下命令
# %pip install -qU docling transformers

from docling.document_converter import DocumentConverter

doc = DocumentConverter().convert(source=str(DATA_PATH)).document
print(type(doc))


In [None]:
from docling.chunking import HybridChunker

chunker = HybridChunker()
chunks = list(chunker.chunk(dl_doc=doc))
print(f"num_chunks: {len(chunks)}")

for i, chunk in enumerate(chunks[:5]):
    original = chunk.text or ""
    enriched = chunker.contextualize(chunk=chunk)
    print(f"\n=== {i} ===")
    print("chunk.text:\n" + (original[:500] + ("…" if len(original) > 500 else "")))
    print("\nchunker.contextualize(chunk):\n" + (enriched[:500] + ("…" if len(enriched) > 500 else "")))


> 提示：使用 `HybridChunker` 时，transformers 可能会打印“序列长度超限”的警告，但在本场景通常是“误报”。详情可见官方示例链接。


In [None]:
# 与 HF 分词器对齐（可选）：在 RAG 中建议与嵌入模型保持一致
from typing import Optional
try:
    from transformers import AutoTokenizer
except Exception:
    AutoTokenizer = None

hf_model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2"  # 可按需替换

tokenizer = None
if AutoTokenizer is not None and hf_model_name:
    tokenizer = AutoTokenizer.from_pretrained(hf_model_name, use_fast=True)

chunker_with_tok = HybridChunker(tokenizer=tokenizer) if tokenizer else HybridChunker()
chunks_tok = list(chunker_with_tok.chunk(dl_doc=doc))
print(f"num_chunks (tokenizer={bool(tokenizer)}): {len(chunks_tok)}")

for i, chunk in enumerate(chunks_tok[:3]):
    orig = chunk.text or ""
    enriched = chunker_with_tok.contextualize(chunk=chunk)
    print(f"\n=== {i} (with tokenizer) ===")
    print("chunk.text:\n" + (orig[:300] + ("…" if len(orig) > 300 else "")))
    print("\ncontextualized:\n" + (enriched[:300] + ("…" if len(enriched) > 300 else "")))
