# Knowledge extraction


Install packages


In [3]:
!uv pip install -q \
    pydantic==2.12.3 \
    python-dotenv==1.1.1 \
    docling==2.61.2 \
    tiktoken==0.12.0 \
    transformers==4.57.1

Import packages


In [None]:
import logging

from docling.chunking import HybridChunker
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
from docling_core.transforms.chunker.tokenizer.huggingface import (
    HuggingFaceTokenizer,
)
from dotenv import load_dotenv
from transformers import AutoTokenizer

load_dotenv()

logging.getLogger("docling").setLevel(logging.ERROR)
logging.getLogger("RapidOCR").setLevel(logging.ERROR)

Define tokenizer


In [None]:
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TOKENS = 64  # set to a small number for illustrative purposes

tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
    max_tokens=MAX_TOKENS,  # optional, by default derived from `tokenizer` for HF case
)

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False  # Disable OCR if not needed (much faster)
pipeline_options.do_table_structure = True  # Keep table extraction
pipeline_options.images_scale = 1.0  # Reduce if you don't need high-res images
pipeline_options.generate_page_images = False

Extract the data


In [None]:
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

result = converter.convert("https://arxiv.org/pdf/2408.09869")

Apply hybrid chunking


In [None]:
chunker = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=MAX_TOKENS,
    merge_peers=True,
)
chunk_iter = chunker.chunk(dl_doc=result.document)
chunks = list(chunk_iter)

print(f"First 5 chunks: {chunks[:5]}\nTotal chunks: {len(chunks)}")

First 5 chunks: [DocChunk(text='Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/3', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, meta=None, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=113.643, t=481.532, r=498.359, b=439.849, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 295))], orig='Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar', text='Christoph Auer Maksym Lys