In [3]:
from docling.document_converter import DocumentConverter

In [None]:
converter = DocumentConverter()

# --------------------------------------------------------------
# Basic PDF extraction
# --------------------------------------------------------------

result = converter.convert("https://services.google.com/fh/files/misc/startup_technical_guide_ai_agents_final.pdf")

document = result.document
markdown_output = document.export_to_markdown()
json_output = document.export_to_dict()

print(markdown_output)

In [10]:
from docling.chunking import HybridChunker

MAX_TOKENS = 384  # all-MiniLM-L6-v2 embedding dimension; we approximate token length
#
chunker = HybridChunker(
    tokenizer="sentence-transformers/all-MiniLM-L6-v2",
    max_tokens=MAX_TOKENS,
    merge_peers=True,
)

chunk_iter = chunker.chunk(dl_doc=document)
chunks = list(chunk_iter)

len(chunks)

Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors


157

In [35]:
import pandas as pd

metadatas = []
processed_chunks = []

for chunk in chunks:
    processed_chunks.append(chunk.text)
    metadatas.append({
        "book_name": chunk.meta.origin.filename,
        "chapters": ", ".join([str(x) for x in[
                page_no
                for page_no in sorted(
                    set(
                        prov.page_no
                        for item in chunk.meta.doc_items
                        for prov in item.prov
                    )
                )
            ]]),
        "type": "research",
        "title": chunk.meta.headings[0] if chunk.meta.headings else None,
        "date_created": "2025-10-27T10:30:00Z"
    })


df_results = pd.DataFrame(metadatas)
df_results

Unnamed: 0,book_name,chapters,type,title,date_created
0,startup_technical_guide_ai_agents_final.pdf,2,research,Table of contents,2025-10-27T10:30:00Z
1,startup_technical_guide_ai_agents_final.pdf,3,research,Introduction,2025-10-27T10:30:00Z
2,startup_technical_guide_ai_agents_final.pdf,3,research,The focus of this guide,2025-10-27T10:30:00Z
3,startup_technical_guide_ai_agents_final.pdf,3,research,Want extra support?,2025-10-27T10:30:00Z
4,startup_technical_guide_ai_agents_final.pdf,"5, 6",research,Core concepts of AI agents g,2025-10-27T10:30:00Z
...,...,...,...,...,...
152,startup_technical_guide_ai_agents_final.pdf,62,research,Resources,2025-10-27T10:30:00Z
153,startup_technical_guide_ai_agents_final.pdf,62,research,Resources,2025-10-27T10:30:00Z
154,startup_technical_guide_ai_agents_final.pdf,62,research,Resources,2025-10-27T10:30:00Z
155,startup_technical_guide_ai_agents_final.pdf,63,research,Resources,2025-10-27T10:30:00Z


In [36]:
# Step 4 — Save chunks to Chroma (`file-explorer`) with default embeddings
from chromadb import HttpClient

chroma_client = HttpClient(host="localhost", port=5002)
collection = chroma_client.get_or_create_collection(name="file-explorer")

# Schema mirrors example: documents, metadatas, ids
documents = processed_chunks

ids = [f"dl_chunk_{i+1}" for i in range(len(documents))]

# Use upsert to avoid duplicates on re-run
collection.upsert(documents=documents, metadatas=metadatas, ids=ids)

print(f"Saved {len(documents)} chunks to Chroma collection 'file-explorer'.")

2025-10-31 12:51:14,053 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-10-31 12:51:14,123 - INFO - HTTP Request: GET http://localhost:5002/api/v2/auth/identity "HTTP/1.1 200 OK"
2025-10-31 12:51:14,125 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-10-31 12:51:14,161 - INFO - HTTP Request: GET http://localhost:5002/api/v2/tenants/default_tenant "HTTP/1.1 200 OK"
2025-10-31 12:51:14,165 - INFO - HTTP Request: GET http://localhost:5002/api/v2/tenants/default_tenant/databases/default_database "HTTP/1.1 200 OK"
2025-10-31 12:51:14,180 - INFO - HTTP Request: POST http://localhost:5002/api/v2/tenants/default_tenant/databases/default_database/collections "HTTP/1.1 200 OK"
2025-10-31 12:51:14,523 - INFO - Backing off send_request(...) for 0.2s (requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connec

Saved 157 chunks to Chroma collection 'file-explorer'.
