# Document Ingestion and Storage Process

- Use either a local file path or a URL (not both).
- Supported formats by Docling (common): PDF, DOCX, PPTX, XLSX, HTML, TXT.


In [None]:
# Step 1 — Input (choose file OR URL)
# Supported formats (Docling): PDF, DOCX, PPTX, XLSX, HTML, TXT

# Exactly one of these can be set (str). Leave the other as None.
# If using the UI below, you can leave both as None here.
input_file_path = "/Users/dimopc/Downloads/startup_technical_guide_ai_agents_final.pdf"  # e.g., "/path/to/document.pdf"
input_url = None        # e.g., "https://example.com/document.pdf"

print({"file": input_file_path, "url": input_url})


### Optional UI — Upload or URL
Use this to set input without editing variables in Step 1.


In [None]:
# UI widgets: choose File or URL, then click Apply
from ipywidgets import FileUpload, Text, ToggleButtons, Button, HBox, VBox, Output
from IPython.display import display

mode = ToggleButtons(options=["File", "URL"], value="File")
uploader = FileUpload(accept="", multiple=False)
url_text = Text(placeholder="https://example.com/document.pdf", value="")
apply_btn = Button(description="Use Selection", button_style="primary")
out = Output()

# Globals consumed by Step 2
uploaded_bytes = None

@apply_btn.on_click
def _apply(_):
    global input_file_path, input_url, uploaded_bytes
    input_file_path = None
    input_url = None
    uploaded_bytes = None
    if mode.value == "File":
        if uploader.value:
            # Take first file only
            file_info = next(iter(uploader.value.values()))
            uploaded_bytes = file_info.get("content")
            with out:
                out.clear_output()
                print(f"Selected file: {file_info.get('metadata', {}).get('name', 'uploaded')} ({len(uploaded_bytes)} bytes)")
        else:
            with out:
                out.clear_output()
                print("No file selected.")
    else:
        if url_text.value.strip():
            input_url = url_text.value.strip()
            with out:
                out.clear_output()
                print(f"Selected URL: {input_url}")
        else:
            with out:
                out.clear_output()
                print("No URL provided.")

ui = VBox([
    mode,
    HBox([uploader]),
    HBox([url_text]),
    apply_btn,
    out,
])

display(ui)


In [None]:
# Step 2 — Process with Docling (HTTP v1 async) and extract Markdown
import os
import io
import time
import base64
import requests

DOCLING_BASE_URL = os.getenv("DOCLING_BASE_URL", "http://localhost:5001")
markdown_content = None

# Determine input source: exactly one among uploaded_bytes, input_file_path, input_url
has_upload = 'uploaded_bytes' in globals() and uploaded_bytes is not None
has_path = bool(input_file_path)
has_url = bool(input_url)
assert (has_upload + has_path + has_url) == 1, "Provide exactly one source (upload OR file path OR URL)."

# Build sources for Docling Serve v1
sources = []
if has_url:
    sources.append({"kind": "http", "url": input_url})
else:
    if has_path:
        with open(input_file_path, "rb") as f:
            b = f.read()
        filename = os.path.basename(input_file_path)
    else:  # has_upload
        b = uploaded_bytes
        filename = "upload.bin"
    sources.append({
        "kind": "file",
        "base64_string": base64.b64encode(b).decode("ascii"),
        "filename": filename,
    })

# Submit async job
submit_payload = {
    "sources": sources,
    "output_formats": ["md"],             # accepted by v1 sync API
    "options": {
        "to_formats": ["md"],
        "include_images": False,
        "do_picture_description": False,
        "do_picture_classification": False,
        "do_formula_enrichment": False,
        "do_code_enrichment": False,
        "image_export_mode": "placeholder"
    },
}
resp = requests.post(
    f"{DOCLING_BASE_URL}/v1/convert/source/async",
    json=submit_payload,
    timeout=30,
)
resp.raise_for_status()
job = resp.json()

# Poll status until success/failure (max ~5 min)
max_wait_s = 300
poll_interval_s = 2
start = time.time()
status = job.get("task_status", "pending")
task_id = job["task_id"]
while status not in ("success", "failure"):
    if time.time() - start > max_wait_s:
        raise TimeoutError(f"Docling async job timed out after {max_wait_s}s (task_id={task_id}).")
    s = requests.get(f"{DOCLING_BASE_URL}/v1/status/poll/{task_id}", timeout=15)
    s.raise_for_status()
    job = s.json()
    status = job.get("task_status", "pending")
    time.sleep(poll_interval_s)

if status != "success":
    raise RuntimeError(f"Docling job failed (task_id={task_id}). Details: {job}")

# Fetch result
r = requests.get(f"{DOCLING_BASE_URL}/v1/result/{task_id}", timeout=120)
r.raise_for_status()
res = r.json() if r.headers.get("content-type", "").startswith("application/json") else {}

# Prefer 'document.md_content' if provided; fallback to previous map style
doc_obj = res.get("document") or {}
markdown_content = doc_obj.get("md_content")

if not markdown_content:
    output = res.get("output") or {}
    md_key = next((k for k in output.keys() if k.endswith(".md")), None)
    markdown_content = output.get(md_key, "") if md_key else output.get("markdown", "")

print("--- Extracted Markdown (preview) ---")
print(markdown_content)


### Step 3 — Hybrid chunking via Docling Serve (merge_peers)
Uses the HTTP hybrid chunker with ~384-token chunks.


In [None]:
# Step 3 — Chunk using Docling Serve hybrid chunker (≈384 tokens, merge_peers)
import base64
import os
import requests

TARGET_TOKENS = 384  # all-MiniLM-L6-v2 embedding dimension

# Build sources again (same logic as Step 2)
has_upload = 'uploaded_bytes' in globals() and uploaded_bytes is not None
has_path = bool(input_file_path)
has_url = bool(input_url)
assert (has_upload + has_path + has_url) == 1, "Provide exactly one source (upload OR file path OR URL)."

sources = []
if has_url:
    sources.append({"kind": "http", "url": input_url})
else:
    if has_path:
        with open(input_file_path, "rb") as f:
            b = f.read()
        filename = os.path.basename(input_file_path)
    else:  # has_upload
        b = uploaded_bytes
        filename = "upload.bin"
    sources.append({
        "kind": "file",
        "base64_string": base64.b64encode(b).decode("ascii"),
        "filename": filename,
    })

payload = {
    "sources": sources,
    "chunking_options": {
        "chunker": "hybrid",
        "tokenizer": "sentence-transformers/all-MiniLM-L6-v2",
        "max_tokens": TARGET_TOKENS,
        "merge_peers": True
    }
}

resp = requests.post(f"{DOCLING_BASE_URL}/v1/chunk/hybrid/source", json=payload, timeout=300)
resp.raise_for_status()
result = resp.json()

chunks = result.get("chunks", [])
chunk_texts = [c.get("text", "") for c in chunks]

print(f"Total chunks: {len(chunk_texts)}")


In [None]:
# Step 3 — Chunk Markdown using Docling (≈384 tokens per chunk)
from typing import List

# Try Docling chunker if available; otherwise simple token-based chunking
try:
    from docling.chunking import HierarchicalChunker
    from docling.core.types import DoclingDocument
    have_docling_chunker = True
except Exception:
    have_docling_chunker = False

TARGET_TOKENS = 384  # all-MiniLM-L6-v2 embedding dimension; we approximate token length

# Tokenizer: lightweight whitespace split for portability

def split_into_token_chunks(text: str, max_tokens: int) -> List[str]:
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        if chunk_tokens:
            chunks.append(" ".join(chunk_tokens))
    return chunks

if have_docling_chunker and 'doc' in locals():
    # Use Docling hierarchical chunker to get semantically coherent pieces first
    chunker = HierarchicalChunker()
    docling_chunks = list(chunker.chunk(doc))
    base_segments = [c.text for c in docling_chunks if getattr(c, 'text', '')]
    text_for_chunking = "\n\n".join(base_segments)
else:
    text_for_chunking = markdown_content or ""

chunk_texts = split_into_token_chunks(text_for_chunking, TARGET_TOKENS)

print(f"Total chunks: {len(chunk_texts)}")
print("--- First 10 chunks ---")


In [None]:
# Step 4 — Save chunks to Chroma (`file-explorer`) with default embeddings
from chromadb import HttpClient

chroma_client = HttpClient(host="localhost", port=5002)
collection = chroma_client.get_or_create_collection(name="file-explorer")

# Schema mirrors example: documents, metadatas, ids
documents = chunk_texts
metadatas = [
    {
        "source": input_file_path or input_url,
        "chunk_index": i,
        "type": "docling_markdown_chunk",
    }
    for i in range(len(documents))
]
ids = [f"dl_chunk_{i+1}" for i in range(len(documents))]

# Use upsert to avoid duplicates on re-run
collection.upsert(documents=documents, metadatas=metadatas, ids=ids)

print(f"Saved {len(documents)} chunks to Chroma collection 'file-explorer'.")
