# ColPali + Qdrant Retrieval Pipeline for a Single PDF

This notebook converts every page of a single PDF into images, embeds them with ColPali, and stores the embeddings in a local Qdrant instance for retrieval.


## 1. Setup & Imports
Install required dependencies, import modules, and configure the compute device.


In [None]:
%pip install --upgrade --quiet torch pdf2image Pillow matplotlib tqdm qdrant-client colpali-engine

import os
from pathlib import Path

import torch
from pdf2image import convert_from_path
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

from colpali_engine.models import ColPali, ColPaliProcessor
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


## 2. Configuration
Set paths and runtime parameters for the pipeline.


In [None]:
PDF_PATH = "./data/sample.pdf"  # Update with the actual PDF path
OUTPUT_IMAGE_DIR = "./data/pdf_pages"
QDRANT_URL = "http://localhost:6333"
QDRANT_COLLECTION = "pdf_pages"
BATCH_SIZE = 2

output_dir_path = Path(OUTPUT_IMAGE_DIR)
output_dir_path.mkdir(parents=True, exist_ok=True)

print(f"PDF path: {os.path.abspath(PDF_PATH)}")
print(f"Image output directory: {output_dir_path.resolve()}")


## 3. Convert PDF Pages to Images
Render each page of the PDF to a PNG image and capture metadata for later steps.


In [None]:
def convert_pdf_to_images(pdf_path: str, output_dir: str) -> list[dict]:
    """Convert each page of the PDF into a PNG image on disk."""
    pdf_path = Path(pdf_path)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    images = convert_from_path(str(pdf_path))

    page_records: list[dict] = []
    base_name = pdf_path.stem

    for idx, image in enumerate(images):
        file_name = f"page_{idx + 1:04d}.png"
        image_path = output_path / file_name
        image.save(image_path, format="PNG")

        page_records.append(
            {
                "page_index": idx,
                "image_path": str(image_path.resolve()),
                "pdf_file_name": base_name,
            }
        )

    return page_records


In [None]:
pages = convert_pdf_to_images(PDF_PATH, OUTPUT_IMAGE_DIR)
print(f"Converted {len(pages)} pages to images.")

if pages:
    with Image.open(pages[0]["image_path"]) as sample_image:
        plt.figure(figsize=(6, 8))
        plt.imshow(sample_image)
        plt.axis("off")
        plt.title(f"Sample page 0 from {pages[0]['pdf_file_name']}")
        plt.show()
else:
    print("No pages were generated. Check the PDF path.")


## 4. Load ColPali Model
Load the ColPali processor and model for generating image and text embeddings.


In [None]:
model_name = "vidore/colpali-v1.2"
processor = ColPaliProcessor.from_pretrained(model_name)

preferred_dtype = torch.bfloat16 if device.type in {"cuda", "mps"} else torch.float32

try:
    model = ColPali.from_pretrained(
        model_name,
        torch_dtype=preferred_dtype,
    )
except Exception as error:
    print(f"Falling back to float32 due to: {error}")
    preferred_dtype = torch.float32
    model = ColPali.from_pretrained(
        model_name,
        torch_dtype=preferred_dtype,
    )

model = model.to(device)
model.eval()

print(f"Model loaded with dtype {preferred_dtype} on {device}.")


## 5. Embed Page Images
Batch process the saved page images through ColPali to create embeddings and metadata.


In [None]:
def embed_pages(pages: list[dict], model: ColPali, processor: ColPaliProcessor, batch_size: int):
    """Embed page images with ColPali and return metadata suitable for Qdrant."""
    embeddings: list[dict] = []

    for start in tqdm(range(0, len(pages), batch_size), desc="Embedding pages"):
        batch = pages[start : start + batch_size]
        images = []
        for item in batch:
            with Image.open(item["image_path"]) as img:
                images.append(img.convert("RGB"))
        batch_tensors = processor.process_images(images)
        batch_tensors = {k: v.to(device) for k, v in batch_tensors.items()}

        with torch.no_grad():
            outputs = model(**batch_tensors)

        if isinstance(outputs, torch.Tensor):
            batch_embeddings = outputs
        elif isinstance(outputs, (list, tuple)):
            batch_embeddings = outputs[0]
        elif hasattr(outputs, "embeddings"):
            batch_embeddings = outputs.embeddings
        elif hasattr(outputs, "last_hidden_state"):
            batch_embeddings = outputs.last_hidden_state
        else:
            raise ValueError("Unexpected model output type")

        for offset, embedding_tensor in enumerate(batch_embeddings):
            embedding = embedding_tensor.detach().to(torch.float32).cpu().tolist()
            page_info = batch[offset]
            embeddings.append(
                {
                    "id": len(embeddings) + 1,
                    "embedding": embedding,
                    "pdf_file_name": page_info["pdf_file_name"],
                    "page_index": page_info["page_index"],
                    "image_path": page_info["image_path"],
                }
            )

    return embeddings


In [None]:
page_embeddings = embed_pages(pages, model, processor, BATCH_SIZE)
print(f"Embedded {len(page_embeddings)} pages.")

if page_embeddings:
    embedding_dim = len(page_embeddings[0]["embedding"])
    print(f"Embedding dimension: {embedding_dim}")
else:
    raise RuntimeError("No embeddings generated.")


## 6. Initialize Qdrant Collection
Create or recreate the Qdrant collection to store page embeddings.


In [None]:
client = QdrantClient(url=QDRANT_URL)
vector_size = len(page_embeddings[0]["embedding"])

client.recreate_collection(
    collection_name=QDRANT_COLLECTION,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)

print(f"Collection '{QDRANT_COLLECTION}' ready with vector size {vector_size}.")


## 7. Upsert Embeddings into Qdrant
Persist the vectors and metadata for each PDF page.


In [None]:
points = [
    PointStruct(
        id=item["id"],
        vector=item["embedding"],
        payload={
            "pdf_file_name": item["pdf_file_name"],
            "page_index": item["page_index"],
            "image_path": item["image_path"],
        },
    )
    for item in page_embeddings
]

operation_info = client.upsert(collection_name=QDRANT_COLLECTION, points=points)

print(f"Upserted {len(points)} points into collection '{QDRANT_COLLECTION}'.")
if points:
    print("Example payload:", points[0].payload)


## 8. Retrieval Demo
Search the Qdrant collection with a natural-language query and visualize matching pages.


In [None]:
def search_similar(query_text: str, top_k: int = 5):
    processed_query = processor.process_queries([query_text])
    processed_query = {k: v.to(device) for k, v in processed_query.items()}

    with torch.no_grad():
        outputs = model(**processed_query)

    if isinstance(outputs, torch.Tensor):
        query_embedding_tensor = outputs[0]
    elif isinstance(outputs, (list, tuple)):
        query_embedding_tensor = outputs[0][0]
    elif hasattr(outputs, "embeddings"):
        query_embedding_tensor = outputs.embeddings[0]
    elif hasattr(outputs, "last_hidden_state"):
        query_embedding_tensor = outputs.last_hidden_state[0]
    else:
        raise ValueError("Unexpected model output type for query")

    query_vector = query_embedding_tensor.detach().to(torch.float32).cpu().tolist()

    results = client.search(
        collection_name=QDRANT_COLLECTION,
        query_vector=query_vector,
        limit=top_k,
        with_payload=True,
    )

    matches = []
    for rank, result in enumerate(results, start=1):
        payload = result.payload
        image_path = payload["image_path"]
        with Image.open(image_path) as image:
            plt.figure(figsize=(6, 8))
            plt.imshow(image)
            plt.axis("off")
            plt.title(f"score={result.score:.4f} | page={payload['page_index']} | file={payload['pdf_file_name']}")
            plt.show()

        matches.append(
            {
                "rank": rank,
                "score": result.score,
                "payload": payload,
            }
        )

    return matches


In [None]:
# Example query; update with a relevant question for your PDF
# matches = search_similar("What is the main topic of this document?", top_k=3)
# matches


## 9. Answer Generation Stub
Outline how to hand retrieved images to a vision-language model for question answering.


In [None]:
def answer_question_with_vlm(query_text: str, retrieved_results):
    """Pseudocode for sending top results to a VLM such as Qwen2-VL."""
    # Steps:
    # 1. Select the best result, e.g., retrieved_results[0].
    # 2. Load the corresponding page image: Image.open(result['payload']['image_path']).
    # 3. Initialize the VLM (e.g., Qwen2-VL) and format the prompt with the image and query_text.
    # 4. Generate the answer text from the VLM.
    # 5. Return the generated answer string.
    raise NotImplementedError("Integrate with your preferred VLM here.")


## 10. Cleanup
Release model resources so the notebook can be rerun without a kernel restart.


In [None]:
try:
    del model
except NameError:
    pass

try:
    del processor
except NameError:
    pass

try:
    del page_embeddings
except NameError:
    pass

try:
    torch.cuda.empty_cache()
except Exception:
    pass

try:
    torch.mps.empty_cache()  # type: ignore[attr-defined]
except Exception:
    pass

import gc
_ = gc.collect()

print("Cleanup complete.")
