# ColPali + Qdrant Retrieval Pipeline for a Single PDF

This notebook converts every page of a single PDF into images, embeds them with ColPali, and stores the embeddings in a local Qdrant instance for retrieval.


## 1. Setup & Imports
Install required dependencies, import modules, and configure the compute device.


In [None]:
%pip install --upgrade --quiet torch pdf2image Pillow matplotlib tqdm qdrant-client colpali-engine transformers qwen-vl-utils accelerate

import os
from pathlib import Path

import torch
from pdf2image import convert_from_path
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

from colpali_engine.models import ColPali, ColPaliProcessor
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


## 2. Configuration
Set paths and runtime parameters for the pipeline.


In [None]:
PDF_PATH = "./data/presentation.pdf"  # Update with the actual PDF path
OUTPUT_IMAGE_DIR = "./data/pdf_pages"
QDRANT_URL = "http://localhost:6333"
QDRANT_COLLECTION = "pdf_pages"
BATCH_SIZE = 2

output_dir_path = Path(OUTPUT_IMAGE_DIR)
output_dir_path.mkdir(parents=True, exist_ok=True)

print(f"PDF path: {os.path.abspath(PDF_PATH)}")
print(f"Image output directory: {output_dir_path.resolve()}")


## 3. Convert PDF Pages to Images
Render each page of the PDF to a PNG image and capture metadata for later steps.


In [None]:
def convert_pdf_to_images(pdf_path: str, output_dir: str) -> list[dict]:
    """Convert each page of the PDF into a PNG image on disk."""
    pdf_path = Path(pdf_path)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    images = convert_from_path(str(pdf_path))

    page_records: list[dict] = []
    base_name = pdf_path.stem

    for idx, image in enumerate(images):
        file_name = f"page_{idx + 1:04d}.png"
        image_path = output_path / file_name
        image.save(image_path, format="PNG")

        page_records.append(
            {
                "page_index": idx,
                "image_path": str(image_path.resolve()),
                "pdf_file_name": base_name,
            }
        )

    return page_records


In [None]:
pages = convert_pdf_to_images(PDF_PATH, OUTPUT_IMAGE_DIR)
print(f"Converted {len(pages)} pages to images.")

if pages:
    with Image.open(pages[0]["image_path"]) as sample_image:
        plt.figure(figsize=(6, 8))
        plt.imshow(sample_image)
        plt.axis("off")
        plt.title(f"Sample page 0 from {pages[0]['pdf_file_name']}")
        plt.show()
else:
    print("No pages were generated. Check the PDF path.")


## 4. Load ColPali Model
Load the ColPali processor and model for generating image and text embeddings.


In [None]:
model_name = "vidore/colpali-v1.2"
processor = ColPaliProcessor.from_pretrained(model_name)

preferred_dtype = torch.bfloat16 if device.type in {"cuda", "mps"} else torch.float32

try:
    model = ColPali.from_pretrained(
        model_name,
        torch_dtype=preferred_dtype,
    )
except Exception as error:
    print(f"Falling back to float32 due to: {error}")
    preferred_dtype = torch.float32
    model = ColPali.from_pretrained(
        model_name,
        torch_dtype=preferred_dtype,
    )

model = model.to(device)
model.eval()

print(f"Model loaded with dtype {preferred_dtype} on {device}.")


## 5. Embed Page Images
Batch process the saved page images through ColPali to create embeddings and metadata.


In [None]:
from torch.utils.data import DataLoader
all_embeddings_with_metadata=[]
pdf_images = [Image.open(page_data["image_path"]) for page_data in pages]
data_loader = DataLoader(
        dataset=pdf_images,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=lambda x: processor.process_images(x),
    )

page_counter = 1
for batch in tqdm(data_loader, desc=f"Processing"):
    with torch.no_grad():
        batch = {k: v.to(model.device) for k, v in batch.items()}
        batch_embeddings = model(**batch)
        batch_embeddings = list(torch.unbind(batch_embeddings.to("cpu")))

        for embedding in batch_embeddings:
            all_embeddings_with_metadata.append({
                "embedding": embedding,
                "page_id": page_counter
            })
            page_counter += 1
                
all_embeddings_with_metadata

## 6. Initialize Qdrant Collection
Create or recreate the Qdrant collection to store page embeddings.

In [None]:
from qdrant_client import QdrantClient, models
client = QdrantClient(url=QDRANT_URL)

collection_name = "pdf_pages"

# Create collection configured for multivector
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=128,
        distance=models.Distance.COSINE,
        multivector_config=models.MultiVectorConfig(
            comparator=models.MultiVectorComparator.MAX_SIM
        ),
    ),
)


## 7. Upsert Embeddings into Qdrant
Persist the vectors and metadata for each PDF page.

In [None]:
import torch

for idx, embedding_record in enumerate(all_embeddings_with_metadata):
    embedding_matrix = embedding_record['embedding'].to(torch.float32).cpu().numpy()
    client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=idx+1,
                vector=embedding_matrix,
                payload={"image_id": f"{embedding_record['page_id']}"},
            )
        ],
        wait=True
)

## 8. Retrieval Demo
Search the Qdrant collection with a natural-language query and visualize matching pages.


In [None]:
import torch
from PIL import Image
from qdrant_client.models import QueryResponse
import matplotlib.pyplot as plt

def _get_query_embedding(query_text: str, model, processor):
    batch_queries = processor.process_queries([query_text]).to(model.device)
    
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    return query_embeddings

def search_similar(query_text: str, top_k: int = 5):
    # 1. embed the user query into the SAME latent space as pages
    query_embeddings = _get_query_embedding(query_text, model, processor)
    query_matrx = query_embeddings[0].to(torch.float32).cpu().numpy()

    # 2. vector similarity search in Qdrant
    result: QueryResponse = client.query_points(
        collection_name=QDRANT_COLLECTION,
        query=query_matrx,
        limit=top_k,
        with_payload=True,
    )

    # 3. visualize + collect matches
    matches = []
    for rank, point in enumerate(result.points, start=1):
        # add padding before image_id to match the saved file names        
        image_path = f"data/pdf_pages/page_{int(point.payload["image_id"]):04d}.png"

        with Image.open(image_path) as image:
            plt.figure(figsize=(6, 8))
            plt.imshow(image)
            plt.axis("off")
            plt.title(
                f"score={point.score:.4f} | page={point.payload["image_id"]}"
            )
            plt.show()

        matches.append(
            {
                "rank": rank,
                "score": point.score,
                "image_path": image_path,
            }
        )

    return matches

In [None]:
# Example query; update with a relevant question for your PDF
matches = search_similar("Can I use Postgres for vector search", top_k=3)
matches

## 9. Answer Generation Stub
Outline how to hand retrieved images to a vision-language model for question answering.


### Vision-Language Answering with Qwen2-VL

After we retrieve the most relevant PDF page(s) using ColPali + Qdrant, we can ask a Vision-Language Model (VLM) to read that page image and answer the user's question.

Flow:
1. User asks a question.
2. We embed the question, search Qdrant, and get back the top matching page image.
3. We send that page image + the user's question into a local Qwen2-VL model.
4. The model generates an answer using ONLY what it "sees" in that page.

This section shows how to load Qwen2-VL locally and run a question-answer step against the best-matching page.


In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

vlm_device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
vlm_dtype = torch.float16 if vlm_device.type == "mps" else torch.float32

VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"

vlm_processor = AutoProcessor.from_pretrained(VLM_MODEL_NAME)
vlm_model = Qwen2VLForConditionalGeneration.from_pretrained(
    VLM_MODEL_NAME,
    torch_dtype=vlm_dtype,
    device_map=None,
).to(vlm_device)

vlm_model.eval()

In [None]:
from PIL import Image
import torch
# make sure this import matches your source notebook
from qwen_vl_utils import process_vision_info  

def answer_question_with_vlm(query_text: str, retrieved_results, max_new_tokens: int = 500):
    """
    Use the local Qwen2-VL model to answer `query_text`
    based on the top retrieved PDF page image.

    Assumes:
    - vlm_model: loaded Qwen2-VL model (e.g. Qwen2VLForConditionalGeneration)
    - vlm_processor: matching processor
    - vlm_device: torch.device ("mps" or "cpu")
    """

    if not retrieved_results:
        return "No results found to answer from."

    # Get best match from vector search (Qdrant)
    best = retrieved_results[0]
    image_path = best["image_path"]

    # Load the page image from disk
    page_image = Image.open(image_path).convert("RGB")

    # Build chat template with one image + the user question
    chat_template = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": page_image},
                {"type": "text", "text": query_text},
            ],
        }
    ]

    # 1. Convert chat template to the actual LLM text prompt,
    #    including special multimodal tokens, and append assistant prefix.
    text_prompt = vlm_processor.apply_chat_template(
        chat_template,
        tokenize=False,
        add_generation_prompt=True,
    )

    # 2. Extract vision inputs in the format the model expects
    image_inputs, _ = process_vision_info(chat_template)

    # 3. Build final batch inputs for the model
    inputs = vlm_processor(
        text=[text_prompt],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    )

    # move tensors to correct device
    inputs = {k: v.to(vlm_device) for k, v in inputs.items()}

    # 4. Generate
    with torch.no_grad():
        generated_ids = vlm_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    # 5. Trim the prompt tokens so we only decode new answer tokens
    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
    ]

    # 6. Decode to string
    output_text = vlm_processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )

    return output_text[0]


In [None]:
# Example: ask something about the document
user_query = "Can I use Postgres for vector search? Is it built-in or do I need an extension?"

retrieved = search_similar(user_query, top_k=3)
print("Top match page index:", retrieved[0]["image_path"])

final_answer = answer_question_with_vlm(user_query, retrieved)
print("\n=== Model Answer ===\n")
print(final_answer)


## 10. Cleanup
Release model resources so the notebook can be rerun without a kernel restart.


In [None]:
try:
    del model
except NameError:
    pass

try:
    del processor
except NameError:
    pass

try:
    torch.cuda.empty_cache()
except Exception:
    pass

try:
    torch.mps.empty_cache()  # type: ignore[attr-defined]
except Exception:
    pass

try:
    del client
except NameError:
    pass

#del vlm_model
#del vlm_processor
#del vlm_device
torch.mps.empty_cache() if torch.backends.mps.is_available() else None
try:
    torch.cuda.empty_cache()
except Exception:
    pass
import gc
gc.collect()
_ = gc.collect()
print("Cleanup complete.")
print("VLM + retrieval models cleaned up.")
