In [1]:
from typing import cast
import torch
from PIL import Image
from colpali_engine.models import ColPali, ColPaliProcessor

model_name = "vidore/colpali-v1.2"

model = ColPali.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
).eval()

processor = ColPaliProcessor.from_pretrained(model_name)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  12%|#1        | 598M/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/862M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/78.6M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/243k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

In [2]:
# Your inputs
images = [
    Image.new("RGB", (32, 32), color="white"),
    Image.new("RGB", (16, 16), color="black"),
]
queries = [
    "Is attention really all you need?",
    "Are Benjamin, Antoine, Merve, and Jo best friends?",
]

# Process the inputs
batch_images = processor.process_images(images).to(model.device)
batch_queries = processor.process_queries(queries).to(model.device)

In [3]:
# Forward pass
with torch.no_grad():
    image_embeddings = model(**batch_images)
    querry_embeddings = model(**batch_queries)

scores = processor.score_multi_vector(querry_embeddings, image_embeddings)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [4]:
scores

tensor([[ 6.7188,  6.9688],
        [10.3125,  9.3750]])

# Testing with custom PDF

In [5]:
import pymupdf

In [8]:
import os
import sys, pymupdf  # import the bindings
doc = pymupdf.open("../data/DCEE Actions Master List_090920_final.pdf")  # open document
for page in doc:  # iterate through the pages
    pix = page.get_pixmap(dpi=300)  # render page to an image
    os.makedirs("pages", exist_ok=True)
    pix.save("pages/page-%i.png" % page.number)  # store image as a PNG

In [11]:
images = [Image.open("pages/page-30.png"), Image.open("pages/page-31.png")]

In [13]:
queries = [
    "How can cooling water help make my servers more efficient?",
    "Are Benjamin, Antoine, Merve, and Jo best friends?",
]

# Process the inputs
batch_images = processor.process_images(images).to(model.device)
batch_queries = processor.process_queries(queries).to(model.device)

In [14]:
# Forward pass
with torch.no_grad():
    image_embeddings = model(**batch_images)
    querry_embeddings = model(**batch_queries)

In [16]:
scores = processor.score_multi_vector(querry_embeddings, image_embeddings)
scores

tensor([[15.8750, 12.1250],
        [ 6.5938,  6.4688]])

In [None]:
scores

tensor([[ 6.7188,  6.9688],
        [10.3125,  9.3750]])

In [24]:
image_embeddings.shape

torch.Size([2, 1030, 128])

In [26]:
querry_embeddings.shape

torch.Size([2, 27, 128])