### Import and load dataset

In [1]:
import os
import torch
import time
import numpy as np
from PIL import Image
from colpali_engine import ColPali, ColPaliProcessor
from qdrant_client import QdrantClient
from qdrant_client.http import models
from tqdm import tqdm
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["HF_HOME"] = "/media/pc1/Ubuntu/Extend_Data/hf_models"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
dataset = load_dataset("davanstrien/ufo-ColPali", split="train")

In [3]:
dataset

Dataset({
    features: ['image', 'raw_queries', 'broad_topical_query', 'broad_topical_explanation', 'specific_detail_query', 'specific_detail_explanation', 'visual_element_query', 'visual_element_explanation', 'parsed_into_json'],
    num_rows: 2243
})

In [4]:
dataset[29]["image"].show()

### Connecting to Qdrant server

In [5]:
client = QdrantClient(url="http://localhost:6333")

### Setup Colpali

In [6]:
model_name = (
    "davanstrien/finetune_colpali_v1_2-ufo-4bit"
)

colpali_model = ColPali.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0"
)

colpali_processor = ColPaliProcessor.from_pretrained("vidore/colpaligemma-3b-pt-448-base")

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 54827.50it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


### Configure Qdrant Collection

In [7]:
collection_name = "ufo-binary"

In [8]:
client.create_collection(
    collection_name=collection_name,
    on_disk_payload=True,  # store the payload on disk
    vectors_config=models.VectorParams(
        size=128,
        distance=models.Distance.COSINE,
        on_disk=True, # move original vectors to disk
        multivector_config=models.MultiVectorConfig(
            comparator=models.MultiVectorComparator.MAX_SIM
        ),
        quantization_config=models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(
            always_ram=True  # keep only quantized vectors in RAM
            ),
        ),
    ),
)

True

### Uploading to the Qdrant

In [11]:
import stamina

@stamina.retry(on=Exception, attempts=3)
def upsert_to_qdrant(points):
    try:
        client.upsert(
            collection_name=collection_name,
            points=points,
            wait=False
        )
    except Exception as e:
        print(f"Error during upsert {e}")
        return False
    return True


In [13]:
batch_size = 16  # Adjust based on your GPU memory constraints

# Use tqdm to create a progress bar
with tqdm(total=len(dataset), desc="Indexing Progress") as pbar:
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i : i + batch_size]

        # The images are already PIL Image objects, so we can use them directly
        images = batch["image"]

        # Process and encode images
        with torch.no_grad():
            batch_images = colpali_processor.process_images(images).to(
                colpali_model.device
            )
            image_embeddings = colpali_model(**batch_images)

        # Prepare points for Qdrant
        points = []
        for j, embedding in enumerate(image_embeddings):
            # Convert the embedding to a list of vectors
            multivector = embedding.cpu().float().numpy().tolist()
            points.append(
                models.PointStruct(
                    id=i + j,  # we just use the index as the ID
                    vector=multivector,  # This is now a list of vectors
                    payload={
                        "source": "internet archive"
                    },  # can also add other metadata/data
                )
            )

        # Upload points to Qdrant
        try:
            upsert_to_qdrant(points)
        except Exception as e:
            print(f"Error during upsert: {e}")
            continue

        # Update the progress bar
        pbar.update(batch_size)

print("Indexing complete!")

Indexing Progress: 2256it [06:34,  5.72it/s]                          

Indexing complete!





In [14]:
client.update_collection(
    collection_name=collection_name,
    optimizer_config=models.OptimizersConfigDiff(indexing_threshold=10),
)

True

### process query

In [16]:
query_text = "top secret"
with torch.no_grad():
    batch_query = colpali_processor.process_queries([query_text]).to(
        colpali_model.device
    )
    query_embedding = colpali_model(**batch_query)
query_embedding

tensor([[[ 0.1543, -0.0261,  0.0933,  ..., -0.0112, -0.0762, -0.0381],
         [ 0.0425, -0.0718, -0.0120,  ...,  0.1211, -0.0645,  0.0659],
         [ 0.0762,  0.0330,  0.0762,  ..., -0.0249, -0.0173,  0.0182],
         ...,
         [-0.0013,  0.0554,  0.0452,  ...,  0.0023,  0.0547,  0.0620],
         [ 0.0369,  0.0425,  0.0332,  ...,  0.0162,  0.0583,  0.0669],
         [ 0.1196,  0.0564,  0.0718,  ..., -0.0063,  0.0112,  0.0732]]],
       device='cuda:0', dtype=torch.bfloat16)

In [17]:
multivector_query = query_embedding[0].cpu().float().numpy().tolist()

### searching and retrieving document

In [21]:
start_time = time.time()
search_result = client.query_points(
    collection_name=collection_name,
    query=multivector_query,
    limit=5,
    timeout=100,
    search_params=models.SearchParams(
        quantization=models.QuantizationSearchParams(
            ignore=False,
            rescore=True,
            oversampling=2.0,
        )
    )
)
end_time = time.time()
# Search in Qdrant
search_result.points

elapsed_time = end_time - start_time
print(f"Search completed in {elapsed_time:.4f} seconds")

Search completed in 0.0088 seconds


In [22]:
idx = search_result.points[0].id
dataset[idx]["image"].show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
