In [1]:
import os
import torch
import numpy as np
from PIL import Image
import open_clip

import requests
import pickle
from io import BytesIO
from transformers import AutoModel, AutoImageProcessor

import faiss
import json

import cv2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_embbedings_model(model_name="facebook/dinov2-base"):
    """
    Chargement du modèle DINOv2 depuis Hugging Face
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Chargement du processeur et du modèle
    processor = AutoImageProcessor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    return model, processor, device

model, processor, device = load_embbedings_model()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [8]:
def get_embedding(image_source, model, processor, device):
    """
    Encode une image en embedding avec DINOv2 (Hugging Face)
    """
    # Chargement de l'image
    if image_source.startswith("http"):
        response = requests.get(image_source)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert("RGB")
    else:
        img = Image.open(image_source).convert("RGB")

    # Préprocessing automatique
    inputs = processor(images=img, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

        # DINOv2 renvoie un dictionnaire avec 'last_hidden_state' et 'pooler_output'
        if "pooler_output" in outputs:
            emb = outputs.pooler_output  # [batch_size, hidden_dim]
        else:
            # fallback: moyenne spatiale sur les patchs
            emb = outputs.last_hidden_state.mean(dim=1)

        emb = emb.cpu().numpy().flatten()

    # Normalisation L2
    emb = emb / np.linalg.norm(emb)
    emb = emb.reshape(1, -1)
    return emb

emb = get_embedding("./data/images/segmented/card_0.png",model, processor, device)

array([[-2.39026211e-02, -5.78677515e-03,  4.35819104e-02,
        -2.10122950e-03, -3.47709097e-02, -1.82606112e-02,
        -3.65590048e-03, -3.84214744e-02,  1.73628964e-02,
        -1.52876629e-02, -1.79880485e-02, -1.16353016e-02,
         2.32423730e-02,  5.10385558e-02, -1.56448577e-02,
        -1.52399694e-03, -3.83429378e-02,  1.54630542e-02,
        -3.38093527e-02,  2.63718329e-02, -1.32524492e-02,
        -8.71510059e-03,  3.53350267e-02, -1.93290263e-02,
         4.43451814e-02, -3.19859311e-02, -1.74969751e-02,
         5.01362979e-03, -4.31652591e-02,  6.39149398e-02,
        -2.25111581e-02,  2.44250540e-02, -1.60203557e-02,
         2.20584348e-02,  6.56099394e-02,  1.85066629e-02,
         4.39934954e-02, -4.03589979e-02,  5.12746256e-03,
         1.40435705e-02, -1.26403989e-02, -4.27066050e-02,
        -1.64440610e-02, -2.67996285e-02,  4.39841859e-02,
         2.16031596e-02, -2.75751296e-02,  2.33963248e-03,
        -9.99078061e-03,  1.16108870e-02, -4.45795171e-0