## Imports

In [1]:
import os
from typing import Dict, Tuple
import time
from pathlib import Path

import torch
import pandas as pd
import base64
from PIL import Image
from transformers import AutoModel, AutoProcessor
import numpy as np
import faiss
import pickle
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from datasets import load_dataset
from huggingface_hub import login
import json
from mistralai import Mistral
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set_style("whitegrid")

In [3]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()

token = os.getenv('HF_TOKEN')
MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY')
login(token=token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Test data

In [4]:
TEST_DATA_DIR = "sites_for_testing"
TEST_DATASET = "sites_for_testing/sites_data.csv"

In [5]:
df = pd.read_csv(TEST_DATASET, sep=";")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        20 non-null     object
 1   image_path   20 non-null     object
 2   description  20 non-null     object
 3   difficulty   20 non-null     object
 4   type         20 non-null     object
dtypes: object(5)
memory usage: 932.0+ bytes


In [7]:
df["image_path"][:3]

0    sagrada_familia.jpg
1       eiffel_tower.jpg
2              petra.jpg
Name: image_path, dtype: object

In [8]:
df["type"].value_counts()

type
tower           4
religious       3
ancient_city    3
skyscraper      2
building        2
waterfall       2
sign            1
street          1
mountain        1
canyon          1
Name: count, dtype: int64

In [9]:
df["difficulty"].value_counts()

difficulty
easy    10
hard    10
Name: count, dtype: int64

In [10]:
df.head()

Unnamed: 0,title,image_path,description,difficulty,type
0,Sagrada Familia,sagrada_familia.jpg,Barcelona's most popular and famous attraction...,easy,religious
1,The Eiffel Tower,eiffel_tower.jpg,A symbol of Paris and one of the most famous ...,easy,tower
2,Petra,petra.jpg,"Petra is the fabled ""rose red city, half as ol...",easy,ancient_city
3,Great Wall,great_wall.jpg,The Great Wall of China snakes its way through...,easy,ancient_city
4,Big Ben,big_ben.jpg,Big Ben is the nickname for the Great Bell ins...,easy,tower


## Result data

In [11]:
RESULTS_PIXTRAL = "results_pixtral.csv"
RESULTS_SIGLIP_CHUNKS = "results_siglip_chunks.csv"

## Testing Pixtral

In [12]:
image_path = os.path.join(
    TEST_DATA_DIR,
    df["image_path"][0]
)
image_path

'sites_for_testing/sagrada_familia.jpg'

In [13]:
def load_image_as_base64(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

In [14]:
pic_b64 = load_image_as_base64(image_path)
pic_b64[:6]

'/9j/4A'

In [15]:
SYSTEM_CONTENT = (
"Return the answer in a JSON object with the next structure: "
"{\"elements\": [{\"element\": \"some name of element1\", "
"\"description\": \"some description of element 1\"}, "
"{\"element\": \"some name of element2\", \"description\": "
"\"some description of element 2\"}]\""
"\"summary\": \"summarised short description of the whole\"}"
"You should interpret image as a whole entity"
"People on the image do not matter, only the scenery"
"Give very specific explanation, related to the image,"
)

In [16]:
model = "pixtral-12b-2409"

client = Mistral(api_key=MISTRAL_API_KEY)

In [17]:
def get_pixtral_description_from_picture(
    image_path: str,
    model:str = "pixtral-12b-2409",
    api_key = MISTRAL_API_KEY,
    ) -> Tuple[Dict, Dict]:
    """Get description from Pixtral API"""
    image_b64 = load_image_as_base64(image_path)
    messages = [
        {
            "role": "system",
            "content": SYSTEM_CONTENT
        },
        {
            "role": "user",
            "content": "Describe the image and give the exact place or structure"
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": f"data:image/jpeg;base64,{image_b64}"
                }
            ]
        }
    ]
    client = Mistral(api_key=MISTRAL_API_KEY)
    chat_response = client.chat.complete(
        model=model,
        messages=messages,
        response_format={
            "type": "json_object",
        }
    )
    content = chat_response.choices[0].message.content
    data = json.loads(content)
    return data, chat_response

In [18]:
test_result = get_pixtral_description_from_picture(
    image_path
)

In [19]:
test_result[0]

{'elements': [{'element': 'Sagrada Familia',
   'description': 'The Sagrada Familia is a large Roman Catholic church in Barcelona, Spain, designed by the renowned architect Antoni Gaudí. It is known for its distinctive and intricate architectural design, featuring multiple spires and detailed facades.'},
  {'element': 'Cranes',
   'description': 'Cranes are visible around the Sagrada Familia, indicating ongoing construction and restoration work. The church has been under construction for over a century and is still in progress.'}],
 'summary': 'The image depicts the Sagrada Familia, a famous Roman Catholic church in Barcelona, Spain, designed by Antoni Gaudí, with cranes indicating ongoing construction work.'}

In [20]:
test_result[1].usage

UsageInfo(prompt_tokens=3955, completion_tokens=163, total_tokens=4118, prompt_audio_seconds=Unset())

In [21]:
def get_pixtral_description_from_picture_timed(
    image_path: str,
    model: str = "pixtral-12b-2409",
    api_key: str = MISTRAL_API_KEY,
) -> Tuple[Dict, Dict]:
    """
    Возвращает:
    - data: ответ модели (dict)
    - meta: метаданные (latency, model, image_path)
    """
    image_b64 = load_image_as_base64(image_path)

    messages = [
        {"role": "system", "content": SYSTEM_CONTENT},
        {"role": "user", "content": "Describe the image and give the exact place or structure"},
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": f"data:image/jpeg;base64,{image_b64}",
                }
            ],
        },
    ]

    client = Mistral(api_key=api_key)

    start_time = time.perf_counter()

    chat_response = client.chat.complete(
        model=model,
        messages=messages,
        response_format={"type": "json_object"},
    )

    latency = time.perf_counter() - start_time

    content = chat_response.choices[0].message.content
    data = json.loads(content)

    meta = {
        "image_path": image_path,
        "model": model,
        "latency_sec": latency,
        "prompt_tokens": chat_response.usage.prompt_tokens,
        "completion_tokens": chat_response.usage.completion_tokens,
        "total_tokens": chat_response.usage.total_tokens,

    }

    return data, meta


In [22]:
image_dir = Path(TEST_DATA_DIR)
jpg_paths = list(image_dir.glob("*.jpg"))

In [23]:
len(jpg_paths)

20

In [24]:
jpg_paths[0].name

'stonehenge.jpg'

In [25]:
jpg_paths[:2]

[PosixPath('sites_for_testing/stonehenge.jpg'),
 PosixPath('sites_for_testing/big_ben.jpg')]

In [26]:
rows = []

for image_path in tqdm(jpg_paths):
    data, meta = get_pixtral_description_from_picture_timed(image_path)

    rows.append({
        "image_path": meta["image_path"].name,
        "model": meta["model"],
        "latency_sec": meta["latency_sec"],
        "prompt_tokens": meta["prompt_tokens"],
        "completion_tokens": meta["completion_tokens"],
        "total_tokens": meta["total_tokens"],
        "summary": data.get("summary"),
        "elements": data.get("elements"),
        "raw_response": data,
    })

df_pixtral = pd.DataFrame(rows)

100%|███████████████████████████████████████████| 20/20 [00:59<00:00,  2.98s/it]


In [27]:
df_pixtral.head()

Unnamed: 0,image_path,model,latency_sec,prompt_tokens,completion_tokens,total_tokens,summary,elements,raw_response
0,stonehenge.jpg,pixtral-12b-2409,1.657614,3235,99,3334,"The image depicts Stonehenge, an ancient and i...","[{'element': 'Stonehenge', 'description': 'An ...","{'elements': [{'element': 'Stonehenge', 'descr..."
1,big_ben.jpg,pixtral-12b-2409,1.48958,1666,172,1838,"The image depicts the iconic Elizabeth Tower, ...","[{'element': 'Big Ben', 'description': 'The El...","{'elements': [{'element': 'Big Ben', 'descript..."
2,grand_canyon.jpg,pixtral-12b-2409,3.110798,2195,101,2296,"The image depicts the Grand Canyon at sunset, ...","[{'element': 'Grand Canyon', 'description': 'A...","{'elements': [{'element': 'Grand Canyon', 'des..."
3,hollywood_sign.jpg,pixtral-12b-2409,2.097872,2715,87,2802,The image features the famous Hollywood Sign o...,"[{'element': 'Hollywood Sign', 'description': ...","{'elements': [{'element': 'Hollywood Sign', 'd..."
4,hofburg.jpg,pixtral-12b-2409,3.90272,2975,175,3150,"The image depicts Heldenplatz, a significant p...","[{'element': 'Heldenplatz', 'description': 'He...","{'elements': [{'element': 'Heldenplatz', 'desc..."


In [28]:
df_pixtral.to_csv(RESULTS_PIXTRAL)

In [29]:
np.mean(df_pixtral["latency_sec"])

np.float64(2.919048402219778)

Далее ручная разметка по тому, получилось ли правильно определить достопримечательность

## Testing SigLip embeddings

In [30]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VISION_MODEL = "google/siglip-base-patch16-384"

vision_processor = AutoProcessor.from_pretrained(VISION_MODEL)
vision_model = AutoModel.from_pretrained(VISION_MODEL).to(DEVICE)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [31]:
# load index + chunks
index_siglip = faiss.read_index("siglip.index")
chunked_texts = pickle.load(open("siglip_chunks.pkl", "rb"))

In [32]:
def encode_image_siglip(image_path):
    """Return L2-normalized SigLIP image embedding."""
    img = Image.open(image_path).convert("RGB")
    inputs = vision_processor(images=img, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        emb = vision_model.get_image_features(**inputs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
    return emb.cpu().numpy()[0].astype("float32")

In [33]:
def image_search_siglip_timed(
    image_path,
    vectorstore,
    chunked_texts,
    k=5,
    model="google/siglip-base-patch16-384",
    ):
    """Search in SigLIP FAISS index by image similarity."""
    # --- embedding ---
    start_emb = time.perf_counter()
    image_emb = encode_image_siglip(image_path)
    latency_emb = time.perf_counter() - start_emb

    # --- vector search ---
    start_search = time.perf_counter()
    D, I = vectorstore.search(image_emb.reshape(1, -1), k)
    latency_search = time.perf_counter() - start_search

    # --- total ---
    latency_total = latency_emb + latency_search
    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "score": float(score),
            "text": chunked_texts[idx],
            "chunk_id": int(idx)
        })
    meta = {
        "image_path": image_path.name,
        "model": model,
        "latency_emb": latency_emb,
        "latency_search": latency_search,
        "latency_total": latency_total,
    }
    return results, meta

In [34]:
image_search_siglip_timed(
    jpg_paths[0],
    index_siglip,
    chunked_texts,
)

([{'score': 0.1235327422618866,
   'text': 'you time to see only the Stones and not time to appreciate the surrounding area. For tours starting from London, the price starts from around £65 for adult, including entry fee and pick-up service in your London hotel. The Stonehenge Tour is a tourist-oriented bus service from Salisbury to Stonehenge (on the return trip, it stops at Old Sarum near Salisbury). Tickets cost £16 (adults)/£11 (children) for tour only, or £33 (adults)/£22 (children) for the tour and entry to Stonehenge, Old Sarum and Salisbury',
   'chunk_id': 570229},
  {'score': 0.1235279068350792,
   'text': 'Verulamium Museum and Verulamium Park. Stonehenge - Among the most famous landmarks in England. The mysterious stone ring was built thousands of years ago, today it is a UNESCO World Heritage Site. Best visited in combination with a trip to nearby city Salisbury, where you can also visit the 13th-century cathedral with the highest spire in the country. Winchester - Former 

In [35]:
rows = []

for image_path in tqdm(jpg_paths):
    data, meta = image_search_siglip_timed(
        image_path,
        index_siglip,
        chunked_texts,        
    )
    tmp_data = []
    for _ in data:
        tmp_data.append(_["text"])
    rows.append({
        "image_path": meta["image_path"],
        "model": meta["model"],
        "latency_emb": meta["latency_emb"],
        "latency_search": meta["latency_search"],
        "latency_total": meta["latency_total"],
        "raw_response": data,
        "texts": tmp_data
    })

df_siglip = pd.DataFrame(rows)

100%|███████████████████████████████████████████| 20/20 [00:03<00:00,  5.04it/s]


In [36]:
df_siglip.to_csv(RESULTS_SIGLIP_CHUNKS)

In [37]:
df_siglip.head()

Unnamed: 0,image_path,model,latency_emb,latency_search,latency_total,raw_response,texts
0,stonehenge.jpg,google/siglip-base-patch16-384,0.170223,0.035569,0.205792,"[{'score': 0.1235327422618866, 'text': 'you ti...",[you time to see only the Stones and not time ...
1,big_ben.jpg,google/siglip-base-patch16-384,0.152878,0.032467,0.185345,"[{'score': 0.10741087049245834, 'text': 'of th...",[of the Main gate. A simulacrum of the famous ...
2,grand_canyon.jpg,google/siglip-base-patch16-384,0.169482,0.035289,0.204771,"[{'score': 0.1330050826072693, 'text': 'sunset...",[sunset there—you shouldn't miss it. The large...
3,hollywood_sign.jpg,google/siglip-base-patch16-384,0.15335,0.028753,0.182103,"[{'score': 0.1350061148405075, 'text': '(a la ...",[(a la 'Hollywood') that greets travelers comi...
4,hofburg.jpg,google/siglip-base-patch16-384,0.160629,0.031657,0.192286,"[{'score': 0.15511950850486755, 'text': 'north...",[north-west plinth in the square was intended ...


In [38]:
np.mean(df_siglip["latency_emb"])

np.float64(0.16641559989657254)

In [39]:
np.mean(df_siglip["latency_search"])

np.float64(0.03179246645886451)

In [40]:
np.mean(df_siglip["latency_total"])

np.float64(0.19820806635543703)