## Imports

In [4]:
# %pip install sentencepiece
# %pip install protobuf
# %pip install langchain_community
# %pip install langchain
# %pip install langchain_core
# %pip install langchain_huggingface

In [2]:
import os

import base64
import torch
from transformers import AutoModel, AutoProcessor
from PIL import Image
import numpy as np
import faiss
import pickle
from langchain_community.vectorstores import FAISS
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from datasets import load_dataset
from huggingface_hub import login
import json


In [3]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()

token = os.getenv('HF_TOKEN')
MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY')
login(token=token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Directories

In [5]:
images_fpath = "images"
if not os.path.exists(images_fpath):
    os.mkdir(images_fpath)

## SigLip embedding from image

In [6]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VISION_MODEL = "google/siglip-base-patch16-384"

vision_processor = AutoProcessor.from_pretrained(VISION_MODEL)
vision_model = AutoModel.from_pretrained(VISION_MODEL).to(DEVICE)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
def encode_image_siglip(image_path):
    """Return L2-normalized SigLIP image embedding."""
    img = Image.open(image_path).convert("RGB")
    inputs = vision_processor(images=img, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        emb = vision_model.get_image_features(**inputs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
    return emb.cpu().numpy()[0].astype("float32")

In [8]:
pic_fpath = os.path.join(
    images_fpath,
    "pamukkale.jpg"
)

In [9]:
pic_embedding = encode_image_siglip(pic_fpath)
pic_embedding.shape

(768,)

## Create embeddings from Wikivoyage

In [84]:
# ---------------------------
# 1. Load dataset
# ---------------------------

dataset = load_dataset("bigscience-data/roots_en_wikivoyage", split="train")
texts = dataset["text"]

# ---------------------------
# 2. Split texts into chunks
# ---------------------------

chunk_size = 512
chunk_overlap = 128
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

chunked_texts = []
for text in tqdm(texts, desc="Splitting into chunks"):
    chunks = splitter.split_text(text)
    chunked_texts.extend(chunks)

print("Total chunks:", len(chunked_texts))

Splitting into chunks: 100%|██████████| 24838/24838 [00:16<00:00, 1540.42it/s]

Total chunks: 625336





In [10]:
# ---------------------------
# 4. SigLIP text embedding
# ---------------------------

@torch.no_grad()
def encode_text_siglip(texts):
    """Return L2-normalized SigLIP text embedding."""
    inputs = vision_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
    emb = vision_model.get_text_features(**inputs)
    emb = emb / emb.norm(dim=-1, keepdim=True)
    return emb.cpu().numpy()

In [38]:
# Compute embeddings
siglip_embs = []
batch_size = 32

for i in tqdm(range(0, len(chunked_texts), batch_size), desc="Encoding SigLIP embeddings"):
    batch = chunked_texts[i:i+batch_size]
    emb = encode_text_siglip(batch)
    siglip_embs.append(emb)

Encoding SigLIP embeddings: 100%|██████████| 19542/19542 [2:13:57<00:00,  2.43it/s]  


In [39]:
siglip_embs = np.concatenate(siglip_embs, axis=0).astype("float32")
print("SigLIP embedding matrix shape:", siglip_embs.shape)

# ---------------------------
# 5. Build a separate FAISS index (cosine via inner product)
# ---------------------------

dim = siglip_embs.shape[1]

index_siglip = faiss.IndexFlatIP(dim)
index_siglip.add(siglip_embs)

print("FAISS SigLIP index built:", index_siglip.ntotal)

# ---------------------------
# 6. Save everything
# ---------------------------

faiss.write_index(index_siglip, "siglip.index")
np.save("siglip_embs.npy", siglip_embs)

SigLIP embedding matrix shape: (625336, 768)
FAISS SigLIP index built: 625336


In [40]:
with open("siglip_chunks.pkl", "wb") as f:
    pickle.dump(chunked_texts, f)

In [43]:
chunk_size = 512
chunk_overlap = 128

In [44]:
metadata = {
    "embedding_dim": dim,
    "chunk_size": chunk_size,
    "chunk_overlap": chunk_overlap,
    "total_chunks": len(chunked_texts),
    "model": "google/siglip-base-patch16-384"
}

In [None]:
with open("siglip_metadata.json", "w") as f:
    json.dump(metadata, f)

print("SigLIP index saved.")


SigLIP index saved.


## Test

In [12]:
def encode_image_siglip(image_path):
    """Return L2-normalized SigLIP image embedding."""
    img = Image.open(image_path).convert("RGB")
    inputs = vision_processor(images=img, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        emb = vision_model.get_image_features(**inputs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
    return emb.cpu().numpy()[0].astype("float32")

In [13]:
def image_search_siglip(image_path, k=5):
    """Search in SigLIP FAISS index by image similarity."""
    # load index + chunks
    index_siglip = faiss.read_index("siglip.index")
    chunked_texts = pickle.load(open("siglip_chunks.pkl", "rb"))

    # embed image
    image_emb = encode_image_siglip(image_path)

    # similarity search
    D, I = index_siglip.search(image_emb.reshape(1, -1), k)

    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "score": float(score),
            "text": chunked_texts[idx],
            "chunk_id": int(idx)
        })

    return results

In [14]:
%%time
test_search = image_search_siglip(pic_fpath)

CPU times: user 636 ms, sys: 826 ms, total: 1.46 s
Wall time: 3.37 s


In [15]:
test_search

[{'score': 0.14834292232990265,
  'text': 'Pamukkale is a hot spring with calcium-coated cliffs and pools in inland southeastern Aegean Turkey. Pamukkale, which has been used as a spa since the second century BC, literally means "cotton castle" in Turkish. The travertine features have their origins in the shifting of a fault in the valley of the Menderes river (between here and Denizli). As the fault shifted, very hot springs with a very high mineral content (notably chalk) arose at this location. Apart from the slightly radioactive minerals, the',
  'chunk_id': 231993},
 {'score': 0.14748886227607727,
  'text': '), so the travertines stay white as ever. This job is made tougher in winters when the water flowing down the chalky cascades will be freezing cold. Unfortunately in winter they not let water into the pools you can usually soak in in summer. Instead, just a little river is going downhill for warm your feet and a small waterfall to the side, but nothing to fully soak in. At the

In [17]:
test_search[0]["text"]

'Pamukkale is a hot spring with calcium-coated cliffs and pools in inland southeastern Aegean Turkey. Pamukkale, which has been used as a spa since the second century BC, literally means "cotton castle" in Turkish. The travertine features have their origins in the shifting of a fault in the valley of the Menderes river (between here and Denizli). As the fault shifted, very hot springs with a very high mineral content (notably chalk) arose at this location. Apart from the slightly radioactive minerals, the'

Это заработало

In [18]:
def get_pic_fpath(
    picture_name: str,
    image_dir = "images"
    ):
    """Create picture full path using directory"""
    pic_fpath = os.path.join(
        image_dir,
        picture_name
    )
    return pic_fpath

In [19]:
venice_fpath = get_pic_fpath("venice_grand_canal.jpg")

In [20]:
test_search = image_search_siglip(venice_fpath)

In [21]:
test_search

[{'score': 0.12463367730379105,
  'text': 'It has a vaguely rectangular shape, with the major side of about 900 m. It is crossed by only two internal canals: the Santa Caterina canal winds at the western end, passing in front of the church of the same name, and cuts the island from side to side; the other opens in front of Burano. The landscape is characterized by the presence of cultivated areas on which traditional fruit and vegetable activities (such as the cultivation of castraure) take place, similarly to other islands in the lagoon. Formerly',
  'chunk_id': 499177},
 {'score': 0.12433916330337524,
  'text': 'be either a long walk or an expensive vaporetto ride. The opposite question is whether it makes sense for someone staying in central Venice to visit Mestre. For someone staying for 3 days or less, probably the answer is no. Those staying longer, however, may enjoy going to Mestre to have a break from tourist crowds, shop and eat at non-inflated prices, go to bars and nightclu

In [22]:
istanbul_sea_fpath = get_pic_fpath("istanbul_sea.jpg")

In [23]:
test_search = image_search_siglip(istanbul_sea_fpath)

In [24]:
test_search

[{'score': 0.1357659548521042,
  'text': '+1 418-235-4537, toll-free: +1 877-235-4197. Excursions organized in small boats for up to 12 passengers (duration of 2 or 3 hours). (updated Dec 2021) 48.228589-69.55367414 Quai Les Bergeronnes. Wharf at the eastern tip of the Bay of Grandes Bergeronnes. (updated Dec 2021) 48.2927-69.5532815 Croisières Essipit, 498, rue de la Mer, Grandes-Bergeronnes, ☏ +1 418-232-6778. Whale-watching cruises, condo-hotels, cabins, campsites, outfitters and sea kayaking. (updated Dec 2021) 48.318311-69.41516816 Quai de',
  'chunk_id': 417502},
 {'score': 0.1310519278049469,
  'text': 'Side. In Turkish Boğaz means the Bosphorus channel, while Boğaziçi is the word for these neighbourhoods. These began as separate fishing villages, and even today have not completely grown into each other. So they each have a different character, with late Ottoman palaces, parks, and lush woodlands. This is one of the most scenic districts of Istanbul, especially in May when the J

In [25]:
sagrada_familia_fpath = get_pic_fpath("sagrada_familia.jpg")

In [26]:
test_search = image_search_siglip(sagrada_familia_fpath)
test_search

[{'score': 0.1530635952949524,
  'text': "examples of Gothic and Modernist architectures. Home of the Sagrada Familia and other Antoni Gaudí buildings such as Parc Guell and La Pedrera. Barcelona is also well known for its museums and cultural events. The Roman ruins including the complex and colosseum in Tarragona and the ruins in Castelló d'Empúries. Enjoy the long fine sand beaches of Costa Daurada and the grain beaches on Costa Brava See the volcanoes near the city of Olot, and La Fageda d'en Jordà, a very nice forest and extinct volcanoes.",
  'chunk_id': 223108},
 {'score': 0.1353929489850998,
  'text': "is a testament to Antonio Gaudi's modernist style and unlike any other church in the world. The texture and detail on the façades are intricate and incorporate forms seen in nature as much of Gaudi's work does. The inside is huge — it's estimated that it can hold 13,000 people — with vaulted ceilings and columns designed to be reminiscent of trees. The church is still under const