# Download Dataset

**Kaggle Dataset**

In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.20-py3-none-any.whl (14 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.20


In [None]:
import opendatasets as od

Для следующей ячейки потребуются данные из Kaggle аккаунта:

You Profile -> Account -> Create New API Token

In [None]:
od.download("https://www.kaggle.com/jessicali9530/stl10")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: lightlegends
Your Kaggle Key: ··········
Downloading stl10.zip to ./stl10


100%|██████████| 1.88G/1.88G [00:56<00:00, 35.4MB/s]





# Work with SearchModel

In [None]:
!pip install ruclip==0.0.1 > /dev/null
!pip install git+https://github.com/openai/CLIP.git
# Для colab нижние install не нужны, могут пригодиться для сервера, только нужно ставить torchvision для cpu, а не как снизу для cuda
#!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ezrs9w4j
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-ezrs9w4j
Collecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 1.9 MB/s 
Building wheels for collected packages: clip, ftfy
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369221 sha256=29518b531e4aace9fa65994377eadc58f6217a16435666eba85638f92ebb1aa1
  Stored in directory: /tmp/pip-ephem-wheel-cache-xcr6lom_/wheels/fd/b9/c3/5b4470e35ed76e174bff77c92f91da82098d5e35fd5bc8cdac
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-6.0.3-py3-none-any.whl size=41933 sha256=72a14e115cdc3c8459b18e3200edfde8d7d7038482118eefd9f8339a775f1bc9
  Stored in directory: /root/.cache/pip/wheels/19/f5/38/273eb3b5e76dfd850619312f

In [None]:
import numpy as np
# from faiss import Indexer


class DummyIndexer():
    def __init__(self):
        """
        Creates an empty index object
        """
        self.index = None

    def add(self, embs: np.ndarray):
        """
        Adds new embeddings embs in empty or existing index
        :param embs:
        :return:
        """
        if self.index is None:
            self.index = embs
        else:
            self.index = np.append(self.index, embs, axis=0)

    def train(self):
        """
        Not sure if this one is necessary here, left for compatibility with abstract class Indexer
        :return:
        """
        pass

    def find(self, query: np.ndarray, topn: int) -> (np.ndarray, np.ndarray):
        """
        Returns topn entries closest to the query vector
        :param query:
        :param topn:
        :return:
        """
        similarities = (self.index @ query.squeeze())
        best_photo_idx = (-similarities).argsort()
        D, I = similarities[best_photo_idx[:topn]], best_photo_idx[:topn]
        return D, I

    def save(self, file: str):
        """
        Saves data to npy file
        :param file:
        :return:
        """
        np.save(file, self.index)

    def load(self, file: str):
        """
        Loads data from npy file
        :param file:
        :return:
        """
        self.index = np.load(file)

In [None]:
"""
Created on 2022 Jan 28 14:09 
@author: keller
"""
import abc

import torch
import ruclip
import clip
import numpy as np

from PIL import Image

from numbers import Number
from typing import List

class Embedder(abc.ABC):
    @abc.abstractmethod
    def encode_text(self, text):
        pass

    @abc.abstractmethod
    def encode_imgs(self, imgs):
        pass

    def cos(self, emb1: np.ndarray, emb2: np.ndarray) -> Number:
        """
        Returns cos similarity between two embeddings
        :param emb1: 1D tensor
        :param emb2: 1D tensor
        :return: cos similarity (Number)
        """
        emb1, emb2 = emb1.squeeze(), emb2.squeeze() # convert (1, N) arrays to (N,)
        return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))


class EmbedderRuCLIP(Embedder):
    def __init__(self, ruclip_model_name='ruclip-vit-base-patch32-384',
             device='cpu', templates = ['{}', 'это {}', 'на картинке {}']):
        """
        :param ruclip_model_name:
        :param device:
        :param templates:
        """
        clip, processor = ruclip.load(ruclip_model_name)
        self.predictor = ruclip.Predictor(clip, processor, device, bs=8, templates=templates)

    def _tonumpy(self, tensor: torch.Tensor) -> np.ndarray:
        """
        Detaches tensor from GPU and converts it to numpy array
        :return: numpy array
        """
        return tensor.cpu().detach().numpy()

    def encode_text(self, text: str) -> np.ndarray:
        """
        Returns text latent of the text input
        :param text:
        :return:
        """
        classes = [text, ]
        with torch.no_grad():
            text_latent = self.predictor.get_text_latents(classes)
        return self._tonumpy(text_latent)

    def encode_imgs(self, pil_imgs: List[Image.Image]) -> np.ndarray:
        """
        Returns image latents of a image batch
        :param pil_imgs: list of PIL images
        :return img_latents: numpy array of img latents
        """
        with torch.no_grad():
            img_latents = self.predictor.get_image_latents(pil_imgs)
        return self._tonumpy(img_latents)

class EmbedderCLIP(Embedder):
    def __init__(self, clip_model_name='ViT-B/32', device='cpu'):
        """
        :param clip_model_name:
        :param device:
        """
        self.device = device
        self.predictor, self.preprocess = clip.load(clip_model_name, device=device)

    def _tonumpy(self, tensor: torch.Tensor) -> np.ndarray:
        """
        Detaches tensor from GPU and converts it to numpy array
        :return: numpy array
        """
        return tensor.cpu().detach().numpy()

    def encode_text(self, text: str) -> np.ndarray:
        """
        Returns text latent of the text input
        :param text:
        :return:
        """
        with torch.no_grad():
          # Encode it to a feature vector using CLIP
          text_latent = self.predictor.encode_text(clip.tokenize(text).to(self.device))
          text_latent /= text_latent.norm(dim=-1, keepdim=True)
          
        return self._tonumpy(text_latent)

    def encode_imgs(self, pil_imgs: List[Image.Image]) -> np.ndarray:
        """
        Returns image latents of a image batch
        :param pil_imgs: list of PIL images
        :return img_latents: numpy array of img latents
        """

        # Preprocess all photos
        photos_preprocessed = torch.stack([self.preprocess(photo) for photo in pil_imgs]).to(self.device)

        with torch.no_grad():
          # Encode the photos batch to compute the feature vectors and normalize them
          img_latents = self.predictor.encode_image(photos_preprocessed)
          img_latents /= img_latents.norm(dim=-1, keepdim=True)

        return self._tonumpy(img_latents)

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import math
from PIL import Image
from typing import List
from pathlib import Path


class SearchModel():
    def __init__(self, embedder, indexer):
        self.embedder = embedder
        self.indexer = indexer
        self.images_dir = None
        self.imgs_path = None
        self.features_path = None

    def load_imgs(self, path: str, prefix: str):
        """
        Returns a list of names images in a given path
        :param path:
        :return:
        """
        self.images_dir = path
        photos_path = Path(self.images_dir)
        general_features_dir = str(photos_path.parents[0]) + '/features'
        features_dir = general_features_dir + '/' + prefix
        self.features_path = Path(features_dir)
        self.imgs_path = list(photos_path.glob("*.*"))
        
        if not os.path.exists(general_features_dir):
          os.mkdir(general_features_dir)
        
        if not os.path.exists(features_dir):
          os.mkdir(features_dir)
        
        if len(os.listdir(features_dir)) >= 2:
          self.imgs_path = list(pd.read_csv(f"{self.features_path}/photo_ids.csv")['photo_id'])

    def load_img_urls(self):
        """
        In case we want to load imgs from a list of url
        :return:
        """
        pass

    def save_embs(self, batch_size=512) -> None:
        """
        Extracts image embeddings from embedder and adds them to indexer
        :param pil_imgs:
        :return:
        """

        if len(os.listdir(self.features_path)) >= 2:
          os.remove(str(self.features_path) + '/photo_ids.csv')
          os.remove(str(self.features_path) + '/features.npy')
          self.imgs_path = list(Path(self.images_dir).glob("*.*"))
        
        if not len(self.imgs_path) >= 512:
          batch_size = len(self.imgs_path)

        # Compute how many batches are needed
        batches = math.ceil(len(self.imgs_path) / batch_size)

        # Process each batch
        for i in range(batches):
          print(f"Processing batch {i+1}/{batches}")

          batch_ids_path = self.features_path / f"{i:010d}.csv"
          batch_features_path = self.features_path / f"{i:010d}.npy"
    
          # Only do the processing if the batch wasn't processed yet
          if not batch_features_path.exists():
            try:
              # Select the photos for the current batch
              batch_files = self.imgs_path[i*batch_size : min(len(self.imgs_path), (i+1)*batch_size)]
              pil_batch = [Image.open(photo_file) for photo_file in batch_files]

              # Compute the features and save to a numpy file
              batch_features = self.embedder.encode_imgs(pil_batch)
              np.save(batch_features_path, batch_features)

              # Save the photo IDs to a CSV file
              photo_ids = [photo_file for photo_file in batch_files]
              photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
              photo_ids_data.to_csv(batch_ids_path, index=False)
            except:
              # Catch problems with the processing to make the process more robust
              print(f'Problem with batch {i}')

        # Load all numpy files
        features_list = [np.load(features_file) for features_file in sorted(self.features_path.glob("*.npy"))]

        # Concatenate the features and store in a merged file
        features = np.concatenate(features_list)
        np.save(self.features_path / "features.npy", features)

        # Load all the photo IDs
        photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(self.features_path.glob("*.csv"))])
        photo_ids.to_csv(self.features_path / "photo_ids.csv", index=False)
        
        for file in glob.glob('{}/0*.*'.format(self.features_path)):
          os.remove(file)
        
        self.indexer.load(str(self.features_path) + '/features.npy')
    
    def get_k_imgs(self, emb: np.ndarray, k: int):
        """
        Returns k indices of nearest image embeddings and respective distances for a given embedding emb
        :param emb:
        :param k:
        :return:
        """
        distances, indices = self.indexer.find(emb, k)
        return distances, np.array(self.imgs_path)[indices]

# Строим индексы

В функцию load_imgs подайте путь до данных: "stl10" в нашем случае название датасета, "train_images" где хранятся изображения. 

Для своего индекса сохраняйте такую же структуру: {Название датасета}/{Где хранятся изображения}.

Когда код выполнится: нужно скачать получение признаки, находятся они по пути: {Название датасета}/{features}.

Сами изображения тоже должны находится на сервере, в папке с названием {Название датасета}, общая папка для всех - это "indexes". Пример правильной архитектуры находится на сервере по пути "/home/comptech/indexes/trip".

In [None]:
clip_model = SearchModel(EmbedderCLIP(device='cuda'), DummyIndexer())
ruclip_model = SearchModel(EmbedderRuCLIP(device='cuda'), DummyIndexer())

clip_model.load_imgs('/content/stl10/train_images','CLIP')
clip_model.save_embs()
ruclip_model.load_imgs('/content/stl10/train_images','RuCLIP')
ruclip_model.save_embs()

In [None]:
query = clip_model.embedder.encode_text(text="Small monkey")
clip_model.get_k_imgs(query, 10)

In [None]:
query = ruclip_model.embedder.encode_text(text="Обезьяна играет с мячиком")
ruclip_model.get_k_imgs(query, 10)

**Самое главное действие !**

Поменяйте выражение trip на то, как будет называться ваш датасет: '/home/comptech/indexes/trip/images/'.

Допустимо: name; name_prefix.

Делать это в коде, а не в текстовом блоке.

In [None]:
def generate_true_path(data_in_list, name_file):
  photo_id_list = []

  for name in data_in_list:
    check = '/home/comptech/indexes/trip/images/' + [name.split('/')[len(name.split('/'))-1]][0]
    photo_id_list.append(check)

  photo_ids_data = pd.DataFrame(photo_id_list, columns=['photo_id'])
  photo_ids_data.to_csv(name_file, index=False)

In [None]:
import numpy as np
import pandas as pd

data_clip = pd.read_csv('напишите_путь_до.csv')
data_clip_in_list = data_clip['photo_id'].to_list()

data_ruclip = pd.read_csv('напишите_путь_до.csv')
data_ruclip_in_list = data_ruclip['photo_id'].to_list()

generate_true_path(data_clip_in_list, "photo_ids_clip.csv")
generate_true_path(data_ruclip_in_list, "photo_ids_ruclip.csv")

**Не забудьте скачать посчитанные сверху csv**

Осталось сохранить посчитанные признаки и сам датасет на свой гугл диск, а после открыть к нему доступ и скачать на сервер с помощью консоли или же самому закинуть его на сервер через Xftp