# **1. INSTALL LIBRARY & IMPORT LIBRARY**

*Install necessary library*

In [5]:
print(1)

1


In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install torch torchvision pillow
!pip install numpy
!pip install python-docx
!pip install pymupdf
!pip install beautifulsoup4 lxml
!pip install hnswlib
!pip install h5py

*Import library*

In [1]:
import torch
import clip
import os
import fitz
import docx
import hnswlib
import numpy as np
from bs4 import BeautifulSoup
from PIL import Image
import torch.nn.functional as F
import h5py

# **2. CLONE DATASETS FROM REPO GITHUB**
*This repo is has some image and text to test this model*

In [None]:
Datasets = ["https://github.com/huynguyen6906/Data-for-HNSW-model.git"]
if os.path.isdir("/content/Datasets"):
  !rm -rf "/content/Datasets/*"
else:
  !mkdir Datasets

In [None]:
for Dataset in Datasets:
  !cd Datasets/ && git clone $Dataset

Cloning into 'Data-for-HNSW-model'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 54 (delta 6), reused 50 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (54/54), 1.01 MiB | 19.48 MiB/s, done.
Resolving deltas: 100% (6/6), done.


# **3. CONVERT IMAGES TO VECTORS**

In [2]:
class convertToVector:
  def __init__(self):
    # Select device (use GPU if available)
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    # Load model CLIP
    self.model, self.preprocess = clip.load("ViT-B/32", device=self.device)
    self.Dataset_path = ""
    self.image_vectors = {}
    self.text_vectors = {}

  def loadDataset(self, Dataset_path):
    self.Dataset_path = Dataset_path

  def ConvertToVector(self):
    for dirpath, dirnames, filenames in os.walk(Dataset_path):
      for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        text_content = ""

        if filename.lower().endswith(".jpeg"):
          image = self.preprocess(Image.open(file_path)).unsqueeze(0).to(self.device)
          vector = self.model.encode_image(image)
          vector = vector / vector.norm(dim=-1, keepdim=True)
          self.image_vectors[file_path] = vector.cpu()

        elif filename.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as f:
                text_content = f.read()
            text_content = text_content.strip()

            # Tokenize and encode text
            text_tokens = clip.tokenize([text_content]).to(self.device)
            vector = self.model.encode_text(text_tokens)
            vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

            # Save vector
            self.text_vectors[file_path] = vector.cpu()

        elif filename.endswith(".pdf"):
            doc = fitz.open(file_path)
            for page in doc:
                text_content += page.get_text("text") + " "
            doc.close()
            text_content = text_content.strip()

            # Tokenize and encode text
            text_tokens = clip.tokenize([text_content]).to(self.device)
            vector = self.model.encode_text(text_tokens)
            vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

            # Save vector
            self.text_vectors[file_path] = vector.cpu()

        elif filename.endswith(".docx"):
            doc = docx.Document(file_path)
            for para in doc.paragraphs:
                text_content += para.text + " "
            text_content = text_content.strip()

            # Tokenize and encode text
            text_tokens = clip.tokenize([text_content]).to(self.device)
            vector = self.model.encode_text(text_tokens)
            vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

            # Save vector
            self.text_vectors[file_path] = vector.cpu()

        elif filename.endswith(".html") or filename.endswith(".htm"):
            with open(file_path, "r", encoding="utf-8") as f:
                html_content = f.read()
            soup = BeautifulSoup(html_content, "lxml")
            text_content = soup.get_text(separator=" ", strip=True)
            text_content = text_content.strip()

            # Tokenize and encode text
            text_tokens = clip.tokenize([text_content]).to(self.device)
            vector = self.model.encode_text(text_tokens)
            vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

            # Save vector
            self.text_vectors[file_path] = vector.cpu()
    print (len(self.image_vectors)," images was converted.")
    print (len(self.text_vectors)," text was converted.")

  def find_best_similar_image(self, text, option):
    best_similar_image = ""
    temp = -1
    if option:
      text_tokens = clip.tokenize([text.strip()]).to(self.device)
      vector_text = self.model.encode_text(text_tokens)
      vector_text = (vector_text / vector_text.norm(dim=-1, keepdim=True)).cpu()
      for file_path, vector in self.image_vectors.items():
        similarity_value = F.cosine_similarity(vector_text, vector, dim=1).item()
        if abs(similarity_value - 1) < abs(temp - 1):
          temp = similarity_value
          best_similar_image = file_path
    else:
      pass
    return best_similar_image

  def find_best_similar_text(self, text, option):
    best_similar_text = ""
    temp = -1
    if option:
      text_tokens = clip.tokenize([text.strip()]).to(self.device)
      vector_text = self.model.encode_text(text_tokens)
      vector_text = (vector_text / vector_text.norm(dim=-1, keepdim=True)).cpu()
      for file_path, vector in self.text_vectors.items():
        similarity_value = F.cosine_similarity(vector_text, vector, dim=1).item()
        if abs(similarity_value - 1) < abs(temp - 1):
          temp = similarity_value
          best_similar_text = file_path
    else:
      pass
    return best_similar_text


In [None]:
Dataset_path = "Datasets/"
convert = convertToVector();
convert.loadDataset(Dataset_path)
convert.ConvertToVector()
print(convert.find_best_similar_image("dog", 1))
print(convert.find_best_similar_text("BK", 1))

100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 98.4MiB/s]


23  images was converted.
11  text was converted.
Datasets/Data-for-HNSW-model/image/dog1.jpeg
Datasets/Data-for-HNSW-model/text/text4.txt


In [None]:
train_image_dataset = []
train_image_path = []
for path, vec in convert.image_vectors.items():
  train_image_dataset.append(vec.detach().numpy()[0])
  train_image_path.append(path)
train_image_dataset = np.array(train_image_dataset)

train_text_dataset = []
train_text_path = []
for path, vec in convert.text_vectors.items():
  train_text_dataset.append(vec.detach().numpy()[0])
  train_text_path.append(path)
train_text_dataset = np.array(train_text_dataset)

print(train_image_path)
print(train_text_path)

['Datasets/Data-for-HNSW-model/image/bug3.jpeg', 'Datasets/Data-for-HNSW-model/image/cow3.jpeg', 'Datasets/Data-for-HNSW-model/image/dog1.jpeg', 'Datasets/Data-for-HNSW-model/image/cat1.jpeg', 'Datasets/Data-for-HNSW-model/image/mountain4.jpeg', 'Datasets/Data-for-HNSW-model/image/mountain2.jpeg', 'Datasets/Data-for-HNSW-model/image/cow4.jpeg', 'Datasets/Data-for-HNSW-model/image/cow1.jpeg', 'Datasets/Data-for-HNSW-model/image/mountain3.jpeg', 'Datasets/Data-for-HNSW-model/image/mountain1.jpeg', 'Datasets/Data-for-HNSW-model/image/human1.jpeg', 'Datasets/Data-for-HNSW-model/image/bug4.jpeg', 'Datasets/Data-for-HNSW-model/image/dog4.jpeg', 'Datasets/Data-for-HNSW-model/image/dog2.jpeg', 'Datasets/Data-for-HNSW-model/image/bug2.jpeg', 'Datasets/Data-for-HNSW-model/image/dog3.jpeg', 'Datasets/Data-for-HNSW-model/image/cat3.jpeg', 'Datasets/Data-for-HNSW-model/image/bug1.jpeg', 'Datasets/Data-for-HNSW-model/image/human3.jpeg', 'Datasets/Data-for-HNSW-model/image/human2.jpeg', 'Datasets/Dat

*Another Convert class (convert something to vector and return as numpy.ndarray)*

In [3]:
class Convert:
  def __init__(self):
    # Select device (use GPU if available)
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    # Load model CLIP
    self.model, self.preprocess = clip.load("ViT-B/32", device=self.device)

  def convert_to_vector(self, file_path):
    filename = os.path.basename(file_path)
    text_content = ""
    vector = None

    if filename.lower().endswith(".jpeg"):
      image = self.preprocess(Image.open(file_path)).unsqueeze(0).to(self.device)
      vector = self.model.encode_image(image)
      vector = vector / vector.norm(dim=-1, keepdim=True)

    elif filename.endswith(".txt"):
      with open(file_path, "r", encoding="utf-8") as f:
          text_content = f.read()
      text_content = text_content.strip()

      # Tokenize and encode text
      text_tokens = clip.tokenize([text_content]).to(self.device)
      vector = self.model.encode_text(text_tokens)
      vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

    elif filename.endswith(".pdf"):
      doc = fitz.open(file_path)
      for page in doc:
          text_content += page.get_text("text") + " "
      doc.close()
      text_content = text_content.strip()

      # Tokenize and encode text
      text_tokens = clip.tokenize([text_content]).to(self.device)
      vector = self.model.encode_text(text_tokens)
      vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

    elif filename.endswith(".docx"):
      doc = docx.Document(file_path)
      for para in doc.paragraphs:
          text_content += para.text + " "
      text_content = text_content.strip()

      # Tokenize and encode text
      text_tokens = clip.tokenize([text_content]).to(self.device)
      vector = self.model.encode_text(text_tokens)
      vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

    elif filename.endswith(".html") or filename.endswith(".htm"):
      with open(file_path, "r", encoding="utf-8") as f:
          html_content = f.read()
      soup = BeautifulSoup(html_content, "lxml")
      text_content = soup.get_text(separator=" ", strip=True)
      text_content = text_content.strip()

      # Tokenize and encode text
      text_tokens = clip.tokenize([text_content]).to(self.device)
      vector = self.model.encode_text(text_tokens)
      vector = vector / vector.norm(dim=-1, keepdim=True)  # normalize

    return vector.cpu().detach().numpy()[0]

# **4. READ DATASETS**

In [4]:
with h5py.File("images_embeds.h5", "r") as f:
  texts = f["image_path"][:]
  embs = f["embeddings"][:]

# **5. HNSW IMPLEMENT**





*Construct HNSW*

In [5]:
dim = embs.shape[1]

index = hnswlib.Index(space='cosine', dim=dim)
index.init_index(max_elements = int(5e5), ef_construction = 400, M = 200)
index.set_ef(200)

index.add_items(embs)

In [6]:
def request_text_to_vector(text):
  device = "cuda" if torch.cuda.is_available() else "cpu"
  model, preprocess = clip.load("ViT-B/32", device=device)
  text_tokens = clip.tokenize([text.strip()]).to(device)
  vector_text = model.encode_text(text_tokens)
  vector_text = (vector_text / vector_text.norm(dim=-1, keepdim=True)).cpu()
  return vector_text.detach().numpy().astype(np.float32)

In [None]:
query = request_text_to_vector("beach")

 32%|████████████▊                           | 108M/338M [04:25<04:21, 920kiB/s]

# **6. SEARCH QUERIES**

In [None]:
# @title
indices, distances = index.knn_query(embs, k=5)
for idx in indices[0]:
  print(texts[idx])

b'./images/34894.jpg'
b'./images/689570.jpg'
b'./images/401534.jpg'
b'./images/13890.jpg'
b'./images/753786.jpg'
b'./images/481912.jpg'
b'./images/249948.jpg'
b'./images/657890.jpg'
b'./images/899976.jpg'
b'./images/467526.jpg'
b'./images/694896.jpg'
b'./images/184986.jpg'
b'./images/560966.jpg'
b'./images/756796.jpg'
b'./images/887326.jpg'
b'./images/693890.jpg'
b'./images/933612.jpg'
b'./images/105576.jpg'
b'./images/381220.jpg'
b'./images/682702.jpg'
b'./images/127654.jpg'
b'./images/396298.jpg'
b'./images/618106.jpg'
b'./images/67160.jpg'
b'./images/779006.jpg'
b'./images/231576.jpg'
b'./images/563848.jpg'
b'./images/705678.jpg'
b'./images/520568.jpg'
b'./images/340980.jpg'
b'./images/46028.jpg'
b'./images/922024.jpg'
b'./images/300834.jpg'
b'./images/484832.jpg'
b'./images/537266.jpg'
b'./images/150926.jpg'
b'./images/600846.jpg'
b'./images/253200.jpg'
b'./images/52154.jpg'
b'./images/869372.jpg'
b'./images/850144.jpg'
b'./images/164634.jpg'
b'./images/573022.jpg'
b'./images/89358