# 1. INSTALL & SETUP

In [1]:
!pip install gdown hnswlib
!git clone https://github.com/huynguyen6906/EmbedX.git
!pip install -r EmbedX/requirements.txt
!cd EmbedX && pip install .

Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting hnswlib
  Using cached hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting PySocks!=1.5.7,>=1.5.6 (from requests[socks]->gdown)
  Using cached PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Using cached PySocks-1.7.1-py3-none-any.whl (16 kB)
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25ldone
[?25h  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp312-cp312-linux_x86_64.whl size=225275 sha256=3242f6bc2045291f16c66fded1e195b215529e48284a74c8aa0c39f8dbba5665
  Stored in directory: /home/huynguyen/.cache/pip/wheels/ac/39/b3/cbd7f9cbb76501d2d5fbc84956e70d0b94e788aac87bda465e
Successfully built hnswlib
Installing collected packages: PySocks, 

In [2]:
import hnswlib
import gdown
import h5py
import clip
import embedx
import json
import torch
from sentence_transformers import SentenceTransformer
import os
import numpy as np

  from pkg_resources import packaging


# 2. DOWNLOAD CACHE

In [26]:
# Ensure the local directory for caching files exists.
os.makedirs(".cache", exist_ok=True)

if not os.path.isfile(".cache/Papers_Embedbed_0-1000000.h5"):
    # If not present, download the snapshot file using its Google Drive ID.
    gdown.download(id='1hMNur1U24ULce6osanfrePuCAz5uIvc3', output='.cache/Papers_Embedbed_0-1000000.h5', quiet=False)

if not os.path.isfile(".cache/hnsw_paper_index.bin"):
    # If not present, download the snapshot file using its Google Drive ID.
    gdown.download(id='1RR1o_eX9LKv_3pd9Iz1y4xB90pQIpUuX', output='.cache/hnsw_paper_index.bin', quiet=False)

if not os.path.isfile(".cache/hnsw_paper_index.json"):
    # If not present, download the snapshot file using its Google Drive ID.
    gdown.download(id='1H_BNLy9fXYwaOCXvZBRawN8S0w_KuGIK', output='.cache/hnsw_paper_index.json', quiet=False)

if not os.path.isfile(".cache/Image_Embedded.h5"):
    # If not present, download the snapshot file using its Google Drive ID.
    gdown.download(id='12R_DHBMOVFVEaPrjBEQXOd7tb14Isb1D', output='.cache/Image_Embedded.h5', quiet=False)

if not os.path.isfile(".cache/hnsw_image_index.bin"):
    # If not present, download the snapshot file using its Google Drive ID.
    gdown.download(id='1vWbJFLTdFnT6N2cnpMDA7c85JCfrY-QH', output='.cache/hnsw_image_index.bin', quiet=False)

if not os.path.isfile(".cache/hnsw_image_index.json"):
    # If not present, download the snapshot file using its Google Drive ID.
    gdown.download(id='16F8VRuKZb4SJ0KLa1Oh281RvdmQzj6Ft', output='.cache/hnsw_image_index.json', quiet=False)

# 3. SEARCH IMAGE BY HNSW

In [16]:
with h5py.File(".cache/Image_Embedded.h5", "r") as f:
  image_urls = f["urls"][:]
  image_embeddings = f["embeddings"][:]

In [17]:
image_index_config = ".cache/hnsw_image_index.json"
dim_image = 0
max_element_image = 0
ef_image = 0
with open(image_index_config, 'r') as f:
    data = json.load(f)
    dim_image = data[0]
    max_element_image = data[1]
    ef_image = data[2]

In [18]:
image_index_file = ".cache/hnsw_image_index.bin"

if os.path.exists(image_index_file):
    image_search = hnswlib.Index(space='cosine', dim=dim_image)
    image_search.load_index(image_index_file, max_elements=max_element_image) 
    image_search.set_ef(ef_image)   
else:
    print("❌ ERROR: file is not exist.")

In [19]:
Text_queries = ["computer", "chemical", "single"]
queries = [embedx.image.embed_Text(Text_queries[i]) for i in range(len(Text_queries))]
indices, distances = image_search.knn_query(queries, k=10)
for i in range(len(Text_queries)):
  print(Text_queries[i])
  for idx in indices[i]:
    print(image_urls[idx])
del indices
del distances

computer
b'https://farm4.staticflickr.com/4145/5144385210_11af7a12b3_o.jpg'
b'https://c1.staticflickr.com/8/7066/6986311580_f25426e4c8_o.jpg'
b'https://c4.staticflickr.com/1/115/309069511_061d065296_o.jpg'
b'https://c5.staticflickr.com/2/1055/1040841974_7ec01b4d9f_o.jpg'
b'https://farm6.staticflickr.com/2803/5713954047_4cd45a029d_o.jpg'
b'https://farm7.staticflickr.com/2746/4267363398_b770c0757e_o.jpg'
b'https://c3.staticflickr.com/2/1056/986204626_8d7252a06a_o.jpg'
b'https://farm7.staticflickr.com/206/487304481_a7c6b071cc_o.jpg'
b'https://c1.staticflickr.com/3/2364/1798975528_b15ffc491d_o.jpg'
b'https://farm8.staticflickr.com/4006/4673592688_4f8abde8a7_o.jpg'
chemical
b'https://farm3.staticflickr.com/3795/10044488764_132b77a668_o.jpg'
b'https://c1.staticflickr.com/8/7019/6523859709_3904afb388_o.jpg'
b'https://c7.staticflickr.com/4/3665/9914382424_f77338be49_o.jpg'
b'https://farm1.staticflickr.com/19/99996654_dd3692c886_o.jpg'
b'https://c5.staticflickr.com/7/6208/6136234094_13d9df22f5_

In [20]:
del image_urls
del image_embeddings
del image_index_config
del image_search
del image_index_file
del dim_image
del max_element_image
del ef_image

# 4. SEARCH PAPER BY HNSW

In [21]:
with h5py.File(".cache/Papers_Embedbed_0-1000000.h5", "r") as f:
  paper_urls = f["urls"][:]
  paper_embeddings = f["embeddings"][:]

In [22]:
paper_index_config = ".cache/hnsw_paper_index.json"
dim_paper = 0
max_element_paper = 0
ef_paper = 0
with open(paper_index_config, 'r') as f:
    data = json.load(f)
    dim_paper = data[0]
    max_element_paper = data[1]
    ef_paper = data[2]

In [23]:
paper_index_file = ".cache/hnsw_paper_index.bin"

if os.path.exists(paper_index_file):
    paper_search = hnswlib.Index(space='cosine', dim=dim_paper)
    paper_search.load_index(paper_index_file, max_elements=max_element_paper) 
    paper_search.set_ef(ef_paper)   
else:
    print("❌ ERROR: file is not exist.")

In [24]:
Text_queries = ["computer", "chemical"]
queries = [embedx.text.embed_Text(Text_queries[i]) for i in range(len(Text_queries))]
indices, distances = paper_search.knn_query(queries, k=10)
for i in range(len(Text_queries)):
  print(Text_queries[i])
  for idx in indices[i]:
    print(paper_urls[idx])
del indices
del distances

computer
b'https://arxiv.org/pdf/1309.5737.pdf'
b'https://arxiv.org/pdf/1107.3893.pdf'
b'https://arxiv.org/pdf/1107.4217.pdf'
b'https://arxiv.org/pdf/0903.4286.pdf'
b'https://arxiv.org/pdf/1304.1428.pdf'
b'https://arxiv.org/pdf/1712.09404.pd'
b'https://arxiv.org/pdf/1312.2447.pdf'
b'https://arxiv.org/pdf/1703.02944.pd'
b'https://arxiv.org/pdf/1202.5944.pdf'
b'https://arxiv.org/pdf/0910.3440.pdf'
chemical
b'https://arxiv.org/pdf/1606.04381.pd'
b'https://arxiv.org/pdf/1207.3242.pdf'
b'https://arxiv.org/pdf/1307.6360.pdf'
b'https://arxiv.org/pdf/1007.0818.pdf'
b'https://arxiv.org/pdf/0708.1826.pdf'
b'https://arxiv.org/pdf/1301.0833.pdf'
b'https://arxiv.org/pdf/0906.0871.pdf'
b'https://arxiv.org/pdf/1211.3163.pdf'
b'https://arxiv.org/pdf/1601.05356.pd'
b'https://arxiv.org/pdf/0901.0318.pdf'


In [25]:
del paper_urls
del paper_embeddings
del paper_index_config
del paper_search
del paper_index_file
del dim_paper
del max_element_paper
del ef_paper