In [1]:
!mkdir -p /datasets
!gdown "https://drive.google.com/uc?id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp&confirm=t" --output /datasets/caltech101.tar.gz
!tar -xzf /datasets/caltech101.tar.gz --directory /datasets
!mv /datasets/101_ObjectCategories /datasets/caltech101
!rm -rf /datasets/caltech101/101_ObjectCategories

Downloading...
From: https://drive.google.com/uc?id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp&confirm=t
To: /datasets/caltech101.tar.gz
100% 132M/132M [00:03<00:00, 40.0MB/s]


In [2]:
!pip install faiss-cpu
!pip install hnswlib
!pip install federpy
!pip install ipython-autotime
%load_ext autotime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hnswlib
  Downloading hnswlib-0.7.0.tar.gz (33 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.7.0-cp38-cp38-linux_x86_64.whl size=2126788 sha256=06404c5f68248eaa1fbc0c3a8dd127

In [3]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive
time: 23.1 s (started: 2023-02-23 09:19:41 +00:00)


In [4]:
!mkdir -p /features
!cp /content/gdrive/MyDrive/ahrefs/features/* /features

time: 8.29 s (started: 2023-02-23 09:20:04 +00:00)


In [5]:
import faiss
import hnswlib
import numpy as np
import pickle
from federpy.federpy import FederPy
from sklearn.decomposition import PCA

time: 1.59 s (started: 2023-02-23 09:20:12 +00:00)


In [6]:
with open('/features/features-caltech101-resnet.pickle', 'rb') as f:
  data = pickle.load(f)

time: 36.9 ms (started: 2023-02-23 09:20:14 +00:00)


In [7]:
pca = PCA(n_components=5)
data_compressed = pca.fit_transform(data)

time: 1.46 s (started: 2023-02-23 09:20:14 +00:00)


# HNSW

In [None]:
num_elements = len(data_compressed)

ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'l2', dim=5) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 100, M = 8)

# Element insertion (can be called several times):
p.add_items(data_compressed, ids)

p.save_index("hnsw.index")

time: 750 ms (started: 2023-02-22 02:00:04 +00:00)


In [None]:
# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Query dataset, k - number of the closest elements (returns 2 numpy arrays)
labels, distances = p.knn_query(data, k = 5)

time: 246 ms (started: 2023-02-22 02:00:13 +00:00)


In [9]:
with open('/features/filenames-caltech101.pickle', 'rb') as f:
  imageUrls = pickle.load(f)

time: 1.77 ms (started: 2023-02-23 09:20:21 +00:00)


In [17]:
imageUrls[:5]

['/datasets/caltech101/BACKGROUND_Google/image_0001.jpg',
 '/datasets/caltech101/BACKGROUND_Google/image_0002.jpg',
 '/datasets/caltech101/BACKGROUND_Google/image_0003.jpg',
 '/datasets/caltech101/BACKGROUND_Google/image_0004.jpg',
 '/datasets/caltech101/BACKGROUND_Google/image_0005.jpg']

time: 3.41 ms (started: 2023-02-23 09:22:06 +00:00)


In [24]:
cloudimageUrls = ['https://federpyimages.s3.ap-southeast-1.amazonaws.com/federpyimages' + url for url in imageUrls]
cloudimageUrls[:5]

['https://federpyimages.s3.ap-southeast-1.amazonaws.com/federpyimages/datasets/caltech101/BACKGROUND_Google/image_0001.jpg',
 'https://federpyimages.s3.ap-southeast-1.amazonaws.com/federpyimages/datasets/caltech101/BACKGROUND_Google/image_0002.jpg',
 'https://federpyimages.s3.ap-southeast-1.amazonaws.com/federpyimages/datasets/caltech101/BACKGROUND_Google/image_0003.jpg',
 'https://federpyimages.s3.ap-southeast-1.amazonaws.com/federpyimages/datasets/caltech101/BACKGROUND_Google/image_0004.jpg',
 'https://federpyimages.s3.ap-southeast-1.amazonaws.com/federpyimages/datasets/caltech101/BACKGROUND_Google/image_0005.jpg']

time: 5.21 ms (started: 2023-02-23 10:04:30 +00:00)


In [35]:
hnswSource = 'hnswlib'
hnswIndexFile = 'https://federindexes.s3.ap-southeast-1.amazonaws.com/hnsw.index'

hnswViewParams = {
    "width": 800,
    "height": 500,
    "mediaType": "image",
    "mediaUrls": cloudimageUrls
}
federPy_hnsw = FederPy(hnswIndexFile, hnswSource, **hnswViewParams)
# federPy_hnsw.overview()
# federPy_hnsw.searchRandTestVec()
federPy_hnsw.searchById(8296)

time: 39.5 ms (started: 2023-02-23 10:25:27 +00:00)


# Faiss

In [12]:
index = faiss.index_factory(2048, "IVF100,Flat",)
index.train(data)
index.add(data)
print(index.ntotal)

9144
time: 1.68 s (started: 2023-02-23 09:20:47 +00:00)


In [13]:
faiss.write_index(index, "faiss.index") 

time: 72.9 ms (started: 2023-02-23 09:20:48 +00:00)


In [36]:
ivfflatSource = 'faiss'
ivfflatIndexFile = 'https://federindexes.s3.ap-southeast-1.amazonaws.com/faiss.index'

ivfflatViewParams = {
    "width": 800,
    "height": 500,
    "mediaType": "image",
    "mediaUrls": cloudimageUrls,
    "projectMethod": "umap",
}
federPy_ivfflat = FederPy(ivfflatIndexFile, ivfflatSource, **ivfflatViewParams)
federPy_ivfflat.overview()

time: 44.6 ms (started: 2023-02-23 10:26:09 +00:00)


In [33]:
federPy_ivfflat.setSearchParams({"k": 12, "nprobe": 8}).searchById(7421)

time: 13.8 ms (started: 2023-02-23 10:24:16 +00:00)


In [32]:
federPy_ivfflat.setSearchParams({"k": 12, "nprobe": 8}).searchRandTestVec()

time: 13.8 ms (started: 2023-02-23 10:21:08 +00:00)
