# Example with USearch library

Reference: https://github.com/unum-cloud/usearch

https://www.unum.cloud/blog/2021-12-31-dbms-startups

In [1]:
import numpy as np
from scipy.spatial.distance import cdist
from usearch.index import search, MetricKind, Matches, BatchMatches

In [2]:
# Generate 10'000 random vectors with 1024 dimensions
vectors = np.random.rand(20000, 1024).astype(np.float32)
vector = np.random.rand(1024).astype(np.float32)

# first we compute the pairwise distances between vector and the rows in vectors
one_in_many: Matches = search(vectors, vector, vectors.shape[0], MetricKind.L2sq, exact=True)

# second we compute all the pairwise distances between all the rows in vectors
many_in_many: BatchMatches = search(vectors, vectors, vectors.shape[0], MetricKind.L2sq, exact=True)



In [29]:
pairwise_distances1 = cdist(vectors, [vector], metric='euclidean')
pairwise_distances2 = cdist(vectors, vectors, metric='euclidean')

In [11]:
vectors.shape

(20000, 1024)

In [7]:
one_in_many.distances

array([149.06035, 149.87482, 150.19632, 150.26244, 150.34021, 150.95145,
       151.10468, 151.19652, 151.50356, 151.5379 , 151.6048 , 151.70966,
       151.881  , 151.93915, 152.17819, 152.43797, 152.47408, 152.60005,
       152.62799, 152.65417, 152.67267, 152.84586, 152.9353 , 153.19278,
       153.19736, 153.25937, 153.3535 , 153.35486, 153.47995, 153.51808],
      dtype=float32)

In [8]:
one_in_many.to_list()

[(14827, 149.0603485107422),
 (11228, 149.87481689453125),
 (3879, 150.19631958007812),
 (12326, 150.26243591308594),
 (5320, 150.3402099609375),
 (135, 150.95144653320312),
 (6811, 151.10467529296875),
 (19397, 151.19651794433594),
 (16299, 151.50355529785156),
 (8123, 151.53790283203125),
 (17450, 151.60479736328125),
 (13892, 151.70965576171875),
 (3780, 151.88099670410156),
 (2249, 151.93914794921875),
 (17329, 152.17819213867188),
 (5476, 152.43797302246094),
 (16357, 152.4740753173828),
 (17636, 152.6000518798828),
 (9124, 152.62799072265625),
 (3937, 152.6541748046875),
 (11974, 152.67266845703125),
 (17005, 152.84585571289062),
 (9448, 152.935302734375),
 (6367, 153.19277954101562),
 (10898, 153.19735717773438),
 (14157, 153.25936889648438),
 (12493, 153.35350036621094),
 (16891, 153.3548583984375),
 (8456, 153.47994995117188),
 (7408, 153.51808166503906)]

# Comparison with Scipy Spatial Distance

In [12]:
# Another example to compare with scipy spatial.distance
# Example vectors and we pay attention to the actual values of the distances
data = np.random.uniform(2,3,size=[10,5])
vector2 =np.array([np.random.uniform(2,3,size=5)])

In [15]:
%time
result: BatchMatches = search(data, vector2, data.shape[0], MetricKind.L2sq, exact=True)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs


In [16]:
# get the indeces for ordering the pairwise distances
ind = np.array(result.to_list())[:,0].astype('int64')

In [17]:
ind

array([8, 1, 7, 5, 9, 6, 3, 2, 0, 4])

In [18]:
d = result.distances

In [19]:
np.sqrt(d)[np.argsort(ind)]

array([1.179276  , 0.52923083, 0.967594  , 0.8504675 , 1.2874043 ,
       0.58930093, 0.80207235, 0.56880563, 0.35001096, 0.78043014],
      dtype=float32)

# We compare to Scipy cdist function

In [22]:
# Compute pairwise distances using Euclidean distance metric
pairwise_distances = cdist(data, vector2, metric='euclidean')

print("Pairwise distances:")
print(np.column_stack([pairwise_distances,np.sqrt(d)[np.argsort(ind)]]))

Pairwise distances:
[[1.17927596 1.17927599]
 [0.52923081 0.52923083]
 [0.96759402 0.96759403]
 [0.8504675  0.8504675 ]
 [1.28740426 1.2874043 ]
 [0.58930093 0.58930093]
 [0.80207232 0.80207235]
 [0.5688056  0.56880563]
 [0.35001097 0.35001096]
 [0.78043013 0.78043014]]


In [16]:
pairwise_distances.shape

(10, 1)

# FAISS is dangerous

...and slower than USearch.

In [4]:
import faiss

In [5]:
# Concatenate the vectors into a single array
all_vectors = np.concatenate((vectors, vectors), axis=0)

# Initialize Faiss index
d = all_vectors.shape[1]  # Dimension of vectors
index = faiss.IndexFlatL2(d)  # L2 distance (Euclidean distance)

# Add vectors to the index
index.add(all_vectors)

# Query for the nearest neighbors (in this case, excluding the vector itself)
k = all_vectors.shape[0] - 1  # Return distances to all other vectors
D, I = index.search(all_vectors, k)

# The first row of D corresponds to distances from vector1 to all vectors,
# and the second row corresponds to distances from vector2 to all vectors
pairwise_distances = D[1]

print("Pairwise distances:")
print(pairwise_distances)

In [52]:
# Concatenate the vectors into a single array
all_vectors = np.concatenate((vector1, vector2), axis=0)

# Initialize Faiss index
d = all_vectors.shape[1]  # Dimension of vectors
index = faiss.IndexFlatL2(d)  # L2 distance (Euclidean distance)

# Add vectors to the index
index.add(all_vectors)

# Query for the nearest neighbors (in this case, excluding the vector itself)
k = all_vectors.shape[0] - 1  # Return distances to all other vectors
D, I = index.search(all_vectors, k)

# The first row of D corresponds to distances from vector1 to all vectors,
# and the second row corresponds to distances from vector2 to all vectors
pairwise_distances = D[1]

print("Pairwise distances:")
print(pairwise_distances)


Pairwise distances:
[0.         0.29514208 0.30921572 0.32049134 0.32110634 0.45318595
 0.69156986 0.7355278  0.94303757 1.1888701 ]


# Excuse for FAISS performance

https://medium.com/mlearning-ai/why-you-should-be-careful-using-faiss-c44996eda9ee