In [56]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.neighbors import KNeighborsRegressor
from scipy.spatial.distance import cdist
from usearch.index import search, MetricKind, Matches, BatchMatches

# Example with USearch library

Reference: https://github.com/unum-cloud/usearch

In [49]:
# Generate 10'000 random vectors with 1024 dimensions
vectors = np.random.rand(10000, 1024).astype(np.float32)
vector = np.random.rand(1024).astype(np.float32)

one_in_many: Matches = search(vectors, vector, 50, MetricKind.L2sq, exact=True)
# many_in_many: BatchMatches = search(vectors, vectors, 50, MetricKind.L2sq, exact=True)



## Real Data Application


In [41]:
data = pd.read_csv('../Data Sets/concrete.csv')

In [42]:
x = data.drop(columns='strength').values
y = data['strength'].values

In [None]:
for row in x:
    print(row)

In [54]:
model = KNeighborsRegressor(n_neighbors=10)
model.fit(x,y)
model.score(x,y)

0.7528885893274251

In [57]:
mse(model.predict(x),y)

68.89734524174757

In [4]:
# Generate 10'000 random vectors with 1024 dimensions
train = np.random.rand(2000, 1024).astype(np.float32)
test = np.random.rand(20,1024).astype(np.float32)

#one_in_many: Matches = search(vectors, vector, 50, MetricKind.L2sq, exact=True)
many_in_many: BatchMatches = search(train, test, 5, MetricKind.L2sq, exact=True)

In [50]:
output = many_in_many.to_list()

In [51]:
output

[(1923, 153.38187),
 (1117, 155.98915),
 (1327, 157.62474),
 (1515, 158.20346),
 (139, 158.45634),
 (1018, 150.89204),
 (1191, 153.86111),
 (1192, 155.33842),
 (1217, 155.46008),
 (1270, 155.64658),
 (223, 152.95477),
 (1364, 154.72516),
 (1966, 154.74345),
 (1925, 155.19516),
 (812, 155.64015),
 (660, 152.79694),
 (379, 153.18756),
 (1972, 153.31203),
 (358, 154.70497),
 (1467, 155.29631),
 (447, 151.20822),
 (1640, 151.51291),
 (1594, 151.95554),
 (946, 152.25009),
 (1818, 152.36336),
 (941, 148.99124),
 (564, 151.87589),
 (971, 153.8834),
 (1548, 154.96663),
 (1451, 155.93024),
 (627, 155.25912),
 (452, 155.6015),
 (1258, 156.03473),
 (1461, 156.88412),
 (1683, 157.11961),
 (747, 148.4409),
 (222, 149.54985),
 (183, 149.57553),
 (1366, 153.1477),
 (1242, 153.42387),
 (13, 149.38934),
 (188, 149.56116),
 (1261, 150.89915),
 (1630, 151.43372),
 (1509, 151.54802),
 (1799, 148.84427),
 (62, 149.0828),
 (934, 150.40575),
 (1215, 150.95244),
 (1847, 151.3516),
 (368, 151.0336),
 (1235, 15

In [25]:
ind = np.array(output)[:,0].astype('int64')
dist = np.array(output)[:,1].astype(float)

In [37]:
weights = 1/dist[5:10]
weights[weights==np.inf]=100
np.sum(weights)
weights = weights/np.sum(weights)

In [38]:
weights

array([0.20440918, 0.20046468, 0.19855821, 0.19840282, 0.1981651 ])

In [11]:
one_in_many.to_list()

[(6398, 148.28492736816406),
 (143, 149.36392211914062),
 (5364, 149.3961181640625),
 (2408, 150.33773803710938),
 (4788, 150.7617645263672),
 (4569, 151.26873779296875),
 (8272, 151.555419921875),
 (3095, 151.58168029785156),
 (7043, 151.61358642578125),
 (5500, 151.66888427734375),
 (5613, 151.77276611328125),
 (6301, 151.88189697265625),
 (5890, 152.1690673828125),
 (540, 152.28231811523438),
 (9951, 152.41409301757812),
 (171, 152.41915893554688),
 (7865, 152.45040893554688),
 (9505, 152.47854614257812),
 (2065, 152.56646728515625),
 (762, 152.61732482910156),
 (2186, 152.81068420410156),
 (8400, 152.9129180908203),
 (649, 152.93020629882812),
 (4983, 152.9842071533203),
 (7042, 153.01051330566406),
 (5245, 153.03829956054688),
 (2278, 153.0926513671875),
 (609, 153.14703369140625),
 (3126, 153.23403930664062),
 (6393, 153.30552673339844),
 (950, 153.60391235351562),
 (1871, 153.6868133544922),
 (8731, 153.98638916015625),
 (1245, 154.013427734375),
 (8291, 154.26998901367188),
 (7

# Comparison with Scipy Spatial Distance

In [12]:
# Another example to compare with scipy spatial.distance
# Example vectors
vector1 = np.random.uniform(2,3,size=[10,5])
vector2 =np.array([np.random.uniform(2,3,size=5)])

In [13]:
result: BatchMatches = search(vector1, vector2, 10, MetricKind.L2sq, exact=True)

In [35]:
# get the indeces for ordering the pairwise distances
ind = np.array(result.to_list())[:,0].astype('int64')

In [44]:
np.sort(ind)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [15]:
d = result.distances

In [47]:
np.sqrt(d)[np.argsort(ind)]

array([1.2775285 , 0.6731909 , 0.61250603, 1.0589464 , 0.5932575 ,
       0.94723815, 0.7952991 , 0.21066497, 0.96343386, 0.587761  ],
      dtype=float32)

In [37]:
# Compute pairwise distances using Euclidean distance metric
pairwise_distances = cdist(vector1, vector2, metric='euclidean')

print("Pairwise distances:")
print(pairwise_distances)

Pairwise distances:
[[1.27752856]
 [0.67319091]
 [0.61250603]
 [1.0589464 ]
 [0.59325747]
 [0.94723814]
 [0.79529909]
 [0.21066497]
 [0.96343388]
 [0.58776098]]


In [16]:
pairwise_distances.shape

(10, 1)

# FAISS is dangerous

...and slower than USearch.

In [51]:
import faiss

In [54]:
# Concatenate the vectors into a single array
all_vectors = np.concatenate((vectors, vector.reshape(1,-1)), axis=0)

# Initialize Faiss index
d = all_vectors.shape[1]  # Dimension of vectors
index = faiss.IndexFlatL2(d)  # L2 distance (Euclidean distance)

# Add vectors to the index
index.add(all_vectors)

# Query for the nearest neighbors (in this case, excluding the vector itself)
k = all_vectors.shape[0] - 1  # Return distances to all other vectors
D, I = index.search(all_vectors, k)

# The first row of D corresponds to distances from vector1 to all vectors,
# and the second row corresponds to distances from vector2 to all vectors
pairwise_distances = D[1]

print("Pairwise distances:")
print(pairwise_distances)

Pairwise distances:
[  0.      147.81537 149.88068 ... 188.24158 188.37915 188.89383]


In [52]:
# Concatenate the vectors into a single array
all_vectors = np.concatenate((vector1, vector2), axis=0)

# Initialize Faiss index
d = all_vectors.shape[1]  # Dimension of vectors
index = faiss.IndexFlatL2(d)  # L2 distance (Euclidean distance)

# Add vectors to the index
index.add(all_vectors)

# Query for the nearest neighbors (in this case, excluding the vector itself)
k = all_vectors.shape[0] - 1  # Return distances to all other vectors
D, I = index.search(all_vectors, k)

# The first row of D corresponds to distances from vector1 to all vectors,
# and the second row corresponds to distances from vector2 to all vectors
pairwise_distances = D[1]

print("Pairwise distances:")
print(pairwise_distances)


Pairwise distances:
[0.         0.29514208 0.30921572 0.32049134 0.32110634 0.45318595
 0.69156986 0.7355278  0.94303757 1.1888701 ]


# Excuse for FAISS performance

https://medium.com/mlearning-ai/why-you-should-be-careful-using-faiss-c44996eda9ee