forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
approximate_nearest_neighbors.py
294 lines (234 loc) · 10.7 KB
/
approximate_nearest_neighbors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""
=====================================
Approximate nearest neighbors in TSNE
=====================================
This example presents how to chain KNeighborsTransformer and TSNE in a
pipeline. It also shows how to wrap the packages `annoy` and `nmslib` to
replace KNeighborsTransformer and perform approximate nearest neighbors.
These packages can be installed with `pip install annoy nmslib`.
Note: Currently `TSNE(metric='precomputed')` does not modify the precomputed
distances, and thus assumes that precomputed euclidean distances are squared.
In future versions, a parameter in TSNE will control the optional squaring of
precomputed distances (see #12401).
Note: In KNeighborsTransformer we use the definition which includes each
training point as its own neighbor in the count of `n_neighbors`, and for
compatibility reasons, one extra neighbor is computed when
`mode == 'distance'`. Please note that we do the same in the proposed wrappers.
Sample output::
Benchmarking on MNIST_2000:
---------------------------
AnnoyTransformer: 0.583 sec
NMSlibTransformer: 0.321 sec
KNeighborsTransformer: 1.225 sec
TSNE with AnnoyTransformer: 4.903 sec
TSNE with NMSlibTransformer: 5.009 sec
TSNE with KNeighborsTransformer: 6.210 sec
TSNE with internal NearestNeighbors: 6.365 sec
Benchmarking on MNIST_10000:
----------------------------
AnnoyTransformer: 4.457 sec
NMSlibTransformer: 2.080 sec
KNeighborsTransformer: 30.680 sec
TSNE with AnnoyTransformer: 30.225 sec
TSNE with NMSlibTransformer: 43.295 sec
TSNE with KNeighborsTransformer: 64.845 sec
TSNE with internal NearestNeighbors: 64.984 sec
"""
# Author: Tom Dupre la Tour
#
# License: BSD 3 clause
import time
import sys
try:
import annoy
except ImportError:
print("The package 'annoy' is required to run this example.")
sys.exit()
try:
import nmslib
except ImportError:
print("The package 'nmslib' is required to run this example.")
sys.exit()
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsTransformer
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.datasets import fetch_openml
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
print(__doc__)
class NMSlibTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5, metric='euclidean', method='sw-graph',
n_jobs=1):
self.n_neighbors = n_neighbors
self.method = method
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
# see more metric in the manual
# https://github.com/nmslib/nmslib/tree/master/manual
space = {
'sqeuclidean': 'l2',
'euclidean': 'l2',
'cosine': 'cosinesimil',
'l1': 'l1',
'l2': 'l2',
}[self.metric]
self.nmslib_ = nmslib.init(method=self.method, space=space)
self.nmslib_.addDataPointBatch(X)
self.nmslib_.createIndex()
return self
def transform(self, X):
n_samples_transform = X.shape[0]
# For compatibility reasons, as each sample is considered as its own
# neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
results = self.nmslib_.knnQueryBatch(X, k=n_neighbors,
num_threads=self.n_jobs)
indices, distances = zip(*results)
indices, distances = np.vstack(indices), np.vstack(distances)
if self.metric == 'sqeuclidean':
distances **= 2
indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
n_neighbors)
kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
indptr), shape=(n_samples_transform,
self.n_samples_fit_))
return kneighbors_graph
class AnnoyTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
search_k=-1):
self.n_neighbors = n_neighbors
self.n_trees = n_trees
self.search_k = search_k
self.metric = metric
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
for i, x in enumerate(X):
self.annoy_.add_item(i, x.tolist())
self.annoy_.build(self.n_trees)
return self
def transform(self, X):
return self._transform(X)
def fit_transform(self, X, y=None):
return self.fit(X)._transform(X=None)
def _transform(self, X):
"""As `transform`, but handles X is None for faster `fit_transform`."""
n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0]
# For compatibility reasons, as each sample is considered as its own
# neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
indices = np.empty((n_samples_transform, n_neighbors),
dtype=np.int)
distances = np.empty((n_samples_transform, n_neighbors))
if X is None:
for i in range(self.annoy_.get_n_items()):
ind, dist = self.annoy_.get_nns_by_item(
i, n_neighbors, self.search_k, include_distances=True)
indices[i], distances[i] = ind, dist
else:
for i, x in enumerate(X):
indices[i], distances[i] = self.annoy_.get_nns_by_vector(
x.tolist(), n_neighbors, self.search_k,
include_distances=True)
if self.metric == 'sqeuclidean':
distances **= 2
indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
n_neighbors)
kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
indptr), shape=(n_samples_transform,
self.n_samples_fit_))
return kneighbors_graph
def test_transformers():
"""Test that AnnoyTransformer and KNeighborsTransformer give same results
"""
X = np.random.RandomState(42).randn(10, 2)
knn = KNeighborsTransformer()
Xt0 = knn.fit_transform(X)
ann = AnnoyTransformer()
Xt1 = ann.fit_transform(X)
nms = NMSlibTransformer()
Xt2 = nms.fit_transform(X)
assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5)
assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5)
def load_mnist(n_samples):
"""Load MNIST, shuffle the data, and return only n_samples."""
mnist = fetch_openml(data_id=41063)
X, y = shuffle(mnist.data, mnist.target, random_state=42)
return X[:n_samples], y[:n_samples]
def run_benchmark():
datasets = [
('MNIST_2000', load_mnist(n_samples=2000)),
('MNIST_10000', load_mnist(n_samples=10000)),
]
n_iter = 500
perplexity = 30
# TSNE requires a certain number of neighbors which depends on the
# perplexity parameter.
# Add one since we include each sample as its own neighbor.
n_neighbors = int(3. * perplexity + 1) + 1
transformers = [
('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors,
metric='sqeuclidean')),
('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors,
metric='sqeuclidean')),
('KNeighborsTransformer', KNeighborsTransformer(
n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')),
('TSNE with AnnoyTransformer', make_pipeline(
AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with NMSlibTransformer', make_pipeline(
NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with KNeighborsTransformer', make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with internal NearestNeighbors',
TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut",
random_state=42, n_iter=n_iter)),
]
# init the plot
nrows = len(datasets)
ncols = np.sum([1 for name, model in transformers if 'TSNE' in name])
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False,
figsize=(5 * ncols, 4 * nrows))
axes = axes.ravel()
i_ax = 0
for dataset_name, (X, y) in datasets:
msg = 'Benchmarking on %s:' % dataset_name
print('\n%s\n%s' % (msg, '-' * len(msg)))
for transformer_name, transformer in transformers:
start = time.time()
Xt = transformer.fit_transform(X)
duration = time.time() - start
# print the duration report
longest = np.max([len(name) for name, model in transformers])
whitespaces = ' ' * (longest - len(transformer_name))
print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration))
# plot TSNE embedding which should be very similar across methods
if 'TSNE' in transformer_name:
axes[i_ax].set_title(transformer_name + '\non ' + dataset_name)
axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y, alpha=0.2,
cmap=plt.cm.viridis)
axes[i_ax].xaxis.set_major_formatter(NullFormatter())
axes[i_ax].yaxis.set_major_formatter(NullFormatter())
axes[i_ax].axis('tight')
i_ax += 1
fig.tight_layout()
plt.show()
if __name__ == '__main__':
test_transformers()
run_benchmark()