Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor NN graph building (included in #43) #21

Draft
wants to merge 33 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1ee85b5
attempt to refactor nn graph building
naspert Mar 19, 2018
4bacd5c
update tests
naspert Mar 19, 2018
00bbcdd
fix typo
naspert Mar 19, 2018
b822333
fix tests (avoiding not implemented combinations)
naspert Mar 19, 2018
38aebd0
- fix missing space after colon in dictionary
naspert Mar 19, 2018
524c60f
fix (matlab) GSP url
naspert Mar 19, 2018
ae83814
throw exception when using FLANN + max_dist (produces incorrect results)
naspert Mar 19, 2018
62fc0ce
update test case to fit FLANN & max_dist exception
naspert Mar 19, 2018
6f473fa
implement nn graph using pdist using radius
naspert Mar 20, 2018
25ec6d2
implement radius nn graph with flann
naspert Mar 20, 2018
96b628e
flann returns the squared distance when called with 'euclidean' dista…
naspert Mar 20, 2018
09bbff4
compute sqrt of list properly
naspert Mar 20, 2018
27b9a03
use cyflann instead of pyflann (radius search not working)
naspert Mar 20, 2018
8a1f9b9
check nn graphs building against pdist reference
naspert Mar 20, 2018
6e9e2ac
cyflann needs the flann library to be installed on the system
naspert Mar 20, 2018
811de06
check nn graphs building against pdist reference
naspert Mar 20, 2018
813fe39
backport stuff from cyflann branch
naspert Mar 20, 2018
4a4d597
flann should (mostly) work for knn graphs
naspert Mar 20, 2018
53dffc1
fix pdist warnings
naspert Mar 21, 2018
1309e92
implement and use scipy-ckdtree as default (faster than kdtree)
naspert Mar 21, 2018
90ae9a8
Merge remote-tracking branch 'origin-nas/nn_cyflann' into nn_refactor
naspert Mar 22, 2018
648fa91
backport README changes from master
naspert Mar 22, 2018
96fa5f6
Merge branch 'master' of https://github.com/epfl-lts2/pygsp into nn_r…
naspert Mar 22, 2018
c26e449
Merge branch 'master' into nn_refactor
naspert Mar 22, 2018
8e7c553
add nmslib
naspert Mar 23, 2018
b83e467
test flann when not on windows
naspert Mar 26, 2018
28b7858
use the same code to build sparse matrix for knn and radius
naspert Mar 29, 2018
188c4a6
building the graph with rescale/center=False should also work
naspert Mar 29, 2018
59c131a
Merge pull request #1 from naspert/nmslib
naspert Mar 29, 2018
8e98b77
update doc for nmslib
naspert Mar 29, 2018
08ae29f
enable multithreading with ckdtree/nmslib
naspert Apr 9, 2018
57e9661
Merge branch 'master' into nn_refactor
naspert Jun 20, 2018
a562896
fix _get_extra_repr
naspert Jun 20, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions .travis.yml
Expand Up @@ -8,6 +8,10 @@ python:
- 3.5
- 3.6

before_install:
- sudo apt-get -qq update
- sudo apt-get install -y libflann-dev

addons:
apt:
packages:
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Expand Up @@ -37,7 +37,7 @@ The documentation is available on
`Read the Docs <https://pygsp.readthedocs.io>`_
and development takes place on
`GitHub <https://github.com/epfl-lts2/pygsp>`_.
A (mostly unmaintained) `Matlab version <https://lts2.epfl.ch/gsp>`_ exists.
(A (mostly unmaintained) `Matlab version <https://epfl-lts2.github.io/gspbox-html>`_ exists.)

The PyGSP facilitates a wide variety of operations on graphs, like computing
their Fourier basis, filtering or interpolating signals, plotting graphs,
Expand Down
317 changes: 243 additions & 74 deletions pygsp/graphs/nngraphs/nngraph.py
Expand Up @@ -3,23 +3,197 @@
import traceback

import numpy as np
from scipy import sparse, spatial
from scipy import sparse
import scipy.spatial as sps
import scipy.spatial.distance as spsd
import multiprocessing

from pygsp import utils
from pygsp.graphs import Graph # prevent circular import in Python < 3.5

_logger = utils.build_logger(__name__)


def _import_pfl():
# conversion between the FLANN conventions and the various backend functions
_dist_translation = {
'scipy-kdtree': {
'euclidean': 2,
'manhattan': 1,
'max_dist': np.inf
},
'scipy-ckdtree': {
'euclidean': 2,
'manhattan': 1,
'max_dist': np.inf
},
'scipy-pdist' : {
'euclidean': 'euclidean',
'manhattan': 'cityblock',
'max_dist': 'chebyshev',
'minkowski': 'minkowski'
},
'nmslib' : {
'euclidean': 'l2',
'manhattan': 'l1',
'max_dist': 'linf',
'minkowski': 'lp'
}
}

def _import_cfl():
try:
import pyflann as pfl
import cyflann as cfl
except Exception:
raise ImportError('Cannot import pyflann. Choose another nearest '
raise ImportError('Cannot import cyflann. Choose another nearest '
'neighbors method or try to install it with '
'pip (or conda) install pyflann (or pyflann3).')
return pfl
'pip (or conda) install cyflann.')
return cfl

def _import_nmslib():
try:
import nmslib as nms
except Exception:
raise ImportError('Cannot import nmslib. Choose another nearest '
'neighbors method or try to install it with '
'pip (or conda) install nmslib.')
return nms

def _knn_sp_kdtree(X, num_neighbors, dist_type, order=0):
kdt = sps.KDTree(X)
D, NN = kdt.query(X, k=(num_neighbors + 1),
p=_dist_translation['scipy-kdtree'][dist_type])
return NN, D

def _knn_sp_ckdtree(X, num_neighbors, dist_type, order=0):
kdt = sps.cKDTree(X)
D, NN = kdt.query(X, k=(num_neighbors + 1),
p=_dist_translation['scipy-ckdtree'][dist_type],
n_jobs=-1)
return NN, D


def _knn_flann(X, num_neighbors, dist_type, order):
# the combination FLANN + max_dist produces incorrect results
# do not allow it
if dist_type == 'max_dist':
raise ValueError('FLANN and max_dist is not supported')

cfl = _import_cfl()
cfl.set_distance_type(dist_type, order=order)
c = cfl.FLANNIndex(algorithm='kdtree')
c.build_index(X)
# Default FLANN parameters (I tried changing the algorithm and
# testing performance on huge matrices, but the default one
# seems to work best).
NN, D = c.nn_index(X, num_neighbors + 1)
c.free_index()
if dist_type == 'euclidean': # flann returns squared distances
return NN, np.sqrt(D)
return NN, D

def _radius_sp_kdtree(X, epsilon, dist_type, order=0):
kdt = sps.KDTree(X)
D, NN = kdt.query(X, k=None, distance_upper_bound=epsilon,
p=_dist_translation['scipy-kdtree'][dist_type])
return NN, D

def _radius_sp_ckdtree(X, epsilon, dist_type, order=0):
N, dim = np.shape(X)
kdt = sps.cKDTree(X)
nn = kdt.query_ball_point(X, r=epsilon,
p=_dist_translation['scipy-ckdtree'][dist_type],
n_jobs=-1)
D = []
NN = []
for k in range(N):
x = np.matlib.repmat(X[k, :], len(nn[k]), 1)
d = np.linalg.norm(x - X[nn[k], :],
ord=_dist_translation['scipy-ckdtree'][dist_type],
axis=1)
nidx = d.argsort()
NN.append(np.take(nn[k], nidx))
D.append(np.sort(d))
return NN, D


def _knn_sp_pdist(X, num_neighbors, dist_type, order):
if dist_type == 'minkowski':
p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type],
p=order)
else:
p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type])
pd = spsd.squareform(p)
pds = np.sort(pd)[:, 0:num_neighbors+1]
pdi = pd.argsort()[:, 0:num_neighbors+1]
return pdi, pds

def _knn_nmslib(X, num_neighbors, dist_type, order):
N, dim = np.shape(X)
ncpu = multiprocessing.cpu_count()
if dist_type == 'minkowski':
raise ValueError('unsupported distance type (lp) for nmslib')
nms = _import_nmslib()
nmsidx = nms.init(space=_dist_translation['nmslib'][dist_type])
nmsidx.addDataPointBatch(X)
nmsidx.createIndex()
q = nmsidx.knnQueryBatch(X, k=num_neighbors+1, num_threads=int(ncpu/2))
nn, d = zip(*q)
D = np.concatenate(d).reshape(N, num_neighbors+1)
NN = np.concatenate(nn).reshape(N, num_neighbors+1)
return NN, D

def _radius_sp_pdist(X, epsilon, dist_type, order):
N, dim = np.shape(X)
if dist_type == 'minkowski':
p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type],
p=order)
else:
p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type])
pd = spsd.squareform(p)
pdf = pd < epsilon
D = []
NN = []
for k in range(N):
v = pd[k, pdf[k, :]]
d = pd[k, :].argsort()
# use the same conventions as in scipy.distance.kdtree
NN.append(d[0:len(v)])
D.append(np.sort(v))

return NN, D

def _radius_flann(X, epsilon, dist_type, order=0):
N, dim = np.shape(X)
# the combination FLANN + max_dist produces incorrect results
# do not allow it
if dist_type == 'max_dist':
raise ValueError('FLANN and max_dist is not supported')

cfl = _import_cfl()
cfl.set_distance_type(dist_type, order=order)
c = cfl.FLANNIndex(algorithm='kdtree')
c.build_index(X)
D = []
NN = []
for k in range(N):
nn, d = c.nn_radius(X[k, :], epsilon*epsilon)
D.append(d)
NN.append(nn)
c.free_index()
if dist_type == 'euclidean': # flann returns squared distances
return NN, list(map(np.sqrt, D))
return NN, D

def _radius_nmslib(X, epsilon, dist_type, order=0):
raise ValueError('nmslib does not support (yet?) range queries')

def center_input(X, N):
return X - np.kron(np.ones((N, 1)), np.mean(X, axis=0))

def rescale_input(X, N, d):
bounding_radius = 0.5 * np.linalg.norm(np.amax(X, axis=0) -
np.amin(X, axis=0), 2)
scale = np.power(N, 1. / float(min(d, 3))) / 10.
return X * scale / bounding_radius

class NNGraph(Graph):
r"""Nearest-neighbor graph from given point cloud.
Expand All @@ -33,9 +207,13 @@ class NNGraph(Graph):
Type of nearest neighbor graph to create. The options are 'knn' for
k-Nearest Neighbors or 'radius' for epsilon-Nearest Neighbors (default
is 'knn').
use_flann : bool, optional
Use Fast Library for Approximate Nearest Neighbors (FLANN) or not.
(default is False)
backend : {'scipy-kdtree', 'scipy-pdist', 'flann'}
Type of the backend for graph construction.
- 'scipy-kdtree' will use scipy.spatial.KDTree
- 'scipy-ckdtree'(default) will use scipy.spatial.cKDTree
- 'scipy-pdist' will use scipy.spatial.distance.pdist (slowest but exact)
- 'flann' use Fast Library for Approximate Nearest Neighbors (FLANN)
- 'nmslib' use nmslib for approximate nearest neighbors (faster in high-dimensional spaces)
center : bool, optional
Center the data so that it has zero mean (default is True)
rescale : bool, optional
Expand Down Expand Up @@ -72,19 +250,43 @@ class NNGraph(Graph):

"""

def __init__(self, Xin, NNtype='knn', use_flann=False, center=True,

def __init__(self, Xin, NNtype='knn', backend='scipy-ckdtree', center=True,
rescale=True, k=10, sigma=0.1, epsilon=0.01,
plotting={}, symmetrize_type='average', dist_type='euclidean',
order=0, **kwargs):

self.Xin = Xin
self.NNtype = NNtype
self.use_flann = use_flann
self.backend = backend
self.center = center
self.rescale = rescale
self.k = k
self.sigma = sigma
self.epsilon = epsilon

_dist_translation['scipy-kdtree']['minkowski'] = order
_dist_translation['scipy-ckdtree']['minkowski'] = order

self._nn_functions = {
'knn': {
'scipy-kdtree': _knn_sp_kdtree,
'scipy-ckdtree': _knn_sp_ckdtree,
'scipy-pdist': _knn_sp_pdist,
'flann': _knn_flann,
'nmslib': _knn_nmslib
},
'radius': {
'scipy-kdtree': _radius_sp_kdtree,
'scipy-ckdtree': _radius_sp_ckdtree,
'scipy-pdist': _radius_sp_pdist,
'flann': _radius_flann,
'nmslib': _radius_nmslib
},
}



self.symmetrize_type = symmetrize_type
self.dist_type = dist_type
self.order = order
Expand All @@ -93,73 +295,40 @@ def __init__(self, Xin, NNtype='knn', use_flann=False, center=True,
Xout = self.Xin

if self.center:
Xout = self.Xin - np.kron(np.ones((N, 1)),
np.mean(self.Xin, axis=0))
Xout = center_input(Xout, N)

if self.rescale:
bounding_radius = 0.5 * np.linalg.norm(np.amax(Xout, axis=0) -
np.amin(Xout, axis=0), 2)
scale = np.power(N, 1. / float(min(d, 3))) / 10.
Xout *= scale / bounding_radius

# Translate distance type string to corresponding Minkowski order.
dist_translation = {"euclidean": 2,
"manhattan": 1,
"max_dist": np.inf,
"minkowski": order
}
Xout = rescale_input(Xout, N, d)

if self.NNtype == 'knn':
spi = np.zeros((N * k))
spj = np.zeros((N * k))
spv = np.zeros((N * k))

if self.use_flann:
pfl = _import_pfl()
pfl.set_distance_type(dist_type, order=order)
flann = pfl.FLANN()

# Default FLANN parameters (I tried changing the algorithm and
# testing performance on huge matrices, but the default one
# seems to work best).
NN, D = flann.nn(Xout, Xout, num_neighbors=(k + 1),
algorithm='kdtree')

else:
kdt = spatial.KDTree(Xout)
D, NN = kdt.query(Xout, k=(k + 1),
p=dist_translation[dist_type])

for i in range(N):
spi[i * k:(i + 1) * k] = np.kron(np.ones((k)), i)
spj[i * k:(i + 1) * k] = NN[i, 1:]
spv[i * k:(i + 1) * k] = np.exp(-np.power(D[i, 1:], 2) /
float(self.sigma))

if self._nn_functions.get(NNtype) == None:
raise ValueError('Invalid NNtype {}'.format(self.NNtype))

if self._nn_functions[NNtype].get(backend) == None:
raise ValueError('Invalid backend {} for type {}'.format(backend,
self.NNtype))
if self.NNtype == 'knn':
NN, D = self._nn_functions[NNtype][backend](Xout, k,
dist_type, order)

elif self.NNtype == 'radius':
NN, D = self._nn_functions[NNtype][backend](Xout, epsilon,
dist_type, order)
countV = list(map(len, NN))
count = sum(countV)
spi = np.zeros((count))
spj = np.zeros((count))
spv = np.zeros((count))

start = 0
for i in range(N):
leng = countV[i] - 1
spi[start:start + leng] = np.kron(np.ones((leng)), i)
spj[start:start + leng] = NN[i][1:]
spv[start:start + leng] = np.exp(-np.power(D[i][1:], 2) /
float(self.sigma))
start = start + leng

kdt = spatial.KDTree(Xout)
D, NN = kdt.query(Xout, k=None, distance_upper_bound=epsilon,
p=dist_translation[dist_type])
count = 0
for i in range(N):
count = count + len(NN[i])

spi = np.zeros((count))
spj = np.zeros((count))
spv = np.zeros((count))

start = 0
for i in range(N):
leng = len(NN[i]) - 1
spi[start:start + leng] = np.kron(np.ones((leng)), i)
spj[start:start + leng] = NN[i][1:]
spv[start:start + leng] = np.exp(-np.power(D[i][1:], 2) /
float(self.sigma))
start = start + leng

else:
raise ValueError('Unknown NNtype {}'.format(self.NNtype))

W = sparse.csc_matrix((spv, (spi, spj)), shape=(N, N))

Expand All @@ -176,7 +345,7 @@ def __init__(self, Xin, NNtype='knn', use_flann=False, center=True,

def _get_extra_repr(self):
return {'NNtype': self.NNtype,
'use_flann': self.use_flann,
'backend': self.backend,
'center': self.center,
'rescale': self.rescale,
'k': self.k,
Expand Down