Skip to content

Commit

Permalink
Merge 9e2b382 into bf4e437
Browse files Browse the repository at this point in the history
  • Loading branch information
nperraud committed Aug 19, 2019
2 parents bf4e437 + 9e2b382 commit 9b30478
Show file tree
Hide file tree
Showing 31 changed files with 1,897 additions and 249 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Expand Up @@ -25,9 +25,10 @@ addons:
- ubuntu-toolchain-r-test
- sourceline: 'deb http://downloads.skewed.de/apt/trusty trusty universe'
packages:
- libqt5gui5 # pyqt5>5.11 otherwise cannot load the xcb platform plugin
- libflann-dev
- python3-graph-tool
- python-graph-tool
- libqt5gui5 # pyqt5>5.11 fails to load the xcb platform plugin without it

install:
- pip install --upgrade --upgrade-strategy eager .[dev]
Expand Down
9 changes: 9 additions & 0 deletions README.rst
Expand Up @@ -39,6 +39,15 @@ A (mostly unmaintained) `Matlab version <https://lts2.epfl.ch/gsp>`_ exists.
.. |conda| image:: https://anaconda.org/conda-forge/pygsp/badges/installer/conda.svg
:target: https://anaconda.org/conda-forge/pygsp


The PyGSP is a Python package to ease
`Signal Processing on Graphs <https://arxiv.org/abs/1211.0053>`_.
The documentation is available on
`Read the Docs <https://pygsp.readthedocs.io>`_
and development takes place on
`GitHub <https://github.com/epfl-lts2/pygsp>`_.
A (mostly unmaintained) `Matlab version <https://epfl-lts2.github.io/gspbox-html>`_ exists.

The PyGSP facilitates a wide variety of operations on graphs, like computing
their Fourier basis, filtering or interpolating signals, plotting graphs,
signals, and filters. Its core is spectral graph theory, and many of the
Expand Down
5 changes: 5 additions & 0 deletions doc/history.rst
Expand Up @@ -24,12 +24,17 @@ History
* New implementation of the Sensor graph that is simpler and scales better.
* A new learning module with three functions to solve standard semi-supervised
classification and regression problems.
* A much improved, fixed, documented, and tested NNGraph. The user can now
select the backend and similarity kernel. The radius can be estimated and
features standardized. (PR #43)
* Import and export graphs and their signals to NetworkX and graph-tool.
* Save and load graphs and theirs signals to / from GraphML, GML, and GEXF.
* Documentation: path graph linked to DCT, ring graph linked to DFT.
* We now have a gallery of examples! That is convenient for users to get a
taste of what the library can do, and to start working from a code snippet.
* Merged all the extra requirements in a single dev requirement.
* A function to learn the graph from the signal has been added with doc and tests.
This function can be used simply using the graph ``LearnGraph``

Experimental filter API (to be tested and validated):

Expand Down
17 changes: 17 additions & 0 deletions doc/references.bib
Expand Up @@ -220,3 +220,20 @@ @article{grassi2018timevertex
year={2018},
journal={IEEE Transactions on Signal Processing},
}

@inproceedings{
kalofolias2018large,
title={Large Scale Graph Learning From Smooth Signals},
author={Vassilis Kalofolias and Nathanaël Perraudin},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=ryGkSo0qYm},
}
@inproceedings{kalofolias2016learn,
title={How to learn a graph from smooth signals},
author={Kalofolias, Vassilis},
booktitle={Artificial Intelligence and Statistics},
pages={920--929},
year={2016}
}
4 changes: 2 additions & 2 deletions doc/tutorials/optimization.rst
Expand Up @@ -85,7 +85,7 @@ We start with the graph TV regularization. We will use the :class:`pyunlocbox.so
>>> prob1 = pyunlocbox.solvers.solve([d, r, f], solver=solver,
... x0=x0, rtol=0, maxit=1000)
Solution found after 1000 iterations:
objective function f(sol) = 2.250584e+02
objective function f(sol) = 2.256055e+02
stopping criterion: MAXIT
>>>
>>> fig, ax = G.plot(prob1['sol'])
Expand All @@ -107,7 +107,7 @@ This figure shows the label signal recovered by graph total variation regulariza
>>> prob2 = pyunlocbox.solvers.solve([r, f], solver=solver,
... x0=x0, rtol=0, maxit=1000)
Solution found after 1000 iterations:
objective function f(sol) = 6.504290e+01
objective function f(sol) = 4.376481e+01
stopping criterion: MAXIT
>>>
>>> fig, ax = G.plot(prob2['sol'])
Expand Down
46 changes: 46 additions & 0 deletions nn-test.py
@@ -0,0 +1,46 @@
import unittest
import numpy as np
from nn import nn

class TestCase(unittest.TestCase):
def test_nngraph(self, n_vertices=24):
data = np.random.RandomState(42).uniform(size=(n_vertices, 3))
metrics = ['euclidean', 'manhattan', 'max_dist', 'minkowski']
backends = ['scipy-kdtree', 'scipy-ckdtree', 'flann', 'nmslib']

for metric in metrics:
for kind in ['knn', 'radius']:
for backend in backends:
params = dict(features=data, metric=metric, kind=kind, radius=0.25)
ref_nn, ref_d = nn(backend='scipy-pdist', **params)
# Unsupported combinations.
if backend == 'flann' and metric == 'max_dist':
self.assertRaises(ValueError, nn, data,
metric=metric, backend=backend)
elif backend == 'nmslib' and metric == 'minkowski':
self.assertRaises(ValueError, nn, data,
metric=metric, backend=backend)
elif backend == 'nmslib' and kind == 'radius':
self.assertRaises(ValueError, nn, data,
kind=kind, backend=backend)
else:
params['backend'] = backend
if backend == 'flann':
# params['target_precision'] = 1
other_nn, other_d = nn(random_seed=44, **params)
else:
other_nn, other_d = nn(**params)
print(kind, backend)
for a,b in zip(ref_nn, other_nn):
np.testing.assert_allclose(np.sort(a),np.sort(b), rtol=1e-5)

for a,b in zip(ref_d, other_d):
np.testing.assert_allclose(np.sort(a),np.sort(b), rtol=1e-5)

def test_sparse_distance_matrix(self):
data = np.random.RandomState(42).uniform(size=(24, 3))


suite = unittest.TestLoader().loadTestsFromTestCase(TestCase)
if __name__ == '__main__':
unittest.main()
245 changes: 245 additions & 0 deletions nn.py
@@ -0,0 +1,245 @@
# -*- coding: utf-8 -*-

from __future__ import division

import numpy as np
from scipy import sparse, spatial
from pygsp import utils

def _scipy_pdist(features, metric, order, kind, k, radius, params):
if params:
raise ValueError('unexpected parameters {}'.format(params))
metric = 'cityblock' if metric == 'manhattan' else metric
metric = 'chebyshev' if metric == 'max_dist' else metric
params = dict(metric=metric)
if metric == 'minkowski':
params['p'] = order
dist = spatial.distance.pdist(features, **params)
dist = spatial.distance.squareform(dist)
if kind == 'knn':
neighbors = np.argsort(dist)[:, :k+1]
distances = np.take_along_axis(dist, neighbors, axis=-1)
elif kind == 'radius':
distances = []
neighbors = []
for distance in dist:
neighbor = np.flatnonzero(distance < radius)
neighbors.append(neighbor)
distances.append(distance[neighbor])
return neighbors, distances


def _scipy_kdtree(features, _, order, kind, k, radius, params):
if order is None:
raise ValueError('invalid metric for scipy-kdtree')
eps = params.pop('eps', 0)
tree = spatial.KDTree(features, **params)
params = dict(p=order, eps=eps)
if kind == 'knn':
params['k'] = k + 1
elif kind == 'radius':
params['k'] = None
params['distance_upper_bound'] = radius
distances, neighbors = tree.query(features, **params)
return neighbors, distances


def _scipy_ckdtree(features, _, order, kind, k, radius, params):
if order is None:
raise ValueError('invalid metric for scipy-kdtree')
eps = params.pop('eps', 0)
tree = spatial.cKDTree(features, **params)
params = dict(p=order, eps=eps, n_jobs=-1)
if kind == 'knn':
params['k'] = k + 1
elif kind == 'radius':
params['k'] = features.shape[0] # number of vertices
params['distance_upper_bound'] = radius
distances, neighbors = tree.query(features, **params)
if kind == 'knn':
return neighbors, distances
elif kind == 'radius':
dist = []
neigh = []
for distance, neighbor in zip(distances, neighbors):
mask = (distance != np.inf)
dist.append(distance[mask])
neigh.append(neighbor[mask])
return neigh, dist


def _flann(features, metric, order, kind, k, radius, params):
if metric == 'max_dist':
raise ValueError('flann gives wrong results for metric="max_dist".')
try:
import cyflann as cfl
except Exception as e:
raise ImportError('Cannot import cyflann. Choose another nearest '
'neighbors backend or try to install it with '
'pip (or conda) install cyflann. '
'Original exception: {}'.format(e))
cfl.set_distance_type(metric, order=order)
index = cfl.FLANNIndex()
index.build_index(features, **params)
# I tried changing the algorithm and testing performance on huge matrices,
# but the default parameters seems to work best.
if kind == 'knn':
neighbors, distances = index.nn_index(features, k+1)
if metric == 'euclidean':
np.sqrt(distances, out=distances)
elif metric == 'minkowski':
np.power(distances, 1/order, out=distances)
elif kind == 'radius':
distances = []
neighbors = []
if metric == 'euclidean':
radius = radius**2
elif metric == 'minkowski':
radius = radius**order
n_vertices, _ = features.shape
for vertex in range(n_vertices):
neighbor, distance = index.nn_radius(features[vertex, :], radius)
distances.append(distance)
neighbors.append(neighbor)
if metric == 'euclidean':
distances = list(map(np.sqrt, distances))
elif metric == 'minkowski':
distances = list(map(lambda d: np.power(d, 1/order), distances))
index.free_index()
return neighbors, distances


def _nmslib(features, metric, order, kind, k, _, params):
if kind == 'radius':
raise ValueError('nmslib does not support kind="radius".')
if metric == 'minkowski':
raise ValueError('nmslib does not support metric="minkowski".')
try:
import nmslib as nms
except Exception as e:
raise ImportError('Cannot import nmslib. Choose another nearest '
'neighbors backend or try to install it with '
'pip (or conda) install nmslib. '
'Original exception: {}'.format(e))
n_vertices, _ = features.shape
params_index = params.pop('index', None)
params_query = params.pop('query', None)
metric = 'l2' if metric == 'euclidean' else metric
metric = 'l1' if metric == 'manhattan' else metric
metric = 'linf' if metric == 'max_dist' else metric
index = nms.init(space=metric, **params)
index.addDataPointBatch(features)
index.createIndex(params_index)
if params_query is not None:
index.setQueryTimeParams(params_query)
results = index.knnQueryBatch(features, k=k+1)
neighbors, distances = zip(*results)
distances = np.concatenate(distances).reshape(n_vertices, k+1)
neighbors = np.concatenate(neighbors).reshape(n_vertices, k+1)
return neighbors, distances

def nn(features, metric='euclidean', order=2, kind='knn', k=10, radius=None, backend='scipy-ckdtree', **kwargs):
'''Find nearest neighboors.
Parameters
----------
features : data numpy array
metric : {'euclidean', 'manhattan', 'minkowski', 'max_dist'}, optional
Metric used to compute pairwise distances.
* ``'euclidean'`` defines pairwise distances as
:math:`d(v_i, v_j) = \| x_i - x_j \|_2`.
* ``'manhattan'`` defines pairwise distances as
:math:`d(v_i, v_j) = \| x_i - x_j \|_1`.
* ``'minkowski'`` generalizes the above and defines distances as
:math:`d(v_i, v_j) = \| x_i - x_j \|_p`
where :math:`p` is the ``order`` of the norm.
* ``'max_dist'`` defines pairwise distances as
:math:`d(v_i, v_j) = \| x_i - x_j \|_\infty = \max(x_i - x_j)`, where
the maximum is taken over the elements of the vector.
More metrics may be supported for some backends.
Please refer to the documentation of the chosen backend.
kind : 'knn' or 'radius' (default 'knn')
k : number of nearest neighboors if 'knn' is selected
radius : radius of the search if 'radius' is slected
order : float, optional
The order of the Minkowski distance for ``metric='minkowski'``.
backend : string, optional
* ``'scipy-pdist'`` uses :func:`scipy.spatial.distance.pdist` to
compute pairwise distances. The method is brute force and computes
all distances. That is the slowest method.
* ``'scipy-kdtree'`` uses :class:`scipy.spatial.KDTree`. The method
builds a k-d tree to prune the number of pairwise distances it has to
compute. That is an efficient strategy for low-dimensional spaces.
* ``'scipy-ckdtree'`` uses :class:`scipy.spatial.cKDTree`. The same as
``'scipy-kdtree'`` but with C bindings, which should be faster.
That is the default.
* ``'flann'`` uses the `Fast Library for Approximate Nearest Neighbors
(FLANN) <https://github.com/mariusmuja/flann>`_. That method is an
approximation.
* ``'nmslib'`` uses the `Non-Metric Space Library (NMSLIB)
<https://github.com/nmslib/nmslib>`_. That method is an
approximation. It should be the fastest in high-dimensional spaces.
You can look at this `benchmark
<https://github.com/erikbern/ann-benchmarks>`_ to get an idea of the
relative performance of those backends. It's nonetheless wise to run
some tests on your own data.
'''
if kind=='knn':
radius = None
elif kind=='radius':
k = None
else:
raise ValueError('"kind" must be "knn" or "radius"')

_orders = {
'euclidean': 2,
'manhattan': 1,
'max_dist': np.inf,
'minkowski': order,
}
order = _orders.pop(metric, None)
try:
function = globals()['_' + backend.replace('-', '_')]
except KeyError:
raise ValueError('Invalid backend "{}".'.format(backend))
neighbors, distances = function(features, metric, order,
kind, k, radius, kwargs)
return neighbors, distances


def sparse_distance_matrix(neighbors, distances, symmetrize=True, safe=False, kind = None):
'''Build a sparse distance matrix.'''
n_edges = [len(n) - 1 for n in neighbors] # remove distance to self
if safe and kind is None:
raise ValueError('Please specify "kind" to "knn" or "radius" to use the safe mode')

if safe and kind == 'radius':
n_disconnected = np.sum(np.asarray(n_edges) == 0)
if n_disconnected > 0:
_logger.warning('{} points (out of {}) have no neighboors. '
'Consider increasing the radius or setting '
'kind=knn.'.format(n_disconnected, n_vertices))

value = np.empty(sum(n_edges), dtype=np.float)
row = np.empty_like(value, dtype=np.int)
col = np.empty_like(value, dtype=np.int)
start = 0
n_vertices = len(n_edges)
for vertex in range(n_vertices):
if safe and kind == 'knn':
assert n_edges[vertex] == k
end = start + n_edges[vertex]
value[start:end] = distances[vertex][1:]
row[start:end] = np.full(n_edges[vertex], vertex)
col[start:end] = neighbors[vertex][1:]
start = end
W = sparse.csr_matrix((value, (row, col)), (n_vertices, n_vertices))
if symmetrize:
# Enforce symmetry. May have been broken by k-NN. Checking symmetry
# with np.abs(W - W.T).sum() is as costly as the symmetrization itself.
W = utils.symmetrize(W, method='fill')
return W

0 comments on commit 9b30478

Please sign in to comment.