Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LSE to Pipeline #817

Merged
merged 12 commits into from
Aug 11, 2021
Merged
4 changes: 2 additions & 2 deletions docs/reference/reference/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ Embed
-----
.. automodule:: graspologic.pipeline.embed
.. autoclass:: graspologic.pipeline.embed.embeddings.Embeddings
.. autofunction:: graspologic.pipeline.embed.adjacency_spectral_embedding.adjacency_spectral_embedding

.. autofunction:: graspologic.pipeline.embed.adjacency_spectral_embedding
.. autofunction:: graspologic.pipeline.embed.laplacian_spectral_embedding
6 changes: 3 additions & 3 deletions graspologic/embed/lse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class LaplacianSpectralEmbed(BaseSpectralEmbed):

The laplacian spectral embedding (LSE) is a k-dimensional Euclidean representation
of the graph based on its Laplacian matrix. It relies on an SVD to reduce
the dimensionality to the specified k, or if k is unspecified, can find a number
of dimensions automatically.
the dimensionality to the specified ``n_components``, or if ``n_components`` is
unspecified, can find a number of dimensions automatically.

Parameters
----------
Expand Down Expand Up @@ -121,7 +121,7 @@ def __init__(
self,
form: str = "DAD",
n_components: Optional[int] = None,
n_elbows: int = 2,
n_elbows: Optional[int] = 2,
algorithm: str = "randomized",
n_iter: int = 5,
check_lcc: bool = True,
Expand Down
1 change: 1 addition & 0 deletions graspologic/pipeline/embed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@

from .adjacency_spectral_embedding import adjacency_spectral_embedding
from .embeddings import Embeddings, EmbeddingsView
from .laplacian_spectral_embedding import laplacian_spectral_embedding
35 changes: 23 additions & 12 deletions graspologic/pipeline/embed/adjacency_spectral_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from beartype import beartype

from graspologic.embed import AdjacencySpectralEmbed
from graspologic.preconditions import (
check_argument,
check_argument_types,
check_optional_argument_types,
is_real_weighted,
from graspologic.preconditions import check_argument, is_real_weighted
from graspologic.utils import (
augment_diagonal,
is_fully_connected,
pass_to_ranks,
remove_loops,
)
from graspologic.utils import is_fully_connected, pass_to_ranks

from . import __SVD_SOLVER_TYPES # from the module init
from ._elbow import _index_of_elbow
Expand Down Expand Up @@ -53,7 +53,7 @@ def adjacency_spectral_embedding(

Parameters
----------
graph : Union[nx.Graph, nx.DiGraph, nx.OrderedGraph, nx.OrderedDiGraph]
graph : Union[nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph]
An undirected or directed graph. The graph **must**:

- be fully numerically weighted (every edge must have a real, numeric weight
Expand Down Expand Up @@ -103,6 +103,11 @@ def adjacency_spectral_embedding(
-------
Embeddings

Raises
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you. I forgot about this.

------
beartype.roar.BeartypeCallHintPepParamException if parameters do not match type hints
ValueError if values are not within appropriate ranges or allowed values

See Also
--------
graspologic.pipeline.embed.Embeddings
Expand Down Expand Up @@ -171,14 +176,20 @@ def adjacency_spectral_embedding(
# not all of the weights are real numbers, if they exist at all
# this weight=1.0 treatment actually happens in nx.to_scipy_sparse_matrix()

graph_as_csr = nx.to_scipy_sparse_matrix(graph, weight=weight_attribute)
node_labels = np.array(list(graph.nodes()))

graph_as_csr = nx.to_scipy_sparse_matrix(
graph, weight=weight_attribute, nodelist=node_labels
)

if not is_fully_connected(graph):
warnings.warn("More than one connected component detected")

node_labels = np.array(list(graph.nodes()))
graph_sans_loops = remove_loops(graph_as_csr)

ranked_graph = pass_to_ranks(graph_sans_loops)

graph_as_csr = pass_to_ranks(graph_as_csr)
augmented_graph = augment_diagonal(ranked_graph)

embedder = AdjacencySpectralEmbed(
n_components=dimensions,
Expand All @@ -187,9 +198,9 @@ def adjacency_spectral_embedding(
n_iter=svd_solver_iterations,
svd_seed=svd_seed,
concat=False,
diag_aug=True,
diag_aug=False,
daxpryce marked this conversation as resolved.
Show resolved Hide resolved
)
results = embedder.fit_transform(graph_as_csr)
results = embedder.fit_transform(augmented_graph)

if elbow_cut is None:
if graph.is_directed():
Expand Down
18 changes: 7 additions & 11 deletions graspologic/pipeline/embed/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,14 @@ def __init__(self, labels: np.ndarray, embeddings: np.ndarray):
The node labels that are positionally correlated with the embeddings.
The dtype of labels is any object stored in a networkx Graph object,
though type uniformity will be required
embeddings
embeddings : np.ndarray
The embedded values generated by the embedding technique.

Raises
------
beartype.roar.BeartypeCallHintPepParamException if the types are invalid
ValueError if the row count of labels does not equal the row count of embeddings
"""
if labels is None:
raise ValueError("labels cannot be None")
if embeddings is None:
raise ValueError("embeddings cannot be None")
if not isinstance(labels, np.ndarray):
raise TypeError(f"labels must be numpy.ndarray, got: {type(labels)}")
if not isinstance(embeddings, np.ndarray):
raise TypeError(
f"embeddings must be numpy.ndarray, got: {type(embeddings)}"
)
if labels.shape[0] != embeddings.shape[0]:
raise ValueError(
f"labels and embeddings must have the same number of "
Expand Down
230 changes: 230 additions & 0 deletions graspologic/pipeline/embed/laplacian_spectral_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numbers
import warnings
from typing import Optional, Union

import networkx as nx
import numpy as np
from beartype import beartype

from graspologic.embed import LaplacianSpectralEmbed
from graspologic.preconditions import check_argument, is_real_weighted
from graspologic.utils import is_fully_connected, pass_to_ranks, remove_loops

from . import __SVD_SOLVER_TYPES # from the module init
from ._elbow import _index_of_elbow
from .embeddings import Embeddings

__FORMS = ["DAD", "I-DAD", "R-DAD"]


@beartype
def laplacian_spectral_embedding(
graph: Union[nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph],
form: str = "R-DAD",
dimensions: int = 100,
elbow_cut: Optional[int] = None,
svd_solver_algorithm: str = "randomized",
svd_solver_iterations: int = 5,
svd_seed: Optional[int] = None,
weight_attribute: str = "weight",
regularizer: Optional[numbers.Real] = None,
) -> Embeddings:
"""
Given a directed or undirected networkx graph (*not* multigraph), generate an
Embeddings object.

The laplacian spectral embedding process is similar to the adjacency spectral
embedding process, with the key differentiator being that the LSE process looks
further into the latent space when it captures changes, whereas the ASE process
is egocentric and focused on immediate differentiators in a node's periphery.

All weights will be rescaled based on their relative rank in the graph,
which is beneficial in minimizing anomalous results if some edge weights are
extremely atypical of the rest of the graph.

Parameters
----------
graph : Union[nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph]
An undirected or directed graph. The graph **must**:

- be fully numerically weighted (every edge must have a real, numeric weight
or else it will be treated as an unweighted graph)
- be a basic graph (meaning it should not be a multigraph; if you have a
multigraph you must first decide how you want to handle the weights of the
edges between two nodes, whether summed, averaged, last-wins,
maximum-weight-only, etc)
form : str (default="R-DAD")
Specifies the type of Laplacian normalization to use. Allowed values are:
{ "DAD", "I-DAD", "R-DAD" }
dimensions : int (default=100)
Dimensions to use for the svd solver.
For undirected graphs, if ``elbow_cut==None``, you will receive an embedding
that has ``nodes`` rows and ``dimensions`` columns.
For directed graphs, if ``elbow_cut==None``, you will receive an embedding that
has ``nodes`` rows and ``2*dimensions`` columns.
If ``elbow_cut`` is specified to be not ``None``, we will cut the embedding at
``elbow_cut`` elbow, but the provided ``dimensions`` will be used in the
creation of the SVD.
elbow_cut : Optional[int] (default=None)
Using a process described by Zhu & Ghodsi in their paper "Automatic
dimensionality selection from the scree plot via the use of profile likelihood",
truncate the dimensionality of the return on the ``elbow_cut``-th elbow.
By default this value is ``None`` but can be used to reduce the dimensionality
of the returned tensors.
svd_solver_algorithm : str (default="randomized")
allowed values: {'randomized', 'full', 'truncated'}

SVD solver to use:

- 'randomized'
Computes randomized svd using
:func:`sklearn.utils.extmath.randomized_svd`
- 'full'
Computes full svd using :func:`scipy.linalg.svd`
Does not support ``graph`` input of type scipy.sparse.csr_matrix
- 'truncated'
Computes truncated svd using :func:`scipy.sparse.linalg.svds`
svd_solver_iterations : int (default=5)
Number of iterations for randomized SVD solver. Not used by 'full' or
'truncated'. The default is larger than the default in randomized_svd
to handle sparse matrices that may have large slowly decaying spectrum.
svd_seed : Optional[int] (default=None)
Used to seed the PRNG used in the ``randomized`` svd solver algorithm.
weight_attribute : str (default="weight")
The edge dictionary key that contains the weight of the edge.
regularizer : Optional[numbers.Real] (default=None)
Only used when form="R-DAD". Must be None or nonnegative.
Constant to be added to the diagonal of degree matrix. If None, average
node degree is added. If int or float, must be >= 0.

Returns
-------
Embeddings

Raises
------
beartype.roar.BeartypeCallHintPepParamException if parameters do not match type hints
ValueError if values are not within appropriate ranges or allowed values

See Also
--------
graspologic.pipeline.embed.Embeddings
graspologic.embed.LaplacianSpectralEmbed
graspologic.embed.select_svd
graspologic.utils.to_laplacian

Notes
-----
The singular value decomposition:

.. math:: A = U \Sigma V^T

is used to find an orthonormal basis for a matrix, which in our case is the
Laplacian matrix of the graph. These basis vectors (in the matrices U or V) are
ordered according to the amount of variance they explain in the original matrix.
By selecting a subset of these basis vectors (through our choice of dimensionality
reduction) we can find a lower dimensional space in which to represent the graph.

References
----------
.. [1] Sussman, D.L., Tang, M., Fishkind, D.E., Priebe, C.E. "A
Consistent Adjacency Spectral Embedding for Stochastic Blockmodel Graphs,"
Journal of the American Statistical Association, Vol. 107(499), 2012.

.. [2] Von Luxburg, Ulrike. "A tutorial on spectral clustering," Statistics
and computing, Vol. 17(4), pp. 395-416, 2007.

.. [3] Rohe, Karl, Sourav Chatterjee, and Bin Yu. "Spectral clustering and
the high-dimensional stochastic blockmodel," The Annals of Statistics,
Vol. 39(4), pp. 1878-1915, 2011.

.. [4] Zhu, M. and Ghodsi, A. (2006). Automatic dimensionality selection from the
scree plot via the use of profile likelihood. Computational Statistics & Data
Analysis, 51(2), pp.918-930.

"""
check_argument(
form in __FORMS, f"form must be one of the values in {','.join(__FORMS)}"
)

check_argument(dimensions >= 1, "dimensions must be positive")

check_argument(elbow_cut is None or elbow_cut >= 1, "elbow_cut must be positive")

check_argument(
svd_solver_algorithm in __SVD_SOLVER_TYPES,
f"svd_solver_algorithm must be one of the values in {','.join(__SVD_SOLVER_TYPES)}",
)

check_argument(svd_solver_iterations >= 1, "svd_solver_iterations must be positive")

check_argument(
svd_seed is None or 0 <= svd_seed <= 2 ** 32 - 1,
"svd_seed must be a nonnegative, 32-bit integer",
)

check_argument(
regularizer is None or regularizer >= 0, "regularizer must be nonnegative"
)

check_argument(
not graph.is_multigraph(),
"Multigraphs are not supported; you must determine how to represent at most "
"one edge between any two nodes, and handle the corresponding weights "
"accordingly",
)

if not is_real_weighted(graph, weight_attribute=weight_attribute):
warnings.warn(
f"Graphs with edges that do not have a real numeric weight set for every "
f"{weight_attribute} attribute on every edge are treated as an unweighted "
f"graph - which presumes all weights are `1.0`. If this is incorrect, "
f"please add a '{weight_attribute}' attribute to every edge with a real, "
f"numeric value (e.g. an integer or a float) and call this function again."
)
weight_attribute = None # this supercedes what the user said, because
# not all of the weights are real numbers, if they exist at all
# this weight=1.0 treatment actually happens in nx.to_scipy_sparse_matrix()

node_labels = np.array(list(graph.nodes()))

graph_as_csr = nx.to_scipy_sparse_matrix(
graph, weight=weight_attribute, nodelist=node_labels
)

if not is_fully_connected(graph):
warnings.warn("More than one connected component detected")

graph_sans_loops = remove_loops(graph_as_csr)

ranked_graph = pass_to_ranks(graph_sans_loops)

embedder = LaplacianSpectralEmbed(
form=form,
n_components=dimensions,
n_elbows=None, # in the short term, we do our own elbow finding
algorithm=svd_solver_algorithm,
n_iter=svd_solver_iterations,
svd_seed=svd_seed,
concat=False,
)
results = embedder.fit_transform(ranked_graph)

if elbow_cut is None:
if graph.is_directed():
results = np.concatenate(results, axis=1)
else:
column_index = _index_of_elbow(embedder.singular_values_, elbow_cut)
if graph.is_directed():
left, right = results
left = left[:, :column_index]
right = right[:, :column_index]
results = np.concatenate((left, right), axis=1)
else:
results = results[:, :column_index]

embeddings = Embeddings(node_labels, results)
return embeddings
11 changes: 11 additions & 0 deletions tests/pipeline/embed/test_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import unittest

import numpy as np
from beartype.roar import BeartypeCallHintPepParamException

from graspologic.pipeline.embed import Embeddings

Expand Down Expand Up @@ -43,3 +44,13 @@ def test_view(self):
self.assertSetEqual(set(view.keys()), set(expected.keys()))
for key in expected:
np.testing.assert_array_equal(expected[key], view[key])

def test_argument_types(self):
with self.assertRaises(BeartypeCallHintPepParamException):
Embeddings(None, None)
with self.assertRaises(BeartypeCallHintPepParamException):
Embeddings(np.array(["hello"]), None)
with self.assertRaises(BeartypeCallHintPepParamException):
Embeddings(["hello"], [1.0])
with self.assertRaises(ValueError):
Embeddings(np.array(["hello"]), np.array([[1.1, 1.2], [2.1, 2.2]]))