Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sparse omni #834

Merged
merged 10 commits into from
Sep 10, 2021
57 changes: 48 additions & 9 deletions graspologic/embed/omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,55 @@
# Licensed under the MIT License.

import warnings
from typing import Optional
from typing import List, Optional

import numpy as np
from scipy.sparse import isspmatrix_csr
from beartype import beartype
from scipy.sparse import csr_matrix, hstack, isspmatrix_csr, vstack

from ..utils import import_graph, is_fully_connected
from ..utils import average_matrices, is_fully_connected
from .base import BaseEmbedMulti


@beartype
def _get_omnibus_matrix_sparse(matrices: List[csr_matrix]) -> csr_matrix:
"""
Generate the omnibus matrix from a list of sparse adjacency matrices as described by 'A central limit theorem
for an omnibus embedding of random dot product graphs.'

Given an iterable of matrices a, b, ... n then the omnibus matrix is defined as::

[[ a, .5 * (a + b), ..., .5 * (a + n)],
[.5 * (b + a), b, ..., .5 * (b + n)],
[ ..., ..., ..., ...],
[.5 * (n + a), .5 * (n + b, ..., n]
]
"""

rows = []

# Iterate over each column
for column_index, column_matrix in enumerate(matrices):
current_row = []

for row_index, row_matrix in enumerate(matrices):
if row_index == column_index:
# we are on the diagonal, we do not need to perform any calculation and instead add the current matrix
# to the current_row
current_row.append(column_matrix)
else:
# otherwise we are not on the diagonal and we average the current_matrix with the matrix at row_index
# and add that to our current_row
matrices_averaged = (column_matrix + row_matrix) * 0.5
current_row.append(matrices_averaged)

# an entire row has been generated, we will create a horizontal stack of each matrix in the row completing the
# row
rows.append(hstack(current_row))

return vstack(rows, format="csr")


def _get_omni_matrix(graphs):
"""
Helper function for creating the omnibus matrix.
Expand All @@ -25,6 +65,9 @@ def _get_omni_matrix(graphs):
out : 2d-array
Array of shape (n_vertices * n_graphs, n_vertices * n_graphs)
"""
if isspmatrix_csr(graphs[0]):
nicaurvi marked this conversation as resolved.
Show resolved Hide resolved
return _get_omnibus_matrix_sparse(graphs)

shape = graphs[0].shape
n = shape[0] # number of vertices
m = len(graphs) # number of graphs
Expand Down Expand Up @@ -163,7 +206,7 @@ def fit(self, graphs, y=None):

Parameters
----------
graphs : list of nx.Graph or ndarray, or ndarray
graphs : list of nx.Graph or ndarray, or csr_matrix
If list of nx.Graph, each Graph must contain same number of nodes.
If list of ndarray, each array must have shape (n_vertices, n_vertices).
If ndarray, then array must have shape (n_graphs, n_vertices, n_vertices).
Expand All @@ -173,15 +216,11 @@ def fit(self, graphs, y=None):
self : object
Returns an instance of self.
"""
if any([isspmatrix_csr(g) for g in graphs]):
msg = "OmnibusEmbed does not support scipy.sparse.csr_matrix inputs"
raise TypeError(msg)

graphs = self._check_input_graphs(graphs)

# Check if Abar is connected
if self.check_lcc:
if not is_fully_connected(np.mean(graphs, axis=0)):
if not is_fully_connected(average_matrices(graphs)):
msg = (
"Input graphs are not fully connected. Results may not"
+ "be optimal. You can compute the largest connected component by"
Expand Down
5 changes: 3 additions & 2 deletions graspologic/pipeline/embed/omnibus_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def omnibus_embedding_pairwise(

union_graph_lcc = largest_connected_component(union_graph)
union_graph_lcc_nodes = union_graph_lcc.nodes()

union_node_ids = np.array(list(union_graph_lcc_nodes))

previous_graph = graphs[0].copy()
Expand Down Expand Up @@ -261,11 +262,11 @@ def _elbow_cut_if_needed(elbow_cut, is_directed, singular_values, embedding):


def _augment_graph(graph, node_ids, weight_attribute):
graph_as_array = nx.to_numpy_array(
graph_sparse = nx.to_scipy_sparse_matrix(
graph, weight=weight_attribute, nodelist=node_ids
)

graphs_loops_removed = remove_loops(graph_as_array)
graphs_loops_removed = remove_loops(graph_sparse)
graphs_ranked = pass_to_ranks(graphs_loops_removed)
graphs_diag_augmented = augment_diagonal(graphs_ranked)

Expand Down
2 changes: 2 additions & 0 deletions graspologic/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .ptr import pass_to_ranks
from .utils import (
augment_diagonal,
average_matrices,
binarize,
cartesian_product,
fit_plug_in_variance_estimator,
Expand All @@ -26,6 +27,7 @@
)

__all__ = [
"average_matrices",
"import_graph",
"import_edgelist",
"is_symmetric",
Expand Down
24 changes: 24 additions & 0 deletions graspologic/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
import scipy.sparse
from beartype import beartype
from scipy.optimize import linear_sum_assignment
from scipy.sparse import csgraph, csr_matrix, diags, isspmatrix_csr
from scipy.sparse.csgraph import connected_components
Expand All @@ -19,6 +20,29 @@
from sklearn.utils.multiclass import type_of_target, unique_labels


@beartype
def average_matrices(
matrices: Union[np.ndarray, List[Union[np.ndarray, csr_matrix]]]
) -> Union[np.ndarray, csr_matrix]:
"""
Helper method to encapsulate calculating the average of matrices represented either as a
list of numpy.ndarray or a list of scipy.sparse.csr_matrix.

Parameters
----------
matrices: Union[np.ndarray, List[Union[np.ndarray, csr_matrix]]]
The list of matrices to be averaged

Returns
-------
Union[np.ndarray, csr_matrix]
"""
if isinstance(matrices[0], np.ndarray):
return np.mean(matrices, axis=0)
elif isspmatrix_csr(matrices[0]):
return sum(matrices) / len(matrices)


def import_graph(graph, copy=True):
"""
A function for reading a graph and returning a shared data type.
Expand Down
24 changes: 24 additions & 0 deletions tests/test_omni.py → tests/embed/test_omni.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from numpy import array_equal
from numpy.linalg import norm
from numpy.testing import assert_allclose
from scipy.sparse import csr_matrix

from graspologic.embed.omni import OmnibusEmbed, _get_omni_matrix
from graspologic.simulations.simulations import er_nm, er_np
Expand Down Expand Up @@ -189,3 +190,26 @@ def run(diag_aug):

run(diag_aug=True)
run(diag_aug=False)

def test_omni_embed_sparse(self):
def compute_bar(arr):
n = arr.shape[0] // 2
return (arr[:n] + arr[n:]) / 2

def run(diag_aug):
X, A1, A2 = generate_data(1000, seed=2)
Abar = (A1 + A2) / 2

omni = OmnibusEmbed(n_components=3, diag_aug=diag_aug)
OmniBar = compute_bar(omni.fit_transform([csr_matrix(A1), csr_matrix(A2)]))

omni = OmnibusEmbed(n_components=3, diag_aug=diag_aug)
ABar = compute_bar(omni.fit_transform([Abar, Abar]))

tol = 1.0e-2
np.testing.assert_allclose(
norm(OmniBar, axis=1), norm(ABar, axis=1), rtol=tol, atol=tol
nicaurvi marked this conversation as resolved.
Show resolved Hide resolved
)

run(diag_aug=True)
run(diag_aug=False)
5 changes: 3 additions & 2 deletions tests/pipeline/embed/test_omnibus_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,14 @@ def test_omnibus_embedding_elbowcuts_none_returns_full_embedding(self):
def test_omnibus_embedding_digraph_elbowcuts_none_returns_full_embedding(self):
dimensions = 100
expected_dimensions = dimensions * 2
number_of_nodes = 1000
nicaurvi marked this conversation as resolved.
Show resolved Hide resolved

g = nx.DiGraph()
for i in range(1000):
for i in range(number_of_nodes):
g.add_edge(1, i, weight=1)

g2 = g.copy()
for i in range(1000):
for i in range(number_of_nodes):
g2.add_edge(i, 1, weight=i)

embeddings = omnibus_embedding_pairwise(
Expand Down
20 changes: 20 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.

import random
import unittest
import warnings
from math import sqrt
Expand All @@ -14,6 +15,25 @@
from graspologic.utils import utils as gus


class TestAverageMatrices(unittest.TestCase):
def test_mean_dense_and_sparse_are_equivalent(self):
trials = 20

for _ in range(trials):
number_of_graphs = random.randint(2, 10)

dim = random.randint(2, 100)
dim2 = random.randint(2, 100)

graphs = [np.random.rand(dim, dim2) for _ in range(number_of_graphs)]
graphs_sparse = [csr_matrix(graph) for graph in graphs]

graphs_averaged = gus.average_matrices(graphs)
graphs_sparse_averaged = gus.average_matrices(graphs_sparse).todense()

np.testing.assert_almost_equal(graphs_averaged, graphs_sparse_averaged)


class TestInput(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down