graspologic-org · nicaurvi · Sep 10, 2021 · Sep 7, 2021 · Sep 7, 2021 · Sep 7, 2021
diff --git a/graspologic/embed/omni.py b/graspologic/embed/omni.py
@@ -2,15 +2,55 @@
 # Licensed under the MIT License.
 
 import warnings
-from typing import Optional
+from typing import List, Optional
 
 import numpy as np
-from scipy.sparse import isspmatrix_csr
+from beartype import beartype
+from scipy.sparse import csr_matrix, hstack, isspmatrix_csr, vstack
 
-from ..utils import import_graph, is_fully_connected
+from ..utils import average_matrices, is_fully_connected
 from .base import BaseEmbedMulti
 
 
+@beartype
+def _get_omnibus_matrix_sparse(matrices: List[csr_matrix]) -> csr_matrix:
+    """
+    Generate the omnibus matrix from a list of sparse adjacency matrices as described by 'A central limit theorem
+    for an omnibus embedding of random dot product graphs.'
+
+    Given an iterable of matrices a, b, ... n then the omnibus matrix is defined as::
+
+        [[           a, .5 * (a + b), ..., .5 * (a + n)],
+         [.5 * (b + a),            b, ..., .5 * (b + n)],
+         [         ...,          ..., ...,          ...],
+         [.5 * (n + a),  .5 * (n + b, ...,            n]
+        ]
+    """
+
+    rows = []
+
+    # Iterate over each column
+    for column_index, column_matrix in enumerate(matrices):
+        current_row = []
+
+        for row_index, row_matrix in enumerate(matrices):
+            if row_index == column_index:
+                # we are on the diagonal, we do not need to perform any calculation and instead add the current matrix
+                # to the current_row
+                current_row.append(column_matrix)
+            else:
+                # otherwise we are not on the diagonal and we average the current_matrix with the matrix at row_index
+                # and add that to our current_row
+                matrices_averaged = (column_matrix + row_matrix) * 0.5
+                current_row.append(matrices_averaged)
+
+        # an entire row has been generated, we will create a horizontal stack of each matrix in the row completing the
+        # row
+        rows.append(hstack(current_row))
+
+    return vstack(rows, format="csr")
+
+
 def _get_omni_matrix(graphs):
     """
     Helper function for creating the omnibus matrix.
@@ -25,6 +65,9 @@ def _get_omni_matrix(graphs):
     out : 2d-array
         Array of shape (n_vertices * n_graphs, n_vertices * n_graphs)
     """
+    if isspmatrix_csr(graphs[0]):
+        return _get_omnibus_matrix_sparse(graphs)
+
     shape = graphs[0].shape
     n = shape[0]  # number of vertices
     m = len(graphs)  # number of graphs
@@ -163,7 +206,7 @@ def fit(self, graphs, y=None):
 
         Parameters
         ----------
-        graphs : list of nx.Graph or ndarray, or ndarray
+        graphs : list of nx.Graph or ndarray, or csr_matrix
             If list of nx.Graph, each Graph must contain same number of nodes.
             If list of ndarray, each array must have shape (n_vertices, n_vertices).
             If ndarray, then array must have shape (n_graphs, n_vertices, n_vertices).
@@ -173,15 +216,11 @@ def fit(self, graphs, y=None):
         self : object
             Returns an instance of self.
         """
-        if any([isspmatrix_csr(g) for g in graphs]):
-            msg = "OmnibusEmbed does not support scipy.sparse.csr_matrix inputs"
-            raise TypeError(msg)
-
         graphs = self._check_input_graphs(graphs)
 
         # Check if Abar is connected
         if self.check_lcc:
-            if not is_fully_connected(np.mean(graphs, axis=0)):
+            if not is_fully_connected(average_matrices(graphs)):
                 msg = (
                     "Input graphs are not fully connected. Results may not"
                     + "be optimal. You can compute the largest connected component by"

diff --git a/graspologic/pipeline/embed/omnibus_embedding.py b/graspologic/pipeline/embed/omnibus_embedding.py
@@ -158,6 +158,7 @@ def omnibus_embedding_pairwise(
 
     union_graph_lcc = largest_connected_component(union_graph)
     union_graph_lcc_nodes = union_graph_lcc.nodes()
+
     union_node_ids = np.array(list(union_graph_lcc_nodes))
 
     previous_graph = graphs[0].copy()
@@ -261,11 +262,11 @@ def _elbow_cut_if_needed(elbow_cut, is_directed, singular_values, embedding):
 
 
 def _augment_graph(graph, node_ids, weight_attribute):
-    graph_as_array = nx.to_numpy_array(
+    graph_sparse = nx.to_scipy_sparse_matrix(
         graph, weight=weight_attribute, nodelist=node_ids
     )
 
-    graphs_loops_removed = remove_loops(graph_as_array)
+    graphs_loops_removed = remove_loops(graph_sparse)
     graphs_ranked = pass_to_ranks(graphs_loops_removed)
     graphs_diag_augmented = augment_diagonal(graphs_ranked)
 

diff --git a/graspologic/utils/__init__.py b/graspologic/utils/__init__.py
@@ -4,6 +4,7 @@
 from .ptr import pass_to_ranks
 from .utils import (
     augment_diagonal,
+    average_matrices,
     binarize,
     cartesian_product,
     fit_plug_in_variance_estimator,
@@ -26,6 +27,7 @@
 )
 
 __all__ = [
+    "average_matrices",
     "import_graph",
     "import_edgelist",
     "is_symmetric",

diff --git a/graspologic/utils/utils.py b/graspologic/utils/utils.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse
+from beartype import beartype
 from scipy.optimize import linear_sum_assignment
 from scipy.sparse import csgraph, csr_matrix, diags, isspmatrix_csr
 from scipy.sparse.csgraph import connected_components
@@ -19,6 +20,29 @@
 from sklearn.utils.multiclass import type_of_target, unique_labels
 
 
+@beartype
+def average_matrices(
+    matrices: Union[np.ndarray, List[Union[np.ndarray, csr_matrix]]]
+) -> Union[np.ndarray, csr_matrix]:
+    """
+    Helper method to encapsulate calculating the average of matrices represented either as a
+    list of numpy.ndarray or a list of scipy.sparse.csr_matrix.
+
+    Parameters
+    ----------
+    matrices: Union[np.ndarray, List[Union[np.ndarray, csr_matrix]]]
+        The list of matrices to be averaged
+
+    Returns
+    -------
+    Union[np.ndarray, csr_matrix]
+    """
+    if isinstance(matrices[0], np.ndarray):
+        return np.mean(matrices, axis=0)
+    elif isspmatrix_csr(matrices[0]):
+        return sum(matrices) / len(matrices)
+
+
 def import_graph(graph, copy=True):
     """
     A function for reading a graph and returning a shared data type.

diff --git a/tests/test_omni.py → tests/embed/test_omni.py b/tests/test_omni.py → tests/embed/test_omni.py
@@ -7,6 +7,7 @@
 from numpy import array_equal
 from numpy.linalg import norm
 from numpy.testing import assert_allclose
+from scipy.sparse import csr_matrix
 
 from graspologic.embed.omni import OmnibusEmbed, _get_omni_matrix
 from graspologic.simulations.simulations import er_nm, er_np
@@ -189,3 +190,26 @@ def run(diag_aug):
 
         run(diag_aug=True)
         run(diag_aug=False)
+
+    def test_omni_embed_sparse(self):
+        def compute_bar(arr):
+            n = arr.shape[0] // 2
+            return (arr[:n] + arr[n:]) / 2
+
+        def run(diag_aug):
+            X, A1, A2 = generate_data(1000, seed=2)
+            Abar = (A1 + A2) / 2
+
+            omni = OmnibusEmbed(n_components=3, diag_aug=diag_aug)
+            OmniBar = compute_bar(omni.fit_transform([csr_matrix(A1), csr_matrix(A2)]))
+
+            omni = OmnibusEmbed(n_components=3, diag_aug=diag_aug)
+            ABar = compute_bar(omni.fit_transform([Abar, Abar]))
+
+            tol = 1.0e-2
+            np.testing.assert_allclose(
+                norm(OmniBar, axis=1), norm(ABar, axis=1), rtol=tol, atol=tol
+            )
+
+        run(diag_aug=True)
+        run(diag_aug=False)
diff --git a/tests/pipeline/embed/test_omnibus_embedding.py b/tests/pipeline/embed/test_omnibus_embedding.py
@@ -154,13 +154,14 @@ def test_omnibus_embedding_elbowcuts_none_returns_full_embedding(self):
     def test_omnibus_embedding_digraph_elbowcuts_none_returns_full_embedding(self):
         dimensions = 100
         expected_dimensions = dimensions * 2
+        number_of_nodes = 1000
 
         g = nx.DiGraph()
-        for i in range(1000):
+        for i in range(number_of_nodes):
             g.add_edge(1, i, weight=1)
 
         g2 = g.copy()
-        for i in range(1000):
+        for i in range(number_of_nodes):
             g2.add_edge(i, 1, weight=i)
 
         embeddings = omnibus_embedding_pairwise(

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation and contributors.
 # Licensed under the MIT License.
 
+import random
 import unittest
 import warnings
 from math import sqrt
@@ -14,6 +15,25 @@
 from graspologic.utils import utils as gus
 
 
+class TestAverageMatrices(unittest.TestCase):
+    def test_mean_dense_and_sparse_are_equivalent(self):
+        trials = 20
+
+        for _ in range(trials):
+            number_of_graphs = random.randint(2, 10)
+
+            dim = random.randint(2, 100)
+            dim2 = random.randint(2, 100)
+
+            graphs = [np.random.rand(dim, dim2) for _ in range(number_of_graphs)]
+            graphs_sparse = [csr_matrix(graph) for graph in graphs]
+
+            graphs_averaged = gus.average_matrices(graphs)
+            graphs_sparse_averaged = gus.average_matrices(graphs_sparse).todense()
+
+            np.testing.assert_almost_equal(graphs_averaged, graphs_sparse_averaged)
+
+
 class TestInput(unittest.TestCase):
     @classmethod
     def setUpClass(cls):