Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallelize Sampling of LDT #744

Merged
merged 31 commits into from
May 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ea8f9a7
LPT tutorial render fix attempt
kareef928 Nov 13, 2020
a2c77c3
merge
kareef928 Nov 26, 2020
895d93e
unwanted changes
kareef928 Dec 10, 2020
e05d226
lpt tutorial changes
kareef928 Dec 10, 2020
aa9059d
Merge branch 'dev' of https://github.com/microsoft/graspologic into dev
kareef928 Mar 7, 2021
948de44
parallelize ldt
kareef928 Mar 8, 2021
e62cc99
parallelize ldt
kareef928 Mar 8, 2021
507a98b
functioning parallelization
kareef928 Mar 8, 2021
fc64a64
remove print
kareef928 Mar 15, 2021
e8bd71a
remove comments
kareef928 Apr 1, 2021
0edd979
format fix
kareef928 Apr 4, 2021
eda6730
add parallel test
kareef928 Apr 4, 2021
3f7c310
Merge branch 'dev' into ldt-parallel
bdpedigo Apr 6, 2021
2071206
change seeds array
kareef928 Apr 18, 2021
8d037ca
pull
kareef928 Apr 18, 2021
742d373
pull
kareef928 Apr 27, 2021
ea6f8b5
add random_state param
kareef928 May 5, 2021
96e0e91
change workers to n_jobs
kareef928 May 6, 2021
5814892
Merge branch 'dev' into ldt-parallel
bdpedigo May 6, 2021
8a7d80e
change back to workers
kareef928 May 6, 2021
6ed928c
Merge branch 'ldt-parallel' of https://github.com/kareef928/graspolog…
kareef928 May 6, 2021
c9cbbc7
fix conflict
kareef928 May 10, 2021
c816c42
add missing imports
kareef928 May 10, 2021
709942d
remove unused warnings import
bdpedigo May 17, 2021
df7e801
allow for none in type checking
bdpedigo May 17, 2021
db44da7
update description for workers
bdpedigo May 17, 2021
817d08b
black
bdpedigo May 17, 2021
18aaea2
Merge branch 'dev' into ldt-parallel
bdpedigo May 19, 2021
907a07e
Merge branch 'dev' into ldt-parallel
bdpedigo May 19, 2021
ad04089
try only generating N seeds
bdpedigo May 19, 2021
bcc4466
Merge branch 'dev' into ldt-parallel
bdpedigo May 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
52 changes: 39 additions & 13 deletions graspologic/inference/latent_distribution_test.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.

import warnings
from collections import namedtuple

import numpy as np
from hyppo.ksample import KSample
from scipy import stats
from sklearn.metrics.pairwise import PAIRED_DISTANCES, PAIRWISE_KERNEL_FUNCTIONS
from sklearn.utils import check_array
from sklearn.utils import check_array, check_random_state
from joblib import Parallel, delayed

from ..align import SeedlessProcrustes, SignFlips
from ..embed import AdjacencySpectralEmbed, select_dimension
from ..utils import fit_plug_in_variance_estimator, import_graph


_VALID_DISTANCES = list(PAIRED_DISTANCES.keys())
_VALID_KERNELS = list(PAIRWISE_KERNEL_FUNCTIONS.keys())
_VALID_KERNELS.append("gaussian") # can use hyppo's medial gaussian kernel too
Expand All @@ -31,7 +32,8 @@ def latent_distribution_test(
metric="euclidean",
n_components=None,
n_bootstraps=500,
workers=1,
random_state=None,
workers=None,
size_correction=True,
pooled=False,
align_type="sign_flips",
Expand Down Expand Up @@ -101,9 +103,23 @@ def latent_distribution_test(
Number of bootstrap iterations for the backend hypothesis test.
See :class:`hyppo.ksample.KSample` for more information.

workers : int (default=1)
random_state : {None, int, `~np.random.RandomState`, `~np.random.Generator`}
This parameter defines the object to use for drawing random
variates.
If `random_state` is ``None`` the `~np.random.RandomState` singleton is
used.
If `random_state` is an int, a new ``RandomState`` instance is used,
seeded with `random_state`.
If `random_state` is already a ``RandomState`` or ``Generator``
instance, then that object is used.
Default is None.

workers : int or None (default=None)
Number of workers to use. If more than 1, parallelizes the code.
Supply -1 to use all cores available to the Process.
Supply -1 to use all cores available. None is a marker for
'unset' that will be interpreted as ``workers=1`` (sequential execution) unless
the call is performed under a Joblib parallel_backend context manager that sets
another value for ``workers``. See :class:joblib.Parallel for more details.

size_correction : bool (default=True)
Ignored when the two graphs have the same number of vertices. The test
Expand Down Expand Up @@ -262,8 +278,8 @@ def latent_distribution_test(
raise ValueError(msg.format(n_bootstraps))

# check workers argument
if not isinstance(workers, int):
msg = "workers must be an int, not {}".format(type(workers))
if workers is not None and not isinstance(workers, (int, np.integer)):
msg = "workers must be an int or None, not {}".format(type(workers))
raise TypeError(msg)

# check size_correction argument
Expand Down Expand Up @@ -360,7 +376,9 @@ def latent_distribution_test(
Q = np.identity(X1_hat.shape[0])

if size_correction:
X1_hat, X2_hat = _sample_modified_ase(X1_hat, X2_hat, pooled=pooled)
X1_hat, X2_hat = _sample_modified_ase(
X1_hat, X2_hat, workers=workers, random_state=random_state, pooled=pooled
)

test_obj = KSample(test, compute_distkern=metric)

Expand Down Expand Up @@ -402,7 +420,7 @@ def _embed(A1, A2, n_components):
return X1_hat, X2_hat


def _sample_modified_ase(X, Y, pooled=False):
def _sample_modified_ase(X, Y, workers, random_state, pooled=False):
N, M = len(X), len(Y)

# return if graphs are same order, else ensure X the larger graph.
Expand All @@ -422,12 +440,20 @@ def _sample_modified_ase(X, Y, pooled=False):
else:
get_sigma = fit_plug_in_variance_estimator(X)
X_sigmas = get_sigma(X) * (N - M) / (N * M)

# increase the variance of X by sampling from the asy dist
X_sampled = np.zeros(X.shape)
# TODO may be parallelized, but requires keeping track of random state
for i in range(N):
X_sampled[i, :] = X[i, :] + stats.multivariate_normal.rvs(cov=X_sigmas[i])
rng = check_random_state(random_state)
X_sampled = np.asarray(
Parallel(n_jobs=workers)(
delayed(add_variance)(X[i, :], X_sigmas[i], r)
for i, r in zip(range(N), rng.randint(np.iinfo(np.int32).max, size=N))
)
)

# return the embeddings in the appropriate order
return (Y, X_sampled) if reverse_order else (X_sampled, Y)


def add_variance(X_orig, X_sigma, seed):
np.random.seed(seed)
return X_orig + stats.multivariate_normal.rvs(cov=X_sigma)
9 changes: 9 additions & 0 deletions tests/test_latentdistributiontest.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def test_bad_kwargs(self):
# check workers argument
with pytest.raises(TypeError):
latent_distribution_test(A1, A2, workers=0.5)
latent_distribution_test(A1, A2, workers="oops")
# check size_correction argument
with pytest.raises(TypeError):
latent_distribution_test(A1, A2, size_correction=0)
Expand Down Expand Up @@ -207,11 +208,19 @@ def test_SBM_dcorr(self):
A1 = sbm(2 * [b_size], B1)
A2 = sbm(2 * [b_size], B1)
A3 = sbm(2 * [b_size], B2)

# non-parallel test
ldt_null = latent_distribution_test(A1, A2)
ldt_alt = latent_distribution_test(A1, A3)
self.assertTrue(ldt_null[0] > 0.05)
self.assertTrue(ldt_alt[0] <= 0.05)

# parallel test
ldt_null = latent_distribution_test(A1, A2, workers=-1)
ldt_alt = latent_distribution_test(A1, A3, workers=-1)
self.assertTrue(ldt_null[0] > 0.05)
self.assertTrue(ldt_alt[0] <= 0.05)

def test_different_sizes_null(self):
np.random.seed(314)

Expand Down