Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option for more than one kmeans init to autogmm #662

Merged
merged 29 commits into from
Aug 24, 2021
Merged
Changes from 12 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
92caafc
Merge gclust to autogmm
PerifanosPrometheus Feb 9, 2021
5221c0d
Fixes import error of check_scalar
PerifanosPrometheus Feb 9, 2021
ab7e582
Fixes formatting error
PerifanosPrometheus Feb 9, 2021
d2d6361
Merge branch 'dev' into merge_gclust_autogmm
bdpedigo Feb 9, 2021
60f957e
Fixed edit of process_param(forgotten to add a change in gm_params)
PerifanosPrometheus Feb 16, 2021
2d16045
Merge branch 'merge_gclust_autogmm' of github.com:PerifanosPrometheus…
PerifanosPrometheus Feb 16, 2021
774d14d
Perform and append multiple initializations k-means run rather then
PerifanosPrometheus Feb 23, 2021
7206c19
Black formatting
PerifanosPrometheus Feb 23, 2021
db77c16
Fix docstring
PerifanosPrometheus Mar 9, 2021
f854528
Remove redundant boolean condition
PerifanosPrometheus Mar 23, 2021
76cbf0c
Returns n_init rows instead of just one row
PerifanosPrometheus Apr 8, 2021
498b2ed
Generates different seeds for every run
PerifanosPrometheus Apr 13, 2021
767913f
Simplify code & modify docs
PerifanosPrometheus Apr 20, 2021
8f0b78e
Black formatting of code
PerifanosPrometheus Apr 20, 2021
f461b13
Merge branch 'dev' into merge_gclust_autogmm
bdpedigo Apr 20, 2021
a827508
Merge branch 'dev' into merge_gclust_autogmm
bdpedigo Apr 21, 2021
42bbad4
Edited docs and moved back appending in process_paramgrid
PerifanosPrometheus Apr 29, 2021
5ba0c39
Merge branch 'merge_gclust_autogmm' of github.com:PerifanosPrometheus…
PerifanosPrometheus Apr 29, 2021
0c52594
Merge branch 'dev' into merge_gclust_autogmm
bdpedigo Apr 29, 2021
9c72792
Update code to sklearn PR
PerifanosPrometheus Apr 29, 2021
b5fd398
Update code to sklearn PR
PerifanosPrometheus Apr 29, 2021
94ea4bf
Merge branch 'merge_gclust_autogmm' of github.com:PerifanosPrometheus…
PerifanosPrometheus Apr 29, 2021
708fc14
change n_init to kmeans_n_init
tliu68 May 2, 2021
f452ef5
fix format
tliu68 May 2, 2021
258d611
Fixes run error related to switch from n_init to kmeans_n_init
PerifanosPrometheus May 4, 2021
facd1a5
Add test multiple_kmeans_inits
PerifanosPrometheus May 4, 2021
0fa852e
fix formatting
tliu68 May 5, 2021
8110caf
Merge branch 'dev' into merge_gclust_autogmm
tliu68 Aug 24, 2021
0c998d5
fix failing tests
tliu68 Aug 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 52 additions & 9 deletions graspologic/cluster/autogmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from joblib import Parallel, delayed
import warnings

from sklearn.utils import check_scalar

from .base import BaseCluster


Expand Down Expand Up @@ -117,6 +119,11 @@ class AutoGMMCluster(BaseCluster):
If provided, min_components and ``max_components`` must match the number of
unique labels given here.

n_init : int, optional (default = None)
tliu68 marked this conversation as resolved.
Show resolved Hide resolved
If `n_init` is not `None` and `label_init` is `None` then additional
k-means runs will be performed with `n_init` initializations for all covariance
tliu68 marked this conversation as resolved.
Show resolved Hide resolved
parameters in `covariance_type`.

max_iter : int, optional (default = 100).
The maximum number of EM iterations to perform.

Expand Down Expand Up @@ -221,6 +228,7 @@ def __init__(
selection_criteria="bic",
max_agglom_size=2000,
n_jobs=None,
n_init=None,
):
if isinstance(min_components, int):
if min_components <= 0:
Expand Down Expand Up @@ -363,6 +371,13 @@ def __init__(
if max_agglom_size is not None and max_agglom_size < 2:
raise ValueError("Must use at least 2 points for `max_agglom_size`")

if n_init is not None:
check_scalar(x=n_init, name="n_init", target_type=int, min_val=1)

run_multiple_init = False
if n_init is not None and label_init is None:
run_multiple_init = True
tliu68 marked this conversation as resolved.
Show resolved Hide resolved

self.min_components = min_components
self.max_components = max_components
self.affinity = affinity
Expand All @@ -375,8 +390,10 @@ def __init__(
self.selection_criteria = selection_criteria
self.max_agglom_size = max_agglom_size
self.n_jobs = n_jobs
self.n_init = n_init
self.run_multiple_init = run_multiple_init

def _fit_cluster(self, X, X_subset, y, params, agg_clustering):
def _fit_cluster(self, X, X_subset, y, params, agg_clustering, seed):
label_init = self.label_init
if label_init is not None:
onehot = _labels_to_onehot(label_init)
Expand All @@ -401,6 +418,7 @@ def _fit_cluster(self, X, X_subset, y, params, agg_clustering):
gm_params["init_params"] = "kmeans"
gm_params["reg_covar"] = 0
gm_params["max_iter"] = self.max_iter
gm_params["random_state"] = seed

criter = np.inf # if none of the iterations converge, bic/aic is set to inf
# below is the regularization scheme
Expand Down Expand Up @@ -515,10 +533,17 @@ def fit(self, X, y=None):
linkage=self.linkage,
covariance_type=self.covariance_type,
n_components=range(lower_ncomponents, upper_ncomponents + 1),
random_state=[self.random_state],
)
param_grid = list(ParameterGrid(param_grid))
param_grid_ag, param_grid = _process_paramgrid(param_grid)
param_grid_ag, param_grid = _process_paramgrid(
param_grid, self.run_multiple_init, self.n_init
)

if isinstance(self.random_state, int):
np.random.seed(self.random_state)
seeds = np.random.randint(1e8, size=len(param_grid))
else:
seeds = [self.random_state]*len(param_grid)

n = X.shape[0]
if self.max_agglom_size is None or n <= self.max_agglom_size:
Expand All @@ -540,17 +565,17 @@ def fit(self, X, y=None):
)
ag_labels.append(hierarchical_labels)

def _fit_for_data(p):
def _fit_for_data(p, seed):
n_clusters = p[1]["n_components"]
if (p[0]["affinity"] != "none") and (self.label_init is None):
index = param_grid_ag.index(p[0])
agg_clustering = ag_labels[index][:, n_clusters - self.min_components]
else:
agg_clustering = []
return self._fit_cluster(X, X_subset, y, p, agg_clustering)
return self._fit_cluster(X, X_subset, y, p, agg_clustering, seed)

results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
delayed(_fit_for_data)(p) for p in param_grid
delayed(_fit_for_data)(p, seed) for p, seed in zip(param_grid, seeds)
)
results = pd.DataFrame(results)

Expand Down Expand Up @@ -647,7 +672,7 @@ def _labels_to_onehot(labels):
return onehot


def _process_paramgrid(paramgrid):
def _process_paramgrid(paramgrid, run_multiple_init, n_init):
"""
Removes combinations of affinity and linkage that are not possible.

Expand All @@ -657,6 +682,13 @@ def _process_paramgrid(paramgrid):
Each dict has the keys 'affinity', 'covariance_type', 'linkage',
'n_components', and 'random_state'

run_multiple_init : bool
If True, run additional n_init k_means initializations.
Else, no multiple k-means initialization is performed.

n_init : int, defaults to None.
The number of k-means initializations to perform if run_multiple_init is True

Returns
-------
paramgrid_processed : list pairs of dicts
Expand All @@ -665,9 +697,10 @@ def _process_paramgrid(paramgrid):
ag_paramgrid_processed : list of dicts
options for AgglomerativeClustering
"""
gm_keys = ["covariance_type", "n_components", "random_state"]
gm_keys = ["covariance_type", "n_components"]
ag_keys = ["affinity", "linkage"]
ag_params_processed = []
gm_params_processed = []
paramgrid_processed = []

for params in paramgrid:
Expand All @@ -687,8 +720,18 @@ def _process_paramgrid(paramgrid):
ag_params = {key: params[key] for key in ag_keys}
if ag_params not in ag_params_processed:
ag_params_processed.append(ag_params)
if (
gm_params not in gm_params_processed
and ag_params["affinity"] == "none"
and run_multiple_init
):
for _ in range(n_init):
gm_params_processed.append(gm_params.copy())
gm_params_processed[-1].update({"n_init": 1})

paramgrid_processed.append([ag_params, gm_params])
for pa in gm_params_processed:
tliu68 marked this conversation as resolved.
Show resolved Hide resolved
paramgrid_processed.append([{"affinity": "none", "linkage": "none"}, pa])
return ag_params_processed, paramgrid_processed


Expand All @@ -712,4 +755,4 @@ def _hierarchical_labels(children, min_components, max_components):
hierarchical_labels[:, i], return_inverse=True
)

return hierarchical_labels[:, ::-1]
return hierarchical_labels[:, ::-1]