-
Notifications
You must be signed in to change notification settings - Fork 239
/
kmedoids.py
147 lines (117 loc) · 4.99 KB
/
kmedoids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""K-medoids clustering.
Lead author: Hadi Zaatiti.
"""
import logging
from sklearn.base import BaseEstimator, ClusterMixin
import geomstats.backend as gs
from geomstats.learning._template import TransformerMixin
class RiemannianKMedoids(TransformerMixin, ClusterMixin, BaseEstimator):
"""Class for K-medoids clustering on manifolds.
K-medoids algorithm using Riemannian manifolds.
Parameters
----------
metric : object of class RiemannianMetric
The geomstats Riemmanian metric associate to the space used.
n_clusters : int
Number of clusters (k value of k-medoids).
Optional, default: 8.
init : str
How to initialize centroids at the beginning of the algorithm. The
choice 'random' will select training points as initial centroids
uniformly at random.
Optional, default: 'random'.
n_jobs : int
Number of jobs to run in parallel. `-1` means using all processors.
Optional, default: 1.
Example
-------
Available example on the Poincaré Ball and Hypersphere manifolds
:mod:`examples.plot_kmedoids_manifolds`
"""
def __init__(self, metric, n_clusters=8, init="random", n_jobs=1):
self.metric = metric
self.n_clusters = n_clusters
self.init = init
self.cluster_centers_ = None
self.labels_ = None
self.medoid_indices_ = None
self.n_jobs = n_jobs
def _initialize_medoids(self, distances):
"""Select initial medoids when beginning clustering."""
if self.init == "random":
medoids = gs.random.choice(gs.arange(len(distances)), self.n_clusters)
else:
logging.error("Unknown initialization method.")
return medoids
def fit(self, data, max_iter=100):
"""Provide clusters centroids and data labels.
Labels data by minimizing the distance between data points
and cluster centroids chosen from the data points.
Minimization is performed by swapping the centroids and data points.
Parameters
----------
data : array-like, shape=[n_samples, dim]
Training data, where n_samples is the number of samples and
dim is the number of dimensions.
max_iter : int
Maximum number of iterations.
Optional, default: 100.
Returns
-------
self : array-like, shape=[n_clusters,]
Centroids.
"""
distances = self.metric.dist_pairwise(data, n_jobs=self.n_jobs)
medoids_indices = self._initialize_medoids(distances)
for iteration in range(max_iter):
old_medoids_indices = gs.copy(medoids_indices)
labels = gs.argmin(distances[medoids_indices, :], axis=0)
self._update_medoid_indexes(distances, labels, medoids_indices)
if gs.all(old_medoids_indices == medoids_indices):
break
if iteration == max_iter - 1:
logging.warning(
"Maximum number of iteration reached before "
"convergence. Consider increasing max_iter to "
"improve the fit."
)
self.cluster_centers_ = data[medoids_indices]
self.labels_ = labels
self.medoid_indices_ = medoids_indices
return self.cluster_centers_
def _update_medoid_indexes(self, distances, labels, medoid_indices):
for cluster in range(self.n_clusters):
cluster_index = gs.where(labels == cluster)[0]
if len(cluster_index) == 0:
logging.warning("One cluster is empty.")
continue
in_cluster_distances = distances[
cluster_index, gs.expand_dims(cluster_index, axis=-1)
]
in_cluster_all_costs = gs.sum(in_cluster_distances, axis=1)
min_cost_index = gs.argmin(in_cluster_all_costs)
min_cost = in_cluster_all_costs[min_cost_index]
current_cost = in_cluster_all_costs[
gs.argmax(cluster_index == medoid_indices[cluster])
]
if min_cost < current_cost:
medoid_indices[cluster] = cluster_index[min_cost_index]
def predict(self, data):
"""Predict the closest cluster for each sample in X.
Parameters
----------
data : array-like, shape=[n_samples, dim,]
Training data, where n_samples is the number of samples and
dim is the number of dimensions.
Returns
-------
labels : array-like, shape=[n_samples,]
Index of the cluster each sample belongs to.
"""
labels = gs.zeros(len(data))
for point_index, point_value in enumerate(data):
distances = gs.zeros(len(self.cluster_centers_))
for cluster_index, cluster_value in enumerate(self.cluster_centers_):
distances[cluster_index] = self.metric.dist(point_value, cluster_value)
labels[point_index] = gs.argmin(distances)
return labels