-
Notifications
You must be signed in to change notification settings - Fork 9
/
clustering.clj
289 lines (214 loc) · 10.5 KB
/
clustering.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
(ns fastmath.clustering
"Clustering algorithms.
Various clustering algrorithms backed by SMILE library.
Currently implemented: only partition clustering.
### Input data
It's always sequence of n-sized samples as sequences.
For example, 2d samples `[[1 2] [2 2] [3 3] ...]`
For 1d data you can pass sequence of numbers of sequence of 1d seqs of numbers
```clojure
[1 2 3]
;; or
[[1] [2] [3]]
```
### Distances
Some of the methods use distance functions, currently supported are:
* `:euclidean`
* `:manhattan`
* `:chebyshev`
### Output
Every function returns record which contains:
* `:type` - name of the method used
* `:data` - input data
* `:clustering` - sequence of cluster ids
* `:sizes` - sizes of clusters
* `:clusters` - number of clusters
* `:predict` - predicting function (see below), qualify additional sample
* `:representatives` - list of centroids or medoids if available
* `:info` - additional statistics for your samples (like distortion)
* `:obj` - SMILE object
Cluster id is a integer ranging from 0 to the number of clusters minus 1. Some methods mark outliers with [[outlier-id]].
Record acts as function and can qualify additonal sample by calling `:predict` function, for example (`data` is sequence of 3d samples):
```clojure
(let [cl (k-means data 10)] (cl [0 1 2]))
```
See [[k-means]]
#### Regrouping
Clustering record can be regroupped to the list of individual clusters. Call [[regroup]] and get list of maps with following structure:
* `:key` - cluster id
* `:data` - samples which belong to the cluster
* `:outliers?` - does it contain outliers or not
* `:representative` - centroid/medoid or average vector if the former is not available
* `:size` - size of cluster"
(:require [fastmath.core :as m]
[fastmath.vector :as v]
[clojure.string :as s]
[fastmath.stats :as stat])
(:import [smile.clustering Clustering KMeans GMeans XMeans DeterministicAnnealing DENCLUE CLARANS DBSCAN MEC]
[smile.math.distance ChebyshevDistance EuclideanDistance ManhattanDistance CorrelationDistance JensenShannonDistance]
[smile.vq NeuralGas]
[clojure.lang IFn]))
(set! *warn-on-reflection* false)
(def ^:const ^{:doc "Id of the cluster which contain outliers."} outlier-id Clustering/OUTLIER)
(defrecord ClusteringResult [type data clustering sizes clusters predict representatives info obj]
IFn
(invoke [_ in] (predict in)))
(defn- structurize
"Pack result of the clustering function into the structure"
[type data in repr info]
(->ClusteringResult type
data
(seq (.getClusterLabel in))
(seq (.getClusterSize in))
(.getNumClusters in)
(fn [x] (.predict in (m/seq->double-array x)))
(when repr (m/double-double-array->seq (repr in)))
(into {} (map (fn [[k v]] [k (v in)]) info))
in))
(def ^:private clustering-classes {:k-means ['KMeans 'centroids 'distortion]
:g-means ['GMeans 'centroids 'distortion]
:x-means ['XMeans 'centroids 'distortion]
:deterministic-annealing ['DeterministicAnnealing 'centroids 'distortion 'getAlpha]
:neural-gas ['NeuralGas 'centroids 'distortion]
:denclue ['DENCLUE nil 'getSigma]
:clarans ['CLARANS 'medoids 'distortion 'getMaxNeighbor 'getNumLocalMinima]
:dbscan ['DBSCAN nil 'getMinPts 'getRadius]
:mec ['MEC nil 'entropy]})
(def ^{:doc "List of clustering methods."} clustering-methods-list (keys clustering-classes))
(def ^:private distances {:chebyshev (ChebyshevDistance.)
:euclidean (EuclideanDistance.)
:manhattan (ManhattanDistance.)})
(def ^{:doc "List of distances used in some clustring methods."} distances-list (keys distances))
(defn- symbol->keyword
"Convert java method into the keyword."
[s]
(let [s (name s)]
(-> (if (= "get" (subs s 0 3)) (subs s 3) s)
(s/lower-case)
(keyword))))
(defmacro ^:private clustering
"Analyze clustering method and pack into the structure."
[clustering-method data & params]
(let [[nm repr & rest] (clustering-classes clustering-method)
repr (when repr `(fn [obj#] (. obj# ~repr)))
info (mapv #(vector (symbol->keyword %) `(fn [obj#] (. obj# ~%))) rest)]
`(structurize ~clustering-method ~data (new ~nm (m/seq->double-double-array ~data) ~@params) ~repr ~info)))
(defn k-means
"K-Means++ algorithm.
Input:
* data - sequence of samples
* clusters - number of clusters
* max-iter (optional) - maximum number of iterations
* runs (optional) - maximum number of runs
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/KMeans.html)"
([data clusters] (clustering :k-means data clusters))
([data clusters max-iter] (clustering :k-means data clusters max-iter))
([data clusters max-iter runs] (clustering :k-means data clusters max-iter runs)))
(defn g-means
"G-Means algorithm.
Input:
* data - sequence of samples
* max-clusters - maximum number of clusters
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/GMeans.html)"
[data max-clusters] (clustering :g-means data max-clusters))
(defn x-means
"X-Means algorithm.
Input:
* data - sequence of samples
* max-clusters - number of clusters
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/XMeans.html)"
[data max-clusters] (clustering :x-means data max-clusters))
(defn deterministic-annealing
"Deterministic Annealing algorithm.
Input:
* data - sequence of samples
* max-clusters - number of clusters
* alpha (optional) - temperature decreasing factor (valued from 0 to 1)
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/DeterministicAnnealing.html)"
([data max-clusters] (clustering :deterministic-annealing data max-clusters))
([data max-clusters alpha] (clustering :deterministic-annealing data max-clusters alpha)))
(defn neural-gas
"Neural Gas algorithm.
Input:
* data - sequence of samples
* clusters - number of clusters
Optional:
* lambda-i - intial lambda value (soft learning radius/rate)
* lambda-f - final lambda value
* eps-i - initial epsilon value (learning rate)
* eps-f - final epsilon value
* steps - number of iterations
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/vq/NeuralGas.html)"
([data clusters] (clustering :neural-gas data clusters))
([data clusters lambda-i lambda-f eps-i eps-f steps] (clustering :neural-gas data clusters lambda-i lambda-f eps-i eps-f steps)))
(defn denclue
"DENsity CLUstering algorithm.
Input:
* data - sequence of samples
* sigma - gaussian kernel parameter
* m - number of selected samples, much slower than number of all samples
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/DENCLUE.html)"
[data sigma m] (clustering :denclue data sigma m))
(defn clarans
"Clustering Large Applications based upon RANdomized Search algorithm.
Input:
* data - sequence of samples
* clusters - numbe of clusters
Optional:
* dist - distance method, default `:euclidean`
* max-neighbor - maximum number of neighbors checked during random search
* num-local - the number of local minima to search for
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/CLARANS.html)"
([data clusters] (clarans data :euclidean clusters))
([data dist clusters] (clustering :clarans data (distances dist) clusters))
([data dist clusters max-neighbor] (clustering :clarans data (distances dist) clusters max-neighbor))
([data dist clusters max-neighbor num-local] (clustering :clarans data (distances dist) clusters max-neighbor num-local)))
(defn dbscan
"Density-Based Spatial Clustering of Applications with Noise algorithm.
Input:
* data - sequence of samples
* dist (optional) - distance method, default `:euclidean`
* min-pts - minimum number of neighbors
* radius - the neighborhood radius
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/DBSCAN.html)"
([data min-pts radius] (dbscan data :euclidean min-pts radius))
([data dist min-pts ^double radius] (clustering :dbscan data (distances dist) (int min-pts) radius)))
(defn mec
"Nonparametric Minimum Conditional Entropy Clustering algorithm.
Input:
* data - sequence of samples
* dist (optional) - distance method, default `:euclidean`
* max-clusters - maximum number of clusters
* radius - the neighborhood radius
See more in [SMILE doc](https://haifengl.github.io/smile/api/java/smile/clustering/MEC.html)"
([data max-clusters radius] (mec data :euclidean max-clusters radius))
([data dist max-clusters ^double radius] (clustering :mec data (distances dist) max-clusters radius)))
(defn regroup
"Transform clusterig result into list of clusters as separate maps.
Every map contain:
* `:key` - cluster id
* `:data` - samples which belong to the cluster
* `:outliers?` - does it contain outliers or not
* `:representative` - centroid/medoid or average vector if the former is not available
* `:size` - size of cluster
Representative is always a n-dimensional sequence even if input is a list of numbers.
Empty clusters are skipped."
[clustered-data]
(let [mvector? (satisfies? v/VectorProto (first (:data clustered-data))) ;;required to fix missing representative
mseqable? (sequential? (first (:data clustered-data)))]
(for [[k lst] (group-by first (map vector (:clustering clustered-data) (:data clustered-data)))
:let [d (map second lst)
outliers? (== k outlier-id)
r (:representatives clustered-data)]]
{:key k
:outliers? outliers?
:data d
:representative (if (and r (not outliers?))
(nth r k)
(cond ;; if representative is missing, calculate average
mvector? (v/average-vectors d)
mseqable? (v/average-vectors (map vec d))
:else (list (stat/mean d))))
:size (if outliers?
(count d)
(nth (:sizes clustered-data) k))})))