/
hdbscan.R
232 lines (223 loc) · 10.6 KB
/
hdbscan.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#' hdbscan
#'
#' Implemenation of the hdbscan algorithm.
#'
#' @param edges An edge matrix of the type returned by \code{\link{buildEdgeMatrix}} or, alternatively, a \code{largeVis} object.
#' @param neighbors An adjacency matrix of the type returned by \code{\link{randomProjectionTreeSearch}}. Must be specified unless
#' \code{edges} is a \code{largeVis} object.
#' @param minPts The minimum number of points in a cluster.
#' @param K The number of points in the core neighborhood. (See details.)
#' @param threads Maximum number of threads. Determined automatically if \code{NULL} (the default). It is unlikely that
#' this parameter should ever need to be adjusted. It is only available to make it possible to abide by the CRAN limitation that no package
#' use more than two cores.
#' @param verbose Verbosity.
#'
#' @details The hyperparameter \code{K} controls the size of core neighborhoods.
#' The algorithm measures the density around a point as 1 / the distance between
#' that point and its \eqn{k}th nearest neighbor. A low value of \code{K} is similar
#' to clustering nearest neighbors rather than based on density. A high value of
#' \code{K} may cause the algorithm to miss some (usually contrived) clustering
#' patterns, such as where clusters are made up of points arranged in lines to form
#' shapes.
#'
#' The function must be provided sufficient nearest-neighbor data for whatever
#' is specified for \eqn{k}. If \eqn{k} = 5, for example, the edge matrix (and
#' neighbor matrix, if specified) must contain data on at least 5 neighbors for
#' each point. This should not be problematic in typical use in connection with
#' \code{\link{largeVis}}, which is ordinarily run with a far higher \eqn{k}-value
#' than hdbscan.
#'
#' @return An object of type \code{hdbscan} with the following fields:
#' \describe{
#' \item{'clusters'}{A vector of the cluster membership for each vertex. Outliers
#' are given \code{NA}}
#' \item{'probabilities'}{A vector of the degree of each vertex' membership. This
#' is calculated by standardizing each vertex' \eqn{lambda_p} against the \eqn{lambda_birth}
#' and \eqn{lambda_death} of the cluster.}
#' \item{'glosh'}{A vector of GLOSH outlier scores for each node assigned to a cluster. NA for nodes not
#' in a cluster.}
#' \item{'tree'}{The minimum spanning tree used to generate the clustering.}
#' \item{'hierarchy'}{A representation of the condensed cluster hierarchy.}
#' \item{'call'}{The call.}
#' }
#'
#' The hierarchy describes the complete post-condensation structure of the tree:
#' \describe{
#' \item{'nodemembership'}{The cluster ID of the vertex's immediate parent, after condensation.}
#' \item{'lambda'}{\eqn{\lambda_p} for each vertex.}
#' \item{'parent'}{The cluster ID of each cluster's parent.}
#' \item{'stability'}{The cluster's stability, taking into account child-node stabilities.}
#' \item{'selected'}{Whether the cluster was selected.}
#' \item{'coredistances'}{The core distance determined for each vertex.}
#' \item{'lamba_birth'}{\eqn{\lambda_b} for each cluster.}
#' \item{'lambda_deaeth'}{\eqn{\lambda_d} for each cluster.}
#' }
#'
#' @references R. Campello, D. Moulavi, and J. Sander, Density-Based Clustering Based on Hierarchical Density Estimates In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172. 2013
#' @seealso \url{https://github.com/lmcinnes/hdbscan}
#' @note This is not precisely the \code{HDBSCAN} algorithm because it relies on the
#' nearest neighbor data generated by the \code{LargeVis} algorithm. In particular,
#' \code{HDBSCAN} assumes that all points can be connected in a single minimal-spanning
#' tree. This implementation uses a minimal-spanning forest, because the graph may not
#' be fully connected depending on the amount of nearest-neighbor data provided.
#' If, for example, the data has distinct clusters where no member of some cluster is a
#' nearest neighbor of a point in any other cluster, which can easily happen, the algorithm will
#' generate distinct trees for each disconnected set of points. In testing, this
#' improved the performance of the algorithm.
#'
#' @note Do not rely on the content of the \code{probabilities} field for outliers. A future version
#' will hopefully provide some measure of outlier factor.
#' @examples
#' \dontrun{
#' library(largeVis)
#' # The spiral dataset can be downloaded from https://github.com/elbamos/clusteringdatasets
#' data(spiral)
#' dat <- as.matrix(spiral[, 1:2])
#' neighbors <- randomProjectionTreeSearch(t(dat), K = 10, tree_threshold = 100,
#' max_iter = 5)
#' edges <- buildEdgeMatrix(t(dat), neighbors)
#' clusters <- hdbscan(edges, neighbors = neighbors, verbose = FALSE)
#'
#' # Calling largeVis while setting sgd_batches to 1 is
#' # the simplest way to generate the data structures neeeded for hdbscan
#' spiralVis <- largeVis(t(dat), K = 10, tree_threshold = 100, max_iter = 5, sgd_batches = 1)
#' clusters <- hdbscan(spiralVis, verbose = FALSE)
#'
#' # The gplot function helps to visualize the clustering
#' largeHighDimensionalDataset <- matrix(rnorm(50000), ncol = 50)
#' vis <- largeVis(largeHighDimensionalDataset)
#' clustering <- hdbscan(vis)
#' gplot(clustering, t(vis$coords))
#' }
#' @export
#' @importFrom stats aggregate
hdbscan <- function(edges, neighbors = NULL, minPts = 20, K = 5,
threads = NULL,
verbose = getOption("verbose", TRUE)) {
if (inherits(edges, "edgematrix")) {
edges <- t(toMatrix(edges))
} else if (inherits(edges, "largeVis")) {
if (missing(neighbors)) neighbors <- edges$knns
edges <- t(toMatrix(edges$edges))
} else {
stop("edges must be either an edgematrix or a largeVis object")
}
if (is.null(edges) || is.null(neighbors)) stop("Both edges and neighbors must be specified (or use a largeVis object)")
if (!is.null(neighbors)) {
neighbors[is.na(neighbors)] <- -1
if (ncol(neighbors) != ncol(edges)) neighbors <- t(neighbors)
}
clustersout <- hdbscanc(edges = edges,
neighbors = neighbors,
K = as.integer(K),
minPts = as.integer(minPts),
threads = threads,
verbose = as.logical(verbose))
clusters = factor(clustersout$clusters)
probs <- data.frame(
probs = clustersout$lambdas
)
mins = stats::aggregate(probs, by = list(clusters), FUN = "min")$probs
maxes = stats::aggregate(probs, by = list(clusters), FUN = "max")$probs - mins
probs$probs[!is.na(clusters)] <- (probs$probs[!is.na(clusters)] -
mins[as.integer(clusters)[!is.na(clusters)]]) /
maxes[as.integer(clusters)[!is.na(clusters)]]
# Adjust C->R style numbering
hierarchy <- clustersout$hierarchy
hierarchy$nodemembership <- hierarchy$nodemembership + 1
hierarchy$parent <- hierarchy$parent + 1
# hierarchy$coredistances <- hierarchy$coredistances
tree <- clustersout$tree + 1
tree[tree == 0] <- NA
# GLOSH
fmax <- stats::aggregate(hierarchy$lambda, by = list(hierarchy$nodemembership), FUN = max,
na.rm = TRUE, simplify = TRUE, drop = FALSE)
maxes <- fmax$x[match(hierarchy$nodemembership, fmax$Group.1)]
glosh <- (maxes - hierarchy$lambda) / maxes
structure(
.Data = list(
clusters = clusters,
probabilities = probs$probs,
GLOSH = glosh,
tree = tree,
hierarchy = hierarchy
),
call = sys.call(),
class = "hdbscan")
}
#' gplot
#'
#' Plot an \code{hdbscan} object, using \code{\link[ggplot2]{ggplot}}. The
#' plot is primarily intended for diagnostic purposes, but can be useful to undertand
#' how clusters were generated.
#'
#' Point color corresponds to clusters, with outliers as the \code{NA} color. Alpha
#' corresponds to the node's GLOSH score. The segments on the plot
#' correspond to the connections on the minimum spanning tree. Segment alpha
#' corresponds to \eqn{\lambda_p}.
#'
#' If the parameter \code{text} is set to \code{TRUE} or \code{"parent"}, the nodes will
#' be labelled with the node index number or cluster index number, respectively.
#'
#' @param x An \code{hdbscan} object.
#' @param coords Coordinates for the points clustered in \code{x}.
#' @param text If \code{TRUE}, include on the plot labels for each node's index.
#' If \code{"parent"}, the labels will instead be the index number of the node's
#' cluster.
#' @param distances If \code{TRUE}, draw circles representing the core distances around each point.
#'
#' @return A \code{\link[ggplot2]{ggplot}} object
#' @export
#'
#' @examples
#' \dontrun{
#' library(largeVis)
#' # The aggregation dataset can be downloaded from https://github.com/elbamos/clusteringdatasets
#' data(Aggregation)
#' dat <- as.matrix(Aggregation[, 1:2])
#' aggregateVis <- largeVis(dat, K = 10, tree_threshold = 100,
#' max_iter = 5, sgd_batches = 1)
#' clusters <- hdbscan(aggregateVis, verbose = FALSE)
#' gplot(clusters, dat)
#' }
#' @importFrom ggplot2 ggplot unit geom_label geom_point geom_segment aes_
gplot <- function(x, coords, text = FALSE, distances = FALSE) {
dframe <- data.frame(coords)
colnames(dframe) <- c("x", "y")
dframe$cluster = x$clusters
dframe$glosh = x$GLOSH
dframe$distance = x$hierarchy$coredistances
dframe$glosh[is.nan(dframe$glosh)] <-
x$hierarchy$lambda[is.nan(dframe$glosh)]
dframe$label <- 0:(nrow(dframe) - 1)
dframe$parent <- x$hierarchy$nodemembership
xy <- data.frame(coords[x$tree, ])
colnames(xy) <- c("x2", "y2")
dframe <- cbind(dframe, xy)
plt <- ggplot2::ggplot(dframe,
ggplot2::aes_(x = quote(x), y = quote(y),
xend = quote(x2), yend = quote(y2), color = quote(cluster)))
if (distances) {
if (requireNamespace("ggforce", quietly = TRUE)) plt <- plt + ggforce::geom_circle(ggplot2::aes_(x0 = quote(x),
y0 = quote(y),
r = quote(distance),
fill = quote(cluster),
color = quote(cluster)),
inherit.aes = FALSE, alpha = 0.1, linetype = "dotted")
else warning("To display distance circles, the ggforce package must be installed.")
}
plt <- plt +
ggplot2::geom_point(aes_(alpha = quote(glosh)), size = 0.7) +
ggplot2::geom_segment(size = 0.5, ggplot2::aes_(alpha = quote(glosh))) +
ggplot2::scale_alpha_continuous(trans = "reverse")
if (text == "parent") {
plt <- plt + ggplot2::geom_label(ggplot2::aes_(label = quote(parent)), size = 2.5,
label.padding = ggplot2::unit(0.1, "lines"),
label.size = 0.1, alpha = 0.7)
} else if (text) {
plt <- plt + ggplot2::geom_label(ggplot2::aes_(label = quote(label)), size = 2.5,
label.padding = ggplot2::unit(0.1, "lines"),
label.size = 0.1, alpha = 0.7)
}
plt
}