version 1.0.1

cran · May 30, 2023 · 482bc28 · 482bc28
1 parent 3804341
commit 482bc28
Show file tree

Hide file tree

Showing 21 changed files with 411 additions and 372 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,20 +1,22 @@
 Package: ClusBoot
-Type: Package
-Title: Bootstrap Clustering
-Version: 1.0
-Date: 2019-11-11
-Author: Sugnet Lubbe
-Maintainer: Sugnet Lubbe <slubbe@sun.ac.za>
-Description: Clustering algorithms are designed to cluster objects into a number of clusters. Any clustering algorithm provides the 'best'
-             grouping of objects according to some criterion.
-             This does not guarantee a 'good' clustering solution in the sense that some allocations were not simply the result of chance.
-             This package allows the user to apply any clustering algorithm to a data set. The cluster allocations are subjected to a 
-             bootstrap analysis 
-             to determine the extent to which the clustering structure is stable and fundamental to the data set. For more information 
-             see <https://slubbe.wixsite.com/academic-cv/conference-presentations>.
-License: AGPL-3
+Title: Bootstrap a Clustering Solution to Establish the Stability of
+        the Clusters
+Version: 1.0.1
+Authors@R: 
+    person("Sugnet", "Lubbe", , "slubbe@sun.ac.za", role = c("aut", "cre", "cph"),
+           comment = c(ORCID = "0000-0003-2762-9944"))
+Description: Providing a cluster allocation for n samples, either with an $n \times p$ data matrix or an $n \times n$ distance
+                matrix, a bootstrap procedure is performed. The proportion of bootstrap replicates where a pair of samples
+                cluster in the same cluster indicates who tightly the samples in a particular cluster clusters together.
+License: MIT + file LICENSE
+Encoding: UTF-8
+RoxygenNote: 7.2.3
+Depends: R (>= 2.10)
+LazyData: true
+Imports: graphics, grDevices, stats
 NeedsCompilation: no
-Packaged: 2019-12-05 14:52:51 UTC; filz
-Depends: R (>= 3.5.0)
+Packaged: 2023-05-30 14:49:07 UTC; slubbe
+Author: Sugnet Lubbe [aut, cre, cph] (<https://orcid.org/0000-0003-2762-9944>)
+Maintainer: Sugnet Lubbe <slubbe@sun.ac.za>
 Repository: CRAN
-Date/Publication: 2019-12-13 15:20:02 UTC
+Date/Publication: 2023-05-30 15:20:02 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2023
+COPYRIGHT HOLDER: ClusBoot authors
diff --git a/MD5 b/MD5
@@ -1,14 +1,12 @@
-e8f79ccd1acced4e8d05ec4398093a42 *DESCRIPTION
-f3e08a3f89c576b59ade251eafa5d7a3 *NAMESPACE
-8e95bd3634dbb044a89e174303fa09cc *R/boot.silhouette.R
-e2ae8b677d47dadb9b3c338dcd6ac3d4 *R/clusboot.R
-838928313918fa1218f7f15e9d31f137 *R/complete.linkage.R
-9750c10672a4551764bf4e7380900e47 *R/plot.clusboot.R
-7daae5384cc743d046c837092961f449 *build/partial.rdb
-0f01608300356264e1620e627ecbf81a *data/case_study_psychiatrist.rda
-602fed71fcdbd3dead709ead7bcfa933 *man/ClusBoot-package.Rd
-e7c8d822754be59587f0f44f98193e7d *man/boot.silhouette.Rd
-ee9aed8c28f04c09b2590a84c127d12e *man/case_study_psychiatrist.Rd
-21be27b22db9ba5482101b7cfa63d1c3 *man/clusboot.Rd
-27f2896c166a5065d7d91164ba0929d4 *man/complete.linkage.Rd
-730f2de0200dccb5c8aa44f7a3292cca *man/plot.clusboot.Rd
+3ae7a1f3fbecf4d7e9bf11241165725d *DESCRIPTION
+1e56d0aff1225cea0aa97c60ce2c3e2f *LICENSE
+b093fd22e95a70beb8fe5c7cd6e49ea2 *NAMESPACE
+b3e161ad905271694302b83b3e1a844c *R/data.R
+77fd48d6b7f90661916d887e2f26e2dc *R/funcs.R
+b4698063ca49ab35f1e22782d377d78c *data/case.study.psychiatrist.rda
+f208ca5353d10afb03b770f1bb241a9e *man/boot.silhouette.Rd
+7689190f22b577a7ee0e930d9ff080dd *man/case.study.psychiatrist.Rd
+1877fdeded8608f80f0dfb7373ba14a5 *man/clusboot.Rd
+214b6f46eac17a1c085b98d5d0c4ee08 *man/complete.linkage.Rd
+502a5f8dd6ba90060101596366cb7d58 *man/figures/README-pressure-1.png
+4d82fb6eed47b6179f1211a7ea97e97f *man/plot.clusboot.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
-exportPattern("^[[:alpha:]]+")
-  importFrom("grDevices", "rainbow")
-  importFrom("graphics", "barplot", "plot", "points")
-  importFrom("stats", "as.dist", "cmdscale", "cutree", "dist", "hclust","optim")
-S3method(plot,clusboot)
+# Generated by roxygen2: do not edit by hand
+
+S3method(plot,clusboot)
+export(boot.silhouette)
+export(clusboot)
+export(complete.linkage)
diff --git a/R/boot.silhouette.R b/R/boot.silhouette.R
diff --git a/R/clusboot.R b/R/clusboot.R
diff --git a/R/complete.linkage.R b/R/complete.linkage.R
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,45 @@
+#' Patient by psychiatric symptom data
+#'
+#' Presence/absence ratings of 24 psychiatric symptoms in 30 psychiatric inpatients made by an individual psychiatrist.
+#' The data have been collected in a case study of an individual psychiatrist to identify his implicit taxonomy.
+#'
+#' @format ## `case.study.psychiatrist`
+#' A data frame with 30 observations on the following 28 variables:
+#' \describe{
+#' \item{\code{V1}}{inappropriate affect, appearance or behavior; binary vector}
+#' \item{\code{V2}}{interview belligerence - negativism; binary vector}
+#' \item{\code{V3}}{agitation - excitement; binary vector}
+#' \item{\code{V4}}{retardation; binary vector}
+#' \item{\code{V5}}{lack of emotions; binary vector}
+#' \item{\code{V6}}{speech disorganization; binary vector}
+#' \item{\code{V7}}{grandiosity; binary vector}
+#' \item{\code{V8}}{suspicion - ideas of persecution; binary vector}
+#' \item{\code{V9}}{hallucinations - delusions; binary vector}
+#' \item{\code{V10}}{overt anger; binary vector}
+#' \item{\code{V11}}{depression; binary vector}
+#' \item{\code{V12}}{anxiety; binary vector}
+#' \item{\code{V13}}{obsession - compulsion; binary vector}
+#' \item{\code{V14}}{suicide; binary vector}
+#' \item{\code{V15}}{self injury; binary vector}
+#' \item{\code{V16}}{somatic concerns; binary vector}
+#' \item{\code{V17}}{social isolation; binary vector}
+#' \item{\code{V18}}{daily routine impairment; binary vector}
+#' \item{\code{V19}}{leisure time impairment; binary vector}
+#' \item{\code{V20}}{antisocial impulses or acts; binary vector}
+#' \item{\code{V21}}{alcohol abuse; binary vector}
+#' \item{\code{V22}}{drug abuse; binary vector}
+#' \item{\code{V23}}{disorientation; binary vector}
+#' \item{\code{V24}}{memory impairment; binary vector}
+#' \item{\code{V25}}{rating on Global Assessment Scale, a 101-point scale for overall severity of psychiatric disturbance; a numeric vector}
+#' \item{\code{V26}}{Affective (Affective Disorder or Anxiety Disorder); binary vector}
+#' \item{\code{V27}}{Psychotic (Schizophrenic Disorder or Paranoid Disorder); binary vector}
+#' \item{\code{V28}}{Substance abuse (Substance Use Disorder or Substance-Induced Disorder); binary vector}
+#' }
+#' @details {
+#' The data set forms part of the International Federation of Classification Societies Cluster Benchmark Data Repository
+#' }
+#' @source Van Mechelen, I., & De Boeck, P. (1989). Implicit taxonomy in psychiatric diagnosis: A case study. Journal of Social and Clinical Psychology, 8, 276-287.
+#' <https://ifcs.boku.ac.at/repository/data/case_study_psychiatrist/index.html>
+
+
+"case.study.psychiatrist"
diff --git a/R/funcs.R b/R/funcs.R
@@ -0,0 +1,172 @@
+# Sugnet Lubbe
+# MuViSU (Centre for Multi-Dimensional Data Visualisation)
+# Department of Statistics and Actuarial Science
+# Stellenbosch University, June 2022
+# ============================================================
+
+# ---------------------------------------------------------------------------------
+# clusboot
+# ---------------------------------------------------------------------------------
+
+#' Performs bootstrap on a cluster analysis output
+#'
+#' @param datmat a data matrix or distance object which will be the input to the clustering function
+#' @param B number of bootstrap replicates
+#' @param clustering.func the function which will perform the clustering and output a vector of cluster memberships
+#' @param ... more arguments to be passed to the clustering function
+#'
+#' @return an object of type clusboot
+#' @export
+#'
+#' @examples
+#' clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=complete.linkage)
+
+clusboot <- function(datmat, B=1000, clustering.func=complete.linkage, ...)
+{
+  obj <- datmat
+  if (data.class(datmat) != "matrix") datmat <- as.matrix(datmat)
+  n <- nrow(datmat)
+  if (is.null(rownames(datmat))) sample.names <- 1:n
+  else sample.names <- rownames(datmat)
+  boot.samples <- matrix(sample (1:n, size=n*B, replace=T), ncol=B)
+
+  boot.out <- apply(boot.samples,2,function(x)
+  { if (data.class(obj) == "dist") out <- clustering.func(stats::as.dist(datmat[x,x]), ...)
+  else out <- clustering.func(datmat[x,], ...)
+  clusD <- totD <- matrix (0, nrow=n, ncol=n, dimnames=list(sample.names,sample.names))
+
+  boot.sample <- table(x)
+  boot.names <- as.numeric(names(boot.sample))
+  Dmat <- matrix (boot.sample,ncol=1) %*% matrix (boot.sample, nrow=1)
+  totD[boot.names,boot.names] <- Dmat
+
+  kk <- nlevels(factor(out))
+  for (i in 1:kk)
+  {
+    clus.count <- table(x[out==i])
+    clus.names <- as.numeric(names(clus.count))
+    Dmat <- matrix (clus.count,ncol=1) %*% matrix (clus.count,nrow=1)
+    clusD[clus.names,clus.names] <- Dmat
+  }
+
+  cbind(clusD, totD)
+  })
+  # boot.out is a matrix with B columns
+  # the first nrow/2 is the n*n elements of clusD
+  # the remaining nrow/2 is the n*n elements of totD
+
+  boot.out <- apply (boot.out, 1, sum)
+  clusD <- matrix (boot.out[1:(n*n)], nrow=n, ncol=n)
+  totD <- matrix (boot.out[-(1:(n*n))], nrow=n, ncol=n)
+
+  clusD <- clusD/totD
+  dimnames(clusD) <- list(sample.names,sample.names)
+  out <- clustering.func(obj, ...)
+  output <- list(proportions=clusD[order(out),order(out)], clustering=out[order(out)])
+  class(output) <- "clusboot"
+  output
+}
+
+# ---------------------------------------------------------------------------------
+# plot.clusboot
+# ---------------------------------------------------------------------------------
+
+#' MDS plot of similarities given by the proportion of bootstrap replicates where objects cluster together
+#'
+#' @param x an object of class clusboot
+#' @param col single colour or a vector specifying a colour for each object
+#' @param ... more arguments to be passed to `plot()`
+#'
+#' @return matrix of similarities (proportions)
+#' @export
+#'
+#' @examples
+#' out <- clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=complete.linkage)
+#' plot(out)
+
+plot.clusboot <- function(x, col, ...)
+{
+  Dmat <- 1-x$proportions
+
+  stress.func <- function (y, delta)
+  {
+    Y <- matrix (y, ncol=2)
+    dd <- stats::dist(Y)
+    sum((dd-delta)^2)/sum(dd^2)
+  }
+  Y <- stats::cmdscale(Dmat)
+  y <- stats::optim(as.vector(Y), stress.func, delta=stats::as.dist(Dmat))$par
+  Y <- matrix(y, ncol=2)
+
+  plot (Y[,1], Y[,2], asp=1, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
+
+  cluster.vec <- x$clustering
+  k <- nlevels(factor(cluster.vec))
+  if (missing(col)) col <- grDevices::rainbow(k)
+  if (length(col)<k) col <- rep(col,k)
+
+  for (i in 1:k)
+    graphics::points(Y[cluster.vec==i,1], Y[cluster.vec==i,2], col=col[i], ...)
+
+  Y
+}
+
+# ---------------------------------------------------------------------------------
+# boot.silhouette
+# ---------------------------------------------------------------------------------
+
+#' Constructs a silhouette plot based on proportion of times items cluster together
+#'
+#' @param clusboot.out an object of class clusboot
+#' @param ... more arguments to be passed to `plot()`
+#'
+#' @return vector of silhouette widths for each of the clusters
+#' @export
+#'
+#' @examples
+#' out <- clusboot (scale(case.study.psychiatrist), B=100, k=6, clustering.func=complete.linkage)
+#' boot.silhouette(out)
+
+boot.silhouette <- function(clusboot.out, ...)
+{
+  cluster.vec <- clusboot.out$clustering
+  k <- nlevels(factor(cluster.vec))
+  Pmat <- clusboot.out$proportions
+
+  sil <- rep(NA,k)
+  for (i in 1:k)
+  {
+    current.clus <- (1:length(cluster.vec))[cluster.vec==i]
+    current.p <- Pmat[current.clus, current.clus]
+    own.p <- mean(current.p[lower.tri(current.p)])
+    other.p <- 0
+    for (j in (1:k)[-i])
+    {  other.clus <- (1:length(cluster.vec))[cluster.vec==j]
+    other.mat <- Pmat[current.clus,other.clus]
+    other.mean <- mean(other.mat)
+    if (other.mean>other.p) other.p <- other.mean
+    }
+    sil[i] <- own.p-other.p
+  }
+  names(sil) <- levels(factor(cluster.vec))
+  graphics::barplot (sil, names.arg=1:k, horiz=T, xlim=c(0,1), ...)
+  sil
+}
+
+# ---------------------------------------------------------------------------------
+# complete.linkage
+# ---------------------------------------------------------------------------------
+
+#' Wrapper function for performing complete linkage clustering
+#'
+#' @param X samples x variables data matrix
+#' @param k number of clusters
+#'
+#' @return vector of cluster memberships
+#' @export
+#'
+#' @examples
+#' complete.linkage(scale(case.study.psychiatrist), k=6)
+
+complete.linkage <- function (X, k)
+{ stats::cutree(stats::hclust(stats::dist(X)), k) }