version 1.0

cran · Dec 13, 2019 · 3804341 · 3804341
commit 3804341
Show file tree

Hide file tree

Showing 15 changed files with 375 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,20 @@
+Package: ClusBoot
+Type: Package
+Title: Bootstrap Clustering
+Version: 1.0
+Date: 2019-11-11
+Author: Sugnet Lubbe
+Maintainer: Sugnet Lubbe <slubbe@sun.ac.za>
+Description: Clustering algorithms are designed to cluster objects into a number of clusters. Any clustering algorithm provides the 'best'
+             grouping of objects according to some criterion.
+             This does not guarantee a 'good' clustering solution in the sense that some allocations were not simply the result of chance.
+             This package allows the user to apply any clustering algorithm to a data set. The cluster allocations are subjected to a 
+             bootstrap analysis 
+             to determine the extent to which the clustering structure is stable and fundamental to the data set. For more information 
+             see <https://slubbe.wixsite.com/academic-cv/conference-presentations>.
+License: AGPL-3
+NeedsCompilation: no
+Packaged: 2019-12-05 14:52:51 UTC; filz
+Depends: R (>= 3.5.0)
+Repository: CRAN
+Date/Publication: 2019-12-13 15:20:02 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,14 @@
+e8f79ccd1acced4e8d05ec4398093a42 *DESCRIPTION
+f3e08a3f89c576b59ade251eafa5d7a3 *NAMESPACE
+8e95bd3634dbb044a89e174303fa09cc *R/boot.silhouette.R
+e2ae8b677d47dadb9b3c338dcd6ac3d4 *R/clusboot.R
+838928313918fa1218f7f15e9d31f137 *R/complete.linkage.R
+9750c10672a4551764bf4e7380900e47 *R/plot.clusboot.R
+7daae5384cc743d046c837092961f449 *build/partial.rdb
+0f01608300356264e1620e627ecbf81a *data/case_study_psychiatrist.rda
+602fed71fcdbd3dead709ead7bcfa933 *man/ClusBoot-package.Rd
+e7c8d822754be59587f0f44f98193e7d *man/boot.silhouette.Rd
+ee9aed8c28f04c09b2590a84c127d12e *man/case_study_psychiatrist.Rd
+21be27b22db9ba5482101b7cfa63d1c3 *man/clusboot.Rd
+27f2896c166a5065d7d91164ba0929d4 *man/complete.linkage.Rd
+730f2de0200dccb5c8aa44f7a3292cca *man/plot.clusboot.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,5 @@
+exportPattern("^[[:alpha:]]+")
+  importFrom("grDevices", "rainbow")
+  importFrom("graphics", "barplot", "plot", "points")
+  importFrom("stats", "as.dist", "cmdscale", "cutree", "dist", "hclust","optim")
+S3method(plot,clusboot)
diff --git a/R/boot.silhouette.R b/R/boot.silhouette.R
@@ -0,0 +1,25 @@
+boot.silhouette <-
+function(clusboot.out, ...)
+{
+  cluster.vec <- clusboot.out[[2]]
+  k <- nlevels(factor(cluster.vec))
+  Pmat <- clusboot.out[[1]]
+
+  sil <- rep(NA,k)
+  for (i in 1:k)
+    {
+       current.clus <- (1:length(cluster.vec))[cluster.vec==i]
+       current.p <- Pmat[current.clus, current.clus]
+       own.p <- mean(current.p[lower.tri(current.p)])
+       other.p <- 0
+       for (j in (1:k)[-i])
+         {  other.clus <- (1:length(cluster.vec))[cluster.vec==j]
+            other.mat <- Pmat[current.clus,other.clus]
+            other.mean <- mean(other.mat)
+            if (other.mean>other.p) other.p <- other.mean
+         }
+       sil[i] <- own.p-other.p
+    }
+  barplot (sil, names.arg=1:k, horiz=T, xlim=c(0,1), ...)
+  sil
+}
diff --git a/R/clusboot.R b/R/clusboot.R
@@ -0,0 +1,43 @@
+clusboot <-
+function (datmat, B=1000, clustering.func=complete.linkage, ...)
+{
+  n <- nrow(datmat)
+  if (is.null(rownames(datmat))) rownames(datmat) <- 1:n
+  boot.samples <- matrix(sample (1:n, size=n*B, replace=T), ncol=B)
+
+  boot.out <- apply(boot.samples,2,function(x) 
+                                     { out <- clustering.func(datmat[x,], ...)
+
+                                       clusD <- totD <- matrix (0, nrow=n, ncol=n, dimnames=list(rownames(datmat),rownames(datmat)))
+
+                                       boot.sample <- table(x)
+                                       boot.names <- as.numeric(names(boot.sample))
+                                       Dmat <- matrix (boot.sample,ncol=1) %*% matrix (boot.sample, nrow=1)
+                                       totD[boot.names,boot.names] <- Dmat
+
+                                       kk <- nlevels(factor(out))
+                                       for (i in 1:kk)
+                                         {
+                                            clus.count <- table(x[out==i])
+                                            clus.names <- as.numeric(names(clus.count))
+                                            Dmat <- matrix (clus.count,ncol=1) %*% matrix (clus.count,nrow=1)
+                                            clusD[clus.names,clus.names] <- Dmat
+                                         }
+
+                                       cbind(clusD, totD)
+                                     })
+  # boot.out is a matrix with B columns
+  # the first nrow/2 is the n*n elements of clusD
+  # the remaining nrow/2 is the n*n elements of totD
+
+  boot.out <- apply (boot.out, 1, sum)
+  clusD <- matrix (boot.out[1:(n*n)], nrow=n, ncol=n)
+  totD <- matrix (boot.out[-(1:(n*n))], nrow=n, ncol=n)
+
+  clusD <- clusD/totD
+  dimnames(clusD) <- list(rownames(datmat),rownames(datmat))
+  out <- clustering.func(datmat, ...)
+  output <- list(proportions=clusD[order(out),order(out)], clustering=out[order(out)])
+  class(output) <- "clusboot"
+  output
+}
diff --git a/R/complete.linkage.R b/R/complete.linkage.R
@@ -0,0 +1,2 @@
+complete.linkage <-
+function (X, k) cutree(hclust(dist(X)), k)
diff --git a/R/plot.clusboot.R b/R/plot.clusboot.R
@@ -0,0 +1,27 @@
+plot.clusboot <-
+function (x, col=NULL, ...)
+{
+  Dmat <- 1-x$proportions
+
+  stress.func <- function (y, delta)
+    {
+       Y <- matrix (y, ncol=2)
+       dd <- dist(Y)
+       sum((dd-delta)^2)/sum(dd^2)
+    }
+  Y <- cmdscale(Dmat)
+  y <- optim(as.vector(Y), stress.func, delta=as.dist(Dmat))$par
+  Y <- matrix(y, ncol=2)
+
+  plot (Y[,1], Y[,2], asp=1, type="n", xaxt="n", yaxt="n", xlab="", ylab="")
+
+  cluster.vec <- x$clustering
+  k <- nlevels(factor(cluster.vec))
+  if (missing(col)) col <- rainbow(k)
+  if (length(col)<k) col <- rep(col,k)
+
+  for (i in 1:k)
+    points(Y[cluster.vec==i,1], Y[cluster.vec==i,2], col=col[i], ...)
+
+  Y
+}
diff --git a/build/partial.rdb b/build/partial.rdb
diff --git a/data/case_study_psychiatrist.rda b/data/case_study_psychiatrist.rda
diff --git a/man/ClusBoot-package.Rd b/man/ClusBoot-package.Rd
@@ -0,0 +1,27 @@
+\name{ClusBoot-package}
+\alias{ClusBoot-package}
+\alias{ClusBoot}
+\docType{package}
+\title{
+Performs bootstrap on a cluster analysis output
+}
+\description{
+Any clustering output is subjected to a bootstrap procedure to determine the stability of the clustering solution. The results are displayed
+in the form of a Multi-dimensional scaling plot and a silhouette plot.
+}
+\details{
+
+The DESCRIPTION file:
+\packageDESCRIPTION{ClusBoot}
+\packageIndices{ClusBoot}
+A package to perform bootstrap on any cluster analysis output and visualise the results.
+For more infomation see \url{https://slubbe.wixsite.com/academic-cv/conference-presentations}
+}
+\author{
+Sugnet Lubbe slubbe@sun.ac.za
+
+Maintainer: Sugnet Lubbe slubbe@sun.ac.za
+}
+\references{
+Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
+}
diff --git a/man/boot.silhouette.Rd b/man/boot.silhouette.Rd
@@ -0,0 +1,44 @@
+\name{boot.silhouette}
+\alias{boot.silhouette}
+\title{
+Construct silhouette plot from bootstrap replicates
+}
+\description{
+A silhouette plot is constructed, indicating the proportion of times the cluster members cluster together in the same cluster.
+}
+\usage{
+boot.silhouette(clusboot.out, ...)
+}
+\arguments{
+  \item{clusboot.out}{
+an object of class clusboot, usually, a result of a call to clusboot
+}
+  \item{\dots}{
+more plotting parameters, e.g. col
+}
+}
+\details{
+The clustering tightness is computed for each cluster, compared to the 'nearest' alternative cluster. The cluster tightness is computed 
+as the mean of the proportion of times each pair of objects are clustered together in the same cluster. The tightness to other clusters
+is computed as the mean of the proportion an item of this cluster and one from the other cluster as clustered together. The 'nearest' 
+alternative cluster is defined as the alternative cluster with the numerically largest tightness to the current cluster.
+}
+\value{
+a vector with number of components equal to the number of classes, providing the silhouette value for each class.
+}
+\references{
+Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
+}
+\author{
+Sugnet Lubbe slubbe@sun.ac.za
+}
+\seealso{
+\code{\link{clusboot}} for performing bootstrap on a cluster analysis output and \code{\link{plot.clusboot}} to visually represent the bootstrap replications
+}
+\examples{
+data(case_study_psychiatrist)
+boot.out <- clusboot (scale(case_study_psychiatrist), B=100, k=6, clustering.func=complete.linkage)
+plot(boot.out)
+boot.silhouette (boot.out)
+}
+
diff --git a/man/case_study_psychiatrist.Rd b/man/case_study_psychiatrist.Rd
@@ -0,0 +1,52 @@
+\name{case_study_psychiatrist}
+\alias{case_study_psychiatrist}
+\docType{data}
+\title{
+Patient by psychiatric symptom data
+}
+\description{
+Abstract: Presence/absence ratings of 24 psychiatric symptoms in 30 psychiatric inpatients made by an individual psychiatrist. 
+Subject matter background: The data have been collected in a case study of an individual psychiatrist to identify his implicit taxonomy. 
+Data structure: object x variables data matrix 
+}
+\usage{data(case_study_psychiatrist)}
+\format{
+  A data frame with 30 observations on the following 28 variables.
+  \describe{
+    \item{\code{V1}}{inappropriate affect, appearance or behavior; binary vector}
+    \item{\code{V2}}{interview belligerence - negativism; binary vector}
+    \item{\code{V3}}{agitation - excitement; binary vector}
+    \item{\code{V4}}{retardation; binary vector}
+    \item{\code{V5}}{lack of emotions; binary vector}
+    \item{\code{V6}}{speech disorganization; binary vector}
+    \item{\code{V7}}{grandiosity; binary vector}
+    \item{\code{V8}}{suspicion - ideas of persecution; binary vector}
+    \item{\code{V9}}{hallucinations - delusions; binary vector}
+    \item{\code{V10}}{overt anger; binary vector}
+    \item{\code{V11}}{depression; binary vector}
+    \item{\code{V12}}{anxiety; binary vector}
+    \item{\code{V13}}{obsession - compulsion; binary vector}
+    \item{\code{V14}}{suicide; binary vector}
+    \item{\code{V15}}{self injury; binary vector}
+    \item{\code{V16}}{somatic concerns; binary vector}
+    \item{\code{V17}}{social isolation; binary vector}
+    \item{\code{V18}}{daily routine impairment; binary vector}
+    \item{\code{V19}}{leisure time impairment; binary vector}
+    \item{\code{V20}}{antisocial impulses or acts; binary vector}
+    \item{\code{V21}}{alcohol abuse; binary vector}
+    \item{\code{V22}}{drug abuse; binary vector}
+    \item{\code{V23}}{disorientation; binary vector}
+    \item{\code{V24}}{memory impairment; binary vector}
+    \item{\code{V25}}{rating on Global Assessment Scale, a 101-point scale for overall severity of psychiatric disturbance; a numeric vector}
+    \item{\code{V26}}{Affective (Affective Disorder or Anxiety Disorder); binary vector}
+    \item{\code{V27}}{Psychotic (Schizophrenic Disorder or Paranoid Disorder); binary vector}
+    \item{\code{V28}}{Substance abuse (Substance Use Disorder or Substance-Induced Disorder); binary vector}
+  }
+}
+\details{
+The data set forms part of the International Federation of Classification Societies Cluster Benchmark Data Repository  
+}
+\source{
+Van Mechelen, I., & De Boeck, P. (1989). Implicit taxonomy in psychiatric diagnosis: A case study. Journal of Social and Clinical Psychology, 8, 276-287. }
+\references{
+\url{http://ifcs.boku.ac.at/repository/data/case_study_psychiatrist/index.html}}
diff --git a/man/clusboot.Rd b/man/clusboot.Rd
@@ -0,0 +1,41 @@
+\name{clusboot}
+\alias{clusboot}
+\title{
+Performs bootstrap on a cluster analysis output}
+\description{
+B bootstrap samples are drawn with replacement from the data and cluster analysis is performed on the bootstrap samples.
+}
+\usage{
+clusboot(datmat, B = 1000, clustering.func = complete.linkage, ...)
+}
+\arguments{
+  \item{datmat}{
+a samples by variables data matrix or a distance object, whatever input is required for clustering.func
+}
+  \item{B}{
+number of bootstrap replicates
+}
+  \item{clustering.func}{
+any function that performs cluster analysis which returns a single vector of cluster allocations as output
+}
+  \item{\dots}{
+any other arguments to be sent to clustering.func
+}
+}
+\value{
+  \item{proportions}{an nxn matrix with (i,j)-th element the proportion of times objects i and j clustered in the same cluster)}
+  \item{clustering}{vector of cluster allocations of the input data}
+}
+\references{
+Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
+}
+\author{
+Sugnet Lubbe slubbe@sun.ac.za
+}
+\seealso{
+\code{\link{plot.clusboot}} to visually represent the bootstrap replications and \code{\link{boot.silhouette}} for a silhouette summary of the bootstrap replicates as well as \code{\link{kmeans}}
+}
+\examples{
+data(case_study_psychiatrist)
+clusboot (scale(case_study_psychiatrist), B=100, k=6, clustering.func=complete.linkage)
+}
diff --git a/man/complete.linkage.Rd b/man/complete.linkage.Rd
@@ -0,0 +1,33 @@
+\name{complete.linkage}
+\alias{complete.linkage}
+\title{
+Wrapper function to return only the clustering allocation.
+}
+\description{
+Wrapper function to return only the clustering allocation from the hclust function with default method = "complete".
+}
+\usage{
+complete.linkage(X, k)
+}
+\arguments{
+  \item{X}{
+numeric matrix of data, or an object that can be coerced to such a matrix (such as a numeric vector or a data frame with all numeric columns).
+}
+  \item{k}{
+either the number of clusters, say k, or a set of initial (distinct) cluster centres. If a number, a random set of (distinct) rows in x is chosen as the initial centres.
+}
+}
+\value{
+a single vector of cluster allocations, the output of cutree(hclust(dist(X)), k)
+}
+\references{
+hclust()
+}
+\author{
+Sugnet Lubbe slubbe@sun.ac.za
+}
+\examples{
+data(case_study_psychiatrist)
+complete.linkage(scale(case_study_psychiatrist), k=6)
+}
+
diff --git a/man/plot.clusboot.Rd b/man/plot.clusboot.Rd
@@ -0,0 +1,42 @@
+\name{plot.clusboot}
+\alias{plot.clusboot}
+\title{
+Constructs a Multi-dimensional scaling plot to represent the bootstrap replications.
+}
+\description{
+Constructs a Multi-dimensional scaling plot to represent the bootstrap replications.
+}
+\usage{
+\method{plot}{clusboot}(x, col = NULL, ...)
+}
+\arguments{
+  \item{x}{
+an object of class clusboot, usually, a result of a call to clusboot
+}
+  \item{col}{
+an optional vector equal in length to the number of clusters with colours to represent different clusters
+}
+  \item{\dots}{
+more plotting parameters
+}
+}
+\details{
+Constructs a Multi-dimensional scaling plot to represent the bootstrap replications.
+}
+\value{
+a two-column matrix with rows containing the MDS coordinates of the samples 
+}
+\references{
+Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
+}
+\author{
+Sugnet Lubbe slubbe@sun.ac.za
+}
+\seealso{
+\code{\link{clusboot}} for performing bootstrap on a cluster analysis output and \code{\link{plot.clusboot}} to visually represent the bootstrap replications
+}
+\examples{
+data(case_study_psychiatrist)
+boot.out <- clusboot (scale(case_study_psychiatrist), B=100, k=6, clustering.func=complete.linkage)
+plot(boot.out)
+}