Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
sugnet authored and cran-robot committed Dec 13, 2019
0 parents commit 3804341
Show file tree
Hide file tree
Showing 15 changed files with 375 additions and 0 deletions.
20 changes: 20 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Package: ClusBoot
Type: Package
Title: Bootstrap Clustering
Version: 1.0
Date: 2019-11-11
Author: Sugnet Lubbe
Maintainer: Sugnet Lubbe <slubbe@sun.ac.za>
Description: Clustering algorithms are designed to cluster objects into a number of clusters. Any clustering algorithm provides the 'best'
grouping of objects according to some criterion.
This does not guarantee a 'good' clustering solution in the sense that some allocations were not simply the result of chance.
This package allows the user to apply any clustering algorithm to a data set. The cluster allocations are subjected to a
bootstrap analysis
to determine the extent to which the clustering structure is stable and fundamental to the data set. For more information
see <https://slubbe.wixsite.com/academic-cv/conference-presentations>.
License: AGPL-3
NeedsCompilation: no
Packaged: 2019-12-05 14:52:51 UTC; filz
Depends: R (>= 3.5.0)
Repository: CRAN
Date/Publication: 2019-12-13 15:20:02 UTC
14 changes: 14 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
e8f79ccd1acced4e8d05ec4398093a42 *DESCRIPTION
f3e08a3f89c576b59ade251eafa5d7a3 *NAMESPACE
8e95bd3634dbb044a89e174303fa09cc *R/boot.silhouette.R
e2ae8b677d47dadb9b3c338dcd6ac3d4 *R/clusboot.R
838928313918fa1218f7f15e9d31f137 *R/complete.linkage.R
9750c10672a4551764bf4e7380900e47 *R/plot.clusboot.R
7daae5384cc743d046c837092961f449 *build/partial.rdb
0f01608300356264e1620e627ecbf81a *data/case_study_psychiatrist.rda
602fed71fcdbd3dead709ead7bcfa933 *man/ClusBoot-package.Rd
e7c8d822754be59587f0f44f98193e7d *man/boot.silhouette.Rd
ee9aed8c28f04c09b2590a84c127d12e *man/case_study_psychiatrist.Rd
21be27b22db9ba5482101b7cfa63d1c3 *man/clusboot.Rd
27f2896c166a5065d7d91164ba0929d4 *man/complete.linkage.Rd
730f2de0200dccb5c8aa44f7a3292cca *man/plot.clusboot.Rd
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
exportPattern("^[[:alpha:]]+")
importFrom("grDevices", "rainbow")
importFrom("graphics", "barplot", "plot", "points")
importFrom("stats", "as.dist", "cmdscale", "cutree", "dist", "hclust","optim")
S3method(plot,clusboot)
25 changes: 25 additions & 0 deletions R/boot.silhouette.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
boot.silhouette <-
function(clusboot.out, ...)
{
cluster.vec <- clusboot.out[[2]]
k <- nlevels(factor(cluster.vec))
Pmat <- clusboot.out[[1]]

sil <- rep(NA,k)
for (i in 1:k)
{
current.clus <- (1:length(cluster.vec))[cluster.vec==i]
current.p <- Pmat[current.clus, current.clus]
own.p <- mean(current.p[lower.tri(current.p)])
other.p <- 0
for (j in (1:k)[-i])
{ other.clus <- (1:length(cluster.vec))[cluster.vec==j]
other.mat <- Pmat[current.clus,other.clus]
other.mean <- mean(other.mat)
if (other.mean>other.p) other.p <- other.mean
}
sil[i] <- own.p-other.p
}
barplot (sil, names.arg=1:k, horiz=T, xlim=c(0,1), ...)
sil
}
43 changes: 43 additions & 0 deletions R/clusboot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
clusboot <-
function (datmat, B=1000, clustering.func=complete.linkage, ...)
{
n <- nrow(datmat)
if (is.null(rownames(datmat))) rownames(datmat) <- 1:n
boot.samples <- matrix(sample (1:n, size=n*B, replace=T), ncol=B)

boot.out <- apply(boot.samples,2,function(x)
{ out <- clustering.func(datmat[x,], ...)

clusD <- totD <- matrix (0, nrow=n, ncol=n, dimnames=list(rownames(datmat),rownames(datmat)))

boot.sample <- table(x)
boot.names <- as.numeric(names(boot.sample))
Dmat <- matrix (boot.sample,ncol=1) %*% matrix (boot.sample, nrow=1)
totD[boot.names,boot.names] <- Dmat

kk <- nlevels(factor(out))
for (i in 1:kk)
{
clus.count <- table(x[out==i])
clus.names <- as.numeric(names(clus.count))
Dmat <- matrix (clus.count,ncol=1) %*% matrix (clus.count,nrow=1)
clusD[clus.names,clus.names] <- Dmat
}

cbind(clusD, totD)
})
# boot.out is a matrix with B columns
# the first nrow/2 is the n*n elements of clusD
# the remaining nrow/2 is the n*n elements of totD

boot.out <- apply (boot.out, 1, sum)
clusD <- matrix (boot.out[1:(n*n)], nrow=n, ncol=n)
totD <- matrix (boot.out[-(1:(n*n))], nrow=n, ncol=n)

clusD <- clusD/totD
dimnames(clusD) <- list(rownames(datmat),rownames(datmat))
out <- clustering.func(datmat, ...)
output <- list(proportions=clusD[order(out),order(out)], clustering=out[order(out)])
class(output) <- "clusboot"
output
}
2 changes: 2 additions & 0 deletions R/complete.linkage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
complete.linkage <-
function (X, k) cutree(hclust(dist(X)), k)
27 changes: 27 additions & 0 deletions R/plot.clusboot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
plot.clusboot <-
function (x, col=NULL, ...)
{
Dmat <- 1-x$proportions

stress.func <- function (y, delta)
{
Y <- matrix (y, ncol=2)
dd <- dist(Y)
sum((dd-delta)^2)/sum(dd^2)
}
Y <- cmdscale(Dmat)
y <- optim(as.vector(Y), stress.func, delta=as.dist(Dmat))$par
Y <- matrix(y, ncol=2)

plot (Y[,1], Y[,2], asp=1, type="n", xaxt="n", yaxt="n", xlab="", ylab="")

cluster.vec <- x$clustering
k <- nlevels(factor(cluster.vec))
if (missing(col)) col <- rainbow(k)
if (length(col)<k) col <- rep(col,k)

for (i in 1:k)
points(Y[cluster.vec==i,1], Y[cluster.vec==i,2], col=col[i], ...)

Y
}
Binary file added build/partial.rdb
Binary file not shown.
Binary file added data/case_study_psychiatrist.rda
Binary file not shown.
27 changes: 27 additions & 0 deletions man/ClusBoot-package.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
\name{ClusBoot-package}
\alias{ClusBoot-package}
\alias{ClusBoot}
\docType{package}
\title{
Performs bootstrap on a cluster analysis output
}
\description{
Any clustering output is subjected to a bootstrap procedure to determine the stability of the clustering solution. The results are displayed
in the form of a Multi-dimensional scaling plot and a silhouette plot.
}
\details{

The DESCRIPTION file:
\packageDESCRIPTION{ClusBoot}
\packageIndices{ClusBoot}
A package to perform bootstrap on any cluster analysis output and visualise the results.
For more infomation see \url{https://slubbe.wixsite.com/academic-cv/conference-presentations}
}
\author{
Sugnet Lubbe slubbe@sun.ac.za

Maintainer: Sugnet Lubbe slubbe@sun.ac.za
}
\references{
Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
}
44 changes: 44 additions & 0 deletions man/boot.silhouette.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
\name{boot.silhouette}
\alias{boot.silhouette}
\title{
Construct silhouette plot from bootstrap replicates
}
\description{
A silhouette plot is constructed, indicating the proportion of times the cluster members cluster together in the same cluster.
}
\usage{
boot.silhouette(clusboot.out, ...)
}
\arguments{
\item{clusboot.out}{
an object of class clusboot, usually, a result of a call to clusboot
}
\item{\dots}{
more plotting parameters, e.g. col
}
}
\details{
The clustering tightness is computed for each cluster, compared to the 'nearest' alternative cluster. The cluster tightness is computed
as the mean of the proportion of times each pair of objects are clustered together in the same cluster. The tightness to other clusters
is computed as the mean of the proportion an item of this cluster and one from the other cluster as clustered together. The 'nearest'
alternative cluster is defined as the alternative cluster with the numerically largest tightness to the current cluster.
}
\value{
a vector with number of components equal to the number of classes, providing the silhouette value for each class.
}
\references{
Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
}
\author{
Sugnet Lubbe slubbe@sun.ac.za
}
\seealso{
\code{\link{clusboot}} for performing bootstrap on a cluster analysis output and \code{\link{plot.clusboot}} to visually represent the bootstrap replications
}
\examples{
data(case_study_psychiatrist)
boot.out <- clusboot (scale(case_study_psychiatrist), B=100, k=6, clustering.func=complete.linkage)
plot(boot.out)
boot.silhouette (boot.out)
}

52 changes: 52 additions & 0 deletions man/case_study_psychiatrist.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
\name{case_study_psychiatrist}
\alias{case_study_psychiatrist}
\docType{data}
\title{
Patient by psychiatric symptom data
}
\description{
Abstract: Presence/absence ratings of 24 psychiatric symptoms in 30 psychiatric inpatients made by an individual psychiatrist.
Subject matter background: The data have been collected in a case study of an individual psychiatrist to identify his implicit taxonomy.
Data structure: object x variables data matrix
}
\usage{data(case_study_psychiatrist)}
\format{
A data frame with 30 observations on the following 28 variables.
\describe{
\item{\code{V1}}{inappropriate affect, appearance or behavior; binary vector}
\item{\code{V2}}{interview belligerence - negativism; binary vector}
\item{\code{V3}}{agitation - excitement; binary vector}
\item{\code{V4}}{retardation; binary vector}
\item{\code{V5}}{lack of emotions; binary vector}
\item{\code{V6}}{speech disorganization; binary vector}
\item{\code{V7}}{grandiosity; binary vector}
\item{\code{V8}}{suspicion - ideas of persecution; binary vector}
\item{\code{V9}}{hallucinations - delusions; binary vector}
\item{\code{V10}}{overt anger; binary vector}
\item{\code{V11}}{depression; binary vector}
\item{\code{V12}}{anxiety; binary vector}
\item{\code{V13}}{obsession - compulsion; binary vector}
\item{\code{V14}}{suicide; binary vector}
\item{\code{V15}}{self injury; binary vector}
\item{\code{V16}}{somatic concerns; binary vector}
\item{\code{V17}}{social isolation; binary vector}
\item{\code{V18}}{daily routine impairment; binary vector}
\item{\code{V19}}{leisure time impairment; binary vector}
\item{\code{V20}}{antisocial impulses or acts; binary vector}
\item{\code{V21}}{alcohol abuse; binary vector}
\item{\code{V22}}{drug abuse; binary vector}
\item{\code{V23}}{disorientation; binary vector}
\item{\code{V24}}{memory impairment; binary vector}
\item{\code{V25}}{rating on Global Assessment Scale, a 101-point scale for overall severity of psychiatric disturbance; a numeric vector}
\item{\code{V26}}{Affective (Affective Disorder or Anxiety Disorder); binary vector}
\item{\code{V27}}{Psychotic (Schizophrenic Disorder or Paranoid Disorder); binary vector}
\item{\code{V28}}{Substance abuse (Substance Use Disorder or Substance-Induced Disorder); binary vector}
}
}
\details{
The data set forms part of the International Federation of Classification Societies Cluster Benchmark Data Repository
}
\source{
Van Mechelen, I., & De Boeck, P. (1989). Implicit taxonomy in psychiatric diagnosis: A case study. Journal of Social and Clinical Psychology, 8, 276-287. }
\references{
\url{http://ifcs.boku.ac.at/repository/data/case_study_psychiatrist/index.html}}
41 changes: 41 additions & 0 deletions man/clusboot.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
\name{clusboot}
\alias{clusboot}
\title{
Performs bootstrap on a cluster analysis output}
\description{
B bootstrap samples are drawn with replacement from the data and cluster analysis is performed on the bootstrap samples.
}
\usage{
clusboot(datmat, B = 1000, clustering.func = complete.linkage, ...)
}
\arguments{
\item{datmat}{
a samples by variables data matrix or a distance object, whatever input is required for clustering.func
}
\item{B}{
number of bootstrap replicates
}
\item{clustering.func}{
any function that performs cluster analysis which returns a single vector of cluster allocations as output
}
\item{\dots}{
any other arguments to be sent to clustering.func
}
}
\value{
\item{proportions}{an nxn matrix with (i,j)-th element the proportion of times objects i and j clustered in the same cluster)}
\item{clustering}{vector of cluster allocations of the input data}
}
\references{
Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
}
\author{
Sugnet Lubbe slubbe@sun.ac.za
}
\seealso{
\code{\link{plot.clusboot}} to visually represent the bootstrap replications and \code{\link{boot.silhouette}} for a silhouette summary of the bootstrap replicates as well as \code{\link{kmeans}}
}
\examples{
data(case_study_psychiatrist)
clusboot (scale(case_study_psychiatrist), B=100, k=6, clustering.func=complete.linkage)
}
33 changes: 33 additions & 0 deletions man/complete.linkage.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
\name{complete.linkage}
\alias{complete.linkage}
\title{
Wrapper function to return only the clustering allocation.
}
\description{
Wrapper function to return only the clustering allocation from the hclust function with default method = "complete".
}
\usage{
complete.linkage(X, k)
}
\arguments{
\item{X}{
numeric matrix of data, or an object that can be coerced to such a matrix (such as a numeric vector or a data frame with all numeric columns).
}
\item{k}{
either the number of clusters, say k, or a set of initial (distinct) cluster centres. If a number, a random set of (distinct) rows in x is chosen as the initial centres.
}
}
\value{
a single vector of cluster allocations, the output of cutree(hclust(dist(X)), k)
}
\references{
hclust()
}
\author{
Sugnet Lubbe slubbe@sun.ac.za
}
\examples{
data(case_study_psychiatrist)
complete.linkage(scale(case_study_psychiatrist), k=6)
}

42 changes: 42 additions & 0 deletions man/plot.clusboot.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
\name{plot.clusboot}
\alias{plot.clusboot}
\title{
Constructs a Multi-dimensional scaling plot to represent the bootstrap replications.
}
\description{
Constructs a Multi-dimensional scaling plot to represent the bootstrap replications.
}
\usage{
\method{plot}{clusboot}(x, col = NULL, ...)
}
\arguments{
\item{x}{
an object of class clusboot, usually, a result of a call to clusboot
}
\item{col}{
an optional vector equal in length to the number of clusters with colours to represent different clusters
}
\item{\dots}{
more plotting parameters
}
}
\details{
Constructs a Multi-dimensional scaling plot to represent the bootstrap replications.
}
\value{
a two-column matrix with rows containing the MDS coordinates of the samples
}
\references{
Lubbe, S. Visualisations associated with bootstrapping cluster analysis. Data Science, Statistics and Visualisation conference, July 2017, Lisbon, Portugal.
}
\author{
Sugnet Lubbe slubbe@sun.ac.za
}
\seealso{
\code{\link{clusboot}} for performing bootstrap on a cluster analysis output and \code{\link{plot.clusboot}} to visually represent the bootstrap replications
}
\examples{
data(case_study_psychiatrist)
boot.out <- clusboot (scale(case_study_psychiatrist), B=100, k=6, clustering.func=complete.linkage)
plot(boot.out)
}

0 comments on commit 3804341

Please sign in to comment.