version 0.2

cran · Nov 16, 2013 · 4b89345 · 4b89345
commit 4b89345
Show file tree

Hide file tree

Showing 13 changed files with 578 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,14 @@
+Package: vscc
+Type: Package
+Title: Variable selection for clustering and classification
+Version: 0.2
+Date: 2013-11-16
+Author: Jeffrey L. Andrews, Paul D. McNicholas
+Maintainer: Jeffrey L. Andrews <jeffrey.andrews@macewan.ca>
+Description: Performs variable selection/feature reduction under a clustering or classification framework. In particular, it can be used in an automated fashion using mixture model-based methods (tEIGEN and MCLUST are currently supported).  
+License: GPL (>= 2)
+Imports: teigen, mclust
+Packaged: 2013-11-16 23:30:12 UTC; jeff
+NeedsCompilation: no
+Repository: CRAN
+Date/Publication: 2013-11-17 08:24:55
diff --git a/MD5 b/MD5
@@ -0,0 +1,12 @@
+b9560548ea2a7012b37e6e05bf7469e5 *DESCRIPTION
+af3e0ec837f755bdaf405c90111c7d2a *NAMESPACE
+a89264140a524e5a23b352409b4d7bf0 *R/plot.vscc.R
+972aa6ba1490610d319b494faf438267 *R/print.vscc.R
+047e0a9d46d96515e37bd0cee7a531a2 *R/summary.vscc.R
+9d87a2e8d515659e5792f882e82a67db *R/vscc.R
+ff5d205b6ca0116c0726df42ccd25dd2 *inst/CITATION
+b0665b5c92ace46daf9f87b783c26f44 *man/plot.vscc.Rd
+8d7217e814ed31023162a7024dd46cdb *man/print.vscc.Rd
+2fcf41c0a6df54128bb0471ca6376b72 *man/summary.vscc.Rd
+8afbad6abe93013da02de31968c9fed6 *man/vscc-package.Rd
+b8d0fcd61884096ca8bfb602f973f6e0 *man/vscc.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,5 @@
+export(vscc)
+import(stats, utils, teigen, mclust)
+S3method(plot, vscc)
+S3method(summary, vscc)
+S3method(print, vscc)
diff --git a/R/plot.vscc.R b/R/plot.vscc.R
@@ -0,0 +1,5 @@
+plot.vscc <-
+function(x, ...){
+	classcolours <- rainbow(length(unique(x$bestmod$class)))
+	pairs(x$top, col=classcolours[x$best$class])
+}
diff --git a/R/print.vscc.R b/R/print.vscc.R
@@ -0,0 +1,4 @@
+print.vscc <-
+function(x, ...){
+	summary(x)
+}
diff --git a/R/summary.vscc.R b/R/summary.vscc.R
@@ -0,0 +1,11 @@
+summary.vscc <- function(object, ...){
+	x <- object
+    cat("---------- Summary for VSCC ----------", "\n\n")
+	cat("         ----   RESULTS   ----    ", "\n")
+	cat("# Vars:     ", ncol(x$top), "\n")
+	cat("Relation:   ", x$chosen, "\n")
+	cat("BIC:        ", x$bestmod$bic, "\n")
+	cat("Model:      ", x$bestmod$model, "\n")
+	cat("Family:     ", x$family, "\n")
+	cat("# Groups:   ", x$bestmod$G, "\n")
+}
diff --git a/R/vscc.R b/R/vscc.R
@@ -0,0 +1,275 @@
+####Fix the arguments passed along in the post-analysis!
+
+vscc <-function(x, G=1:9, automate="mclust", initial=NULL, train=NULL, forcereduction=FALSE){
+	origx <- x
+	origG <- G
+	x <- scale(x)
+	p <- ncol(x)
+	if(is.null(train)){
+		if(automate=="teigen"){
+#			require("teigen")
+			if(packageVersion("teigen")<1.9){
+				warning(paste("The 'vscc' package requires 'teigen' version 1.9 or higher, version", packageVersion("teigen"), "is currently installed: issues may arise."))
+			}
+			if(is.null(initial)){
+				mclinit1 <- hc("VVV",x)
+				mclinit2 <- hclass(mclinit1, G)
+				mclist <- list()
+				for(g in length(G)){
+					mclist[[G[g]]] <- mclinit2[,g]
+				}
+				initrun <- teigen(x, G, init=mclist, training=train, verbose=FALSE) 
+				if(initrun$G==1){stop("teigen initialization gives G=1 solution...please use an initialization where G>1")}
+				initial <- initrun$classification
+				initunc <- sum(1-apply(initrun$fuzzy,1,max))
+			}
+			G <- length(unique(initial))
+			n <- nrow(x)
+			zmat <- matrix(0,n,G)
+			for(i in 1:G){
+				zmat[initial==i, i]<-1
+			}
+		}
+		else{
+			if(automate=="mclust"){
+#				require("mclust")
+				if(packageVersion("mclust")<4.0){
+					warning(paste("VSCC requires 'mclust' version 4.0 or higher, version", packageVersion("mclust"), "is currently installed: issues may arise."))
+				}
+				if(is.null(initial)){
+					initrun <- Mclust(x, G)
+					if(initrun$G==1){stop("mclust initialization gives G=1 solution...please use an initialization where G>1")}
+					initial <- initrun$classification
+					initunc <- sum(initrun$unc)
+				}
+				G <- length(unique(initial))
+				n <- nrow(x)
+				zmat <- matrix(0,n,G)
+				for(i in 1:G){
+					zmat[initial==i, i]<-1
+				}
+			}
+			else{
+				if(is.null(initial)){stop("If an initial clustering vector is not supplied, automate='teigen' or 'mclust' must be specified")}
+			}
+			G <- length(unique(initial))
+			n <- nrow(x)
+			zmat <- matrix(0,n,G)
+			for(i in 1:G){
+				zmat[initial==i, i]<-1
+			}
+		}
+	}
+	else{
+		if(is.null(initial)){
+			stop("If using 'train', 'initial' vector must also be given")
+		}
+		origx <- x
+		x <- x[train,]
+		G <- length(unique(initial[train]))
+		n <- nrow(x)
+		zmat <- matrix(0,n,G)
+		for(i in 1:G){
+			zmat[initial[train]==i, i]<-1
+		}
+	}
+
+	if(is.null(colnames(x))){ 
+		colnames(x) <- 1:p 
+		colnames(origx) <- 1:p
+	}
+	ng <- colSums(zmat)
+	mug <- matrix(0,G,p)
+	for(g in 1:G){
+		mug[g,] <- colSums(zmat[,g]*x)/ng[g]
+	}
+#	mug <- muginit(G,p,x,zmat,ng)	
+
+	mugarr <- array(0,dim=c(n,p,G))
+	for(g in 1:G){
+		mugarr[,,g] <- t(mug[g,] * t(matrix(1,n,p)))
+	}
+	mugmat <- matrix(0,n,p)
+	for(g in 1:G){
+		mugmat <- mugmat + zmat[,g] * mugarr[,,g]
+	}
+	xminusmug <- x - mugmat
+	ss <- xminusmug * xminusmug
+	ssbyvar <- colSums(ss)/n
+
+#	bssmugmat <- array(0,dim=c(n,p,G))
+#	for(g in 1:G){
+#		bssmugmat[,,g] <- bssmugmat[,,g] + (1-zmat[,g]) * mugarr[,,g]
+#	}
+#	bssxminusmug <- array(0,dim=c(n,p,G))
+#	for(g in 1:G){
+#		bssxminusmug[,,g] <- (x-bssmugmat[,,g])^2/(n-ng[g])
+#	}
+#	bssbyvar <- rep(0,p)
+#	for(g in 1:G){
+#		bssbyvar <- bssbyvar + colSums(bssxminusmug[,,g])
+#	}
+#	sortbss <- sort(bssbyvar)
+	sorted <- t(as.matrix(sort(ssbyvar)))
+	select <- list()
+	useselect <- list()
+	varnames <- list()
+	trun <- list()
+	numvars <- NA
+	for(i in 1:5){
+		select[[i]] <-  matrix(data=origx[,colnames(sorted)[1]])
+		useselect[[i]] <- matrix(data=x[,colnames(sorted)[1]])
+		varnames[[i]] <- colnames(sorted)[1]
+	}
+	counts <- rep(2,5)
+	for(k in 2:p){
+		curname <- colnames(sorted)[k]
+		for(i in 1:5){
+			curcor <- cor(cbind(x[,curname],useselect[[i]]))
+			if(all(abs(curcor[upper.tri(curcor)])<=(1-sorted[1,k]^i))){
+				select[[i]] <- cbind(select[[i]],origx[,curname])
+				useselect[[i]] <- cbind(useselect[[i]],x[,curname])
+				varnames[[i]][counts[i]] <- curname
+				counts[i] <- counts[i]+1
+			}
+		}
+	}
+	for(i in 1:5){
+		colnames(select[[i]]) <- varnames[[i]]
+	}
+	if(!is.null(automate)){
+		tuncs <- Inf
+		numvars <- counts-1
+		counttab <- table(counts-1)
+		runteig <- rep(TRUE,5)
+		if(any(counttab>1)){
+#			dubvars <- as.numeric(names(which(counttab>1)))
+#			for(j in 1:length(dubvars)){
+#				relneedcheck <- which(numvars==dubvars[j])
+#				k <- 1
+#				while(k < length(relneedcheck)){
+#					for(i in (k+1):length(relneedcheck)){
+#						if(all(varnames[[relneedcheck[k]]] %in% varnames[[relneedcheck[i]]])){
+#							runteig[relneedcheck[i]] <- FALSE
+#						}
+#					}
+#					k <- k+1
+#				}
+#			}
+			#This could be improved
+			for(i in 1:4){
+				for(j in (i+1):5){
+					if(length(varnames[[i]])==length(varnames[[j]])){
+						if(all(varnames[[i]] %in% varnames[[j]])){
+							runteig[j] <- FALSE
+						}
+					}
+				}
+			}
+		}
+		if(automate=="teigen"){
+			for(i in 1:5){
+				if(runteig[i]){
+					G <- origG
+					mclinit1 <- hc("VVV",x)
+					mclinit2 <- hclass(mclinit1, G)
+					mclist <- list()
+					for(g in length(G)){
+						mclist[[G[g]]] <- mclinit2[,g]
+					}
+					trun[[i]] <- teigen(select[[i]], G, training=train, init=mclist, verbose=FALSE)
+					if(trun[[i]]$G>1){
+						tuncs[i] <- sum(1-apply(trun[[i]]$fuzzy,1,max))
+					}
+					else{
+						tuncs[i] <- Inf
+					}
+				}
+				else{
+					trun[[i]] <- "Same as simpler relation"
+					tuncs[i] <- Inf
+				}
+			}
+		}
+		else{
+			if(is.null(train)){
+				for(i in 1:5){
+					if(runteig[i]){
+						G <- origG
+						trun[[i]] <- Mclust(scale(select[[i]]), G)
+						if(trun[[i]]$G>1){
+							tuncs[i] <- sum(trun[[i]]$unc)
+						}
+						else{
+							tuncs[i] <- Inf
+						}
+					}
+					else{
+						trun[[i]] <- "Same as simpler relation"
+						tuncs[i] <- Inf
+					}
+				}
+			}
+			else{
+				for(i in 1:5){
+					if(runteig[i]){
+						G <- origG
+						trun[[i]] <- teigen(select[[i]], G, models="mclust", training=train, init="uniform", verbose=FALSE, known=initial)
+						if(trun[[i]]$G>1){
+							tuncs[i] <- sum(1-apply(trun[[i]]$fuzzy,1,max))
+						}
+						else{
+							tuncs[i] <- Inf
+						}
+					}
+					else{
+						trun[[i]] <- "Same as simpler relation"
+						tuncs[i] <- Inf
+					}
+				}
+			}
+		}
+
+	}
+	store <- list()
+	store[["selected"]] <- select
+	if(!is.null(automate)){
+#		if(is.null(initial)){
+			if(is.null(train)){
+				store[["initialrun"]] <- initrun
+			}
+			else{
+				initunc <- Inf
+			}
+			if(forcereduction){
+				store[["bestmodel"]] <- trun[[which.min(tuncs)]]
+				store[["chosenrelation"]] <- which.min(tuncs)
+			}
+			else{
+				if(min(tuncs)<initunc){
+					store[["bestmodel"]] <- trun[[which.min(tuncs)]]
+					store[["chosenrelation"]] <- which.min(tuncs)
+					store[["topselected"]] <- select[[which.min(tuncs)]]
+					store[["uncertainty"]] <- min(tuncs)
+				}
+				else{
+					store[["bestmodel"]] <- initrun
+					store[["chosenrelation"]] <- "Full dataset"
+					store[["topselected"]] <- origx
+					store[["uncertainty"]] <- initunc
+				}
+			}
+#		}
+#		else{
+#			store[["bestmodel"]] <- trun[[which.min(tuncs)]]
+#			store[["chosenrelation"]] <- which.min(tuncs)
+#			store[["topselected"]] <- select[[which.min(tuncs)]]
+#			store[["uncertainty"]] <- min(tuncs)
+#		}
+		store[["allmodelfit"]] <- trun
+	}
+	store[["family"]] <- automate
+	store[["wss"]] <- sorted
+	class(store) <- "vscc"
+	store
+}
diff --git a/inst/CITATION b/inst/CITATION
@@ -0,0 +1,21 @@
+citHeader("To cite package 'vscc' in publications use:")
+
+citEntry(entry="Manual",
+         title = "vscc: Variable selection for clustering and classification",
+    	author = personList(person(given=c("Jeffrey", "L."), family="Andrews"),person(given=c("Paul", "D."), family="McNicholas")),
+    	year = "2013",
+    	note = "R package version 1",
+    	textVersion = "Andrews, J. L. & McNicholas, P. D. (2013). vscc: Variable selection for clustering and classification. R package version 1."
+)
+
+
+citEntry(entry="Article",
+  title=" Variable Selection for Clustering and Classification",
+  author=personList(person(given=c("Jeffrey", "L."), family="Andrews"),person(given=c("Paul", "D."), family="McNicholas")),
+  journal="arXiv preprint",
+  year="2013",
+  note = "arXiv:1303.5294",
+  textVersion = "Andrews, J. L. & McNicholas, P. D. (2013). Variable selection for clustering and classification. arXiv preprint arXiv:1303.5294."
+)
+
+citFooter("For BibTeX entries, call: toBibtex(citation('vscc'))")
diff --git a/man/plot.vscc.Rd b/man/plot.vscc.Rd
@@ -0,0 +1,38 @@
+\name{plot.vscc}
+\alias{plot.vscc}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{
+Plotting for vscc objects
+}
+\description{
+Dedicated plot function for objects of class vscc.
+}
+\usage{
+\method{plot}{vscc}(x, ...)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{x}{
+An object of class vscc.
+}
+\item{\dots}{
+Further arguments to be passed on
+}
+}
+\details{
+Provides a scatterplot matrix of the selected variables with colours corresponding to each group.
+}
+\author{
+Jeffrey L. Andrews
+}
+
+\seealso{
+\code{\link{vscc}}
+}
+\examples{
+require("mclust")
+data(banknote)
+bankrun <- vscc(banknote[,-1])
+plot(bankrun)
+}
+\keyword{ ~kwd1 }