Skip to content

Commit

Permalink
version 0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeffrey L. Andrews authored and gaborcsardi committed Nov 16, 2013
0 parents commit 4b89345
Show file tree
Hide file tree
Showing 13 changed files with 578 additions and 0 deletions.
14 changes: 14 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Package: vscc
Type: Package
Title: Variable selection for clustering and classification
Version: 0.2
Date: 2013-11-16
Author: Jeffrey L. Andrews, Paul D. McNicholas
Maintainer: Jeffrey L. Andrews <jeffrey.andrews@macewan.ca>
Description: Performs variable selection/feature reduction under a clustering or classification framework. In particular, it can be used in an automated fashion using mixture model-based methods (tEIGEN and MCLUST are currently supported).
License: GPL (>= 2)
Imports: teigen, mclust
Packaged: 2013-11-16 23:30:12 UTC; jeff
NeedsCompilation: no
Repository: CRAN
Date/Publication: 2013-11-17 08:24:55
12 changes: 12 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
b9560548ea2a7012b37e6e05bf7469e5 *DESCRIPTION
af3e0ec837f755bdaf405c90111c7d2a *NAMESPACE
a89264140a524e5a23b352409b4d7bf0 *R/plot.vscc.R
972aa6ba1490610d319b494faf438267 *R/print.vscc.R
047e0a9d46d96515e37bd0cee7a531a2 *R/summary.vscc.R
9d87a2e8d515659e5792f882e82a67db *R/vscc.R
ff5d205b6ca0116c0726df42ccd25dd2 *inst/CITATION
b0665b5c92ace46daf9f87b783c26f44 *man/plot.vscc.Rd
8d7217e814ed31023162a7024dd46cdb *man/print.vscc.Rd
2fcf41c0a6df54128bb0471ca6376b72 *man/summary.vscc.Rd
8afbad6abe93013da02de31968c9fed6 *man/vscc-package.Rd
b8d0fcd61884096ca8bfb602f973f6e0 *man/vscc.Rd
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export(vscc)
import(stats, utils, teigen, mclust)
S3method(plot, vscc)
S3method(summary, vscc)
S3method(print, vscc)
5 changes: 5 additions & 0 deletions R/plot.vscc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
plot.vscc <-
function(x, ...){
classcolours <- rainbow(length(unique(x$bestmod$class)))
pairs(x$top, col=classcolours[x$best$class])
}
4 changes: 4 additions & 0 deletions R/print.vscc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
print.vscc <-
function(x, ...){
summary(x)
}
11 changes: 11 additions & 0 deletions R/summary.vscc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
summary.vscc <- function(object, ...){
x <- object
cat("---------- Summary for VSCC ----------", "\n\n")
cat(" ---- RESULTS ---- ", "\n")
cat("# Vars: ", ncol(x$top), "\n")
cat("Relation: ", x$chosen, "\n")
cat("BIC: ", x$bestmod$bic, "\n")
cat("Model: ", x$bestmod$model, "\n")
cat("Family: ", x$family, "\n")
cat("# Groups: ", x$bestmod$G, "\n")
}
275 changes: 275 additions & 0 deletions R/vscc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
####Fix the arguments passed along in the post-analysis!

vscc <-function(x, G=1:9, automate="mclust", initial=NULL, train=NULL, forcereduction=FALSE){
origx <- x
origG <- G
x <- scale(x)
p <- ncol(x)
if(is.null(train)){
if(automate=="teigen"){
# require("teigen")
if(packageVersion("teigen")<1.9){
warning(paste("The 'vscc' package requires 'teigen' version 1.9 or higher, version", packageVersion("teigen"), "is currently installed: issues may arise."))
}
if(is.null(initial)){
mclinit1 <- hc("VVV",x)
mclinit2 <- hclass(mclinit1, G)
mclist <- list()
for(g in length(G)){
mclist[[G[g]]] <- mclinit2[,g]
}
initrun <- teigen(x, G, init=mclist, training=train, verbose=FALSE)
if(initrun$G==1){stop("teigen initialization gives G=1 solution...please use an initialization where G>1")}
initial <- initrun$classification
initunc <- sum(1-apply(initrun$fuzzy,1,max))
}
G <- length(unique(initial))
n <- nrow(x)
zmat <- matrix(0,n,G)
for(i in 1:G){
zmat[initial==i, i]<-1
}
}
else{
if(automate=="mclust"){
# require("mclust")
if(packageVersion("mclust")<4.0){
warning(paste("VSCC requires 'mclust' version 4.0 or higher, version", packageVersion("mclust"), "is currently installed: issues may arise."))
}
if(is.null(initial)){
initrun <- Mclust(x, G)
if(initrun$G==1){stop("mclust initialization gives G=1 solution...please use an initialization where G>1")}
initial <- initrun$classification
initunc <- sum(initrun$unc)
}
G <- length(unique(initial))
n <- nrow(x)
zmat <- matrix(0,n,G)
for(i in 1:G){
zmat[initial==i, i]<-1
}
}
else{
if(is.null(initial)){stop("If an initial clustering vector is not supplied, automate='teigen' or 'mclust' must be specified")}
}
G <- length(unique(initial))
n <- nrow(x)
zmat <- matrix(0,n,G)
for(i in 1:G){
zmat[initial==i, i]<-1
}
}
}
else{
if(is.null(initial)){
stop("If using 'train', 'initial' vector must also be given")
}
origx <- x
x <- x[train,]
G <- length(unique(initial[train]))
n <- nrow(x)
zmat <- matrix(0,n,G)
for(i in 1:G){
zmat[initial[train]==i, i]<-1
}
}

if(is.null(colnames(x))){
colnames(x) <- 1:p
colnames(origx) <- 1:p
}
ng <- colSums(zmat)
mug <- matrix(0,G,p)
for(g in 1:G){
mug[g,] <- colSums(zmat[,g]*x)/ng[g]
}
# mug <- muginit(G,p,x,zmat,ng)

mugarr <- array(0,dim=c(n,p,G))
for(g in 1:G){
mugarr[,,g] <- t(mug[g,] * t(matrix(1,n,p)))
}
mugmat <- matrix(0,n,p)
for(g in 1:G){
mugmat <- mugmat + zmat[,g] * mugarr[,,g]
}
xminusmug <- x - mugmat
ss <- xminusmug * xminusmug
ssbyvar <- colSums(ss)/n

# bssmugmat <- array(0,dim=c(n,p,G))
# for(g in 1:G){
# bssmugmat[,,g] <- bssmugmat[,,g] + (1-zmat[,g]) * mugarr[,,g]
# }
# bssxminusmug <- array(0,dim=c(n,p,G))
# for(g in 1:G){
# bssxminusmug[,,g] <- (x-bssmugmat[,,g])^2/(n-ng[g])
# }
# bssbyvar <- rep(0,p)
# for(g in 1:G){
# bssbyvar <- bssbyvar + colSums(bssxminusmug[,,g])
# }
# sortbss <- sort(bssbyvar)
sorted <- t(as.matrix(sort(ssbyvar)))
select <- list()
useselect <- list()
varnames <- list()
trun <- list()
numvars <- NA
for(i in 1:5){
select[[i]] <- matrix(data=origx[,colnames(sorted)[1]])
useselect[[i]] <- matrix(data=x[,colnames(sorted)[1]])
varnames[[i]] <- colnames(sorted)[1]
}
counts <- rep(2,5)
for(k in 2:p){
curname <- colnames(sorted)[k]
for(i in 1:5){
curcor <- cor(cbind(x[,curname],useselect[[i]]))
if(all(abs(curcor[upper.tri(curcor)])<=(1-sorted[1,k]^i))){
select[[i]] <- cbind(select[[i]],origx[,curname])
useselect[[i]] <- cbind(useselect[[i]],x[,curname])
varnames[[i]][counts[i]] <- curname
counts[i] <- counts[i]+1
}
}
}
for(i in 1:5){
colnames(select[[i]]) <- varnames[[i]]
}
if(!is.null(automate)){
tuncs <- Inf
numvars <- counts-1
counttab <- table(counts-1)
runteig <- rep(TRUE,5)
if(any(counttab>1)){
# dubvars <- as.numeric(names(which(counttab>1)))
# for(j in 1:length(dubvars)){
# relneedcheck <- which(numvars==dubvars[j])
# k <- 1
# while(k < length(relneedcheck)){
# for(i in (k+1):length(relneedcheck)){
# if(all(varnames[[relneedcheck[k]]] %in% varnames[[relneedcheck[i]]])){
# runteig[relneedcheck[i]] <- FALSE
# }
# }
# k <- k+1
# }
# }
#This could be improved
for(i in 1:4){
for(j in (i+1):5){
if(length(varnames[[i]])==length(varnames[[j]])){
if(all(varnames[[i]] %in% varnames[[j]])){
runteig[j] <- FALSE
}
}
}
}
}
if(automate=="teigen"){
for(i in 1:5){
if(runteig[i]){
G <- origG
mclinit1 <- hc("VVV",x)
mclinit2 <- hclass(mclinit1, G)
mclist <- list()
for(g in length(G)){
mclist[[G[g]]] <- mclinit2[,g]
}
trun[[i]] <- teigen(select[[i]], G, training=train, init=mclist, verbose=FALSE)
if(trun[[i]]$G>1){
tuncs[i] <- sum(1-apply(trun[[i]]$fuzzy,1,max))
}
else{
tuncs[i] <- Inf
}
}
else{
trun[[i]] <- "Same as simpler relation"
tuncs[i] <- Inf
}
}
}
else{
if(is.null(train)){
for(i in 1:5){
if(runteig[i]){
G <- origG
trun[[i]] <- Mclust(scale(select[[i]]), G)
if(trun[[i]]$G>1){
tuncs[i] <- sum(trun[[i]]$unc)
}
else{
tuncs[i] <- Inf
}
}
else{
trun[[i]] <- "Same as simpler relation"
tuncs[i] <- Inf
}
}
}
else{
for(i in 1:5){
if(runteig[i]){
G <- origG
trun[[i]] <- teigen(select[[i]], G, models="mclust", training=train, init="uniform", verbose=FALSE, known=initial)
if(trun[[i]]$G>1){
tuncs[i] <- sum(1-apply(trun[[i]]$fuzzy,1,max))
}
else{
tuncs[i] <- Inf
}
}
else{
trun[[i]] <- "Same as simpler relation"
tuncs[i] <- Inf
}
}
}
}

}
store <- list()
store[["selected"]] <- select
if(!is.null(automate)){
# if(is.null(initial)){
if(is.null(train)){
store[["initialrun"]] <- initrun
}
else{
initunc <- Inf
}
if(forcereduction){
store[["bestmodel"]] <- trun[[which.min(tuncs)]]
store[["chosenrelation"]] <- which.min(tuncs)
}
else{
if(min(tuncs)<initunc){
store[["bestmodel"]] <- trun[[which.min(tuncs)]]
store[["chosenrelation"]] <- which.min(tuncs)
store[["topselected"]] <- select[[which.min(tuncs)]]
store[["uncertainty"]] <- min(tuncs)
}
else{
store[["bestmodel"]] <- initrun
store[["chosenrelation"]] <- "Full dataset"
store[["topselected"]] <- origx
store[["uncertainty"]] <- initunc
}
}
# }
# else{
# store[["bestmodel"]] <- trun[[which.min(tuncs)]]
# store[["chosenrelation"]] <- which.min(tuncs)
# store[["topselected"]] <- select[[which.min(tuncs)]]
# store[["uncertainty"]] <- min(tuncs)
# }
store[["allmodelfit"]] <- trun
}
store[["family"]] <- automate
store[["wss"]] <- sorted
class(store) <- "vscc"
store
}
21 changes: 21 additions & 0 deletions inst/CITATION
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
citHeader("To cite package 'vscc' in publications use:")

citEntry(entry="Manual",
title = "vscc: Variable selection for clustering and classification",
author = personList(person(given=c("Jeffrey", "L."), family="Andrews"),person(given=c("Paul", "D."), family="McNicholas")),
year = "2013",
note = "R package version 1",
textVersion = "Andrews, J. L. & McNicholas, P. D. (2013). vscc: Variable selection for clustering and classification. R package version 1."
)


citEntry(entry="Article",
title=" Variable Selection for Clustering and Classification",
author=personList(person(given=c("Jeffrey", "L."), family="Andrews"),person(given=c("Paul", "D."), family="McNicholas")),
journal="arXiv preprint",
year="2013",
note = "arXiv:1303.5294",
textVersion = "Andrews, J. L. & McNicholas, P. D. (2013). Variable selection for clustering and classification. arXiv preprint arXiv:1303.5294."
)

citFooter("For BibTeX entries, call: toBibtex(citation('vscc'))")
38 changes: 38 additions & 0 deletions man/plot.vscc.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
\name{plot.vscc}
\alias{plot.vscc}
%- Also NEED an '\alias' for EACH other topic documented here.
\title{
Plotting for vscc objects
}
\description{
Dedicated plot function for objects of class vscc.
}
\usage{
\method{plot}{vscc}(x, ...)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
\item{x}{
An object of class vscc.
}
\item{\dots}{
Further arguments to be passed on
}
}
\details{
Provides a scatterplot matrix of the selected variables with colours corresponding to each group.
}
\author{
Jeffrey L. Andrews
}

\seealso{
\code{\link{vscc}}
}
\examples{
require("mclust")
data(banknote)
bankrun <- vscc(banknote[,-1])
plot(bankrun)
}
\keyword{ ~kwd1 }

0 comments on commit 4b89345

Please sign in to comment.