In [1]:
library(FlashRLearn)
fm.set.conf("/FlashX/EC2/run_matrix.txt")

Loading required package: pcg
Loading required package: FlashR
Loading required package: RSpectra
Loading required package: Rcpp

Attaching package: 'FlashR'

The following objects are masked from 'package:base':

    cbind, pmax, pmin, rbind



In [5]:
fm.set.test.na(FALSE)

NULL

In [2]:
mvrnorm <-
    function(n = 1, mu, Sigma_diag, tol=1e-6, empirical = FALSE, EISPACK = FALSE)
{
    p <- length(mu)
    if(!all(length(Sigma_diag) == p)) stop("incompatible arguments")
    if(EISPACK) stop("'EISPACK' is no longer supported by R", domain = NA)
    X <- fm.rnorm.matrix(n, p)
    if(empirical) {
        X <- scale(X, TRUE, FALSE) # remove means
        X <- X %*% fm.svd(X, nu = 0)$v # rotate to PCs
        X <- scale(X, FALSE, TRUE) # rescale PCs to unit variance
    }
    X <- fm.mapply.row(X, Sigma_diag, "*")
    X <- fm.mapply.row(X, drop(mu), "+")
    if(n == 1) drop(X) else t(X)
}

mix.mvrnorm <- function(n, p, m)
{
    mats <- list()
    for (i in 1:m)
        mats <- c(mats, mvrnorm(n, runif(p), runif(p)))
    list(X = fm.cbind.list(mats), y=c(rep(0, n), rep(1, n)))
}

mix <- mix.mvrnorm(100, 1000000, 2)
data <- mix$X
labels <- mix$y

In [7]:
print(paste("#samples:", ncol(data)))
print(paste("#features:", nrow(data)))
print(paste("size in bytes:", length(data) * 8))

[1] "#samples: 200"
[1] "#features: 1e+06"
[1] "size in bytes: 1.6e+09"


In [3]:
embed.classifier <- function(data, labels, proj=c("LOL", "LAL", "QOQ", "PCA"), red.p=1)
{
	if (proj == "LOL")
		proj <- LOL(data, fm.conv.R2FM(as.integer(labels)), red.p, type="svd")
	else if (proj == "LAL")
		proj <- LOL(data, fm.conv.R2FM(as.integer(labels)), red.p, type="rand_sparse")
	else if (proj == "QOQ")
		proj <- QOQ(data, fm.conv.R2FM(as.integer(labels)), red.p)
	else if (proj == "PCA") {
		mu <- rowMeans(data)
		center.mat <- sweep(data, 1, mu, "-")
		res <- fm.svd(t(center.mat), red.p, red.p)
		proj <- res$v
	}
}

train.classifier <- function(data, labels, proj, method="lda")
{
	proj.res <- t(data) %*% proj
	if (method == "lda")
		res <- lda(as.matrix(fm.conv.FM2R(proj.res)), as.factor(labels))
	else if (method == "qda")
		res <- qda(as.matrix(fm.conv.FM2R(proj.res)), as.factor(labels))
	else
		res <- NULL
	list(proj=proj, res=res)
}

predict.classifier <- function(object, newdata)
{
	proj.res <- t(newdata) %*% object$proj
	predict(object=object$res, newdata=as.matrix(fm.conv.FM2R(proj.res)))
}

rand.split.test <- function(data, labels, count, train.percent, red.ps)
{
	train.size <- as.integer(ncol(data) * train.percent)
	for (run in 1:count) {
		idxs <- 1:ncol(data)
		train.idxs <- sort(sample(idxs, train.size))
		test.idxs <- which(is.na(pmatch(idxs, train.idxs)))
		train <- data[,train.idxs]
		test <- data[,test.idxs]
		train.labels <- labels[train.idxs]
		truth <- labels[test.idxs]+1
		print("truth:")
		print(truth)

		proj <- embed.classifier(train, train.labels, proj="LOL", max(red.ps))
		for (red.p in red.ps) {
			res <- train.classifier(train, train.labels, proj[, 1:red.p], method="lda")
			pred <- predict.classifier(object=res, newdata=test)
			print("LOL+LDA predict:")
			print(pred$class)
			# measure the accuracy
			out <- paste("LOL-", red.p, "dim: ",
						 sum((as.integer(pred$class) - truth) != 0)/length(pred$class), sep="")
			print(out)

			res <- train.classifier(train, train.labels, proj[, 2:min(red.p + 1, ncol(proj))], method="lda")
			pred <- predict.classifier(object=res, newdata=test)
			print("RR-LDA predict:")
			print(pred$class)
			# measure the accuracy
			out <- paste("RR-LDA-", red.p, "dim: ",
						 sum((as.integer(pred$class) - truth) != 0)/length(pred$class), sep="")
			print(out)
			res <- NULL
			gc()
		}
		proj <- NULL
		gc()

		proj <- embed.classifier(train, train.labels, proj="PCA", max(red.ps))
		for (red.p in red.ps) {
			res <- train.classifier(train, train.labels, proj[, 1:red.p], method="lda")
			pred <- predict.classifier(object=res, newdata=test)
			print("PCA+LDA predict:")
			print(pred$class)
			# measure the accuracy
			out <- paste("PCA-", red.p, "dim: ",
						 sum((as.integer(pred$class) - truth) != 0)/length(pred$class), sep="")
			print(out)
			res <- NULL
			gc()
		}
		proj <- NULL
		gc()
	}
}

In [6]:
library(MASS)
rand.split.test(data, labels, 1, 0.9, c(1, seq(10, 100, 10)))

[1] "truth:"
 [1] 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
[1] "LOL+LDA predict:"
 [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
Levels: 0 1
[1] "LOL-1dim: 0"
[1] "RR-LDA predict:"
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Levels: 0 1
[1] "RR-LDA-1dim: 0.6"
[1] "LOL+LDA predict:"
 [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
Levels: 0 1
[1] "LOL-10dim: 0"
[1] "RR-LDA predict:"
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Levels: 0 1
[1] "RR-LDA-10dim: 0.6"
[1] "LOL+LDA predict:"
 [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
Levels: 0 1
[1] "LOL-20dim: 0"
[1] "RR-LDA predict:"
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Levels: 0 1
[1] "RR-LDA-20dim: 0.6"
[1] "LOL+LDA predict:"
 [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
Levels: 0 1
[1] "LOL-30dim: 0"
[1] "RR-LDA predict:"
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Levels: 0 1
[1] "RR-LDA-30dim: 0.6"
[1] "LOL+LDA predict:"
 [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
Levels: 0 1
[1] "LOL-40dim: 0"
[1] "RR-LDA predict:"
 [1] 0 0 0 0 0 

In [7]:
dim(data)

NULL