version 1.0

cran · Feb 27, 2018 · 52463a8 · 52463a8
1 parent e9b82a3
commit 52463a8
Show file tree

Hide file tree

Showing 16 changed files with 214 additions and 74 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,15 +1,15 @@
 Package: cpt
 Title: Classification Permutation Test
-Description: Non-parametric test for equality of multivariate distributions.  Trains a classifier to classify (multivariate) observations as coming from one of two distributions.  If the classifier is able to classify the observations better than would be expected by chance (using permutation inference), then the null hypothesis that the two distributions are equal is rejected.  
-Version: 0.9
-Date: 2017-03-1
-Imports: MASS, nnet, randomForest
+Description: Non-parametric test for equality of multivariate distributions.  Trains a classifier to classify (multivariate) observations as coming from one of several distributions.  If the classifier is able to classify the observations better than would be expected by chance (using permutation inference), then the null hypothesis that the distributions are equal is rejected.  
+Version: 1.0
+Date: 2018-2-26
+Imports: MASS, nnet, randomForest, glmnet
 Author: Johann Gagnon-Bartsch <johanngb@umich.edu> 
 Maintainer: Johann Gagnon-Bartsch <johanngb@umich.edu>
 License: GPL
 URL: http://dept.stat.lsa.umich.edu/~johanngb
 LazyLoad: yes
 NeedsCompilation: no
-Packaged: 2017-03-02 13:16:22 UTC; johann
+Packaged: 2018-02-27 18:59:14 UTC; johann
 Repository: CRAN
-Date/Publication: 2017-03-02 16:14:16
+Date/Publication: 2018-02-27 19:20:03 UTC
diff --git a/MD5 b/MD5
@@ -1,13 +1,15 @@
-bf08ea2f965acb7bf0f5a2a8ebd8d29c *DESCRIPTION
-f285cee7fbebbcbce60f6e076dd6164b *NAMESPACE
-6eb9fdf8fc1bb8996065491e577464b8 *R/applyclassifiers.R
-06d087e5273286c8a542fa3cc8f6f5df *R/cpt.R
-f16b09cf2a898e26ae376ee2ae077f6c *R/getmetric.R
-a118e9ddd7cac7894237e5fe02df8437 *R/gettestmethod.R
+d29d8d81f29e25dca836e4f116ad6aac *DESCRIPTION
+de0e7172cef2b50e23d2b4a58f96bd23 *NAMESPACE
+ee246132368348b606def55cff7be0c5 *R/applyclassifiers.R
+3a23a9a8d40ba0ee31b4b306c5126481 *R/cpt.R
+e8063fc42ba577005bdd919ead877071 *R/getcombmethod.R
+7235781e1c25370cb4c23208fb885eab *R/getensemble.metric.R
+b9d644400fc9a4ef70ffb1b9f9b21832 *R/getmetric.R
+7437c39174c27c502cd17294475545a7 *R/gettestmethod.R
 70f2ae6f2a683d5776ab392189bee83a *R/gettestmethods.R
-bdb13c32469d01d76c4de6f03fb660bc *R/getteststat.R
-58fa478f85cba4a77a477636fa43b6a0 *R/gettrainmethod.R
+32b30252abef51dc55ab31c6cfc68f3f *R/getteststat.R
+9cbe378084ceb7abaa5f3d9f56c6d26e *R/gettrainmethod.R
 ef2d130d68e57477a20eb762e0c00b53 *R/gettrainmethods.R
-78fa1692627fa4fb614645a62e57e748 *R/train.R
-44d74470c02d53b64e0fd0ed62461ec8 *man/cpt-package.Rd
-1ad9ff61783b51b5032ced2c54fa8543 *man/cpt.Rd
+35014fd243d937533e3d2940766615fd *R/train.R
+c1cf85e07ad2f2ad289c2a741031861d *man/cpt-package.Rd
+e72967beb125f37eb1b0a6640516f045 *man/cpt.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,6 @@
-import(MASS, nnet, randomForest)
+import(MASS, nnet, randomForest, glmnet)
 
-importFrom("stats", "coefficients", "model.matrix", "predict", "rbinom")   
+importFrom("stats", "coefficients", "model.matrix", "predict", "rbinom", "sd")   
 
 export(
 cpt

diff --git a/R/applyclassifiers.R b/R/applyclassifiers.R
@@ -1,10 +1,30 @@
 applyclassifiers <-
-function (tstZ, tstT, classifiers, test.methods, metric) 
+function (tstZ, tstT, classifiers, test.methods, metric, ensemble.metric, 
+    testistrain = FALSE) 
 {
-    metrics = rep(0, length(classifiers))
-    for (i in 1:length(classifiers)) {
-        metrics[i] = metric(test.methods[[i]](tstZ, classifiers[[i]]), 
+    rval = rep(NA, length(classifiers) + 1)
+    if (nrow(tstZ) == 1) {
+        class.output = rep(NA, length(classifiers) * length(levels(tstT)))
+        dim(class.output) = c(length(classifiers), length(levels(tstT)))
+        for (i in 1:length(classifiers)) class.output[i, ] = test.methods[[i]](tstZ, 
+            classifiers[[i]], testistrain = testistrain)
+        for (i in 1:length(classifiers)) rval[i] = metric(class.output[i, 
+            , drop = FALSE], tstT)
+        dim(class.output) = c(1, dim(class.output))
+        rval[length(classifiers) + 1] = ensemble.metric(class.output, 
             tstT)
     }
-    return(metrics)
+    else {
+        class.output = rep(NA, nrow(tstZ) * length(classifiers) * 
+            length(levels(tstT)))
+        dim(class.output) = c(nrow(tstZ), length(classifiers), 
+            length(levels(tstT)))
+        for (i in 1:length(classifiers)) class.output[, i, ] = test.methods[[i]](tstZ, 
+            classifiers[[i]], testistrain = testistrain)
+        for (i in 1:length(classifiers)) rval[i] = metric(class.output[, 
+            i, ], tstT)
+        rval[length(classifiers) + 1] = ensemble.metric(class.output, 
+            tstT)
+    }
+    return(rval)
 }
diff --git a/R/cpt.R b/R/cpt.R
@@ -1,34 +1,60 @@
 cpt <-
-function (Z, T, leaveout = 0, class.methods = "forest", metric = "rate", 
-    paired = FALSE, perm.N = 1000, leaveout.N = 100) 
+function (Z, T, leaveout = 0, class.methods = "forest", metric = "probability", 
+    ensemble.metric = "mean.prob", paired = FALSE, perm.N = 1000, 
+    leaveout.N = 100, comb.methods = c(class.methods, "ensemble"), 
+    comb.method = "fisher") 
 {
     T = as.factor(T)
     train.methods = gettrainmethods(class.methods)
     test.methods = gettestmethods(class.methods)
     if (is.character(metric)) 
         metric = getmetric(metric)
+    if (is.character(ensemble.metric)) 
+        ensemble.metric = getensemble.metric(ensemble.metric)
+    if (is.character(comb.method)) 
+        comb.method = getcombmethod(comb.method)
     if ((leaveout > 0) && (leaveout < 1)) 
         leaveout = ceiling(min(table(T)) * leaveout)
-    nulldist = rep(0, perm.N)
-    teststat = getteststat(Z, T, leaveout, train.methods, test.methods, 
-        metric, leaveout.N)
+    teststat = rep(NA, length(class.methods))
+    nulldist = matrix(NA, perm.N, length(class.methods) + 1)
+    colnames(nulldist) = c(class.methods, "ensemble")
+    for (method.i in 1:length(class.methods)) {
+        teststat = getteststat(Z, T, leaveout, train.methods, 
+            test.methods, metric, ensemble.metric, leaveout.N)
+    }
     if (paired) {
+        T = as.numeric(T) - 1
         for (i in 1:perm.N) {
-            newT = as.numeric(T)
+            newT = T
             newT[T == 0] = rbinom(length(T)/2, 1, 0.5)
             newT[T == 1] = 1 - newT[T == 0]
             newT = as.factor(newT)
-            nulldist[i] = getteststat(Z, newT, leaveout, train.methods, 
-                test.methods, metric, leaveout.N)
+            nulldist[i, ] = getteststat(Z, newT, leaveout, train.methods, 
+                test.methods, metric, ensemble.metric, leaveout.N)
         }
     }
     else {
         for (i in 1:perm.N) {
-            Z = Z[sample(nrow(Z)), , drop = FALSE]
-            nulldist[i] = getteststat(Z, T, leaveout, train.methods, 
-                test.methods, metric, leaveout.N)
+            T = T[sample(length(T))]
+            nulldist[i, ] = getteststat(Z, T, leaveout, train.methods, 
+                test.methods, metric, ensemble.metric, leaveout.N)
         }
     }
-    pval = sum(nulldist >= teststat)/perm.N
-    return(list(pval = pval, teststat = teststat, nulldist = nulldist))
+    pvals = rep(NA, ncol(nulldist))
+    names(pvals) = colnames(nulldist)
+    nullpvaldist = matrix(NA, perm.N, ncol(nulldist))
+    colnames(nullpvaldist) = names(pvals)
+    for (method.i in 1:ncol(nulldist)) {
+        pvals[method.i] = sum(nulldist[, method.i] >= teststat[method.i])/perm.N
+        nullpvaldist[, method.i] = rank(nulldist[, method.i], 
+            ties.method = "max")/perm.N
+    }
+    nullcombpvaldist = apply(nullpvaldist[, comb.methods, drop = FALSE], 
+        1, comb.method)
+    pval = sum(nullcombpvaldist <= comb.method(pvals[comb.methods]))/perm.N
+    if (length(class.methods) == 1) 
+        return(list(pval = pvals[1], teststat = teststat[1], 
+            nulldist = nulldist[, 1], pvals = pvals[1]))
+    return(list(pval = pval, teststat = teststat, nulldist = nulldist, 
+        pvals = pvals))
 }
diff --git a/R/getcombmethod.R b/R/getcombmethod.R
@@ -0,0 +1,13 @@
+getcombmethod <-
+function (comb.method) 
+{
+    if (comb.method == "fisher") {
+        rval = function(x) {
+            return(mean(log(x)))
+        }
+    }
+    if (comb.method == "min") {
+        rval = min
+    }
+    return(rval)
+}
diff --git a/R/getensemble.metric.R b/R/getensemble.metric.R
@@ -0,0 +1,32 @@
+getensemble.metric <-
+function (ensemble.metric) 
+{
+    if (ensemble.metric == "vote") {
+        rval = function(class.output, tstT) {
+            temp = (class.output - as.vector(apply(class.output, 
+                c(1, 2), max))) == 0
+            temp = temp/as.vector(apply(temp, c(1, 2), sum))
+            votemat = apply(temp, c(1, 3), sum)
+            indexmat = cbind(1:nrow(votemat), tstT)
+            temp = votemat - apply(votemat, 1, max) == 0
+            temp = temp/apply(temp, 1, sum)
+            return(mean(votemat[indexmat]))
+        }
+    }
+    if (ensemble.metric == "mean.prob") {
+        rval = function(class.output, tstT) {
+            meanprob = apply(class.output, c(1, 3), mean)
+            indexmat = cbind(1:nrow(meanprob), tstT)
+            return(mean(meanprob[indexmat]))
+        }
+    }
+    if (ensemble.metric == "mean.mse") {
+        rval = function(class.output, tstT) {
+            meanprob = apply(class.output, c(1, 3), mean)
+            indexmat = cbind(1:nrow(meanprob), tstT)
+            meanprob[indexmat] = 1 - meanprob[indexmat]
+            return(-mean(meanprob^2))
+        }
+    }
+    return(rval)
+}
diff --git a/R/getmetric.R b/R/getmetric.R
@@ -12,7 +12,7 @@ function (metric)
             indexmat = cbind(1:nrow(prob), tstT)
             temp = prob - apply(prob, 1, max) == 0
             temp = temp/apply(temp, 1, sum)
-            return(mean(prob[indexmat]))
+            return(mean(temp[indexmat]))
         }
     }
     else if (metric == "mse") {

diff --git a/R/gettestmethod.R b/R/gettestmethod.R
@@ -2,26 +2,41 @@ gettestmethod <-
 function (method) 
 {
     if (method == "logistic") {
-        rval = function(Z, classifier) {
+        rval = function(Z, classifier, testistrain = FALSE) {
             tmp = exp(cbind(0, cbind(1, Z) %*% classifier))
             return(tmp/apply(tmp, 1, sum))
         }
     }
     else if (method == "logistic2") {
-        rval = function(Z, classifier) {
+        rval = function(Z, classifier, testistrain = FALSE) {
             tmp = exp(cbind(0, model.matrix(~.^2, data = data.frame(Z)) %*% 
                 classifier))
             return(tmp/apply(tmp, 1, sum))
         }
     }
     else if (method == "lda") {
-        rval = function(Z, classifier) {
+        rval = function(Z, classifier, testistrain = FALSE) {
             return(predict(classifier, Z)$posterior)
         }
     }
     else if (method == "forest") {
-        rval = function(Z, classifier) {
-            return(predict(classifier, Z, type = "prob"))
+        rval = function(Z, classifier, testistrain = FALSE) {
+            if (testistrain) 
+                return(predict(classifier, type = "prob"))
+            else return(predict(classifier, Z, type = "prob"))
+        }
+    }
+    else if (method == "glmnet") {
+        rval = function(Z, classifier, testistrain = FALSE) {
+            return(predict(classifier, newx = Z, type = "response"))
+        }
+    }
+    else if (method == "glmnet2") {
+        rval = function(Z, classifier, testistrain = FALSE) {
+            Z = scale(Z, center = classifier[[1]], scale = classifier[[2]])
+            Z = model.matrix(~.^2, data = data.frame(Z))[, -1, 
+                drop = FALSE]
+            return(predict(classifier[[3]], newx = Z, type = "response"))
         }
     }
 }
diff --git a/R/gettestmethods.R b/R/gettestmethods.R
diff --git a/R/getteststat.R b/R/getteststat.R
@@ -1,28 +1,29 @@
 getteststat <-
 function (Z, T, leaveout, train.methods, test.methods, metric, 
-    leaveout.N) 
+    ensemble.metric, leaveout.N) 
 {
     if (leaveout == 0) {
         classifiers = train(Z, T, train.methods)
-        metrics = applyclassifiers(Z, T, classifiers, test.methods, 
-            metric)
-        return(max(metrics))
+        return(applyclassifiers(Z, T, classifiers, test.methods, 
+            metric, ensemble.metric, testistrain = TRUE))
     }
     else if ((leaveout == 1) & (leaveout.N == nrow(Z))) {
-        metricsmat = matrix(0, leaveout.N, length(train.methods))
+        metric.mat = matrix(NA, leaveout.N, length(train.methods) + 
+            1)
         for (leaveout.i in 1:leaveout.N) {
             trnZ = Z[-leaveout.i, , drop = FALSE]
             trnT = T[-leaveout.i]
             tstZ = Z[leaveout.i, , drop = FALSE]
             tstT = T[leaveout.i]
             classifiers = train(trnZ, trnT, train.methods)
-            metricsmat[leaveout.i, ] = applyclassifiers(tstZ, 
-                tstT, classifiers, test.methods, metric)
+            metric.mat[leaveout.i, ] = applyclassifiers(tstZ, 
+                tstT, classifiers, test.methods, metric, ensemble.metric)
         }
-        return(max(apply(metricsmat, 2, mean)))
+        return(apply(metric.mat, 2, mean))
     }
     else {
-        metricsmat = matrix(0, leaveout.N, length(train.methods))
+        metric.mat = matrix(NA, leaveout.N, length(train.methods) + 
+            1)
         for (leaveout.i in 1:leaveout.N) {
             testset = rep(FALSE, length(T))
             for (i in 1:length(levels(T))) testset[sample(which(levels(T)[i] == 
@@ -32,9 +33,9 @@ function (Z, T, leaveout, train.methods, test.methods, metric,
             tstZ = Z[testset, , drop = FALSE]
             tstT = T[testset]
             classifiers = train(trnZ, trnT, train.methods)
-            metricsmat[leaveout.i, ] = applyclassifiers(tstZ, 
-                tstT, classifiers, test.methods, metric)
+            metric.mat[leaveout.i, ] = applyclassifiers(tstZ, 
+                tstT, classifiers, test.methods, metric, ensemble.metric)
         }
-        return(max(apply(metricsmat, 2, mean)))
+        return(apply(metric.mat, 2, mean))
     }
 }
diff --git a/R/gettrainmethod.R b/R/gettrainmethod.R
@@ -28,5 +28,20 @@ function (method)
             return(randomForest(Z, T))
         }
     }
+    else if (method == "glmnet") {
+        rval = function(Z, T) {
+            return(cv.glmnet(Z, T, family = "multinomial"))
+        }
+    }
+    else if (method == "glmnet2") {
+        rval = function(Z, T) {
+            means = apply(Z, 2, mean)
+            sds = apply(Z, 2, sd)
+            Z = scale(Z)
+            Z = as.matrix(model.matrix(~.^2, data = data.frame(Z))[, 
+                -1])
+            return(list(means, sds, cv.glmnet(Z, T, family = "multinomial")))
+        }
+    }
     return(rval)
 }
diff --git a/R/gettrainmethods.R b/R/gettrainmethods.R
diff --git a/R/train.R b/R/train.R
@@ -1,9 +1,8 @@
 train <-
 function (trnZ, trnT, train.methods) 
 {
-    classifiers = list()
-    for (i in 1:length(train.methods)) {
-        classifiers[[i]] = train.methods[[i]](trnZ, trnT)
-    }
-    return(classifiers)
+    rval = list()
+    for (i in 1:length(train.methods)) rval[[i]] = train.methods[[i]](trnZ, 
+        trnT)
+    return(rval)
 }
diff --git a/man/cpt-package.Rd b/man/cpt-package.Rd
@@ -7,15 +7,15 @@ Classification Permutation Test
 }
 
 \description{
-Non-parametric test for equality of multivariate distributions.  Trains a classifier to classify (multivariate) observations as coming from one of two distributions.  If the classifier is able to classify the observations better than would be expected by chance (using permutation inference), then the null hypothesis that the two distributions are equal is rejected.  
+Description: Non-parametric test for equality of multivariate distributions.  Trains a classifier to classify (multivariate) observations as coming from one of several distributions.  If the classifier is able to classify the observations better than would be expected by chance (using permutation inference), then the null hypothesis that the distributions are equal is rejected.  
 }
 
 \details{
 \tabular{ll}{
 Package: \tab cpt\cr
 Type: \tab Package\cr
-Version: \tab 0.9\cr
-Date: \tab 2017-3-1\cr
+Version: \tab 1.0\cr
+Date: \tab 2018-2-26\cr
 License: \tab GPL\cr
 LazyLoad: \tab yes\cr
 }