version 1.0

cran · May 6, 2013 · d98aefa · d98aefa
commit d98aefa
Show file tree

Hide file tree

Showing 20 changed files with 1,750 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,19 @@
+Package: clogitL1
+Type: Package
+Title: Fitting exact conditional logistic regression with lasso and
+        elastic net penalties.
+Version: 1.0
+Date: 2013-05-06
+Author: Stephen Reid and Robert Tibshirani
+Maintainer: Stephen Reid <sreid@stanford.edu>
+Description: Tools for the fitting and cross validation of exact
+        conditional logistic regression models with lasso and elastic
+        net penalties. Uses cyclic coordinate descent and warm starts
+        to compute the entire path efficiently.
+License: GPL-2
+Depends: Rcpp (>= 0.10.2)
+LinkingTo: Rcpp
+Packaged: 2013-06-14 17:59:41 UTC; sreid
+NeedsCompilation: yes
+Repository: CRAN
+Date/Publication: 2013-06-14 21:44:33
diff --git a/MD5 b/MD5
@@ -0,0 +1,19 @@
+569100fec08c627d6f416f8f4c361659 *DESCRIPTION
+34b84ed8d271a69460e92038243d161c *NAMESPACE
+2ca5f1f5572e3ed9698403e0e8ade222 *R/RcppExports.R
+a899f7e98a4814fbd7aa18e4e069a653 *R/clogitL1.R
+ceba51f55ce4cedfc205190aa360782f *R/cv.clogitL1.R
+5985015687c608cb731bc7b0ff659b09 *R/plot.clogitL1.R
+006f13e07d87b59e783b256189b03af9 *R/plot.cv.clogitL1.R
+5f561e7fd0ee2f2972e0956dfff11a1e *R/print.clogitL1.R
+906ac267ba416c09ad7a3e5be92f6922 *man/clogitL1-package.Rd
+86b2d499b2a0c245528c9a0f17734342 *man/clogitL1.Rd
+874de325a4b97dd43fb474300a7c910d *man/cv.clogitL1.Rd
+a9c4ec4ed182d9aae156c75d6e2a92ea *man/plot.clogitL1.Rd
+53fc20c60545dfa88ac1d6f2de3ba602 *man/plot.cv.clogitL1.Rd
+118e2ab7918c9037931da76edb642bcb *man/print.clogitL1.Rd
+b3f86c20b81b51f5162874977d38784d *src/Makevars
+73505498722ad4b07b0ddb3067e92523 *src/Makevars.win
+d1045b24fc4eb6ec76b847597b02ef8b *src/RcppExports.cpp
+c1174da64ffbe4667392c902b69c4ce0 *src/clogisticl1_5.cpp
+be43acb3bd583e108720f500a1c49a5f *src/clogisticl1_5cv.cpp
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,6 @@
+useDynLib(clogitL1)
+export(clogitL1)
+export(print.clogitL1)
+export(plot.clogitL1)
+export(cv.clogitL1)
+export(plot.cv.clogitL1)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,11 @@
+# This file was generated by Rcpp::compileAttributes
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+clogitl1_c <- function(n, m, Xmat, yvec, switchIter, numLambda, minLambda, alpha) {
+    .Call('clogitL1_clogitl1_c', PACKAGE = 'clogitL1', n, m, Xmat, yvec, switchIter, numLambda, minLambda, alpha)
+}
+
+clogitl1CV_c <- function(n, m, Xmat, yvec, lambdas, keepvec, alpha) {
+    .Call('clogitL1_clogitl1CV_c', PACKAGE = 'clogitL1', n, m, Xmat, yvec, lambdas, keepvec, alpha)
+}
+
diff --git a/R/clogitL1.R b/R/clogitL1.R
@@ -0,0 +1,44 @@
+clogitL1 = function(y, X, strata, numLambda = 100, minLambdaRatio = 0.000001, switch = floor(0.9*numLambda), alpha = 1){
+
+	# twiddle parameters for stability
+	minLambdaRatio = max(c(minLambdaRatio, 0.000001))
+	minLambdaRatio = min(c(minLambdaRatio, 1))
+
+	alpha = max(c(alpha, 0.000001))
+	alpha = min(c(alpha, 1))
+
+	# make sure inputs are appropriate
+	if (length(y) != length(strata)) stop("Please specify one stratum per observation")
+	if (length(y) != nrow(X)) stop("Please ensure that each observation has predictors and response")
+	if (any(y != 1 & y != 0)) stop("Response vector should contain 1 for cases and 0 for controls")
+
+	# first put y and X into form conducive to call to C function
+	# group strata together, putting cases first
+	yC = NULL
+	XC = NULL
+	nVec = NULL
+	mVec = NULL
+	for (i in unique(strata)){
+		currentX = X[strata==i, ]
+		currentY = y[strata==i]
+
+		yC = c(yC, currentY[currentY == 1], currentY[currentY==0])
+		XC = rbind(XC, currentX[currentY == 1, ], currentX[currentY == 0, ])
+		nVec = c(nVec, length(currentY))
+		mVec = c(mVec, sum(currentY))
+	}
+	XC = as.matrix(XC)
+
+	# fit the model - clogitL1_c returns a matrix
+	# 	first ncol(x) columns: beta estimates
+	#	column ncol(X)+1: lambda values, then non zero beta and strong set beta
+	esti.beta = clogitl1_c (nVec, mVec, X, y, switch, numLambda, minLambdaRatio, alpha=alpha)
+
+	out.beta = esti.beta[,1:ncol(X)] # maybe we stopped before we got to numLambda iterations
+	empty.beta = apply (esti.beta, 1, function(x){all(x==0)})
+	out.beta = out.beta[!empty.beta,]
+
+	out = list(beta=out.beta, lambda=esti.beta[!empty.beta,ncol(X)+1], nz_beta=esti.beta[!empty.beta,ncol(X)+2], ss_beta=esti.beta[!empty.beta,ncol(X)+3], dev_perc=esti.beta[!empty.beta, ncol(X)+5], y_c=yC, X_c=XC, nVec=nVec, mVec=mVec, alpha=alpha)
+	class(out) = "clogitL1"
+	out
+}
diff --git a/R/cv.clogitL1.R b/R/cv.clogitL1.R
@@ -0,0 +1,42 @@
+cv.clogitL1 = function(clObj, numFolds=10){
+	# Function that accepts a object of type "clogitL1" and returns an object 
+	# of type "cv.clogitL1" containing relevant CV information
+
+	if (class(clObj) != "clogitL1"){
+		stop("requires object of type clogitL1")
+	}
+
+	# assign folds
+	K = length(clObj$nVec) # number of folds
+	nFolds = min(c(K, numFolds))
+
+	perFold = floor(K/nFolds)
+	extra = K %% nFolds
+	foldVec = NULL
+	for (i in 1:nFolds){
+		foldVec = c(foldVec, rep(i, perFold))
+		if (i <= extra) foldVec = c(foldVec, i) # another one to make up extra
+	}
+	foldVec = sample (foldVec, K, replace = F) # jumble around the folds
+
+	# now run the CV version of clogitL1
+	cv.mat = NULL
+	for (i in 1:nFolds){
+		keepvec = ifelse (foldVec == i, 0, 1)
+		esti.beta.cv = clogitl1CV_c (clObj$nVec, clObj$mVec, clObj$X_c, clObj$y_c, clObj$lambda, keepvec, clObj$alpha)
+		cv.mat = cbind(cv.mat, esti.beta.cv[,ncol(clObj$X_c) + 6])
+	}
+
+	# find minimum and 1-se rule
+	mean.cv.err = -apply (cv.mat, 1, mean)[-1] # cv error at each lambda
+	sd.cv.err = apply (cv.mat, 1, function(x){sqrt(var(x))})[-1]/sqrt(10) # estimate of standard error of this mean cv error
+	plot.lambdas = log(clObj$lambda)[-1]
+
+	min.lambda = plot.lambdas[which.min(mean.cv.err)] # lambda of minimum cv error
+	min.lambda.1sd = max(plot.lambdas[mean.cv.err-sd.cv.err <= min(mean.cv.err)])
+
+	out = list(cv_dev=cv.mat, lambda=plot.lambdas, folds=foldVec, mean_cv=mean.cv.err, se_cv=sd.cv.err, minCV_lambda=min.lambda, minCV1se_lambda=min.lambda.1sd, nz_beta=clObj$nz_beta[-1])
+	class(out) = "cv.clogitL1"
+	out
+}
+
diff --git a/R/plot.clogitL1.R b/R/plot.clogitL1.R
@@ -0,0 +1,8 @@
+plot.clogitL1 = function(x, logX=T, ...){
+	# Plots the parameter profiles against (log) regularisation parameter
+	# If 'logX' is TRUE, then we plot it against the log regularisation parameter
+
+	if (logX) horiz = log(x$lambda)
+	else horiz = x$lambda
+	matplot (x=horiz, y=x$beta, type = "l", xlab = "Regularisation parameter", ylab = "Parameter estimate", ...)
+}
diff --git a/R/plot.cv.clogitL1.R b/R/plot.cv.clogitL1.R
@@ -0,0 +1,14 @@
+plot.cv.clogitL1 = function(x, ...){
+	# Function for plotting a cv.clogitL1 object - plots the mean cv deviance and standard error bars
+
+	plot (x=c(x$lambda, x$lambda), y = c(x$mean_cv + x$se_cv, x$mean_cv - x$se_cv), type = "n", xlab = "log(lambda)", ylab = "Conditional likelihood deviance", ...)
+	points (x=x$lambda, y=x$mean_cv, col = "red") # mean cv deviance
+	axis(side=3, at = x$lambda, labels=x$nz_beta, tick=FALSE) # number of non-zero predictors on top axis
+	for (i in 1:length(x$lambda)){ # standard error bounds
+		lines (x = rep(x$lambda[i], 2), y = x$mean_cv[i] + c(x$se_cv[i], -x$se_cv[i]), col = "gray") # verical lines
+		lines (x = x$lambda[i] + c(-0.03, 0.03), y = rep (x$mean_cv[i] + x$se_cv[i], 2), col = "gray") # top horizontal line
+		lines (x = x$lambda[i] + c(-0.03, 0.03), y = rep (x$mean_cv[i] - x$se_cv[i], 2), col = "gray") # bottom horizontal line
+	}
+	abline (v = x$minCV_lambda, lty = 2) # minimum CV lambda
+	abline (v = x$minCV1se_lambda, lty=2) # 1SE-rule lambda
+}
diff --git a/R/print.clogitL1.R b/R/print.clogitL1.R
@@ -0,0 +1,8 @@
+print.clogitL1 = function (x, digits = 6, ...){
+	# prints a summary of the regularisation path of a clogitL1 object
+	#	output:
+	#		- 3 column matrix containing: non-zero beta, %deviance explained, lambda
+
+	out = data.frame(Df=x$nz_beta, DevPerc=round(x$dev_perc, digits), Lambda=round(x$lambda, digits))
+	print(out, ...)
+}
diff --git a/man/clogitL1-package.Rd b/man/clogitL1-package.Rd
@@ -0,0 +1,32 @@
+\name{clogitL1-package}
+\alias{clogitL1-package}
+\docType{package}
+\title{
+Penalised conditional logistic regression.
+}
+\description{
+Tools for the fitting and cross validation of exact conditional logistic regression models with lasso and elastic net penalties. Uses cyclic coordinate descent and warm starts to compute the entire path efficiently.
+}
+\details{
+\tabular{ll}{
+Package: \tab clogitL1\cr
+Type: \tab Package\cr
+Version: \tab 1.0\cr
+Date: \tab 2013-05-06\cr
+License: \tab GPL-2\cr
+}
+Very simple to use. The main fitting function \code{clogitL1} accepts x, y data and a strata vector indicating stratum membership. It fits the exact conditional logistic regression model at a grid of regularisation parameters. 
+Only 5 functions:
+\itemize{
+\item \code{clogitL1}
+\item \code{cv.clogitL1}
+\item \code{plot.clogitL1}
+\item \code{plot.cv.clogitL1}
+\item \code{print.clogitL1}
+}
+}
+\author{
+Stephen Reid and Rob Tibshirani
+
+Maintainer: Stephen Reid <sreid@stanford.edu>
+}
diff --git a/man/clogitL1.Rd b/man/clogitL1.Rd
@@ -0,0 +1,70 @@
+\name{clogitL1}
+\alias{clogitL1}
+\title{Conditional logistic regression with elastic net penalties}
+\description{
+ Fit a sequence of conditional logistic regression models with lasso or elastic net penalties
+}
+
+\usage{
+ clogitL1 (y, X, strata, numLambda=100, 
+	minLambdaRatio=0.000001, switch=floor(0.9*numLambda), alpha = 1)
+}
+
+\arguments{
+\item{y}{a vector of binary responses with 1 for cases and 0 for controls.}
+\item{X}{a matrix with the same number of rows as the length of y and p columns. Contains the p-vector regressor values as rows}
+\item{strata}{vector with stratum membership of each observation.}
+\item{numLambda}{number of different values of the regularisation parameter \eqn{\lambda} at which to parameter estimates. First fit is made at value just below smallest regularisation parameter value at which all parameter estimates are 0; last fit made at this value multipled by \code{minLambdaRatio}}
+\item{minLambdaRatio}{ratio of smallest to larget value of regularisation parameter \eqn{\lambda} at which we find parameter estimates.}
+\item{switch}{index (between 0 and \code{numLambda}) at which we transition from linear to logarithmic jumps.}
+\item{alpha}{parameter controling trade off between lasso and ridge penalties. At value 1, we have a pure lasso penalty; at 0, pure ridge. Intermediate values provide a mixture of the two.}
+}
+
+\details{
+The sequence of models implied by \code{numLambda} and \code{minLambdaRatio} is fit by coordinate descent with warm starts and sequential strong rules. If \code{alpha=1}, we fit using a lasso penalty. Otherwise we fit with an elastic net penalty. Note that a pure ridge penalty is never obatined, because the function sets a floor for \code{alpha} at 0.000001. This improves the stability of the algorithm. A similar lower bound is set for \code{minLambdaRatio}. The sequence of models can be truncated at fewer than \code{numLambda} models if it is found that a very large proportion of training set deviance is explained by the model in question.
+
+}
+
+\value{
+ An object of type \code{clogitL1} with the following fields:
+
+\item{beta}{a (\code{numLambda} + 1)-by-p matrix of estimated coefficients. First row has all 0s}
+\item{lambda}{a vector of length \code{numLambda} + 1 containing the value of the regularisation parameter at which we obtained the fits.}
+\item{nz_beta}{a vector of length \code{numLambda} + 1 containg the number of nonzero parameter estimates for the fit at the corresponding regularisation parameter.}
+\item{ss_beta}{a vector of length \code{numLambda} + 1 containing the number of predictors considered by the sequential strong rule at that iteration.}
+\item{dev_perc}{a vector of length \code{numLambda} + 1 containing the percentage of null deviance explained by the model represented by that row in the matrix.}
+\item{y_c}{reordered vector of responses. Grouped by stratum with cases coming first.}
+\item{X_c}{reordered matrix of predictors. See above.}
+\item{strata_c}{reordered stratum vector. See above.}
+\item{nVec}{vector of length the number of unique strata in \code{strata} containing the number of observations encountered in each stratum.}
+\item{mVec}{vector containing the number of cases in each stratum.}
+\item{alpha}{penalty trade off parameter.}
+}
+
+\seealso{
+ \code{\link{plot.clogitL1}}
+}
+
+\examples{
+
+set.seed(142)
+# data parameters
+K = 10 # number of strata
+n = 5 # number in strata
+m = 2 # cases per stratum
+p = 20 # predictors
+
+# generate data
+y = rep(c(rep(1, m), rep(0, n-m)), K)
+X = matrix (rnorm(K*n*p, 0, 1), ncol = p) # pure noise
+strata = sort(rep(1:K, n))
+
+par(mfrow = c(1,2))
+# fit the conditional logistic model
+clObj = clogitL1(y, X, strata)
+plot(clObj, logX=TRUE)
+
+# cross validation
+clcvObj = cv.clogitL1(clObj)
+plot(clcvObj)
+}
diff --git a/man/cv.clogitL1.Rd b/man/cv.clogitL1.Rd
@@ -0,0 +1,59 @@
+\name{cv.clogitL1}
+\alias{cv.clogitL1}
+\title{Cross validation of conditional logistic regression with elastic net penalties}
+\description{
+ Find the best of a sequence of conditional logistic regression models with lasso or elastic net penalties using cross validation
+}
+
+\usage{
+ cv.clogitL1 (clObj, numFolds=10)
+}
+
+\arguments{
+\item{clObj}{an object of type \code{clogitL1} on which we do cross validation.}
+\item{numFolds}{the number of folds used in cross validation. Defaults to the minimum of 10 or the number of observations}
+}
+
+\details{
+Performs \code{numFolds}-fold cross validation on an object of type \code{clogitL1}. Using the sequence of regularisation parameters generated by \code{clObj}, the function chooses \emph{strata} to leave out randomly. The penalised conditional logistic regression model is fit to the non-left-out strata in turn and its deviance compared to an out-of-sample deviance computed on the left-out strata. Fitting models to individual non-left-out strata proceeds using the cyclic coordinate descent-warm start-strong rule type algorith used in \code{clogitL1}, only with a prespecified sequence of \eqn{\lambda}.
+}
+
+\value{
+An object of type \code{cv.clogitL1} with the following fields:
+
+\item{cv_dev}{matrix of size \code{numLambda}-by-\code{numFolds} containing the CV deviance in each fold for each value of the regularisation parameter.}
+\item{lambda}{vector of regularisation parameters.}
+\item{folds}{vector showing the folds membership of each observation.}
+\item{mean_cv}{vector containing mean CV deviances for each value of the regularisation parameter.}
+\item{se_cv}{vector containing an estimate of the standard error of the CV deviance at each value of the regularisation parameter.}
+\item{minCV_lambda}{value of the regularisation parameter at which we have minimum \code{mean_cv}}
+\item{minCV1se_lambda}{value of the regularisation parameter corresponding to the 1-SE rule. Selects the simplest model with estimate CV within 1 standard deviation of the minimum cv.}
+\item{nz_beta}{number of nonzero parameter estimates at each value of the regularisation parameter.}
+}
+
+\seealso{
+ \code{\link{clogitL1}}, \code{\link{plot.cv.clogitL1}}
+}
+\examples{
+set.seed(142)
+
+# data parameters
+K = 10 # number of strata
+n = 5 # number in strata
+m = 2 # cases per stratum
+p = 20 # predictors
+
+# generate data
+y = rep(c(rep(1, m), rep(0, n-m)), K)
+X = matrix (rnorm(K*n*p, 0, 1), ncol = p) # pure noise
+strata = sort(rep(1:K, n))
+
+par(mfrow = c(1,2))
+# fit the conditional logistic model
+clObj = clogitL1(y, X, strata)
+plot(clObj, logX=TRUE)
+
+# cross validation
+clcvObj = cv.clogitL1(clObj)
+plot(clcvObj)
+}
diff --git a/man/plot.clogitL1.Rd b/man/plot.clogitL1.Rd
@@ -0,0 +1,44 @@
+\name{plot.clogitL1}
+\alias{plot.clogitL1}
+\title{Plotting after fitting conditional logistic regression with elastic net penalties}
+\description{
+ Takes a \code{clogitL1} object and plots the parameter profile associated with it.
+}
+
+\usage{
+ \method{plot}{clogitL1} (x, logX = T, ...)
+}
+
+\arguments{
+\item{x}{an object of type \code{clogitL1}.}
+\item{logX}{should the horizontal axis be on log scale?}
+\item{...}{additional arguments to \code{plot} function}
+}
+
+\seealso{
+ \code{\link{clogitL1}}
+}
+
+\examples{
+set.seed(142)
+
+# data parameters
+K = 10 # number of strata
+n = 5 # number in strata
+m = 2 # cases per stratum
+p = 20 # predictors
+
+# generate data
+y = rep(c(rep(1, m), rep(0, n-m)), K)
+X = matrix (rnorm(K*n*p, 0, 1), ncol = p) # pure noise
+strata = sort(rep(1:K, n))
+
+par(mfrow = c(1,2))
+# fit the conditional logistic model
+clObj = clogitL1(y, X, strata)
+plot(clObj, logX=TRUE)
+
+# cross validation
+clcvObj = cv.clogitL1(clObj)
+plot(clcvObj)
+}