Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
sreid11 authored and gaborcsardi committed May 6, 2013
0 parents commit d98aefa
Show file tree
Hide file tree
Showing 20 changed files with 1,750 additions and 0 deletions.
19 changes: 19 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Package: clogitL1
Type: Package
Title: Fitting exact conditional logistic regression with lasso and
elastic net penalties.
Version: 1.0
Date: 2013-05-06
Author: Stephen Reid and Robert Tibshirani
Maintainer: Stephen Reid <sreid@stanford.edu>
Description: Tools for the fitting and cross validation of exact
conditional logistic regression models with lasso and elastic
net penalties. Uses cyclic coordinate descent and warm starts
to compute the entire path efficiently.
License: GPL-2
Depends: Rcpp (>= 0.10.2)
LinkingTo: Rcpp
Packaged: 2013-06-14 17:59:41 UTC; sreid
NeedsCompilation: yes
Repository: CRAN
Date/Publication: 2013-06-14 21:44:33
19 changes: 19 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
569100fec08c627d6f416f8f4c361659 *DESCRIPTION
34b84ed8d271a69460e92038243d161c *NAMESPACE
2ca5f1f5572e3ed9698403e0e8ade222 *R/RcppExports.R
a899f7e98a4814fbd7aa18e4e069a653 *R/clogitL1.R
ceba51f55ce4cedfc205190aa360782f *R/cv.clogitL1.R
5985015687c608cb731bc7b0ff659b09 *R/plot.clogitL1.R
006f13e07d87b59e783b256189b03af9 *R/plot.cv.clogitL1.R
5f561e7fd0ee2f2972e0956dfff11a1e *R/print.clogitL1.R
906ac267ba416c09ad7a3e5be92f6922 *man/clogitL1-package.Rd
86b2d499b2a0c245528c9a0f17734342 *man/clogitL1.Rd
874de325a4b97dd43fb474300a7c910d *man/cv.clogitL1.Rd
a9c4ec4ed182d9aae156c75d6e2a92ea *man/plot.clogitL1.Rd
53fc20c60545dfa88ac1d6f2de3ba602 *man/plot.cv.clogitL1.Rd
118e2ab7918c9037931da76edb642bcb *man/print.clogitL1.Rd
b3f86c20b81b51f5162874977d38784d *src/Makevars
73505498722ad4b07b0ddb3067e92523 *src/Makevars.win
d1045b24fc4eb6ec76b847597b02ef8b *src/RcppExports.cpp
c1174da64ffbe4667392c902b69c4ce0 *src/clogisticl1_5.cpp
be43acb3bd583e108720f500a1c49a5f *src/clogisticl1_5cv.cpp
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
useDynLib(clogitL1)
export(clogitL1)
export(print.clogitL1)
export(plot.clogitL1)
export(cv.clogitL1)
export(plot.cv.clogitL1)
11 changes: 11 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

clogitl1_c <- function(n, m, Xmat, yvec, switchIter, numLambda, minLambda, alpha) {
.Call('clogitL1_clogitl1_c', PACKAGE = 'clogitL1', n, m, Xmat, yvec, switchIter, numLambda, minLambda, alpha)
}

clogitl1CV_c <- function(n, m, Xmat, yvec, lambdas, keepvec, alpha) {
.Call('clogitL1_clogitl1CV_c', PACKAGE = 'clogitL1', n, m, Xmat, yvec, lambdas, keepvec, alpha)
}

44 changes: 44 additions & 0 deletions R/clogitL1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
clogitL1 = function(y, X, strata, numLambda = 100, minLambdaRatio = 0.000001, switch = floor(0.9*numLambda), alpha = 1){

# twiddle parameters for stability
minLambdaRatio = max(c(minLambdaRatio, 0.000001))
minLambdaRatio = min(c(minLambdaRatio, 1))

alpha = max(c(alpha, 0.000001))
alpha = min(c(alpha, 1))

# make sure inputs are appropriate
if (length(y) != length(strata)) stop("Please specify one stratum per observation")
if (length(y) != nrow(X)) stop("Please ensure that each observation has predictors and response")
if (any(y != 1 & y != 0)) stop("Response vector should contain 1 for cases and 0 for controls")

# first put y and X into form conducive to call to C function
# group strata together, putting cases first
yC = NULL
XC = NULL
nVec = NULL
mVec = NULL
for (i in unique(strata)){
currentX = X[strata==i, ]
currentY = y[strata==i]

yC = c(yC, currentY[currentY == 1], currentY[currentY==0])
XC = rbind(XC, currentX[currentY == 1, ], currentX[currentY == 0, ])
nVec = c(nVec, length(currentY))
mVec = c(mVec, sum(currentY))
}
XC = as.matrix(XC)

# fit the model - clogitL1_c returns a matrix
# first ncol(x) columns: beta estimates
# column ncol(X)+1: lambda values, then non zero beta and strong set beta
esti.beta = clogitl1_c (nVec, mVec, X, y, switch, numLambda, minLambdaRatio, alpha=alpha)

out.beta = esti.beta[,1:ncol(X)] # maybe we stopped before we got to numLambda iterations
empty.beta = apply (esti.beta, 1, function(x){all(x==0)})
out.beta = out.beta[!empty.beta,]

out = list(beta=out.beta, lambda=esti.beta[!empty.beta,ncol(X)+1], nz_beta=esti.beta[!empty.beta,ncol(X)+2], ss_beta=esti.beta[!empty.beta,ncol(X)+3], dev_perc=esti.beta[!empty.beta, ncol(X)+5], y_c=yC, X_c=XC, nVec=nVec, mVec=mVec, alpha=alpha)
class(out) = "clogitL1"
out
}
42 changes: 42 additions & 0 deletions R/cv.clogitL1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
cv.clogitL1 = function(clObj, numFolds=10){
# Function that accepts a object of type "clogitL1" and returns an object
# of type "cv.clogitL1" containing relevant CV information

if (class(clObj) != "clogitL1"){
stop("requires object of type clogitL1")
}

# assign folds
K = length(clObj$nVec) # number of folds
nFolds = min(c(K, numFolds))

perFold = floor(K/nFolds)
extra = K %% nFolds
foldVec = NULL
for (i in 1:nFolds){
foldVec = c(foldVec, rep(i, perFold))
if (i <= extra) foldVec = c(foldVec, i) # another one to make up extra
}
foldVec = sample (foldVec, K, replace = F) # jumble around the folds

# now run the CV version of clogitL1
cv.mat = NULL
for (i in 1:nFolds){
keepvec = ifelse (foldVec == i, 0, 1)
esti.beta.cv = clogitl1CV_c (clObj$nVec, clObj$mVec, clObj$X_c, clObj$y_c, clObj$lambda, keepvec, clObj$alpha)
cv.mat = cbind(cv.mat, esti.beta.cv[,ncol(clObj$X_c) + 6])
}

# find minimum and 1-se rule
mean.cv.err = -apply (cv.mat, 1, mean)[-1] # cv error at each lambda
sd.cv.err = apply (cv.mat, 1, function(x){sqrt(var(x))})[-1]/sqrt(10) # estimate of standard error of this mean cv error
plot.lambdas = log(clObj$lambda)[-1]

min.lambda = plot.lambdas[which.min(mean.cv.err)] # lambda of minimum cv error
min.lambda.1sd = max(plot.lambdas[mean.cv.err-sd.cv.err <= min(mean.cv.err)])

out = list(cv_dev=cv.mat, lambda=plot.lambdas, folds=foldVec, mean_cv=mean.cv.err, se_cv=sd.cv.err, minCV_lambda=min.lambda, minCV1se_lambda=min.lambda.1sd, nz_beta=clObj$nz_beta[-1])
class(out) = "cv.clogitL1"
out
}

8 changes: 8 additions & 0 deletions R/plot.clogitL1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
plot.clogitL1 = function(x, logX=T, ...){
# Plots the parameter profiles against (log) regularisation parameter
# If 'logX' is TRUE, then we plot it against the log regularisation parameter

if (logX) horiz = log(x$lambda)
else horiz = x$lambda
matplot (x=horiz, y=x$beta, type = "l", xlab = "Regularisation parameter", ylab = "Parameter estimate", ...)
}
14 changes: 14 additions & 0 deletions R/plot.cv.clogitL1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
plot.cv.clogitL1 = function(x, ...){
# Function for plotting a cv.clogitL1 object - plots the mean cv deviance and standard error bars

plot (x=c(x$lambda, x$lambda), y = c(x$mean_cv + x$se_cv, x$mean_cv - x$se_cv), type = "n", xlab = "log(lambda)", ylab = "Conditional likelihood deviance", ...)
points (x=x$lambda, y=x$mean_cv, col = "red") # mean cv deviance
axis(side=3, at = x$lambda, labels=x$nz_beta, tick=FALSE) # number of non-zero predictors on top axis
for (i in 1:length(x$lambda)){ # standard error bounds
lines (x = rep(x$lambda[i], 2), y = x$mean_cv[i] + c(x$se_cv[i], -x$se_cv[i]), col = "gray") # verical lines
lines (x = x$lambda[i] + c(-0.03, 0.03), y = rep (x$mean_cv[i] + x$se_cv[i], 2), col = "gray") # top horizontal line
lines (x = x$lambda[i] + c(-0.03, 0.03), y = rep (x$mean_cv[i] - x$se_cv[i], 2), col = "gray") # bottom horizontal line
}
abline (v = x$minCV_lambda, lty = 2) # minimum CV lambda
abline (v = x$minCV1se_lambda, lty=2) # 1SE-rule lambda
}
8 changes: 8 additions & 0 deletions R/print.clogitL1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
print.clogitL1 = function (x, digits = 6, ...){
# prints a summary of the regularisation path of a clogitL1 object
# output:
# - 3 column matrix containing: non-zero beta, %deviance explained, lambda

out = data.frame(Df=x$nz_beta, DevPerc=round(x$dev_perc, digits), Lambda=round(x$lambda, digits))
print(out, ...)
}
32 changes: 32 additions & 0 deletions man/clogitL1-package.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
\name{clogitL1-package}
\alias{clogitL1-package}
\docType{package}
\title{
Penalised conditional logistic regression.
}
\description{
Tools for the fitting and cross validation of exact conditional logistic regression models with lasso and elastic net penalties. Uses cyclic coordinate descent and warm starts to compute the entire path efficiently.
}
\details{
\tabular{ll}{
Package: \tab clogitL1\cr
Type: \tab Package\cr
Version: \tab 1.0\cr
Date: \tab 2013-05-06\cr
License: \tab GPL-2\cr
}
Very simple to use. The main fitting function \code{clogitL1} accepts x, y data and a strata vector indicating stratum membership. It fits the exact conditional logistic regression model at a grid of regularisation parameters.
Only 5 functions:
\itemize{
\item \code{clogitL1}
\item \code{cv.clogitL1}
\item \code{plot.clogitL1}
\item \code{plot.cv.clogitL1}
\item \code{print.clogitL1}
}
}
\author{
Stephen Reid and Rob Tibshirani

Maintainer: Stephen Reid <sreid@stanford.edu>
}
70 changes: 70 additions & 0 deletions man/clogitL1.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
\name{clogitL1}
\alias{clogitL1}
\title{Conditional logistic regression with elastic net penalties}
\description{
Fit a sequence of conditional logistic regression models with lasso or elastic net penalties
}

\usage{
clogitL1 (y, X, strata, numLambda=100,
minLambdaRatio=0.000001, switch=floor(0.9*numLambda), alpha = 1)
}

\arguments{
\item{y}{a vector of binary responses with 1 for cases and 0 for controls.}
\item{X}{a matrix with the same number of rows as the length of y and p columns. Contains the p-vector regressor values as rows}
\item{strata}{vector with stratum membership of each observation.}
\item{numLambda}{number of different values of the regularisation parameter \eqn{\lambda} at which to parameter estimates. First fit is made at value just below smallest regularisation parameter value at which all parameter estimates are 0; last fit made at this value multipled by \code{minLambdaRatio}}
\item{minLambdaRatio}{ratio of smallest to larget value of regularisation parameter \eqn{\lambda} at which we find parameter estimates.}
\item{switch}{index (between 0 and \code{numLambda}) at which we transition from linear to logarithmic jumps.}
\item{alpha}{parameter controling trade off between lasso and ridge penalties. At value 1, we have a pure lasso penalty; at 0, pure ridge. Intermediate values provide a mixture of the two.}
}

\details{
The sequence of models implied by \code{numLambda} and \code{minLambdaRatio} is fit by coordinate descent with warm starts and sequential strong rules. If \code{alpha=1}, we fit using a lasso penalty. Otherwise we fit with an elastic net penalty. Note that a pure ridge penalty is never obatined, because the function sets a floor for \code{alpha} at 0.000001. This improves the stability of the algorithm. A similar lower bound is set for \code{minLambdaRatio}. The sequence of models can be truncated at fewer than \code{numLambda} models if it is found that a very large proportion of training set deviance is explained by the model in question.

}

\value{
An object of type \code{clogitL1} with the following fields:

\item{beta}{a (\code{numLambda} + 1)-by-p matrix of estimated coefficients. First row has all 0s}
\item{lambda}{a vector of length \code{numLambda} + 1 containing the value of the regularisation parameter at which we obtained the fits.}
\item{nz_beta}{a vector of length \code{numLambda} + 1 containg the number of nonzero parameter estimates for the fit at the corresponding regularisation parameter.}
\item{ss_beta}{a vector of length \code{numLambda} + 1 containing the number of predictors considered by the sequential strong rule at that iteration.}
\item{dev_perc}{a vector of length \code{numLambda} + 1 containing the percentage of null deviance explained by the model represented by that row in the matrix.}
\item{y_c}{reordered vector of responses. Grouped by stratum with cases coming first.}
\item{X_c}{reordered matrix of predictors. See above.}
\item{strata_c}{reordered stratum vector. See above.}
\item{nVec}{vector of length the number of unique strata in \code{strata} containing the number of observations encountered in each stratum.}
\item{mVec}{vector containing the number of cases in each stratum.}
\item{alpha}{penalty trade off parameter.}
}

\seealso{
\code{\link{plot.clogitL1}}
}

\examples{

set.seed(142)
# data parameters
K = 10 # number of strata
n = 5 # number in strata
m = 2 # cases per stratum
p = 20 # predictors

# generate data
y = rep(c(rep(1, m), rep(0, n-m)), K)
X = matrix (rnorm(K*n*p, 0, 1), ncol = p) # pure noise
strata = sort(rep(1:K, n))

par(mfrow = c(1,2))
# fit the conditional logistic model
clObj = clogitL1(y, X, strata)
plot(clObj, logX=TRUE)

# cross validation
clcvObj = cv.clogitL1(clObj)
plot(clcvObj)
}
59 changes: 59 additions & 0 deletions man/cv.clogitL1.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
\name{cv.clogitL1}
\alias{cv.clogitL1}
\title{Cross validation of conditional logistic regression with elastic net penalties}
\description{
Find the best of a sequence of conditional logistic regression models with lasso or elastic net penalties using cross validation
}

\usage{
cv.clogitL1 (clObj, numFolds=10)
}

\arguments{
\item{clObj}{an object of type \code{clogitL1} on which we do cross validation.}
\item{numFolds}{the number of folds used in cross validation. Defaults to the minimum of 10 or the number of observations}
}

\details{
Performs \code{numFolds}-fold cross validation on an object of type \code{clogitL1}. Using the sequence of regularisation parameters generated by \code{clObj}, the function chooses \emph{strata} to leave out randomly. The penalised conditional logistic regression model is fit to the non-left-out strata in turn and its deviance compared to an out-of-sample deviance computed on the left-out strata. Fitting models to individual non-left-out strata proceeds using the cyclic coordinate descent-warm start-strong rule type algorith used in \code{clogitL1}, only with a prespecified sequence of \eqn{\lambda}.
}

\value{
An object of type \code{cv.clogitL1} with the following fields:

\item{cv_dev}{matrix of size \code{numLambda}-by-\code{numFolds} containing the CV deviance in each fold for each value of the regularisation parameter.}
\item{lambda}{vector of regularisation parameters.}
\item{folds}{vector showing the folds membership of each observation.}
\item{mean_cv}{vector containing mean CV deviances for each value of the regularisation parameter.}
\item{se_cv}{vector containing an estimate of the standard error of the CV deviance at each value of the regularisation parameter.}
\item{minCV_lambda}{value of the regularisation parameter at which we have minimum \code{mean_cv}}
\item{minCV1se_lambda}{value of the regularisation parameter corresponding to the 1-SE rule. Selects the simplest model with estimate CV within 1 standard deviation of the minimum cv.}
\item{nz_beta}{number of nonzero parameter estimates at each value of the regularisation parameter.}
}

\seealso{
\code{\link{clogitL1}}, \code{\link{plot.cv.clogitL1}}
}
\examples{
set.seed(142)

# data parameters
K = 10 # number of strata
n = 5 # number in strata
m = 2 # cases per stratum
p = 20 # predictors

# generate data
y = rep(c(rep(1, m), rep(0, n-m)), K)
X = matrix (rnorm(K*n*p, 0, 1), ncol = p) # pure noise
strata = sort(rep(1:K, n))

par(mfrow = c(1,2))
# fit the conditional logistic model
clObj = clogitL1(y, X, strata)
plot(clObj, logX=TRUE)

# cross validation
clcvObj = cv.clogitL1(clObj)
plot(clcvObj)
}
44 changes: 44 additions & 0 deletions man/plot.clogitL1.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
\name{plot.clogitL1}
\alias{plot.clogitL1}
\title{Plotting after fitting conditional logistic regression with elastic net penalties}
\description{
Takes a \code{clogitL1} object and plots the parameter profile associated with it.
}

\usage{
\method{plot}{clogitL1} (x, logX = T, ...)
}

\arguments{
\item{x}{an object of type \code{clogitL1}.}
\item{logX}{should the horizontal axis be on log scale?}
\item{...}{additional arguments to \code{plot} function}
}

\seealso{
\code{\link{clogitL1}}
}

\examples{
set.seed(142)

# data parameters
K = 10 # number of strata
n = 5 # number in strata
m = 2 # cases per stratum
p = 20 # predictors

# generate data
y = rep(c(rep(1, m), rep(0, n-m)), K)
X = matrix (rnorm(K*n*p, 0, 1), ncol = p) # pure noise
strata = sort(rep(1:K, n))

par(mfrow = c(1,2))
# fit the conditional logistic model
clObj = clogitL1(y, X, strata)
plot(clObj, logX=TRUE)

# cross validation
clcvObj = cv.clogitL1(clObj)
plot(clcvObj)
}

0 comments on commit d98aefa

Please sign in to comment.