Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Jairo Diaz authored and gaborcsardi committed Sep 20, 2015
0 parents commit 86c07ac
Show file tree
Hide file tree
Showing 19 changed files with 1,041 additions and 0 deletions.
14 changes: 14 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,14 @@
Package: qut
Type: Package
Title: Quantile Universal Threshold
Version: 1.0
Author: Jairo Diaz, Sylvain Sardy, Caroline Giacobino, Nick Hengartner.
Maintainer: Jairo Diaz <jairo.diaz@unige.ch>
Description: Selection of a threshold parameter for GLM-lasso to obtain a sparse model
with a good compromise between high true positive rate and low false discovery rate.
License: GPL-2
Depends: Matrix, glmnet, lars
NeedsCompilation: no
Packaged: 2015-09-20 07:54:10 UTC; Jairo
Repository: CRAN
Date/Publication: 2015-09-20 15:41:39
18 changes: 18 additions & 0 deletions MD5
@@ -0,0 +1,18 @@
796386f5b46abf28de97ed5e6bfa948a *DESCRIPTION
7e939e6767941731e8bc9e537d7ebf75 *NAMESPACE
b35d2c3fe626c987a1236bf500ff89a1 *R/lambdaqut.R
c5c0e9083a2d2266484c69cec2892cfe *R/predict.qut.R
60b8c469d26b0a0a513bad3854bf3aca *R/qut.R
07d830a8b07ef1d53e36e1e72b24b789 *R/sigmaqut.R
83c9097fad099cce1c20298145c00225 *R/sigmarcv.R
6366e619fc2df2a00b86d383beaf2eaf *data/datalist
fb4808ca9012f6abbb606d7d1b21a039 *data/leukemia.rda
3f8a1ef02e84a88f448620114f857d20 *data/riboflavin.rda
82c882af4ff01af36d89fc694e28510f *man/lambdaqut.Rd
ed49b029b582232aab02fc804364a362 *man/leukemia.Rd
2fc547bfe76a865d91d34c6954aff496 *man/predict.qut.Rd
b52ed035629cdab2584ed1bd53bd9541 *man/qut-package.Rd
7701f3ae6406752095a020ef17f50a08 *man/qut.Rd
e1e2b3e8a62002755cc157be3174b951 *man/riboflavin.Rd
5cb8388ddf7ecb6984c3ac7afb91319a *man/sigmaqut.Rd
504c41ff245b7e6c27cd3a1eb75b6a43 *man/sigmarcv.Rd
7 changes: 7 additions & 0 deletions NAMESPACE
@@ -0,0 +1,7 @@
export(qut,lambdaqut,sigmaqut,sigmarcv,predict.qut)
import(Matrix, glmnet, lars)
importFrom("stats", "coef", "gaussian", "glm", "glm.fit", "lm",
"median", "predict", "quantile", "rbinom", "rnorm", "rpois",
"var")

S3method(predict, qut)
164 changes: 164 additions & 0 deletions R/lambdaqut.R
@@ -0,0 +1,164 @@
lambdaqut <-
function(y,X,family=gaussian,alpha.level='default',M=1000,qut.standardize=TRUE,intercept=TRUE,no.penalty=NULL,offset=NULL,bootstrap=TRUE){
#FUNCTION TO OBTAIN THE QUANTILE UNIVERSAL THRESHOLD

#Hidden function for vectorized bootstrapping
qutbootstrap <- function(curz){
ind=sample(1:n,n,replace=TRUE)
curX=X[ind,]
curmuhat=curz[1:n]
curz=curz[-(1:n)]
if(!intercept&sum(O!=0)!=0&is.null(no.penalty)){
curmuhat=muhatZ[ind,1]
}
else if(!is.null(no.penalty)|sum(O!=0)!=0){
curbeta0=glm.fit(y=curz,x=as.matrix(A[ind,]),intercept=FALSE,family=family,offset=O[ind])$coef
curmuhat=family$linkinv(as.matrix(A[ind,])%*%curbeta0+O[ind])
}
curbp=abs(t(curX)%*%(curz-curmuhat))

}

#Hidden function for vectorized computation of the intercept for each of the Monte Carlo simulations of the NULL model
glm0=function(y,x,family,offset=offset){
out=glm.fit(y=y,x=x,intercept=FALSE,family=family,offset=offset)
return(out$coefficients)
}

family=family()
n=nrow(X)
p=ncol(X)
X0=X
if(is.null(offset)) O=rep(0,n)
else O=offset

#Check for warnings
if(alpha.level!='default'){
if(alpha.level>1|alpha.level<0){
warning("alpha.level is not in [0,1] interval; set to default")
alpha.level='default'
}
}
if(M<=0){
warning("M is <=0; set to 1000")
M=1000
}
if (is.null(p) | (p <= 1)) stop("X should be a matrix with 2 or more columns")
if (n!=length(y)) stop("Number of observations in y not equal to number of rows in X")
if(length(O)!=n) stop("length of offset is different to the number of observations")
if(family$family=='poisson'&(sum(y==0)==n)) stop("All your Poisson counts are zero")
if(family$family=='binomial'&((sum(y==0)==n)|(sum(y==1)==n))) stop("All your Binomial measures are zero or one")

#initialize A matrix
if(!intercept&is.null(no.penalty)) muhat=family$linkinv(O)
else{
A=c()
if(!is.null(no.penalty)){
A=as.matrix(X[,no.penalty]) #if there are more unpenalized coefficients
X=X[,-no.penalty]
}
if(intercept) A=cBind(rep(1,n),A) #if there is an intercept (column of ones)

#Estimate beta0 as glm(y~A)
beta0=glm.fit(y=y,x=as.matrix(A),intercept=FALSE,family=family,offset=offset)$coef
muhat=family$linkinv(as.matrix(A)%*%beta0+O)
}

#Set default alpha.level
if(alpha.level=='default') alpha.level=1/(sqrt(pi*log(p)))

#adjustment gamma to the quantile (existence of glm)
gamma=1
if(intercept&is.null(no.penalty)){
if(family$family=='poisson') gamma=1-exp(-n*muhat[1])
else if(family$family=='binomial') gamma=1-muhat[1]^n+(1-muhat[1])^n
}
else warning("Adjustment gamma for glm existence is not considered")

#Monte Carlo Simulation of the null model
if(family$family=='gaussian') z=matrix(rnorm(n*M,mean=as.vector(muhat),sd=1),n,M)
else if(family$family=='poisson'){
z=matrix(rpois(n*M,lambda=as.vector(muhat)),n,M)
z=z[,apply(z,2,sum)!=0]
}
else if(family$family=='binomial'){
z=matrix(rbinom(n*M,size=1,prob=as.vector(muhat)),n,M)
z=z[,(apply(z,2,sum)!=0)&(apply(z,2,sum)!=n)]
}
else stop("Not available family")

#obtain estimate of beta0 for each simulation
if(!intercept&is.null(no.penalty)){
muhatZ=family$linkinv(O)
muhatZ=matrix(rep(muhatZ,ncol(z)),nrow=n)
}
else{
beta0Z=apply(z,2,glm0,x=as.matrix(A),family=family,offset=offset)
muhatZ=family$linkinv(as.matrix(A)%*%beta0Z+O)
}

divX=rep(1,ncol(X))

if(!bootstrap){
#No Bootstrap Matrix X

#X'z
bp=abs(t(X)%*%(z-muhatZ))
scale.factor=rep(1,p)

#quantile-based standardization
if(qut.standardize){
divX=apply(abs(bp),1,quantile,prob=(1-alpha.level)/gamma)
divX[which(divX==0)]=1

#Alignment/scaling
X=t(t(X)/divX)

#X'z for the standardized X matrix
bp=abs(t(X)%*%(z-muhatZ))
}
}
else{
#Bootstrap Matrix X

#quantile-based standardization
if(qut.standardize){
bp=abs(t(X)%*%(z-muhatZ))
scale.factor=rep(1,p)
divX=apply(abs(bp),1,quantile,prob=(1-alpha.level)/gamma)
divX[which(divX==0)]=1

#Alignment/scaling
X=t(t(X)/divX)
}

#Bootstrapping
bp=apply(rbind(muhatZ,z),2,qutbootstrap)
resultsMC=apply(bp,2,max,na.rm=TRUE)
lambda=quantile(resultsMC, prob=(1-alpha.level)/gamma)
}

#Obtain distribution of Lambda=max(X'z)
resultsMC=apply(bp,2,max,na.rm=TRUE)

#obtain lambda.qut for the given quantile
lambda=quantile(resultsMC, prob=(1-alpha.level)/gamma)

lambdamax=max(abs(t(X)%*%(y-muhat)))

if(!is.null(no.penalty)){
X0[,-no.penalty]=X
scale.factor[-no.penalty]=divX
X=X0
}
else scale.factor=divX

#OUTPUT
out=NULL
out$scale.factor=scale.factor
out$lambda.max=lambdamax
out$lambda=lambda
out$Xnew=X
return(out)

}
24 changes: 24 additions & 0 deletions R/predict.qut.R
@@ -0,0 +1,24 @@
predict.qut <-
function(object, newx, mode='glm',offset=NULL,...){

n=nrow(newx)

#Check for warnings
if(is.null(ncol(newx))) stop("newx should be a matrix with 2 or more columns")
if(length(offset)!=n&!is.null(offset)) stop("length of offset is different to the number of observations")
if(ncol(newx)!=length(object$beta[-1])) stop("newx should be a matrix with the same number of columns as covariates in the fitted model")

if(mode=='lasso'){ #Predict with the lasso coefficients
if(class(object$fit)[1]=='lars') type='fit' else type='response'
newx=t(t(newx)/object$scale.factor)
out=predict(object$fit,newx=newx,type=type,s=object$lambda,mode='lambda',offset=offset)
if(class(object$fit)[1]=='lars') out=out$fit
}
else if(mode=='glm'){ #Predict with the glm fitted coefficients
f=object$family()
if(is.null(offset)) offset=0
out=f$linkinv(object$betaglm[1]+newx%*%object$betaglm[-1]+offset)
}

return(out)
}
162 changes: 162 additions & 0 deletions R/qut.R
@@ -0,0 +1,162 @@
qut <-
function(y,X,fit,family=gaussian,alpha.level='default',M=1000,qut.standardize=TRUE,intercept=TRUE,offset=NULL,bootstrap=TRUE,sigma='qut',estimator='unbiased',type=c('glmnet','lars'),lambda.seq=0,penalty.factor=rep(1,p),lambda.min.ratio=ifelse(n<p,0.01,0.0001),nlambda=100,lambda=NULL,...){
#FUNCTION TO FIT GLM WITH THE QUANTILE UNIVERSAL THRESHOLD

#Hidden function to get glm fit with non-zero coefficients found by LASSO-GLM
betaGLMestimates=function(){
if(type=='glmnet') betatemp=beta[-1] else betatemp=beta
betaglm=betatemp*0

if(is.null(offset)) O=rep(0,n)
if(sum(betatemp!=0)>0){
if(intercept) betaglm0=glm(y~X0[,betatemp!=0],family=family,offset=offset)$coef
else betaglm0=glm(y~X0[,betatemp!=0]-1,family=family,offset=offset)$coef
}
else{if(intercept) betaglm0=glm(y~1,family=family,offset=offset)$coef}
if(intercept){
betaglm[betatemp!=0]=betaglm0[-1]
betaglm=c(betaglm0[1],betaglm)
}
else{
betaglm[betatemp!=0]=betaglm0
betaglm=c(0,betaglm)
}

names(betaglm)[1]='(Intercept)'
names(betaglm)[-1]=1:p
return(betaglm)
}

#Initialize some basic variables
X0=X
n=nrow(X)
p=ncol(X)
f=family()
no.penalty=which(penalty.factor==0)
if(length(no.penalty)==0) no.penalty=NULL

#Check for warnings
type=match.arg(type)
if(type=='lars'&f$family!='gaussian'){
warning("type lars is just for gaussian family; set to glmnet")
type='glmnet'
}
if(type=='lars'&!is.null(no.penalty)){
warning("type lars does not allow no penalty subsets; set to glmnet")
type='glmnet'
}
if(type=='lars'&!is.null(offset)){
warning("type lars does not allow offset; set to glmnet")
type='glmnet'
}

#Obtain lambda.qut (Quantile Universal Threshold)
outqut=lambdaqut(y=y,X=X,alpha.level=alpha.level,M=M,qut.standardize=qut.standardize, family=family,intercept=intercept,no.penalty=no.penalty,offset=offset,bootstrap=bootstrap)
X=outqut$Xnew #Get standardized matrix
lambdaqut=outqut$lambda

#Estimate sigma for gaussian family (if not specified)
if(f$family=='gaussian'&!is.numeric(sigma)){
if(sigma=='qut') sigma=sigmaqut(y=y,X=X,intercept=intercept,estimator=estimator,alpha.level=alpha.level,M=M,qut.standardize=qut.standardize,offset=offset,penalty.factor=penalty.factor,...)
else{
warning("sigma rcv or cv does not take into account penalty factor")
if(sigma=='rcv') sigma = sigmarcv(X=X,y=y,cv=FALSE,intercept=intercept)$sigmahat
else if(sigma=='cv') sigma = sigmarcv(X=X,y=y,cv=TRUE,intercept=intercept)$sigmahat
else sigma = 1
}
}
else sigma=1

#Fit model
if(type=='glmnet'){ #GLMNET
lambdamax=outqut$lambda.max*sum(penalty.factor)/p
#set sequence of lambdas
if(lambda.seq==2){ #default glmnet values
if(missing(lambda.min.ratio)) lambda.min.ratio=ifelse(n<p,0.01,0.0001)
if(missing(nlambda)) nlambda=100
if(missing(lambda)) lambda=NULL
}
else{ #starting from lambdaqut to lambdamax
lambda.min.ratio=lambdaqut*sigma/lambdamax*0.9999
if(lambda.min.ratio>1) lambda.min.ratio=0.9999
if(lambda.seq==0) nlambda=100 #100 lambdas #DEFAULT
else if(lambda.seq==1) nlambda=n #n lambdas
lambda=exp(seq(log(lambdamax),log(lambdaqut*sigma),length=nlambda))/n #n lambdas in logscale
}
if(missing(penalty.factor)) penalty.factor=rep(1,p)
if(!missing(no.penalty)) penalty.factor[no.penalty]=0

if(missing(fit)) fit=glmnet(X,y,standardize=FALSE,intercept=intercept,family=f$family,penalty.factor=penalty.factor,offset=offset,nlambda=nlambda,lambda.min.ratio=lambda.min.ratio,...)

if(max(fit$lambda)==Inf) beta=rep(0,p+1)
else beta=coef(fit, s=lambdaqut*sigma/n*sum(penalty.factor)/p,offset=offset)/c(1,outqut$scale.factor)

lambdaqut=lambdaqut/n

#Get GLM fitted coefficients
betaglm=try(betaGLMestimates(),silent=TRUE)


}
else if(type=='lars'){ #LARS

#LASSO fit
lambdamax=outqut$lambda.max*n
if(missing(penalty.factor)) penalty.factor=rep(1,p)
X=X/penalty.factor

if(missing(fit)) fit=lars(X,y,intercept=intercept,normalize=FALSE,...)
beta=coef(fit,s=lambdaqut*sigma,mode='lambda')

if(intercept) beta0=mean(y)-mean(X%*%beta) else beta0=0
beta=beta/penalty.factor/outqut$scale.factor

#Least squares fit
betaglmestimatesLARS=function(){
betaglm=beta*0
if(sum(beta!=0)>0){
if(intercept) betaglm0=lm(y~X0[,beta!=0])$coef
else betaglm0=lm(y~X0[,beta!=0]-1)$coef
}
else{if(intercept) betaglm0=lm(y~1)$coef}

if(intercept){
betaglm[beta!=0]=betaglm0[-1]
betaglm=c(betaglm0[1],betaglm)
}
else{
betaglm[beta!=0]=betaglm0
betaglm=c(0,betaglm)
}
names(betaglm)[1]='(Intercept)'
names(betaglm)[-1]=1:p
return(betaglm)
}

#Get GLM fitted coefficients
betaglm=try(betaGLMestimates(),silent=TRUE)

beta=c(beta0,beta)

names(beta)[1]='(Intercept)'
names(beta)[-1]=1:p

}

#OUTPUT
out=NULL
out$lambda=lambdaqut
out$lambda.max=lambdamax/n
out$scale.factor=outqut$scale.factor
out$beta=beta
if(class(betaglm)[1]=="try-error") warning("No valid GLM fits were possible to find")
out$betaglm=betaglm
out$fit=fit
out$family=family
if(f$family=='gaussian') out$sigma=sigma
else out$sigma=NA
class(out)='qut'
return(out)
}


0 comments on commit 86c07ac

Please sign in to comment.