Skip to content

Commit

Permalink
version 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
tmaryhuard authored and cran-robot committed May 7, 2021
0 parents commit 07e0335
Show file tree
Hide file tree
Showing 14 changed files with 610 additions and 0 deletions.
33 changes: 33 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Package: qch
Title: Query Composed Hypotheses
Version: 1.0.0
Authors@R:
person(given = "Tristan",
family = "Mary-Huard",
role = c("aut", "cre"),
email = "tristan.mary-huard@agroparistech.fr",
comment = c(ORCID = "0000-0002-3839-9067"))
Description: Provides functions for the joint analysis of K sets
of p-values obtained for a same list of items. This joint analysis is
performed by querying a composed hypothesis, i.e. an arbitrary complex
combination of simple hypotheses, as described in Mary-Huard et al.
(2021) <arXiv:2104.14601>. The null distribution corresponding to the
composed hypothesis of interest is obtained by fitting non-parametric
mixtures models (one for each of the simple hypothesis of the complex
combination). Type I error rate control is achieved through Bayesian
False Discovery Rate control. The 3 main functions of the package
GetHinfo(), qch.fit() and qch.test() correspond to the 3 steps for
querying a composed hypothesis (composed H0/H1 formulation, inferring
the null distribution and testing the null hypothesis).
License: GPL-3
Depends: R (>= 2.10)
Imports: graphics, ks, mclust, stats
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
NeedsCompilation: no
Packaged: 2021-05-06 13:12:40 UTC; dsi
Author: Tristan Mary-Huard [aut, cre] (<https://orcid.org/0000-0002-3839-9067>)
Maintainer: Tristan Mary-Huard <tristan.mary-huard@agroparistech.fr>
Repository: CRAN
Date/Publication: 2021-05-07 12:10:02 UTC
13 changes: 13 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
eb74fd3746987b7711e8df9fae579209 *DESCRIPTION
ba32463f4647e688dce1c43b869478bb *NAMESPACE
1c242249e24450adfeb40c576d75b6b2 *R/Data.R
d070e8e6490908159614a1268da8a11f *R/QCH_functions.R
e0ef681901465d24f47f9139cb2c086a *R/qch-package.R
aacbfe98ddfb739904382b9ad8e98aba *data/PvalSets.rda
eac1213d953962608dd8a2bde7b23de5 *man/FastKerFdr.Rd
6b4cfbe8331db12171abd96a6a00a4c2 *man/GetHinfo.Rd
c7fa58a9ea95bd1e2929f3d0826b0b49 *man/GetHinfoEqual.Rd
ecc1afc03d4c3aa8ff6541d20c35314f *man/PvalSets.Rd
57d5aeceb926b756bf1fc00d86e45f03 *man/qch-package.Rd
97de6d91273ddaab3d3a171626c2fc58 *man/qch.fit.Rd
625b99d1355f866102a481f4c944f26f *man/qch.test.Rd
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Generated by roxygen2: do not edit by hand

export(GetHinfo)
export(GetHinfoEqual)
export(qch.fit)
export(qch.test)
import(graphics)
import(ks)
import(mclust)
import(stats)
13 changes: 13 additions & 0 deletions R/Data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

#' Synthetic example to illustrate the main qch functions
#'
#' PvalSets is a data.frame with 10,000 rows and 3 columns. Each row corresponds to an item,
#' columns 'Pval1' and 'Pval2' each correspond to a test serie over the items, and column 'Class'
#' provides the truth, i.e. if item \eqn{i} belongs to class 1 then the H0 hypothesis is true for the 2 tests,
#' if item \eqn{i} belongs to class 2 (resp. 3) then the H0 hypothesis is true for the first (resp. second)
#' test only, and if item \eqn{i} belongs to class 4 then both H0 hypotheses are false (for the first
#' and the second test).
#'
#' @format A data.frame
#'
"PvalSets"
319 changes: 319 additions & 0 deletions R/QCH_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
###################################################################
#' Generate H0/H1 configurations and specify the ones corresponding to the composed H1
#'
#' @param Q number of test series to be combined
#' @param AtLeast How many H1 hypotheses at least for the item to be of interest ?
#' @param Consecutive Should the significant test series be consecutive ? Default=FALSE
#'
#' @export
#' @return A list of two objects 'Hconfig' and 'Hconfig.H1'.
#' Hconfig is the list of all possible combination of H0 and H1 hypotheses among Q hypotheses tested.
#' Hconfig.H1 is the vector of components of Hconfig that correspond to the 'AtLeast' specification.
#'
#' @examples
#' GetHinfo(4,2)
#'
#' @seealso [GetHinfoEqual()]
GetHinfo <- function(Q,AtLeast,Consecutive=FALSE){

## Build H configurations
Hconfig <- as.matrix(expand.grid(lapply(1:Q, function(q) 0:1)))
Hconfig <- split(Hconfig, seq(2^Q))

## Find the ones that match H1
if (!Consecutive){
MatchingH1 <- sapply(Hconfig, function(h){sum(h)>=AtLeast})
Hconfig.H1 <- which(MatchingH1)
names(Hconfig.H1) <- NULL
} else {
Consec <- paste(rep(1,AtLeast),collapse='')
Hconcat <- sapply(Hconfig, function(hh){paste(hh,collapse='')})
Hconfig.H1 <- grep(pattern = Consec,x = Hconcat)
}

## Collect results
return(list(Hconfig=Hconfig,Hconfig.H1=Hconfig.H1))

}


###################################################################
#' Generate H0/H1 configurations and specify the ones corresponding to the composed H1
#'
#' @param Q number of test series to be combined
#' @param Equal How many H1 hypotheses exactly for the item to be of interest ?
#' @param Consecutive Should the significant test series be consecutive ? Default=FALSE
#'
#' @export
#' @return A list of two objects 'Hconfig' and 'Hconfig.H1'.
#' Hconfig is the list of all possible combination of H0 and H1 hypotheses among Q hypotheses tested.
#' Hconfig.H1 is the vector of components of Hconfig that correspond to the 'Equal' specification.
#'
#' @examples
#' GetHinfoEqual(4,2)
#' @seealso [GetHinfo()]
GetHinfoEqual <- function(Q,Equal,Consecutive=FALSE){

## Build H configurations
Hconfig <- as.matrix(expand.grid(lapply(1:Q, function(q) 0:1)))
Hconfig <- split(Hconfig, seq(2^Q))

## Find the ones that match H1
if (!Consecutive){
MatchingH1 <- sapply(Hconfig, function(h){sum(h)==Equal})
Hconfig.H1 <- which(MatchingH1)
names(Hconfig.H1) <- NULL
} else {
Consec <- paste(rep(1,Equal),collapse='')
ConsecP1 <- paste(rep(1,Equal+1),collapse='')
Hconcat <- sapply(Hconfig, function(hh){paste(hh,collapse='')})
Hconfig.H1 <- intersect(grep(pattern = Consec,x = Hconcat),which(sapply(Hconfig, function(h){sum(h)==Equal})))
}

## Collect results
return(list(Hconfig=Hconfig,Hconfig.H1=Hconfig.H1))

}



###################################################################
#' FastKerFdr
#'
#' @param Pval a vector of p-values (corresponding to a p-value serie)
#' @param p0 a priori proportion of H0 hypotheses
#' @param plotting boolean, should some diagnostic graphs be plotted. Default is FALSE.
#' @param NbKnot The (maximum) number of knot for the kde procedure. Default is 1e5
#' @param tol a tolerance value for convergence. Default is 1e-5
#' @import ks mclust graphics stats
#'
#' @return A list of 3 objects. Object p0 is an estimate of the proportion of H0 hypotheses.,
#' tau is the vector of H1 posteriors.
#' f1 is a numeric vector, each coordinate i corresponding to the evaluation of the H1 density at point pi, where pi is the ith p-value in Pval.
FastKerFdr <- function(Pval,p0=NULL,plotting=FALSE,NbKnot=1e5,tol = 1e-5){

## Transform pvalues into N(0,1) quantiles
n = length(Pval)
X = -qnorm(Pval)

## Get a p0 estimate
if(is.null(p0)){
p0 = min(2*sum(X < 0)/n,1-1/n);
}
p1 = 1 - p0

## Knots, counts and initialization (using Mclust)
if(length(X)>NbKnot){
Hist = hist(X, breaks=NbKnot, plot=FALSE)
Knots = Hist$mids; ActualNbKnot = length(Knots); Counts = Hist$counts;
Xsample = sample(X, NbKnot)
GM = mclust::Mclust(Xsample, G=3, modelNames='E');
mu = max(GM$parameters$mean)
} else {
Knots = X; ActualNbKnot = length(X); Counts = rep(1,ActualNbKnot);
GM = mclust::Mclust(X, G=3, modelNames='E');
mu = max(GM$parameters$mean)
}
if (plotting){
Order <- order(Knots)
Knots <- Knots[Order]
Counts <- Counts[Order]
}

## Initialize the taus using GM
phi = dnorm(Knots); f1 = dnorm(Knots, mean=mu, sd=1)
tau = p1*f1/(p0*phi + p1*f1)

## Get the weighted kernel density estimate
diff = 2*tol; iter = 0
while(diff > tol){
iter = iter + 1
weights = tau*Counts; weights = ActualNbKnot * weights / sum(weights)
f1 = ks::kde(x=Knots, w=weights, eval.points=Knots)$estimate
tauNew = p1*f1/(p0*phi + p1*f1)
## Dirty job 1: get rid of the f1 mass on the left
tauNew[Knots< -3] <- 0
diff = max(abs(tau - tauNew))
tau = tauNew
}
if(plotting){
Hist.fig <- hist(X, freq=TRUE, breaks=sqrt(n), main='', border=8,
xlab="Q-transformed pvalues", ylab="Densities")
bin.width <- mean(diff(Hist.fig$breaks))
lines(Knots, n*bin.width*p0*phi, type='l', col=4, lwd=2);
lines(Knots, n*bin.width*p1*f1, col=2,lwd=2);
lines(Knots, n*bin.width*(p0*phi+p1*f1), lwd=2)
legend("topright", legend=c("H0 dist", "H1 dist","Mixture Dist"),
col=c("blue","red", "black"), lty=c(1,1,2), cex=0.8)
}

## Now get the f1 estimate
KDE = ks::kde(x=Knots, w=weights, eval.points=X)
f1 = KDE$estimate

## Dirty job 2: get rid of numeric problems
f1[f1<0] <- 1e-30
tau = p1*f1 / (p1*f1 + p0*dnorm(X))

return(list(p0=p0,tau=tau,f1=f1))
}


###################################################################
#' Infer Hconfig posteriors
#'
#' @param pValMat a matrix of p-values, each column corresponding to a p-value serie.
#' @param Hconfig an Hconfig list as generated by the [GetHinfo()] function.
#' @param plotting a boolean. Should some diagnostic graphs be plotted ? Default is FALSE.
#' @import stats
#'
#' @return A list of 2 objects 'prior' and 'posterior'.
#' Object 'prior' is a vector of estimated prior probabilities for each of the H-configurations.
#' Object 'posterior' is a matrix providing for each item (in row) its posterior probability to belong to each of the H-configurations (in columns).
#' @export
#'
#' @examples
#' data(PvalSets)
#' PvalMat <- as.matrix(PvalSets[,-3])
#' ## Build the Hconfig objects
#' Q <- 2
#' AtLeast <- 2
#' Hconfig <- GetHinfo(Q,AtLeast)$Hconfig
#'
#' ## Run the function
#' res.fit <- qch.fit(PvalMat,Hconfig)
#'
#' ## Display the prior of each class of items
#' res.fit$prior
#'
#' ## Display the first posteriors
#' head(res.fit$posterior)
qch.fit <- function(pValMat,Hconfig, plotting=FALSE){

n <- nrow(pValMat)
Q <- ncol(pValMat)

#### Step 1: Marginal density estimation

## Get p0 estimates
p0 <- rep(0, Q)
for (q in 1:Q){
p0[q] = min(2*sum(pValMat[,q] > 0.5)/n,1-1/n);
}
SomeH1 <- which(p0<1)
NoH1 <- which(p0==1-1/n);
if(length(NoH1)==1){
message(paste("Pvalue serie",NoH1, "may have very few H1 (or a weird distribution)"))
}
if(length(NoH1)>1){
message(paste("Pvalue series",paste(NoH1,collapse=' '), "may have very few H1 (or a weird distribution)"))
}

## Fit a 2-component mixture to each test serie using kerFdr
f1Mat <- matrix(1, n, Q);
for(q in SomeH1){
ker <- FastKerFdr(pValMat[, q], p0=p0[q], plotting=FALSE)
f1Mat[,q] <- ker$f1
}
f0Mat <- matrix(dnorm(-qnorm(pValMat)),ncol=Q)

#### Step 2: transform marginal densities into config densities

Logf0Mat <- log(f0Mat);
Logf1Mat <- log(f1Mat);
f.Hconfig <- sapply(Hconfig, function(h){
f <- rep(0,nrow(Logf0Mat))
if (length(which(h==1)) > 0){f <- f + rowSums(Logf1Mat[, which(h==1), drop=FALSE])}
if (length(which(h==0)) > 0){f <- f + rowSums(Logf0Mat[, which(h==0), drop=FALSE])}
return(exp(f))
})

#### Step 3: Infer prior estimation using an EM procedure

## Initialization: simple product of marginal priors estimator
NewPrior <- sapply(1:length(Hconfig), function(c){
prod(p0[which(Hconfig[[c]]==0)]) * prod(1-p0[which(Hconfig[[c]]==1)])
})
PriorsAt0 <- which(NewPrior==0)

## EM calibration
NotOK <- TRUE
Precision <- 1e-6
NoLowerThan <- 1e-7
while(NotOK){

## E step
Tau <- f.Hconfig*(tcrossprod(rep(1:n),NewPrior))
Tau <- Tau/rowSums(Tau)

## M step
OldPrior <- NewPrior
NewPrior <- colMeans(Tau)
if(length(PriorsAt0)==0){
NewPrior[NewPrior<NoLowerThan] <- NoLowerThan
} else {
NoLowerCoord <- setdiff(which(NewPrior<NoLowerThan),PriorsAt0)
if(length(NoLowerCoord)>0){
NewPrior[NoLowerCoord] <- NoLowerThan
}
}
NewPrior <- NewPrior/sum(NewPrior)
NotOK <- max((OldPrior-NewPrior)^2) > Precision

}
priorHconfigEM <- NewPrior

#### Step 4: Posterior computation
posterior <- f.Hconfig*(tcrossprod(rep(1:n),priorHconfigEM))
posterior <- posterior/rowSums(posterior)

#### Last but not least: output results
Res <- list(prior=priorHconfigEM, posterior=posterior)
return(Res)
}


###################################################################
#' Perform composed hypothesis testing with FDR control
#'
#' @param posterior a matrix of posterior probabilities for each item to belong the different H-configurations, as provided by the [qch.fit()] function.
#' @param Hconfig.H1 a list of H1 config, as created by the [GetHinfo()] function.
#' @param Alpha the nominal Type I error rate for FDR control.
#'
#' @return A list of 2 objects 'Rejection' and 'lFDR'.
#' Object 'Rejection' is a vector providing for each item the result of the composed hypothesis test, after multiple testing correction.
#' Object 'lFDR' is a vector providing for each item its local FDR estimate.
#' @export
#'
#' @examples
#' data(PvalSets)
#' PvalMat <- as.matrix(PvalSets[,-3])
#' Truth <- PvalSets[,3]
#'
#' ## Build the Hconfig objects
#' Q <- 2
#' AtLeast <- 2
#' Hconfig <- GetHinfo(Q,AtLeast)$Hconfig
#' Hconfig.H1 <- GetHinfo(Q,AtLeast)$Hconfig.H1
#'
#' ## Infer the posteriors
#' res.fit <- qch.fit(PvalMat,Hconfig)
#'
#' ## Run the test procedure with FDR control
#' res.test <- qch.test(res.fit$posterior,Hconfig.H1)
#' table(res.test$Rejection,Truth==4)
qch.test <- function(posterior,Hconfig.H1,Alpha=0.05){

n <- nrow(posterior)
localFDR <- 1-rowSums(posterior[,Hconfig.H1,drop=FALSE])
Order <- order(localFDR)
FDR <- cumsum(localFDR[Order])/(1:n)
NbReject <- max(which(FDR<=Alpha))
Rejection <- rep(0,n)
if (NbReject>0){
Rejection[Order[1:NbReject]] <- 1
}
return(list(Rejection=Rejection,lFDR=localFDR))

}

0 comments on commit 07e0335

Please sign in to comment.