version 1.0.0

cran · May 7, 2021 · 07e0335 · 07e0335
commit 07e0335
Show file tree

Hide file tree

Showing 14 changed files with 610 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,33 @@
+Package: qch
+Title: Query Composed Hypotheses
+Version: 1.0.0
+Authors@R: 
+    person(given = "Tristan",
+           family = "Mary-Huard",
+           role = c("aut", "cre"),
+           email = "tristan.mary-huard@agroparistech.fr",
+           comment = c(ORCID = "0000-0002-3839-9067"))
+Description: Provides functions for the joint analysis of K sets
+    of p-values obtained for a same list of items. This joint analysis is
+    performed by querying a composed hypothesis, i.e. an arbitrary complex
+    combination of simple hypotheses, as described in Mary-Huard et al.
+    (2021) <arXiv:2104.14601>. The null distribution corresponding to the
+    composed hypothesis of interest is obtained by fitting non-parametric
+    mixtures models (one for each of the simple hypothesis of the complex
+    combination). Type I error rate control is achieved through Bayesian
+    False Discovery Rate control.  The 3 main functions of the package
+    GetHinfo(), qch.fit() and qch.test() correspond to the 3 steps for
+    querying a composed hypothesis (composed H0/H1 formulation, inferring
+    the null distribution and testing the null hypothesis).
+License: GPL-3
+Depends: R (>= 2.10)
+Imports: graphics, ks, mclust, stats
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.1.1
+NeedsCompilation: no
+Packaged: 2021-05-06 13:12:40 UTC; dsi
+Author: Tristan Mary-Huard [aut, cre] (<https://orcid.org/0000-0002-3839-9067>)
+Maintainer: Tristan Mary-Huard <tristan.mary-huard@agroparistech.fr>
+Repository: CRAN
+Date/Publication: 2021-05-07 12:10:02 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,13 @@
+eb74fd3746987b7711e8df9fae579209 *DESCRIPTION
+ba32463f4647e688dce1c43b869478bb *NAMESPACE
+1c242249e24450adfeb40c576d75b6b2 *R/Data.R
+d070e8e6490908159614a1268da8a11f *R/QCH_functions.R
+e0ef681901465d24f47f9139cb2c086a *R/qch-package.R
+aacbfe98ddfb739904382b9ad8e98aba *data/PvalSets.rda
+eac1213d953962608dd8a2bde7b23de5 *man/FastKerFdr.Rd
+6b4cfbe8331db12171abd96a6a00a4c2 *man/GetHinfo.Rd
+c7fa58a9ea95bd1e2929f3d0826b0b49 *man/GetHinfoEqual.Rd
+ecc1afc03d4c3aa8ff6541d20c35314f *man/PvalSets.Rd
+57d5aeceb926b756bf1fc00d86e45f03 *man/qch-package.Rd
+97de6d91273ddaab3d3a171626c2fc58 *man/qch.fit.Rd
+625b99d1355f866102a481f4c944f26f *man/qch.test.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,10 @@
+# Generated by roxygen2: do not edit by hand
+
+export(GetHinfo)
+export(GetHinfoEqual)
+export(qch.fit)
+export(qch.test)
+import(graphics)
+import(ks)
+import(mclust)
+import(stats)
diff --git a/R/Data.R b/R/Data.R
@@ -0,0 +1,13 @@
+
+#' Synthetic example to illustrate the main qch functions
+#'
+#' PvalSets is a data.frame with 10,000 rows and 3 columns. Each row corresponds to an item,
+#' columns 'Pval1' and 'Pval2'  each correspond to a test serie over the items, and column 'Class'
+#' provides the truth, i.e. if item \eqn{i} belongs to class 1 then the H0 hypothesis is true for the 2 tests,
+#' if item \eqn{i} belongs to class 2 (resp. 3) then the H0 hypothesis is true for the first (resp. second)
+#' test only, and if item \eqn{i} belongs to class 4 then both H0 hypotheses are false (for the first
+#' and the second test).
+#'
+#' @format A data.frame
+#'
+"PvalSets"
diff --git a/R/QCH_functions.R b/R/QCH_functions.R
@@ -0,0 +1,319 @@
+###################################################################
+#' Generate H0/H1 configurations and specify the ones corresponding to the composed H1
+#'
+#' @param Q number of test series to be combined
+#' @param AtLeast How many H1 hypotheses at least for the item to be of interest ?
+#' @param Consecutive Should the significant test series be consecutive ? Default=FALSE
+#'
+#' @export
+#' @return A list of two objects 'Hconfig' and 'Hconfig.H1'.
+#' Hconfig is the list of all possible combination of H0 and H1 hypotheses among Q hypotheses tested.
+#' Hconfig.H1 is the vector of components of Hconfig that correspond to the 'AtLeast' specification.
+#'
+#' @examples
+#' GetHinfo(4,2)
+#'
+#' @seealso [GetHinfoEqual()]
+GetHinfo <- function(Q,AtLeast,Consecutive=FALSE){
+
+  ## Build H configurations
+  Hconfig <- as.matrix(expand.grid(lapply(1:Q, function(q) 0:1)))
+  Hconfig <- split(Hconfig, seq(2^Q))
+
+  ## Find the ones that match H1
+  if (!Consecutive){
+    MatchingH1 <- sapply(Hconfig, function(h){sum(h)>=AtLeast})
+    Hconfig.H1 <- which(MatchingH1)
+    names(Hconfig.H1) <- NULL
+  } else {
+    Consec <- paste(rep(1,AtLeast),collapse='')
+    Hconcat <- sapply(Hconfig, function(hh){paste(hh,collapse='')})
+    Hconfig.H1 <- grep(pattern = Consec,x = Hconcat)
+  }
+
+  ## Collect results
+  return(list(Hconfig=Hconfig,Hconfig.H1=Hconfig.H1))
+
+}
+
+
+###################################################################
+#' Generate H0/H1 configurations and specify the ones corresponding to the composed H1
+#'
+#' @param Q number of test series to be combined
+#' @param Equal How many H1 hypotheses exactly for the item to be of interest ?
+#' @param Consecutive Should the significant test series be consecutive ? Default=FALSE
+#'
+#' @export
+#' @return A list of two objects 'Hconfig' and 'Hconfig.H1'.
+#' Hconfig is the list of all possible combination of H0 and H1 hypotheses among Q hypotheses tested.
+#' Hconfig.H1 is the vector of components of Hconfig that correspond to the 'Equal' specification.
+#'
+#' @examples
+#' GetHinfoEqual(4,2)
+#' @seealso [GetHinfo()]
+GetHinfoEqual <- function(Q,Equal,Consecutive=FALSE){
+
+  ## Build H configurations
+  Hconfig <- as.matrix(expand.grid(lapply(1:Q, function(q) 0:1)))
+  Hconfig <- split(Hconfig, seq(2^Q))
+
+  ## Find the ones that match H1
+  if (!Consecutive){
+    MatchingH1 <- sapply(Hconfig, function(h){sum(h)==Equal})
+    Hconfig.H1 <- which(MatchingH1)
+    names(Hconfig.H1) <- NULL
+  } else {
+    Consec <- paste(rep(1,Equal),collapse='')
+    ConsecP1 <- paste(rep(1,Equal+1),collapse='')
+    Hconcat <- sapply(Hconfig, function(hh){paste(hh,collapse='')})
+    Hconfig.H1 <- intersect(grep(pattern = Consec,x = Hconcat),which(sapply(Hconfig, function(h){sum(h)==Equal})))
+  }
+
+  ## Collect results
+  return(list(Hconfig=Hconfig,Hconfig.H1=Hconfig.H1))
+
+}
+
+
+
+###################################################################
+#' FastKerFdr
+#'
+#' @param Pval a vector of p-values (corresponding to a p-value serie)
+#' @param p0 a priori proportion of H0 hypotheses
+#' @param plotting boolean, should some diagnostic graphs be plotted. Default is FALSE.
+#' @param NbKnot The (maximum) number of knot for the kde procedure. Default is 1e5
+#' @param tol a tolerance value for convergence. Default is 1e-5
+#' @import ks mclust graphics stats
+#'
+#' @return A list of 3 objects. Object p0 is an estimate of the proportion of H0 hypotheses.,
+#' tau is the vector of H1 posteriors.
+#' f1 is a numeric vector, each coordinate i corresponding to the evaluation of the H1 density at point pi, where pi is the ith p-value in Pval.
+FastKerFdr <- function(Pval,p0=NULL,plotting=FALSE,NbKnot=1e5,tol = 1e-5){
+
+  ## Transform pvalues into N(0,1) quantiles
+  n = length(Pval)
+  X = -qnorm(Pval)
+
+  ## Get a p0 estimate
+  if(is.null(p0)){
+    p0 = min(2*sum(X < 0)/n,1-1/n);
+  }
+  p1 = 1 - p0
+
+  ## Knots, counts and initialization (using Mclust)
+  if(length(X)>NbKnot){
+    Hist = hist(X, breaks=NbKnot, plot=FALSE)
+    Knots = Hist$mids; ActualNbKnot = length(Knots); Counts = Hist$counts;
+    Xsample = sample(X, NbKnot)
+    GM = mclust::Mclust(Xsample, G=3, modelNames='E');
+    mu = max(GM$parameters$mean)
+  } else {
+    Knots = X; ActualNbKnot = length(X); Counts = rep(1,ActualNbKnot);
+    GM = mclust::Mclust(X, G=3, modelNames='E');
+    mu = max(GM$parameters$mean)
+  }
+  if (plotting){
+    Order <- order(Knots)
+    Knots <- Knots[Order]
+    Counts <- Counts[Order]
+  }
+
+  ## Initialize the taus using GM
+  phi = dnorm(Knots); f1 = dnorm(Knots, mean=mu, sd=1)
+  tau = p1*f1/(p0*phi + p1*f1)
+
+  ## Get the weighted kernel density estimate
+  diff = 2*tol; iter = 0
+  while(diff > tol){
+    iter = iter + 1
+    weights = tau*Counts; weights = ActualNbKnot * weights / sum(weights)
+    f1 = ks::kde(x=Knots, w=weights, eval.points=Knots)$estimate
+    tauNew = p1*f1/(p0*phi + p1*f1)
+    ## Dirty job 1: get rid of the f1 mass on the left
+    tauNew[Knots< -3] <- 0
+    diff = max(abs(tau - tauNew))
+    tau = tauNew
+  }
+  if(plotting){
+    Hist.fig <- hist(X, freq=TRUE, breaks=sqrt(n), main='', border=8,
+                     xlab="Q-transformed pvalues", ylab="Densities")
+    bin.width <- mean(diff(Hist.fig$breaks))
+    lines(Knots, n*bin.width*p0*phi, type='l', col=4, lwd=2);
+    lines(Knots, n*bin.width*p1*f1, col=2,lwd=2);
+    lines(Knots, n*bin.width*(p0*phi+p1*f1), lwd=2)
+    legend("topright", legend=c("H0 dist", "H1 dist","Mixture Dist"),
+           col=c("blue","red", "black"), lty=c(1,1,2), cex=0.8)
+  }
+
+  ## Now get the f1 estimate
+  KDE = ks::kde(x=Knots, w=weights, eval.points=X)
+  f1 = KDE$estimate
+
+  ## Dirty job 2: get rid of numeric problems
+  f1[f1<0] <- 1e-30
+  tau = p1*f1 / (p1*f1 + p0*dnorm(X))
+
+  return(list(p0=p0,tau=tau,f1=f1))
+}
+
+
+###################################################################
+#' Infer Hconfig posteriors
+#'
+#' @param pValMat a matrix of p-values, each column corresponding to a p-value serie.
+#' @param Hconfig an Hconfig list as generated by the [GetHinfo()] function.
+#' @param plotting a boolean. Should some diagnostic graphs be plotted ? Default is FALSE.
+#' @import stats
+#'
+#' @return A list of 2 objects 'prior' and 'posterior'.
+#' Object 'prior' is a vector of estimated prior probabilities for each of the H-configurations.
+#' Object 'posterior' is a matrix providing for each item (in row) its posterior probability to belong to each of the H-configurations (in columns).
+#' @export
+#'
+#' @examples
+#' data(PvalSets)
+#' PvalMat <- as.matrix(PvalSets[,-3])
+#' ## Build the Hconfig objects
+#' Q <- 2
+#' AtLeast <- 2
+#' Hconfig <- GetHinfo(Q,AtLeast)$Hconfig
+#'
+#' ## Run the function
+#' res.fit <- qch.fit(PvalMat,Hconfig)
+#'
+#' ## Display the prior of each class of items
+#' res.fit$prior
+#'
+#' ## Display the first posteriors
+#' head(res.fit$posterior)
+qch.fit <- function(pValMat,Hconfig, plotting=FALSE){
+
+  n <- nrow(pValMat)
+  Q <- ncol(pValMat)
+
+  #### Step 1: Marginal density estimation
+
+  ## Get p0 estimates
+  p0 <- rep(0, Q)
+  for (q in 1:Q){
+    p0[q] = min(2*sum(pValMat[,q] > 0.5)/n,1-1/n);
+  }
+  SomeH1 <-  which(p0<1)
+  NoH1 <- which(p0==1-1/n);
+  if(length(NoH1)==1){
+    message(paste("Pvalue serie",NoH1, "may have very few H1 (or a weird distribution)"))
+  }
+  if(length(NoH1)>1){
+    message(paste("Pvalue series",paste(NoH1,collapse=' '), "may have very few H1 (or a weird distribution)"))
+  }
+
+  ## Fit a 2-component mixture to each test serie using kerFdr
+  f1Mat <- matrix(1, n, Q);
+  for(q in SomeH1){
+    ker <- FastKerFdr(pValMat[, q], p0=p0[q], plotting=FALSE)
+    f1Mat[,q] <- ker$f1
+  }
+  f0Mat <- matrix(dnorm(-qnorm(pValMat)),ncol=Q)
+
+  #### Step 2: transform marginal densities into config densities
+
+  Logf0Mat <- log(f0Mat);
+  Logf1Mat <- log(f1Mat);
+  f.Hconfig <- sapply(Hconfig, function(h){
+    f <- rep(0,nrow(Logf0Mat))
+    if (length(which(h==1)) > 0){f <- f + rowSums(Logf1Mat[, which(h==1), drop=FALSE])}
+    if (length(which(h==0)) > 0){f <- f + rowSums(Logf0Mat[, which(h==0), drop=FALSE])}
+    return(exp(f))
+  })
+
+  #### Step 3: Infer prior estimation using an EM procedure
+
+  ## Initialization: simple product of marginal priors estimator
+  NewPrior <- sapply(1:length(Hconfig), function(c){
+    prod(p0[which(Hconfig[[c]]==0)]) * prod(1-p0[which(Hconfig[[c]]==1)])
+  })
+  PriorsAt0 <- which(NewPrior==0)
+
+  ## EM calibration
+  NotOK <- TRUE
+  Precision <- 1e-6
+  NoLowerThan <- 1e-7
+  while(NotOK){
+
+    ## E step
+    Tau <- f.Hconfig*(tcrossprod(rep(1:n),NewPrior))
+    Tau <- Tau/rowSums(Tau)
+
+    ## M step
+    OldPrior <- NewPrior
+    NewPrior <- colMeans(Tau)
+    if(length(PriorsAt0)==0){
+      NewPrior[NewPrior<NoLowerThan] <- NoLowerThan
+    } else {
+      NoLowerCoord <- setdiff(which(NewPrior<NoLowerThan),PriorsAt0)
+      if(length(NoLowerCoord)>0){
+        NewPrior[NoLowerCoord] <- NoLowerThan
+      }
+    }
+    NewPrior <- NewPrior/sum(NewPrior)
+    NotOK <- max((OldPrior-NewPrior)^2) > Precision
+
+  }
+  priorHconfigEM <- NewPrior
+
+  #### Step 4: Posterior computation
+  posterior <- f.Hconfig*(tcrossprod(rep(1:n),priorHconfigEM))
+  posterior <- posterior/rowSums(posterior)
+
+  #### Last but not least: output results
+  Res <- list(prior=priorHconfigEM, posterior=posterior)
+  return(Res)
+}
+
+
+###################################################################
+#' Perform composed hypothesis testing with FDR control
+#'
+#' @param posterior a matrix of posterior probabilities for each item to belong the different H-configurations, as provided by the [qch.fit()] function.
+#' @param Hconfig.H1 a list of H1 config, as created by the [GetHinfo()] function.
+#' @param Alpha the nominal Type I error rate for FDR control.
+#'
+#' @return A list of 2 objects 'Rejection' and 'lFDR'.
+#' Object 'Rejection' is a vector providing for each item the result of the composed hypothesis test, after multiple testing correction.
+#' Object 'lFDR' is a vector providing for each item its local FDR estimate.
+#' @export
+#'
+#' @examples
+#' data(PvalSets)
+#' PvalMat <- as.matrix(PvalSets[,-3])
+#' Truth <- PvalSets[,3]
+#'
+#' ## Build the Hconfig objects
+#' Q <- 2
+#' AtLeast <- 2
+#' Hconfig <- GetHinfo(Q,AtLeast)$Hconfig
+#' Hconfig.H1 <- GetHinfo(Q,AtLeast)$Hconfig.H1
+#'
+#' ## Infer the posteriors
+#' res.fit <- qch.fit(PvalMat,Hconfig)
+#'
+#' ## Run the test procedure with FDR control
+#' res.test <- qch.test(res.fit$posterior,Hconfig.H1)
+#' table(res.test$Rejection,Truth==4)
+qch.test <- function(posterior,Hconfig.H1,Alpha=0.05){
+
+  n <- nrow(posterior)
+  localFDR <- 1-rowSums(posterior[,Hconfig.H1,drop=FALSE])
+  Order <- order(localFDR)
+  FDR <- cumsum(localFDR[Order])/(1:n)
+  NbReject <- max(which(FDR<=Alpha))
+  Rejection <- rep(0,n)
+  if (NbReject>0){
+    Rejection[Order[1:NbReject]] <- 1
+  }
+  return(list(Rejection=Rejection,lFDR=localFDR))
+
+}
+