diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..61730b2 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,16 @@ +Package: groupTesting +Title: Simulating and Modeling Group (Pooled) Testing Data +Version: 1.0.0 +Date: 2021-11-19 +Author: Md S. Warasi +Maintainer: Md S. Warasi +Description: Provides an expectation-maximization (EM) algorithm using the approach introduced in Xie (2001) . The EM algorithm can be used to estimate the prevalence (overall proportion) of a disease and to estimate a binary regression model from among the class of generalized linear models based on group testing data. The estimation framework we consider offers a flexible and general approach; i.e., its application is not limited to any specific group testing protocol. Consequently, the EM algorithm can model data arising from simple pooling as well as advanced pooling such as hierarchical testing, array testing, and quality control pooling. Also, provided are functions that can be used to conduct the Wald tests described in Buse (1982) and to simulate the group testing data described in Kim et al. (2007) . +License: GPL-3 +Depends: R(>= 3.5.0) +Imports: pracma +NeedsCompilation: yes +Encoding: UTF-8 +RoxygenNote: 7.1.1 +Packaged: 2021-11-19 18:49:25 UTC; msarker +Repository: CRAN +Date/Publication: 2021-11-22 09:00:02 UTC diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..f710c19 --- /dev/null +++ b/MD5 @@ -0,0 +1,16 @@ +2d2e7f5417633fc3e05179f24ce12867 *DESCRIPTION +19e4497639be28ffc4dc737e66e5e2d7 *NAMESPACE +67e9f5c036f34456c44ea590b83e93b2 *R/gtcode.R +e7110e7f48030f35698eb91d91a4e528 *R/linkfns.R +a449c57bfef435657267f231eba233ca *R/propEM.R +b6523d7dfb32c95639cced2d36b2c0fc *R/regEM.R +c0f4895459d8677dec2f2ad4600141f0 *R/waldTest.R +e30b5e61eb92e83201c46850741bfaec *man/array.gt.simulation.Rd +153f56fbddfd60e6bf7ac2fdd424b0c5 *man/glm.gt.Rd +005753654cb1ba175c4a66d57e5d166e *man/glmLink.Rd +98cd54c4cb84265e55b4bb7c982cf39e *man/hier.gt.simulation.Rd +2140f47f6eaa073f0b042aad9ed5da82 *man/prop.gt.Rd +148ef1e8336716c9208b7e77ad53fbef *man/waldTest.Rd +9b11ec0e9f983e39a5cf915d5002dc6f *src/Makevars.txt +cb44d44d2eb4d6b7ffd7d32607f5968e *src/gtestingc.c +1f0efc225a8218d7aad4216e8597ab7a *src/gtestingf.f95 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..09491ba --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,16 @@ +# Generated by roxygen2: do not edit by hand + +export(array.gt.simulation) +export(glm.gt) +export(glmLink) +export(hier.gt.simulation) +export(prop.gt) +export(waldTest) +importFrom(pracma,fderiv) +importFrom(stats,optim) +importFrom(stats,pchisq) +importFrom(stats,pnorm) +importFrom(stats,qnorm) +importFrom(stats,rbinom) +importFrom(stats,runif) +useDynLib(groupTesting, .registration=TRUE) diff --git a/R/gtcode.R b/R/gtcode.R new file mode 100644 index 0000000..27c43d5 --- /dev/null +++ b/R/gtcode.R @@ -0,0 +1,456 @@ +#' Simulating Hierarchical Group Testing Data +#' +#' This function simulates hierarchical group testing data with any number of hierarchical stages. +#' +#' @param N The number of individuals to be tested. +#' @param p A vector of length N consisting of individual disease probabilities. +#' @param S The number of stages used in testing, where \code{S} >= 1. +#' @param psz A vector of pool sizes in stages 1-\code{S}. +#' @param Se A vector of assay sensitivities in stages 1-\code{S}. +#' @param Sp A vector of assay specificities in stages 1-\code{S}. +#' @param assayID A vector of the identification numbers of the assays used in stages 1-\code{S}. +#' @param Yt A vector of individual true disease statuses. +#' +#' @importFrom stats rbinom +#' +#' @details +#' We consider the \eqn{S}-stage hierarchical testing protocol outlined in Kim et al. (2007). Under this protocol, \eqn{N} individual specimens are first assigned to \eqn{m} non-overlapping pools, where each initial pool size is \eqn{c}; i.e., \eqn{N=mc}. The initial pools are tested in stage 1. If a pooled test is negative, all members in the pool are diagnosed as negative. However, if a pooled test is positive, the pool members are split into non-overlapping subpools to be tested in the next stage. This procedure is continued. Note that individual testing is used in the final stage, \eqn{S}, for case identification. +#' +#' \code{S} is a positive integer, \code{S} >= 1. When \code{S}=1, only the non-overlapping initial pools are tested in stage 1. +#' +#' If \code{N} is not divisible by the initial pool size \eqn{c}, we implement the following policy to test the remainder individuals: (1) when \code{S}=1, simply test the remainder pool once as a pooled sample; (2) when \code{S}>1, test the remainder pool based on 2-stage hierarchical testing. +#' +#' \code{p} is a vector of individual disease probabilities. When all individuals have the same probability of disease, say, 0.10, p can be specified as p=rep(0.10, N) or p=0.10. +#' +#' \code{psz} is a vector of length \code{S}, where the first element is the stage-1 pool size, the second element is the stage-2 pool size, and so on. Pool size at any stage must be divisible by the pool size used at the next stage. For example, \code{psz} can be specified as \code{c(12,3,1)} but not as \code{c(12,5,1)}. +#' +#' When \code{psz} is a vector of length 1, test responses are simulated only from the initial pools. +#' +#' \code{Se} is a vector of length \code{S}, where the first element is the sensitivity of the assay used in stage 1, the second element is sensitivity of the assay in stage 2, and so on. +#' +#' \code{Sp} is a vector of length \code{S}, where the first element is the specificity of the assay used in stage 1, the second element is specificity of the assay in stage 2, and so on. +#' +#' \code{assayID} is a vector of length \code{S}, where the first element is the ID of the assay in stage 1, the second element is the ID of the assay in stage 2, and so on. +#' +#' When available, the individual true disease statuses (1 for positive and 0 for negative) can be used in simulating the group testing data through argument \code{Yt}. When an input is entered for \code{Yt}, argument \code{p} will be ignored. +#' +#' @return A list with components: +#' \item{gtData}{The simulated group testing data.} +#' \item{testsExp}{The number of tests expended.} +#' +#' @export +#' +#' @references +#' Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63(4), 1152–1163. +#' +#' @seealso +#' \code{\link{array.gt.simulation}} for simulation of the array-based group testing data. +#' +#' @examples +#' +#' ## Example 1: Two-stage hierarchical (Dorfman) testing +#' N <- 50 # Sample size +#' psz <- c(5, 1) # Pool sizes used in stages 1 and 2 +#' S <- 2 # The number of stages +#' Se <- c(0.95, 0.95) # Sensitivities in stages 1-2 +#' Sp <- c(0.98, 0.98) # Specificities in stages 1-2 +#' assayID <- c(1, 1) # The same assay in both stages +#' +#' # (a) Homogeneous population +#' pHom <- 0.10 # Overall prevalence +#' hier.gt.simulation(N=N,p=pHom,S=S,psz=psz,Se=Se,Sp=Sp,assayID=assayID) +#' +#' # Alternatively, the individual true statuses can be used as: +#' yt <- rbinom( N, size=1, prob=0.1 ) +#' hier.gt.simulation(N=N,S=S,psz=psz,Se=Se,Sp=Sp,assayID=assayID,Yt=yt) +#' +#' # (b) Heterogeneous population (regression) +#' param <- c(-3,2,1) +#' x1 <- rnorm(N, mean=0, sd=.75) +#' x2 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(1, x1, x2) +#' pReg <- exp(X%*%param)/(1+exp(X%*%param)) # Logit +#' hier.gt.simulation(N=N,p=pReg,S=S,psz=psz,Se=Se,Sp=Sp,assayID=assayID) +#' +#' ## Example 2: Initial (1-stage) pooled testing data +#' N <- 50 +#' S <- 1 +#' Se <- 0.95 +#' Sp <- 0.98 +#' assayID <- 1 +#' +#' # (a) Homogeneous population +#' pHom <- 0.10 # Overall prevalence +#' +#' # a(i) Pooled testing +#' psz <- 5 # pool size +#' hier.gt.simulation(N,pHom,S,psz,Se,Sp,assayID) +#' +#' # a(ii) Inidividual testing +#' psz <- 1 # pool size +#' hier.gt.simulation(N,pHom,S,psz,Se,Sp,assayID) +#' +#' # (b) Heterogeneous population (regression) +#' param <- c(-3,2,1) +#' x1 <- rnorm(N, mean=0, sd=.75) +#' x2 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(1, x1, x2) +#' pReg <- exp(X%*%param)/(1+exp(X%*%param)) # Logit +#' +#' # b(i) Pooled testing +#' psz <- 5 +#' hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID) +#' +#' # b(ii) Individual testing +#' psz <- 1 +#' hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID) +#' +#' ## Example 3: Data with other configurations +#' N <- 48 +#' p <- 0.10 +#' Se <- c(.90, .95, .92, .90, .99) +#' Sp <- c(.96, .96, .90, .92, .95) +#' Assay <- 1:5 +#' +#' # Initial pooled testing, using the first element of Se, Sp & Assay +#' pszH1 <- 4 +#' hier.gt.simulation(N=N,p=p,S=1,psz=pszH1,Se=Se,Sp=Sp,assayID=Assay) +#' +#' pszH2 <- c(4,1) # Two-stage, using first 2 elements of Se, Sp & Assay +#' hier.gt.simulation(N=N,p=p,S=2,psz=pszH2,Se=Se,Sp=Sp,assayID=Assay) +#' +#' pszH4 <- c(16,8,2,1) # Four-stage, using first 4 elements of Se, Sp & Assay +#' hier.gt.simulation(N=N,p=p,S=4,psz=pszH4,Se=Se,Sp=Sp,assayID=Assay) +#' +#' pszH3 <- c(12,2,1) # Three-stage, using first 3 elements of Se, Sp & Assay +#' Assay3 <- c(2,1,3) # Array ID numbers do not need to be in order +#' hier.gt.simulation(N=N,p=p,S=3,psz=pszH3,Se=Se,Sp=Sp,assayID=Assay3) +#' +#' # Works with a remainder pool of 2 individuals +#' N <- 50 +#' psz <- c(12,2,1) +#' hier.gt.simulation(N=N,p=p,S=3,psz=psz,Se=Se,Sp=Sp,assayID=Assay) +#' +hier.gt.simulation <- function(N,p=0.10,S,psz,Se,Sp,assayID,Yt=NULL){ + c.s <- psz ## Pool sizes + S <- length(c.s) ## Number of stages + if(min(c.s)<=0) stop("Pool size cannot be negative or zero") + if(S > 1){ + quot <- rep(-9,S-1) + for(s in 1:(S-1)){quot[s] <- c.s[s]%%c.s[s+1]} + if(max(quot) > 0) stop("Pool size at any stage must be divisible by the next-stage pool size") + } + ## Setting up pooling configurations + M <- floor(N/c.s[1]) + N0 <- M*c.s[1] + if( !is.null(Yt) ){ + Ytil1 <- Yt + }else{ + Ytil1 <- stats::rbinom(N,1,p) + } + Rem <- N-N0 + Psz <- n.div <- list() + Psz[[1]] <- rep(c.s[1],M) + n.div[[1]] <- rep(1,length(Psz[[1]])) + n.sm <- matrix(-9,length(Psz[[1]]),S) + n.sm[ ,1] <- 1 + if(S > 1){ + for( s in 1:(S-1) ){ + store <- tmp <- NULL + for(i in 1:length(Psz[[s]])){ + temp <- rep( c.s[s+1],floor(Psz[[s]][i]/c.s[s+1]) ) + store <- c(store,temp) + tmp <- c(tmp,length(temp)) + } + Psz[[s+1]] <- store + n.div[[s+1]] <- tmp + } + vec <- rep(1,length(Psz[[1]])) + for(s in 1:(S-1) ){ + id0 <- cumsum(c(0,vec)) + for(i in 1:length(Psz[[1]])){ + vec[i] <- sum(n.div[[s+1]][(id0[i]+1):id0[i+1]]) + } + n.sm[ ,s+1] <- vec + } + } + Zmat <- NULL + cc <- cumsum(c(0,colSums(n.sm))) + id <- cumsum(c(0,Psz[[1]])) + pl.res <- rep(-9,cc[S+1]) + ## Simulating pooled responses at stage 1 + for(m in 1:M){ + mid <- (id[m]+1):id[m+1] + prob1 <- ifelse(sum(Ytil1[mid])>0,Se[1],1-Sp[1]) + z1 <- stats::rbinom(1,1,prob1) + pl.res[m] <- z1 + Zmat <- rbind(Zmat,c(z1,length(mid),Se[1],Sp[1],assayID[1],mid)) + } + if(S == 1){ + if(Rem > 0){ + rid1 <- (N0+1):N + zr1 <- rbinom(1,1,ifelse(sum(Ytil1[rid1])>0,Se[1],1-Sp[1])) + Zmat <- rbind(Zmat,c(zr1,Rem,Se[1],Sp[1],assayID[1],rid1,rep(-9,c.s[1]-Rem))) + warning("N is not divisible by the initial pool size; a smaller remainder pool is used") + } + } + ## Simulating pooled responses from subsequent stages + if( S > 1){ + for(s in 2:S){ + Z1 <- pl.res[(cc[s-1]+1):cc[s]] + cid <- cumsum(c(0,Psz[[s]])) + cn <- cumsum(c(0,n.div[[s]])) + tmp1 <- NULL + for(d in 1:length(Psz[[s-1]])){ + tmp3 <- NULL + if(Z1[d]==0){ + tmp3 <- rep(0,length((cn[d]+1):cn[d+1])) + } + if(Psz[[s-1]][d]==1){ + tmp3 <- Z1[d] + } + if(Psz[[s-1]][d]>1){ + if(Z1[d] == 1){ + for(i in (cn[d]+1):cn[d+1]){ + crng <- (cid[i]+1):cid[i+1] + probs <- ifelse(sum(Ytil1[crng])>0,Se[s],1-Sp[s]) + ztp1 <- stats::rbinom(1,1,probs) + tmp3 <- c(tmp3,ztp1) + fill1 <- rep(-9,Psz[[1]][1]-length(crng)) + Zmat <- rbind(Zmat,c(ztp1,length(crng),Se[s],Sp[s],assayID[s],crng,fill1)) + } + } + } + tmp1 <- c(tmp1,tmp3) + } + pl.res[(cc[s]+1):cc[s+1]] <- tmp1 + } + if(Rem == 1){ + yr1 <- rbinom(1,1,ifelse(Ytil1[N]==1,Se[S],1-Sp[S])) + Zmat <- rbind(Zmat,c(yr1,1,Se[S],Sp[S],assayID[S],N,rep(-9,c.s[1]-1))) + if( Rem > 0) warning("N is not divisible by the initial pool size; a smaller remainder pool is used") + } + if(Rem > 1){ + rid <- (M*c.s[1]+1):N + ytr1 <- Ytil1[rid] + zr2 <- rbinom(1,1,ifelse(sum(ytr1)>0,Se[S-1],1-Sp[S-1])) + Zmat <- rbind(Zmat,c(zr2,Rem,Se[S-1],Sp[S-1],assayID[S-1],rid,rep(-9,c.s[1]-Rem))) + if(zr2 > 0){ + yrm1 <- rbinom(Rem,1,ifelse(ytr1==1,Se[S],1-Sp[S])) + Zmat <- rbind(Zmat,cbind(yrm1,1,Se[S],Sp[S],assayID[S],rid,matrix(-9,Rem,c.s[1]-1))) + } + if( Rem > 0) warning("N is not divisible by the initial pool size; a smaller remainder pool is used") + } + } + ## Output + ivid <- paste( rep("Mem",Psz[[1]][1]),1:Psz[[1]][1],sep="" ) + colnames(Zmat) <- c("Z","psz","Se","Sp","Assay",ivid) + rownames(Zmat) <- paste("Pool:",1:nrow(Zmat),sep="") + list("gtData" = Zmat, + "testsExp"= nrow(Zmat)) +} + +#' Simulating Array-Based Group Testing Data +#' +#' This function simulates two-dimensional array-based group testing data. +#' +#' @param N The number of individuals to be tested. +#' @param p A vector of length N consisting of individual disease probabilities. +#' @param protocol Either "A2" or "A2M", where "A2" ("A2M") refers to the two-dimensional array without (with) testing the members of an array as a single pooled sample. +#' @param n The row (or column) size of the arrays. +#' @param Se A vector of assay sensitivities. +#' @param Sp A vector of assay specificities. +#' @param assayID A vector of assay identification numbers. +#' @param Yt A vector of individual true disease statuses. +#' +#' @importFrom stats rbinom +#' +#' @details +#' We consider the array testing protocol outlined in Kim et al. (2007). Under this protocol, \eqn{N} individuals are assigned to \eqn{m} non-overlapping \eqn{n}-by-\eqn{n} matrices such that \eqn{N=mn^2}. From each matrix, \eqn{n} pools are formed using the row specimens and another \eqn{n} pools are formed using the column specimens. In stage 1, the \eqn{2n} pools are tested. In stage 2, individual testing is used for case identification according to the strategy described in Kim et al. (2007). This is a 2-stage protocol called \emph{Square Array without Master Pool Testing} and denoted by \eqn{A2(n:1)} in Kim et al. (2007). A variant (3-stage protocol) is also presented in Kim et al. (2007) which employs testing the \eqn{n^2} array members together as an initial pooled unit before implementing the 2-stage array. If the initial pooled test is negative, the procedure stops (i.e., the 2-stage array is not needed). However, if the pooled test is positive, the 2-stage protocol is used as before. This 3-stage approach is called \emph{Square Array with Master Pool Testing} and is denoted by \eqn{A2(n^2:n:1)}. See Kim et al. (2007) for more details. +#' +#' \code{N} should be divisible by the array size \eqn{n^2}. When not divisible, the remainder individuals are tested one by one (i.e., individual testing). +#' +#' \code{p} is a vector of individual disease probabilities. When all individuals have the same probability of disease, say, 0.10, \code{p} can be specified as rep(0.10, N) or p=0.10. +#' +#' For "A2" and "A2M", the pool sizes used are \code{c(n, 1)} and \code{c(n^2, n, 1)}, respectively. +#' +#' For "A2", \code{Se} is \code{c(Se1, Se2)}, where \code{Se1} is the sensitivity of the assay used for both row and column pools, and \code{Se2} is the sensitivity of the assay used for individual testing. For "A2M", \code{Se} is \code{c(Se1, Se2, Se3)}, where \code{Se1} is for the initial array pool, \code{Se2} is for the row and column pools, and \code{Se3} is for individual testing. \code{Sp} is specified in the same manner. +#' +#' For "A2", \code{assayID} is \code{c(1, 1)} when the same assay is used for row/column pool testing as well as for individual testing, and assayID is \code{c(1, 2)} when assay 1 is used for row/column pool testing and assay 2 is used for individual testing. In the same manner, \code{assayID} is specified for "A2M" as \code{c(1, 1, 1)}, \code{c(1, 2, 3)}, and in many other ways. +#' +#' When available, the individual true disease statuses (1 for positive and 0 for negative) can be used in simulating the group testing data through argument \code{Yt}. When an input is entered for \code{Yt}, argument \code{p} will be ignored. +#' +#' @return A list with components: +#' \item{gtData}{The simulated group testing data.} +#' \item{testsExp}{The number of tests expended in the simulation.} +#' +#' @export +#' +#' @references +#' Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63(4), 1152–1163. +#' +#' @seealso +#' \code{\link{hier.gt.simulation}} for simulation of the hierarchical group testing data. +#' +#' @examples +#' +#' ## Example 1: Square Array without Master Pool Testing (i.e., 2-Stage Array) +#' N <- 48 # Sample size +#' protocol <- "A2" # 2-stage array +#' n <- 4 # Row/column size +#' Se <- c(0.95, 0.95) # Sensitivities in stages 1-2 +#' Sp <- c(0.98, 0.98) # Specificities in stages 1-2 +#' assayID <- c(1, 1) # The same assay in both stages +#' +#' # (a) Homogeneous population +#' pHom <- 0.10 # Overall prevalence +#' array.gt.simulation(N=N,p=pHom,protocol=protocol,n=n,Se=Se,Sp=Sp,assayID=assayID) +#' +#' # Alternatively, the individual true statuses can be used as: +#' yt <- rbinom( N, size=1, prob=0.1 ) +#' array.gt.simulation(N=N,protocol=protocol,n=n,Se=Se,Sp=Sp,assayID=assayID,Yt=yt) +#' +#' # (b) Heterogeneous population (regression) +#' param <- c(-3,2,1) +#' x1 <- rnorm(N, mean=0, sd=.75) +#' x2 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(1, x1, x2) +#' pReg <- exp(X%*%param)/(1+exp(X%*%param)) # Logit +#' array.gt.simulation(N=N,p=pReg,protocol=protocol,n=n,Se=Se,Sp=Sp,assayID=assayID) +#' +#' # The above examples with different assays +#' Se <- c(0.95, 0.98) +#' Sp <- c(0.97, 0.99) +#' assayID <- c(1, 2) +#' array.gt.simulation(N,pHom,protocol,n,Se,Sp,assayID) +#' array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID) +#' +#' ## Example 2: Square Array with Master Pool Testing (i.e., 3-Stage Array) +#' N <- 48 +#' protocol <- "A2M" +#' n <- 4 +#' Se <- c(0.95, 0.95, 0.95) +#' Sp <- c(0.98, 0.98, 0.98) +#' assayID <- c(1, 1, 1) # The same assay in 3 stages +#' +#' # (a) Homogeneous population +#' pHom <- 0.10 +#' array.gt.simulation(N,pHom,protocol,n,Se,Sp,assayID) +#' +#' # (b) Heterogeneous population (regression) +#' param <- c(-3,2,1) +#' x1 <- rnorm(N, mean=0, sd=.75) +#' x2 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(1, x1, x2) +#' pReg <- exp(X%*%param)/(1+exp(X%*%param)) # Logit +#' array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID) +#' +#' # The above examples with different assays: +#' Se <- c(0.95, 0.98, 0.98) +#' Sp <- c(0.97, 0.98, 0.92) +#' assayID <- 1:3 +#' array.gt.simulation(N,pHom,protocol,n,Se,Sp,assayID) +#' array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID) +#' +array.gt.simulation <- function(N,p=0.10,protocol=c("A2","A2M"),n,Se,Sp,assayID,Yt=NULL){ + ind.simulation <- function(Nr,prob=0.1,Se.ind,Sp.ind,yt=NULL){ + Yt.ind <- yt + if( is.null(yt) ){ + Yt.ind <- stats::rbinom(Nr,1,prob) + } + for(k in 1:Nr){ + prb <- ifelse(Yt.ind>0, Se.ind, 1-Sp.ind) + y.test <- stats::rbinom(Nr,1,prb) + } + return(y.test) + } + protocol <- match.arg(protocol) + if(n <= 1) stop("Row size and column size must be larger than 1") + if(n^2 > N) stop("The array size n*n is too large") + L <- floor(N/n^2) + N0 <- L*n^2 + if( !is.null(Yt) ){ + Ytil1 <- Yt + }else{ + Ytil1 <- stats::rbinom(N,1,p) + } + id1 <- cumsum( c(0,rep(n^2,L)) ) + Zmat <- Ymat <- NULL + ## Simulating for two-stage array + if(protocol=="A2"){ + for(l in 1:L){ + Z_id <- matrix((id1[l]+1):id1[l+1],n,n) + Ymat1 <- matrix(Ytil1[(id1[l]+1):id1[l+1]],n,n) + R1 <- stats::rbinom(n,1,ifelse(rowSums(Ymat1)>0,Se[1],1-Sp[1])) + Zmat <- rbind(Zmat,cbind(R1,n,Se[1],Sp[1],assayID[1],Z_id)) + C1 <- stats::rbinom(n,1,ifelse(colSums(Ymat1)>0,Se[1],1-Sp[1])) + Zmat <- rbind(Zmat,cbind(C1,n,Se[1],Sp[1],assayID[1],t(Z_id))) + for(i in 1:n){ + for(j in 1:n){ + T1 <- 0 + if(R1[i]==1 & C1[j]==1) T1 <- T1 + 1 + if(R1[i]==1 & sum(C1)==0) T1 <- T1 + 1 + if(sum(R1)==0 & C1[j]==1) T1 <- T1 + 1 + if(T1>=1){ + y1 <- stats::rbinom(1,1,ifelse(Ymat1[i,j]==1,Se[2],1-Sp[2])) + Ymat <- rbind(Ymat,c(y1,1,Se[2],Sp[2],assayID[2],Z_id[i,j])) + } + } + } + } + if(!is.null(Ymat)){ + Zmat <- rbind(Zmat,cbind(Ymat,matrix(-9,nrow(Ymat),n-1))) + } + idv.id <- paste( rep("Mem",n), 1:n, sep="" ) + } + ## Simulating for three-stage array + if(protocol=="A2M"){ + zfil <- matrix(-9,n,n^2-n) + for(l in 1:L){ + aryid <- (id1[l]+1):id1[l+1] + ytmp <- Ytil1[aryid] + zary <- stats::rbinom(1,1,ifelse(sum(ytmp)>0,Se[1],1-Sp[1])) + Zmat <- rbind(Zmat,c(zary,n*n,Se[1],Sp[1],assayID[1],aryid)) + if(zary==1){ + Z_id <- matrix((id1[l]+1):id1[l+1],n,n) + Ymat1 <- matrix(Ytil1[(id1[l]+1):id1[l+1]],n,n) + R1 <- stats::rbinom(n,1,ifelse(rowSums(Ymat1)>0,Se[2],1-Sp[2])) + Zmat <- rbind(Zmat,cbind(R1,n,Se[2],Sp[2],assayID[2],Z_id,zfil)) + C1 <- stats::rbinom(n,1,ifelse(colSums(Ymat1)>0,Se[2],1-Sp[2])) + Zmat <- rbind(Zmat,cbind(C1,n,Se[2],Sp[2],assayID[2],t(Z_id),zfil)) + for(i in 1:n){ + for(j in 1:n){ + T1 <- 0 + if(R1[i]==1 & C1[j]==1) T1 <- T1 + 1 + if(R1[i]==1 & sum(C1)==0) T1 <- T1 + 1 + if(sum(R1)==0 & C1[j]==1) T1 <- T1 + 1 + if(T1>=1){ + y1 <- stats::rbinom(1,1,ifelse(Ymat1[i,j]==1,Se[3],1-Sp[3])) + Ymat <- rbind(Ymat,c(y1,1,Se[3],Sp[3],assayID[3],Z_id[i,j])) + } + } + } + } + } + if(!is.null(Ymat)){ + Zmat <- rbind(Zmat,cbind(Ymat,matrix(-9,nrow(Ymat),n*n-1))) + } + idv.id <- paste( rep("Mem",n), 1:n^2, sep="" ) + } + ## Individual testing for the remainder sample + Rem <- N-N0 + if( Rem > 0) warning("N is not divisible by the array size; the remainder individuals are tested one by one") + if( Rem > 0 ){ + S <- length(Se) + ytest <- ind.simulation(Nr=Rem,Se.ind=Se[S],Sp.ind=Sp[S],yt=Ytil1[(N0+1):N]) + rmat <- matrix(-9,length(ytest),ncol(Zmat)-6) + zfil <- cbind( ytest,1,Se[S],Sp[S],assayID[S],(N0+1):N,rmat ) + Zmat <- rbind( Zmat, zfil ) + } + ## Output + rownames(Zmat) <- NULL + colnames(Zmat) <- c("Z","psz","Se","Sp","Assay",idv.id) + rownames(Zmat) <- paste("Pool:",1:nrow(Zmat),sep="") + return( list("gtData" = Zmat, + "testsExp" = nrow(Zmat)) ) +} diff --git a/R/linkfns.R b/R/linkfns.R new file mode 100644 index 0000000..33e08e1 --- /dev/null +++ b/R/linkfns.R @@ -0,0 +1,50 @@ +#' Link Functions in the Class of Generalized Linear Models +#' +#' This function provides characteristics of common link functions (logit, probit, and comlementary log-log). Specifically, based on the link name, the function with its inverse, first derivative, and second derivative is provided. +#' +#' @param fn.name One of the three: "logit", "probit", and "cloglog". +#' +#' @importFrom stats pnorm +#' @importFrom stats qnorm +#' +#' @return A list with components: +#' \item{g}{The link function corresponding to "logit", "probit", or "cloglog".} +#' \item{dg}{The first derivative of g.} +#' \item{d2g}{The second derivative of g.} +#' \item{gInv}{The inverse of g.} +#' +#' @export +#' +#' @examples +#' ## Try: +#' glmLink("logit") +#' +glmLink <- function(fn.name=c("logit","probit","cloglog")){ + fn.name <- match.arg(fn.name) + + # Logit link + if(fn.name == "logit"){ + g <- function(u){exp(u)/(1+exp(u))} + dg <- function(u){exp(u)/(1+exp(u))^2} + d2g <- function(u){exp(u)*(1-exp(u))/(1+exp(u))^3} + g.inv <- function(u){log(u/(1-u))} + } + # Probit link + if(fn.name == "probit"){ + g <- function(u){stats::pnorm(u)} + dg <- function(u){exp(-u^2/2)/sqrt(2*pi)} + d2g <- function(u){-u*exp(-u^2/2)/sqrt(2*pi)} + g.inv <- function(u){stats::qnorm(u)} + } + # Complementary log-log link + if(fn.name == "cloglog"){ + g <- function(u){1 - exp(-exp(u))} + dg <- function(u){exp(u)*exp(-exp(u))} + d2g <- function(u){exp(u)*exp(-exp(u))*(1-exp(u))} + g.inv <- function(u){log(-log(1-u))} + } + list("g" = g, + "dg" = dg, + "d2g" = d2g, + "gInv"= g.inv) +} diff --git a/R/propEM.R b/R/propEM.R new file mode 100644 index 0000000..72140a0 --- /dev/null +++ b/R/propEM.R @@ -0,0 +1,325 @@ +#' EM Algorithm to Estimate the Prevalence of a Disease from Group Testing Data +#' +#' This function implements an expectation-maximization (EM) algorithm to find the maximum likelihood estimate (MLE) of a disease prevalence, p, based on group testing data. The EM algorithm can model pooling data observed from \strong{any} group testing protocol used in practice, including hierarchical and array testing (Kim et al., 2007). +#' +#' @useDynLib groupTesting, .registration=TRUE +#' +#' @param p0 An initial value of the prevalence. +#' @param gtData A matrix or data.frame consisting of the pooled test outcomes and other information from a group testing application. Needs to be specified as shown in the example below. +#' @param covariance When TRUE, the variance is calculated at the MLE. +#' @param nburn The number of initial Gibbs iterates to be discarded. +#' @param ngit The number of Gibbs iterates to be used in the E-step after discarding the initial iterates as a burn-in period. +#' @param maxit The maximum number of EM steps (iterations) allowed in the EM algorithm. +#' @param tol Convergence tolerance used in the EM algorithm. +#' @param tracing When TRUE, progress in the EM algorithm is displayed. +#' @param conf.level Confidence level to be used for the Wald confidence interval. +#' +#' @importFrom stats rbinom +#' @importFrom stats runif +#' +#' @details +#' +#' \code{gtData} must be specified as follows. Columns 1-5 consist of the pooled test outcomes (0 for negative and 1 for positive), pool sizes, pool-specific sensitivities, pool-specific specificities, and assay ID numbers, respectively. From column 6 onward, the pool member ID numbers need to be specified. Note that the ID numbers must start with 1 and increase consecutively up to \code{N}, the total number of individuals tested. \strong{For smaller pools, incomplete ID numbers must be filled out by -9 or any non-positive numbers} as shown in the example below. +#' +#' | | Z | psz | Se | Sp | Assay | Mem1 | Mem2 | Mem3 | Mem4 | Mem5 | Mem6 | +#' |:---:|:---:|:-----:|:------:|:------:|:-------:|:-------:|:-------:|:-------:|:-------:|:-------:|:-------:| +#' | Pool:1 | 1 | 6 | 0.90 | 0.92 | 1 | 1 | 2 | 3 | 4 | 5 | 6 | +#' | Pool:2 | 0 | 6 | 0.90 | 0.92 | 1 | 7 | 8 | 9 | 10 | 11 | 12 | +#' | Pool:3 | 1 | 2 | 0.95 | 0.96 | 2 | 1 | 2 | -9 | -9 | -9 | -9 | +#' | Pool:4 | 0 | 2 | 0.95 | 0.96 | 2 | 3 | 4 | -9 | -9 | -9 | -9 | +#' | Pool:5 | 1 | 2 | 0.95 | 0.96 | 2 | 5 | 6 | -9 | -9 | -9 | -9 | +#' | Pool:6 | 0 | 1 | 0.92 | 0.90 | 3 | 1 | -9 | -9 | -9 | -9 | -9 | +#' | Pool:7 | 1 | 1 | 0.92 | 0.90 | 3 | 2 | -9 | -9 | -9 | -9 | -9 | +#' | Pool:8 | 0 | 1 | 0.92 | 0.90 | 3 | 5 | -9 | -9 | -9 | -9 | -9 | +#' | Pool:9 | 0 | 1 | 0.92 | 0.90 | 3 | 6 | -9 | -9 | -9 | -9 | -9 | +#' +#' This is an example of \code{gtData}, where 12 individuals are assigned to 2 non-overlapping initial pools and then tested based on the 3-stage hierarchical protocol. The test outcomes, \code{Z}, from 9 pools are in column 1. In three stages, different pool sizes (6, 2, and 1), sensitivities, specificities, and assays are used. The ID numbers of the pool members are shown in columns 6-11. The row names and column names are not required. Note that the EM algorithm can accommodate any group testing data including those described in Kim et al. (2007). For individual testing data, the pool size in column 2 is 1 for all pools. +#' +#' The EM algorithm implements a Gibbs sampler to approximate quantities required to complete the E-step. Under each EM iteration, \code{ngit} Gibbs samples are retained for these purposes after discarding the initial \code{nburn} samples. +#' +#' The variance of the MLE is calculated by an appeal to the missing data principle and the method outlined in Louis (1982). +#' +#' @return A list with components: +#' \item{param}{The MLE of the disease prevalence.} +#' \item{covariance}{Estimated variance for the disease prevalence.} +#' \item{iterUsed}{The number of EM iterations used for convergence.} +#' \item{convergence}{0 if the EM algorithm converges successfully and 1 if the iteration limit \code{maxit} has been reached.} +#' \item{summary}{Estimation summary with Wald confidence interval.} +#' +#' @export +#' +#' @references +#' Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C. (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63:1152-1163. +#' +#' Litvak E, Tu X, and Pagano M. (1994). Screening for the Presence of a Disease by Pooling Sera Samples. \emph{Journal of the American Statistical Association}, 89:424-434. +#' +#' Liu A, Liu C, Zhang Z, and Albert P. (2012). Optimality of Group Testing in the Presence of Misclassification. \emph{Biometrika}, 99:245-251. +#' +#' Louis T. (1982). Finding the Observed Information Matrix when Using the EM algorithm. \emph{Journal of the Royal Statistical Society: Series B}, 44:226-233. +#' +#' @seealso +#' \code{\link{hier.gt.simulation}} and \code{\link{array.gt.simulation}} for group testing data simulation, and \code{\link{glm.gt}} for group testing regression models. +#' +#' @examples +#' +#' library("groupTesting") +#' +#' ## To illustrate 'prop.gt', we use data simulated +#' ## by the R functions 'hier.gt.simulation' and 'array.gt.simulation'. +#' +#' ## The simulated data-structures are consistent +#' ## with the data-structure required for 'gtData'. +#' +#' ## Example 1: MLE from 3-stage hierarchical group testing data. +#' ## The data used is simulated by 'hier.gt.simulation'. +#' +#' N <- 90 # Sample size +#' S <- 3 # 3-stage hierarchical testing +#' psz <- c(6,2,1) # Pool sizes used in stages 1-3 +#' Se <- c(.95,.95,.98) # Sensitivities in stages 1-3 +#' Sp <- c(.95,.98,.96) # Specificities in stages 1-3 +#' assayID <- c(1,2,3) # Assays used in stages 1-3 +#' p.t <- 0.05 # The TRUE parameter to be estimated +#' +#' # Simulating data: +#' set.seed(123) +#' gtOut <- hier.gt.simulation(N,p.t,S,psz,Se,Sp,assayID)$gtData +#' +#' # Running the EM algorithm: +#' pStart <- p.t + 0.2 # Initial value +#' res <- prop.gt(p0=pStart,gtData=gtOut,covariance=TRUE, +#' nburn=2000,ngit=5000,maxit=200,tol=1e-03, +#' tracing=TRUE,conf.level=0.95) +#' +#' # Estimation results: +#' # > res +#' +#' # $param +#' # [1] 0.05158 +#' +#' # $covariance +#' # [,1] +#' # [1,] 0.0006374296 +#' +#' # $iterUsed +#' # [1] 4 +#' +#' # $convergence +#' # [1] 0 +#' +#' # $summary +#' # Estimate StdErr 95%lower 95%upper +#' # prop 0.052 0.025 0.002 0.101 +#' +#' ## Example 2: MLE from two-dimensional array testing data. +#' ## The data used is simulated by 'array.gt.simulation'. +#' +#' N <- 100 # Sample size +#' protocol <- "A2" # 2-stage array without testing the initial master pool +#' n <- 5 # Row/column size +#' Se <- c(0.95, 0.95) # Sensitivities +#' Sp <- c(0.98, 0.98) # Specificities +#' assayID <- c(1, 1) # The same assay in both stages +#' p.true <- 0.05 # The TRUE parameter to be estimated +#' +#' # Simulating data: +#' set.seed(123) +#' gtOut <- array.gt.simulation(N,p.true,protocol,n,Se,Sp,assayID)$gtData +#' +#' # Fitting the model: +#' pStart <- p.true + 0.2 # Initial value +#' res <- prop.gt(p0=pStart,gtData=gtOut,covariance=TRUE) +#' print(res) +#' +#' \donttest{ +#' ## Example 3: MLE from non-overlapping initial pooled responses. +#' ## The data used is simulated by 'hier.gt.simulation'. +#' +#' ## Note: With initial pooled responses, our MLE is equivalent +#' ## to the MLE in Litvak et al. (1994) and Liu et al. (2012). +#' +#' N <- 1000 # Sample size +#' psz <- 5 # Pool size +#' S <- 1 # 1-stage testing +#' Se <- 0.95 # Sensitivity +#' Sp <- 0.99 # Specificity +#' assayID <- 1 # Assay used for all pools +#' p.true <- 0.05 # True parameter +#' +#' set.seed(123) +#' gtOut <- hier.gt.simulation(N,p.true,S,psz,Se,Sp,assayID)$gtData +#' +#' pStart <- p.true + 0.2 # Initial value +#' res <- prop.gt(p0=pStart,gtData=gtOut, +#' covariance=TRUE,nburn=2000,ngit=5000, +#' maxit=200,tol=1e-03,tracing=TRUE) +#' print(res) +#' +#' ## Example 4: MLE from individual (one-by-one) testing data. +#' ## The data used is simulated by 'hier.gt.simulation'. +#' +#' N <- 1000 # Sample size +#' psz <- 1 # Pool size 1 (i.e., individual testing) +#' S <- 1 # 1-stage testing +#' Se <- 0.95 # Sensitivity +#' Sp <- 0.99 # Specificity +#' assayID <- 1 # Assay used for all pools +#' p.true <- 0.05 # True parameter +#' +#' set.seed(123) +#' gtOut <- hier.gt.simulation(N,p.true,S,psz,Se,Sp,assayID)$gtData +#' +#' pStart <- p.true + 0.2 # Initial value +#' res <- prop.gt(p0=pStart,gtData=gtOut, +#' covariance=TRUE,nburn=2000, +#' ngit=5000,maxit=200, +#' tol=1e-03,tracing=TRUE) +#' print(res) +#' +#' ## Example 5: Using pooled testing data. +#' +#' # Pooled test outcomes: +#' Z <- c(1, 0, 1, 0, 1, 0, 1, 0, 0) +#' +#' # Pool sizes used: +#' psz <- c(6, 6, 2, 2, 2, 1, 1, 1, 1) +#' +#' # Pool-specific Se & Sp: +#' Se <- c(.90, .90, .95, .95, .95, .92, .92, .92, .92) +#' Sp <- c(.92, .92, .96, .96, .96, .90, .90, .90, .90) +#' +#' # Assays used: +#' Assay <- c(1, 1, 2, 2, 2, 3, 3, 3, 3) +#' +#' # Pool members: +#' Memb <- rbind( +#' c(1, 2, 3, 4, 5, 6), +#' c(7, 8, 9, 10, 11, 12), +#' c(1, 2, -9, -9, -9, -9), +#' c(3, 4, -9, -9, -9, -9), +#' c(5, 6, -9, -9, -9, -9), +#' c(1,-9, -9, -9, -9, -9), +#' c(2,-9, -9, -9, -9, -9), +#' c(5,-9, -9, -9, -9, -9), +#' c(6,-9, -9, -9, -9, -9) +#' ) +#' # The data-structure suited for 'gtData': +#' gtOut <- cbind(Z, psz, Se, Sp, Assay, Memb) +#' +#' # Fitting the model: +#' pStart <- 0.10 +#' res <- prop.gt(p0=pStart,gtData=gtOut, +#' covariance=TRUE,nburn=2000, +#' ngit=5000,maxit=200, +#' tol=1e-03,tracing=TRUE) +#' print(res) +#' } +#' +prop.gt <- function(p0,gtData,covariance=FALSE,nburn=2000,ngit=5000,maxit=200,tol=1e-03,tracing=TRUE,conf.level=0.95){ + + ## This block tracks the individuals assigned to a pool. + ## Note: 'gtData' must have the required specific structure. + Memb <- gtData[ ,-(1:5)] + N <- max(Memb) + maxAssign <- max(as.numeric(table(Memb[Memb > 0]))) + ytm <- matrix(-9,N,maxAssign) + tmp <- as.matrix( Memb ) + vec <- 1:nrow(gtData) + for(d in 1:N){ + tid <- tmp==d + store <- NULL + for(i in 1:ncol(tmp)){ + store <- c(store,vec[tid[ ,i]]) + } + ytm[d,1:length(store)] <- sort(store) + } + + ## Generating individual true disease statuses + ## at the initial parameter value p0 + Yt <- stats::rbinom(N,1,p0) + Ytmat <- cbind(Yt,rowSums(ytm>0),ytm) + + ## Some global variables + Ycol <- ncol(Ytmat) + SeSp <- gtData[ ,3:4] + Z <- gtData[ ,-(3:5)] + Zrow <- nrow(Z) + Zcol <- ncol(Z) + + ## The total number of Gibbs iterates + GI <- ngit + nburn + + ## Initial value of the parameter + p1 <- p0 + p0 <- p0 + 2*tol + s <- 1 + convergence <- 0 + + ## The EM algorithm starts here + while(abs(p1-p0) > tol){ + p0 <- p1 + U <- matrix(stats::runif(N*GI),nrow=N,ncol=GI) + + ## Gibbs sampling in Fortran to approximate the E-step + res <- .Call("gbsonedhom_c",as.double(p0),as.integer(Ytmat), + as.integer(Z),as.integer(N),as.double(SeSp),as.integer(Ycol), + as.integer(Zrow),as.integer(Zcol),as.double(U),as.integer(GI), + as.integer(nburn), PACKAGE="groupTesting") + temp <- sum( res )/ngit + + ## M-step: The parameter p is updated here + p1 <- temp/N + + ## Terminate the EM algorithm if it exceeds max iteration + if(s >= maxit){ + convergence <- 1 + break + } + s <- s + 1 + if(tracing){ + print(c(s-1,p1)) + } + } + + ## Covariance matrix in Fortran using Louis's (1982) method + covr2 <- NULL + if(covariance){ + U <- matrix(stats::runif(N*GI),nrow=N,ncol=GI) + Info <- .Call("cvondknachom_c",as.double(p1),as.integer(Ytmat), + as.integer(Z),as.integer(N),as.double(SeSp),as.integer(Ycol), + as.integer(Zrow),as.integer(Zcol),as.double(U),as.integer(GI), + as.integer(nburn), PACKAGE="groupTesting") + covr2 <- solve( Info ) + } + + ## Univariate Wald Inference + pHat <- p1 + if(covariance){ + se <- sqrt(covr2) + alternative <- "two.sided" + ## Calculate the test statistic: + z <- qnorm(ifelse(alternative=="two.sided", + (1+conf.level)/2, conf.level)) + ## Find the confidence interval: + CI <- c(pHat-z*se, pHat+z*se) + ## Organizing and reporting the output: + res <- data.frame(round(pHat, 3), round(se, 3), + round(CI[1],3), round(CI[2],3) ) + }else{ + res <- data.frame(round(pHat, 3), NA, NA, NA) + } + + rownames(res) <- colnames(res) <- NULL + rownames(res) <- "prop" + colnames(res) <- c("Estimate", "StdErr", + paste(conf.level*100, "%lower", sep=""), + paste(conf.level*100, "%upper", sep="")) + + # Output + list("param" = p1, + "covariance" = covr2, + "iterUsed" = s-1, + "convergence" = convergence, + "summary" = res + ) +} diff --git a/R/regEM.R b/R/regEM.R new file mode 100644 index 0000000..44915fa --- /dev/null +++ b/R/regEM.R @@ -0,0 +1,434 @@ +#' EM Algorithm for Fitting Regression Models to Group Testing Data +#' +#' This function implements an expectation-maximization (EM) algorithm to fit regression models to group testing data, where pooled responses are related to individual covariates through a link function in the generalized linear model (GLM) family. The EM algorithm finds the maximum likelihood estimate (MLE) for the vector of regression coefficients, \strong{beta}. The EM algorithm can model pooling data observed from \strong{any} group testing protocol used in practice, including hierarchical and array testing (Kim et al., 2007). +#' +#' @useDynLib groupTesting, .registration=TRUE +#' +#' @param beta0 An initial value for the regression coefficients. +#' @param gtData A matrix or data.frame consisting of the pooled test outcomes and other information from a group testing application. Needs to be specified as shown in the example below. +#' @param X The design matrix. +#' @param g An inverse link function in the GLM family. +#' @param dg The first derivate of \code{g}. When NULL, a finite-difference approximation will be used. +#' @param d2g The second derivate of \code{g}. When NULL, a finite-difference approximation will be used. +#' @param grdMethod The finite-difference approximation method to be used for \code{dg} and \code{d2g}. See 'Details'. +#' @param covariance When TRUE, the covariance matrix is calculated at the MLE. +#' @param nburn The number of initial Gibbs iterates to be discarded. +#' @param ngit The number of Gibbs iterates to be used in the E-step after discarding \code{nburn} iterates as a burn-in period. +#' @param maxit The maximum number of EM steps (iterations) allowed in the EM algorithm. +#' @param tol Convergence tolerance used in the EM algorithm. +#' @param tracing When TRUE, progress in the EM algorithm is displayed. +#' @param conf.level Confidence level to be used for the Wald confidence interval. +#' @param ... Further arguments to be passed to \code{\link{optim}}. See 'Details'. +#' +#' @importFrom stats rbinom +#' @importFrom stats runif +#' @importFrom stats pchisq +#' @importFrom stats optim +#' @importFrom pracma fderiv +#' +#' @details +#' +#' \code{gtData} must be specified as follows. Columns 1-5 consist of the pooled test outcomes (0 for negative and 1 for positive), pool sizes, pool-specific sensitivities, pool-specific specificities, and assay identification (ID) numbers, respectively. From column 6 onward, the pool member ID numbers need to be specified. Note that the ID numbers must start with 1 and increase consecutively up to \code{N}, the total number of individuals tested. \strong{For smaller pools, incomplete ID numbers must be filled out by -9 or any non-positive numbers} as shown in the example below. The design matrix \code{X} consists of invidual covariate information, such as age, sex, and symptoms, of the pool members located in column 6 onward. +#' +#' | | Z | psz | Se | Sp | Assay | Mem1 | Mem2 | Mem3 | Mem4 | Mem5 | Mem6 | +#' |:---:|:---:|:-----:|:------:|:------:|:-------:|:-------:|:-------:|:-------:|:-------:|:-------:|:-------:| +#' | Pool:1 | 1 | 6 | 0.90 | 0.92 | 1 | 1 | 2 | 3 | 4 | 5 | 6 | +#' | Pool:2 | 0 | 6 | 0.90 | 0.92 | 1 | 7 | 8 | 9 | 10 | 11 | 12 | +#' | Pool:3 | 1 | 2 | 0.95 | 0.96 | 2 | 1 | 2 | -9 | -9 | -9 | -9 | +#' | Pool:4 | 0 | 2 | 0.95 | 0.96 | 2 | 3 | 4 | -9 | -9 | -9 | -9 | +#' | Pool:5 | 1 | 2 | 0.95 | 0.96 | 2 | 5 | 6 | -9 | -9 | -9 | -9 | +#' | Pool:6 | 0 | 1 | 0.92 | 0.90 | 3 | 1 | -9 | -9 | -9 | -9 | -9 | +#' | Pool:7 | 1 | 1 | 0.92 | 0.90 | 3 | 2 | -9 | -9 | -9 | -9 | -9 | +#' | Pool:8 | 0 | 1 | 0.92 | 0.90 | 3 | 5 | -9 | -9 | -9 | -9 | -9 | +#' | Pool:9 | 0 | 1 | 0.92 | 0.90 | 3 | 6 | -9 | -9 | -9 | -9 | -9 | +#' +#' This is an example of \code{gtData}, where 12 individuals are assigned to 2 non-overlapping initial pools and then tested based on the 3-stage hierarchical protocol. The test outcomes, \code{Z}, from 9 pools are in column 1. In three stages, different pool sizes (6, 2, and 1), sensitivities, specificities, and assays are used. The ID numbers of the pool members are shown in columns 6-11. The row names and column names are not required. Note that the EM algorithm can accommodate any group testing data including those described in Kim et al. (2007). For individual testing data, the pool size in column 2 is 1 for all pools. +#' +#' \code{X} is an \eqn{N}x\eqn{k} design matrix, where each column represents a vector of individual covariate values. For an intercept model, the first column values must be 1. The column (covariate) names of X, such as 'age' and 'sex', will be displayed in the estimation summary. When column names are missing (NULL), the names that will be displayed by default are 'Intercept', 'x1', 'x2', and so on. +#' +#' The EM algorithm implements a Gibbs sampler to approximate the expectation in the E-step. Under each EM iteration, \code{ngit} Gibbs samples are retained for these purposes after discarding the initial \code{nburn} samples. +#' +#' \code{g} relates the pooled responses Z (column 1 in \code{gtData}) to \code{X}. \code{dg} and \code{d2g} can be specified analogously. These characteristics can be obtained from \code{\link{glmLink}} for the common links: logit, probit, and complementary log-log. +#' +#' \code{grdMethod} is used only when dg and d2g are NULL, where a finite-difference approximation is implemented by the function \code{fderiv} from the package 'pracma'. +#' +#' The optimization routine \code{\link{optim}} is used to complete the M-step with the default method 'Nelder-Mead'. The argument ... allows the user to change the default method as well as other arguments in \code{\link{optim}}. +#' +#' The covariance matrix is calculated by an appeal to the missing data principle and the method outlined in Louis (1982). +#' +#' @return A list with components: +#' \item{param}{The MLE of the regression coefficients.} +#' \item{covariance}{Estimated covariance matrix for the regression coefficients.} +#' \item{iterUsed}{The number of EM iterations needed for convergence.} +#' \item{convergence}{0 if the EM algorithm converges successfully and 1 if the iteration limit \code{maxit} has been reached.} +#' \item{summary}{Estimation summary with Wald confidence interval.} +#' +#' @export +#' +#' @references +#' Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C. (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63:1152-1163. +#' +#' Louis T. (1982). Finding the Observed Information Matrix when Using the EM algorithm. \emph{Journal of the Royal Statistical Society: Series B}, 44:226-233. +#' +#' Vansteelandt S, Goetghebeur E, and Verstraeten T. (2000). Regression Models for Disease Prevalence with Diagnostic Tests on Pools of Serum Samples. \emph{Biometrics}, 56:1126-1133. +#' +#' @seealso +#' \code{\link{hier.gt.simulation}} and \code{\link{array.gt.simulation}} for group testing data simulation, and \code{\link{prop.gt}} for estimation of a disease prevalence from group testing data. +#' +#' @examples +#' +#' library("groupTesting") +#' +#' ## To illustrate 'glm.gt', we use data simulated +#' ## by the functions 'hier.gt.simulation' and 'array.gt.simulation'. +#' +#' ## Note: The simulated data-structures are consistent +#' ## with the data-structure required for 'gtData'. +#' +#' ## Example 1: MLE from 3-stage hierarchical group testing data. +#' ## The data used is simulated by 'hier.gt.simulation'. +#' +#' N <- 200 # Sample size +#' S <- 3 # 3-stage hierarchical testing +#' psz <- c(6,2,1) # Pool sizes used in stages 1-3 +#' Se <- c(.95,.95,.98) # Sensitivities in stages 1-3 +#' Sp <- c(.95,.98,.96) # Specificities in stages 1-3 +#' assayID <- c(1,2,3) # Assays used in stages 1-3 +#' param.t <- c(-3,2,1) # The TRUE parameter to be estimated +#' +#' # Simulating covariates: +#' set.seed(123) +#' x1 <- rnorm(N, mean=0, sd=0.75) +#' x2 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(1, x1, x2) +#' colnames( X ) <- c("Intercept", "Predictor 1", "Predictor 2") +#' # Note: Because the 1st column of X is 1, intercept model will be fit. +#' +#' # Specifying logit inverse link: +#' g <- function(t){exp(t)/(1+exp(t))} +#' pReg <- g(X%*%param.t) +#' +#' # Simulating test responses: +#' gtOut <- hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID)$gtData +#' +#' # Fitting the model (with intercept): +#' param0 <- param.t + 0.2 # Initial value +#' res <- glm.gt(beta0=param0,gtData=gtOut,X=X, +#' g=g,dg=NULL,d2g=NULL, +#' grdMethod="central",covariance=TRUE, +#' nburn=2000,ngit=5000,maxit=200, +#' tol=1e-03,tracing=TRUE,conf.level=0.95) +#' +#' # Note: Because dg and d2g are NULL (i.e., the exact derivatives +#' # are not given), numerical derivatives are used. +#' +#' # Estimation results: +#' # > res +#' +#' # $param +#' # [1] -2.840802 1.992916 0.677176 +#' +#' # $covariance +#' # [,1] [,2] [,3] +#' # [1,] 0.2134439 -0.10147555 -0.16693776 +#' # [2,] -0.1014756 0.16855122 0.02997113 +#' # [3,] -0.1669378 0.02997113 0.26324589 +#' +#' # $iterUsed +#' # [1] 10 +#' +#' # $convergence +#' # [1] 0 +#' +#' # $summary +#' # Estimate Std.Err 95%lower 95%upper +#' # Intercept -2.841 0.462 -3.746 -1.935 +#' # Predictor 1 1.993 0.411 1.188 2.798 +#' # Predictor 2 0.677 0.513 -0.328 1.683 +#' +#' ## Example 2: MLE from two-dimensional array testing data. +#' ## The data used is simulated by 'array.gt.simulation'. +#' +#' N <- 200 # Sample size +#' protocol <- "A2" # 2-stage array without testing initial master pool +#' n <- 5 # Row/column size +#' Se <- c(0.95, 0.95) # Sensitivities +#' Sp <- c(0.98, 0.98) # Specificities +#' assayID <- c(1, 1) # The same assay in both stages +#' param <- c(-4,1,1) # The TRUE parameter to be estimated +#' +#' # Simulating data: +#' set.seed(123) +#' x1 <- runif(N) +#' x2 <- rnorm(N, mean=0, sd=0.5) +#' x3 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(x1, x2, x3) +#' # Note: Because the 1st column of X is not 1, +#' # the model without intercept will be fit. +#' +#' # Finding g, dg, and d2g from the function 'glmLink': +#' res0 <- glmLink(fn.name="logit") +#' g <- res0$g # Logit inverse link g() +#' dg <- res0$dg # The exact first derivate of g +#' d2g <- res0$d2g # The exact second derivate of g +#' pReg <- g(X%*%param) # Individual probabilities +#' gtOut <- array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID)$gtData +#' +#' # Fitting the model (without intercept): +#' param0 <- param + 0.2 +#' res <- glm.gt(beta0=param0,gtData=gtOut,X=X,g=g, +#' dg=dg,d2g=d2g,covariance=TRUE, +#' nburn=2000,ngit=5000,maxit=200, +#' tol=1e-03,tracing=TRUE,conf.level=0.95) +#' print(res) +#' +#' \donttest{ +#' ## Example 3: MLE from non-overlapping initial pooled responses. +#' ## The data used is simulated by 'hier.gt.simulation'. +#' +#' ## Note: With initial pooled responses, our MLE is equivalent +#' ## to the MLE in Vansteelandt et al. (2000). +#' +#' N <- 1000 # Sample size +#' psz <- 5 # Pool size +#' S <- 1 # 1-stage testing +#' Se <- 0.95 # Sensitivity +#' Sp <- 0.99 # Specificity +#' assayID <- 1 # Assay used for all pools +#' param <- c(-3,2,1) # The TRUE parameter to be estimated +#' +#' # Simulating data: +#' set.seed(123) +#' x1 <- rnorm(N, mean=0, sd=0.75) +#' x2 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(1, x1, x2) +#' +#' # Finding g, dg, and d2g by the function 'glmLink': +#' res0 <- glmLink(fn.name="probit") # Probit link +#' g <- res0$g +#' dg <- res0$dg +#' d2g <- res0$d2g +#' pReg <- g(X%*%param) +#' gtOut <- hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID)$gtData +#' +#' # Fitting the model: +#' param0 <- param + 0.2 +#' res <- glm.gt(beta0=param0,gtData=gtOut,X=X,g=g, +#' dg=dg,d2g=d2g,covariance=TRUE, +#' nburn=2000,ngit=5000,maxit=200, +#' tol=1e-03,tracing=TRUE,conf.level=0.95) +#' print(res) +#' +#' ## Example 4: MLE from individual (one-by-one) testing data. +#' ## The data used is simulated by 'hier.gt.simulation'. +#' +#' N <- 1000 # Sample size +#' psz <- 1 # Pool size 1 (i.e., individual testing) +#' S <- 1 # 1-stage testing +#' Se <- 0.95 # Sensitivity +#' Sp <- 0.99 # Specificity +#' assayID <- 1 # Assay used for all pools +#' param <- c(-3,2,1) # The TRUE parameter to be estimated +#' +#' # Simulating data: +#' set.seed(123) +#' x1 <- rnorm(N, mean=0, sd=0.75) +#' x2 <- rbinom(N, size=1, prob=0.5) +#' X <- cbind(1, x1, x2) +#' g <- function(t){exp(t)/(1+exp(t))} # Inverse logit +#' pReg <- g(X%*%param) +#' gtOut <- hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID)$gtData +#' +#' # Fitting the model: +#' param0 <- param + 0.2 +#' res <- glm.gt(beta0=param0,gtData=gtOut, +#' X=X,g=g,dg=NULL,d2g=NULL, +#' grdMethod="central",covariance=TRUE, +#' nburn=2000,ngit=5000,maxit=200, +#' tol=1e-03,tracing=TRUE,conf.level=0.95) +#' print(res) +#' +#' ## Example 5: Using pooled testing data. +#' +#' # Pooled test outcomes: +#' Z <- c(1, 0, 1, 0, 1, 0, 1, 0, 0) +#' +#' # Design matrix, X: +#' x1 <- c(0.8,1.2,0.4,1.5,1.8,1.8,0.1,1.6,0.2,0.2,1.8,0.2) +#' x2 <- c(31,56,45,64,26,47,22,60,35,41,32,41) +#' X <- cbind(x1, x2) +#' +#' # Pool sizes used: +#' psz <- c(6, 6, 2, 2, 2, 1, 1, 1, 1) +#' +#' # Pool-specific Se & Sp: +#' Se <- c(.90, .90, .95, .95, .95, .92, .92, .92, .92) +#' Sp <- c(.92, .92, .96, .96, .96, .90, .90, .90, .90) +#' +#' # Assays used: +#' Assay <- c(1, 1, 2, 2, 2, 3, 3, 3, 3) +#' +#' # Pool members: +#' Memb <- rbind( +#' c(1, 2, 3, 4, 5, 6), +#' c(7, 8, 9, 10, 11, 12), +#' c(1, 2, -9, -9, -9, -9), +#' c(3, 4, -9, -9, -9, -9), +#' c(5, 6, -9, -9, -9, -9), +#' c(1,-9, -9, -9, -9, -9), +#' c(2,-9, -9, -9, -9, -9), +#' c(5,-9, -9, -9, -9, -9), +#' c(6,-9, -9, -9, -9, -9) +#' ) +#' # The data-structure suited for 'gtData': +#' gtOut <- cbind(Z, psz, Se, Sp, Assay, Memb) +#' +#' # Fitting the model with logit link: +#' g <- function(t){exp(t)/(1+exp(t))} +#' param0 <- c(0, 0) +#' res <- glm.gt(beta0=param0,gtData=gtOut,X=X, +#' g=g,dg=NULL,d2g=NULL, +#' grdMethod="central",covariance=TRUE, +#' nburn=2000,ngit=5000,maxit=200, +#' tol=1e-03,tracing=TRUE,conf.level=0.95) +#' print(res) +#' } +#' +glm.gt <- function(beta0,gtData,X,g,dg=NULL,d2g=NULL,grdMethod=c("central","forward","backward"),covariance=FALSE,nburn=2000,ngit=5000,maxit=200,tol=1e-03,tracing=TRUE,conf.level=0.95,...){ + + ## Objective function to be maximized in the M-step + Q.beta <- function(param,X,eyij){ + p <- apply(X%*%param, 2, g) + -sum(eyij*log(p)+(1-eyij)*log(1-p)) + } + + ## This block tracks the individuals assigned to a pool. + ## Note: 'gtData' must have the required specific structure. + Memb <- gtData[ ,-(1:5)] + N <- max(Memb) + maxAssign <- max(as.numeric(table(Memb[Memb > 0]))) + ytm <- matrix(-9,N,maxAssign) + tmp <- as.matrix( Memb ) + vec <- 1:nrow(gtData) + for(d in 1:N){ + tid <- tmp==d + store <- NULL + for(i in 1:ncol(tmp)){ + store <- c(store,vec[tid[ ,i]]) + } + ytm[d,1:length(store)] <- sort(store) + } + + ## Generating individual true disease statuses + ## at the initial parameter value beta0 + X <- as.matrix(X) + Yt <- stats::rbinom(N,1,apply(X%*%beta0,2,g)) + Ytmat <- cbind(Yt,rowSums(ytm>0),ytm) + + ## Some global variables + Ycol <- ncol(Ytmat) + SeSp <- gtData[ ,3:4] + Z <- gtData[ ,-(3:5)] + Zrow <- nrow(Z) + Zcol <- ncol(Z) + + ## The total number of Gibbs iterates + GI <- ngit + nburn + + ## Initial value of the parameter + param1 <- beta0 + param0 <- beta0 + 2*tol + s <- 1 + convergence <- 0 + + ## The EM algorithm starts here + while(max(abs(param1-param0)) > tol){ # Start iterating + param0 <- param1 + pvec <- apply(X%*%param0, 2, g) + U <- matrix(stats::runif(N*GI),nrow=N,ncol=GI) + + ## Gibbs sampling in Fortran to approximate the E-step + res1 <- .Call("gbsonedsreg_c",as.double(pvec),as.integer(Ytmat),as.integer(Z), + as.integer(N),as.double(SeSp),as.integer(Ycol),as.integer(Zrow), + as.integer(Zcol),as.double(U),as.integer(GI),as.integer(nburn), + PACKAGE="groupTesting") + ey <- res1/ngit + + ## M-step: The parameter beta is updated here + param1 <- stats::optim(par=param0,fn=Q.beta,X=X,eyij=ey,hessian=FALSE, ...)$par + + ## Terminate the EM algorithm if it exceeds max iteration + if(s >= maxit){ + convergence <- 1 + break + } + s <- s + 1 + if(tracing){ + print(c(s-1,param1)) + } + } + + ## Covariance matrix in Fortran using Louis's (1982) method + covr2 <- NULL + if(covariance){ + if( !is.null(dg) ){ + dG <- apply(X%*%param1, 2, dg) + }else{ + dG <- pracma::fderiv(g,X%*%param1,n=1,method=grdMethod) + } + if( !is.null(d2g) ){ + d2G <- apply(X%*%param1, 2, d2g) + }else{ + d2G <- pracma::fderiv(g,X%*%param1,n=2,method=grdMethod) + } + blen <- length(param1) + pvec <- apply(X%*%param1, 2, g) + U <- matrix(stats::runif(N*GI),nrow=N,ncol=GI) + Info <- .Call("cvondknacreg_c",as.double(dG),as.double(d2G),as.integer(blen), + as.double(pvec),as.double(SeSp),as.integer(Ytmat),as.integer(Z), + as.double(X),as.integer(N),as.integer(Ycol),as.integer(Zrow), + as.integer(Zcol),as.double(U),as.integer(GI),as.integer(nburn), + PACKAGE="groupTesting") + covr2 <- solve( Info ) + } + + ## Univariate Wald Inference + betaHat <- param1 + blen <- length(betaHat) + if(covariance){ + vbeta <- diag(covr2) + se <- sqrt(vbeta) + z <- rep(-9, blen) + CI <- matrix(-9, blen, 2) + alternative <- rep("two.sided",blen) + for(i in 1:blen){ + z[i] <- qnorm(ifelse(alternative[i]=="two.sided", + (1+conf.level)/2,conf.level)) + ## Find the confidence interval: + CI[i, ] <- c(betaHat[i]-z[i]*se[i],betaHat[i]+z[i]*se[i]) + } + ## Organizing and reporting the output: + res <- data.frame(round(betaHat, 3), round(se, 3), + round(CI[ ,1],3), round(CI[ ,2],3) ) + }else{ + res <- data.frame( mle=betaHat,se=rep(NA,blen),lower=rep(NA,blen),upper=rep(NA,blen) ) + } + + rownames(res) <- colnames(res) <- NULL + if(!is.null(colnames(X))){ + rownames(res) <- colnames(X) + }else{ + rownames(res) <- c("Intercept",paste(rep("x",blen-1),1:(blen-1),sep="")) + } + colnames(res) <- c("Estimate", "Std.Err", + paste(conf.level*100, "%lower", sep=""), + paste(conf.level*100, "%upper", sep="")) + + # Output + list("param" = param1, + "covariance" = covr2, + "iterUsed" = s-1, + "convergence" = convergence, + "summary" = res + ) +} diff --git a/R/waldTest.R b/R/waldTest.R new file mode 100644 index 0000000..d5d9bf1 --- /dev/null +++ b/R/waldTest.R @@ -0,0 +1,79 @@ +#' Wald Chi-Square Test +#' +#' This function implements the Wald \emph{chi-square} test on a \eqn{K}x\eqn{1} parameter vector \strong{theta}. The test assumes that \strong{thetaHat}, a consistent estimator of \strong{theta} such as MLE, is asymptotically normal with mean \strong{theta} and covariance matrix \strong{Sigma}. The function can implement 1 test on \strong{theta} as well as multiple, \strong{Q}, tests jointly on \strong{theta}. +#' +#' @param R A \eqn{Q}x\eqn{K} matrix of known coefficients depending on how the test is to be carried out. +#' @param thetaHat An estimate of \strong{theta}. +#' @param Sigma An estimated covariance matrix for \code{thetaHat}. +#' @param r A \eqn{Q}x\eqn{1} matrix of hypothesized values. +#' @param L A character string to be used as a name of the test. When NULL, "L" will be used. +#' +#' @importFrom stats pchisq +#' +#' @details +#' +#' Suppose that Q tests are to be performed jointly on the K by 1 parameter vector \strong{theta}. Let R be a \eqn{Q}x\eqn{K} matrix of known coefficients such as 0, 1, and -1, and r be a \eqn{Q}x\eqn{1} matrix of hypothesized values. The hypotheses are \eqn{H0:} \eqn{R}\eqn{\theta} = \eqn{r} vs. \eqn{H1}: \eqn{R}\eqn{\theta} != \eqn{r}. The test statistic has a chi-square distribution with Q degrees of freedom (Buse, 1982; Agresti, 2002). +#' +#' @return A data.frame object of the Wald test results. +#' +#' @export +#' +#' @references +#' +#' Agresti A. (2002). Categorical Data Analysis (2nd ed.). Wiley. ISBN 0471360937. +#' +#' Buse A. (1982). The Likelihood Ratio, Wald, and Lagrange Multiplier Tests: An Expository Note. \emph{The American Statistician}, 36:153-157. +#' +#' @examples +#' +#' ## Example 1 +#' # Parameter: p (proportion) +#' MLE <- 0.42 +#' Var <- 0.016 +#' # (a) Test H0: p = 0.50 vs. H1: p != 0.50 +#' R <- matrix(1, nrow=1, ncol=1) +#' p0 <- 0.50 +#' waldTest( R=R, thetaHat=MLE, r=p0, Sigma=Var ) +#' +#' ## Example 2 +#' # Parameter: beta = (beta1, beta2), regression coefficients +#' MLE <- c(1.09, 2.95) +#' Cov <- rbind(c(0.21, -0.27), +#' c(-0.27, 0.66)) +#' # (a) Test H0: beta1 = beta2 vs. H1: beta1 != beta2 +#' R <- rbind(c(1,-1)) +#' waldTest( R=R, thetaHat=MLE, r=0, Sigma=Cov, L="1 vs 2" ) +#' +#' # (b) Test H0: beta1 = 0 vs. H1: beta1 != 0 +#' R <- rbind(c(1,0)) +#' waldTest( R=R, thetaHat=MLE, r=0, Sigma=Cov ) +#' +#' ## Example 3 +#' # Parameter: beta = (beta0, beta1, beta2) +#' MLE <- c(-3.05, 1.99, 0.93) +#' Cov <- rbind(c( 0.045, -0.022, -0.034), +#' c(-0.022, 0.032, 0.008), +#' c(-0.034, 0.008, 0.048)) +#' +#' # Performing simultaneous test: +#' # H0: beta0 = -3, H0: beta1 = 2, H0: beta2 = 1 +#' # H1: beta0 != -3, H1: beta1 != 2, H1: beta2 != 1 +#' R <- rbind(c(1,0,0), +#' c(0,1,0), +#' c(0,0,1)) +#' r <- matrix( c(-3,2,1), nrow=3 ) +#' waldTest( R=R, thetaHat=MLE, r=r, Sigma=Cov) +#' +waldTest <- function(R,thetaHat,Sigma,r=0,L=NULL){ + Lhat <- R%*%thetaHat - r + # Wald test statistic, W: + W <- t(Lhat)%*%solve(R%*%Sigma%*%t(R))%*%Lhat + DF <- nrow(R) + p.value <- 1 - stats::pchisq(as.numeric(W),DF) + res <- data.frame(round(W,2), round(DF), round(p.value,5)) + res <- noquote(res) + colnames(res) <- c("ChiSq","DF","Pr > ChiSq") + if( is.null(L) ){ L <- "L" } + rownames(res) <- L + return(res) +} diff --git a/man/array.gt.simulation.Rd b/man/array.gt.simulation.Rd new file mode 100644 index 0000000..03eaf7f --- /dev/null +++ b/man/array.gt.simulation.Rd @@ -0,0 +1,124 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gtcode.R +\name{array.gt.simulation} +\alias{array.gt.simulation} +\title{Simulating Array-Based Group Testing Data} +\usage{ +array.gt.simulation( + N, + p = 0.1, + protocol = c("A2", "A2M"), + n, + Se, + Sp, + assayID, + Yt = NULL +) +} +\arguments{ +\item{N}{The number of individuals to be tested.} + +\item{p}{A vector of length N consisting of individual disease probabilities.} + +\item{protocol}{Either "A2" or "A2M", where "A2" ("A2M") refers to the two-dimensional array without (with) testing the members of an array as a single pooled sample.} + +\item{n}{The row (or column) size of the arrays.} + +\item{Se}{A vector of assay sensitivities.} + +\item{Sp}{A vector of assay specificities.} + +\item{assayID}{A vector of assay identification numbers.} + +\item{Yt}{A vector of individual true disease statuses.} +} +\value{ +A list with components: +\item{gtData}{The simulated group testing data.} +\item{testsExp}{The number of tests expended in the simulation.} +} +\description{ +This function simulates two-dimensional array-based group testing data. +} +\details{ +We consider the array testing protocol outlined in Kim et al. (2007). Under this protocol, \eqn{N} individuals are assigned to \eqn{m} non-overlapping \eqn{n}-by-\eqn{n} matrices such that \eqn{N=mn^2}. From each matrix, \eqn{n} pools are formed using the row specimens and another \eqn{n} pools are formed using the column specimens. In stage 1, the \eqn{2n} pools are tested. In stage 2, individual testing is used for case identification according to the strategy described in Kim et al. (2007). This is a 2-stage protocol called \emph{Square Array without Master Pool Testing} and denoted by \eqn{A2(n:1)} in Kim et al. (2007). A variant (3-stage protocol) is also presented in Kim et al. (2007) which employs testing the \eqn{n^2} array members together as an initial pooled unit before implementing the 2-stage array. If the initial pooled test is negative, the procedure stops (i.e., the 2-stage array is not needed). However, if the pooled test is positive, the 2-stage protocol is used as before. This 3-stage approach is called \emph{Square Array with Master Pool Testing} and is denoted by \eqn{A2(n^2:n:1)}. See Kim et al. (2007) for more details. + +\code{N} should be divisible by the array size \eqn{n^2}. When not divisible, the remainder individuals are tested one by one (i.e., individual testing). + +\code{p} is a vector of individual disease probabilities. When all individuals have the same probability of disease, say, 0.10, \code{p} can be specified as rep(0.10, N) or p=0.10. + +For "A2" and "A2M", the pool sizes used are \code{c(n, 1)} and \code{c(n^2, n, 1)}, respectively. + +For "A2", \code{Se} is \code{c(Se1, Se2)}, where \code{Se1} is the sensitivity of the assay used for both row and column pools, and \code{Se2} is the sensitivity of the assay used for individual testing. For "A2M", \code{Se} is \code{c(Se1, Se2, Se3)}, where \code{Se1} is for the initial array pool, \code{Se2} is for the row and column pools, and \code{Se3} is for individual testing. \code{Sp} is specified in the same manner. + +For "A2", \code{assayID} is \code{c(1, 1)} when the same assay is used for row/column pool testing as well as for individual testing, and assayID is \code{c(1, 2)} when assay 1 is used for row/column pool testing and assay 2 is used for individual testing. In the same manner, \code{assayID} is specified for "A2M" as \code{c(1, 1, 1)}, \code{c(1, 2, 3)}, and in many other ways. + +When available, the individual true disease statuses (1 for positive and 0 for negative) can be used in simulating the group testing data through argument \code{Yt}. When an input is entered for \code{Yt}, argument \code{p} will be ignored. +} +\examples{ + +## Example 1: Square Array without Master Pool Testing (i.e., 2-Stage Array) +N <- 48 # Sample size +protocol <- "A2" # 2-stage array +n <- 4 # Row/column size +Se <- c(0.95, 0.95) # Sensitivities in stages 1-2 +Sp <- c(0.98, 0.98) # Specificities in stages 1-2 +assayID <- c(1, 1) # The same assay in both stages + +# (a) Homogeneous population +pHom <- 0.10 # Overall prevalence +array.gt.simulation(N=N,p=pHom,protocol=protocol,n=n,Se=Se,Sp=Sp,assayID=assayID) + +# Alternatively, the individual true statuses can be used as: +yt <- rbinom( N, size=1, prob=0.1 ) +array.gt.simulation(N=N,protocol=protocol,n=n,Se=Se,Sp=Sp,assayID=assayID,Yt=yt) + +# (b) Heterogeneous population (regression) +param <- c(-3,2,1) +x1 <- rnorm(N, mean=0, sd=.75) +x2 <- rbinom(N, size=1, prob=0.5) +X <- cbind(1, x1, x2) +pReg <- exp(X\%*\%param)/(1+exp(X\%*\%param)) # Logit +array.gt.simulation(N=N,p=pReg,protocol=protocol,n=n,Se=Se,Sp=Sp,assayID=assayID) + +# The above examples with different assays +Se <- c(0.95, 0.98) +Sp <- c(0.97, 0.99) +assayID <- c(1, 2) +array.gt.simulation(N,pHom,protocol,n,Se,Sp,assayID) +array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID) + +## Example 2: Square Array with Master Pool Testing (i.e., 3-Stage Array) +N <- 48 +protocol <- "A2M" +n <- 4 +Se <- c(0.95, 0.95, 0.95) +Sp <- c(0.98, 0.98, 0.98) +assayID <- c(1, 1, 1) # The same assay in 3 stages + +# (a) Homogeneous population +pHom <- 0.10 +array.gt.simulation(N,pHom,protocol,n,Se,Sp,assayID) + +# (b) Heterogeneous population (regression) +param <- c(-3,2,1) +x1 <- rnorm(N, mean=0, sd=.75) +x2 <- rbinom(N, size=1, prob=0.5) +X <- cbind(1, x1, x2) +pReg <- exp(X\%*\%param)/(1+exp(X\%*\%param)) # Logit +array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID) + +# The above examples with different assays: +Se <- c(0.95, 0.98, 0.98) +Sp <- c(0.97, 0.98, 0.92) +assayID <- 1:3 +array.gt.simulation(N,pHom,protocol,n,Se,Sp,assayID) +array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID) + +} +\references{ +Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63(4), 1152–1163. +} +\seealso{ +\code{\link{hier.gt.simulation}} for simulation of the hierarchical group testing data. +} diff --git a/man/glm.gt.Rd b/man/glm.gt.Rd new file mode 100644 index 0000000..a42adbd --- /dev/null +++ b/man/glm.gt.Rd @@ -0,0 +1,325 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/regEM.R +\name{glm.gt} +\alias{glm.gt} +\title{EM Algorithm for Fitting Regression Models to Group Testing Data} +\usage{ +glm.gt( + beta0, + gtData, + X, + g, + dg = NULL, + d2g = NULL, + grdMethod = c("central", "forward", "backward"), + covariance = FALSE, + nburn = 2000, + ngit = 5000, + maxit = 200, + tol = 0.001, + tracing = TRUE, + conf.level = 0.95, + ... +) +} +\arguments{ +\item{beta0}{An initial value for the regression coefficients.} + +\item{gtData}{A matrix or data.frame consisting of the pooled test outcomes and other information from a group testing application. Needs to be specified as shown in the example below.} + +\item{X}{The design matrix.} + +\item{g}{An inverse link function in the GLM family.} + +\item{dg}{The first derivate of \code{g}. When NULL, a finite-difference approximation will be used.} + +\item{d2g}{The second derivate of \code{g}. When NULL, a finite-difference approximation will be used.} + +\item{grdMethod}{The finite-difference approximation method to be used for \code{dg} and \code{d2g}. See 'Details'.} + +\item{covariance}{When TRUE, the covariance matrix is calculated at the MLE.} + +\item{nburn}{The number of initial Gibbs iterates to be discarded.} + +\item{ngit}{The number of Gibbs iterates to be used in the E-step after discarding \code{nburn} iterates as a burn-in period.} + +\item{maxit}{The maximum number of EM steps (iterations) allowed in the EM algorithm.} + +\item{tol}{Convergence tolerance used in the EM algorithm.} + +\item{tracing}{When TRUE, progress in the EM algorithm is displayed.} + +\item{conf.level}{Confidence level to be used for the Wald confidence interval.} + +\item{...}{Further arguments to be passed to \code{\link{optim}}. See 'Details'.} +} +\value{ +A list with components: +\item{param}{The MLE of the regression coefficients.} +\item{covariance}{Estimated covariance matrix for the regression coefficients.} +\item{iterUsed}{The number of EM iterations needed for convergence.} +\item{convergence}{0 if the EM algorithm converges successfully and 1 if the iteration limit \code{maxit} has been reached.} +\item{summary}{Estimation summary with Wald confidence interval.} +} +\description{ +This function implements an expectation-maximization (EM) algorithm to fit regression models to group testing data, where pooled responses are related to individual covariates through a link function in the generalized linear model (GLM) family. The EM algorithm finds the maximum likelihood estimate (MLE) for the vector of regression coefficients, \strong{beta}. The EM algorithm can model pooling data observed from \strong{any} group testing protocol used in practice, including hierarchical and array testing (Kim et al., 2007). +} +\details{ +\code{gtData} must be specified as follows. Columns 1-5 consist of the pooled test outcomes (0 for negative and 1 for positive), pool sizes, pool-specific sensitivities, pool-specific specificities, and assay identification (ID) numbers, respectively. From column 6 onward, the pool member ID numbers need to be specified. Note that the ID numbers must start with 1 and increase consecutively up to \code{N}, the total number of individuals tested. \strong{For smaller pools, incomplete ID numbers must be filled out by -9 or any non-positive numbers} as shown in the example below. The design matrix \code{X} consists of invidual covariate information, such as age, sex, and symptoms, of the pool members located in column 6 onward.\tabular{cccccccccccc}{ + \tab Z \tab psz \tab Se \tab Sp \tab Assay \tab Mem1 \tab Mem2 \tab Mem3 \tab Mem4 \tab Mem5 \tab Mem6 \cr + Pool:1 \tab 1 \tab 6 \tab 0.90 \tab 0.92 \tab 1 \tab 1 \tab 2 \tab 3 \tab 4 \tab 5 \tab 6 \cr + Pool:2 \tab 0 \tab 6 \tab 0.90 \tab 0.92 \tab 1 \tab 7 \tab 8 \tab 9 \tab 10 \tab 11 \tab 12 \cr + Pool:3 \tab 1 \tab 2 \tab 0.95 \tab 0.96 \tab 2 \tab 1 \tab 2 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:4 \tab 0 \tab 2 \tab 0.95 \tab 0.96 \tab 2 \tab 3 \tab 4 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:5 \tab 1 \tab 2 \tab 0.95 \tab 0.96 \tab 2 \tab 5 \tab 6 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:6 \tab 0 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 1 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:7 \tab 1 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 2 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:8 \tab 0 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 5 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:9 \tab 0 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 6 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr +} + + +This is an example of \code{gtData}, where 12 individuals are assigned to 2 non-overlapping initial pools and then tested based on the 3-stage hierarchical protocol. The test outcomes, \code{Z}, from 9 pools are in column 1. In three stages, different pool sizes (6, 2, and 1), sensitivities, specificities, and assays are used. The ID numbers of the pool members are shown in columns 6-11. The row names and column names are not required. Note that the EM algorithm can accommodate any group testing data including those described in Kim et al. (2007). For individual testing data, the pool size in column 2 is 1 for all pools. + +\code{X} is an \eqn{N}x\eqn{k} design matrix, where each column represents a vector of individual covariate values. For an intercept model, the first column values must be 1. The column (covariate) names of X, such as 'age' and 'sex', will be displayed in the estimation summary. When column names are missing (NULL), the names that will be displayed by default are 'Intercept', 'x1', 'x2', and so on. + +The EM algorithm implements a Gibbs sampler to approximate the expectation in the E-step. Under each EM iteration, \code{ngit} Gibbs samples are retained for these purposes after discarding the initial \code{nburn} samples. + +\code{g} relates the pooled responses Z (column 1 in \code{gtData}) to \code{X}. \code{dg} and \code{d2g} can be specified analogously. These characteristics can be obtained from \code{\link{glmLink}} for the common links: logit, probit, and complementary log-log. + +\code{grdMethod} is used only when dg and d2g are NULL, where a finite-difference approximation is implemented by the function \code{fderiv} from the package 'pracma'. + +The optimization routine \code{\link{optim}} is used to complete the M-step with the default method 'Nelder-Mead'. The argument ... allows the user to change the default method as well as other arguments in \code{\link{optim}}. + +The covariance matrix is calculated by an appeal to the missing data principle and the method outlined in Louis (1982). +} +\examples{ + +library("groupTesting") + +## To illustrate 'glm.gt', we use data simulated +## by the functions 'hier.gt.simulation' and 'array.gt.simulation'. + +## Note: The simulated data-structures are consistent +## with the data-structure required for 'gtData'. + +## Example 1: MLE from 3-stage hierarchical group testing data. +## The data used is simulated by 'hier.gt.simulation'. + +N <- 200 # Sample size +S <- 3 # 3-stage hierarchical testing +psz <- c(6,2,1) # Pool sizes used in stages 1-3 +Se <- c(.95,.95,.98) # Sensitivities in stages 1-3 +Sp <- c(.95,.98,.96) # Specificities in stages 1-3 +assayID <- c(1,2,3) # Assays used in stages 1-3 +param.t <- c(-3,2,1) # The TRUE parameter to be estimated + +# Simulating covariates: +set.seed(123) +x1 <- rnorm(N, mean=0, sd=0.75) +x2 <- rbinom(N, size=1, prob=0.5) +X <- cbind(1, x1, x2) +colnames( X ) <- c("Intercept", "Predictor 1", "Predictor 2") +# Note: Because the 1st column of X is 1, intercept model will be fit. + +# Specifying logit inverse link: +g <- function(t){exp(t)/(1+exp(t))} +pReg <- g(X\%*\%param.t) + +# Simulating test responses: +gtOut <- hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID)$gtData + +# Fitting the model (with intercept): +param0 <- param.t + 0.2 # Initial value +res <- glm.gt(beta0=param0,gtData=gtOut,X=X, + g=g,dg=NULL,d2g=NULL, + grdMethod="central",covariance=TRUE, + nburn=2000,ngit=5000,maxit=200, + tol=1e-03,tracing=TRUE,conf.level=0.95) + +# Note: Because dg and d2g are NULL (i.e., the exact derivatives +# are not given), numerical derivatives are used. + +# Estimation results: +# > res + +# $param +# [1] -2.840802 1.992916 0.677176 + +# $covariance +# [,1] [,2] [,3] +# [1,] 0.2134439 -0.10147555 -0.16693776 +# [2,] -0.1014756 0.16855122 0.02997113 +# [3,] -0.1669378 0.02997113 0.26324589 + +# $iterUsed +# [1] 10 + +# $convergence +# [1] 0 + +# $summary +# Estimate Std.Err 95\%lower 95\%upper +# Intercept -2.841 0.462 -3.746 -1.935 +# Predictor 1 1.993 0.411 1.188 2.798 +# Predictor 2 0.677 0.513 -0.328 1.683 + +## Example 2: MLE from two-dimensional array testing data. +## The data used is simulated by 'array.gt.simulation'. + +N <- 200 # Sample size +protocol <- "A2" # 2-stage array without testing initial master pool +n <- 5 # Row/column size +Se <- c(0.95, 0.95) # Sensitivities +Sp <- c(0.98, 0.98) # Specificities +assayID <- c(1, 1) # The same assay in both stages +param <- c(-4,1,1) # The TRUE parameter to be estimated + +# Simulating data: +set.seed(123) +x1 <- runif(N) +x2 <- rnorm(N, mean=0, sd=0.5) +x3 <- rbinom(N, size=1, prob=0.5) +X <- cbind(x1, x2, x3) +# Note: Because the 1st column of X is not 1, +# the model without intercept will be fit. + +# Finding g, dg, and d2g from the function 'glmLink': +res0 <- glmLink(fn.name="logit") +g <- res0$g # Logit inverse link g() +dg <- res0$dg # The exact first derivate of g +d2g <- res0$d2g # The exact second derivate of g +pReg <- g(X\%*\%param) # Individual probabilities +gtOut <- array.gt.simulation(N,pReg,protocol,n,Se,Sp,assayID)$gtData + +# Fitting the model (without intercept): +param0 <- param + 0.2 +res <- glm.gt(beta0=param0,gtData=gtOut,X=X,g=g, + dg=dg,d2g=d2g,covariance=TRUE, + nburn=2000,ngit=5000,maxit=200, + tol=1e-03,tracing=TRUE,conf.level=0.95) +print(res) + +\donttest{ +## Example 3: MLE from non-overlapping initial pooled responses. +## The data used is simulated by 'hier.gt.simulation'. + +## Note: With initial pooled responses, our MLE is equivalent +## to the MLE in Vansteelandt et al. (2000). + +N <- 1000 # Sample size +psz <- 5 # Pool size +S <- 1 # 1-stage testing +Se <- 0.95 # Sensitivity +Sp <- 0.99 # Specificity +assayID <- 1 # Assay used for all pools +param <- c(-3,2,1) # The TRUE parameter to be estimated + +# Simulating data: +set.seed(123) +x1 <- rnorm(N, mean=0, sd=0.75) +x2 <- rbinom(N, size=1, prob=0.5) +X <- cbind(1, x1, x2) + +# Finding g, dg, and d2g by the function 'glmLink': +res0 <- glmLink(fn.name="probit") # Probit link +g <- res0$g +dg <- res0$dg +d2g <- res0$d2g +pReg <- g(X\%*\%param) +gtOut <- hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID)$gtData + +# Fitting the model: +param0 <- param + 0.2 +res <- glm.gt(beta0=param0,gtData=gtOut,X=X,g=g, + dg=dg,d2g=d2g,covariance=TRUE, + nburn=2000,ngit=5000,maxit=200, + tol=1e-03,tracing=TRUE,conf.level=0.95) +print(res) + +## Example 4: MLE from individual (one-by-one) testing data. +## The data used is simulated by 'hier.gt.simulation'. + +N <- 1000 # Sample size +psz <- 1 # Pool size 1 (i.e., individual testing) +S <- 1 # 1-stage testing +Se <- 0.95 # Sensitivity +Sp <- 0.99 # Specificity +assayID <- 1 # Assay used for all pools +param <- c(-3,2,1) # The TRUE parameter to be estimated + +# Simulating data: +set.seed(123) +x1 <- rnorm(N, mean=0, sd=0.75) +x2 <- rbinom(N, size=1, prob=0.5) +X <- cbind(1, x1, x2) +g <- function(t){exp(t)/(1+exp(t))} # Inverse logit +pReg <- g(X\%*\%param) +gtOut <- hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID)$gtData + +# Fitting the model: +param0 <- param + 0.2 +res <- glm.gt(beta0=param0,gtData=gtOut, + X=X,g=g,dg=NULL,d2g=NULL, + grdMethod="central",covariance=TRUE, + nburn=2000,ngit=5000,maxit=200, + tol=1e-03,tracing=TRUE,conf.level=0.95) +print(res) + +## Example 5: Using pooled testing data. + +# Pooled test outcomes: +Z <- c(1, 0, 1, 0, 1, 0, 1, 0, 0) + +# Design matrix, X: +x1 <- c(0.8,1.2,0.4,1.5,1.8,1.8,0.1,1.6,0.2,0.2,1.8,0.2) +x2 <- c(31,56,45,64,26,47,22,60,35,41,32,41) +X <- cbind(x1, x2) + +# Pool sizes used: +psz <- c(6, 6, 2, 2, 2, 1, 1, 1, 1) + +# Pool-specific Se & Sp: +Se <- c(.90, .90, .95, .95, .95, .92, .92, .92, .92) +Sp <- c(.92, .92, .96, .96, .96, .90, .90, .90, .90) + +# Assays used: +Assay <- c(1, 1, 2, 2, 2, 3, 3, 3, 3) + +# Pool members: +Memb <- rbind( + c(1, 2, 3, 4, 5, 6), + c(7, 8, 9, 10, 11, 12), + c(1, 2, -9, -9, -9, -9), + c(3, 4, -9, -9, -9, -9), + c(5, 6, -9, -9, -9, -9), + c(1,-9, -9, -9, -9, -9), + c(2,-9, -9, -9, -9, -9), + c(5,-9, -9, -9, -9, -9), + c(6,-9, -9, -9, -9, -9) +) +# The data-structure suited for 'gtData': +gtOut <- cbind(Z, psz, Se, Sp, Assay, Memb) + +# Fitting the model with logit link: +g <- function(t){exp(t)/(1+exp(t))} +param0 <- c(0, 0) +res <- glm.gt(beta0=param0,gtData=gtOut,X=X, + g=g,dg=NULL,d2g=NULL, + grdMethod="central",covariance=TRUE, + nburn=2000,ngit=5000,maxit=200, + tol=1e-03,tracing=TRUE,conf.level=0.95) +print(res) +} + +} +\references{ +Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C. (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63:1152-1163. + +Louis T. (1982). Finding the Observed Information Matrix when Using the EM algorithm. \emph{Journal of the Royal Statistical Society: Series B}, 44:226-233. + +Vansteelandt S, Goetghebeur E, and Verstraeten T. (2000). Regression Models for Disease Prevalence with Diagnostic Tests on Pools of Serum Samples. \emph{Biometrics}, 56:1126-1133. +} +\seealso{ +\code{\link{hier.gt.simulation}} and \code{\link{array.gt.simulation}} for group testing data simulation, and \code{\link{prop.gt}} for estimation of a disease prevalence from group testing data. +} diff --git a/man/glmLink.Rd b/man/glmLink.Rd new file mode 100644 index 0000000..eba9fa5 --- /dev/null +++ b/man/glmLink.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/linkfns.R +\name{glmLink} +\alias{glmLink} +\title{Link Functions in the Class of Generalized Linear Models} +\usage{ +glmLink(fn.name = c("logit", "probit", "cloglog")) +} +\arguments{ +\item{fn.name}{One of the three: "logit", "probit", and "cloglog".} +} +\value{ +A list with components: +\item{g}{The link function corresponding to "logit", "probit", or "cloglog".} +\item{dg}{The first derivative of g.} +\item{d2g}{The second derivative of g.} +\item{gInv}{The inverse of g.} +} +\description{ +This function provides characteristics of common link functions (logit, probit, and comlementary log-log). Specifically, based on the link name, the function with its inverse, first derivative, and second derivative is provided. +} +\examples{ +## Try: +glmLink("logit") + +} diff --git a/man/hier.gt.simulation.Rd b/man/hier.gt.simulation.Rd new file mode 100644 index 0000000..c3ce79e --- /dev/null +++ b/man/hier.gt.simulation.Rd @@ -0,0 +1,146 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gtcode.R +\name{hier.gt.simulation} +\alias{hier.gt.simulation} +\title{Simulating Hierarchical Group Testing Data} +\usage{ +hier.gt.simulation(N, p = 0.1, S, psz, Se, Sp, assayID, Yt = NULL) +} +\arguments{ +\item{N}{The number of individuals to be tested.} + +\item{p}{A vector of length N consisting of individual disease probabilities.} + +\item{S}{The number of stages used in testing, where \code{S} >= 1.} + +\item{psz}{A vector of pool sizes in stages 1-\code{S}.} + +\item{Se}{A vector of assay sensitivities in stages 1-\code{S}.} + +\item{Sp}{A vector of assay specificities in stages 1-\code{S}.} + +\item{assayID}{A vector of the identification numbers of the assays used in stages 1-\code{S}.} + +\item{Yt}{A vector of individual true disease statuses.} +} +\value{ +A list with components: +\item{gtData}{The simulated group testing data.} +\item{testsExp}{The number of tests expended.} +} +\description{ +This function simulates hierarchical group testing data with any number of hierarchical stages. +} +\details{ +We consider the \eqn{S}-stage hierarchical testing protocol outlined in Kim et al. (2007). Under this protocol, \eqn{N} individual specimens are first assigned to \eqn{m} non-overlapping pools, where each initial pool size is \eqn{c}; i.e., \eqn{N=mc}. The initial pools are tested in stage 1. If a pooled test is negative, all members in the pool are diagnosed as negative. However, if a pooled test is positive, the pool members are split into non-overlapping subpools to be tested in the next stage. This procedure is continued. Note that individual testing is used in the final stage, \eqn{S}, for case identification. + +\code{S} is a positive integer, \code{S} >= 1. When \code{S}=1, only the non-overlapping initial pools are tested in stage 1. + +If \code{N} is not divisible by the initial pool size \eqn{c}, we implement the following policy to test the remainder individuals: (1) when \code{S}=1, simply test the remainder pool once as a pooled sample; (2) when \code{S}>1, test the remainder pool based on 2-stage hierarchical testing. + +\code{p} is a vector of individual disease probabilities. When all individuals have the same probability of disease, say, 0.10, p can be specified as p=rep(0.10, N) or p=0.10. + +\code{psz} is a vector of length \code{S}, where the first element is the stage-1 pool size, the second element is the stage-2 pool size, and so on. Pool size at any stage must be divisible by the pool size used at the next stage. For example, \code{psz} can be specified as \code{c(12,3,1)} but not as \code{c(12,5,1)}. + +When \code{psz} is a vector of length 1, test responses are simulated only from the initial pools. + +\code{Se} is a vector of length \code{S}, where the first element is the sensitivity of the assay used in stage 1, the second element is sensitivity of the assay in stage 2, and so on. + +\code{Sp} is a vector of length \code{S}, where the first element is the specificity of the assay used in stage 1, the second element is specificity of the assay in stage 2, and so on. + +\code{assayID} is a vector of length \code{S}, where the first element is the ID of the assay in stage 1, the second element is the ID of the assay in stage 2, and so on. + +When available, the individual true disease statuses (1 for positive and 0 for negative) can be used in simulating the group testing data through argument \code{Yt}. When an input is entered for \code{Yt}, argument \code{p} will be ignored. +} +\examples{ + +## Example 1: Two-stage hierarchical (Dorfman) testing +N <- 50 # Sample size +psz <- c(5, 1) # Pool sizes used in stages 1 and 2 +S <- 2 # The number of stages +Se <- c(0.95, 0.95) # Sensitivities in stages 1-2 +Sp <- c(0.98, 0.98) # Specificities in stages 1-2 +assayID <- c(1, 1) # The same assay in both stages + +# (a) Homogeneous population +pHom <- 0.10 # Overall prevalence +hier.gt.simulation(N=N,p=pHom,S=S,psz=psz,Se=Se,Sp=Sp,assayID=assayID) + +# Alternatively, the individual true statuses can be used as: +yt <- rbinom( N, size=1, prob=0.1 ) +hier.gt.simulation(N=N,S=S,psz=psz,Se=Se,Sp=Sp,assayID=assayID,Yt=yt) + +# (b) Heterogeneous population (regression) +param <- c(-3,2,1) +x1 <- rnorm(N, mean=0, sd=.75) +x2 <- rbinom(N, size=1, prob=0.5) +X <- cbind(1, x1, x2) +pReg <- exp(X\%*\%param)/(1+exp(X\%*\%param)) # Logit +hier.gt.simulation(N=N,p=pReg,S=S,psz=psz,Se=Se,Sp=Sp,assayID=assayID) + +## Example 2: Initial (1-stage) pooled testing data +N <- 50 +S <- 1 +Se <- 0.95 +Sp <- 0.98 +assayID <- 1 + +# (a) Homogeneous population +pHom <- 0.10 # Overall prevalence + +# a(i) Pooled testing +psz <- 5 # pool size +hier.gt.simulation(N,pHom,S,psz,Se,Sp,assayID) + +# a(ii) Inidividual testing +psz <- 1 # pool size +hier.gt.simulation(N,pHom,S,psz,Se,Sp,assayID) + +# (b) Heterogeneous population (regression) +param <- c(-3,2,1) +x1 <- rnorm(N, mean=0, sd=.75) +x2 <- rbinom(N, size=1, prob=0.5) +X <- cbind(1, x1, x2) +pReg <- exp(X\%*\%param)/(1+exp(X\%*\%param)) # Logit + +# b(i) Pooled testing +psz <- 5 +hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID) + +# b(ii) Individual testing +psz <- 1 +hier.gt.simulation(N,pReg,S,psz,Se,Sp,assayID) + +## Example 3: Data with other configurations +N <- 48 +p <- 0.10 +Se <- c(.90, .95, .92, .90, .99) +Sp <- c(.96, .96, .90, .92, .95) +Assay <- 1:5 + +# Initial pooled testing, using the first element of Se, Sp & Assay +pszH1 <- 4 +hier.gt.simulation(N=N,p=p,S=1,psz=pszH1,Se=Se,Sp=Sp,assayID=Assay) + +pszH2 <- c(4,1) # Two-stage, using first 2 elements of Se, Sp & Assay +hier.gt.simulation(N=N,p=p,S=2,psz=pszH2,Se=Se,Sp=Sp,assayID=Assay) + +pszH4 <- c(16,8,2,1) # Four-stage, using first 4 elements of Se, Sp & Assay +hier.gt.simulation(N=N,p=p,S=4,psz=pszH4,Se=Se,Sp=Sp,assayID=Assay) + +pszH3 <- c(12,2,1) # Three-stage, using first 3 elements of Se, Sp & Assay +Assay3 <- c(2,1,3) # Array ID numbers do not need to be in order +hier.gt.simulation(N=N,p=p,S=3,psz=pszH3,Se=Se,Sp=Sp,assayID=Assay3) + +# Works with a remainder pool of 2 individuals +N <- 50 +psz <- c(12,2,1) +hier.gt.simulation(N=N,p=p,S=3,psz=psz,Se=Se,Sp=Sp,assayID=Assay) + +} +\references{ +Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63(4), 1152–1163. +} +\seealso{ +\code{\link{array.gt.simulation}} for simulation of the array-based group testing data. +} diff --git a/man/prop.gt.Rd b/man/prop.gt.Rd new file mode 100644 index 0000000..049a1d8 --- /dev/null +++ b/man/prop.gt.Rd @@ -0,0 +1,237 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/propEM.R +\name{prop.gt} +\alias{prop.gt} +\title{EM Algorithm to Estimate the Prevalence of a Disease from Group Testing Data} +\usage{ +prop.gt( + p0, + gtData, + covariance = FALSE, + nburn = 2000, + ngit = 5000, + maxit = 200, + tol = 0.001, + tracing = TRUE, + conf.level = 0.95 +) +} +\arguments{ +\item{p0}{An initial value of the prevalence.} + +\item{gtData}{A matrix or data.frame consisting of the pooled test outcomes and other information from a group testing application. Needs to be specified as shown in the example below.} + +\item{covariance}{When TRUE, the variance is calculated at the MLE.} + +\item{nburn}{The number of initial Gibbs iterates to be discarded.} + +\item{ngit}{The number of Gibbs iterates to be used in the E-step after discarding the initial iterates as a burn-in period.} + +\item{maxit}{The maximum number of EM steps (iterations) allowed in the EM algorithm.} + +\item{tol}{Convergence tolerance used in the EM algorithm.} + +\item{tracing}{When TRUE, progress in the EM algorithm is displayed.} + +\item{conf.level}{Confidence level to be used for the Wald confidence interval.} +} +\value{ +A list with components: +\item{param}{The MLE of the disease prevalence.} +\item{covariance}{Estimated variance for the disease prevalence.} +\item{iterUsed}{The number of EM iterations used for convergence.} +\item{convergence}{0 if the EM algorithm converges successfully and 1 if the iteration limit \code{maxit} has been reached.} +\item{summary}{Estimation summary with Wald confidence interval.} +} +\description{ +This function implements an expectation-maximization (EM) algorithm to find the maximum likelihood estimate (MLE) of a disease prevalence, p, based on group testing data. The EM algorithm can model pooling data observed from \strong{any} group testing protocol used in practice, including hierarchical and array testing (Kim et al., 2007). +} +\details{ +\code{gtData} must be specified as follows. Columns 1-5 consist of the pooled test outcomes (0 for negative and 1 for positive), pool sizes, pool-specific sensitivities, pool-specific specificities, and assay ID numbers, respectively. From column 6 onward, the pool member ID numbers need to be specified. Note that the ID numbers must start with 1 and increase consecutively up to \code{N}, the total number of individuals tested. \strong{For smaller pools, incomplete ID numbers must be filled out by -9 or any non-positive numbers} as shown in the example below.\tabular{cccccccccccc}{ + \tab Z \tab psz \tab Se \tab Sp \tab Assay \tab Mem1 \tab Mem2 \tab Mem3 \tab Mem4 \tab Mem5 \tab Mem6 \cr + Pool:1 \tab 1 \tab 6 \tab 0.90 \tab 0.92 \tab 1 \tab 1 \tab 2 \tab 3 \tab 4 \tab 5 \tab 6 \cr + Pool:2 \tab 0 \tab 6 \tab 0.90 \tab 0.92 \tab 1 \tab 7 \tab 8 \tab 9 \tab 10 \tab 11 \tab 12 \cr + Pool:3 \tab 1 \tab 2 \tab 0.95 \tab 0.96 \tab 2 \tab 1 \tab 2 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:4 \tab 0 \tab 2 \tab 0.95 \tab 0.96 \tab 2 \tab 3 \tab 4 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:5 \tab 1 \tab 2 \tab 0.95 \tab 0.96 \tab 2 \tab 5 \tab 6 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:6 \tab 0 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 1 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:7 \tab 1 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 2 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:8 \tab 0 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 5 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr + Pool:9 \tab 0 \tab 1 \tab 0.92 \tab 0.90 \tab 3 \tab 6 \tab -9 \tab -9 \tab -9 \tab -9 \tab -9 \cr +} + + +This is an example of \code{gtData}, where 12 individuals are assigned to 2 non-overlapping initial pools and then tested based on the 3-stage hierarchical protocol. The test outcomes, \code{Z}, from 9 pools are in column 1. In three stages, different pool sizes (6, 2, and 1), sensitivities, specificities, and assays are used. The ID numbers of the pool members are shown in columns 6-11. The row names and column names are not required. Note that the EM algorithm can accommodate any group testing data including those described in Kim et al. (2007). For individual testing data, the pool size in column 2 is 1 for all pools. + +The EM algorithm implements a Gibbs sampler to approximate quantities required to complete the E-step. Under each EM iteration, \code{ngit} Gibbs samples are retained for these purposes after discarding the initial \code{nburn} samples. + +The variance of the MLE is calculated by an appeal to the missing data principle and the method outlined in Louis (1982). +} +\examples{ + +library("groupTesting") + +## To illustrate 'prop.gt', we use data simulated +## by the R functions 'hier.gt.simulation' and 'array.gt.simulation'. + +## The simulated data-structures are consistent +## with the data-structure required for 'gtData'. + +## Example 1: MLE from 3-stage hierarchical group testing data. +## The data used is simulated by 'hier.gt.simulation'. + +N <- 90 # Sample size +S <- 3 # 3-stage hierarchical testing +psz <- c(6,2,1) # Pool sizes used in stages 1-3 +Se <- c(.95,.95,.98) # Sensitivities in stages 1-3 +Sp <- c(.95,.98,.96) # Specificities in stages 1-3 +assayID <- c(1,2,3) # Assays used in stages 1-3 +p.t <- 0.05 # The TRUE parameter to be estimated + +# Simulating data: +set.seed(123) +gtOut <- hier.gt.simulation(N,p.t,S,psz,Se,Sp,assayID)$gtData + +# Running the EM algorithm: +pStart <- p.t + 0.2 # Initial value +res <- prop.gt(p0=pStart,gtData=gtOut,covariance=TRUE, + nburn=2000,ngit=5000,maxit=200,tol=1e-03, + tracing=TRUE,conf.level=0.95) + +# Estimation results: +# > res + +# $param +# [1] 0.05158 + +# $covariance +# [,1] +# [1,] 0.0006374296 + +# $iterUsed +# [1] 4 + +# $convergence +# [1] 0 + +# $summary +# Estimate StdErr 95\%lower 95\%upper +# prop 0.052 0.025 0.002 0.101 + +## Example 2: MLE from two-dimensional array testing data. +## The data used is simulated by 'array.gt.simulation'. + +N <- 100 # Sample size +protocol <- "A2" # 2-stage array without testing the initial master pool +n <- 5 # Row/column size +Se <- c(0.95, 0.95) # Sensitivities +Sp <- c(0.98, 0.98) # Specificities +assayID <- c(1, 1) # The same assay in both stages +p.true <- 0.05 # The TRUE parameter to be estimated + +# Simulating data: +set.seed(123) +gtOut <- array.gt.simulation(N,p.true,protocol,n,Se,Sp,assayID)$gtData + +# Fitting the model: +pStart <- p.true + 0.2 # Initial value +res <- prop.gt(p0=pStart,gtData=gtOut,covariance=TRUE) +print(res) + +\donttest{ +## Example 3: MLE from non-overlapping initial pooled responses. +## The data used is simulated by 'hier.gt.simulation'. + +## Note: With initial pooled responses, our MLE is equivalent +## to the MLE in Litvak et al. (1994) and Liu et al. (2012). + +N <- 1000 # Sample size +psz <- 5 # Pool size +S <- 1 # 1-stage testing +Se <- 0.95 # Sensitivity +Sp <- 0.99 # Specificity +assayID <- 1 # Assay used for all pools +p.true <- 0.05 # True parameter + +set.seed(123) +gtOut <- hier.gt.simulation(N,p.true,S,psz,Se,Sp,assayID)$gtData + +pStart <- p.true + 0.2 # Initial value +res <- prop.gt(p0=pStart,gtData=gtOut, + covariance=TRUE,nburn=2000,ngit=5000, + maxit=200,tol=1e-03,tracing=TRUE) +print(res) + +## Example 4: MLE from individual (one-by-one) testing data. +## The data used is simulated by 'hier.gt.simulation'. + +N <- 1000 # Sample size +psz <- 1 # Pool size 1 (i.e., individual testing) +S <- 1 # 1-stage testing +Se <- 0.95 # Sensitivity +Sp <- 0.99 # Specificity +assayID <- 1 # Assay used for all pools +p.true <- 0.05 # True parameter + +set.seed(123) +gtOut <- hier.gt.simulation(N,p.true,S,psz,Se,Sp,assayID)$gtData + +pStart <- p.true + 0.2 # Initial value +res <- prop.gt(p0=pStart,gtData=gtOut, + covariance=TRUE,nburn=2000, + ngit=5000,maxit=200, + tol=1e-03,tracing=TRUE) +print(res) + +## Example 5: Using pooled testing data. + +# Pooled test outcomes: +Z <- c(1, 0, 1, 0, 1, 0, 1, 0, 0) + +# Pool sizes used: +psz <- c(6, 6, 2, 2, 2, 1, 1, 1, 1) + +# Pool-specific Se & Sp: +Se <- c(.90, .90, .95, .95, .95, .92, .92, .92, .92) +Sp <- c(.92, .92, .96, .96, .96, .90, .90, .90, .90) + +# Assays used: +Assay <- c(1, 1, 2, 2, 2, 3, 3, 3, 3) + +# Pool members: +Memb <- rbind( + c(1, 2, 3, 4, 5, 6), + c(7, 8, 9, 10, 11, 12), + c(1, 2, -9, -9, -9, -9), + c(3, 4, -9, -9, -9, -9), + c(5, 6, -9, -9, -9, -9), + c(1,-9, -9, -9, -9, -9), + c(2,-9, -9, -9, -9, -9), + c(5,-9, -9, -9, -9, -9), + c(6,-9, -9, -9, -9, -9) +) +# The data-structure suited for 'gtData': +gtOut <- cbind(Z, psz, Se, Sp, Assay, Memb) + +# Fitting the model: +pStart <- 0.10 +res <- prop.gt(p0=pStart,gtData=gtOut, + covariance=TRUE,nburn=2000, + ngit=5000,maxit=200, + tol=1e-03,tracing=TRUE) +print(res) +} + +} +\references{ +Kim HY, Hudgens M, Dreyfuss J, Westreich D, and Pilcher C. (2007). Comparison of Group Testing Algorithms for Case Identification in the Presence of Testing Error. \emph{Biometrics}, 63:1152-1163. + +Litvak E, Tu X, and Pagano M. (1994). Screening for the Presence of a Disease by Pooling Sera Samples. \emph{Journal of the American Statistical Association}, 89:424-434. + +Liu A, Liu C, Zhang Z, and Albert P. (2012). Optimality of Group Testing in the Presence of Misclassification. \emph{Biometrika}, 99:245-251. + +Louis T. (1982). Finding the Observed Information Matrix when Using the EM algorithm. \emph{Journal of the Royal Statistical Society: Series B}, 44:226-233. +} +\seealso{ +\code{\link{hier.gt.simulation}} and \code{\link{array.gt.simulation}} for group testing data simulation, and \code{\link{glm.gt}} for group testing regression models. +} diff --git a/man/waldTest.Rd b/man/waldTest.Rd new file mode 100644 index 0000000..58350e1 --- /dev/null +++ b/man/waldTest.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/waldTest.R +\name{waldTest} +\alias{waldTest} +\title{Wald Chi-Square Test} +\usage{ +waldTest(R, thetaHat, Sigma, r = 0, L = NULL) +} +\arguments{ +\item{R}{A \eqn{Q}x\eqn{K} matrix of known coefficients depending on how the test is to be carried out.} + +\item{thetaHat}{An estimate of \strong{theta}.} + +\item{Sigma}{An estimated covariance matrix for \code{thetaHat}.} + +\item{r}{A \eqn{Q}x\eqn{1} matrix of hypothesized values.} + +\item{L}{A character string to be used as a name of the test. When NULL, "L" will be used.} +} +\value{ +A data.frame object of the Wald test results. +} +\description{ +This function implements the Wald \emph{chi-square} test on a \eqn{K}x\eqn{1} parameter vector \strong{theta}. The test assumes that \strong{thetaHat}, a consistent estimator of \strong{theta} such as MLE, is asymptotically normal with mean \strong{theta} and covariance matrix \strong{Sigma}. The function can implement 1 test on \strong{theta} as well as multiple, \strong{Q}, tests jointly on \strong{theta}. +} +\details{ +Suppose that Q tests are to be performed jointly on the K by 1 parameter vector \strong{theta}. Let R be a \eqn{Q}x\eqn{K} matrix of known coefficients such as 0, 1, and -1, and r be a \eqn{Q}x\eqn{1} matrix of hypothesized values. The hypotheses are \eqn{H0:} \eqn{R}\eqn{\theta} = \eqn{r} vs. \eqn{H1}: \eqn{R}\eqn{\theta} != \eqn{r}. The test statistic has a chi-square distribution with Q degrees of freedom (Buse, 1982; Agresti, 2002). +} +\examples{ + +## Example 1 +# Parameter: p (proportion) +MLE <- 0.42 +Var <- 0.016 +# (a) Test H0: p = 0.50 vs. H1: p != 0.50 +R <- matrix(1, nrow=1, ncol=1) +p0 <- 0.50 +waldTest( R=R, thetaHat=MLE, r=p0, Sigma=Var ) + +## Example 2 +# Parameter: beta = (beta1, beta2), regression coefficients +MLE <- c(1.09, 2.95) +Cov <- rbind(c(0.21, -0.27), + c(-0.27, 0.66)) +# (a) Test H0: beta1 = beta2 vs. H1: beta1 != beta2 +R <- rbind(c(1,-1)) +waldTest( R=R, thetaHat=MLE, r=0, Sigma=Cov, L="1 vs 2" ) + +# (b) Test H0: beta1 = 0 vs. H1: beta1 != 0 +R <- rbind(c(1,0)) +waldTest( R=R, thetaHat=MLE, r=0, Sigma=Cov ) + +## Example 3 +# Parameter: beta = (beta0, beta1, beta2) +MLE <- c(-3.05, 1.99, 0.93) +Cov <- rbind(c( 0.045, -0.022, -0.034), + c(-0.022, 0.032, 0.008), + c(-0.034, 0.008, 0.048)) + +# Performing simultaneous test: +# H0: beta0 = -3, H0: beta1 = 2, H0: beta2 = 1 +# H1: beta0 != -3, H1: beta1 != 2, H1: beta2 != 1 +R <- rbind(c(1,0,0), + c(0,1,0), + c(0,0,1)) +r <- matrix( c(-3,2,1), nrow=3 ) +waldTest( R=R, thetaHat=MLE, r=r, Sigma=Cov) + +} +\references{ +Agresti A. (2002). Categorical Data Analysis (2nd ed.). Wiley. ISBN 0471360937. + +Buse A. (1982). The Likelihood Ratio, Wald, and Lagrange Multiplier Tests: An Expository Note. \emph{The American Statistician}, 36:153-157. +} diff --git a/src/Makevars.txt b/src/Makevars.txt new file mode 100644 index 0000000..a08adda --- /dev/null +++ b/src/Makevars.txt @@ -0,0 +1,12 @@ +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) + +C_OBJS = gtestingc.o +FT_OBJS = gtestingf.o + +all: + @$(MAKE) $(SHLIB) + @rm -f *.mod *.o + +$(SHLIB): $(FT_OBJS) $(C_OBJS) + +gtestingc.o: gtestingf.o diff --git a/src/gtestingc.c b/src/gtestingc.c new file mode 100644 index 0000000..62b69a5 --- /dev/null +++ b/src/gtestingc.c @@ -0,0 +1,58 @@ +#include +#include +#include // for NULL +#include +#include + +void F77_NAME(gbsonedhom_f)(double *p, int *Ytmat, int *Zmat, int *N, double *SeSp, int *Ycol, int *Zrow, int *Zcol, double *U, int *GI, int *a, int *ret); + +extern SEXP gbsonedhom_c(SEXP p, SEXP Ytmat, SEXP Zmat, SEXP N, SEXP SeSp, SEXP Ycol, SEXP Zrow, SEXP Zcol, SEXP U, SEXP GI, SEXP a){ + SEXP ret; + PROTECT(ret = allocVector(INTSXP, asInteger(N))); + F77_CALL(gbsonedhom_f)(REAL(p),INTEGER(Ytmat),INTEGER(Zmat),INTEGER(N),REAL(SeSp),INTEGER(Ycol),INTEGER(Zrow),INTEGER(Zcol),REAL(U),INTEGER(GI),INTEGER(a),INTEGER(ret)); + UNPROTECT(1); + return(ret); +} + +void F77_NAME(cvondknachom_f)(double *p, int *Ytmat, int *Zmat, int *N, double *SeSp, int *Ycol, int *Zrow, int *Zcol, double *U, int *GI, int *a, double *ret); + +extern SEXP cvondknachom_c(SEXP p, SEXP Ytmat, SEXP Zmat, SEXP N, SEXP SeSp, SEXP Ycol, SEXP Zrow, SEXP Zcol, SEXP U, SEXP GI, SEXP a){ + SEXP ret; + PROTECT(ret = allocVector(REALSXP, 1)); + F77_CALL(cvondknachom_f)(REAL(p),INTEGER(Ytmat),INTEGER(Zmat),INTEGER(N),REAL(SeSp),INTEGER(Ycol),INTEGER(Zrow),INTEGER(Zcol),REAL(U),INTEGER(GI),INTEGER(a),REAL(ret)); + UNPROTECT(1); + return(ret); +} + +void F77_NAME(gbsonedsreg_f)(double *p, int *Ytmat, int *Zmat, int *N, double *SeSp, int *Ycol, int *Zrow, int *Zcol, double *U, int *GI, int *a, int *ret); + +extern SEXP gbsonedsreg_c(SEXP p, SEXP Ytmat, SEXP Zmat, SEXP N, SEXP SeSp, SEXP Ycol, SEXP Zrow, SEXP Zcol, SEXP U, SEXP GI, SEXP a){ + SEXP ret; + PROTECT(ret = allocVector(INTSXP, asInteger(N))); + F77_CALL(gbsonedsreg_f)(REAL(p),INTEGER(Ytmat),INTEGER(Zmat),INTEGER(N),REAL(SeSp),INTEGER(Ycol),INTEGER(Zrow),INTEGER(Zcol),REAL(U),INTEGER(GI),INTEGER(a),INTEGER(ret)); + UNPROTECT(1); + return(ret); +} + +void F77_NAME(cvondknacreg_f)(double *dg, double *d2g, int *blen, double *p, double *SeSp, int *Ytmat, int *Zmat, double *X, int *N, int *Ycol, int *Zrow, int *Zcol, double *U, int *GI, int *a, double *ret); + +extern SEXP cvondknacreg_c(SEXP dg, SEXP d2g, SEXP blen, SEXP p, SEXP SeSp, SEXP Ytmat, SEXP Zmat, SEXP X, SEXP N, SEXP Ycol, SEXP Zrow, SEXP Zcol, SEXP U, SEXP GI, SEXP a){ + SEXP ret; + PROTECT(ret = allocMatrix(REALSXP, asInteger(blen), asInteger(blen))); + F77_CALL(cvondknacreg_f)(REAL(dg),REAL(d2g),INTEGER(blen),REAL(p),REAL(SeSp),INTEGER(Ytmat),INTEGER(Zmat),REAL(X),INTEGER(N),INTEGER(Ycol),INTEGER(Zrow),INTEGER(Zcol),REAL(U),INTEGER(GI),INTEGER(a),REAL(ret)); + UNPROTECT(1); + return(ret); +} + +static const R_CallMethodDef CallEntries[] = { + {"gbsonedhom_c", (DL_FUNC) &gbsonedhom_c, 11}, + {"cvondknachom_c", (DL_FUNC) &cvondknachom_c, 11}, + {"gbsonedsreg_c", (DL_FUNC) &gbsonedsreg_c, 11}, + {"cvondknacreg_c", (DL_FUNC) &cvondknacreg_c, 15}, + {NULL, NULL, 0} +}; + +void R_init_groupTesting(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/gtestingf.f95 b/src/gtestingf.f95 new file mode 100644 index 0000000..5bc51c0 --- /dev/null +++ b/src/gtestingf.f95 @@ -0,0 +1,326 @@ +module gtesting + use, intrinsic :: iso_c_binding + + implicit none + private + public :: gbsonedhom_f + public :: cvondknachom_f + public :: gbsonedsreg_f + + +contains + + subroutine gbsonedhom_f(p,Ytmat,Zmat,N,SeSp,Ycol,Zrow,Zcol,U,GI,a,ret) bind(C, name="gbsonedhom_f_") + implicit none + integer(kind = c_int), intent(in) :: N, Ycol, Zrow, Zcol, GI, a + real(kind = c_double), intent(in) :: p + real(kind = c_double), intent(in), dimension(Zrow,2) :: SeSp + integer(kind = c_int), intent(inout), dimension(N,Ycol) :: Ytmat + integer(kind = c_int), intent(in), dimension(Zrow,Zcol) :: Zmat + real(kind = c_double), intent(in), dimension(N,GI) :: U + integer(kind = c_int), intent(inout), dimension(N) :: ret + + integer :: g, i, j, k, s + integer :: Z1, sm1, gma1 + real(kind = 8) :: zeta, zeta0, zeta1 + integer :: pid, psz + real(kind = 8) :: Se1, Sp1, RSe1, RSp1 + + do i = 1, N + ret(i) = 0 + end do + do g = 1, GI + do i = 1, N + Ytmat(i,1) = 0 + zeta0 = 1.0_c_double + zeta1 = 1.0_c_double + do j=1, Ytmat(i,2) + pid = Ytmat(i,2+j) + Z1 = Zmat(pid,1) + psz = Zmat(pid,2) + Se1 = SeSp(pid,1) + Sp1 = SeSp(pid,2) + sm1 = 0 + do s=1, psz + k = Zmat(pid,2+s) + sm1 = sm1 + Ytmat(k,1) + end do + gma1 = 0 + if(sm1 .gt. 0) then + gma1 = 1 + end if + RSe1 = (Se1**Z1)*((1_c_double-Se1)**(1_c_double-Z1)) + RSp1 = (Sp1**(1_c_double-Z1))*((1_c_double-Sp1)**Z1) + zeta0 = zeta0*(RSe1**gma1)*(RSp1**(1_c_double-gma1)) + zeta1 = zeta1*RSe1 + end do + zeta0 = zeta0*(1.0_c_double-p) + zeta1 = zeta1*p + zeta = zeta0 + zeta1 + zeta0 = zeta0/zeta + if(U(i,g) .gt. zeta0) then + Ytmat(i,1)=1 + else + Ytmat(i,1)=0 + end if + if(g .gt. a) then + ret(i) = ret(i) + Ytmat(i,1) + end if + end do + end do + end subroutine gbsonedhom_f + + subroutine cvondknachom_f(p,Ytmat,Zmat,N,SeSp,Ycol,Zrow,Zcol,U,GI,a,ret) bind(C, name="cvondknachom_f_") + implicit none + integer(kind = c_int), intent(in) :: N, Ycol, Zrow, Zcol, GI, a + real(kind = c_double), intent(in) :: p + real(kind = c_double), intent(in), dimension(Zrow,2) :: SeSp + integer(kind = c_int), intent(inout), dimension(N,Ycol) :: Ytmat + integer(kind = c_int), intent(in), dimension(Zrow,Zcol) :: Zmat + real(kind = c_double), intent(in), dimension(N,GI) :: U + real(kind = c_double), intent(out) :: ret + + integer :: g, i, j, k, s + integer :: Z1, sm1, gma1 + real(kind = 8) :: zeta, zeta0, zeta1 + integer :: pid, psz + real(kind = 8) :: Se1, Sp1, RSe1, RSp1 + integer, dimension(N) :: V + integer :: ytsm + real(kind = 8) :: Elc, dlc, Eyi, tmp1, dlcmat(GI-a) + real(kind = 8) :: covr, d2Q + + V = 0 + covr = 0 + d2Q = 0 + Elc = 0 + do g = 1, GI + ytsm = 0 + do i = 1, N + Ytmat(i,1) = 0 + zeta0 = 1.0_c_double + zeta1 = 1.0_c_double + do j=1, Ytmat(i,2) + pid = Ytmat(i,2+j) + Z1 = Zmat(pid,1) + psz = Zmat(pid,2) + Se1 = SeSp(pid,1) + Sp1 = SeSp(pid,2) + sm1 = 0 + do s=1, psz + k = Zmat(pid,2+s) + sm1 = sm1 + Ytmat(k,1) + end do + gma1 = 0 + if(sm1 .gt. 0) then + gma1 = 1 + end if + RSe1 = (Se1**Z1)*((1_c_double-Se1)**(1_c_double-Z1)) + RSp1 = (Sp1**(1-Z1))*((1_c_double-Sp1)**Z1) + zeta0 = zeta0*(RSe1**gma1)*(RSp1**(1_c_double-gma1)) + zeta1 = zeta1*RSe1 + end do + zeta0 = zeta0*(1.0_c_double-p) + zeta1 = zeta1*p + zeta = zeta0 + zeta1 + zeta0 = zeta0/zeta + if(U(i,g) .gt. zeta0) then + Ytmat(i,1)=1 + else + Ytmat(i,1)=0 + end if + if(g .gt. a) then + V(i) = V(i) + Ytmat(i,1) + ytsm = ytsm + Ytmat(i,1) + end if + end do + if(g .gt. a) then + dlc = dble(ytsm-p*N)/(p*(1.0_c_double-p)) + Elc = Elc + dlc + dlcmat(g-a) = dlc + end if + end do + + Elc = Elc/dble(GI-a) + do g=1, (GI-a) + covr = covr+( dlcmat(g)-Elc )*( dlcmat(g)-Elc ) + end do + do i = 1, N + Eyi = dble(V(i))/dble(GI-a) + tmp1 = Eyi/(p**2) + (1.0_c_double-Eyi)/( (1.0_c_double-p)**2 ) + d2Q = d2Q + tmp1 + end do + ret = d2Q - covr/dble(GI-a) + end subroutine cvondknachom_f + + subroutine gbsonedsreg_f(p,Ytmat,Zmat,N,SeSp,Ycol,Zrow,Zcol,U,GI,a,ret) bind(C, name="gbsonedsreg_f_") + implicit none + integer(kind = c_int), intent(in) :: N, Ycol, Zrow, Zcol, GI, a + real(kind = c_double), intent(in), dimension(N) :: p + real(kind = c_double), intent(in), dimension(Zrow,2) :: SeSp + integer(kind = c_int), intent(inout), dimension(N,Ycol) :: Ytmat + integer(kind = c_int), intent(in), dimension(Zrow,Zcol) :: Zmat + real(kind = c_double), intent(in), dimension(N,GI) :: U + integer(kind = c_int), intent(inout), dimension(N) :: ret + + integer :: g, i, j, k, s + integer :: Z1, sm1, gma1 + real(kind = 8) :: zeta, zeta0, zeta1 + integer :: pid, psz + real(kind = 8) :: Se1, Sp1, RSe1, RSp1 + + do i = 1, N + ret(i) = 0 + end do + do g = 1, GI + do i = 1, N + Ytmat(i,1) = 0 + zeta0 = 1.0_c_double + zeta1 = 1.0_c_double + do j=1, Ytmat(i,2) + pid = Ytmat(i,2+j) + Z1 = Zmat(pid,1) + psz = Zmat(pid,2) + Se1 = SeSp(pid,1) + Sp1 = SeSp(pid,2) + sm1 = 0 + do s=1, psz + k = Zmat(pid,2+s) + sm1 = sm1 + Ytmat(k,1) + end do + gma1 = 0 + if(sm1 .gt. 0) then + gma1 = 1 + end if + RSe1 = (Se1**Z1)*((1_c_double-Se1)**(1_c_double-Z1)) + RSp1 = (Sp1**(1_c_double-Z1))*((1_c_double-Sp1)**Z1) + zeta0 = zeta0*(RSe1**gma1)*(RSp1**(1_c_double-gma1)) + zeta1 = zeta1*RSe1 + end do + zeta0 = (1.0_c_double-p(i))*zeta0 + zeta1 = p(i)*zeta1 + zeta = zeta0 + zeta1 + zeta0 = zeta0/zeta + if(U(i,g) .gt. zeta0) then + Ytmat(i,1)=1 + else + Ytmat(i,1)=0 + end if + if(g .gt. a) then + ret(i) = ret(i) + Ytmat(i,1) + end if + end do + end do + end subroutine gbsonedsreg_f + + subroutine cvondknacreg_f(dg,d2g,blen,p,SeSp,Ytmat,Zmat,X,N,Ycol,Zrow,Zcol,U,GI,a,ret) bind(C, name="cvondknacreg_f_") + implicit none + integer(kind = c_int), intent(in) :: blen, N, Ycol, Zrow, Zcol, GI, a + real(kind = c_double), intent(in), dimension(N) :: dg + real(kind = c_double), intent(in), dimension(N) :: d2g + real(kind = c_double), intent(in), dimension(N) :: p + real(kind = c_double), intent(in), dimension(Zrow,2) :: SeSp + real(kind = c_double), intent(in), dimension(N,blen) :: X + integer(kind = c_int), intent(inout), dimension(N,Ycol) :: Ytmat + integer(kind = c_int), intent(in), dimension(Zrow,Zcol) :: Zmat + real(kind = c_double), intent(in), dimension(N,GI) :: U + real(kind = c_double), intent(out), dimension(blen,blen) :: ret + + real(kind = 8), dimension(blen,blen) :: d2Q + real(kind = 8), dimension(blen,blen) :: covr + integer, dimension(N) :: V + integer :: g, i, j, k, s, c, d + integer :: Z1, sm1, gma1 + real(kind = 8) :: zeta, zeta0, zeta1 + integer :: pid, psz + real(kind = 8) :: Se1, Sp1, RSe1, RSp1 + real(kind = 8), dimension(blen) :: Elc + real(kind = 8), dimension(blen) :: dlc + real(kind = 8) :: Eyi, tmp1, tmp2, lctmp1, d2pdb + real(kind = 8), dimension(blen) :: dpdb + real(kind = 8), dimension(GI-a,blen) :: dlcmat + + V = 0 + Elc = 0.0 + d2Q = 0.0 + covr = 0.0 + do g = 1, GI + dlc = 0.0 + do i = 1, N + Ytmat(i,1) = 0 + zeta0 = 1.0_c_double + zeta1 = 1.0_c_double + do j=1, Ytmat(i,2) + pid = Ytmat(i,2+j) + Z1 = Zmat(pid,1) + psz = Zmat(pid,2) + Se1 = SeSp(pid,1) + Sp1 = SeSp(pid,2) + sm1 = 0 + do s=1, psz + k = Zmat(pid,2+s) + sm1 = sm1 + Ytmat(k,1) + end do + gma1 = 0 + if(sm1 .gt. 0) then + gma1 = 1 + end if + RSe1 = (Se1**Z1)*((1_c_double-Se1)**(1_c_double-Z1)) + RSp1 = (Sp1**(1_c_double-Z1))*((1_c_double-Sp1)**Z1) + zeta0 = zeta0*(RSe1**gma1)*(RSp1**(1_c_double-gma1)) + zeta1 = zeta1*RSe1 + end do + zeta0 = zeta0*(1.0_c_double-p(i)) + zeta1 = zeta1*p(i) + zeta = zeta0 + zeta1 + zeta0 = zeta0/zeta + if(U(i,g) .gt. zeta0) then + Ytmat(i,1)=1 + else + Ytmat(i,1)=0 + end if + if(g .gt. a) then + V(i) = V(i) + Ytmat(i,1) + lctmp1 = dble((Ytmat(i,1)-p(i))*dg(i))/(p(i)*(1.0_c_double-p(i))) + do s=1, blen + dlc(s) = dlc(s) + lctmp1*X(i,s) + end do + end if + end do + if(g .gt. a) then + do s=1, blen + Elc(s) = Elc(s) + dlc(s) + dlcmat((g-a),s) = dlc(s) + end do + end if + end do + + do s=1, blen + Elc(s) = Elc(s)/dble(GI-a) + end do + do c=1, blen + do d=1, blen + do g=1, (GI-a) + covr(c,d) = covr(c,d)+( dlcmat(g,c)-Elc(c) )*( dlcmat(g,d)-Elc(d) ) + end do + end do + end do + + do i = 1, N + Eyi = dble(V(i))/dble(GI-a) + tmp1 = Eyi/(p(i)**2) + (1.0_c_double-Eyi)/( (1.0_c_double-p(i))**2 ) + tmp2 = (p(i) - Eyi)/( p(i)*(1.0_c_double-p(i)) ) + do s = 1, blen + dpdb(s) = dg(i)*X(i,s) + end do + do c = 1, blen + do d = 1, blen + d2pdb = d2g(i)*X(i,c)*X(i,d) + d2Q(c,d) = d2Q(c,d) + dpdb(c)*dpdb(d)*tmp1 + tmp2*d2pdb + end do + end do + end do + ret = d2Q - covr/dble(GI-a) + end subroutine cvondknacreg_f + +end module gtesting