diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..2457597 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,15 @@ +Package: LGEWIS +Type: Package +Title: Tests for Genetic Association/Gene-Environment Interaction in + Longitudinal Gene-Environment-Wide Interaction Studies +Version: 0.2 +Date: 2015-10-07 +Author: Zihuai He, Seunggeun Lee, Bhramar Mukherjee, Min Zhang +Maintainer: Zihuai He +Description: Functions for testing the genetic association/gene-environment interaction in longitudinal gene-environment-wide interaction studies. Generalized score type tests are used for set based analyses. Then GEE based score tests are applied to all single variants within the defined set. +License: GPL-3 +Depends: CompQuadForm, SKAT, geeM, pls, splines +NeedsCompilation: no +Packaged: 2015-10-12 13:14:06 UTC; statzihuai +Repository: CRAN +Date/Publication: 2015-10-14 09:48:39 diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..27bed04 --- /dev/null +++ b/MD5 @@ -0,0 +1,15 @@ +89882734b5cc2f53cad68ee5b4fc7316 *DESCRIPTION +f0ef963ee49ab0bc55e8e9739737f26c *NAMESPACE +0745f11856cdfa5b1d823a82f81458d0 *R/GA.R +b7f20ba296667b10673abe9d02bae18e *R/GEI.R +9620bb6a7e839acc0e36bd0e1f155002 *R/Main_SSD.R +78a6eab9c39cfc8bbeb6f99e1ab4f8fe *data/LGEWIS.example.rda +790cf5c85deb903d58e5639eca2421f8 *man/GA.SSD.All.Rd +3eedbf2cf1ab8bbb4ec5844c5b295557 *man/GA.SSD.OneSet_SetIndex.Rd +a05de32b89938c91c3b19da5981ba49d *man/GA.prelim.Rd +2cffc2c1b31f42dd378af21fb3814cd1 *man/GA.test.Rd +46a895ede225364ba59c67507c54f83c *man/GEI.SSD.All.Rd +b4bad0e2f5eda08b40a2141433b9920f *man/GEI.SSD.OneSet_SetIndex.Rd +fbe0940c306d475666b4adee37d679cd *man/GEI.prelim.Rd +230009204a7146e24e82cf945e19a8ca *man/GEI.test.Rd +f2e61050b2713fd4dc60b40180c0f1e0 *man/LGEWIS_example.Rd diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..e06437f --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,4 @@ +#exportPattern("^[[:alpha:]]+") +export(GA.prelim,GA.test,GA.SSD.OneSet_SetIndex,GA.SSD.All,GEI.prelim,GEI.test,GEI.SSD.OneSet_SetIndex,GEI.SSD.All) +import(CompQuadForm, SKAT, geeM, pls, splines) +importFrom("stats", "cor", "lm", "rbinom", "gaussian", "pchisq", "quantile", "sd", "var") diff --git a/R/GA.R b/R/GA.R new file mode 100644 index 0000000..d938cf9 --- /dev/null +++ b/R/GA.R @@ -0,0 +1,278 @@ + +######################### +# Hypothesis Testing +######################### + +#p.s: the current code is only for a single environmental exposure + +GA.prelim<-function(Y,time,X=NULL,corstr="exchangeable"){ + ##Preliminary + Y<-as.matrix(Y);N<-nrow(Y) + cluster.id<-unique(time[,1]);m<-length(cluster.id) + #sort the data by cluster.id + order.index<-order(match(time[,1],cluster.id)) + if (sum(time[,1]!=time[order.index,1])>0){ + msg<-sprintf("time was not sorted, now data is grouped by subject id") + warning(msg,call.=F) + Y<-as.matrix(Y[order.index,]) + if(length(X)!=0){X<-as.matrix(X[order.index])} + time<-time[order.index,] + } + + if(length(X)!=0){ + Z.A<-svd(as.matrix(X))$u + nullgee<-geem(Y~Z.A,family=gaussian,corstr=corstr,id=time[,1]) + }else{ + nullgee<-geem(Y~1,family=gaussian,corstr=corstr,id=time[,1]) + } + + #prepare the intermediate results + result.prelim<-list(Y=Y,time=time,X=X,cluster.id=cluster.id,m=m,corstr=corstr,Z.A=Z.A,nullgee=nullgee) + return(result.prelim) +} + +#G is an m*q matrix, each row coresponds to one subject. +GA.test<-function(result.prelim,G,Gsub.id=NULL,bootstrap=NULL,MinP.adjust=NULL,impute.method='fixed'){ + ## Load preliminary data + Y<-result.prelim$Y;corstr<-result.prelim$corstr + m<-result.prelim$m;X<-result.prelim$X + time<-result.prelim$time;cluster.id<-result.prelim$cluster.id + Z.A<-result.prelim$Z.A #Z.A0=svd(cbind(rep(1,N),X, M.E))$u + nullgee<-result.prelim$nullgee + + ## Deal with the genotype + SNP.list<-colnames(G) + # match the phenotype and genotype subject ID + if(length(Gsub.id)==0){Gsub.id<-cluster.id} + G<-as.matrix(G[match(cluster.id,Gsub.id),]) + + # missing genotype imputation + G[G==9]<-NA + N_MISS<-sum(is.na(G)) + if(N_MISS>0){ + msg<-sprintf("The missing genotype rate is %f. Imputation is applied.", N_MISS/nrow(G)/ncol(G)) + warning(msg,call.=F) + G<-Impute(G,impute.method) + } + # centered genotype + G<-as.matrix(G);center.G<-t(t(G)-colMeans(G)) + MAF<-colMeans(as.matrix(G[,colMeans(center.G^2)!=0]))/2;MAF[MAF>0.5]<-1-MAF[MAF>0.5] + G<-as.matrix(center.G[,colMeans(center.G^2)!=0]);n.G<-ncol(G) + SNP.name<-SNP.list[colMeans(center.G^2)!=0] + if(ncol(G) == 0){ + msg<-sprintf("G does not include any heterozygous variant") + stop(msg) + } + + # generate long form genotype + Z<-as.matrix(G[match(time[,1],cluster.id),]) + + if(length(bootstrap)==0){ + # calculate score + GA.fit<-GA.Score(nullgee,Y,time,Z.A,Z,corstr) #Z.A does not include the intercept + # calculate score statistics and their covariance matrix + score<-GA.fit$score;M<-GA.fit$M + # calculate p-value + TestStat<-t(score)%*%score + EigenM<-eigen(M,symmetric=TRUE,only.values=TRUE)$values + p.value<-davies(TestStat,EigenM)$Qq + + # implement a single SNP based analysis by GEE score tests + p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/diag(M),df=1,lower.tail=F))) + colnames(p.single)<-c('MAF','p.value') + rownames(p.single)<-SNP.name + }else{ + # calculate score + GA.fit<-GA.bootstrap(nullgee,Y,time,Z.A,Z,corstr,bootstrap) #Z.A does not include the intercept + score<-GA.fit$score;perturb.score<-GA.fit$perturb.score + Q<-t(score)%*%score + # calculate bootstrap based mean, variance and kurtosis + perturb.Q<-apply(perturb.score^2,2,sum) + perturb.mean<-mean(perturb.Q) + perturb.variance<-var(perturb.Q) + perturb.kurtosis<-mean((perturb.Q-perturb.mean)^4)/perturb.variance^2-3 + # calculate p.value + if (perturb.kurtosis>0){df<-12/perturb.kurtosis}else{ + df<-100000 + } + p.value<-1-pchisq((Q-perturb.mean)*sqrt(2*df)/sqrt(perturb.variance)+df,df) + # implement a single SNP based analysis by GEE score tests + perturb.variance.single<-apply(perturb.score,1,var) + p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/perturb.variance.single,df=1,lower.tail=F))) + colnames(p.single)<-c('MAF','p.value') + rownames(p.single)<-SNP.name + } + if(length(MinP.adjust)!=0){ + eigen.G<-eigen(cor(G),only.values=TRUE)$values + me.G<-effect.n(eigen.G,MinP.adjust) + p.MinP<-min(min(p.single[,2])*me.G,1) + }else{p.MinP<-NA} + + return(list(n.marker=n.G,p.single=p.single,p.MinP=p.MinP,p.value=p.value)) +} + +################################################# +# Calculate Score Statistics and their Variance +################################################# + +GA.bootstrap<-function(nullgee,Y,time,Z.A,Z.I,corstr,bootstrap) #Z.A does not include the intercept +{ + rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept + mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu + #rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu + cluster.id<-unique(time[,1]);m<-length(cluster.id);dimI<-ncol(Z.I) + ##Generalized score test + score.array<-matrix(0,dimI,m) + n.total<-1;n.rep<-as.numeric(table(time[,1])) + for (i in 1:m) + { + ni<-n.rep[i] + index<-n.total:(n.total+ni-1) + n.total<-n.total + ni + #working covariance matrix + if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)} + if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)} + if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi} + #if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv} + #score matrices + Z.Ii<-Z.I[index,];if (ni==1) {Z.Ii<-t(Z.Ii)} + score.array[,i]<-t(Z.Ii)%*%(Vi.inv%*%Y.res[index])/sqrt(m) + } + score<-as.matrix(apply(score.array,1,sum)) + # null distribution + perturb.coef<-matrix(rbinom(m*bootstrap,1,0.5)*2-1,m,bootstrap) + perturb.score<-score.array%*%perturb.coef + + return(list(score=score,perturb.score=perturb.score))#Q: test statistic; M: covariance matrix +} + +GA.Score<-function(nullgee,Y,time,Z.A,Z.I,corstr) #Z.A does not include the intercept +{ + rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept + mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu + #rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu + cluster.id<-unique(time[,1]);m<-length(cluster.id) + ##Generalized score test + dimY<-nrow(Y.res);dimA<-ncol(Z.A);dimI<-ncol(Z.I);score<-matrix(0,dimI,1) + An<-matrix(0,dimA+dimI,dimA+dimI);Bn<-matrix(0,dimA+dimI,dimA+dimI) + n.total<-1;n.rep<-as.numeric(table(time[,1])) + for (i in 1:m) + { + ni<-n.rep[i] + index<-n.total:(n.total+ni-1) + n.total<-n.total + ni + #working covariance matrix + if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)} + if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)} + if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi} + #if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv} + #score matrices + Z.Ai<-Z.A[index,];Z.Ii<-Z.I[index,] + if (ni==1) { Z.Ai<-t(Z.Ai); Z.Ii<-t(Z.Ii)} + Zi<-cbind(Z.Ii,Z.Ai) + #An<-An+t(Zi)%*%Vi.inv%*%Zi + An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]+t(Z.Ai)%*%Vi.inv%*%Z.Ai + An[1:dimI, (dimI+1):(dimI+dimA)]<-An[1:dimI, (dimI+1):(dimI+dimA)]+t(Z.Ii)%*%(Vi.inv%*%Z.Ai) + Bntemp<-t(Zi)%*%(Vi.inv%*%Y.res[index]) + Bn<-Bn+Bntemp%*%t(Bntemp) + score<-score+Bntemp[1:dimI,]#t(Z.Ii)%*%Vi.inv%*%Y.res[index] + } + An<-An/m; Bn<-Bn/m + #Calculate p-value + A00<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)];A00.inv<-solve(A00);A10<-An[1:dimI, (dimI+1):(dimI+dimA)] + B11<-Bn[1:dimI,1:dimI];B10<-Bn[1:dimI,(dimI+1):(dimI+dimA)];B00<-Bn[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)] + P<-A10%*%A00.inv;M<-B11-t(B10%*%t(P))-B10%*%t(P)+P%*%B00%*%t(P) + #Revise the covariance matrix using m-q + M<-M*m/(m-dimA) + #Revise the score using standard representation + score<-score/sqrt(m) + return(list(score=score,M=M))#Q: test statistic; M: covariance matrix +} + +# calculate number of independent tests +effect.n<-function(x,MinP.adjust){ + temp<-0;sum.EV<-sum(x) #summation of eigen values + for (i in 1:length(x)){ + temp<-temp+x[i];if (temp>sum.EV*MinP.adjust){break} + } + return(i) +} + +#Data management +Minor.allele<-function(x){if(mean(x)>1){x<-2-x};return(x)} +Standardize<-function(x){return((x-mean(x))/sd(x))} +Center<-function(x){return(x-mean(x))} +Variation<-function(x){x<-apply(x,2,Minor.allele);x<-x[,which(apply(x,2,sd)>0)];return(x)} +Common<-function(x){x<-apply(x,2,Minor.allele);freq<-apply(x,2,mean)/2;x<-x[,which(freq>0.05)];return(x)} +##Matrix calculation +#Get sqrt.root of a matrix +Get.sqrt<-function(A){ + a.eig <- eigen(A,symmetric=TRUE) + ID1<-which(a.eig$values > 0) + if(length(ID1)== 0){stop("Error to obtain matrix square!")} + a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])) %*% t(a.eig$vectors[,ID1]) + return(a.sqrt) +} +#Get inverse of a matrix; numerically robust +Get.inverse<-function(A){ + a.eig <- eigen(A,symmetric=TRUE) + ID1<-which(a.eig$values > 0) + if(length(ID1)== 0){stop("Error to obtain matrix inverse!")} + a.inverse <- a.eig$vectors[,ID1] %*% diag(a.eig$values[ID1]^-1) %*% t(a.eig$vectors[,ID1]) + return(a.inverse) +} +#Get -1/2 of a matrix; numerically robust +Get.inv.sqrt<-function(A){ + a.eig <- eigen(A,symmetric=TRUE) + ID1<-which(a.eig$values > 0) + if(length(ID1)== 0){stop("Error to obtain matrix square!")} + a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])^-2) %*% t(a.eig$vectors[,ID1]) + return(a.sqrt) +} +#Time similarity +Check.same<-function(x,y){return(as.numeric(x==y))} +Check.near<-function(x,y){return(as.numeric(abs(x-y)==1))} +D.matrix<-function(X){ + n<-length(X[,1]);temp<-matrix(0,n,n) + #checking time + temp<-outer(X[,1],X[,1],Check.same)*outer(X[,2],X[,2],Check.near) + diag(temp)<-0 + return(temp) +} +# Simple Imputation (from SKAT package) +# Z : an m x p genotype matrix with m samples and p SNPs + +Impute<-function(Z, impute.method){ + p<-dim(Z)[2] + if(impute.method =="random"){ + for(i in 1:p){ + IDX<-which(is.na(Z[,i])) + if(length(IDX) > 0){ + maf1<-mean(Z[-IDX,i])/2 + Z[IDX,i]<-rbinom(length(IDX),2,maf1) + } + } + } else if(impute.method =="fixed"){ + for(i in 1:p){ + IDX<-which(is.na(Z[,i])) + if(length(IDX) > 0){ + maf1<-mean(Z[-IDX,i])/2 + Z[IDX,i]<-2 * maf1 + } + } + } else if(impute.method =="bestguess") { + for(i in 1:p){ + IDX<-which(is.na(Z[,i])) + if(length(IDX) > 0){ + maf1<-mean(Z[-IDX,i])/2 + Z[IDX,i]<-round(2 * maf1) + } + } + } else { + stop("Error: Imputation method shoud be \"fixed\", \"random\" or \"bestguess\" ") + } + return(as.matrix(Z)) +} + + + diff --git a/R/GEI.R b/R/GEI.R new file mode 100644 index 0000000..678c189 --- /dev/null +++ b/R/GEI.R @@ -0,0 +1,384 @@ + +######################### +# Hypothesis Testing +######################### + +#p.s: the current code is only for a single environmental exposure +GEI.prelim<-function(Y,time,E,X=NULL,E.method='ns',E.df=floor(sqrt(length(unique(time[,1])))),corstr="exchangeable"){ + ##Preliminary + Y<-as.matrix(Y);E<-as.matrix(E);N<-nrow(Y) + cluster.id<-unique(time[,1]);m<-length(cluster.id) + #sort the data by cluster.id + order.index<-order(match(time[,1],cluster.id)) + if (sum(time[,1]!=time[order.index,1])>0){ + msg<-sprintf("time was not sorted, now data is grouped by subject id") + warning(msg,call.=F) + Y<-as.matrix(Y[order.index,]);E<-as.matrix(E[order.index,]) + if(length(X)!=0){X<-as.matrix(X[order.index])} + time<-time[order.index,] + } + + ##Generate polynomial/cubic basis functions + #use quantiles for knots: + knots<-quantile(E,probs=seq(0,1,length=E.df+1));knots<-unique(knots[!knots%in%knots[c(1,E.df+1)]]) + if(E.df!=1){B.E<-NULL; + if(E.method=='ns'){M.E<-ns(E,knots=knots)} + if(E.method=='bs'){M.E<-bs(E,knots=knots)} + if(E.method=='ps'){M.E<-ps(E,df=E.df)} + }else{M.E<-E} + M.E<-as.matrix(M.E) + #standrdize enviromental exposure + if(E.df!=1){M.E<-M.E[,which(apply(M.E,2,sd)>0)]} + #fit the gee adjusting for X and E, the residuals will be used for selecting the G components + #Z.A0<-svd(cbind(rep(1,N),X, M.E))$u + #nullgee<-geeglm(Y~0+Z.A0,family=gaussian,corstr=corstr,id=time[,1]) + Z.A0<-svd(cbind(X, M.E))$u + nullgee<-geem(Y~Z.A0,family=gaussian,corstr=corstr,id=time[,1]) + mu<-colSums(nullgee$beta*t(cbind(rep(1,N),Z.A0)));Y.res0<-Y-mu + + #prepare the intermediate results + result.prelim<-list(Y=Y,time=time,X=X,E=E,M.E=M.E,cluster.id=cluster.id,m=m,corstr=corstr,Y.res0=Y.res0,Z.A0=Z.A0) + return(result.prelim) +} + +#G is an m*q matrix, each row coresponds to one subject. +GEI.test<-function(result.prelim,G,Gsub.id=NULL,G.method='wPCA',G.df=floor(sqrt(nrow(G))),bootstrap=NULL,MinP.adjust=NULL,impute.method='fixed'){ + ## Load preliminary data + Y<-result.prelim$Y;corstr<-result.prelim$corstr + m<-result.prelim$m;X<-result.prelim$X + time<-result.prelim$time;cluster.id<-result.prelim$cluster.id + E<-result.prelim$E;M.E<-result.prelim$M.E + Y.res0<-result.prelim$Y.res0 #residuals by regressing Y on cbind(rep(1,N),X, M.E) + Z.A0<-result.prelim$Z.A0 #Z.A0=svd(cbind(rep(1,N),X, M.E))$u + + ## Deal with the genotype + SNP.list<-colnames(G) + # match the phenotype and genotype subject ID + if(length(Gsub.id)==0){Gsub.id<-cluster.id} + G<-as.matrix(G[match(cluster.id,Gsub.id),]) + + # missing genotype imputation + G[G==9]<-NA + N_MISS<-sum(is.na(G)) + if(N_MISS>0){ + msg<-sprintf("The missing genotype rate is %f. Imputation is applied.", N_MISS/nrow(G)/ncol(G)) + warning(msg,call.=F) + G<-Impute(G,impute.method) + } + # centered genotype + G<-as.matrix(G);center.G<-t(t(G)-colMeans(G)) + MAF<-colMeans(as.matrix(G[,colMeans(center.G^2)!=0]))/2;MAF[MAF>0.5]<-1-MAF[MAF>0.5] + G<-as.matrix(center.G[,colMeans(center.G^2)!=0]);n.G<-ncol(G) + SNP.name<-SNP.list[colMeans(center.G^2)!=0] + if(ncol(G) == 0){ + msg<-sprintf("G does not include any heterozygous variant") + stop(msg) + } + + # generate long form genotype + Z<-as.matrix(G[match(time[,1],cluster.id),]) + + # find adjusted G components + G.df<-min(G.df,ncol(G)) + if (G.method=='PCA'){G.adj<-GEI.PCA(time,G,G.df)} + if (G.method=='wPCA'){G.adj<-GEI.wPCA(Y.res0,time,G,G.df)} + if (G.method=='PLS'){G.adj<-GEI.PLS(Y.res0,time,G,G.df)} + if (G.method=='R2'){G.adj<-GEI.R2(Y.res0,time,G,G.df)} + + ## Fit null model using GEE + #Z.A<-cbind(Z.A0,G.adj) + #nullgee<-geeglm(Y~0+Z.A,family=gaussian,corstr=corstr,id=time[,1]) + #rho<-nullgee$geese$alpha + Z.A<-cbind(Z.A0,G.adj) #Z.A does not include the intercept for geeM + nullgee<-try(geem(Y~Z.A,family=gaussian,corstr=corstr,id=time[,1]),silent = TRUE) + if(class(nullgee) == "try-error"){ + msg<-sprintf("function geem() in package geeM cannot be fitted due to a numerical error") + stop(msg) + } + + rho<-as.numeric(nullgee$alpha) + ## Generate interaction variables + if (corstr=="ar1"){ + n.total<-1;n.rep<-as.numeric(table(time[,1])) + c.E<-NULL;for (i in m){ + ni<-n.rep[i] + index<-n.total:(n.total+ni-1) + n.total<-n.total + ni + Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi) + c.E<-c(c.E,rowSums(Vi.inv)) + } + Z.I<-as.vector((E-sum(E*c.E)/sum(c.E)))*Z + }else{Z.I<-as.vector(Standardize(E))*Z} + + if(length(bootstrap)==0){ + # calculate score + GEI.fit<-GEI.Score(nullgee,Y,time,Z.A,Z.I,corstr) #Z.A does not include the intercept + # calculate score statistics and their covariance matrix + score<-GEI.fit$score;M<-GEI.fit$M + # calculate p-value + TestStat<-t(score)%*%score + EigenM<-eigen(M,symmetric=TRUE,only.values=TRUE)$values + p.value<-davies(TestStat,EigenM)$Qq + + # implement a single SNP based analysis by GEE score tests + p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/diag(M),df=1,lower.tail=F))) + colnames(p.single)<-c('MAF','p.value') + rownames(p.single)<-SNP.name + }else{ + # calculate score + GEI.fit<-GEI.bootstrap(nullgee,Y,time,Z.A,Z.I,corstr,bootstrap) #Z.A does not include the intercept + score<-GEI.fit$score;perturb.score<-GEI.fit$perturb.score + Q<-t(score)%*%score + # calculate bootstrap based mean, variance and kurtosis + perturb.Q<-apply(perturb.score^2,2,sum) + perturb.mean<-mean(perturb.Q) + perturb.variance<-var(perturb.Q) + perturb.kurtosis<-mean((perturb.Q-perturb.mean)^4)/perturb.variance^2-3 + # calculate p.value + if (perturb.kurtosis>0){df<-12/perturb.kurtosis}else{ + df<-100000 + } + p.value<-1-pchisq((Q-perturb.mean)*sqrt(2*df)/sqrt(perturb.variance)+df,df) + # implement a single SNP based analysis by GEE score tests + perturb.variance.single<-apply(perturb.score,1,var) + p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/perturb.variance.single,df=1,lower.tail=F))) + colnames(p.single)<-c('MAF','p.value') + rownames(p.single)<-SNP.name + } + + if(length(MinP.adjust)!=0){ + eigen.G<-eigen(cor(G),only.values=TRUE)$values + me.G<-effect.n(eigen.G,MinP.adjust) + p.MinP<-min(min(p.single[,2])*me.G,1) + }else{p.MinP<-NA} + + return(list(n.marker=n.G,E.df=ncol(M.E),G.df=G.df,p.single=p.single,p.MinP=p.MinP,p.value=p.value)) +} + +################################################# +# Calculate Score Statistics and their Variance +################################################# + + +GEI.bootstrap<-function(nullgee,Y,time,Z.A,Z.I,corstr,bootstrap) #Z.A does not include the intercept +{ + rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept + mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu + #rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu + cluster.id<-unique(time[,1]);m<-length(cluster.id);dimI<-ncol(Z.I) + ##Generalized score test + score.array<-matrix(0,dimI,m) + n.total<-1;n.rep<-as.numeric(table(time[,1])) + for (i in 1:m) + { + ni<-n.rep[i] + index<-n.total:(n.total+ni-1) + n.total<-n.total + ni + #working covariance matrix + if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)} + if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)} + if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi} + #if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv} + #score matrices + Z.Ii<-Z.I[index,];if (ni==1) {Z.Ii<-t(Z.Ii)} + score.array[,i]<-t(Z.Ii)%*%(Vi.inv%*%Y.res[index])/sqrt(m) + } + score<-as.matrix(apply(score.array,1,sum)) + # null distribution + perturb.coef<-matrix(rbinom(m*bootstrap,1,0.5)*2-1,m,bootstrap) + perturb.score<-score.array%*%perturb.coef + + return(list(score=score,perturb.score=perturb.score))#Q: test statistic; M: covariance matrix +} + + +GEI.Score<-function(nullgee,Y,time,Z.A,Z.I,corstr) #Z.A does not include the intercept +{ + rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept + mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu + #rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu + cluster.id<-unique(time[,1]);m<-length(cluster.id) + ##Generalized score test + dimY<-nrow(Y.res);dimA<-ncol(Z.A);dimI<-ncol(Z.I);score<-matrix(0,dimI,1) + An<-matrix(0,dimA+dimI,dimA+dimI);Bn<-matrix(0,dimA+dimI,dimA+dimI) + n.total<-1;n.rep<-as.numeric(table(time[,1])) + for (i in 1:m) + { + ni<-n.rep[i] + index<-n.total:(n.total+ni-1) + n.total<-n.total + ni + #working covariance matrix + if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)} + if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)} + if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi} + #if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv} + #score matrices + Z.Ai<-Z.A[index,];Z.Ii<-Z.I[index,] + if (ni==1) { Z.Ai<-t(Z.Ai); Z.Ii<-t(Z.Ii)} + Zi<-cbind(Z.Ii,Z.Ai) + #An<-An+t(Zi)%*%Vi.inv%*%Zi + An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]+t(Z.Ai)%*%Vi.inv%*%Z.Ai + An[1:dimI, (dimI+1):(dimI+dimA)]<-An[1:dimI, (dimI+1):(dimI+dimA)]+t(Z.Ii)%*%(Vi.inv%*%Z.Ai) + Bntemp<-t(Zi)%*%(Vi.inv%*%Y.res[index]) + Bn<-Bn+Bntemp%*%t(Bntemp) + score<-score+Bntemp[1:dimI,]#t(Z.Ii)%*%Vi.inv%*%Y.res[index] + } + An<-An/m; Bn<-Bn/m + #Calculate p-value + A00<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)];A00.inv<-solve(A00);A10<-An[1:dimI, (dimI+1):(dimI+dimA)] + B11<-Bn[1:dimI,1:dimI];B10<-Bn[1:dimI,(dimI+1):(dimI+dimA)];B00<-Bn[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)] + P<-A10%*%A00.inv;M<-B11-t(B10%*%t(P))-B10%*%t(P)+P%*%B00%*%t(P) + #Revise the covariance matrix using m-q + M<-M*m/(m-dimA) + #Revise the score using standard representation + score<-score/sqrt(m) + return(list(score=score,M=M))#Q: test statistic; M: covariance matrix +} + +############################################ +# Type-I Error Control: adjustment of G +############################################ + +GEI.PLS<-function(outcome,time,Z,ncomp=5)#note: long form G is the imput +{ + Iter<-TRUE; + if (ncomp==ncol(Z)){index<-rep(1,ncol(Z));G.adj<-Z;Iter=FALSE} + if (ncomp==0){G.adj<-NULL;Iter=FALSE} + if (Iter==TRUE){ #for cases that ncomp is a number and Iter is still true + PLS.fit<-plsr(outcome~Z,ncomp=ncomp,validation='none') + PLS.scores<-PLS.fit$scores + G.adj<-PLS.fit$scores[,1:ncomp] + } + return(G.adj) +} + +GEI.wPCA<-function(outcome,time,G,ncomp=5) +{ + cluster.id<-unique(time[,1]);n.G<-ncol(G) + svd.G<-svd(G);svd.Z<-as.matrix(svd.G$u[match(time[,1],cluster.id),]) + #Calculate weighted eigen-values + adj.score<-(svd.G$d)^2*(t(svd.Z)%*%outcome)^2;# weighted + Iter<-TRUE;index<-rep(0,n.G) + if (ncomp==ncol(svd.Z)){index<-rep(1,n.G);G.adj<-svd.Z;Iter=FALSE} + if (ncomp==0){G.adj<-NULL;Iter=FALSE} + if (Iter==TRUE){G.adj<-svd.Z[,order(adj.score,decreasing=T)][,1:ncomp]} + return(G.adj) +} + +GEI.R2<-function(outcome,time,G,ncomp=5) +{ + cluster.id<-unique(time[,1]);n.G<-ncol(G) + svd.G<-svd(G);svd.Z<-as.matrix(svd.G$u[match(time[,1],cluster.id),]) + #Calculate R2s + adj.score<-(t(svd.Z)%*%outcome)^2;# weighted + Iter<-TRUE;index<-rep(0,n.G) + if (ncomp==ncol(svd.Z)){index<-rep(1,n.G);G.adj<-svd.Z;Iter=FALSE} + if (ncomp==0){G.adj<-NULL;Iter=FALSE} + if (Iter==TRUE){G.adj<-svd.Z[,order(adj.score,decreasing=T)[1:ncomp]]} + return(G.adj) +} + +GEI.PCA<-function(time,G,ncomp=5) +{ + cluster.id<-unique(time[,1]);n.G<-ncol(G) + svd.G<-svd(G);svd.Z<-as.matrix(svd.G$u[match(time[,1],cluster.id),]) + Iter<-TRUE + if (ncomp==ncol(svd.Z)){index<-rep(1,n.G);G.adj<-svd.Z;Iter=FALSE} + if (ncomp==0){G.adj<-NULL;Iter=FALSE} + if (Iter==TRUE){G.adj<-svd.Z[,1:ncomp]} + return(G.adj) +} + +# calculate number of independent tests +effect.n<-function(x,MinP.adjust){ + temp<-0;sum.EV<-sum(x) #summation of eigen values + for (i in 1:length(x)){ + temp<-temp+x[i];if (temp>sum.EV*MinP.adjust){break} + } + return(i) +} + +#Data management +Minor.allele<-function(x){if(mean(x)>1){x<-2-x};return(x)} +Standardize<-function(x){return((x-mean(x))/sd(x))} +Center<-function(x){return(x-mean(x))} +Variation<-function(x){x<-apply(x,2,Minor.allele);x<-x[,which(apply(x,2,sd)>0)];return(x)} +Common<-function(x){x<-apply(x,2,Minor.allele);freq<-apply(x,2,mean)/2;x<-x[,which(freq>0.05)];return(x)} +##Matrix calculation +#Get sqrt.root of a matrix +Get.sqrt<-function(A){ + a.eig <- eigen(A,symmetric=TRUE) + ID1<-which(a.eig$values > 0) + if(length(ID1)== 0){stop("Error to obtain matrix square!")} + a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])) %*% t(a.eig$vectors[,ID1]) + return(a.sqrt) +} +#Get inverse of a matrix; numerically robust +Get.inverse<-function(A){ + a.eig <- eigen(A,symmetric=TRUE) + ID1<-which(a.eig$values > 0) + if(length(ID1)== 0){stop("Error to obtain matrix inverse!")} + a.inverse <- a.eig$vectors[,ID1] %*% diag(a.eig$values[ID1]^-1) %*% t(a.eig$vectors[,ID1]) + return(a.inverse) +} +#Get -1/2 of a matrix; numerically robust +Get.inv.sqrt<-function(A){ + a.eig <- eigen(A,symmetric=TRUE) + ID1<-which(a.eig$values > 0) + if(length(ID1)== 0){stop("Error to obtain matrix square!")} + a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])^-2) %*% t(a.eig$vectors[,ID1]) + return(a.sqrt) +} +#Time similarity +Check.same<-function(x,y){return(as.numeric(x==y))} +Check.near<-function(x,y){return(as.numeric(abs(x-y)==1))} +D.matrix<-function(X){ + n<-length(X[,1]);temp<-matrix(0,n,n) + #checking time + temp<-outer(X[,1],X[,1],Check.same)*outer(X[,2],X[,2],Check.near) + diag(temp)<-0 + return(temp) +} +ps<-function(x,df) #generate polynomial sieves +{ + temp<-NULL + for (i in 1:df){temp<-cbind(temp,x^i)} + return(temp) +} +# Simple Imputation (from SKAT package) +# Z : an m x p genotype matrix with m samples and p SNPs + +Impute<-function(Z, impute.method){ + p<-dim(Z)[2] + if(impute.method =="random"){ + for(i in 1:p){ + IDX<-which(is.na(Z[,i])) + if(length(IDX) > 0){ + maf1<-mean(Z[-IDX,i])/2 + Z[IDX,i]<-rbinom(length(IDX),2,maf1) + } + } + } else if(impute.method =="fixed"){ + for(i in 1:p){ + IDX<-which(is.na(Z[,i])) + if(length(IDX) > 0){ + maf1<-mean(Z[-IDX,i])/2 + Z[IDX,i]<-2 * maf1 + } + } + } else if(impute.method =="bestguess") { + for(i in 1:p){ + IDX<-which(is.na(Z[,i])) + if(length(IDX) > 0){ + maf1<-mean(Z[-IDX,i])/2 + Z[IDX,i]<-round(2 * maf1) + } + } + } else { + stop("Error: Imputation method shoud be \"fixed\", \"random\" or \"bestguess\" ") + } + return(as.matrix(Z)) +} + + + diff --git a/R/Main_SSD.R b/R/Main_SSD.R new file mode 100755 index 0000000..8236215 --- /dev/null +++ b/R/Main_SSD.R @@ -0,0 +1,156 @@ + +# Some functions used in this file are from the SKAT package + +## Marginal genetic association + +GA.SSD.OneSet_SetIndex = function(SSD.INFO, SetIndex, result.prelim, Gsub.id=NULL, MinP.adjust=NULL, ...){ + + id1<-which(SSD.INFO$SetInfo$SetIndex == SetIndex) + if(length(id1) == 0){ + MSG<-sprintf("Error: cannot find set index [%d] from SSD!", SetIndex) + stop(MSG) + } + SetID<-SSD.INFO$SetInfo$SetID[id1] + + try1<-try(Get_Genotypes_SSD(SSD.INFO, SetIndex, is_ID=T),silent = TRUE) + if(class(try1) != "try-error"){ + G<-try1 + Is.Error<-FALSE + } else { + err.msg<-geterrmessage() + msg<-sprintf("Error to get genotypes of %s: %s",SetID, err.msg) + stop(msg) + } + re<-GA.test(result.prelim, G, Gsub.id=Gsub.id, MinP.adjust=MinP.adjust, ...) + + return(re) +} + +#LGRF.SSD.All function +GA.SSD.All = function(SSD.INFO, result.prelim, Gsub.id=NULL, MinP.adjust=NULL, ...){ + N.Set<-SSD.INFO$nSets + OUT.Pvalue<-rep(NA,N.Set) + OUT.Pvalue.MinP<-rep(NA,N.Set) + OUT.Marker<-rep(NA,N.Set) + #OUT.Marker.Test<-rep(NA,N.Set) + OUT.Error<-rep(-1,N.Set) + OUT.single<-c() + + for(i in 1:N.Set){ + if(i%%100==0){print(paste0(i," sets finished"))} + Is.Error<-TRUE + try1 = try(GA.SSD.OneSet_SetIndex(SSD.INFO=SSD.INFO, SetIndex=i, result.prelim=result.prelim, Gsub.id=Gsub.id, MinP.adjust=MinP.adjust, ...)) + + if(class(try1) != "try-error"){ + re<-try1; + Is.Error<-FALSE + } else { + + err.msg<-geterrmessage() + msg<-sprintf("Error to run GA for %s: %s",SSD.INFO$SetInfo$SetID[i], err.msg) + warning(msg,call.=FALSE) + + } + print(paste(i,"-th set",SSD.INFO$SetInfo[i,2],",",SSD.INFO$SetInfo[i,3],"SNPs")) + + if(!Is.Error){ + + OUT.Pvalue[i]<-re$p.value + OUT.Pvalue.MinP[i]<-re$p.MinP + OUT.Marker[i]<-re$n.marker + temp.single<-cbind(rep(SSD.INFO$SetInfo[i,2],nrow(re$p.single)),rownames(re$p.single),re$p.single) + rownames(temp.single)<-NULL + OUT.single<-rbind(OUT.single,temp.single) + + #OUT.Marker.Test[i]<-re$param$n.marker.test + } + } + + out.tbl<-data.frame(SetID=SSD.INFO$SetInfo$SetID, P.value=OUT.Pvalue, N.Marker=OUT.Marker) + out.tbl.single<-data.frame(OUT.single);colnames(out.tbl.single)<-c('Region.name','SNP.name','MAF','p.value') + + if(length(MinP.adjust)!=0){out.tbl<-data.frame(SetID=SSD.INFO$SetInfo$SetID, P.value=OUT.Pvalue,P.value_MinP=OUT.Pvalue.MinP, N.Marker=OUT.Marker)} + re<-list(results.single=out.tbl.single,results=out.tbl) + class(re)<-"GA_SSD_ALL" + + return(re) +} + +## Gene-environement intereaction + +GEI.SSD.OneSet_SetIndex = function(SSD.INFO, SetIndex, result.prelim, Gsub.id=NULL, MinP.adjust=NULL, ...){ + + id1<-which(SSD.INFO$SetInfo$SetIndex == SetIndex) + if(length(id1) == 0){ + MSG<-sprintf("Error: cannot find set index [%d] from SSD!", SetIndex) + stop(MSG) + } + SetID<-SSD.INFO$SetInfo$SetID[id1] + + try1<-try(Get_Genotypes_SSD(SSD.INFO, SetIndex, is_ID=T),silent = TRUE) + if(class(try1) != "try-error"){ + G<-try1 + Is.Error<-FALSE + } else { + err.msg<-geterrmessage() + msg<-sprintf("Error to get genotypes of %s: %s",SetID, err.msg) + stop(msg) + } + re<-GEI.test(result.prelim, G, Gsub.id=Gsub.id, MinP.adjust=MinP.adjust, ...) + + return(re) +} + +#GEI.SSD.All function +GEI.SSD.All = function(SSD.INFO, result.prelim, Gsub.id=NULL, MinP.adjust=NULL, ...){ + N.Set<-SSD.INFO$nSets + OUT.Pvalue<-rep(NA,N.Set) + OUT.Pvalue.MinP<-rep(NA,N.Set) + OUT.Marker<-rep(NA,N.Set) + #OUT.Marker.Test<-rep(NA,N.Set) + OUT.Error<-rep(-1,N.Set) + OUT.single<-c() + + for(i in 1:N.Set){ + if(i%%100==0){print(paste0(i," sets finished"))} + Is.Error<-TRUE + #try1 = try(GEI.SSD.OneSet_SetIndex(SSD.INFO=SSD.INFO, SetIndex=i, result.prelim=result.prelim, Gsub.id=Gsub.id, MinP.adjust=MinP.adjust, ...)) + try1 = try(GEI.SSD.OneSet_SetIndex(SSD.INFO=SSD.INFO, SetIndex=i, result.prelim=result.prelim, Gsub.id=Gsub.id, MinP.adjust=MinP.adjust, ...)) + + if(class(try1) != "try-error"){ + re<-try1; + Is.Error<-FALSE + } else { + + err.msg<-geterrmessage() + msg<-sprintf("Error to run GEI for %s: %s",SSD.INFO$SetInfo$SetID[i], err.msg) + warning(msg,call.=FALSE) + + } + print(paste(i,"-th set",SSD.INFO$SetInfo[i,2],",",SSD.INFO$SetInfo[i,3],"SNPs")) + + if(!Is.Error){ + + OUT.Pvalue[i]<-re$p.value + OUT.Pvalue.MinP[i]<-re$p.MinP + OUT.Marker[i]<-re$n.marker + temp.single<-cbind(rep(SSD.INFO$SetInfo[i,2],nrow(re$p.single)),rownames(re$p.single),re$p.single) + rownames(temp.single)<-NULL + OUT.single<-rbind(OUT.single,temp.single) + + #OUT.Marker.Test[i]<-re$param$n.marker.test + } + } + + out.tbl<-data.frame(SetID=SSD.INFO$SetInfo$SetID, P.value=OUT.Pvalue, N.Marker=OUT.Marker) + out.tbl.single<-data.frame(OUT.single);colnames(out.tbl.single)<-c('Region.name','SNP.name','MAF','p.value') + + if(length(MinP.adjust)!=0){out.tbl<-data.frame(SetID=SSD.INFO$SetInfo$SetID, P.value=OUT.Pvalue,P.value_MinP=OUT.Pvalue.MinP, N.Marker=OUT.Marker)} + re<-list(results.single=out.tbl.single,results=out.tbl) + class(re)<-"GEI_SSD_ALL" + + return(re) +} + + + diff --git a/data/LGEWIS.example.rda b/data/LGEWIS.example.rda new file mode 100644 index 0000000..659c538 Binary files /dev/null and b/data/LGEWIS.example.rda differ diff --git a/man/GA.SSD.All.Rd b/man/GA.SSD.All.Rd new file mode 100755 index 0000000..2516317 --- /dev/null +++ b/man/GA.SSD.All.Rd @@ -0,0 +1,131 @@ +\name{GA.SSD.All} +\alias{GA.SSD.All} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Genetic association tests for multiple regions/genes using SSD format files +} +\description{ +Test the association between an quantitative outcome and multiple region/genes using SSD format files. +} +\usage{ +GA.SSD.All(SSD.INFO, result.prelim, Gsub.id=NULL, MinP.adjust=NULL, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{SSD.INFO}{ +SSD format information file, output of function ``Open_SSD". The sets are defined by this file. +} + \item{result.prelim}{ +Output of function "GA.prelim()". +} + \item{Gsub.id}{ +The subject id corresponding to the genotype matrix, an m dimensional vector. This is in order to match the phenotype and genotype matrix. The default is NULL, where the order is assumed to be matched with Y, X, E and time. +} + \item{MinP.adjust}{ +If the users would like to compare with the MinP test, this parameter specify the adjustment thereshold as in Gao, et al. (2008) "A multiple testing correction method for genetic association studies using correlated single nucleotide polymorphisms". Values from 0 to 1 are permitted. The default is NULL, i.e., no comparison. The value suggested by Gao, et al. (2008) is 0.95. +} + \item{...}{ +Other options of the generalized score type test. Defined same as in function "GA.test()". +} +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + \item{results}{ +Results of the set based analysis. First column contains the set ID; Second column (second and third columns when the MinP test is compared) contains the p-values; Last column contains the number of tested SNPs. +} + \item{results.single}{ +Results of the single variant analysis for all variants in the sets. First column contains the regions' names; Second column is the variants' names; Third column contains the minor allele frequencies; Last column contains the p.values. +} +} +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +%} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ + +# * Since the Plink data files used here are hard to be included in a R package, +# The usage is marked by "#" to pass the package check. + +# library(LGEWIS) + +############################################## + +# Plink data files: File.Bed, File.Bim, File.Fam +# Files defining the sets: File.SetID, File.SSD, File.Info +# For longitudinal data, outcome and covariates are saved in a separate file: Y, time, X, E. +# Preliminary work was done using function null.LGRF. + +# Create the MW File +# File.Bed<-"./example.bed" +# File.Bim<-"./example.bim" +# File.Fam<-"./example.fam" +# File.SetID<-"./example.SetID" +# File.SSD<-"./example.SSD" +# File.Info<-"./example.SSD.info" + +# Generate SSD file +# To use binary ped files, you have to generate SSD file first. +# If you already have a SSD file, you do not need to call this function. +# Generate_SSD_SetID(File.Bed, File.Bim, File.Fam, File.SetID, File.SSD, File.Info) + +# SSD.INFO<-Open_SSD(File.SSD, File.Info) +# Number of samples +# SSD.INFO$nSample +# Number of Sets +# SSD.INFO$nSets + +## Fit the null model +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# time: describe longitudinal structure, n by 2 matrix +# result.prelim<-GA.prelim(Y,time,X=X) + +## Test all regions +# out_all<-GA.SSD.All(SSD.INFO, result.prelim, MinP.adjust=0.95) + +# Example result +# out.all$results +# SetID P.value P.value_MinP N.Marker +# 1 GENE_01 0.91592151 1.00000000 94 +# 2 GENE_02 0.31103681 0.06451609 84 +# 3 GENE_03 0.05976685 0.49923701 108 +# 4 GENE_04 0.09389408 1.00000000 101 +# 5 GENE_05 0.10339403 0.67139314 103 +# 6 GENE_06 0.94666614 1.00000000 94 +# 7 GENE_07 0.47955756 1.00000000 104 +# 8 GENE_08 0.99415148 1.00000000 96 +# 9 GENE_09 0.29271727 0.78406742 100 +# 10 GENE_10 0.29061419 0.67045802 100 + +# out.all$results.single +# Region.name SNP.name MAF p.value +# 1 GENE_01 SNP0056 0.097 0.343641626676761 +# 2 GENE_01 SNP0083 0.11 0.75884859949668 +# 3 GENE_01 SNP0035 0.097 0.796572155483814 +# ... + + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{plink_test_all} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/GA.SSD.OneSet_SetIndex.Rd b/man/GA.SSD.OneSet_SetIndex.Rd new file mode 100755 index 0000000..cc35f35 --- /dev/null +++ b/man/GA.SSD.OneSet_SetIndex.Rd @@ -0,0 +1,137 @@ +\name{GA.SSD.OneSet_SetIndex} +\alias{GA.SSD.OneSet_SetIndex} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Genetic association tests for a single region/gene using SSD format files +} +\description{ +Test the genetic association between an quantitative outcome and one region/gene using SSD format files. +} +\usage{ +GA.SSD.OneSet_SetIndex(SSD.INFO, SetIndex, result.prelim, Gsub.id=NULL, +MinP.adjust=NULL, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{SSD.INFO}{ +SSD format information file, output of function ``Open_SSD". The sets are defined by this file. +} + \item{SetIndex}{ +Set index. From 1 to the total number of sets. +} + \item{result.prelim}{ +Output of function "GA.prelim()". +} + \item{Gsub.id}{ +The subject id corresponding to the genotype matrix, an m dimensional vector. This is in order to match the phenotype and genotype matrix. The default is NULL, where the order is assumed to be matched with Y, X, E and time. +} + \item{MinP.adjust}{ +If the users would like to compare with the MinP test, this parameter specify the adjustment thereshold as in Gao, et al. (2008) "A multiple testing correction method for genetic association studies using correlated single nucleotide polymorphisms". Values from 0 to 1 are permitted. The default is NULL, i.e., no comparison. The value suggested by Gao, et al. (2008) is 0.95. +} + \item{...}{ +Other options of the generalized score type test. Defined same as in function "GA.test()". +} +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + \item{p.value}{ +P-value of the set based generalized score type test. +} + \item{p.single}{ +P-values of the incorporated single SNP analyses +} + \item{p.MinP}{ +P-value of the MinP test. +} + \item{n.marker}{ +number of tested SNPs in the SNP set. +} +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ + +# * Since the Plink data files used here are hard to be included in a R package, +# The usage is marked by "#" to pass the package check. + +# library(LGEWIS) + +############################################## + +# Plink data files: File.Bed, File.Bim, File.Fam +# Files defining the sets: File.SetID, File.SSD, File.Info +# For longitudinal data, outcome and covariates are saved in a separate file: Y, time, X. +# Preliminary work was done using function null.LGRF. + +# Create the MW File +# File.Bed<-"./example.bed" +# File.Bim<-"./example.bim" +# File.Fam<-"./example.fam" +# File.SetID<-"./example.SetID" +# File.SSD<-"./example.SSD" +# File.Info<-"./example.SSD.info" + +# Generate SSD file +# To use binary ped files, you have to generate SSD file first. +# If you already have a SSD file, you do not need to call this function. +# Generate_SSD_SetID(File.Bed, File.Bim, File.Fam, File.SetID, File.SSD, File.Info) + +# SSD.INFO<-Open_SSD(File.SSD, File.Info) +# Number of samples +# SSD.INFO$nSample +# Number of Sets +# SSD.INFO$nSets + +## Fit the null model +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# time: describe longitudinal structure, n by 2 matrix +# result.prelim<-GA.prelim(Y,time,X=X) + +## Test a single region +# out_single<-GA.SSD.OneSet_SetIndex(SSD.INFO=SSD.INFO, +# SetIndex=3, result.prelim=result.prelim,MinP.adjust=0.95) + +# Example result +# out_single +# $n.marker +# [1] 108 + +# $p.single +# MAF p.value +# SNP0254 0.098 0.094386616 +# SNP0273 0.091 0.976357482 +# SNP0199 0.118 0.629960577 +# ... + +# $p.MinP +# [1] 0.499237 + +# $p.value +# [1] 0.05976685 + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{plink_test_single} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/GA.prelim.Rd b/man/GA.prelim.Rd new file mode 100755 index 0000000..24c9277 --- /dev/null +++ b/man/GA.prelim.Rd @@ -0,0 +1,73 @@ +\name{GA.prelim} +\alias{GA.prelim} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +The preliminary data management for GA (tests for genetic association) +} +\description{ +Before testing a specific region using a generalized score type test, this function does the preliminary data management, such as fitting the model under the null hypothesis. +} +\usage{ +GA.prelim(Y,time,X=NULL,corstr="exchangeable") +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{Y}{ +The outcome variable, an n*1 matrix where n is the total number of observations +} + \item{time}{ +An n*2 matrix describing how the observations are measured. The first column is the subject id. The second column is the measured exam (1,2,3,etc.). +} + \item{X}{ +An n*p covariates matrix where p is the total number of covariates. +} + \item{corstr}{ +The working correlation as specified in 'geeglm'. The following are permitted: "independence", "exchangeable", "ar1". +} + +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + It returns a list used for function GA.test(). +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ +library(LGEWIS) + +# Load data example +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# time: describe longitudinal structure, n by 2 matrix +# G: genotype matrix, m by q matrix where m is the total number of subjects + +data(LGEWIS.example) +Y<-LGEWIS.example$Y;time<-LGEWIS.example$time;X<-LGEWIS.example$X;G<-LGEWIS.example$G + +# Preliminary data management +result.prelim<-GA.prelim(Y,time,X=X) + +} + +\keyword{preliminary work} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/GA.test.Rd b/man/GA.test.Rd new file mode 100755 index 0000000..351775b --- /dev/null +++ b/man/GA.test.Rd @@ -0,0 +1,105 @@ +\name{GA.test} +\alias{GA.test} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Test the association between an quantitative outcome variable and a region/gene by a generalized score type test. +} +\description{ +Once the preliminary work is done using "GA.prelim()", this function tests a specifc region/gene. Single SNP analyses are also incorporated. +} +\usage{ +GA.test(result.prelim,G,Gsub.id=NULL,bootstrap=NULL, +MinP.adjust=NULL,impute.method='fixed') +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{result.prelim}{ +The output of function "GEI.prelim()" +} + \item{G}{ +Genetic variants in the target region/gene, an m*q matrix where m is the subject ID and q is the total number of genetic variables. Note that the number of rows in Z should be same as the number of subjects. +} + \item{Gsub.id}{ +The subject id corresponding to the genotype matrix, an m dimensional vector. This is in order to match the phenotype and genotype matrix. The default is NULL, where the order is assumed to be matched with Y, X and time. +} + \item{bootstrap}{ +Whether to use bootstrap for small sample size adjustement. This is recommended when the number of subjects is small, or the set contains rare variants. The default is NULL, but a suggested number is 10000 when it is needed. +} + \item{MinP.adjust}{ +If the users would like to compare with the MinP test, this parameter specify the adjustment thereshold as in Gao, et al. (2008) "A multiple testing correction method for genetic association studies using correlated single nucleotide polymorphisms". Values from 0 to 1 are permitted. The default is NULL, i.e., no comparison. The value suggested by Gao, et al. (2008) is 0.95. +} + \item{impute.method}{ +Choose the imputation method when there is missing genotype. Can be "random", "fixed" or "bestguess". Given the estimated allele frequency, "random" simulates the genotype from binomial distribution; "fixed" uses the genotype expectation; "Best guess" uses the genotype with highest probability. +} +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + \item{p.value}{ +P-value of the set based generalized score type test. +} + \item{p.single}{ +P-values of the incorporated single SNP analyses +} + \item{p.MinP}{ +P-value of the MinP test. +} + \item{n.marker}{ +number of heterozygous SNPs in the SNP set. +} +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ +## GA.prelim does the preliminary data management. +# Input: Y, time, X (covariates) +## GA.test tests a region. +# Input: G (genetic variants) and result of GEI.prelim + +library(LGEWIS) + +# Load data example +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# time: describe longitudinal structure, n by 2 matrix +# G: genotype matrix, m by q matrix where m is the total number of subjects + +data(LGEWIS.example) +Y<-LGEWIS.example$Y;time<-LGEWIS.example$time;X<-LGEWIS.example$X;G<-LGEWIS.example$G + +# Preliminary data management +result.prelim<-GA.prelim(Y,time,X=X) + +# test without the MinP test +result<-GA.test(result.prelim,G,MinP.adjust=NULL) + +# test with the MinP test +result<-GA.test(result.prelim,G,MinP.adjust=0.95) + +# test with the MinP test and the small sample adjustment +result<-GA.test(result.prelim,G,MinP.adjust=0.95,bootstrap=1000) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{test} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/GEI.SSD.All.Rd b/man/GEI.SSD.All.Rd new file mode 100755 index 0000000..417e9eb --- /dev/null +++ b/man/GEI.SSD.All.Rd @@ -0,0 +1,132 @@ +\name{GEI.SSD.All} +\alias{GEI.SSD.All} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Gene-environment interaction tests for multiple regions/genes using SSD format files +} +\description{ +Test the interaction between an environmental exposure and multiple region/genes on a quantitative outcome using SSD format files. +} +\usage{ +GEI.SSD.All(SSD.INFO, result.prelim, Gsub.id=NULL, MinP.adjust=NULL, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{SSD.INFO}{ +SSD format information file, output of function ``Open_SSD". The sets are defined by this file. +} + \item{result.prelim}{ +Output of function "GEI.prelim()". +} + \item{Gsub.id}{ +The subject id corresponding to the genotype matrix, an m dimensional vector. This is in order to match the phenotype and genotype matrix. The default is NULL, where the order is assumed to be matched with Y, X, E and time. +} + \item{MinP.adjust}{ +If the users would like to compare with the MinP test, this parameter specify the adjustment thereshold as in Gao, et al. (2008) "A multiple testing correction method for genetic association studies using correlated single nucleotide polymorphisms". Values from 0 to 1 are permitted. The default is NULL, i.e., no comparison. The value suggested by Gao, et al. (2008) is 0.95. +} + \item{...}{ +Other options of the generalized score type test. Defined same as in function "GEI.test()". +} +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + \item{results}{ +Results of the set based analysis. First column contains the set ID; Second column (second and third columns when the MinP test is compared) contains the p-values; Last column contains the number of tested SNPs. +} + \item{results.single}{ +Results of the single variant analysis for all variants in the sets. First column contains the regions' names; Second column is the variants' names; Third column contains the minor allele frequencies; Last column contains the p.values. +} +} +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +%} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ + +# * Since the Plink data files used here are hard to be included in a R package, +# The usage is marked by "#" to pass the package check. + +# library(LGEWIS) + +############################################## + +# Plink data files: File.Bed, File.Bim, File.Fam +# Files defining the sets: File.SetID, File.SSD, File.Info +# For longitudinal data, outcome and covariates are saved in a separate file: Y, time, X, E. +# Preliminary work was done using function null.LGRF. + +# Create the MW File +# File.Bed<-"./example.bed" +# File.Bim<-"./example.bim" +# File.Fam<-"./example.fam" +# File.SetID<-"./example.SetID" +# File.SSD<-"./example.SSD" +# File.Info<-"./example.SSD.info" + +# Generate SSD file +# To use binary ped files, you have to generate SSD file first. +# If you already have a SSD file, you do not need to call this function. +# Generate_SSD_SetID(File.Bed, File.Bim, File.Fam, File.SetID, File.SSD, File.Info) + +# SSD.INFO<-Open_SSD(File.SSD, File.Info) +# Number of samples +# SSD.INFO$nSample +# Number of Sets +# SSD.INFO$nSets + +## Fit the null model +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# E: covariates, n by 1 matrix +# time: describe longitudinal structure, n by 2 matrix +# result.prelim<-GEI.prelim(Y,time,E,X=X) + +## Test all regions +# out_all<-GEI.SSD.All(SSD.INFO, result.prelim, MinP.adjust=0.95) + +# Example result +# out.all$results +# SetID P.value P.value_MinP N.Marker +# 1 GENE_01 0.5617291 1.000000000 94 +# 2 GENE_02 0.8079711 1.000000000 84 +# 3 GENE_03 0.1046738 0.004664728 108 +# 4 GENE_04 0.5976760 1.000000000 101 +# 5 GENE_05 0.3240141 1.000000000 103 +# 6 GENE_06 0.1277916 0.641227316 94 +# 7 GENE_07 0.6957561 1.000000000 104 +# 8 GENE_08 0.7630369 0.151874693 96 +# 9 GENE_09 0.7164281 0.863155784 100 +# 10 GENE_10 0.7292435 0.070854665 100 + +# out.all$results.single +# Region.name SNP.name MAF p.value +# 1 GENE_01 SNP0056 0.097 0.72366448267218 +# 2 GENE_01 SNP0083 0.11 0.814563709041184 +# 3 GENE_01 SNP0035 0.097 0.999162315393064 +# ... + + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{plink_test_all} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/GEI.SSD.OneSet_SetIndex.Rd b/man/GEI.SSD.OneSet_SetIndex.Rd new file mode 100755 index 0000000..dcfdb5f --- /dev/null +++ b/man/GEI.SSD.OneSet_SetIndex.Rd @@ -0,0 +1,150 @@ +\name{GEI.SSD.OneSet_SetIndex} +\alias{GEI.SSD.OneSet_SetIndex} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Gene-environment interaction tests for a single region/gene using SSD format files +} +\description{ +Test the interaction between an environmental exposure and one region/gene on a quantitative outcome using SSD format files. +} +\usage{ +GEI.SSD.OneSet_SetIndex(SSD.INFO, SetIndex, result.prelim, Gsub.id=NULL, +MinP.adjust=NULL, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{SSD.INFO}{ +SSD format information file, output of function ``Open_SSD". The sets are defined by this file. +} + \item{SetIndex}{ +Set index. From 1 to the total number of sets. +} + \item{result.prelim}{ +Output of function "GEI.prelim()". +} + \item{Gsub.id}{ +The subject id corresponding to the genotype matrix, an m dimensional vector. This is in order to match the phenotype and genotype matrix. The default is NULL, where the order is assumed to be matched with Y, X, E and time. +} + \item{MinP.adjust}{ +If the users would like to compare with the MinP test, this parameter specify the adjustment thereshold as in Gao, et al. (2008) "A multiple testing correction method for genetic association studies using correlated single nucleotide polymorphisms". Values from 0 to 1 are permitted. The default is NULL, i.e., no comparison. The value suggested by Gao, et al. (2008) is 0.95. +} + \item{...}{ +Other options of the generalized score type test. Defined same as in function "GEI.test()". +} +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + \item{p.value}{ +P-value of the set based generalized score type test. +} + \item{p.single}{ +P-values of the incorporated single SNP analyses +} + \item{p.MinP}{ +P-value of the MinP test. +} + \item{n.marker}{ +number of tested SNPs in the SNP set. +} + \item{E.df}{ +number of tested SNPs in the SNP set. +} + \item{G.df}{ +number of tested SNPs in the SNP set. +} +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ + +# * Since the Plink data files used here are hard to be included in a R package, +# The usage is marked by "#" to pass the package check. + +# library(LGEWIS) + +############################################## + +# Plink data files: File.Bed, File.Bim, File.Fam +# Files defining the sets: File.SetID, File.SSD, File.Info +# For longitudinal data, outcome and covariates are saved in a separate file: Y, time, X, E. +# Preliminary work was done using function null.LGRF. + +# Create the MW File +# File.Bed<-"./example.bed" +# File.Bim<-"./example.bim" +# File.Fam<-"./example.fam" +# File.SetID<-"./example.SetID" +# File.SSD<-"./example.SSD" +# File.Info<-"./example.SSD.info" + +# Generate SSD file +# To use binary ped files, you have to generate SSD file first. +# If you already have a SSD file, you do not need to call this function. +# Generate_SSD_SetID(File.Bed, File.Bim, File.Fam, File.SetID, File.SSD, File.Info) + +# SSD.INFO<-Open_SSD(File.SSD, File.Info) +# Number of samples +# SSD.INFO$nSample +# Number of Sets +# SSD.INFO$nSets + +## Fit the null model +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# E: covariates, n by 1 matrix +# time: describe longitudinal structure, n by 2 matrix +# result.prelim<-GEI.prelim(Y,time,E,X=X) + +## Test a single region +# out_single<-GEI.SSD.OneSet_SetIndex(SSD.INFO=SSD.INFO, +# SetIndex=3, result.prelim=result.prelim,MinP.adjust=0.95) + +# Example result +# out_single +# $n.marker +# [1] 108 + +# $E.df +# [1] 22 + +# $G.df +# [1] 22 + +# $p.single +# MAF p.value +# SNP0254 0.098 5.920758e-01 +# SNP0273 0.091 9.468959e-01 +# SNP0199 0.118 4.131540e-01 +# ... + +# $p.MinP +# [1] 3.876731e-05 + +# $p.value +# [1] 0.1167991 + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{plink_test_single} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/GEI.prelim.Rd b/man/GEI.prelim.Rd new file mode 100755 index 0000000..aaadfdd --- /dev/null +++ b/man/GEI.prelim.Rd @@ -0,0 +1,85 @@ +\name{GEI.prelim} +\alias{GEI.prelim} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +The preliminary data management for GEI (tests for gene-environment interaction) +} +\description{ +Before testing a specific region using a generalized score type test, this function does the preliminary data management, such as pareparing spline basis functions for E etc.. +} +\usage{ +GEI.prelim(Y,time,E,X=NULL,E.method='ns',E.df=floor(sqrt(length(unique(time[,1])))), +corstr="exchangeable") +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{Y}{ +The outcome variable, an n*1 matrix where n is the total number of observations +} + \item{time}{ +An n*2 matrix describing how the observations are measured. The first column is the subject id. The second column is the measured exam (1,2,3,etc.). +} + \item{E}{ +An n*1 environmental exposure. +} + \item{X}{ +An n*p covariates matrix where p is the total number of covariates. +} + \item{E.method}{ +The method of sieves for the main effect of E. It can be "ns" for natural cubic spline sieves; "bs" for B-spline sieves; "ps" for polynomial sieves. The default is "ns". +} + \item{E.df}{ +Model complexity for the method of sieves, i.e., number of basis functions. The default is sqrt(m). +} + \item{corstr}{ +The working correlation as specified in 'geeglm'. The following are permitted: "independence", "exchangeable", "ar1". +} + +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + It returns a list used for function GEI.test(). +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ +library(LGEWIS) + +# Load data example +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# E: environmental exposure, n by 1 matrix +# time: describe longitudinal structure, n by 2 matrix +# G: genotype matrix, m by q matrix where m is the total number of subjects + +data(LGEWIS.example) +Y<-LGEWIS.example$Y;time<-LGEWIS.example$time; +E<-LGEWIS.example$E;X<-LGEWIS.example$X;G<-LGEWIS.example$G + +# Preliminary data management +result.prelim<-GEI.prelim(Y,time,E,X=X) + +} + +\keyword{preliminary work} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/GEI.test.Rd b/man/GEI.test.Rd new file mode 100755 index 0000000..21dd940 --- /dev/null +++ b/man/GEI.test.Rd @@ -0,0 +1,119 @@ +\name{GEI.test} +\alias{GEI.test} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Test the interaction between an environemental exposure and a region/gene by a generalized score type test. +} +\description{ +Once the preliminary work is done using "GEI.prelim()", this function tests a specifc region/gene. Single SNP analyses are also incorporated. +} +\usage{ +GEI.test(result.prelim,G,Gsub.id=NULL,G.method='wPCA',G.df=floor(sqrt(nrow(G))), +bootstrap=NULL,MinP.adjust=NULL,impute.method='fixed') +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{result.prelim}{ +The output of function "GEI.prelim()" +} + \item{G}{ +Genetic variants in the target region/gene, an m*q matrix where m is the subject ID and q is the total number of genetic variables. Note that the number of rows in Z should be same as the number of subjects. +} + \item{Gsub.id}{ +The subject id corresponding to the genotype matrix, an m dimensional vector. This is in order to match the phenotype and genotype matrix. The default is NULL, where the order is assumed to be matched with Y, X and time. +} + \item{G.method}{ +The dimension reduction method for main effect adjustment of G. The following are permitted: "wPCA" for weighted principal component analysis; "PCA" for principal component analysis; "PLS" for partial least square regression; "R2" for ordering the principal components by their R-squares. The dimension reduction method is in order to analyze large regions, i.e., the number of variants is close to or larger than the number of subjects. The default is "wPCA". +} + \item{G.df}{ +Number of components selected by the dimension reduction method. The default is sqrt(m). +} + \item{bootstrap}{ +Whether to use bootstrap for small sample size adjustement. This is recommended when the number of subjects is small, or the set contains rare variants. The default is NULL, but a suggested number is 10000 when it is needed. +} + \item{MinP.adjust}{ +If the users would like to compare with the MinP test, this parameter specify the adjustment thereshold as in Gao, et al. (2008) "A multiple testing correction method for genetic association studies using correlated single nucleotide polymorphisms". Values from 0 to 1 are permitted. The default is NULL, i.e., no comparison. The value suggested by Gao, et al. (2008) is 0.95. +} + \item{impute.method}{ +Choose the imputation method when there is missing genotype. Can be "random", "fixed" or "bestguess". Given the estimated allele frequency, "random" simulates the genotype from binomial distribution; "fixed" uses the genotype expectation; "Best guess" uses the genotype with highest probability. +} +} +%\details{ +%% ~~ If necessary, more details than the description above ~~ +%} +\value{ + \item{p.value}{ +P-value of the set based generalized score type test. +} + \item{p.single}{ +P-values of the incorporated single SNP analyses +} + \item{p.MinP}{ +P-value of the MinP test. +} + \item{n.marker}{ +number of heterozygous SNPs in the SNP set. +} + \item{E.df}{ +number of tested SNPs in the SNP set. +} + \item{G.df}{ +number of tested SNPs in the SNP set. +} +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +%\references{ +%% ~put references to the literature/web site here ~ +%} +%\author{ +%% ~~who you are~~ +%} +%\note{ +%% ~~further notes~~ +%} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +%\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +%} +\examples{ +## GEI.prelim does the preliminary data management. +# Input: Y, time, E, X (covariates) +## GEI.test tests a region. +# Input: G (genetic variants) and result of GEI.prelim + +library(LGEWIS) + +# Load data example +# Y: outcomes, n by 1 matrix where n is the total number of observations +# X: covariates, n by p matrix +# E: environmental exposure, n by 1 matrix +# time: describe longitudinal structure, n by 2 matrix +# G: genotype matrix, m by q matrix where m is the total number of subjects + +data(LGEWIS.example) +Y<-LGEWIS.example$Y;time<-LGEWIS.example$time; +E<-LGEWIS.example$E;X<-LGEWIS.example$X;G<-LGEWIS.example$G + +# Preliminary data management +result.prelim<-GEI.prelim(Y,time,E,X=X) + +# test without the MinP test +result<-GEI.test(result.prelim,G,MinP.adjust=NULL) + +# test with the MinP test +result<-GEI.test(result.prelim,G,MinP.adjust=0.95) + +# test with the MinP test and the small sample adjustment +result<-GEI.test(result.prelim,G,MinP.adjust=0.95,bootstrap=1000) + +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{test} +%\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/LGEWIS_example.Rd b/man/LGEWIS_example.Rd new file mode 100755 index 0000000..1fabbd4 --- /dev/null +++ b/man/LGEWIS_example.Rd @@ -0,0 +1,26 @@ +\name{LGEWIS.example} +\alias{LGEWIS.example} +\docType{data} +\title{ +Data example for LGEWIS (tests for genetic association or gene-environment interaction) +} +\description{ +The dataset contains outcome variable Y, covariate X, environmental exposure E, time, and genotype data Z. The first column in time is the subject ID and the second column is the measured exam. Y, X, E and time are all in long form. Z is a genotype matrix where each row corresponds to one subject. +} +\usage{data(LGEWIS.example)} +%\format{ +%} +%\details{ +%% ~~ If necessary, more details than the __description__ above ~~ +%} +%\source{ +%% ~~ reference to a publication or URL from which the data were obtained ~~ +%} +%\references{ +%% ~~ possibly secondary sources and usages ~~ +%} +%\examples{ +%data(GEI.example) +%## maybe str(LGRF.example) ; plot(LGRF.example) ... +%} +\keyword{datasets}