Skip to content

Commit

Permalink
version 0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Zihuai He authored and gaborcsardi committed Oct 14, 2015
0 parents commit dc7bb57
Show file tree
Hide file tree
Showing 16 changed files with 1,810 additions and 0 deletions.
15 changes: 15 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,15 @@
Package: LGEWIS
Type: Package
Title: Tests for Genetic Association/Gene-Environment Interaction in
Longitudinal Gene-Environment-Wide Interaction Studies
Version: 0.2
Date: 2015-10-07
Author: Zihuai He, Seunggeun Lee, Bhramar Mukherjee, Min Zhang
Maintainer: Zihuai He <zihuai@umich.edu>
Description: Functions for testing the genetic association/gene-environment interaction in longitudinal gene-environment-wide interaction studies. Generalized score type tests are used for set based analyses. Then GEE based score tests are applied to all single variants within the defined set.
License: GPL-3
Depends: CompQuadForm, SKAT, geeM, pls, splines
NeedsCompilation: no
Packaged: 2015-10-12 13:14:06 UTC; statzihuai
Repository: CRAN
Date/Publication: 2015-10-14 09:48:39
15 changes: 15 additions & 0 deletions MD5
@@ -0,0 +1,15 @@
89882734b5cc2f53cad68ee5b4fc7316 *DESCRIPTION
f0ef963ee49ab0bc55e8e9739737f26c *NAMESPACE
0745f11856cdfa5b1d823a82f81458d0 *R/GA.R
b7f20ba296667b10673abe9d02bae18e *R/GEI.R
9620bb6a7e839acc0e36bd0e1f155002 *R/Main_SSD.R
78a6eab9c39cfc8bbeb6f99e1ab4f8fe *data/LGEWIS.example.rda
790cf5c85deb903d58e5639eca2421f8 *man/GA.SSD.All.Rd
3eedbf2cf1ab8bbb4ec5844c5b295557 *man/GA.SSD.OneSet_SetIndex.Rd
a05de32b89938c91c3b19da5981ba49d *man/GA.prelim.Rd
2cffc2c1b31f42dd378af21fb3814cd1 *man/GA.test.Rd
46a895ede225364ba59c67507c54f83c *man/GEI.SSD.All.Rd
b4bad0e2f5eda08b40a2141433b9920f *man/GEI.SSD.OneSet_SetIndex.Rd
fbe0940c306d475666b4adee37d679cd *man/GEI.prelim.Rd
230009204a7146e24e82cf945e19a8ca *man/GEI.test.Rd
f2e61050b2713fd4dc60b40180c0f1e0 *man/LGEWIS_example.Rd
4 changes: 4 additions & 0 deletions NAMESPACE
@@ -0,0 +1,4 @@
#exportPattern("^[[:alpha:]]+")
export(GA.prelim,GA.test,GA.SSD.OneSet_SetIndex,GA.SSD.All,GEI.prelim,GEI.test,GEI.SSD.OneSet_SetIndex,GEI.SSD.All)
import(CompQuadForm, SKAT, geeM, pls, splines)
importFrom("stats", "cor", "lm", "rbinom", "gaussian", "pchisq", "quantile", "sd", "var")
278 changes: 278 additions & 0 deletions R/GA.R
@@ -0,0 +1,278 @@

#########################
# Hypothesis Testing
#########################

#p.s: the current code is only for a single environmental exposure

GA.prelim<-function(Y,time,X=NULL,corstr="exchangeable"){
##Preliminary
Y<-as.matrix(Y);N<-nrow(Y)
cluster.id<-unique(time[,1]);m<-length(cluster.id)
#sort the data by cluster.id
order.index<-order(match(time[,1],cluster.id))
if (sum(time[,1]!=time[order.index,1])>0){
msg<-sprintf("time was not sorted, now data is grouped by subject id")
warning(msg,call.=F)
Y<-as.matrix(Y[order.index,])
if(length(X)!=0){X<-as.matrix(X[order.index])}
time<-time[order.index,]
}

if(length(X)!=0){
Z.A<-svd(as.matrix(X))$u
nullgee<-geem(Y~Z.A,family=gaussian,corstr=corstr,id=time[,1])
}else{
nullgee<-geem(Y~1,family=gaussian,corstr=corstr,id=time[,1])
}

#prepare the intermediate results
result.prelim<-list(Y=Y,time=time,X=X,cluster.id=cluster.id,m=m,corstr=corstr,Z.A=Z.A,nullgee=nullgee)
return(result.prelim)
}

#G is an m*q matrix, each row coresponds to one subject.
GA.test<-function(result.prelim,G,Gsub.id=NULL,bootstrap=NULL,MinP.adjust=NULL,impute.method='fixed'){
## Load preliminary data
Y<-result.prelim$Y;corstr<-result.prelim$corstr
m<-result.prelim$m;X<-result.prelim$X
time<-result.prelim$time;cluster.id<-result.prelim$cluster.id
Z.A<-result.prelim$Z.A #Z.A0=svd(cbind(rep(1,N),X, M.E))$u
nullgee<-result.prelim$nullgee

## Deal with the genotype
SNP.list<-colnames(G)
# match the phenotype and genotype subject ID
if(length(Gsub.id)==0){Gsub.id<-cluster.id}
G<-as.matrix(G[match(cluster.id,Gsub.id),])

# missing genotype imputation
G[G==9]<-NA
N_MISS<-sum(is.na(G))
if(N_MISS>0){
msg<-sprintf("The missing genotype rate is %f. Imputation is applied.", N_MISS/nrow(G)/ncol(G))
warning(msg,call.=F)
G<-Impute(G,impute.method)
}
# centered genotype
G<-as.matrix(G);center.G<-t(t(G)-colMeans(G))
MAF<-colMeans(as.matrix(G[,colMeans(center.G^2)!=0]))/2;MAF[MAF>0.5]<-1-MAF[MAF>0.5]
G<-as.matrix(center.G[,colMeans(center.G^2)!=0]);n.G<-ncol(G)
SNP.name<-SNP.list[colMeans(center.G^2)!=0]
if(ncol(G) == 0){
msg<-sprintf("G does not include any heterozygous variant")
stop(msg)
}

# generate long form genotype
Z<-as.matrix(G[match(time[,1],cluster.id),])

if(length(bootstrap)==0){
# calculate score
GA.fit<-GA.Score(nullgee,Y,time,Z.A,Z,corstr) #Z.A does not include the intercept
# calculate score statistics and their covariance matrix
score<-GA.fit$score;M<-GA.fit$M
# calculate p-value
TestStat<-t(score)%*%score
EigenM<-eigen(M,symmetric=TRUE,only.values=TRUE)$values
p.value<-davies(TestStat,EigenM)$Qq

# implement a single SNP based analysis by GEE score tests
p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/diag(M),df=1,lower.tail=F)))
colnames(p.single)<-c('MAF','p.value')
rownames(p.single)<-SNP.name
}else{
# calculate score
GA.fit<-GA.bootstrap(nullgee,Y,time,Z.A,Z,corstr,bootstrap) #Z.A does not include the intercept
score<-GA.fit$score;perturb.score<-GA.fit$perturb.score
Q<-t(score)%*%score
# calculate bootstrap based mean, variance and kurtosis
perturb.Q<-apply(perturb.score^2,2,sum)
perturb.mean<-mean(perturb.Q)
perturb.variance<-var(perturb.Q)
perturb.kurtosis<-mean((perturb.Q-perturb.mean)^4)/perturb.variance^2-3
# calculate p.value
if (perturb.kurtosis>0){df<-12/perturb.kurtosis}else{
df<-100000
}
p.value<-1-pchisq((Q-perturb.mean)*sqrt(2*df)/sqrt(perturb.variance)+df,df)
# implement a single SNP based analysis by GEE score tests
perturb.variance.single<-apply(perturb.score,1,var)
p.single<-cbind(MAF,as.vector(pchisq(as.vector(score)^2/perturb.variance.single,df=1,lower.tail=F)))
colnames(p.single)<-c('MAF','p.value')
rownames(p.single)<-SNP.name
}
if(length(MinP.adjust)!=0){
eigen.G<-eigen(cor(G),only.values=TRUE)$values
me.G<-effect.n(eigen.G,MinP.adjust)
p.MinP<-min(min(p.single[,2])*me.G,1)
}else{p.MinP<-NA}

return(list(n.marker=n.G,p.single=p.single,p.MinP=p.MinP,p.value=p.value))
}

#################################################
# Calculate Score Statistics and their Variance
#################################################

GA.bootstrap<-function(nullgee,Y,time,Z.A,Z.I,corstr,bootstrap) #Z.A does not include the intercept
{
rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept
mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu
#rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu
cluster.id<-unique(time[,1]);m<-length(cluster.id);dimI<-ncol(Z.I)
##Generalized score test
score.array<-matrix(0,dimI,m)
n.total<-1;n.rep<-as.numeric(table(time[,1]))
for (i in 1:m)
{
ni<-n.rep[i]
index<-n.total:(n.total+ni-1)
n.total<-n.total + ni
#working covariance matrix
if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)}
if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)}
if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi}
#if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv}
#score matrices
Z.Ii<-Z.I[index,];if (ni==1) {Z.Ii<-t(Z.Ii)}
score.array[,i]<-t(Z.Ii)%*%(Vi.inv%*%Y.res[index])/sqrt(m)
}
score<-as.matrix(apply(score.array,1,sum))
# null distribution
perturb.coef<-matrix(rbinom(m*bootstrap,1,0.5)*2-1,m,bootstrap)
perturb.score<-score.array%*%perturb.coef

return(list(score=score,perturb.score=perturb.score))#Q: test statistic; M: covariance matrix
}

GA.Score<-function(nullgee,Y,time,Z.A,Z.I,corstr) #Z.A does not include the intercept
{
rho<-as.numeric(nullgee$alpha);N<-nrow(time);Z.A<-cbind(rep(1,N),Z.A) #add the intercept
mu<-colSums(nullgee$beta*t(Z.A));Y.res<-Y-mu
#rho<-nullgee$geese$alpha;mu<-nullgee$fitted.values;Y.res<-Y-mu
cluster.id<-unique(time[,1]);m<-length(cluster.id)
##Generalized score test
dimY<-nrow(Y.res);dimA<-ncol(Z.A);dimI<-ncol(Z.I);score<-matrix(0,dimI,1)
An<-matrix(0,dimA+dimI,dimA+dimI);Bn<-matrix(0,dimA+dimI,dimA+dimI)
n.total<-1;n.rep<-as.numeric(table(time[,1]))
for (i in 1:m)
{
ni<-n.rep[i]
index<-n.total:(n.total+ni-1)
n.total<-n.total + ni
#working covariance matrix
if (corstr=="exchangeable"){Vi<-diag(1-rho,ni)+ matrix(rho, ni, ni);Vi.inv<-solve(Vi)}
if (corstr=="ar1"){Vi<-rho^abs(outer(time[index,2], time[index,2],"-"));Vi.inv<-solve(Vi)}
if (corstr=="independence"){Vi<-diag(1,ni);Vi.inv<-Vi}
#if (link=="logit"){Ci<-mu[index]*(1-mu[index]);Vi.inv<-outer(sqrt(Ci),sqrt(Ci)^-1,"*")*Vi.inv}
#score matrices
Z.Ai<-Z.A[index,];Z.Ii<-Z.I[index,]
if (ni==1) { Z.Ai<-t(Z.Ai); Z.Ii<-t(Z.Ii)}
Zi<-cbind(Z.Ii,Z.Ai)
#An<-An+t(Zi)%*%Vi.inv%*%Zi
An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]+t(Z.Ai)%*%Vi.inv%*%Z.Ai
An[1:dimI, (dimI+1):(dimI+dimA)]<-An[1:dimI, (dimI+1):(dimI+dimA)]+t(Z.Ii)%*%(Vi.inv%*%Z.Ai)
Bntemp<-t(Zi)%*%(Vi.inv%*%Y.res[index])
Bn<-Bn+Bntemp%*%t(Bntemp)
score<-score+Bntemp[1:dimI,]#t(Z.Ii)%*%Vi.inv%*%Y.res[index]
}
An<-An/m; Bn<-Bn/m
#Calculate p-value
A00<-An[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)];A00.inv<-solve(A00);A10<-An[1:dimI, (dimI+1):(dimI+dimA)]
B11<-Bn[1:dimI,1:dimI];B10<-Bn[1:dimI,(dimI+1):(dimI+dimA)];B00<-Bn[(dimI+1):(dimI+dimA),(dimI+1):(dimI+dimA)]
P<-A10%*%A00.inv;M<-B11-t(B10%*%t(P))-B10%*%t(P)+P%*%B00%*%t(P)
#Revise the covariance matrix using m-q
M<-M*m/(m-dimA)
#Revise the score using standard representation
score<-score/sqrt(m)
return(list(score=score,M=M))#Q: test statistic; M: covariance matrix
}

# calculate number of independent tests
effect.n<-function(x,MinP.adjust){
temp<-0;sum.EV<-sum(x) #summation of eigen values
for (i in 1:length(x)){
temp<-temp+x[i];if (temp>sum.EV*MinP.adjust){break}
}
return(i)
}

#Data management
Minor.allele<-function(x){if(mean(x)>1){x<-2-x};return(x)}
Standardize<-function(x){return((x-mean(x))/sd(x))}
Center<-function(x){return(x-mean(x))}
Variation<-function(x){x<-apply(x,2,Minor.allele);x<-x[,which(apply(x,2,sd)>0)];return(x)}
Common<-function(x){x<-apply(x,2,Minor.allele);freq<-apply(x,2,mean)/2;x<-x[,which(freq>0.05)];return(x)}
##Matrix calculation
#Get sqrt.root of a matrix
Get.sqrt<-function(A){
a.eig <- eigen(A,symmetric=TRUE)
ID1<-which(a.eig$values > 0)
if(length(ID1)== 0){stop("Error to obtain matrix square!")}
a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])) %*% t(a.eig$vectors[,ID1])
return(a.sqrt)
}
#Get inverse of a matrix; numerically robust
Get.inverse<-function(A){
a.eig <- eigen(A,symmetric=TRUE)
ID1<-which(a.eig$values > 0)
if(length(ID1)== 0){stop("Error to obtain matrix inverse!")}
a.inverse <- a.eig$vectors[,ID1] %*% diag(a.eig$values[ID1]^-1) %*% t(a.eig$vectors[,ID1])
return(a.inverse)
}
#Get -1/2 of a matrix; numerically robust
Get.inv.sqrt<-function(A){
a.eig <- eigen(A,symmetric=TRUE)
ID1<-which(a.eig$values > 0)
if(length(ID1)== 0){stop("Error to obtain matrix square!")}
a.sqrt <- a.eig$vectors[,ID1] %*% diag(sqrt(a.eig$values[ID1])^-2) %*% t(a.eig$vectors[,ID1])
return(a.sqrt)
}
#Time similarity
Check.same<-function(x,y){return(as.numeric(x==y))}
Check.near<-function(x,y){return(as.numeric(abs(x-y)==1))}
D.matrix<-function(X){
n<-length(X[,1]);temp<-matrix(0,n,n)
#checking time
temp<-outer(X[,1],X[,1],Check.same)*outer(X[,2],X[,2],Check.near)
diag(temp)<-0
return(temp)
}
# Simple Imputation (from SKAT package)
# Z : an m x p genotype matrix with m samples and p SNPs

Impute<-function(Z, impute.method){
p<-dim(Z)[2]
if(impute.method =="random"){
for(i in 1:p){
IDX<-which(is.na(Z[,i]))
if(length(IDX) > 0){
maf1<-mean(Z[-IDX,i])/2
Z[IDX,i]<-rbinom(length(IDX),2,maf1)
}
}
} else if(impute.method =="fixed"){
for(i in 1:p){
IDX<-which(is.na(Z[,i]))
if(length(IDX) > 0){
maf1<-mean(Z[-IDX,i])/2
Z[IDX,i]<-2 * maf1
}
}
} else if(impute.method =="bestguess") {
for(i in 1:p){
IDX<-which(is.na(Z[,i]))
if(length(IDX) > 0){
maf1<-mean(Z[-IDX,i])/2
Z[IDX,i]<-round(2 * maf1)
}
}
} else {
stop("Error: Imputation method shoud be \"fixed\", \"random\" or \"bestguess\" ")
}
return(as.matrix(Z))
}



0 comments on commit dc7bb57

Please sign in to comment.