-
Notifications
You must be signed in to change notification settings - Fork 3
/
sample_overlap_theory.Rmd
189 lines (133 loc) · 6.43 KB
/
sample_overlap_theory.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
---
title: "Directly simulating summary datasets with sample overlap"
author: John Ferguson
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Directly simulating summary datasets with sample overlap}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```
```{r}
library(MASS)
## calculation of asymptotic covariance matrix for Odds ratio
### convert logit values to probabilities
expit <- function(x){exp(x)/(1+exp(x))}
## calculate asymptotic version of (X^t W X)
asymp_var_logistic <- function(n,G_prob,Gamma_0,Gamma_1){
N <- length(Gamma_0)
disease_probs <- expit(outer(Gamma_0,c(1,1,1)) + outer(Gamma_1,c(0,1,2)))
diag_weights <- disease_probs*(1-disease_probs)
a <- n*apply(G_prob*diag_weights,1,sum)
b <- n*apply(G_prob*diag_weights*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- n*apply(G_prob*diag_weights*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(a/(a*d-b*c))
}
asymp_var_linear <- function(n,G_prob,sigma=1){
N <- nrow(G_prob)
a <- n*apply(G_prob,1,sum)
b <- n*apply(G_prob*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- n*apply(G_prob*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(a/(a*d-b*c)*sigma^2)
}
asymp_cov_linear_linear <- function(n_overlap,n_gx,n_gy,G_prob,sigma_x=1,sigma_y=1,cor_xy=.5){
N <- nrow(G_prob)
cov_xy <- cor_xy*sigma_x*sigma_y
a <- apply(G_prob,1,sum)
b <- apply(G_prob*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- apply(G_prob*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(n_overlap*(a/(a*d-b*c))*cov_xy/(n_gx*n_gy))
}
asymp_cov_linear_logistic <- function(n_overlap,n_gx,n_gy,G_prob,cor_xy=0,sigma=1,Gamma_0,Gamma_1,prev=0.01){
N <- length(Gamma_0)
cov_xy <- cor_xy*sigma*sqrt(prev*(1-prev))
disease_probs <- expit(outer(Gamma_0,c(1,1,1)) + outer(Gamma_1,c(0,1,2)))
diag_weights <- disease_probs*(1-disease_probs)
a <- apply(G_prob*diag_weights,1,sum)
b <- apply(G_prob*diag_weights*matrix(rep(c(0,1,2),N),byrow=TRUE,nrow=N),1,sum)
c <- b
d <- apply(G_prob*diag_weights*matrix(rep(c(0,1,4),N),byrow=TRUE,nrow=N),1,sum)
## invert matrix and take element for Gamma1
return(n_overlap*(a/(a*d-b*c))*cov_xy/(n_gx*n_gy))
}
################################################## Linear Logistic case ###################################################
## parameters
N <- 100000 # number of snps to simulate (assumed in LD and HWE)
prev <- 0.1 # disease prevalence
gamma1 <- rnorm(N,mean=0,sd=0.3) # true instrument strengths
beta <- 0.1 # causal effect
OR <- exp(beta*gamma1) ## true odds ratio (G-Y association)
MAF <- seq(from=.01,to=.5,length=N) # minor allele frequency
n_G_X <- 100000 # sample size G-X (# individuals genotyped on G with X phenotype)
n_G_Y <- 200000 # sample size G-Y (# individuals genotyped on G with Y phenotype)
overlap <- 100000 # shared sample size
sigma_x=1 #variation in trait
cor_xy=0.1 # correlation between x and y on same person
## assuming HWE
G_prob <- cbind(MAF^2,2*MAF*(1-MAF),(1-MAF)^2)
## find Gamma0 for logistic model so prevalence is 0.1
Gamma1 <- log(OR) # G, Y association on log-odds scale
myfunc <- function(MAF,Gamma0, Gamma1, prev){
MAF^2*expit(Gamma0+Gamma1*2)+2*MAF*(1-MAF)*expit(Gamma0+Gamma1)+(1-MAF)^2*expit(Gamma0)-prev
}
Gamma0 <- numeric(N)
for(i in 1:N) Gamma0[i] <- uniroot(myfunc,Gamma1=Gamma1[i],MAF=MAF[i],prev=prev,lower=-10, upper=10)$root
var_Gamma_y <- asymp_var_logistic(n_G_Y,G_prob,Gamma0,Gamma1)
var_gamma_x <- asymp_var_linear(n_G_X,G_prob,sigma=sigma_x)
cov_gamma_x_Gamma_y <- asymp_cov_linear_logistic(overlap, n_G_X,n_G_Y,G_prob,cor_xy=cor_xy,sigma=sigma_x,Gamma_0=Gamma0,Gamma_1=Gamma1,prev=prev)
## now what if G-X and G-Y GWASs share sample overlap (simulate joint association)
## simulate summary statistics
## create array, first 2 dimensions represent individual covariance matrices, 3rd dimension indexing SNPs
library(MASS)
cov_array <- array(dim=c(2,2,length(var_gamma_x)))
cov_array[1,1,] <- var_gamma_x
cov_array[2,1,] <- cov_gamma_x_Gamma_y
cov_array[1,2,] <- cov_array[2,1,]
cov_array[2,2,] <- var_Gamma_y
summary_stats <- apply(cov_array,3,function(x){mvrnorm(n=1,mu=c(0,0),Sigma=x)})
summary_stats <- t(summary_stats + rbind(gamma1,Gamma1))
## checking correlation
cor(summary_stats-cbind(gamma1,Gamma1))
################################################## Linear Linear case ###################################################
## parameters
N <- 100000 # number of snps to simulate (assumed in LD and HWE)
prev <- 0.1 # disease prevalence
beta <- 0.1 # causal effect of X and Y
gamma1 <- rnorm(N,mean=0,sd=0.3) # true instrument strengths
Gamma1 <- beta*gamma1 # true G/Y associations
MAF <- seq(from=.01,to=.5,length=N) # minor allele frequency
n_G_X <- 100000 # sample size G-X
n_G_Y <- 200000 # sample size G-Y
overlap <- 100000 # shared sample size (must be less than or equal to min(n_G_X,n_G_Y))
sigma_x=1 #variation in trait
sigma_y=1 #variation in outcome
cor_xy=0.1 # correlation between x and y on same person # note if there is no confounding this will be beta*sqrt(var_X)/sqrt(var_Y)
## assuming HWE
G_prob <- cbind(MAF^2,2*MAF*(1-MAF),(1-MAF)^2)
var_Gamma_y <- asymp_var_linear(n_G_Y,G_prob,sigma=sigma_y)
var_gamma_x <- asymp_var_linear(n_G_X,G_prob,sigma=sigma_x)
cov_gamma_x_Gamma_y <- asymp_cov_linear_linear(overlap, n_G_X,n_G_Y,G_prob,sigma_x=sigma_x,sigma_y=sigma_y,cor_xy=cor_xy)
## now what if G-X and G-Y GWASs share sample overlap (simulate joint association)
## simulate summary statistics
## create array, first 2 dimensions represent individual covariance matrices, 3rd dimension indexing SNPs
cov_array <- array(dim=c(2,2,length(var_gamma_x)))
cov_array[1,1,] <- var_gamma_x
cov_array[2,1,] <- cov_gamma_x_Gamma_y
cov_array[1,2,] <- cov_array[2,1,]
cov_array[2,2,] <- var_Gamma_y
summary_stats <- apply(cov_array,3,function(x){mvrnorm(n=1,mu=c(0,0),Sigma=x)})
summary_stats <- t(summary_stats + rbind(gamma1,Gamma1))
## checking correlation
cor(summary_stats-cbind(gamma1,Gamma1))
```