-
Notifications
You must be signed in to change notification settings - Fork 0
/
mrmash_larger_r_issue.Rmd
291 lines (232 loc) · 11.4 KB
/
mrmash_larger_r_issue.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
---
title: "mr.mash issue with larger number of conditions"
author: "Fabio Morgante"
date: "`r format(Sys.time(), '%B %d, %Y')`"
output:
workflowr::wflow_html:
toc: true
editor_options:
chunk_output_type: console
---
```{r set opts, message=FALSE}
library(mr.mash.alpha)
library(glmnet)
options(stringsAsFactors = FALSE)
###Functions to compute g-lasso coefficients
compute_coefficients_glasso <- function(X, Y, standardize, nthreads, version=c("Rcpp", "R")){
version <- match.arg(version)
n <- nrow(X)
p <- ncol(X)
r <- ncol(Y)
Y_has_missing <- any(is.na(Y))
if(Y_has_missing){
###Extract per-individual Y missingness patterns
Y_miss_patterns <- mr.mash.alpha:::extract_missing_Y_pattern(Y)
###Initialize missing Ys
muy <- colMeans(Y, na.rm=TRUE)
for(l in 1:r){
Y[is.na(Y[, l]), l] <- muy[l]
}
###Compute V and its inverse
V <- mr.mash.alpha:::compute_V_init(X, Y, matrix(0, p, r), method="flash")
Vinv <- chol2inv(chol(V))
###Compute expected Y (assuming B=0)
mu <- matrix(rep(muy, each=n), n, r)
###Impute missing Ys
Y <- mr.mash.alpha:::impute_missing_Y(Y=Y, mu=mu, Vinv=Vinv, miss=Y_miss_patterns$miss, non_miss=Y_miss_patterns$non_miss,
version=version)$Y
}
##Fit group-lasso
if(nthreads>1){
doMC::registerDoMC(nthreads)
paral <- TRUE
} else {
paral <- FALSE
}
cvfit_glmnet <- glmnet::cv.glmnet(x=X, y=Y, family="mgaussian", alpha=1, standardize=standardize, parallel=paral)
coeff_glmnet <- coef(cvfit_glmnet, s="lambda.min")
##Build matrix of initial estimates for mr.mash
B <- matrix(as.numeric(NA), nrow=p, ncol=r)
for(i in 1:length(coeff_glmnet)){
B[, i] <- as.vector(coeff_glmnet[[i]])[-1]
}
return(list(B=B, Y=Y))
}
```
In this website, we want to investigate an issue that happens with larger number of conditions than the 10 conditions we have used in the other investigations. The issue is that *mr.mash* fails to shrink to 0 coefficients that are actually 0. As we will see, some of them are pretty badly misestimated.
Let's look at an example with n=150, p=400, r=25, PVE=0.2, p_causal=5, independent residuals, independent predictors, and equal effects across conditions from a single multivariate normal with variance 1.
```{r shared effects simulations, results='hide'}
set.seed(12)
n <- 200
ntest <- 50
p <- 400
p_causal <- 5
r <- 25
X_cor <- 0
pve <- 0.2
r_causal <- list(1:r)
B_cor <- 1
B_scale <- 1
w <- 1
###Simulate V, B, X and Y
out <- simulate_mr_mash_data(n, p, p_causal, r, r_causal, intercepts = rep(1, r),
pve=pve, B_cor=B_cor, B_scale=B_scale, w=w,
X_cor=X_cor, X_scale=1, V_cor=0)
colnames(out$Y) <- paste0("Y", seq(1, r))
rownames(out$Y) <- paste0("N", seq(1, n))
colnames(out$X) <- paste0("X", seq(1, p))
rownames(out$X) <- paste0("N", seq(1, n))
###Split the data in training and test sets
test_set <- sort(sample(x=c(1:n), size=ntest, replace=FALSE))
Ytrain <- out$Y[-test_set, ]
Xtrain <- out$X[-test_set, ]
Ytest <- out$Y[test_set, ]
Xtest <- out$X[test_set, ]
```
*mr.mash* is run standardizing X and dropping mixture components with weight less than $1e-8$. However, the results look the same without dropping any component. The mixture prior consists only of canonical matrices, scaled by a grid computed as in the *mash* paper. group-LASSO is used to initialize *mr.mash* and as a comparison.
```{r shared effects fit, fig.height=12, fig.width=15}
standardize <- TRUE
nthreads <- 4
w0_threshold <- 1e-8
S0_can <- compute_canonical_covs(ncol(Ytrain), singletons=TRUE, hetgrid=c(0, 0.25, 0.5, 0.75, 1))
####Complete data####
###Compute grid of variances
univ_sumstats <- compute_univariate_sumstats(Xtrain, Ytrain, standardize=standardize,
standardize.response=FALSE, mc.cores=nthreads)
grid <- autoselect.mixsd(univ_sumstats, mult=sqrt(2))^2
###Compute prior with only canonical matrices
S0 <- expand_covs(S0_can, grid, zeromat=TRUE)
###Fit mr.mash
out_coeff <- compute_coefficients_glasso(Xtrain, Ytrain, standardize=standardize, nthreads=nthreads, version="Rcpp")
prop_nonzero_glmnet <- sum(out_coeff$B[, 1]!=0)/p
w0 <- c((1-prop_nonzero_glmnet), rep(prop_nonzero_glmnet/(length(S0)-1), (length(S0)-1)))
fit_mrmash <- mr.mash(Xtrain, Ytrain, S0, w0=w0, update_w0=TRUE, update_w0_method="EM", tol=1e-2,
convergence_criterion="ELBO", compute_ELBO=TRUE, standardize=standardize,
verbose=FALSE, update_V=TRUE, update_V_method="diagonal", e=1e-8,
w0_threshold=w0_threshold, nthreads=nthreads, mu1_init=out_coeff$B)
#####Coefficients plots######
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
0, 3, 3, 0,
0, 3, 3, 0), 4, 4, byrow = TRUE))
###Plot estimated vs true coeffcients
##mr.mash
plot(out$B, fit_mrmash$mu1, main="mr.mash", xlab="True coefficients", ylab="Estimated coefficients",
cex=2, cex.lab=2, cex.main=2, cex.axis=2)
abline(0, 1)
##g-lasso
plot(out$B, out_coeff$B, main="g-lasso", xlab="True coefficients", ylab="Estimated coefficients",
cex=2, cex.lab=2, cex.main=2, cex.axis=2)
abline(0, 1)
###Plot mr.mash vs g-lasso estimated coeffcients
colorz <- matrix("black", nrow=p, ncol=r)
zeros <- apply(out$B, 2, function(x) x==0)
for(i in 1:ncol(colorz)){
colorz[zeros[, i], i] <- "red"
}
plot(fit_mrmash$mu1, out_coeff$B, main="mr.mash vs g-lasso",
xlab="mr.mash coefficients", ylab="g-lasso coefficients",
col=colorz, cex=2, cex.lab=2, cex.main=2, cex.axis=2)
abline(0, 1)
legend("topleft",
legend = c("Non-zero", "Zero"),
col = c("black", "red"),
pch = c(1, 1),
horiz = FALSE,
cex=2)
###Identify badly estimated variables
bad_est_1 <- which(out$B==0 & fit_mrmash$mu1 < -0.5, arr.ind = TRUE)
bad_est_2 <- which(out$B==0 & fit_mrmash$mu1 > 0.5, arr.ind = TRUE)
bad_est <- rbind(bad_est_1, bad_est_2)
cat("Bad variables:\n")
bad_est
```
*mr.mash* is much better than group-LASSO at correctly estimating the non-zero coefficients. However, it does much worse at shrinking the true null coefficients to 0. In particular, there are a few coeffcients that are pretty badly misestimated. We are going to look at those cases more closely.
```{r shared effects mr.mash coeff bad vars}
###Estimated coefficients by mr.mash for the bad variables
cat("mr.mash estimated coefficients:\n")
fit_mrmash$mu1[bad_est[, 1], ]
```
We can see that the estimated coefficients for each one of those variables look like a singleton structure, where there is an effect in one condition and nowhere else. When we look at the prior weights at convergence, we can see that those singleton matrices get relatively large weight (see below).
```{r shared effects mr.mash weights bad vars}
###Estimated prior weights by mr.mash for the bad variables
cat("mr.mash estimated prior weights:\n")
head(sort(fit_mrmash$w0, decreasing=TRUE), 20)
```
To see why that is happening, the next step will be to look at the data. While the best thing to do would be to look at the summary statistics from a simple multivariate regression with the same residual covariance matrix as in *mr.mash*, we are going to look at the univariate summary statistics that are readily available.
```{r shared effects univariate coeff bad vars, fig.height=12, fig.width=15}
###Estimated coefficients and Z-scores by univariate regression for the bad variables
rownames(univ_sumstats$Bhat) <- colnames(out$X)
colnames(univ_sumstats$Bhat) <- colnames(out$Y)
cat("SLR estimated coefficients:\n")
univ_sumstats$Bhat[bad_est[, 1], ]
###Estimated Z-scores by univariate regression for the bad variables
Zscores <- univ_sumstats$Bhat/univ_sumstats$Shat
cat("SLR estimated Z-scores:\n")
Zscores[bad_est[, 1], ]
plot(fit_mrmash$mu1[bad_est[, 1], ], univ_sumstats$Bhat[bad_est[, 1], ], main="mr.mash vs SLR -- bad variables",
xlab="mr.mash coefficients", ylab="SLR coefficients",
cex=2, cex.lab=2, cex.main=2, cex.axis=2)
abline(0, 1)
```
The data (wrongly) supports the singleton structure (i.e., large effect in one condition and smaller effects in the other conditions). *mr.mash* then estimates the coefficients accordingly, by shrinking to 0 the coefficients in the conditions with smaller effects and leaving the coefficent in the one condition almost unshrunken for two variables and a little shrunken for the other two variables.
If we do not include the singleton matrices in the prior, this behavior does not appear.
```{r shared effects fit no singletons, fig.height=12, fig.width=15}
S0_can_nosing <- compute_canonical_covs(ncol(Ytrain), singletons=FALSE, hetgrid=c(0, 0.25, 0.5, 0.75, 1))
###Compute prior with only canonical matrices without singletons
S0_nosing <- expand_covs(S0_can_nosing, grid, zeromat=TRUE)
###Fit mr.mash
w0_nosing <- c((1-prop_nonzero_glmnet), rep(prop_nonzero_glmnet/(length(S0_nosing)-1), (length(S0_nosing)-1)))
fit_mrmash_nosing <- mr.mash(Xtrain, Ytrain, S0_nosing, w0=w0_nosing, update_w0=TRUE, update_w0_method="EM", tol=1e-2,
convergence_criterion="ELBO", compute_ELBO=TRUE, standardize=standardize,
verbose=FALSE, update_V=TRUE, update_V_method="diagonal", e=1e-8,
w0_threshold=w0_threshold, nthreads=nthreads, mu1_init=out_coeff$B)
#####Coefficients plots######
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
0, 3, 3, 0,
0, 3, 3, 0), 4, 4, byrow = TRUE))
###Plot estimated vs true coeffcients
##mr.mash
plot(out$B, fit_mrmash_nosing$mu1, main="mr.mash", xlab="True coefficients", ylab="Estimated coefficients",
cex=2, cex.lab=2, cex.main=2, cex.axis=2)
abline(0, 1)
##g-lasso
plot(out$B, out_coeff$B, main="g-lasso", xlab="True coefficients", ylab="Estimated coefficients",
cex=2, cex.lab=2, cex.main=2, cex.axis=2)
abline(0, 1)
###Plot mr.mash vs g-lasso estimated coeffcients
colorz <- matrix("black", nrow=p, ncol=r)
zeros <- apply(out$B, 2, function(x) x==0)
for(i in 1:ncol(colorz)){
colorz[zeros[, i], i] <- "red"
}
plot(fit_mrmash_nosing$mu1, out_coeff$B, main="mr.mash vs g-lasso",
xlab="mr.mash coefficients", ylab="g-lasso coefficients",
col=colorz, cex=2, cex.lab=2, cex.main=2, cex.axis=2)
abline(0, 1)
legend("topleft",
legend = c("Non-zero", "Zero"),
col = c("black", "red"),
pch = c(1, 1),
horiz = FALSE,
cex=2)
```
When comparing the ELBO of the two models, we can see that the ELBO for the model with the singletons is a little larger.
```{r shared effects mr.mash ELBO comparison}
###ELBO with singletons
cat("mr.mash ELBO with singletons:\n")
fit_mrmash$ELBO
###ELBO with singletons
cat("mr.mash ELBO without singletons:\n")
fit_mrmash_nosing$ELBO
```
However, at least in this example, the variable/condition combinations displaying this problem are not so many. Interestingly, a few conditions (10, 13, 15, 22) are the only ones creating this issue.
```{r shared effects mr.mash bad vars}
badd <- which(abs(fit_mrmash$mu1 - out$B) > 0.05, arr.ind = TRUE)
unique_vars <- unique(badd[,1])
unique_vars <- unique_vars[-unique(which(out$B[unique_vars, ] !=0, arr.ind = TRUE)[, 1])]
unique_vars_bad_y <- badd[which(badd[, 1] %in% unique_vars), ]
unique_vars_bad_y
plot(out$B[unique_vars_bad_y], fit_mrmash$mu1[unique_vars_bad_y], xlab="True coefficents", ylab="Estimated coefficients", main="mr.mash vs truth\nmost bad variables/conditions")
```