-
Notifications
You must be signed in to change notification settings - Fork 0
/
mode_specific_sure.R
463 lines (408 loc) · 15.1 KB
/
mode_specific_sure.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
##' Outputs the 'C' array from Gerard and Hoff (2015), along with the HOSVD of
##' the data tensor.
##'
##' This is necessary to calculate the SURE for higher-order spectral
##' estimators.
##'
##' this function only calculates C and the HOSVD so that calculating SURE is
##' faster when doing it over and over again
##'
##' @param X An array of numerics. The data.
##'
##' @return \code{C_array} An array of numerics. The "C" array from Gerard and
##' Hoff (2015).
##'
##' \code{hosvd_x} A list containing the higher-order SVD of \code{X}.
##'
##' @author David Gerard.
##'
##' @references Gerard, D., & Hoff, P. (2015).
##' \href{http://arxiv.org/abs/1505.02114}{Adaptive Higher-order Spectral
##' Estimators}. \emph{arXiv preprint} arXiv:1505.02114.
##'
##' @seealso \code{\link{diverge_given_c}} for calculating the divergence of
##' higher-order spectral estimators using the output of \code{get_c}.
##'
##' \code{\link{sure_given_c}} for calculating the SURE given the output of
##' \code{get_c}.
##'
##' \code{\link{sure}} for a wrapper for \code{get_c} and
##' \code{\link{sure_given_c}}.
##'
##' \code{\link{soft_coord}} for a coordinate descent algorithm for finding
##' the optimal sure using the output from \code{get_c}.
##'
##' @export
get_c <- function(X) {
p <- dim(X)
n <- length(p)
hosvd_x <- hosvd_full(X)
S <- hosvd_x$S
for (mode_index in 1:n) {
over_temp <- rep(NA, length = p[mode_index])
for (individual_index in 1:p[mode_index]) {
over_temp[individual_index] <-
sum(1 / (hosvd_x$D[[mode_index]] ^ 2 -
hosvd_x$D[[mode_index]][individual_index] ^
2)[-individual_index])
}
if (mode_index == 1) {
sig_inv_sq_array <- 1 / hosvd_x$D[[1]] ^ 2
sig_leave_one_out <- over_temp
} else {
sig_inv_sq_array <-
outer(sig_inv_sq_array,
1 / hosvd_x$D[[mode_index]] ^ 2, FUN = "+")
sig_leave_one_out <- outer(sig_leave_one_out, over_temp, FUN = "+")
}
}
D_list <- list()
for (k_index in 1:n) {
k_temp_mat <- matrix(NA, nrow = p[k_index], ncol = prod(p[-k_index]))
for (i_index in 1:p[k_index]) {
mult_left <- 1 / (hosvd_x$D[[k_index]][i_index] ^ 2 -
hosvd_x$D[[k_index]] ^ 2)
if (p[k_index] > 2) {
k_temp_mat[i_index, ] <-
colSums((mult_left * tensr::mat(S ^ 2, k_index))[-i_index, ])
} else {
k_temp_mat[i_index, ] <-
(mult_left * tensr::mat(S ^ 2, k_index))[-i_index, ]
}
}
k_temp_array <- array(k_temp_mat, dim = c(p[k_index], p[-k_index]))
D_list[[k_index]] <-
aperm(k_temp_array, match(1:n, c(k_index, (1:n)[-k_index])))
}
## an array of ones
one_array <- array(1, dim = p)
C_array <- one_array - S ^ 2 * (sig_inv_sq_array + sig_leave_one_out) +
listSum(D_list)
return(list(C_array = C_array, hosvd_x = hosvd_x))
}
##' Calculates divergence of HOSE.
##'
##' Assumes we already did all of the heavy lifting in calculating the HOSVD and
##' the 'C' matrix from my write-up 'sure_pdf'. Call \code{\link{get_c}} before
##' using this function.
##'
##' @inheritParams sure_given_c
##'
##' @return \code{divergence_f} The divergence of the higher-order spectral
##' estimator.
##'
##' \code{mean_est} The mean estimate of the higher-order spectral estimator.
##'
##' @author David Gerard.
##'
##' @references Gerard, D., & Hoff, P. (2015).
##' \href{http://arxiv.org/abs/1505.02114}{Adaptive Higher-order Spectral
##' Estimators}. \emph{arXiv preprint} arXiv:1505.02114.
##'
##' @seealso \code{\link{get_c}}, \code{\link{sure_given_c}}.
##'
##' @export
diverge_given_c <- function(obj, func, dfunc, lambda) {
hosvd_x <- obj$hosvd_x
C_array <- obj$C_array
S <- hosvd_x$S
p <- dim(S)
n <- length(p)
D <- list()
D_inv <- list()
f_D <- list()
df_D <- list()
for (mode_index in 1:n) {
D[[mode_index]] <- diag(hosvd_x$D[[mode_index]])
D_inv[[mode_index]] <- diag(1 / hosvd_x$D[[mode_index]])
f_D[[mode_index]] <-
diag(do.call(func[[mode_index]],
args = list(hosvd_x$D[[mode_index]],
lambda[[mode_index]])))
df_D[[mode_index]] <-
diag(do.call(dfunc[[mode_index]],
list(hosvd_x$D[[mode_index]], lambda[[mode_index]])))
}
f_d_inv <- list()
for (mode_index in 1:n) {
f_d_inv[[mode_index]] <- f_D[[mode_index]] * D_inv[[mode_index]]
}
H_list <- list()
for (mode_index in 1:n) {
H_list[[mode_index]] <- f_d_inv
H_list[[mode_index]][[mode_index]] <-
df_D[[mode_index]] * D_inv[[mode_index]] ^ 2
}
divergence_f <- sum(tensr::atrans(C_array, f_d_inv))
for (mode_index in 1:n) {
divergence_f <- divergence_f +
sum(tensr::atrans(S ^ 2, H_list[[mode_index]]))
}
mean_est <- tensr::atrans(tensr::atrans(S, f_d_inv), hosvd_x$U)
return(list(divergence_f = divergence_f, mean_est = mean_est))
}
##' Calculates SURE given the output of \code{\link{get_c}}.
##'
##' Calculates SURE assuming we already did all of the heavy lifting in
##' calculating the HOSVD and the 'C' matrix from my write-up 'sure_pdf'. Call
##' \code{\link{get_c}} before using this function.
##'
##' @param obj Output from \code{\link{get_c}}.
##' @param func A list of length \code{length(dim(X))} of shrinkage functions.
##' @param dfunc A list of length \code{length(dim(X))} of corresponding
##' derivative functions.
##' @param lambda A list of parameter values for shrinking along each mode.
##' @param tau2 A positive numeric. The variance. Assumed known and defaults to
##' 1.
##'
##' @author David Gerard.
##'
##' @references Gerard, D., & Hoff, P. (2015).
##' \href{http://arxiv.org/abs/1505.02114}{Adaptive Higher-order Spectral Estimators}.
##' \emph{arXiv preprint} arXiv:1505.02114.
##'
##' @seealso \code{\link{get_c}}, \code{\link{diverge_given_c}}, \code{\link{sure}}.
##'
##' @export
sure_given_c <- function(obj, func, dfunc, lambda, tau2 = 1) {
hosvd_x <- obj$hosvd_x
C_array <- obj$C_array
S <- hosvd_x$S
p <- dim(S)
n <- length(p)
D <- list()
D_inv <- list()
f_D <- list()
df_D <- list()
for (mode_index in 1:n) {
D[[mode_index]] <- diag(hosvd_x$D[[mode_index]])
D_inv[[mode_index]] <- diag(1 / hosvd_x$D[[mode_index]])
f_D[[mode_index]] <-
diag(do.call(func[[mode_index]],
args = list(hosvd_x$D[[mode_index]],
lambda[[mode_index]])))
df_D[[mode_index]] <-
diag(do.call(dfunc[[mode_index]], list(hosvd_x$D[[mode_index]],
lambda[[mode_index]])))
}
f_d_inv <- list()
for (mode_index in 1:n) {
f_d_inv[[mode_index]] <- f_D[[mode_index]] * D_inv[[mode_index]]
}
H_list <- list()
for (mode_index in 1:n) {
H_list[[mode_index]] <- f_d_inv
H_list[[mode_index]][[mode_index]] <-
df_D[[mode_index]] * D_inv[[mode_index]] ^ 2
}
divergence_f <- sum(tensr::atrans(C_array, f_d_inv))
for (mode_index in 1:n) {
divergence_f <- divergence_f +
sum(tensr::atrans(S ^ 2, H_list[[mode_index]]))
}
rss <- sum((tensr::atrans(S, f_d_inv) - S) ^ 2)
sure_val <- 2 * tau2 * divergence_f + rss - tau2 * prod(p)
## gsure_val <- (rss / prod(p)) / (1 - divergence_f / prod(p)) ^ 2
gsure_val <- rss / (1 - divergence_f / prod(p)) ^ 2
mean_est <- tensr::atrans(tensr::atrans(S, f_d_inv), hosvd_x$U)
return(list(sure_val = sure_val, gsure_val = gsure_val,
mean_est = mean_est))
}
##' Wrapper for \code{\link{get_c}} and \code{\link{sure_given_c}}.
##'
##' @inheritParams get_c
##' @inheritParams sure_given_c
##'
##' @author David Gerard.
##'
##' @references Gerard, D., & Hoff, P. (2015).
##' \href{http://arxiv.org/abs/1505.02114}{Adaptive Higher-order Spectral
##' Estimators}. \emph{arXiv preprint} arXiv:1505.02114.
##'
##' @seealso \code{\link{get_c}}, \code{\link{sure_given_c}}.
##'
##' @export
sure <- function(X, func, dfunc, lambda, tau2 = 1) {
## wrapper for get_c and sure_given_c
C_output <- get_c(X)
return(sure_given_c(obj = C_output, func, dfunc, lambda, tau2 = tau2))
}
##' Iterates through all multilinear ranks less than \code{max_nrank} and
##' returns all SUREs.
##'
##'
##' Iterate through all ranks less than max_nrank and choose rank with smallest
##' sure.
##'
##' @param X An array of numerics. The data.
##' @param max_nrank A vector of positive integers. The maximum rank to inspect for each mode.
##' @param tau2 The variance, assumed known. Defaults to 1.
##'
##' @return \code{min_rank} The multilinear rank that minimizes the SURE.
##'
##' \code{min_sure} The minimum SURE value.
##'
##' \code{sure_vec} A vector of all the SURE values of all the multilinear ranks that were inspected.
##'
##' \code{all_ranks} A matrix of all of the multilinear ranks that were inspected.
##'
##' \code{est} The final mean estimate.
##'
##' @author David Gerard.
##'
##' @references Gerard, D., & Hoff, P. (2015).
##' \href{http://arxiv.org/abs/1505.02114}{Adaptive Higher-order Spectral Estimators}.
##' \emph{arXiv preprint} arXiv:1505.02114.
##'
##' @export
sure_rank <- function(X, max_nrank = dim(X), tau2 = 1) {
p <- dim(X)
n <- length(p)
func <- list()
dfunc <- list()
for (mode_index in 1:n) {
func[[mode_index]] <- f_truncate
dfunc[[mode_index]] <- df_truncate
}
C_output <- get_c(X)
all_ranks <- expand.grid(lapply(max_nrank, seq, from = 1))
sure_vec <- rep(NA, length = prod(max_nrank))
for (index in 1:prod(max_nrank)) {
sure_vec[index] <-
sure_given_c(C_output, func, dfunc,
all_ranks[index, ], tau2 = tau2)$sure_val
}
which_rank_min <- which.min(sure_vec)
min_rank <- c(as.matrix(all_ranks[which_rank_min, ]))
min_sure <- sure_vec[which_rank_min]
est <- sure_given_c(C_output, func = func, dfunc = dfunc,
lambda = min_rank, tau2 = tau2)$mean_est
return(list(min_rank = min_rank, min_sure = min_sure,
sure_vec = sure_vec, all_ranks = all_ranks, est = est))
}
##' Runs a stochastic gradient descent to minimize SURE for higher-order
##' spectral estimators.
##'
##' This function is in beta. Not sure if it works properly or is even
##' worthwhile considering that \code{\link{soft_coord}} works pretty well.
##' Also, we have to call \code{\link{get_c}} anyway, so not sure if this function
##' actually reduces the computational complexity.
##'
##' @param c_obj = Output from \code{\link{get_c}}.
##' @param sgd_lambda Lambda value from sgd for lambda from hose.
##' @param sgd_lambda_c Lambda value from sgd for c from hose.
##' @param c_init Initialization for c from hose.
##' @param lambda_init Initialization for lambda from hose.
##' @param sgd_c c value from sgd.
##' @param itermax Maximum number of iterations for sgd.
##' @param tau2 Known variance.
##' @param print_current Should we print the results at each iteration?
##' @param every_iter If \code{print_current = TRUE}, then every
##' \code{every_iter} iteration will be printed.
##' @param alpha Burnin for final estimates.
##' @param calc_final Should we calculate the final sure and estimates?
##'
##' @author David Gerard.
##'
##' @references Gerard, D., & Hoff, P. (2015).
##' \href{http://arxiv.org/abs/1505.02114}{Adaptive Higher-order Spectral
##' Estimators}. \emph{arXiv preprint} arXiv:1505.02114.
sgd_given_c <- function(c_obj, sgd_lambda, sgd_lambda_c, c_init = 1,
lambda_init = NULL, sgd_c = 1 / 2 + 0.001,
itermax = 10000, tau2 = 1, print_current = TRUE,
every_iter = 1000, alpha = 0.2, calc_final = TRUE) {
hosvd_x <- c_obj$hosvd_x
c_array <- c_obj$C_array
p <- dim(c_array)
n <- length(p)
c_current <- c_init
if (is.null(lambda_init)) {
lambda_current <- rep(0, length = n)
} else {
lambda_current <- lambda_init
}
lambda_mat <- matrix(NA, nrow = itermax, ncol = n)
c_vec <- rep(NA, length = itermax)
for (sgd_index in 1:itermax) {
max_toget <- rep(NA, length = n)
for (mode_index in 1:n) {
max_toget[mode_index] <-
sum(hosvd_x$D[[mode_index]] > lambda_current[mode_index])
}
## use bernoulli to see if I draw a 0 for gradient
p_1 <- prod(max_toget) / prod(p)
is_0 <- stats::rbinom(1, 1, 1 - p_1)
if (is_0 == 1) {
lambda_new <- lambda_current
c_new <- c_current
lambda_mat[sgd_index, ] <- lambda_new
c_vec[sgd_index] <- c_new
} else {
index_current <- c()
for (mode_index in 1:n) {
index_current <-
c(index_current, sample(1:max_toget[mode_index], size = 1))
}
sigma_current <- c()
f_current <- c()
for (mode_index in 1:n) {
sigma_current[mode_index] <-
hosvd_x$D[[mode_index]][index_current[mode_index]]
f_current[mode_index] <- f_lasso(sigma_current[mode_index],
lambda_current[mode_index])
}
f_times_sigma_inv <- f_current / sigma_current
f_s_mult <- prod(f_times_sigma_inv)
S2_current <- hosvd_x$S[matrix(index_current, nrow = 1)] ^ 2
C_array_current <- c_array[matrix(index_current, nrow = 1)]
common_to_all <- -c_current * f_s_mult ^ 2 * S2_current + f_s_mult *
S2_current - tau2 * f_s_mult * C_array_current
inv_temp <-
matrix(rep(1 / (f_current * sigma_current), n), nrow = n)
diag(inv_temp) <- 0
grad_current_lambda <- 2 * c_current *
(common_to_all - colSums(inv_temp) *
tau2 * f_s_mult * S2_current) / f_current
step_size <- sgd_c / (sgd_lambda * sgd_index)
lambda_new <- lambda_current - step_size * grad_current_lambda
grad_c <- 2 * c_current * f_s_mult ^ 2 * S2_current -
2 * f_s_mult * S2_current +
2 * tau2 * f_s_mult * C_array_current + 2 * tau2 * f_s_mult *
S2_current * sum(1 / (f_current * sigma_current))
step_size_c <- sgd_c / (sgd_lambda_c * sgd_index)
c_new <- c_current - grad_c * step_size_c
lambda_mat[sgd_index, ] <- lambda_new
c_vec[sgd_index] <- c_new
c_current <- c_new
lambda_current <- lambda_new
}
if (print_current & (sgd_index %% every_iter == 0)) {
cat("Current Lambda = ", lambda_current, "\n")
cat(" Current c = ", c_current, "\n")
cat(" Grad of c = ", grad_c, "\n")
cat("Grad of lambda = ", grad_current_lambda, "\n\n")
}
}
if (calc_final) {
## get final estimates
c_final <- mean(c_vec[round(alpha * itermax):itermax])
lambda_final <- colMeans(lambda_mat[round(alpha * itermax):itermax, ])
lambda_list <- list()
lambda_list[[1]] <- c(lambda_final[1], c_final)
func_lasso <- list()
dfunc_lasso <- list()
func_lasso[[1]] <- f_lasso_mult
dfunc_lasso[[1]] <- df_lasso_mult
for (mode_index in 2:n) {
lambda_list[[mode_index]] <- lambda_final[mode_index]
func_lasso[[mode_index]] <- f_lasso
dfunc_lasso[[mode_index]] <- df_lasso
}
sure_est <- sure_given_c(c_obj, func = func_lasso, dfunc = dfunc_lasso,
lambda = lambda_list, tau2 = tau2)
return(list(c_vec = c_vec, lambda_mat = lambda_mat, c_final = c_final,
lambda_final = lambda_final, sure_final = sure_est))
} else {
return(list(c_vec = c_vec, lambda_mat = lambda_mat))
}
}