-
Notifications
You must be signed in to change notification settings - Fork 0
/
tcor.R
642 lines (591 loc) · 26.8 KB
/
tcor.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
#' Compute time varying correlation coefficients
#'
#' The function `tcor()` implements (together with its helper function
#' `calc_rho()`) the nonparametric estimation of the time varying correlation
#' coefficient proposed by Choi & Shin (2021). The general idea is to compute a
#' (Pearson) correlation coefficient (\eqn{r(x,y) = \frac{\hat{xy} - \hat{x}\times\hat{y}}{
#' \sqrt{\hat{x^2}-\hat{x}^2} \times \sqrt{\hat{y^2}-\hat{y}^2}}}), but instead of
#' using the means required for such a computation, each component (i.e.,
#' \eqn{x}, \eqn{y}, \eqn{x^2}, \eqn{y^2}, \eqn{x \times y}) is smoothed and the
#' smoothed terms are considered in place the original means. The intensity of
#' the smoothing depends on a unique parameter: the bandwidth (`h`). If `h =
#' Inf`, the method produces the original (i.e., time-invariant) correlation
#' value. The smaller the parameter `h`, the more variation in time is being
#' captured. The parameter `h` can be provided by the user; otherwise it is
#' automatically estimated by the internal helper functions `select_h()` and
#' `calc_RMSE()` (see **Details**).
#'
#' - **Smoothing**: the smoothing of each component is performed by kernel
#' regression. The default is to use the Epanechnikov kernel following Choi &
#' Shin (2021), but other kernels have also been implemented and can thus
#' alternatively be used (see [`kern_smooth()`] for details). The normal kernel
#' seems to sometimes lead to very small bandwidth being selected, but the
#' default kernel can lead to numerical issues (see next point). We thus
#' recommend always comparing the results from different kernel methods.
#'
#' - **Numerical issues**: some numerical issues can happen because the smoothing
#' is performed independently on each component of the correlation coefficient.
#' As a consequence, some relationship between components may become violated
#' for some time points. For instance, if the square of the smoothed \eqn{x} term
#' gets larger than the smoothed \eqn{x^2} term, the variance of \eqn{x} would become
#' negative. In such cases, coefficient values returned are `NA`.
#'
#' - **Bandwidth selection**: when the value used to define the bandwidth (`h`)
#' in `tcor()` is set to `NULL` (the default), the internal function `select_h()`
#' is used to to select the optimal value for `h`. It is first estimated by
#' leave-one-out cross validation (using internally `calc_RMSE()`). If the cross
#' validation error (RMSE) is minimal for the maximal value of `h` considered
#' (\eqn{8\sqrt{N}}), rather than taking this as the optimal `h` value, the
#' bandwidth becomes estimated using the so-called elbow criterion. This latter
#' method identifies the value `h` after which the cross validation error
#' decreasing very little. The procedure is detailed in section 2.1 in Choi &
#' Shin (2021).
#'
#' - **Parallel computation**: if `h` is not provided, an automatic bandwidth
#' selection occurs (see above). For large datasets, this step can be
#' computationally demanding. The current implementation thus relies on
#' [`parallel::mclapply()`] and is thus only effective for Linux and MacOS.
#' Relying on parallel processing also implies that you call `options("mc.cores"
#' = XX)` beforehand, replacing `XX` by the relevant number of CPU cores you
#' want to use (see **Examples**). For debugging, do use `options("mc.cores" =
#' 1)`, otherwise you may not be able to see the error messages generated in
#' child nodes.
#'
#' - **Confidence interval**: if `CI` is set to `TRUE`, a confidence interval is
#' calculated as described in Choi & Shin (2021). This is also necessary for using
#' [`test_equality()`] to test differences between correlations at two time points.
#' The computation of the confidence intervals involves multiple internal
#' functions (see [`CI`] for details).
#'
#' @inheritParams kern_smooth
#' @param x a numeric vector.
#' @param y a numeric vector of to be correlated with `x`.
#' @param CI a logical specifying if a confidence interval should be computed or not (default = `FALSE`).
#' @param CI.level a scalar defining the level for `CI` (default = 0.95 for 95% CI).
#' @param cor.method a character string indicating which correlation coefficient
#' is to be computed ("pearson", the default; or "spearman").
#' @param keep.missing a logical specifying if time points associated with missing
#' information should be kept in the output (default = `FALSE` to facilitate plotting).
#' @param verbose a logical specifying if information should be displayed to
#' monitor the progress of the cross validation (default = `FALSE`).
#'
#' @name tcor
#' @rdname tcor
#'
#' @references
#' Choi, JE., Shin, D.W. Nonparametric estimation of time varying correlation coefficient.
#' J. Korean Stat. Soc. 50, 333–353 (2021). \doi{10.1007/s42952-020-00073-6}
#'
#' @seealso [`test_equality`], [`kern_smooth`], [`CI`]
#'
#' @importFrom parallel mclapply
#'
NULL
#' @describeIn tcor **the user-level function to be used**.
#'
#' @return
#' **---Output for `tcor()`---**
#'
#' A 2 x \eqn{t} dataframe containing:
#' - the time points (`t`).
#' - the estimates of the correlation value (`r`).
#'
#' Or, if `CI = TRUE`, a 5 x \eqn{t} dataframe containing:
#' - the time points (`t`).
#' - the estimates of the correlation value (`r`).
#' - the Standard Error (`SE`).
#' - the lower boundary of the confidence intervals (`lwr`).
#' - the upper boundary of the confidence intervals (`upr`).
#'
#' Some metadata are also attached to the dataframe (as attributes):
#' - the call to the function (`call`).
#' - the argument `CI`.
#' - the bandwidth parameter (`h`).
#' - the method used to select `h` (`h_selection`).
#' - the minimal root mean square error when `h` is selected (`RMSE`).
#' - the computing time (in seconds) spent to select the bandwidth parameter (`h_selection_duration`) if `h` automatically selected.
#'
#' @order 1
#'
#' @export
#'
#' @examples
#'
#' #####################################################
#' ## Examples for the user-level function to be used ##
#' #####################################################
#'
#' ## Effect of the bandwidth
#'
#' res_h50 <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = 50))
#' res_h100 <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = 100))
#' res_h200 <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = 200))
#' plot(res_h50, type = "l", ylab = "Cor", xlab = "Time", las = 1, col = "grey")
#' points(res_h100, type = "l", col = "blue")
#' points(res_h200, type = "l", col = "red")
#' legend("bottom", horiz = TRUE, fill = c("grey", "blue", "red"),
#' legend = c("50", "100", "200"), bty = "n", title = "Bandwidth (h)")
#'
#'
#' ## Effect of the correlation method
#'
#' res_pearson <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = 150))
#' res_spearman <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = 150,
#' cor.method = "spearman"))
#' plot(res_pearson, type = "l", ylab = "Cor", xlab = "Time", las = 1)
#' points(res_spearman, type = "l", col = "blue")
#' legend("bottom", horiz = TRUE, fill = c("black", "blue"),
#' legend = c("pearson", "spearman"), bty = "n", title = "cor.method")
#'
#'
#' ## Infinite bandwidth should match fixed correlation coefficients
#' ## nb: `h = Inf` is not supported by default kernel (`kernel = 'epanechnikov'`)
#'
#' res_pearson_hInf <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = Inf,
#' kernel = "normal"))
#' res_spearman_hInf <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = Inf,
#' kernel = "normal", cor.method = "spearman"))
#' r <- cor(stockprice$SP500, stockprice$FTSE100, use = "pairwise.complete.obs")
#' rho <- cor(stockprice$SP500, stockprice$FTSE100, method = "spearman", use = "pairwise.complete.obs")
#' round(unique(res_pearson_hInf$r) - r, digits = 3) ## 0 indicates near equality
#' round(unique(res_spearman_hInf$r) - rho, digits = 3) ## 0 indicates near equality
#'
#'
#' ## Computing and plotting the confidence interval
#'
#' res_withCI <- with(stockprice, tcor(x = SP500, y = FTSE100, t = DateID, h = 200, CI = TRUE))
#' with(res_withCI, {
#' plot(r ~ t, type = "l", ylab = "Cor", xlab = "Time", las = 1, ylim = c(0, 1))
#' points(lwr ~ t, type = "l", lty = 2)
#' points(upr ~ t, type = "l", lty = 2)})
#'
#'
#' ## Same using tidyverse packages (dplyr and ggplot2 must be installed)
#' ## see https://github.com/courtiol/timevarcorr for more examples of this kind
#'
#' if (require("dplyr", warn.conflicts = FALSE)) {
#'
#' stockprice |>
#' reframe(tcor(x = SP500, y = FTSE100, t = DateID,
#' h = 200, CI = TRUE)) -> res_tidy
#' res_tidy
#' }
#'
#' if (require("ggplot2")) {
#'
#' ggplot(res_tidy) +
#' aes(x = t, y = r, ymin = lwr, ymax = upr) +
#' geom_ribbon(fill = "grey") +
#' geom_line() +
#' labs(title = "SP500 vs FTSE100", x = "Time", y = "Correlation") +
#' theme_classic()
#'
#' }
#'
#'
#' ## Automatic selection of the bandwidth using parallel processing and comparison
#' ## of the 3 alternative kernels on the first 500 time points of the dataset
#' # nb: takes a few seconds to run, so not run by default
#'
#' run <- in_pkgdown() || FALSE ## change to TRUE to run the example
#' if (run) {
#'
#' options("mc.cores" = 2L) ## CPU cores to be used for parallel processing
#'
#' res_hauto_epanech <- with(stockprice[1:500, ],
#' tcor(x = SP500, y = FTSE100, t = DateID, kernel = "epanechnikov")
#' )
#'
#' res_hauto_box <- with(stockprice[1:500, ],
#' tcor(x = SP500, y = FTSE100, t = DateID, kernel = "box")
#' )
#'
#' res_hauto_norm <- with(stockprice[1:500, ],
#' tcor(x = SP500, y = FTSE100, t = DateID, kernel = "norm")
#' )
#'
#' plot(res_hauto_epanech, type = "l", col = "red",
#' ylab = "Cor", xlab = "Time", las = 1, ylim = c(0, 1))
#' points(res_hauto_box, type = "l", col = "grey")
#' points(res_hauto_norm, type = "l", col = "orange")
#' legend("top", horiz = TRUE, fill = c("red", "grey", "orange"),
#' legend = c("epanechnikov", "box", "normal"), bty = "n",
#' title = "Kernel")
#'
#' }
#'
#'
#' ## Comparison of the 3 alternative kernels under same bandwidth
#' ## nb: it requires to have run the previous example
#'
#' if (run) {
#'
#' res_epanech <- with(stockprice[1:500, ],
#' tcor(x = SP500, y = FTSE100, t = DateID,
#' kernel = "epanechnikov", h = attr(res_hauto_epanech, "h"))
#' )
#'
#' res_box <- with(stockprice[1:500, ],
#' tcor(x = SP500, y = FTSE100, t = DateID,
#' kernel = "box", h = attr(res_hauto_epanech, "h"))
#' )
#'
#' res_norm <- with(stockprice[1:500, ],
#' tcor(x = SP500, y = FTSE100, t = DateID,
#' kernel = "norm", h = attr(res_hauto_epanech, "h"))
#' )
#'
#' plot(res_epanech, type = "l", col = "red", ylab = "Cor", xlab = "Time",
#' las = 1, ylim = c(0, 1))
#' points(res_box, type = "l", col = "grey")
#' points(res_norm, type = "l", col = "orange")
#' legend("top", horiz = TRUE, fill = c("red", "grey", "orange"),
#' legend = c("epanechnikov", "box", "normal"), bty = "n",
#' title = "Kernel")
#'
#' }
#'
#' ## Automatic selection of the bandwidth using parallel processing with CI
#' # nb: takes a few seconds to run, so not run by default
#'
#' run <- in_pkgdown() || FALSE ## change to TRUE to run the example
#' if (run) {
#'
#' res_hauto_epanechCI <- with(stockprice[1:500, ],
#' tcor(x = SP500, y = FTSE100, t = DateID, CI = TRUE)
#' )
#'
#' plot(res_hauto_epanechCI[, c("t", "r")], type = "l", col = "red",
#' ylab = "Cor", xlab = "Time", las = 1, ylim = c(0, 1))
#' points(res_hauto_epanechCI[, c("t", "lwr")], type = "l", col = "red", lty = 2)
#' points(res_hauto_epanechCI[, c("t", "upr")], type = "l", col = "red", lty = 2)
#'
#' }
#'
#'
#' ## Not all kernels work well in all situations
#' ## Here the default kernell estimation leads to issues for last time points
#' ## nb1: EuStockMarkets is a time-series object provided with R
#' ## nb2: takes a few minutes to run, so not run by default
#'
#' run <- in_pkgdown() || FALSE ## change to TRUE to run the example
#' if (run) {
#'
#' EuStock_epanech <- tcor(EuStockMarkets[1:500, "DAX"], EuStockMarkets[1:500, "SMI"])
#' EuStock_norm <- tcor(EuStockMarkets[1:500, "DAX"], EuStockMarkets[1:500, "SMI"], kernel = "normal")
#'
#' plot(EuStock_epanech, type = "l", col = "red", las = 1, ylim = c(-1, 1))
#' points(EuStock_norm, type = "l", col = "orange", lty = 2)
#' legend("bottom", horiz = TRUE, fill = c("red", "orange"),
#' legend = c("epanechnikov", "normal"), bty = "n",
#' title = "Kernel")
#' }
#'
#'
tcor <- function(x, y, t = seq_along(x), h = NULL, cor.method = c("pearson", "spearman"),
kernel = c("epanechnikov", "box", "normal"), CI = FALSE, CI.level = 0.95,
param_smoother = list(), keep.missing = FALSE,
verbose = FALSE) {
## stripping out missing data
missing <- is.na(x) | is.na(y) | is.na(t)
x_ori <- x[!missing] ## ori = original, i.e., not smoothed
y_ori <- y[!missing]
t_ori <- t[!missing]
## if the bandwidth is not given, we must estimate it
if (is.null(h)) {
bandwidth_obj <- select_h(x = x_ori, y = y_ori, t = t_ori, cor.method = cor.method, kernel = kernel, param_smoother = param_smoother, verbose = verbose)
} else {
bandwidth_obj <- list(h = h, h_selection = "fixed by user", RMSE = NA, time = NULL)
}
## compute correlation with the selected bandwidth
res_all <- calc_rho(x = x_ori, y = y_ori, t = t_ori, h = bandwidth_obj$h, cor.method = cor.method,
kernel = kernel, param_smoother = param_smoother)
## format data for output
res <- res_all[, c("t", "rho_smoothed")]
## fix out of range values if required
out_of_range <- res$rho_smoothed > 1 | res$rho_smoothed < -1
if (any(out_of_range, na.rm = TRUE)) {
res$rho_smoothed[res$rho_smoothed > 1] <- 1
res$rho_smoothed[res$rho_smoothed < -1] <- -1
warning(paste(sum(out_of_range, na.rm = TRUE), "out of", length(out_of_range), "correlation values were estimated out of the [-1, 1] range and where thus forced to [-1, 1]. Using another kernel may avoid such problem."))
}
## compute CI if requested
if (CI) {
res$SE <- calc_SE(smoothed_obj = res_all, h = bandwidth_obj$h, AR.method = "yule-walker") ## for now fixed AR.method
res$lwr <- res$rho_smoothed + stats::qnorm((1 - CI.level)/2)*res$SE
res$upr <- res$rho_smoothed + stats::qnorm((1 + CI.level)/2)*res$SE
## fix out of range values if required
out_of_range <- res$lwr > 1 | res$lwr < -1 | res$upr > 1 | res$upr < -1
if (any(out_of_range, na.rm = TRUE)) {
res$lwr[res$lwr > 1] <- 1
res$lwr[res$lwr < -1] <- -1
res$upr[res$upr > 1] <- 1
res$upr[res$upr < -1] <- -1
warning(paste(sum(out_of_range, na.rm = TRUE), "out of", length(out_of_range), "CI boundary values were estimated out of the [-1, 1] range and where thus forced to [-1, 1] Using another kernel may avoid such problem."))
}
}
## rename column with correlation
colnames(res)[colnames(res) == "rho_smoothed"] <- "r"
## restore missing values
if (keep.missing) {
res <- merge(res, data.frame(t = t), all.y = TRUE)
}
## store other useful info as attributes
attr(res, "call") <- match.call()
attr(res, "CI") <- CI
attr(res, "h") <- bandwidth_obj$h
attr(res, "RMSE") <- bandwidth_obj$RMSE
attr(res, "h_selection") <- bandwidth_obj$h_selection
attr(res, "h_select_duration") <- bandwidth_obj$time
res
}
#' @describeIn tcor computes the correlation for a given bandwidth.
#'
#' The function calls the kernel smoothing procedure on each component required
#' to compute the time-varying correlation.
#'
#' @return
#' **---Output for `calc_rho()`---**
#'
#' A 14 x \eqn{t} dataframe with:
#' - the six raw components of correlation (`x`, `y`, `x2`, `y2`, `xy`).
#' - the time points (`t`).
#' - the six raw components of correlation after smoothing (`x_smoothed`, `y_smoothed`, `x2_smoothed`, `y2_smoothed`, `xy_smoothed`).
#' - the standard deviation around \eqn{x} and \eqn{y} (`sd_x_smoothed`, `sd_y_smoothed`).
#' - the smoothed correlation coefficient (`rho_smoothed`).
#' @order 2
#'
#' @export
#'
#' @examples
#'
#'
#' ##################################################################
#' ## Examples for the internal function computing the correlation ##
#' ##################################################################
#'
#' ## Computing the correlation and its component for the first six time points
#'
#' with(head(stockprice), calc_rho(x = SP500, y = FTSE100, t = DateID, h = 20))
#'
#'
#' ## Predicting the correlation and its component at a specific time point
#'
#' with(head(stockprice), calc_rho(x = SP500, y = FTSE100, t = DateID, h = 20,
#' t.for.pred = DateID[1]))
#'
#'
#' ## The function can handle non consecutive time points
#'
#' set.seed(1)
#' calc_rho(x = rnorm(10), y = rnorm(10), t = c(1:5, 26:30), h = 3, kernel = "box")
#'
#'
#' ## The function can handle non-ordered time series
#'
#' with(head(stockprice)[c(1, 3, 6, 2, 4, 5), ], calc_rho(x = SP500, y = FTSE100, t = DateID, h = 20))
#'
#'
#' ## Note: the function does not handle missing data (by design)
#'
#' # calc_rho(x = c(NA, rnorm(9)), y = rnorm(10), t = c(1:2, 23:30), h = 2) ## should err (if ran)
#'
calc_rho <- function(x, y, t = seq_along(x), t.for.pred = t, h, cor.method = c("pearson", "spearman"),
kernel = c("epanechnikov", "box", "normal"), param_smoother = list()) {
## checking inputs
if (any(is.na(c(x, y, t)))) {
stop("Computing correlation requires x, y & t not to contain any NAs; otherwise, r becomes meaningless as it would no longer be bounded between [-1, 1].")
}
if (length(x) != length(y)) stop("x and y must have same length")
if (length(x) != length(t)) stop("t must have same length as x and y")
#if (length(x) == 0) stop("missing data for requested time(s)")
if (length(h) != 1) stop("h must be a scalar (numerical value of length 1)")
cor.method <- match.arg(cor.method)
## a spearman correlation equates a pearson on ranked data
if (cor.method == "spearman") {
y <- rank(y)
x <- rank(x)
}
## we create a matrix with the required components
U <- cbind(x = x, y = y, x2 = x^2, y2 = y^2, xy = x*y)
## we smooth each component (i.e., each column in the matrix)
x_smoothed <- kern_smooth(x = x, t = t, h = h, t.for.pred = t.for.pred,
kernel = kernel, param_smoother = param_smoother, output = "list") ## separate call to retrieve t once (and x)
other_smoothed_list <- apply(U[, -1], 2L, function(v) {
kern_smooth(x = v, t = t, h = h, t.for.pred = t.for.pred, kernel = kernel, param_smoother = param_smoother, output = "list")$x
}, simplify = FALSE) ## combine call for everything else
smoothed <- c(x_smoothed, other_smoothed_list) ## output as list, in any case: not to be coerced into matrix -> t can be a Date
## we compute the time varying correlation coefficient
var_x <- smoothed$x2 - smoothed$x^2
var_y <- smoothed$y2 - smoothed$y^2
smoothed$sd_x <- sqrt(ifelse(var_x > 0, var_x, NA))
smoothed$sd_y <- sqrt(ifelse(var_y > 0, var_y, NA))
if (any(is.na(c(smoothed$sd_x, smoothed$sd_y)))) {
message(paste("\nNumerical issues occured when computing the correlation values for the bandwidth value h =", round(h, digits = 2), "resulting in `NA`(s) (see Details on *Numerical issues* in `?tcor`).\n You may want to:\n - try another bandwidth value using the argument `h`\n - try another kernel using the argument `kernel`\n - adjust the smoothing parameters using the argument `param_smoother` (see `?kern_smooth`).\n This may not be an issue, if you are estimating `h` automatically, as issues may occur only for sub-optimal bandwidth values.\n"))
}
smoothed$rho <- (smoothed$xy - smoothed$x * smoothed$y) / (smoothed$sd_x * smoothed$sd_y)
## we rename the components so it is clear they are smoothed
names(smoothed) <- c(names(smoothed)[1], paste0(names(smoothed)[-1], "_smoothed"))
## we add original (non-smoothed) components:
U_at_t_for_pred <- U[match(t.for.pred, t), , drop = FALSE]
cbind(U_at_t_for_pred, data.frame(smoothed))
}
#' @describeIn tcor Internal function computing the root mean square error (RMSE) for a given bandwidth.
#'
#' The function removes each time point one by one and predicts the correlation
#' at the missing time point based on the other time points. It then computes
#' and returns the RMSE between this predicted correlation and the one predicted
#' using the full dataset. See also *Bandwidth selection* and *Parallel
#' computation* in **Details**.
#'
#' @return
#' **---Output for `calc_RMSE()`---**
#'
#' A scalar of class numeric corresponding to the RMSE.
#'
#' @order 3
#'
#' @export
#'
#' @examples
#'
#'
#' ###########################################################
#' ## Examples for the internal function computing the RMSE ##
#' ###########################################################
#'
#' ## Compute the RMSE on the correlation estimate
#' # nb: takes a few seconds to run, so not run by default
#'
#' run <- in_pkgdown() || FALSE ## change to TRUE to run the example
#' if (run) {
#'
#' small_clean_dataset <- head(na.omit(stockprice), n = 200)
#' with(small_clean_dataset, calc_RMSE(x = SP500, y = FTSE100, t = DateID, h = 10))
#'
#' }
#'
#'
calc_RMSE <- function(h, x, y, t = seq_along(x), cor.method = c("pearson", "spearman"),
kernel = c("epanechnikov", "box", "normal"), param_smoother = list(),
verbose = FALSE) {
CVi <- mclapply(seq_along(t), function(oob) { ## TODO: try to make this Windows friendly
obj <- calc_rho(x = x[-oob], y = y[-oob], t = t[-oob], h = h, t.for.pred = t[oob],
cor.method = cor.method, kernel = kernel, param_smoother = param_smoother)
(obj$rho_smoothed - ((x[oob] - obj$x_smoothed) * (y[oob] - obj$y_smoothed)) / (obj$sd_x_smoothed * obj$sd_y_smoothed))^2
})
CVi_num <- as.numeric(CVi)
failing <- is.na(CVi_num)
res <- sqrt(mean(CVi_num[!failing]))
if (verbose) {
print(paste("h =", round(h, digits = 1L), " # failing =", sum(failing)," RMSE =", res))
}
res
}
#' @describeIn tcor Internal function selecting the optimal bandwidth parameter `h`.
#'
#' The function selects and returns the optimal bandwidth parameter `h` using an
#' optimizer ([stats::optimize()]) which searches the `h` value associated with
#' the smallest RMSE returned by [calc_RMSE()]. See also *Bandwidth selection*
#' in **Details**.
#'
#' @order 4
#'
#' @return
#' **---Output for `select_h()`---**
#'
#' A list with the following components:
#' - the selected bandwidth parameter (`h`).
#' - the method used to select `h` (`h_selection`).
#' - the minimal root mean square error when `h` is selected (`RMSE`).
#' - the computing time (in seconds) spent to select the bandwidth parameter (`time`).
#'
#' @export
#'
#' @examples
#'
#'
#' ################################################################
#' ## Examples for the internal function selecting the bandwidth ##
#' ################################################################
#'
#' ## Automatic selection of the bandwidth using parallel processing
#' # nb: takes a few seconds to run, so not run by default
#'
#' run <- in_pkgdown() || FALSE ## change to TRUE to run the example
#' if (run) {
#'
#' small_clean_dataset <- head(na.omit(stockprice), n = 200)
#' with(small_clean_dataset, select_h(x = SP500, y = FTSE100, t = DateID))
#'
#' }
#'
select_h <- function(x, y, t = seq_along(x), cor.method = c("pearson", "spearman"),
kernel = c("epanechnikov", "box", "normal"), param_smoother = list(),
verbose = FALSE) {
RMSE <- NA # default value for export if not computed
## check mc.cores input
if (is.null(options("mc.cores")[[1]])) {
mc.cores <- 1
} else {
mc.cores <- options("mc.cores")[[1]]
}
if (Sys.info()[['sysname']] != "Windows" && parallel::detectCores() < mc.cores) {
stop(paste("\nYour computer does not allow such a large value for the argument `mc.cores`. The maximum value you may consider is", parallel::detectCores(), ".\n"))
}
if (Sys.info()[['sysname']] != "Windows" && parallel::detectCores() > 1 && mc.cores == 1) {
message("\nYou may use several CPU cores for faster computation by calling `options('mc.cores' = XX)` with `XX` corresponding to the number of CPU cores to be used.\n")
}
if (Sys.info()[['sysname']] == "Windows" && mc.cores > 1) {
message("\nThe argument `mc.cores` is ignore on Windows-based infrastructure.\n")
}
h_selection <- "LOO-CV"
if (length(x) > 500) message("Bandwidth selection using cross validation... (may take a while)")
time <- system.time({
## estimate h by cross-validation
h_max <- 8*sqrt(length(x))
opt <- stats::optimize(calc_RMSE, interval = c(1, h_max),
x = x, y = y, t = t, cor.method = cor.method,
kernel = kernel, param_smoother = param_smoother,
verbose = verbose,
tol = 1) ## TODO: check if other optimizer would be best
## if h_max is best, use elbow criterion instead
h <- opt$minimum
RMSE <- opt$objective
message(paste("h selected using LOO-CV =", round(h, digits = 1L)))
RMSE_bound <- calc_RMSE(h = h_max, x = x, y = y, t = t, cor.method = cor.method,
kernel = kernel, param_smoother = param_smoother,
verbose = FALSE)
if (RMSE_bound <= opt$objective) {
RMSE <- NA # remove stored value since not used in this case
h_selection <- "elbow criterion"
if (length(x) > 500) message("Bandwidth selection using elbow criterion... (may take a while)")
calc_RMSE_Lh0 <- function(h0) {
d <- data.frame(h = 1:h0)
d$RMSE_h <- sapply(h, calc_RMSE,
x = x, y = y, t = t, cor.method = cor.method,
kernel = kernel, param_smoother = param_smoother,
verbose = FALSE)
fit_Lh0 <- stats::lm(RMSE_h ~ h, data = d)
sqrt(mean(stats::residuals(fit_Lh0)^2))
}
calc_RMSE_Rh0 <- function(h0) {
d <- data.frame(h = (h0 + 1):h_max)
d$RMSE_h <- sapply(h, calc_RMSE,
x = x, y = y, t = t, cor.method = cor.method,
kernel = kernel, param_smoother = param_smoother,
verbose = FALSE)
fit_Rh0 <- stats::lm(RMSE_h ~ h, data = d)
sqrt(mean(stats::residuals(fit_Rh0)^2))
}
calc_RMSE_h0 <- function(h0) {
(h0 - 1)/(h_max - 1)*calc_RMSE_Lh0(h0) + (h_max - h0)/(h_max - 1)*calc_RMSE_Rh0(h0)
}
opt <- stats::optimize(calc_RMSE_h0, interval = c(3, h_max - 1), tol = 1)
h <- opt$minimum
message(paste("h selected using elbow criterion =", round(h, digits = 1L)))
}
})
message(paste("Bandwidth automatic selection completed in", round(time[3], digits = 1L), "seconds"))
list(h = h, h_selection = h_selection, RMSE = RMSE, time = time[3])
}