-
Notifications
You must be signed in to change notification settings - Fork 3
/
covlmc_tune.R
249 lines (248 loc) · 9.73 KB
/
covlmc_tune.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#' Fit an optimal Variable Length Markov Chain with Covariates (coVLMC)
#'
#' This function fits a Variable Length Markov Chain with Covariates (coVLMC) to
#' a discrete time series coupled with a time series of covariates by optimizing
#' an information criterion (BIC or AIC).
#'
#' This function automates the process of fitting a large coVLMC to a discrete
#' time series with [covlmc()] and of pruning the tree (with [cutoff()] and
#' [prune()]) to get an optimal with respect to an information criterion. To
#' avoid missing long term dependencies, the function uses the `max_depth`
#' parameter as an initial guess but then relies on an automatic increase of the
#' value to make sure the initial context tree is only limited by the `min_size`
#' parameter. The initial value of the `alpha` parameter of [covlmc()] is also
#' set to a conservative value (0.5) to avoid prior simplification of the
#' context tree. This can be overridden by setting the `alpha_init` parameter to
#' a more adapted value.
#'
#' Once the initial coVLMC is obtained, the [cutoff()] and [prune()] functions
#' are used to build all the coVLMC models that could be generated using smaller
#' values of the alpha parameter. The best model is selected from this
#' collection, including the initial complex tree, as the one that minimizes the
#' chosen information criterion.
#'
#' @param x an object that can be interpreted as a discrete time series, such
#' as an integer vector or a `dts` object (see [dts()]).
#' @param covariate a data frame of covariates.
#' @param criterion criterion used to select the best model. Either `"BIC"`
#' (default) or `"AIC"` (see details).
#' @param initial specifies the likelihood function, more precisely the way the
#' first few observations for which contexts cannot be calculated are
#' integrated in the likelihood. See [loglikelihood()] for details.
#' @param alpha_init if non `NULL` used as the initial cut off parameter (in
#' quantile scale) to build the initial VLMC
#' @param min_size integer >= 1 (default: 5). Tune the minimum number of
#' observations for a context in the growing phase of the context tree (see
#' [covlmc()] for details).
#' @param max_depth integer >= 1 (default: 100). Longest context considered in
#' growing phase of the initial context tree (see details).
#' @param verbose integer >= 0 (default: 0). Verbosity level of the pruning
#' process.
#' @param save specify which BIC models are saved during the pruning process.
#' The default value `"best"` asks the function to keep only the best model
#' according to the `criterion`. When `save="initial"` the function keeps *in
#' addition* the initial (complex) model which is then pruned during the
#' selection process. When `save="all"`, the function returns all the models
#' considered during the selection process. See details for memory occupation.
#' @param trimming specify the type of trimming used when saving the
#' intermediate models, see details.
#' @param best_trimming specify the type of trimming used when saving the best
#' model and the initial one (see details).
#'
#' @returns a list with the following components:
#'
#' - `best_model`: the optimal COVLMC
#' - `criterion`: the criterion used to select the optimal VLMC
#' - `initial`: the likelihood function used to select the optimal VLMC
#' - `results`: a data frame with details about the pruning process
#' - `saved_models`: a list of intermediate COVLMCs if `save="initial"` or
#' `save="all"`. It contains an `initial` component with the large coVLMC
#' obtained first and an `all` component with a list of all the *other* coVLMC
#' obtained by pruning the initial one.
#'
#' @section Memory occupation:
#'
#' `covlmc` objects tend to be large and saving all the models during the
#' search for the optimal model can lead to an unreasonable use of memory. To
#' avoid this problem, models are kept in trimmed form only using
#' [trim.covlmc()] with `keep_model=FALSE`. Both the initial model and the
#' best one are saved untrimmed. This default behaviour corresponds to
#' `trimming="full"`. Setting `trimming="partial"` asks the function to use
#' `keep_model=TRUE` in [trim.covlmc()] for intermediate models. Finally,
#' `trimming="none"` turns off trimming, which is discouraged expected for
#' small data sets.
#'
#' In parallel processing contexts (e.g. using [foreach::%dopar%]), the memory
#' occupation of the results can become very large as models tend to keep
#' environments attached to the formulas. In this situation, it is highly
#' recommended to trim all saved models, including the best one and the
#' initial one. This can be done via the `best_trimming` parameter whose
#' possible values are identical to the ones of `trimming`.
#'
#' @export
#' @seealso [covlmc()], [cutoff()] and [prune()]
#'
#' @examples
#' pc <- powerconsumption[powerconsumption$week %in% 6:7, ]
#' rdts <- cut(pc$active_power, breaks = c(0, quantile(pc$active_power, probs = c(0.5, 1))))
#' rdts_cov <- data.frame(day_night = (pc$hour >= 7 & pc$hour <= 17))
#' rdts_best_model_tune <- tune_covlmc(rdts, rdts_cov)
#' draw(as_covlmc(rdts_best_model_tune))
tune_covlmc <- function(x, covariate, criterion = c("BIC", "AIC"),
initial = c("truncated", "specific", "extended"),
alpha_init = NULL,
min_size = 5, max_depth = 100,
verbose = 0,
save = c("best", "initial", "all"),
trimming = c("full", "partial", "none"),
best_trimming = c("none", "partial", "full")) {
criterion <- match.arg(criterion)
initial <- match.arg(initial)
best_trimming <- match.arg(best_trimming)
save <- match.arg(save)
criterion <- match.arg(criterion)
trimming <- match.arg(trimming)
## make sure that x is a dts
if (!is_dts(x)) {
x <- dts(x)
}
if (criterion == "BIC") {
f_criterion <- stats::BIC
} else {
f_criterion <- stats::AIC
}
if (is.null(alpha_init)) {
alpha <- 0.5
} else {
if (is.null(alpha_init) || !is.numeric(alpha_init) || alpha_init <= 0 || alpha_init > 1) {
stop("the alpha_init parameter must be in (0, 1]")
}
alpha <- alpha_init
}
if (verbose > 0) {
cat("Fitting a covlmc with max_depth=", max_depth, "and alpha=", alpha, "\n")
}
saved_models <- list()
base_model <- covlmc(x, covariate,
alpha = alpha, min_size = min_size,
max_depth = max_depth
)
while (base_model$max_depth) {
n_max_depth <- min(2 * max_depth, length(x) - 1)
if (n_max_depth > max_depth) {
if (verbose > 0) {
cat("Max depth reached, increasing it to", n_max_depth, "\n")
}
max_depth <- n_max_depth
base_model <- covlmc(x, covariate,
alpha = alpha, min_size = min_size,
max_depth = max_depth
)
} else {
warning("cannot find a suitable value for max_depth")
break
}
}
results <- NULL
model <- base_model
max_order <- depth(model)
best_crit <- Inf
if (verbose > 0) {
cat("Initial criterion=", best_crit, "\n")
}
if (save == "all") {
all_models <- list()
}
repeat {
if (initial == "truncated") {
ll <- loglikelihood(model,
newdata = dts_data(x), initial = "truncated",
ignore = max_order, newcov = covariate
)
} else {
ll <- stats::logLik(model, initial = initial)
}
crit <- f_criterion(ll)
if (crit <= best_crit) {
best_crit <- crit
best_model <- model
if (verbose > 0) {
cat("Improving criterion=", best_crit, "\n")
}
}
a_result <- data.frame(
alpha = alpha,
depth = depth(model),
nb_contexts = context_number(model),
loglikelihood = ll,
cov_depth = covariate_depth(model),
AIC = stats::AIC(ll),
BIC = stats::BIC(ll)
)
if (is.null(results)) {
results <- a_result
} else {
results <- rbind(results, a_result)
}
new_alphas <- cutoff(model)
if (is.null(new_alphas) || all(new_alphas >= alpha)) {
break
} else {
alpha <- max(new_alphas[new_alphas < alpha])
if (alpha <= 0) {
break
}
if (verbose > 0) {
cat("Pruning covlmc with alpha=", alpha, "\n")
}
model <- prune(model, alpha = alpha)
if (save == "all") {
saved_model <- model
if (trimming == "full") {
saved_model <- trim(model)
} else if (trimming == "partial") {
saved_model <- trim(model, keep_model = TRUE)
}
all_models <- c(all_models, list(saved_model))
}
}
}
pre_result <- list(
best_model = best_model,
criterion = criterion,
initial = initial,
results = results
)
if (best_trimming == "partial") {
pre_result$best_model <- trim(best_model, keep_model = TRUE)
} else if (best_trimming == "full") {
pre_result$best_model <- trim(best_model)
}
if (save == "all") {
pre_result[["saved_models"]] <- list(initial = base_model, all = all_models)
} else if (save == "initial") {
pre_result[["saved_models"]] <- list(initial = base_model)
}
if (!is.null(pre_result[["saved_models"]])) {
if (best_trimming == "partial") {
pre_result[["saved_models"]]$initial <- trim(pre_result[["saved_models"]]$initial, keep_model = TRUE)
} else if (best_trimming == "full") {
pre_result[["saved_models"]]$initial <- trim(pre_result[["saved_models"]]$initial)
}
}
structure(pre_result, class = "tune_covlmc")
}
#' @export
print.tune_covlmc <- function(x, ...) {
print(x$best_model)
cat(" Selected by", x$criterion, "(")
if (x$criterion == "BIC") {
cat(min(x$results$BIC))
} else {
cat(min(x$results$AIC))
}
cat(") with likelihood function \"", x$initial, "\" (", sep = "")
cat(loglikelihood(x$best_model))
cat(")\n")
invisible(x)
}