diff --git a/NAMESPACE b/NAMESPACE index 55c8c969..ae7320b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,7 @@ S3method(autoplot,canned_epipred) S3method(autoplot,epi_workflow) S3method(bake,check_enough_train_data) S3method(bake,epi_recipe) +S3method(bake,step_adjust_latency) S3method(bake,step_epi_ahead) S3method(bake,step_epi_lag) S3method(bake,step_growth_rate) @@ -225,9 +226,12 @@ importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,group_by) +importFrom(dplyr,join_by) +importFrom(dplyr,left_join) importFrom(dplyr,n) importFrom(dplyr,pull) importFrom(dplyr,rowwise) +importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) @@ -236,6 +240,7 @@ importFrom(generics,augment) importFrom(generics,fit) importFrom(generics,forecast) importFrom(ggplot2,autoplot) +importFrom(glue,glue) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") @@ -249,6 +254,7 @@ importFrom(rlang,"%||%") importFrom(rlang,":=") importFrom(rlang,as_function) importFrom(rlang,caller_env) +importFrom(rlang,enquos) importFrom(rlang,global_env) importFrom(rlang,is_null) importFrom(rlang,set_names) @@ -275,3 +281,4 @@ importFrom(vctrs,vec_data) importFrom(vctrs,vec_ptype_abbr) importFrom(vctrs,vec_ptype_full) importFrom(vctrs,vec_recycle_common) +importFrom(workflows,extract_preprocessor) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index bda134da..d0e5284a 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -54,11 +54,18 @@ arx_classifier <- function( wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- generics::fit(wf, epi_data) + latency_adjust_fd <- if (is.null(args_list$adjust_latency)) { + max(epi_data$time_value) + } else { + attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% latency_adjust_fd + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) preds <- forecast( wf, - fill_locf = TRUE, + fill_locf = is.null(args_list$adjust_latency), n_recent = args_list$nafill_buffer, - forecast_date = args_list$forecast_date %||% max(epi_data$time_value) + forecast_date = forecast_date ) %>% tibble::as_tibble() %>% dplyr::select(-time_value) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 8f19e8cf..9e629c6e 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -1,4 +1,3 @@ -# TODO add latency to default forecaster #' Direct autoregressive forecaster with covariates #' #' This is an autoregressive forecasting model for @@ -54,7 +53,7 @@ arx_forecaster <- function( preds <- forecast( wf, - fill_locf = TRUE, + fill_locf = is.null(args_list$adjust_latency), n_recent = args_list$nafill_buffer, forecast_date = args_list$forecast_date %||% max(epi_data$time_value) ) %>% @@ -119,6 +118,17 @@ arx_fcast_epi_workflow <- function( if (!(is.null(trainer) || is_regression(trainer))) { cli::cli_abort("{trainer} must be a `{parsnip}` model of mode 'regression'.") } + # forecast_date is first what they set; + # if they don't and they're not adjusting latency, it defaults to the max time_value + # if they're adjusting as_of, it defaults to the as_of + latency_adjust_fd <- if (is.null(args_list$adjust_latency)) { + max(epi_data$time_value) + } else { + attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% latency_adjust_fd + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor @@ -128,26 +138,34 @@ arx_fcast_epi_workflow <- function( r <- step_epi_lag(r, !!p, lag = lags[[l]]) } r <- r %>% - step_epi_ahead(!!outcome, ahead = args_list$ahead) %>% - step_epi_naomit() %>% - step_training_window(n_recent = args_list$n_training) %>% - { - if (!is.null(args_list$check_enough_data_n)) { - check_enough_train_data( - ., - all_predictors(), - !!outcome, - n = args_list$check_enough_data_n, - epi_keys = args_list$check_enough_data_epi_keys, - drop_na = FALSE - ) - } else { - . - } + step_epi_ahead(!!outcome, ahead = args_list$ahead) + method <- args_list$adjust_latency + if (!is.null(method)) { + if (method == "extend_ahead") { + r <- r %>% step_adjust_latency(all_outcomes(), + fixed_forecast_date = forecast_date, + method = method + ) + } else if (method == "extend_lags") { + r <- r %>% step_adjust_latency(all_predictors(), + fixed_forecast_date = forecast_date, + method = method + ) } + } + r <- r %>% + step_epi_naomit() %>% + step_training_window(n_recent = args_list$n_training) + if (!is.null(args_list$check_enough_data_n)) { + r <- r %>% check_enough_train_data( + all_predictors(), + !!outcome, + n = args_list$check_enough_data_n, + epi_keys = args_list$check_enough_data_epi_keys, + drop_na = FALSE + ) + } - forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) - target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) # --- postprocessor f <- frosting() %>% layer_predict() # %>% layer_naomit() @@ -159,11 +177,11 @@ arx_fcast_epi_workflow <- function( )) args_list$quantile_levels <- quantile_levels trainer$args$quantile_levels <- rlang::enquo(quantile_levels) - f <- layer_quantile_distn(f, quantile_levels = quantile_levels) %>% + f <- f %>% + layer_quantile_distn(quantile_levels = quantile_levels) %>% layer_point_from_distn() } else { - f <- layer_residual_quantiles( - f, + f <- f %>% layer_residual_quantiles( quantile_levels = args_list$quantile_levels, symmetrize = args_list$symmetrize, by_key = args_list$quantile_by_key @@ -189,10 +207,15 @@ arx_fcast_epi_workflow <- function( #' @param n_training Integer. An upper limit for the number of rows per #' key that are used for training #' (in the time unit of the `epi_df`). -#' @param forecast_date Date. The date on which the forecast is created. -#' The default `NULL` will attempt to determine this automatically. -#' @param target_date Date. The date for which the forecast is intended. -#' The default `NULL` will attempt to determine this automatically. +#' @param forecast_date Date. The date on which the forecast is created. The +#' default `NULL` will attempt to determine this automatically either as the +#' max time value if there is no latency adjustment, or as the `as_of` of +#' `epi_data` if `adjust_latency` is non-`NULL`. +#' @param target_date Date. The date for which the forecast is intended. The +#' default `NULL` will attempt to determine this automatically as +#' `forecast_date + ahead`. +#' @param adjust_latency Character or `NULL`. one of the `method`s of +#' `step_adjust_latency`, or `NULL` (in which case there is no adjustment). #' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce #' prediction intervals. These are created by computing the quantiles of #' training residuals. A `NULL` value will result in point forecasts only. @@ -238,6 +261,7 @@ arx_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = NULL, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -253,7 +277,7 @@ arx_args_list <- function( arg_is_scalar(ahead, n_training, symmetrize, nonneg) arg_is_chr(quantile_by_key, allow_empty = TRUE) - arg_is_scalar(forecast_date, target_date, allow_null = TRUE) + arg_is_scalar(forecast_date, target_date, adjust_latency, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags) arg_is_lgl(symmetrize, nonneg) @@ -282,6 +306,7 @@ arx_args_list <- function( quantile_levels, forecast_date, target_date, + adjust_latency, symmetrize, nonneg, max_lags, diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 0d809faf..77a04bbd 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -431,7 +431,7 @@ prep.epi_recipe <- function( x, training = NULL, fresh = FALSE, verbose = FALSE, retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) { if (is.null(training)) { - cli::cli_warn(c( + cli::cli_warn(paste( "!" = "No training data was supplied to {.fn prep}.", "!" = "Unlike a {.cls recipe}, an {.cls epi_recipe} does not ", "!" = "store the full template data in the object.", @@ -577,7 +577,6 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { new_data } - kill_levels <- function(x, keys) { for (i in which(names(x) %in% keys)) x[[i]] <- list(values = NA, ordered = NA) x diff --git a/R/get_test_data.R b/R/get_test_data.R index e76715da..c4e1a8e1 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -116,7 +116,7 @@ get_test_data <- function( cannot_be_used <- x %>% dplyr::filter(forecast_date - time_value <= n_recent) %>% dplyr::mutate(fillers = forecast_date - time_value > min_required) %>% - dplyr::summarize( + dplyr::summarise( dplyr::across( -tidyselect::any_of(epi_keys(recipe)), ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index b6b655e5..f80cd876 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -1,12 +1,13 @@ -# TODO adapt this to latency #' Postprocessing step to add the forecast date #' #' @param frosting a `frosting` postprocessor #' @param forecast_date The forecast date to add as a column to the `epi_df`. -#' For most cases, this should be specified in the form "yyyy-mm-dd". Note that -#' when the forecast date is left unspecified, it is set to the maximum time -#' value from the data used in pre-processing, fitting the model, and -#' postprocessing. +#' For most cases, this should be specified in the form "yyyy-mm-dd". Note +#' that when the forecast date is left unspecified, it is set to one of two +#' values. If there is a `step_adjust_latency` step present, it uses the +#' `forecast_date` as set in that function. Otherwise, it uses the maximum +#' `time_value` across the data used for pre-processing, fitting the model, +#' and postprocessing. #' @param id a random id string #' #' @return an updated `frosting` postprocessor @@ -86,17 +87,14 @@ layer_add_forecast_date_new <- function(forecast_date, id) { } #' @export +#' @importFrom workflows extract_preprocessor slather.layer_add_forecast_date <- function(object, components, workflow, new_data, ...) { - if (is.null(object$forecast_date)) { - max_time_value <- max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- object$forecast_date %||% + get_forecast_date_in_layer( + extract_preprocessor(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) + new_data ) - forecast_date <- max_time_value - } else { - forecast_date <- object$forecast_date - } expected_time_type <- attr( workflows::extract_preprocessor(workflow)$template, "metadata" diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 1b671583..8d699dff 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -1,23 +1,26 @@ -# TODO adapt this to latency #' Postprocessing step to add the target date #' #' @param frosting a `frosting` postprocessor -#' @param target_date The target date to add as a column to the -#' `epi_df`. If there's a forecast date specified in a layer, then -#' it is the forecast date plus `ahead` (from `step_epi_ahead` in -#' the `epi_recipe`). Otherwise, it is the maximum `time_value` -#' (from the data used in pre-processing, fitting the model, and -#' postprocessing) plus `ahead`, where `ahead` has been specified in -#' preprocessing. The user may override these by specifying a -#' target date of their own (of the form "yyyy-mm-dd"). +#' @param target_date The target date to add as a column to the `epi_df`. If +#' there's a forecast date specified upstream (either in a +#' `step_adjust_latency` or in a `layer_forecast_date`), then it is the +#' forecast date plus `ahead` (from `step_epi_ahead` in the `epi_recipe`). +#' Otherwise, it is the maximum `time_value` (from the data used in +#' pre-processing, fitting the model, and postprocessing) plus `ahead`, where +#' `ahead` has been specified in preprocessing. The user may override these by +#' specifying a target date of their own (of the form "yyyy-mm-dd"). #' @param id a random id string #' #' @return an updated `frosting` postprocessor #' #' @details By default, this function assumes that a value for `ahead` #' has been specified in a preprocessing step (most likely in -#' `step_epi_ahead`). Then, `ahead` is added to the maximum `time_value` -#' in the test data to get the target date. +#' `step_epi_ahead`). Then, `ahead` is added to the `forecast_date` +#' in the test data to get the target date. `forecast_date` can be set in 3 ways: +#' 1. `step_adjust_latency`, which typically uses the training `epi_df`'s `as_of` +#' 2. `layer_add_forecast_date`, which inherits from 1 if not manually specifed +#' 3. if none of those are the case, it is simply the maximum `time_value` over +#' every dataset used (prep, training, and prediction). #' #' @export #' @examples @@ -41,8 +44,14 @@ #' p <- forecast(wf1) #' p #' -#' # Use ahead + max time value from pre, fit, post -#' # which is the same if include `layer_add_forecast_date()` +#' # Use ahead + forecast_date from adjust_latency +#' # setting the `as_of` to something realistic +#' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +#' r <- epi_recipe(jhu) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_adjust_latency(method = "extend_ahead") %>% +#' step_epi_naomit() #' f2 <- frosting() %>% #' layer_predict() %>% #' layer_add_target_date() %>% @@ -52,15 +61,26 @@ #' p2 <- forecast(wf2) #' p2 #' -#' # Specify own target date +#' # Use ahead + max time value from pre, fit, post +#' # which is the same if include `layer_add_forecast_date()` #' f3 <- frosting() %>% #' layer_predict() %>% -#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_add_target_date() %>% #' layer_naomit(.pred) #' wf3 <- wf %>% add_frosting(f3) #' -#' p3 <- forecast(wf3) -#' p3 +#' p3 <- forecast(wf2) +#' p2 +#' +#' # Specify own target date +#' f4 <- frosting() %>% +#' layer_predict() %>% +#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_naomit(.pred) +#' wf4 <- wf %>% add_frosting(f4) +#' +#' p4 <- forecast(wf4) +#' p4 layer_add_target_date <- function(frosting, target_date = NULL, id = rand_id("add_target_date")) { arg_is_chr_scalar(id) @@ -108,13 +128,13 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") target_date <- forecast_date + ahead } else { - max_time_value <- max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- get_forecast_date_in_layer( + extract_preprocessor(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) + new_data ) ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") - target_date <- max_time_value + ahead + target_date <- forecast_date + ahead } object$target_date <- target_date diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index 37490f68..c562f66f 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -115,7 +115,7 @@ slather.layer_residual_quantiles <- } r <- r %>% - dplyr::summarize( + dplyr::summarise( dstn = list(quantile( c(.resid, s * .resid), probs = object$quantile_levels, na.rm = TRUE diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 227b0ee7..5522a83b 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -1,24 +1,21 @@ -#' Adapt the pipeline to latency in the data +#' Adapt the model to latent data #' -#' In the standard case, the pipeline assumes that the last observation is also -#' the day from which the forecast is being made. `step_adjust_latency` uses the -#' `as_of` date of the `epi_df` as the `forecast_date`. This is most useful in -#' realtime and pseudo-prospective forecasting for data where there is some -#' delay between the day recorded and when that data is available. +#' In the standard case, the arx models assume that the last observation is also +#' the day from which the forecast is being made. But if the data has latency, +#' then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +#' to compensate. This allows the model to create bleeding-edge forecasts using +#' the lags actually observed rather than anticipated. `step_adjust_latency` +#' uses the `as_of` date of the `epi_df` as the `forecast_date`. This is most +#' useful in realtime and pseudo-prospective forecasting for data where there is +#' some delay between the day recorded and when that data is available. #' -#' @param recipe A recipe object. The step will be added to the -#' sequence of operations for this recipe. -#' @param ... One or more selector functions to choose variables for this step. -#' See [recipes::selections()] for more details. Typically you will not need -#' to set this manually, as the necessary adjustments will be done for the -#' predictors and outcome. #' @param method a character. Determines the method by which the -#' forecast handles latency. All of these assume the forecast date is the -#' `as_of` of the `epi_df`. The options are: +#' forecast handles latency. The options are: #' - `"extend_ahead"`: Lengthen the ahead so that forecasting from the last -#' observation results in a forecast `ahead` after the `as_of` date. E.g. if -#' there are 3 days of latency between the last observation and the `as_of` -#' date for a 4 day ahead forecast, the ahead used in practice is actually 7. +#' observation results in a forecast `ahead` after the `forecast_date` date. +#' E.g. if there are 3 days of latency between the last observation and the +#' `forecast_date` date for a 4 day ahead forecast, the ahead used in practice +#' is actually 7. #' - `"locf"`: carries forward the last observed value(s) up to the forecast #' date. See the Vignette TODO for equivalents using other steps and more #' sophisticated methods of extrapolation. @@ -26,38 +23,41 @@ #' the shortest lag at predict time is at the last observation. E.g. if the #' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used #' become `c(3,10,17)`. +#' @param epi_keys_checked a character vector. A list of keys to group by before +#' finding the `max_time_value`. The default value of this is +#' `c("geo_value")`, but it can be any collection of `epi_keys`. Different +#' locations may have different latencies; to produce a forecast at every +#' location, we need to use the largest latency across every location; this +#' means taking `max_time_value` to be the minimum of the `max_time_value`s +#' for each `geo_value` (or whichever collection of keys are specified). If +#' `NULL` or an empty character vector, it will take the maximum across all +#' values, irrespective of any keys. #' @param fixed_latency either a positive integer, or a labeled positive integer -#' vector. Cannot be set at the same time as `fixed_asof`. If non-`NULL`, -#' the amount to offset the ahead or lag by. If a single integer, this is used -#' for all columns; if a labeled vector, the labels must correspond to the -#' base column names. If `NULL`, the latency is the distance between the -#' `epi_df`'s `max_time_value` and either the `fixed_asof` or the `epi_df`'s -#' `as_of` field. -#' @param fixed_asof either a date of the same kind used in the `epi_df`, or -#' NULL. Cannot be set at the same time as `fixed_latency`. If a date, it -#' gives the date from which the forecast is actually occurring. If `NULL`, -#' the `as_of` is determined either from `fixed_latency` or automatically. +#' vector. Cannot be set at the same time as `fixed_forecast_date`. If +#' non-`NULL`, the amount to offset the ahead or lag by. If a single integer, +#' this is used for all columns; if a labeled vector, the labels must +#' correspond to the base column names (before lags/aheads). If `NULL`, the +#' latency is the distance between the `epi_df`'s `max_time_value` and either +#' the `fixed_forecast_date` or the `epi_df`'s `as_of` field (the default for +#' `forecast_date`). +#' @param fixed_forecast_date either a date of the same kind used in the +#' `epi_df`, or `NULL`. Exclusive with `fixed_latency`. If a date, it gives +#' the date from which the forecast is actually occurring. If `NULL`, the +#' `forecast_date` is determined either via the `fixed_latency`, or is set to +#' the `epi_df`'s `as_of` value if `fixed_latency` is also `NULL`. #' @param role For model terms created by this step, what analysis role should -#' they be assigned? `lag` is default a predictor while `ahead` is an outcome. -#' It should be correctly inferred and not need setting -#' @param trained A logical to indicate if the quantities for preprocessing have -#' been estimated. -#' @param columns A character string of column names to be adjusted; these -#' should be the original columns, and not the derived ones +#' they be assigned? `lag` is a predictor while `ahead` is an outcome. It +#' should be correctly inferred and not need setting #' @param default Determines what fills empty rows #' left by leading/lagging (defaults to NA). -#' @param skip A logical. Should the step be skipped when the -#' recipe is baked by [bake()]? While all operations are baked -#' when [prep()] is run, some operations may not be able to be -#' conducted on new data (e.g. processing the outcome variable(s)). -#' Care should be taken when using `skip = TRUE` as it may affect -#' the computations for subsequent operations. -#' @param id A unique identifier for the step #' @template step-return +#' @inheritParams recipes::step_lag #' #' @details The step assumes that the pipeline has already applied either -#' `step_epi_ahead` or `step_epi_lag` depending on the value of -#' `"method"`, and that `step_epi_naomit` has NOT been run. +#' `step_epi_ahead` or `step_epi_lag` depending on the value of `"method"`, +#' and that `step_epi_naomit` has NOT been run. By default, the latency will +#' be determined using the arguments below, but can be set explicitly using +#' either `fixed_latency` or `fixed_forecast_date`. #' #' The `prefix` and `id` arguments are unchangeable to ensure that the code runs #' properly and to avoid inconsistency with naming. For `step_epi_ahead`, they @@ -68,12 +68,25 @@ #' @rdname step_adjust_latency #' @export #' @examples +#' jhu <- case_death_rate_subset %>% +#' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +#' # setting the `as_of` to something realistic +#' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +#' #' r <- epi_recipe(case_death_rate_subset) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% -#' # step_adjust_latency(method = "extend_ahead") %>% +#' step_adjust_latency(method = "extend_ahead") %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) #' r +#' +#' jhu_fit <- epi_workflow() %>% +#' add_epi_recipe(r) %>% +#' add_model(linear_reg()) %>% +#' fit(data = jhu) +#' jhu_fit +#' #' @importFrom recipes detect_step +#' @importFrom rlang enquos step_adjust_latency <- function(recipe, ..., @@ -84,12 +97,13 @@ step_adjust_latency <- "locf", "extend_lags" ), + epi_keys_checked = c("geo_value"), fixed_latency = NULL, - fixed_asof = NULL, + fixed_forecast_date = NULL, default = NA, skip = FALSE, columns = NULL, - id = recipes::rand_id("epi_lag")) { + id = recipes::rand_id("adjust_latency")) { arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") @@ -114,6 +128,10 @@ step_adjust_latency <- cli::cli_abort("adjust_latency needs to occur before any `NA` removal, as columns may be moved around") } + if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { + cli::cli_abort("Only one of `fixed_latency` and `fixed_forecast_date` + can be non-`NULL` at a time!") + } method <- rlang::arg_match(method) terms_used <- recipes_eval_select(enquos(...), recipe$template, recipe$term_info) @@ -135,7 +153,7 @@ step_adjust_latency <- recipe$steps, function(recipe_step) inherits(recipe_step, rel_step_type) ))) { - cli::cli_abort(paste( + cli::cli_abort(glue::glue( "There is no `{rel_step_type}` defined before this.", " For the method `extend_{shift_name}` of `step_adjust_latency`,", " at least one {shift_name} must be previously defined." @@ -145,15 +163,17 @@ step_adjust_latency <- recipes::add_step( recipe, step_adjust_latency_new( - terms = dplyr::enquos(...), + terms = enquos(...), role = role, method = method, + epi_keys_checked = epi_keys_checked, trained = trained, - as_of = fixed_asof, + forecast_date = fixed_forecast_date, latency = fixed_latency, shift_cols = relevant_shifts, default = default, keys = epi_keys(recipe), + columns = columns, skip = skip, id = id ) @@ -161,19 +181,21 @@ step_adjust_latency <- } step_adjust_latency_new <- - function(terms, role, trained, as_of, latency, shift_cols, time_type, default, - keys, method, skip, id) { + function(terms, role, trained, forecast_date, latency, shift_cols, time_type, default, + keys, method, epi_keys_checked, columns, skip, id) { step( subclass = "adjust_latency", terms = terms, role = role, method = method, + epi_keys_checked = epi_keys_checked, trained = trained, - as_of = as_of, + forecast_date = forecast_date, latency = latency, shift_cols = shift_cols, default = default, keys = keys, + columns = columns, skip = skip, id = id ) @@ -181,24 +203,25 @@ step_adjust_latency_new <- # lags introduces max(lags) NA's after the max_time_value. #' @export +#' @importFrom glue glue prep.step_adjust_latency <- function(x, training, info = NULL, ...) { # get the columns used, even if it's all of them - terms_used <- x$columns + terms_used <- x$terms if (length(terms_used) == 0) { terms_used <- info %>% filter(role == "raw") %>% pull(variable) } - # get and check the max_time and as_of are the right kinds of dates - as_of <- x$as_of %||% set_asof(training, info) + # get and check the max_time and forecast_date are the right kinds of dates + forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked) # infer the correct columns to be working with from the previous # transformations x$prefix <- x$shift_cols$prefix[[1]] sign_shift <- get_sign(x) latency_cols <- get_latent_column_tibble( - x$shift_cols, training, as_of, - x$latency, sign_shift, info + x$shift_cols, training, forecast_date, + x$latency, sign_shift, info, x$epi_keys_checked ) if ((x$method == "extend_ahead") || (x$method == "extend_lags")) { @@ -221,7 +244,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ), "i" = "input shift: {latency_cols$shift[[i_latency]]}", "i" = "latency adjusted shift: {latency_cols$effective_shift[[i_latency]]}", - "i" = "max_time = {max_time} -> as_of = {as_of}" + "i" = "`max_time` = {max_time} -> `forecast_date` = {forecast_date}" )) } } @@ -231,25 +254,20 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { role = latency_cols$role[[1]], trained = TRUE, shift_cols = latency_cols, - as_of = as_of, + forecast_date = forecast_date, latency = unique(latency_cols$latency), default = x$default, keys = x$keys, method = x$method, + epi_keys_checked = x$epi_keys_checked, + columns = recipes_eval_select(latency_cols$original_name, training, info), skip = x$skip, id = x$id ) } -#' various ways of handling differences between the `as_of` date and the maximum -#' time value -#' @description -#' adjust the ahead so that we will be predicting `ahead` days after the `as_of` -#' date, rather than relative to the last day of data -#' @param new_data assumes that this already has lag/ahead columns that we need -#' to adjust #' @importFrom dplyr %>% pull -#' @keywords internal +#' @export bake.step_adjust_latency <- function(object, new_data, ...) { if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { keys <- object$keys @@ -267,17 +285,21 @@ print.step_adjust_latency <- } else { terms <- x$terms } - if (!is.null(x$as_of)) { + if (!is.null(x$forecast_date)) { conj <- "with forecast date" - extra_text <- x$as_of - } else if (!is.null(x$shift_cols)) { - conj <- "with latencies" - extra_text <- x$shift_cols + extra_text <- x$forecast_date + } else if (!is.null(x$latency)) { + conj <- if (length(x$latency == 1)) { + "with latency" + } else { + "with latencies" + } + extra_text <- x$latency } else { - conj <- "" + conj <- "with latency" extra_text <- "set at train time" } - print_epi_step(terms, NULL, x$trained, x$method, + print_epi_step(terms, terms, x$trained, x$method, conjunction = conj, extra_text = extra_text ) diff --git a/R/utils-latency.R b/R/utils-latency.R index 5fa4ce3f..9c5f6142 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -3,7 +3,7 @@ #' note that this may introduce new NA values when one column is shifted farther than another #' @param shift_cols a tibble which must have the columns `column`, the name of #' the column to adjust, `latency` the latency of the original column relative -#' to the `as_of` date, `new_name`, the names in `column` adjusted by the +#' to the `forecast_date`, `new_name`, the names in `column` adjusted by the #' latencies `latency` #' @param new_data just what is says #' @param keys the variables which are used as keys @@ -68,26 +68,26 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' been changed #' @param shift_cols a list of columns to operate on, as created by `construct_shift_tibble` #' @param new_data the data transformed so far -#' @param as_of the forecast date -#' @param latency `NULL`, int, or vector, as described in `step_eip_latency` +#' @param forecast_date the forecast date +#' @param latency `NULL`, int, or vector, as described in `step_epi_latency` #' @param sign_shift -1 if ahead, 1 if lag #' @return a tibble with columns `column` (relevant shifted names), `shift` (the #' amount that one is shifted), `latency` (original columns difference between -#' max_time_value and as_of (on a per-initial column basis)), +#' the max `time_value` and `forecast_date` (on a per-initial column basis)), #' `effective_shift` (shift+latency), and `new_name` (adjusted names with the #' effective_shift) #' @keywords internal -#' @importFrom dplyr rowwise %>% +#' @importFrom dplyr rowwise left_join join_by get_latent_column_tibble <- function( - shift_cols, new_data, as_of, latency, - sign_shift, info, call = caller_env()) { + shift_cols, new_data, forecast_date, latency, + sign_shift, info, epi_keys_checked, call = caller_env()) { shift_cols <- shift_cols %>% mutate(original_name = glue::glue("{prefix}{shift}_{terms}")) if (is.null(latency)) { shift_cols <- shift_cols %>% rowwise() %>% # add the latencies to shift_cols mutate(latency = get_latency( - new_data, as_of, original_name, shift, sign_shift + new_data, forecast_date, original_name, shift, sign_shift, epi_keys_checked )) %>% ungroup() } else if (length(latency) > 1) { @@ -119,9 +119,10 @@ get_latent_column_tibble <- function( } -#' extract the as_of, and make sure there's nothing very off about it +#' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal -set_asof <- function(new_data, info) { +#' @importFrom dplyr select +set_forecast_date <- function(new_data, info, epi_keys_checked) { original_columns <- info %>% filter(source == "original") %>% pull(variable) @@ -135,52 +136,99 @@ set_asof <- function(new_data, info) { } # the source data determines the actual time_values # these are the non-na time_values; - time_values <- new_data %>% + # get the minimum value across the checked epi_keys' maximum time values + max_time <- new_data %>% select(all_of(original_columns)) %>% - drop_na() %>% - pull(time_value) - if (length(time_values) <= 0) { - cli::cli_abort("the `time_value` column of `new_data` is empty") + drop_na() + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && (epi_keys_checked != "")) { + max_time <- max_time %>% group_by(get(epi_keys_checked)) } - as_of <- attributes(new_data)$metadata$as_of - max_time <- max(time_values) + max_time <- max_time %>% + summarise(time_value = max(time_value)) %>% + pull(time_value) %>% + min() + forecast_date <- attributes(new_data)$metadata$as_of # make sure the as_of is sane - if (!inherits(as_of, class(time_values)) & !inherits(as_of, "POSIXt")) { + if (!inherits(forecast_date, class(max_time)) & !inherits(forecast_date, "POSIXt")) { cli::cli_abort(paste( - "the data matrix `as_of` value is {as_of}, ", + "the data matrix `forecast_date` value is {forecast_date}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", - "{typeof(new_data$time_value)}." + "{class(max_time)}." )) } - if (is.null(as_of) || is.na(as_of)) { + if (is.null(forecast_date) || is.na(forecast_date)) { cli::cli_warn(paste( - "epi_data's `as_of` was {as_of}, setting to ", + "epi_data's `forecast_date` was {forecast_date}, setting to ", "the latest time value, {max_time}." )) - as_of <- max_time - } else if (as_of < max_time) { + forecast_date <- max_time + } else if (forecast_date < max_time) { cli::cli_abort(paste( - "`as_of` ({(as_of)}) is before the most ", + "`forecast_date` ({(forecast_date)}) is before the most ", "recent data ({max_time}). Remove before ", "predicting." )) } # TODO cover the rest of the possible types for as_of and max_time... - if (class(time_values) == "Date") { - as_of <- as.Date(as_of) + if (inherits(max_time, "Date")) { + forecast_date <- as.Date(forecast_date) } - return(as_of) + return(forecast_date) } #' the latency is also the amount the shift is off by #' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you #' need to shift the data to bring the 3 day lagged value to today. #' @keywords internal -get_latency <- function(new_data, as_of, column, shift_amount, sign_shift) { +get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shift, epi_keys_checked) { shift_max_date <- new_data %>% - drop_na(all_of(column)) %>% + drop_na(all_of(column)) + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && epi_keys_checked != "") { + shift_max_date <- shift_max_date %>% group_by(get(epi_keys_checked)) + } + shift_max_date <- shift_max_date %>% + summarise(time_value = max(time_value)) %>% pull(time_value) %>% - max() - return(as.integer(sign_shift * (as_of - shift_max_date) + shift_amount)) + min() + return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date) + shift_amount)) +} + + + +#' get the target date while in a layer +#' @param this_recipe the recipe to check for `step_adjust_latency` +#' @param workflow_max_time_value the `max_time` value coming out of the fit +#' workflow (this will be the maximal time value in a potentially different +#' dataset) +#' @param new_data the data we're currently working with, from which we'll take +#' a potentially different max_time_value +#' @keywords internal +get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new_data) { + max_time_value <- max( + workflow_max_time_value, + this_recipe$max_time_value, + max(new_data$time_value) + ) + if (this_recipe %>% recipes::detect_step("adjust_latency")) { + # get the as_of in an `adjust_latency` step, regardless of where + handpicked_forecast_date <- map( + this_recipe$steps, + function(x) { + if (inherits(x, "step_adjust_latency")) x$forecast_date + } + ) %>% Filter(Negate(is.null), .) + if (length(handpicked_forecast_date) > 0) { + max_time_value <- handpicked_forecast_date[[1]] + } else { + # if we haven't chosen one, use either the max_time_value or the as_of + max_time_value <- max( + max_time_value, + attributes(new_data)$metadata$as_of + ) + } + } + max_time_value } diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index c9ae4e73..68d44587 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -10,6 +10,7 @@ arx_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = NULL, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -32,11 +33,17 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date for which the forecast is intended. The +default \code{NULL} will attempt to determine this automatically as +\code{forecast_date + ahead}.} + +\item{adjust_latency}{Character or \code{NULL}. one of the \code{method}s of +\code{step_adjust_latency}, or \code{NULL} (in which case there is no adjustment).} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index a1205c71..330c9ced 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -34,11 +34,14 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date for which the forecast is intended. The +default \code{NULL} will attempt to determine this automatically as +\code{forecast_date + ahead}.} \item{outcome_transform}{Scalar character. Whether the outcome should be created using growth rates (as the predictors are) or lagged diff --git a/man/bake.step_adjust_latency.Rd b/man/bake.step_adjust_latency.Rd deleted file mode 100644 index edb4f1f6..00000000 --- a/man/bake.step_adjust_latency.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step_adjust_latency.R -\name{bake.step_adjust_latency} -\alias{bake.step_adjust_latency} -\title{various ways of handling differences between the \code{as_of} date and the maximum -time value} -\usage{ -\method{bake}{step_adjust_latency}(object, new_data, ...) -} -\arguments{ -\item{new_data}{assumes that this already has lag/ahead columns that we need -to adjust} -} -\description{ -adjust the ahead so that we will be predicting \code{ahead} days after the \code{as_of} -date, rather than relative to the last day of data -} -\keyword{internal} diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 2f930057..981c3c7e 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -34,8 +34,10 @@ set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_ key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/extend_either.Rd b/man/extend_either.Rd index b7e30694..ae55fa46 100644 --- a/man/extend_either.Rd +++ b/man/extend_either.Rd @@ -13,7 +13,7 @@ extend_either(new_data, shift_cols, keys) \item{shift_cols}{a tibble which must have the columns \code{column}, the name of the column to adjust, \code{latency} the latency of the original column relative -to the \code{as_of} date, \code{new_name}, the names in \code{column} adjusted by the +to the \code{forecast_date}, \code{new_name}, the names in \code{column} adjusted by the latencies \code{latency}} \item{keys}{the variables which are used as keys} diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 059dfa03..633d4502 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -29,11 +29,14 @@ So for example, \code{ahead = 7} will create residuals by comparing values key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date for which the forecast is intended. The +default \code{NULL} will attempt to determine this automatically as +\code{forecast_date + ahead}.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/get_forecast_date_in_layer.Rd b/man/get_forecast_date_in_layer.Rd new file mode 100644 index 00000000..c866a88e --- /dev/null +++ b/man/get_forecast_date_in_layer.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_forecast_date_in_layer} +\alias{get_forecast_date_in_layer} +\title{get the target date while in a layer} +\usage{ +get_forecast_date_in_layer(this_recipe, workflow_max_time_value, new_data) +} +\arguments{ +\item{this_recipe}{the recipe to check for \code{step_adjust_latency}} + +\item{workflow_max_time_value}{the \code{max_time} value coming out of the fit +workflow (this will be the maximal time value in a potentially different +dataset)} + +\item{new_data}{the data we're currently working with, from which we'll take +a potentially different max_time_value} +} +\description{ +get the target date while in a layer +} +\keyword{internal} diff --git a/man/get_latency.Rd b/man/get_latency.Rd index d9098b45..f5bf7f0c 100644 --- a/man/get_latency.Rd +++ b/man/get_latency.Rd @@ -4,7 +4,14 @@ \alias{get_latency} \title{the latency is also the amount the shift is off by} \usage{ -get_latency(new_data, as_of, column, shift_amount, sign_shift) +get_latency( + new_data, + forecast_date, + column, + shift_amount, + sign_shift, + epi_keys_checked +) } \arguments{ \item{sign_shift}{integer. 1 if lag and -1 if ahead. These represent how you diff --git a/man/get_latent_column_tibble.Rd b/man/get_latent_column_tibble.Rd index 168145ed..0a029862 100644 --- a/man/get_latent_column_tibble.Rd +++ b/man/get_latent_column_tibble.Rd @@ -8,10 +8,11 @@ been changed} get_latent_column_tibble( shift_cols, new_data, - as_of, + forecast_date, latency, sign_shift, info, + epi_keys_checked, call = caller_env() ) } @@ -20,16 +21,16 @@ get_latent_column_tibble( \item{new_data}{the data transformed so far} -\item{as_of}{the forecast date} +\item{forecast_date}{the forecast date} -\item{latency}{\code{NULL}, int, or vector, as described in \code{step_eip_latency}} +\item{latency}{\code{NULL}, int, or vector, as described in \code{step_epi_latency}} \item{sign_shift}{-1 if ahead, 1 if lag} } \value{ a tibble with columns \code{column} (relevant shifted names), \code{shift} (the amount that one is shifted), \code{latency} (original columns difference between -max_time_value and as_of (on a per-initial column basis)), +the max \code{time_value} and \code{forecast_date} (on a per-initial column basis)), \code{effective_shift} (shift+latency), and \code{new_name} (adjusted names with the effective_shift) } diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index 4e173d66..59d6ff84 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -14,10 +14,12 @@ layer_add_forecast_date( \item{frosting}{a \code{frosting} postprocessor} \item{forecast_date}{The forecast date to add as a column to the \code{epi_df}. -For most cases, this should be specified in the form "yyyy-mm-dd". Note that -when the forecast date is left unspecified, it is set to the maximum time -value from the data used in pre-processing, fitting the model, and -postprocessing.} +For most cases, this should be specified in the form "yyyy-mm-dd". Note +that when the forecast date is left unspecified, it is set to one of two +values. If there is a \code{step_adjust_latency} step present, it uses the +\code{forecast_date} as set in that function. Otherwise, it uses the maximum +\code{time_value} across the data used for pre-processing, fitting the model, +and postprocessing.} \item{id}{a random id string} } diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index 5b32002d..94cbc7b4 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -13,14 +13,14 @@ layer_add_target_date( \arguments{ \item{frosting}{a \code{frosting} postprocessor} -\item{target_date}{The target date to add as a column to the -\code{epi_df}. If there's a forecast date specified in a layer, then -it is the forecast date plus \code{ahead} (from \code{step_epi_ahead} in -the \code{epi_recipe}). Otherwise, it is the maximum \code{time_value} -(from the data used in pre-processing, fitting the model, and -postprocessing) plus \code{ahead}, where \code{ahead} has been specified in -preprocessing. The user may override these by specifying a -target date of their own (of the form "yyyy-mm-dd").} +\item{target_date}{The target date to add as a column to the \code{epi_df}. If +there's a forecast date specified upstream (either in a +\code{step_adjust_latency} or in a \code{layer_forecast_date}), then it is the +forecast date plus \code{ahead} (from \code{step_epi_ahead} in the \code{epi_recipe}). +Otherwise, it is the maximum \code{time_value} (from the data used in +pre-processing, fitting the model, and postprocessing) plus \code{ahead}, where +\code{ahead} has been specified in preprocessing. The user may override these by +specifying a target date of their own (of the form "yyyy-mm-dd").} \item{id}{a random id string} } @@ -33,8 +33,14 @@ Postprocessing step to add the target date \details{ By default, this function assumes that a value for \code{ahead} has been specified in a preprocessing step (most likely in -\code{step_epi_ahead}). Then, \code{ahead} is added to the maximum \code{time_value} -in the test data to get the target date. +\code{step_epi_ahead}). Then, \code{ahead} is added to the \code{forecast_date} +in the test data to get the target date. \code{forecast_date} can be set in 3 ways: +\enumerate{ +\item \code{step_adjust_latency}, which typically uses the training \code{epi_df}'s \code{as_of} +\item \code{layer_add_forecast_date}, which inherits from 1 if not manually specifed +\item if none of those are the case, it is simply the maximum \code{time_value} over +every dataset used (prep, training, and prediction). +} } \examples{ jhu <- case_death_rate_subset \%>\% @@ -57,8 +63,14 @@ wf1 <- wf \%>\% add_frosting(f) p <- forecast(wf1) p -# Use ahead + max time value from pre, fit, post -# which is the same if include `layer_add_forecast_date()` +# Use ahead + forecast_date from adjust_latency +# setting the `as_of` to something realistic +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +r <- epi_recipe(jhu) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_adjust_latency(method = "extend_ahead") \%>\% + step_epi_naomit() f2 <- frosting() \%>\% layer_predict() \%>\% layer_add_target_date() \%>\% @@ -68,13 +80,24 @@ wf2 <- wf \%>\% add_frosting(f2) p2 <- forecast(wf2) p2 -# Specify own target date +# Use ahead + max time value from pre, fit, post +# which is the same if include `layer_add_forecast_date()` f3 <- frosting() \%>\% layer_predict() \%>\% - layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_add_target_date() \%>\% layer_naomit(.pred) wf3 <- wf \%>\% add_frosting(f3) -p3 <- forecast(wf3) -p3 +p3 <- forecast(wf2) +p2 + +# Specify own target date +f4 <- frosting() \%>\% + layer_predict() \%>\% + layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_naomit(.pred) +wf4 <- wf \%>\% add_frosting(f4) + +p4 <- forecast(wf4) +p4 } diff --git a/man/set_asof.Rd b/man/set_asof.Rd deleted file mode 100644 index fabf97c8..00000000 --- a/man/set_asof.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-latency.R -\name{set_asof} -\alias{set_asof} -\title{extract the as_of, and make sure there's nothing very off about it} -\usage{ -set_asof(new_data, info) -} -\description{ -extract the as_of, and make sure there's nothing very off about it -} -\keyword{internal} diff --git a/man/set_forecast_date.Rd b/man/set_forecast_date.Rd new file mode 100644 index 00000000..58682bd2 --- /dev/null +++ b/man/set_forecast_date.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{set_forecast_date} +\alias{set_forecast_date} +\title{Extract the as_of for the forecast date, and make sure there's nothing very off about it.} +\usage{ +set_forecast_date(new_data, info, epi_keys_checked) +} +\description{ +Extract the as_of for the forecast date, and make sure there's nothing very off about it. +} +\keyword{internal} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index e123769f..fc4c5587 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_adjust_latency.R \name{step_adjust_latency} \alias{step_adjust_latency} -\title{Adapt the pipeline to latency in the data} +\title{Adapt the model to latent data} \usage{ step_adjust_latency( recipe, @@ -10,38 +10,37 @@ step_adjust_latency( role = NA, trained = FALSE, method = c("extend_ahead", "locf", "extend_lags"), + epi_keys_checked = c("geo_value"), fixed_latency = NULL, - fixed_asof = NULL, + fixed_forecast_date = NULL, default = NA, skip = FALSE, columns = NULL, - id = recipes::rand_id("epi_lag") + id = recipes::rand_id("adjust_latency") ) } \arguments{ \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} -\item{...}{One or more selector functions to choose variables for this step. -See \code{\link[recipes:selections]{recipes::selections()}} for more details. Typically you will not need -to set this manually, as the necessary adjustments will be done for the -predictors and outcome.} +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[recipes:selections]{selections()}} for more details.} \item{role}{For model terms created by this step, what analysis role should -they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome. -It should be correctly inferred and not need setting} +they be assigned? \code{lag} is a predictor while \code{ahead} is an outcome. It +should be correctly inferred and not need setting} -\item{trained}{A logical to indicate if the quantities for preprocessing have -been estimated.} +\item{trained}{A logical to indicate if the quantities for +preprocessing have been estimated.} \item{method}{a character. Determines the method by which the -forecast handles latency. All of these assume the forecast date is the -\code{as_of} of the \code{epi_df}. The options are: +forecast handles latency. The options are: \itemize{ \item \code{"extend_ahead"}: Lengthen the ahead so that forecasting from the last -observation results in a forecast \code{ahead} after the \code{as_of} date. E.g. if -there are 3 days of latency between the last observation and the \code{as_of} -date for a 4 day ahead forecast, the ahead used in practice is actually 7. +observation results in a forecast \code{ahead} after the \code{forecast_date} date. +E.g. if there are 3 days of latency between the last observation and the +\code{forecast_date} date for a 4 day ahead forecast, the ahead used in practice +is actually 7. \item \code{"locf"}: carries forward the last observed value(s) up to the forecast date. See the Vignette TODO for equivalents using other steps and more sophisticated methods of extrapolation. @@ -51,49 +50,66 @@ lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used become \code{c(3,10,17)}. }} +\item{epi_keys_checked}{a character vector. A list of keys to group by before +finding the \code{max_time_value}. The default value of this is +\code{c("geo_value")}, but it can be any collection of \code{epi_keys}. Different +locations may have different latencies; to produce a forecast at every +location, we need to use the largest latency across every location; this +means taking \code{max_time_value} to be the minimum of the \code{max_time_value}s +for each \code{geo_value} (or whichever collection of keys are specified). If +\code{NULL} or an empty character vector, it will take the maximum across all +values, irrespective of any keys.} + \item{fixed_latency}{either a positive integer, or a labeled positive integer -vector. Cannot be set at the same time as \code{fixed_asof}. If non-\code{NULL}, -the amount to offset the ahead or lag by. If a single integer, this is used -for all columns; if a labeled vector, the labels must correspond to the -base column names. If \code{NULL}, the latency is the distance between the -\code{epi_df}'s \code{max_time_value} and either the \code{fixed_asof} or the \code{epi_df}'s -\code{as_of} field.} +vector. Cannot be set at the same time as \code{fixed_forecast_date}. If +non-\code{NULL}, the amount to offset the ahead or lag by. If a single integer, +this is used for all columns; if a labeled vector, the labels must +correspond to the base column names (before lags/aheads). If \code{NULL}, the +latency is the distance between the \code{epi_df}'s \code{max_time_value} and either +the \code{fixed_forecast_date} or the \code{epi_df}'s \code{as_of} field (the default for +\code{forecast_date}).} -\item{fixed_asof}{either a date of the same kind used in the \code{epi_df}, or -NULL. Cannot be set at the same time as \code{fixed_latency}. If a date, it -gives the date from which the forecast is actually occurring. If \code{NULL}, -the \code{as_of} is determined either from \code{fixed_latency} or automatically.} +\item{fixed_forecast_date}{either a date of the same kind used in the +\code{epi_df}, or \code{NULL}. Exclusive with \code{fixed_latency}. If a date, it gives +the date from which the forecast is actually occurring. If \code{NULL}, the +\code{forecast_date} is determined either via the \code{fixed_latency}, or is set to +the \code{epi_df}'s \code{as_of} value if \code{fixed_latency} is also \code{NULL}.} \item{default}{Determines what fills empty rows left by leading/lagging (defaults to NA).} \item{skip}{A logical. Should the step be skipped when the -recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked -when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked +when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} -\item{columns}{A character string of column names to be adjusted; these -should be the original columns, and not the derived ones} +\item{columns}{A character string of the selected variable names. This field +is a placeholder and will be populated once \code{\link[recipes:prep]{prep()}} is used.} -\item{id}{A unique identifier for the step} +\item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -In the standard case, the pipeline assumes that the last observation is also -the day from which the forecast is being made. \code{step_adjust_latency} uses the -\code{as_of} date of the \code{epi_df} as the \code{forecast_date}. This is most useful in -realtime and pseudo-prospective forecasting for data where there is some -delay between the day recorded and when that data is available. +In the standard case, the arx models assume that the last observation is also +the day from which the forecast is being made. But if the data has latency, +then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +to compensate. This allows the model to create bleeding-edge forecasts using +the lags actually observed rather than anticipated. \code{step_adjust_latency} +uses the \code{as_of} date of the \code{epi_df} as the \code{forecast_date}. This is most +useful in realtime and pseudo-prospective forecasting for data where there is +some delay between the day recorded and when that data is available. } \details{ The step assumes that the pipeline has already applied either -\code{step_epi_ahead} or \code{step_epi_lag} depending on the value of -\code{"method"}, and that \code{step_epi_naomit} has NOT been run. +\code{step_epi_ahead} or \code{step_epi_lag} depending on the value of \code{"method"}, +and that \code{step_epi_naomit} has NOT been run. By default, the latency will +be determined using the arguments below, but can be set explicitly using +either \code{fixed_latency} or \code{fixed_forecast_date}. The \code{prefix} and \code{id} arguments are unchangeable to ensure that the code runs properly and to avoid inconsistency with naming. For \code{step_epi_ahead}, they @@ -101,11 +117,23 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. } \examples{ +jhu <- case_death_rate_subset \%>\% + dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) +# setting the `as_of` to something realistic +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 + r <- epi_recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% - # step_adjust_latency(method = "extend_ahead") \%>\% + step_adjust_latency(method = "extend_ahead") \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) r + +jhu_fit <- epi_workflow() \%>\% + add_epi_recipe(r) \%>\% + add_model(linear_reg()) \%>\% + fit(data = jhu) +jhu_fit + } \seealso{ Other row operation steps: diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 079bfa20..81523e74 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1221,6 +1221,34 @@ 18993, 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" + ), .pred = c(0.303244704017743, 0.531332853311082, 0.588827944685979, + 0.988690249216229, 0.794801997001639, 0.306895457225321), .pred_distn = structure(list( + structure(list(values = c("5%" = 0.136509784083987, "95%" = 0.469979623951498 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.364597933377326, "95%" = 0.698067773244837 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.422093024752224, "95%" = 0.755562864619735 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.821955329282474, "95%" = 1.15542516914998 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.628067077067883, "95%" = 0.961536916935394 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.140160537291566, "95%" = 0.473630377159077 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr"))), class = c("distribution", + "vctrs_vctr", "list")), forecast_date = structure(c(18997, 18997, + 18997, 18997, 18997, 18997), class = "Date"), target_date = structure(c(18998, + 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, + -6L), class = c("tbl_df", "tbl", "data.frame")) + # arx_classifier snapshots structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", diff --git a/tests/testthat/_snaps/utils-shift.md b/tests/testthat/_snaps/utils-shift.md deleted file mode 100644 index b7c5f064..00000000 --- a/tests/testthat/_snaps/utils-shift.md +++ /dev/null @@ -1,13 +0,0 @@ -# extend_ahead warns in case of extreme adjustment - - Code - adjust_latency(object, x_adjust_ahead) - Condition - Warning: - ! The ahead has been adjusted by 100, which is questionable for it's `time_type` of day - i input ahead: 7 - i shifted ahead: 107 - i max_time = 2021-07-19 -> as_of = 2021-10-27 - Output - [1] 107 - diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 9595b47b..a723b6ba 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -1,5 +1,7 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 + r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% @@ -67,6 +69,8 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { # p3 <- predict(wf3, latest), # "forecast_date is less than the most recent update date of the data." # ) + p3 <- predict(wf3, latest) + p3 expect_silent(p3 <- predict(wf3, latest)) expect_equal(ncol(p3), 4L) expect_s3_class(p3, "epi_df") @@ -75,6 +79,30 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { expect_named(p3, c("geo_value", "time_value", ".pred", "forecast_date")) }) +test_that("`layer_add_forecast_date()` infers correct date when using `adjust_latency`", { + r_latent <- epi_recipe(jhu) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + frost_latent <- frosting() %>% + layer_predict() %>% + layer_add_forecast_date() %>% + layer_naomit(.pred) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(frost_latent) + p_latent <- predict(wf_latent, latest) + expect_equal( + p_latent$forecast_date, + rep(as.Date("2022-01-03"), times = 3) + ) + expect_equal( + p_latent$forecast_date - p_latent$time_value, + as.difftime(rep(3, times = 3), units = "days") + ) +}) test_that("forecast date works for daily", { f <- frosting() %>% diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index e5349839..b8bae584 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -6,6 +6,7 @@ r <- epi_recipe(jhu) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 latest <- jhu %>% dplyr::filter(time_value >= max(time_value) - 14) @@ -39,6 +40,27 @@ test_that("Use ahead + max time value from pre, fit, post", { expect_equal(p2$target_date, rep(as.Date("2022-01-07"), times = 3)) expect_named(p2, c("geo_value", "time_value", ".pred", "forecast_date", "target_date")) }) +test_that("latency adjust doesn't interfere with correct target date", { + r_latent <- epi_recipe(jhu) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% fit(jhu) + f_latent <- frosting() %>% + layer_predict() %>% + layer_add_target_date() %>% + layer_naomit(.pred) + wf_latent <- wf_latent %>% add_frosting(f_latent) + + expect_silent(p_latent <- predict(wf_latent, latest)) + expect_equal(ncol(p_latent), 4L) + expect_s3_class(p_latent, "epi_df") + expect_equal(nrow(p_latent), 3L) + expect_equal(p_latent$target_date, rep(as.Date("2022-01-10"), times = 3)) + expect_named(p_latent, c("geo_value", "time_value", ".pred", "target_date")) +}) test_that("Use ahead + specified forecast date", { f <- frosting() %>% diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index d624a4c2..25bc13bd 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -68,6 +68,32 @@ test_that("arx_forecaster snapshots", { ) ) expect_snapshot_tibble(arx2$predictions) + attributes(train_data)$metadata$as_of <- max(train_data$time_value) + 5 + arx3 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = arx_args_list( + ahead = 1L, + adjust_latency = "extend_ahead" + ) + ) + # consistency check + expect_snapshot_tibble(arx3$predictions) + expect_equal( + arx3$predictions$target_date, + rep(attributes(train_data)$metadata$as_of + 1, times = 6) + ) + expect_equal( + arx3$predictions$target_date, + arx2$predictions$target_date + 5 + ) + expect_equal( + arx3$predictions$forecast_date, + arx2$predictions$forecast_date + 5 + ) + # not the same predictions + expect_false(all(arx2$predictions == arx3$predictions)) }) test_that("arx_classifier snapshots", { diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index f52888a6..fb02e740 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -6,7 +6,7 @@ x <- tibble( case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 ) %>% - as_epi_df() + as_epi_df(as_of = as.POSIXct("2024-05-17")) max_time <- max(x$time_value) class(attributes(x)$metadata$as_of) as_of <- attributes(x)$metadata$as_of @@ -25,6 +25,15 @@ slm_fit <- function(recipe, data = x) { fit(data = data) } + +# making a toy dataset with lag between geo_values +x_lagged <- x +x_lagged$time_value <- x$time_value - 1 +x_lagged$geo_value <- "other" +x_lagged <- add_row(x, x_lagged) +x_lagged +attributes(x_lagged)$metadata$as_of <- testing_as_of + test_that("epi_adjust_latency correctly extends the lags", { r5 <- epi_recipe(x) %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% @@ -34,7 +43,7 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1024"), class = "simpleError") + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1033"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = real_x) @@ -158,3 +167,82 @@ test_that("epi_adjust_latency warns against removing NA's beforehand", { ) }) # todo check that epi_adjust_latency errors for nonsense `as_of`'s + + + +# todo make sure that `epi_keys_checked` works correctly for extra epi_keys +test_that("epi_adjust_latency correctly extends the lags", { + r5 <- epi_recipe(x_lagged) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(method = "extend_lags", epi_keys_checked = NULL) + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1033"), class = "simpleError") + + # now trying with the as_of a reasonable distance in the future + fit5 <- slm_fit(r5, data = x_lagged) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_5_death_rate", "lag_11_death_rate", "lag_16_death_rate", + "lag_6_case_rate", "lag_10_case_rate" + ) + ) + latest <- get_test_data(r5, x_lagged) + latest$time_value %>% unique() + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(testing_as_of)) + + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r5, x) + pred <- predict(fit5, latest) + actual_solutions <- pred %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(5, 11, 16)) %>% + step_epi_lag(case_rate, lag = c(6, 10)) %>% + step_epi_ahead(death_rate, ahead = ahead) + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) + +test_that("`step_adjust_latency` only allows one instance of itself", {}) + +test_that("`step_adjust_latency` only uses the columns specified in the `...`", { + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(case_rate, method = "extend_lags") + + fit5 <- slm_fit(r5, data = real_x) + expect_equal(names(fit5$fit$fit$fit$coefficients), c("(Intercept)", "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", "lag_6_case_rate", "lag_10_case_rate")) +}) + +test_that("setting fixed_* works for `step_adjust_latency`", {}) + +test_that("printing step_adjust_latency results in expected output", { + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(case_rate, method = "extend_lags") + expect_snapshot(r5) +}) diff --git a/tests/testthat/test-utils-shift.R b/tests/testthat/test-utils-shift.R index 53788f8b..e69de29b 100644 --- a/tests/testthat/test-utils-shift.R +++ b/tests/testthat/test-utils-shift.R @@ -1,30 +0,0 @@ -time_range <- as.Date("2021-01-01") + 0:199 -x_adjust_ahead <- tibble( - geo_value = rep("place", 200), - time_value = time_range, - case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, - death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 -) %>% - as_epi_df(as_of = max(time_range) + 3) -# confirm the delay is right - -test_that("adjust_latency extend_ahead works", { - # testing that POSIXct converts correctly (as well as basic types) - expect_equal( - attributes(x_adjust_ahead)$metadata$as_of - max(x_adjust_ahead$time_value), - as.difftime(3, units = "days") - ) - object <- list(latency_adjustment = "extend_ahead", ahead = 7) - expect_no_error(adjusted_ahead <- adjust_latency(object, x_adjust_ahead)) - expect_type(adjusted_ahead, "integer") - expect_equal(adjusted_ahead, 3 + 7) -}) - -test_that("extend_ahead warns in case of extreme adjustment", { - # warns if the ahead is relatively small - attributes(x_adjust_ahead)$metadata$as_of <- - max(x_adjust_ahead$time_value) + 100 - object <- list(latency_adjustment = "extend_ahead", ahead = 7) - attributes(x_adjust_ahead)$metadata$time_type - expect_snapshot(adjust_latency(object, x_adjust_ahead)) -}) diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 1b8f93b9..b8ec678c 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -1,13 +1,16 @@ -time_values <- as.Date("2021-01-01") + 0:199 +time_values <- as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200] as_of <- max(time_values) + 5 max_time <- max(time_values) old_data <- tibble( - geo_value = rep("place", 200), - time_value = as.Date("2021-01-01") + 0:199, + geo_value = rep(c("place1", "place2"), 100), + time_value = as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200], case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, tmp_death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 ) %>% + # place2 is slightly more recent than place1 + mutate(time_value = as.Date(ifelse(geo_value == "place2", time_value + 1, time_value))) %>% as_epi_df(as_of = as_of) +old_data keys <- c("time_value", "geo_value") old_data <- old_data %>% full_join(epi_shift_single( @@ -54,17 +57,19 @@ test_that("construct_shift_tibble constructs the right tibble", { }) test_that("get_latency works", { - expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1), 4) - expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 3, 1), 5) + expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1, "geo_value"), 4) + expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 3, 1, "geo_value"), 5) # get_latency does't check the shift_amount - expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 4, 1), 6) + expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 4, 1, "geo_value"), 6) # ahead works correctly - expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, -1), -5) + expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, -1, "geo_value"), -5) # setting the wrong sign doubles the shift and gets the sign wrong - expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, 1), 5 + 4 * 2) + expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, 1, "geo_value"), 5 + 4 * 2) + # minimizing over everything decreases the latency + expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1, NULL), 3) }) -test_that("get_latency infers max_time to be the minimum `max time` across the columns", {}) +test_that("get_latency infers max_time to be the minimum `max time` across the epi_keys", {}) test_that("get_asof works", { info <- tribble( @@ -75,24 +80,29 @@ test_that("get_asof works", { "death_rate", "numeric", "raw", "original", "not_real", "numeric", "predictor", "derived" ) - expect_equal(set_asof(modified_data, info), as_of) + expect_equal(set_forecast_date(modified_data, info, "geo_value"), as_of) + expect_equal(set_forecast_date(modified_data, info, ""), as_of) + expect_equal(set_forecast_date(modified_data, info, NULL), as_of) }) test_that("get_latent_column_tibble infers latency and works correctly", { info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") case_lag <- get_latent_column_tibble( - shift_cols[1, ], modified_data, as_of, NULL, 1, info + shift_cols[1, ], modified_data, as_of, NULL, 1, info, + epi_keys_checked = "geo_value" ) expect_equal(case_lag, all_shift_cols[1, ]) death_lag <- get_latent_column_tibble( - shift_cols[2, ], modified_data, as_of, NULL, 1, info + shift_cols[2, ], modified_data, as_of, NULL, 1, info, + epi_keys_checked = "geo_value" ) expect_equal(death_lag, all_shift_cols[2, ]) both_lag <- get_latent_column_tibble( - shift_cols, modified_data, as_of, NULL, 1, info + shift_cols, modified_data, as_of, NULL, 1, info, + epi_keys_checked = "geo_value" ) expect_equal(both_lag, all_shift_cols[1:2, ]) }) @@ -123,7 +133,7 @@ test_that("get_latent_column_tibble assigns given latencies", { ahead_shift_cols <- construct_shift_tibble(c("case_rate"), test_recipe, "step_epi_ahead", "ahead") case_ahead <- get_latent_column_tibble( - ahead_shift_cols, modified_data, as_of, NULL, -1, info + ahead_shift_cols, modified_data, as_of, NULL, -1, info, "geo_value" ) expect_equal(case_ahead, all_shift_cols[3, ]) }) @@ -156,11 +166,52 @@ test_that("extend_either works", { epi_shift_single(old_data, "case_rate", -9, "ahead_9_case_rate", keys), by = keys ) %>% - arrange(time_value) + dplyr::bind_rows(tibble( + geo_value = c("place1", "place2"), + time_value = as.Date(c("2021-04-23", "2021-04-24")), case_rate = c(NA, NA), death_rate = c(NA, NA), + lag_8_case_rate = c(NA, NA), lag_11_death_rate = c(NA, NA), ahead_9_case_rate = c(NA, NA) + )) %>% + arrange(time_value, geo_value) expect_equal( - extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value), + extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value, geo_value), expected_post_shift ) + extended <- extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value, geo_value) +}) + + + + + +time_range <- as.Date("2021-01-01") + 0:199 +x_adjust_ahead <- tibble( + geo_value = rep("place", 200), + time_value = time_range, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = max(time_range) + 3) +# confirm the delay is right + +test_that("adjust_latency extend_ahead works", { + # testing that POSIXct converts correctly (as well as basic types) + expect_equal( + attributes(x_adjust_ahead)$metadata$as_of - max(x_adjust_ahead$time_value), + as.difftime(3, units = "days") + ) + object <- list(latency_adjustment = "extend_ahead", ahead = 7) + expect_no_error(adjusted_ahead <- adjust_latency(object, x_adjust_ahead)) + expect_type(adjusted_ahead, "integer") + expect_equal(adjusted_ahead, 3 + 7) +}) + +test_that("extend_ahead warns in case of extreme adjustment", { + # warns if the ahead is relatively small + attributes(x_adjust_ahead)$metadata$as_of <- + max(x_adjust_ahead$time_value) + 100 + object <- list(latency_adjustment = "extend_ahead", ahead = 7) + attributes(x_adjust_ahead)$metadata$time_type + testthat::expect_warning(adjust_latency(object, x_adjust_ahead), regexp = "The ahead has been adjusted by 100") }) # todo case where somehow columns of different roles are selected diff --git a/vignettes/articles/symptom-surveys.Rmd b/vignettes/articles/symptom-surveys.Rmd index 9558b80c..2f9db4b2 100644 --- a/vignettes/articles/symptom-surveys.Rmd +++ b/vignettes/articles/symptom-surveys.Rmd @@ -342,7 +342,7 @@ res_err4 <- res_all4 %>% knitr::kable( res_err4 %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -382,7 +382,7 @@ res_dif4 <- res_all4 %>% knitr::kable( res_dif4 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -401,7 +401,7 @@ ggplot_colors <- c("#FC4E07", "#00AFBB", "#E7B800") ggplot(res_dif4 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -540,7 +540,7 @@ knitr::kable( res_err2 %>% select(-ends_with("diff")) %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -563,7 +563,7 @@ Thanks to the extended length of the test period, we can also plot the trajector ggplot( res_err2 %>% group_by(model, lead, forecast_date) %>% - summarize(err = median(err)) %>% ungroup(), + summarise(err = median(err)) %>% ungroup(), aes(x = forecast_date, y = err) ) + geom_line(aes(color = model)) + @@ -594,7 +594,7 @@ res_dif2 <- res_all2 %>% knitr::kable( res_dif2 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -610,7 +610,7 @@ If we stratify and recompute p-values by forecast date, the bulk of p-values are ```{r} ggplot(res_dif2 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -653,7 +653,7 @@ err_by_lead <- res %>% ) %>% mutate(model = factor(model, labels = model_names[1:2])) %>% group_by(model, lead) %>% - summarize(err = median(err)) %>% + summarise(err = median(err)) %>% ungroup() ggplot(err_by_lead, aes(x = lead, y = err)) +