diff --git a/NAMESPACE b/NAMESPACE index 71e08fbf9..ea091004d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,8 +3,7 @@ S3method(apply_frosting,default) S3method(apply_frosting,epi_workflow) S3method(augment,epi_workflow) -S3method(bake,step_epi_ahead) -S3method(bake,step_epi_lag) +S3method(bake,step_epi_shift) S3method(detect_layer,frosting) S3method(detect_layer,workflow) S3method(epi_keys,default) @@ -17,12 +16,12 @@ S3method(extract_layers,frosting) S3method(extract_layers,workflow) S3method(predict,epi_workflow) S3method(prep,epi_recipe) -S3method(prep,step_epi_ahead) -S3method(prep,step_epi_lag) +S3method(prep,step_epi_shift) S3method(print,epi_workflow) S3method(print,frosting) S3method(print,step_epi_ahead) S3method(print,step_epi_lag) +S3method(print,step_epi_shift) S3method(refresh_blueprint,default_epi_recipe_blueprint) S3method(run_mold,default_epi_recipe_blueprint) S3method(slather,layer_naomit) @@ -71,6 +70,7 @@ export(smooth_arx_forecaster) export(step_epi_ahead) export(step_epi_lag) export(step_epi_naomit) +export(step_epi_shift) export(validate_layer) import(recipes) importFrom(generics,augment) diff --git a/R/epi_ahead.R b/R/epi_ahead.R deleted file mode 100644 index 3434b7d93..000000000 --- a/R/epi_ahead.R +++ /dev/null @@ -1,163 +0,0 @@ -#' Create a leading outcome -#' -#' `step_epi_ahead` creates a *specification* of a recipe step that -#' will add new columns of leading data. Leading data will -#' by default include NA values where the lag was induced. -#' These can be removed with [step_naomit()], or you may -#' specify an alternative filler value with the `default` -#' argument. -#' -#' @param recipe A recipe object. The step will be added to the -#' sequence of operations for this recipe. -#' @param ... One or more selector functions to choose variables -#' for this step. See [selections()] for more details. -#' @param role For model terms created by this step, what analysis role should -#' they be assigned? -#' @param trained A logical to indicate if the quantities for -#' preprocessing have been estimated. -#' @param ahead A vector of positive integers. Each specified column will be -#' lead for each value in the vector. -#' @param prefix A prefix for generated column names, default to "ahead_". -#' @param default Determines what fills empty rows -#' left by leading/lagging (defaults to NA). -#' @param keys A character vector of the keys in an epi_df -#' @param columns A character string of variable names that will -#' be populated (eventually) by the `terms` argument. -#' @param skip A logical. Should the step be skipped when the -#' recipe is baked by [bake()]? While all operations are baked -#' when [prep()] is run, some operations may not be able to be -#' conducted on new data (e.g. processing the outcome variable(s)). -#' Care should be taken when using `skip = TRUE` as it may affect -#' the computations for subsequent operations. -#' @param id A character string that is unique to this step to identify it. -#' @template step-return -#' -#' @details The step assumes that the data are already _in the proper sequential -#' order_ for leading. -#' -#' @family row operation steps -#' @export -#' -#' @examples -#' tib <- tibble::tibble( -#' x = 1:5, y = 1:5, -#' time_value = seq(as.Date("2020-01-01"), by = 1, length.out = 5), -#' geo_value = "ca" -#' ) %>% epiprocess::as_epi_df() -#' -#' library(recipes) -#' epi_recipe(y ~ x, data = tib) %>% -#' step_epi_lag(x, lag = 2:3) %>% -#' step_epi_ahead(y, ahead = 1) %>% -#' prep(tib) %>% -#' bake(tib) -step_epi_ahead <- - function(recipe, - ..., - role = "outcome", - trained = FALSE, - ahead = 1, - prefix = "ahead_", - default = NA, - keys = epi_keys(recipe), - columns = NULL, - skip = FALSE, - id = rand_id("epi_ahead")) { - add_step( - recipe, - step_epi_ahead_new( - terms = dplyr::enquos(...), - role = role, - trained = trained, - ahead = ahead, - prefix = prefix, - default = default, - keys = keys, - columns = columns, - skip = skip, - id = id - ) - ) - } - -step_epi_ahead_new <- - function(terms, role, trained, ahead, prefix, default, keys, - columns, skip, id) { - step( - subclass = "epi_ahead", - terms = terms, - role = role, - trained = trained, - ahead = ahead, - prefix = prefix, - default = default, - keys = keys, - columns = columns, - skip = skip, - id = id - ) - } - -#' @export -prep.step_epi_ahead <- function(x, training, info = NULL, ...) { - step_epi_ahead_new( - terms = x$terms, - role = x$role, - trained = TRUE, - ahead = x$ahead, - prefix = x$prefix, - default = x$default, - keys = x$keys, - columns = recipes_eval_select(x$terms, training, info), - skip = x$skip, - id = x$id - ) -} - -#' @export -bake.step_epi_ahead <- function(object, new_data, ...) { - if (!all(object$ahead == as.integer(object$ahead))) { - rlang::abort("step_epi_ahead requires 'ahead' argument to be integer valued.") - } - - grid <- tidyr::expand_grid( - col = object$columns, lag_val = -object$ahead) %>% - dplyr::mutate( - ahead_val = -lag_val, - newname = glue::glue("{object$prefix}{ahead_val}_{col}") - ) %>% - dplyr::select(-ahead_val) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - rlang::abort( - paste0("Name collision occured in `", class(object)[1], - "`. The following variable names already exists: ", - paste0(new_data_names[intersection], collapse = ", "), - ".")) - } - - ok <- object$keys - lagged <- purrr::reduce( - purrr::pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - dplyr::full_join, - by = ok - ) - - dplyr::full_join(new_data, lagged, by = ok) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(ok[-1]))) %>% - dplyr::arrange(time_value) %>% - dplyr::ungroup() - -} - -#' @export -print.step_epi_ahead <- - function(x, width = max(20, options()$width - 30), ...) { - ## TODO add printing of the lags - title <- "Leading " - recipes::print_step(x$columns, x$terms, x$trained, title, width) - invisible(x) - } diff --git a/R/epi_lag.R b/R/epi_lag.R deleted file mode 100644 index b7dc28b2a..000000000 --- a/R/epi_lag.R +++ /dev/null @@ -1,123 +0,0 @@ -#' Create a lagged predictor -#' -#' `step_epi_lag` creates a *specification* of a recipe step that -#' will add new columns of lagged data. Lagged data will -#' by default include NA values where the lag was induced. -#' These can be removed with [step_naomit()], or you may -#' specify an alternative filler value with the `default` -#' argument. -#' -#' @param lag A vector of positive integers. Each specified column will be -#' lagged for each value in the vector. -#' @template step-return -#' -#' @details The step assumes that the data are already _in the proper sequential -#' order_ for lagging. -#' -#' @family row operation steps -#' @export -#' @rdname step_epi_ahead -step_epi_lag <- - function(recipe, - ..., - role = "predictor", - trained = FALSE, - lag = 1, - prefix = "lag_", - default = NA, - keys = epi_keys(recipe), - columns = NULL, - skip = FALSE, - id = rand_id("epi_lag")) { - add_step( - recipe, - step_epi_lag_new( - terms = dplyr::enquos(...), - role = role, - trained = trained, - lag = lag, - prefix = prefix, - default = default, - keys = keys, - columns = columns, - skip = skip, - id = id - ) - ) - } - -step_epi_lag_new <- - function(terms, role, trained, lag, prefix, default, keys, - columns, skip, id) { - step( - subclass = "epi_lag", - terms = terms, - role = role, - trained = trained, - lag = lag, - prefix = prefix, - default = default, - keys = keys, - columns = columns, - skip = skip, - id = id - ) - } - -#' @export -prep.step_epi_lag <- function(x, training, info = NULL, ...) { - step_epi_lag_new( - terms = x$terms, - role = x$role, - trained = TRUE, - lag = x$lag, - prefix = x$prefix, - default = x$default, - keys = x$keys, - columns = recipes_eval_select(x$terms, training, info), - skip = x$skip, - id = x$id - ) -} - -#' @export -bake.step_epi_lag <- function(object, new_data, ...) { - if (!all(object$lag == as.integer(object$lag))) { - rlang::abort("step_epi_lag requires 'lag' argument to be integer valued.") - } - - grid <- tidyr::expand_grid(col = object$columns, lag_val = object$lag) %>% - dplyr::mutate(newname = glue::glue("{object$prefix}{lag_val}_{col}")) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - rlang::abort( - paste0("Name collision occured in `", class(object)[1], - "`. The following variable names already exists: ", - paste0(new_data_names[intersection], collapse = ", "), - ".")) - } - ok <- object$keys - lagged <- purrr::reduce( - purrr::pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - dplyr::full_join, - by = ok - ) - - dplyr::full_join(new_data, lagged, by = ok) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(ok[-1]))) %>% - dplyr::arrange(time_value) %>% - dplyr::ungroup() - -} - -#' @export -print.step_epi_lag <- - function(x, width = max(20, options()$width - 30), ...) { - ## TODO add printing of the lags - title <- "Lagging " - recipes::print_step(x$columns, x$terms, x$trained, title, width) - invisible(x) - } diff --git a/R/epi_shift.R b/R/epi_shift.R index 022058dd7..c61137706 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -2,8 +2,8 @@ #' #' This is a lower-level function. As such it performs no error checking. #' -#' @param x Data frame. Variables to lag -#' @param lags List. Each list element is a vector of lags. +#' @param x Data frame. Variables to shift +#' @param shifts List. Each list element is a vector of shifts. #' Negative values produce leads. The list should have the same #' length as the number of columns in `x`. #' @param time_value Vector. Same length as `x` giving time stamps. @@ -11,17 +11,17 @@ #' @param out_name Chr. The output list will use this as a prefix. #' #' @return a list of tibbles -epi_shift <- function(x, lags, time_value, keys = NULL, out_name = "x") { +epi_shift <- function(x, shifts, time_value, keys = NULL, out_name = "x") { if (!is.data.frame(x)) x <- data.frame(x) if (is.null(keys)) keys <- rep("empty", nrow(x)) p_in = ncol(x) - out_list <- tibble::tibble(i = 1:p_in, lag = lags) %>% - tidyr::unchop(lag) %>% # what is chop + out_list <- tibble::tibble(i = 1:p_in, shift = shifts) %>% + tidyr::unchop(shift) %>% # what is chop dplyr::mutate(name = paste0(out_name, 1:nrow(.))) %>% - # One list element for each lagged feature - purrr::pmap(function(i, lag, name) { + # One list element for each shifted feature + purrr::pmap(function(i, shift, name) { tibble(keys, - time_value = time_value + lag, # Shift back + time_value = time_value + shift, # Shift back !!name := x[[i]]) }) if (is.data.frame(keys)) common_names <- c(names(keys), "time_value") @@ -30,9 +30,9 @@ epi_shift <- function(x, lags, time_value, keys = NULL, out_name = "x") { purrr::reduce(out_list, dplyr::full_join, by = common_names) } -epi_shift_single <- function(x, col, lag_val, newname, key_cols) { +epi_shift_single <- function(x, col, shift_val, newname, key_cols) { x %>% dplyr::select(tidyselect::all_of(c(key_cols, col))) %>% - dplyr::mutate(time_value = time_value + lag_val) %>% + dplyr::mutate(time_value = time_value + shift_val) %>% dplyr::rename(!!newname := col) } diff --git a/R/get_test_data.R b/R/get_test_data.R index 59a163aab..5cdab7273 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -28,7 +28,7 @@ get_test_data <- function(recipe, x){ } ## CHECK if it is epi_df? - max_lags <- max(map_dbl(recipe$steps, ~ max(.x$lag %||% 0))) + max_lags <- max(map_dbl(recipe$steps, ~ max(.x$shift %||% 0))) # CHECK: Return NA if insufficient training data if (dplyr::n_distinct(x$time_value) < max_lags) { diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R new file mode 100644 index 000000000..b1fac6501 --- /dev/null +++ b/R/step_epi_shift.R @@ -0,0 +1,277 @@ +#' Create a shifted predictor +#' +#' `step_epi_lag` and `step_epi_ahead` create a *specification* of a recipe step +#' that will add new columns of shifted data. The former will created a lag +#' column, while the latter will create a lead column. Shifted data will +#' by default include NA values where the shift was induced. +#' These can be properly removed with [step_epi_naomit()], or you may +#' specify an alternative filler value with the `default` +#' argument. +#' +#' `step_epi_shift` is more general, accomodating both leads and lags +#' simultaneously. +#' +#' @param recipe A recipe object. The step will be added to the +#' sequence of operations for this recipe. +#' @param ... One or more selector functions to choose variables +#' for this step. See [selections()] for more details. +#' @param role For model terms created by this step, what analysis role should +#' they be assigned? +#' @param trained A logical to indicate if the quantities for +#' preprocessing have been estimated. +#' @param lag,ahead,shift A vector of integers. Each specified column will +#' be the lag or lead for each value in the vector. Lag integers must be +#' nonnegative, while ahead integers must be positive. +#' @param prefix A prefix to indicate what type of variable this is +#' @param default Determines what fills empty rows +#' left by leading/lagging (defaults to NA). +#' @param keys A character vector of the keys in an epi_df +#' @param columns A character string of variable names that will +#' be populated (eventually) by the `terms` argument. +#' @param skip A logical. Should the step be skipped when the +#' recipe is baked by [bake()]? While all operations are baked +#' when [prep()] is run, some operations may not be able to be +#' conducted on new data (e.g. processing the outcome variable(s)). +#' Care should be taken when using `skip = TRUE` as it may affect +#' the computations for subsequent operations. +#' @param id A unique identifier for the step +#' @param intended_direction used by `step_epi_shift` to determine whether +#' leading or lagging was intended. +#' @template step-return +#' +#' @details The step assumes that the data are already _in the proper sequential +#' order_ for shifting. +#' +#' The `prefix` and `id` arguments are unchangeable to ensure that the code runs +#' properly and to avoid inconsistency with naming. For `step_epi_ahead`, they +#' are always set to `"ahead_"` and `"epi_ahead"` respectively, while for +#' `step_epi_lag`, they are set to `"lag_"` and `"epi_lag`, respectively. +#' +#' @family row operation steps +#' @rdname step_epi_shift +#' @export +#' @examples +#' r <- epi_recipe(case_death_rate_subset) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_lag(death_rate, lag = c(0,7,14)) +#' r +#' +#' r <- epi_recipe(case_death_rate_subset) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_lag(death_rate, case_rate, lag = c(0,7,14)) %>% +#' # pretty odd, but possible +#' step_epi_shift(case_rate, shift = c(-5, 5), role = "predictor") +#' r +step_epi_lag <- + function(recipe, + ..., + role = "predictor", + trained = FALSE, + lag = 1, + prefix = "lag_", + default = NA, + keys = epi_keys(recipe), + columns = NULL, + skip = FALSE, + id = rand_id("epi_lag")) { + stopifnot("Lag values must be nonnegative integers" = + all(lag>=0 & lag == as.integer(lag))) + + step_epi_shift(recipe, + ..., + role = role, + trained = trained, + shift = lag, + prefix = prefix, + default = default, + keys = keys, + columns = columns, + skip = skip, + id = id, + intended_direction = "epi_lag" + ) + } + +#' Create a shifted predictor +#' +#' @family row operation steps +#' @rdname step_epi_shift +#' @export +step_epi_ahead <- + function(recipe, + ..., + role = "outcome", + trained = FALSE, + ahead = 1, + prefix = "ahead_", + default = NA, + keys = epi_keys(recipe), + columns = NULL, + skip = FALSE, + id = rand_id("epi_ahead")) { + + stopifnot("Ahead values must be nonnegative integers" = + all(ahead>=0 & ahead == as.integer(ahead))) + + step_epi_shift(recipe, + ..., + role = role, + trained = trained, + shift = -ahead, + prefix = prefix, + default = default, + keys = keys, + columns = columns, + skip = skip, + id = id, + intended_direction = "epi_ahead" + ) + } + +#' @family row operation steps +#' @rdname step_epi_shift +#' @export +step_epi_shift <- + function(recipe, + ..., + role, + trained = FALSE, + shift = 0, + prefix = "shift_", + default = NA, + keys = epi_keys(recipe), + columns = NULL, + skip = FALSE, + id = rand_id("epi_shift"), + intended_direction = NULL) { + add_step( + recipe, + step_epi_shift_new( + terms = dplyr::enquos(...), + role = role, + trained = trained, + shift = shift, + prefix = prefix, + default = default, + keys = keys, + columns = columns, + skip = skip, + id = id, + intended_direction = intended_direction + ) + ) + } + +step_epi_shift_new <- + function(terms, role, trained, shift, prefix, default, keys, + columns, skip, id, intended_direction) { + step( + subclass = c(intended_direction, "epi_shift"), + terms = terms, + role = role, + trained = trained, + shift = shift, + prefix = prefix, + default = default, + keys = keys, + columns = columns, + skip = skip, + id = id + ) + } + +#' @export +prep.step_epi_shift <- function(x, training, info = NULL, ...) { + step_epi_shift_new( + terms = x$terms, + role = x$role, + trained = TRUE, + shift = x$shift, + prefix = x$prefix, + default = x$default, + keys = x$keys, + columns = recipes_eval_select(x$terms, training, info), + skip = x$skip, + id = x$id, + intended_direction = x$intended_direction + ) +} + +#' @export +bake.step_epi_shift <- function(object, new_data, ...) { + grid <- tidyr::expand_grid(col = object$columns, shift_val = object$shift) %>% + dplyr::mutate(newname = glue::glue( + paste0("{object$prefix}","{abs(shift_val)}","_{col}") + ) + ) + ## ensure no name clashes + new_data_names <- colnames(new_data) + intersection <- new_data_names %in% grid$newname + if (any(intersection)) { + rlang::abort( + paste0("Name collision occured in `", class(object)[1], + "`. The following variable names already exists: ", + paste0(new_data_names[intersection], collapse = ", "), + ".")) + } + ok <- object$keys + shifted <- purrr::reduce( + purrr::pmap(grid, epi_shift_single, x = new_data, key_cols = ok), + dplyr::full_join, + by = ok + ) + + dplyr::full_join(new_data, shifted, by = ok) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(ok[-1]))) %>% + dplyr::arrange(time_value) %>% + dplyr::ungroup() + +} + +#' @export +print.step_epi_lag <- function(x, ...) { + NextMethod(title = "Lagging ", shift = abs(x$shift)) +} + +#' @export +print.step_epi_ahead <- function(x, ...) { + NextMethod(title = "Leading ", shift = abs(x$shift)) +} + +#' @export +print.step_epi_shift <- function(x, + width = max(20, options()$width - 30), + title = "Shifting ", + shift = x$shift, + ...) { + print_step_shift(x$columns, x$terms, x$trained, title, width, shift = shift) + invisible(x) +} + + +print_step_shift <- function( + tr_obj = NULL, untr_obj = NULL, trained = FALSE, title = NULL, + width = max(20, options()$width - 30), case_weights = NULL, shift = NULL) { + + cat(title) + if (trained) txt <- recipes::format_ch_vec(tr_obj, width = width) + else txt <- recipes::format_selectors(untr_obj, width = width) + if (length(txt) == 0L) txt <- "" + cat(txt) + if (trained) { + if (is.null(case_weights)) cat(" [trained]") + else { + case_weights_ind <- ifelse(case_weights, "weighted", + "ignored weights") + trained_txt <- paste(case_weights_ind, "trained", + sep = ", ") + trained_txt <- paste0(" [", trained_txt, "]") + cat(trained_txt) + } + } + cat(" by ") + txt <- recipes::format_ch_vec(shift) + cat(txt) + cat("\n") + invisible(NULL) +} diff --git a/man/epi_shift.Rd b/man/epi_shift.Rd index 48cc5f6e6..058505faa 100644 --- a/man/epi_shift.Rd +++ b/man/epi_shift.Rd @@ -4,12 +4,12 @@ \alias{epi_shift} \title{Shift predictors while maintaining grouping and time_value ordering} \usage{ -epi_shift(x, lags, time_value, keys = NULL, out_name = "x") +epi_shift(x, shifts, time_value, keys = NULL, out_name = "x") } \arguments{ -\item{x}{Data frame. Variables to lag} +\item{x}{Data frame. Variables to shift} -\item{lags}{List. Each list element is a vector of lags. +\item{shifts}{List. Each list element is a vector of shifts. Negative values produce leads. The list should have the same length as the number of columns in \code{x}.} diff --git a/man/step_epi_ahead.Rd b/man/step_epi_ahead.Rd deleted file mode 100644 index 006c224c3..000000000 --- a/man/step_epi_ahead.Rd +++ /dev/null @@ -1,117 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_ahead.R, R/epi_lag.R -\name{step_epi_ahead} -\alias{step_epi_ahead} -\alias{step_epi_lag} -\title{Create a leading outcome} -\usage{ -step_epi_ahead( - recipe, - ..., - role = "outcome", - trained = FALSE, - ahead = 1, - prefix = "ahead_", - default = NA, - keys = epi_keys(recipe), - columns = NULL, - skip = FALSE, - id = rand_id("epi_ahead") -) - -step_epi_lag( - recipe, - ..., - role = "predictor", - trained = FALSE, - lag = 1, - prefix = "lag_", - default = NA, - keys = epi_keys(recipe), - columns = NULL, - skip = FALSE, - id = rand_id("epi_lag") -) -} -\arguments{ -\item{recipe}{A recipe object. The step will be added to the -sequence of operations for this recipe.} - -\item{...}{One or more selector functions to choose variables -for this step. See \code{\link[=selections]{selections()}} for more details.} - -\item{role}{For model terms created by this step, what analysis role should -they be assigned?} - -\item{trained}{A logical to indicate if the quantities for -preprocessing have been estimated.} - -\item{ahead}{A vector of positive integers. Each specified column will be -lead for each value in the vector.} - -\item{prefix}{A prefix for generated column names, default to "ahead_".} - -\item{default}{Determines what fills empty rows -left by leading/lagging (defaults to NA).} - -\item{keys}{A character vector of the keys in an epi_df} - -\item{columns}{A character string of variable names that will -be populated (eventually) by the \code{terms} argument.} - -\item{skip}{A logical. Should the step be skipped when the -recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked -when \code{\link[=prep]{prep()}} is run, some operations may not be able to be -conducted on new data (e.g. processing the outcome variable(s)). -Care should be taken when using \code{skip = TRUE} as it may affect -the computations for subsequent operations.} - -\item{id}{A character string that is unique to this step to identify it.} - -\item{lag}{A vector of positive integers. Each specified column will be -lagged for each value in the vector.} -} -\value{ -An updated version of \code{recipe} with the new step added to the -sequence of any existing operations. - -An updated version of \code{recipe} with the new step added to the -sequence of any existing operations. -} -\description{ -\code{step_epi_ahead} creates a \emph{specification} of a recipe step that -will add new columns of leading data. Leading data will -by default include NA values where the lag was induced. -These can be removed with \code{\link[=step_naomit]{step_naomit()}}, or you may -specify an alternative filler value with the \code{default} -argument. - -\code{step_epi_lag} creates a \emph{specification} of a recipe step that -will add new columns of lagged data. Lagged data will -by default include NA values where the lag was induced. -These can be removed with \code{\link[=step_naomit]{step_naomit()}}, or you may -specify an alternative filler value with the \code{default} -argument. -} -\details{ -The step assumes that the data are already \emph{in the proper sequential -order} for leading. - -The step assumes that the data are already \emph{in the proper sequential -order} for lagging. -} -\examples{ -tib <- tibble::tibble( - x = 1:5, y = 1:5, - time_value = seq(as.Date("2020-01-01"), by = 1, length.out = 5), - geo_value = "ca" - ) \%>\% epiprocess::as_epi_df() - -library(recipes) -epi_recipe(y ~ x, data = tib) \%>\% - step_epi_lag(x, lag = 2:3) \%>\% - step_epi_ahead(y, ahead = 1) \%>\% - prep(tib) \%>\% - bake(tib) -} -\concept{row operation steps} diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd new file mode 100644 index 000000000..2711a7209 --- /dev/null +++ b/man/step_epi_shift.Rd @@ -0,0 +1,129 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_epi_shift.R +\name{step_epi_lag} +\alias{step_epi_lag} +\alias{step_epi_ahead} +\alias{step_epi_shift} +\title{Create a shifted predictor} +\usage{ +step_epi_lag( + recipe, + ..., + role = "predictor", + trained = FALSE, + lag = 1, + prefix = "lag_", + default = NA, + keys = epi_keys(recipe), + columns = NULL, + skip = FALSE, + id = rand_id("epi_lag") +) + +step_epi_ahead( + recipe, + ..., + role = "outcome", + trained = FALSE, + ahead = 1, + prefix = "ahead_", + default = NA, + keys = epi_keys(recipe), + columns = NULL, + skip = FALSE, + id = rand_id("epi_ahead") +) + +step_epi_shift( + recipe, + ..., + role, + trained = FALSE, + shift = 0, + prefix = "shift_", + default = NA, + keys = epi_keys(recipe), + columns = NULL, + skip = FALSE, + id = rand_id("epi_shift"), + intended_direction = NULL +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[=selections]{selections()}} for more details.} + +\item{role}{For model terms created by this step, what analysis role should +they be assigned?} + +\item{trained}{A logical to indicate if the quantities for +preprocessing have been estimated.} + +\item{lag, ahead, shift}{A vector of integers. Each specified column will +be the lag or lead for each value in the vector. Lag integers must be +nonnegative, while ahead integers must be positive.} + +\item{prefix}{A prefix to indicate what type of variable this is} + +\item{default}{Determines what fills empty rows +left by leading/lagging (defaults to NA).} + +\item{keys}{A character vector of the keys in an epi_df} + +\item{columns}{A character string of variable names that will +be populated (eventually) by the \code{terms} argument.} + +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} + +\item{id}{A unique identifier for the step} + +\item{intended_direction}{used by \code{step_epi_shift} to determine whether +leading or lagging was intended.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +\code{step_epi_lag} and \code{step_epi_ahead} create a \emph{specification} of a recipe step +that will add new columns of shifted data. The former will created a lag +column, while the latter will create a lead column. Shifted data will +by default include NA values where the shift was induced. +These can be properly removed with \code{\link[=step_epi_naomit]{step_epi_naomit()}}, or you may +specify an alternative filler value with the \code{default} +argument. +} +\details{ +\code{step_epi_shift} is more general, accomodating both leads and lags +simultaneously. + +The step assumes that the data are already \emph{in the proper sequential +order} for shifting. + +The \code{prefix} and \code{id} arguments are unchangeable to ensure that the code runs +properly and to avoid inconsistency with naming. For \code{step_epi_ahead}, they +are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for +\code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. +} +\examples{ +r <- epi_recipe(case_death_rate_subset) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_lag(death_rate, lag = c(0,7,14)) +r + +r <- epi_recipe(case_death_rate_subset) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_lag(death_rate, case_rate, lag = c(0,7,14)) \%>\% + # pretty odd, but possible + step_epi_shift(case_rate, shift = c(-5, 5), role = "predictor") +r +} +\concept{row operation steps} diff --git a/tests/testthat/test-epi_shift_internal.R b/tests/testthat/test-epi_shift_internal.R new file mode 100644 index 000000000..3f8559d8c --- /dev/null +++ b/tests/testthat/test-epi_shift_internal.R @@ -0,0 +1,65 @@ +library(dplyr) +library(epiprocess) +library(parsnip) +library(workflows) + +# Random generated dataset +x <- tibble(geo_value = rep("place",200), + time_value = as.Date("2021-01-01") + 0:199, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5*1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5*1:200) + 1) %>% + as_epi_df() + +slm_fit <- function(recipe, data = x) { + workflow() %>% + add_recipe(recipe) %>% + add_model(linear_reg()) %>% + fit(data = data) +} + +test_that("Values for ahead and lag must be integer values", { + expect_error( + r1 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = 3.6) %>% + step_epi_lag(death_rate, lag = 1.9) + ) +}) + +test_that("A negative lag value should should throw an error", { + expect_error( + r2 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_lag(death_rate, lag = -7) + ) +}) + +test_that("A nonpositive ahead value should throw an error", { + expect_error( + r3 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = -7) %>% + step_epi_lag(death_rate, lag = 7) + ) +}) + +test_that("Values for ahead and lag cannot be duplicates", { + r4 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_lag(death_rate, lag = 7) %>% + step_epi_lag(death_rate, lag = 7) + expect_error( + slm_fit(r4) + ) +}) + +test_that("Check that epi_lag shifts applies the shift", { + r5 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_lag(death_rate, lag = c(0,7,14)) + + # Two steps passed here + expect_equal(length(r5$steps),2) + fit5 <- slm_fit(r5) + + # Should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients),4) +})