From d6b263958cbb0a7a817c3f0a557393b0646aef8f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 16 Mar 2023 20:08:58 -0700 Subject: [PATCH 01/32] Clarify archive grouping-related documentation --- R/methods-epi_archive.R | 19 ++++++++++--------- man/group_by.epi_archive.Rd | 18 ++++++++++-------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 348c5c01..0b870e56 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -512,13 +512,13 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' @param .data An `epi_archive` or `grouped_epi_archive` #' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); -#' * In `group_by`: unquoted variable name(s) or other ["data -#' masking"][dplyr::dplyr_data_masking] expression(s). It's possible to use -#' [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to #' perform grouping, but note that, if you are regrouping an already-grouped #' `.data` object, the calculations will be carried out ignoring such grouping #' (same as [in dplyr][dplyr::group_by]). -#' * In `ungroup`: either +#' * For `ungroup`: either #' * empty, in order to remove the grouping and output an `epi_archive`; or #' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] #' expression(s), in order to remove the matching variables from the list of @@ -527,12 +527,13 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' the variable selection from `...` only; if `TRUE`, the output will be #' grouped by the current grouping variables plus the variable selection from #' `...`. -#' @param .drop As in [`dplyr::group_by`]; determines treatment of factor +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of factor #' columns. -#' @param x a `grouped_epi_archive`, or, in `is_grouped_epi_archive`, any object -#' @param .tbl An `epi_archive` or `grouped_epi_archive` (`epi_archive` -#' dispatches to the S3 default method, and `grouped_epi_archive` dispatches -#' its own S3 method) +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) #' #' @details #' diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index dac7ba0d..abdb6a6b 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -27,12 +27,13 @@ is_grouped_epi_archive(x) \item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); \itemize{ -\item In \code{group_by}: unquoted variable name(s) or other \link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to use -\code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to perform grouping, but note that, if you are regrouping an already-grouped \code{.data} object, the calculations will be carried out ignoring such grouping (same as \link[dplyr:group_by]{in dplyr}). -\item In \code{ungroup}: either +\item For \code{ungroup}: either \itemize{ \item empty, in order to remove the grouping and output an \code{epi_archive}; or \item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} @@ -46,14 +47,15 @@ the variable selection from \code{...} only; if \code{TRUE}, the output will be grouped by the current grouping variables plus the variable selection from \code{...}.} -\item{.drop}{As in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor columns.} -\item{x}{a \code{grouped_epi_archive}, or, in \code{is_grouped_epi_archive}, any object} +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} -\item{.tbl}{An \code{epi_archive} or \code{grouped_epi_archive} (\code{epi_archive} -dispatches to the S3 default method, and \code{grouped_epi_archive} dispatches -its own S3 method)} +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} } \description{ \code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} From 9a092afb1edd9d96f49ca7b22bbe6389d8047624 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 16 Mar 2023 16:38:01 -0700 Subject: [PATCH 02/32] Update epix_slide code to have `reframe`-like row behavior `epix_slide` already has summarize/reframe-like column behavior: it outputs the grouping columns + columns from the user computations. Make it also have `reframe`-like row behavior: * Accept any number of elements/rows out of user computations (instead of requiring either 1 or a particular number derived from an inferred/assumed "computation effective key"). * Do not recycle length-1 user computation outputs to the "expected" length/nrow. --- R/grouped_epi_archive.R | 87 ++--------------------------------------- 1 file changed, 3 insertions(+), 84 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 1c6bd311..df93ba72 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -254,23 +254,10 @@ grouped_epi_archive = Abort("`all_versions` must be TRUE or FALSE.") } - # Each computation is expected to output a data frame with either - # one element/row total or one element/row per encountered - # nongrouping, nontime, nonversion key value. These nongrouping, - # nontime, nonversion key columns can be seen as the "effective" key - # of the computation; the computation might return an object that - # reports a different key or no key, but the "effective" key should - # still be a valid unique key for the data, and is something that we - # could use even with `.keep = FALSE`. - comp_effective_key_vars = - setdiff(key(private$ungrouped$DT), - c(private$vars, "time_value", "version")) - # Computation for one group, one time value comp_one_grp = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # Carry out the specified computation comp_value = f(.data_group, .group_key, ...) @@ -282,77 +269,12 @@ grouped_epi_archive = .data_group = .data_group$DT } - # Calculate the number of output elements/rows we expect the - # computation to output: one per distinct "effective computation - # key variable" value encountered in the input. - # - # Note: this mirrors how `epi_slide` does things if we're using - # unique keys, but can diverge if using nonunique keys. The - # `epi_slide` approach of counting occurrences of the - # `ref_time_value` in the `time_value` column, which helps lines - # up the computation results with corresponding rows of the - # input data, wouldn't quite apply here: we'd want to line up - # with rows (from the same group) with `version` matching the - # `ref_time_value`, but would still need to summarize these rows - # somehow and drop the `time_value` input column, but this - # summarization requires something like a to-be-unique output - # key to determine a sensible number of rows to output (and the - # contents of those rows). - count = - if (length(comp_effective_key_vars) != 0L) { - comp_effective_key_vals_in_comp_input = - if (data.table::is.data.table(.data_group)) { - .data_group[, comp_effective_key_vars, with=FALSE] - } else { - .data_group[, comp_effective_key_vars] - } - sum(!duplicated(comp_effective_key_vals_in_comp_input)) - } else { - # Same idea as above, but accounting for `duplicated` working - # differently (outputting `logical(0)`) on 0-column inputs - # rather than matching the number of rows. (Instead, we use - # the same count we would get if we were counting distinct - # values of a column defined as `rep(val, target_n_rows)`.) - if (nrow(.data_group) == 0L) { - 0L - } else { - 1L - } - } - - # If we get back an atomic vector - if (is.atomic(comp_value)) { - if (length(comp_value) == 1) { - comp_value = rep(comp_value, count) - } - # If not a singleton, should be the right length, else abort - else if (length(comp_value) != count) { - Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.') - } - } - - # If we get back a data frame - else if (is.data.frame(comp_value)) { - if (nrow(comp_value) == 1) { - comp_value = rep(list(comp_value), count) - } - # If not a single row, should be the right length, else abort - else if (nrow(comp_value) != count) { - Abort("If the slide computation returns a data frame, then it must have a single row, or else one row per appearance of the reference time value in the local window.") - } - # Make into a list - else { - comp_value = split(comp_value, seq_len(nrow(comp_value))) - } - } - - # If neither an atomic vector data frame, then abort - else { + if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } - + # Label every result row with the `ref_time_value`: - return(tibble::tibble(time_value = rep(.env$ref_time_value, count), + return(tibble::tibble(time_value = .env$ref_time_value, !!new_col := .env$comp_value)) } @@ -391,7 +313,6 @@ grouped_epi_archive = group_modify_fn = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a @@ -402,7 +323,6 @@ grouped_epi_archive = .data_group_archive$DT = .data_group comp_one_grp(.data_group_archive, .group_key, f = f, ..., ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col ) } @@ -414,7 +334,6 @@ grouped_epi_archive = dplyr::group_modify(group_modify_fn, f = f, ..., ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col, .keep = TRUE) ) From d2edb2e30061489e69281f2d2fc7aca1a87ed198 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 20 Mar 2023 03:40:06 -0700 Subject: [PATCH 03/32] Add missing `grouped_epi_archive` test file --- tests/testthat/test-grouped_epi_archive.R | 49 +++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/testthat/test-grouped_epi_archive.R diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R new file mode 100644 index 00000000..341259ec --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive.R @@ -0,0 +1,49 @@ + +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive = + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version)) %>% + as_epi_archive(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys = toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) + ) + grouping_cols = c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + expect_identical( + toy_archive %>% + group_by(geo_value, age_group, .drop=FALSE) %>% + epix_slide(f = ~ sum(.x$value), before = 20) %>% + ungroup(), + toy_archive %>% + group_by(geo_value, age_group, .drop=TRUE) %>% + epix_slide(f = ~ sum(.x$value), before = 20) %>% + ungroup() + ) +}) From bf99f30d7b176300637f96e50a293ce7de8ece27 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 20 Mar 2023 03:40:31 -0700 Subject: [PATCH 04/32] Update `.drop`+`epix_slide` given `reframe`-like `epix_slide` Previously, broadcasting in `epix_slide` would remove any rows filled in by `.drop=FALSE`, making the `.drop` setting more or less useless. Now that `epix_slide` does not broadcast, update docs&tests accordingly. Additionally, add some missing validation for the `.drop` parameter. Other changes: * Tweak code using `setDT` to do it in a pipe to read better (avoid a line setting an `out_DT` variable to a non-DT value, even temporarily). * Format some roxygen comments. --- R/methods-epi_archive.R | 56 +++++++++++++---------- man/group_by.epi_archive.Rd | 21 +++------ tests/testthat/test-grouped_epi_archive.R | 53 ++++++++++++++++----- 3 files changed, 79 insertions(+), 51 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 0b870e56..c6c3eb52 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -478,16 +478,17 @@ epix_detailed_restricted_mutate = function(.data, ...) { } else { # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, - # then `setDT`-ing it in place to be a `data.table`. The key should still be - # valid (assuming that the user did not explicitly alter `key(.data$DT)` or - # the columns by reference somehow within `...` tidyeval-style computations, - # or trigger refcount-1 alterations due to still having >1 refcounts on the - # columns), so in between, set the "sorted" attribute accordingly to prevent - # attempted sorting (including potential extra copies) or sortedness - # checking, then `setDT`. - out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) # tibble - data.table::setattr(out_DT, "sorted", data.table::key(.data$DT)) - data.table::setDT(out_DT, key=key(.data$DT)) + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key=key(.data$DT)) out_archive = .data$clone() out_archive$DT <- out_DT request_names = names(col_modify_cols) @@ -527,8 +528,8 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' the variable selection from `...` only; if `TRUE`, the output will be #' grouped by the current grouping variables plus the variable selection from #' `...`. -#' @param .drop As described in [`dplyr::group_by`]; determines treatment of factor -#' columns. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. #' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for #' `is_grouped_epi_archive`: any object #' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or @@ -615,19 +616,10 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): #' toy_archive %>% group_by(geo_value) %>% groups() #' -#' # `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for -#' # the missing groups identified by `dplyr`, but the row-recycling rules will -#' # exclude the corresponding outputs of `f` from the output of the slide: -#' all.equal( -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop=FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup(), -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop=TRUE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() -#' ) +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop=FALSE) %>% +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() #' #' @importFrom dplyr group_by #' @export @@ -636,6 +628,20 @@ epix_detailed_restricted_mutate = function(.data, ...) { group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { # `add` makes no difference; this is an ungrouped `epi_archive`. detailed_mutate = epix_detailed_restricted_mutate(.data, ...) + if (!rlang::is_bool(.drop)) { + Abort("`.drop` must be TRUE or FALSE") + } else if (!.drop) { + grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + } else if (any(diff(grouping_col_is_factor) == -1L)) { + Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor") + } + } grouped_epi_archive$new(detailed_mutate[["archive"]], detailed_mutate[["request_names"]], drop = .drop) diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index abdb6a6b..aee0a07b 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -47,8 +47,8 @@ the variable selection from \code{...} only; if \code{TRUE}, the output will be grouped by the current grouping variables plus the variable selection from \code{...}.} -\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor -columns.} +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} \item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for \code{is_grouped_epi_archive}: any object} @@ -139,18 +139,9 @@ toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): toy_archive \%>\% group_by(geo_value) \%>\% groups() -# `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for -# the missing groups identified by `dplyr`, but the row-recycling rules will -# exclude the corresponding outputs of `f` from the output of the slide: -all.equal( - toy_archive \%>\% - group_by(geo_value, age_group, .drop=FALSE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% - ungroup(), - toy_archive \%>\% - group_by(geo_value, age_group, .drop=TRUE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% - ungroup() -) +toy_archive \%>\% + group_by(geo_value, age_group, .drop=FALSE) \%>\% + epix_slide(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() } diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 341259ec..a348b815 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -1,4 +1,3 @@ - test_that("Grouping, regrouping, and ungrouping archives works as intended", { # From an example: library(dplyr) @@ -36,14 +35,46 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) ) - expect_identical( - toy_archive %>% - group_by(geo_value, age_group, .drop=FALSE) %>% - epix_slide(f = ~ sum(.x$value), before = 20) %>% - ungroup(), - toy_archive %>% - group_by(geo_value, age_group, .drop=TRUE) %>% - epix_slide(f = ~ sum(.x$value), before = 20) %>% - ungroup() - ) + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "\\.drop.*TRUE or FALSE") + expect_warning(toy_archive %>% group_by(.drop=FALSE), + class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + expect_warning(grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor") + expect_identical(grouped_factor_then_nonfactor %>% + epix_slide(before = 10, s = sum(value)) %>% + ungroup(), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + as_of = as.Date("2000-01-02"), # bug; issue #213 + additional_metadata = list(other_keys = "age_group")) %>% + # put back in expected order; see issue #166: + select(age_group, geo_value, time_value, s)) + expect_identical(toy_archive %>% + group_by(geo_value, age_group, .drop=FALSE) %>% + epix_slide(before = 10, s = sum(value)) %>% + ungroup(), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + as_epi_df(as_of = as.Date("2000-01-02"), # bug; issue 213 + additional_metadata = list(other_keys = "age_group")) %>% + # put back in expected order; see issue #166: + select(geo_value, age_group, time_value, s)) }) From 74631afecfd2c8d7a8cab9dcb8b4c8169416a7bc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 20 Mar 2023 11:31:15 -0700 Subject: [PATCH 05/32] If `epix_slide` outputs an `epi_df`, use `versions_end` for `as_of` See #213. --- R/grouped_epi_archive.R | 12 ++++++++++++ tests/testthat/test-epix_slide.R | 17 ++++++++++++++++- tests/testthat/test-grouped_epi_archive.R | 4 ++-- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index df93ba72..0efca351 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -427,6 +427,18 @@ grouped_epi_archive = y = unique(private$ungrouped$DT[, ..cols]) x = dplyr::left_join(y, x, by = cols) } + + if (is_epi_df(x)) { + # The analogue of `epi_df`'s `as_of` metadata for an archive is + # `$versions_end`, at least in the current absence of + # separate fields/columns denoting the "archive version" with a + # different resolution, or from the perspective of a different + # stage of a data pipeline. The `as_of` that is automatically + # derived won't always match; override: + + attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + } + return(x) } ) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 9ef2f9af..c352f90a 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -27,7 +27,7 @@ test_that("epix_slide works as intended",{ 2^6+2^3, 2^10+2^9, 2^15+2^14)) %>% - as_epi_df(as_of = 4) %>% # Also a bug (issue #213) + as_epi_df(as_of = 7) %>% group_by(geo_value) expect_identical(xx1,xx2) # * @@ -348,3 +348,18 @@ test_that("epix_slide with all_versions option works as intended",{ expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical }) + +test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { + ea_updated_stale = ea$clone() + ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) + # + expect_identical( + ea_updated_stale %>% + group_by(geo_value) %>% + epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>% + ungroup() %>% + attr("metadata") %>% + .$as_of, + 10 + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index a348b815..64c482bf 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -57,7 +57,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { mutate(age_group = ordered(age_group, c("pediatric", "adult")), time_value = as.Date(time_value)) %>% as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 - as_of = as.Date("2000-01-02"), # bug; issue #213 + as_of = as.Date("2000-01-03"), additional_metadata = list(other_keys = "age_group")) %>% # put back in expected order; see issue #166: select(age_group, geo_value, time_value, s)) @@ -73,7 +73,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { "us", "adult", "2000-01-03", 255) %>% mutate(age_group = ordered(age_group, c("pediatric", "adult")), time_value = as.Date(time_value)) %>% - as_epi_df(as_of = as.Date("2000-01-02"), # bug; issue 213 + as_epi_df(as_of = as.Date("2000-01-03"), additional_metadata = list(other_keys = "age_group")) %>% # put back in expected order; see issue #166: select(geo_value, age_group, time_value, s)) From 8039ed35e675610b627d82394a4e63ff59e21363 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 20 Mar 2023 18:06:57 -0700 Subject: [PATCH 06/32] Update checks to also run on PRs (but not pushes) to `dev` --- .github/workflows/R-CMD-check.yaml | 2 +- .github/workflows/pkgdown.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f4b17a4b..2fca5dbd 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] name: R-CMD-check diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 087f0b05..847176d3 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] release: types: [published] workflow_dispatch: From 9d10b5fc3799f9dd43e9c41f19eb6d82f1f583da Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 24 Mar 2023 00:37:32 -0700 Subject: [PATCH 07/32] Fix missing quotes in _pkgdown.yml --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index d9e2ec79..bba3ea8d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -56,7 +56,7 @@ reference: desc: Details on `epi_archive`, and basic functionality. - contents: - matches("archive") -- title: `epix_*()` functions +- title: "`epix_*()` functions" desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. - contents: - starts_with("epix") From b8f768a3911f92aa858483d6c73c41b596684c1f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 27 Mar 2023 19:57:49 -0700 Subject: [PATCH 08/32] Update `epi[x]_slide` comparisons given epix like `reframe` --- R/methods-epi_archive.R | 35 +++++----- man/epix_slide.Rd | 35 +++++----- vignettes/advanced.Rmd | 146 ++++++++++++++++++++++++++++------------ 3 files changed, 140 insertions(+), 76 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index c6c3eb52..b4b4694a 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -743,27 +743,28 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' ends); `epi_slide` windows extend from `before` time steps before a #' `ref_time_value` through `after` time steps after `ref_time_value`. #' 3. The input class and columns are similar but different: `epix_slide` -#' keeps all columns and the `epi_df`-ness of the first input to the -#' computation; `epi_slide` only provides the grouping variables in the second -#' input, and will convert the first input into a regular tibble if the -#' grouping variables include the essential `geo_value` column. +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. #' 4. The output class and columns are similar but different: `epix_slide()` -#' returns a tibble containing only the grouping variables, `time_value`, and -#' the new column(s) from the slide computation `f`, whereas `epi_slide()` -#' returns an `epi_df` with all original variables plus the new columns from -#' the slide computation. -#' 5. Unless grouping by `geo_value` and all `other_keys`, there will be -#' row-recyling behavior meant to resemble `epi_slide`'s results, based on the -#' distinct combinations of `geo_value`, `time_value`, and all `other_keys` -#' present in the version data with `time_value` matching one of the -#' `ref_time_values`. However, due to reporting latency or reporting dropping -#' in and out, this may not exactly match the behavior of "corresponding" -#' `epi_df`s. +#' returns an `epi_df` or tibble containing only the grouping variables, +#' `time_value`, and the new column(s) from the slide computations, whereas +#' `epi_slide()` returns an `epi_df` with all original variables plus the new +#' columns from the slide computations. +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::reframe`] in `dplyr` 1.1.0 +#' or[`dplyr::summarize`] in `dplyr` 1.0.0, while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. #' 6. Similar to the row recyling, while `all_rows=TRUE` is designed to mimic #' `epi_slide` by completing based on distinct combinations of `geo_value`, #' `time_value`, and all `other_keys` present in the version data with -#' `time_value` matching one of the `ref_time_values`, this can have unexpected -#' behaviors due reporting latency or reporting dropping in and out. +#' `time_value` matching one of the `ref_time_values`, this can have +#' unexpected behaviors due reporting latency or reporting dropping in and +#' out. #' 7. The `ref_time_values` default for `epix_slide` is based on making an #' evenly-spaced sequence out of the `version`s in the `DT` plus the #' `versions_end`, rather than the `time_value`s. diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 54d16e86..da8b9213 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -126,27 +126,28 @@ time interval (e.g., day) are only published after that time interval ends); \code{epi_slide} windows extend from \code{before} time steps before a \code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. \item The input class and columns are similar but different: \code{epix_slide} -keeps all columns and the \code{epi_df}-ness of the first input to the -computation; \code{epi_slide} only provides the grouping variables in the second -input, and will convert the first input into a regular tibble if the -grouping variables include the essential \code{geo_value} column. +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. \item The output class and columns are similar but different: \code{epix_slide()} -returns a tibble containing only the grouping variables, \code{time_value}, and -the new column(s) from the slide computation \code{f}, whereas \code{epi_slide()} -returns an \code{epi_df} with all original variables plus the new columns from -the slide computation. -\item Unless grouping by \code{geo_value} and all \code{other_keys}, there will be -row-recyling behavior meant to resemble \code{epi_slide}'s results, based on the -distinct combinations of \code{geo_value}, \code{time_value}, and all \code{other_keys} -present in the version data with \code{time_value} matching one of the -\code{ref_time_values}. However, due to reporting latency or reporting dropping -in and out, this may not exactly match the behavior of "corresponding" -\code{epi_df}s. +returns an \code{epi_df} or tibble containing only the grouping variables, +\code{time_value}, and the new column(s) from the slide computations, whereas +\code{epi_slide()} returns an \code{epi_df} with all original variables plus the new +columns from the slide computations. +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:reframe]{dplyr::reframe}} in \code{dplyr} 1.1.0 +or\code{\link[dplyr:summarise]{dplyr::summarize}} in \code{dplyr} 1.0.0, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. \item Similar to the row recyling, while \code{all_rows=TRUE} is designed to mimic \code{epi_slide} by completing based on distinct combinations of \code{geo_value}, \code{time_value}, and all \code{other_keys} present in the version data with -\code{time_value} matching one of the \code{ref_time_values}, this can have unexpected -behaviors due reporting latency or reporting dropping in and out. +\code{time_value} matching one of the \code{ref_time_values}, this can have +unexpected behaviors due reporting latency or reporting dropping in and +out. \item The \code{ref_time_values} default for \code{epix_slide} is based on making an evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the \code{versions_end}, rather than the \code{time_value}s. diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index ee0751e2..9580bee7 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -7,21 +7,62 @@ vignette: > %\VignetteEncoding{UTF-8} --- -In this vignette, we discuss how to use the sliding functionality in the -`epiprocess` package with computations that have advanced output structures. - -In general, the functions `epi_slide()` and `epix_slide()` do what they can to -ensure the result of a slide operation is *size stable*, meaning, it will return -something whose length is the same as the number of appearances of reference -time values for the slide computation in the given data frame/table (this -defaults to all time values, but can be some given subset when `ref_time_values` -is specified). - -The output of a slide computation should either be an atomic value/vector, or a -data frame. This data frame can have multiple columns, multiple rows, or both. -Below we demonstrate some advanced use cases of sliding with these output -structures. We focus on `epi_slide()` for the most part, though the behavior we -demonstrate also carries over to `epix_slide()`. +In this vignette, we discuss how to use the sliding functionality in the +`epiprocess` package with less common grouping schemes or with computations that +have advanced output structures. +The output of a slide computation should either be an atomic value/vector, or a +data frame. This data frame can have multiple columns, multiple rows, or both. + +During basic usage (e.g., when all optional arguments are set to their defaults): + +* `epi_slide(edf, , .....)`: + + * keeps **all** columns of `edf`, adds computed column(s) + * outputs one row **for every row in `edf`** (recycling outputs from + computations appropriately if there are multiple time series bundled + together inside any group(s)) + * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by + `dplyr::arrange(time_value, .by_group = TRUE)`** + +* `epix_slide(ea, , .....)`: + + * keeps **grouping and `time_value`** columns of `ea`, adds computed + column(s) + * outputs one row **for element/row output from the computations** + * is roughly analogous to (the non-sliding) **`dplyr::reframe`** (or + `dplyr::summarize`, before the `dplyr` 1.1.0 + [update](https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-pick-reframe-arrange/#reframe)) + +These differences in basic behavior make some common slide operations require less boilerplate: +* predictors and targets calculated with `epi_slide` are automatically lined up + with each other and with the signals from which they were calculated; and +* computations for an `epix_slide` can output data frames with any number of + rows, containing models, forecasts, evaluations, etc., and will not be + recycled. + +When using more advanced features, more complex rules apply: + +* Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)` + will output one row for every row in `edf` with `time_value` appearing inside + `ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` + followed by `dplyr::filter` to those `ref_time_values`. We call this property + **size stability**, and describe how it is achieved in the following sections. + The default behavior described above is a special case of this general rule + based on a default value of `ref_time_values`. +* Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values, + all_rows=TRUE)` will not just output rows for the given `ref_time_values`, but + instead will output one row per row in `edf`. +* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will output one + row per computation that outputs a data frame, even when these data frames + have arbitrary numbers of rows. +* Clarification: `ea %>% group_by(....., .drop=FALSE) %>% + epix_slide(, .....)` will call the computation on any missing + groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional + output rows. + +Below we demonstrate some advanced use cases of sliding with different output +structures. We focus on `epi_slide()` for the most part, though some of the +behavior we demonstrate also carries over to `epix_slide()`. ## Recycling outputs @@ -35,7 +76,7 @@ simple synthetic example. library(epiprocess) library(dplyr) -df <- tibble( +edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), @@ -44,29 +85,31 @@ df <- tibble( as_epi_df() # 2-day trailing average, per geo value -df %>% +edf %>% group_by(geo_value) %>% epi_slide(x_2dav = mean(x), before = 1) %>% ungroup() # 2-day trailing average, marginally -df %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1) ``` ```{r, include = FALSE} # More checks (not included) -df %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% @@ -80,7 +123,7 @@ so, uses it to fill the new column. For example, this next computation gives the same result as the last one. ```{r} -df %>% +edf %>% epi_slide(y_2dav = rep(mean(x), 3), before = 1) ``` @@ -89,7 +132,7 @@ is *not* size stable, then `epi_slide()` throws an error. For example, below we are trying to return 2 things for 3 states. ```{r, error = TRUE} -df %>% +edf %>% epi_slide(x_2dav = rep(mean(x), 2), before = 1) ``` @@ -103,15 +146,15 @@ we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` object returned by `epi_slide()` has a list column containing the slide values. ```{r} -df2 <- df %>% +edf2 <- edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = TRUE) %>% ungroup() -class(df2$a) -length(df2$a) -df2$a[[2]] +class(edf2$a) +length(edf2$a) +edf2$a[[2]] ``` When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function @@ -122,7 +165,7 @@ list column (here `a`) onto the column names of the output data frame from the slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE) %>% @@ -133,7 +176,7 @@ We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop the prefix associated with list column name, in naming the unnested columns. ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) %>% @@ -144,19 +187,19 @@ Furthermore, `epi_slide()` will recycle the single row data frame as needed in order to make the result size stable, just like the case for atomic values. ```{r} -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) ``` ```{r, include = FALSE} # More checks (not included) -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL) -df %>% +edf %>% mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% @@ -174,32 +217,51 @@ Meaning, `epi_slide()` will check that the result is size stable, and if so, will fill the new column(s) in the resulting `epi_df` object appropriately. This can be convenient for modeling in the following sense: we can, for example, -fit a sliding forecasting model by pooling data from different locations, and -then return separate forecasts from this common model for each location. We use -our synthetic example to demonstrate this idea abstractly but simply. +fit a sliding, data-versioning-unaware nowcasting or forecasting model by +pooling data from different locations, and then return separate forecasts from +this common model for each location. We use our synthetic example to demonstrate +this idea abstractly but simply by forecasting (actually, nowcasting) `y` from +`x` by fitting a time-windowed linear model that pooling data across all +locations. ```{r} -df$y <- 2 * df$x + 0.05 * rnorm(length(df$x)) +edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x)) -df %>% +edf %>% epi_slide(function(d, ...) { obj <- lm(y ~ x, data = d) return( as.data.frame( predict(obj, newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), + group_by(geo_value) %>% + filter(time_value == max(time_value)), interval = "prediction", level = 0.9) )) }, before = 1, new_col_name = "fc", names_sep = NULL) ``` +The above example focused on simplicity to show how to work with multi-row +outputs. Note however, the following issues in this example: + +* The `lm` fitting data includes the testing instances, as no training-test split was performed. +* Adding a simple training-test split would not factor in reporting latency properly. +* Data revisions are not taken into account. + +All three of these factors contribute to unrealistic retrospective forecasts and +overly optimistic retrospective performance evaluations. Instead, one should +favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using +`epix_slide` also makes it easier to express certain types of forecasts; while +in `epi_slide`, forecasts for additional aheads or quantile levels would need to +be expressed as additional columns, or nested inside list columns, `epix_slide` +does not perform size stability checks or recycling, allowing computations to +output any number of rows. + ## Version-aware forecasting, revisited -Finally, we revisit the COVID-19 forecasting example from the [archive +We revisit the COVID-19 forecasting example from the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order -to demonstrate the last point in a more realistic setting. First, we fetch the -versioned data and build the archive. +to demonstrate the preceding points regarding forecast evaluation in a more +realistic setting. First, we fetch the versioned data and build the archive. ```{r, message = FALSE, warning = FALSE, eval =FALSE} library(epidatr) From 953717fd2ad6317d130a8cabeaf3d6040aebb59a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 28 Mar 2023 12:05:29 -0700 Subject: [PATCH 09/32] Remove `all_rows` for `epix_slide`; add deprecation tests --- R/archive.R | 4 +-- R/grouped_epi_archive.R | 26 ++++++++-------- R/methods-epi_archive.R | 37 +++++++++++------------ man/epi_archive.Rd | 1 - man/epix_slide.Rd | 36 ++++++++++------------ tests/testthat/test-deprecations.R | 48 ++++++++++++++++++++++++++++++ 6 files changed, 97 insertions(+), 55 deletions(-) create mode 100644 tests/testthat/test-deprecations.R diff --git a/R/archive.R b/R/archive.R index d5be0ed4..086d31b3 100644 --- a/R/archive.R +++ b/R/archive.R @@ -640,7 +640,7 @@ epi_archive = slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { # For an "ungrouped" slide, treat all rows as belonging to one big # group (group by 0 vars), like `dplyr::summarize`, and let the # resulting `grouped_epi_archive` handle the slide: @@ -649,7 +649,7 @@ epi_archive = before = before, ref_time_values = ref_time_values, time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, - all_rows = all_rows, all_versions = all_versions + all_versions = all_versions ) %>% # We want a slide on ungrouped archives to output something # ungrouped, rather than retaining the trivial (0-variable) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 0efca351..ddd4527f 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -190,7 +190,11 @@ grouped_epi_archive = slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. if ("group_by" %in% nse_dots_names(...)) { Abort(" The `group_by` argument to `slide` has been removed; please use @@ -200,7 +204,15 @@ grouped_epi_archive = this check is a false positive, but you will still need to use a different column name here and rename the resulting column after the slide.) - ") + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + Abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Since `epix_slide` now + allows any number of rows out of slide computations, it's + unclear how `all_rows=TRUE` should fill in missing results. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") } if (missing(ref_time_values)) { @@ -247,9 +259,6 @@ grouped_epi_archive = if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) { Abort("`names_sep` must be a (single) string or NULL.") } - if (!rlang::is_bool(all_rows)) { - Abort("`all_rows` must be TRUE or FALSE.") - } if (!rlang::is_bool(all_versions)) { Abort("`all_versions` must be TRUE or FALSE.") } @@ -420,13 +429,6 @@ grouped_epi_archive = if (!as_list_col) { x = tidyr::unnest(x, !!new_col, names_sep = names_sep) } - - # Join to get all rows, if we need to, then return - if (all_rows) { - cols = c(private$vars, "time_value") - y = unique(private$ungrouped$DT[, ..cols]) - x = dplyr::left_join(y, x, by = cols) - } if (is_epi_df(x)) { # The analogue of `epi_df`'s `as_of` metadata for an archive is diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index b4b4694a..420e5f1c 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -714,15 +714,12 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. -#' @param all_rows If `all_rows = TRUE`, then the output will have one row per -#' combination of grouping variables and unique time values in the underlying -#' data table. Otherwise, there will be one row in the output for each time -#' value in `x` that acts as a reference time value. Default is `FALSE`. -#' @param all_versions If `all_versions = TRUE`, then `f` will be passed the -#' version history (all `version <= ref_time_value`) for rows having -#' `time_value` between `ref_time_value - before` and `ref_time_value`. -#' Otherwise, `f` will be passed only the most recent `version` for every -#' unique `time_value`. Default is `FALSE`. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables, `time_value`, #' containing the reference time values for the slide computation, and a #' column named according to the `new_col_name` argument, containing the slide @@ -747,7 +744,9 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' `epi_df`-ness of the first argument to each computation; `epi_slide` only #' provides the grouping variables in the second input, and will convert the #' first input into a regular tibble if the grouping variables include the -#' essential `geo_value` column. +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) #' 4. The output class and columns are similar but different: `epix_slide()` #' returns an `epi_df` or tibble containing only the grouping variables, #' `time_value`, and the new column(s) from the slide computations, whereas @@ -759,17 +758,16 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' or[`dplyr::summarize`] in `dplyr` 1.0.0, while `epi_slide` is roughly #' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed #' in the "advanced" vignette. -#' 6. Similar to the row recyling, while `all_rows=TRUE` is designed to mimic -#' `epi_slide` by completing based on distinct combinations of `geo_value`, -#' `time_value`, and all `other_keys` present in the version data with -#' `time_value` matching one of the `ref_time_values`, this can have -#' unexpected behaviors due reporting latency or reporting dropping in and -#' out. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. #' 7. The `ref_time_values` default for `epix_slide` is based on making an #' evenly-spaced sequence out of the `version`s in the `DT` plus the #' `versions_end`, rather than the `time_value`s. -#' Apart from this, the interfaces between `epix_slide()` and `epi_slide()` are -#' the same. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. #' #' Furthermore, the current function can be considerably slower than #' `epi_slide()`, for two reasons: (1) it must repeatedly fetch @@ -848,7 +846,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr epix_slide = function(x, f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { if (!is_epi_archive(x, grouped_okay=TRUE)) { Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } @@ -858,7 +856,6 @@ epix_slide = function(x, f, ..., before, ref_time_values, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, - all_rows = all_rows, all_versions = all_versions )) } diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 8ae412a6..a4a58645 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -322,7 +322,6 @@ details. new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE )}\if{html}{\out{}} } diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index da8b9213..116dd657 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -14,7 +14,6 @@ epix_slide( new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE ) } @@ -84,16 +83,12 @@ by prepending \code{new_col_name} to the names of the list elements.} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix from \code{new_col_name} entirely.} -\item{all_rows}{If \code{all_rows = TRUE}, then the output will have one row per -combination of grouping variables and unique time values in the underlying -data table. Otherwise, there will be one row in the output for each time -value in \code{x} that acts as a reference time value. Default is \code{FALSE}.} - -\item{all_versions}{If \code{all_versions = TRUE}, then \code{f} will be passed the -version history (all \code{version <= ref_time_value}) for rows having -\code{time_value} between \code{ref_time_value - before} and \code{ref_time_value}. -Otherwise, \code{f} will be passed only the most recent \code{version} for every -unique \code{time_value}. Default is \code{FALSE}.} +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} } \value{ A tibble whose columns are: the grouping variables, \code{time_value}, @@ -130,7 +125,9 @@ ends); \code{epi_slide} windows extend from \code{before} time steps before a \code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only provides the grouping variables in the second input, and will convert the first input into a regular tibble if the grouping variables include the -essential \code{geo_value} column. +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) \item The output class and columns are similar but different: \code{epix_slide()} returns an \code{epi_df} or tibble containing only the grouping variables, \code{time_value}, and the new column(s) from the slide computations, whereas @@ -142,19 +139,18 @@ roughly analogous to \code{\link[dplyr:reframe]{dplyr::reframe}} in \code{dplyr} or\code{\link[dplyr:summarise]{dplyr::summarize}} in \code{dplyr} 1.0.0, while \code{epi_slide} is roughly analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed in the "advanced" vignette. -\item Similar to the row recyling, while \code{all_rows=TRUE} is designed to mimic -\code{epi_slide} by completing based on distinct combinations of \code{geo_value}, -\code{time_value}, and all \code{other_keys} present in the version data with -\code{time_value} matching one of the \code{ref_time_values}, this can have -unexpected behaviors due reporting latency or reporting dropping in and -out. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. \item The \code{ref_time_values} default for \code{epix_slide} is based on making an evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the \code{versions_end}, rather than the \code{time_value}s. -Apart from this, the interfaces between \code{epix_slide()} and \code{epi_slide()} are -the same. } +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. + Furthermore, the current function can be considerably slower than \code{epi_slide()}, for two reasons: (1) it must repeatedly fetch properly-versioned snapshots from the data archive (via its \code{as_of()} diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R new file mode 100644 index 00000000..334b4488 --- /dev/null +++ b/tests/testthat/test-deprecations.R @@ -0,0 +1,48 @@ + +test_that("epix_slide group_by= deprecation works",{ + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + # + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) +}) From cb195c2b5db373287c3269544249c1df71496280 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 4 May 2023 10:57:17 -0700 Subject: [PATCH 10/32] Always output ungrouped, no-metadata tibble from `epix_slide`, `as_tibble.epi_df` To make `epix_slide` more like `reframe` and to make `as_tibble` on `epi_df`s act more like other tibble subclasses. --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ R/grouped_epi_archive.R | 27 +++++++++----- R/methods-epi_archive.R | 16 ++++----- R/methods-epi_df.R | 11 ++++++ man/epix_slide.Rd | 10 +++--- man/group_by.epi_archive.Rd | 6 ++-- man/print.epi_df.Rd | 7 ++-- tests/testthat/test-epix_slide.R | 43 ++++++++++++----------- tests/testthat/test-grouped_epi_archive.R | 27 ++++++++------ vignettes/advanced.Rmd | 15 ++++---- 11 files changed, 94 insertions(+), 72 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8a06e6f5..4c63a832 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -61,7 +61,7 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 Depends: R (>= 2.10) URL: https://cmu-delphi.github.io/epiprocess/ diff --git a/NAMESPACE b/NAMESPACE index 10847e6c..b2f76706 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,7 @@ export(archive_cases_dv_subset) export(arrange) export(as_epi_archive) export(as_epi_df) +export(as_tibble.epi_df) export(as_tsibble) export(detect_outlr) export(detect_outlr_rm) @@ -97,6 +98,7 @@ importFrom(rlang,sym) importFrom(rlang,syms) importFrom(stats,cor) importFrom(stats,median) +importFrom(tibble,as_tibble) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index ddd4527f..c10e4f24 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -430,16 +430,25 @@ grouped_epi_archive = x = tidyr::unnest(x, !!new_col, names_sep = names_sep) } - if (is_epi_df(x)) { - # The analogue of `epi_df`'s `as_of` metadata for an archive is - # `$versions_end`, at least in the current absence of - # separate fields/columns denoting the "archive version" with a - # different resolution, or from the perspective of a different - # stage of a data pipeline. The `as_of` that is automatically - # derived won't always match; override: + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } - attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end - } + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have consistency when grouping by + # `geo_value` or not, and to prevent `epi_df` output with invalid + # `geo_type` or `other_keys`, always output a tibble. + # + # For consistency with `reframe`, this should always be ungrouped. + x <- as_tibble(x) return(x) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 420e5f1c..65835623 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -582,8 +582,7 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' epix_slide(f = ~ mean(.x$case_rate_7d_av), #' before = 2, #' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = 'case_rate_3d_av') %>% -#' ungroup() +#' new_col_name = 'case_rate_3d_av') #' #' # ----------------------------------------------------------------- #' @@ -618,8 +617,7 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' toy_archive %>% #' group_by(geo_value, age_group, .drop=FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() +#' epix_slide(f = ~ sum(.x$value), before = 20) #' #' @importFrom dplyr group_by #' @export @@ -748,10 +746,10 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' will provide an `epi_archive` rather than an `epi-df` to each #' computation.) #' 4. The output class and columns are similar but different: `epix_slide()` -#' returns an `epi_df` or tibble containing only the grouping variables, +#' returns an ungrouped tibble containing only the grouping variables, #' `time_value`, and the new column(s) from the slide computations, whereas -#' `epi_slide()` returns an `epi_df` with all original variables plus the new -#' columns from the slide computations. +#' `epi_slide()` returns an `epi_df`, retaining groupings, with all original +#' variables plus the new columns from the slide computations. #' 5. There are no size stability checks or element/row recycling to maintain #' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is #' roughly analogous to [`dplyr::reframe`] in `dplyr` 1.1.0 @@ -801,8 +799,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' epix_slide(f = ~ mean(.x$case_rate_7d_av), #' before = 2, #' ref_time_values = ref_time_values, -#' new_col_name = 'case_rate_7d_av_recent_av') %>% -#' ungroup() +#' new_col_name = 'case_rate_7d_av_recent_av') #' # We requested time windows that started 2 days before the corresponding time #' # values. The actual number of `time_value`s in each computation depends on #' # the reporting latency of the signal and `time_value` range covered by the @@ -838,7 +835,6 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' }, #' before = 2, all_versions = TRUE, #' ref_time_values = ref_time_values, names_sep=NULL) %>% -#' ungroup() %>% #' arrange(geo_value, time_value) #' #' @importFrom rlang enquo !!! diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 503b8add..ff6c3c88 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -1,3 +1,14 @@ +#' Convert to tibble, dropping metadata +#' @rdname print.epi_df +#' +#' @importFrom tibble as_tibble +#' @export +as_tibble.epi_df = function(x, ...) { + result = NextMethod() + attr(result, "metadata") <- NULL + return(result) +} + #' Convert to tsibble format #' #' Converts an `epi_df` object into a tsibble, where the index is taken to be diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 116dd657..896cd38d 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -129,10 +129,10 @@ essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_sl will provide an \code{epi_archive} rather than an \code{epi-df} to each computation.) \item The output class and columns are similar but different: \code{epix_slide()} -returns an \code{epi_df} or tibble containing only the grouping variables, +returns an ungrouped tibble containing only the grouping variables, \code{time_value}, and the new column(s) from the slide computations, whereas -\code{epi_slide()} returns an \code{epi_df} with all original variables plus the new -columns from the slide computations. +\code{epi_slide()} returns an \code{epi_df}, retaining groupings, with all original +variables plus the new columns from the slide computations. \item There are no size stability checks or element/row recycling to maintain size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is roughly analogous to \code{\link[dplyr:reframe]{dplyr::reframe}} in \code{dplyr} 1.1.0 @@ -184,8 +184,7 @@ archive_cases_dv_subset \%>\% epix_slide(f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = ref_time_values, - new_col_name = 'case_rate_7d_av_recent_av') \%>\% - ungroup() + new_col_name = 'case_rate_7d_av_recent_av') # We requested time windows that started 2 days before the corresponding time # values. The actual number of `time_value`s in each computation depends on # the reporting latency of the signal and `time_value` range covered by the @@ -221,7 +220,6 @@ archive_cases_dv_subset \%>\% }, before = 2, all_versions = TRUE, ref_time_values = ref_time_values, names_sep=NULL) \%>\% - ungroup() \%>\% arrange(geo_value, time_value) } diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index aee0a07b..2282dff8 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -105,8 +105,7 @@ archive_cases_dv_subset \%>\% epix_slide(f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = as.Date("2020-06-11") + 0:2, - new_col_name = 'case_rate_3d_av') \%>\% - ungroup() + new_col_name = 'case_rate_3d_av') # ----------------------------------------------------------------- @@ -141,7 +140,6 @@ toy_archive \%>\% group_by(geo_value) \%>\% groups() toy_archive \%>\% group_by(geo_value, age_group, .drop=FALSE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% - ungroup() + epix_slide(f = ~ sum(.x$value), before = 20) } diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 878e7f18..32c2a50d 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -1,14 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods-epi_df.R -\name{print.epi_df} +\name{as_tibble.epi_df} +\alias{as_tibble.epi_df} \alias{print.epi_df} \alias{summary.epi_df} \alias{group_by.epi_df} \alias{ungroup.epi_df} \alias{group_modify.epi_df} \alias{unnest.epi_df} -\title{Base S3 methods for an \code{epi_df} object} +\title{Convert to tibble, dropping metadata} \usage{ +as_tibble.epi_df(x, ...) + \method{print}{epi_df}(x, ...) \method{summary}{epi_df}(object, ...) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index c352f90a..3f15d7eb 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -26,9 +26,7 @@ test_that("epix_slide works as intended",{ sum_binary = c(2^3+2^2, 2^6+2^3, 2^10+2^9, - 2^15+2^14)) %>% - as_epi_df(as_of = 7) %>% - group_by(geo_value) + 2^15+2^14)) expect_identical(xx1,xx2) # * @@ -300,8 +298,7 @@ test_that("as_of and epix_slide with long enough window are compatible", { ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` epix_as_of(ref_time_value2, all_versions=TRUE) %>% f2() %>% - transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% - group_by(geo_value) + transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) ) }) @@ -332,8 +329,7 @@ test_that("epix_slide with all_versions option works as intended",{ sum_binary = c(2^3+2^2, 2^6+2^3, 2^10+2^9+2^6, - 2^15+2^14+2^10)) %>% - group_by(geo_value) + 2^15+2^14+2^10)) expect_identical(xx1,xx2) # * @@ -349,17 +345,22 @@ test_that("epix_slide with all_versions option works as intended",{ expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical }) -test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { - ea_updated_stale = ea$clone() - ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) - # - expect_identical( - ea_updated_stale %>% - group_by(geo_value) %>% - epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>% - ungroup() %>% - attr("metadata") %>% - .$as_of, - 10 - ) -}) +# XXX currently, we're using a stopgap measure of having `epix_slide` always +# output a tibble while we think about the class, columns, and attributes of +# `epix_slide` output more carefully. We might bring this test back depending on +# the decisions there: +# +# test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { +# ea_updated_stale = ea$clone() +# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) +# # +# expect_identical( +# ea_updated_stale %>% +# group_by(geo_value) %>% +# epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>% +# ungroup() %>% +# attr("metadata") %>% +# .$as_of, +# 10 +# ) +# }) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 64c482bf..403bf258 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -46,8 +46,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), class="epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor") expect_identical(grouped_factor_then_nonfactor %>% - epix_slide(before = 10, s = sum(value)) %>% - ungroup(), + epix_slide(before = 10, s = sum(value)), tibble::tribble( ~age_group, ~geo_value, ~time_value, ~s, "pediatric", NA_character_, "2000-01-02", 0, @@ -56,15 +55,21 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { "adult", "us", "2000-01-03", 255) %>% mutate(age_group = ordered(age_group, c("pediatric", "adult")), time_value = as.Date(time_value)) %>% - as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 - as_of = as.Date("2000-01-03"), - additional_metadata = list(other_keys = "age_group")) %>% - # put back in expected order; see issue #166: + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: select(age_group, geo_value, time_value, s)) expect_identical(toy_archive %>% group_by(geo_value, age_group, .drop=FALSE) %>% - epix_slide(before = 10, s = sum(value)) %>% - ungroup(), + epix_slide(before = 10, s = sum(value)), tibble::tribble( ~geo_value, ~age_group, ~time_value, ~s, "us", "pediatric", "2000-01-02", 0, @@ -73,8 +78,8 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { "us", "adult", "2000-01-03", 255) %>% mutate(age_group = ordered(age_group, c("pediatric", "adult")), time_value = as.Date(time_value)) %>% - as_epi_df(as_of = as.Date("2000-01-03"), - additional_metadata = list(other_keys = "age_group")) %>% - # put back in expected order; see issue #166: + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: select(geo_value, age_group, time_value, s)) }) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 9580bee7..d4693c67 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -21,6 +21,7 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * outputs one row **for every row in `edf`** (recycling outputs from computations appropriately if there are multiple time series bundled together inside any group(s)) + * keeps any grouping from `edf` * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by `dplyr::arrange(time_value, .by_group = TRUE)`** @@ -29,8 +30,9 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * keeps **grouping and `time_value`** columns of `ea`, adds computed column(s) * outputs one row **for element/row output from the computations** - * is roughly analogous to (the non-sliding) **`dplyr::reframe`** (or - `dplyr::summarize`, before the `dplyr` 1.1.0 + * removes any grouping of `ea` + * is roughly analogous to (the non-sliding) **`dplyr::reframe`** (or, more + roughly, `dplyr::summarize` before the `dplyr` 1.1.0 [update](https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-pick-reframe-arrange/#reframe)) These differences in basic behavior make some common slide operations require less boilerplate: @@ -105,16 +107,14 @@ edf %>% mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% - ungroup() + epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% - epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% - ungroup() + epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) ``` When the slide computation returns an atomic vector (rather than a single value) @@ -205,8 +205,7 @@ edf %>% group_by(geo_value) %>% epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL) %>% - ungroup() + before = 1, as_list_col = FALSE, names_sep = NULL) ``` ## Multi-row outputs From b7a07e4c118b5776038ef17c2a5025185da801b5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 4 May 2023 11:04:57 -0700 Subject: [PATCH 11/32] Refactor `ref_time_values` calc, test 0-row `f` outputs in `epix_slide` --- NAMESPACE | 2 +- R/archive.R | 2 +- R/grouped_epi_archive.R | 20 ++++---- R/methods-epi_archive.R | 9 ++++ man/print.epi_df.Rd | 2 +- tests/testthat/test-epix_slide.R | 82 ++++++++++++++++++++++++++++---- 6 files changed, 94 insertions(+), 23 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b2f76706..1c344a3d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) +S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) @@ -30,7 +31,6 @@ export(archive_cases_dv_subset) export(arrange) export(as_epi_archive) export(as_epi_df) -export(as_tibble.epi_df) export(as_tsibble) export(detect_outlr) export(detect_outlr_rm) diff --git a/R/archive.R b/R/archive.R index 086d31b3..5897fc4d 100644 --- a/R/archive.R +++ b/R/archive.R @@ -307,7 +307,7 @@ epi_archive = Abort("compactify must be boolean or null.") } - # Apply defaults and conduct checks and apply defaults for + # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: if (missing(clobberable_versions_start)) { clobberable_versions_start <- NA diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index c10e4f24..106726dc 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -216,8 +216,7 @@ grouped_epi_archive = } if (missing(ref_time_values)) { - versions_with_updates = c(private$ungrouped$DT$version, private$ungrouped$versions_end) - ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + ref_time_values = epix_slide_ref_time_values_default(private$ungrouped) } else if (length(ref_time_values) == 0L) { Abort("`ref_time_values` must have at least one element.") } else if (any(is.na(ref_time_values))) { @@ -281,7 +280,7 @@ grouped_epi_archive = if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } - + # Label every result row with the `ref_time_value`: return(tibble::tibble(time_value = .env$ref_time_value, !!new_col := .env$comp_value)) @@ -305,11 +304,12 @@ grouped_epi_archive = group_modify_fn = comp_one_grp } else { as_of_archive = as_of_raw - # We essentially want to `group_modify` the archive, but don't - # provide an implementation yet. Next best would be - # `group_modify` on its `$DT`, but that has different behavior - # based on whether or not `dtplyr` is loaded. Instead, go - # through a , trying to avoid copies. + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { # `as_of` aliased its the full `$DT`; copy before mutating: as_of_archive$DT <- copy(as_of_archive$DT) @@ -317,7 +317,7 @@ grouped_epi_archive = dt_key = data.table::key(as_of_archive$DT) as_of_df = as_of_archive$DT data.table::setDF(as_of_df) - + # Convert each subgroup chunk to an archive before running the calculation. group_modify_fn = function(.data_group, .group_key, f, ..., @@ -336,7 +336,7 @@ grouped_epi_archive = ) } } - + return( dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)), .drop=private$drop) %>% diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 65835623..2ab4b279 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -856,6 +856,15 @@ epix_slide = function(x, f, ..., before, ref_time_values, )) } +#' Default value for `ref_time_values` in an `epix_slide` +#' +#' @noRd +epix_slide_ref_time_values_default = function(ea) { + versions_with_updates = c(ea$DT$version, ea$versions_end) + ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + return (ref_time_values) +} + #' Filter an `epi_archive` object to keep only older versions #' #' Generates a filtered `epi_archive` from an `epi_archive` object, keeping diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 32c2a50d..babece3e 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -10,7 +10,7 @@ \alias{unnest.epi_df} \title{Convert to tibble, dropping metadata} \usage{ -as_tibble.epi_df(x, ...) +\method{as_tibble}{epi_df}(x, ...) \method{print}{epi_df}(x, ...) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 3f15d7eb..4115aa33 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -178,16 +178,15 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss }) ea <- tibble::tribble(~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1)) %>% - tidyr::unnest(c(time_value,binary)) - -ea$geo_value <- "x" -ea <- as_epi_archive(ea) + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1)) %>% + tidyr::unnest(c(time_value,binary)) %>% + mutate(geo_value = "x") %>% + as_epi_archive() test_that("epix_slide with all_versions option has access to all older versions", { library(data.table) @@ -364,3 +363,66 @@ test_that("epix_slide with all_versions option works as intended",{ # 10 # ) # }) + +test_that("epix_slide works with 0-row computation outputs", { + epix_slide_empty = function(ea, ...) { + ea %>% + epix_slide(before = 5L, ..., function(x, g) { + tibble::tibble() + }) + } + expect_identical( + ea %>% + epix_slide_empty(), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) # %>% + # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, + # as_of = ea$versions_end) + ) + # with `all_versions=TRUE`, we have something similar but never get an + # `epi_df`: + expect_identical( + ea %>% + epix_slide_empty(all_versions=TRUE), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(all_versions=TRUE), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) + ) +}) + +# test_that("epix_slide grouped by geo can produce `epi_df` output", { +# # This is a characterization test. Not sure we actually want this behavior; +# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 +# expect_identical( +# ea %>% +# group_by(geo_value) %>% +# epix_slide(before = 5L, function(x,g) { +# tibble::tibble(value = 42) +# }, names_sep = NULL), +# tibble::tibble( +# geo_value = "x", +# time_value = epix_slide_ref_time_values_default(ea), +# value = 42 +# ) %>% +# new_epi_df(as_of = ea$versions_end) +# ) +# }) From c06f3ee95af5cacca564690e1f72e5f2de318f42 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 4 May 2023 11:10:24 -0700 Subject: [PATCH 12/32] Fix DESCRIPTION version, update NEWS.md with `epix_slide` changes --- DESCRIPTION | 2 +- NEWS.md | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4c63a832..894b23a4 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.6.0 +Version: 0.6.0.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 4745f09a..1ec53968 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,15 @@ Note that `epiprocess` uses the [Semantic Versioning ("semver")](https://semver.org/) scheme for all release versions, but any inter-release development versions will include an additional ".9999" suffix. +## Breaking changes: + +* `epix_slide` has been made more like `dplyr::reframe` (in `dplyr` 1.1.0 terms, + or, more roughly, `dplyr::summarize` pre-dplyr 1.1.0). It will no longer + perform element/row recycling for size stability, accepts slide computation + outputs containing any number of rows, no longer supports `all_rows`, and + always outputs an ungrouped tibble. Future versions will consider whether/when + to output an `epi_df` instead. + # epiprocess 0.6.0 ## Breaking changes: From b02b3a1ff06bab5a82588d890a3c7966daddeb5c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 4 May 2023 15:10:46 -0700 Subject: [PATCH 13/32] Change `epix_slide` to be more analogous to `group_modify` rather than `reframe`. This way we can match the grouping/ungroupedness just like in `epi_slide`, except for the zero-variable groupings not supported by `grouped_df`. --- NEWS.md | 11 ++++++----- R/grouped_epi_archive.R | 11 +++++------ R/methods-epi_archive.R | 24 ++++++++++++++--------- R/methods-epi_df.R | 4 +--- man/epix_slide.Rd | 18 ++++++++++------- man/group_by.epi_archive.Rd | 6 ++++-- tests/testthat/test-epix_slide.R | 23 +++++++++++++--------- tests/testthat/test-grouped_epi_archive.R | 7 +++++-- tests/testthat/test-methods-epi_df.R | 8 ++++++++ vignettes/advanced.Rmd | 22 +++++++++++++-------- 10 files changed, 83 insertions(+), 51 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1ec53968..6eb7688e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,12 +6,13 @@ inter-release development versions will include an additional ".9999" suffix. ## Breaking changes: -* `epix_slide` has been made more like `dplyr::reframe` (in `dplyr` 1.1.0 terms, - or, more roughly, `dplyr::summarize` pre-dplyr 1.1.0). It will no longer +* `epix_slide` has been made more like `dplyr::group_modify`. It will no longer perform element/row recycling for size stability, accepts slide computation - outputs containing any number of rows, no longer supports `all_rows`, and - always outputs an ungrouped tibble. Future versions will consider whether/when - to output an `epi_df` instead. + outputs containing any number of rows, and no longer supports `all_rows`. +* `epix_slide` will only output grouped or ungrouped tibbles. Previously, it + would sometimes output `epi_df`s, but not consistently, and not always with + the metadata desired. Future versions will revisit this design, and consider + more closely whether/when/how to output an `epi_df`. # epiprocess 0.6.0 diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 106726dc..91b578d5 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -443,12 +443,11 @@ grouped_epi_archive = # XXX We need to work out when we want to return an `epi_df` and how # to get appropriate keys (see #290, #223, #163). We'll probably # need the commented-out code above if we ever output an `epi_df`. - # However, as a stopgap measure to have consistency when grouping by - # `geo_value` or not, and to prevent `epi_df` output with invalid - # `geo_type` or `other_keys`, always output a tibble. - # - # For consistency with `reframe`, this should always be ungrouped. - x <- as_tibble(x) + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) return(x) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 2ab4b279..ee12d70e 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -582,7 +582,8 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' epix_slide(f = ~ mean(.x$case_rate_7d_av), #' before = 2, #' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = 'case_rate_3d_av') +#' new_col_name = 'case_rate_3d_av') %>% +#' ungroup() #' #' # ----------------------------------------------------------------- #' @@ -617,7 +618,8 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' toy_archive %>% #' group_by(geo_value, age_group, .drop=FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() #' #' @importFrom dplyr group_by #' @export @@ -746,14 +748,16 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' will provide an `epi_archive` rather than an `epi-df` to each #' computation.) #' 4. The output class and columns are similar but different: `epix_slide()` -#' returns an ungrouped tibble containing only the grouping variables, -#' `time_value`, and the new column(s) from the slide computations, whereas -#' `epi_slide()` returns an `epi_df`, retaining groupings, with all original -#' variables plus the new columns from the slide computations. +#' returns a tibble containing only the grouping variables, `time_value`, and +#' the new column(s) from the slide computations, whereas `epi_slide()` +#' returns an `epi_df` with all original variables plus the new columns from +#' the slide computations. (Both will mirror the grouping or ungroupedness of +#' their input, with one exception: `epi_archive`s can have trivial +#' (zero-variable) groupings, but these will be dropped in `epix_slide` +#' results as they are not supported by tibbles.) #' 5. There are no size stability checks or element/row recycling to maintain #' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is -#' roughly analogous to [`dplyr::reframe`] in `dplyr` 1.1.0 -#' or[`dplyr::summarize`] in `dplyr` 1.0.0, while `epi_slide` is roughly +#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly #' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed #' in the "advanced" vignette. #' 6. `all_rows` is not supported in `epix_slide`; since the slide @@ -799,7 +803,8 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' epix_slide(f = ~ mean(.x$case_rate_7d_av), #' before = 2, #' ref_time_values = ref_time_values, -#' new_col_name = 'case_rate_7d_av_recent_av') +#' new_col_name = 'case_rate_7d_av_recent_av') %>% +#' ungroup() #' # We requested time windows that started 2 days before the corresponding time #' # values. The actual number of `time_value`s in each computation depends on #' # the reporting latency of the signal and `time_value` range covered by the @@ -835,6 +840,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' }, #' before = 2, all_versions = TRUE, #' ref_time_values = ref_time_values, names_sep=NULL) %>% +#' ungroup() %>% #' arrange(geo_value, time_value) #' #' @importFrom rlang enquo !!! diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index ff6c3c88..0a9a1e28 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -4,9 +4,7 @@ #' @importFrom tibble as_tibble #' @export as_tibble.epi_df = function(x, ...) { - result = NextMethod() - attr(result, "metadata") <- NULL - return(result) + decay_epi_df(NextMethod()) } #' Convert to tsibble format diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 896cd38d..b6d7b75d 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -129,14 +129,16 @@ essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_sl will provide an \code{epi_archive} rather than an \code{epi-df} to each computation.) \item The output class and columns are similar but different: \code{epix_slide()} -returns an ungrouped tibble containing only the grouping variables, -\code{time_value}, and the new column(s) from the slide computations, whereas -\code{epi_slide()} returns an \code{epi_df}, retaining groupings, with all original -variables plus the new columns from the slide computations. +returns a tibble containing only the grouping variables, \code{time_value}, and +the new column(s) from the slide computations, whereas \code{epi_slide()} +returns an \code{epi_df} with all original variables plus the new columns from +the slide computations. (Both will mirror the grouping or ungroupedness of +their input, with one exception: \code{epi_archive}s can have trivial +(zero-variable) groupings, but these will be dropped in \code{epix_slide} +results as they are not supported by tibbles.) \item There are no size stability checks or element/row recycling to maintain size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is -roughly analogous to \code{\link[dplyr:reframe]{dplyr::reframe}} in \code{dplyr} 1.1.0 -or\code{\link[dplyr:summarise]{dplyr::summarize}} in \code{dplyr} 1.0.0, while \code{epi_slide} is roughly +roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed in the "advanced" vignette. \item \code{all_rows} is not supported in \code{epix_slide}; since the slide @@ -184,7 +186,8 @@ archive_cases_dv_subset \%>\% epix_slide(f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = ref_time_values, - new_col_name = 'case_rate_7d_av_recent_av') + new_col_name = 'case_rate_7d_av_recent_av') \%>\% + ungroup() # We requested time windows that started 2 days before the corresponding time # values. The actual number of `time_value`s in each computation depends on # the reporting latency of the signal and `time_value` range covered by the @@ -220,6 +223,7 @@ archive_cases_dv_subset \%>\% }, before = 2, all_versions = TRUE, ref_time_values = ref_time_values, names_sep=NULL) \%>\% + ungroup() \%>\% arrange(geo_value, time_value) } diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index 2282dff8..aee0a07b 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -105,7 +105,8 @@ archive_cases_dv_subset \%>\% epix_slide(f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = as.Date("2020-06-11") + 0:2, - new_col_name = 'case_rate_3d_av') + new_col_name = 'case_rate_3d_av') \%>\% + ungroup() # ----------------------------------------------------------------- @@ -140,6 +141,7 @@ toy_archive \%>\% group_by(geo_value) \%>\% groups() toy_archive \%>\% group_by(geo_value, age_group, .drop=FALSE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) + epix_slide(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() } diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 4115aa33..f4be8a70 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -26,7 +26,8 @@ test_that("epix_slide works as intended",{ sum_binary = c(2^3+2^2, 2^6+2^3, 2^10+2^9, - 2^15+2^14)) + 2^15+2^14)) %>% + group_by(geo_value) expect_identical(xx1,xx2) # * @@ -297,7 +298,8 @@ test_that("as_of and epix_slide with long enough window are compatible", { ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` epix_as_of(ref_time_value2, all_versions=TRUE) %>% f2() %>% - transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) + transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + group_by(geo_value) ) }) @@ -328,7 +330,8 @@ test_that("epix_slide with all_versions option works as intended",{ sum_binary = c(2^3+2^2, 2^6+2^3, 2^10+2^9+2^6, - 2^15+2^14+2^10)) + 2^15+2^14+2^10)) %>% + group_by(geo_value) expect_identical(xx1,xx2) # * @@ -345,9 +348,9 @@ test_that("epix_slide with all_versions option works as intended",{ }) # XXX currently, we're using a stopgap measure of having `epix_slide` always -# output a tibble while we think about the class, columns, and attributes of -# `epix_slide` output more carefully. We might bring this test back depending on -# the decisions there: +# output a (grouped/ungrouped) tibble while we think about the class, columns, +# and attributes of `epix_slide` output more carefully. We might bring this test +# back depending on the decisions there: # # test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { # ea_updated_stale = ea$clone() @@ -385,9 +388,10 @@ test_that("epix_slide works with 0-row computation outputs", { tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], time_value = ea$DT$version[integer(0)] - ) # %>% + ) %>% # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, - # as_of = ea$versions_end) + # as_of = ea$versions_end) %>% + group_by(geo_value) ) # with `all_versions=TRUE`, we have something similar but never get an # `epi_df`: @@ -405,7 +409,8 @@ test_that("epix_slide works with 0-row computation outputs", { tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], time_value = ea$DT$version[integer(0)] - ) + ) %>% + group_by(geo_value) ) }) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 403bf258..84c371f9 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -66,7 +66,8 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # as_of = as.Date("2000-01-03"), # additional_metadata = list(other_keys = "age_group")) %>% # # put back in expected order; see issue #166: - select(age_group, geo_value, time_value, s)) + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop=FALSE)) expect_identical(toy_archive %>% group_by(geo_value, age_group, .drop=FALSE) %>% epix_slide(before = 10, s = sum(value)), @@ -81,5 +82,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { # as_epi_df(as_of = as.Date("2000-01-03"), # additional_metadata = list(other_keys = "age_group")) %>% # # put back in expected order; see issue #166: - select(geo_value, age_group, time_value, s)) + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop=FALSE) + ) }) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index de43d7c2..9d03cf93 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -116,3 +116,11 @@ test_that("Correct metadata when subset includes some of other_keys", { # Including both original other_keys was already tested above }) +test_that("Metadata and grouping are dropped by `as_tibble`", { + grouped_converted = toy_epi_df %>% + group_by(geo_value) %>% + as_tibble() + expect_true( + !any(c("metadata", "groups") %in% names(attributes(grouped_converted))) + ) +}) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index d4693c67..b2f71e07 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -21,19 +21,22 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * outputs one row **for every row in `edf`** (recycling outputs from computations appropriately if there are multiple time series bundled together inside any group(s)) - * keeps any grouping from `edf` + * maintains the grouping or ungroupedness of `edf` * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by `dplyr::arrange(time_value, .by_group = TRUE)`** + * outputs an **`epi_df`** if the required columns are present, otherwise a + tibble * `epix_slide(ea, , .....)`: * keeps **grouping and `time_value`** columns of `ea`, adds computed column(s) * outputs one row **for element/row output from the computations** - * removes any grouping of `ea` - * is roughly analogous to (the non-sliding) **`dplyr::reframe`** (or, more - roughly, `dplyr::summarize` before the `dplyr` 1.1.0 - [update](https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-pick-reframe-arrange/#reframe)) + * maintains the grouping or ungroupedness of `ea`, unless it was explicitly + grouped by zero variables; this isn't supported by `grouped_df` and it will + automatically turn into an ungrouped tibble + * is roughly analogous to (the non-sliding) **`dplyr::group_modify`** + * outputs a **tibble** These differences in basic behavior make some common slide operations require less boilerplate: * predictors and targets calculated with `epi_slide` are automatically lined up @@ -107,14 +110,16 @@ edf %>% mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) + epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + ungroup() edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% - epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) + epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + ungroup() ``` When the slide computation returns an atomic vector (rather than a single value) @@ -205,7 +210,8 @@ edf %>% group_by(geo_value) %>% epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL) + before = 1, as_list_col = FALSE, names_sep = NULL) %>% + ungroup() ``` ## Multi-row outputs From 442f3e9ee1aa3632571004768569a873221676f7 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 5 May 2023 02:18:00 -0700 Subject: [PATCH 14/32] Fix `epix_slide` `as_list_col=TRUE` outputting df-type col not list --- R/grouped_epi_archive.R | 5 +++++ tests/testthat/test-epix_slide.R | 34 +++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 91b578d5..75b331db 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -280,6 +280,11 @@ grouped_epi_archive = if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } + if (is.data.frame(comp_value)) { + # Wrap in a list so that we get a list-type col rather than a + # data.frame-type col when `as_list_col = TRUE`: + comp_value <- list(comp_value) + } # Label every result row with the `ref_time_value`: return(tibble::tibble(time_value = .env$ref_time_value, diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index f4be8a70..8eaeba7a 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -42,6 +42,38 @@ test_that("epix_slide works as intended",{ expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical }) +test_that("epix_slide works as intended with `as_list_col=TRUE`",{ + # Note Issue #261. + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col=TRUE) + + xx2 <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + c(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% + group_by(geo_value) + + expect_identical(xx1,xx2) # * + + xx3 <- ( + xx + $group_by(dplyr::across(dplyr::all_of("geo_value"))) + $slide(f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE) + ) + + expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical +}) + test_that("epix_slide `before` validation works", { expect_error(xx$slide(f = ~ sum(.x$binary)), "`before` is required") @@ -303,7 +335,7 @@ test_that("as_of and epix_slide with long enough window are compatible", { ) }) -test_that("epix_slide `f` is passed an ungrouped `epi_archive`",{ +test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`",{ slide_fn <- function(x, g) { expect_true(is_epi_archive(x)) return(NA) From e440a183e6b6a2446a173f1969d676c78fa7fa4c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 5 May 2023 02:18:50 -0700 Subject: [PATCH 15/32] Add `epix_slide`-like-`group_modify` migration notes in NEWS.md --- NEWS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index 6eb7688e..7d339fc6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,10 +9,16 @@ inter-release development versions will include an additional ".9999" suffix. * `epix_slide` has been made more like `dplyr::group_modify`. It will no longer perform element/row recycling for size stability, accepts slide computation outputs containing any number of rows, and no longer supports `all_rows`. + * To keep the old behavior, manually perform row recycling within `f` + computations, and/or `left_join` a data frame representing the desired + output structure with the current `epix_slide()` result to obtain the + desired repetitions and completions expected with `all_rows = TRUE`. * `epix_slide` will only output grouped or ungrouped tibbles. Previously, it would sometimes output `epi_df`s, but not consistently, and not always with the metadata desired. Future versions will revisit this design, and consider more closely whether/when/how to output an `epi_df`. + * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` + when desired and set the metadata appropriately. # epiprocess 0.6.0 From 134b47ce98b33a9bf32e4e15dec99649ccfa7a2f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 5 May 2023 03:15:02 -0700 Subject: [PATCH 16/32] Fixm roxygen copy-paste error for `group_modify` --- NAMESPACE | 1 + R/methods-epi_df.R | 2 +- man/print.epi_df.Rd | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 1c344a3d..e342058e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ S3method(group_by,epi_archive) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) S3method(group_by_drop_default,grouped_epi_archive) +S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) S3method(next_after,Date) S3method(next_after,integer) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 0a9a1e28..c8b7a228 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -196,7 +196,7 @@ ungroup.epi_df = function(x, ...) { reclass(x, metadata) } -#' @method unnest epi_df +#' @method group_modify epi_df #' @rdname print.epi_df #' @param data The `epi_df` object. #' @export diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index babece3e..9ddd8873 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -20,7 +20,7 @@ \method{ungroup}{epi_df}(x, ...) -\method{unnest}{epi_df}(.data, .f, ..., .keep = FALSE) +\method{group_modify}{epi_df}(.data, .f, ..., .keep = FALSE) \method{unnest}{epi_df}(data, ...) } From a4baf7a7b4eb37d43e47a948fbdca404e359af0a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 5 May 2023 04:47:47 -0700 Subject: [PATCH 17/32] Address non-ASCII character issue from `check()` --- R/grouped_epi_archive.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 75b331db..6798e3ae 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -115,11 +115,11 @@ grouped_epi_archive = cat("Public `grouped_epi_archive` R6 methods:\n") grouped_method_names = names(grouped_epi_archive$public_methods) ungrouped_method_names = names(epi_archive$public_methods) - writeLines(wrap_varnames(initial = "• Specialized `epi_archive` methods: ", + writeLines(wrap_varnames(initial = "\u2022 Specialized `epi_archive` methods: ", intersect(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "• Exclusive to `grouped_epi_archive`: ", + writeLines(wrap_varnames(initial = "\u2022 Exclusive to `grouped_epi_archive`: ", setdiff(grouped_method_names, ungrouped_method_names))) - writeLines(wrap_varnames(initial = "• `ungroup` to use: ", + writeLines(wrap_varnames(initial = "\u2022 `ungroup` to use: ", setdiff(ungrouped_method_names, grouped_method_names))) } # Return self invisibly for convenience in `$`-"pipe": From 6d0f06064bfc2f498d9e28e1e3699617e7e686a0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 5 May 2023 04:49:58 -0700 Subject: [PATCH 18/32] Fix `guess_period` `@param` names Remove the backticks around the parameter names, which don't belong and were caught by `check()`. --- R/utils.R | 4 ++-- man/guess_period.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/utils.R b/R/utils.R index b398ff5b..6d9fcd80 100644 --- a/R/utils.R +++ b/R/utils.R @@ -415,12 +415,12 @@ gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { #' Use max valid period as guess for `period` of `ref_time_values` #' -#' @param `ref_time_values` Vector containing time-interval-like or time-like +#' @param ref_time_values Vector containing time-interval-like or time-like #' data, with at least two distinct values, [`diff`]-able (e.g., a #' `time_value` or `version` column), and should have a sensible result from #' adding `is.numeric` versions of its `diff` result (via `as.integer` if its #' `typeof` is `"integer"`, otherwise via `as.numeric`). -#' @param `ref_time_values_arg` Optional, string; name to give `ref_time_values` +#' @param ref_time_values_arg Optional, string; name to give `ref_time_values` #' in error messages. Defaults to quoting the expression the caller fed into #' the `ref_time_values` argument. #' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)` diff --git a/man/guess_period.Rd b/man/guess_period.Rd index e27309d3..e03a1373 100644 --- a/man/guess_period.Rd +++ b/man/guess_period.Rd @@ -10,13 +10,13 @@ guess_period( ) } \arguments{ -\item{`ref_time_values`}{Vector containing time-interval-like or time-like +\item{ref_time_values}{Vector containing time-interval-like or time-like data, with at least two distinct values, \code{\link{diff}}-able (e.g., a \code{time_value} or \code{version} column), and should have a sensible result from adding \code{is.numeric} versions of its \code{diff} result (via \code{as.integer} if its \code{typeof} is \code{"integer"}, otherwise via \code{as.numeric}).} -\item{`ref_time_values_arg`}{Optional, string; name to give \code{ref_time_values} +\item{ref_time_values_arg}{Optional, string; name to give \code{ref_time_values} in error messages. Defaults to quoting the expression the caller fed into the \code{ref_time_values} argument.} } From 21a090bd5af4e9d8741018a2bad34c6027ca65d8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 5 May 2023 05:09:50 -0700 Subject: [PATCH 19/32] Fix `as_tibble.epi_df` docs, make it have its own topic --- R/methods-epi_df.R | 12 ++++++++++-- man/print.epi_df.Rd | 7 ++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index c8b7a228..95e710aa 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -1,9 +1,17 @@ -#' Convert to tibble, dropping metadata -#' @rdname print.epi_df +#' Convert to tibble +#' +#' Converts an `epi_df` object into a tibble, dropping metadata and any +#' grouping. +#' +#' @param x an `epi_df` +#' @param ... arguments to forward to `NextMethod()` #' #' @importFrom tibble as_tibble #' @export as_tibble.epi_df = function(x, ...) { + # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the + # grouping and should be called by `NextMethod()` in the current design. + # See #223 for discussion of alternatives. decay_epi_df(NextMethod()) } diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 9ddd8873..fa7fcd53 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -1,17 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods-epi_df.R -\name{as_tibble.epi_df} -\alias{as_tibble.epi_df} +\name{print.epi_df} \alias{print.epi_df} \alias{summary.epi_df} \alias{group_by.epi_df} \alias{ungroup.epi_df} \alias{group_modify.epi_df} \alias{unnest.epi_df} -\title{Convert to tibble, dropping metadata} +\title{Base S3 methods for an \code{epi_df} object} \usage{ -\method{as_tibble}{epi_df}(x, ...) - \method{print}{epi_df}(x, ...) \method{summary}{epi_df}(object, ...) From 6efd8233013ccd77718f1d93d19bd4bc45a116e5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 5 May 2023 10:02:06 -0700 Subject: [PATCH 20/32] Fix `group_modify.epi_df` roxygen typo + missing `@param`s --- R/methods-epi_df.R | 4 +++- man/print.epi_df.Rd | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 95e710aa..6429b867 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -206,7 +206,9 @@ ungroup.epi_df = function(x, ...) { #' @method group_modify epi_df #' @rdname print.epi_df -#' @param data The `epi_df` object. +#' @param .data The `epi_df` object. +#' @param .f function or formula; see [`dplyr::group_modify`] +#' @param .keep Boolean; see [`dplyr::group_modify`] #' @export group_modify.epi_df = function(.data, .f, ..., .keep = FALSE) { dplyr::dplyr_reconstruct(NextMethod(), .data) diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index fa7fcd53..f5749d82 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -29,6 +29,12 @@ Currently unused.} \item{object}{The \code{epi_df} object.} +\item{.data}{The \code{epi_df} object.} + +\item{.f}{function or formula; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} + +\item{.keep}{Boolean; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} + \item{data}{The \code{epi_df} object.} } \description{ From a047b079ffdab12e0160b882ca89eca79f20cf0d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 19 May 2023 15:40:05 -0700 Subject: [PATCH 21/32] repl(epix_slide): describe migration in `all_rows` deprecation not the reasons for deprecation. --- R/grouped_epi_archive.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 6798e3ae..a146eb3a 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -209,9 +209,8 @@ grouped_epi_archive = if ("all_rows" %in% nse_dots_names(...)) { Abort(" The `all_rows` argument has been removed from `epix_slide` (but - is still supported in `epi_slide`). Since `epix_slide` now - allows any number of rows out of slide computations, it's - unclear how `all_rows=TRUE` should fill in missing results. + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") } From 26e9d2c3ad6e156179948ed8c6907217d9d763a8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 19 May 2023 15:45:03 -0700 Subject: [PATCH 22/32] refactor: favor validation section over validation + `else if` --- R/methods-epi_archive.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index ee12d70e..844eb54b 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -630,7 +630,8 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr detailed_mutate = epix_detailed_restricted_mutate(.data, ...) if (!rlang::is_bool(.drop)) { Abort("`.drop` must be TRUE or FALSE") - } else if (!.drop) { + } + if (!.drop) { grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor) # ^ Use `as.list` to try to avoid any possibility of a deep copy. From dbb658ca2ad38c6b6a01ae0f1296db659dbe9f25 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 19 May 2023 15:46:42 -0700 Subject: [PATCH 23/32] Fix epix_slide warning class naming inconsistencies --- R/methods-epi_archive.R | 4 ++-- tests/testthat/test-grouped_epi_archive.R | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 844eb54b..53c572bd 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -637,10 +637,10 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr # ^ Use `as.list` to try to avoid any possibility of a deep copy. if (!any(grouping_col_is_factor)) { Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", - class = "epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors") } else if (any(diff(grouping_col_is_factor) == -1L)) { Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", - class = "epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor") + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") } } grouped_epi_archive$new(detailed_mutate[["archive"]], diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 84c371f9..0423352e 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -39,12 +39,12 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { expect_error(toy_archive %>% group_by(.drop = "bogus"), regexp = "\\.drop.*TRUE or FALSE") expect_warning(toy_archive %>% group_by(.drop=FALSE), - class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE), - class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") expect_warning(grouped_factor_then_nonfactor <- toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), - class="epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor") + class="epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") expect_identical(grouped_factor_then_nonfactor %>% epix_slide(before = 10, s = sum(value)), tibble::tribble( From af49aae0186d652911d2fc6af09bb8743bb73be4 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 19 May 2023 15:47:36 -0700 Subject: [PATCH 24/32] Fix missing word in `advanced.Rmd` basic usage description --- vignettes/advanced.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index b2f71e07..f6d7b9c1 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -31,7 +31,7 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * keeps **grouping and `time_value`** columns of `ea`, adds computed column(s) - * outputs one row **for element/row output from the computations** + * outputs one row **for every element/row output from the computations** * maintains the grouping or ungroupedness of `ea`, unless it was explicitly grouped by zero variables; this isn't supported by `grouped_df` and it will automatically turn into an ungrouped tibble From 60ada45d941c2d5d995fb30be855344dc65b80a0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 May 2023 00:10:15 -0700 Subject: [PATCH 25/32] Remove remaining references to `comp_effective_key_vars` --- R/grouped_epi_archive.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index a146eb3a..51bb14b1 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -386,7 +386,7 @@ grouped_epi_archive = # provide an implementation yet. Next best would be # `group_modify` on its `$DT`, but that has different behavior # based on whether or not `dtplyr` is loaded. Instead, go - # through a , trying to avoid copies. + # through an ordinary data frame, trying to avoid copies. if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { # `as_of` aliased its the full `$DT`; copy before mutating: as_of_archive$DT <- copy(as_of_archive$DT) @@ -399,7 +399,6 @@ grouped_epi_archive = group_modify_fn = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a @@ -410,7 +409,6 @@ grouped_epi_archive = .data_group_archive$DT = .data_group comp_one_grp(.data_group_archive, .group_key, f = f, quo = quo, ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col ) } From 80d29c6c7fdf8a977f0a467fe363d9505f1458ab Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 May 2023 00:10:25 -0700 Subject: [PATCH 26/32] Add missing `epix_slide` tests for {fn,~,tidy} x {all_versions,not} Add test for `epix_slide` on function or tidyeval with `all_versions=FALSE`, and for formula or tidyeval with `all_versions=TRUE`. --- tests/testthat/test-epix_slide.R | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 8eaeba7a..21962873 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -40,6 +40,23 @@ test_that("epix_slide works as intended",{ ) expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical + + # function interface + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = function(x, g) { + tibble::tibble(sum_binary = sum(x$binary)) + }, before = 2, names_sep = NULL) + + expect_identical(xx1,xx4) + + # tidyeval interface + xx5 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(sum_binary = sum(binary), + before = 2) + + expect_identical(xx1,xx5) }) test_that("epix_slide works as intended with `as_list_col=TRUE`",{ @@ -269,6 +286,28 @@ test_that("epix_slide with all_versions option has access to all older versions" expect_identical(result1,result3) # This and * Imply result2 and result3 are identical + # formula interface + result4 <- ea %>% group_by() %>% + epix_slide(f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + + expect_identical(result1,result4) # This and * Imply result2 and result4 are identical + + # tidyeval interface + result5 <- ea %>% + group_by() %>% + epix_slide(data = slide_fn( + .data$clone(), # hack to convert from pronoun back to archive + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + + expect_identical(result1,result5) # This and * Imply result2 and result5 are identical + expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea }) From f70b911e59ee9fa05d28fc253781c10445475757 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 May 2023 02:28:21 -0700 Subject: [PATCH 27/32] Reword advanced.Rmd on epi vs epix slide number of output rows --- vignettes/advanced.Rmd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index f6d7b9c1..ce4f8e82 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -18,7 +18,7 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * `epi_slide(edf, , .....)`: * keeps **all** columns of `edf`, adds computed column(s) - * outputs one row **for every row in `edf`** (recycling outputs from + * outputs **one row per row in `edf`** (recycling outputs from computations appropriately if there are multiple time series bundled together inside any group(s)) * maintains the grouping or ungroupedness of `edf` @@ -31,7 +31,8 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * keeps **grouping and `time_value`** columns of `ea`, adds computed column(s) - * outputs one row **for every element/row output from the computations** + * outputs **any number of rows** (computations are allowed to output any + number of elements/rows, and no recycling is performed) * maintains the grouping or ungroupedness of `ea`, unless it was explicitly grouped by zero variables; this isn't supported by `grouped_df` and it will automatically turn into an ungrouped tibble From de398b1ba381c676ee66a7595278c54038ce82e6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 May 2023 02:49:29 -0700 Subject: [PATCH 28/32] Track missing .Rd file --- man/as_tibble.epi_df.Rd | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 man/as_tibble.epi_df.Rd diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd new file mode 100644 index 00000000..c314f47e --- /dev/null +++ b/man/as_tibble.epi_df.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{as_tibble.epi_df} +\alias{as_tibble.epi_df} +\title{Convert to tibble} +\usage{ +\method{as_tibble}{epi_df}(x, ...) +} +\arguments{ +\item{x}{an \code{epi_df}} + +\item{...}{arguments to forward to \code{NextMethod()}} +} +\description{ +Converts an \code{epi_df} object into a tibble, dropping metadata and any +grouping. +} From 8985db2b9066d19131148be43ce2f25cec6be795 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 May 2023 05:52:07 -0700 Subject: [PATCH 29/32] Make `as_list_col=TRUE` consistent for vecs and dfs from slide comps --- NEWS.md | 6 +++ R/grouped_epi_archive.R | 15 +++--- R/methods-epi_archive.R | 11 ++-- R/slide.R | 81 ++++++++++++++--------------- man/epi_slide.Rd | 11 ++-- man/epix_slide.Rd | 11 ++-- tests/testthat/test-epi_slide.R | 73 ++++++++++++++++++++++++++ tests/testthat/test-epix_slide.R | 89 ++++++++++++++++++++++++++------ vignettes/advanced.Rmd | 13 +++-- 9 files changed, 226 insertions(+), 84 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7d339fc6..8e13a665 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,12 @@ inter-release development versions will include an additional ".9999" suffix. * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` when desired and set the metadata appropriately. +## Improvements: + +* `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide + computations output atomic vectors, and output a list column in "chopped" + format (see `tidyr::chop`). + # epiprocess 0.6.0 ## Breaking changes: diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 51bb14b1..59dc2b44 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -279,11 +279,12 @@ grouped_epi_archive = if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } - if (is.data.frame(comp_value)) { - # Wrap in a list so that we get a list-type col rather than a - # data.frame-type col when `as_list_col = TRUE`: - comp_value <- list(comp_value) - } + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + comp_value <- list(comp_value) # Label every result row with the `ref_time_value`: return(tibble::tibble(time_value = .env$ref_time_value, @@ -426,8 +427,8 @@ grouped_epi_archive = ) }) } - - # Unnest if we need to + + # Unchop/unnest if we need to if (!as_list_col) { x = tidyr::unnest(x, !!new_col, names_sep = names_sep) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 53c572bd..9c6640a8 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -707,11 +707,12 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col If the computations return data frames, should the slide -#' result hold these in a single list column or try to unnest them? Default is -#' `FALSE`, in which case a list object returned by `f` would be unnested -#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given -#' by prepending `new_col_name` to the names of the list elements. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. diff --git a/R/slide.R b/R/slide.R index ea2b93cb..4650d604 100644 --- a/R/slide.R +++ b/R/slide.R @@ -52,11 +52,12 @@ #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col If the computations return data frames, should the slide -#' result hold these in a single list column or try to unnest them? Default is -#' `FALSE`, in which case a list object returned by `f` would be unnested -#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given -#' by prepending `new_col_name` to the names of the list elements. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. @@ -248,11 +249,11 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, time_values = time_values[o] # Compute the slide values - slide_values = slider::hop_index(.x = .data_group, - .i = .data_group$time_value, - .f = f, ..., - .starts = starts, - .stops = stops) + slide_values_list = slider::hop_index(.x = .data_group, + .i = .data_group$time_value, + .f = f, ..., + .starts = starts, + .stops = stops) # Now figure out which rows in the data group are in the reference time # values; this will be useful for all sorts of checks that follow @@ -265,42 +266,38 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, dplyr::count(.data$time_value) %>% dplyr::pull(n) - # If they're all atomic vectors - if (all(sapply(slide_values, is.atomic))) { - if (all(sapply(slide_values, length) == 1)) { - # Recycle to make size stable (one slide value per ref time value) - slide_values = rep(unlist(slide_values), times = counts) - } - else { - # Unlist, then check its length, and abort if not right - slide_values = unlist(slide_values) - if (length(slide_values) != num_ref_rows) { - Abort("If the slide computations return atomic vectors, then they must each have a single element, or else one element per appearance of the reference time value in the local window.") - } - } + if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && + !all(purrr::map_lgl(slide_values_list, is.data.frame))) { + Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") } - - # If they're all data frames - else if (all(sapply(slide_values, is.data.frame))) { - if (all(sapply(slide_values, nrow) == 1)) { - # Recycle to make size stable (one slide value per ref time value) - slide_values = rep(slide_values, times = counts) + + # Unlist if appropriate: + slide_values = + if (as_list_col) { + slide_values_list + } else { + vctrs::list_unchop(slide_values_list) } - else { - # Split (each row on its own), check length, abort if not right - slide_df = dplyr::bind_rows(slide_values) - slide_values = split(slide_df, 1:nrow(slide_df)) - if (length(slide_values) != num_ref_rows) { - Abort("If the slide computations return data frames, then they must each have a single row, or else one row per appearance of the reference time value in the local window.") - } + + if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && + length(slide_values_list) != 0L) { + # Recycle to make size stable (one slide value per ref time value). + # (Length-0 case also could be handled here, but causes difficulties; + # leave it to the next branch, where it also belongs.) + slide_values = vctrs::vec_rep_each(slide_values, times = counts) + } else { + # Split and flatten if appropriate, perform a (loose) check on number of + # rows. + if (as_list_col) { + slide_values = purrr::list_flatten(purrr::map( + slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]] + )) + } + if (vctrs::vec_size(slide_values) != num_ref_rows) { + Abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.") } } - - # If neither all atomic vectors or all data frames, then abort - else { - Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") - } - + # If all rows, then pad slide values with NAs, else filter down data group if (all_rows) { orig_values = slide_values diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 1809e113..0bfff507 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -71,11 +71,12 @@ return an object of class \code{lubridate::period}. For example, we can use contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} -\item{as_list_col}{If the computations return data frames, should the slide -result hold these in a single list column or try to unnest them? Default is -\code{FALSE}, in which case a list object returned by \code{f} would be unnested -(using \code{\link[tidyr:nest]{tidyr::unnest()}}), and the names of the resulting columns are given -by prepending \code{new_col_name} to the names of the list elements.} +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:nest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:nest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index b6d7b75d..0098727f 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -73,11 +73,12 @@ would only be meaningful if \code{time_value} is of class \code{POSIXct}).} contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} -\item{as_list_col}{If the computations return data frames, should the slide -result hold these in a single list column or try to unnest them? Default is -\code{FALSE}, in which case a list object returned by \code{f} would be unnested -(using \code{\link[tidyr:nest]{tidyr::unnest()}}), and the names of the resulting columns are given -by prepending \code{new_col_name} to the names of the list elements.} +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:nest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:nest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index eebcc55b..a35b8f9c 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -86,3 +86,76 @@ test_that("these doesn't produce an error; the error appears only if the ref tim dplyr::select("geo_value","slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group }) + +test_that("computation output formats x as_list_col", { + toy_edf = tibble::tribble( + ~geo_value, ~time_value, ~value , + "a" , 1:10 , 2L^( 1:10), + "b" , 1:10 , 2L^(11:20), + ) %>% + tidyr::unchop(c(time_value, value)) %>% + as_epi_df(as_of = 100) + # We'll try 7d sum with a few formats. + basic_result_from_size1 = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + basic_result_from_size1 + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), + basic_result_from_size1 %>% dplyr::mutate(slide_value = as.list(slide_value)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + basic_result_from_size1 %>% rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), + basic_result_from_size1 %>% + mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) + # output naming functionality: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + new_col_name = "result"), + basic_result_from_size1 %>% rename(result_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value_sum = sum(.x$value)), + names_sep = NULL), + basic_result_from_size1 %>% rename(value_sum = slide_value) + ) + # trying with non-size-1 computation outputs: + basic_result_from_size2 = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE) + 1L, + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1), + basic_result_from_size2 + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE), + basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)), + basic_result_from_size2 %>% rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE), + basic_result_from_size2 %>% + mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) +}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 21962873..7ec8194f 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -60,27 +60,27 @@ test_that("epix_slide works as intended",{ }) test_that("epix_slide works as intended with `as_list_col=TRUE`",{ - # Note Issue #261. - xx1 <- xx %>% + xx_dfrow1 <- xx %>% group_by(.data$geo_value) %>% epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)), before = 2, - as_list_col=TRUE) + as_list_col = TRUE) - xx2 <- tibble(geo_value = rep("x",4), - time_value = c(4,5,6,7), - slide_value = - c(2^3+2^2, - 2^6+2^3, - 2^10+2^9, - 2^15+2^14) %>% - purrr::map(~ data.frame(bin_sum = .x)) - ) %>% + xx_dfrow2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + c(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% group_by(geo_value) - expect_identical(xx1,xx2) # * + expect_identical(xx_dfrow1,xx_dfrow2) # * - xx3 <- ( + xx_dfrow3 <- ( xx $group_by(dplyr::across(dplyr::all_of("geo_value"))) $slide(f = ~ data.frame(bin_sum = sum(.x$binary)), @@ -88,7 +88,66 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`",{ as_list_col = TRUE) ) - expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical + expect_identical(xx_dfrow1,xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + + xx_df1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE) + + xx_df2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(c(2^3,2^2), + c(2^6,2^3), + c(2^10,2^9), + c(2^15,2^14)) %>% + purrr::map(~ data.frame(bin = rev(.x))) + ) %>% + group_by(geo_value) + + expect_identical(xx_df1,xx_df2) + + xx_scalar1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE) + + xx_scalar2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14) + ) %>% + group_by(geo_value) + + expect_identical(xx_scalar1,xx_scalar2) + + xx_vec1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ .x$binary, + before = 2, + as_list_col = TRUE) + + xx_vec2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(c(2^3,2^2), + c(2^6,2^3), + c(2^10,2^9), + c(2^15,2^14)) %>% + purrr::map(rev) + ) %>% + group_by(geo_value) + + expect_identical(xx_vec1,xx_vec2) }) test_that("epix_slide `before` validation works", { diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index ce4f8e82..afa5c653 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -50,17 +50,20 @@ When using more advanced features, more complex rules apply: * Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)` will output one row for every row in `edf` with `time_value` appearing inside - `ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` + `my_ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` followed by `dplyr::filter` to those `ref_time_values`. We call this property **size stability**, and describe how it is achieved in the following sections. The default behavior described above is a special case of this general rule based on a default value of `ref_time_values`. * Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values, - all_rows=TRUE)` will not just output rows for the given `ref_time_values`, but + all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but instead will output one row per row in `edf`. -* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will output one - row per computation that outputs a data frame, even when these data frames - have arbitrary numbers of rows. +* Exception/feature: `epi_slide(edf, ....., as_list_col=TRUE)` will format the + output to add a single list-class computed column. +* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will format the + output to have one row per computation and a single list-class computed column + (in addition to the grouping variables and `time_value`), as if we had used + `tidyr::chop()` or `tidyr::nest()`. * Clarification: `ea %>% group_by(....., .drop=FALSE) %>% epix_slide(, .....)` will call the computation on any missing groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional From 9ba843c54339c3c5e5cc2f8b8c519ebb339873d7 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 May 2023 18:20:31 -0700 Subject: [PATCH 30/32] Fix `all_rows = TRUE` to work with move to `vctrs` Document behavior when `all_rows = TRUE` and `as_list_col = TRUE`; this behavior might be different from before move to use `vctrs` though. Add tests to cover this case + others, which was caught only via a vignette build failing. --- R/slide.R | 20 +++-- man/epi_slide.Rd | 11 ++- tests/testthat/test-epi_slide.R | 142 +++++++++++++++++++++++++++++--- 3 files changed, 156 insertions(+), 17 deletions(-) diff --git a/R/slide.R b/R/slide.R index 38676012..ab591f99 100644 --- a/R/slide.R +++ b/R/slide.R @@ -62,8 +62,15 @@ #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. #' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output; otherwise, there will be one row for each time value in `x` -#' that acts as a reference time value. Default is `FALSE`. +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. If using `as_list_col = TRUE`, note that +#' the missing marker is a `NULL` entry in the list column; for certain +#' operations, you might want to replace these `NULL` entries with a different +#' `NA` marker. #' @return An `epi_df` object given by appending a new column to `x`, named #' according to the `new_col_name` argument. #' @@ -306,10 +313,13 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # If all rows, then pad slide values with NAs, else filter down data group if (all_rows) { orig_values = slide_values - slide_values = rep(NA, nrow(.data_group)) - slide_values[o] = orig_values + slide_values = vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) + # ^ using vctrs::vec_init would be shorter but docs don't guarantee it + # fills with NA equivalent. + vctrs::vec_slice(slide_values, o) = orig_values + } else { + .data_group = filter(.data_group, o) } - else .data_group = filter(.data_group, o) return(mutate(.data_group, !!new_col := slide_values)) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 0bfff507..2a646670 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -83,8 +83,15 @@ when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the pre from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output; otherwise, there will be one row for each time value in \code{x} -that acts as a reference time value. Default is \code{FALSE}.} +the output even with \code{ref_time_values} provided, with some type of missing +value marker for the slide computation output column(s) for \code{time_value}s +outside \code{ref_time_values}; otherwise, there will be one row for each row in +\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending a new column to \code{x}, named diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index ad44fe4a..be08eb6a 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -2,15 +2,23 @@ d <- as.Date("2020-01-01") -grouped = dplyr::bind_rows( +ungrouped = dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value=1:200), dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) ) %>% - as_epi_df() %>% + as_epi_df() +grouped = ungrouped %>% group_by(geo_value) - f = function(x, g) dplyr::tibble(value=mean(x$value), count=length(x$value)) +toy_edf = tibble::tribble( + ~geo_value, ~time_value, ~value , + "a" , 1:10 , 2L^( 1:10), + "b" , 1:10 , 2L^(11:20), + ) %>% + tidyr::unchop(c(time_value, value)) %>% + as_epi_df(as_of = 100) + ## --- These cases generate errors (or not): --- test_that("`before` and `after` are both vectors of length 1", { expect_error(epi_slide(grouped, f, before = c(0,1), after = 0, ref_time_values = d+3), @@ -88,13 +96,7 @@ test_that("these doesn't produce an error; the error appears only if the ref tim }) test_that("computation output formats x as_list_col", { - toy_edf = tibble::tribble( - ~geo_value, ~time_value, ~value , - "a" , 1:10 , 2L^( 1:10), - "b" , 1:10 , 2L^(11:20), - ) %>% - tidyr::unchop(c(time_value, value)) %>% - as_epi_df(as_of = 100) + # See `toy_edf` definition at top of file. # We'll try 7d sum with a few formats. basic_result_from_size1 = tibble::tribble( ~geo_value, ~time_value, ~value , ~slide_value , @@ -170,3 +172,123 @@ test_that("epi_slide alerts if the provided f doesn't take enough args", { expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d+1), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") }) + +test_that("`ref_time_values` + `all_rows = TRUE` works", { + # See `toy_edf` definition at top of file. We'll do variants of a slide + # returning the following: + basic_full_result = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + # slide computations returning atomic vecs: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + basic_full_result + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L)), + basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L), all_rows = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) + ) + # slide computations returning data frames: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L)), + basic_full_result %>% + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + # slide computations returning data frames with `as_list_col=TRUE`: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% + dplyr::filter(time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, list(NULL))) + ) + # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + # XXX unclear exactly what we want in this case. Current approach is + # compatible with `vctrs::vec_detect_missing` but breaks `tidyr::unnest` + # compatibility + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + rework_nulls = function(slide_values_list) { + vctrs::vec_assign( + slide_values_list, + vctrs::vec_detect_missing(slide_values_list), + list(vctrs::vec_cast(NA, vctrs::vec_ptype_common(!!!slide_values_list))) + ) + } + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE) %>% + mutate(slide_value = rework_nulls(slide_value)) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) %>% + dplyr::rename(slide_value_value = slide_value) + ) +}) From cf2ad2fa45ef66d6e6a4fac638060646a9fc3676 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 30 May 2023 10:41:47 -0700 Subject: [PATCH 31/32] Update tests&NEWS.md about `epi_slide` `f` Date vec output --- NEWS.md | 2 ++ tests/testthat/test-epi_slide.R | 9 +++++++++ tests/testthat/test-epix_slide.R | 12 ++++++++++++ 3 files changed, 23 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8e13a665..60e99885 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,6 +25,8 @@ inter-release development versions will include an additional ".9999" suffix. * `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide computations output atomic vectors, and output a list column in "chopped" format (see `tidyr::chop`). +* `epi_slide` now works properly with slide computations that output just a + `Date` vector, rather than converting `slide_value` to a numeric column. # epiprocess 0.6.0 diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index be08eb6a..21191a0b 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -292,3 +292,12 @@ test_that("`ref_time_values` + `all_rows = TRUE` works", { dplyr::rename(slide_value_value = slide_value) ) }) + +test_that("`epi_slide` doesn't decay date output", { + expect_true( + ungrouped %>% + epi_slide(before = 5L, ~ as.Date("2020-01-01")) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index aeb67053..34fef705 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -572,3 +572,15 @@ test_that("epix_slide alerts if the provided f doesn't take enough args", { expect_warning(epix_slide(xx, f_x_dots, before = 2L), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") }) + +test_that("`epix_slide` doesn't decay date output", { + expect_true( + xx$DT %>% + as_tibble() %>% + mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% + as_epi_archive() %>% + epix_slide(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) From 116bae08e454306a890f6fae4578168d5f6f0332 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 30 May 2023 10:58:10 -0700 Subject: [PATCH 32/32] Add NEWS.md entry for `as_list_col` `all_rows` `NA` -> `NULL` --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index 60e99885..aea09f44 100644 --- a/NEWS.md +++ b/NEWS.md @@ -47,6 +47,13 @@ inter-release development versions will include an additional ".9999" suffix. * Slide functions now keep any grouping of `x` in their results, like `mutate` and `group_modify`. * To obtain the old behavior, `dplyr::ungroup` the slide results immediately. +* Additional `epi_slide` changes: + * When using `as_list_col = TRUE` together with `ref_time_values` and + `all_rows=TRUE`, the marker for excluded computations is now a `NULL` entry + in the list column, rather than a `NA`; if you are using `tidyr::unnest()` + afterward and want to keep these missing data markers, you will need to + replace the `NULL` entries with `NA`s. Skipped computations are now more + uniformly detectable using `vctrs` methods. * Additional`epix_slide` changes: * `epix_slide`'s `group_by` argument has been replaced by `dplyr::group_by` and `dplyr::ungroup` S3 methods. The `group_by` method uses "data masking" (also