diff --git a/NAMESPACE b/NAMESPACE index 2f90eae6..03efecd2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,26 +1,22 @@ # Generated by roxygen2: do not edit by hand S3method("[",epi_df) -S3method(arrange,epi_df) +S3method("names<-",epi_df) S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) S3method(as_tsibble,epi_df) S3method(dplyr_col_modify,col_modify_recorder_df) -S3method(filter,epi_df) +S3method(dplyr_col_modify,epi_df) +S3method(dplyr_reconstruct,epi_df) S3method(group_by,epi_archive) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) S3method(group_by_drop_default,grouped_epi_archive) -S3method(group_modify,epi_df) -S3method(mutate,epi_df) S3method(next_after,Date) S3method(next_after,integer) S3method(print,epi_df) -S3method(relocate,epi_df) -S3method(rename,epi_df) -S3method(slice,epi_df) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(unnest,epi_df) @@ -66,6 +62,7 @@ importFrom(data.table,set) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) importFrom(dplyr,dplyr_col_modify) +importFrom(dplyr,dplyr_reconstruct) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) diff --git a/NEWS.md b/NEWS.md index e61d7cb2..c509b08e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,9 +4,28 @@ Note that `epiprocess` uses the [Semantic Versioning ("semver")](https://semver.org/) scheme for all release versions, but not for development versions. A ".9999" suffix indicates a development version. +## Breaking changes: + +* `epi_slide` and `epix_slide` now keep the grouping of `x` in their results, + like `dplyr::mutate` and `dplyr::group_modify`. To obtain the old behavior, + `dplyr::ungroup` immediately after slides. + +## Potentially-breaking changes: + +* Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when + dropping the `epi_df` class (e.g., when removing the `time_value` column). +* Fixed `group_modify` on `epi_df`s to decay to non-`epi_df`s when an `epi_df` + result doesn't make sense (e.g., when removing the `time_value` column). + +## Improvements: + +* Fixed `epi_slide` and `group_modify` on grouped `epi_df`s to not drop `epi_df` + class. + ## Cleanup: * Added a `NEWS.md` file to track changes to the package. +* Implemented `?dplyr::dplyr_extending` for `epi_df`s (#223). # epiprocess 0.5.0: diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 9f6d64b0..c3a3e3e4 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -215,24 +215,55 @@ grouped_epi_archive = # Symbolize column name new_col = sym(new_col_name) - # Key variable names, apart from time value and version - key_vars = setdiff(key(private$ungrouped$DT), c("time_value", "version")) + # Each computation is expected to output a data frame with either + # one element/row total or one element/row per encountered + # nongrouping, nontime, nonversion key value. These nongrouping, + # nontime, nonversion key columns can be seen as the "effective" key + # of the computation; the computation might return an object that + # reports a different key or no key, but the "effective" key should + # still be a valid unique key for the data, and is something that we + # could use even with `.keep = FALSE`. + comp_effective_key_vars = + setdiff(key(private$ungrouped$DT), + c(private$vars, "time_value", "version")) # Computation for one group, one time value - comp_one_grp = function(.data_group, + comp_one_grp = function(.data_group, .group_key, f, ..., - time_value, - key_vars, + ref_time_value, + comp_effective_key_vars, new_col) { # Carry out the specified computation - comp_value = f(.data_group, ...) + comp_value = f(.data_group, .group_key, ...) - # Count the number of appearances of the reference time value. - # Note: ideally, we want to directly count occurrences of the ref - # time value but due to latency, this will often not appear in the - # data group. So we count the number of unique key values, outside - # of the time value column - count = sum(!duplicated(.data_group[, key_vars])) + # Calculate the number of output elements/rows we expect the + # computation to output: one per distinct "effective computation + # key variable" value encountered in the input. Note: this mirrors + # how `epi_slide` does things if we're using unique keys, but can + # diverge if using nonunique keys. The `epi_slide` approach of + # counting occurrences of the `ref_time_value` in the `time_value` + # column, which helps lines up the computation results with + # corresponding rows of the input data, wouldn't quite apply here: + # we'd want to line up with rows (from the same group) with + # `version` matching the `ref_time_value`, but would still need to + # summarize these rows somehow and drop the `time_value` input + # column, but this summarization requires something like a + # to-be-unique output key to determine a sensible number of rows + # to output (and the contents of those rows). + count = + if (length(comp_effective_key_vars) != 0L) { + sum(!duplicated(.data_group[, comp_effective_key_vars])) + } else { + # Same idea as above, but accounting for `duplicated` not + # working as we want on 0 columns. (Should be the same as if + # we were counting distinct values of a column defined as + # `rep(val, target_n_rows)`.) + if (nrow(.data_group) == 0L) { + 0L + } else { + 1L + } + } # If we get back an atomic vector if (is.atomic(comp_value)) { @@ -241,7 +272,7 @@ grouped_epi_archive = } # If not a singleton, should be the right length, else abort else if (length(comp_value) != count) { - Abort("If the slide computation returns an atomic vector, then it must have a single element, or else one element per appearance of the reference time value in the local window.") + Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.') } } @@ -256,7 +287,7 @@ grouped_epi_archive = } # Make into a list else { - comp_value = split(comp_value, 1:nrow(comp_value)) + comp_value = split(comp_value, seq_len(nrow(comp_value))) } } @@ -265,10 +296,9 @@ grouped_epi_archive = Abort("The slide computation must return an atomic vector or a data frame.") } - # Note that we've already recycled comp value to make size stable, - # so tibble() will just recycle time value appropriately - return(tibble::tibble(time_value = time_value, - !!new_col := comp_value)) + # Label every result row with the `ref_time_value`: + return(tibble::tibble(time_value = rep(.env$ref_time_value, count), + !!new_col := .env$comp_value)) } # If f is not missing, then just go ahead, slide by group @@ -278,12 +308,12 @@ grouped_epi_archive = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before) %>% dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars)), .drop=private$drop) %>% - dplyr::summarize(comp_one_grp(dplyr::cur_data_all(), - f = f, ..., - time_value = ref_time_value, - key_vars = key_vars, - new_col = new_col), - .groups = groups) + dplyr::group_modify(comp_one_grp, + f = f, ..., + ref_time_value = ref_time_value, + comp_effective_key_vars = comp_effective_key_vars, + new_col = new_col, + .keep = TRUE) }) } @@ -306,11 +336,10 @@ grouped_epi_archive = dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars))) %>% dplyr::group_modify(comp_one_grp, f = f, quo = quo, - time_value = ref_time_value, - key_vars = key_vars, + ref_time_value = ref_time_value, + comp_effective_key_vars = comp_effective_key_vars, new_col = new_col, - .keep = TRUE) %>% - dplyr::ungroup() + .keep = TRUE) }) } diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index a44dde6a..072fddd5 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -19,8 +19,7 @@ as_tsibble.epi_df = function(x, key, ...) { #' Base S3 methods for an `epi_df` object #' -#' Print, summary, and `dplyr` verbs (that preserve class and attributes) for an -#' `epi_df` object. +#' Print and summary functions for an `epi_df` object. #' #' @param x The `epi_df` object. #' @param ... Additional arguments passed to methods. @@ -65,119 +64,104 @@ summary.epi_df = function(object, ...) { dplyr::summarize(mean(.data$num))))) } +#' Drop any `epi_df` metadata and class on a data frame +#' +#' Useful in implementing `?dplyr_extending` when manipulations cause invariants +#' of `epi_df`s to be violated and we need to return some other class. Note that +#' this will maintain any grouping (keeping the `grouped_df` class and +#' associated attributes, if present). +#' +#' @param x an `epi_df` or other data frame +#' @return `x` with any metadata dropped and the `"epi_df"` class, if previously +#' present, dropped +#' +#' @noRd +decay_epi_df = function(x) { + attributes(x)$metadata <- NULL + class(x) <- class(x)[class(x) != "epi_df"] + x +} + +# Implementing `dplyr_extending`: `geo_type` and `time_type` are scalar +# attributes dependent on columns, and `other_keys` acts like an attribute +# vectorized over columns; `dplyr_extending` advice says to implement +# `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and `names<-`, but not +# `dplyr_row_slice`. We'll implement `[` to allow either 1d or 2d. + +#' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may +#' directly feed in latter from our other methods) +#' @param template `epi_df` template to use to restore +#' @return `epi_df` or degrade into `tbl_df` +#' @importFrom dplyr dplyr_reconstruct #' @export -`[.epi_df` <- function(x, i, j, drop = FALSE) { - res <- NextMethod() - - if (!is.data.frame(res)) return(res) - - if (missing(i)) { - i <- NULL - } - - if (missing(j)) { - j <- NULL - } +#' @noRd +dplyr_reconstruct.epi_df = function(data, template) { + # Start from a reconstruction for the backing S3 classes; this ensures that we + # keep any grouping that has been applied: + res <- NextMethod() cn <- names(res) - + # Duplicate columns, Abort dup_col_names = cn[duplicated(cn)] if (length(dup_col_names) != 0) { Abort(paste0("Column name(s) ", paste(unique(dup_col_names), collapse = ", "), " must not be duplicated.")) - } + } not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn) if (not_epi_df) { - attributes(res)$metadata <- NULL - return(tibble::as_tibble(res)) + # If we're calling on an `epi_df` from one of our own functions, we need to + # decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble, + # `res` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we + # simply need to skip adding the metadata & class. Current `decay_epi_df` + # should work in both cases. + return(decay_epi_df(res)) } - # Use reclass as safeguard (in case class &/or metadata are dropped) - res <- reclass(res, attr(x, "metadata")) + res <- reclass(res, attr(template, "metadata")) + + # XXX we may want verify the `geo_type` and `time_type` here. If it's + # significant overhead, we may also want to keep this less strict version + # around and implement some extra S3 methods that use it, when appropriate. # Amend additional metadata if some other_keys cols are dropped in the subset - old_other_keys = attr(x, "metadata")$other_keys + old_other_keys = attr(template, "metadata")$other_keys attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn] res } -#' `dplyr` verbs -#' -#' `dplyr` verbs for `epi_df` objects, preserving class and attributes. -#' -#' @method arrange epi_df -#' @param .data The `epi_df` object. -#' @rdname print.epi_df -#' @export -arrange.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) -} - -#' @method filter epi_df -#' @rdname print.epi_df -#' @export -filter.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) -} - -#' @method group_by epi_df -#' @rdname print.epi_df #' @export -group_by.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) -} - -#' @method group_modify epi_df -#' @rdname print.epi_df -#' @export -group_modify.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) -} - -#' @method mutate epi_df -#' @rdname print.epi_df -#' @export -mutate.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) +`[.epi_df` <- function(x, i, j, drop = FALSE) { + res <- NextMethod() + + if (!is.data.frame(res)) return(res) + + dplyr::dplyr_reconstruct(res, x) } -#' @method relocate epi_df -#' @rdname print.epi_df +#' @importFrom dplyr dplyr_col_modify #' @export -relocate.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) +dplyr_col_modify.epi_df = function(data, cols) { + dplyr::dplyr_reconstruct(NextMethod(), data) } -#' @method rename epi_df -#' @rdname print.epi_df #' @export -rename.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) +`names<-.epi_df` = function(x, value) { + old_names = names(x) + old_other_keys = attributes(x)$metadata$other_keys + result = NextMethod() + attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)] + dplyr::dplyr_reconstruct(result, result) } -#' @method slice epi_df +#' @method group_by epi_df #' @rdname print.epi_df #' @export -slice.epi_df = function(.data, ...) { +group_by.epi_df = function(.data, ...) { metadata = attributes(.data)$metadata .data = NextMethod() reclass(.data, metadata) diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 9ac3af99..bba21030 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -3,14 +3,7 @@ \name{print.epi_df} \alias{print.epi_df} \alias{summary.epi_df} -\alias{arrange.epi_df} -\alias{filter.epi_df} \alias{group_by.epi_df} -\alias{group_modify.epi_df} -\alias{mutate.epi_df} -\alias{relocate.epi_df} -\alias{rename.epi_df} -\alias{slice.epi_df} \alias{ungroup.epi_df} \alias{unnest.epi_df} \title{Base S3 methods for an \code{epi_df} object} @@ -19,22 +12,8 @@ \method{summary}{epi_df}(object, ...) -\method{arrange}{epi_df}(.data, ...) - -\method{filter}{epi_df}(.data, ...) - \method{group_by}{epi_df}(.data, ...) -\method{group_modify}{epi_df}(.data, ...) - -\method{mutate}{epi_df}(.data, ...) - -\method{relocate}{epi_df}(.data, ...) - -\method{rename}{epi_df}(.data, ...) - -\method{slice}{epi_df}(.data, ...) - \method{ungroup}{epi_df}(x, ...) \method{unnest}{epi_df}(data, ...) @@ -47,16 +26,11 @@ Currently unused.} \item{object}{The \code{epi_df} object.} -\item{.data}{The \code{epi_df} object.} - \item{data}{The \code{epi_df} object.} } \description{ -Print, summary, and \code{dplyr} verbs (that preserve class and attributes) for an -\code{epi_df} object. +Print and summary functions for an \code{epi_df} object. Prints a variety of summary statistics about the \code{epi_df} object, such as the time range included and geographic coverage. - -\code{dplyr} verbs for \code{epi_df} objects, preserving class and attributes. } diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index f363ae34..bdfa5d59 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -26,9 +26,11 @@ test_that("`ref_time_values` + `align` that have some slide data, but generate t ## --- These cases doesn't generate the error: --- test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", { expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+200L) %>% + dplyr::ungroup() %>% dplyr::select("geo_value","slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199)) # out of range for one group expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-04")) %>% + dplyr::ungroup() %>% dplyr::select("geo_value","slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 339fe930..66ff9ef8 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -27,7 +27,8 @@ test_that("epix_slide works as intended",{ sum_binary = c(2^3+2^2, 2^6+2^3, 2^10+2^9)) %>% - as_epi_df(as_of = 1) # Also a bug (issue #213) + as_epi_df(as_of = 1) %>% # Also a bug (issue #213) + group_by(geo_value) expect_identical(xx1,xx2) # *