Half-fix failing test, use dplyr_extending, don't ungroup in slides

* Half-fix failing grouped `epix_slide` test, where `group_modify` dropped the `epi_df` class when binding results together, by implementing `dplyr_reconstruct.epi_df`. Somehow a session documenting and testing immediately before this commit and then after it will have the tests pass, but the tests won't pass off of this branch directly. The class vector involved in the failure does appear to change. * Explicate `epix_slide` `count` derivation. * Implement the rest of `?dplyr_extending` and remove some now-unnecessary S3 methods for dplyr verbs, addressing #195, #223, and changing `epi[x]_slide` to leave grouping intact. * Update tests for grouped slides to reflect new behavior. * Update NEWS.md.
cmu-delphi · Nov 1, 2022 · 4ca96b0 · 4ca96b0
1 parent 6e3f554
commit 4ca96b0
Show file tree

Hide file tree

Showing 7 changed files with 152 additions and 146 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,26 +1,22 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method("[",epi_df)
-S3method(arrange,epi_df)
+S3method("names<-",epi_df)
 S3method(as_epi_df,data.frame)
 S3method(as_epi_df,epi_df)
 S3method(as_epi_df,tbl_df)
 S3method(as_epi_df,tbl_ts)
 S3method(as_tsibble,epi_df)
 S3method(dplyr_col_modify,col_modify_recorder_df)
-S3method(filter,epi_df)
+S3method(dplyr_col_modify,epi_df)
+S3method(dplyr_reconstruct,epi_df)
 S3method(group_by,epi_archive)
 S3method(group_by,epi_df)
 S3method(group_by,grouped_epi_archive)
 S3method(group_by_drop_default,grouped_epi_archive)
-S3method(group_modify,epi_df)
-S3method(mutate,epi_df)
 S3method(next_after,Date)
 S3method(next_after,integer)
 S3method(print,epi_df)
-S3method(relocate,epi_df)
-S3method(rename,epi_df)
-S3method(slice,epi_df)
 S3method(summary,epi_df)
 S3method(ungroup,epi_df)
 S3method(unnest,epi_df)
@@ -66,6 +62,7 @@ importFrom(data.table,set)
 importFrom(data.table,setkeyv)
 importFrom(dplyr,arrange)
 importFrom(dplyr,dplyr_col_modify)
+importFrom(dplyr,dplyr_reconstruct)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_drop_default)

diff --git a/NEWS.md b/NEWS.md
@@ -4,9 +4,28 @@ Note that `epiprocess` uses the [Semantic Versioning
 ("semver")](https://semver.org/) scheme for all release versions, but not for
 development versions. A ".9999" suffix indicates a development version.
 
+## Breaking changes:
+
+* `epi_slide` and `epix_slide` now keep the grouping of `x` in their results,
+  like `dplyr::mutate` and `dplyr::group_modify`. To obtain the old behavior,
+  `dplyr::ungroup` immediately after slides.
+
+## Potentially-breaking changes:
+
+* Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when
+  dropping the `epi_df` class (e.g., when removing the `time_value` column).
+* Fixed `group_modify` on `epi_df`s to decay to non-`epi_df`s when an `epi_df`
+  result doesn't make sense (e.g., when removing the `time_value` column).
+
+## Improvements:
+
+* Fixed `epi_slide` and `group_modify` on grouped `epi_df`s to not drop `epi_df`
+  class.
+
 ## Cleanup:
 
 * Added a `NEWS.md` file to track changes to the package.
+* Implemented `?dplyr::dplyr_extending` for `epi_df`s (#223).
 
 # epiprocess 0.5.0:
 

diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R
@@ -215,24 +215,55 @@ grouped_epi_archive =
             # Symbolize column name
             new_col = sym(new_col_name)
 
-            # Key variable names, apart from time value and version
-            key_vars = setdiff(key(private$ungrouped$DT), c("time_value", "version"))
+            # Each computation is expected to output a data frame with either
+            # one element/row total or one element/row per encountered
+            # nongrouping, nontime, nonversion key value. These nongrouping,
+            # nontime, nonversion key columns can be seen as the "effective" key
+            # of the computation; the computation might return an object that
+            # reports a different key or no key, but the "effective" key should
+            # still be a valid unique key for the data, and is something that we
+            # could use even with `.keep = FALSE`.
+            comp_effective_key_vars =
+              setdiff(key(private$ungrouped$DT),
+                      c(private$vars, "time_value", "version"))
 
             # Computation for one group, one time value
-            comp_one_grp = function(.data_group,
+            comp_one_grp = function(.data_group, .group_key,
                                     f, ..., 
-                                    time_value,
-                                    key_vars,
+                                    ref_time_value,
+                                    comp_effective_key_vars,
                                     new_col) {
               # Carry out the specified computation 
-              comp_value = f(.data_group, ...)
+              comp_value = f(.data_group, .group_key, ...)
 
-              # Count the number of appearances of the reference time value.
-              # Note: ideally, we want to directly count occurrences of the ref
-              # time value but due to latency, this will often not appear in the
-              # data group. So we count the number of unique key values, outside 
-              # of the time value column
-              count = sum(!duplicated(.data_group[, key_vars]))
+              # Calculate the number of output elements/rows we expect the
+              # computation to output: one per distinct "effective computation
+              # key variable" value encountered in the input. Note: this mirrors
+              # how `epi_slide` does things if we're using unique keys, but can
+              # diverge if using nonunique keys. The `epi_slide` approach of
+              # counting occurrences of the `ref_time_value` in the `time_value`
+              # column, which helps lines up the computation results with
+              # corresponding rows of the input data, wouldn't quite apply here:
+              # we'd want to line up with rows (from the same group) with
+              # `version` matching the `ref_time_value`, but would still need to
+              # summarize these rows somehow and drop the `time_value` input
+              # column, but this summarization requires something like a
+              # to-be-unique output key to determine a sensible number of rows
+              # to output (and the contents of those rows).
+              count =
+                if (length(comp_effective_key_vars) != 0L) {
+                  sum(!duplicated(.data_group[, comp_effective_key_vars]))
+                } else {
+                  # Same idea as above, but accounting for `duplicated` not
+                  # working as we want on 0 columns. (Should be the same as if
+                  # we were counting distinct values of a column defined as
+                  # `rep(val, target_n_rows)`.)
+                  if (nrow(.data_group) == 0L) {
+                    0L
+                  } else {
+                    1L
+                  }
+                }
 
               # If we get back an atomic vector
               if (is.atomic(comp_value)) {
@@ -241,7 +272,7 @@ grouped_epi_archive =
                 }
                 # If not a singleton, should be the right length, else abort
                 else if (length(comp_value) != count) {
-                  Abort("If the slide computation returns an atomic vector, then it must have a single element, or else one element per appearance of the reference time value in the local window.")
+                  Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.')
                 }
               }
 
@@ -256,7 +287,7 @@ grouped_epi_archive =
                 }
                 # Make into a list
                 else {
-                  comp_value = split(comp_value, 1:nrow(comp_value))
+                  comp_value = split(comp_value, seq_len(nrow(comp_value)))
                 }
               }
 
@@ -265,10 +296,9 @@ grouped_epi_archive =
                 Abort("The slide computation must return an atomic vector or a data frame.")
               }
 
-              # Note that we've already recycled comp value to make size stable,
-              # so tibble() will just recycle time value appropriately
-              return(tibble::tibble(time_value = time_value, 
-                                    !!new_col := comp_value))
+              # Label every result row with the `ref_time_value`:
+              return(tibble::tibble(time_value = rep(.env$ref_time_value, count),
+                                    !!new_col := .env$comp_value))
             }
 
             # If f is not missing, then just go ahead, slide by group
@@ -278,12 +308,12 @@ grouped_epi_archive =
                 private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before) %>%
                   dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars)),
                                   .drop=private$drop) %>%
-                  dplyr::summarize(comp_one_grp(dplyr::cur_data_all(),
-                                                f = f, ...,
-                                                time_value = ref_time_value,
-                                                key_vars = key_vars,
-                                                new_col = new_col),
-                                   .groups = groups)
+                  dplyr::group_modify(comp_one_grp,
+                                      f = f, ...,
+                                      ref_time_value = ref_time_value,
+                                      comp_effective_key_vars = comp_effective_key_vars,
+                                      new_col = new_col,
+                                      .keep = TRUE)
               })
             }
 
@@ -306,11 +336,10 @@ grouped_epi_archive =
                   dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars))) %>%
                   dplyr::group_modify(comp_one_grp,
                                       f = f, quo = quo,
-                                      time_value = ref_time_value,
-                                      key_vars = key_vars,
+                                      ref_time_value = ref_time_value,
+                                      comp_effective_key_vars = comp_effective_key_vars,
                                       new_col = new_col,
-                                      .keep = TRUE) %>%
-                  dplyr::ungroup()
+                                      .keep = TRUE)
               })
             }
 

diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -19,8 +19,7 @@ as_tsibble.epi_df = function(x, key, ...) {
 
 #' Base S3 methods for an `epi_df` object
 #'
-#' Print, summary, and `dplyr` verbs (that preserve class and attributes) for an
-#' `epi_df` object.
+#' Print and summary functions for an `epi_df` object.
 #'
 #' @param x The `epi_df` object.
 #' @param ... Additional arguments passed to methods.
@@ -65,119 +64,104 @@ summary.epi_df = function(object, ...) {
                          dplyr::summarize(mean(.data$num)))))
 }
 
+#' Drop any `epi_df` metadata and class on a data frame
+#'
+#' Useful in implementing `?dplyr_extending` when manipulations cause invariants
+#' of `epi_df`s to be violated and we need to return some other class. Note that
+#' this will maintain any grouping (keeping the `grouped_df` class and
+#' associated attributes, if present).
+#'
+#' @param x an `epi_df` or other data frame
+#' @return `x` with any metadata dropped and the `"epi_df"` class, if previously
+#'   present, dropped
+#'
+#' @noRd
+decay_epi_df = function(x) {
+  attributes(x)$metadata <- NULL
+  class(x) <- class(x)[class(x) != "epi_df"]
+  x
+}
+
+# Implementing `dplyr_extending`: `geo_type` and `time_type` are scalar
+# attributes dependent on columns, and `other_keys` acts like an attribute
+# vectorized over columns; `dplyr_extending` advice says to implement
+# `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and `names<-`, but not
+# `dplyr_row_slice`.  We'll implement `[` to allow either 1d or 2d.
+
+#' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may
+#'   directly feed in latter from our other methods)
+#' @param template `epi_df` template to use to restore
+#' @return `epi_df` or degrade into `tbl_df`
+#' @importFrom dplyr dplyr_reconstruct
 #' @export
-`[.epi_df` <- function(x, i, j, drop = FALSE) { 
-  res <- NextMethod() 
-
-  if (!is.data.frame(res)) return(res)
-
-  if (missing(i)) {
-    i <- NULL
-  } 
-
-  if (missing(j)) {
-    j <- NULL
-  } 
+#' @noRd
+dplyr_reconstruct.epi_df = function(data, template) {
+  # Start from a reconstruction for the backing S3 classes; this ensures that we
+  # keep any grouping that has been applied:
+  res <- NextMethod()
 
   cn <- names(res)
-  
+
   # Duplicate columns, Abort
   dup_col_names = cn[duplicated(cn)]
   if (length(dup_col_names) != 0) {
     Abort(paste0("Column name(s) ", 
                  paste(unique(dup_col_names), 
                        collapse = ", "), " must not be duplicated."))
-  } 
+  }
 
   not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
 
   if (not_epi_df) {
-    attributes(res)$metadata <- NULL 
-    return(tibble::as_tibble(res))
+    # If we're calling on an `epi_df` from one of our own functions, we need to
+    # decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble,
+    # `res` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we
+    # simply need to skip adding the metadata & class. Current `decay_epi_df`
+    # should work in both cases.
+    return(decay_epi_df(res))
   }
 
-  # Use reclass as safeguard (in case class &/or metadata are dropped)
-  res <- reclass(res, attr(x, "metadata"))
+  res <- reclass(res, attr(template, "metadata"))
+
+  # XXX we may want verify the `geo_type` and `time_type` here. If it's
+  # significant overhead, we may also want to keep this less strict version
+  # around and implement some extra S3 methods that use it, when appropriate.
 
   # Amend additional metadata if some other_keys cols are dropped in the subset
-  old_other_keys = attr(x, "metadata")$other_keys
+  old_other_keys = attr(template, "metadata")$other_keys
   attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
 
   res
 }
 
-#' `dplyr` verbs
-#'
-#' `dplyr` verbs for `epi_df` objects, preserving class and attributes. 
-#'
-#' @method arrange epi_df
-#' @param .data The `epi_df` object.
-#' @rdname print.epi_df
-#' @export
-arrange.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method filter epi_df
-#' @rdname print.epi_df
-#' @export
-filter.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method group_by epi_df
-#' @rdname print.epi_df
 #' @export
-group_by.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method group_modify epi_df
-#' @rdname print.epi_df
-#' @export
-group_modify.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
-}
-
-#' @method mutate epi_df
-#' @rdname print.epi_df
-#' @export
-mutate.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
+`[.epi_df` <- function(x, i, j, drop = FALSE) {
+  res <- NextMethod()
+
+  if (!is.data.frame(res)) return(res)
+
+  dplyr::dplyr_reconstruct(res, x)
 }
 
-#' @method relocate epi_df
-#' @rdname print.epi_df
+#' @importFrom dplyr dplyr_col_modify
 #' @export
-relocate.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
+dplyr_col_modify.epi_df = function(data, cols) {
+  dplyr::dplyr_reconstruct(NextMethod(), data)
 }
 
-#' @method rename epi_df
-#' @rdname print.epi_df
 #' @export
-rename.epi_df = function(.data, ...) {
-  metadata = attributes(.data)$metadata
-  .data = NextMethod()
-  reclass(.data, metadata)
+`names<-.epi_df` = function(x, value) {
+  old_names = names(x)
+  old_other_keys = attributes(x)$metadata$other_keys
+  result = NextMethod()
+  attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)]
+  dplyr::dplyr_reconstruct(result, result)
 }
 
-#' @method slice epi_df
+#' @method group_by epi_df
 #' @rdname print.epi_df
 #' @export
-slice.epi_df = function(.data, ...) {
+group_by.epi_df = function(.data, ...) {
   metadata = attributes(.data)$metadata
   .data = NextMethod()
   reclass(.data, metadata)