cmu-delphi · dajmcdon · Apr 10, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: epiprocess
 Title: Tools for basic signal processing in epidemiology
-Version: 0.11.4
+Version: 0.11.5
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -53,6 +53,7 @@ S3method(print,epi_archive)
 S3method(print,epi_df)
 S3method(print,grouped_epi_archive)
 S3method(print,growth_rate_params)
+S3method(print,revision_analysis)
 S3method(summary,epi_df)
 S3method(ungroup,epi_df)
 S3method(ungroup,grouped_epi_archive)
@@ -99,6 +100,7 @@ export(new_epi_archive)
 export(new_epi_df)
 export(relocate)
 export(rename)
+export(revision_analysis)
 export(revision_summary)
 export(set_versions_end)
 export(slice)

diff --git a/NEWS.md b/NEWS.md
@@ -22,7 +22,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   for users without a compiler, we have placed `{trendfilter}` in Suggests:; if
   you want to use `method = "trendfilter"` you will need to manually install
   this dependency (e.g., with `remotes::install_github("glmgen/trendfilter")`).
-- In `revision_summary()`:
+- In `revision_summary()` (aliased to `revision_analysis()`):
   - The `should_compactify` argument is now called `compactify`. To migrate,
     change any calls with `should_compactfiy =` to `compactify =`.
   - Output now uses the name `lag_near_latest` instead of `time_near_latest`. To
@@ -36,6 +36,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   - `min_waiting_period` now defines a nonstrict inequality instead of a strict
     one. To obtain the old bounds, bump the `min_waiting_period` up to the next
     possible value for your `time_type`.
+  - Added a `print()` method to take the place of `print_inform`.
+  - Removed the the argument `print_inform`. This is now always false. Replaced
+    with an option, `return_only_tibble` to return only the tibble of results 
+    rather than the full S3 object.
+
 - In `key_colnames()`:
   - On regular (non-`epi_df`) data frames, now requires manual specification of
     `geo_keys`, `other_keys`, and `time_keys`.
@@ -53,7 +58,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
   `.facet_filter` which allows explicit selection of facets to show.
 
 ## Improvements
-- `revision_summary()` now supports all `time_type`s.
+- `revision_summary()` now supports all `time_type`s. Printed summary is more attractive.
 - The compactification tolerance setting now works with integer-type columns.
 - Various functions are now faster, using faster variants of core operations and
   avoiding reconstructing grouped `epi_df`s when unnecessary.

diff --git a/R/revision_analysis.R b/R/revision_analysis.R
@@ -34,31 +34,16 @@
 #'   `NA`'s compactify is run again if `compactify` is `TRUE` to make
 #'   sure there are no duplicate values from occasions when the signal is
 #'   revised to `NA`, and then back to its immediately-preceding value.
-#' @param print_inform bool, determines whether to print summary information, or
-#'   only return the full summary tibble
 #' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any
 #'   time_values that have not had at least `min_waiting_period` to stabilize as
 #'   of the `versions_end` are removed. `min_waiting_period` should characterize
 #'   the typical time during which most significant revisions occur. The default
 #'   of 60 days corresponds to a typical near-final value for case counts as
 #'   reported in the context of insurance. To avoid this filtering, either set
-#'   to `NULL` or 0.
+#'   to `NULL` or 0. A `difftime` will be rounded up to the appropriate `time_type` if
+#'   necessary (that is 5 days will be rounded to 1 week if the data is weekly).
 #' @param within_latest double between 0 and 1. Determines the threshold
 #'   used for the `lag_to`
-#' @param quick_revision difftime or integer (integer is treated as days), for
-#'   the printed summary, the amount of time between the final revision and the
-#'   actual time_value to consider the revision quickly resolved. Default of 3
-#'   days
-#' @param few_revisions integer, for the printed summary, the upper bound on the
-#'   number of revisions to consider "few". Default is 3.
-#' @param abs_spread_threshold length-1 numeric, for the printed summary, the
-#'   maximum spread used to characterize revisions which don't actually change
-#'   very much. Default is 5% of the maximum value in the dataset, but this is
-#'   the most unit dependent of values, and likely needs to be chosen
-#'   appropriate for the scale of the dataset.
-#' @param rel_spread_threshold length-1 double between 0 and 1, for the printed
-#'   summary, the relative spread fraction used to characterize revisions which
-#'   don't actually change very much. Default is .1, or 10% of the final value
 #' @param compactify bool. If `TRUE`, we will compactify after the signal
 #'   requested in `...` has been selected on its own and the `drop_nas` step.
 #'   This helps, for example, to give similar results when called on
@@ -67,6 +52,8 @@
 #'   requested signal. The default is `TRUE`.
 #' @param compactify_abs_tol length-1 double, used if `compactify` is `TRUE`, it
 #'   determines the threshold for when two doubles are considered identical.
+#' @param return_only_tibble boolean to return only the simple `tibble` of
+#'   computational results rather than the complete S3 object.
 #'
 #' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`,
 #'   and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if
@@ -76,31 +63,35 @@
 #'   produce incorrect results for some calculations, since week numbering
 #'   contains jumps at year boundaries.
 #'
+#' @return An S3 object with class `revision_behavior`. This function is typically
+#'   called for the purposes of inspecting the printed output. The
+#'   results of the computations are available in
+#'   `revision_analysis(...)$revision_behavior`. If you only want to access
+#'   the internal computations, use `return_only_tibble = TRUE`.
+#'
 #' @examples
-#' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli)
-#' revision_example %>% arrange(desc(spread))
+#' revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli)
+#' revision_example$revision_behavior %>% arrange(desc(spread))
 #'
 #' @export
 #' @importFrom cli cli_inform cli_abort cli_li
 #' @importFrom rlang list2 syms dots_n
 #' @importFrom vctrs vec_cast
 #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across
 #'   everything ungroup summarize if_else %>%
-revision_summary <- function(epi_arch,
-                             ...,
-                             drop_nas = TRUE,
-                             print_inform = TRUE,
-                             min_waiting_period = as.difftime(60, units = "days") %>%
-                               difftime_approx_ceiling_time_delta(epi_arch$time_type),
-                             within_latest = 0.2,
-                             quick_revision = as.difftime(3, units = "days") %>%
-                               difftime_approx_ceiling_time_delta(epi_arch$time_type),
-                             few_revisions = 3,
-                             abs_spread_threshold = NULL,
-                             rel_spread_threshold = 0.1,
-                             compactify = TRUE,
-                             compactify_abs_tol = 0) {
+revision_analysis <- function(epi_arch,
+                              ...,
+                              drop_nas = TRUE,
+                              min_waiting_period = as.difftime(60, units = "days"),
+                              within_latest = 0.2,
+                              compactify = TRUE,
+                              compactify_abs_tol = 0,
+                              return_only_tibble = FALSE) {
   assert_class(epi_arch, "epi_archive")
+  if (methods::is(min_waiting_period, "difftime")) {
+    min_waiting_period <- min_waiting_period %>%
+      difftime_approx_ceiling_time_delta(epi_arch$time_type)
+  }
   # if the column to summarize isn't specified, use the only one if there is only one
   if (dots_n(...) == 0) {
     # Choose the first column that's not a key:
@@ -126,11 +117,6 @@ revision_summary <- function(epi_arch,
       cli_abort("Not currently implementing more than one column at a time. Run each separately.")
     }
   }
-  if (is.null(abs_spread_threshold)) {
-    abs_spread_threshold <- .05 * epi_arch$DT %>%
-      pull(!!arg) %>%
-      max(na.rm = TRUE)
-  }
   # for each time_value, get
   #   the number of revisions
   #   the maximum spread in value (both absolute and relative)
@@ -193,63 +179,113 @@ revision_summary <- function(epi_arch,
       time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter
       lag_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter
     )
-  if (print_inform) {
-    cli_inform("Min lag (time to first version):")
-    time_delta_summary(revision_behavior$min_lag, time_type) %>% print()
-    if (!drop_nas) {
-      total_na <- epi_arch$DT %>%
-        filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter
-        nrow()
-      cli_inform("Fraction of all versions that are `NA`:")
-      cli_li(num_percent(total_na, nrow(epi_arch$DT), ""))
-      cli_inform("")
-    }
-    cli_inform("Fraction of epi_key+time_values with")
-    total_num <- nrow(revision_behavior) # nolint: object_usage_linter
-    total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter
-    cli_inform("No revisions:")
-    cli_li(num_percent(total_num_unrevised, total_num, ""))
-    total_quickly_revised <- sum( # nolint: object_usage_linter
-      time_delta_to_n_steps(revision_behavior$max_lag, time_type) <=
-        time_delta_to_n_steps(quick_revision, time_type)
-    )
-    cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, time_type)}
-                of the `time_value`):")
-    cli_li(num_percent(total_quickly_revised, total_num, ""))
-    total_barely_revised <- sum( # nolint: object_usage_linter
-      revision_behavior$n_revisions <=
-        few_revisions
-    )
-    cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):")
-    cli_li(num_percent(total_barely_revised, total_num, ""))
-    cli_inform("")
-    cli_inform("Fraction of revised epi_key+time_values which have:")
+  total_na <- epi_arch$DT %>%
+    filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter
+    nrow()
+  if (!return_only_tibble) {
+    revision_behavior <- structure(list(
+      revision_behavior = revision_behavior,
+      range_time_values = range(epi_arch$DT$time_value),
+      signal_variable = arg,
+      drop_nas = drop_nas,
+      time_type = time_type,
+      total_na = total_na,
+      max_val = max(epi_arch$DT[[arg]], na.rm = TRUE),
+      n_obs = nrow(epi_arch$DT),
+      within_latest = within_latest
+    ), class = "revision_analysis")
+  }
+  return(revision_behavior)
+}
+
+
 
-    real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter
-    n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter
-    rel_spread <- sum( # nolint: object_usage_linter
-      real_revisions$rel_spread <
-        rel_spread_threshold,
-      na.rm = TRUE
-    ) + sum(is.na(real_revisions$rel_spread))
-    cli_inform("Less than {rel_spread_threshold} spread in relative value:")
-    cli_li(num_percent(rel_spread, n_real_revised, ""))
-    abs_spread <- sum( # nolint: object_usage_linter
-      real_revisions$spread >
-        abs_spread_threshold
-    ) # nolint: object_usage_linter
-    cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):")
-    cli_li(num_percent(abs_spread, n_real_revised, ""))
 
-    # time_type_unit_pluralizer[[time_type]] is a format string controlled by us
-    # and/or downstream devs, so we can paste it onto our format string safely:
-    units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[time_type]])) # nolint: object_usage_linter
-    cli_inform("{toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:")
-    time_delta_summary(revision_behavior[["lag_near_latest"]], time_type) %>% print()
+#' Print a `revision_analysis` object
+#'
+#' @param x a `revision_analysis` object
+#' @param quick_revision Difftime or integer (integer is treated as days).
+#'   The amount of time between the final revision and the
+#'   actual time_value to consider the revision quickly resolved. Default of 3
+#'   days. This will be rounded up to the appropriate `time_type` if
+#'   necessary (that is 5 days will be rounded to 1 week if the data is weekly).
+#' @param few_revisions Integer. The upper bound on the
+#'   number of revisions to consider "few". Default is 3.
+#' @param abs_spread_threshold Scalar numeric. The
+#'   maximum spread used to characterize revisions which don't actually change
+#'   very much. Default is 5% of the maximum value in the dataset, but this is
+#'   the most unit dependent of values, and likely needs to be chosen
+#'   appropriate for the scale of the dataset.
+#' @param rel_spread_threshold Scalar between 0 and 1. The relative spread fraction used to characterize revisions which
+#'   don't actually change very much. Default is .1, or 10% of the final value
+#'
+#' @rdname revision_analysis
+#' @export
+print.revision_analysis <- function(x,
+                                    quick_revision = as.difftime(3, units = "days"),
+                                    few_revisions = 3,
+                                    abs_spread_threshold = NULL,
+                                    rel_spread_threshold = 0.1,
+                                    ...) {
+  if (methods::is(quick_revision, "difftime")) {
+    quick_revision <- quick_revision %>%
+      difftime_approx_ceiling_time_delta(x$time_type)
   }
-  return(revision_behavior)
+  if (is.null(abs_spread_threshold)) abs_spread_threshold <- .05 * x$max_val
+  rev_beh <- x$revision_behavior
+  cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.")
+  cli::cli_h3("Min lag (time to first version):")
+  time_delta_summary(rev_beh$min_lag, x$time_type) %>% print()
+  if (!x$drop_nas) {
+    cli_inform("Fraction of all versions that are `NA`:")
+    cli_li(num_percent(x$total_na, x$n_obs, ""))
+    cli_inform("")
+  }
+  cli::cli_h3("Fraction of epi_key + time_values with")
+  total_num <- nrow(rev_beh) # nolint: object_usage_linter
+  total_num_unrevised <- sum(rev_beh$n_revisions == 0) # nolint: object_usage_linter
+  cli_inform("No revisions:")
+  cli_li(num_percent(total_num_unrevised, total_num, ""))
+  total_quickly_revised <- sum( # nolint: object_usage_linter
+    time_delta_to_n_steps(rev_beh$max_lag, x$time_type) <=
+      time_delta_to_n_steps(quick_revision, x$time_type)
+  )
+  cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, x$time_type)}
+                of the `time_value`):")
+  cli_li(num_percent(total_quickly_revised, total_num, ""))
+  total_barely_revised <- sum(rev_beh$n_revisions <= few_revisions)
+  cli_inform("Few revisions (At most {.val {few_revisions}} revisions for that `time_value`):")
+  cli_li(num_percent(total_barely_revised, total_num, ""))
+
+  cli::cli_h3("Fraction of revised epi_key + time_values which have:")
+
+  real_revisions <- rev_beh %>% filter(n_revisions > 0) # nolint: object_usage_linter
+  n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter
+  rel_spread <- sum( # nolint: object_usage_linter
+    real_revisions$rel_spread < rel_spread_threshold,
+    na.rm = TRUE
+  ) + sum(is.na(real_revisions$rel_spread))
+  cli_inform("Less than {.val {rel_spread_threshold}} spread in relative value:")
+  cli_li(num_percent(rel_spread, n_real_revised, ""))
+  abs_spread <- sum( # nolint: object_usage_linter
+    real_revisions$spread > abs_spread_threshold
+  ) # nolint: object_usage_linter
+  divid <- cli::cli_div(theme = list(.val = list(digits = 3)))
+  cli_inform("Spread of more than {.val {abs_spread_threshold}} in actual value (when revised):")
+  cli::cli_end(divid)
+  cli_li(num_percent(abs_spread, n_real_revised, ""))
+
+  # time_type_unit_pluralizer[[time_type]] is a format string controlled by us
+  # and/or downstream devs, so we can paste it onto our format string safely:
+  units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter
+  cli::cli_h3("{toTitleCase(units_plural)} until within {.val {x$within_latest*100}}% of the latest value:")
+  time_delta_summary(rev_beh[["lag_near_latest"]], x$time_type) %>% print()
 }
 
+#' @export
+#' @rdname revision_analysis
+revision_summary <- revision_analysis
+
 #' pull the value from lags when values starts indefinitely being within prop of its latest value.
 #' @param lags vector of lags; should be sorted
 #' @param values this should be a vector (e.g., a column) with length matching that of `lags`

diff --git a/man/epiprocess-package.Rd b/man/epiprocess-package.Rd