From 8d9f2de6d7791e3c063f2170cbd93c547887c1b2 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 16:10:19 -0700 Subject: [PATCH 01/14] switch revision_behavior to a better formatted print method --- NAMESPACE | 2 + R/revision_analysis.R | 160 ++++++++++-------- man/epiprocess-package.Rd | 1 + ...vision_summary.Rd => revision_analysis.Rd} | 29 +++- 4 files changed, 115 insertions(+), 77 deletions(-) rename man/{revision_summary.Rd => revision_analysis.Rd} (89%) diff --git a/NAMESPACE b/NAMESPACE index 3582fe9c7..1c548309e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ S3method(print,epi_archive) S3method(print,epi_df) S3method(print,grouped_epi_archive) S3method(print,growth_rate_params) +S3method(print,revision_behavior) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) @@ -99,6 +100,7 @@ export(new_epi_archive) export(new_epi_df) export(relocate) export(rename) +export(revision_analysis) export(revision_summary) export(set_versions_end) export(slice) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index f36dcc16a..705d244b3 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -34,8 +34,6 @@ #' `NA`'s compactify is run again if `compactify` is `TRUE` to make #' sure there are no duplicate values from occasions when the signal is #' revised to `NA`, and then back to its immediately-preceding value. -#' @param print_inform bool, determines whether to print summary information, or -#' only return the full summary tibble #' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any #' time_values that have not had at least `min_waiting_period` to stabilize as #' of the `versions_end` are removed. `min_waiting_period` should characterize @@ -77,7 +75,7 @@ #' contains jumps at year boundaries. #' #' @examples -#' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) +#' revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) #' revision_example %>% arrange(desc(spread)) #' #' @export @@ -86,20 +84,20 @@ #' @importFrom vctrs vec_cast #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across #' everything ungroup summarize if_else %>% -revision_summary <- function(epi_arch, - ..., - drop_nas = TRUE, - print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), - within_latest = 0.2, - quick_revision = as.difftime(3, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), - few_revisions = 3, - abs_spread_threshold = NULL, - rel_spread_threshold = 0.1, - compactify = TRUE, - compactify_abs_tol = 0) { +revision_analysis <- function(epi_arch, + ..., + drop_nas = TRUE, + min_waiting_period = as.difftime(60, units = "days") %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type), + within_latest = 0.2, + quick_revision = as.difftime(3, units = "days") %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type), + few_revisions = 3, + abs_spread_threshold = NULL, + rel_spread_threshold = 0.1, + compactify = TRUE, + compactify_abs_tol = 0, + return_only_tibble = FALSE) { assert_class(epi_arch, "epi_archive") # if the column to summarize isn't specified, use the only one if there is only one if (dots_n(...) == 0) { @@ -193,63 +191,85 @@ revision_summary <- function(epi_arch, time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter lag_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter ) - if (print_inform) { - cli_inform("Min lag (time to first version):") - time_delta_summary(revision_behavior$min_lag, time_type) %>% print() - if (!drop_nas) { - total_na <- epi_arch$DT %>% - filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter - nrow() - cli_inform("Fraction of all versions that are `NA`:") - cli_li(num_percent(total_na, nrow(epi_arch$DT), "")) - cli_inform("") - } - cli_inform("Fraction of epi_key+time_values with") - total_num <- nrow(revision_behavior) # nolint: object_usage_linter - total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter - cli_inform("No revisions:") - cli_li(num_percent(total_num_unrevised, total_num, "")) - total_quickly_revised <- sum( # nolint: object_usage_linter - time_delta_to_n_steps(revision_behavior$max_lag, time_type) <= - time_delta_to_n_steps(quick_revision, time_type) - ) - cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, time_type)} - of the `time_value`):") - cli_li(num_percent(total_quickly_revised, total_num, "")) - total_barely_revised <- sum( # nolint: object_usage_linter - revision_behavior$n_revisions <= - few_revisions - ) - cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):") - cli_li(num_percent(total_barely_revised, total_num, "")) - cli_inform("") - cli_inform("Fraction of revised epi_key+time_values which have:") - - real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter - n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter - rel_spread <- sum( # nolint: object_usage_linter - real_revisions$rel_spread < - rel_spread_threshold, - na.rm = TRUE - ) + sum(is.na(real_revisions$rel_spread)) - cli_inform("Less than {rel_spread_threshold} spread in relative value:") - cli_li(num_percent(rel_spread, n_real_revised, "")) - abs_spread <- sum( # nolint: object_usage_linter - real_revisions$spread > - abs_spread_threshold - ) # nolint: object_usage_linter - cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") - cli_li(num_percent(abs_spread, n_real_revised, "")) - - # time_type_unit_pluralizer[[time_type]] is a format string controlled by us - # and/or downstream devs, so we can paste it onto our format string safely: - units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[time_type]])) # nolint: object_usage_linter - cli_inform("{toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:") - time_delta_summary(revision_behavior[["lag_near_latest"]], time_type) %>% print() + total_na <- epi_arch$DT %>% + filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter + nrow() + if (!return_only_tibble) { + revision_behavior <- structure(list( + revision_behavior = revision_behavior, + range_time_values = range(epi_arch$DT$time_value), + signal_variable = arg, + drop_nas = drop_nas, + time_type = time_type, + total_na = total_na, + n_obs = nrow(epi_arch$DT), + quick_revision = quick_revision, + few_revisions = few_revisions, + rel_spread_threshold = rel_spread_threshold, + abs_spread_threshold = abs_spread_threshold, + within_latest = within_latest + ), class = "revision_behavior") } return(revision_behavior) } +#' @export +print.revision_behavior <- function(x, ...) { + cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.") + cli::cli_h3("Min lag (time to first version):") + time_delta_summary(x$revision_behavior$min_lag, x$time_type) %>% print() + if (!x$drop_nas) { + cli_inform("Fraction of all versions that are `NA`:") + cli_li(num_percent(x$total_na, x$n_obs, "")) + cli_inform("") + } + cli::cli_h3("Fraction of epi_key + time_values with") + total_num <- nrow(x$revision_behavior) # nolint: object_usage_linter + total_num_unrevised <- sum(x$n_revisions == 0) # nolint: object_usage_linter + cli_inform("No revisions:") + cli_li(num_percent(total_num_unrevised, total_num, "")) + total_quickly_revised <- sum( # nolint: object_usage_linter + time_delta_to_n_steps(x$revision_behavior$max_lag, x$time_type) <= + time_delta_to_n_steps(x$quick_revision, x$time_type) + ) + cli_inform("Quick revisions (last revision within {format_time_delta(x$quick_revision, x$time_type)} + of the `time_value`):") + cli_li(num_percent(total_quickly_revised, total_num, "")) + total_barely_revised <- sum( # nolint: object_usage_linter + x$n_revisions <= x$few_revisions + ) + cli_inform("Few revisions (At most {x$few_revisions} revisions for that `time_value`):") + cli_li(num_percent(total_barely_revised, total_num, "")) + + cli::cli_h3("Fraction of revised epi_key + time_values which have:") + + real_revisions <- x$revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter + n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter + rel_spread <- sum( # nolint: object_usage_linter + real_revisions$rel_spread < + x$rel_spread_threshold, + na.rm = TRUE + ) + sum(is.na(real_revisions$rel_spread)) + cli_inform("Less than {x$rel_spread_threshold} spread in relative value:") + cli_li(num_percent(rel_spread, n_real_revised, "")) + abs_spread <- sum( # nolint: object_usage_linter + real_revisions$spread > + x$abs_spread_threshold + ) # nolint: object_usage_linter + cli_inform("Spread of more than {x$abs_spread_threshold} in actual value (when revised):") + cli_li(num_percent(abs_spread, n_real_revised, "")) + + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us + # and/or downstream devs, so we can paste it onto our format string safely: + units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter + cli::cli_h3("{toTitleCase(units_plural)} until within {x$within_latest*100}% of the latest value:") + time_delta_summary(x$revision_behavior[["lag_near_latest"]], x$time_type) %>% print() +} + +#' @export +#' @rdname revision_analysis +revision_summary <- revision_analysis + #' pull the value from lags when values starts indefinitely being within prop of its latest value. #' @param lags vector of lags; should be sorted #' @param values this should be a vector (e.g., a column) with length matching that of `lags` diff --git a/man/epiprocess-package.Rd b/man/epiprocess-package.Rd index fe79c01e8..12e8a3c42 100644 --- a/man/epiprocess-package.Rd +++ b/man/epiprocess-package.Rd @@ -11,6 +11,7 @@ This package introduces common data structures for working with epidemiological \seealso{ Useful links: \itemize{ + \item \url{https://github.com/cmu-delphi/epiprocess} \item \url{https://cmu-delphi.github.io/epiprocess/} } diff --git a/man/revision_summary.Rd b/man/revision_analysis.Rd similarity index 89% rename from man/revision_summary.Rd rename to man/revision_analysis.Rd index 54abf098c..e15aa92a5 100644 --- a/man/revision_summary.Rd +++ b/man/revision_analysis.Rd @@ -1,14 +1,31 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/revision_analysis.R -\name{revision_summary} +\name{revision_analysis} +\alias{revision_analysis} \alias{revision_summary} \title{A function to describe revision behavior for an archive.} \usage{ +revision_analysis( + epi_arch, + ..., + drop_nas = TRUE, + min_waiting_period = as.difftime(60, units = "days") \%>\% + difftime_approx_ceiling_time_delta(epi_arch$time_type), + within_latest = 0.2, + quick_revision = as.difftime(3, units = "days") \%>\% + difftime_approx_ceiling_time_delta(epi_arch$time_type), + few_revisions = 3, + abs_spread_threshold = NULL, + rel_spread_threshold = 0.1, + compactify = TRUE, + compactify_abs_tol = 0, + return_only_tibble = FALSE +) + revision_summary( epi_arch, ..., drop_nas = TRUE, - print_inform = TRUE, min_waiting_period = as.difftime(60, units = "days") \%>\% difftime_approx_ceiling_time_delta(epi_arch$time_type), within_latest = 0.2, @@ -18,7 +35,8 @@ revision_summary( abs_spread_threshold = NULL, rel_spread_threshold = 0.1, compactify = TRUE, - compactify_abs_tol = 0 + compactify_abs_tol = 0, + return_only_tibble = FALSE ) } \arguments{ @@ -34,9 +52,6 @@ If supplied, \code{...} must select exactly one column.} sure there are no duplicate values from occasions when the signal is revised to \code{NA}, and then back to its immediately-preceding value.} -\item{print_inform}{bool, determines whether to print summary information, or -only return the full summary tibble} - \item{min_waiting_period}{\code{difftime}, integer or \code{NULL}. Sets a cutoff: any time_values that have not had at least \code{min_waiting_period} to stabilize as of the \code{versions_end} are removed. \code{min_waiting_period} should characterize @@ -113,7 +128,7 @@ produce incorrect results for some calculations, since week numbering contains jumps at year boundaries. } \examples{ -revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) +revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) revision_example \%>\% arrange(desc(spread)) } From cc64f70a62c625c31d6e1d12edac7dfa06247b87 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 16:24:19 -0700 Subject: [PATCH 02/14] adjust tests to match --- .../_snaps/revision-latency-functions.md | 132 +++++++++++++----- .../test-revision-latency-functions.R | 31 ++-- 2 files changed, 115 insertions(+), 48 deletions(-) diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index 4f2bfe269..e04e6c1f8 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -1,29 +1,41 @@ # revision_summary works for dummy datasets Code - dummy_ex %>% revision_summary() %>% print(n = 10, width = 300) + rs1 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-01. -- + + -- Min lag (time to first version): Output min median mean max 0 days 1 days 1.6 days 4 days Message - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: - * 3 out of 7 (42.86%) + * 0 out of 7 (0%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): - * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + * 0 out of 7 (0%) + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 1 out of 4 (25%) Spread of more than 5.1 in actual value (when revised): * 3 out of 4 (75%) - Days until within 20% of the latest value: + + -- Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days + +--- + + Code + rs1$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -47,31 +59,43 @@ --- Code - dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + rs2 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-01. -- + + -- Min lag (time to first version): Output min median mean max 0 days 1 days 1.4 days 4 days Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: - * 2 out of 7 (28.57%) + * 0 out of 7 (0%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): - * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + * 0 out of 7 (0%) + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Days until within 20% of the latest value: + + -- Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days + +--- + + Code + rs2$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -95,31 +119,43 @@ --- Code - dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + rs3 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-01. -- + + -- Min lag (time to first version): Output min median mean max 0 weeks 1 weeks 1.4 weeks 4 weeks Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: - * 2 out of 7 (28.57%) + * 0 out of 7 (0%) Quick revisions (last revision within 1 week of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): - * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + * 0 out of 7 (0%) + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Weeks until within 20% of the latest value: + + -- Weeks until within 20% of the latest value: Output min median mean max 0 weeks 3 weeks 6.9 weeks 19 weeks + +--- + + Code + rs3$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -143,32 +179,43 @@ --- Code - dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, - width = 300) + rs4 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020 Jan to 2020 Jan. -- + + -- Min lag (time to first version): Output min median mean max 0 1 1.4 4 Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: - * 2 out of 7 (28.57%) + * 0 out of 7 (0%) Quick revisions (last revision within 1 month of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): - * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + * 0 out of 7 (0%) + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Months until within 20% of the latest value: + + -- Months until within 20% of the latest value: Output min median mean max 0 3 6.9 19 + +--- + + Code + rs4$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -192,32 +239,43 @@ --- Code - dummy_ex_integerly %>% revision_summary(min_waiting_period = 60, - quick_revision = 3, drop_nas = FALSE) %>% print(n = 10, width = 300) + rs5 Message - Min lag (time to first version): + + -- An epi_archive spanning 1 to 1. -- + + -- Min lag (time to first version): Output min median mean max 0 1 1.4 4 Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: - * 2 out of 7 (28.57%) + * 0 out of 7 (0%) Quick revisions (last revision within 3 time steps of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): - * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + * 0 out of 7 (0%) + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Time Steps until within 20% of the latest value: + + -- Time Steps until within 20% of the latest value: Output min median mean max 0 3 6.9 19 + +--- + + Code + rs5$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index b636bf32e..031cf4ac3 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -64,27 +64,36 @@ dummy_ex_integerly <- dummy_ex$DT %>% stopifnot(dummy_ex_integerly$time_type == "integer") test_that("revision_summary works for dummy datasets", { - expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)) - expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs1 <- dummy_ex %>% revision_summary() + rs2 <- dummy_ex %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs1) + expect_snapshot(rs1$revision_behavior %>% print(n = 10, width = 300)) + expect_snapshot(rs2) + expect_snapshot(rs2$revision_behavior %>% print(n = 10, width = 300)) # Weekly dummy is mostly just "day" -> "week", but quick-revision summary changes: - expect_snapshot(dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs3 <- dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs3) + expect_snapshot(rs3$revision_behavior %>% print(n = 10, width = 300)) # Yearmonthly has the same story. It would have been close to encountering # min_waiting_period-based filtering but we actually set its versions_end to # sometime in 2080 rather than 2022: - expect_snapshot(dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs4 <- dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs4) + expect_snapshot(rs4$revision_behavior %>% print(n = 10, width = 300)) # Integer is very much like daily. We have to provide some of the # configuration arguments since we have no idea about what the integers # represent. If the possible integers being used have large jumps like # YYYYww-as-integer epiweek labels (e.g., 200053 jumps to 200101) or are # regularly spaced apart but by more than 1, we'll still be producing # something nonsensical, but we tried. - expect_snapshot(dummy_ex_integerly %>% + rs5 <- dummy_ex_integerly %>% revision_summary( min_waiting_period = 60, quick_revision = 3, drop_nas = FALSE - ) %>% - print(n = 10, width = 300)) + ) + expect_snapshot(rs5) + expect_snapshot(rs5$revision_behavior %>% print(n = 10, width = 300)) }) test_that("tidyselect is functional", { @@ -133,7 +142,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = as.Date("2020-01-01") + 1 + 59) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), as.Date("2020-01-01") ) @@ -146,7 +155,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = as.Date("2020-01-01") + 7 + 56) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), as.Date("2020-01-01") ) @@ -159,7 +168,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = tsibble::make_yearmonth(2000, 3)) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), tsibble::make_yearmonth(2000, 1) ) @@ -172,7 +181,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = 1 + 1 + 59) %>% - revision_summary(print_inform = FALSE), + revision_summary(return_only_tibble = TRUE), regexp = "Unsupported time_type" ) }) From 03cf6e3967891c1ca29d5c6058ec4350e6e23333 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 16:33:25 -0700 Subject: [PATCH 03/14] describe changes in news and redocument --- DESCRIPTION | 2 +- NEWS.md | 9 +++++++-- R/revision_analysis.R | 6 ++++++ man/revision_analysis.Rd | 7 +++++++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f75879e97..1b39d8b5b 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.11.3 +Version: 0.11.5 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 88acabee4..49685e7c2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,7 +22,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat for users without a compiler, we have placed `{trendfilter}` in Suggests:; if you want to use `method = "trendfilter"` you will need to manually install this dependency (e.g., with `remotes::install_github("glmgen/trendfilter")`). -- In `revision_summary()`: +- In `revision_summary()` (aliased to `revision_analysis()`): - The `should_compactify` argument is now called `compactify`. To migrate, change any calls with `should_compactfiy =` to `compactify =`. - Output now uses the name `lag_near_latest` instead of `time_near_latest`. To @@ -36,6 +36,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `min_waiting_period` now defines a nonstrict inequality instead of a strict one. To obtain the old bounds, bump the `min_waiting_period` up to the next possible value for your `time_type`. + - Added a `print()` method to take the place of `print_inform`. + - Removed the the argument `print_inform`. This is now always false. Replaced + with an option, `return_only_tibble` to return only the tibble of results + rather than the full S3 object. + - In `key_colnames()`: - On regular (non-`epi_df`) data frames, now requires manual specification of `geo_keys`, `other_keys`, and `time_keys`. @@ -53,7 +58,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat `.facet_filter` which allows explicit selection of facets to show. ## Improvements -- `revision_summary()` now supports all `time_type`s. +- `revision_summary()` now supports all `time_type`s. Printed summary is more attractive. - The compactification tolerance setting now works with integer-type columns. - Various functions are now faster, using faster variants of core operations and avoiding reconstructing grouped `epi_df`s when unnecessary. diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 705d244b3..49d5b3d30 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -73,6 +73,12 @@ #' Using a `time_type` of `"integer"` with week numbers like 202501 will #' produce incorrect results for some calculations, since week numbering #' contains jumps at year boundaries. +#' +#' @return An S3 object with class `revision_behavior`. This function is typically +#' called for the purposes of inspecting the printed output. The +#' results of the computations are available in +#' `revision_analysis(...)$revision_behavior`. If you only want to access +#' the internal computations, use `return_only_tibble = TRUE`. #' #' @examples #' revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) diff --git a/man/revision_analysis.Rd b/man/revision_analysis.Rd index e15aa92a5..fc8ef217b 100644 --- a/man/revision_analysis.Rd +++ b/man/revision_analysis.Rd @@ -91,6 +91,13 @@ requested signal. The default is \code{TRUE}.} \item{compactify_abs_tol}{length-1 double, used if \code{compactify} is \code{TRUE}, it determines the threshold for when two doubles are considered identical.} } +\value{ +An S3 object with class \code{revision_behavior}. This function is typically +called for the purposes of inspecting the printed output. The +results of the computations are available in +\code{revision_analysis(...)$revision_behavior}. If you only want to access +the internal computations, use \code{return_only_tibble = TRUE}. +} \description{ \code{revision_summary} removes all missing values (if requested), and then computes some basic statistics about the revision behavior of an archive, From 226ba492475e0476ac5038ae714ae598a2111a30 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 17:05:56 -0700 Subject: [PATCH 04/14] fix: doc missing arg, fix vignette --- R/revision_analysis.R | 2 ++ man/revision_analysis.Rd | 3 +++ vignettes/epi_archive.Rmd | 7 ++++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 49d5b3d30..25e45bbeb 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -65,6 +65,8 @@ #' requested signal. The default is `TRUE`. #' @param compactify_abs_tol length-1 double, used if `compactify` is `TRUE`, it #' determines the threshold for when two doubles are considered identical. +#' @param return_only_tibble boolean to return only the simple `tibble` of +#' computational results rather than the complete S3 object. #' #' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`, #' and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if diff --git a/man/revision_analysis.Rd b/man/revision_analysis.Rd index fc8ef217b..89683afb8 100644 --- a/man/revision_analysis.Rd +++ b/man/revision_analysis.Rd @@ -90,6 +90,9 @@ requested signal. The default is \code{TRUE}.} \item{compactify_abs_tol}{length-1 double, used if \code{compactify} is \code{TRUE}, it determines the threshold for when two doubles are considered identical.} + +\item{return_only_tibble}{boolean to return only the simple \code{tibble} of +computational results rather than the complete S3 object.} } \value{ An S3 object with class \code{revision_behavior}. This function is typically diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd index f87ea2915..cae40dfa6 100644 --- a/vignettes/epi_archive.Rmd +++ b/vignettes/epi_archive.Rmd @@ -162,7 +162,8 @@ addition to the per key summary, it also returns an overall summary. Here is an a sample of the output: ```{r} -revision_details <- revision_summary(dv_archive, print_inform = TRUE) +revision_details <- revision_summary(dv_archive) +revision_details ``` We can see from the output that, as mentioned above, this data set has a lot of @@ -174,9 +175,9 @@ inspect the returned `revision_details` tibble. Here we collect a number of statistics for each state: ```{r} -revision_details %>% +revision_details$revision_behavior %>% group_by(geo_value) %>% - summarise( + summarize( n_rev = mean(n_revisions), min_lag = min(min_lag), max_lag = max(max_lag), From 91bbde3ef4c362d59b1e5b90c4c96fa53fd52a6f Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 17:31:53 -0700 Subject: [PATCH 05/14] pass local checks --- R/revision_analysis.R | 2 +- man/revision_analysis.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 25e45bbeb..77f5ebfae 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -84,7 +84,7 @@ #' #' @examples #' revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) -#' revision_example %>% arrange(desc(spread)) +#' revision_example$revision_behavior %>% arrange(desc(spread)) #' #' @export #' @importFrom cli cli_inform cli_abort cli_li diff --git a/man/revision_analysis.Rd b/man/revision_analysis.Rd index 89683afb8..737e5ec0a 100644 --- a/man/revision_analysis.Rd +++ b/man/revision_analysis.Rd @@ -139,6 +139,6 @@ contains jumps at year boundaries. } \examples{ revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) -revision_example \%>\% arrange(desc(spread)) +revision_example$revision_behavior \%>\% arrange(desc(spread)) } From 68f8c500c93781ad80572ee73ceef32eb165d763 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 9 Apr 2025 10:56:54 -0500 Subject: [PATCH 06/14] formatting --- R/revision_analysis.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 77f5ebfae..f33cdd827 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -75,10 +75,10 @@ #' Using a `time_type` of `"integer"` with week numbers like 202501 will #' produce incorrect results for some calculations, since week numbering #' contains jumps at year boundaries. -#' +#' #' @return An S3 object with class `revision_behavior`. This function is typically #' called for the purposes of inspecting the printed output. The -#' results of the computations are available in +#' results of the computations are available in #' `revision_analysis(...)$revision_behavior`. If you only want to access #' the internal computations, use `return_only_tibble = TRUE`. #' @@ -205,7 +205,7 @@ revision_analysis <- function(epi_arch, if (!return_only_tibble) { revision_behavior <- structure(list( revision_behavior = revision_behavior, - range_time_values = range(epi_arch$DT$time_value), + range_time_values = range(epi_arch$DT$time_value), signal_variable = arg, drop_nas = drop_nas, time_type = time_type, @@ -248,9 +248,9 @@ print.revision_behavior <- function(x, ...) { ) cli_inform("Few revisions (At most {x$few_revisions} revisions for that `time_value`):") cli_li(num_percent(total_barely_revised, total_num, "")) - + cli::cli_h3("Fraction of revised epi_key + time_values which have:") - + real_revisions <- x$revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter rel_spread <- sum( # nolint: object_usage_linter @@ -266,7 +266,7 @@ print.revision_behavior <- function(x, ...) { ) # nolint: object_usage_linter cli_inform("Spread of more than {x$abs_spread_threshold} in actual value (when revised):") cli_li(num_percent(abs_spread, n_real_revised, "")) - + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us # and/or downstream devs, so we can paste it onto our format string safely: units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter From 496178a49fdfd9ec288471a660abed7dc329ebd2 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 9 Apr 2025 11:29:25 -0500 Subject: [PATCH 07/14] accessing wrong object, renaming x, fixed snapshot --- R/revision_analysis.R | 41 ++++++++++--------- .../_snaps/revision-latency-functions.md | 20 ++++----- .../test-revision-latency-functions.R | 4 +- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index f33cdd827..941b6fb2d 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -222,56 +222,57 @@ revision_analysis <- function(epi_arch, } #' @export -print.revision_behavior <- function(x, ...) { - cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.") +print.revision_behavior <- function(rev_beh, ...) { + revision_behavior <- rev_beh$revision_behavior + cli::cli_h2("An epi_archive spanning {.val {rev_beh$range_time_values[1]}} to {.val {rev_beh$range_time_values[1]}}.") cli::cli_h3("Min lag (time to first version):") - time_delta_summary(x$revision_behavior$min_lag, x$time_type) %>% print() - if (!x$drop_nas) { + time_delta_summary(revision_behavior$min_lag, rev_beh$time_type) %>% print() + if (!rev_beh$drop_nas) { cli_inform("Fraction of all versions that are `NA`:") - cli_li(num_percent(x$total_na, x$n_obs, "")) + cli_li(num_percent(rev_beh$total_na, rev_beh$n_obs, "")) cli_inform("") } cli::cli_h3("Fraction of epi_key + time_values with") - total_num <- nrow(x$revision_behavior) # nolint: object_usage_linter - total_num_unrevised <- sum(x$n_revisions == 0) # nolint: object_usage_linter + total_num <- nrow(revision_behavior) # nolint: object_usage_linter + total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter cli_inform("No revisions:") cli_li(num_percent(total_num_unrevised, total_num, "")) total_quickly_revised <- sum( # nolint: object_usage_linter - time_delta_to_n_steps(x$revision_behavior$max_lag, x$time_type) <= - time_delta_to_n_steps(x$quick_revision, x$time_type) + time_delta_to_n_steps(revision_behavior$max_lag, rev_beh$time_type) <= + time_delta_to_n_steps(rev_beh$quick_revision, rev_beh$time_type) ) - cli_inform("Quick revisions (last revision within {format_time_delta(x$quick_revision, x$time_type)} + cli_inform("Quick revisions (last revision within {format_time_delta(rev_beh$quick_revision, rev_beh$time_type)} of the `time_value`):") cli_li(num_percent(total_quickly_revised, total_num, "")) total_barely_revised <- sum( # nolint: object_usage_linter - x$n_revisions <= x$few_revisions + revision_behavior$n_revisions <= rev_beh$few_revisions ) - cli_inform("Few revisions (At most {x$few_revisions} revisions for that `time_value`):") + cli_inform("Few revisions (At most {rev_beh$few_revisions} revisions for that `time_value`):") cli_li(num_percent(total_barely_revised, total_num, "")) cli::cli_h3("Fraction of revised epi_key + time_values which have:") - real_revisions <- x$revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter + real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter rel_spread <- sum( # nolint: object_usage_linter real_revisions$rel_spread < - x$rel_spread_threshold, + rev_beh$rel_spread_threshold, na.rm = TRUE ) + sum(is.na(real_revisions$rel_spread)) - cli_inform("Less than {x$rel_spread_threshold} spread in relative value:") + cli_inform("Less than {rev_beh$rel_spread_threshold} spread in relative value:") cli_li(num_percent(rel_spread, n_real_revised, "")) abs_spread <- sum( # nolint: object_usage_linter real_revisions$spread > - x$abs_spread_threshold + rev_beh$abs_spread_threshold ) # nolint: object_usage_linter - cli_inform("Spread of more than {x$abs_spread_threshold} in actual value (when revised):") + cli_inform("Spread of more than {rev_beh$abs_spread_threshold} in actual value (when revised):") cli_li(num_percent(abs_spread, n_real_revised, "")) # time_type_unit_pluralizer[[time_type]] is a format string controlled by us # and/or downstream devs, so we can paste it onto our format string safely: - units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter - cli::cli_h3("{toTitleCase(units_plural)} until within {x$within_latest*100}% of the latest value:") - time_delta_summary(x$revision_behavior[["lag_near_latest"]], x$time_type) %>% print() + units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[rev_beh$time_type]])) # nolint: object_usage_linter + cli::cli_h3("{toTitleCase(units_plural)} until within {rev_beh$within_latest*100}% of the latest value:") + time_delta_summary(revision_behavior[["lag_near_latest"]], rev_beh$time_type) %>% print() } #' @export diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index e04e6c1f8..af4f5c20c 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -14,11 +14,11 @@ -- Fraction of epi_key + time_values with No revisions: - * 0 out of 7 (0%) + * 3 out of 7 (42.86%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): - * 0 out of 7 (0%) + * 6 out of 7 (85.71%) -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: @@ -74,11 +74,11 @@ -- Fraction of epi_key + time_values with No revisions: - * 0 out of 7 (0%) + * 2 out of 7 (28.57%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): - * 0 out of 7 (0%) + * 6 out of 7 (85.71%) -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: @@ -134,11 +134,11 @@ -- Fraction of epi_key + time_values with No revisions: - * 0 out of 7 (0%) + * 2 out of 7 (28.57%) Quick revisions (last revision within 1 week of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): - * 0 out of 7 (0%) + * 6 out of 7 (85.71%) -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: @@ -194,11 +194,11 @@ -- Fraction of epi_key + time_values with No revisions: - * 0 out of 7 (0%) + * 2 out of 7 (28.57%) Quick revisions (last revision within 1 month of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): - * 0 out of 7 (0%) + * 6 out of 7 (85.71%) -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: @@ -254,11 +254,11 @@ -- Fraction of epi_key + time_values with No revisions: - * 0 out of 7 (0%) + * 2 out of 7 (28.57%) Quick revisions (last revision within 3 time steps of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): - * 0 out of 7 (0%) + * 6 out of 7 (85.71%) -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index 031cf4ac3..86d98f8eb 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -78,7 +78,7 @@ test_that("revision_summary works for dummy datasets", { # Yearmonthly has the same story. It would have been close to encountering # min_waiting_period-based filtering but we actually set its versions_end to # sometime in 2080 rather than 2022: - rs4 <- dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) + rs4 <- dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) expect_snapshot(rs4) expect_snapshot(rs4$revision_behavior %>% print(n = 10, width = 300)) # Integer is very much like daily. We have to provide some of the @@ -91,7 +91,7 @@ test_that("revision_summary works for dummy datasets", { revision_summary( min_waiting_period = 60, quick_revision = 3, drop_nas = FALSE - ) + ) expect_snapshot(rs5) expect_snapshot(rs5$revision_behavior %>% print(n = 10, width = 300)) }) From 1592fd8d397084eb1f0e0bbe07abb3a8c8f9a2f4 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 9 Apr 2025 11:37:08 -0500 Subject: [PATCH 08/14] print needs name x --- R/revision_analysis.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 941b6fb2d..785a18f1a 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -222,14 +222,14 @@ revision_analysis <- function(epi_arch, } #' @export -print.revision_behavior <- function(rev_beh, ...) { - revision_behavior <- rev_beh$revision_behavior +print.revision_behavior <- function(x, ...) { + revision_behavior <- x$revision_behavior cli::cli_h2("An epi_archive spanning {.val {rev_beh$range_time_values[1]}} to {.val {rev_beh$range_time_values[1]}}.") cli::cli_h3("Min lag (time to first version):") - time_delta_summary(revision_behavior$min_lag, rev_beh$time_type) %>% print() - if (!rev_beh$drop_nas) { + time_delta_summary(revision_behavior$min_lag, x$time_type) %>% print() + if (!x$drop_nas) { cli_inform("Fraction of all versions that are `NA`:") - cli_li(num_percent(rev_beh$total_na, rev_beh$n_obs, "")) + cli_li(num_percent(x$total_na, x$n_obs, "")) cli_inform("") } cli::cli_h3("Fraction of epi_key + time_values with") @@ -238,14 +238,14 @@ print.revision_behavior <- function(rev_beh, ...) { cli_inform("No revisions:") cli_li(num_percent(total_num_unrevised, total_num, "")) total_quickly_revised <- sum( # nolint: object_usage_linter - time_delta_to_n_steps(revision_behavior$max_lag, rev_beh$time_type) <= - time_delta_to_n_steps(rev_beh$quick_revision, rev_beh$time_type) + time_delta_to_n_steps(revision_behavior$max_lag, x$time_type) <= + time_delta_to_n_steps(x$quick_revision, x$time_type) ) cli_inform("Quick revisions (last revision within {format_time_delta(rev_beh$quick_revision, rev_beh$time_type)} of the `time_value`):") cli_li(num_percent(total_quickly_revised, total_num, "")) total_barely_revised <- sum( # nolint: object_usage_linter - revision_behavior$n_revisions <= rev_beh$few_revisions + revision_behavior$n_revisions <= x$few_revisions ) cli_inform("Few revisions (At most {rev_beh$few_revisions} revisions for that `time_value`):") cli_li(num_percent(total_barely_revised, total_num, "")) @@ -256,23 +256,23 @@ print.revision_behavior <- function(rev_beh, ...) { n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter rel_spread <- sum( # nolint: object_usage_linter real_revisions$rel_spread < - rev_beh$rel_spread_threshold, + x$rel_spread_threshold, na.rm = TRUE ) + sum(is.na(real_revisions$rel_spread)) cli_inform("Less than {rev_beh$rel_spread_threshold} spread in relative value:") cli_li(num_percent(rel_spread, n_real_revised, "")) abs_spread <- sum( # nolint: object_usage_linter real_revisions$spread > - rev_beh$abs_spread_threshold + x$abs_spread_threshold ) # nolint: object_usage_linter cli_inform("Spread of more than {rev_beh$abs_spread_threshold} in actual value (when revised):") cli_li(num_percent(abs_spread, n_real_revised, "")) # time_type_unit_pluralizer[[time_type]] is a format string controlled by us # and/or downstream devs, so we can paste it onto our format string safely: - units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[rev_beh$time_type]])) # nolint: object_usage_linter + units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter cli::cli_h3("{toTitleCase(units_plural)} until within {rev_beh$within_latest*100}% of the latest value:") - time_delta_summary(revision_behavior[["lag_near_latest"]], rev_beh$time_type) %>% print() + time_delta_summary(revision_behavior[["lag_near_latest"]], x$time_type) %>% print() } #' @export From 48aa65ec65f2f8d462d2b0e2e513a615081457aa Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 9 Apr 2025 10:13:06 -0700 Subject: [PATCH 09/14] move print options to print method --- R/revision_analysis.R | 72 ++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 77f5ebfae..dd4a1dd99 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -43,20 +43,6 @@ #' to `NULL` or 0. #' @param within_latest double between 0 and 1. Determines the threshold #' used for the `lag_to` -#' @param quick_revision difftime or integer (integer is treated as days), for -#' the printed summary, the amount of time between the final revision and the -#' actual time_value to consider the revision quickly resolved. Default of 3 -#' days -#' @param few_revisions integer, for the printed summary, the upper bound on the -#' number of revisions to consider "few". Default is 3. -#' @param abs_spread_threshold length-1 numeric, for the printed summary, the -#' maximum spread used to characterize revisions which don't actually change -#' very much. Default is 5% of the maximum value in the dataset, but this is -#' the most unit dependent of values, and likely needs to be chosen -#' appropriate for the scale of the dataset. -#' @param rel_spread_threshold length-1 double between 0 and 1, for the printed -#' summary, the relative spread fraction used to characterize revisions which -#' don't actually change very much. Default is .1, or 10% of the final value #' @param compactify bool. If `TRUE`, we will compactify after the signal #' requested in `...` has been selected on its own and the `drop_nas` step. #' This helps, for example, to give similar results when called on @@ -98,11 +84,6 @@ revision_analysis <- function(epi_arch, min_waiting_period = as.difftime(60, units = "days") %>% difftime_approx_ceiling_time_delta(epi_arch$time_type), within_latest = 0.2, - quick_revision = as.difftime(3, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), - few_revisions = 3, - abs_spread_threshold = NULL, - rel_spread_threshold = 0.1, compactify = TRUE, compactify_abs_tol = 0, return_only_tibble = FALSE) { @@ -211,18 +192,41 @@ revision_analysis <- function(epi_arch, time_type = time_type, total_na = total_na, n_obs = nrow(epi_arch$DT), - quick_revision = quick_revision, - few_revisions = few_revisions, - rel_spread_threshold = rel_spread_threshold, - abs_spread_threshold = abs_spread_threshold, within_latest = within_latest - ), class = "revision_behavior") + ), class = "revision_analysis") } return(revision_behavior) } + + + +#' Print a `revision_analysis` object +#' +#' @param x a `revision_analysis` object +#' @param quick_revision Difftime or integer (integer is treated as days). +#' The amount of time between the final revision and the +#' actual time_value to consider the revision quickly resolved. Default of 3 +#' days +#' @param few_revisions Integer. The upper bound on the +#' number of revisions to consider "few". Default is 3. +#' @param abs_spread_threshold Scalar numeric. The +#' maximum spread used to characterize revisions which don't actually change +#' very much. Default is 5% of the maximum value in the dataset, but this is +#' the most unit dependent of values, and likely needs to be chosen +#' appropriate for the scale of the dataset. +#' @param rel_spread_threshold Scalar between 0 and 1. The relative spread fraction used to characterize revisions which +#' don't actually change very much. Default is .1, or 10% of the final value +#' +#' @rdname revision_analysis #' @export -print.revision_behavior <- function(x, ...) { +print.revision_analysis <- function(x, + quick_revision = as.difftime(3, units = "days") %>% + difftime_approx_ceiling_time_delta(x$time_type), + few_revisions = 3, + abs_spread_threshold = NULL, + rel_spread_threshold = 0.1, + ...) { cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.") cli::cli_h3("Min lag (time to first version):") time_delta_summary(x$revision_behavior$min_lag, x$time_type) %>% print() @@ -238,15 +242,15 @@ print.revision_behavior <- function(x, ...) { cli_li(num_percent(total_num_unrevised, total_num, "")) total_quickly_revised <- sum( # nolint: object_usage_linter time_delta_to_n_steps(x$revision_behavior$max_lag, x$time_type) <= - time_delta_to_n_steps(x$quick_revision, x$time_type) + time_delta_to_n_steps(quick_revision, x$time_type) ) - cli_inform("Quick revisions (last revision within {format_time_delta(x$quick_revision, x$time_type)} + cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, x$time_type)} of the `time_value`):") cli_li(num_percent(total_quickly_revised, total_num, "")) total_barely_revised <- sum( # nolint: object_usage_linter - x$n_revisions <= x$few_revisions + x$n_revisions <= few_revisions ) - cli_inform("Few revisions (At most {x$few_revisions} revisions for that `time_value`):") + cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):") cli_li(num_percent(total_barely_revised, total_num, "")) cli::cli_h3("Fraction of revised epi_key + time_values which have:") @@ -254,17 +258,15 @@ print.revision_behavior <- function(x, ...) { real_revisions <- x$revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter rel_spread <- sum( # nolint: object_usage_linter - real_revisions$rel_spread < - x$rel_spread_threshold, + real_revisions$rel_spread < rel_spread_threshold, na.rm = TRUE ) + sum(is.na(real_revisions$rel_spread)) - cli_inform("Less than {x$rel_spread_threshold} spread in relative value:") + cli_inform("Less than {rel_spread_threshold} spread in relative value:") cli_li(num_percent(rel_spread, n_real_revised, "")) abs_spread <- sum( # nolint: object_usage_linter - real_revisions$spread > - x$abs_spread_threshold + real_revisions$spread > abs_spread_threshold ) # nolint: object_usage_linter - cli_inform("Spread of more than {x$abs_spread_threshold} in actual value (when revised):") + cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") cli_li(num_percent(abs_spread, n_real_revised, "")) # time_type_unit_pluralizer[[time_type]] is a format string controlled by us From 32403fbb3e55d04e3eaea06109ffd66d5ead4e1b Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 9 Apr 2025 10:39:25 -0700 Subject: [PATCH 10/14] rebuild docs and test --- R/revision_analysis.R | 21 +++++++++---------- .../_snaps/revision-latency-functions.md | 2 +- .../test-revision-latency-functions.R | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 55be566d6..aac77bdde 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -113,11 +113,6 @@ revision_analysis <- function(epi_arch, cli_abort("Not currently implementing more than one column at a time. Run each separately.") } } - if (is.null(abs_spread_threshold)) { - abs_spread_threshold <- .05 * epi_arch$DT %>% - pull(!!arg) %>% - max(na.rm = TRUE) - } # for each time_value, get # the number of revisions # the maximum spread in value (both absolute and relative) @@ -191,6 +186,7 @@ revision_analysis <- function(epi_arch, drop_nas = drop_nas, time_type = time_type, total_na = total_na, + max_val = max(epi_arch$DT[[arg]], na.rm = TRUE), n_obs = nrow(epi_arch$DT), within_latest = within_latest ), class = "revision_analysis") @@ -227,8 +223,9 @@ print.revision_analysis <- function(x, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, ...) { + if (is.null(abs_spread_threshold)) abs_spread_threshold <- .05 * x$max_val rev_beh <- x$revision_behavior - cli::cli_h2("An epi_archive spanning {.val {rev_beh$range_time_values[1]}} to {.val {rev_beh$range_time_values[1]}}.") + cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.") cli::cli_h3("Min lag (time to first version):") time_delta_summary(rev_beh$min_lag, x$time_type) %>% print() if (!x$drop_nas) { @@ -248,8 +245,8 @@ print.revision_analysis <- function(x, cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, x$time_type)} of the `time_value`):") cli_li(num_percent(total_quickly_revised, total_num, "")) - total_barely_revised <- sum(x$n_revisions <= few_revisions) - cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):") + total_barely_revised <- sum(rev_beh$n_revisions <= few_revisions) + cli_inform("Few revisions (At most {.val {few_revisions}} revisions for that `time_value`):") cli_li(num_percent(total_barely_revised, total_num, "")) cli::cli_h3("Fraction of revised epi_key + time_values which have:") @@ -260,18 +257,20 @@ print.revision_analysis <- function(x, real_revisions$rel_spread < rel_spread_threshold, na.rm = TRUE ) + sum(is.na(real_revisions$rel_spread)) - cli_inform("Less than {rel_spread_threshold} spread in relative value:") + cli_inform("Less than {.val {rel_spread_threshold}} spread in relative value:") cli_li(num_percent(rel_spread, n_real_revised, "")) abs_spread <- sum( # nolint: object_usage_linter real_revisions$spread > abs_spread_threshold ) # nolint: object_usage_linter - cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") + divid <- cli::cli_div(theme = list(.val = list(digits = 3))) + cli_inform("Spread of more than {.val {abs_spread_threshold}} in actual value (when revised):") + cli::cli_end(divid) cli_li(num_percent(abs_spread, n_real_revised, "")) # time_type_unit_pluralizer[[time_type]] is a format string controlled by us # and/or downstream devs, so we can paste it onto our format string safely: units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter - cli::cli_h3("{toTitleCase(units_plural)} until within {x$within_latest*100}% of the latest value:") + cli::cli_h3("{toTitleCase(units_plural)} until within {.val {x$within_latest*100}}% of the latest value:") time_delta_summary(rev_beh[["lag_near_latest"]], x$time_type) %>% print() } diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index af4f5c20c..1fbbbe06e 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -239,7 +239,7 @@ --- Code - rs5 + print(rs5, quick_revision = 3) Message -- An epi_archive spanning 1 to 1. -- diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index 86d98f8eb..1a61e8d88 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -89,10 +89,10 @@ test_that("revision_summary works for dummy datasets", { # something nonsensical, but we tried. rs5 <- dummy_ex_integerly %>% revision_summary( - min_waiting_period = 60, quick_revision = 3, + min_waiting_period = 60, drop_nas = FALSE ) - expect_snapshot(rs5) + expect_snapshot(print(rs5, quick_revision = 3)) expect_snapshot(rs5$revision_behavior %>% print(n = 10, width = 300)) }) From e0304410c4dc64a651f3090a6423d832712f4648 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 9 Apr 2025 10:45:18 -0700 Subject: [PATCH 11/14] appease the lintr --- R/revision_analysis.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index aac77bdde..b1793a8e2 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -213,7 +213,7 @@ revision_analysis <- function(epi_arch, #' appropriate for the scale of the dataset. #' @param rel_spread_threshold Scalar between 0 and 1. The relative spread fraction used to characterize revisions which #' don't actually change very much. Default is .1, or 10% of the final value -#' +#' #' @rdname revision_analysis #' @export print.revision_analysis <- function(x, From 412e1a12aa5580fc29b0b0669faa5f004bf1eadd Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 9 Apr 2025 12:32:47 -0700 Subject: [PATCH 12/14] move internal function out of default args --- R/revision_analysis.R | 16 ++++++++++------ man/revision_analysis.Rd | 15 +++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index b1793a8e2..3518d974f 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -40,7 +40,8 @@ #' the typical time during which most significant revisions occur. The default #' of 60 days corresponds to a typical near-final value for case counts as #' reported in the context of insurance. To avoid this filtering, either set -#' to `NULL` or 0. +#' to `NULL` or 0. This will be rounded up to the appropriate `time_type` if +#' necessary (that is 5 days will be rounded to 1 week if the data is weekly). #' @param within_latest double between 0 and 1. Determines the threshold #' used for the `lag_to` #' @param compactify bool. If `TRUE`, we will compactify after the signal @@ -81,13 +82,14 @@ revision_analysis <- function(epi_arch, ..., drop_nas = TRUE, - min_waiting_period = as.difftime(60, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), + min_waiting_period = as.difftime(60, units = "days"), within_latest = 0.2, compactify = TRUE, compactify_abs_tol = 0, return_only_tibble = FALSE) { assert_class(epi_arch, "epi_archive") + min_waiting_period <- min_waiting_period %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type) # if the column to summarize isn't specified, use the only one if there is only one if (dots_n(...) == 0) { # Choose the first column that's not a key: @@ -203,7 +205,8 @@ revision_analysis <- function(epi_arch, #' @param quick_revision Difftime or integer (integer is treated as days). #' The amount of time between the final revision and the #' actual time_value to consider the revision quickly resolved. Default of 3 -#' days +#' days. This will be rounded up to the appropriate `time_type` if +#' necessary (that is 5 days will be rounded to 1 week if the data is weekly). #' @param few_revisions Integer. The upper bound on the #' number of revisions to consider "few". Default is 3. #' @param abs_spread_threshold Scalar numeric. The @@ -217,12 +220,13 @@ revision_analysis <- function(epi_arch, #' @rdname revision_analysis #' @export print.revision_analysis <- function(x, - quick_revision = as.difftime(3, units = "days") %>% - difftime_approx_ceiling_time_delta(x$time_type), + quick_revision = as.difftime(3, units = "days"), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, ...) { + quick_revision <- quick_revision %>% + difftime_approx_ceiling_time_delta(x$time_type) if (is.null(abs_spread_threshold)) abs_spread_threshold <- .05 * x$max_val rev_beh <- x$revision_behavior cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.") diff --git a/man/revision_analysis.Rd b/man/revision_analysis.Rd index 947af17d8..23ddf0212 100644 --- a/man/revision_analysis.Rd +++ b/man/revision_analysis.Rd @@ -10,8 +10,7 @@ revision_analysis( epi_arch, ..., drop_nas = TRUE, - min_waiting_period = as.difftime(60, units = "days") \%>\% - difftime_approx_ceiling_time_delta(epi_arch$time_type), + min_waiting_period = as.difftime(60, units = "days"), within_latest = 0.2, compactify = TRUE, compactify_abs_tol = 0, @@ -20,8 +19,7 @@ revision_analysis( \method{print}{revision_analysis}( x, - quick_revision = as.difftime(3, units = "days") \%>\% - difftime_approx_ceiling_time_delta(x$time_type), + quick_revision = as.difftime(3, units = "days"), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, @@ -32,8 +30,7 @@ revision_summary( epi_arch, ..., drop_nas = TRUE, - min_waiting_period = as.difftime(60, units = "days") \%>\% - difftime_approx_ceiling_time_delta(epi_arch$time_type), + min_waiting_period = as.difftime(60, units = "days"), within_latest = 0.2, compactify = TRUE, compactify_abs_tol = 0, @@ -59,7 +56,8 @@ of the \code{versions_end} are removed. \code{min_waiting_period} should charact the typical time during which most significant revisions occur. The default of 60 days corresponds to a typical near-final value for case counts as reported in the context of insurance. To avoid this filtering, either set -to \code{NULL} or 0.} +to \code{NULL} or 0. This will be rounded up to the appropriate \code{time_type} if +necessary (that is 5 days will be rounded to 1 week if the data is weekly).} \item{within_latest}{double between 0 and 1. Determines the threshold used for the \code{lag_to}} @@ -82,7 +80,8 @@ computational results rather than the complete S3 object.} \item{quick_revision}{Difftime or integer (integer is treated as days). The amount of time between the final revision and the actual time_value to consider the revision quickly resolved. Default of 3 -days} +days. This will be rounded up to the appropriate \code{time_type} if +necessary (that is 5 days will be rounded to 1 week if the data is weekly).} \item{few_revisions}{Integer. The upper bound on the number of revisions to consider "few". Default is 3.} From 788734cc804cdd29d3863beae3ed060d03010045 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 9 Apr 2025 12:37:13 -0700 Subject: [PATCH 13/14] appease lintr --- R/revision_analysis.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 3518d974f..a1d31ed4f 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -225,7 +225,7 @@ print.revision_analysis <- function(x, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, ...) { - quick_revision <- quick_revision %>% + quick_revision <- quick_revision %>% difftime_approx_ceiling_time_delta(x$time_type) if (is.null(abs_spread_threshold)) abs_spread_threshold <- .05 * x$max_val rev_beh <- x$revision_behavior From 6872e0271814ccaf3c66cc05bf9e266013c2356b Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 9 Apr 2025 12:46:35 -0700 Subject: [PATCH 14/14] pass local checks --- R/revision_analysis.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index a1d31ed4f..86210effc 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -40,7 +40,7 @@ #' the typical time during which most significant revisions occur. The default #' of 60 days corresponds to a typical near-final value for case counts as #' reported in the context of insurance. To avoid this filtering, either set -#' to `NULL` or 0. This will be rounded up to the appropriate `time_type` if +#' to `NULL` or 0. A `difftime` will be rounded up to the appropriate `time_type` if #' necessary (that is 5 days will be rounded to 1 week if the data is weekly). #' @param within_latest double between 0 and 1. Determines the threshold #' used for the `lag_to` @@ -88,8 +88,10 @@ revision_analysis <- function(epi_arch, compactify_abs_tol = 0, return_only_tibble = FALSE) { assert_class(epi_arch, "epi_archive") - min_waiting_period <- min_waiting_period %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type) + if (methods::is(min_waiting_period, "difftime")) { + min_waiting_period <- min_waiting_period %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type) + } # if the column to summarize isn't specified, use the only one if there is only one if (dots_n(...) == 0) { # Choose the first column that's not a key: @@ -225,8 +227,10 @@ print.revision_analysis <- function(x, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, ...) { - quick_revision <- quick_revision %>% - difftime_approx_ceiling_time_delta(x$time_type) + if (methods::is(quick_revision, "difftime")) { + quick_revision <- quick_revision %>% + difftime_approx_ceiling_time_delta(x$time_type) + } if (is.null(abs_spread_threshold)) abs_spread_threshold <- .05 * x$max_val rev_beh <- x$revision_behavior cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.")