diff --git a/DESCRIPTION b/DESCRIPTION index c7b12c3e4..c7a651d8e 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.11.4 +Version: 0.11.5 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NAMESPACE b/NAMESPACE index 3582fe9c7..e01468e14 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ S3method(print,epi_archive) S3method(print,epi_df) S3method(print,grouped_epi_archive) S3method(print,growth_rate_params) +S3method(print,revision_analysis) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) @@ -99,6 +100,7 @@ export(new_epi_archive) export(new_epi_df) export(relocate) export(rename) +export(revision_analysis) export(revision_summary) export(set_versions_end) export(slice) diff --git a/NEWS.md b/NEWS.md index 2734419bf..a971623f9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,7 +22,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat for users without a compiler, we have placed `{trendfilter}` in Suggests:; if you want to use `method = "trendfilter"` you will need to manually install this dependency (e.g., with `remotes::install_github("glmgen/trendfilter")`). -- In `revision_summary()`: +- In `revision_summary()` (aliased to `revision_analysis()`): - The `should_compactify` argument is now called `compactify`. To migrate, change any calls with `should_compactfiy =` to `compactify =`. - Output now uses the name `lag_near_latest` instead of `time_near_latest`. To @@ -36,6 +36,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `min_waiting_period` now defines a nonstrict inequality instead of a strict one. To obtain the old bounds, bump the `min_waiting_period` up to the next possible value for your `time_type`. + - Added a `print()` method to take the place of `print_inform`. + - Removed the the argument `print_inform`. This is now always false. Replaced + with an option, `return_only_tibble` to return only the tibble of results + rather than the full S3 object. + - In `key_colnames()`: - On regular (non-`epi_df`) data frames, now requires manual specification of `geo_keys`, `other_keys`, and `time_keys`. @@ -53,7 +58,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat `.facet_filter` which allows explicit selection of facets to show. ## Improvements -- `revision_summary()` now supports all `time_type`s. +- `revision_summary()` now supports all `time_type`s. Printed summary is more attractive. - The compactification tolerance setting now works with integer-type columns. - Various functions are now faster, using faster variants of core operations and avoiding reconstructing grouped `epi_df`s when unnecessary. diff --git a/R/revision_analysis.R b/R/revision_analysis.R index f36dcc16a..86210effc 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -34,31 +34,16 @@ #' `NA`'s compactify is run again if `compactify` is `TRUE` to make #' sure there are no duplicate values from occasions when the signal is #' revised to `NA`, and then back to its immediately-preceding value. -#' @param print_inform bool, determines whether to print summary information, or -#' only return the full summary tibble #' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any #' time_values that have not had at least `min_waiting_period` to stabilize as #' of the `versions_end` are removed. `min_waiting_period` should characterize #' the typical time during which most significant revisions occur. The default #' of 60 days corresponds to a typical near-final value for case counts as #' reported in the context of insurance. To avoid this filtering, either set -#' to `NULL` or 0. +#' to `NULL` or 0. A `difftime` will be rounded up to the appropriate `time_type` if +#' necessary (that is 5 days will be rounded to 1 week if the data is weekly). #' @param within_latest double between 0 and 1. Determines the threshold #' used for the `lag_to` -#' @param quick_revision difftime or integer (integer is treated as days), for -#' the printed summary, the amount of time between the final revision and the -#' actual time_value to consider the revision quickly resolved. Default of 3 -#' days -#' @param few_revisions integer, for the printed summary, the upper bound on the -#' number of revisions to consider "few". Default is 3. -#' @param abs_spread_threshold length-1 numeric, for the printed summary, the -#' maximum spread used to characterize revisions which don't actually change -#' very much. Default is 5% of the maximum value in the dataset, but this is -#' the most unit dependent of values, and likely needs to be chosen -#' appropriate for the scale of the dataset. -#' @param rel_spread_threshold length-1 double between 0 and 1, for the printed -#' summary, the relative spread fraction used to characterize revisions which -#' don't actually change very much. Default is .1, or 10% of the final value #' @param compactify bool. If `TRUE`, we will compactify after the signal #' requested in `...` has been selected on its own and the `drop_nas` step. #' This helps, for example, to give similar results when called on @@ -67,6 +52,8 @@ #' requested signal. The default is `TRUE`. #' @param compactify_abs_tol length-1 double, used if `compactify` is `TRUE`, it #' determines the threshold for when two doubles are considered identical. +#' @param return_only_tibble boolean to return only the simple `tibble` of +#' computational results rather than the complete S3 object. #' #' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`, #' and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if @@ -76,9 +63,15 @@ #' produce incorrect results for some calculations, since week numbering #' contains jumps at year boundaries. #' +#' @return An S3 object with class `revision_behavior`. This function is typically +#' called for the purposes of inspecting the printed output. The +#' results of the computations are available in +#' `revision_analysis(...)$revision_behavior`. If you only want to access +#' the internal computations, use `return_only_tibble = TRUE`. +#' #' @examples -#' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) -#' revision_example %>% arrange(desc(spread)) +#' revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) +#' revision_example$revision_behavior %>% arrange(desc(spread)) #' #' @export #' @importFrom cli cli_inform cli_abort cli_li @@ -86,21 +79,19 @@ #' @importFrom vctrs vec_cast #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across #' everything ungroup summarize if_else %>% -revision_summary <- function(epi_arch, - ..., - drop_nas = TRUE, - print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), - within_latest = 0.2, - quick_revision = as.difftime(3, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), - few_revisions = 3, - abs_spread_threshold = NULL, - rel_spread_threshold = 0.1, - compactify = TRUE, - compactify_abs_tol = 0) { +revision_analysis <- function(epi_arch, + ..., + drop_nas = TRUE, + min_waiting_period = as.difftime(60, units = "days"), + within_latest = 0.2, + compactify = TRUE, + compactify_abs_tol = 0, + return_only_tibble = FALSE) { assert_class(epi_arch, "epi_archive") + if (methods::is(min_waiting_period, "difftime")) { + min_waiting_period <- min_waiting_period %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type) + } # if the column to summarize isn't specified, use the only one if there is only one if (dots_n(...) == 0) { # Choose the first column that's not a key: @@ -126,11 +117,6 @@ revision_summary <- function(epi_arch, cli_abort("Not currently implementing more than one column at a time. Run each separately.") } } - if (is.null(abs_spread_threshold)) { - abs_spread_threshold <- .05 * epi_arch$DT %>% - pull(!!arg) %>% - max(na.rm = TRUE) - } # for each time_value, get # the number of revisions # the maximum spread in value (both absolute and relative) @@ -193,63 +179,113 @@ revision_summary <- function(epi_arch, time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter lag_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter ) - if (print_inform) { - cli_inform("Min lag (time to first version):") - time_delta_summary(revision_behavior$min_lag, time_type) %>% print() - if (!drop_nas) { - total_na <- epi_arch$DT %>% - filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter - nrow() - cli_inform("Fraction of all versions that are `NA`:") - cli_li(num_percent(total_na, nrow(epi_arch$DT), "")) - cli_inform("") - } - cli_inform("Fraction of epi_key+time_values with") - total_num <- nrow(revision_behavior) # nolint: object_usage_linter - total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter - cli_inform("No revisions:") - cli_li(num_percent(total_num_unrevised, total_num, "")) - total_quickly_revised <- sum( # nolint: object_usage_linter - time_delta_to_n_steps(revision_behavior$max_lag, time_type) <= - time_delta_to_n_steps(quick_revision, time_type) - ) - cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, time_type)} - of the `time_value`):") - cli_li(num_percent(total_quickly_revised, total_num, "")) - total_barely_revised <- sum( # nolint: object_usage_linter - revision_behavior$n_revisions <= - few_revisions - ) - cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):") - cli_li(num_percent(total_barely_revised, total_num, "")) - cli_inform("") - cli_inform("Fraction of revised epi_key+time_values which have:") + total_na <- epi_arch$DT %>% + filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter + nrow() + if (!return_only_tibble) { + revision_behavior <- structure(list( + revision_behavior = revision_behavior, + range_time_values = range(epi_arch$DT$time_value), + signal_variable = arg, + drop_nas = drop_nas, + time_type = time_type, + total_na = total_na, + max_val = max(epi_arch$DT[[arg]], na.rm = TRUE), + n_obs = nrow(epi_arch$DT), + within_latest = within_latest + ), class = "revision_analysis") + } + return(revision_behavior) +} + + - real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter - n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter - rel_spread <- sum( # nolint: object_usage_linter - real_revisions$rel_spread < - rel_spread_threshold, - na.rm = TRUE - ) + sum(is.na(real_revisions$rel_spread)) - cli_inform("Less than {rel_spread_threshold} spread in relative value:") - cli_li(num_percent(rel_spread, n_real_revised, "")) - abs_spread <- sum( # nolint: object_usage_linter - real_revisions$spread > - abs_spread_threshold - ) # nolint: object_usage_linter - cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") - cli_li(num_percent(abs_spread, n_real_revised, "")) - # time_type_unit_pluralizer[[time_type]] is a format string controlled by us - # and/or downstream devs, so we can paste it onto our format string safely: - units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[time_type]])) # nolint: object_usage_linter - cli_inform("{toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:") - time_delta_summary(revision_behavior[["lag_near_latest"]], time_type) %>% print() +#' Print a `revision_analysis` object +#' +#' @param x a `revision_analysis` object +#' @param quick_revision Difftime or integer (integer is treated as days). +#' The amount of time between the final revision and the +#' actual time_value to consider the revision quickly resolved. Default of 3 +#' days. This will be rounded up to the appropriate `time_type` if +#' necessary (that is 5 days will be rounded to 1 week if the data is weekly). +#' @param few_revisions Integer. The upper bound on the +#' number of revisions to consider "few". Default is 3. +#' @param abs_spread_threshold Scalar numeric. The +#' maximum spread used to characterize revisions which don't actually change +#' very much. Default is 5% of the maximum value in the dataset, but this is +#' the most unit dependent of values, and likely needs to be chosen +#' appropriate for the scale of the dataset. +#' @param rel_spread_threshold Scalar between 0 and 1. The relative spread fraction used to characterize revisions which +#' don't actually change very much. Default is .1, or 10% of the final value +#' +#' @rdname revision_analysis +#' @export +print.revision_analysis <- function(x, + quick_revision = as.difftime(3, units = "days"), + few_revisions = 3, + abs_spread_threshold = NULL, + rel_spread_threshold = 0.1, + ...) { + if (methods::is(quick_revision, "difftime")) { + quick_revision <- quick_revision %>% + difftime_approx_ceiling_time_delta(x$time_type) } - return(revision_behavior) + if (is.null(abs_spread_threshold)) abs_spread_threshold <- .05 * x$max_val + rev_beh <- x$revision_behavior + cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[1]}}.") + cli::cli_h3("Min lag (time to first version):") + time_delta_summary(rev_beh$min_lag, x$time_type) %>% print() + if (!x$drop_nas) { + cli_inform("Fraction of all versions that are `NA`:") + cli_li(num_percent(x$total_na, x$n_obs, "")) + cli_inform("") + } + cli::cli_h3("Fraction of epi_key + time_values with") + total_num <- nrow(rev_beh) # nolint: object_usage_linter + total_num_unrevised <- sum(rev_beh$n_revisions == 0) # nolint: object_usage_linter + cli_inform("No revisions:") + cli_li(num_percent(total_num_unrevised, total_num, "")) + total_quickly_revised <- sum( # nolint: object_usage_linter + time_delta_to_n_steps(rev_beh$max_lag, x$time_type) <= + time_delta_to_n_steps(quick_revision, x$time_type) + ) + cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, x$time_type)} + of the `time_value`):") + cli_li(num_percent(total_quickly_revised, total_num, "")) + total_barely_revised <- sum(rev_beh$n_revisions <= few_revisions) + cli_inform("Few revisions (At most {.val {few_revisions}} revisions for that `time_value`):") + cli_li(num_percent(total_barely_revised, total_num, "")) + + cli::cli_h3("Fraction of revised epi_key + time_values which have:") + + real_revisions <- rev_beh %>% filter(n_revisions > 0) # nolint: object_usage_linter + n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter + rel_spread <- sum( # nolint: object_usage_linter + real_revisions$rel_spread < rel_spread_threshold, + na.rm = TRUE + ) + sum(is.na(real_revisions$rel_spread)) + cli_inform("Less than {.val {rel_spread_threshold}} spread in relative value:") + cli_li(num_percent(rel_spread, n_real_revised, "")) + abs_spread <- sum( # nolint: object_usage_linter + real_revisions$spread > abs_spread_threshold + ) # nolint: object_usage_linter + divid <- cli::cli_div(theme = list(.val = list(digits = 3))) + cli_inform("Spread of more than {.val {abs_spread_threshold}} in actual value (when revised):") + cli::cli_end(divid) + cli_li(num_percent(abs_spread, n_real_revised, "")) + + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us + # and/or downstream devs, so we can paste it onto our format string safely: + units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter + cli::cli_h3("{toTitleCase(units_plural)} until within {.val {x$within_latest*100}}% of the latest value:") + time_delta_summary(rev_beh[["lag_near_latest"]], x$time_type) %>% print() } +#' @export +#' @rdname revision_analysis +revision_summary <- revision_analysis + #' pull the value from lags when values starts indefinitely being within prop of its latest value. #' @param lags vector of lags; should be sorted #' @param values this should be a vector (e.g., a column) with length matching that of `lags` diff --git a/man/epiprocess-package.Rd b/man/epiprocess-package.Rd index fe79c01e8..12e8a3c42 100644 --- a/man/epiprocess-package.Rd +++ b/man/epiprocess-package.Rd @@ -11,6 +11,7 @@ This package introduces common data structures for working with epidemiological \seealso{ Useful links: \itemize{ + \item \url{https://github.com/cmu-delphi/epiprocess} \item \url{https://cmu-delphi.github.io/epiprocess/} } diff --git a/man/revision_summary.Rd b/man/revision_analysis.Rd similarity index 72% rename from man/revision_summary.Rd rename to man/revision_analysis.Rd index 54abf098c..23ddf0212 100644 --- a/man/revision_summary.Rd +++ b/man/revision_analysis.Rd @@ -1,24 +1,40 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/revision_analysis.R -\name{revision_summary} +\name{revision_analysis} +\alias{revision_analysis} +\alias{print.revision_analysis} \alias{revision_summary} \title{A function to describe revision behavior for an archive.} \usage{ -revision_summary( +revision_analysis( epi_arch, ..., drop_nas = TRUE, - print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days") \%>\% - difftime_approx_ceiling_time_delta(epi_arch$time_type), + min_waiting_period = as.difftime(60, units = "days"), within_latest = 0.2, - quick_revision = as.difftime(3, units = "days") \%>\% - difftime_approx_ceiling_time_delta(epi_arch$time_type), + compactify = TRUE, + compactify_abs_tol = 0, + return_only_tibble = FALSE +) + +\method{print}{revision_analysis}( + x, + quick_revision = as.difftime(3, units = "days"), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, + ... +) + +revision_summary( + epi_arch, + ..., + drop_nas = TRUE, + min_waiting_period = as.difftime(60, units = "days"), + within_latest = 0.2, compactify = TRUE, - compactify_abs_tol = 0 + compactify_abs_tol = 0, + return_only_tibble = FALSE ) } \arguments{ @@ -34,47 +50,57 @@ If supplied, \code{...} must select exactly one column.} sure there are no duplicate values from occasions when the signal is revised to \code{NA}, and then back to its immediately-preceding value.} -\item{print_inform}{bool, determines whether to print summary information, or -only return the full summary tibble} - \item{min_waiting_period}{\code{difftime}, integer or \code{NULL}. Sets a cutoff: any time_values that have not had at least \code{min_waiting_period} to stabilize as of the \code{versions_end} are removed. \code{min_waiting_period} should characterize the typical time during which most significant revisions occur. The default of 60 days corresponds to a typical near-final value for case counts as reported in the context of insurance. To avoid this filtering, either set -to \code{NULL} or 0.} +to \code{NULL} or 0. This will be rounded up to the appropriate \code{time_type} if +necessary (that is 5 days will be rounded to 1 week if the data is weekly).} \item{within_latest}{double between 0 and 1. Determines the threshold used for the \code{lag_to}} -\item{quick_revision}{difftime or integer (integer is treated as days), for -the printed summary, the amount of time between the final revision and the +\item{compactify}{bool. If \code{TRUE}, we will compactify after the signal +requested in \code{...} has been selected on its own and the \code{drop_nas} step. +This helps, for example, to give similar results when called on +\link[=epix_merge]{merged} and single-signal archives, since merged archives +record an update when any of the other signals change, not just the +requested signal. The default is \code{TRUE}.} + +\item{compactify_abs_tol}{length-1 double, used if \code{compactify} is \code{TRUE}, it +determines the threshold for when two doubles are considered identical.} + +\item{return_only_tibble}{boolean to return only the simple \code{tibble} of +computational results rather than the complete S3 object.} + +\item{x}{a \code{revision_analysis} object} + +\item{quick_revision}{Difftime or integer (integer is treated as days). +The amount of time between the final revision and the actual time_value to consider the revision quickly resolved. Default of 3 -days} +days. This will be rounded up to the appropriate \code{time_type} if +necessary (that is 5 days will be rounded to 1 week if the data is weekly).} -\item{few_revisions}{integer, for the printed summary, the upper bound on the +\item{few_revisions}{Integer. The upper bound on the number of revisions to consider "few". Default is 3.} -\item{abs_spread_threshold}{length-1 numeric, for the printed summary, the +\item{abs_spread_threshold}{Scalar numeric. The maximum spread used to characterize revisions which don't actually change very much. Default is 5\% of the maximum value in the dataset, but this is the most unit dependent of values, and likely needs to be chosen appropriate for the scale of the dataset.} -\item{rel_spread_threshold}{length-1 double between 0 and 1, for the printed -summary, the relative spread fraction used to characterize revisions which +\item{rel_spread_threshold}{Scalar between 0 and 1. The relative spread fraction used to characterize revisions which don't actually change very much. Default is .1, or 10\% of the final value} - -\item{compactify}{bool. If \code{TRUE}, we will compactify after the signal -requested in \code{...} has been selected on its own and the \code{drop_nas} step. -This helps, for example, to give similar results when called on -\link[=epix_merge]{merged} and single-signal archives, since merged archives -record an update when any of the other signals change, not just the -requested signal. The default is \code{TRUE}.} - -\item{compactify_abs_tol}{length-1 double, used if \code{compactify} is \code{TRUE}, it -determines the threshold for when two doubles are considered identical.} +} +\value{ +An S3 object with class \code{revision_behavior}. This function is typically +called for the purposes of inspecting the printed output. The +results of the computations are available in +\code{revision_analysis(...)$revision_behavior}. If you only want to access +the internal computations, use \code{return_only_tibble = TRUE}. } \description{ \code{revision_summary} removes all missing values (if requested), and then @@ -113,7 +139,7 @@ produce incorrect results for some calculations, since week numbering contains jumps at year boundaries. } \examples{ -revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) -revision_example \%>\% arrange(desc(spread)) +revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) +revision_example$revision_behavior \%>\% arrange(desc(spread)) } diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index 4f2bfe269..1fbbbe06e 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -1,29 +1,41 @@ # revision_summary works for dummy datasets Code - dummy_ex %>% revision_summary() %>% print(n = 10, width = 300) + rs1 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-01. -- + + -- Min lag (time to first version): Output min median mean max 0 days 1 days 1.6 days 4 days Message - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 3 out of 7 (42.86%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 1 out of 4 (25%) Spread of more than 5.1 in actual value (when revised): * 3 out of 4 (75%) - Days until within 20% of the latest value: + + -- Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days + +--- + + Code + rs1$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -47,31 +59,43 @@ --- Code - dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + rs2 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-01. -- + + -- Min lag (time to first version): Output min median mean max 0 days 1 days 1.4 days 4 days Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Days until within 20% of the latest value: + + -- Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days + +--- + + Code + rs2$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -95,31 +119,43 @@ --- Code - dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + rs3 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-01. -- + + -- Min lag (time to first version): Output min median mean max 0 weeks 1 weeks 1.4 weeks 4 weeks Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 1 week of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Weeks until within 20% of the latest value: + + -- Weeks until within 20% of the latest value: Output min median mean max 0 weeks 3 weeks 6.9 weeks 19 weeks + +--- + + Code + rs3$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -143,32 +179,43 @@ --- Code - dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, - width = 300) + rs4 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020 Jan to 2020 Jan. -- + + -- Min lag (time to first version): Output min median mean max 0 1 1.4 4 Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 1 month of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Months until within 20% of the latest value: + + -- Months until within 20% of the latest value: Output min median mean max 0 3 6.9 19 + +--- + + Code + rs4$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -192,32 +239,43 @@ --- Code - dummy_ex_integerly %>% revision_summary(min_waiting_period = 60, - quick_revision = 3, drop_nas = FALSE) %>% print(n = 10, width = 300) + print(rs5, quick_revision = 3) Message - Min lag (time to first version): + + -- An epi_archive spanning 1 to 1. -- + + -- Min lag (time to first version): Output min median mean max 0 1 1.4 4 Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 3 time steps of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Time Steps until within 20% of the latest value: + + -- Time Steps until within 20% of the latest value: Output min median mean max 0 3 6.9 19 + +--- + + Code + rs5$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index b636bf32e..1a61e8d88 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -64,27 +64,36 @@ dummy_ex_integerly <- dummy_ex$DT %>% stopifnot(dummy_ex_integerly$time_type == "integer") test_that("revision_summary works for dummy datasets", { - expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)) - expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs1 <- dummy_ex %>% revision_summary() + rs2 <- dummy_ex %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs1) + expect_snapshot(rs1$revision_behavior %>% print(n = 10, width = 300)) + expect_snapshot(rs2) + expect_snapshot(rs2$revision_behavior %>% print(n = 10, width = 300)) # Weekly dummy is mostly just "day" -> "week", but quick-revision summary changes: - expect_snapshot(dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs3 <- dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs3) + expect_snapshot(rs3$revision_behavior %>% print(n = 10, width = 300)) # Yearmonthly has the same story. It would have been close to encountering # min_waiting_period-based filtering but we actually set its versions_end to # sometime in 2080 rather than 2022: - expect_snapshot(dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs4 <- dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs4) + expect_snapshot(rs4$revision_behavior %>% print(n = 10, width = 300)) # Integer is very much like daily. We have to provide some of the # configuration arguments since we have no idea about what the integers # represent. If the possible integers being used have large jumps like # YYYYww-as-integer epiweek labels (e.g., 200053 jumps to 200101) or are # regularly spaced apart but by more than 1, we'll still be producing # something nonsensical, but we tried. - expect_snapshot(dummy_ex_integerly %>% + rs5 <- dummy_ex_integerly %>% revision_summary( - min_waiting_period = 60, quick_revision = 3, + min_waiting_period = 60, drop_nas = FALSE - ) %>% - print(n = 10, width = 300)) + ) + expect_snapshot(print(rs5, quick_revision = 3)) + expect_snapshot(rs5$revision_behavior %>% print(n = 10, width = 300)) }) test_that("tidyselect is functional", { @@ -133,7 +142,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = as.Date("2020-01-01") + 1 + 59) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), as.Date("2020-01-01") ) @@ -146,7 +155,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = as.Date("2020-01-01") + 7 + 56) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), as.Date("2020-01-01") ) @@ -159,7 +168,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = tsibble::make_yearmonth(2000, 3)) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), tsibble::make_yearmonth(2000, 1) ) @@ -172,7 +181,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = 1 + 1 + 59) %>% - revision_summary(print_inform = FALSE), + revision_summary(return_only_tibble = TRUE), regexp = "Unsupported time_type" ) }) diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd index f87ea2915..cae40dfa6 100644 --- a/vignettes/epi_archive.Rmd +++ b/vignettes/epi_archive.Rmd @@ -162,7 +162,8 @@ addition to the per key summary, it also returns an overall summary. Here is an a sample of the output: ```{r} -revision_details <- revision_summary(dv_archive, print_inform = TRUE) +revision_details <- revision_summary(dv_archive) +revision_details ``` We can see from the output that, as mentioned above, this data set has a lot of @@ -174,9 +175,9 @@ inspect the returned `revision_details` tibble. Here we collect a number of statistics for each state: ```{r} -revision_details %>% +revision_details$revision_behavior %>% group_by(geo_value) %>% - summarise( + summarize( n_rev = mean(n_revisions), min_lag = min(min_lag), max_lag = max(max_lag),