diff --git a/DESCRIPTION b/DESCRIPTION index 502dfccba..c83bf98a4 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: epiprocess Type: Package Title: Tools for basic signal processing in epidemiology -Version: 0.10.1 +Version: 0.10.2 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), @@ -56,6 +56,7 @@ Imports: tibble, tidyr, tidyselect (>= 1.2.0), + tools, tsibble, utils, vctrs, @@ -63,6 +64,7 @@ Imports: Suggests: devtools, epidatr, + epipredict, here, knitr, outbreaks, @@ -76,6 +78,7 @@ Remotes: cmu-delphi/delphidocs, cmu-delphi/epidatasets, cmu-delphi/epidatr, + cmu-delphi/epipredict, glmgen/genlasso, reconverse/outbreaks Config/Needs/website: cmu-delphi/delphidocs @@ -103,5 +106,6 @@ Collate: 'reexports.R' 'revision_analysis.R' 'slide.R' + 'time-utils.R' 'utils.R' 'utils_pipe.R' diff --git a/NAMESPACE b/NAMESPACE index 1269c5ef7..a5aefadd4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -130,6 +130,8 @@ importFrom(cli,cli_li) importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(cli,format_message) +importFrom(cli,pluralize) +importFrom(cli,qty) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) @@ -195,6 +197,7 @@ importFrom(rlang,caller_arg) importFrom(rlang,caller_env) importFrom(rlang,check_dots_empty) importFrom(rlang,check_dots_empty0) +importFrom(rlang,dots_n) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) @@ -232,8 +235,11 @@ importFrom(tidyr,unnest) importFrom(tidyselect,any_of) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) +importFrom(tools,toTitleCase) importFrom(tsibble,as_tsibble) importFrom(utils,capture.output) importFrom(utils,tail) +importFrom(vctrs,vec_cast) importFrom(vctrs,vec_data) +importFrom(vctrs,vec_detect_missing) importFrom(vctrs,vec_equal) diff --git a/R/archive.R b/R/archive.R index 3557c9914..de6bc1199 100644 --- a/R/archive.R +++ b/R/archive.R @@ -376,10 +376,11 @@ removed_by_compactify <- function(df, keys, tolerance) { #' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are #' considered equal to themselves and each other. #' @importFrom dplyr lag if_else near +#' @importFrom vctrs vec_detect_missing vec_equal #' @keywords internal is_locf <- function(vec, tolerance) { # nolint: object_usage_linter - lag_vec <- dplyr::lag(vec) - if (typeof(vec) == "double") { + lag_vec <- lag(vec, 1L) + if (inherits(vec, "numeric")) { # (no matrix/array/general support) res <- if_else( !is.na(vec) & !is.na(lag_vec), near(vec, lag_vec, tol = tolerance), @@ -387,11 +388,7 @@ is_locf <- function(vec, tolerance) { # nolint: object_usage_linter ) return(res) } else { - res <- if_else( - !is.na(vec) & !is.na(lag_vec), - vec == lag_vec, - is.na(vec) & is.na(lag_vec) - ) + res <- vec_equal(vec, lag_vec, na_equal = TRUE) return(res) } } diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R index 675d000db..724fd9675 100644 --- a/R/epiprocess-package.R +++ b/R/epiprocess-package.R @@ -14,6 +14,8 @@ #' @importFrom checkmate check_names #' @importFrom checkmate test_subset test_set_equal vname #' @importFrom cli cli_abort cli_warn +#' @importFrom cli pluralize +#' @importFrom cli qty #' @importFrom data.table as.data.table #' @importFrom data.table key #' @importFrom data.table setkeyv @@ -23,6 +25,7 @@ #' @importFrom lifecycle deprecated #' @importFrom rlang %||% #' @importFrom rlang is_bare_integerish +#' @importFrom tools toTitleCase #' @importFrom vctrs vec_data #' @importFrom vctrs vec_equal ## usethis namespace: end @@ -32,6 +35,6 @@ utils::globalVariables(c( ".x", ".group_key", ".ref_time_value", "resid", "fitted", ".response", "geo_value", "time_value", "value", ".real", "lag", "max_value", "min_value", - "median_value", "spread", "rel_spread", "time_to", - "time_near_latest", "n_revisions", "min_lag", "max_lag" + "median_value", "spread", "rel_spread", "lag_to", + "lag_near_latest", "n_revisions", "min_lag", "max_lag" )) diff --git a/R/key_colnames.R b/R/key_colnames.R index 9e8ad79b9..eeecce05b 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -4,30 +4,42 @@ #' explicit checks that the key actually is unique in any associated data #' structures. #' -#' @param x an object, such as an [`epi_df`] +#' @param x an object, often a data frame or something similar. `{epiprocess}` +#' includes implementations for [`epi_df`]s, [`epi_archive`]s, +#' [`tsibble::tsibble`]s, and other data frames (including +#' [`tibble::tibble`]s); other packages, like `{epipredict}`, can add more. #' @param ... additional arguments passed on to methods -#' @param geo_keys optional character vector; which columns (if any) to consider -#' keys specifying the geographical region? Defaults to `"geo_value"` if -#' present; must be `"geo_value"` if `x` is an `epi_df`. -#' @param other_keys character vector; which columns (if any) to consider keys -#' specifying demographical or identifying/grouping information besides the -#' geographical region and time interval? Mandatory if `x` is a vanilla -#' `data.frame` or `tibble`. Optional if `x` is an `epi_df`; default is the -#' `epi_df`'s `other_keys`; if you provide `other_keys`, they must match the -#' default. (This behavior is to enable consistent and sane results when you -#' can't guarantee whether `x` is an `epi_df` or just a -#' `tibble`/`data.frame`.) -#' @param time_keys optional character vector; which columns (if any) to -#' consider keys specifying the time interval during which associated events -#' occurred? Defaults to `"time_value"` if present; must be `"time_value"` if -#' `x` is an `epi_df`. +#' @param geo_keys,other_keys,time_keys character vectors, sometimes optional; +#' which variables (if any) should be considered as part of a unique +#' key/identifier for data in `x`, dealing respectively with the associated +#' geographical region, demographic/strain/other information needed in +#' addition to the geographical region to identify individual time series in +#' `x`, and time interval during which associated events occurred. +#' +#' Mandatory if `x` is a regular `data.frame` or `tibble`. Optional if `x` is +#' an `epi_df`; the defaults are `"geo_value"`, the `epi_df`'s `other_keys` +#' metadata, and `"time_value"`, respectively; if you provide these manually, +#' they must match the defaults. (This behavior is to enable consistent and +#' sane results when you can't guarantee whether `x` is an `epi_df` or just a +#' `tibble`/`data.frame`. You don't need to use it if you know that `x` is +#' definitely an `epi_df`.) Not accepted when `x` is a `tsibble` or an +#' `epi_archive`. #' @param exclude an optional character vector of key column names to exclude #' from the result #' @return character vector #' @keywords internal #' @export key_colnames <- function(x, ..., exclude = character()) { - UseMethod("key_colnames") + provided_args <- rlang::call_args_names(rlang::call_match()) + if ("extra_keys" %in% provided_args) { + lifecycle::deprecate_soft("0.9.6", "key_colnames(extra_keys=)", "key_colnames(other_keys=)") + redispatch <- function(..., extra_keys) { + key_colnames(..., other_keys = extra_keys) + } + redispatch(x, ..., exclude = exclude) + } else { + UseMethod("key_colnames") + } } #' @rdname key_colnames @@ -35,9 +47,9 @@ key_colnames <- function(x, ..., exclude = character()) { #' @method key_colnames data.frame #' @export key_colnames.data.frame <- function(x, ..., - geo_keys = intersect("geo_value", names(x)), + geo_keys, other_keys, - time_keys = intersect("time_value", names(x)), + time_keys, exclude = character()) { check_dots_empty0(...) assert_character(geo_keys) @@ -61,7 +73,7 @@ key_colnames.data.frame <- function(x, ..., #' @export key_colnames.epi_df <- function(x, ..., geo_keys = "geo_value", - other_keys = NULL, + other_keys = attr(x, "metadata")$other_keys, time_keys = "time_value", exclude = character()) { check_dots_empty0(...) @@ -76,20 +88,16 @@ key_colnames.epi_df <- function(x, ..., ) } expected_other_keys <- attr(x, "metadata")$other_keys - if (is.null(other_keys)) { - other_keys <- expected_other_keys - } else { - if (!identical(other_keys, expected_other_keys)) { - cli_abort(c( - "The provided `other_keys` argument didn't match the `other_keys` of `x`", - "*" = "`other_keys` was {format_chr_with_quotes(other_keys)}", - "*" = "`expected_other_keys` was {format_chr_with_quotes(expected_other_keys)}", - "i" = "If you know that `x` will always be an `epi_df` and - resolve this discrepancy by adjusting the metadata of `x`, you - shouldn't have to pass `other_keys =` here anymore, - unless you want to continue to perform this check." - ), class = "epiprocess__key_colnames__mismatched_other_keys") - } + if (!identical(other_keys, expected_other_keys)) { + cli_abort(c( + "The provided `other_keys` argument didn't match the `other_keys` of `x`", + "*" = "`other_keys` was {format_chr_with_quotes(other_keys)}", + "*" = "`expected_other_keys` was {format_chr_with_quotes(expected_other_keys)}", + "i" = "If you know that `x` will always be an `epi_df` and + resolve this discrepancy by adjusting the metadata of `x`, you + shouldn't have to pass `other_keys =` here anymore, + unless you want to continue to perform this check." + ), class = "epiprocess__key_colnames__mismatched_other_keys") } assert_character(exclude) setdiff(c("geo_value", other_keys, "time_value"), exclude) diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 8efc4fce2..1fca61d5e 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -19,47 +19,62 @@ #' 8. `rel_spread`: `spread` divided by the largest value (so it will #' always be less than 1). Note that this need not be the final value. It will #' be `NA` whenever `spread` is 0. -#' 9. `time_near_latest`: the time taken for the revisions to settle to within +#' 9. `lag_near_latest`: the time taken for the revisions to settle to within #' `within_latest` (default 20%) of the final value and stay there. For #' example, consider the series (0, 20, 99, 150, 102, 100); then -#' `time_near_latest` is 5, since even though 99 is within 20%, it is outside +#' `lag_near_latest` is 5, since even though 99 is within 20%, it is outside #' the window afterwards at 150. #' #' @param epi_arch an epi_archive to be analyzed #' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to -#' summarize. If empty, it chooses the first. Currently only implemented for -#' one column at a time. +#' summarize. If empty and there is only one value/measurement column (i.e., +#' not in [`key_colnames`]) in the archive, it will automatically select it. +#' If supplied, `...` must select exactly one column. #' @param drop_nas bool, drop any `NA` values from the archive? After dropping -#' `NA`'s compactify is run again to make sure there are no duplicate values -#' from occasions when the signal is revised to `NA`, and then back to its -#' immediately-preceding value. +#' `NA`'s compactify is run again if `should_compactify` is `TRUE` to make +#' sure there are no duplicate values from occasions when the signal is +#' revised to `NA`, and then back to its immediately-preceding value. #' @param print_inform bool, determines whether to print summary information, or #' only return the full summary tibble #' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any -#' time_values not earlier than `min_waiting_period` before `versions_end` are -#' removed. `min_waiting_period` should characterize the typical time during -#' which revisions occur. The default of 60 days corresponds to a typical -#' final value for case counts as reported in the context of insurance. To -#' avoid this filtering, either set to `NULL` or 0. +#' time_values that have not had at least `min_waiting_period` to stabilize as +#' of the `versions_end` are removed. `min_waiting_period` should characterize +#' the typical time during which most significant revisions occur. The default +#' of 60 days corresponds to a typical near-final value for case counts as +#' reported in the context of insurance. To avoid this filtering, either set +#' to `NULL` or 0. #' @param within_latest double between 0 and 1. Determines the threshold -#' used for the `time_to` +#' used for the `lag_to` #' @param quick_revision difftime or integer (integer is treated as days), for #' the printed summary, the amount of time between the final revision and the #' actual time_value to consider the revision quickly resolved. Default of 3 #' days #' @param few_revisions integer, for the printed summary, the upper bound on the #' number of revisions to consider "few". Default is 3. -#' @param abs_spread_threshold numeric, for the printed summary, the maximum -#' spread used to characterize revisions which don't actually change very -#' much. Default is 5% of the maximum value in the dataset, but this is the -#' most unit dependent of values, and likely needs to be chosen appropriate -#' for the scale of the dataset. -#' @param rel_spread_threshold float between 0 and 1, for the printed summary, -#' the relative spread fraction used to characterize revisions which don't -#' actually change very much. Default is .1, or 10% of the final value -#' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the -#' threshold for when two floats are considered identical. -#' @param should_compactify bool. Compactify if `TRUE`. +#' @param abs_spread_threshold length-1 numeric, for the printed summary, the +#' maximum spread used to characterize revisions which don't actually change +#' very much. Default is 5% of the maximum value in the dataset, but this is +#' the most unit dependent of values, and likely needs to be chosen +#' appropriate for the scale of the dataset. +#' @param rel_spread_threshold length-1 double between 0 and 1, for the printed +#' summary, the relative spread fraction used to characterize revisions which +#' don't actually change very much. Default is .1, or 10% of the final value +#' @param compactify_tol length-1 double, used if `should_compactify` is `TRUE`, it +#' determines the threshold for when two doubles are considered identical. +#' @param should_compactify bool. If `TRUE`, we will compactify after the signal +#' requested in `...` has been selected on its own and the `drop_nas` step. +#' This helps, for example, to give similar results when called on +#' [merged][epix_merge] and single-signal archives, since merged archives +#' record an update when any of the other signals change, not just the +#' requested signal. The default is `TRUE`. +#' +#' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`, +#' and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if +#' the possible `time_values` are all consecutive integers; you will need to +#' manually specify the `min_waiting_period` and `quick_revision`, though. +#' Using a `time_type` of `"integer"` with week numbers like 202501 will +#' produce incorrect results for some calculations, since week numbering +#' contains jumps at year boundaries. #' #' @examples #' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) @@ -67,31 +82,53 @@ #' #' @export #' @importFrom cli cli_inform cli_abort cli_li -#' @importFrom rlang list2 syms +#' @importFrom rlang list2 syms dots_n +#' @importFrom vctrs vec_cast #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across #' everything ungroup summarize if_else %>% revision_summary <- function(epi_arch, ..., drop_nas = TRUE, print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days"), + min_waiting_period = as.difftime(60, units = "days") %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type), within_latest = 0.2, - quick_revision = as.difftime(3, units = "days"), + quick_revision = as.difftime(3, units = "days") %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, compactify_tol = .Machine$double.eps^0.5, should_compactify = TRUE) { - arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) - if (length(arg) == 0) { - # Choose the first column that's not a key or version - arg <- setdiff(names(epi_arch$DT), key_colnames(epi_arch))[[1]] - } else if (length(arg) > 1) { - cli_abort("Not currently implementing more than one column at a time. Run each separately") + assert_class(epi_arch, "epi_archive") + # if the column to summarize isn't specified, use the only one if there is only one + if (dots_n(...) == 0) { + # Choose the first column that's not a key: + value_colnames <- setdiff(names(epi_arch$DT), key_colnames(epi_arch)) + if (length(value_colnames) == 1) { + arg <- value_colnames + } else { + cli_abort(c( + "Cannot determine which column to summarize.", + "i" = "Value/measurement columns appear to be: {format_varnames(value_colnames)}", + ">" = "Please specify which column to summarize in `...` (with tidyselect syntax)." + ), class = "epiprocess__revision_summary_cannot_determine_default_selection") + } + } else { + # get the names of columns matching any tidyselect used in `...` + arg <- names(eval_select(rlang::expr(c(...)), allow_rename = FALSE, data = epi_arch$DT)) + if (length(arg) == 0) { + cli_abort("Could not find any columns matching the selection in `...`.", + class = "epiprocess__revision_summary__selected_zero_columns" + ) + } + if (length(arg) > 1) { + cli_abort("Not currently implementing more than one column at a time. Run each separately.") + } } if (is.null(abs_spread_threshold)) { abs_spread_threshold <- .05 * epi_arch$DT %>% - pull(...) %>% + pull(!!arg) %>% max(na.rm = TRUE) } # for each time_value, get @@ -101,58 +138,64 @@ revision_summary <- function(epi_arch, # the max lag # # revision_tibble - keys <- key_colnames(epi_arch, exclude = "version") + epikey_names <- key_colnames(epi_arch, exclude = c("time_value", "version")) + epikeytime_names <- c(epikey_names, "time_value") + keys <- c(epikeytime_names, "version") + time_type <- epi_arch$time_type revision_behavior <- epi_arch$DT %>% - select(all_of(unique(c("geo_value", "time_value", keys, "version", arg)))) + select(all_of(unique(c(keys, arg)))) if (!is.null(min_waiting_period)) { + last_semistable_time_value <- time_minus_n_steps( + epi_arch$versions_end, + time_delta_to_n_steps(min_waiting_period, time_type), + time_type + ) revision_behavior <- revision_behavior %>% - filter(abs(time_value - as.Date(epi_arch$versions_end)) >= min_waiting_period) + filter(time_value <= last_semistable_time_value) } if (drop_nas) { # if we're dropping NA's, we should recompactify revision_behavior <- revision_behavior %>% - filter(!is.na(c_across(!!arg))) + filter(!is.na(.data[[arg]])) } else { revision_behavior <- epi_arch$DT } if (should_compactify) { revision_behavior <- revision_behavior %>% - arrange(across(c(geo_value, time_value, all_of(keys), version))) %>% # need to sort before compactifying - apply_compactify(c(keys, version), compactify_tol) + apply_compactify(keys, compactify_tol) } revision_behavior <- revision_behavior %>% - mutate(lag = as.integer(version) - as.integer(time_value)) %>% # nolint: object_usage_linter - group_by(across(all_of(keys))) %>% # group by all the keys + mutate(lag = time_minus_time_in_n_steps(version, time_value, time_type)) %>% # nolint: object_usage_linter + group_by(across(all_of(epikeytime_names))) %>% # group = versions of one measurement summarize( n_revisions = dplyr::n() - 1, min_lag = min(lag), # nolint: object_usage_linter max_lag = max(lag), # nolint: object_usage_linter - min_value = f_no_na(min, pick(!!arg)), - max_value = f_no_na(max, pick(!!arg)), - median_value = f_no_na(median, pick(!!arg)), - time_to = time_within_x_latest(lag, pick(!!arg), prop = within_latest), # nolint: object_usage_linter + min_value = f_no_na(min, .data[[arg]]), + max_value = f_no_na(max, .data[[arg]]), + median_value = f_no_na(median, .data[[arg]]), + lag_to = lag_within_x_latest(lag, .data[[arg]], prop = within_latest), .groups = "drop" ) %>% mutate( spread = max_value - min_value, # nolint: object_usage_linter rel_spread = spread / max_value, # nolint: object_usage_linter - # TODO the units here may be a problem - min_lag = as.difftime(min_lag, units = "days"), # nolint: object_usage_linter - max_lag = as.difftime(max_lag, units = "days"), # nolint: object_usage_linter - time_near_latest = as.difftime(time_to, units = "days") # nolint: object_usage_linter + min_lag = n_steps_to_time_delta(min_lag, time_type), # nolint: object_usage_linter + max_lag = n_steps_to_time_delta(max_lag, time_type), # nolint: object_usage_linter + lag_near_latest = n_steps_to_time_delta(lag_to, time_type) # nolint: object_usage_linter ) %>% - select(-time_to) %>% + select(-lag_to) %>% relocate( - time_value, geo_value, all_of(keys), n_revisions, min_lag, max_lag, # nolint: object_usage_linter - time_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter + time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter + lag_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter ) if (print_inform) { cli_inform("Min lag (time to first version):") - difftime_summary(revision_behavior$min_lag) %>% print() + time_delta_summary(revision_behavior$min_lag, time_type) %>% print() if (!drop_nas) { total_na <- epi_arch$DT %>% filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter @@ -167,11 +210,11 @@ revision_summary <- function(epi_arch, cli_inform("No revisions:") cli_li(num_percent(total_num_unrevised, total_num, "")) total_quickly_revised <- sum( # nolint: object_usage_linter - revision_behavior$max_lag <= - as.difftime(quick_revision, units = "days") + time_delta_to_n_steps(revision_behavior$max_lag, time_type) <= + time_delta_to_n_steps(quick_revision, time_type) ) - cli_inform("Quick revisions (last revision within {quick_revision} -{units(quick_revision)} of the `time_value`):") + cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, time_type)} + of the `time_value`):") cli_li(num_percent(total_quickly_revised, total_num, "")) total_barely_revised <- sum( # nolint: object_usage_linter revision_behavior$n_revisions <= @@ -198,17 +241,21 @@ revision_summary <- function(epi_arch, cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") cli_li(num_percent(abs_spread, n_real_revised, "")) - cli_inform("{units(quick_revision)} until within {within_latest*100}% of the latest value:") - difftime_summary(revision_behavior[["time_near_latest"]]) %>% print() + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us + # and/or downstream devs, so we can paste it onto our format string safely: + units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[time_type]])) + cli_inform("{toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:") + time_delta_summary(revision_behavior[["lag_near_latest"]], time_type) %>% print() } return(revision_behavior) } -#' pull the value from lags when values starts indefinitely being within prop of it's last value. -#' @param values this should be a 1 column tibble. errors may occur otherwise +#' pull the value from lags when values starts indefinitely being within prop of its latest value. +#' @param lags vector of lags; should be sorted +#' @param values this should be a vector (e.g., a column) with length matching that of `lags` +#' @param prop optional length-1 double; proportion #' @keywords internal -time_within_x_latest <- function(lags, values, prop = .2) { - values <- values[[1]] +lag_within_x_latest <- function(lags, values, prop = .2) { latest_value <- values[[length(values)]] close_enough <- abs(values - latest_value) < prop * latest_value # we want to ignore any stretches where it's close, but goes farther away later @@ -224,11 +271,10 @@ time_within_x_latest <- function(lags, values, prop = .2) { #' @keywords internal get_last_run <- function(bool_vec, values_from) { runs <- rle(bool_vec) - length(bool_vec) - tail(runs$lengths, n = 1) values_from[[length(bool_vec) - tail(runs$lengths, n = 1) + 1]] } -#' use when the default behavior returns a warning on empty lists, which we do +#' use when the default behavior returns a warning on empty vectors, which we do #' not want, and there is no super clean way of preventing this #' @keywords internal f_no_na <- function(f, x) { @@ -248,18 +294,25 @@ num_percent <- function(a, b, b_description) { ({round(a/b*100,digits=2)}%)") } -#' summary doesn't work on difftimes +#' Like `summary` but working across all "time deltas", including difftimes +#' +#' Also standardizes units of difftimes to the natural unit for the given +#' `time_type` (via conversion to and from a corresponding number of time +#' steps). +#' #' @keywords internal -difftime_summary <- function(diff_time_val) { - if (length(diff_time_val) > 0) { +time_delta_summary <- function(time_delta, time_type) { + if (length(time_delta) > 0) { + n_steps <- time_delta_to_n_steps(time_delta, time_type) res <- data.frame( - min = min(diff_time_val), - median = median(diff_time_val), - mean = round(mean(diff_time_val), 1), - max = max(diff_time_val), + min = min(n_steps), + median = median(n_steps), + mean = round(mean(n_steps), 1), + max = max(n_steps), row.names = " ", check.names = FALSE - ) + ) %>% + mutate(across(c(min, median, mean, max), ~ .x * unit_time_delta(time_type))) return(res) } else { return(data.frame()) diff --git a/R/slide.R b/R/slide.R index 761639d44..600f5d605 100644 --- a/R/slide.R +++ b/R/slide.R @@ -979,7 +979,7 @@ epi_slide_opt <- function( #' @rdname epi_slide_opt #' @description `epi_slide_mean` is a wrapper around `epi_slide_opt` with `.f = -#' datatable::frollmean`. +#' data.table::frollmean`. #' #' @export epi_slide_mean <- function( @@ -1039,7 +1039,7 @@ epi_slide_mean <- function( #' @rdname epi_slide_opt #' @description `epi_slide_sum` is a wrapper around `epi_slide_opt` with `.f = -#' datatable::frollsum`. +#' data.table::frollsum`. #' #' @export epi_slide_sum <- function( diff --git a/R/time-utils.R b/R/time-utils.R new file mode 100644 index 000000000..843baaf82 --- /dev/null +++ b/R/time-utils.R @@ -0,0 +1,370 @@ +#' Use max valid period as guess for `period` of `time_values` +#' +#' `r lifecycle::badge("experimental")` +#' +#' @param time_values Vector containing time-interval-like or time-point-like +#' data, with at least two distinct values. +#' @param time_values_arg Optional, string; name to give `time_values` in error +#' messages. Defaults to quoting the expression the caller fed into the +#' `time_values` argument. +#' @param ... Should be empty, there to satisfy the S3 generic. +#' @return length-1 vector; `r lifecycle::badge("experimental")` class will +#' either be the same class as [`base::diff()`] on such time values, an +#' integer, or a double, such that all `time_values` can be exactly obtained +#' by adding `k * result` for an integer k, and such that there is no smaller +#' `result` that can achieve this. +#' +#' @keywords internal +#' @export +guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + UseMethod("guess_period") +} + +#' @export +guess_period.default <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + rlang::check_dots_empty() + sorted_distinct_time_values <- sort(unique(time_values)) + if (length(sorted_distinct_time_values) < 2L) { + cli_abort("Not enough distinct values in {.code {time_values_arg}} to guess the period.", + class = "epiprocess__guess_period__not_enough_times", + time_values = time_values + ) + } + skips <- diff(sorted_distinct_time_values) + # Certain diff results have special classes or attributes; use vctrs to try to + # appropriately destructure for gcd_num, then restore to their original class + # & attributes. + skips_data <- vctrs::vec_data(skips) + period_data <- gcd_num(skips_data, rrtol = 0) + vctrs::vec_restore(period_data, skips) +} + +# `full_seq()` doesn't like difftimes, so convert to the natural units of some time types: + +#' @export +guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + as.numeric(NextMethod(), units = "days") +} + +#' @export +guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { + as.numeric(NextMethod(), units = "secs") +} + +#' Validate `.before` or `.window_size` argument +#' @keywords internal +validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRUE, arg_name = rlang::caller_arg(arg)) { + if (time_type == "custom") { + cli_abort( + "Unsure how to interpret slide units with a custom time type. Consider converting your time + column to a Date, yearmonth, or integer type.", + class = "epiprocess__validate_slide_window_arg" + ) + } + + msg <- "" + inf_if_okay <- if (allow_inf) { + "Inf" + } else { + character(0L) + } + + # nolint start: indentation_linter. + if (time_type == "day") { + if (!(test_sensible_int(arg, lower = lower) || + inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "days" || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("length-1 difftime with units in days", "non-negative integer", inf_if_okay), " or ") + } + } else if (time_type == "week") { + if (!(inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "weeks" || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("length-1 difftime with units in weeks", inf_if_okay), " or ") + } + } else if (time_type == "yearmonth") { + if (!(test_sensible_int(arg, lower = lower) || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") + } + } else if (time_type == "integer") { + if (!(test_sensible_int(arg, lower = lower) || + allow_inf && identical(arg, Inf) + )) { + msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") + } + } else { + cli_abort('`epiprocess` internal error: unrecognized time_type: "{time_type}"', + class = "epiprocess__unrecognized_time_type" + ) + } + # nolint end + + if (msg != "") { + cli_abort( + "Slide function expected `{arg_name}` to be a {msg}.", + class = "epiprocess__validate_slide_window_arg" + ) + } +} + +#' Object that, added to time_values of time_type, advances by one time step/interval +#' +#' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` +#' @param format "friendly" or "fast"; for some time_types, there are multiple +#' ways to represent time_deltas. "friendly" tries to output a format that +#' will be more informative when printed, and produce errors in more cases +#' when used in unexpected ways. "fast" tries to output a time_delta that will +#' be faster in downstream operations. +#' @return an object `u` such that `time_values + u` represents advancing by one +#' time step / moving to the subsequent time interval for any `time_values` +#' object of time type `time_type`, and such that `time_values + k * u` for +#' integerish vector `k` advances by `k` steps (with vectorization, +#' recycling). At time of writing, these objects also all support +#' multiplication by nonintegerish numeric vectors, `mean`, and `median`, +#' which are useful for summarizing vector time_deltas, but these fractional +#' time_deltas are not allowed in time_delta-specific operations. +#' +#' @keywords internal +unit_time_delta <- function(time_type, format = c("friendly", "fast")) { + format <- rlang::arg_match(format) + switch(format, + friendly = switch(time_type, + day = as.difftime(1, units = "days"), + week = as.difftime(1, units = "weeks"), + yearmonth = 1, + integer = 1L, + cli_abort("Unsupported time_type: {time_type}") + ), + fast = switch(time_type, + day = 1, + week = 7, + yearmonth = 1, + integer = 1L, + cli_abort("Unsupported time_type: {time_type}") + ) + ) +} + +#' Convert a time delta to a integerish number of "unit" steps between time values +#' +#' @param time_delta a vector that can be added to time values of time type +#' `time_type` to arrive at other time values of that time type, or +#' `r lifecycle::badge("experimental")` such a vector with Inf/-Inf entries mixed +#' in, if supported by the class of `time_delta`, even if `time_type` doesn't +#' necessarily support Inf/-Inf entries. Basically a slide window arg but +#' without sign and length restrictions. +#' @param time_type as in `validate_slide_window_arg` +#' @return [bare integerish][rlang::is_integerish] vector (with possible +#' infinite values) that produces the same result as `time_delta` when +#' multiplied by the natural [`unit_time_delta`] for +#' that time type and added to time values of time type `time_type`. If the +#' given time type does not support infinite values, then it should produce +#' +Inf or -Inf for analogous entries of `time_delta`, and match the addition +#' result match the addition result for non-infinite entries. +#' +#' @keywords internal +time_delta_to_n_steps <- function(time_delta, time_type) { + # could be S3 if we're willing to export + if (inherits(time_delta, "difftime")) { + output_units <- switch(time_type, + day = "days", + week = "weeks", + cli_abort("difftime objects not supported for time_type {format_chr_with_quotes(time_type)}") + ) + units(time_delta) <- output_units # converts number to represent same duration; not just attr<- + n_steps <- vec_data(time_delta) + if (!is_bare_integerish(n_steps)) { + cli_abort("`time_delta` did not appear to contain only integerish numbers + of steps between time values of time type {format_chr_with_quotes(time_type)}") + } + n_steps + } else if (is_bare_integerish(time_delta)) { # (allows infinite values) + switch(time_type, + day = , + week = , + yearmonth = , + integer = time_delta, + cli_abort("Invalid or unsupported time_type {format_chr_with_quotes(time_type)}") + ) + } else { + cli_abort("Invalid or unsupported kind of `time_delta`") + } +} + +#' Convert from integerish/infinite/mix to time_delta +#' +#' @param n_steps integerish vector that can mix in infinite values +#' @param time_type as in [`validate_slide_window_arg`] +#' @param format optional; `"friendly"` to output a more descriptive/friendly +#' class like `"difftime"` when possible; `"fast"` to output a class that's +#' generally faster to work with when possible, like a vanilla `"numeric"`. +#' Default is `"friendly"`. +#' +#' @keywords internal +n_steps_to_time_delta <- function(n_steps, time_type, format = c("friendly", "fast")) { + if (!is_bare_integerish(n_steps)) { + cli_abort("`n_steps` did not appear to be integerish (or infinite, or a mix)") + } + n_steps * unit_time_delta(time_type, format) +} + +#' Standardize time_deltas to a multiple of [`unit_time_delta()`] +#' +#' @keywords internal +time_delta_standardize <- function(time_delta, time_type, format = c("friendly", "fast")) { + time_delta_to_n_steps(time_delta, time_type) * unit_time_delta(time_type, format) +} + +#' Helper data for [`time_type_unit_abbr`] +#' +#' @keywords internal +time_type_unit_abbrs <- c( + day = "d", + week = "w", + yearmonth = "m" +) +# ^ Using these unit abbreviations happens to make our automatic slide output +# naming look like taking ISO-8601 duration designations, removing the P, and +# lowercasing any characters. Fortnightly or sub-daily time types would need an +# adjustment to remain consistent. + +#' Get an abbreviation for the "units" of `unit_time_delta(time_type)` +#' +#' For use in formatting or automatically naming things based on +#' `time_delta_to_n_steps(time_delta)` for a `time_delta` between times of time +#' type `time_type`. +#' +#' @param time_type str +#' @return str +#' +#' @keywords internal +time_type_unit_abbr <- function(time_type) { + maybe_unit_abbr <- time_type_unit_abbrs[time_type] + if (is.na(maybe_unit_abbr)) { + cli_abort("Cannot determine the units of time type {format_chr_with_quotes(time_type)}") + } + maybe_unit_abbr +} + +#' Helper data for [`format_time_delta`] +#' +#' Should not be altered on the basis of untrusted user input, as it is used as +#' a cli format string and may run code. +#' +#' @keywords internal +time_type_unit_pluralizer <- c( + day = "day{?s}", + week = "week{?s}", + yearmonth = "month{?s}", + integer = "time step{?s}" +) + +#' Format a length-1 time delta to a character to assist messaging +#' +#' This is meant to address the following: +#' - glue::glue("{as.difftime(1, units = 'days')}") is "1" +#' - glue::glue("{format(as.difftime(1, units = 'days'))}") is "1 days" +#' - time deltas for yearmonths and integers don't have units attached at all +#' +#' @keywords internal +format_time_delta <- function(x, time_type) { + n_steps <- time_delta_to_n_steps(x, time_type) + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us + # and/or downstream devs, so we can paste it onto our format string safely: + pluralize(paste0("{n_steps} ", time_type_unit_pluralizer[[time_type]])) +} + +#' Convert `time_delta` to an approximate difftime +#' +#' `r lifecycle::badge("experimental")` +#' +#' To assist in comparing `time_delta`s to default `difftime` thresholds when we +#' want to reduce friction. +#' +#' It may be better to try to do something like make `time_delta` validation +#' more accommodating (e.g., of difftimes with units of "days" when working on +#' weekly scale), and remain rigid on yearmonths. Applying deltas and comparing +#' time_values might also be an approach but seems more fraught as the least +#' common denominator would be start/mid/end datetimes of time intervals, but +#' those are also ambiguous (starting&representation wdays of weeks are unknown, +#' timezone of dates are unknown). +#' +#' Another alternative approach, below, converts difftimes to time_deltas +#' instead. It requires knowledge of which way to round in order to get +#' time_deltas representing an integer number of time steps, but avoids some +#' potential inconsistencies of the time-delta-to-difftime approach when we +#' think about applying it to, e.g., months / spans of months with varying +#' numbers of days, and also makes it easier to avoid "magical defaults". +#' +#' @keywords internal +time_delta_to_approx_difftime <- function(time_delta, time_type) { + switch(time_type, + day = , + week = time_delta_standardize(time_delta, time_type, "friendly"), + yearmonth = time_delta * as.difftime(30, units = "days"), + integer = , + cli_abort("Unsupported time_type for this operation: {time_type}") + ) +} + +#' Closest time_delta that's approximately greater than or equal to given difftime +#' +#' `r lifecycle::badge("experimental")` +#' +#' @param difftime a difftime object +#' @param time_type as in [`validate_slide_window_arg`] +#' @return An object representing an integerish number (or vector of numbers) of +#' time steps between consecutive time_values of type `time_type`. +#' +#' @keywords internal +difftime_approx_ceiling_time_delta <- function(difftime, time_type) { + assert_class(difftime, "difftime") + switch(time_type, + day = , + week = { + units(difftime) <- paste0(time_type, "s") + ceiling(difftime) + }, + yearmonth = { + units(difftime) <- "days" + ceiling(as.numeric(difftime) / 30) + }, + integer = , + cli_abort("Unsupported time_type for this operation: {time_type}") + ) +} + +#' Difference between two time value vectors in terms of number of time "steps" +#' +#' @param x a time_value (vector) of time type `time_type` +#' @param y a time_value (vector) of time type `time_type` +#' @param time_type as in [`validate_slide_window_arg()`] +#' @return integerish vector such that `x + n_steps_to_time_delta_fast(result)` +#' should equal `y`. +#' +#' @keywords internal +time_minus_time_in_n_steps <- function(x, y, time_type) { + time_delta_to_n_steps(x - y, time_type) +} + +#' Advance/retreat time_values by specified number of time "steps" +#' +#' Here, a "step" is based on the `time_type`, not just the class of `x`. +#' +#' @param x a time_value (vector) of time type `time_type` +#' @param y integerish (vector) +#' @param time_type as in [`validate_slide_window_arg()`] +#' @return a time_value (vector) of time type `time_type` +#' +#' @keywords internal +time_plus_n_steps <- function(x, y, time_type) { + x + y * unit_time_delta(time_type, "fast") +} + +#' @rdname time_plus_n_steps +time_minus_n_steps <- function(x, y, time_type) { + x - y * unit_time_delta(time_type, "fast") +} diff --git a/R/utils.R b/R/utils.R index e350ade25..f422aa4e6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -979,59 +979,6 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { vctrs::vec_cast(numeric_gcd, dividends) } -#' Use max valid period as guess for `period` of `time_values` -#' -#' `r lifecycle::badge("experimental")` -#' -#' @param time_values Vector containing time-interval-like or time-point-like -#' data, with at least two distinct values. -#' @param time_values_arg Optional, string; name to give `time_values` in error -#' messages. Defaults to quoting the expression the caller fed into the -#' `time_values` argument. -#' @param ... Should be empty, there to satisfy the S3 generic. -#' @return length-1 vector; `r lifecycle::badge("experimental")` class will -#' either be the same class as [`base::diff()`] on such time values, an -#' integer, or a double, such that all `time_values` can be exactly obtained -#' by adding `k * result` for an integer k, and such that there is no smaller -#' `result` that can achieve this. -#' -#' @keywords internal -#' @export -guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - UseMethod("guess_period") -} - -#' @export -guess_period.default <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - rlang::check_dots_empty() - sorted_distinct_time_values <- sort(unique(time_values)) - if (length(sorted_distinct_time_values) < 2L) { - cli_abort("Not enough distinct values in {.code {time_values_arg}} to guess the period.", - class = "epiprocess__guess_period__not_enough_times", - time_values = time_values - ) - } - skips <- diff(sorted_distinct_time_values) - # Certain diff results have special classes or attributes; use vctrs to try to - # appropriately destructure for gcd_num, then restore to their original class - # & attributes. - skips_data <- vctrs::vec_data(skips) - period_data <- gcd_num(skips_data, rrtol = 0) - vctrs::vec_restore(period_data, skips) -} - -# `full_seq()` doesn't like difftimes, so convert to the natural units of some time types: - -#' @export -guess_period.Date <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - as.numeric(NextMethod(), units = "days") -} - -#' @export -guess_period.POSIXt <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { - as.numeric(NextMethod(), units = "secs") -} - #' Is `x` an "int" with a sensible class? TRUE/FALSE #' #' Like [`checkmate::test_int`] but disallowing some non-sensible classes that @@ -1053,148 +1000,6 @@ test_sensible_int <- function(x, na.ok = FALSE, lower = -Inf, upper = Inf, # nol } } -validate_slide_window_arg <- function(arg, time_type, lower = 1, allow_inf = TRUE, arg_name = rlang::caller_arg(arg)) { - if (time_type == "custom") { - cli_abort( - "Unsure how to interpret slide units with a custom time type. Consider converting your time - column to a Date, yearmonth, or integer type.", - class = "epiprocess__validate_slide_window_arg" - ) - } - - msg <- "" - inf_if_okay <- if (allow_inf) { - "Inf" - } else { - character(0L) - } - - # nolint start: indentation_linter. - if (time_type == "day") { - if (!(test_sensible_int(arg, lower = lower) || - inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "days" || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("length-1 difftime with units in days", "non-negative integer", inf_if_okay), " or ") - } - } else if (time_type == "week") { - if (!(inherits(arg, "difftime") && length(arg) == 1L && units(arg) == "weeks" || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("length-1 difftime with units in weeks", inf_if_okay), " or ") - } - } else if (time_type == "yearmonth") { - if (!(test_sensible_int(arg, lower = lower) || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") - } - } else if (time_type == "integer") { - if (!(test_sensible_int(arg, lower = lower) || - allow_inf && identical(arg, Inf) - )) { - msg <- glue::glue_collapse(c("non-negative integer", inf_if_okay), " or ") - } - } else { - cli_abort('`epiprocess` internal error: unrecognized time_type: "{time_type}"', - class = "epiprocess__unrecognized_time_type" - ) - } - # nolint end - - if (msg != "") { - cli_abort( - "Slide function expected `{arg_name}` to be a {msg}.", - class = "epiprocess__validate_slide_window_arg" - ) - } -} - - -#' Convert a time delta to a integerish number of "unit" steps between time values -#' -#' @param time_delta a vector that can be added to time values of time type -#' `time_type` to arrive at other time values of that time type, or -#' `r lifecycle::badge("experimental")` such a vector with Inf/-Inf entries mixed -#' in, if supported by the class of `time_delta`, even if `time_type` doesn't -#' necessarily support Inf/-Inf entries. Basically a slide window arg but -#' without sign and length restrictions. -#' @param time_type as in `validate_slide_window_arg` -#' @return [bare integerish][rlang::is_integerish] vector (with possible -#' infinite values) that produces the same result as `time_delta` when -#' multiplied by the natural [`unit_time_delta`] for -#' that time type and added to time values of time type `time_type`. If the -#' given time type does not support infinite values, then it should produce -#' +Inf or -Inf for analogous entries of `time_delta`, and match the addition -#' result match the addition result for non-infinite entries. -#' -#' @keywords internal -time_delta_to_n_steps <- function(time_delta, time_type) { - # could be S3 if we're willing to export - if (inherits(time_delta, "difftime")) { - output_units <- switch(time_type, - day = "days", - week = "weeks", - cli_abort("difftime objects not supported for time_type {format_chr_with_quotes(time_type)}") - ) - units(time_delta) <- output_units # converts number to represent same duration; not just attr<- - n_steps <- vec_data(time_delta) - if (!is_bare_integerish(n_steps)) { - cli_abort("`time_delta` did not appear to contain only integerish numbers - of steps between time values of time type {format_chr_with_quotes(time_type)}") - } - n_steps - } else if (is_bare_integerish(time_delta)) { # (allows infinite values) - switch(time_type, - day = , - week = , - yearmonth = , - integer = time_delta, - cli_abort("Invalid or unsupported time_type {format_chr_with_quotes(time_type)}") - ) - } else { - cli_abort("Invalid or unsupported kind of `time_delta`") - } -} - -#' Object that, added to time_values of time_type, advances by one time step/interval -#' -#' @param time_type string; `epi_df`'s or `epi_archive`'s `time_type` -#' @return an object `u` such that `time_values + u` represents advancing by one -#' time step / moving to the subsequent time interval for any `time_values` -#' object of time type `time_type`, and such that `time_values + k * u` for -#' integerish vector `k` advances by `k` steps (with vectorization, -#' recycling). -#' -#' @keywords internal -unit_time_delta <- function(time_type) { - switch(time_type, - day = as.difftime(1, units = "days"), - week = as.difftime(1, units = "weeks"), - yearmonth = 1, - integer = 1L, - cli_abort("Unsupported time_type: {time_type}") - ) -} - -# Using these unit abbreviations happens to make our automatic slide output -# naming look like taking ISO-8601 duration designations, removing the P, and -# lowercasing any characters. Fortnightly or sub-daily time types would need an -# adjustment to remain consistent. -time_type_unit_abbrs <- c( - day = "d", - week = "w", - yearmonth = "m" -) - -time_type_unit_abbr <- function(time_type) { - maybe_unit_abbr <- time_type_unit_abbrs[time_type] - if (is.na(maybe_unit_abbr)) { - cli_abort("Cannot determine the units of time type {format_chr_with_quotes(time_type)}") - } - maybe_unit_abbr -} - #' Extract singular element of a length-1 unnamed list (validated) #' #' Inverse of `list(elt)`. diff --git a/README.Rmd b/README.Rmd index 0e8756d3d..e6abc04f8 100644 --- a/README.Rmd +++ b/README.Rmd @@ -95,29 +95,29 @@ df <- pub_covidcast( df ``` -Convert the data to an epi_df object and sort by geo_value and time_value. You +Convert the data to an `epi_df` object and sort by `geo_value` and `time_value`. You can work with an `epi_df` like you can with a `{tibble}` by using `{dplyr}` -verbs +verbs. ```{r} edf <- df %>% as_epi_df(as_of = as.Date("2024-01-01")) %>% arrange_canonical() %>% group_by(geo_value) %>% - mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) + mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) %>% + ungroup() edf ``` -Compute the 7 day moving average of the confirmed daily cases for each geo_value +Compute the 7 day moving average of the confirmed daily cases for each `geo_value` ```{r} edf <- edf %>% - group_by(geo_value) %>% epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf ``` -Autoplot the confirmed daily cases for each geo_value +Autoplot the confirmed daily cases for each `geo_value` ```{r} edf %>% diff --git a/README.md b/README.md index dacc458d1..5f076ed79 100644 --- a/README.md +++ b/README.md @@ -101,16 +101,17 @@ df #> # ℹ 2,802 more rows ``` -Convert the data to an epi_df object and sort by geo_value and -time_value. You can work with an `epi_df` like you can with a `{tibble}` -by using `{dplyr}` verbs +Convert the data to an `epi_df` object and sort by `geo_value` and +`time_value`. You can work with an `epi_df` like you can with a +`{tibble}` by using `{dplyr}` verbs. ``` r edf <- df %>% as_epi_df(as_of = as.Date("2024-01-01")) %>% arrange_canonical() %>% group_by(geo_value) %>% - mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) + mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) %>% + ungroup() edf #> An `epi_df` object, 2,808 x 4 with metadata: #> * geo_type = state @@ -118,7 +119,6 @@ edf #> * as_of = 2024-01-01 #> #> # A tibble: 2,808 × 4 -#> # Groups: geo_value [4] #> geo_value time_value cases_cumulative cases_daily #> * #> 1 ca 2020-03-01 19 19 @@ -131,11 +131,10 @@ edf ``` Compute the 7 day moving average of the confirmed daily cases for each -geo_value +`geo_value` ``` r edf <- edf %>% - group_by(geo_value) %>% epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE, .prefix = "smoothed_") edf #> An `epi_df` object, 2,808 x 5 with metadata: @@ -144,7 +143,6 @@ edf #> * as_of = 2024-01-01 #> #> # A tibble: 2,808 × 5 -#> # Groups: geo_value [4] #> geo_value time_value cases_cumulative cases_daily smoothed_cases_daily #> * #> 1 ca 2020-03-01 19 19 19 @@ -156,7 +154,7 @@ edf #> # ℹ 2,802 more rows ``` -Autoplot the confirmed daily cases for each geo_value +Autoplot the confirmed daily cases for each `geo_value` ``` r edf %>% diff --git a/man/difftime_approx_ceiling_time_delta.Rd b/man/difftime_approx_ceiling_time_delta.Rd new file mode 100644 index 000000000..060cadadd --- /dev/null +++ b/man/difftime_approx_ceiling_time_delta.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{difftime_approx_ceiling_time_delta} +\alias{difftime_approx_ceiling_time_delta} +\title{Closest time_delta that's approximately greater than or equal to given difftime} +\usage{ +difftime_approx_ceiling_time_delta(difftime, time_type) +} +\arguments{ +\item{difftime}{a difftime object} + +\item{time_type}{as in \code{\link{validate_slide_window_arg}}} +} +\value{ +An object representing an integerish number (or vector of numbers) of +time steps between consecutive time_values of type \code{time_type}. +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +} +\keyword{internal} diff --git a/man/difftime_summary.Rd b/man/difftime_summary.Rd deleted file mode 100644 index ef153f3d1..000000000 --- a/man/difftime_summary.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/revision_analysis.R -\name{difftime_summary} -\alias{difftime_summary} -\title{summary doesn't work on difftimes} -\usage{ -difftime_summary(diff_time_val) -} -\description{ -summary doesn't work on difftimes -} -\keyword{internal} diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 68244410b..4b75e9ffb 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -139,9 +139,9 @@ or \link[slider:summary-slide]{slider::summary-slide} function over variables in These functions tend to be much faster than \code{epi_slide()}. See \code{vignette("epi_df")} for more examples. -\code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollmean}. +\code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollmean}. -\code{epi_slide_sum} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. +\code{epi_slide_sum} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollsum}. } \section{Prefix and suffix shorthand}{ diff --git a/man/f_no_na.Rd b/man/f_no_na.Rd index 9a832d729..1e3acb6f7 100644 --- a/man/f_no_na.Rd +++ b/man/f_no_na.Rd @@ -2,13 +2,13 @@ % Please edit documentation in R/revision_analysis.R \name{f_no_na} \alias{f_no_na} -\title{use when the default behavior returns a warning on empty lists, which we do +\title{use when the default behavior returns a warning on empty vectors, which we do not want, and there is no super clean way of preventing this} \usage{ f_no_na(f, x) } \description{ -use when the default behavior returns a warning on empty lists, which we do +use when the default behavior returns a warning on empty vectors, which we do not want, and there is no super clean way of preventing this } \keyword{internal} diff --git a/man/format_time_delta.Rd b/man/format_time_delta.Rd new file mode 100644 index 000000000..88ba581a2 --- /dev/null +++ b/man/format_time_delta.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{format_time_delta} +\alias{format_time_delta} +\title{Format a length-1 time delta to a character to assist messaging} +\usage{ +format_time_delta(x, time_type) +} +\description{ +This is meant to address the following: +\itemize{ +\item glue::glue("{as.difftime(1, units = 'days')}") is "1" +\item glue::glue("{format(as.difftime(1, units = 'days'))}") is "1 days" +\item time deltas for yearmonths and integers don't have units attached at all +} +} +\keyword{internal} diff --git a/man/guess_period.Rd b/man/guess_period.Rd index 5f17cf4ef..9cbfebc6c 100644 --- a/man/guess_period.Rd +++ b/man/guess_period.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/time-utils.R \name{guess_period} \alias{guess_period} \title{Use max valid period as guess for \code{period} of \code{time_values}} diff --git a/man/key_colnames.Rd b/man/key_colnames.Rd index 925601a6d..3a1865bde 100644 --- a/man/key_colnames.Rd +++ b/man/key_colnames.Rd @@ -10,20 +10,13 @@ \usage{ key_colnames(x, ..., exclude = character()) -\method{key_colnames}{data.frame}( - x, - ..., - geo_keys = intersect("geo_value", names(x)), - other_keys, - time_keys = intersect("time_value", names(x)), - exclude = character() -) +\method{key_colnames}{data.frame}(x, ..., geo_keys, other_keys, time_keys, exclude = character()) \method{key_colnames}{epi_df}( x, ..., geo_keys = "geo_value", - other_keys = NULL, + other_keys = attr(x, "metadata")$other_keys, time_keys = "time_value", exclude = character() ) @@ -33,30 +26,31 @@ key_colnames(x, ..., exclude = character()) \method{key_colnames}{epi_archive}(x, ..., exclude = character()) } \arguments{ -\item{x}{an object, such as an \code{\link{epi_df}}} +\item{x}{an object, often a data frame or something similar. \code{{epiprocess}} +includes implementations for \code{\link{epi_df}}s, \code{\link{epi_archive}}s, +\code{\link[tsibble:tsibble]{tsibble::tsibble}}s, and other data frames (including +\code{\link[tibble:tibble]{tibble::tibble}}s); other packages, like \code{{epipredict}}, can add more.} \item{...}{additional arguments passed on to methods} \item{exclude}{an optional character vector of key column names to exclude from the result} -\item{geo_keys}{optional character vector; which columns (if any) to consider -keys specifying the geographical region? Defaults to \code{"geo_value"} if -present; must be \code{"geo_value"} if \code{x} is an \code{epi_df}.} - -\item{other_keys}{character vector; which columns (if any) to consider keys -specifying demographical or identifying/grouping information besides the -geographical region and time interval? Mandatory if \code{x} is a vanilla -\code{data.frame} or \code{tibble}. Optional if \code{x} is an \code{epi_df}; default is the -\code{epi_df}'s \code{other_keys}; if you provide \code{other_keys}, they must match the -default. (This behavior is to enable consistent and sane results when you -can't guarantee whether \code{x} is an \code{epi_df} or just a -\code{tibble}/\code{data.frame}.)} - -\item{time_keys}{optional character vector; which columns (if any) to -consider keys specifying the time interval during which associated events -occurred? Defaults to \code{"time_value"} if present; must be \code{"time_value"} if -\code{x} is an \code{epi_df}.} +\item{geo_keys, other_keys, time_keys}{character vectors, sometimes optional; +which variables (if any) should be considered as part of a unique +key/identifier for data in \code{x}, dealing respectively with the associated +geographical region, demographic/strain/other information needed in +addition to the geographical region to identify individual time series in +\code{x}, and time interval during which associated events occurred. + +Mandatory if \code{x} is a regular \code{data.frame} or \code{tibble}. Optional if \code{x} is +an \code{epi_df}; the defaults are \code{"geo_value"}, the \code{epi_df}'s \code{other_keys} +metadata, and \code{"time_value"}, respectively; if you provide these manually, +they must match the defaults. (This behavior is to enable consistent and +sane results when you can't guarantee whether \code{x} is an \code{epi_df} or just a +\code{tibble}/\code{data.frame}. You don't need to use it if you know that \code{x} is +definitely an \code{epi_df}.) Not accepted when \code{x} is a \code{tsibble} or an +\code{epi_archive}.} } \value{ character vector diff --git a/man/lag_within_x_latest.Rd b/man/lag_within_x_latest.Rd new file mode 100644 index 000000000..9c90fd8c3 --- /dev/null +++ b/man/lag_within_x_latest.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{lag_within_x_latest} +\alias{lag_within_x_latest} +\title{pull the value from lags when values starts indefinitely being within prop of its latest value.} +\usage{ +lag_within_x_latest(lags, values, prop = 0.2) +} +\arguments{ +\item{lags}{vector of lags; should be sorted} + +\item{values}{this should be a vector (e.g., a column) with length matching that of \code{lags}} + +\item{prop}{optional length-1 double; proportion} +} +\description{ +pull the value from lags when values starts indefinitely being within prop of its latest value. +} +\keyword{internal} diff --git a/man/n_steps_to_time_delta.Rd b/man/n_steps_to_time_delta.Rd new file mode 100644 index 000000000..6a4763464 --- /dev/null +++ b/man/n_steps_to_time_delta.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{n_steps_to_time_delta} +\alias{n_steps_to_time_delta} +\title{Convert from integerish/infinite/mix to time_delta} +\usage{ +n_steps_to_time_delta(n_steps, time_type, format = c("friendly", "fast")) +} +\arguments{ +\item{n_steps}{integerish vector that can mix in infinite values} + +\item{time_type}{as in \code{\link{validate_slide_window_arg}}} + +\item{format}{optional; \code{"friendly"} to output a more descriptive/friendly +class like \code{"difftime"} when possible; \code{"fast"} to output a class that's +generally faster to work with when possible, like a vanilla \code{"numeric"}. +Default is \code{"friendly"}.} +} +\description{ +Convert from integerish/infinite/mix to time_delta +} +\keyword{internal} diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd index 39a72b9a3..315161ade 100644 --- a/man/revision_summary.Rd +++ b/man/revision_summary.Rd @@ -9,9 +9,11 @@ revision_summary( ..., drop_nas = TRUE, print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days"), + min_waiting_period = as.difftime(60, units = "days") \%>\% + difftime_approx_ceiling_time_delta(epi_arch$time_type), within_latest = 0.2, - quick_revision = as.difftime(3, units = "days"), + quick_revision = as.difftime(3, units = "days") \%>\% + difftime_approx_ceiling_time_delta(epi_arch$time_type), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, @@ -23,26 +25,28 @@ revision_summary( \item{epi_arch}{an epi_archive to be analyzed} \item{...}{<\code{\link[=dplyr_tidy_select]{tidyselect}}>, used to choose the column to -summarize. If empty, it chooses the first. Currently only implemented for -one column at a time.} +summarize. If empty and there is only one value/measurement column (i.e., +not in \code{\link{key_colnames}}) in the archive, it will automatically select it. +If supplied, \code{...} must select exactly one column.} \item{drop_nas}{bool, drop any \code{NA} values from the archive? After dropping -\code{NA}'s compactify is run again to make sure there are no duplicate values -from occasions when the signal is revised to \code{NA}, and then back to its -immediately-preceding value.} +\code{NA}'s compactify is run again if \code{should_compactify} is \code{TRUE} to make +sure there are no duplicate values from occasions when the signal is +revised to \code{NA}, and then back to its immediately-preceding value.} \item{print_inform}{bool, determines whether to print summary information, or only return the full summary tibble} \item{min_waiting_period}{\code{difftime}, integer or \code{NULL}. Sets a cutoff: any -time_values not earlier than \code{min_waiting_period} before \code{versions_end} are -removed. \code{min_waiting_period} should characterize the typical time during -which revisions occur. The default of 60 days corresponds to a typical -final value for case counts as reported in the context of insurance. To -avoid this filtering, either set to \code{NULL} or 0.} +time_values that have not had at least \code{min_waiting_period} to stabilize as +of the \code{versions_end} are removed. \code{min_waiting_period} should characterize +the typical time during which most significant revisions occur. The default +of 60 days corresponds to a typical near-final value for case counts as +reported in the context of insurance. To avoid this filtering, either set +to \code{NULL} or 0.} \item{within_latest}{double between 0 and 1. Determines the threshold -used for the \code{time_to}} +used for the \code{lag_to}} \item{quick_revision}{difftime or integer (integer is treated as days), for the printed summary, the amount of time between the final revision and the @@ -52,20 +56,25 @@ days} \item{few_revisions}{integer, for the printed summary, the upper bound on the number of revisions to consider "few". Default is 3.} -\item{abs_spread_threshold}{numeric, for the printed summary, the maximum -spread used to characterize revisions which don't actually change very -much. Default is 5\% of the maximum value in the dataset, but this is the -most unit dependent of values, and likely needs to be chosen appropriate -for the scale of the dataset.} +\item{abs_spread_threshold}{length-1 numeric, for the printed summary, the +maximum spread used to characterize revisions which don't actually change +very much. Default is 5\% of the maximum value in the dataset, but this is +the most unit dependent of values, and likely needs to be chosen +appropriate for the scale of the dataset.} -\item{rel_spread_threshold}{float between 0 and 1, for the printed summary, -the relative spread fraction used to characterize revisions which don't -actually change very much. Default is .1, or 10\% of the final value} +\item{rel_spread_threshold}{length-1 double between 0 and 1, for the printed +summary, the relative spread fraction used to characterize revisions which +don't actually change very much. Default is .1, or 10\% of the final value} -\item{compactify_tol}{float, used if \code{drop_nas=TRUE}, it determines the -threshold for when two floats are considered identical.} +\item{compactify_tol}{length-1 double, used if \code{should_compactify} is \code{TRUE}, it +determines the threshold for when two doubles are considered identical.} -\item{should_compactify}{bool. Compactify if \code{TRUE}.} +\item{should_compactify}{bool. If \code{TRUE}, we will compactify after the signal +requested in \code{...} has been selected on its own and the \code{drop_nas} step. +This helps, for example, to give similar results when called on +\link[=epix_merge]{merged} and single-signal archives, since merged archives +record an update when any of the other signals change, not just the +requested signal. The default is \code{TRUE}.} } \description{ \code{revision_summary} removes all missing values (if requested), and then @@ -87,13 +96,22 @@ always excludes \code{NA} values) \item \code{rel_spread}: \code{spread} divided by the largest value (so it will always be less than 1). Note that this need not be the final value. It will be \code{NA} whenever \code{spread} is 0. -\item \code{time_near_latest}: the time taken for the revisions to settle to within +\item \code{lag_near_latest}: the time taken for the revisions to settle to within \code{within_latest} (default 20\%) of the final value and stay there. For example, consider the series (0, 20, 99, 150, 102, 100); then -\code{time_near_latest} is 5, since even though 99 is within 20\%, it is outside +\code{lag_near_latest} is 5, since even though 99 is within 20\%, it is outside the window afterwards at 150. } } +\details{ +Applies to \code{epi_archive}s with \code{time_type}s of \code{"day"}, \code{"week"}, +and \code{"yearmonth"}. It can also work with a \code{time_type} of \code{"integer"} if +the possible \code{time_values} are all consecutive integers; you will need to +manually specify the \code{min_waiting_period} and \code{quick_revision}, though. +Using a \code{time_type} of \code{"integer"} with week numbers like 202501 will +produce incorrect results for some calculations, since week numbering +contains jumps at year boundaries. +} \examples{ revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) revision_example \%>\% arrange(desc(spread)) diff --git a/man/time_delta_standardize.Rd b/man/time_delta_standardize.Rd new file mode 100644 index 000000000..4ce0bd53d --- /dev/null +++ b/man/time_delta_standardize.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_delta_standardize} +\alias{time_delta_standardize} +\title{Standardize time_deltas to a multiple of \code{\link[=unit_time_delta]{unit_time_delta()}}} +\usage{ +time_delta_standardize(time_delta, time_type, format = c("friendly", "fast")) +} +\description{ +Standardize time_deltas to a multiple of \code{\link[=unit_time_delta]{unit_time_delta()}} +} +\keyword{internal} diff --git a/man/time_delta_summary.Rd b/man/time_delta_summary.Rd new file mode 100644 index 000000000..c6b57c9bb --- /dev/null +++ b/man/time_delta_summary.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/revision_analysis.R +\name{time_delta_summary} +\alias{time_delta_summary} +\title{Like \code{summary} but working across all "time deltas", including difftimes} +\usage{ +time_delta_summary(time_delta, time_type) +} +\description{ +Also standardizes units of difftimes to the natural unit for the given +\code{time_type} (via conversion to and from a corresponding number of time +steps). +} +\keyword{internal} diff --git a/man/time_delta_to_approx_difftime.Rd b/man/time_delta_to_approx_difftime.Rd new file mode 100644 index 000000000..a726fd62b --- /dev/null +++ b/man/time_delta_to_approx_difftime.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_delta_to_approx_difftime} +\alias{time_delta_to_approx_difftime} +\title{Convert \code{time_delta} to an approximate difftime} +\usage{ +time_delta_to_approx_difftime(time_delta, time_type) +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +} +\details{ +To assist in comparing \code{time_delta}s to default \code{difftime} thresholds when we +want to reduce friction. + +It may be better to try to do something like make \code{time_delta} validation +more accommodating (e.g., of difftimes with units of "days" when working on +weekly scale), and remain rigid on yearmonths. Applying deltas and comparing +time_values might also be an approach but seems more fraught as the least +common denominator would be start/mid/end datetimes of time intervals, but +those are also ambiguous (starting&representation wdays of weeks are unknown, +timezone of dates are unknown). + +Another alternative approach, below, converts difftimes to time_deltas +instead. It requires knowledge of which way to round in order to get +time_deltas representing an integer number of time steps, but avoids some +potential inconsistencies of the time-delta-to-difftime approach when we +think about applying it to, e.g., months / spans of months with varying +numbers of days, and also makes it easier to avoid "magical defaults". +} +\keyword{internal} diff --git a/man/time_delta_to_n_steps.Rd b/man/time_delta_to_n_steps.Rd index 937159195..d7858064b 100644 --- a/man/time_delta_to_n_steps.Rd +++ b/man/time_delta_to_n_steps.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/time-utils.R \name{time_delta_to_n_steps} \alias{time_delta_to_n_steps} \title{Convert a time delta to a integerish number of "unit" steps between time values} diff --git a/man/time_minus_time_in_n_steps.Rd b/man/time_minus_time_in_n_steps.Rd new file mode 100644 index 000000000..aab030dea --- /dev/null +++ b/man/time_minus_time_in_n_steps.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_minus_time_in_n_steps} +\alias{time_minus_time_in_n_steps} +\title{Difference between two time value vectors in terms of number of time "steps"} +\usage{ +time_minus_time_in_n_steps(x, y, time_type) +} +\arguments{ +\item{x}{a time_value (vector) of time type \code{time_type}} + +\item{y}{a time_value (vector) of time type \code{time_type}} + +\item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} +} +\value{ +integerish vector such that \code{x + n_steps_to_time_delta_fast(result)} +should equal \code{y}. +} +\description{ +Difference between two time value vectors in terms of number of time "steps" +} +\keyword{internal} diff --git a/man/time_plus_n_steps.Rd b/man/time_plus_n_steps.Rd new file mode 100644 index 000000000..f7071c132 --- /dev/null +++ b/man/time_plus_n_steps.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_plus_n_steps} +\alias{time_plus_n_steps} +\alias{time_minus_n_steps} +\title{Advance/retreat time_values by specified number of time "steps"} +\usage{ +time_plus_n_steps(x, y, time_type) + +time_minus_n_steps(x, y, time_type) +} +\arguments{ +\item{x}{a time_value (vector) of time type \code{time_type}} + +\item{y}{integerish (vector)} + +\item{time_type}{as in \code{\link[=validate_slide_window_arg]{validate_slide_window_arg()}}} +} +\value{ +a time_value (vector) of time type \code{time_type} +} +\description{ +Here, a "step" is based on the \code{time_type}, not just the class of \code{x}. +} +\keyword{internal} diff --git a/man/time_type_unit_abbr.Rd b/man/time_type_unit_abbr.Rd new file mode 100644 index 000000000..4e2971500 --- /dev/null +++ b/man/time_type_unit_abbr.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{time_type_unit_abbr} +\alias{time_type_unit_abbr} +\title{Get an abbreviation for the "units" of \code{unit_time_delta(time_type)}} +\usage{ +time_type_unit_abbr(time_type) +} +\arguments{ +\item{time_type}{str} +} +\value{ +str +} +\description{ +For use in formatting or automatically naming things based on +\code{time_delta_to_n_steps(time_delta)} for a \code{time_delta} between times of time +type \code{time_type}. +} +\keyword{internal} diff --git a/man/time_type_unit_abbrs.Rd b/man/time_type_unit_abbrs.Rd new file mode 100644 index 000000000..d6b93ec1b --- /dev/null +++ b/man/time_type_unit_abbrs.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\docType{data} +\name{time_type_unit_abbrs} +\alias{time_type_unit_abbrs} +\title{Helper data for \code{\link{time_type_unit_abbr}}} +\format{ +An object of class \code{character} of length 3. +} +\usage{ +time_type_unit_abbrs +} +\description{ +Helper data for \code{\link{time_type_unit_abbr}} +} +\keyword{internal} diff --git a/man/time_type_unit_pluralizer.Rd b/man/time_type_unit_pluralizer.Rd new file mode 100644 index 000000000..a815e483f --- /dev/null +++ b/man/time_type_unit_pluralizer.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\docType{data} +\name{time_type_unit_pluralizer} +\alias{time_type_unit_pluralizer} +\title{Helper data for \code{\link{format_time_delta}}} +\format{ +An object of class \code{character} of length 4. +} +\usage{ +time_type_unit_pluralizer +} +\description{ +Should not be altered on the basis of untrusted user input, as it is used as +a cli format string and may run code. +} +\keyword{internal} diff --git a/man/time_within_x_latest.Rd b/man/time_within_x_latest.Rd deleted file mode 100644 index 1dd7e8010..000000000 --- a/man/time_within_x_latest.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/revision_analysis.R -\name{time_within_x_latest} -\alias{time_within_x_latest} -\title{pull the value from lags when values starts indefinitely being within prop of it's last value.} -\usage{ -time_within_x_latest(lags, values, prop = 0.2) -} -\arguments{ -\item{values}{this should be a 1 column tibble. errors may occur otherwise} -} -\description{ -pull the value from lags when values starts indefinitely being within prop of it's last value. -} -\keyword{internal} diff --git a/man/unit_time_delta.Rd b/man/unit_time_delta.Rd index 46b3c48d6..ec63d2558 100644 --- a/man/unit_time_delta.Rd +++ b/man/unit_time_delta.Rd @@ -1,20 +1,29 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/time-utils.R \name{unit_time_delta} \alias{unit_time_delta} \title{Object that, added to time_values of time_type, advances by one time step/interval} \usage{ -unit_time_delta(time_type) +unit_time_delta(time_type, format = c("friendly", "fast")) } \arguments{ \item{time_type}{string; \code{epi_df}'s or \code{epi_archive}'s \code{time_type}} + +\item{format}{"friendly" or "fast"; for some time_types, there are multiple +ways to represent time_deltas. "friendly" tries to output a format that +will be more informative when printed, and produce errors in more cases +when used in unexpected ways. "fast" tries to output a time_delta that will +be faster in downstream operations.} } \value{ an object \code{u} such that \code{time_values + u} represents advancing by one time step / moving to the subsequent time interval for any \code{time_values} object of time type \code{time_type}, and such that \code{time_values + k * u} for integerish vector \code{k} advances by \code{k} steps (with vectorization, -recycling). +recycling). At time of writing, these objects also all support +multiplication by nonintegerish numeric vectors, \code{mean}, and \code{median}, +which are useful for summarizing vector time_deltas, but these fractional +time_deltas are not allowed in time_delta-specific operations. } \description{ Object that, added to time_values of time_type, advances by one time step/interval diff --git a/man/validate_slide_window_arg.Rd b/man/validate_slide_window_arg.Rd new file mode 100644 index 000000000..13e79fb67 --- /dev/null +++ b/man/validate_slide_window_arg.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time-utils.R +\name{validate_slide_window_arg} +\alias{validate_slide_window_arg} +\title{Validate \code{.before} or \code{.window_size} argument} +\usage{ +validate_slide_window_arg( + arg, + time_type, + lower = 1, + allow_inf = TRUE, + arg_name = rlang::caller_arg(arg) +) +} +\description{ +Validate \code{.before} or \code{.window_size} argument +} +\keyword{internal} diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index 1ac214691..4f2bfe269 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -1,4 +1,4 @@ -# revision_summary works for a dummy dataset +# revision_summary works for dummy datasets Code dummy_ex %>% revision_summary() %>% print(n = 10, width = 300) @@ -20,20 +20,20 @@ * 1 out of 4 (25%) Spread of more than 5.1 in actual value (when revised): * 3 out of 4 (75%) - days until within 20% of the latest value: + Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days # A tibble: 7 x 11 - time_value geo_value n_revisions min_lag max_lag time_near_latest spread - - 1 2020-01-01 ak 4 2 days 19 days 19 days 101 - 2 2020-01-02 ak 1 4 days 5 days 4 days 9 - 3 2020-01-03 ak 0 3 days 3 days 3 days 0 - 4 2020-01-01 al 1 0 days 19 days 19 days 99 - 5 2020-01-02 al 0 0 days 0 days 0 days 0 - 6 2020-01-03 al 1 1 days 2 days 2 days 3 - 7 2020-01-04 al 0 1 days 1 days 1 days 0 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 4 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 0 1 days 1 days 1 days 0 rel_spread min_value max_value median_value 1 0.990 1 102 6 @@ -68,20 +68,166 @@ * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - days until within 20% of the latest value: + Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days # A tibble: 7 x 11 - time_value geo_value n_revisions min_lag max_lag time_near_latest spread - - 1 2020-01-01 ak 6 2 days 19 days 19 days 101 - 2 2020-01-02 ak 1 4 days 5 days 4 days 9 - 3 2020-01-03 ak 0 3 days 3 days 3 days 0 - 4 2020-01-01 al 1 0 days 19 days 19 days 99 - 5 2020-01-02 al 0 0 days 0 days 0 days 0 - 6 2020-01-03 al 1 1 days 2 days 2 days 3 - 7 2020-01-04 al 1 0 days 1 days 1 days 0 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 6 2 days 19 days 19 days 101 + 2 2020-01-02 ak 1 4 days 5 days 4 days 9 + 3 2020-01-03 ak 0 3 days 3 days 3 days 0 + 4 2020-01-01 al 1 0 days 19 days 19 days 99 + 5 2020-01-02 al 0 0 days 0 days 0 days 0 + 6 2020-01-03 al 1 1 days 2 days 2 days 3 + 7 2020-01-04 al 1 0 days 1 days 1 days 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 5.5 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + +--- + + Code + dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 weeks 1 weeks 1.4 weeks 4 weeks + Message + Fraction of all versions that are `NA`: + * 2 out of 19 (10.53%) + Fraction of epi_key+time_values with + No revisions: + * 2 out of 7 (28.57%) + Quick revisions (last revision within 1 week of the `time_value`): + * 2 out of 7 (28.57%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 2 out of 5 (40%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 5 (60%) + Weeks until within 20% of the latest value: + Output + min median mean max + 0 weeks 3 weeks 6.9 weeks 19 weeks + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020-01-01 ak 6 2 weeks 19 weeks 19 weeks 101 + 2 2020-01-08 ak 1 4 weeks 5 weeks 4 weeks 9 + 3 2020-01-15 ak 0 3 weeks 3 weeks 3 weeks 0 + 4 2020-01-01 al 1 0 weeks 19 weeks 19 weeks 99 + 5 2020-01-08 al 0 0 weeks 0 weeks 0 weeks 0 + 6 2020-01-15 al 1 1 weeks 2 weeks 2 weeks 3 + 7 2020-01-22 al 1 0 weeks 1 weeks 1 weeks 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 5.5 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + +--- + + Code + dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, + width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 1 1.4 4 + Message + Fraction of all versions that are `NA`: + * 2 out of 19 (10.53%) + Fraction of epi_key+time_values with + No revisions: + * 2 out of 7 (28.57%) + Quick revisions (last revision within 1 month of the `time_value`): + * 2 out of 7 (28.57%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 2 out of 5 (40%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 5 (60%) + Months until within 20% of the latest value: + Output + min median mean max + 0 3 6.9 19 + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 2020 Jan ak 6 2 19 19 101 + 2 2020 Feb ak 1 4 5 4 9 + 3 2020 Mar ak 0 3 3 3 0 + 4 2020 Jan al 1 0 19 19 99 + 5 2020 Feb al 0 0 0 0 0 + 6 2020 Mar al 1 1 2 2 3 + 7 2020 Apr al 1 0 1 1 0 + rel_spread min_value max_value median_value + + 1 0.990 1 102 5.5 + 2 0.09 91 100 95.5 + 3 NaN 0 0 0 + 4 0.99 1 100 50.5 + 5 0 1 1 1 + 6 0.75 1 4 2.5 + 7 0 9 9 9 + +--- + + Code + dummy_ex_integerly %>% revision_summary(min_waiting_period = 60, + quick_revision = 3, drop_nas = FALSE) %>% print(n = 10, width = 300) + Message + Min lag (time to first version): + Output + min median mean max + 0 1 1.4 4 + Message + Fraction of all versions that are `NA`: + * 2 out of 19 (10.53%) + Fraction of epi_key+time_values with + No revisions: + * 2 out of 7 (28.57%) + Quick revisions (last revision within 3 time steps of the `time_value`): + * 4 out of 7 (57.14%) + Few revisions (At most 3 revisions for that `time_value`): + * 6 out of 7 (85.71%) + Fraction of revised epi_key+time_values which have: + Less than 0.1 spread in relative value: + * 2 out of 5 (40%) + Spread of more than 5.1 in actual value (when revised): + * 3 out of 5 (60%) + Time Steps until within 20% of the latest value: + Output + min median mean max + 0 3 6.9 19 + # A tibble: 7 x 11 + time_value geo_value n_revisions min_lag max_lag lag_near_latest spread + + 1 1 ak 6 2 19 19 101 + 2 2 ak 1 4 5 4 9 + 3 3 ak 0 3 3 3 0 + 4 1 al 1 0 19 19 99 + 5 2 al 0 0 0 0 0 + 6 3 al 1 1 2 2 3 + 7 4 al 1 0 1 1 0 rel_spread min_value max_value median_value 1 0.990 1 102 5.5 diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index d05fe0b33..e930cdc4d 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -105,3 +105,35 @@ test_that("compactify does not alter the default clobberable and observed versio expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) expect_identical(ea_true$versions_end, ea_false$versions_end) }) + +test_that("compactify works on distributions", { + forecasts <- tibble( + ahead = 2L, + geo_value = "ak", + target_end_date = as.Date("2020-01-19"), + forecast_date = as.Date("2020-01-17") + 1:8, + actual = 25, + .pred_distn = c( + epipredict::dist_quantiles(c(1, 5, 9), c(0.1, 0.5, 0.9)), + epipredict::dist_quantiles(c(1, NA, 9), c(0.1, 0.5, 0.9)), # single NA in quantiles + epipredict::dist_quantiles(c(NA, NA, NA), c(0.1, 0.5, 0.9)), # all NAs in quantiles + distributional::dist_missing(1), # the actual `NA` for distributions + epipredict::dist_quantiles(c(1, 5, 9), c(0.1, 0.5, 0.9)), # and back + epipredict::dist_quantiles(c(3, 5, 9), c(0.1, 0.5, 0.9)), # change quantile + epipredict::dist_quantiles(c(3, 5, 9), c(0.2, 0.5, 0.8)), # change level + epipredict::dist_quantiles(c(3, 5, 9), c(0.2, 0.5, 0.8)) # LOCF + ) + ) + expect_equal( + forecasts %>% + as_epi_archive( + other_keys = "ahead", time_value = target_end_date, version = forecast_date, + compactify = TRUE + ) %>% + .$DT %>% + as.data.frame() %>% + as_tibble(), + forecasts[-8, ] %>% + rename(time_value = target_end_date, version = forecast_date) + ) +}) diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index 0196cea24..8a2f20cd9 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -37,6 +37,8 @@ test_that("`key_colnames` on non-`epi_df`-like tibbles works as expected", { }) test_that("`key_colnames` on `epi_df`s and similar tibbles works as expected", { + withr::local_options(list(lifecycle_verbosity = "warning")) # for extra_keys tests + gat_tbl <- tibble::tibble(geo_value = 1, age_group = 1, time_value = 1) gat_edf <- as_epi_df(gat_tbl, other_keys = "age_group", as_of = 2) @@ -73,16 +75,6 @@ test_that("`key_colnames` on `epi_df`s and similar tibbles works as expected", { class = "epiprocess__key_colnames__mismatched_time_keys" ) - # For either class, `extra_keys` is not accepted: - expect_error( - key_colnames(gat_tbl, extra_keys = "age_group"), - class = "rlib_error_dots_nonempty" - ) - expect_error( - key_colnames(gat_edf, extra_keys = "age_group"), - class = "rlib_error_dots_nonempty" - ) - # We can exclude keys: expect_equal( key_colnames(gat_tbl, other_keys = "age_group", exclude = c("time_value")), @@ -100,6 +92,20 @@ test_that("`key_colnames` on `epi_df`s and similar tibbles works as expected", { key_colnames(gat_edf, exclude = c("geo_value", "time_value")), c("age_group") ) + + # Using `extra_keys =` is soft-deprecated and routes to `other_keys =`: + expect_warning( + gat_tbl_extra_keys_res <- key_colnames(gat_tbl, extra_keys = "age_group"), + class = "lifecycle_warning_deprecated" + ) + expect_equal(gat_tbl_extra_keys_res, c("geo_value", "age_group", "time_value")) + + expect_warning( + gat_edf_extra_keys_exclude_res <- + key_colnames(gat_edf, extra_keys = "age_group", exclude = c("geo_value", "time_value")), + class = "lifecycle_warning_deprecated" + ) + expect_equal(gat_edf_extra_keys_exclude_res, c("age_group")) }) test_that("`key_colnames` on tsibbles works as expected", { diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index ff7220684..b636bf32e 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -1,17 +1,5 @@ dummy_ex <- tibble::tribble( ~geo_value, ~time_value, ~version, ~value, - # al 1 has 1 real revision, a lag of 0, and changes by 99 - "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, - "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, - # al 2 has no revision, a min lag of 0, and a rel_spread of 0 - "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, - # al 3 has 1 revision and a min lag of 1, and a change of 3 - "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, - "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, - # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 - "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, - "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, # ak 1 has 4 revisions w/out NAs, but 6 with NAs # a min lag of 2, and a change of 101 "ak", as.Date("2020-01-01"), as.Date("2020-01-03"), 1, @@ -27,15 +15,164 @@ dummy_ex <- tibble::tribble( # ak 3 has 0 revisions, and a value of zero, and thus a rel_spread of NaN "ak", as.Date("2020-01-03"), as.Date("2020-01-06"), 0, "ak", as.Date("2020-01-03"), as.Date("2020-01-07"), 0, + # al 1 has 1 real revision, a lag of 0, and changes by 99 + "al", as.Date("2020-01-01"), as.Date("2020-01-01"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-10"), 1, + "al", as.Date("2020-01-01"), as.Date("2020-01-20"), 100, + # al 2 has no revision, a min lag of 0, and a rel_spread of 0 + "al", as.Date("2020-01-02"), as.Date("2020-01-02"), 1, + # al 3 has 1 revision and a min lag of 1, and a change of 3 + "al", as.Date("2020-01-03"), as.Date("2020-01-04"), 1, + "al", as.Date("2020-01-03"), as.Date("2020-01-05"), 4, + # al 4 has 1 revision including NA's none if not, a lag of 0/1 and changes of 0 + "al", as.Date("2020-01-04"), as.Date("2020-01-04"), NA, + "al", as.Date("2020-01-04"), as.Date("2020-01-05"), 9, ) %>% as_epi_archive(versions_end = as.Date("2022-01-01"), compactify = FALSE) -test_that("revision_summary works for a dummy dataset", { +dummy_ex_weekly <- dummy_ex$DT %>% + mutate(across( + c(time_value, version), + ~ as.Date("2020-01-01") + 7 * as.numeric(.x - as.Date("2020-01-01")) + )) %>% + as_epi_archive( + versions_end = as.Date("2022-01-01") + as.numeric(as.Date("2022-01-01") - as.Date("2020-01-01")) %% 7, + compactify = FALSE + ) +stopifnot(dummy_ex_weekly$time_type == "week") + +dummy_ex_yearmonthly <- dummy_ex$DT %>% + mutate(across( + c(time_value, version), + ~ tsibble::make_yearmonth(2020, 1) + as.numeric(.x - as.Date("2020-01-01")) + )) %>% + as_epi_archive( + versions_end = tsibble::make_yearmonth(2020, 1) + as.numeric(as.Date("2022-01-01") - as.Date("2020-01-01")), + compactify = FALSE + ) +stopifnot(dummy_ex_yearmonthly$time_type == "yearmonth") + +dummy_ex_integerly <- dummy_ex$DT %>% + mutate(across( + c(time_value, version), + ~ 1 + as.numeric(.x - as.Date("2020-01-01")) + )) %>% + as_epi_archive( + versions_end = 1 + as.numeric(as.Date("2022-01-01") - as.Date("2020-01-01")), + compactify = FALSE + ) +stopifnot(dummy_ex_integerly$time_type == "integer") + +test_that("revision_summary works for dummy datasets", { expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)) expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + + # Weekly dummy is mostly just "day" -> "week", but quick-revision summary changes: + expect_snapshot(dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + # Yearmonthly has the same story. It would have been close to encountering + # min_waiting_period-based filtering but we actually set its versions_end to + # sometime in 2080 rather than 2022: + expect_snapshot(dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + # Integer is very much like daily. We have to provide some of the + # configuration arguments since we have no idea about what the integers + # represent. If the possible integers being used have large jumps like + # YYYYww-as-integer epiweek labels (e.g., 200053 jumps to 200101) or are + # regularly spaced apart but by more than 1, we'll still be producing + # something nonsensical, but we tried. + expect_snapshot(dummy_ex_integerly %>% + revision_summary( + min_waiting_period = 60, quick_revision = 3, + drop_nas = FALSE + ) %>% + print(n = 10, width = 300)) }) + test_that("tidyselect is functional", { - expect_no_error(revision_summary(dummy_ex, value)) - expect_no_error(revision_summary(dummy_ex, starts_with("val"))) + expect_no_error(quiet(revision_summary(dummy_ex, value))) + expect_no_error(quiet(revision_summary(dummy_ex, starts_with("val")))) + # column order shouldn't matter + with_later_key_col <- dummy_ex$DT %>% + select(geo_value, time_value, value, version) %>% + as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) + expect_equal( + quiet(revision_summary(with_later_key_col)), + quiet(revision_summary(dummy_ex)) + ) + # extra column shouldn't interfere + with_later_val_col <- dummy_ex$DT %>% + mutate(value2 = 0) %>% + as_epi_archive(versions_end = dummy_ex$versions_end, compactify = FALSE) + expect_equal( + quiet(revision_summary(with_later_val_col, value)), + quiet(revision_summary(dummy_ex, value)) + ) + # error when which column we're summarizing is ambiguous + expect_error( + dummy_ex$DT %>% + copy() %>% + mutate(value2 = value) %>% + as_epi_archive( + versions_end = dummy_ex$versions_end, + compactify = FALSE + ) %>% + revision_summary(), + class = "epiprocess__revision_summary_cannot_determine_default_selection" + ) + expect_error(revision_summary(with_later_val_col, !everything()), + class = "epiprocess__revision_summary__selected_zero_columns" + ) +}) + +test_that("revision_summary default min_waiting_period works as expected", { + # just outside the window for daily data + expect_equal( + tibble( + geo_value = 1, + time_value = as.Date("2020-01-01") + 0:1, + version = time_value + 1, + value = 1:2 + ) %>% + as_epi_archive(versions_end = as.Date("2020-01-01") + 1 + 59) %>% + revision_summary(print_inform = FALSE) %>% + pull(time_value), + as.Date("2020-01-01") + ) + # just outside the window for weekly data + expect_equal( + tibble( + geo_value = 1, + time_value = as.Date("2020-01-01") + 7 * (0:1), + version = time_value + 35, + value = 1:2 + ) %>% + as_epi_archive(versions_end = as.Date("2020-01-01") + 7 + 56) %>% + revision_summary(print_inform = FALSE) %>% + pull(time_value), + as.Date("2020-01-01") + ) + # just outside the window for monthly data + expect_equal( + tibble( + geo_value = 1, + time_value = tsibble::make_yearmonth(2000, 1:2), + version = time_value + 1, + value = 1:2 + ) %>% + as_epi_archive(versions_end = tsibble::make_yearmonth(2000, 3)) %>% + revision_summary(print_inform = FALSE) %>% + pull(time_value), + tsibble::make_yearmonth(2000, 1) + ) + # we don't know how to interpret the default in terms of "integer" time_type + expect_error( + tibble( + geo_value = 1, + time_value = 1:2 + 0, + version = time_value + 1, + value = 1:2 + ) %>% + as_epi_archive(versions_end = 1 + 1 + 59) %>% + revision_summary(print_inform = FALSE), + regexp = "Unsupported time_type" + ) }) -test_that("revision_summary works for various timetypes", {}) diff --git a/tests/testthat/test-time-utils.R b/tests/testthat/test-time-utils.R new file mode 100644 index 000000000..7a8aab506 --- /dev/null +++ b/tests/testthat/test-time-utils.R @@ -0,0 +1,259 @@ +test_that("guess_period works", { + # Error cases: + expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times") + expect_error(guess_period(c(1)), class = "epiprocess__guess_period__not_enough_times") + # Different numeric classes and cases: + expect_identical(guess_period(c(1, 8)), 7) + expect_identical(guess_period(c(1, 8, 15)), 7) + expect_identical(guess_period(c(1L, 8L, 15L)), 7L) + expect_identical(guess_period(c(0, 7, 14, 15)), 1) + # We currently allow the guessed frequency to not appear in the diffs, but + # this might not be a good idea as it likely indicates an issue with the data + # (#485). + expect_identical(guess_period(c(0, 2, 5)), 1) + expect_identical(guess_period(c(0, 4, 10)), 2) + # On Dates: + daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") + weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") + expect_identical( + daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), + daily_dates + ) + expect_identical( + weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), + weekly_dates + ) + # On POSIXcts: + daily_posixcts <- as.POSIXct(daily_dates, tz = "US/Aleutian") + 3600 + weekly_posixcts <- as.POSIXct(weekly_dates, tz = "US/Aleutian") + 3600 + expect_identical( + daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L), + daily_posixcts + ) + expect_identical( + weekly_posixcts[[1L]] + guess_period(weekly_posixcts) * (seq_along(weekly_posixcts) - 1L), + weekly_posixcts + ) + # On POSIXlts: + daily_posixlts <- as.POSIXlt(daily_dates, tz = "UTC") + 3600 + weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "UTC") + 3600 + expect_identical( + daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L), + daily_posixlts + ) + expect_identical( + weekly_posixlts[[1L]] + guess_period(weekly_posixlts) * (seq_along(weekly_posixlts) - 1L), + weekly_posixlts + ) +}) + +test_that("validate_slide_window_arg works", { + for (time_type in c("day", "week", "integer", "yearmonth")) { + expect_no_error(validate_slide_window_arg(Inf, time_type)) + } + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "days"), "day")) + expect_no_error(validate_slide_window_arg(1, "day")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "day"), + class = "epiprocess__validate_slide_window_arg" + ) + expect_error(validate_slide_window_arg(as.difftime(1, units = "secs"), "day"), + class = "epiprocess__validate_slide_window_arg" + ) + + expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "week")) + expect_error(validate_slide_window_arg(1, "week"), + class = "epiprocess__validate_slide_window_arg" + ) + + expect_no_error(validate_slide_window_arg(1, "integer")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "days"), "integer"), + class = "epiprocess__validate_slide_window_arg" + ) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "integer"), + class = "epiprocess__validate_slide_window_arg" + ) + + expect_no_error(validate_slide_window_arg(1, "yearmonth")) + expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "yearmonth"), + class = "epiprocess__validate_slide_window_arg" + ) +}) + +test_that("unit_time_delta works", { + for (format in c("friendly", "fast")) { + expect_equal( + as.Date("2020-01-01") + 5 * unit_time_delta("day", format = format), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + 2 * unit_time_delta("week", format = format), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + 5 * unit_time_delta("yearmonth", format = format), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + 5L * unit_time_delta("integer", format = format), + 6L + ) + # + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-06") - as.Date("2020-01-01"), "day") * + unit_time_delta("day", format = format), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.integer(as.Date("2020-01-06") - as.Date("2020-01-01")), "day") * + unit_time_delta("day", format = format), + as.Date("2020-01-06") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.Date("2020-01-15") - as.Date("2020-01-01"), "week") * + unit_time_delta("week", format = format), + as.Date("2020-01-15") + ) + expect_equal( + as.Date("2020-01-01") + + time_delta_to_n_steps(as.difftime(2, units = "weeks"), "week") * + unit_time_delta("week", format = format), + as.Date("2020-01-15") + ) + expect_equal( + tsibble::make_yearmonth(2000, 1) + + time_delta_to_n_steps(5, "yearmonth") * + unit_time_delta("yearmonth", format = format), + tsibble::make_yearmonth(2000, 6) + ) + expect_equal( + 1L + + time_delta_to_n_steps(5, "integer") * + unit_time_delta("integer", format = format), + 6L + ) + } +}) + +test_that("time_delta_to_approx_difftime works as expected", { + expect_equal( + time_delta_to_approx_difftime(as.difftime(3, units = "days"), "day"), + as.difftime(3, units = "days") + ) + expect_equal( + time_delta_to_approx_difftime(3, "day"), + as.difftime(3, units = "days") + ) + expect_equal( + time_delta_to_approx_difftime(3, "week"), + as.difftime(3, units = "weeks") + ) + expect_true(time_delta_to_approx_difftime(3, "yearmonth") %>% + `units<-`("days") %>% # nolint: indentation_linter + as.numeric() %>% + `-`(90) %>% + abs() %>% + `<=`(5)) + expect_error(time_delta_to_approx_difftime(3, "integer")) +}) + +test_that("format_time_delta works as expected", { + # time_type "day": + expect_equal( + format_time_delta(as.difftime(1, units = "days"), "day"), + "1 day" + ) + expect_equal( + format_time_delta(as.difftime(2, units = "days"), "day"), + "2 days" + ) + expect_equal( + format_time_delta(1, "day"), + "1 day" + ) + expect_equal( + format_time_delta(2, "day"), + "2 days" + ) + # time_type "week": + expect_equal( + format_time_delta(as.difftime(1, units = "weeks"), "week"), + "1 week" + ) + expect_equal( + format_time_delta(as.difftime(7, units = "days"), "week"), + "1 week" + ) + expect_equal( + format_time_delta(1, "week"), + "1 week" + ) + expect_equal( + format_time_delta(as.difftime(2, units = "weeks"), "week"), + "2 weeks" + ) + # time_type "yearmonth": + expect_equal( + format_time_delta(1, "yearmonth"), + "1 month" + ) + expect_equal( + format_time_delta(2, "yearmonth"), + "2 months" + ) + # time_type "integer": + expect_equal( + format_time_delta(1, "integer"), + "1 time step" + ) + expect_equal( + format_time_delta(2, "integer"), + "2 time steps" + ) + # we don't handle length != 1; pluralize will raise error for us: + expect_error(format_time_delta(numeric(0), "day")) # we don't handle length != 0 + expect_error(format_time_delta(1:5, "day")) # we don't handle length != 0 +}) + +test_that("difftime_approx_ceiling_time_delta works as expected", { + # At time of writing, docs don't guarantee difftime_approx_ceiling_time_delta + # will output friendly time_deltas, so we'll include a standardization step in + # these tests. Prevent eye-glazing repetitition by testing a bunch of cases + # simultaneously with dplyr: + comparisons <- tibble::tribble( + ~x_amount, ~x_units, ~time_type, ~expected_wrapped_friendly_result, + # days x day: + 0, "days", "day", list(as.difftime(0.0, units = "days")), + 1.5, "days", "day", list(as.difftime(2.0, units = "days")), + 2.0, "days", "day", list(as.difftime(2.0, units = "days")), + # days x week: + 2.0, "days", "week", list(as.difftime(1.0, units = "weeks")), + 7.0, "days", "week", list(as.difftime(1.0, units = "weeks")), + 8.0, "days", "week", list(as.difftime(2.0, units = "weeks")), + # weeks x week: + 1.0, "weeks", "week", list(as.difftime(1.0, units = "weeks")), + 1.1, "weeks", "week", list(as.difftime(2.0, units = "weeks")), + # days x yearmonth: + 2.0, "days", "yearmonth", list(1.0), + 32.0, "days", "yearmonth", list(2.0), + ) %>% + mutate(across(expected_wrapped_friendly_result, purrr::list_flatten)) %>% + rowwise() %>% + mutate(wrapped_friendly_result = as.difftime(x_amount, units = x_units) %>% + difftime_approx_ceiling_time_delta(time_type) %>% + time_delta_standardize(time_type, format = "friendly") %>% + list()) %>% + ungroup() + + expect_equal( + comparisons$wrapped_friendly_result, + comparisons$expected_wrapped_friendly_result + ) + + # days x integer: + expect_error(difftime_approx_ceiling_time_delta(as.difftime(1, units = "days"), "integer"), + regexp = "Unsupported time_type" + ) +}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 37125d533..55e79830a 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -280,141 +280,3 @@ test_that("as_slide_computation works", { h <- as_time_slide_computation(~ .x - .group_key) expect_equal(h(6, 3), 3) }) - -test_that("guess_period works", { - # Error cases: - expect_error(guess_period(numeric(0L)), class = "epiprocess__guess_period__not_enough_times") - expect_error(guess_period(c(1)), class = "epiprocess__guess_period__not_enough_times") - # Different numeric classes and cases: - expect_identical(guess_period(c(1, 8)), 7) - expect_identical(guess_period(c(1, 8, 15)), 7) - expect_identical(guess_period(c(1L, 8L, 15L)), 7L) - expect_identical(guess_period(c(0, 7, 14, 15)), 1) - # We currently allow the guessed frequency to not appear in the diffs, but - # this might not be a good idea as it likely indicates an issue with the data - # (#485). - expect_identical(guess_period(c(0, 2, 5)), 1) - expect_identical(guess_period(c(0, 4, 10)), 2) - # On Dates: - daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") - weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") - expect_identical( - daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), - daily_dates - ) - expect_identical( - weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), - weekly_dates - ) - # On POSIXcts: - daily_posixcts <- as.POSIXct(daily_dates, tz = "US/Aleutian") + 3600 - weekly_posixcts <- as.POSIXct(weekly_dates, tz = "US/Aleutian") + 3600 - expect_identical( - daily_posixcts[[1L]] + guess_period(daily_posixcts) * (seq_along(daily_posixcts) - 1L), - daily_posixcts - ) - expect_identical( - weekly_posixcts[[1L]] + guess_period(weekly_posixcts) * (seq_along(weekly_posixcts) - 1L), - weekly_posixcts - ) - # On POSIXlts: - daily_posixlts <- as.POSIXlt(daily_dates, tz = "UTC") + 3600 - weekly_posixlts <- as.POSIXlt(weekly_dates, tz = "UTC") + 3600 - expect_identical( - daily_posixlts[[1L]] + guess_period(daily_posixlts) * (seq_along(daily_posixlts) - 1L), - daily_posixlts - ) - expect_identical( - weekly_posixlts[[1L]] + guess_period(weekly_posixlts) * (seq_along(weekly_posixlts) - 1L), - weekly_posixlts - ) -}) - - -test_that("validate_slide_window_arg works", { - for (time_type in c("day", "week", "integer", "yearmonth")) { - expect_no_error(validate_slide_window_arg(Inf, time_type)) - } - expect_no_error(validate_slide_window_arg(as.difftime(1, units = "days"), "day")) - expect_no_error(validate_slide_window_arg(1, "day")) - expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "day"), - class = "epiprocess__validate_slide_window_arg" - ) - expect_error(validate_slide_window_arg(as.difftime(1, units = "secs"), "day"), - class = "epiprocess__validate_slide_window_arg" - ) - - expect_no_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "week")) - expect_error(validate_slide_window_arg(1, "week"), - class = "epiprocess__validate_slide_window_arg" - ) - - expect_no_error(validate_slide_window_arg(1, "integer")) - expect_error(validate_slide_window_arg(as.difftime(1, units = "days"), "integer"), - class = "epiprocess__validate_slide_window_arg" - ) - expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "integer"), - class = "epiprocess__validate_slide_window_arg" - ) - - expect_no_error(validate_slide_window_arg(1, "yearmonth")) - expect_error(validate_slide_window_arg(as.difftime(1, units = "weeks"), "yearmonth"), - class = "epiprocess__validate_slide_window_arg" - ) -}) - -test_that("unit_time_delta works", { - expect_equal( - as.Date("2020-01-01") + 5 * unit_time_delta("day"), - as.Date("2020-01-06") - ) - expect_equal( - as.Date("2020-01-01") + 2 * unit_time_delta("week"), - as.Date("2020-01-15") - ) - expect_equal( - tsibble::make_yearmonth(2000, 1) + 5 * unit_time_delta("yearmonth"), - tsibble::make_yearmonth(2000, 6) - ) - expect_equal( - 1L + 5L * unit_time_delta("integer"), - 6L - ) - # - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.Date("2020-01-06") - as.Date("2020-01-01"), "day") * - unit_time_delta("day"), - as.Date("2020-01-06") - ) - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.integer(as.Date("2020-01-06") - as.Date("2020-01-01")), "day") * - unit_time_delta("day"), - as.Date("2020-01-06") - ) - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.Date("2020-01-15") - as.Date("2020-01-01"), "week") * - unit_time_delta("week"), - as.Date("2020-01-15") - ) - expect_equal( - as.Date("2020-01-01") + - time_delta_to_n_steps(as.difftime(2, units = "weeks"), "week") * - unit_time_delta("week"), - as.Date("2020-01-15") - ) - expect_equal( - tsibble::make_yearmonth(2000, 1) + - time_delta_to_n_steps(5, "yearmonth") * - unit_time_delta("yearmonth"), - tsibble::make_yearmonth(2000, 6) - ) - expect_equal( - 1L + - time_delta_to_n_steps(5, "integer") * - unit_time_delta("integer"), - 6L - ) -}) diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd index 34f5a74b5..86a2aa95b 100644 --- a/vignettes/epi_archive.Rmd +++ b/vignettes/epi_archive.Rmd @@ -182,7 +182,7 @@ revision_details %>% max_lag = max(max_lag), spread = mean(spread), rel_spread = mean(rel_spread), - time_near_latest = mean(time_near_latest) + lag_near_latest = mean(lag_near_latest) ) ``` diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd index 3976208e6..ac1ba41fe 100644 --- a/vignettes/epi_df.Rmd +++ b/vignettes/epi_df.Rmd @@ -136,7 +136,8 @@ where - `g` is a one-row tibble containing the values of the grouping variables for the associated group, for instance `g$geo_value` - `t` is the ref_time_value for the current window -- `...` are additional arguments +- `...` (optional) are any additional arguments you'd like to be able to forward + to your function The same computation as above can be done with a function: @@ -160,8 +161,8 @@ formula or function case. This can be adjusted with `.new_col_name`. ### Rolling computations with multiple column outputs -If your formula (or function) returns a data.frame, then the columns of the -data.frame will be unpacked into the resulting `epi_df` (in the sense of +If your formula (or function) returns a tibble (or other kind of data frame), +then its columns will be unpacked into the resulting `epi_df` (in the sense of `tidyr::unpack()`). For example, the following computes the 7-day trailing average of daily cases as well as the the 7-day trailing standard deviation of daily cases: @@ -169,7 +170,10 @@ daily cases: ```{r} edf %>% epi_slide( - ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sd = sd(.x$cases, na.rm = TRUE)), + ~ tibble( + cases_mean = mean(.x$cases, na.rm = TRUE), + cases_sd = sd(.x$cases, na.rm = TRUE) + ), .window_size = 7 ) ``` @@ -180,7 +184,8 @@ For the two most common sliding operations, we offer two optimized versions: `epi_slide_mean()` and `epi_slide_sum()`. These are much faster than `epi_slide()`, so we recommend using them when you are only interested in the mean or sum of a column. The following computes the 7-day trailing mean of daily -cases: +cases, allowing means and sums to be taken over fewer than 7 observations if +there is missingness (`na.rm = TRUE`): ```{r} edf %>% @@ -204,7 +209,7 @@ An `epi_df` object can have more key columns than just `geo_value` and can add this as a key column. We can then aggregate the data by these key columns using `sum_groups_epi_df()`. Let's use influenza hospitalization rate data from the CDC system FluSurv as an example. We can get it from the [Delphi -Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html) +Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html). ```{r} library(epidatr) @@ -237,7 +242,10 @@ flu_data <- flu_data_api %>% "age_group_3" ~ "50--64 yr", "age_group_4" ~ ">= 65 yr", # Make this a factor with appropriate level ordering: - .ptype = factor(levels = c("0--4 yr", "5--17 yr", "18--49 yr", "50--64 yr", ">= 65 yr")) + .ptype = factor(levels = c( + "0--4 yr", "5--17 yr", "18--49 yr", + "50--64 yr", ">= 65 yr" + )) ) ) %>% # The API currently outputs `epiweek` in Date format (the constituent Sunday); @@ -260,13 +268,14 @@ means that there should only be one row for each combination of `geo_value`, time). Now we can aggregate the data by `age_group`, if we want to compute the total. -Since we are working with rates, we need to attach some population data in order -to do this aggregation. It's somewhat ambiguous whether FluSurv-NET reporting -uses either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or +For count data, this would be just a single call to `sum_groups_epi_df()`. Since we +are working with rates, we need to attach some population data in order to do +this aggregation. It's somewhat ambiguous whether FluSurv-NET reporting uses +either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or [Census](https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html) -populations for `time_value`s before 2020 included in reports from 2020 onward, -but in this case, the two sources agreed exactly. FluSurv-NET also directly -reports an overall rate, so we can check our work. +populations for `time_value`s before 2020 included in reports published in 2020 +onward, but at least for this example, these two sources agree exactly. +FluSurv-NET also directly reports an overall rate, so we can check our work. ```{r} # Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and # 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually @@ -306,8 +315,11 @@ fractional_rate_by_age_group <- # time_values 2017-07-01 through 2018-06-30, and the estimated population on # 2018-07-01 for all subsequent time_values: join_by(geo_value, age_group, closest(y$time_value <= x$time_value)), - suffix = c("", "_for_pop"), - relationship = "many-to-one", unmatched = "error" + # Generate errors if the two data sets don't line up as expected: + relationship = "many-to-one", unmatched = "error", + # We'll get a second time column indicating which population estimate + # was carried forward; name it time_value_for_pop: + suffix = c("", "_for_pop") ) %>% mutate(rate_contrib = rate * frac_pop) ``` @@ -361,7 +373,8 @@ First, let's create a data set with some missing data. We will reuse the dataset edf_missing <- edf %>% filter(geo_value %in% c("ca", "tx")) %>% group_by(geo_value) %>% - slice(1:3, 5:6) + slice(1:3, 5:6) %>% + ungroup() edf_missing %>% print(n = 10) @@ -371,6 +384,7 @@ Now let's fill in the missing data with explicit zeros: ```{r} edf_missing <- edf_missing %>% + group_by(geo_value) %>% complete( time_value = seq.Date(min(time_value), max(time_value), by = "day"), fill = list(cases = 0) @@ -380,6 +394,9 @@ edf_missing <- edf_missing %>% edf_missing %>% print(n = 12) ``` +We see that rows have been added for the missing `time_value` 2020-03-04 for +both of the states, with `cases` set to `0`. If there were explicit `NA`s in the +`cases` column, those would have been replaced by `0` as well. ### Detecting and filling time gaps with `tsibble`