diff --git a/.Rbuildignore b/.Rbuildignore index ccbd7ea77..8ca624127 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ +^renv$ +^renv\.lock$ ^.*\.Rproj$ ^\.Rproj\.user$ ^LICENSE\.md$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f4b17a4b6..67f4bdb7f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,10 +1,12 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# +# Created with usethis + edited to run on PRs to dev, use API key. on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] name: R-CMD-check @@ -27,3 +29,5 @@ jobs: needs: check - uses: r-lib/actions/check-r-package@v2 + env: + DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 087f0b05f..12e352b39 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,10 +1,12 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# +# Created with usethis + edited to run on PRs to dev, use API key. on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] release: types: [published] workflow_dispatch: @@ -34,6 +36,8 @@ jobs: needs: website - name: Build site + env: + DELPHI_EPIDATA_KEY: ${{ secrets.SECRET_EPIPROCESS_GHACTIONS_DELPHI_EPIDATA_KEY }} run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} diff --git a/DESCRIPTION b/DESCRIPTION index fb6e687ee..d3d100a92 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,12 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.5.0.9999 +Version: 0.6.0.9999 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", role = "aut"), person("Rafael", "Catoia", role = "ctb"), + person("Nat", "DeFries", role = "ctb"), person("Daniel", "McDonald", role = "aut"), person("Rachel", "Lobay", role = "ctb"), person("Ken", "Mawer", role = "ctb"), @@ -20,13 +21,15 @@ Description: This package introduces a common data structure for epidemiological work with revisions to these data sets over time, and offers associated utilities to perform basic signal processing tasks. License: MIT + file LICENSE -Imports: +Imports: + cli, data.table, - dplyr, + dplyr (>= 1.0.0), fabletools, feasts, generics, genlasso, + lifecycle (>= 1.0.1), lubridate, magrittr, purrr, @@ -35,9 +38,10 @@ Imports: slider, tibble, tidyr, - tidyselect, + tidyselect (>= 1.2.0), tsibble, - utils + utils, + vctrs Suggests: covidcast, epidatr, @@ -45,8 +49,7 @@ Suggests: knitr, outbreaks, rmarkdown, - testthat (>= 3.0.0), - vctrs, + testthat (>= 3.1.5), waldo (>= 0.3.1), withr VignetteBuilder: @@ -59,7 +62,22 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 Depends: R (>= 2.10) URL: https://cmu-delphi.github.io/epiprocess/ +Collate: + 'archive.R' + 'correlation.R' + 'data.R' + 'epi_df.R' + 'epiprocess.R' + 'methods-epi_archive.R' + 'grouped_epi_archive.R' + 'growth_rate.R' + 'methods-epi_df.R' + 'outliers.R' + 'reexports.R' + 'slide.R' + 'utils.R' + 'utils_pipe.R' diff --git a/NAMESPACE b/NAMESPACE index a290ab27f..1d8affefb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,24 +1,31 @@ # Generated by roxygen2: do not edit by hand S3method("[",epi_df) -S3method(arrange,epi_df) +S3method("names<-",epi_df) S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) +S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) -S3method(filter,epi_df) +S3method(dplyr_col_modify,col_modify_recorder_df) +S3method(dplyr_col_modify,epi_df) +S3method(dplyr_reconstruct,epi_df) +S3method(dplyr_row_slice,epi_df) +S3method(epix_truncate_versions_after,epi_archive) +S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(group_by,epi_archive) S3method(group_by,epi_df) +S3method(group_by,grouped_epi_archive) +S3method(group_by_drop_default,grouped_epi_archive) S3method(group_modify,epi_df) -S3method(mutate,epi_df) +S3method(groups,grouped_epi_archive) S3method(next_after,Date) S3method(next_after,integer) S3method(print,epi_df) -S3method(relocate,epi_df) -S3method(rename,epi_df) -S3method(slice,epi_df) S3method(summary,epi_df) S3method(ungroup,epi_df) +S3method(ungroup,grouped_epi_archive) S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) @@ -35,12 +42,14 @@ export(epi_slide) export(epix_as_of) export(epix_merge) export(epix_slide) +export(epix_truncate_versions_after) export(filter) export(group_by) export(group_modify) export(growth_rate) export(is_epi_archive) export(is_epi_df) +export(is_grouped_epi_archive) export(max_version_with_row_in) export(mutate) export(new_epi_df) @@ -60,9 +69,16 @@ importFrom(data.table,key) importFrom(data.table,set) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) +importFrom(dplyr,bind_rows) +importFrom(dplyr,dplyr_col_modify) +importFrom(dplyr,dplyr_reconstruct) +importFrom(dplyr,dplyr_row_slice) importFrom(dplyr,filter) importFrom(dplyr,group_by) +importFrom(dplyr,group_by_drop_default) importFrom(dplyr,group_modify) +importFrom(dplyr,group_vars) +importFrom(dplyr,groups) importFrom(dplyr,mutate) importFrom(dplyr,relocate) importFrom(dplyr,rename) @@ -72,20 +88,37 @@ importFrom(dplyr,ungroup) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") +importFrom(purrr,map_lgl) importFrom(rlang,"!!!") importFrom(rlang,"!!") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) +importFrom(rlang,caller_arg) +importFrom(rlang,caller_env) +importFrom(rlang,check_dots_empty0) importFrom(rlang,enquo) importFrom(rlang,enquos) +importFrom(rlang,env) +importFrom(rlang,f_env) +importFrom(rlang,f_rhs) +importFrom(rlang,global_env) +importFrom(rlang,is_environment) +importFrom(rlang,is_formula) +importFrom(rlang,is_function) +importFrom(rlang,is_missing) importFrom(rlang,is_quosure) +importFrom(rlang,is_string) +importFrom(rlang,missing_arg) +importFrom(rlang,new_function) importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) importFrom(stats,cor) importFrom(stats,median) +importFrom(tibble,as_tibble) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) +importFrom(utils,tail) diff --git a/NEWS.md b/NEWS.md index 8b49b1555..df6744252 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,12 +1,144 @@ -# epiprocess 0.5.0.9999 (development version) +# epiprocess 0.7.0 Note that `epiprocess` uses the [Semantic Versioning -("semver")](https://semver.org/) scheme for all release versions, but not for -development versions. A ".9999" suffix indicates a development version. +("semver")](https://semver.org/) scheme for all release versions, but any +inter-release development versions will include an additional ".9999" suffix. + +## Breaking changes: + +* Changes to `epi_slide` and `epix_slide`: + * If `f` is a function, it is now required to take at least three arguments. + `f` must take an `epi_df` with the same column names as the archive's `DT`, + minus the `version` column; followed by a one-row tibble containing the + values of the grouping variables for the associated group; followed by a + reference time value, usually as a `Date` object. Optionally, it can take + any number of additional arguments after that, and forward values for those + arguments through `epi[x]_slide`'s `...` args. + * To make your existing slide computations work, add a third argument to + your `f` function to accept this new input: e.g., change `f = function(x, + g, ) { }` to `f = function(x, g, rt, ) { }`. + +## New features: + +* `epi_slide` and `epix_slide` also make the window data, group key and reference + time value available to slide computations specified as formulas or tidy + evaluation expressions, in additional or completely new ways. + * If `f` is a formula, it can now access the reference time value via `.z` or + `.ref_time_value`. + * If `f` is missing, the tidy evaluation expression in `...` can now refer to + the window data as an `epi_df` or `tibble` with `.x`, the group key with + `.group_key`, and the reference time value with `.ref_time_value`. The usual + `.data` and `.env` pronouns also work, but`pick()` and `cur_data()` are not; + work off of `.x` instead. +* `epix_slide` has been made more like `dplyr::group_modify`. It will no longer + perform element/row recycling for size stability, accepts slide computation + outputs containing any number of rows, and no longer supports `all_rows`. + * To keep the old behavior, manually perform row recycling within `f` + computations, and/or `left_join` a data frame representing the desired + output structure with the current `epix_slide()` result to obtain the + desired repetitions and completions expected with `all_rows = TRUE`. +* `epix_slide` will only output grouped or ungrouped tibbles. Previously, it + would sometimes output `epi_df`s, but not consistently, and not always with + the metadata desired. Future versions will revisit this design, and consider + more closely whether/when/how to output an `epi_df`. + * To keep the old behavior, convert the output of `epix_slide()` to `epi_df` + when desired and set the metadata appropriately. + +## Improvements: + +* `epi_slide` and `epix_slide` now support `as_list_col = TRUE` when the slide + computations output atomic vectors, and output a list column in "chopped" + format (see `tidyr::chop`). +* `epi_slide` now works properly with slide computations that output just a + `Date` vector, rather than converting `slide_value` to a numeric column. + +# epiprocess 0.6.0 + +## Breaking changes: + +* Changes to both `epi_slide` and `epix_slide`: + * The `n`, `align`, and `before` arguments have been replaced by new `before` + and `after` arguments. To migrate to the new version, replace these + arguments in every `epi_slide` and `epix_slide` call. If you were only using + the `n` argument, then this means replacing `n = ` with `before = + - 1`. + * `epi_slide`'s time windows now extend `before` time steps before and + `after` time steps after the corresponding `ref_time_values`. See + `?epi_slide` for details on matching old alignments. + * `epix_slide`'s time windows now extend `before` time steps before the + corresponding `ref_time_values` all the way through the latest data + available at the corresponding `ref_time_values`. + * Slide functions now keep any grouping of `x` in their results, like + `mutate` and `group_modify`. + * To obtain the old behavior, `dplyr::ungroup` the slide results immediately. +* Additional `epi_slide` changes: + * When using `as_list_col = TRUE` together with `ref_time_values` and + `all_rows=TRUE`, the marker for excluded computations is now a `NULL` entry + in the list column, rather than a `NA`; if you are using `tidyr::unnest()` + afterward and want to keep these missing data markers, you will need to + replace the `NULL` entries with `NA`s. Skipped computations are now more + uniformly detectable using `vctrs` methods. +* Additional`epix_slide` changes: + * `epix_slide`'s `group_by` argument has been replaced by `dplyr::group_by` and + `dplyr::ungroup` S3 methods. The `group_by` method uses "data masking" (also + referred to as "tidy evaluation") rather than "tidy selection". + * Old syntax: + * `x %>% epix_slide(, group_by=c(col1, col2))` + * `x %>% epix_slide(, group_by=all_of(colname_vector))` + * New syntax: + * `x %>% group_by(col1, col2) %>% epix_slide()` + * `x %>% group_by(across(all_of(colname_vector))) %>% epix_slide()` + * `epix_slide` no longer defaults to grouping by non-`time_value`, non-`version` + key columns, instead considering all data to be in one big group. + * To obtain the old behavior, precede each `epix_slide` call lacking a + `group_by` argument with an appropriate `group_by` call. + * `epix_slide` now guesses `ref_time_values` to be a regularly spaced sequence + covering all the `DT$version` values and the `version_end`, rather than the + distinct `DT$time_value`s. To obtain the old behavior, pass in + `ref_time_values = unique($DT$time_value)`. +* `epi_archive`'s `clobberable_versions_start`'s default is now `NA`, so there + will be no warnings by default about potential nonreproducibility. To obtain + the old behavior, pass in `clobberable_versions_start = + max_version_with_row_in(x)`. + +## Potentially-breaking changes: + +* Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when + dropping the `epi_df` class (e.g., when removing the `time_value` column). +* Fixed `epi_df` operations to be more consistent about decaying into + non-`epi_df`s when the result of the operation doesn't make sense as an + `epi_df` (e.g., when removing the `time_value` column). +* Changed `bind_rows` on grouped `epi_df`s to not drop the `epi_df` class. Like + with ungrouped `epi_df`s, the metadata of the result is still simply taken + from the first result, and may be inappropriate + ([#242](https://github.com/cmu-delphi/epiprocess/issues/242)). +* `epi_slide` and `epix_slide` now raise an error rather than silently filtering + out `ref_time_values` that don't meet their expectations. + +## New features: + +* `epix_slide`, `$slide` have a new parameter `all_versions`. With + `all_versions=TRUE`, `epix_slide` will pass a filtered `epi_archive` to each + computation rather than an `epi_df` snapshot. This enables, e.g., performing + pseudoprospective forecasts with a revision-aware forecaster using nested + `epix_slide` operations. + +## Improvements: + +* Added `dplyr::group_by` and `dplyr::ungroup` S3 methods for `epi_archive` + objects, plus corresponding `$group_by` and `$ungroup` R6 methods. The + `group_by` implementation supports the `.add` and `.drop` arguments, and + `ungroup` supports partial ungrouping with `...`. +* `as_epi_archive`, `epi_archive$new` now perform checks for the key uniqueness + requirement (part of + [#154](https://github.com/cmu-delphi/epiprocess/issues/154)). ## Cleanup: * Added a `NEWS.md` file to track changes to the package. +* Implemented `?dplyr::dplyr_extending` for `epi_df`s + ([#223](https://github.com/cmu-delphi/epiprocess/issues/223)). * Fixed various small documentation issues ([#217](https://github.com/cmu-delphi/epiprocess/issues/217)). # epiprocess 0.5.0: diff --git a/R/archive.R b/R/archive.R index 2f6af5e0c..5897fc4d4 100644 --- a/R/archive.R +++ b/R/archive.R @@ -61,7 +61,7 @@ validate_version_bound = function(version_bound, x, na_ok, } } -#' Default arg helper: `max(x$version)`, with error if `x` has 0 rows +#' `max(x$version)`, with error if `x` has 0 rows #' #' Exported to make defaults more easily copyable. #' @@ -233,15 +233,17 @@ epi_archive = #' carried forward (LOCF) to interpolate between the version data provided, #' rows that don't change these LOCF results can potentially be omitted to #' save space while maintaining the same behavior (with the help of the -#' `clobberable_versions_start` and `versions_end` fields in some -#' edge cases). `TRUE` will remove these rows, `FALSE` will not, and missing -#' or `NULL` will remove these rows and issue a warning. Generally, this can -#' be set to `TRUE`, but if you directly inspect or edit the fields of the -#' `epi_archive` such as its `DT`, you will have to determine whether -#' `compactify=TRUE` will produce the desired results. If compactification -#' here is removing a large proportion of the rows, this may indicate a -#' potential for space, time, or bandwidth savings upstream the data pipeline, -#' e.g., when fetching, storing, or preparing the input data `x` +#' `clobberable_versions_start` and `versions_end` fields in some edge cases). +#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will +#' remove these rows and issue a warning. Generally, this can be set to +#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` +#' such as its `DT`, or rely on redundant updates to achieve a certain +#' behavior of the `ref_time_values` default in `epix_slide`, you will have to +#' determine whether `compactify=TRUE` will produce the desired results. If +#' compactification here is removing a large proportion of the rows, this may +#' indicate a potential for space, time, or bandwidth savings upstream the +#' data pipeline, e.g., by avoiding fetching, storing, or processing these +#' rows of `x`. #' @param clobberable_versions_start Optional; as in [`as_epi_archive`] #' @param versions_end Optional; as in [`as_epi_archive`] #' @return An `epi_archive` object. @@ -305,10 +307,10 @@ epi_archive = Abort("compactify must be boolean or null.") } - # Apply defaults and conduct checks and apply defaults for + # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: if (missing(clobberable_versions_start)) { - clobberable_versions_start <- max_version_with_row_in(x) + clobberable_versions_start <- NA } if (missing(versions_end)) { versions_end <- max_version_with_row_in(x) @@ -336,6 +338,12 @@ epi_archive = key_vars = c("geo_value", "time_value", other_keys, "version") DT = as.data.table(x, key = key_vars) if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + maybe_first_duplicate_key_row_index = anyDuplicated(DT, by=key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + Abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. Otherwise, check for duplicate rows and/or conflicting values for the same measurement.", + class = "epiprocess__epi_archive_requires_unique_key") + } # Checks to see if a value in a vector is LOCF is_locf <- function(vec) { @@ -397,8 +405,8 @@ epi_archive = self$clobberable_versions_start = clobberable_versions_start self$versions_end = versions_end }, - print = function() { - cat("An `epi_archive` object, with metadata:\n") + print = function(class = TRUE, methods = TRUE) { + if (class) cat("An `epi_archive` object, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) if (!is.null(self$additional_metadata)) { @@ -420,7 +428,7 @@ epi_archive = cat(sprintf("* %-14s = %s\n", "last version with update", max(self$DT$version))) if (is.na(self$clobberable_versions_start)) { - cat("No clobberable versions\n") + cat("* No clobberable versions\n") } else { cat(sprintf("* %-14s = %s\n", "clobberable versions start", self$clobberable_versions_start)) @@ -430,22 +438,21 @@ epi_archive = cat("----------\n") cat(sprintf("Data archive (stored in DT field): %i x %i\n", nrow(self$DT), ncol(self$DT))) - cat("----------\n") cat(sprintf("Columns in DT: %s\n", paste(ifelse(length( colnames(self$DT)) <= 4, paste(colnames(self$DT), collapse = ", "), paste(paste(colnames(self$DT)[1:4], collapse = ", "), "and", length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns"))))) - cat("----------\n") - cat(sprintf("Public methods: %s\n", - paste(names(epi_archive$public_methods), - collapse = ", ")),"\n") - + if (methods) { + cat("----------\n") + writeLines(wrap_varnames(initial = "Public R6 methods: ", + names(epi_archive$public_methods))) + } }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. #' See the documentation for the wrapper function [`epix_as_of()`] for details. #' @importFrom data.table between key - as_of = function(max_version, min_time_value = -Inf) { + as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { # Self max version and other keys other_keys = setdiff(key(self$DT), c("geo_value", "time_value", "version")) @@ -465,12 +472,23 @@ epi_archive = if (max_version > self$versions_end) { Abort("`max_version` must be at most `self$versions_end`.") } + if (!rlang::is_bool(all_versions)) { + Abort("`all_versions` must be TRUE or FALSE.") + } if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { - Warn('Getting data as of some "clobberable" version that might be hotfixed, synced, or otherwise replaced later with different data using the same version tag. Thus, the snapshot that we produce here might not be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + Warn('Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new version number (a.k.a. "clobbered"). Thus, the snapshot that we produce here should not be expected to be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', class="epiprocess__snapshot_as_of_clobberable_version") } # Filter by version and return + if (all_versions) { + result = epix_truncate_versions_after(self, max_version) + # `self` has already been `clone`d in `epix_truncate_versions_after` + # so we can modify the new archive's DT directly. + result$DT = result$DT[time_value >= min_time_value, ] + return(result) + } + return( # Make sure to use data.table ways of filtering and selecting self$DT[time_value >= min_time_value & @@ -478,7 +496,7 @@ epi_archive = unique(by = c("geo_value", "time_value", other_keys), fromLast = TRUE) %>% tibble::as_tibble() %>% - dplyr::select(-.data$version) %>% + dplyr::select(-"version") %>% as_epi_df(geo_type = self$geo_type, time_type = self$time_type, as_of = max_version, @@ -514,7 +532,7 @@ epi_archive = next_version_tag = next_after(self$versions_end) if (next_version_tag > fill_versions_end) { Abort(sprintf(paste( - "Apparent problem with `next_after` implementation:", + "Apparent problem with `next_after` method:", "archive contained observations through version %s", "and the next possible version was supposed to be %s,", "but this appeared to jump from a version < %3$s", @@ -552,6 +570,38 @@ epi_archive = return (invisible(self)) }, ##### +#' @description Filter to keep only older versions, mutating the archive by +#' potentially reseating but not mutating some fields. `DT` is likely, but not +#' guaranteed, to be copied. Returns the mutated archive +#' [invisibly][base::invisible]. +#' @param x as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] + truncate_versions_after = function(max_version) { + if (length(max_version) != 1) { + Abort("`max_version` cannot be a vector.") + } + if (is.na(max_version)) { + Abort("`max_version` must not be NA.") + } + if (!identical(class(max_version), class(self$DT$version)) || + !identical(typeof(max_version), typeof(self$DT$version))) { + Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") + } + if (max_version > self$versions_end) { + Abort("`max_version` must be at most `self$versions_end`.") + } + self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with=FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(self$clobberable_versions_start) && + self$clobberable_versions_start > max_version) { + self$clobberable_versions_start <- NA + } + self$versions_end <- max_version + return (invisible(self)) + }, + ##### #' @description Merges another `epi_archive` with the current one, mutating the #' current one by reseating its `DT` and several other fields, but avoiding #' mutation of the old `DT`; returns the current archive @@ -579,156 +629,38 @@ epi_archive = return (invisible(self)) }, ##### + group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { + group_by.epi_archive(self, ..., .add=.add, .drop=.drop) + }, #' @description Slides a given function over variables in an `epi_archive` #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. #' @importFrom data.table key #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - slide = function(f, ..., n, group_by, ref_time_values, + slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE) { - # If missing, then set ref time values to be everything; else make - # sure we intersect with observed time values - if (missing(ref_time_values)) { - ref_time_values = unique(self$DT$time_value) - } - else { - ref_time_values = ref_time_values[ref_time_values %in% - unique(self$DT$time_value)] - } - - # If a custom time step is specified, then redefine units - before_num = n-1 - if (!missing(time_step)) before_num = time_step(n-1) - - # What to group by? If missing, set according to internal keys; - # otherwise, tidyselect. - if (quo_is_missing(enquo(group_by))) { - group_by <- syms(setdiff(key(self$DT), c("time_value", "version"))) - } else { - group_by <- syms(names(eval_select(enquo(group_by), self$DT))) - } - - # Symbolize column name - new_col = sym(new_col_name) - - # Key variable names, apart from time value and version - key_vars = setdiff(key(self$DT), c("time_value", "version")) - - # Computation for one group, one time value - comp_one_grp = function(.data_group, - f, ..., - time_value, - key_vars, - new_col) { - # Carry out the specified computation - comp_value = f(.data_group, ...) - - # Count the number of appearances of the reference time value. - # Note: ideally, we want to directly count occurrences of the ref - # time value but due to latency, this will often not appear in the - # data group. So we count the number of unique key values, outside - # of the time value column - count = sum(!duplicated(.data_group[, key_vars])) - - # If we get back an atomic vector - if (is.atomic(comp_value)) { - if (length(comp_value) == 1) { - comp_value = rep(comp_value, count) - } - # If not a singleton, should be the right length, else abort - else if (length(comp_value) != count) { - Abort("If the slide computation returns an atomic vector, then it must have a single element, or else one element per appearance of the reference time value in the local window.") - } - } - - # If we get back a data frame - else if (is.data.frame(comp_value)) { - if (nrow(comp_value) == 1) { - comp_value = rep(list(comp_value), count) - } - # If not a single row, should be the right length, else abort - else if (nrow(comp_value) != count) { - Abort("If the slide computation returns a data frame, then it must have a single row, or else one row per appearance of the reference time value in the local window.") - } - # Make into a list - else { - comp_value = split(comp_value, 1:nrow(comp_value)) - } - } - - # If neither an atomic vector data frame, then abort - else { - Abort("The slide computation must return an atomic vector or a data frame.") - } - - # Note that we've already recycled comp value to make size stable, - # so tibble() will just recycle time value appropriately - return(tibble::tibble(time_value = time_value, - !!new_col := comp_value)) - } - - # If f is not missing, then just go ahead, slide by group - if (!missing(f)) { - if (rlang::is_formula(f)) f = rlang::as_function(f) - - x = purrr::map_dfr(ref_time_values, function(t) { - self$as_of(t, min_time_value = t - before_num) %>% - dplyr::group_by(!!!group_by) %>% - dplyr::group_modify(comp_one_grp, - f = f, ..., - time_value = t, - key_vars = key_vars, - new_col = new_col, - .keep = TRUE) %>% - dplyr::ungroup() - }) - } - - # Else interpret ... as an expression for tidy evaluation - else { - quos = enquos(...) - if (length(quos) == 0) { - Abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - Abort("If `f` is missing then only a single computation can be specified via `...`.") - } - - quo = quos[[1]] - f = function(x, quo, ...) rlang::eval_tidy(quo, x) - new_col = sym(names(rlang::quos_auto_name(quos))) - - x = purrr::map_dfr(ref_time_values, function(t) { - self$as_of(t, min_time_value = t - before_num) %>% - dplyr::group_by(!!!group_by) %>% - dplyr::group_modify(comp_one_grp, - f = f, quo = quo, - time_value = t, - key_vars = key_vars, - new_col = new_col, - .keep = TRUE) %>% - dplyr::ungroup() - }) - } - - # Unnest if we need to - if (!as_list_col) { - x = tidyr::unnest(x, !!new_col, names_sep = names_sep) - } - - # Join to get all rows, if we need to, then return - if (all_rows) { - cols = c(as.character(group_by), "time_value") - y = unique(self$DT[, ..cols]) - x = dplyr::left_join(y, x, by = cols) - } - return(x) + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + self$group_by()$slide( + f, ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, + all_versions = all_versions + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() } ) ) - + #' Convert to `epi_archive` format #' #' Converts a data frame, data table, or tibble into an `epi_archive` @@ -767,25 +699,18 @@ epi_archive = #' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and #' `typeof`: specifically, either (a) the earliest version that could be #' subject to "clobbering" (being overwritten with different update data, but -#' using the same version tag as the old update data), or (b) `NA`, to +#' using the *same* version tag as the old update data), or (b) `NA`, to #' indicate that no versions are clobberable. There are a variety of reasons -#' why versions could be clobberable, such as upstream hotfixes to the latest -#' version, or delays in data synchronization that were mistaken for versions -#' with no updates; potential causes vary between different data pipelines. -#' The default value is `max_version_with_row_in(x)`; this default assumes -#' that (i) if a row in `x` (even one that `compactify` would consider -#' redundant) is present with version `ver`, then all previous versions must -#' be finalized and non-clobberable, although `ver` (and onward) might still -#' be modified, (ii) even if we have "observed" empty updates for some -#' versions beyond `max(x$version)` (as indicated by `versions_end`; -#' see below), we can't assume `max(x$version)` has been finalized, because we -#' might see a nonfinalized version + empty subsequent versions due to -#' upstream database replication delays in combination with the upstream -#' replicas using last-version-carried-forward to extrapolate that there were -#' no updates, (iii) "redundant" update rows that would be removed by -#' `compactify` are not redundant, and actually come from an explicit version -#' release that indicates that preceding versions are finalized. If `nrow(x) -#' == 0`, then this argument is mandatory. +#' why versions could be clobberable under routine circumstances, such as (a) +#' today's version of one/all of the columns being published after initially +#' being filled with `NA` or LOCF, (b) a buggy version of today's data being +#' published but then fixed and republished later in the day, or (c) data +#' pipeline delays (e.g., publisher uploading, periodic scraping, database +#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected +#' later in the day (or even on a different day) than expected; potential +#' causes vary between different data pipelines. The default value is `NA`, +#' which doesn't consider any versions to be clobberable. Another setting that +#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. #' @param versions_end Optional; length-1, same `class` and `typeof` as #' `x$version`: what is the last version we have observed? The default is #' `max_version_with_row_in(x)`, but values greater than this could also be @@ -842,7 +767,7 @@ epi_archive = as_epi_archive = function(x, geo_type, time_type, other_keys, additional_metadata = list(), compactify = NULL, - clobberable_versions_start = max_version_with_row_in(x), + clobberable_versions_start = NA, versions_end = max_version_with_row_in(x)) { epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end) @@ -851,12 +776,23 @@ as_epi_archive = function(x, geo_type, time_type, other_keys, #' Test for `epi_archive` format #' #' @param x An object. +#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also +#' count? Default is `FALSE`. #' @return `TRUE` if the object inherits from `epi_archive`. #' #' @export #' @examples #' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) #' is_epi_archive(archive_cases_dv_subset) # TRUE -is_epi_archive = function(x) { - inherits(x, "epi_archive") +#' +#' # By default, grouped_epi_archives don't count as epi_archives, as they may +#' # support a different set of operations from regular `epi_archives`. This +#' # behavior can be controlled by `grouped_okay`. +#' grouped_archive = archive_cases_dv_subset$group_by(geo_value) +#' is_epi_archive(grouped_archive) # FALSE +#' is_epi_archive(grouped_archive, grouped_okay=TRUE) # TRUE +#' +#' @seealso [`is_grouped_epi_archive`] +is_epi_archive = function(x, grouped_okay=FALSE) { + inherits(x, "epi_archive") || grouped_okay && inherits(x, "grouped_epi_archive") } diff --git a/R/data.R b/R/data.R index f642ac47d..248288eb1 100644 --- a/R/data.R +++ b/R/data.R @@ -65,7 +65,7 @@ #' #' Modifications: #' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits API}: The signal `percent_cli` is taken directly from the API without changes. -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: `case_rate_7d_av` is taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: `case_rate_7d_av` signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. #' * Furthermore, the data is a subset of the full dataset, the signal names slightly altered, and formatted into a tibble. #' #' @export diff --git a/R/epi_df.R b/R/epi_df.R index abef52c0f..c2b84c83b 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -159,7 +159,7 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # Reorder columns (geo_value, time_value, ...) if(sum(dim(x)) != 0){ - x = dplyr::relocate(x, .data$geo_value, .data$time_value) + x = dplyr::relocate(x, "geo_value", "time_value") } # Apply epi_df class, attach metadata, and return diff --git a/R/epiprocess.R b/R/epiprocess.R index 0749647f9..e047de8ce 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -7,3 +7,4 @@ #' @docType package #' @name epiprocess NULL +utils::globalVariables(c(".x", ".group_key", ".ref_time_value")) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R new file mode 100644 index 000000000..fd91ed4d7 --- /dev/null +++ b/R/grouped_epi_archive.R @@ -0,0 +1,532 @@ + +#' Get var names from select-only `tidy_select`ing `...` in `.data` +#' +#' Convenience function for performing a `tidy_select` on dots according to its +#' docs, and taking the names (rather than the integer indices). +#' +#' @param ... tidyselect-syntax selection description +#' @param .data named vector / data frame; context for the description / the +#' object to which the selections apply +#' @return character vector containing names of entries/columns of +#' `names(.data)` denoting the selection +#' +#' @noRd +eval_pure_select_names_from_dots = function(..., .data) { + # `?tidyselect::eval_select` tells us to use this form when we take in dots. + # It seems a bit peculiar, since the expr doesn't pack with it a way to get at + # the environment for the dots, but it looks like `eval_select` will assume + # the caller env (our `environment()`) when given an expr, and thus have + # access to the dots. + # + # If we were allowing renaming, we'd need to be careful about which names (new + # vs. old vs. both) to return here. + names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename=FALSE)) +} + +#' Get names of dots without forcing the dots +#' +#' For use in functions that use nonstandard evaluation (NSE) on the dots; we +#' can't use the pattern `names(list(...))` in this case because it will attempt +#' to force/(standard-)evaluate the dots, and we want to avoid attempted forcing of the +#' dots if we're using NSE. +#' +#' @noRd +nse_dots_names = function(...) { + names(rlang::call_match()) +} +nse_dots_names2 = function(...) { + rlang::names2(rlang::call_match()) +} + +#' @importFrom dplyr group_by_drop_default +#' @noRd +grouped_epi_archive = + R6::R6Class( + classname = "grouped_epi_archive", + # (We don't R6-inherit `epi_archive` or S3-multiclass with "epi_archive"; + # any "inheritance" of functionality must be done via wrapper functions that + # are checked/tested for sensible operation.) + private = list( + ungrouped = NULL, + vars = NULL, + drop = NULL + ), + public = list( + initialize = function(ungrouped, vars, drop) { + if (inherits(ungrouped, "grouped_epi_archive")) { + Abort("`ungrouped` must not already be grouped (neither automatic regrouping nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, or `ungroup` first.", + class="epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(ungrouped), + epiprocess__ungrouped_groups = groups(ungrouped)) + } + if (!inherits(ungrouped, "epi_archive")) { + Abort("`ungrouped` must be an epi_archive", + class="epiprocess__grouped_epi_archive__ungrouped_arg_is_not_epi_archive", + epiprocess__ungrouped_class = class(ungrouped)) + } + if (!is.character(vars)) { + Abort("`vars` must be a character vector (any tidyselection should have already occurred in a helper method).", + class="epiprocess__grouped_epi_archive__vars_is_not_chr", + epiprocess__vars_class = class(vars), + epiprocess__vars_type = typeof(vars)) + } + if (!all(vars %in% names(ungrouped$DT))) { + Abort("`vars` must be selected from the names of columns of `ungrouped$DT`", + class="epiprocess__grouped_epi_archive__vars_contains_invalid_entries", + epiprocess__vars = vars, + epiprocess__DT_names = names(ungrouped$DT)) + } + if ("version" %in% vars) { + Abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") + } + if (!rlang::is_bool(drop)) { + Abort("`drop` must be a Boolean", + class="epiprocess__grouped_epi_archive__drop_is_not_bool", + epiprocess__drop = drop) + } + # ----- + private$ungrouped <- ungrouped + private$vars <- vars + private$drop <- drop + }, + print = function(class = TRUE, methods = TRUE) { + if (class) cat("A `grouped_epi_archive` object:\n") + writeLines(wrap_varnames(private$vars, initial="* Groups: ")) + # If none of the grouping vars is a factor, then $drop doesn't seem + # relevant, so try to be less verbose and don't message about it. + # + # Below map-then-extract may look weird, but the more natural + # extract-then-map appears to trigger copies of the extracted columns + # since we are working with a `data.table` (unless we go through + # `as.list`, but its current column-aliasing behavior is probably not + # something to rely too much on), while map functions currently appear + # to avoid column copies. + if (any(purrr::map_lgl(private$ungrouped$DT, is.factor)[private$vars])) { + cat(strwrap(init="* ", prefix=" ", sprintf( + "%s groups formed by factor levels that don't appear in the data", + if (private$drop) "Drops" else "Does not drop" + ))) + cat("\n") + } + cat("It wraps an ungrouped `epi_archive`, with metadata:\n") + private$ungrouped$print(class = FALSE, methods = FALSE) + if (methods) { + cat("----------\n") + cat("Public `grouped_epi_archive` R6 methods:\n") + grouped_method_names = names(grouped_epi_archive$public_methods) + ungrouped_method_names = names(epi_archive$public_methods) + writeLines(wrap_varnames(initial = "\u2022 Specialized `epi_archive` methods: ", + intersect(grouped_method_names, ungrouped_method_names))) + writeLines(wrap_varnames(initial = "\u2022 Exclusive to `grouped_epi_archive`: ", + setdiff(grouped_method_names, ungrouped_method_names))) + writeLines(wrap_varnames(initial = "\u2022 `ungroup` to use: ", + setdiff(ungrouped_method_names, grouped_method_names))) + } + # Return self invisibly for convenience in `$`-"pipe": + invisible(self) + }, + group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { + if (!rlang::is_bool(.add)) { + Abort("`.add` must be a Boolean") + } + if (!.add) { + Abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + (neither automatic regrouping nor nested grouping is supported). + If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. + If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. + ', + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE") + } else { + # `group_by` `...` computations are performed on ungrouped data (see + # `?dplyr::group_by`) + detailed_mutate = epix_detailed_restricted_mutate(private$ungrouped, ...) + out_ungrouped = detailed_mutate[["archive"]] + vars_from_dots = detailed_mutate[["request_names"]] + vars = union(private$vars, vars_from_dots) + grouped_epi_archive$new(private$ungrouped, vars, .drop) + } + }, + group_by_drop_default = function() { + private$drop + }, + groups = function() { + rlang::syms(private$vars) + }, + ungroup = function(...) { + if (rlang::dots_n(...) == 0L) { + # No dots = special behavior: remove all grouping vars and convert to + # an ungrouped class, as with `grouped_df`s. + private$ungrouped + } else { + exclude_vars = eval_pure_select_names_from_dots(..., .data=private$ungrouped$DT) + # (requiring a pure selection here is a little stricter than dplyr + # implementations, but passing a renaming selection into `ungroup` + # seems pretty weird.) + result_vars = private$vars[! private$vars %in% exclude_vars] + # `vars` might be length 0 if the user's tidyselection removed all + # grouping vars. Unlike with tibble, opt here to keep the result as a + # grouped_epi_archive, for output class consistency when `...` is + # provided. + grouped_epi_archive$new(private$ungrouped, result_vars, private$drop) + } + }, +#' @description Filter to keep only older versions by mutating the underlying +#' `epi_archive` using `$truncate_versions_after`. Returns the mutated +#' `grouped_epi_archive` [invisibly][base::invisible]. +#' @param x as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] + truncate_versions_after = function(max_version) { + # The grouping is irrelevant for this method; if we were to split into + # groups and recombine appropriately, we should get the same result as + # just leveraging the ungrouped method, so just do the latter: + private$ungrouped$truncate_versions_after(max_version) + return (invisible(self)) + }, +#' @description Slides a given function over variables in a `grouped_epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. +#' @importFrom data.table key address +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms env + slide = function(f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + Abort(" + The `group_by` argument to `slide` has been removed; please use + the `group_by` S3 generic function or `$group_by` R6 method + before the slide instead. (If you were instead trying to pass a + `group_by` argument to `f` or create a column named `group_by`, + this check is a false positive, but you will still need to use a + different column name here and rename the resulting column after + the slide.) + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + Abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") + } + + if (missing(ref_time_values)) { + ref_time_values = epix_slide_ref_time_values_default(private$ungrouped) + } else if (length(ref_time_values) == 0L) { + Abort("`ref_time_values` must have at least one element.") + } else if (any(is.na(ref_time_values))) { + Abort("`ref_time_values` must not include `NA`.") + } else if (anyDuplicated(ref_time_values) != 0L) { + Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } else if (any(ref_time_values > private$ungrouped$versions_end)) { + Abort("All `ref_time_values` must be `<=` the `versions_end`.") + } else { + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values = sort(ref_time_values) + } + + # Check that `f` takes enough args + if (!missing(f) && is.function(f)) { + assert_sufficient_f_args(f, ..., n_mandatory_f_args = 3L) + } + + # Validate and pre-process `before`: + if (missing(before)) { + Abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + if (length(before) != 1L || is.na(before) || before < 0L) { + Abort("`before` must be length-1, non-NA, non-negative.") + } + + # If a custom time step is specified, then redefine units + + if (!missing(time_step)) before <- time_step(before) + + # Symbolize column name + new_col = sym(new_col_name) + + # Validate rest of parameters: + if (!rlang::is_bool(as_list_col)) { + Abort("`as_list_col` must be TRUE or FALSE.") + } + if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) { + Abort("`names_sep` must be a (single) string or NULL.") + } + if (!rlang::is_bool(all_versions)) { + Abort("`all_versions` must be TRUE or FALSE.") + } + + # Computation for one group, one time value + comp_one_grp = function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value = f(.data_group, .group_key, ref_time_value, ...) + + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `as_of` step. + .data_group = .data_group$DT + } + + if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { + Abort("The slide computation must return an atomic vector or a data frame.") + } + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + comp_value <- list(comp_value) + + # Label every result row with the `ref_time_value`: + return(tibble::tibble(time_value = .env$ref_time_value, + !!new_col := .env$comp_value)) + } + + # If f is not missing, then just go ahead, slide by group + if (!missing(f)) { + if (rlang::is_formula(f)) f = as_slide_computation(f) + x = purrr::map_dfr(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df = as_of_raw + group_modify_fn = comp_one_grp + } else { + as_of_archive = as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + as_of_archive$DT <- copy(as_of_archive$DT) + } + dt_key = data.table::key(as_of_archive$DT) + as_of_df = as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn = function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key=dt_key) + .data_group_archive = as_of_archive$clone() + .data_group_archive$DT = .data_group + comp_one_grp(.data_group_archive, .group_key, f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)), + .drop=private$drop) %>% + dplyr::group_modify(group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE) + ) + }) + } + + # Else interpret ... as an expression for tidy evaluation + else { + quos = enquos(...) + if (length(quos) == 0) { + Abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + Abort("If `f` is missing then only a single computation can be specified via `...`.") + } + + quo = quos[[1]] + f = function(.x, .group_key, .ref_time_value, quo, ...) { + # Convert to environment to standardize between tibble and R6 + # based inputs. In both cases, we should get a simple + # environment with the empty environment as its parent. + data_env = rlang::as_environment(.x) + data_mask = rlang::new_data_mask(bottom = data_env, top = data_env) + data_mask$.data <- rlang::as_data_pronoun(data_mask) + # We'll also install `.x` directly, not as an + # `rlang_data_pronoun`, so that we can, e.g., use more dplyr and + # epiprocess operations. + data_mask$.x = .x + data_mask$.group_key = .group_key + data_mask$.ref_time_value = .ref_time_value + rlang::eval_tidy(quo, data_mask) + } + new_col = sym(names(rlang::quos_auto_name(quos))) + + x = purrr::map_dfr(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df = as_of_raw + group_modify_fn = comp_one_grp + } else { + as_of_archive = as_of_raw + # We essentially want to `group_modify` the archive, but don't + # provide an implementation yet. Next best would be + # `group_modify` on its `$DT`, but that has different behavior + # based on whether or not `dtplyr` is loaded. Instead, go + # through an ordinary data frame, trying to avoid copies. + if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + as_of_archive$DT <- copy(as_of_archive$DT) + } + dt_key = data.table::key(as_of_archive$DT) + as_of_df = as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn = function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key=dt_key) + .data_group_archive = as_of_archive$clone() + .data_group_archive$DT = .data_group + comp_one_grp(.data_group_archive, .group_key, f = f, quo = quo, + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)), + .drop=private$drop) %>% + dplyr::group_modify(group_modify_fn, + f = f, quo = quo, + ref_time_value = ref_time_value, + comp_effective_key_vars = comp_effective_key_vars, + new_col = new_col, + .keep = TRUE) + ) + }) + } + + # Unchop/unnest if we need to + if (!as_list_col) { + x = tidyr::unnest(x, !!new_col, names_sep = names_sep) + } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + + return(x) + } + ) + ) + +# At time of writing, roxygen parses content in collation order, impacting the +# presentation of .Rd files that document multiple functions (see +# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining +# `Collate:`) and ordering of functions within each file in order to get the +# desired ordering. + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr group_by +#' @export +group_by.grouped_epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { + .data$group_by(..., .add=.add, .drop=.drop) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr groups +#' @export +groups.grouped_epi_archive = function(x) { + x$groups() +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr ungroup +#' @export +ungroup.grouped_epi_archive = function(x, ...) { + x$ungroup(...) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +is_grouped_epi_archive = function(x) { + inherits(x, "grouped_epi_archive") +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +group_by_drop_default.grouped_epi_archive = function(.tbl) { + .tbl$group_by_drop_default() +} + +#' @export +epix_truncate_versions_after.grouped_epi_archive = function(x, max_version) { + return ((x$clone()$truncate_versions_after(max_version))) + # ^ second set of parens drops invisibility +} diff --git a/R/growth_rate.R b/R/growth_rate.R index d3ca9e311..17c4ec74a 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -73,7 +73,7 @@ #' implicitly defined by the `x` variable; for example, if `x` is a vector of #' `Date` objects, `h = 7`, and the reference point is January 7, then the #' sliding window contains all data in between January 1 and 14 (matching the -#' behavior of `epi_slide()` with `n = 2 * h` and `align = "center"`). +#' behavior of `epi_slide()` with `before = h - 1` and `after = h`). #' #' @section Additional Arguments: #' For the global methods, "smooth_spline" and "trend_filter", additional diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index e3f874be2..c110555c6 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -14,6 +14,12 @@ #' @param min_time_value Time value specifying the min time value to permit in #' the snapshot. Default is `-Inf`, which effectively means that there is no #' minimum considered. +#' @param all_versions If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. #' @return An `epi_df` object. #' #' @details This is simply a wrapper around the `as_of()` method of the @@ -54,9 +60,9 @@ #' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) #' # Since R 4.0, there is a `globalCallingHandlers` function that can be used #' # to globally toggle these warnings. -epix_as_of = function(x, max_version, min_time_value = -Inf) { +epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") - return(x$as_of(max_version, min_time_value)) + return(x$as_of(max_version, min_time_value, all_versions = all_versions)) } #' `epi_archive` with unobserved history filled in (won't mutate, might alias) @@ -354,7 +360,295 @@ epix_merge = function(x, y, )) } -#' Slide a function over variables in an `epi_archive` object +# Helpers for `group_by`: + +#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input +#' +#' A workaround for `dplyr:::mutate_cols` not being exported and directly +#' applying test mock libraries likely being impossible (due to mocking another +#' package's S3 generic or method). +#' +#' Use solely with a single call to the [`dplyr::mutate`] function and then +#' `destructure_col_modify_recorder_df`; other applicable operations from +#' [dplyr::dplyr_extending] have not been implemented. +#' +#' @param parent_df the "parent class" data frame to wrap +#' @return a `col_modify_recorder_df` +#' +#' @noRd +new_col_modify_recorder_df = function(parent_df) { + if (!inherits(parent_df, "data.frame")) { + Abort('`parent_df` must inherit class `"data.frame"`', + internal=TRUE) + } + `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) +} + +#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` +#' +#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` +#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the +#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record +#' +#' @noRd +destructure_col_modify_recorder_df = function(col_modify_recorder_df) { + if (!inherits(col_modify_recorder_df, "col_modify_recorder_df")) { + Abort('`col_modify_recorder_df` must inherit class `"col_modify_recorder_df"`', + internal=TRUE) + } + list( + unchanged_parent_df = col_modify_recorder_df %>% + `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% + `class<-`(setdiff(class(.), "col_modify_recorder_df")), + cols = attr(col_modify_recorder_df, + "epiprocess::col_modify_recorder_df::cols", exact=TRUE) + ) +} + +#' `dplyr_col_modify` method that simply records the `cols` argument +#' +#' Must export S3 methods in R >= 4.0, even if they're only designed to be +#' package internals, and must import any corresponding upstream S3 generic +#' functions: +#' @importFrom dplyr dplyr_col_modify +#' @export +#' @noRd +dplyr_col_modify.col_modify_recorder_df = function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact=TRUE))) { + Abort("`col_modify_recorder_df` can only record `cols` once", + internal=TRUE) + } + attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols + data +} + +#' A more detailed but restricted `mutate` for use in `group_by.epi_archive` +#' +#' More detailed: provides the names of the "requested" columns in addition to +#' the output expected from a regular `mutate` method. +#' +#' Restricted: doesn't allow replacing or removing key cols, where a sort is +#' potentially required at best and what the output key should be is unclear at +#' worst. (The originally expected restriction was that the `mutate` parameters +#' not present in `group_by` would not be recognized, but the current +#' implementation just lets `mutate` handle these even anyway, even if they're +#' not part of the regular `group_by` parameters; these arguments would have to +#' be passed by names with dot prefixes, so just hope that the user means to use +#' them here if provided.) +#' +#' This can introduce column-level aliasing in `data.table`s, which isn't really +#' intended in the `data.table` user model but we can make it part of our user +#' model (see +#' https://stackoverflow.com/questions/45925482/make-a-shallow-copy-in-data-table +#' and links). +#' +#' Don't export this without cleaning up language of "mutate" as in side effects +#' vs. "mutate" as in `dplyr::mutate`. +#' @noRd +epix_detailed_restricted_mutate = function(.data, ...) { + # We don't want to directly use `dplyr::mutate` on the `$DT`, as: + # - this likely copies the entire table + # - `mutate` behavior, including the output class, changes depending on + # whether `dtplyr` is loaded and would require post-processing + # - behavior with `dtplyr` isn't fully compatible + # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not + # appropriately handle the `= NULL` and `= ` tidyeval cases + # Instead: + # - Use `as.list` to get a shallow copy (undocumented, but apparently + # intended, behavior), then `as_tibble` (also shallow, given a list) to get + # back to something that will use `dplyr`'s included `mutate` method(s), + # then convert this using shallow operations into a `data.table`. + # - Use `col_modify_recorder_df` to get the desired details. + in_tbl = tibble::as_tibble(as.list(.data$DT), .name_repair="minimal") + col_modify_cols = + destructure_col_modify_recorder_df( + mutate(new_col_modify_recorder_df(in_tbl), ...) + )[["cols"]] + invalidated_key_col_is = + which(purrr::map_lgl(key(.data$DT), function(key_colname) { + key_colname %in% names(col_modify_cols) && + !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) + })) + if (length(invalidated_key_col_is) != 0L) { + rlang::abort(paste_lines(c( + "Key columns must not be replaced or removed.", + wrap_varnames(key(.data$DT)[invalidated_key_col_is], + initial="Flagged key cols: ") + ))) + } else { + # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing + # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key=key(.data$DT)) + out_archive = .data$clone() + out_archive$DT <- out_DT + request_names = names(col_modify_cols) + return(list( + archive = out_archive, + request_names = request_names + )) + # (We might also consider special-casing when `mutate` hands back something + # equivalent (in some sense) to the input (probably only encountered when + # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, + # in the result, primarily in order to hedge against `as.list` or `setDT` + # changing their behavior and generating deep copies somehow. This could + # also prevent storage, and perhaps also generation, of shallow copies, but + # this seems unlikely to be a major gain unless it helps enable some + # in-place modifications of refcount-1 columns (although detecting this case + # seems to be common across `group_by` implementations; maybe there is + # something there).) + } +} + +#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` +#' +#' @param .data An `epi_archive` or `grouped_epi_archive` +#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' perform grouping, but note that, if you are regrouping an already-grouped +#' `.data` object, the calculations will be carried out ignoring such grouping +#' (same as [in dplyr][dplyr::group_by]). +#' * For `ungroup`: either +#' * empty, in order to remove the grouping and output an `epi_archive`; or +#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] +#' expression(s), in order to remove the matching variables from the list of +#' grouping variables, and output another `grouped_epi_archive`. +#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by +#' the variable selection from `...` only; if `TRUE`, the output will be +#' grouped by the current grouping variables plus the variable selection from +#' `...`. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) +#' +#' @details +#' +#' To match `dplyr`, `group_by` allows "data masking" (also referred to as +#' "tidy evaluation") expressions `...`, not just column names, in a way similar +#' to `mutate`. Note that replacing or removing key columns with these +#' expressions is disabled. +#' +#' `archive %>% group_by()` and other expressions that group or regroup by zero +#' columns (indicating that all rows should be treated as part of one large +#' group) will output a `grouped_epi_archive`, in order to enable the use of +#' `grouped_epi_archive` methods on the result. This is in slight contrast to +#' the same operations on tibbles and grouped tibbles, which will *not* output a +#' `grouped_df` in these circumstances. +#' +#' Using `group_by` with `.add=FALSE` to override the existing grouping is +#' disabled; instead, `ungroup` first then `group_by`. +#' +#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, +#' introducing column-level aliasing between its input and its result. This +#' doesn't follow the general model for most `data.table` operations, which +#' seems to be that, given an nonaliased (i.e., unique) pointer to a +#' `data.table` object, its pointers to its columns should also be nonaliased. +#' If you mutate any of the columns of either the input or result, first ensure +#' that it is fine if columns of the other are also mutated, but do not rely on +#' such behavior to occur. Additionally, never perform mutation on the key +#' columns at all (except for strictly increasing transformations), as this will +#' invalidate sortedness assumptions about the rows. +#' +#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch +#' to `group_by_drop_default.default` (but there is a dedicated method for +#' `grouped_epi_archive`s). +#' +#' @examples +#' +#' grouped_archive = archive_cases_dv_subset %>% group_by(geo_value) +#' +#' # `print` for metadata and method listing: +#' grouped_archive %>% print() +#' +#' # The primary use for grouping is to perform a grouped `epix_slide`: +#' +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide(f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = 'case_rate_3d_av') %>% +#' ungroup() +#' +#' # ----------------------------------------------------------------- +#' +#' # Advanced: some other features of dplyr grouping are implemented: +#' +#' library(dplyr) +#' toy_archive = +#' tribble( +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate(age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version)) %>% +#' as_epi_archive(other_keys = "age_group") +#' +#' # The following are equivalent: +#' toy_archive %>% group_by(geo_value, age_group) +#' toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) +#' grouping_cols = c("geo_value", "age_group") +#' toy_archive %>% group_by(across(all_of(grouping_cols))) +#' +#' # And these are equivalent: +#' toy_archive %>% group_by(geo_value) +#' toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) +#' +#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +#' toy_archive %>% group_by(geo_value) %>% groups() +#' +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop=FALSE) %>% +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() +#' +#' @importFrom dplyr group_by +#' @export +#' +#' @aliases grouped_epi_archive +group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { + # `add` makes no difference; this is an ungrouped `epi_archive`. + detailed_mutate = epix_detailed_restricted_mutate(.data, ...) + if (!rlang::is_bool(.drop)) { + Abort("`.drop` must be TRUE or FALSE") + } + if (!.drop) { + grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + } else if (any(diff(grouping_col_is_factor) == -1L)) { + Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") + } + } + grouped_epi_archive$new(detailed_mutate[["archive"]], + detailed_mutate[["request_names"]], + drop = .drop) +} + +#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` #' #' Slides a given function over variables in an `epi_archive` object. This #' behaves similarly to `epi_slide()`, with the key exception that it is @@ -364,30 +658,52 @@ epix_merge = function(x, y, #' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for #' examples. #' -#' @param x An `epi_archive` object. -#' @param f Function or formula to slide over variables in `x`. To "slide" means -#' to apply a function or formula over a running window of `n` time steps -#' (where one time step is typically one day or one week). If a function, `f` -#' must take `x`, a data frame with the same column names as the original -#' object; followed by any number of named arguments; and ending with -#' `...`. If a formula, `f` can operate directly on columns accessed via -#' `.x$var`, as in `~ mean(.x$var)` to compute a mean of a column `var` over a -#' sliding window of `n` time steps. +#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, +#' all data in `x` will be treated as part of a single data group. +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. #' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then the current argument is -#' interpreted as an expression for tidy evaluation. -#' @param n Number of time steps to use in the running window. For example, if -#' `n = 7`, and one time step is one day, then to produce a value on January 7 -#' we apply the given function or formula to data in between January 1 and -#' 7. -#' @param group_by The variable(s) to group by before slide computation. If -#' missing, then the keys in the underlying data table, excluding `time_value` -#' and `version`, will be used for grouping. To omit a grouping entirely, use -#' `group_by = NULL`. -#' @param ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). #' @param time_step Optional function used to define the meaning of one time #' step, which if specified, overrides the default choice based on the #' `time_value` column. This function must take a positive integer and return @@ -397,38 +713,71 @@ epix_merge = function(x, y, #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col If the computations return data frames, should the slide -#' result hold these in a single list column or try to unnest them? Default is -#' `FALSE`, in which case a list object returned by `f` would be unnested -#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given -#' by prepending `new_col_name` to the names of the list elements. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. -#' @param all_rows If `all_rows = TRUE`, then the output will have one row per -#' combination of grouping variables and unique time values in the underlying -#' data table. Otherwise, there will be one row in the output for each time -#' value in `x` that acts as a reference time value. Default is `FALSE`. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables, `time_value`, #' containing the reference time values for the slide computation, and a #' column named according to the `new_col_name` argument, containing the slide #' values. #' -#' @details Two key distinctions between inputs to the current function and -#' `epi_slide()`: -#' 1. `epix_slide()` uses windows that are **always right-aligned** (in -#' `epi_slide()`, custom alignments could be specified using the `align` or -#' `before` arguments). -#' 2. `epix_slide()` uses a `group_by` to specify the grouping upfront (in -#' `epi_slide()`, this would be accomplished by a preceding function call to -#' `dplyr::group_by()`). -#' Apart from this, the interfaces between `epix_slide()` and `epi_slide()` are -#' the same. -#' -#' Note that the outputs are a similar but different: `epix_slide()` only -#' returns the grouping variables, `time_value`, and the new columns from -#' sliding, whereas `epi_slide()` returns all original variables plus the new -#' columns from sliding. +#' @details A few key distinctions between the current function and `epi_slide()`: +#' 1. In `f` functions for `epix_slide`, one should not assume that the input +#' data to contain any rows with `time_value` matching the computation's +#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for +#' typical epidemiological surveillance data, observations pertaining to a +#' particular time period (`time_value`) are first reported `as_of` some +#' instant after that time period has ended. +#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend +#' from `before` time steps before a given `ref_time_value` through the last +#' `time_value` available as of version `ref_time_value` (typically, this +#' won't include `ref_time_value` itself, as observations about a particular +#' time interval (e.g., day) are only published after that time interval +#' ends); `epi_slide` windows extend from `before` time steps before a +#' `ref_time_value` through `after` time steps after `ref_time_value`. +#' 3. The input class and columns are similar but different: `epix_slide` +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) +#' 4. The output class and columns are similar but different: `epix_slide()` +#' returns a tibble containing only the grouping variables, `time_value`, and +#' the new column(s) from the slide computations, whereas `epi_slide()` +#' returns an `epi_df` with all original variables plus the new columns from +#' the slide computations. (Both will mirror the grouping or ungroupedness of +#' their input, with one exception: `epi_archive`s can have trivial +#' (zero-variable) groupings, but these will be dropped in `epix_slide` +#' results as they are not supported by tibbles.) +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. +#' 7. The `ref_time_values` default for `epix_slide` is based on making an +#' evenly-spaced sequence out of the `version`s in the `DT` plus the +#' `versions_end`, rather than the `time_value`s. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. #' #' Furthermore, the current function can be considerably slower than #' `epi_slide()`, for two reasons: (1) it must repeatedly fetch @@ -439,46 +788,150 @@ epix_merge = function(x, y, #' version-aware sliding is necessary (as it its purpose). #' #' Finally, this is simply a wrapper around the `slide()` method of the -#' `epi_archive` class, so if `x` is an `epi_archive` object, then: +#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an +#' object of either of these classes, then: #' ``` -#' epix_slide(x, new_var = comp(old_var), n = 120) +#' epix_slide(x, new_var = comp(old_var), before = 119) #' ``` #' is equivalent to: #' ``` -#' x$slide(x, new_var = comp(old_var), n = 120) +#' x$slide(new_var = comp(old_var), before = 119) #' ``` #' -#' @importFrom rlang enquo -#' @export #' @examples -#' # these dates are reference time points for the 3 day average sliding window -#' # The resulting epi_archive ends up including data averaged from: -#' # 0 day which has no results, for 2020-06-01 -#' # 1 day, for 2020-06-02 -#' # 2 days, for the rest of the results -#' # never 3 days dur to data latency -#' -#' time_values <- seq(as.Date("2020-06-01"), -#' as.Date("2020-06-15"), -#' by = "1 day") -#' epix_slide(x = archive_cases_dv_subset, -#' f = ~ mean(.x$case_rate_7d_av), -#' n = 3, -#' group_by = geo_value, -#' ref_time_values = time_values, -#' new_col_name = 'case_rate_3d_av') -epix_slide = function(x, f, ..., n, group_by, ref_time_values, +#' library(dplyr) +#' +#' # Reference time points for which we want to compute slide values: +#' ref_time_values <- seq(as.Date("2020-06-01"), +#' as.Date("2020-06-15"), +#' by = "1 day") +#' +#' # A simple (but not very useful) example (see the archive vignette for a more +#' # realistic one): +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide(f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = ref_time_values, +#' new_col_name = 'case_rate_7d_av_recent_av') %>% +#' ungroup() +#' # We requested time windows that started 2 days before the corresponding time +#' # values. The actual number of `time_value`s in each computation depends on +#' # the reporting latency of the signal and `time_value` range covered by the +#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +#' # discarded +#' # * 1 `time_value`, for ref time 2020-06-02 +#' # * 2 `time_value`s, for the rest of the results +#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because +#' # of data latency, we'll never have an observation +#' # `time_value == ref_time_value` as of `ref_time_value`. +#' # The example below shows this type of behavior in more detail. +#' +#' # Examining characteristics of the data passed to each computation with +#' # `all_versions=FALSE`. +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' function(x, gk, rtv) { +#' tibble( +#' time_range = if(nrow(x) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) +#' }, +#' n = nrow(x), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = FALSE, +#' ref_time_values = ref_time_values, names_sep=NULL) %>% +#' ungroup() %>% +#' arrange(geo_value, time_value) +#' +#' # --- Advanced: --- +#' +#' # `epix_slide` with `all_versions=FALSE` (the default) applies a +#' # version-unaware computation to several versions of the data. We can also +#' # use `all_versions=TRUE` to apply a version-*aware* computation to several +#' # versions of the data, again looking at characteristics of the data passed +#' # to each computation. In this case, each computation should expect an +#' # `epi_archive` containing the relevant version data: +#' +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' function(x, gk, rtv) { +#' tibble( +#' versions_start = if (nrow(x$DT) == 0L) { +#' "NA (0 rows)" +#' } else { +#' toString(min(x$DT$version)) +#' }, +#' versions_end = x$versions_end, +#' time_range = if(nrow(x$DT) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) +#' }, +#' n = nrow(x$DT), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = TRUE, +#' ref_time_values = ref_time_values, names_sep=NULL) %>% +#' ungroup() %>% +#' # Focus on one geo_value so we can better see the columns above: +#' filter(geo_value == "ca") %>% +#' select(-geo_value) +#' +#' @importFrom rlang enquo !!! +#' @export +epix_slide = function(x, f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { - if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") - return(x$slide(f, ..., n = n, - group_by = {{group_by}}, + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + if (!is_epi_archive(x, grouped_okay=TRUE)) { + Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") + } + return(x$slide(f, ..., before = before, ref_time_values = ref_time_values, time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, - all_rows = all_rows)) + all_versions = all_versions + )) } +#' Default value for `ref_time_values` in an `epix_slide` +#' +#' @noRd +epix_slide_ref_time_values_default = function(ea) { + versions_with_updates = c(ea$DT$version, ea$versions_end) + ref_time_values = tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + return (ref_time_values) +} +#' Filter an `epi_archive` object to keep only older versions +#' +#' Generates a filtered `epi_archive` from an `epi_archive` object, keeping +#' only rows with `version` falling on or before a specified date. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' filtered archive. That is, the output archive will comprise rows of the +#' current archive data having `version` less than or equal to the +#' specified `max_version` +#' @return An `epi_archive` object +#' +#' @export +epix_truncate_versions_after = function(x, max_version) { + UseMethod("epix_truncate_versions_after") +} + +#' @export +epix_truncate_versions_after.epi_archive = function(x, max_version) { + return ((x$clone()$truncate_versions_after(max_version))) + # ^ second set of parens drops invisibility +} diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 03d03080f..6429b867a 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -1,3 +1,20 @@ +#' Convert to tibble +#' +#' Converts an `epi_df` object into a tibble, dropping metadata and any +#' grouping. +#' +#' @param x an `epi_df` +#' @param ... arguments to forward to `NextMethod()` +#' +#' @importFrom tibble as_tibble +#' @export +as_tibble.epi_df = function(x, ...) { + # Decaying drops the class and metadata. `as_tibble.grouped_df` drops the + # grouping and should be called by `NextMethod()` in the current design. + # See #223 for discussion of alternatives. + decay_epi_df(NextMethod()) +} + #' Convert to tsibble format #' #' Converts an `epi_df` object into a tsibble, where the index is taken to be @@ -12,13 +29,14 @@ #' @export as_tsibble.epi_df = function(x, key, ...) { if (missing(key)) key = c("geo_value", attributes(x)$metadata$other_keys) - return(as_tsibble(tibble::as_tibble(x), key, index = "time_value", ...)) + return(as_tsibble(tibble::as_tibble(x), + key = tidyselect::all_of(key), index = "time_value", + ...)) } #' Base S3 methods for an `epi_df` object #' -#' Print, summary, and `dplyr` verbs (that preserve class and attributes) for an -#' `epi_df` object. +#' Print and summary functions for an `epi_df` object. #' #' @param x The `epi_df` object. #' @param ... Additional arguments passed to methods. @@ -63,119 +81,115 @@ summary.epi_df = function(object, ...) { dplyr::summarize(mean(.data$num))))) } +#' Drop any `epi_df` metadata and class on a data frame +#' +#' Useful in implementing `?dplyr_extending` when manipulations cause invariants +#' of `epi_df`s to be violated and we need to return some other class. Note that +#' this will maintain any grouping (keeping the `grouped_df` class and +#' associated attributes, if present). +#' +#' @param x an `epi_df` or other data frame +#' @return `x` with any metadata dropped and the `"epi_df"` class, if previously +#' present, dropped +#' +#' @noRd +decay_epi_df = function(x) { + attributes(x)$metadata <- NULL + class(x) <- class(x)[class(x) != "epi_df"] + x +} + +# Implementing `dplyr_extending`: we have a few metadata attributes to consider: +# `as_of` is an attribute doesn't depend on the rows or columns, `geo_type` and +# `time_type` are scalar attributes dependent on columns, and `other_keys` acts +# like an attribute vectorized over columns; `dplyr_extending` advice at time of +# writing says to implement `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and +# `names<-`, but not `dplyr_row_slice`; however, we'll also implement +# `dplyr_row_slice` anyway to prevent a `arrange` on grouped `epi_df`s from +# dropping the `epi_df` class. We'll implement `[` to allow either 1d or 2d. +# We'll also implement some other methods where we want to (try to) maintain an +# `epi_df`. + +#' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may +#' directly feed in latter from our other methods) +#' @param template `epi_df` template to use to restore +#' @return `epi_df` or degrade into `tbl_df` +#' @importFrom dplyr dplyr_reconstruct #' @export -`[.epi_df` <- function(x, i, j, drop = FALSE) { - res <- NextMethod() - - if (!is.data.frame(res)) return(res) - - if (missing(i)) { - i <- NULL - } - - if (missing(j)) { - j <- NULL - } +#' @noRd +dplyr_reconstruct.epi_df = function(data, template) { + # Start from a reconstruction for the backing S3 classes; this ensures that we + # keep any grouping that has been applied: + res <- NextMethod() cn <- names(res) - + # Duplicate columns, Abort dup_col_names = cn[duplicated(cn)] if (length(dup_col_names) != 0) { Abort(paste0("Column name(s) ", paste(unique(dup_col_names), collapse = ", "), " must not be duplicated.")) - } + } not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn) if (not_epi_df) { - attributes(res)$metadata <- NULL - return(tibble::as_tibble(res)) + # If we're calling on an `epi_df` from one of our own functions, we need to + # decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble, + # `res` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we + # simply need to skip adding the metadata & class. Current `decay_epi_df` + # should work in both cases. + return(decay_epi_df(res)) } - # Use reclass as safeguard (in case class &/or metadata are dropped) - res <- reclass(res, attr(x, "metadata")) + res <- reclass(res, attr(template, "metadata")) + + # XXX we may want verify the `geo_type` and `time_type` here. If it's + # significant overhead, we may also want to keep this less strict version + # around and implement some extra S3 methods that use it, when appropriate. # Amend additional metadata if some other_keys cols are dropped in the subset - old_other_keys = attr(x, "metadata")$other_keys + old_other_keys = attr(template, "metadata")$other_keys attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn] res } -#' `dplyr` verbs -#' -#' `dplyr` verbs for `epi_df` objects, preserving class and attributes. -#' -#' @method arrange epi_df -#' @param .data The `epi_df` object. -#' @rdname print.epi_df -#' @export -arrange.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) -} - -#' @method filter epi_df -#' @rdname print.epi_df -#' @export -filter.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) -} - -#' @method group_by epi_df -#' @rdname print.epi_df #' @export -group_by.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) +`[.epi_df` <- function(x, i, j, drop = FALSE) { + res <- NextMethod() + + if (!is.data.frame(res)) return(res) + + dplyr::dplyr_reconstruct(res, x) } -#' @method group_modify epi_df -#' @rdname print.epi_df +#' @importFrom dplyr dplyr_col_modify #' @export -group_modify.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) +dplyr_col_modify.epi_df = function(data, cols) { + dplyr::dplyr_reconstruct(NextMethod(), data) } -#' @method mutate epi_df -#' @rdname print.epi_df +#' @importFrom dplyr dplyr_row_slice #' @export -mutate.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) +dplyr_row_slice.epi_df = function(data, i, ...) { + dplyr::dplyr_reconstruct(NextMethod(), data) } -#' @method relocate epi_df -#' @rdname print.epi_df #' @export -relocate.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) -} - -#' @method rename epi_df -#' @rdname print.epi_df -#' @export -rename.epi_df = function(.data, ...) { - metadata = attributes(.data)$metadata - .data = NextMethod() - reclass(.data, metadata) +`names<-.epi_df` = function(x, value) { + old_names = names(x) + old_other_keys = attributes(x)$metadata$other_keys + result = NextMethod() + attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)] + dplyr::dplyr_reconstruct(result, result) } -#' @method slice epi_df +#' @method group_by epi_df #' @rdname print.epi_df #' @export -slice.epi_df = function(.data, ...) { +group_by.epi_df = function(.data, ...) { metadata = attributes(.data)$metadata .data = NextMethod() reclass(.data, metadata) @@ -190,14 +204,22 @@ ungroup.epi_df = function(x, ...) { reclass(x, metadata) } +#' @method group_modify epi_df +#' @rdname print.epi_df +#' @param .data The `epi_df` object. +#' @param .f function or formula; see [`dplyr::group_modify`] +#' @param .keep Boolean; see [`dplyr::group_modify`] +#' @export +group_modify.epi_df = function(.data, .f, ..., .keep = FALSE) { + dplyr::dplyr_reconstruct(NextMethod(), .data) +} + #' @method unnest epi_df #' @rdname print.epi_df #' @param data The `epi_df` object. #' @export unnest.epi_df = function(data, ...) { - metadata = attributes(data)$metadata - data = NextMethod() - reclass(data, metadata) + dplyr::dplyr_reconstruct(NextMethod(), data) } # Simple reclass function diff --git a/R/outliers.R b/R/outliers.R index 6cc2ffb11..e5fd87652 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -128,6 +128,10 @@ detect_outlr = function(x = seq_along(y), y, #' `y`). #' @param y Signal values. #' @param n Number of time steps to use in the rolling window. Default is 21. +#' This value is centrally aligned. When `n` is an odd number, the rolling +#' window extends from `(n-1)/2` time steps before each design point to `(n-1)/2` +#' time steps after. When `n` is even, then the rolling range extends from +#' `n/2-1` time steps before to `n/2` time steps after. #' @param log_transform Should a log transform be applied before running outlier #' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the #' log transform will be padded by 1. @@ -179,7 +183,7 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, # Calculate lower and upper thresholds and replacement value z = z %>% - epi_slide(fitted = median(y), n = n, align = "center") %>% + epi_slide(fitted = median(y), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>% dplyr::mutate(resid = y - fitted) %>% roll_iqr(n = n, detection_multiplier = detection_multiplier, @@ -332,7 +336,7 @@ roll_iqr = function(z, n, detection_multiplier, min_radius, if (typeof(z$y) == "integer") as_type = as.integer else as_type = as.numeric - epi_slide(z, roll_iqr = stats::IQR(resid), n = n, align = "center") %>% + epi_slide(z, roll_iqr = stats::IQR(resid), before = floor((n-1)/2), after = ceiling((n-1)/2)) %>% dplyr::mutate( lower = pmax(min_lower, fitted - pmax(min_radius, detection_multiplier * roll_iqr)), diff --git a/R/slide.R b/R/slide.R index aa5d56dc8..635d4d3d7 100644 --- a/R/slide.R +++ b/R/slide.R @@ -4,84 +4,116 @@ #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' @param x The `epi_df` object under consideration. -#' @param f Function or formula to slide over variables in `x`. To "slide" means -#' to apply a function or formula over a running window of `n` time steps -#' (where one time step is typically one day or one week; see details for more -#' explanation). If a function, `f` should take `x`, an `epi_df` with the same -#' names as the non-grouping columns, followed by `g` to refer to the one row -#' tibble with one column per grouping variable that identifies the group, -#' and any number of named arguments (which will be taken from `...`). If a -#' formula, `f` can operate directly on columns accessed via `.x$var`, as -#' in `~ mean(.x$var)` to compute a mean of a column var over a sliding -#' window of n time steps. As well, `.y` may be used in the formula to refer -#' to the groupings that would be described by `g` if `f` was a function. +#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' single data group. +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation within a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` and `after` parameters described below. One time +#' step is typically one day or one week; see details for more explanation. If +#' a function, `f` must take a data frame with the same column names as +#' the original object, minus any grouping variables, containing the time +#' window data for one group-`ref_time_value` combination; followed by a +#' one-row tibble containing the values of the grouping variables for the +#' associated group; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~mean(.x$var)` to compute a mean of a column `var` for each +#' `ref_time_value`-group combination. The group key can be accessed via `.y`. +#' If `f` is missing, then `...` will specify the computation. #' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then the current argument is -#' interpreted as an expression for tidy evaluation. See details. -#' @param n Number of time steps to use in the running window. For example, if -#' `n = 7`, one time step is one day, and the alignment is "right", then to -#' produce a value on January 7 we apply the given function or formula to data -#' in between January 1 and 7. +#' via `f`. Alternatively, if `f` is missing, then the `...` is interpreted as +#' an expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to `.x`, `.group_key`, and +#' `.ref_time_value`. See details. +#' @param before,after How far `before` and `after` each `ref_time_value` should +#' the sliding window extend? At least one of these two arguments must be +#' provided; the other's default will be 0. Any value provided for either +#' argument must be a single, non-`NA`, non-negative, +#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of +#' the window are inclusive. Common settings: * For trailing/right-aligned +#' windows from `ref_time_value - time_step(k)` to `ref_time_value`: either +#' pass `before=k` by itself, or pass `before=k, after=0`. * For +#' center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. * For +#' leading/left-aligned windows from `ref_time_value` to `ref_time_value + +#' time_step(k)`: either pass pass `after=k` by itself, or pass `before=0, +#' after=k`. See "Details:" about the definition of a time step, +#' (non)treatment of missing rows within the window, and avoiding warnings +#' about `before`&`after` settings for a certain uncommon use case. #' @param ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the #' underlying data table, by default. -#' @param align One of "right", "center", or "left", indicating the alignment of -#' the sliding window relative to the reference time point. If the alignment -#' is "center" and `n` is even, then one more time point will be used after -#' the reference time point than before. Default is "right". -#' @param before Positive integer less than `n`, specifying the number of time -#' points to use in the sliding window strictly before the reference time -#' point. For example, setting `before = n-1` would be the same as setting -#' `align = "right"`. The `before` argument allows for more flexible -#' specification of alignment than the `align` parameter, and if specified, -#' overrides `align`. #' @param time_step Optional function used to define the meaning of one time #' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a positive integer and return -#' an object of class `lubridate::period`. For example, we can use `time_step -#' = lubridate::hours` in order to set the time step to be one hour (this -#' would only be meaningful if `time_value` is of class `POSIXct`). +#' `time_value` column. This function must take a non-negative integer and +#' return an object of class `lubridate::period`. For example, we can use +#' `time_step = lubridate::hours` in order to set the time step to be one hour +#' (this would only be meaningful if `time_value` is of class `POSIXct`). #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col If the computations return data frames, should the slide -#' result hold these in a single list column or try to unnest them? Default is -#' `FALSE`, in which case a list object returned by `f` would be unnested -#' (using [`tidyr::unnest()`]), and the names of the resulting columns are given -#' by prepending `new_col_name` to the names of the list elements. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. #' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output; otherwise, there will be one row for each time value in `x` -#' that acts as a reference time value. Default is `FALSE`. +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. If using `as_list_col = TRUE`, note that +#' the missing marker is a `NULL` entry in the list column; for certain +#' operations, you might want to replace these `NULL` entries with a different +#' `NA` marker. #' @return An `epi_df` object given by appending a new column to `x`, named -#' according to the `new_col_name` argument. +#' according to the `new_col_name` argument. #' -#' @details To "slide" means to apply a function or formula over a running -#' window of `n` time steps, where the unit (the meaning of one time step) is -#' implicitly defined by the way the `time_value` column treats addition and -#' subtraction; for example, if the time values are coded as `Date` objects, -#' then one time step is one day, since `as.Date("2022-01-01") + 1` equals +#' @details To "slide" means to apply a function or formula over a rolling +#' window of time steps for each data group, where the window is entered at a +#' reference time and left and right endpoints are given by the `before` and +#' `after` arguments. The unit (the meaning of one time step) is implicitly +#' defined by the way the `time_value` column treats addition and subtraction; +#' for example, if the time values are coded as `Date` objects, then one time +#' step is one day, since `as.Date("2022-01-01") + 1` equals #' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly #' using the `time_step` argument (which if specified would override the -#' default choice based on `time_value` column). If less than `n` time steps -#' are available at any given reference time value, then `epi_slide()` still -#' attempts to perform the computation anyway (it does not require a complete -#' window). The issue of what to do with partial computations (those run on -#' incomplete windows) is therefore left up to the user, either through the -#' specified function or formula `f`, or through post-processing. -#' -#' If `f` is missing, then an expression for tidy evaluation can be specified, -#' for example, as in: +#' default choice based on `time_value` column). If there are not enough time +#' steps available to complete the window at any given reference time, then +#' `epi_slide()` still attempts to perform the computation anyway (it does not +#' require a complete window). The issue of what to do with partial +#' computations (those run on incomplete windows) is therefore left up to the +#' user, either through the specified function or formula `f`, or through +#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a +#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the +#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and +#' `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` +#' with `k=0` and `after` missing may produce a warning. To avoid warnings, +#' use `before=k, after=0` instead; otherwise, it looks too much like a +#' leading window was intended, but the `after` argument was forgotten or +#' misspelled.) +#' +#' If `f` is missing, then an expression for tidy evaluation can be specified, +#' for example, as in: #' ``` -#' epi_slide(x, cases_7dav = mean(cases), n = 7) +#' epi_slide(x, cases_7dav = mean(cases), before = 6) #' ``` #' which would be equivalent to: #' ``` -#' epi_slide(x, function(x, ...) mean(x$cases), n = 7, +#' epi_slide(x, function(x, g) mean(x$cases), before = 6, #' new_col_name = "cases_7dav") #' ``` #' Thus, to be clear, when the computation is specified via an expression for @@ -90,88 +122,149 @@ #' through the `new_col_name` argument. #' #' @importFrom lubridate days weeks -#' @importFrom rlang .data .env !! enquo enquos sym +#' @importFrom dplyr bind_rows group_vars filter select +#' @importFrom rlang .data .env !! enquo enquos sym env #' @export #' @examples -#' # slide a 7-day trailing average formula on cases -#' jhu_csse_daily_subset %>% +#' # slide a 7-day trailing average formula on cases +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide(cases_7dav = mean(cases), before = 6) %>% +#' # rmv a nonessential var. to ensure new col is printed +#' dplyr::select(-death_rate_7d_av) +#' +#' # slide a 7-day leading average +#' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), n = 7, -#' align = "right") %>% +#' epi_slide(cases_7dav = mean(cases), after = 6) %>% +#' # rmv a nonessential var. to ensure new col is printed +#' dplyr::select(-death_rate_7d_av) +#' +#' # slide a 7-day centre-aligned average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) -#' -#' # slide a left-aligned 7-day average -#' jhu_csse_daily_subset %>% +#' +#' # slide a 14-day centre-aligned average +#' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), n = 7, -#' align = "left") %>% +#' epi_slide(cases_7dav = mean(cases), before = 6, after = 7) %>% #' # rmv a nonessential var. to ensure new col is printed #' dplyr::select(-death_rate_7d_av) -#' -#' # nested new columns -#' jhu_csse_daily_subset %>% -#' group_by(geo_value) %>% -#' epi_slide(a = data.frame(cases_2dav = mean(cases), -#' cases_2dma = mad(cases)), -#' n = 2, as_list_col = TRUE) -epi_slide = function(x, f, ..., n, ref_time_values, - align = c("right", "center", "left"), before, time_step, +#' +#' # nested new columns +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide(a = data.frame(cases_2dav = mean(cases), +#' cases_2dma = mad(cases)), +#' before = 1, as_list_col = TRUE) +epi_slide = function(x, f, ..., before, after, ref_time_values, + time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { + # Check we have an `epi_df` object if (!inherits(x, "epi_df")) Abort("`x` must be of class `epi_df`.") + + # Check that `f` takes enough args + if (!missing(f) && is.function(f)) { + assert_sufficient_f_args(f, ..., n_mandatory_f_args = 3L) + } - # Arrange by increasing time_value - x = arrange(x, time_value) - - # If missing, then set ref time values to be everything; else make sure we - # intersect with observed time values if (missing(ref_time_values)) { ref_time_values = unique(x$time_value) - } - else { - ref_time_values = ref_time_values[ref_time_values %in% - unique(x$time_value)] } - - # If before is missing, then use align to set up alignment - if (missing(before)) { - align = match.arg(align) - if (align == "right") { - before_num = n-1 - after_num = 0 - } - else if (align == "center") { - before_num = floor((n-1)/2) - after_num = ceiling((n-1)/2) + + # Some of these `ref_time_values` checks and processing steps also apply to + # the `ref_time_values` default; for simplicity, just apply all the steps + # regardless of whether we are working with a default or user-provided + # `ref_time_values`: + if (length(ref_time_values) == 0L) { + Abort("`ref_time_values` must have at least one element.") + } else if (any(is.na(ref_time_values))) { + Abort("`ref_time_values` must not include `NA`.") + } else if (anyDuplicated(ref_time_values) != 0L) { + Abort("`ref_time_values` must not contain any duplicates; use `unique` if appropriate.") + } else if (!all(ref_time_values %in% unique(x$time_value))) { + Abort("All `ref_time_values` must appear in `x$time_value`.") + } else { + ref_time_values = sort(ref_time_values) + } + + # Validate and pre-process `before`, `after`: + if (!missing(before)) { + before <- vctrs::vec_cast(before, integer()) + if (length(before) != 1L || is.na(before) || before < 0L) { + Abort("`before` must be length-1, non-NA, non-negative") } - else { - before_num = 0 - after_num = n-1 + } + if (!missing(after)) { + after <- vctrs::vec_cast(after, integer()) + if (length(after) != 1L || is.na(after) || after < 0L) { + Abort("`after` must be length-1, non-NA, non-negative") } } - - # Otherwise set up alignment based on passed before value - else { - if (before < 0 || before > n-1) { - Abort("`before` must be in between 0 and n-1`.") + if (missing(before)) { + if (missing(after)) { + Abort("Either or both of `before`, `after` must be provided.") + } else if (after == 0L) { + Warn("`before` missing, `after==0`; maybe this was intended to be some + non-zero-width trailing window, but since `before` appears to be + missing, it's interpreted as a zero-width window (`before=0, + after=0`).") } - - before_num = before - after_num = n-1-before + before <- 0L + } else if (missing(after)) { + if (before == 0L) { + Warn("`before==0`, `after` missing; maybe this was intended to be some + non-zero-width leading window, but since `after` appears to be + missing, it's interpreted as a zero-width window (`before=0, + after=0`).") + } + after <- 0L } - # If a custom time step is specified, then redefine units + # If a custom time step is specified, then redefine units if (!missing(time_step)) { - before_num = time_step(before_num) - after_num = time_step(after_num) + before <- time_step(before) + after <- time_step(after) + } + + min_ref_time_values = ref_time_values - before + min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))] + + # Do set up to let us recover `ref_time_value`s later. + # A helper column marking real observations. + x$.real = TRUE + + # Create df containing phony data. Df has the same columns and attributes as + # `x`, but filled with `NA`s aside from grouping columns. Number of rows is + # equal to the number of `min_ref_time_values_not_in_x` we have * the + # number of unique levels seen in the grouping columns. + before_time_values_df = data.frame(time_value=min_ref_time_values_not_in_x) + if (length(group_vars(x)) != 0) { + before_time_values_df = dplyr::cross_join( + # Get unique combinations of grouping columns seen in real data. + unique(x[, group_vars(x)]), + before_time_values_df + ) } + # Automatically fill in all other columns from `x` with `NA`s, and carry + # attributes over to new df. + before_time_values_df <- bind_rows(x[0,], before_time_values_df) + before_time_values_df$.real <- FALSE + + x <- bind_rows(before_time_values_df, x) + + # Arrange by increasing time_value + x = arrange(x, time_value) # Now set up starts and stops for sliding/hopping time_range = range(unique(x$time_value)) - starts = in_range(ref_time_values - before_num, time_range) - stops = in_range(ref_time_values + after_num, time_range) + starts = in_range(ref_time_values - before, time_range) + stops = in_range(ref_time_values + after, time_range) if( length(starts) == 0 || length(stops) == 0 ) { Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).") @@ -198,74 +291,83 @@ epi_slide = function(x, f, ..., n, ref_time_values, time_values = time_values[o] # Compute the slide values - slide_values = slider::hop_index(.x = .data_group, - .i = .data_group$time_value, - .f = f, ..., - .starts = starts, - .stops = stops) + slide_values_list = slider::hop_index(.x = .data_group, + .i = .data_group$time_value, + .f = f, ..., + .starts = starts, + .stops = stops) # Now figure out which rows in the data group are in the reference time # values; this will be useful for all sorts of checks that follow o = .data_group$time_value %in% time_values num_ref_rows = sum(o) - # Count the number of appearances of each reference time value + # Count the number of appearances of each reference time value (these + # appearances should all be real for now, but if we allow ref time values + # outside of .data_group's time values): counts = .data_group %>% dplyr::filter(.data$time_value %in% time_values) %>% dplyr::count(.data$time_value) %>% dplyr::pull(n) - # If they're all atomic vectors - if (all(sapply(slide_values, is.atomic))) { - if (all(sapply(slide_values, length) == 1)) { - # Recycle to make size stable (one slide value per ref time value) - slide_values = rep(unlist(slide_values), times = counts) - } - else { - # Unlist, then check its length, and abort if not right - slide_values = unlist(slide_values) - if (length(slide_values) != num_ref_rows) { - Abort("If the slide computations return atomic vectors, then they must each have a single element, or else one element per appearance of the reference time value in the local window.") - } - } + if (!all(purrr::map_lgl(slide_values_list, is.atomic)) && + !all(purrr::map_lgl(slide_values_list, is.data.frame))) { + Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") } - - # If they're all data frames - else if (all(sapply(slide_values, is.data.frame))) { - if (all(sapply(slide_values, nrow) == 1)) { - # Recycle to make size stable (one slide value per ref time value) - slide_values = rep(slide_values, times = counts) + + # Unlist if appropriate: + slide_values = + if (as_list_col) { + slide_values_list + } else { + vctrs::list_unchop(slide_values_list) + } + + if (all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && + length(slide_values_list) != 0L) { + # Recycle to make size stable (one slide value per ref time value). + # (Length-0 case also could be handled here, but causes difficulties; + # leave it to the next branch, where it also belongs.) + slide_values = vctrs::vec_rep_each(slide_values, times = counts) + } else { + # Split and flatten if appropriate, perform a (loose) check on number of + # rows. + if (as_list_col) { + slide_values = purrr::list_flatten(purrr::map( + slide_values, ~ vctrs::vec_split(.x, seq_len(vctrs::vec_size(.x)))[["val"]] + )) } - else { - # Split (each row on its own), check length, abort if not right - slide_df = dplyr::bind_rows(slide_values) - slide_values = split(slide_df, 1:nrow(slide_df)) - if (length(slide_values) != num_ref_rows) { - Abort("If the slide computations return data frames, then they must each have a single row, or else one row per appearance of the reference time value in the local window.") - } + if (vctrs::vec_size(slide_values) != num_ref_rows) { + Abort("The slide computations must either (a) output a single element/row each, or (b) one element/row per appearance of the reference time value in the local window.") } } - - # If neither all atomic vectors or all data frames, then abort - else { - Abort("The slide computations must return always atomic vectors or data frames (and not a mix of these two structures).") - } - + # If all rows, then pad slide values with NAs, else filter down data group if (all_rows) { orig_values = slide_values - slide_values = rep(NA, nrow(.data_group)) - slide_values[o] = orig_values + slide_values = vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) + # ^ using vctrs::vec_init would be shorter but docs don't guarantee it + # fills with NA equivalent. + vctrs::vec_slice(slide_values, o) = orig_values + } else { + # This implicitly removes phony (`.real` == FALSE) observations. + .data_group = filter(.data_group, o) } - else .data_group = filter(.data_group, o) return(mutate(.data_group, !!new_col := slide_values)) } # If f is not missing, then just go ahead, slide by group if (!missing(f)) { + if (rlang::is_formula(f)) f = as_slide_computation(f) + f_rtv_wrapper = function(x, g, ...) { + ref_time_value = min(x$time_value) + before + x <- x[x$.real,] + x$.real <- NULL + f(x, g, ref_time_value, ...) + } x = x %>% group_modify(slide_one_grp, - f = f, ..., + f = f_rtv_wrapper, ..., starts = starts, stops = stops, time_values = ref_time_values, @@ -285,7 +387,18 @@ epi_slide = function(x, f, ..., n, ref_time_values, } quo = quos[[1]] - f = function(x, quo, ...) rlang::eval_tidy(quo, x) + f = function(.x, .group_key, quo, ...) { + .ref_time_value = min(.x$time_value) + before + .x <- .x[.x$.real,] + .x$.real <- NULL + data_mask = rlang::as_data_mask(.x) + # We'll also install `.x` directly, not as an `rlang_data_pronoun`, so + # that we can, e.g., use more dplyr and epiprocess operations. + data_mask$.x = .x + data_mask$.group_key = .group_key + data_mask$.ref_time_value = .ref_time_value + rlang::eval_tidy(quo, data_mask) + } new_col = sym(names(rlang::quos_auto_name(quos))) x = x %>% @@ -303,5 +416,15 @@ epi_slide = function(x, f, ..., n, ref_time_values, if (!as_list_col) { x = unnest(x, !!new_col, names_sep = names_sep) } + + # Remove any remaining phony observations. When `all_rows` is TRUE, phony + # observations aren't necessarily removed in `slide_one_grp`. + if (all_rows) { + x <- x[x$.real,] + } + + # Drop helper column `.real`. + x$.real <- NULL + return(x) } diff --git a/R/utils.R b/R/utils.R index f6b4b790f..e4625a4ff 100644 --- a/R/utils.R +++ b/R/utils.R @@ -4,9 +4,288 @@ break_str = function(str, nchar = 79, init = "") { return(str) } +# Note: update `wrap_symbolics` and `wrap_varnames` (parameters, parameter +# defaults, bodies) together. + +#' Line wrap list holding [symbolic][rlang::is_symbolic], with prefix&indent +#' +#' Helps pretty-print these objects. Adds backticks, commas, prefixes, and +#' indentation. Wraps lines, but won't insert line breaks in the middle of any +#' name while doing so. +#' +#' @param symbolics List of [symbolic][rlang::is_symbolic] objects: the variable +#' names (potentially empty) +#' @param initial Optional; single string: a prefix for the initial line in the +#' result; e.g., "Variable names: ". Defaults to "". Any non-initial lines +#' will be indented with whitespace matching the (estimated) visual width of +#' `initial`. +#' @param common_prefix Optional; single string: a prefix for every line (will +#' appear before `initial`); e.g., "# ". Defaults to "". +#' @param none_str Optional; single string: what to display when given +#' `length`-0 input. Will be combined with `common_prefix` and `initial`. +#' @param width Optional; single integer: desired maximum formatted line width. +#' The formatted output may not obey this setting if `common_prefix` plus +#' `initial` is long or the printing width is very narrow. +#' @return `chr`; to print, use [`base::writeLines`]. +#' +#' @noRd +wrap_symbolics = function(symbolics, + initial = "", common_prefix = "", none_str = "", + width = getOption("width", 80L)) { + if (!all(purrr::map_lgl(symbolics, rlang::is_symbolic))) { + Abort("`symbolics` must be a list of symbolic objects") + } + if (!rlang::is_string(initial)) { + Abort("`initial` must be a string") + } + if (!rlang::is_string(common_prefix)) { + Abort("`common_prefix` must be a string") + } + if (!rlang::is_string(none_str)) { + Abort("`none_str` must be a string") + } + prefix = strrep(" ", nchar(initial, type="width")) + full_initial = paste0(common_prefix, initial) + full_prefix = paste0(common_prefix, prefix) + full_initial_width = nchar(full_initial, type="width") + minimum_reasonable_line_width_for_syms = 20L + line_width_for_syms = max(width - full_initial_width, + minimum_reasonable_line_width_for_syms) + unprefixed_lines = + if (length(symbolics) == 0L) { + none_str + } else { + utils::capture.output( + withr::with_options(list("width" = line_width_for_syms), { + # `paste0` already takes care of necessary backquotes. `cat` with + # `fill=TRUE` takes care of spacing + line wrapping exclusively + # between elements. We need to add commas appropriately. + cat(paste0(symbolics, c(rep(",", times=length(symbolics)-1L), "")), fill=TRUE) + }) + ) + } + lines = paste0(c(full_initial, rep(full_prefix, times=length(unprefixed_lines)-1L)), + unprefixed_lines) + lines +} + +#' Line wrap `chr` holding variable/column/other names, with prefix&indent +#' +#' @param nms Character vector: the variable names (potentially empty) +#' @inheritParams wrap_symbolics +#' @return `chr`; to print, use [`base::writeLines`]. +#' +#' @noRd +wrap_varnames = function(nms, + initial = "", common_prefix = "", none_str = "", + width = getOption("width", 80L)) { + # (Repeating parameter names and default args here for better autocomplete. + # Using `...` instead would require less upkeep, but have worse autocomplete.) + if (!rlang::is_character(nms)) { + Abort("`nms` must be a character vector") + } + wrap_symbolics(rlang::syms(nms), initial=initial, common_prefix=common_prefix, none_str=none_str, width=width) +} + +#' Paste `chr` entries (lines) together with `"\n"` separators, trailing `"\n"` +#' +#' @param lines `chr` +#' @return string +#' +#' @noRd +paste_lines = function(lines) { + paste(paste0(lines,"\n"), collapse="") +} + Abort = function(msg, ...) rlang::abort(break_str(msg, init = "Error: "), ...) Warn = function(msg, ...) rlang::warn(break_str(msg, init = "Warning: "), ...) +#' Assert that a sliding computation function takes enough args +#' +#' @param f Function; specifies a computation to slide over an `epi_df` or +#' `epi_archive` in `epi_slide` or `epix_slide`. +#' @param ... Dots that will be forwarded to `f` from the dots of `epi_slide` or +#' `epix_slide`. +#' @param n_mandatory_f_args Integer; specifies the number of arguments `f` +#' is required to take before any `...` arg. Defaults to 2. +#' +#' @importFrom rlang is_missing +#' @importFrom purrr map_lgl +#' @importFrom utils tail +#' +#' @noRd +assert_sufficient_f_args <- function(f, ..., n_mandatory_f_args = 2L) { + mandatory_f_args_labels <- c("window data", "group key", "reference time value")[seq(n_mandatory_f_args)] + n_mandatory_f_args <- length(mandatory_f_args_labels) + args = formals(args(f)) + args_names = names(args) + # Remove named arguments forwarded from `epi[x]_slide`'s `...`: + forwarded_dots_names = names(rlang::call_match(dots_expand = FALSE)[["..."]]) + args_matched_in_dots = + # positional calling args will skip over args matched by named calling args + args_names %in% forwarded_dots_names & + # extreme edge case: `epi[x]_slide(, dot = 1, `...` = 2)` + args_names != "..." + remaining_args = args[!args_matched_in_dots] + remaining_args_names = names(remaining_args) + # note that this doesn't include unnamed args forwarded through `...`. + dots_i <- which(remaining_args_names == "...") # integer(0) if no match + n_f_args_before_dots <- dots_i - 1L + if (length(dots_i) != 0L) { # `f` has a dots "arg" + # Keep all arg names before `...` + mandatory_args_mapped_names <- remaining_args_names[seq_len(n_f_args_before_dots)] + + if (n_f_args_before_dots < n_mandatory_f_args) { + mandatory_f_args_in_f_dots = + tail(mandatory_f_args_labels, n_mandatory_f_args - n_f_args_before_dots) + cli::cli_warn( + "`f` might not have enough positional arguments before its `...`; in the current `epi[x]_slide` call, the {mandatory_f_args_in_f_dots} will be included in `f`'s `...`; if `f` doesn't expect those arguments, it may produce confusing error messages", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots", + epiprocess__f = f, + epiprocess__mandatory_f_args_in_f_dots = mandatory_f_args_in_f_dots + ) + } + } else { # `f` doesn't have a dots "arg" + if (length(args_names) < n_mandatory_f_args + rlang::dots_n(...)) { + # `f` doesn't take enough args. + if (rlang::dots_n(...) == 0L) { + # common case; try for friendlier error message + Abort(sprintf("`f` must take at least %s arguments", n_mandatory_f_args), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", + epiprocess__f = f) + } else { + # less common; highlight that they are (accidentally?) using dots forwarding + Abort(sprintf("`f` must take at least %s arguments plus the %s arguments forwarded through `epi[x]_slide`'s `...`, or a named argument to `epi[x]_slide` was misspelled", n_mandatory_f_args, rlang::dots_n(...)), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded", + epiprocess__f = f) + } + } + } + # Check for args with defaults that are filled with mandatory positional + # calling args. If `f` has fewer than n_mandatory_f_args before `...`, then we + # only need to check those args for defaults. Note that `n_f_args_before_dots` is + # length 0 if `f` doesn't accept `...`. + n_remaining_args_for_default_check = min(c(n_f_args_before_dots, n_mandatory_f_args)) + default_check_args = remaining_args[seq_len(n_remaining_args_for_default_check)] + default_check_args_names = names(default_check_args) + has_default_replaced_by_mandatory = map_lgl(default_check_args, ~!is_missing(.x)) + if (any(has_default_replaced_by_mandatory)) { + default_check_mandatory_args_labels = + mandatory_f_args_labels[seq_len(n_remaining_args_for_default_check)] + # ^ excludes any mandatory args absorbed by f's `...`'s: + mandatory_args_replacing_defaults = + default_check_mandatory_args_labels[has_default_replaced_by_mandatory] + args_with_default_replaced_by_mandatory = + rlang::syms(default_check_args_names[has_default_replaced_by_mandatory]) + cli::cli_abort("`epi[x]_slide` would pass the {mandatory_args_replacing_defaults} to `f`'s {args_with_default_replaced_by_mandatory} argument{?s}, which {?has a/have} default value{?s}; we suspect that `f` doesn't expect {?this arg/these args} at all and may produce confusing error messages. Please add additional arguments to `f` or remove defaults as appropriate.", + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults", + epiprocess__f = f) + } +} + +#' Convert to function +#' +#' @description +#' `as_slide_computation()` transforms a one-sided formula into a function. +#' This powers the lambda syntax in packages like purrr. +#' +#' This code and documentation borrows heavily from [`rlang::as_function`] +#' (https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L343-L427). +#' +#' This code extends `rlang::as_function` to create functions that take three +#' arguments. The arguments can be accessed via the idiomatic `.x`, `.y`, +#' etc, positional references (`..1`, `..2`, etc), and also by `epi +#' [x]_slide`-specific names. +#' +#' @source https://github.com/r-lib/rlang/blob/c55f6027928d3104ed449e591e8a225fcaf55e13/R/fn.R#L343-L427 +#' +#' @param x A function or formula. +#' +#' If a **function**, it is used as is. +#' +#' If a **formula**, e.g. `~ mean(.x$cases)`, it is converted to a function with up +#' to three arguments: `.x` (single argument), or `.x` and `.y` +#' (two arguments), or `.x`, `.y`, and `.z` (three arguments). The `.` +#' placeholder can be used instead of `.x`, `.group_key` can be used in +#' place of `.y`, and `.ref_time_value` can be used in place of `.z`. This +#' allows you to create very compact anonymous functions (lambdas) with up +#' to three inputs. Functions created from formulas have a special class. Use +#' `rlang::is_lambda()` to test for it. +#' +#' If a **string**, the function is looked up in `env`. Note that +#' this interface is strictly for user convenience because of the +#' scoping issues involved. Package developers should avoid +#' supplying functions by name and instead supply them by value. +#' +#' @param env Environment in which to fetch the function in case `x` +#' is a string. +#' @inheritParams rlang::args_dots_empty +#' @inheritParams rlang::args_error_context +#' @examples +#' f <- as_slide_computation(~ .x + 1) +#' f(10) +#' +#' g <- as_slide_computation(~ -1 * .) +#' g(4) +#' +#' h <- as_slide_computation(~ .x - .group_key) +#' h(6, 3) +#' +#' @importFrom rlang check_dots_empty0 is_function new_function f_env +#' is_environment missing_arg f_rhs is_string is_formula caller_arg +#' caller_env global_env +#' +#' @noRd +as_slide_computation <- function(x, + env = global_env(), + ..., + arg = caller_arg(x), + call = caller_env()) { + check_dots_empty0(...) + + if (is_function(x)) { + return(x) + } + + if (is_formula(x)) { + if (length(x) > 2) { + Abort(sprintf("%s must be a one-sided formula", arg), + class = "epiprocess__as_slide_computation__formula_is_twosided", + epiprocess__x = x, + call = call) + } + + env <- f_env(x) + if (!is_environment(env)) { + Abort("Formula must carry an environment.", + class = "epiprocess__as_slide_computation__formula_has_no_env", + epiprocess__x = x, + epiprocess__x_env = env, + arg = arg, call = call) + } + + args <- list( + ... = missing_arg(), + .x = quote(..1), .y = quote(..2), .z = quote(..3), + . = quote(..1), .group_key = quote(..2), .ref_time_value = quote(..3) + ) + fn <- new_function(args, f_rhs(x), env) + fn <- structure(fn, class = c("epiprocess_slide_computation", "function")) + return(fn) + } + + if (is_string(x)) { + return(get(x, envir = env, mode = "function")) + } + + Abort(sprintf("Can't convert a %s to a slide computation", class(x)), + class = "epiprocess__as_slide_computation__cant_convert_catchall", + epiprocess__x = x, + epiprocess__x_class = class(x), + arg = arg, + call = call) +} + ########## in_range = function(x, rng) pmin(pmax(x, rng[1]), rng[2]) @@ -128,3 +407,220 @@ enlist = function(...) { list2var = function(x) { list2env(x, envir = parent.frame()) } + +########## + +#' [`lifecycle::is_present`] for enquosed deprecated NSE arg +#' +#' [`lifecycle::is_present`] is designed for use with args that undergo standard +#' evaluation, rather than non-standard evaluation (NSE). This function is +#' designed to fulfill a similar purpose, but for args we have +#' [enquosed][rlang::enquo] in preparation for NSE. +#' +#' @param quo [enquosed][rlang::enquo] arg +#' @return bool; was `quo` "present", or did it look like a missing quosure or +#' have an expr that looked like `deprecated()` or `lifecycle::deprecated()`? +#' +#' @examples +#' +#' fn = function(x = deprecated()) { +#' deprecated_quo_is_present(rlang::enquo(x)) +#' } +#' +#' fn() # FALSE +#' fn(.data$something) # TRUE +#' +#' # Functions that wrap `fn` should forward the NSE arg to `fn` using +#' # [`{{ arg }}`][rlang::embrace-operator] (or, if they are working from an +#' # argument that has already been defused into a quosure, `!!quo`). (This is +#' # already how NSE arguments that will be enquosed should be forwarded.) +#' +#' wrapper1 = function(x=deprecated()) fn({{x}}) +#' wrapper2 = function(x=lifecycle::deprecated()) fn({{x}}) +#' wrapper3 = function(x) fn({{x}}) +#' wrapper4 = function(x) fn(!!rlang::enquo(x)) +#' +#' wrapper1() # FALSE +#' wrapper2() # FALSE +#' wrapper3() # FALSE +#' wrapper4() # FALSE +#' +#' # More advanced: wrapper that receives an already-enquosed arg: +#' +#' inner_wrapper = function(quo) fn(!!quo) +#' outer_wrapper1 = function(x=deprecated()) inner_wrapper(rlang::enquo(x)) +#' +#' outer_wrapper1() # FALSE +#' +#' # Improper argument forwarding from a wrapper function will cause this +#' # function to produce incorrect results. +#' bad_wrapper1 = function(x) fn(x) +#' bad_wrapper1() # TRUE, bad +#' +#' @noRd +deprecated_quo_is_present = function(quo) { + if (!rlang::is_quosure(quo)) { + Abort("`quo` must be a quosure; `enquo` the arg first", + internal=TRUE) + } else if (rlang::quo_is_missing(quo)) { + FALSE + } else { + quo_expr = rlang::get_expr(quo) + if (identical(quo_expr, rlang::expr(deprecated())) || + identical(quo_expr, rlang::expr(lifecycle::deprecated()))) { + FALSE + } else { + TRUE + } + } +} + +########## + +#' Find the greatest common divisor of two numeric scalars +#' +#' Not expected to be used directly; output class isn't precise, and checks +#' could be moved away into [`gcd_num`]. +#' +#' An implementation of a least absolute remainder Euclidean algorithm (See, +#' e.g., Moore, Thomas. "On the least absolute remainder Euclidean algorithm." +#' The Fibonacci Quarterly (1992).) +#' +#' Notes on this implementation: +#' * We allow positive or negative inputs, and don't require |a| > |b|. +#' * `round` combines the job of truncating division and deciding between +#' positive and negative remainders. +#' * We use some tolerance parameters and different checks to allow this to work +#' on floating-point numbers. Perhaps they could be altered or removed if we +#' are passed integers, but for simplicity, we always perform these checks. +#' +#' @param a Length 1, `is.numeric`; the first number +#' @param b Length 1, `is.numeric`; the second number +#' @param rrtol Optional, length 1, `is.numeric`, non-negative; the remainder +#' relative tolerance: consider a remainder from a division operation to be +#' zero if it is `abs(remainder/divisor) <= rrtol`. Could also be described as +#' a tolerance on the fractional part of the proper quotient. Default is 1e-6. +#' @param pqlim Optional, length 1, `is.numeric`, non-negative; the proper +#' quotient limit: consider a divisor to be zero if `abs(dividend/divisor) >= +#' pqlim`. +#' @param irtol Optional, length 1, `is.numeric`, non-negative; the iterand +#' relative tolerance: consider `a` and `b` to have no gcd if the absolute +#' value of an iterand (and consequently also any result that might be +#' subsequently produced, as absolute values of iterands are decreasing) is +#' `<= irtol * a` or `<= irtol * b`. Also can be seen as the reciprocal of a +#' limit on the number `k` needed to achieve `k * gcd_result == +#' max(abs(a),abs(b))`. +#' @return Length 1, `is.numeric`: the gcd. (Or an error.) Expected to be a +#' double unless `b` is the GCD and an integer, in which case it is expected +#' be an integer. +#' +#' @noRd +gcd2num = function(a, b, rrtol=1e-6, pqlim=1e6, irtol=1e-6) { + if (!is.numeric(a) || length(a) != 1L) { + Abort("`a` must satisfy `is.numeric`, have `length` 1.") + } + if (!is.numeric(b) || length(b) != 1L) { + Abort("`b` must satisfy `is.numeric`, have `length` 1.") + } + if (!is.numeric(rrtol) || length(rrtol) != 1L || rrtol < 0) { + Abort("`rrtol` must satisfy `is.numeric`, have `length` 1, and be non-negative.") + } + if (!is.numeric(pqlim) || length(pqlim) != 1L || pqlim < 0) { + Abort("`pqlim` must satisfy `is.numeric`, have `length` 1, and be non-negative.") + } + if (!is.numeric(irtol) || length(irtol) != 1L || irtol < 0) { + Abort("`irtol` must satisfy `is.numeric`, have `length` 1, and be non-negative.") + } + if (is.na(a) || is.na(b) || a == 0 || b == 0 || abs(a/b) >= pqlim || abs(b/a) >= pqlim) { + Abort("`a` and/or `b` is either `NA` or exactly zero, or one is so much smaller than the other that it looks like it's supposed to be zero; see `pqlim` setting.") + } + iatol = irtol * max(a,b) + a_curr = a + b_curr = b + while (TRUE) { + # `b_curr` is the candidate GCD / iterand; check first if it seems too small: + if (abs(b_curr) <= iatol) { + Abort('No GCD found; remaining potential Gads are all too small relative to one/both of the original inputs; see `irtol` setting.') + } + remainder = a_curr - round(a_curr / b_curr) * b_curr + if (abs(remainder / b_curr) <= rrtol) { + # We consider `a_curr` divisible by `b_curr`; `b_curr` is the GCD or its negation + return (abs(b_curr)) + } + a_curr <- b_curr + b_curr <- remainder + } +} + +#' Find the greatest common divisor of all entries in a numeric vector +#' +#' @param dividends `is.numeric`, `length` > 0; the dividends for which to find +#' the greatest common divisor. +#' @param ... should be empty; forces the following parameters to be passed by +#' name +#' @inheritParams gcd2num +#' @return Same [`vctrs::vec_ptype`] as `dividends`, `length` 1: the gcd. (Or an +#' error.) +#' +#' @noRd +gcd_num = function(dividends, ..., rrtol=1e-6, pqlim=1e6, irtol=1e-6) { + if (!is.numeric(dividends) || length(dividends) == 0L) { + Abort("`dividends` must satisfy `is.numeric`, and have `length` > 0") + } + if (rlang::dots_n(...) != 0L) { + Abort("`...` should be empty; all dividends should go in a single `dividends` vector, and all tolerance&limit settings should be passed by name.") + } + # We expect a bunch of duplicate `dividends` for some applications. + # De-duplicate to reduce work. Sort by absolute value to attempt to reduce + # workload. Also take `abs` early on as another form of deduplication and to + # make the sort simpler. Use `na.last=FALSE` in the sort to preserve presence + # of `NA`s in order to get a better error message in this case. + optimized_dividends = sort(unique(abs(dividends)), na.last=FALSE) + # Note that taking the prime factorizations of a set of integers, and + # calculating the minimum power for each prime across all these + # factorizations, yields the prime factorization of the GCD of the set of + # integers. We could carry these parallel minimum operations out using + # `reduce`, so we see that calculating the GCD of a set of integers can be + # done via `reduce`. Note that we should always have "gcd_real"(reals) = + # gcd_int(reals / integerizing_divisor) * integerizing_divisor for *every* + # integerizing divisor that would make "gcd_int" applicable. There is a + # greatest integerizing divisor if there is a GCD at all, and this is the + # "gcd_real" itself, for which the "gcd_int" in the previous equation is 1; + # the set of valid integerizing divisors is the set of nonzero integer + # multiples of the greatest integerizing divisor. The gcd_real of X U Y is an + # integerizing factor for X U Y as well as X and Y individually, and we can + # see gcd_real(X U Y) = gcd_int(XUY / gcd(XUY)) * gcd(XUY) = + # gcd2int(gcd_int(X/gcd_real(XUY)), gcd_int(Y/gcd_real(XUY))) * gcd(XUY) = + # gcd2real(gcd_int(X/gcd_real(XUY))*gcd_real(XUY), + # gcd_int(Y/gcd_real(XUY))*gcd_real(XUY)) = gcd2real(gcd_real(X), + # gcd_real(Y)). So "gcd_real" should also be `reduce`-compatible. + numeric_gcd = purrr::reduce(optimized_dividends, gcd2num, + rrtol=rrtol, pqlim=pqlim, irtol=irtol) + vctrs::vec_cast(numeric_gcd, dividends) +} + +#' Use max valid period as guess for `period` of `ref_time_values` +#' +#' @param ref_time_values Vector containing time-interval-like or time-like +#' data, with at least two distinct values, [`diff`]-able (e.g., a +#' `time_value` or `version` column), and should have a sensible result from +#' adding `is.numeric` versions of its `diff` result (via `as.integer` if its +#' `typeof` is `"integer"`, otherwise via `as.numeric`). +#' @param ref_time_values_arg Optional, string; name to give `ref_time_values` +#' in error messages. Defaults to quoting the expression the caller fed into +#' the `ref_time_values` argument. +#' @return `is.numeric`, length 1; attempts to match `typeof(ref_time_values)` +guess_period = function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { + sorted_distinct_ref_time_values = sort(unique(ref_time_values)) + if (length(sorted_distinct_ref_time_values) < 2L) { + Abort(sprintf("Not enough distinct values in `%s` to guess the period.", ref_time_values_arg)) + } + skips = diff(sorted_distinct_ref_time_values) + decayed_skips = + if (typeof(skips) == "integer") { + as.integer(skips) + } else { + as.numeric(skips) + } + gcd_num(decayed_skips) +} diff --git a/README.md b/README.md index 5857325aa..0c117c611 100644 --- a/README.md +++ b/README.md @@ -50,9 +50,9 @@ By convention, functions in the `epiprocess` package that operate on "archive"). These are just wrapper functions around the public methods for the `epi_archive` R6 class. For example: -- `epix_as_of()`, for generating a snapshot in `epi_df` from the data archive, - which represents the most up-to-date values of the signal variables, as of the - specified version; +- `epix_as_of()`, for generating a snapshot in `epi_df` format from the data + archive, which represents the most up-to-date values of the signal variables, + as of the specified version; - `epix_fill_through_version()`, for filling in some fake version data following simple rules, for use when downstream methods expect an archive that is more diff --git a/_pkgdown.yml b/_pkgdown.yml index 403cd1743..bba3ea8d4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -57,9 +57,10 @@ reference: - contents: - matches("archive") - title: "`epix_*()` functions" - desc: Functions that act on an `epi_archive` object. + desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. - contents: - starts_with("epix") + - group_by.epi_archive - title: Example data - contents: - archive_cases_dv_subset @@ -70,4 +71,5 @@ reference: - epiprocess - max_version_with_row_in - next_after + - guess_period diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index 561bf35f2..d907fd897 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -12,7 +12,7 @@ dv_subset <- covidcast( geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% # We're using compactify=FALSE here and below to avoid some testthat test # failures on tests that were based on a non-compactified version. @@ -27,7 +27,7 @@ case_rate_subset <- covidcast( geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% as_epi_archive(compactify=FALSE) diff --git a/data-raw/incidence_num_outlier_example.R b/data-raw/incidence_num_outlier_example.R index 560edd2f2..0aea397b3 100644 --- a/data-raw/incidence_num_outlier_example.R +++ b/data-raw/incidence_num_outlier_example.R @@ -12,7 +12,7 @@ incidence_num_outlier_example <- covidcast( geo_values = "fl,nj", as_of = 20211028 ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() diff --git a/data-raw/jhu_csse_county_level_subset.R b/data-raw/jhu_csse_county_level_subset.R index 350ba376d..613284238 100644 --- a/data-raw/jhu_csse_county_level_subset.R +++ b/data-raw/jhu_csse_county_level_subset.R @@ -17,7 +17,7 @@ jhu_csse_county_level_subset <- covidcast( time_values = epirange(20200601, 20211231), geo_values = paste(y$geo_value, collapse = ",") ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% as_epi_df() diff --git a/data-raw/jhu_csse_daily_subset.R b/data-raw/jhu_csse_daily_subset.R index 7942d9a4c..ce94ff2eb 100644 --- a/data-raw/jhu_csse_daily_subset.R +++ b/data-raw/jhu_csse_daily_subset.R @@ -10,7 +10,7 @@ confirmed_7dav_incidence_prop <- covidcast( time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, case_rate_7d_av = value) %>% arrange(geo_value, time_value) @@ -22,7 +22,7 @@ deaths_7dav_incidence_prop <- covidcast( time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, death_rate_7d_av = value) %>% arrange(geo_value, time_value) @@ -34,7 +34,7 @@ confirmed_incidence_num <- covidcast( time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) @@ -46,7 +46,7 @@ confirmed_7dav_incidence_num <- covidcast( time_values = epirange(20200301, 20211231), geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases_7d_av = value) %>% arrange(geo_value, time_value) diff --git a/man/archive_cases_dv_subset.Rd b/man/archive_cases_dv_subset.Rd index 1c6b9eb9c..4b19e58ca 100644 --- a/man/archive_cases_dv_subset.Rd +++ b/man/archive_cases_dv_subset.Rd @@ -23,7 +23,7 @@ Copyright Johns Hopkins University 2020. Modifications: \itemize{ \item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Doctor Visits API}: The signal \code{percent_cli} is taken directly from the API without changes. -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: \code{case_rate_7d_av} is taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: \code{case_rate_7d_av} signal was computed by Delphi from the original JHU-CSSE data by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. \item Furthermore, the data is a subset of the full dataset, the signal names slightly altered, and formatted into a tibble. } } diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index a98798cc3..d63a5faa7 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -11,7 +11,7 @@ as_epi_archive( other_keys, additional_metadata = list(), compactify = NULL, - clobberable_versions_start = max_version_with_row_in(x), + clobberable_versions_start = NA, versions_end = max_version_with_row_in(x) ) } @@ -53,24 +53,18 @@ e.g., when fetching, storing, or preparing the input data \code{x}} same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and \code{typeof}: specifically, either (a) the earliest version that could be subject to "clobbering" (being overwritten with different update data, but -using the same version tag as the old update data), or (b) \code{NA}, to +using the \emph{same} version tag as the old update data), or (b) \code{NA}, to indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable, such as upstream hotfixes to the latest -version, or delays in data synchronization that were mistaken for versions -with no updates; potential causes vary between different data pipelines. -The default value is \code{max_version_with_row_in(x)}; this default assumes -that (i) if a row in \code{x} (even one that \code{compactify} would consider -redundant) is present with version \code{ver}, then all previous versions must -be finalized and non-clobberable, although \code{ver} (and onward) might still -be modified, (ii) even if we have "observed" empty updates for some -versions beyond \code{max(x$version)} (as indicated by \code{versions_end}; -see below), we can't assume \code{max(x$version)} has been finalized, because we -might see a nonfinalized version + empty subsequent versions due to -upstream database replication delays in combination with the upstream -replicas using last-version-carried-forward to extrapolate that there were -no updates, (iii) "redundant" update rows that would be removed by -\code{compactify} are not redundant, and actually come from an explicit version -release that indicates that preceding versions are finalized. If \code{nrow(x) == 0}, then this argument is mandatory.} +why versions could be clobberable under routine circumstances, such as (a) +today's version of one/all of the columns being published after initially +being filled with \code{NA} or LOCF, (b) a buggy version of today's data being +published but then fixed and republished later in the day, or (c) data +pipeline delays (e.g., publisher uploading, periodic scraping, database +syncing, periodic fetching, etc.) that make events (a) or (b) reflected +later in the day (or even on a different day) than expected; potential +causes vary between different data pipelines. The default value is \code{NA}, +which doesn't consider any versions to be clobberable. Another setting that +may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} \item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as \code{x$version}: what is the last version we have observed? The default is diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd new file mode 100644 index 000000000..c314f47e4 --- /dev/null +++ b/man/as_tibble.epi_df.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{as_tibble.epi_df} +\alias{as_tibble.epi_df} +\title{Convert to tibble} +\usage{ +\method{as_tibble}{epi_df}(x, ...) +} +\arguments{ +\item{x}{an \code{epi_df}} + +\item{...}{arguments to forward to \code{NextMethod()}} +} +\description{ +Converts an \code{epi_df} object into a tibble, dropping metadata and any +grouping. +} diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 33e3f73b5..3efae55d5 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -22,7 +22,11 @@ detect_outlr_rm( \item{y}{Signal values.} -\item{n}{Number of time steps to use in the rolling window. Default is 21.} +\item{n}{Number of time steps to use in the rolling window. Default is 21. +This value is centrally aligned. When \code{n} is an odd number, the rolling +window extends from \code{(n-1)/2} time steps before each design point to \code{(n-1)/2} +time steps after. When \code{n} is even, then the rolling range extends from +\code{n/2-1} time steps before to \code{n/2} time steps after.} \item{log_transform}{Should a log transform be applied before running outlier detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 026f27e1c..a4a586453 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -118,7 +118,9 @@ toy_epi_archive \item \href{#method-epi_archive-print}{\code{epi_archive$print()}} \item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}} \item \href{#method-epi_archive-fill_through_version}{\code{epi_archive$fill_through_version()}} +\item \href{#method-epi_archive-truncate_versions_after}{\code{epi_archive$truncate_versions_after()}} \item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}} +\item \href{#method-epi_archive-group_by}{\code{epi_archive$group_by()}} \item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}} \item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}} } @@ -169,15 +171,17 @@ such as \code{as_of}? As these methods use the last version of each observation carried forward (LOCF) to interpolate between the version data provided, rows that don't change these LOCF results can potentially be omitted to save space while maintaining the same behavior (with the help of the -\code{clobberable_versions_start} and \code{versions_end} fields in some -edge cases). \code{TRUE} will remove these rows, \code{FALSE} will not, and missing -or \code{NULL} will remove these rows and issue a warning. Generally, this can -be set to \code{TRUE}, but if you directly inspect or edit the fields of the -\code{epi_archive} such as its \code{DT}, you will have to determine whether -\code{compactify=TRUE} will produce the desired results. If compactification -here is removing a large proportion of the rows, this may indicate a -potential for space, time, or bandwidth savings upstream the data pipeline, -e.g., when fetching, storing, or preparing the input data \code{x}} +\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). +\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will +remove these rows and issue a warning. Generally, this can be set to +\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} +such as its \code{DT}, or rely on redundant updates to achieve a certain +behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to +determine whether \code{compactify=TRUE} will produce the desired results. If +compactification here is removing a large proportion of the rows, this may +indicate a potential for space, time, or bandwidth savings upstream the +data pipeline, e.g., by avoiding fetching, storing, or processing these +rows of \code{x}.} \item{\code{clobberable_versions_start}}{Optional; as in \code{\link{as_epi_archive}}} @@ -199,7 +203,7 @@ An \code{epi_archive} object. \if{latex}{\out{\hypertarget{method-epi_archive-print}{}}} \subsection{Method \code{print()}}{ \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$print()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$print(class = TRUE, methods = TRUE)}\if{html}{\out{
}} } } @@ -210,7 +214,7 @@ An \code{epi_archive} object. Generates a snapshot in \code{epi_df} format as of a given version. See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for details. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf, all_versions = FALSE)}\if{html}{\out{
}} } } @@ -237,6 +241,28 @@ version, which doesn't mutate the input archive but might alias its fields. } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-truncate_versions_after}{}}} +\subsection{Method \code{truncate_versions_after()}}{ +Filter to keep only older versions, mutating the archive by +potentially reseating but not mutating some fields. \code{DT} is likely, but not +guaranteed, to be copied. Returns the mutated archive +\link[base:invisible]{invisibly}. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_archive$truncate_versions_after(max_version)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{max_version}}{as in \code{\link{epix_truncate_versions_after}}} + +\item{\code{x}}{as in \code{\link{epix_truncate_versions_after}}} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} \subsection{Method \code{merge()}}{ @@ -265,6 +291,19 @@ does not alias either archive's \code{DT}. } \if{html}{\out{}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-group_by}{}}} +\subsection{Method \code{group_by()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_archive$group_by( + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(self) +)}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} @@ -277,14 +316,13 @@ details. \if{html}{\out{
}}\preformatted{epi_archive$slide( f, ..., - n, - group_by, + before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE + all_versions = FALSE )}\if{html}{\out{
}} } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 5c5c363cd..33c3a7fb5 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -8,10 +8,9 @@ epi_slide( x, f, ..., - n, - ref_time_values, - align = c("right", "center", "left"), before, + after, + ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, @@ -20,69 +19,83 @@ epi_slide( ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration.} - -\item{f}{Function or formula to slide over variables in \code{x}. To "slide" means -to apply a function or formula over a running window of \code{n} time steps -(where one time step is typically one day or one week; see details for more -explanation). If a function, \code{f} should take \code{x}, an \code{epi_df} with the same -names as the non-grouping columns, followed by \code{g} to refer to the one row -tibble with one column per grouping variable that identifies the group, -and any number of named arguments (which will be taken from \code{...}). If a -formula, \code{f} can operate directly on columns accessed via \code{.x$var}, as -in \code{~ mean(.x$var)} to compute a mean of a column var over a sliding -window of n time steps. As well, \code{.y} may be used in the formula to refer -to the groupings that would be described by \code{g} if \code{f} was a function.} +\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +single data group.} + +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation within a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} and \code{after} parameters described below. One time +step is typically one day or one week; see details for more explanation. If +a function, \code{f} must take a data frame with the same column names as +the original object, minus any grouping variables, containing the time +window data for one group-\code{ref_time_value} combination; followed by a +one-row tibble containing the values of the grouping variables for the +associated group; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~mean(.x$var)} to compute a mean of a column \code{var} for each +\code{ref_time_value}-group combination. The group key can be accessed via \code{.y}. +If \code{f} is missing, then \code{...} will specify the computation.} \item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then the current argument is -interpreted as an expression for tidy evaluation. See details.} - -\item{n}{Number of time steps to use in the running window. For example, if -\code{n = 7}, one time step is one day, and the alignment is "right", then to -produce a value on January 7 we apply the given function or formula to data -in between January 1 and 7.} +via \code{f}. Alternatively, if \code{f} is missing, then the \code{...} is interpreted as +an expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to \code{.x}, \code{.group_key}, and +\code{.ref_time_value}. See details.} + +\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should +the sliding window extend? At least one of these two arguments must be +provided; the other's default will be 0. Any value provided for either +argument must be a single, non-\code{NA}, non-negative, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. Endpoints of +the window are inclusive. Common settings: * For trailing/right-aligned +windows from \code{ref_time_value - time_step(k)} to \code{ref_time_value}: either +pass \code{before=k} by itself, or pass \verb{before=k, after=0}. * For +center-aligned windows from \code{ref_time_value - time_step(k)} to +\code{ref_time_value + time_step(k)}: pass \verb{before=k, after=k}. * For +leading/left-aligned windows from \code{ref_time_value} to \code{ref_time_value + time_step(k)}: either pass pass \code{after=k} by itself, or pass \verb{before=0, after=k}. See "Details:" about the definition of a time step, +(non)treatment of missing rows within the window, and avoiding warnings +about \code{before}&\code{after} settings for a certain uncommon use case.} \item{ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{align}{One of "right", "center", or "left", indicating the alignment of -the sliding window relative to the reference time point. If the alignment -is "center" and \code{n} is even, then one more time point will be used after -the reference time point than before. Default is "right".} - -\item{before}{Positive integer less than \code{n}, specifying the number of time -points to use in the sliding window strictly before the reference time -point. For example, setting \code{before = n-1} would be the same as setting -\code{align = "right"}. The \code{before} argument allows for more flexible -specification of alignment than the \code{align} parameter, and if specified, -overrides \code{align}.} - \item{time_step}{Optional function used to define the meaning of one time step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} +\code{time_value} column. This function must take a non-negative integer and +return an object of class \code{lubridate::period}. For example, we can use +\code{time_step = lubridate::hours} in order to set the time step to be one hour +(this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} \item{new_col_name}{String indicating the name of the new column that will contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} -\item{as_list_col}{If the computations return data frames, should the slide -result hold these in a single list column or try to unnest them? Default is -\code{FALSE}, in which case a list object returned by \code{f} would be unnested -(using \code{\link[tidyr:nest]{tidyr::unnest()}}), and the names of the resulting columns are given -by prepending \code{new_col_name} to the names of the list elements.} +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output; otherwise, there will be one row for each time value in \code{x} -that acts as a reference time value. Default is \code{FALSE}.} +the output even with \code{ref_time_values} provided, with some type of missing +value marker for the slide computation output column(s) for \code{time_value}s +outside \code{ref_time_values}; otherwise, there will be one row for each row in +\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending a new column to \code{x}, named @@ -93,29 +106,44 @@ Slides a given function over variables in an \code{epi_df} object. See the \href examples. } \details{ -To "slide" means to apply a function or formula over a running -window of \code{n} time steps, where the unit (the meaning of one time step) is -implicitly defined by the way the \code{time_value} column treats addition and -subtraction; for example, if the time values are coded as \code{Date} objects, -then one time step is one day, since \code{as.Date("2022-01-01") + 1} equals +To "slide" means to apply a function or formula over a rolling +window of time steps for each data group, where the window is entered at a +reference time and left and right endpoints are given by the \code{before} and +\code{after} arguments. The unit (the meaning of one time step) is implicitly +defined by the way the \code{time_value} column treats addition and subtraction; +for example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly using the \code{time_step} argument (which if specified would override the -default choice based on \code{time_value} column). If less than \code{n} time steps -are available at any given reference time value, then \code{epi_slide()} still -attempts to perform the computation anyway (it does not require a complete -window). The issue of what to do with partial computations (those run on -incomplete windows) is therefore left up to the user, either through the -specified function or formula \code{f}, or through post-processing. +default choice based on \code{time_value} column). If there are not enough time +steps available to complete the window at any given reference time, then +\code{epi_slide()} still attempts to perform the computation anyway (it does not +require a complete window). The issue of what to do with partial +computations (those run on incomplete windows) is therefore left up to the +user, either through the specified function or formula \code{f}, or through +post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and +\code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) If \code{f} is missing, then an expression for tidy evaluation can be specified, for example, as in: -\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) +\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), before = 6) }\if{html}{\out{
}} which would be equivalent to: -\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7, +\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, g) mean(x$cases), before = 6, new_col_name = "cases_7dav") }\if{html}{\out{
}} @@ -125,26 +153,38 @@ inferred from the given expression and overrides any name passed explicitly through the \code{new_col_name} argument. } \examples{ - # slide a 7-day trailing average formula on cases - jhu_csse_daily_subset \%>\% +# slide a 7-day trailing average formula on cases +jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), n = 7, - align = "right") \%>\% + epi_slide(cases_7dav = mean(cases), before = 6) \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) - - # slide a left-aligned 7-day average - jhu_csse_daily_subset \%>\% + +# slide a 7-day leading average +jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), n = 7, - align = "left") \%>\% + epi_slide(cases_7dav = mean(cases), after = 6) \%>\% + # rmv a nonessential var. to ensure new col is printed + dplyr::select(-death_rate_7d_av) + +# slide a 7-day centre-aligned average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% # rmv a nonessential var. to ensure new col is printed dplyr::select(-death_rate_7d_av) - - # nested new columns - jhu_csse_daily_subset \%>\% - group_by(geo_value) \%>\% - epi_slide(a = data.frame(cases_2dav = mean(cases), - cases_2dma = mad(cases)), - n = 2, as_list_col = TRUE) + +# slide a 14-day centre-aligned average +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide(cases_7dav = mean(cases), before = 6, after = 7) \%>\% + # rmv a nonessential var. to ensure new col is printed + dplyr::select(-death_rate_7d_av) + +# nested new columns +jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide(a = data.frame(cases_2dav = mean(cases), + cases_2dma = mad(cases)), + before = 1, as_list_col = TRUE) } diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 4053cd284..dcdf167db 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -4,7 +4,7 @@ \alias{epix_as_of} \title{Generate a snapshot from an \code{epi_archive} object} \usage{ -epix_as_of(x, max_version, min_time_value = -Inf) +epix_as_of(x, max_version, min_time_value = -Inf, all_versions = FALSE) } \arguments{ \item{x}{An \code{epi_archive} object} @@ -18,6 +18,13 @@ of the specified \code{max_version} (and whose time values are at least \item{min_time_value}{Time value specifying the min time value to permit in the snapshot. Default is \code{-Inf}, which effectively means that there is no minimum considered.} + +\item{all_versions}{If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} } \value{ An \code{epi_df} object. diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 9d76ac7c9..c0f07d88b 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -2,52 +2,72 @@ % Please edit documentation in R/methods-epi_archive.R \name{epix_slide} \alias{epix_slide} -\title{Slide a function over variables in an \code{epi_archive} object} +\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} \usage{ epix_slide( x, f, ..., - n, - group_by, + before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE + all_versions = FALSE ) } \arguments{ -\item{x}{An \code{epi_archive} object.} +\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, +all data in \code{x} will be treated as part of a single data group.} -\item{f}{Function or formula to slide over variables in \code{x}. To "slide" means -to apply a function or formula over a running window of \code{n} time steps -(where one time step is typically one day or one week). If a function, \code{f} -must take \code{x}, a data frame with the same column names as the original -object; followed by any number of named arguments; and ending with -\code{...}. If a formula, \code{f} can operate directly on columns accessed via -\code{.x$var}, as in \code{~ mean(.x$var)} to compute a mean of a column \code{var} over a -sliding window of \code{n} time steps.} +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} \item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then the current argument is -interpreted as an expression for tidy evaluation.} +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} -\item{n}{Number of time steps to use in the running window. For example, if -\code{n = 7}, and one time step is one day, then to produce a value on January 7 -we apply the given function or formula to data in between January 1 and -7.} +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} -\item{group_by}{The variable(s) to group by before slide computation. If -missing, then the keys in the underlying data table, excluding \code{time_value} -and \code{version}, will be used for grouping. To omit a grouping entirely, use -\code{group_by = NULL}.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each -element of this vector serves as the reference time point for one sliding -window. If missing, then this will be set to all unique time values in the -underlying data table, by default.} +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} \item{time_step}{Optional function used to define the meaning of one time step, which if specified, overrides the default choice based on the @@ -59,20 +79,23 @@ would only be meaningful if \code{time_value} is of class \code{POSIXct}).} contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} -\item{as_list_col}{If the computations return data frames, should the slide -result hold these in a single list column or try to unnest them? Default is -\code{FALSE}, in which case a list object returned by \code{f} would be unnested -(using \code{\link[tidyr:nest]{tidyr::unnest()}}), and the names of the resulting columns are given -by prepending \code{new_col_name} to the names of the list elements.} +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix from \code{new_col_name} entirely.} -\item{all_rows}{If \code{all_rows = TRUE}, then the output will have one row per -combination of grouping variables and unique time values in the underlying -data table. Otherwise, there will be one row in the output for each time -value in \code{x} that acts as a reference time value. Default is \code{FALSE}.} +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} } \value{ A tibble whose columns are: the grouping variables, \code{time_value}, @@ -89,23 +112,53 @@ performed on \strong{data that would have been available as of t}. See the examples. } \details{ -Two key distinctions between inputs to the current function and -\code{epi_slide()}: +A few key distinctions between the current function and \code{epi_slide()}: \enumerate{ -\item \code{epix_slide()} uses windows that are \strong{always right-aligned} (in -\code{epi_slide()}, custom alignments could be specified using the \code{align} or -\code{before} arguments). -\item \code{epix_slide()} uses a \code{group_by} to specify the grouping upfront (in -\code{epi_slide()}, this would be accomplished by a preceding function call to -\code{dplyr::group_by()}). -Apart from this, the interfaces between \code{epix_slide()} and \code{epi_slide()} are -the same. +\item In \code{f} functions for \code{epix_slide}, one should not assume that the input +data to contain any rows with \code{time_value} matching the computation's +\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for +typical epidemiological surveillance data, observations pertaining to a +particular time period (\code{time_value}) are first reported \code{as_of} some +instant after that time period has ended. +\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend +from \code{before} time steps before a given \code{ref_time_value} through the last +\code{time_value} available as of version \code{ref_time_value} (typically, this +won't include \code{ref_time_value} itself, as observations about a particular +time interval (e.g., day) are only published after that time interval +ends); \code{epi_slide} windows extend from \code{before} time steps before a +\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. +\item The input class and columns are similar but different: \code{epix_slide} +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) +\item The output class and columns are similar but different: \code{epix_slide()} +returns a tibble containing only the grouping variables, \code{time_value}, and +the new column(s) from the slide computations, whereas \code{epi_slide()} +returns an \code{epi_df} with all original variables plus the new columns from +the slide computations. (Both will mirror the grouping or ungroupedness of +their input, with one exception: \code{epi_archive}s can have trivial +(zero-variable) groupings, but these will be dropped in \code{epix_slide} +results as they are not supported by tibbles.) +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. +\item The \code{ref_time_values} default for \code{epix_slide} is based on making an +evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the +\code{versions_end}, rather than the \code{time_value}s. } -Note that the outputs are a similar but different: \code{epix_slide()} only -returns the grouping variables, \code{time_value}, and the new columns from -sliding, whereas \code{epi_slide()} returns all original variables plus the new -columns from sliding. +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. Furthermore, the current function can be considerably slower than \code{epi_slide()}, for two reasons: (1) it must repeatedly fetch @@ -116,31 +169,102 @@ should never be used in place of \code{epi_slide()}, and only used when version-aware sliding is necessary (as it its purpose). Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: +\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an +object of either of these classes, then: -\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) }\if{html}{\out{
}} is equivalent to: -\if{html}{\out{
}}\preformatted{x$slide(x, new_var = comp(old_var), n = 120) +\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) }\if{html}{\out{
}} } \examples{ -# these dates are reference time points for the 3 day average sliding window -# The resulting epi_archive ends up including data averaged from: -# 0 day which has no results, for 2020-06-01 -# 1 day, for 2020-06-02 -# 2 days, for the rest of the results -# never 3 days dur to data latency - -time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-15"), - by = "1 day") -epix_slide(x = archive_cases_dv_subset, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = geo_value, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') +library(dplyr) + +# Reference time points for which we want to compute slide values: +ref_time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-15"), + by = "1 day") + +# A simple (but not very useful) example (see the archive vignette for a more +# realistic one): +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide(f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = ref_time_values, + new_col_name = 'case_rate_7d_av_recent_av') \%>\% + ungroup() +# We requested time windows that started 2 days before the corresponding time +# values. The actual number of `time_value`s in each computation depends on +# the reporting latency of the signal and `time_value` range covered by the +# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +# discarded +# * 1 `time_value`, for ref time 2020-06-02 +# * 2 `time_value`s, for the rest of the results +# * never the 3 `time_value`s we would get from `epi_slide`, since, because +# of data latency, we'll never have an observation +# `time_value == ref_time_value` as of `ref_time_value`. +# The example below shows this type of behavior in more detail. + +# Examining characteristics of the data passed to each computation with +# `all_versions=FALSE`. +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide( + function(x, gk, rtv) { + tibble( + time_range = if(nrow(x) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) + }, + n = nrow(x), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = FALSE, + ref_time_values = ref_time_values, names_sep=NULL) \%>\% + ungroup() \%>\% + arrange(geo_value, time_value) + +# --- Advanced: --- + +# `epix_slide` with `all_versions=FALSE` (the default) applies a +# version-unaware computation to several versions of the data. We can also +# use `all_versions=TRUE` to apply a version-*aware* computation to several +# versions of the data, again looking at characteristics of the data passed +# to each computation. In this case, each computation should expect an +# `epi_archive` containing the relevant version data: + +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide( + function(x, gk, rtv) { + tibble( + versions_start = if (nrow(x$DT) == 0L) { + "NA (0 rows)" + } else { + toString(min(x$DT$version)) + }, + versions_end = x$versions_end, + time_range = if(nrow(x$DT) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) + }, + n = nrow(x$DT), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = TRUE, + ref_time_values = ref_time_values, names_sep=NULL) \%>\% + ungroup() \%>\% + # Focus on one geo_value so we can better see the columns above: + filter(geo_value == "ca") \%>\% + select(-geo_value) + } diff --git a/man/epix_truncate_versions_after.Rd b/man/epix_truncate_versions_after.Rd new file mode 100644 index 000000000..8f7414184 --- /dev/null +++ b/man/epix_truncate_versions_after.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R +\name{epix_truncate_versions_after} +\alias{epix_truncate_versions_after} +\title{Filter an \code{epi_archive} object to keep only older versions} +\usage{ +epix_truncate_versions_after(x, max_version) +} +\arguments{ +\item{x}{An \code{epi_archive} object} + +\item{max_version}{Time value specifying the max version to permit in the +filtered archive. That is, the output archive will comprise rows of the +current archive data having \code{version} less than or equal to the +specified \code{max_version}} +} +\value{ +An \code{epi_archive} object +} +\description{ +Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping +only rows with \code{version} falling on or before a specified date. +} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd new file mode 100644 index 000000000..aee0a07b8 --- /dev/null +++ b/man/group_by.epi_archive.Rd @@ -0,0 +1,147 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R +\name{group_by.epi_archive} +\alias{group_by.epi_archive} +\alias{grouped_epi_archive} +\alias{group_by.grouped_epi_archive} +\alias{groups.grouped_epi_archive} +\alias{ungroup.grouped_epi_archive} +\alias{is_grouped_epi_archive} +\alias{group_by_drop_default.grouped_epi_archive} +\title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} +\usage{ +\method{group_by}{epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) + +\method{group_by}{grouped_epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) + +\method{groups}{grouped_epi_archive}(x) + +\method{ungroup}{grouped_epi_archive}(x, ...) + +is_grouped_epi_archive(x) + +\method{group_by_drop_default}{grouped_epi_archive}(.tbl) +} +\arguments{ +\item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} + +\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); +\itemize{ +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +perform grouping, but note that, if you are regrouping an already-grouped +\code{.data} object, the calculations will be carried out ignoring such grouping +(same as \link[dplyr:group_by]{in dplyr}). +\item For \code{ungroup}: either +\itemize{ +\item empty, in order to remove the grouping and output an \code{epi_archive}; or +\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} +expression(s), in order to remove the matching variables from the list of +grouping variables, and output another \code{grouped_epi_archive}. +} +}} + +\item{.add}{Boolean. If \code{FALSE}, the default, the output will be grouped by +the variable selection from \code{...} only; if \code{TRUE}, the output will be +grouped by the current grouping variables plus the variable selection from +\code{...}.} + +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} + +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} + +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} +} +\description{ +\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} +} +\details{ +To match \code{dplyr}, \code{group_by} allows "data masking" (also referred to as +"tidy evaluation") expressions \code{...}, not just column names, in a way similar +to \code{mutate}. Note that replacing or removing key columns with these +expressions is disabled. + +\code{archive \%>\% group_by()} and other expressions that group or regroup by zero +columns (indicating that all rows should be treated as part of one large +group) will output a \code{grouped_epi_archive}, in order to enable the use of +\code{grouped_epi_archive} methods on the result. This is in slight contrast to +the same operations on tibbles and grouped tibbles, which will \emph{not} output a +\code{grouped_df} in these circumstances. + +Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is +disabled; instead, \code{ungroup} first then \code{group_by}. + +Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, +introducing column-level aliasing between its input and its result. This +doesn't follow the general model for most \code{data.table} operations, which +seems to be that, given an nonaliased (i.e., unique) pointer to a +\code{data.table} object, its pointers to its columns should also be nonaliased. +If you mutate any of the columns of either the input or result, first ensure +that it is fine if columns of the other are also mutated, but do not rely on +such behavior to occur. Additionally, never perform mutation on the key +columns at all (except for strictly increasing transformations), as this will +invalidate sortedness assumptions about the rows. + +\code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch +to \code{group_by_drop_default.default} (but there is a dedicated method for +\code{grouped_epi_archive}s). +} +\examples{ + +grouped_archive = archive_cases_dv_subset \%>\% group_by(geo_value) + +# `print` for metadata and method listing: +grouped_archive \%>\% print() + +# The primary use for grouping is to perform a grouped `epix_slide`: + +archive_cases_dv_subset \%>\% + group_by(geo_value) \%>\% + epix_slide(f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = as.Date("2020-06-11") + 0:2, + new_col_name = 'case_rate_3d_av') \%>\% + ungroup() + +# ----------------------------------------------------------------- + +# Advanced: some other features of dplyr grouping are implemented: + +library(dplyr) +toy_archive = + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) \%>\% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version)) \%>\% + as_epi_archive(other_keys = "age_group") + +# The following are equivalent: +toy_archive \%>\% group_by(geo_value, age_group) +toy_archive \%>\% group_by(geo_value) \%>\% group_by(age_group, .add=TRUE) +grouping_cols = c("geo_value", "age_group") +toy_archive \%>\% group_by(across(all_of(grouping_cols))) + +# And these are equivalent: +toy_archive \%>\% group_by(geo_value) +toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) + +# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +toy_archive \%>\% group_by(geo_value) \%>\% groups() + +toy_archive \%>\% + group_by(geo_value, age_group, .drop=FALSE) \%>\% + epix_slide(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() + +} diff --git a/man/growth_rate.Rd b/man/growth_rate.Rd index 173eff433..203d5d7d0 100644 --- a/man/growth_rate.Rd +++ b/man/growth_rate.Rd @@ -105,7 +105,7 @@ reference point is at most \code{h}. Note that the unit for this distance is implicitly defined by the \code{x} variable; for example, if \code{x} is a vector of \code{Date} objects, \code{h = 7}, and the reference point is January 7, then the sliding window contains all data in between January 1 and 14 (matching the -behavior of \code{epi_slide()} with \code{n = 2 * h} and \code{align = "center"}). +behavior of \code{epi_slide()} with \code{before = h - 1} and \code{after = h}). } \section{Additional Arguments}{ diff --git a/man/guess_period.Rd b/man/guess_period.Rd new file mode 100644 index 000000000..e03a13730 --- /dev/null +++ b/man/guess_period.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{guess_period} +\alias{guess_period} +\title{Use max valid period as guess for \code{period} of \code{ref_time_values}} +\usage{ +guess_period( + ref_time_values, + ref_time_values_arg = rlang::caller_arg(ref_time_values) +) +} +\arguments{ +\item{ref_time_values}{Vector containing time-interval-like or time-like +data, with at least two distinct values, \code{\link{diff}}-able (e.g., a +\code{time_value} or \code{version} column), and should have a sensible result from +adding \code{is.numeric} versions of its \code{diff} result (via \code{as.integer} if its +\code{typeof} is \code{"integer"}, otherwise via \code{as.numeric}).} + +\item{ref_time_values_arg}{Optional, string; name to give \code{ref_time_values} +in error messages. Defaults to quoting the expression the caller fed into +the \code{ref_time_values} argument.} +} +\value{ +\code{is.numeric}, length 1; attempts to match \code{typeof(ref_time_values)} +} +\description{ +Use max valid period as guess for \code{period} of \code{ref_time_values} +} diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd index f8dcf2c89..5b1330048 100644 --- a/man/is_epi_archive.Rd +++ b/man/is_epi_archive.Rd @@ -4,10 +4,13 @@ \alias{is_epi_archive} \title{Test for \code{epi_archive} format} \usage{ -is_epi_archive(x) +is_epi_archive(x, grouped_okay = FALSE) } \arguments{ \item{x}{An object.} + +\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also +count? Default is \code{FALSE}.} } \value{ \code{TRUE} if the object inherits from \code{epi_archive}. @@ -18,4 +21,15 @@ Test for \code{epi_archive} format \examples{ is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) is_epi_archive(archive_cases_dv_subset) # TRUE + +# By default, grouped_epi_archives don't count as epi_archives, as they may +# support a different set of operations from regular `epi_archives`. This +# behavior can be controlled by `grouped_okay`. +grouped_archive = archive_cases_dv_subset$group_by(geo_value) +is_epi_archive(grouped_archive) # FALSE +is_epi_archive(grouped_archive, grouped_okay=TRUE) # TRUE + +} +\seealso{ +\code{\link{is_grouped_epi_archive}} } diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index 0b2c6deb2..cca554fae 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/archive.R \name{max_version_with_row_in} \alias{max_version_with_row_in} -\title{Default arg helper: \code{max(x$version)}, with error if \code{x} has 0 rows} +\title{\code{max(x$version)}, with error if \code{x} has 0 rows} \usage{ max_version_with_row_in(x) } diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 9ac3af999..f5749d825 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -3,15 +3,9 @@ \name{print.epi_df} \alias{print.epi_df} \alias{summary.epi_df} -\alias{arrange.epi_df} -\alias{filter.epi_df} \alias{group_by.epi_df} -\alias{group_modify.epi_df} -\alias{mutate.epi_df} -\alias{relocate.epi_df} -\alias{rename.epi_df} -\alias{slice.epi_df} \alias{ungroup.epi_df} +\alias{group_modify.epi_df} \alias{unnest.epi_df} \title{Base S3 methods for an \code{epi_df} object} \usage{ @@ -19,24 +13,12 @@ \method{summary}{epi_df}(object, ...) -\method{arrange}{epi_df}(.data, ...) - -\method{filter}{epi_df}(.data, ...) - \method{group_by}{epi_df}(.data, ...) -\method{group_modify}{epi_df}(.data, ...) - -\method{mutate}{epi_df}(.data, ...) - -\method{relocate}{epi_df}(.data, ...) - -\method{rename}{epi_df}(.data, ...) - -\method{slice}{epi_df}(.data, ...) - \method{ungroup}{epi_df}(x, ...) +\method{group_modify}{epi_df}(.data, .f, ..., .keep = FALSE) + \method{unnest}{epi_df}(data, ...) } \arguments{ @@ -49,14 +31,15 @@ Currently unused.} \item{.data}{The \code{epi_df} object.} +\item{.f}{function or formula; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} + +\item{.keep}{Boolean; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} + \item{data}{The \code{epi_df} object.} } \description{ -Print, summary, and \code{dplyr} verbs (that preserve class and attributes) for an -\code{epi_df} object. +Print and summary functions for an \code{epi_df} object. Prints a variety of summary statistics about the \code{epi_df} object, such as the time range included and geographic coverage. - -\code{dplyr} verbs for \code{epi_df} objects, preserving class and attributes. } diff --git a/man/reexports.Rd b/man/reexports.Rd index b633e86c5..46e961d98 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -23,7 +23,7 @@ below to see their documentation. \describe{ \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr:group_map]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr:group_by]{ungroup}}} - \item{tidyr}{\code{\link[tidyr:nest]{unnest}}} + \item{tidyr}{\code{\link[tidyr]{unnest}}} \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}} }} diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index d4c94e099..598825f6a 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -102,7 +102,8 @@ test_that("archive version bounds args work as intended", { regexp="versions_end.*must not satisfy.*is.na") ea_default = as_epi_archive(update_tbl) ea_default$as_of(measurement_date+4L) - expect_warning(ea_default$as_of(measurement_date+5L), + expect_warning(regexp=NA, + ea_default$as_of(measurement_date+5L), class = "epiprocess__snapshot_as_of_clobberable_version") expect_error(ea_default$as_of(measurement_date+6L), regexp = "max_version.*at most.*versions_end") diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index b24334d3f..790ac65f9 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -127,3 +127,25 @@ test_that("epi_archives are correctly instantiated with a variety of data types" expect_equal(key(ea12$DT),c("geo_value","time_value","misc","version")) expect_equal(ea12$additional_metadata,list(value=df$misc)) }) + +test_that("`epi_archive` rejects nonunique keys", { + toy_update_tbl = + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version)) + expect_error( + as_epi_archive(toy_update_tbl), + class = "epiprocess__epi_archive_requires_unique_key" + ) + expect_error( + regexp = NA, + as_epi_archive(toy_update_tbl, other_keys = "age_group"), + ) +}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 962747d90..f8d956c08 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -84,12 +84,8 @@ test_that("as_of produces the same results with compactify=TRUE as with compacti # Row 22, an LOCF row corresponding to the latest version, is omitted in # ea_true latest_version = max(ea_false$DT$version) - expect_warning({ - as_of_true <- ea_true$as_of(latest_version) - }, class = "epiprocess__snapshot_as_of_clobberable_version") - expect_warning({ - as_of_false <- ea_false$as_of(latest_version) - }, class = "epiprocess__snapshot_as_of_clobberable_version") + as_of_true <- ea_true$as_of(latest_version) + as_of_false <- ea_false$as_of(latest_version) expect_identical(as_of_true,as_of_false) }) diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R new file mode 100644 index 000000000..334b4488d --- /dev/null +++ b/tests/testthat/test-deprecations.R @@ -0,0 +1,48 @@ + +test_that("epix_slide group_by= deprecation works",{ + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + # + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) +}) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index f363ae349..23bab72fb 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1,34 +1,522 @@ ## Create an epi. df and a function to test epi_slide with -edf = dplyr::bind_rows( - dplyr::tibble(geo_value = "ak", time_value = as.Date("2020-01-01") + 1:200, value=1:200), - dplyr::tibble(geo_value = "al", time_value=as.Date("2020-01-01") + 1:5, value=-(1:5)) +d <- as.Date("2020-01-01") + +ungrouped = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:200, value=1:200), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) ) %>% as_epi_df() +grouped = ungrouped %>% + group_by(geo_value) + +small_x = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value=11:15), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value=-(1:5)) +) %>% + as_epi_df(as_of = d + 6) %>% + group_by(geo_value) + + +f = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) -f = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value)) +toy_edf = tibble::tribble( + ~geo_value, ~time_value, ~value , + "a" , 1:10 , 2L^( 1:10), + "b" , 1:10 , 2L^(11:20), + ) %>% + tidyr::unchop(c(time_value, value)) %>% + as_epi_df(as_of = 100) + +## --- These cases generate errors (or not): --- +test_that("`before` and `after` are both vectors of length 1", { + expect_error(epi_slide(grouped, f, before = c(0,1), after = 0, ref_time_values = d+3), + "`before`.*length-1") + expect_error(epi_slide(grouped, f, before = 1, after = c(0,1), ref_time_values = d+3), + "`after`.*length-1") +}) + +test_that("Test errors/warnings for discouraged features", { + expect_error(epi_slide(grouped, f, ref_time_values = d+1), + "Either or both of `before`, `after` must be provided.") + expect_warning(epi_slide(grouped, f, before = 0L, ref_time_values = d+1), + "`before==0`, `after` missing") + expect_warning(epi_slide(grouped, f, after = 0L, ref_time_values = d+1), + "`before` missing, `after==0`") + # Below cases should raise no errors/warnings: + expect_warning(epi_slide(grouped, f, before = 1L, ref_time_values = d+2),NA) + expect_warning(epi_slide(grouped, f, after = 1L, ref_time_values = d+2),NA) + expect_warning(epi_slide(grouped, f, before = 0L, after = 0L, ref_time_values = d+2),NA) +}) -## --- These cases generate the error: --- -test_that("`ref_time_values` + `align` that result in no slide data, generate the error", { - expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")), - "starting and/or stopping times for sliding are out of bounds") # before the first, no data in the slide windows - expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+207L), - "starting and/or stopping times for sliding are out of bounds") # beyond the last, no data in window +test_that("Both `before` and `after` must be non-NA, non-negative, integer-compatible",{ + expect_error(epi_slide(grouped, f, before = -1L, ref_time_values = d+2L), + "`before`.*non-negative") + expect_error(epi_slide(grouped, f, before = 2L, after = -1L, ref_time_values = d+2L), + "`after`.*non-negative") + expect_error(epi_slide(grouped, f, before = "a", ref_time_values = d+2L), + regexp="before", class="vctrs_error_incompatible_type") + expect_error(epi_slide(grouped, f, before = 1L, after = "a", ref_time_values = d+2L), + regexp="after", class="vctrs_error_incompatible_type") + expect_error(epi_slide(grouped, f, before = 0.5, ref_time_values = d+2L), + regexp="before", class="vctrs_error_incompatible_type") + expect_error(epi_slide(grouped, f, before = 1L, after = 0.5, ref_time_values = d+2L), + regexp="after", class="vctrs_error_incompatible_type") + expect_error(epi_slide(grouped, f, before = NA, after = 1L, ref_time_values = d+2L), + "`before`.*non-NA") + expect_error(epi_slide(grouped, f, before = 1L, after = NA, ref_time_values = d+2L), + "`after`.*non-NA") + # Non-integer-class but integer-compatible values are allowed: + expect_error(epi_slide(grouped, f, before = 1, after = 1, ref_time_values = d+2L),NA) }) -test_that("`ref_time_values` + `align` that have some slide data, but generate the error due to ref. time being out of time range", { - expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01"), align="left"), - "starting and/or stopping times for sliding are out of bounds") # before the first, but we'd expect there to be data in the window - expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+201L), - "starting and/or stopping times for sliding are out of bounds") # beyond the last, but still with data in window +test_that("`ref_time_values` + `before` + `after` that result in no slide data, generate the error", { + expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`.") # before the first, no data in the slide windows + expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d+207L), + "All `ref_time_values` must appear in `x\\$time_value`.") # beyond the last, no data in window +}) + +test_that("`ref_time_values` + `before` + `after` that have some slide data, but generate the error due to ref. time being out of time range (would also happen if they were in between `time_value`s)", { + expect_error(epi_slide(grouped, f, before=0L, after=2L, ref_time_values = d), + "All `ref_time_values` must appear in `x\\$time_value`.") # before the first, but we'd expect there to be data in the window + expect_error(epi_slide(grouped, f, before=2L, ref_time_values = d+201L), + "All `ref_time_values` must appear in `x\\$time_value`.") # beyond the last, but still with data in window +}) + +## --- These cases generate warnings (or not): --- +test_that("Warn user against having a blank `before`",{ + expect_warning(epi_slide(grouped, f, after = 1L, + ref_time_values = d+1L), NA) + expect_warning(epi_slide(grouped, f, before = 0L, after = 1L, + ref_time_values = d+1L), NA) }) ## --- These cases doesn't generate the error: --- test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", { - expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+200L) %>% + expect_identical(epi_slide(grouped, f, before=2L, ref_time_values = d+200L) %>% + ungroup() %>% dplyr::select("geo_value","slide_value_value"), dplyr::tibble(geo_value = "ak", slide_value_value = 199)) # out of range for one group - expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-04")) %>% + expect_identical(epi_slide(grouped, f, before=2L, ref_time_values=d+3) %>% + ungroup() %>% dplyr::select("geo_value","slide_value_value"), dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group -}) \ No newline at end of file +}) + +test_that("computation output formats x as_list_col", { + # See `toy_edf` definition at top of file. + # We'll try 7d sum with a few formats. + basic_result_from_size1 = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + basic_result_from_size1 + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), as_list_col = TRUE), + basic_result_from_size1 %>% dplyr::mutate(slide_value = as.list(slide_value)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + basic_result_from_size1 %>% rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), as_list_col = TRUE), + basic_result_from_size1 %>% + mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) + # output naming functionality: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + new_col_name = "result"), + basic_result_from_size1 %>% rename(result_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value_sum = sum(.x$value)), + names_sep = NULL), + basic_result_from_size1 %>% rename(value_sum = slide_value) + ) + # trying with non-size-1 computation outputs: + basic_result_from_size2 = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE) + 1L, + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1), + basic_result_from_size2 + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value) + 0:1, as_list_col = TRUE), + basic_result_from_size2 %>% dplyr::mutate(slide_value = as.list(slide_value)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1)), + basic_result_from_size2 %>% rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value) + 0:1), as_list_col = TRUE), + basic_result_from_size2 %>% + mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) +}) + +test_that("epi_slide alerts if the provided f doesn't take enough args", { + f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$value), count=length(x$value)) + # If `regexp` is NA, asserts that there should be no errors/messages. + expect_error(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) + expect_warning(epi_slide(grouped, f_xgt, before = 1L, ref_time_values = d+1), regexp = NA) + + f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value)) + expect_warning(epi_slide(grouped, f_x_dots, before = 1L, ref_time_values = d+1), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") +}) + +test_that("`ref_time_values` + `all_rows = TRUE` works", { + # See `toy_edf` definition at top of file. We'll do variants of a slide + # returning the following: + basic_full_result = tibble::tribble( + ~geo_value, ~time_value, ~value , ~slide_value , + "a" , 1:10 , 2L^( 1:10), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + "b" , 1:10 , 2L^(11:20), data.table::frollsum(2L^(1:10) + 2L^(11:20), c(1:7,rep(7L, 3L)), adaptive=TRUE, na.rm=TRUE), + ) %>% + tidyr::unchop(c(time_value, value, slide_value)) %>% + dplyr::arrange(time_value) %>% + as_epi_df(as_of = 100) + # slide computations returning atomic vecs: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value)), + basic_full_result + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L)), + basic_full_result %>% dplyr::filter(time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ sum(.x$value), + ref_time_values = c(2L, 8L), all_rows = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) + ) + # slide computations returning data frames: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value))), + basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L)), + basic_full_result %>% + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + # slide computations returning data frames with `as_list_col=TRUE`: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% + dplyr::filter(time_value %in% c(2L, 8L)) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE), + basic_full_result %>% + dplyr::mutate(slide_value = purrr::map(slide_value, ~ data.frame(value = .x))) %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, list(NULL))) + ) + # slide computations returning data frames, `as_list_col = TRUE`, `unnest`: + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + # XXX unclear exactly what we want in this case. Current approach is + # compatible with `vctrs::vec_detect_missing` but breaks `tidyr::unnest` + # compatibility + dplyr::filter(time_value %in% c(2L, 8L)) %>% + dplyr::rename(slide_value_value = slide_value) + ) + rework_nulls = function(slide_values_list) { + vctrs::vec_assign( + slide_values_list, + vctrs::vec_detect_missing(slide_values_list), + list(vctrs::vec_cast(NA, vctrs::vec_ptype_common(!!!slide_values_list))) + ) + } + expect_identical( + toy_edf %>% epi_slide(before = 6L, ~ data.frame(value = sum(.x$value)), + ref_time_values = c(2L, 8L), all_rows = TRUE, + as_list_col = TRUE) %>% + mutate(slide_value = rework_nulls(slide_value)) %>% + unnest(slide_value, names_sep = "_"), + basic_full_result %>% + dplyr::mutate(slide_value = dplyr::if_else(time_value %in% c(2L, 8L), + slide_value, NA_integer_)) %>% + dplyr::rename(slide_value_value = slide_value) + ) +}) + +test_that("`epi_slide` doesn't decay date output", { + expect_true( + ungrouped %>% + epi_slide(before = 5L, ~ as.Date("2020-01-01")) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) + +test_that("basic grouped epi_slide computation produces expected output", { + # Also checks that we correctly remove extra rows and columns (`.real`) used + # to recover `ref_time_value`s. + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=cumsum(-(1:5))) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + # formula + result1 <- epi_slide(small_x, f = ~sum(.x$value), before=50) + expect_identical(result1, expected_output) + + # function + result2 <- epi_slide(small_x, f = function(x, g, t) sum(x$value), before=50) + expect_identical(result2, expected_output) + + # dots + result3 <- epi_slide(small_x, slide_value = sum(value), before=50) + expect_identical(result3, expected_output) +}) + +test_that("ungrouped epi_slide computation completes successfully", { + expect_error( + small_x %>% + ungroup() %>% + epi_slide(before = 2, + slide_value = sum(.x$value)), + regexp=NA + ) +}) + +test_that("basic ungrouped epi_slide computation produces expected output", { + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=cumsum(11:15)) + ) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + ungroup() %>% + filter(geo_value == "ak") %>% + epi_slide(before = 50, + slide_value = sum(.x$value)) + expect_identical(result1, expected_output) + + # Ungrouped with multiple geos + expected_output = dplyr::bind_rows( + dplyr::tibble( + geo_value = "ak", time_value = d + 1:5, value=11:15, slide_value=cumsum(11:15) + cumsum(-(1:5) + )), + dplyr::tibble( + geo_value = "al", time_value = d + 1:5, value=-(1:5), slide_value=cumsum(11:15) + cumsum(-(1:5)) + ) + ) %>% + as_epi_df(as_of = d + 6) %>% + arrange(time_value) + + result2 <- small_x %>% + ungroup() %>% + epi_slide(before = 50, + slide_value = sum(.x$value)) + expect_identical(result2, expected_output) +}) + +test_that("epi_slide computation via formula can use ref_time_value", { + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(f = ~ .ref_time_value, + before = 50) + + expect_identical(result1, expected_output) + + result2 <- small_x %>% + epi_slide(f = ~ .z, + before = 50) + + expect_identical(result2, expected_output) + + result3 <- small_x %>% + epi_slide(f = ~ ..3, + before = 50) + + expect_identical(result3, expected_output) + + # Ungrouped with multiple geos + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + as_epi_df(as_of = d + 6) %>% + arrange(time_value) + + result4 <- small_x %>% + ungroup() %>% + epi_slide(f = ~ .ref_time_value, + before = 50) + expect_identical(result4, expected_output) +}) + +test_that("epi_slide computation via function can use ref_time_value", { + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(f = function(x, g, t) t, + before = 2) + + expect_identical(result1, expected_output) +}) + +test_that("epi_slide computation via dots can use ref_time_value and group", { + # ref_time_value + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(before = 50, + slide_value = .ref_time_value) + + expect_identical(result1, expected_output) + + # `.{x,group_key,ref_time_value}` should be inaccessible from `.data` and + # `.env`. + expect_error(small_x %>% + epi_slide(before = 50, + slide_value = .env$.ref_time_value) + ) + + # group_key + # Use group_key column + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value="ak"), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value="al") + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result3 <- small_x %>% + epi_slide(before = 2, + slide_value = .group_key$geo_value) + + expect_identical(result3, expected_output) + + # Use entire group_key object + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=1L), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=1L) + ) %>% + group_by(geo_value) %>% + as_epi_df(as_of = d + 6) + + result4 <- small_x %>% + epi_slide(before = 2, + slide_value = nrow(.group_key)) + + expect_identical(result4, expected_output) + + # Ungrouped with multiple geos + expected_output = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = 11:15, slide_value=d + 1:5), + dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = -(1:5), slide_value=d + 1:5) + ) %>% + as_epi_df(as_of = d + 6) %>% + arrange(time_value) + + result5 <- small_x %>% + ungroup() %>% + epi_slide(before = 50, + slide_value = .ref_time_value) + expect_identical(result5, expected_output) +}) + +test_that("epi_slide computation via dots outputs the same result using col names and the data var", { + expected_output <- small_x %>% + epi_slide(before = 2, + slide_value = max(time_value)) %>% + as_epi_df(as_of = d + 6) + + result1 <- small_x %>% + epi_slide(before = 2, + slide_value = max(.x$time_value)) + + expect_identical(result1, expected_output) +}) + +test_that("`epi_slide` can access objects inside of helper functions", { + helper = function(archive_haystack, time_value_needle) { + archive_haystack %>% epi_slide(has_needle = time_value_needle %in% time_value, before = 365000L) + } + expect_error( + helper(small_x, as.Date("2021-01-01")), + NA + ) +}) diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 03e9c504b..1d78bf490 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -11,9 +11,10 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to # edition 3, which is based on `waldo::compare` rather than `base::identical`; # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 # objects by contents rather than address (in a way that is tested but maybe - # not guaranteed via user docs). Use `local_edition` to ensure we use edition - # 3 here. - local_edition(3) + # not guaranteed via user docs). Use `testthat::local_edition` to ensure we + # use testthat edition 3 here (use `testthat::` to prevent ambiguity with + # `readr`). + testthat::local_edition(3) expect_identical(ea_orig, ea_trivial_fill_na1) expect_identical(ea_orig, ea_trivial_fill_na2) expect_identical(ea_orig, ea_trivial_fill_locf) @@ -30,9 +31,9 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte ea_fill_na = epix_fill_through_version(ea_orig, later_unobserved_version, "na") ea_fill_locf = epix_fill_through_version(ea_orig, later_unobserved_version, "locf") - # We use edition 3 features here, passing `ignore_attr` to `waldo::compare`. - # Ensure we are using edition 3: - local_edition(3) + # We use testthat edition 3 features here, passing `ignore_attr` to + # `waldo::compare`. Ensure we are using edition 3: + testthat::local_edition(3) withCallingHandlers({ expect_identical(ea_fill_na$versions_end, later_unobserved_version) expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index cb2ef7459..265263f07 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -58,7 +58,7 @@ test_that("epix_merge merges and carries forward updates properly", { ) # We rely on testthat edition 3 expect_identical using waldo, not identical. See # test-epix_fill_through_version.R comments for details. - local_edition(3) + testthat::local_edition(3) expect_identical(xy, xy_expected) }) @@ -123,10 +123,12 @@ local({ local({ x = as_epi_archive( tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L), + clobberable_versions_start = 1L, versions_end = 3L ) y = as_epi_archive( - tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L) + tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L), + clobberable_versions_start = 1L ) test_that('epix_merge forbids on sync default or "forbid"', { expect_error(epix_merge(x,y), diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R new file mode 100644 index 000000000..b3353bac2 --- /dev/null +++ b/tests/testthat/test-epix_slide.R @@ -0,0 +1,699 @@ +library(dplyr) + +test_that("epix_slide only works on an epi_archive", { + expect_error(epix_slide(data.frame(x=1))) +}) + +x <- tibble::tribble(~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2,4), 2^(4:6), + 6, c(1:2,4:5), 2^(7:10), + 7, 2:6, 2^(11:15)) %>% + tidyr::unnest(c(time_value,binary)) + +xx <- bind_cols(geo_value = rep("x",15), x) %>% + as_epi_archive() + +test_that("epix_slide works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary") + + xx2 <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + sum_binary = c(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14)) %>% + group_by(geo_value) + + expect_identical(xx1,xx2) # * + + xx3 <- ( + xx + $group_by(dplyr::across(dplyr::all_of("geo_value"))) + $slide(f = ~ sum(.x$binary), + before = 2, + new_col_name = 'sum_binary') + ) + + expect_identical(xx1,xx3) # This and * imply xx2 and xx3 are identical + + # function interface + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = function(x, gk, rtv) { + tibble::tibble(sum_binary = sum(x$binary)) + }, before = 2, names_sep = NULL) + + expect_identical(xx1,xx4) + + # tidyeval interface + xx5 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(sum_binary = sum(binary), + before = 2) + + expect_identical(xx1,xx5) +}) + +test_that("epix_slide works as intended with `as_list_col=TRUE`",{ + xx_dfrow1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE) + + xx_dfrow2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + c(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% + group_by(geo_value) + + expect_identical(xx_dfrow1,xx_dfrow2) # * + + xx_dfrow3 <- ( + xx + $group_by(dplyr::across(dplyr::all_of("geo_value"))) + $slide(f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE) + ) + + expect_identical(xx_dfrow1,xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + + xx_df1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE) + + xx_df2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(c(2^3,2^2), + c(2^6,2^3), + c(2^10,2^9), + c(2^15,2^14)) %>% + purrr::map(~ data.frame(bin = rev(.x))) + ) %>% + group_by(geo_value) + + expect_identical(xx_df1,xx_df2) + + xx_scalar1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE) + + xx_scalar2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(2^3+2^2, + 2^6+2^3, + 2^10+2^9, + 2^15+2^14) + ) %>% + group_by(geo_value) + + expect_identical(xx_scalar1,xx_scalar2) + + xx_vec1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ .x$binary, + before = 2, + as_list_col = TRUE) + + xx_vec2 <- tibble( + geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = + list(c(2^3,2^2), + c(2^6,2^3), + c(2^10,2^9), + c(2^15,2^14)) %>% + purrr::map(rev) + ) %>% + group_by(geo_value) + + expect_identical(xx_vec1,xx_vec2) +}) + +test_that("epix_slide `before` validation works", { + expect_error(xx$slide(f = ~ sum(.x$binary)), + "`before` is required") + expect_error(xx$slide(f = ~ sum(.x$binary), before=NA), + "`before`.*NA") + expect_error(xx$slide(f = ~ sum(.x$binary), before=-1), + "`before`.*negative") + expect_error(xx$slide(f = ~ sum(.x$binary), before=1.5), + regexp="before", + class="vctrs_error_incompatible_type") + # We might want to allow this at some point (issue #219): + expect_error(xx$slide(f = ~ sum(.x$binary), before=Inf), + regexp="before", + class="vctrs_error_incompatible_type") + # (wrapper shouldn't introduce a value:) + expect_error(epix_slide(xx, f = ~ sum(.x$binary)), "`before` is required") + # These `before` values should be accepted: + expect_error(xx$slide(f = ~ sum(.x$binary), before=0), + NA) + expect_error(xx$slide(f = ~ sum(.x$binary), before=2L), + NA) + expect_error(xx$slide(f = ~ sum(.x$binary), before=365000), + NA) +}) + +test_that("quosure passing issue in epix_slide is resolved + other potential issues", { + # (First part adapted from @examples) + time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-02"), + by = "1 day") + # We only have one non-version, non-time key in the example archive. Add + # another so that we don't accidentally pass tests due to accidentally + # matching the default grouping. + ea = as_epi_archive(archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE) + reference_by_modulus = ea %>% + group_by(modulus) %>% + epix_slide(f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av') + reference_by_neither = ea %>% + group_by() %>% + epix_slide(f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av') + # test the passing-something-that-must-be-enquosed behavior: + # + # (S3 group_by behavior for this case is the `reference_by_modulus`) + expect_identical( + ea$group_by(modulus)$slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference_by_modulus + ) + # test the .data pronoun behavior: + expect_identical( + epix_slide(x = ea %>% group_by(.data$modulus), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference_by_modulus + ) + expect_identical( + ea$group_by(.data$modulus)$slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference_by_modulus + ) + # test the passing across-all-of-string-literal behavior: + expect_identical( + epix_slide(x = ea %>% group_by(dplyr::across(all_of("modulus"))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference_by_modulus + ) + expect_identical( + ea$group_by(across(all_of("modulus")))$slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference_by_modulus + ) + # test the passing-across-all-of-string-var behavior: + my_group_by = "modulus" + expect_identical( + epix_slide(x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference_by_modulus + ) + expect_identical( + ea$group_by(dplyr::across(tidyselect::all_of(my_group_by)))$slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference_by_modulus + ) + # test the default behavior (default in this case should just be grouping by neither): + expect_identical( + epix_slide(x = ea, + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference_by_neither + ) + expect_identical( + ea$slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference_by_neither + ) +}) + +ea <- tibble::tribble(~version, ~time_value, ~binary, + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1)) %>% + tidyr::unnest(c(time_value,binary)) %>% + mutate(geo_value = "x") %>% + as_epi_archive() + +test_that("epix_slide with all_versions option has access to all older versions", { + library(data.table) + # Make sure we're using testthat edition 3, where `expect_identical` doesn't + # actually mean `base::identical` but something more content-based using + # `waldo` package: + testthat::local_edition(3) + + slide_fn <- function(x, gk, rtv) { + return(tibble(n_versions = length(unique(x$DT$version)), + n_row = nrow(x$DT), + dt_class1 = class(x$DT)[[1L]], + dt_key = list(key(x$DT)))) + } + + ea_orig_mirror = ea$clone(deep=TRUE) + ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) + + result1 <- ea %>% group_by() %>% + epix_slide(f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + + expect_true(inherits(result1, "tbl_df")) + + result2 <- tibble::tribble( + ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + 2, 1L, sum(1:1), "data.table", key(ea$DT), + 3, 2L, sum(1:2), "data.table", key(ea$DT), + 4, 3L, sum(1:3), "data.table", key(ea$DT), + 5, 4L, sum(1:4), "data.table", key(ea$DT), + 6, 5L, sum(1:5), "data.table", key(ea$DT), + 7, 6L, sum(1:6), "data.table", key(ea$DT), + ) + + expect_identical(result1,result2) # * + + result3 <- ( + ea + $group_by() + $slide(f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + ) + + expect_identical(result1,result3) # This and * Imply result2 and result3 are identical + + # formula interface + result4 <- ea %>% group_by() %>% + epix_slide(f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + + expect_identical(result1,result4) # This and * Imply result2 and result4 are identical + + # tidyeval interface + result5 <- ea %>% + group_by() %>% + epix_slide(data = slide_fn( + .data$clone(), # hack to convert from pronoun back to archive + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE) + + expect_identical(result1,result5) # This and * Imply result2 and result5 are identical + + expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea +}) + +test_that("as_of and epix_slide with long enough window are compatible", { + library(data.table) + testthat::local_edition(3) + + # For all_versions = FALSE: + + f1 = function(x, gk, rtv) { + tibble( + diff_mean = mean(diff(x$binary)) + ) + } + ref_time_value1 = 5 + + expect_identical( + ea$as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before=1L), + ea$slide(f1, before=1000L, ref_time_values=ref_time_value1, names_sep=NULL) + ) + + # For all_versions = TRUE: + + f2 = function(x, gk, rtv) { + x %>% + # extract time&version-lag-1 data: + epix_slide( + function(subx, subgk, rtv) { + tibble(data = list( + subx %>% + filter(time_value == attr(subx, "metadata")$as_of - 1) %>% + rename(real_time_value = time_value, lag1 = binary) + )) + }, before = 1, names_sep = NULL + ) %>% + # assess as nowcast: + unnest(data) %>% + inner_join(x$as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% + summarize(mean_abs_delta = mean(abs(binary - lag1))) + } + ref_time_value2 = 5 + + expect_identical( + ea$as_of(ref_time_value2, all_versions=TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before=1L), + ea$slide(f2, before=1000L, ref_time_values=ref_time_value2, all_versions=TRUE, names_sep=NULL) + ) + + # Test the same sort of thing when grouping by geo in an archive with multiple geos. + ea_multigeo = ea$clone() + ea_multigeo$DT <- rbind(ea_multigeo$DT, + copy(ea_multigeo$DT)[,geo_value:="y"][,binary:=-binary][]) + setkeyv(ea_multigeo$DT, key(ea$DT)) + + expect_identical( + ea_multigeo %>% + group_by(geo_value) %>% + epix_slide(f2, before=1000L, ref_time_values=ref_time_value2, all_versions=TRUE, names_sep=NULL) %>% + filter(geo_value == "x"), + ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` + epix_as_of(ref_time_value2, all_versions=TRUE) %>% + f2() %>% + transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + group_by(geo_value) + ) +}) + +test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`",{ + slide_fn <- function(x, gk, rtv) { + expect_true(is_epi_archive(x)) + return(NA) + } + + ea %>% group_by() %>% + epix_slide(f = slide_fn, + before = 1, + ref_time_values = 5, + new_col_name = "out", + all_versions = TRUE) +}) + +test_that("epix_slide with all_versions option works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE) + + xx2 <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + sum_binary = c(2^3+2^2, + 2^6+2^3, + 2^10+2^9+2^6, + 2^15+2^14+2^10)) %>% + group_by(geo_value) + + expect_identical(xx1,xx2) # * + + xx3 <- ( + xx + $group_by(dplyr::across(dplyr::all_of("geo_value"))) + $slide(f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = 'sum_binary', + all_versions = TRUE) + ) + + expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical +}) + +# XXX currently, we're using a stopgap measure of having `epix_slide` always +# output a (grouped/ungrouped) tibble while we think about the class, columns, +# and attributes of `epix_slide` output more carefully. We might bring this test +# back depending on the decisions there: +# +# test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { +# ea_updated_stale = ea$clone() +# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) +# # +# expect_identical( +# ea_updated_stale %>% +# group_by(geo_value) %>% +# epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>% +# ungroup() %>% +# attr("metadata") %>% +# .$as_of, +# 10 +# ) +# }) + +test_that("epix_slide works with 0-row computation outputs", { + epix_slide_empty = function(ea, ...) { + ea %>% + epix_slide(before = 5L, ..., function(x, gk, rtv) { + tibble::tibble() + }) + } + expect_identical( + ea %>% + epix_slide_empty(), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, + # as_of = ea$versions_end) %>% + group_by(geo_value) + ) + # with `all_versions=TRUE`, we have something similar but never get an + # `epi_df`: + expect_identical( + ea %>% + epix_slide_empty(all_versions=TRUE), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(all_versions=TRUE), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + group_by(geo_value) + ) +}) + +# test_that("epix_slide grouped by geo can produce `epi_df` output", { +# # This is a characterization test. Not sure we actually want this behavior; +# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 +# expect_identical( +# ea %>% +# group_by(geo_value) %>% +# epix_slide(before = 5L, function(x,g) { +# tibble::tibble(value = 42) +# }, names_sep = NULL), +# tibble::tibble( +# geo_value = "x", +# time_value = epix_slide_ref_time_values_default(ea), +# value = 42 +# ) %>% +# new_epi_df(as_of = ea$versions_end) +# ) +# }) + +test_that("epix_slide alerts if the provided f doesn't take enough args", { + f_xgt = function(x, g, t) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + # If `regexp` is NA, asserts that there should be no errors/messages. + expect_error(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) + expect_warning(epix_slide(xx, f = f_xgt, before = 2L), regexp = NA) + + f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + expect_warning(epix_slide(xx, f_x_dots, before = 2L), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") +}) + +test_that("epix_slide computation via formula can use ref_time_value", { + xx_ref <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = c(4,5,6,7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ .ref_time_value, + before = 2) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ .z, + before = 2) + + expect_identical(xx2, xx_ref) + + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = ~ ..3, + before = 2) + + expect_identical(xx3, xx_ref) +}) + +test_that("epix_slide computation via function can use ref_time_value", { + xx_ref <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = c(4,5,6,7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(f = function(x, g, t) t, + before = 2) + + expect_identical(xx1, xx_ref) +}) + +test_that("epix_slide computation via dots can use ref_time_value and group", { + # ref_time_value + xx_ref <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = c(4,5,6,7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(before = 2, + slide_value = .ref_time_value) + + expect_identical(xx1, xx_ref) + + # group_key + xx_ref <- tibble(geo_value = rep("x",4), + time_value = c(4,5,6,7), + slide_value = "x" + ) %>% + group_by(geo_value) + + # Use group_key column + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(before = 2, + slide_value = .group_key$geo_value) + + expect_identical(xx3, xx_ref) + + # Use entire group_key object + expect_error( + xx %>% + group_by(.data$geo_value) %>% + epix_slide(before = 2, + slide_value = nrow(.group_key)), + NA + ) +}) + +test_that("epix_slide computation via dots outputs the same result using col names and the data var", { + xx_ref <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(before = 2, + sum_binary = sum(time_value)) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide(before = 2, + sum_binary = sum(.x$time_value)) + + expect_identical(xx1, xx_ref) +}) + +test_that("`epix_slide` doesn't decay date output", { + expect_true( + xx$DT %>% + as_tibble() %>% + mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% + as_epi_archive() %>% + epix_slide(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) + +test_that("`epix_slide` can access objects inside of helper functions", { + helper = function(archive_haystack, time_value_needle) { + archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, before = 365000L) + } + expect_error( + helper(archive_cases_dv_subset, as.Date("2021-01-01")), + NA + ) + + expect_error( + helper(xx, 3L), + NA + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R new file mode 100644 index 000000000..0423352ef --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive.R @@ -0,0 +1,88 @@ +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive = + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version)) %>% + as_epi_archive(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys = toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) + ) + grouping_cols = c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "\\.drop.*TRUE or FALSE") + expect_warning(toy_archive %>% group_by(.drop=FALSE), + class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive__drop_FALSE_no_factors") + expect_warning(grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor") + expect_identical(grouped_factor_then_nonfactor %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop=FALSE)) + expect_identical(toy_archive %>% + group_by(geo_value, age_group, .drop=FALSE) %>% + epix_slide(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop=FALSE) + ) +}) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index d0434f598..3b6924754 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -2,6 +2,21 @@ library(dplyr) ea <- archive_cases_dv_subset$clone() +ea2_data <- tibble::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, + ) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) + # epix_as_of tests test_that("epix_as_of behaves identically to as_of method",{ expect_identical(epix_as_of(ea,max_version = min(ea$DT$version)), @@ -17,29 +32,22 @@ test_that("Errors are thrown due to bad as_of inputs",{ expect_error(ea$as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) }) -test_that("Warning against max_version being same as edf's max version",{ - expect_warning(ea$as_of(max_version = max(ea$DT$version))) - expect_warning(ea$as_of(max_version = min(ea$DT$version)),NA) +test_that("Warning against max_version being clobberable",{ + # none by default + expect_warning(regexp = NA, ea$as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea$as_of(max_version = min(ea$DT$version))) + # but with `clobberable_versions_start` non-`NA`, yes + ea_with_clobberable = ea$clone() + ea_with_clobberable$clobberable_versions_start = max(ea_with_clobberable$DT$version) + expect_warning(ea_with_clobberable$as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable$as_of(max_version = min(ea$DT$version))) }) test_that("as_of properly grabs the data and doesn't mutate key",{ d <- as.Date("2020-06-01") - ea2 = tibble::tribble( - ~geo_value, ~time_value, ~version, ~cases, - "ca", "2020-06-01", "2020-06-01", 1, - "ca", "2020-06-01", "2020-06-02", 2, - # - "ca", "2020-06-02", "2020-06-02", 0, - "ca", "2020-06-02", "2020-06-03", 1, - "ca", "2020-06-02", "2020-06-04", 2, - # - "ca", "2020-06-03", "2020-06-03", 1, - # - "ca", "2020-06-04", "2020-06-04", 4, - ) %>% - dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) %>% + ea2 = ea2_data %>% as_epi_archive() old_key = data.table::key(ea2$DT) @@ -57,102 +65,74 @@ test_that("as_of properly grabs the data and doesn't mutate key",{ expect_equal(data.table::key(ea2$DT), old_key) }) -test_that("quosure passing issue in epix_slide is resolved + other potential issues", { - # (First part adapted from @examples) - time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-02"), - by = "1 day") - # We only have one non-version, non-time key in the example archive. Add - # another so that we don't accidentally pass tests due to accidentally - # matching the default grouping. - ea = as_epi_archive(archive_cases_dv_subset$DT %>% - dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), - other_keys = "modulus", - compactify = TRUE) - reference_by_modulus = epix_slide(x = ea, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = modulus, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') - reference_by_both = epix_slide(x = ea, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = c(geo_value, modulus), - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') - # test the passing-something-that-must-be-enquosed behavior: - expect_identical( - ea$slide( - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = modulus, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' - ), - reference_by_modulus - ) - # test the passing-string-literal behavior: - expect_identical( - epix_slide(x = ea, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = "modulus", - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), - reference_by_modulus - ) - expect_identical( - ea$slide( - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = "modulus", - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' - ), - reference_by_modulus - ) - # Might also want to test the passing-string-var-without-all_of behavior, but - # make sure to set, trigger, then reset (or restore to old value) the - # tidyselect once-per-session message about the ambiguity - # - # test the passing-all-of-string-var behavior: - my_group_by = "modulus" - expect_identical( - epix_slide(x = ea, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = tidyselect::all_of(my_group_by), - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), - reference_by_modulus - ) - expect_identical( - ea$slide( - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = tidyselect::all_of(my_group_by), - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' - ), - reference_by_modulus - ) - # test the default behavior (default in this case should just be "geo_value"): - expect_identical( - epix_slide(x = ea, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), - reference_by_both - ) - expect_identical( - ea$slide( - f = ~ mean(.x$case_rate_7d_av), - n = 3, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' - ), - reference_by_both - ) +test_that("Errors are thrown due to bad epix_truncate_versions_after inputs",{ + # x must be an archive + expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) + # max_version cannot be of string class rather than date class + expect_error(epix_truncate_versions_after(ea, "2020-01-01")) + # max_version cannot be a vector + expect_error(epix_truncate_versions_after(ea, c(as.Date("2020-01-01"), as.Date("2020-01-02")))) + # max_version cannot be missing + expect_error(epix_truncate_versions_after(ea, as.Date(NA))) + # max_version cannot be after latest version in archive + expect_error(epix_truncate_versions_after(ea, as.Date("2025-01-01"))) +}) + +test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { + + ea2 = ea2_data %>% + as_epi_archive() + + old_key = data.table::key(ea2$DT) + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-02")) + + ea_expected <- ea2_data[1:3,] %>% + as_epi_archive() + + expect_equal(ea_as_of, ea_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { + + ea2 = ea2_data %>% + as_epi_archive() + + ea_expected <- ea2$clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of, ea_expected, ignore_attr=c(".internal.selfref", "sorted")) +}) + +test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { + + ea2 = ea2_data %>% + as_epi_archive() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_epi_archive(ea_as_of, grouped_okay=FALSE)) + + ea2_grouped = ea2$group_by(geo_value) + + ea_as_of <- ea2_grouped %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_grouped_epi_archive(ea_as_of)) +}) + + +test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { + + ea2 = ea2_data %>% + as_epi_archive() + ea2 = ea2$group_by(geo_value) + + ea_expected <- ea2$clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of$groups(), ea_expected$groups()) }) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index de43d7c27..9d03cf939 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -116,3 +116,11 @@ test_that("Correct metadata when subset includes some of other_keys", { # Including both original other_keys was already tested above }) +test_that("Metadata and grouping are dropped by `as_tibble`", { + grouped_converted = toy_epi_df %>% + group_by(geo_value) %>% + as_tibble() + expect_true( + !any(c("metadata", "groups") %in% names(attributes(grouped_converted))) + ) +}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 08b28c97e..4b1c38d2a 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -107,4 +107,113 @@ test_that("enlist works",{ my_list <- enlist(x=1,y=2,z=3) expect_equal(my_list$x,1) expect_true(inherits(my_list,"list")) -}) \ No newline at end of file +}) + +test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough args", { + f_xg = function(x, g) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xg_dots = function(x, g, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + + # If `regexp` is NA, asserts that there should be no errors/messages. + expect_error(assert_sufficient_f_args(f_xg), regexp = NA) + expect_warning(assert_sufficient_f_args(f_xg), regexp = NA) + expect_error(assert_sufficient_f_args(f_xg_dots), regexp = NA) + expect_warning(assert_sufficient_f_args(f_xg_dots), regexp = NA) + + f_x_dots = function(x, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_dots = function(...) dplyr::tibble(value=c(5), count=c(2)) + f_x = function(x) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f = function() dplyr::tibble(value=c(5), count=c(2)) + + expect_warning(assert_sufficient_f_args(f_x_dots), + regexp = ", the group key will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + expect_warning(assert_sufficient_f_args(f_dots), + regexp = ", the window data and group key will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + expect_error(assert_sufficient_f_args(f_x), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args") + expect_error(assert_sufficient_f_args(f), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args") + + f_xs_dots = function(x, setting="a", ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xs = function(x, setting="a") dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + expect_warning(assert_sufficient_f_args(f_xs_dots, setting="b"), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots") + expect_error(assert_sufficient_f_args(f_xs, setting="b"), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded") + + expect_error(assert_sufficient_f_args(f_xg, "b"), + class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded") +}) + +test_that("assert_sufficient_f_args alerts if the provided f has defaults for the required args", { + f_xg = function(x, g=1) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xg_dots = function(x=1, g, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_x_dots = function(x=1, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + + expect_error(assert_sufficient_f_args(f_xg), + regexp = "pass the group key to `f`'s g argument,", + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + expect_error(assert_sufficient_f_args(f_xg_dots), + regexp = "pass the window data to `f`'s x argument,", + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + expect_error(suppressWarnings(assert_sufficient_f_args(f_x_dots)), + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + + f_xsg = function(x, setting="a", g) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xsg_dots = function(x, setting="a", g, ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + f_xs_dots = function(x=1, setting="a", ...) dplyr::tibble(value=mean(x$binary), count=length(x$binary)) + + # forwarding named dots should prevent some complaints: + expect_no_error(assert_sufficient_f_args(f_xsg, setting = "b")) + expect_no_error(assert_sufficient_f_args(f_xsg_dots, setting = "b")) + expect_error(suppressWarnings(assert_sufficient_f_args(f_xs_dots, setting = "b")), + regexp = "window data to `f`'s x argument", + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + + # forwarding unnamed dots should not: + expect_error(assert_sufficient_f_args(f_xsg, "b"), + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + expect_error(assert_sufficient_f_args(f_xsg_dots, "b"), + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + expect_error(assert_sufficient_f_args(f_xs_dots, "b"), + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") + + # forwarding no dots should produce a different error message in some cases: + expect_error(assert_sufficient_f_args(f_xs_dots), + regexp = "window data and group key to `f`'s x and setting argument", + class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults") +}) + +test_that("computation formula-derived functions take all argument types", { + # positional + expect_identical(as_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) + expect_identical(as_slide_computation(~ ..1)(1, 2, 3), 1) + # Matching rlang, purr, dplyr usage + expect_identical(as_slide_computation(~ .x + .z)(1, 2, 3), 4) + expect_identical(as_slide_computation(~ .x + .y)(1, 2, 3), 3) + # named + expect_identical(as_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) + expect_identical(as_slide_computation(~ .group_key)(1, 2, 3), 2) +}) + +test_that("as_slide_computation passes functions unaltered", { + f <- function(a, b, c) {a * b * c + 5} + expect_identical(as_slide_computation(f), f) +}) + +test_that("as_slide_computation raises errors as expected", { + # Formulas must be one-sided + expect_error(as_slide_computation(y ~ ..1), + class="epiprocess__as_slide_computation__formula_is_twosided") + + # `f_env` must be an environment + formula_without_env <- stats::as.formula(~ ..1) + rlang::f_env(formula_without_env) <- 5 + expect_error(as_slide_computation(formula_without_env), + class="epiprocess__as_slide_computation__formula_has_no_env") + + # `f` must be a function, formula, or string + expect_error(as_slide_computation(5), + class="epiprocess__as_slide_computation__cant_convert_catchall") +}) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 5514eaaaf..022889053 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -7,21 +7,71 @@ vignette: > %\VignetteEncoding{UTF-8} --- -In this vignette, we discuss how to use the sliding functionality in the -`epiprocess` package with computations that have advanced output structures. - -In general, the functions `epi_slide()` and `epix_slide()` do what they can to -ensure the result of a slide operation is *size stable*, meaning, it will return -something whose length is the same as the number of appearances of reference -time values for the slide computation in the given data frame/table (this -defaults to all time values, but can be some given subset when `ref_time_values` -is specified). - -The output of a slide computation should either be an atomic value/vector, or a -data frame. This data frame can have multiple columns, multiple rows, or both. -Below we demonstrate some advanced use cases of sliding with these output -structures. We focus on `epi_slide()` for the most part, though the behavior we -demonstrate also carries over to `epix_slide()`. +In this vignette, we discuss how to use the sliding functionality in the +`epiprocess` package with less common grouping schemes or with computations that +have advanced output structures. +The output of a slide computation should either be an atomic value/vector, or a +data frame. This data frame can have multiple columns, multiple rows, or both. + +During basic usage (e.g., when all optional arguments are set to their defaults): + +* `epi_slide(edf, , .....)`: + + * keeps **all** columns of `edf`, adds computed column(s) + * outputs **one row per row in `edf`** (recycling outputs from + computations appropriately if there are multiple time series bundled + together inside any group(s)) + * maintains the grouping or ungroupedness of `edf` + * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by + `dplyr::arrange(time_value, .by_group = TRUE)`** + * outputs an **`epi_df`** if the required columns are present, otherwise a + tibble + +* `epix_slide(ea, , .....)`: + + * keeps **grouping and `time_value`** columns of `ea`, adds computed + column(s) + * outputs **any number of rows** (computations are allowed to output any + number of elements/rows, and no recycling is performed) + * maintains the grouping or ungroupedness of `ea`, unless it was explicitly + grouped by zero variables; this isn't supported by `grouped_df` and it will + automatically turn into an ungrouped tibble + * is roughly analogous to (the non-sliding) **`dplyr::group_modify`** + * outputs a **tibble** + +These differences in basic behavior make some common slide operations require less boilerplate: +* predictors and targets calculated with `epi_slide` are automatically lined up + with each other and with the signals from which they were calculated; and +* computations for an `epix_slide` can output data frames with any number of + rows, containing models, forecasts, evaluations, etc., and will not be + recycled. + +When using more advanced features, more complex rules apply: + +* Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)` + will output one row for every row in `edf` with `time_value` appearing inside + `my_ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` + followed by `dplyr::filter` to those `ref_time_values`. We call this property + **size stability**, and describe how it is achieved in the following sections. + The default behavior described above is a special case of this general rule + based on a default value of `ref_time_values`. +* Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values, + all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but + instead will output one row per row in `edf`. +* Exception/feature: `epi_slide(edf, ....., as_list_col=TRUE)` will format the + output to add a single list-class computed column. +* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will format the + output to have one row per computation and a single list-class computed column + (in addition to the grouping variables and `time_value`), as if we had used + `tidyr::chop()` or `tidyr::nest()`. +* Clarification: `ea %>% group_by(....., .drop=FALSE) %>% + epix_slide(, .....)` will call the computation on any missing + groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional + output rows. + +Below we demonstrate some advanced use cases of sliding with different output +structures. We focus on `epi_slide()` for the most part, though some of the +behavior we demonstrate also carries over to `epix_slide()`. ## Recycling outputs @@ -35,7 +85,7 @@ simple synthetic example. library(epiprocess) library(dplyr) -df <- tibble( +edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), @@ -44,29 +94,36 @@ df <- tibble( as_epi_df() # 2-day trailing average, per geo value -df %>% +edf %>% group_by(geo_value) %>% - epi_slide(x_2dav = mean(x), n = 2) + epi_slide(x_2dav = mean(x), before = 1) %>% + ungroup() # 2-day trailing average, marginally -df %>% - epi_slide(x_2dav = mean(x), n = 2) +edf %>% + epi_slide(x_2dav = mean(x), before = 1) ``` ```{r, include = FALSE} # More checks (not included) -df %>% - epi_slide(x_2dav = mean(x), n = 2, ref_time_values = as.Date("2020-06-02")) +edf %>% + epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% - epix_slide(x_2dav = mean(x), n = 2, ref_time_values = as.Date("2020-06-02")) + group_by(geo_value) %>% + epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + ungroup() -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% - epix_slide(~ mean(.x$x), n = 2, ref_time_values = as.Date("2020-06-02")) + group_by(geo_value) %>% + epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + ungroup() ``` When the slide computation returns an atomic vector (rather than a single value) @@ -75,8 +132,8 @@ so, uses it to fill the new column. For example, this next computation gives the same result as the last one. ```{r} -df %>% - epi_slide(y_2dav = rep(mean(x), 3), n = 2) +edf %>% + epi_slide(y_2dav = rep(mean(x), 3), before = 1) ``` However, if the output is an atomic vector (rather than a single value) and it @@ -84,8 +141,8 @@ is *not* size stable, then `epi_slide()` throws an error. For example, below we are trying to return 2 things for 3 states. ```{r, error = TRUE} -df %>% - epi_slide(x_2dav = rep(mean(x), 2), n = 2) +edf %>% + epi_slide(x_2dav = rep(mean(x), 2), before = 1) ``` ## Multi-column outputs @@ -98,14 +155,15 @@ we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` object returned by `epi_slide()` has a list column containing the slide values. ```{r} -df2 <- df %>% +edf2 <- edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - n = 2, as_list_col = TRUE) + before = 1, as_list_col = TRUE) %>% + ungroup() -class(df2$a) -length(df2$a) -df2$a[[2]] +class(edf2$a) +length(edf2$a) +edf2$a[[2]] ``` When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function @@ -116,44 +174,48 @@ list column (here `a`) onto the column names of the output data frame from the slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - n = 2, as_list_col = FALSE) + before = 1, as_list_col = FALSE) %>% + ungroup() ``` We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop the prefix associated with list column name, in naming the unnested columns. ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - n = 2, as_list_col = FALSE, names_sep = NULL) + before = 1, as_list_col = FALSE, names_sep = NULL) %>% + ungroup() ``` Furthermore, `epi_slide()` will recycle the single row data frame as needed in order to make the result size stable, just like the case for atomic values. ```{r} -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - n = 2, as_list_col = FALSE, names_sep = NULL) + before = 1, as_list_col = FALSE, names_sep = NULL) ``` ```{r, include = FALSE} # More checks (not included) -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), - n = 2, as_list_col = FALSE, names_sep = NULL) + before = 1, as_list_col = FALSE, names_sep = NULL) -df %>% +edf %>% mutate(version = time_value) %>% as_epi_archive() %>% + group_by(geo_value) %>% epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), - n = 2, as_list_col = FALSE, names_sep = NULL) + before = 1, as_list_col = FALSE, names_sep = NULL) %>% + ungroup() ``` ## Multi-row outputs @@ -164,32 +226,51 @@ Meaning, `epi_slide()` will check that the result is size stable, and if so, will fill the new column(s) in the resulting `epi_df` object appropriately. This can be convenient for modeling in the following sense: we can, for example, -fit a sliding forecasting model by pooling data from different locations, and -then return separate forecasts from this common model for each location. We use -our synthetic example to demonstrate this idea abstractly but simply. +fit a sliding, data-versioning-unaware nowcasting or forecasting model by +pooling data from different locations, and then return separate forecasts from +this common model for each location. We use our synthetic example to demonstrate +this idea abstractly but simply by forecasting (actually, nowcasting) `y` from +`x` by fitting a time-windowed linear model that pooling data across all +locations. ```{r} -df$y <- 2 * df$x + 0.05 * rnorm(length(df$x)) +edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x)) -df %>% +edf %>% epi_slide(function(d, ...) { obj <- lm(y ~ x, data = d) return( as.data.frame( predict(obj, newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), + group_by(geo_value) %>% + filter(time_value == max(time_value)), interval = "prediction", level = 0.9) )) - }, n = 2, new_col_name = "fc", names_sep = NULL) + }, before = 1, new_col_name = "fc", names_sep = NULL) ``` +The above example focused on simplicity to show how to work with multi-row +outputs. Note however, the following issues in this example: + +* The `lm` fitting data includes the testing instances, as no training-test split was performed. +* Adding a simple training-test split would not factor in reporting latency properly. +* Data revisions are not taken into account. + +All three of these factors contribute to unrealistic retrospective forecasts and +overly optimistic retrospective performance evaluations. Instead, one should +favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using +`epix_slide` also makes it easier to express certain types of forecasts; while +in `epi_slide`, forecasts for additional aheads or quantile levels would need to +be expressed as additional columns, or nested inside list columns, `epix_slide` +does not perform size stability checks or recycling, allowing computations to +output any number of rows. + ## Version-aware forecasting, revisited -Finally, we revisit the COVID-19 forecasting example from the [archive +We revisit the COVID-19 forecasting example from the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order -to demonstrate the last point in a more realistic setting. First, we fetch the -versioned data and build the archive. +to demonstrate the preceding points regarding forecast evaluation in a more +realistic setting. First, we fetch the versioned data and build the archive. ```{r, message = FALSE, warning = FALSE, eval =FALSE} library(epidatr) @@ -205,7 +286,7 @@ y1 <- covidcast( time_value = epirange(20200601, 20211201), geo_values = "ca,fl", issues = epirange(20200601, 20211201) -) %>% fetch_tbl() +) %>% fetch() y2 <- covidcast( data_source = "jhu-csse", @@ -215,14 +296,14 @@ y2 <- covidcast( time_value = epirange(20200601, 20211201), geo_values = "ca,fl", issues = epirange(20200601, 20211201) -) %>% fetch_tbl() +) %>% fetch() x <- y1 %>% select(geo_value, time_value, version = issue, percent_cli = value ) %>% - as_epi_archive() + as_epi_archive(compactify=FALSE) # mutating merge operation: x$merge(y2 %>% @@ -230,18 +311,20 @@ x$merge(y2 %>% version = issue, case_rate_7d_av = value ) %>% - as_epi_archive() + as_epi_archive(compactify=FALSE), + sync = "locf", + compactify=FALSE ) ``` -```{r, message = FALSE, warning = FALSE, echo =FALSE} +```{r, message = FALSE, echo =FALSE} library(data.table) library(ggplot2) theme_set(theme_bw()) x <- archive_cases_dv_subset$DT %>% filter(geo_value %in% c("ca","fl")) %>% - as_epi_archive() + as_epi_archive(compactify = FALSE) ``` @@ -345,7 +428,7 @@ data. # Latest snapshot of data, and forecast dates x_latest <- epix_as_of(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-12-01"), + as.Date("2021-11-30"), by = "1 month") # Simple function to produce forecasts k weeks ahead @@ -354,7 +437,7 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { x %>% epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, args = prob_arx_args(ahead = ahead)), - n = 120, ref_time_values = fc_time_values) %>% + before = 119, ref_time_values = fc_time_values) %>% mutate(target_date = time_value + ahead, as_of = TRUE, geo_value = fc_geo_value) } @@ -362,7 +445,7 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { x_latest %>% epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, args = prob_arx_args(ahead = ahead)), - n = 120, ref_time_values = fc_time_values) %>% + before = 119, ref_time_values = fc_time_values) %>% mutate(target_date = time_value + ahead, as_of = FALSE) } } diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index f9c845ca2..bdf6279ec 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -32,7 +32,7 @@ x <- covidcast( time_values = epirange(20200601, 20211231), geo_values = paste(y$geo_value, collapse = ",") ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% as_epi_df() @@ -177,7 +177,7 @@ Explicit imputation for missingness (zero-filling in our case) can be important for protecting against bugs in all sorts of downstream tasks. For example, even something as simple as a 7-day trailing average is complicated by missingness. The function `epi_slide()` looks for all rows within a window of 7 days anchored -on the right at the reference time point (when `n = 7` and `align = "right"`). +on the right at the reference time point (when `before = 6`). But when some days in a given week are missing because they were censored because they had small case counts, taking an average of the observed case counts can be misleading and is unintentionally biased upwards. Meanwhile, @@ -189,7 +189,8 @@ running `epi_slide()` on the zero-filled data brings these trailing averages xt %>% as_epi_df() %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), n = 7) %>% + epi_slide(cases_7dav = mean(cases), before = 6) %>% + ungroup() %>% filter(geo_value == "Plymouth, MA", abs(time_value - as.Date("2021-07-01")) <= 3) %>% print(n = 7) @@ -197,7 +198,8 @@ xt %>% xt_filled %>% as_epi_df() %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), n = 7) %>% + epi_slide(cases_7dav = mean(cases), before = 6) %>% + ungroup() %>% filter(geo_value == "Plymouth, MA", abs(time_value - as.Date("2021-07-01")) <= 3) %>% print(n = 7) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index c93fe767a..f14c8663a 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -41,7 +41,7 @@ dv <- covidcast( time_values = epirange(20200601, 20211201), geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) -) %>% fetch_tbl() +) %>% fetch() ``` @@ -252,11 +252,11 @@ y <- covidcast( geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% as_epi_archive(compactify=TRUE) -x$merge(y) +x$merge(y, sync="locf", compactify=FALSE) print(x) head(x$DT) ``` @@ -347,11 +347,14 @@ forecast COVID-19 case rates 7 days into the future. ```{r} fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-12-01"), + as.Date("2021-11-30"), by = "1 month") -z <- epix_slide(x, fc = prob_arx(x = percent_cli, y = case_rate_7d_av), n = 120, - ref_time_values = fc_time_values, group_by = geo_value) +z <- x %>% + group_by(geo_value) %>% + epix_slide(fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, + ref_time_values = fc_time_values) %>% + ungroup() head(z, 10) ``` @@ -369,8 +372,8 @@ a few notable differences, even apart from the version-aware aspect. You can read the documentation for `epix_slide()` for details. We finish off by comparing version-aware and -unaware forecasts at various -points in time and forecast horizons. The former comes from applying -`epix_slide()` to the `epi_archive` object `x`, and the latter from applying +points in time and forecast horizons. The former comes from using +`epix_slide()` with the `epi_archive` object `x`, and the latter from applying `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} @@ -380,16 +383,19 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version)) k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120, - ref_time_values = fc_time_values, group_by = geo_value) %>% - mutate(target_date = time_value + ahead, as_of = TRUE) + group_by(geo_value) %>% + epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, + ref_time_values = fc_time_values) %>% + mutate(target_date = time_value + ahead, as_of = TRUE) %>% + ungroup() } else { x_latest %>% group_by(geo_value) %>% - epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120, + epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values) %>% - mutate(target_date = time_value + ahead, as_of = FALSE) + mutate(target_date = time_value + ahead, as_of = FALSE) %>% + ungroup() } } diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index c30b98226..8ad3d1cd3 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -102,7 +102,7 @@ speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(case_rate_7d_av), n = 7) + my_ea$slide(median = median(case_rate_7d_av), before = 7) } speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) diff --git a/vignettes/correlation.Rmd b/vignettes/correlation.Rmd index f44064256..0d3574020 100644 --- a/vignettes/correlation.Rmd +++ b/vignettes/correlation.Rmd @@ -31,7 +31,7 @@ x <- covidcast( time_values = epirange(20200301, 20211231), geo_values = "*" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, case_rate = value) y <- covidcast( @@ -42,7 +42,7 @@ y <- covidcast( time_values = epirange(20200301, 20211231), geo_values = "*" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, death_rate = value) x <- x %>% diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 5de60eb34..ebab0ec9b 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -2,7 +2,7 @@ title: Get started with `epiprocess` output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Get started with epiprocess} + %\VignetteIndexEntry{Get started with `epiprocess`} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: @@ -67,7 +67,7 @@ cases <- covidcast( geo_type = "state", time_values = epirange(20200301, 20220131), geo_values = "ca,fl,ny,tx" -) %>% fetch_tbl() +) %>% fetch() colnames(cases) ``` diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index bc8485087..9c185b151 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -31,7 +31,7 @@ x <- covidcast( time_values = epirange(20200601, 20211231), geo_values = "ga,pa" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) %>% as_epi_df() diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 588fce4dc..8d8205311 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -29,7 +29,7 @@ x <- covidcast( geo_values = "fl,nj", as_of = 20211028 ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() ``` @@ -117,6 +117,7 @@ x <- x %>% x = time_value, y = cases, methods = detection_methods, combiner = "median")) %>% + ungroup() %>% unnest(outlier_info) head(x) diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index de189f88a..cb435e549 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -11,7 +11,7 @@ A central tool in the `epiprocess` package is `epi_slide()`, which is based on the powerful functionality provided in the [`slider`](https://cran.r-project.org/web/packages/slider) package. In `epiprocess`, to "slide" means to apply a computation---represented as a -function or formula---over a running window of `n` time steps. Suitable +function or formula---over a sliding/rolling data window. Suitable groupings can always be achieved by a preliminary call to `group_by()`. By default, the meaning of one time step is inferred from the `time_value` @@ -43,9 +43,9 @@ x <- covidcast( time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), - geo_values = "ca,fl,ny,tx" + geo_values = "ca,fl,ny,tx,ga,pa" ) %>% - fetch_tbl() %>% + fetch() %>% select(geo_value, time_value, cases = value) %>% arrange(geo_value, time_value) %>% as_epi_df() @@ -71,37 +71,57 @@ order to smooth the signal, by passing in a formula for the first argument of ```{r} x %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), n = 7) %>% + epi_slide(~ mean(.x$cases), before = 6) %>% + ungroup() %>% head(10) ``` -The formula specified has access to all columns present in the original `epi_df` -object (and must refer to them with the prefix `.x$`). As we can see, the -function `epi_slide()` returns an `epi_df` object with a new column appended -that contains the results (from sliding), named `slide_value` as the default. We -can of course change this post hoc, or we can instead specify a new name up -front using the `new_col_name` argument: +The formula specified has access to all non-grouping columns present in the +original `epi_df` object (and must refer to them with the prefix `.x$`). As we +can see, the function `epi_slide()` returns an `epi_df` object with a new column +appended that contains the results (from sliding), named `slide_value` as the +default. We can of course change this post hoc, or we can instead specify a new +name up front using the `new_col_name` argument: ```{r} x <- x %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), n = 7, new_col_name = "cases_7dav") + epi_slide(~ mean(.x$cases), before = 6, new_col_name = "cases_7dav") %>% + ungroup() head(x, 10) ``` +Some other information is available in additional variables: + +* `.group_key` is a one-row tibble containing the values of the grouping + variables for the associated group +* `.ref_time_value` is the reference time value the time window was based on + +Like in `group_modify()`, there are alternative names for these variables as +well: `.` can be used instead of `.x`, `.y` instead of `.group_key`, and `.z` +instead of `.ref_time_value`. + ## Slide with a function We can also pass a function for the first argument in `epi_slide()`. In this -case, the passed function must have the following argument structure: `x`, a -data frame with the same column names as the original object; followed by any -number of named arguments; and ending with `...` to capture additional -arguments. Recreating the last example of a 7-day trailing average: +case, the passed function must accept the following arguments: + +In this case, the passed function `f` must accept the following arguments: a +data frame with the same column names as the original object, minus any grouping +variables, containing the time window data for one group-`ref_time_value` +combination; followed by a one-row tibble containing the values of the grouping +variables for the associated group; followed by the associated `ref_time_value`. +It can accept additional arguments; `epi_slide()` will forward any `...` args it +receives to `f`. + +Recreating the last example of a 7-day trailing average: ```{r} x <- x %>% group_by(geo_value) %>% - epi_slide(function(x, ...) mean(x$cases), n = 7, new_col_name = "cases_7dav") + epi_slide(function(x, gk, rtv) mean(x$cases), before = 6, new_col_name = "cases_7dav") %>% + ungroup() head(x, 10) ``` @@ -117,10 +137,13 @@ would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} x <- x %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), n = 7) + epi_slide(cases_7dav = mean(cases), before = 6) %>% + ungroup() head(x, 10) ``` +In addition to referring to individual columns by name, you can refer to the +time window data as an `epi_df` or `tibble` using `.x`. Similarly, the other arguments of the function format are available through the magic names `.group_key` and `.ref_time_value`, and the tidyverse "pronouns" `.data` and `.env` can also be used. As a simple sanity check, we visualize the 7-day trailing averages computed on top of the original counts: @@ -160,7 +183,7 @@ units of the `time_value` column; so, days, in the working `epi_df` being considered in this vignette). ```{r} -prob_ar <- function(y, lags = c(0, 7, 14), ahead = 7, min_train_window = 20, +prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, intercept = FALSE, nonneg = TRUE) { # Return NA if insufficient training data @@ -209,8 +232,9 @@ fc_time_values <- seq(as.Date("2020-06-01"), by = "1 months") x %>% group_by(geo_value) %>% - epi_slide(fc = prob_ar(cases_7dav), n = 120, + epi_slide(fc = prob_ar(cases_7dav), before = 119, ref_time_values = fc_time_values) %>% + ungroup() %>% head(10) ``` @@ -233,8 +257,9 @@ so that we can call it a few times. k_week_ahead <- function(x, ahead = 7) { x %>% group_by(geo_value) %>% - epi_slide(fc = prob_ar(cases_7dav, ahead = ahead), n = 120, + epi_slide(fc = prob_ar(cases_7dav, ahead = ahead), before = 119, ref_time_values = fc_time_values, all_rows = TRUE) %>% + ungroup() %>% mutate(target_date = time_value + ahead) }