From 9d69fe1a2612c585c89162725d4a15b629a2190c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 9 Nov 2021 12:11:13 -0500 Subject: [PATCH 01/18] Add bare-bones `epi_tibble_archive` Add `epi_tibble_archive` as an `R6` class with just - `$new` (requires the user to perform any compression of snapshots/updates into minimal diffs if desired, and to reconstruct upon updating the archive itself) - `$epi_tibble_as_of` The next priority is to implement `epi_slide` for `epi_tibble_archive`s. --- DESCRIPTION | 11 ++- NAMESPACE | 11 +++ R/epi_tibble_archive.R | 190 ++++++++++++++++++++++++++++++++++++++ man/epi_tibble_archive.Rd | 183 ++++++++++++++++++++++++++++++++++++ 4 files changed, 394 insertions(+), 1 deletion(-) create mode 100644 R/epi_tibble_archive.R create mode 100644 man/epi_tibble_archive.Rd diff --git a/DESCRIPTION b/DESCRIPTION index b789477d6..d0aea5bf9 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,4 +27,13 @@ Imports: slider, tibble, tidyselect, - tidyr + tidyr, + lubridate, + R6, + data.table, + pipeR, + assertthat +Suggests: + delphi.epidata +Remotes: + github:cmu-delphi/delphi-epidata-r diff --git a/NAMESPACE b/NAMESPACE index 7d30e2a6c..06f7b43ea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,14 +19,18 @@ export(Sum) export(as.epi_tibble) export(cor_lagged) export(epi_slide) +export(epi_tibble_archive) export(estimate_deriv) export(pct_change) export(quiet) +importFrom(assertthat,assert_that) +importFrom(data.table,as.data.table) importFrom(dplyr,arrange) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,relocate) +importFrom(dplyr,rename) importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarize) @@ -37,14 +41,21 @@ importFrom(genlasso,trendfilter) importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") +importFrom(pipeR,"%>>%") +importFrom(rlang,"!!") importFrom(rlang,.data) importFrom(rlang,abort) importFrom(rlang,enquo) +importFrom(rlang,is_character) +importFrom(rlang,is_named) +importFrom(rlang,is_scalar_atomic) +importFrom(rlang,is_scalar_character) importFrom(stats,coef) importFrom(stats,cor) importFrom(stats,lsfit) importFrom(stats,median) importFrom(stats,predict) importFrom(stats,smooth.spline) +importFrom(tibble,as_tibble) importFrom(tidyr,drop_na) importFrom(utils,head) diff --git a/R/epi_tibble_archive.R b/R/epi_tibble_archive.R new file mode 100644 index 000000000..a1d506e51 --- /dev/null +++ b/R/epi_tibble_archive.R @@ -0,0 +1,190 @@ +## We use special features of data.table's `[`. The data.table package has a +## compatibility feature that disables some/all of these features if it thinks +## we might expect `data.frame`-compatible behavior instead. We can signal that +## we want the special behavior via `.datatable.aware = TRUE` or by importing +## any `data.table` package member. Do both to prevent surprises if we decide to +## use `data.table::` everywhere and not importing things. +.datatable.aware = TRUE + +#' Archive (data version history) for an \code{epi_tibble} +#' +#' Contains version history for an \code{epi_tibble}, and enables fast querying +#' of snapshots of the \code{epi_tibble} as of certain "issues" (versions). +#' Version history can be input as a data frame combining full snapshots of the +#' `epi_tibble` as of several issue times, or using only the newly added or +#' revised rows for each issue, or using some combination of these two +#' (including "updates" for things that didn't actually change). +#' Last-observation-carried-forward (LOCF) is used to data in between recorded +#' updates. Currently, deletions must be represented as revising a row to a +#' special state (e.g., making the entries \code{NA} or including a special +#' column that flags the data as removed and performing post-processing), and +#' the archive is unaware of what this state is. +#' +#' @examples +#' +#' update.df = +#' tibble::tribble( +#' ~geo_value, ~time_value, ~issue, ~value, +#' ## update history of geo1 for reference time 2021-01-01: +#' ## (1 day of latency in initial report) +#' "geo1", as.Date("2021-01-01"), as.Date("2021-01-02"), 5.0, +#' ## (revised upward) +#' "geo1", as.Date("2021-01-01"), as.Date("2021-01-03"), 9.0, +#' ## (revised upward) +#' "geo1", as.Date("2021-01-01"), as.Date("2021-01-10"), 9.2, +#' ## update history of geo1 for reference time 2021-01-02: +#' ## (1 day of latency in initial report) +#' "geo1", as.Date("2021-01-02"), as.Date("2021-01-03"), 8.0, +#' ## (redundant "update" row; we will already be using LOCF to fill in) +#' "geo1", as.Date("2021-01-02"), as.Date("2021-01-04"), 8.0, +#' ## (replaced with NA) +#' "geo1", as.Date("2021-01-02"), as.Date("2021-01-10"), NA_real_, +#' ## update history of geo1 for reference time 2021-01-05 (suppose data set skips the 3rd and 4th) +#' ## (1 day of latency in initial report) +#' "geo1", as.Date("2021-01-05"), as.Date("2021-01-06"), 13.0, +#' ) +#' +#' ## update.df actually contains update data through issue 2021-01-11, but the +#' ## data set was not reported to change from 2021-01-10 to 2021-01-11 +#' epi.tibble.archive = epi_tibble_archive$new(update.df, max.issue=as.Date("2021-01-11")) +#' +#' ## The snapshot as of issue 2021-01-03 just looks like the updates in issue +#' ## 2021-01-03, because all past measurements were updated in this issue (we +#' ## don't need to do any LOCF to obtain the snapshot). +#' epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-03")) +#' +#' ## The snapshot as of issue 2021-01-05 uses LOCF on the first two geo-time +#' ## combinations. Note that there is no entry for `time_value` 2021-01-05, as the +#' ## initial version of this data was not available until issue 2021-01-06. +#' epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-05")) +#' +#' ## The snapshot as of issue 2021-01-06 does include the measurement for +#' ## `time_value` 2021-01-05. +#' epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-06")) +#' +#' ## (Don't automatically run this example as it involves network access and querying the API) +#' if (FALSE) { +#' library(dplyr) +#' ## (delphi.epidata package is on GitHub in cmu-delphi/delphi-epidata-r) +#' update.df.2 = +#' delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", +#' "day", "state", +#' delphi.epidata::epirange(12340101,34560101), c("ak","al"), +#' issues = delphi.epidata::epirange(12340101,34560101)) %>% +#' delphi.epidata::fetch_tbl() +#' snapshot.df.2a = +#' delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", +#' "day", "state", +#' delphi.epidata::epirange(12340101,34560101), c("ak","al"), +#' as_of = 20201014) %>% +#' delphi.epidata::fetch_tbl() +#' snapshot.df.2b = +#' delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", +#' "day", "state", +#' delphi.epidata::epirange(12340101,34560101), c("ak","al"), +#' as_of = 20201028) %>% +#' delphi.epidata::fetch_tbl() +#' +#' epi.tibble.archive.2 = epi_tibble_archive$new(update.df.2) +#' all.equal( +#' as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-14"))), +#' as_tibble(as.epi_tibble(snapshot.df.2a)), +#' check.attributes = FALSE) +#' all.equal( +#' as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-28"))), +#' as_tibble(as.epi_tibble(snapshot.df.2b)), +#' check.attributes = FALSE) +#' } +#' +#' @importFrom assertthat assert_that +#' @importFrom rlang is_scalar_character is_named is_character is_scalar_atomic !! +#' @importFrom tibble as_tibble +#' @importFrom dplyr rename +#' @importFrom data.table as.data.table +#' @importFrom pipeR %>>% +#' @export +epi_tibble_archive = + R6::R6Class("epi_tibble_archive", + private = list( + update.DT = NULL, + max.issue = NULL, + issue.colname = NULL, + geo.colname = NULL, + time.colname = NULL, + other.key.colnames = NULL + ), + public = list( + #' @description + #' Create a new \code{epi_tibble_archive} with the given update data. + #' @param update.df the update data + #' @param issue.colname name of the column with the issue time of the corresponding updates; operations such as \code{sort}, \code{<=}, and \code{max} must make sense on this column, with earlier issues "less than" later issues + #' @param geo.colname the name of the column that will become \code{geo_value} in the \code{epi_tibble} + #' @param time.colname the name of the column that will become \code{time_value} in the \code{epi_tibble} + #' @param other.key.colnames the names of any other columns that would be used to index a single measurement in this data set, such as the age group the measurement corresponds to (if the data set includes an age group breakdown); there should only be a single row per issue-geo-time-other-key-cols combination. + #' @param max.issue the latest issue for which update data was available; defaults to the maximum issue time in the \code{update.df}, but if there were no additions or revisions in subsequent issues, it could be later. However, due to details regarding database replica syncing times in upstream APIs, using the default might be safer than whatever we think the max issue should be. + #' @return an \code{epi_tibble_archive} object + initialize = function(update.df, + issue.colname = "issue", + geo.colname = "geo_value", + time.colname = "time_value", + other.key.colnames = character(0L), + max.issue = max(update.df[[issue.colname]])) { + assert_that (is.data.frame(update.df)) + assert_that (is_scalar_character(issue.colname) && !is_named(issue.colname)) + assert_that (is_scalar_character(geo.colname) && !is_named(geo.colname)) + assert_that (is_scalar_character(time.colname) && !is_named(time.colname)) + assert_that (is_character(other.key.colnames) && !is_named(other.key.colnames)) + assert_that (issue.colname %in% names(update.df)) + assert_that (geo.colname %in% names(update.df)) + assert_that (time.colname %in% names(update.df)) + assert_that (all(other.key.colnames %in% names(update.df))) + assert_that (identical(class(update.df[[issue.colname]]), class(max.issue))) + assert_that(length(unique(c(issue.colname, geo.colname, time.colname, other.key.colnames))) == + 3L + length(other.key.colnames)) + assert_that(max.issue >= max(update.df[[issue.colname]])) + ## -- end of input validation -- + update.DT = as.data.table(update.df, key=c(geo.colname, time.colname, other.key.colnames, issue.colname)) + private[["update.DT"]] <- update.DT + private[["max.issue"]] <- max.issue + private[["issue.colname"]] <- issue.colname + private[["geo.colname"]] <- geo.colname + private[["time.colname"]] <- time.colname + private[["other.key.colnames"]] <- other.key.colnames + }, + #' @description + #' Get the \code{epi_tibble} as of some issue time + #' @param issue the desired as-of issue time + #' @return an \code{epi_tibble} with data as of the specified issue time, \code{issue} recorded in the metadata, the geo column renamed to \code{geo_value} and time column to \code{time_value}, and the other key colnames recorded in the metadata + epi_tibble_as_of = function(issue) { + assert_that(is_scalar_atomic(issue) && identical(class(issue), class(private[["max.issue"]]))) + assert_that(issue <= private[["max.issue"]]) + if (issue == max(private[["update.DT"]][[private[["issue.colname"]]]])) { + ## (really, this should be the last issue with an actual + ## addition or revision; it's the same as what's checked + ## here only if we didn't include redundant "updates" in + ## this max issue; alternatively, we should follow the + ## user's indication and use `private$max.issue` and let + ## them deal with potential strange cases with replicas + ## being out of date) + warn('Getting epi_tibble as of the latest issue with + recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_tibble snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') + } + ## -- end of input validation -- + private[["update.DT"]] %>>% + ## {.[, .SD[.[[private[["issue.colname"]]]] <= ..issue]]} %>>% + dplyr::filter(.[[private[["issue.colname"]]]] <= .env[["issue"]]) %>>% + unique(by=c(private[["geo.colname"]], private[["time.colname"]], private[["other.key.colnames"]]), fromLast=TRUE) %>>% + as_tibble() %>>% + select(-!!private[["issue.colname"]]) %>>% + ## rename(issue_with_last_update = !!private[["issue.colname"]]) %>>% + rename( + geo_value = !!private[["geo.colname"]], + time_value = !!private[["time.colname"]], + ) %>>% + as.epi_tibble(issue = issue, + additional_metadata = list(other.key.colnames = private[["other.key.colnames"]])) %>>% + {.} + } + ) + ) + diff --git a/man/epi_tibble_archive.Rd b/man/epi_tibble_archive.Rd new file mode 100644 index 000000000..92b319144 --- /dev/null +++ b/man/epi_tibble_archive.Rd @@ -0,0 +1,183 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_tibble_archive.R +\name{epi_tibble_archive} +\alias{epi_tibble_archive} +\title{Archive (data version history) for an \code{epi_tibble}} +\description{ +Archive (data version history) for an \code{epi_tibble} + +Archive (data version history) for an \code{epi_tibble} +} +\details{ +Contains version history for an \code{epi_tibble}, and enables fast querying +of snapshots of the \code{epi_tibble} as of certain "issues" (versions). +Version history can be input as a data frame combining full snapshots of the +\code{epi_tibble} as of several issue times, or using only the newly added or +revised rows for each issue, or using some combination of these two +(including "updates" for things that didn't actually change). +Last-observation-carried-forward (LOCF) is used to data in between recorded +updates. Currently, deletions must be represented as revising a row to a +special state (e.g., making the entries \code{NA} or including a special +column that flags the data as removed and performing post-processing), and +the archive is unaware of what this state is. +} +\examples{ + +update.df = + tibble::tribble( + ~geo_value, ~time_value, ~issue, ~value, + ## update history of geo1 for reference time 2021-01-01: + ## (1 day of latency in initial report) + "geo1", as.Date("2021-01-01"), as.Date("2021-01-02"), 5.0, + ## (revised upward) + "geo1", as.Date("2021-01-01"), as.Date("2021-01-03"), 9.0, + ## (revised upward) + "geo1", as.Date("2021-01-01"), as.Date("2021-01-10"), 9.2, + ## update history of geo1 for reference time 2021-01-02: + ## (1 day of latency in initial report) + "geo1", as.Date("2021-01-02"), as.Date("2021-01-03"), 8.0, + ## (redundant "update" row; we will already be using LOCF to fill in) + "geo1", as.Date("2021-01-02"), as.Date("2021-01-04"), 8.0, + ## (replaced with NA) + "geo1", as.Date("2021-01-02"), as.Date("2021-01-10"), NA_real_, + ## update history of geo1 for reference time 2021-01-05 (suppose data set skips the 3rd and 4th) + ## (1 day of latency in initial report) + "geo1", as.Date("2021-01-05"), as.Date("2021-01-06"), 13.0, + ) + +## update.df actually contains update data through issue 2021-01-11, but the +## data set was not reported to change from 2021-01-10 to 2021-01-11 +epi.tibble.archive = epi_tibble_archive$new(update.df, max.issue=as.Date("2021-01-11")) + +## The snapshot as of issue 2021-01-03 just looks like the updates in issue +## 2021-01-03, because all past measurements were updated in this issue (we +## don't need to do any LOCF to obtain the snapshot). +epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-03")) + +## The snapshot as of issue 2021-01-05 uses LOCF on the first two geo-time +## combinations. Note that there is no entry for `time_value` 2021-01-05, as the +## initial version of this data was not available until issue 2021-01-06. +epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-05")) + +## The snapshot as of issue 2021-01-06 does include the measurement for +## `time_value` 2021-01-05. +epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-06")) + +## (Don't automatically run this example as it involves network access and querying the API) +if (FALSE) { + library(dplyr) + ## (delphi.epidata package is on GitHub in cmu-delphi/delphi-epidata-r) + update.df.2 = + delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", + "day", "state", + delphi.epidata::epirange(12340101,34560101), c("ak","al"), + issues = delphi.epidata::epirange(12340101,34560101)) \%>\% + delphi.epidata::fetch_tbl() + snapshot.df.2a = + delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", + "day", "state", + delphi.epidata::epirange(12340101,34560101), c("ak","al"), + as_of = 20201014) \%>\% + delphi.epidata::fetch_tbl() + snapshot.df.2b = + delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", + "day", "state", + delphi.epidata::epirange(12340101,34560101), c("ak","al"), + as_of = 20201028) \%>\% + delphi.epidata::fetch_tbl() + + epi.tibble.archive.2 = epi_tibble_archive$new(update.df.2) + all.equal( + as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-14"))), + as_tibble(as.epi_tibble(snapshot.df.2a)), + check.attributes = FALSE) + all.equal( + as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-28"))), + as_tibble(as.epi_tibble(snapshot.df.2b)), + check.attributes = FALSE) +} + +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-new}{\code{epi_tibble_archive$new()}} +\item \href{#method-epi_tibble_as_of}{\code{epi_tibble_archive$epi_tibble_as_of()}} +\item \href{#method-clone}{\code{epi_tibble_archive$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-new}{}}} +\subsection{Method \code{new()}}{ +Create a new \code{epi_tibble_archive} with the given update data. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_tibble_archive$new( + update.df, + issue.colname = "issue", + geo.colname = "geo_value", + time.colname = "time_value", + other.key.colnames = character(0L), + max.issue = max(update.df[[issue.colname]]) +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{update.df}}{the update data} + +\item{\code{issue.colname}}{name of the column with the issue time of the corresponding updates; operations such as \code{sort}, \code{<=}, and \code{max} must make sense on this column, with earlier issues "less than" later issues} + +\item{\code{geo.colname}}{the name of the column that will become \code{geo_value} in the \code{epi_tibble}} + +\item{\code{time.colname}}{the name of the column that will become \code{time_value} in the \code{epi_tibble}} + +\item{\code{other.key.colnames}}{the names of any other columns that would be used to index a single measurement in this data set, such as the age group the measurement corresponds to (if the data set includes an age group breakdown); there should only be a single row per issue-geo-time-other-key-cols combination.} + +\item{\code{max.issue}}{the latest issue for which update data was available; defaults to the maximum issue time in the \code{update.df}, but if there were no additions or revisions in subsequent issues, it could be later. However, due to details regarding database replica syncing times in upstream APIs, using the default might be safer than whatever we think the max issue should be.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +an \code{epi_tibble_archive} object +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_tibble_as_of}{}}} +\subsection{Method \code{epi_tibble_as_of()}}{ +Get the \code{epi_tibble} as of some issue time +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_tibble_archive$epi_tibble_as_of(issue)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{issue}}{the desired as-of issue time} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +an \code{epi_tibble} with data as of the specified issue time, \code{issue} recorded in the metadata, the geo column renamed to \code{geo_value} and time column to \code{time_value}, and the other key colnames recorded in the metadata +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_tibble_archive$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} From e19958cb3325781e1110fbfa2fa608657f99af57 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 9 Nov 2021 14:38:30 -0500 Subject: [PATCH 02/18] Add metainfo fns, update_DT_as_of, epi_slide for epi_tibble_archive These need testing. The `epi_slide` implementation needs testing and documentation and examples. The issues vignette still needs to be completed with this implementation. --- NAMESPACE | 6 ++++ R/epi_tibble_archive.R | 60 +++++++++++++++++++++++++++++++---- R/slide.R | 67 +++++++++++++++++++++++++++++++++++++-- man/epi_tibble_archive.Rd | 56 ++++++++++++++++++++++++++++++++ 4 files changed, 180 insertions(+), 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 06f7b43ea..0d8b64cfc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,8 @@ S3method(as.epi_tibble,data.frame) S3method(as.epi_tibble,epi_tibble) S3method(as.epi_tibble,tibble) +S3method(epi_slide,epi_tibble) +S3method(epi_slide,epi_tibble_archive) S3method(group_by,epi_tibble) S3method(head,epi_tibble) S3method(print,epi_tibble) @@ -26,6 +28,7 @@ export(quiet) importFrom(assertthat,assert_that) importFrom(data.table,as.data.table) importFrom(dplyr,arrange) +importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) @@ -42,7 +45,9 @@ importFrom(lubridate,days) importFrom(lubridate,weeks) importFrom(magrittr,"%>%") importFrom(pipeR,"%>>%") +importFrom(rlang,"!!!") importFrom(rlang,"!!") +importFrom(rlang,":=") importFrom(rlang,.data) importFrom(rlang,abort) importFrom(rlang,enquo) @@ -50,6 +55,7 @@ importFrom(rlang,is_character) importFrom(rlang,is_named) importFrom(rlang,is_scalar_atomic) importFrom(rlang,is_scalar_character) +importFrom(rlang,warn) importFrom(stats,coef) importFrom(stats,cor) importFrom(stats,lsfit) diff --git a/R/epi_tibble_archive.R b/R/epi_tibble_archive.R index a1d506e51..11a64d33f 100644 --- a/R/epi_tibble_archive.R +++ b/R/epi_tibble_archive.R @@ -97,9 +97,9 @@ #' } #' #' @importFrom assertthat assert_that -#' @importFrom rlang is_scalar_character is_named is_character is_scalar_atomic !! +#' @importFrom rlang is_scalar_character is_named is_character is_scalar_atomic !! warn #' @importFrom tibble as_tibble -#' @importFrom dplyr rename +#' @importFrom dplyr rename filter #' @importFrom data.table as.data.table #' @importFrom pipeR %>>% #' @export @@ -166,13 +166,12 @@ epi_tibble_archive = ## user's indication and use `private$max.issue` and let ## them deal with potential strange cases with replicas ## being out of date) - warn('Getting epi_tibble as of the latest issue with - recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_tibble snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') + warn('Getting epi_tibble as of the latest issue with recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_tibble snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') } ## -- end of input validation -- private[["update.DT"]] %>>% ## {.[, .SD[.[[private[["issue.colname"]]]] <= ..issue]]} %>>% - dplyr::filter(.[[private[["issue.colname"]]]] <= .env[["issue"]]) %>>% + filter(.[[private[["issue.colname"]]]] <= .env[["issue"]]) %>>% unique(by=c(private[["geo.colname"]], private[["time.colname"]], private[["other.key.colnames"]]), fromLast=TRUE) %>>% as_tibble() %>>% select(-!!private[["issue.colname"]]) %>>% @@ -183,8 +182,55 @@ epi_tibble_archive = ) %>>% as.epi_tibble(issue = issue, additional_metadata = list(other.key.colnames = private[["other.key.colnames"]])) %>>% - {.} + return() + }, + #' @description + #' Return the name settings in a list + naming_info = function() { + list( + issue.colname = private[["issue.colname"]], + geo.colname = private[["geo.colname"]], + time.colname = private[["time.colname"]], + other.key.colnames = private[["other.key.colnames"]] + ) + }, + #' @description + #' Return the max issue value recorded by this archive (whether it had updates or not) + max_issue = function() { + private[["max.issue"]] + }, + #' @description + #' Return the issue values for which updates are + #' recorded in this archive (that is, whether they had updates in + #' the data frame used to form this archive, regardless of whether + #' those "updates" actually added or revised any data) + issues_with_updates = function() { + return (unique(private[["update.DT"]][[private[["issue.colname"]]]])) + }, + #' @description + #' + #' Return the recorded update data up through the given issue + #' value, inside a \code{data.table} object which is fine to + #' modify without copying. + #' + #' @param issue the max issue value that should appear in the result + update_DT_as_of = function(issue) { + assert_that(is_scalar_atomic(issue) && identical(class(issue), class(private[["max.issue"]]))) + assert_that(issue <= private[["max.issue"]]) + if (issue == max(private[["update.DT"]][[private[["issue.colname"]]]])) { + ## (really, this should be the last issue with an actual + ## addition or revision; it's the same as what's checked + ## here only if we didn't include redundant "updates" in + ## this max issue; alternatively, we should follow the + ## user's indication and use `private$max.issue` and let + ## them deal with potential strange cases with replicas + ## being out of date) + warn('Getting epi_tibble as of the latest issue with recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_tibble snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') + } + private[["update.DT"]] %>>% + ## {.[, .SD[.[[private[["issue.colname"]]]] <= ..issue]]} %>>% + filter(.[[private[["issue.colname"]]]] <= .env[["issue"]]) %>>% + return() } ) ) - diff --git a/R/slide.R b/R/slide.R index 5d71720db..021ad72dc 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,3 +1,7 @@ +#' @source epi_tibble.R +#' @source epi_tibble_archive.R +NULL + #' Slide a function over variables in an `epi_tibble` object #' #' Slides a given function over the variables in an `epi_tibble` object. See the @@ -54,9 +58,13 @@ epi_slide = function(x, slide_fun, n = 14, new_col_name = "slide_value", new_col_type = c("dbl", "int", "lgl", "chr", "list"), time_step, ...) { - # Check we have an `epi_tibble` object - if (!inherits(x, "epi_tibble")) abort("`x` be of class `epi_tibble`.") + UseMethod("epi_slide") +} +#' @export +epi_slide.epi_tibble = function(x, slide_fun, n = 14, new_col_name = "slide_value", + new_col_type = c("dbl", "int", "lgl", "chr", "list"), + time_step, ...) { # Which slide_index function? new_col_type = match.arg(new_col_type) slide_index_zzz = switch(new_col_type, @@ -94,3 +102,58 @@ epi_slide = function(x, slide_fun, n = 14, new_col_name = "slide_value", attributes(x)$metadata = metadata return(x) } + +#' @importFrom rlang !!! !! := +#' @importFrom pipeR %>>% +#' @export +epi_slide.epi_tibble_archive = function(x, slide_fun, n = 14, new_col_name = "slide_value", + new_col_type = c("dbl", "int", "lgl", "chr", "list"), + time_step, + issue_step, + issue_to_max_time_value = identity, + issue_range = range(x$issues_with_updates(), x$max_issue()), + ...) { + ## TODO test this. + + ## Which map function? + new_col_type = match.arg(new_col_type) + map_zzz = switch(new_col_type, + "dbl" = purrr::map_dbl, + "int" = purrr::map_int, + "lgl" = purrr::map_lgl, + "chr" = purrr::map_chr, + "list" = purrr::map) + + ## It'd be natural to write + ## ``` + ## seq(issue_range[[1L]], issue_range[[2L]], issue_step(1)) + ## ``` + ## but this produces an error, as do `seq` approaches directly on `Period` objects. + n.issues = diff(issue_range) + issues = issue_range[[1L]] + issue_step(0:as.integer(lubridate::as.period(diff(issue_range))/issue_step(1L))) + + ## XXX use x$naming_info()$issue.colname? + + ## TODO detect skipped empty data, e.g., when trying to produce results + ## expecting 0 initial report latency; warn, error, or produce results + + tibble::tibble(issue := issues) %>>% + tibble::add_column( + results.for.issue = . %>>% + dplyr::pull(issue) %>>% + purrr::map(function(issue) { + max_time_value = issue_to_max_time_value(issue) + min_time_value = max_time_value - time_step(n-1L) + x$epi_tibble_as_of(issue) %>>% + ## XXX we may want to `complete` the `time_value`s here + dplyr::filter(min_time_value <= time_value & time_value <= max_time_value) %>>% + dplyr::group_by(geo_value, !!!x$naming_info()$other.key.colnames) %>>% + dplyr::group_nest(.key="data") %>>% + ## tibble::add_column doesn't seem to work here for some reason + dplyr::mutate(!!new_col_name := map_zzz(.data[["data"]], slide_fun)) %>>% + tidyr::unnest(data) + }) + ) %>>% + tidyr::unnest(results.for.issue) %>>% + dplyr::group_by(issue, geo_value, !!!x$naming_info()$other.key.colnames) +} diff --git a/man/epi_tibble_archive.Rd b/man/epi_tibble_archive.Rd index 92b319144..5fcd5390e 100644 --- a/man/epi_tibble_archive.Rd +++ b/man/epi_tibble_archive.Rd @@ -103,6 +103,10 @@ if (FALSE) { \itemize{ \item \href{#method-new}{\code{epi_tibble_archive$new()}} \item \href{#method-epi_tibble_as_of}{\code{epi_tibble_archive$epi_tibble_as_of()}} +\item \href{#method-naming_info}{\code{epi_tibble_archive$naming_info()}} +\item \href{#method-max_issue}{\code{epi_tibble_archive$max_issue()}} +\item \href{#method-issues_with_updates}{\code{epi_tibble_archive$issues_with_updates()}} +\item \href{#method-update_DT_as_of}{\code{epi_tibble_archive$update_DT_as_of()}} \item \href{#method-clone}{\code{epi_tibble_archive$clone()}} } } @@ -164,6 +168,58 @@ an \code{epi_tibble} with data as of the specified issue time, \code{issue} reco } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-naming_info}{}}} +\subsection{Method \code{naming_info()}}{ +Return the name settings in a list +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_tibble_archive$naming_info()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-max_issue}{}}} +\subsection{Method \code{max_issue()}}{ +Return the max issue value recorded by this archive (whether it had updates or not) +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_tibble_archive$max_issue()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-issues_with_updates}{}}} +\subsection{Method \code{issues_with_updates()}}{ +Return the issue values for which updates are +recorded in this archive (that is, whether they had updates in +the data frame used to form this archive, regardless of whether +those "updates" actually added or revised any data) +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_tibble_archive$issues_with_updates()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-update_DT_as_of}{}}} +\subsection{Method \code{update_DT_as_of()}}{ +Return the recorded update data up through the given issue +value, inside a \code{data.table} object which is fine to +modify without copying. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_tibble_archive$update_DT_as_of(issue)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{issue}}{the max issue value that should appear in the result} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-clone}{}}} \subsection{Method \code{clone()}}{ From 2296916852e460b1086522b95b7535ede16adb7f Mon Sep 17 00:00:00 2001 From: Ryan Tibshirani Date: Tue, 25 Jan 2022 13:19:20 -0500 Subject: [PATCH 03/18] epi_tibble -> epi_df --- R/epi_tibble_archive.R | 52 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/R/epi_tibble_archive.R b/R/epi_tibble_archive.R index 11a64d33f..b0b305023 100644 --- a/R/epi_tibble_archive.R +++ b/R/epi_tibble_archive.R @@ -6,17 +6,17 @@ ## use `data.table::` everywhere and not importing things. .datatable.aware = TRUE -#' Archive (data version history) for an \code{epi_tibble} +#' Archive (data version history) for an `epi_df` object #' -#' Contains version history for an \code{epi_tibble}, and enables fast querying -#' of snapshots of the \code{epi_tibble} as of certain "issues" (versions). +#' Contains version history for an `epi_df` object, and enables fast querying +#' of snapshots of the `epi_df` object as of certain "issues" (versions). #' Version history can be input as a data frame combining full snapshots of the -#' `epi_tibble` as of several issue times, or using only the newly added or +#' `epi_df` as of several issue times, or using only the newly added or #' revised rows for each issue, or using some combination of these two #' (including "updates" for things that didn't actually change). #' Last-observation-carried-forward (LOCF) is used to data in between recorded #' updates. Currently, deletions must be represented as revising a row to a -#' special state (e.g., making the entries \code{NA} or including a special +#' special state (e.g., making the entries `NA` or including a special #' column that flags the data as removed and performing post-processing), and #' the archive is unaware of what this state is. #' @@ -46,21 +46,21 @@ #' #' ## update.df actually contains update data through issue 2021-01-11, but the #' ## data set was not reported to change from 2021-01-10 to 2021-01-11 -#' epi.tibble.archive = epi_tibble_archive$new(update.df, max.issue=as.Date("2021-01-11")) +#' epi.tibble.archive = epi_df_archive$new(update.df, max.issue=as.Date("2021-01-11")) #' #' ## The snapshot as of issue 2021-01-03 just looks like the updates in issue #' ## 2021-01-03, because all past measurements were updated in this issue (we #' ## don't need to do any LOCF to obtain the snapshot). -#' epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-03")) +#' epi.tibble.archive$epi_df_as_of(as.Date("2021-01-03")) #' #' ## The snapshot as of issue 2021-01-05 uses LOCF on the first two geo-time #' ## combinations. Note that there is no entry for `time_value` 2021-01-05, as the #' ## initial version of this data was not available until issue 2021-01-06. -#' epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-05")) +#' epi.tibble.archive$epi_df_as_of(as.Date("2021-01-05")) #' #' ## The snapshot as of issue 2021-01-06 does include the measurement for #' ## `time_value` 2021-01-05. -#' epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-06")) +#' epi.tibble.archive$epi_df_as_of(as.Date("2021-01-06")) #' #' ## (Don't automatically run this example as it involves network access and querying the API) #' if (FALSE) { @@ -85,14 +85,14 @@ #' as_of = 20201028) %>% #' delphi.epidata::fetch_tbl() #' -#' epi.tibble.archive.2 = epi_tibble_archive$new(update.df.2) +#' epi.tibble.archive.2 = epi_df_archive$new(update.df.2) #' all.equal( -#' as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-14"))), -#' as_tibble(as.epi_tibble(snapshot.df.2a)), +#' as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-14"))), +#' as_tibble(as.epi_df(snapshot.df.2a)), #' check.attributes = FALSE) #' all.equal( -#' as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-28"))), -#' as_tibble(as.epi_tibble(snapshot.df.2b)), +#' as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-28"))), +#' as_tibble(as.epi_df(snapshot.df.2b)), #' check.attributes = FALSE) #' } #' @@ -103,8 +103,8 @@ #' @importFrom data.table as.data.table #' @importFrom pipeR %>>% #' @export -epi_tibble_archive = - R6::R6Class("epi_tibble_archive", +epi_df_archive = + R6::R6Class("epi_df_archive", private = list( update.DT = NULL, max.issue = NULL, @@ -115,14 +115,14 @@ epi_tibble_archive = ), public = list( #' @description - #' Create a new \code{epi_tibble_archive} with the given update data. + #' Create a new \code{epi_df_archive} with the given update data. #' @param update.df the update data #' @param issue.colname name of the column with the issue time of the corresponding updates; operations such as \code{sort}, \code{<=}, and \code{max} must make sense on this column, with earlier issues "less than" later issues - #' @param geo.colname the name of the column that will become \code{geo_value} in the \code{epi_tibble} - #' @param time.colname the name of the column that will become \code{time_value} in the \code{epi_tibble} + #' @param geo.colname the name of the column that will become \code{geo_value} in the \code{epi_df} + #' @param time.colname the name of the column that will become \code{time_value} in the \code{epi_df} #' @param other.key.colnames the names of any other columns that would be used to index a single measurement in this data set, such as the age group the measurement corresponds to (if the data set includes an age group breakdown); there should only be a single row per issue-geo-time-other-key-cols combination. #' @param max.issue the latest issue for which update data was available; defaults to the maximum issue time in the \code{update.df}, but if there were no additions or revisions in subsequent issues, it could be later. However, due to details regarding database replica syncing times in upstream APIs, using the default might be safer than whatever we think the max issue should be. - #' @return an \code{epi_tibble_archive} object + #' @return an \code{epi_df_archive} object initialize = function(update.df, issue.colname = "issue", geo.colname = "geo_value", @@ -152,10 +152,10 @@ epi_tibble_archive = private[["other.key.colnames"]] <- other.key.colnames }, #' @description - #' Get the \code{epi_tibble} as of some issue time + #' Get the \code{epi_df} as of some issue time #' @param issue the desired as-of issue time - #' @return an \code{epi_tibble} with data as of the specified issue time, \code{issue} recorded in the metadata, the geo column renamed to \code{geo_value} and time column to \code{time_value}, and the other key colnames recorded in the metadata - epi_tibble_as_of = function(issue) { + #' @return an \code{epi_df} with data as of the specified issue time, \code{issue} recorded in the metadata, the geo column renamed to \code{geo_value} and time column to \code{time_value}, and the other key colnames recorded in the metadata + epi_df_as_of = function(issue) { assert_that(is_scalar_atomic(issue) && identical(class(issue), class(private[["max.issue"]]))) assert_that(issue <= private[["max.issue"]]) if (issue == max(private[["update.DT"]][[private[["issue.colname"]]]])) { @@ -166,7 +166,7 @@ epi_tibble_archive = ## user's indication and use `private$max.issue` and let ## them deal with potential strange cases with replicas ## being out of date) - warn('Getting epi_tibble as of the latest issue with recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_tibble snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') + warn('Getting epi_df as of the latest issue with recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_df snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') } ## -- end of input validation -- private[["update.DT"]] %>>% @@ -180,7 +180,7 @@ epi_tibble_archive = geo_value = !!private[["geo.colname"]], time_value = !!private[["time.colname"]], ) %>>% - as.epi_tibble(issue = issue, + as.epi_df(issue = issue, additional_metadata = list(other.key.colnames = private[["other.key.colnames"]])) %>>% return() }, @@ -225,7 +225,7 @@ epi_tibble_archive = ## user's indication and use `private$max.issue` and let ## them deal with potential strange cases with replicas ## being out of date) - warn('Getting epi_tibble as of the latest issue with recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_tibble snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') + warn('Getting epi_df as of the latest issue with recorded change data; it is possible that we have a preliminary version of this issue, the upstream source has updated it, and we have not seen those updates yet due to them not being published yet, or potentially due to latency in synchronization of upstream database replicas. Thus, the epi_df snapshot that we produce here might not be reproducible at later times when we use an archive with fresher data.') } private[["update.DT"]] %>>% ## {.[, .SD[.[[private[["issue.colname"]]]] <= ..issue]]} %>>% From 22c61a3a511170453427de1ded78f6a49ade5271 Mon Sep 17 00:00:00 2001 From: Ryan Tibshirani Date: Thu, 27 Jan 2022 06:29:57 -0500 Subject: [PATCH 04/18] Assorted cleanup - Renamed `epi_df_archive` to `epi_archive` - Started updating documentation for `epi_slide()` - Rebuilt documentation --- DESCRIPTION | 2 +- NAMESPACE | 7 +- R/{epi_tibble_archive.R => epi_archive.R} | 12 +-- R/slide.R | 53 +++++++----- man/{epi_tibble_archive.Rd => epi_archive.Rd} | 82 +++++++++---------- man/epi_slide.Rd | 11 ++- 6 files changed, 92 insertions(+), 75 deletions(-) rename R/{epi_tibble_archive.R => epi_archive.R} (97%) rename man/{epi_tibble_archive.Rd => epi_archive.Rd} (71%) diff --git a/DESCRIPTION b/DESCRIPTION index 6b9756d66..3b0aeae70 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,4 +45,4 @@ Imports: Suggests: delphi.epidata Remotes: - github:cmu-delphi/delphi-epidata-r \ No newline at end of file + github:cmu-delphi/delphi-epidata-r diff --git a/NAMESPACE b/NAMESPACE index fdf8b4f0c..39e6b2b7b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,8 @@ S3method(as.epi_df,data.frame) S3method(as.epi_df,epi_df) S3method(as.epi_df,tibble) +S3method(epi_slide,epi_archive) +S3method(epi_slide,epi_df) S3method(group_by,epi_df) S3method(head,epi_df) S3method(print,epi_df) @@ -17,23 +19,25 @@ export(Min) export(Start) export(Sum) export(as.epi_df) +export(epi_archive) export(epi_cor) export(epi_detect_outlr) export(epi_detect_outlr_rm) export(epi_detect_outlr_stl) export(epi_slide) -export(epi_tibble_archive) export(estimate_deriv) export(pct_change) export(quiet) importFrom(assertthat,assert_that) importFrom(data.table,as.data.table) importFrom(dplyr,arrange) +importFrom(dplyr,case_when) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,pull) +importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,summarize) importFrom(dplyr,transmute) @@ -68,7 +72,6 @@ importFrom(stats,median) importFrom(stats,predict) importFrom(stats,smooth.spline) importFrom(tibble,as_tibble) -importFrom(tidyr,drop_na) importFrom(tidyselect,all_of) importFrom(tidyselect,ends_with) importFrom(utils,head) diff --git a/R/epi_tibble_archive.R b/R/epi_archive.R similarity index 97% rename from R/epi_tibble_archive.R rename to R/epi_archive.R index b0b305023..cd99a5280 100644 --- a/R/epi_tibble_archive.R +++ b/R/epi_archive.R @@ -46,7 +46,7 @@ #' #' ## update.df actually contains update data through issue 2021-01-11, but the #' ## data set was not reported to change from 2021-01-10 to 2021-01-11 -#' epi.tibble.archive = epi_df_archive$new(update.df, max.issue=as.Date("2021-01-11")) +#' epi.tibble.archive = epi_archive$new(update.df, max.issue=as.Date("2021-01-11")) #' #' ## The snapshot as of issue 2021-01-03 just looks like the updates in issue #' ## 2021-01-03, because all past measurements were updated in this issue (we @@ -85,7 +85,7 @@ #' as_of = 20201028) %>% #' delphi.epidata::fetch_tbl() #' -#' epi.tibble.archive.2 = epi_df_archive$new(update.df.2) +#' epi.tibble.archive.2 = epi_archive$new(update.df.2) #' all.equal( #' as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-14"))), #' as_tibble(as.epi_df(snapshot.df.2a)), @@ -103,8 +103,8 @@ #' @importFrom data.table as.data.table #' @importFrom pipeR %>>% #' @export -epi_df_archive = - R6::R6Class("epi_df_archive", +epi_archive = + R6::R6Class("epi_archive", private = list( update.DT = NULL, max.issue = NULL, @@ -115,14 +115,14 @@ epi_df_archive = ), public = list( #' @description - #' Create a new \code{epi_df_archive} with the given update data. + #' Create a new \code{epi_archive} with the given update data. #' @param update.df the update data #' @param issue.colname name of the column with the issue time of the corresponding updates; operations such as \code{sort}, \code{<=}, and \code{max} must make sense on this column, with earlier issues "less than" later issues #' @param geo.colname the name of the column that will become \code{geo_value} in the \code{epi_df} #' @param time.colname the name of the column that will become \code{time_value} in the \code{epi_df} #' @param other.key.colnames the names of any other columns that would be used to index a single measurement in this data set, such as the age group the measurement corresponds to (if the data set includes an age group breakdown); there should only be a single row per issue-geo-time-other-key-cols combination. #' @param max.issue the latest issue for which update data was available; defaults to the maximum issue time in the \code{update.df}, but if there were no additions or revisions in subsequent issues, it could be later. However, due to details regarding database replica syncing times in upstream APIs, using the default might be safer than whatever we think the max issue should be. - #' @return an \code{epi_df_archive} object + #' @return an \code{epi_archive} object initialize = function(update.df, issue.colname = "issue", geo.colname = "geo_value", diff --git a/R/slide.R b/R/slide.R index 6929adcef..cc583399f 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,11 +1,9 @@ -##' @source epi_df.R -##' @source epi_df.R -#NULL - #' Slide a function over variables in an `epi_df` object #' -#' Slides a given function over variables in an `epi_df` object. See the [slide +#' Slides a given function over variables in an `epi_df` object. See the [slide #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for +#' examples. Also applies to an `epi_archive` object; see the [issues +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/issues.html) for #' examples. #' #' @details To "slide" means to apply a function or formula over a running @@ -16,6 +14,9 @@ #' `time_step` argument (which if specified would override the default choice #' based on the metadata). #' +#' The critical difference between sliding over an `epi_df` versus `epi_archive` +#' object is that, with the latter, TODO +#' #' If `f` is missing, then an expression for tidy evaluation can be specified, #' for example, as in: #' ``` @@ -31,7 +32,7 @@ #' inferred from the given expression and overrides any name passed explicitly #' through the `new_col_name` argument. #' -#' @param x The `epi_df` object under consideration. +#' @param x The `epi_df` or `epi_archive` object under consideration. #' @param f Function or formula to slide over variables in `x`. To "slide" means #' to apply a function or formula over a running window of `n` time steps #' (where one time step is typically one day or one week; see details for more @@ -78,22 +79,26 @@ #' fashion (for example, per geo value), we can use `group_by()` before the #' call to `epi_slide()`. #' -#' @importFrom dplyr arrange group_modify mutate pull summarize -#' @importFrom lubridate days weeks -#' @importFrom rlang .data abort enquo enquos #' @export epi_slide = function(x, f, ..., n = 14, align = c("right", "center", "left"), before, complete = FALSE, new_col_name = "slide_value", new_col_type = c("dbl", "int", "lgl", "chr", "list"), - time_step, ...) { + time_step) { UseMethod("epi_slide") } +#' @method epi_slide epi_df +#' @importFrom dplyr arrange group_modify mutate pull summarize +#' @importFrom lubridate days weeks +#' @importFrom rlang .data abort enquo enquos #' @export -epi_slide.epi_df = function(x, f, ..., n = 14, align = c("right", "center", "left"), - before, complete = FALSE, new_col_name = "slide_value", - new_col_type = c("dbl", "int", "lgl", "chr", "list"), - time_step, ...) { +epi_slide.epi_df = function(x, f, ..., n = 14, + align = c("right", "center", "left"), + before, complete = FALSE, + new_col_name = "slide_value", + new_col_type = c("dbl", "int", "lgl", "chr", + "list"), + time_step) { # Which slide_index function? new_col_type = match.arg(new_col_type) index_fun = switch(new_col_type, @@ -193,13 +198,17 @@ epi_slide_one_grp = function(.data_group, index_fun, f, ..., before_num, #' @importFrom rlang !!! !! := #' @importFrom pipeR %>>% #' @export -epi_slide.epi_df_archive = function(x, slide_fun, n = 14, new_col_name = "slide_value", - new_col_type = c("dbl", "int", "lgl", "chr", "list"), - time_step, - issue_step, - issue_to_max_time_value = identity, - issue_range = range(x$issues_with_updates(), x$max_issue()), - ...) { +epi_slide.epi_archive = function(x, f, ..., n = 14, + align = c("right", "center", "left"), + before, complete = FALSE, + new_col_name = "slide_value", + new_col_type = c("dbl", "int", "lgl", "chr", + "list"), + time_step, + issue_step, + issue_to_max_time_value = identity, + issue_range = range(x$issues_with_updates(), + x$max_issue())) { ## TODO test this. ## Which map function? @@ -243,4 +252,4 @@ epi_slide.epi_df_archive = function(x, slide_fun, n = 14, new_col_name = "slide_ ) %>>% tidyr::unnest(results.for.issue) %>>% dplyr::group_by(issue, geo_value, !!!x$naming_info()$other.key.colnames) -} \ No newline at end of file +} diff --git a/man/epi_tibble_archive.Rd b/man/epi_archive.Rd similarity index 71% rename from man/epi_tibble_archive.Rd rename to man/epi_archive.Rd index 5fcd5390e..ffd6e39b3 100644 --- a/man/epi_tibble_archive.Rd +++ b/man/epi_archive.Rd @@ -1,18 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_tibble_archive.R -\name{epi_tibble_archive} -\alias{epi_tibble_archive} -\title{Archive (data version history) for an \code{epi_tibble}} +% Please edit documentation in R/epi_archive.R +\name{epi_archive} +\alias{epi_archive} +\title{Archive (data version history) for an \code{epi_df} object} \description{ -Archive (data version history) for an \code{epi_tibble} +Archive (data version history) for an \code{epi_df} object -Archive (data version history) for an \code{epi_tibble} +Archive (data version history) for an \code{epi_df} object } \details{ -Contains version history for an \code{epi_tibble}, and enables fast querying -of snapshots of the \code{epi_tibble} as of certain "issues" (versions). +Contains version history for an \code{epi_df} object, and enables fast querying +of snapshots of the \code{epi_df} object as of certain "issues" (versions). Version history can be input as a data frame combining full snapshots of the -\code{epi_tibble} as of several issue times, or using only the newly added or +\code{epi_df} as of several issue times, or using only the newly added or revised rows for each issue, or using some combination of these two (including "updates" for things that didn't actually change). Last-observation-carried-forward (LOCF) is used to data in between recorded @@ -47,21 +47,21 @@ update.df = ## update.df actually contains update data through issue 2021-01-11, but the ## data set was not reported to change from 2021-01-10 to 2021-01-11 -epi.tibble.archive = epi_tibble_archive$new(update.df, max.issue=as.Date("2021-01-11")) +epi.tibble.archive = epi_archive$new(update.df, max.issue=as.Date("2021-01-11")) ## The snapshot as of issue 2021-01-03 just looks like the updates in issue ## 2021-01-03, because all past measurements were updated in this issue (we ## don't need to do any LOCF to obtain the snapshot). -epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-03")) +epi.tibble.archive$epi_df_as_of(as.Date("2021-01-03")) ## The snapshot as of issue 2021-01-05 uses LOCF on the first two geo-time ## combinations. Note that there is no entry for `time_value` 2021-01-05, as the ## initial version of this data was not available until issue 2021-01-06. -epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-05")) +epi.tibble.archive$epi_df_as_of(as.Date("2021-01-05")) ## The snapshot as of issue 2021-01-06 does include the measurement for ## `time_value` 2021-01-05. -epi.tibble.archive$epi_tibble_as_of(as.Date("2021-01-06")) +epi.tibble.archive$epi_df_as_of(as.Date("2021-01-06")) ## (Don't automatically run this example as it involves network access and querying the API) if (FALSE) { @@ -86,14 +86,14 @@ if (FALSE) { as_of = 20201028) \%>\% delphi.epidata::fetch_tbl() - epi.tibble.archive.2 = epi_tibble_archive$new(update.df.2) + epi.tibble.archive.2 = epi_archive$new(update.df.2) all.equal( - as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-14"))), - as_tibble(as.epi_tibble(snapshot.df.2a)), + as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-14"))), + as_tibble(as.epi_df(snapshot.df.2a)), check.attributes = FALSE) all.equal( - as_tibble(epi.tibble.archive.2$epi_tibble_as_of(as.Date("2020-10-28"))), - as_tibble(as.epi_tibble(snapshot.df.2b)), + as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-28"))), + as_tibble(as.epi_df(snapshot.df.2b)), check.attributes = FALSE) } @@ -101,22 +101,22 @@ if (FALSE) { \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{epi_tibble_archive$new()}} -\item \href{#method-epi_tibble_as_of}{\code{epi_tibble_archive$epi_tibble_as_of()}} -\item \href{#method-naming_info}{\code{epi_tibble_archive$naming_info()}} -\item \href{#method-max_issue}{\code{epi_tibble_archive$max_issue()}} -\item \href{#method-issues_with_updates}{\code{epi_tibble_archive$issues_with_updates()}} -\item \href{#method-update_DT_as_of}{\code{epi_tibble_archive$update_DT_as_of()}} -\item \href{#method-clone}{\code{epi_tibble_archive$clone()}} +\item \href{#method-new}{\code{epi_archive$new()}} +\item \href{#method-epi_df_as_of}{\code{epi_archive$epi_df_as_of()}} +\item \href{#method-naming_info}{\code{epi_archive$naming_info()}} +\item \href{#method-max_issue}{\code{epi_archive$max_issue()}} +\item \href{#method-issues_with_updates}{\code{epi_archive$issues_with_updates()}} +\item \href{#method-update_DT_as_of}{\code{epi_archive$update_DT_as_of()}} +\item \href{#method-clone}{\code{epi_archive$clone()}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-new}{}}} \subsection{Method \code{new()}}{ -Create a new \code{epi_tibble_archive} with the given update data. +Create a new \code{epi_archive} with the given update data. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_tibble_archive$new( +\if{html}{\out{
}}\preformatted{epi_archive$new( update.df, issue.colname = "issue", geo.colname = "geo_value", @@ -133,9 +133,9 @@ Create a new \code{epi_tibble_archive} with the given update data. \item{\code{issue.colname}}{name of the column with the issue time of the corresponding updates; operations such as \code{sort}, \code{<=}, and \code{max} must make sense on this column, with earlier issues "less than" later issues} -\item{\code{geo.colname}}{the name of the column that will become \code{geo_value} in the \code{epi_tibble}} +\item{\code{geo.colname}}{the name of the column that will become \code{geo_value} in the \code{epi_df}} -\item{\code{time.colname}}{the name of the column that will become \code{time_value} in the \code{epi_tibble}} +\item{\code{time.colname}}{the name of the column that will become \code{time_value} in the \code{epi_df}} \item{\code{other.key.colnames}}{the names of any other columns that would be used to index a single measurement in this data set, such as the age group the measurement corresponds to (if the data set includes an age group breakdown); there should only be a single row per issue-geo-time-other-key-cols combination.} @@ -144,16 +144,16 @@ Create a new \code{epi_tibble_archive} with the given update data. \if{html}{\out{
}} } \subsection{Returns}{ -an \code{epi_tibble_archive} object +an \code{epi_archive} object } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_tibble_as_of}{}}} -\subsection{Method \code{epi_tibble_as_of()}}{ -Get the \code{epi_tibble} as of some issue time +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_df_as_of}{}}} +\subsection{Method \code{epi_df_as_of()}}{ +Get the \code{epi_df} as of some issue time \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_tibble_archive$epi_tibble_as_of(issue)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$epi_df_as_of(issue)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -164,7 +164,7 @@ Get the \code{epi_tibble} as of some issue time \if{html}{\out{
}} } \subsection{Returns}{ -an \code{epi_tibble} with data as of the specified issue time, \code{issue} recorded in the metadata, the geo column renamed to \code{geo_value} and time column to \code{time_value}, and the other key colnames recorded in the metadata +an \code{epi_df} with data as of the specified issue time, \code{issue} recorded in the metadata, the geo column renamed to \code{geo_value} and time column to \code{time_value}, and the other key colnames recorded in the metadata } } \if{html}{\out{
}} @@ -173,7 +173,7 @@ an \code{epi_tibble} with data as of the specified issue time, \code{issue} reco \subsection{Method \code{naming_info()}}{ Return the name settings in a list \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_tibble_archive$naming_info()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$naming_info()}\if{html}{\out{
}} } } @@ -183,7 +183,7 @@ Return the name settings in a list \subsection{Method \code{max_issue()}}{ Return the max issue value recorded by this archive (whether it had updates or not) \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_tibble_archive$max_issue()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$max_issue()}\if{html}{\out{
}} } } @@ -196,7 +196,7 @@ recorded in this archive (that is, whether they had updates in the data frame used to form this archive, regardless of whether those "updates" actually added or revised any data) \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_tibble_archive$issues_with_updates()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$issues_with_updates()}\if{html}{\out{
}} } } @@ -208,7 +208,7 @@ Return the recorded update data up through the given issue value, inside a \code{data.table} object which is fine to modify without copying. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_tibble_archive$update_DT_as_of(issue)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$update_DT_as_of(issue)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -225,7 +225,7 @@ modify without copying. \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_tibble_archive$clone(deep = FALSE)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$clone(deep = FALSE)}\if{html}{\out{
}} } \subsection{Arguments}{ diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 2fbe8664b..9ef0f8fd4 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/slide.R \name{epi_slide} \alias{epi_slide} -\title{Slide a function over variables in an \code{epi_df} object} +\title{Slide a function over variables in an \code{epi_df} or \code{epi_archive} object} \usage{ epi_slide( x, @@ -18,7 +18,7 @@ epi_slide( ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration.} +\item{x}{The \code{epi_df} or \code{epi_archive} object under consideration.} \item{f}{Function or formula to slide over variables in \code{x}. To "slide" means to apply a function or formula over a running window of \code{n} time steps @@ -72,7 +72,9 @@ An \code{epi_df} object given by appending a new column to \code{x}, named according to the \code{new_col_name} argument, containing the slide values. } \description{ -Slides a given function over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +Slides a given function over variables in an \code{epi_df} or \code{epi_archive} +object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} and +\href{https://cmu-delphi.github.io/epiprocess/articles/issues.html}{issues vignette} for examples. } \details{ @@ -84,6 +86,9 @@ determined by the \code{time_type} field in the metadata: the unit is one day if \code{time_step} argument (which if specified would override the default choice based on the metadata). +The critical difference between sliding over an \code{epi_df} versus \code{epi_archive} +object is that, with the latter, TODO + If \code{f} is missing, then an expression for tidy evaluation can be specified, for example, as in:\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) } From e6acb171f1eb9e72c00149050944d7d699f7c259 Mon Sep 17 00:00:00 2001 From: Ryan Tibshirani Date: Thu, 27 Jan 2022 06:45:03 -0500 Subject: [PATCH 05/18] Little import fixes --- R/correlation.R | 2 +- R/epi_archive.R | 100 ++++++------------------------------------- R/epi_df.R | 1 + R/outliers.R | 4 +- R/slide.R | 4 +- man/epi_archive.Rd | 100 ++++++------------------------------------- man/epi_slide.Rd | 7 ++- vignettes/issues.Rmd | 79 +++++++++++++++++++++++++++++++++- 8 files changed, 112 insertions(+), 185 deletions(-) diff --git a/R/correlation.R b/R/correlation.R index 2864dff1a..3315114c7 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -38,7 +38,7 @@ #' #' @importFrom dplyr arrange mutate summarize #' @importFrom stats cor -#' @importFrom rlang .data enquo +#' @importFrom rlang .data !! enquo #' @export epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, by = geo_value, use = "na.or.complete", diff --git a/R/epi_archive.R b/R/epi_archive.R index cd99a5280..fd482562d 100644 --- a/R/epi_archive.R +++ b/R/epi_archive.R @@ -8,93 +8,19 @@ #' Archive (data version history) for an `epi_df` object #' -#' Contains version history for an `epi_df` object, and enables fast querying -#' of snapshots of the `epi_df` object as of certain "issues" (versions). -#' Version history can be input as a data frame combining full snapshots of the -#' `epi_df` as of several issue times, or using only the newly added or -#' revised rows for each issue, or using some combination of these two -#' (including "updates" for things that didn't actually change). -#' Last-observation-carried-forward (LOCF) is used to data in between recorded -#' updates. Currently, deletions must be represented as revising a row to a -#' special state (e.g., making the entries `NA` or including a special -#' column that flags the data as removed and performing post-processing), and -#' the archive is unaware of what this state is. -#' -#' @examples -#' -#' update.df = -#' tibble::tribble( -#' ~geo_value, ~time_value, ~issue, ~value, -#' ## update history of geo1 for reference time 2021-01-01: -#' ## (1 day of latency in initial report) -#' "geo1", as.Date("2021-01-01"), as.Date("2021-01-02"), 5.0, -#' ## (revised upward) -#' "geo1", as.Date("2021-01-01"), as.Date("2021-01-03"), 9.0, -#' ## (revised upward) -#' "geo1", as.Date("2021-01-01"), as.Date("2021-01-10"), 9.2, -#' ## update history of geo1 for reference time 2021-01-02: -#' ## (1 day of latency in initial report) -#' "geo1", as.Date("2021-01-02"), as.Date("2021-01-03"), 8.0, -#' ## (redundant "update" row; we will already be using LOCF to fill in) -#' "geo1", as.Date("2021-01-02"), as.Date("2021-01-04"), 8.0, -#' ## (replaced with NA) -#' "geo1", as.Date("2021-01-02"), as.Date("2021-01-10"), NA_real_, -#' ## update history of geo1 for reference time 2021-01-05 (suppose data set skips the 3rd and 4th) -#' ## (1 day of latency in initial report) -#' "geo1", as.Date("2021-01-05"), as.Date("2021-01-06"), 13.0, -#' ) -#' -#' ## update.df actually contains update data through issue 2021-01-11, but the -#' ## data set was not reported to change from 2021-01-10 to 2021-01-11 -#' epi.tibble.archive = epi_archive$new(update.df, max.issue=as.Date("2021-01-11")) -#' -#' ## The snapshot as of issue 2021-01-03 just looks like the updates in issue -#' ## 2021-01-03, because all past measurements were updated in this issue (we -#' ## don't need to do any LOCF to obtain the snapshot). -#' epi.tibble.archive$epi_df_as_of(as.Date("2021-01-03")) -#' -#' ## The snapshot as of issue 2021-01-05 uses LOCF on the first two geo-time -#' ## combinations. Note that there is no entry for `time_value` 2021-01-05, as the -#' ## initial version of this data was not available until issue 2021-01-06. -#' epi.tibble.archive$epi_df_as_of(as.Date("2021-01-05")) -#' -#' ## The snapshot as of issue 2021-01-06 does include the measurement for -#' ## `time_value` 2021-01-05. -#' epi.tibble.archive$epi_df_as_of(as.Date("2021-01-06")) -#' -#' ## (Don't automatically run this example as it involves network access and querying the API) -#' if (FALSE) { -#' library(dplyr) -#' ## (delphi.epidata package is on GitHub in cmu-delphi/delphi-epidata-r) -#' update.df.2 = -#' delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", -#' "day", "state", -#' delphi.epidata::epirange(12340101,34560101), c("ak","al"), -#' issues = delphi.epidata::epirange(12340101,34560101)) %>% -#' delphi.epidata::fetch_tbl() -#' snapshot.df.2a = -#' delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", -#' "day", "state", -#' delphi.epidata::epirange(12340101,34560101), c("ak","al"), -#' as_of = 20201014) %>% -#' delphi.epidata::fetch_tbl() -#' snapshot.df.2b = -#' delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", -#' "day", "state", -#' delphi.epidata::epirange(12340101,34560101), c("ak","al"), -#' as_of = 20201028) %>% -#' delphi.epidata::fetch_tbl() -#' -#' epi.tibble.archive.2 = epi_archive$new(update.df.2) -#' all.equal( -#' as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-14"))), -#' as_tibble(as.epi_df(snapshot.df.2a)), -#' check.attributes = FALSE) -#' all.equal( -#' as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-28"))), -#' as_tibble(as.epi_df(snapshot.df.2b)), -#' check.attributes = FALSE) -#' } +#' Contains version history for an `epi_df` object, and enables fast querying of +#' snapshots of the `epi_df` object as of certain "issues" (versions). Version +#' history can be input as a data frame combining full snapshots of the `epi_df` +#' as of several issue times, or using only the newly added or revised rows for +#' each issue, or using some combination of these two (including "updates" for +#' things that didn't actually change). Last-observation-carried-forward (LOCF) +#' is used to data in between recorded updates. Currently, deletions must be +#' represented as revising a row to a special state (e.g., making the entries +#' `NA` or including a special column that flags the data as removed and +#' performing post-processing), and the archive is unaware of what this state +#' is. See the [issues +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/issues.html) for +#' examples. #' #' @importFrom assertthat assert_that #' @importFrom rlang is_scalar_character is_named is_character is_scalar_atomic !! warn diff --git a/R/epi_df.R b/R/epi_df.R index 846cb9188..ad0ca4f57 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -256,6 +256,7 @@ head.epi_df = function(x, ...) { #' @return No return value; called only to print summary statistics. #' #' @method summary epi_df +#' @importFrom rlang .data #' @importFrom stats median #' @export summary.epi_df = function(object, ...) { diff --git a/R/outliers.R b/R/outliers.R index 977b97475..86bfc6a89 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -50,7 +50,7 @@ #' @importFrom dplyr group_modify mutate select #' @importFrom purrr map pmap_dfc #' @importFrom tidyselect ends_with all_of -#' @importFrom rlang abort enquo +#' @importFrom rlang !! abort enquo #' @export epi_detect_outlr = function(x, var, methods = tibble( @@ -168,6 +168,7 @@ epi_detect_outlr_one_grp = function(.data_group, var, methods, combiner, #' `upper`, and `replacement`. #' #' @importFrom dplyr mutate pull select +#' @importFrom rlang !! #' @export epi_detect_outlr_rm = function(x, var, n = 21, log_transform = FALSE, @@ -247,6 +248,7 @@ epi_detect_outlr_rm = function(x, var, n = 21, #' @importFrom dplyr case_when mutate pull select transmute #' @importFrom fabletools model #' @importFrom feasts STL +#' @importFrom rlang !! #' @export epi_detect_outlr_stl = function(x, var, n_trend = 21, diff --git a/R/slide.R b/R/slide.R index cc583399f..602728312 100644 --- a/R/slide.R +++ b/R/slide.R @@ -90,7 +90,7 @@ epi_slide = function(x, f, ..., n = 14, align = c("right", "center", "left"), #' @method epi_slide epi_df #' @importFrom dplyr arrange group_modify mutate pull summarize #' @importFrom lubridate days weeks -#' @importFrom rlang .data abort enquo enquos +#' @importFrom rlang abort enquo enquos #' @export epi_slide.epi_df = function(x, f, ..., n = 14, align = c("right", "center", "left"), @@ -195,7 +195,7 @@ epi_slide_one_grp = function(.data_group, index_fun, f, ..., before_num, return(mutate(.data_group, !!new_col_name := slide_values)) } -#' @importFrom rlang !!! !! := +#' @importFrom rlang .data !! !!! := #' @importFrom pipeR %>>% #' @export epi_slide.epi_archive = function(x, f, ..., n = 14, diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index ffd6e39b3..a706029fa 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -9,94 +9,18 @@ Archive (data version history) for an \code{epi_df} object Archive (data version history) for an \code{epi_df} object } \details{ -Contains version history for an \code{epi_df} object, and enables fast querying -of snapshots of the \code{epi_df} object as of certain "issues" (versions). -Version history can be input as a data frame combining full snapshots of the -\code{epi_df} as of several issue times, or using only the newly added or -revised rows for each issue, or using some combination of these two -(including "updates" for things that didn't actually change). -Last-observation-carried-forward (LOCF) is used to data in between recorded -updates. Currently, deletions must be represented as revising a row to a -special state (e.g., making the entries \code{NA} or including a special -column that flags the data as removed and performing post-processing), and -the archive is unaware of what this state is. -} -\examples{ - -update.df = - tibble::tribble( - ~geo_value, ~time_value, ~issue, ~value, - ## update history of geo1 for reference time 2021-01-01: - ## (1 day of latency in initial report) - "geo1", as.Date("2021-01-01"), as.Date("2021-01-02"), 5.0, - ## (revised upward) - "geo1", as.Date("2021-01-01"), as.Date("2021-01-03"), 9.0, - ## (revised upward) - "geo1", as.Date("2021-01-01"), as.Date("2021-01-10"), 9.2, - ## update history of geo1 for reference time 2021-01-02: - ## (1 day of latency in initial report) - "geo1", as.Date("2021-01-02"), as.Date("2021-01-03"), 8.0, - ## (redundant "update" row; we will already be using LOCF to fill in) - "geo1", as.Date("2021-01-02"), as.Date("2021-01-04"), 8.0, - ## (replaced with NA) - "geo1", as.Date("2021-01-02"), as.Date("2021-01-10"), NA_real_, - ## update history of geo1 for reference time 2021-01-05 (suppose data set skips the 3rd and 4th) - ## (1 day of latency in initial report) - "geo1", as.Date("2021-01-05"), as.Date("2021-01-06"), 13.0, - ) - -## update.df actually contains update data through issue 2021-01-11, but the -## data set was not reported to change from 2021-01-10 to 2021-01-11 -epi.tibble.archive = epi_archive$new(update.df, max.issue=as.Date("2021-01-11")) - -## The snapshot as of issue 2021-01-03 just looks like the updates in issue -## 2021-01-03, because all past measurements were updated in this issue (we -## don't need to do any LOCF to obtain the snapshot). -epi.tibble.archive$epi_df_as_of(as.Date("2021-01-03")) - -## The snapshot as of issue 2021-01-05 uses LOCF on the first two geo-time -## combinations. Note that there is no entry for `time_value` 2021-01-05, as the -## initial version of this data was not available until issue 2021-01-06. -epi.tibble.archive$epi_df_as_of(as.Date("2021-01-05")) - -## The snapshot as of issue 2021-01-06 does include the measurement for -## `time_value` 2021-01-05. -epi.tibble.archive$epi_df_as_of(as.Date("2021-01-06")) - -## (Don't automatically run this example as it involves network access and querying the API) -if (FALSE) { - library(dplyr) - ## (delphi.epidata package is on GitHub in cmu-delphi/delphi-epidata-r) - update.df.2 = - delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", - "day", "state", - delphi.epidata::epirange(12340101,34560101), c("ak","al"), - issues = delphi.epidata::epirange(12340101,34560101)) \%>\% - delphi.epidata::fetch_tbl() - snapshot.df.2a = - delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", - "day", "state", - delphi.epidata::epirange(12340101,34560101), c("ak","al"), - as_of = 20201014) \%>\% - delphi.epidata::fetch_tbl() - snapshot.df.2b = - delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", - "day", "state", - delphi.epidata::epirange(12340101,34560101), c("ak","al"), - as_of = 20201028) \%>\% - delphi.epidata::fetch_tbl() - - epi.tibble.archive.2 = epi_archive$new(update.df.2) - all.equal( - as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-14"))), - as_tibble(as.epi_df(snapshot.df.2a)), - check.attributes = FALSE) - all.equal( - as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-28"))), - as_tibble(as.epi_df(snapshot.df.2b)), - check.attributes = FALSE) -} - +Contains version history for an \code{epi_df} object, and enables fast querying of +snapshots of the \code{epi_df} object as of certain "issues" (versions). Version +history can be input as a data frame combining full snapshots of the \code{epi_df} +as of several issue times, or using only the newly added or revised rows for +each issue, or using some combination of these two (including "updates" for +things that didn't actually change). Last-observation-carried-forward (LOCF) +is used to data in between recorded updates. Currently, deletions must be +represented as revising a row to a special state (e.g., making the entries +\code{NA} or including a special column that flags the data as removed and +performing post-processing), and the archive is unaware of what this state +is. See the \href{https://cmu-delphi.github.io/epiprocess/articles/issues.html}{issues vignette} for +examples. } \section{Methods}{ \subsection{Public methods}{ diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 9ef0f8fd4..b35a402c8 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/slide.R \name{epi_slide} \alias{epi_slide} -\title{Slide a function over variables in an \code{epi_df} or \code{epi_archive} object} +\title{Slide a function over variables in an \code{epi_df} object} \usage{ epi_slide( x, @@ -72,9 +72,8 @@ An \code{epi_df} object given by appending a new column to \code{x}, named according to the \code{new_col_name} argument, containing the slide values. } \description{ -Slides a given function over variables in an \code{epi_df} or \code{epi_archive} -object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} and -\href{https://cmu-delphi.github.io/epiprocess/articles/issues.html}{issues vignette} for +Slides a given function over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +examples. Also applies to an \code{epi_archive} object; see the \href{https://cmu-delphi.github.io/epiprocess/articles/issues.html}{issues vignette} for examples. } \details{ diff --git a/vignettes/issues.Rmd b/vignettes/issues.Rmd index 93ac2f0ed..8bd8ca670 100644 --- a/vignettes/issues.Rmd +++ b/vignettes/issues.Rmd @@ -37,7 +37,82 @@ Slide stuff: * if an `epi_archive`, then for each working time value, it uses `as_of()` to only return the data you would have had at that time (and whose time value is withing `n` trailing time steps of the specified one) -- demo this in the vignette with the hand-built AR forecaster. the `epipred` +- demo this in the vignette with the hand-built AR forecaster. the `epi_predict` package can and still should be built, to contain more fancy forecasters, and potentially also scoring roles. but the functionality for *running* these will -be already built into `epiprocess`, just as a generalization of `epi_slide()` \ No newline at end of file +be already built into `epiprocess`, just as a generalization of `epi_slide()` + +```{r eval = FALSE} +update.df = + tibble::tribble( + ~geo_value, ~time_value, ~issue, ~value, + ## update history of geo1 for reference time 2021-01-01: + ## (1 day of latency in initial report) + "geo1", as.Date("2021-01-01"), as.Date("2021-01-02"), 5.0, + ## (revised upward) + "geo1", as.Date("2021-01-01"), as.Date("2021-01-03"), 9.0, + ## (revised upward) + "geo1", as.Date("2021-01-01"), as.Date("2021-01-10"), 9.2, + ## update history of geo1 for reference time 2021-01-02: + ## (1 day of latency in initial report) + "geo1", as.Date("2021-01-02"), as.Date("2021-01-03"), 8.0, + ## (redundant "update" row; we will already be using LOCF to fill in) + "geo1", as.Date("2021-01-02"), as.Date("2021-01-04"), 8.0, + ## (replaced with NA) + "geo1", as.Date("2021-01-02"), as.Date("2021-01-10"), NA_real_, + ## update history of geo1 for reference time 2021-01-05 (suppose data set skips the 3rd and 4th) + ## (1 day of latency in initial report) + "geo1", as.Date("2021-01-05"), as.Date("2021-01-06"), 13.0, + ) + +## update.df actually contains update data through issue 2021-01-11, but the +## data set was not reported to change from 2021-01-10 to 2021-01-11 +epi.tibble.archive = epi_archive$new(update.df, max.issue=as.Date("2021-01-11")) + +## The snapshot as of issue 2021-01-03 just looks like the updates in issue +## 2021-01-03, because all past measurements were updated in this issue (we +## don't need to do any LOCF to obtain the snapshot). +epi.tibble.archive$epi_df_as_of(as.Date("2021-01-03")) + +## The snapshot as of issue 2021-01-05 uses LOCF on the first two geo-time +## combinations. Note that there is no entry for `time_value` 2021-01-05, as the +## initial version of this data was not available until issue 2021-01-06. +epi.tibble.archive$epi_df_as_of(as.Date("2021-01-05")) + +## The snapshot as of issue 2021-01-06 does include the measurement for +## `time_value` 2021-01-05. +epi.tibble.archive$epi_df_as_of(as.Date("2021-01-06")) + +## (Don't automatically run this example as it involves network access and querying the API) +if (FALSE) { + library(dplyr) + ## (delphi.epidata package is on GitHub in cmu-delphi/delphi-epidata-r) + update.df.2 = + delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", + "day", "state", + delphi.epidata::epirange(12340101,34560101), c("ak","al"), + issues = delphi.epidata::epirange(12340101,34560101)) %>% + delphi.epidata::fetch_tbl() + snapshot.df.2a = + delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", + "day", "state", + delphi.epidata::epirange(12340101,34560101), c("ak","al"), + as_of = 20201014) %>% + delphi.epidata::fetch_tbl() + snapshot.df.2b = + delphi.epidata::covidcast("jhu-csse", "confirmed_incidence_num", + "day", "state", + delphi.epidata::epirange(12340101,34560101), c("ak","al"), + as_of = 20201028) %>% + delphi.epidata::fetch_tbl() + epi.tibble.archive.2 = epi_archive$new(update.df.2) + all.equal( + as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-14"))), + as_tibble(as.epi_df(snapshot.df.2a)), + check.attributes = FALSE) + all.equal( + as_tibble(epi.tibble.archive.2$epi_df_as_of(as.Date("2020-10-28"))), + as_tibble(as.epi_df(snapshot.df.2b)), + check.attributes = FALSE) +} +``` \ No newline at end of file From 1f0ca377d8ce3a120623e66258c362475193d9e6 Mon Sep 17 00:00:00 2001 From: Ryan Tibshirani Date: Mon, 31 Jan 2022 14:57:28 -0500 Subject: [PATCH 06/18] Add `epi_df()` function - This allows us to create an `epi_df` object directly from columns, just like `tibble()` or `data.frame()` - It also gives us a better place to point the documentation to, for describing the structure of an `epi_df` object (previously we were pointing to `as_epi_df()`, which was a bit awkward) - Rename `as.epi_df()` to `as_epi_df()` to be consistent with tibble and tsibble functions - Rebuild doc and vignettes --- DESCRIPTION | 11 + NAMESPACE | 9 +- R/epi_archive.R | 52 ++-- R/epi_df.R | 186 ++++++----- R/slide.R | 4 +- docs/404.html | 5 +- docs/articles/aggregation.html | 5 +- docs/articles/archive.html | 237 ++++++++++++++ .../header-attrs-2.10/header-attrs.js | 0 docs/articles/correlation.html | 7 +- .../figure-html/unnamed-chunk-2-1.png | Bin 131885 -> 131959 bytes .../figure-html/unnamed-chunk-3-1.png | Bin 205225 -> 205182 bytes .../figure-html/unnamed-chunk-4-1.png | Bin 89614 -> 89737 bytes .../figure-html/unnamed-chunk-5-1.png | Bin 79687 -> 79678 bytes docs/articles/derivative.html | 7 +- docs/articles/epiprocess.html | 55 ++-- .../figure-html/unnamed-chunk-6-1.png | Bin 119997 -> 119906 bytes docs/articles/index.html | 7 +- docs/articles/issues.html | 168 ---------- docs/articles/outliers.html | 7 +- docs/articles/pct_change.html | 7 +- docs/articles/slide.html | 83 +++-- .../figure-html/unnamed-chunk-10-1.png | Bin 240637 -> 240573 bytes .../figure-html/unnamed-chunk-4-1.png | Bin 173738 -> 173873 bytes docs/authors.html | 9 +- docs/index.html | 8 +- docs/pkgdown.yml | 4 +- docs/reference/Min.html | 5 +- docs/reference/as_epi_df.html | 253 +++++++++++++++ docs/reference/epi_archive.html | 294 ++++++++++++++++++ docs/reference/epi_cor.html | 5 +- docs/reference/epi_detect_outlr.html | 5 +- docs/reference/epi_detect_outlr_rm.html | 5 +- docs/reference/epi_detect_outlr_stl.html | 5 +- .../reference/{as.epi_df.html => epi_df.html} | 143 ++++----- docs/reference/epi_slide.html | 11 +- docs/reference/epiprocess.html | 5 +- docs/reference/estimate_deriv.html | 5 +- docs/reference/group_by.epi_df.html | 5 +- docs/reference/index.html | 19 +- docs/reference/pct_change.html | 5 +- docs/reference/pipe.html | 5 +- docs/reference/print.epi_df.html | 5 +- docs/reference/summary.epi_df.html | 5 +- man/as.epi_df.Rd | 141 --------- man/as_epi_df.Rd | 69 ++++ man/epi_archive.Rd | 24 +- man/epi_df.Rd | 125 ++++++++ man/epi_slide.Rd | 2 +- vignettes/{issues.Rmd => archive.Rmd} | 4 +- vignettes/correlation.Rmd | 2 +- vignettes/derivative.Rmd | 2 +- vignettes/epiprocess.Rmd | 59 ++-- vignettes/outliers.Rmd | 2 +- vignettes/pct_change.Rmd | 2 +- vignettes/slide.Rmd | 4 +- 56 files changed, 1374 insertions(+), 713 deletions(-) create mode 100644 docs/articles/archive.html rename docs/articles/{issues_files => archive_files}/header-attrs-2.10/header-attrs.js (100%) delete mode 100644 docs/articles/issues.html create mode 100644 docs/reference/as_epi_df.html create mode 100644 docs/reference/epi_archive.html rename docs/reference/{as.epi_df.html => epi_df.html} (63%) delete mode 100644 man/as.epi_df.Rd create mode 100644 man/as_epi_df.Rd create mode 100644 man/epi_df.Rd rename vignettes/{issues.Rmd => archive.Rmd} (98%) diff --git a/DESCRIPTION b/DESCRIPTION index 3b0aeae70..e94548b91 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,3 +46,14 @@ Suggests: delphi.epidata Remotes: github:cmu-delphi/delphi-epidata-r +Collate: + 'correlation.R' + 'derivative.R' + 'epi_archive.R' + 'epi_df.R' + 'epiprocess.R' + 'outliers.R' + 'pct_change.R' + 'slide.R' + 'utils.R' + 'utils_pipe.R' diff --git a/NAMESPACE b/NAMESPACE index 39e6b2b7b..065538510 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,8 @@ # Generated by roxygen2: do not edit by hand -S3method(as.epi_df,data.frame) -S3method(as.epi_df,epi_df) -S3method(as.epi_df,tibble) +S3method(as_epi_df,data.frame) +S3method(as_epi_df,epi_df) +S3method(as_epi_df,tibble) S3method(epi_slide,epi_archive) S3method(epi_slide,epi_df) S3method(group_by,epi_df) @@ -18,12 +18,13 @@ export(Median) export(Min) export(Start) export(Sum) -export(as.epi_df) +export(as_epi_df) export(epi_archive) export(epi_cor) export(epi_detect_outlr) export(epi_detect_outlr_rm) export(epi_detect_outlr_stl) +export(epi_df) export(epi_slide) export(estimate_deriv) export(pct_change) diff --git a/R/epi_archive.R b/R/epi_archive.R index fd482562d..45447e222 100644 --- a/R/epi_archive.R +++ b/R/epi_archive.R @@ -1,33 +1,35 @@ -## We use special features of data.table's `[`. The data.table package has a -## compatibility feature that disables some/all of these features if it thinks -## we might expect `data.frame`-compatible behavior instead. We can signal that -## we want the special behavior via `.datatable.aware = TRUE` or by importing -## any `data.table` package member. Do both to prevent surprises if we decide to -## use `data.table::` everywhere and not importing things. +# We use special features of data.table's `[`. The data.table package has a +# compatibility feature that disables some/all of these features if it thinks we +# might expect `data.frame`-compatible behavior instead. We can signal that we +# want the special behavior via `.datatable.aware = TRUE` or by importing any +# `data.table` package member. Do both to prevent surprises if we decide to use +# `data.table::` everywhere and not importing things. .datatable.aware = TRUE #' Archive (data version history) for an `epi_df` object #' #' Contains version history for an `epi_df` object, and enables fast querying of -#' snapshots of the `epi_df` object as of certain "issues" (versions). Version -#' history can be input as a data frame combining full snapshots of the `epi_df` -#' as of several issue times, or using only the newly added or revised rows for -#' each issue, or using some combination of these two (including "updates" for -#' things that didn't actually change). Last-observation-carried-forward (LOCF) -#' is used to data in between recorded updates. Currently, deletions must be -#' represented as revising a row to a special state (e.g., making the entries -#' `NA` or including a special column that flags the data as removed and -#' performing post-processing), and the archive is unaware of what this state -#' is. See the [issues -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/issues.html) for -#' examples. +#' snapshots of the `epi_df` object as of certain "issues" (versions). See the +#' [data versioning +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @details Version history can be input as a data frame combining full +#' snapshots of the `epi_df` as of several issue times, or using only the +#' newly added or revised rows for each issue, or using some combination of +#' these two (including "updates" for things that didn't actually +#' change). Last-observation-carried-forward (LOCF) is used to data in between +#' recorded updates. Currently, deletions must be represented as revising a +#' row to a special state (e.g., making the entries `NA` or including a +#' special column that flags the data as removed and performing +#' post-processing), and the archive is unaware of what this state is. #' #' @importFrom assertthat assert_that -#' @importFrom rlang is_scalar_character is_named is_character is_scalar_atomic !! warn -#' @importFrom tibble as_tibble -#' @importFrom dplyr rename filter #' @importFrom data.table as.data.table +#' @importFrom dplyr filter rename #' @importFrom pipeR %>>% +#' @importFrom rlang !! is_named is_character is_scalar_atomic is_scalar_character warn +#' @importFrom tibble as_tibble #' @export epi_archive = R6::R6Class("epi_archive", @@ -40,9 +42,9 @@ epi_archive = other.key.colnames = NULL ), public = list( - #' @description - #' Create a new \code{epi_archive} with the given update data. - #' @param update.df the update data +#' @description +#' Create a new \code{epi_archive} with the given update data +#' @param update.df the update data #' @param issue.colname name of the column with the issue time of the corresponding updates; operations such as \code{sort}, \code{<=}, and \code{max} must make sense on this column, with earlier issues "less than" later issues #' @param geo.colname the name of the column that will become \code{geo_value} in the \code{epi_df} #' @param time.colname the name of the column that will become \code{time_value} in the \code{epi_df} @@ -106,7 +108,7 @@ epi_archive = geo_value = !!private[["geo.colname"]], time_value = !!private[["time.colname"]], ) %>>% - as.epi_df(issue = issue, + as_epi_df(issue = issue, additional_metadata = list(other.key.colnames = private[["other.key.colnames"]])) %>>% return() }, diff --git a/R/epi_df.R b/R/epi_df.R index ad0ca4f57..0b6125239 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -1,25 +1,27 @@ -#' Convert data to `epi_df` format +#' Create `epi_df` object #' -#' Converts a data frame or tibble into a format that is consistent with the -#' `epi_df` class, ensuring that it has a certain minimal set of columns, and -#' that it has certain minimal metadata. +#' Creates an `epi_df` object from given `geo_value` and `time_value` variables, +#' and any additional number of variables. #' -#' @param x The object to be converted. See the methods section below for -#' details on formatting of each input type. -#' @param geo_type The type for the geo values. If missing, then the function -#' will attempt to infer it from the geo values present; if this fails, then -#' it will be set to "custom". -#' @param time_type The type for the time values. If missing, then the function -#' will attempt to infer it from the time values present; if this fails, then -#' it will be set to "custom". -#' @param issue Issue to use for this data. If missing, then the function will -#' attempt to infer it from the passed object `x`; if this fails, then the -#' current day-time will be used. +#' @param geo_value Geographic values associated with the measurements. +#' @param time_value Time values associated with the measurements. +#' @param ... Additional arguments of the form `value` or `name = value`, which +#' specify any number of additional columns for the `epi_df` object. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param as_of Time value representing the time at which the given data were +#' available. For example, if `as_of` were January 31, 2022, then the `epi_df` +#' object that is created would represent the most up-to-date version of the +#' data available as of January 31, 2022. If the `as_of` argument is missing, +#' then the current day-time will be used. #' @param additional_metadata List of additional metadata to attach to the -#' `epi_df` object. All objects will have `time_type`, `geo_type`, and `issue` -#' fields; named entries from the passed list or will be included as well. -#' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' `epi_df` object. The metadata will have `time_type`, `geo_type`, and +#' `as_of` fields; named entries from the passed list or will be included as +#' well. #' #' @details An `epi_df` is a tibble with (at least) the following columns: #' @@ -32,19 +34,25 @@ #' #' * `geo_type`: the type for the geo values. #' * `time_type`: the type for the time values. -#' * `issue`: the time value at which the given data set was issued. -#' -#' The first two fields above, `geo_type` and `time_type`, can usually be -#' inferred from the `geo_value` and `time_value` columns, respectively. The -#' last field above, `issue`, is the most unique to the `epi_df` format. In a -#' typical case, this represents the maximum of the issues of individual -#' signal values measured in the data set; hence we would also say that the -#' data set is comprised of all signal values observed "as of" the given issue -#' in the metadata. +#' * `as_of`: the time value at which the given data were available. #' #' Metadata for an `epi_df` object `x` can be accessed (and altered) via -#' `attributes(x)$metadata`. More information on geo types, time types, and -#' issues is given below. +#' `attributes(x)$metadata`. The first two fields in the above list, +#' `geo_type` and `time_type`, can usually be inferred from the `geo_value` +#' and `time_value` columns, respectively. More information on their coding is +#' given below. +#' +#' The last field in the above list, `as_of`, is one of the most unique aspects +#' of an `epi_df` object. In brief, we can think of an `epi_df` object as a +#' single snapshot of a data set that contains the most up-to-date values of +#' some signals of interest, as of the time specified in the `as_of` field. A +#' companion object is the `epi_archive` object, which contains the full +#' version history of a given data set. Revisions are common in many types of +#' epidemiological data streams, and paying attention to data revisions can be +#' important for all sorts of downstream data analysis and modeling tasks. See +#' the `epi_archive()` help file for more details on how data versioning works +#' in the `epiprocess` package (including how to create `epi_df` objects, as +#' data snapshots, from an `epi_archive` object). #' #' @section Geo types: #' The following geo types are supported in an `epi_df`. Their geo coding @@ -64,50 +72,81 @@ #' alpha-2 country codes (lowercase). #' #' The above geo types come with aggregation utilities in the package, *todo: -#' refer to relevant functionality, vignette, and so on*. An unrecognizable -#' geo type is labeled as "custom". +#' refer to relevant functionality, vignette, and so on*. An unrecognizable +#' geo type is labeled as "custom". #' #' @section Time types: #' The following time types are supported in an `epi_df`. Their time coding #' (specification of time values for each time type) is also described below. #' -#' * `"day-time"`: each observation corresponds to a time on a given day (measured -#' to the second); coded as a `POSIXct` object, as in `as.POSIXct("2020-06-09 -#' 18:45:40")`. +#' * `"day-time"`: each observation corresponds to a time on a given day +#' (measured to the second); coded as a `POSIXct` object, as in +#' `as.POSIXct("2022-01-31 18:45:40")`. #' * `"day"`: each observation corresponds to a day; coded as a `Date` object, -#' as in `as.Date("2020-06-09")`. +#' as in `as.Date("2022-01-31")`. #' * `"week"`: each observation corresponds to a week; the alignment can be #' arbitrary (as to whether a week starts on a Monday, Tuesday, etc.; the #' U.S. CDC definition of an epidemiological week starts on a Sunday); coded -#' as a `Date` object, representing the start date of week. +#' as a `Date` object, representing the start date of week. #' -#' An unrecognisable time type is labeled as "custom". -#' -#' @section Issues: -#' todo +#' An unrecognizable time type is labeled as "custom". *todo: refer to vignette +#' for time aggregation examples* +#' +#' @export +epi_df = function(geo_value, time_value, ..., geo_type, time_type, as_of, + additional_metadata = list()) { + x = tibble::tibble(geo_value = geo_value, time_value = time_value, ...) + return(as_epi_df(x, geo_type, time_type, as_of, additional_metadata)) +} + +#' Convert data to `epi_df` format +#' +#' Converts a data frame or tibble into a format that is consistent with the +#' `epi_df` class, ensuring that it has a certain minimal set of columns, and +#' that it has certain minimal metadata. +#' +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param as_of Time value representing the time at which the given data were +#' available. For example, if `as_of` were January 31, 2022, then the `epi_df` +#' object that is created would represent the most up-to-date version of the +#' data available as of January 31, 2022. If the `as_of` argument is missing, +#' then the function will attempt to infer it from the passed object `x`; if +#' this fails, then the current day-time will be used. +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_df` object. The metadata will have `time_type`, `geo_type`, and +#' `as_of` fields; named entries from the passed list or will be included as +#' well. +#' @param ... Additional arguments passed to methods. +#' @return An `epi_df` object. #' #' @export -as.epi_df = function(x, ...) { - UseMethod("as.epi_df") +as_epi_df = function(x, ...) { + UseMethod("as_epi_df") } -#' @method as.epi_df epi_df -#' @describeIn as.epi_df Simply returns the `epi_df` object unchanged. +#' @method as_epi_df epi_df +#' @describeIn as_epi_df Simply returns the `epi_df` object unchanged. #' @export -as.epi_df.epi_df = function(x, ...) { +as_epi_df.epi_df = function(x, ...) { return(x) } -#' @method as.epi_df tibble -#' @describeIn as.epi_df The input tibble `x` must contain the columns +#' @method as_epi_df tibble +#' @describeIn as_epi_df The input tibble `x` must contain the columns #' `geo_value` and `time_value`. All other columns will be preserved as is, -#' and treated as measured variables. If `issue` is missing, then the function -#' will look for `issue` as a column of `x`, or as a field in its metadata -#' (stored in its attributes), to infer the issue; if this fails, then the -#' current day-time will be used. +#' and treated as measured variables. If `as_of` is missing, then the function +#' will try to guess it from an `as_of`, `issue`, or `version` column of `x` +#' (if any of these are present), or from as an `as_of` field in its metadata +#' (stored in its attributes); if this fails, then the current day-time will +#' be used. #' @importFrom rlang .data abort #' @export -as.epi_df.tibble = function(x, geo_type, time_type, issue, +as_epi_df.tibble = function(x, geo_type, time_type, as_of, additional_metadata = list(), ...) { # Check that we have geo_value and time_value columns if (!("geo_value" %in% names(x))) { @@ -176,25 +215,27 @@ as.epi_df.tibble = function(x, geo_type, time_type, issue, else time_type = "custom" } - # If issue is missing, then try to guess it - if (missing(issue)) { - # First check for a column, and take the maximum of issues - if ("issue" %in% names(x)) issue = max(x$issue) - - # Next, check the metadata - else if ("issue" %in% names(attributes(x$metadata))) { - issue = attributes(x)$metadata$issue + # If as_of is missing, then try to guess it + if (missing(as_of)) { + # First check the metadata for an as_of field + if ("as_of" %in% names(attributes(x$metadata))) { + as_of = attributes(x)$metadata$as_of } + + # Next check for as_of, issue, or version columns + else if ("as_of" %in% names(x)) as_of = max(x$as_of) + else if ("issue" %in% names(x)) as_of = max(x$issue) + else if ("version" %in% names(x)) as_of = max(x$version) # If we got here then we failed - else issue = Sys.time() # Use the current day-time + else as_of = Sys.time() # Use the current day-time } # Define metadata fields metadata = list() metadata$geo_type = geo_type metadata$time_type = time_type - metadata$issue = issue + metadata$as_of = as_of metadata = c(metadata, additional_metadata) # Convert to a tibble, apply epi_df class, attach metadata @@ -207,15 +248,16 @@ as.epi_df.tibble = function(x, geo_type, time_type, issue, return(x) } -#' @method as.epi_df data.frame -#' @describeIn as.epi_df The input data frame `x` must contain the columns +#' @method as_epi_df data.frame +#' @describeIn as_epi_df The input data frame `x` must contain the columns #' `geo_value` and `time_value`. All other columns will be preserved as is, -#' and treated as measured variables. If `issue` is missing, then the function -#' will look for `issue` as a column of `x`, or as a field in its metadata -#' (stored in its attributes), to infer the issue; if this fails, then the -#' current day-time will be used. +#' and treated as measured variables. If `as_of` is missing, then the function +#' will try to guess it from an `as_of`, `issue`, or `version` column of `x` +#' (if any of these are present), or from as an `as_of` field in its metadata +#' (stored in its attributes); if this fails, then the current day-time will +#' be used. #' @export -as.epi_df.data.frame = as.epi_df.tibble +as_epi_df.data.frame = as_epi_df.tibble #' Print `epi_df` object #' @@ -233,7 +275,7 @@ print.epi_df = function(x, ...) { cat("An `epi_df` object, with metadata:\n") cat(sprintf("* %-10s= %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-10s= %s\n", "time_type", attributes(x)$metadata$time_type)) - cat(sprintf("* %-10s= %s\n", "issue", attributes(x)$metadata$issue)) + cat(sprintf("* %-10s= %s\n", "as_of", attributes(x)$metadata$as_of)) cat("\n") NextMethod() } @@ -263,7 +305,7 @@ summary.epi_df = function(object, ...) { cat("An `epi_df` object, with metadata:\n") cat(sprintf("* %-10s= %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-10s= %s\n", "time_type", attributes(x)$metadata$time_type)) - cat(sprintf("* %-10s= %s\n", "issue", attributes(x)$metadata$issue)) + cat(sprintf("* %-10s= %s\n", "as_of", attributes(x)$metadata$as_of)) cat("\nSummary of space-time coverge:\n") cat(sprintf("* %-33s= %s\n", "earliest time value", min(object$time_value))) cat(sprintf("* %-33s= %s\n", "latest time value", max(object$time_value))) diff --git a/R/slide.R b/R/slide.R index 602728312..279f73b14 100644 --- a/R/slide.R +++ b/R/slide.R @@ -2,8 +2,8 @@ #' #' Slides a given function over variables in an `epi_df` object. See the [slide #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -#' examples. Also applies to an `epi_archive` object; see the [issues -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/issues.html) for +#' examples. Also applies to an `epi_archive` object; see the [data versioning +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for #' examples. #' #' @details To "slide" means to apply a function or formula over a running diff --git a/docs/404.html b/docs/404.html index 45db31ccb..c031ea8e5 100644 --- a/docs/404.html +++ b/docs/404.html @@ -114,9 +114,6 @@
  • 6. Detect and correct outliers in signals
  • -
  • - 7. Work with issue dates and archive objects -
  • @@ -159,7 +156,7 @@

    Contents