diff --git a/DESCRIPTION b/DESCRIPTION index 533d08b15..a38bcf52c 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,6 +4,9 @@ Title: Tools for basic signal processing in epidemiology Version: 1.0.0 Authors@R: c( + person(given = "Logan", + family = "Brooks", + role = "aut"), person(given = "Daniel", family = "McDonald", role = "ctb"), @@ -23,15 +26,22 @@ LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.2 Imports: + data.table, dplyr, fabletools, feasts, genlasso, lubridate, magrittr, + purrr, + R6, rlang, slider, tibble, tidyselect, tidyr, tsibble +Suggests: + delphi.epidata +Remotes: + github:cmu-delphi/delphi-epidata-r diff --git a/NAMESPACE b/NAMESPACE index c11a29577..b3bf96515 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,22 +1,28 @@ # Generated by roxygen2: do not edit by hand -S3method(as.epi_df,data.frame) -S3method(as.epi_df,epi_df) -S3method(as.epi_df,tibble) +S3method(arrange,epi_df) +S3method(as_epi_df,data.frame) +S3method(as_epi_df,epi_df) +S3method(as_epi_df,tbl_df) +S3method(as_tsibble,epi_df) S3method(group_by,epi_df) +S3method(group_modify,epi_df) S3method(head,epi_df) +S3method(mutate,epi_df) S3method(print,epi_df) S3method(summary,epi_df) S3method(ungroup,epi_df) +S3method(unnest,epi_df) export("%>%") export(End) export(Max) export(Mean) export(Median) +export(Middle) export(Min) export(Start) export(Sum) -export(as.epi_df) +export(as_epi_df) export(epi_cor) export(epi_detect_outlr) export(epi_detect_outlr_rm) @@ -45,6 +51,7 @@ importFrom(lubridate,weeks) importFrom(magrittr,"%>%") importFrom(purrr,map) importFrom(purrr,pmap_dfc) +importFrom(rlang,"!!") importFrom(rlang,.data) importFrom(rlang,abort) importFrom(rlang,enquo) @@ -55,6 +62,8 @@ importFrom(stats,lsfit) importFrom(stats,median) importFrom(stats,predict) importFrom(stats,smooth.spline) +importFrom(tidyr,unnest) importFrom(tidyselect,all_of) importFrom(tidyselect,ends_with) +importFrom(tsibble,as_tsibble) importFrom(utils,head) diff --git a/R/correlation.R b/R/correlation.R index 2864dff1a..948fe28da 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -13,35 +13,35 @@ #' value and positive shifts into a lead value; for example, if `dt = -1`, #' then the new value on June 2 is the original value on June 1; if `dt = 1`, #' then the new value on June 2 is the original value on June 3; if `dt = 0`, -#' then the values are left as is. Default is 0 for both `dt1` and `dt2`. Note -#' that the time shifts are always performed *per geo value*; see details. -#' @param by The variable(s) to group by, for the correlation computation. If -#' `geo_value`, the default, then correlations are computed for each geo -#' value, over all time; if `time_value`, then correlations are computed for -#' each time, over all geo values. A grouping can also be any specified using -#' number of columns of `x`; for example, we can use `by = c(geo_value, -#' age_group)`, assuming `x` has a column `age_group`, in order to compute -#' correlations for each pair of geo value and age group. To omit a grouping -#' entirely, use `by = NULL`. Note that the grouping here is always applied -#' *after* the time shifts; see details. +#' then the values are left as is. Default is 0 for both `dt1` and `dt2`. +#' @param shift_by The variables(s) to group by, for the time shifts. The +#' default is `geo_value`. However, we could also use, for example, `shift_by +#' = c(geo_value, age_group)`, assuming `x` has a column `age_group`, to +#' perform time shifts per geo value and age group. To omit a grouping +#' entirely, use `cor_by = NULL`. Note that the grouping here is always undone +#' *before* the correlation computations. +#' @param cor_by The variable(s) to group by, for the correlation +#' computations. If `geo_value`, the default, then correlations are computed +#' for each geo value, over all time; if `time_value`, then correlations are +#' computed for each time, over all geo values. A grouping can also be any +#' specified using number of columns of `x`; for example, we can use `cor_by = +#' c(geo_value, age_group)`, assuming `x` has a column `age_group`, in order +#' to compute correlations for each pair of geo value and age group. To omit a +#' grouping entirely, use `cor_by = NULL`. Note that the grouping here is +#' always done *after* the time shifts. #' @param use,method Arguments to pass to `cor()`, with "na.or.complete" the #' default for `use` (different than `cor()`) and "pearson" the default for #' `method` (same as `cor()`). #' #' @return An tibble with the grouping columns first (`geo_value`, `time_value`, -#' or possibly others), and then a column `cor`, which gives the correlation. -#' -#' @details Time shifts are always performed first, grouped by geo value (this -#' way they amount to shifting each individual time series). After this, the -#' geo grouping is removed, and the grouping specified in the `by` argument is -#' applied. Then, correlations are computed. +#' or possibly others), and then a column `cor`, which gives the correlation. #' #' @importFrom dplyr arrange mutate summarize #' @importFrom stats cor -#' @importFrom rlang .data enquo +#' @importFrom rlang .data !! enquo #' @export -epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, by = geo_value, - use = "na.or.complete", +epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, + cor_by = geo_value, use = "na.or.complete", method = c("pearson", "kendall", "spearman")) { # Check we have an `epi_df` object if (!inherits(x, "epi_df")) abort("`x` must be of class `epi_df`.") @@ -52,18 +52,19 @@ epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, by = geo_value, var1 = enquo(var1) var2 = enquo(var2) - # What is the grouping? Which method? - by = enquo(by) + # What are the groupings? Which method? + shift_by = enquo(shift_by) + cor_by = enquo(cor_by) method = match.arg(method) # Perform time shifts, then compute appropriate correlations and return return(x %>% - group_by(.data$geo_value) %>% + group_by(!!shift_by) %>% arrange(.data$time_value) %>% mutate(var1 = shift(!!var1, n = dt1), var2 = shift(!!var2, n = dt2)) %>% ungroup() %>% - group_by(!!by) %>% + group_by(!!cor_by) %>% summarize(cor = cor(x = .data$var1, y = .data$var2, use = use, method = method))) } diff --git a/R/epi_df.R b/R/epi_df.R index 846cb9188..e565af571 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -1,52 +1,44 @@ -#' Convert data to `epi_df` format -#' -#' Converts a data frame or tibble into a format that is consistent with the -#' `epi_df` class, ensuring that it has a certain minimal set of columns, and -#' that it has certain minimal metadata. -#' -#' @param x The object to be converted. See the methods section below for -#' details on formatting of each input type. -#' @param geo_type The type for the geo values. If missing, then the function -#' will attempt to infer it from the geo values present; if this fails, then -#' it will be set to "custom". -#' @param time_type The type for the time values. If missing, then the function -#' will attempt to infer it from the time values present; if this fails, then -#' it will be set to "custom". -#' @param issue Issue to use for this data. If missing, then the function will -#' attempt to infer it from the passed object `x`; if this fails, then the -#' current day-time will be used. -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_df` object. All objects will have `time_type`, `geo_type`, and `issue` -#' fields; named entries from the passed list or will be included as well. -#' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' @title `epi_df` object #' +#' @description An `epi_df` is a tibble with certain minimal column structure +#' and metadata. It can be seen as a snapshot of a data set that contains the +#' most up-to-date values of some signal variables of interest, as of a given +#' time. +#' #' @details An `epi_df` is a tibble with (at least) the following columns: #' -#' * `geo_value`: the geographic value associated with each measurement. -#' * `time_value`: the time value associated with each measurement. +#' * `geo_value`: the geographic value associated with each row of measurements. +#' * `time_value`: the time value associated with each row of measurements. #' -#' Other columns can be considered as measured variables, which we also broadly -#' refer to as signal variables. An `epi_df` object also has metadata with (at -#' least) the following fields: +#' Other columns can be considered as measured variables, which we also refer to +#' as signal variables. An `epi_df` object also has metadata with (at least) +#' the following fields: #' #' * `geo_type`: the type for the geo values. #' * `time_type`: the type for the time values. -#' * `issue`: the time value at which the given data set was issued. -#' -#' The first two fields above, `geo_type` and `time_type`, can usually be -#' inferred from the `geo_value` and `time_value` columns, respectively. The -#' last field above, `issue`, is the most unique to the `epi_df` format. In a -#' typical case, this represents the maximum of the issues of individual -#' signal values measured in the data set; hence we would also say that the -#' data set is comprised of all signal values observed "as of" the given issue -#' in the metadata. +#' * `as_of`: the time value at which the given data were available. #' #' Metadata for an `epi_df` object `x` can be accessed (and altered) via -#' `attributes(x)$metadata`. More information on geo types, time types, and -#' issues is given below. +#' `attributes(x)$metadata`. The first two fields in the above list, +#' `geo_type` and `time_type`, can usually be inferred from the `geo_value` +#' and `time_value` columns, respectively. More information on their coding is +#' given below. #' -#' @section Geo types: +#' The last field in the above list, `as_of`, is one of the most unique aspects +#' of an `epi_df` object. In brief, we can think of an `epi_df` object as a +#' single snapshot of a data set that contains the most up-to-date values of +#' the signals variables, as of the time specified in the `as_of` field. +#' +#' A companion object is the `epi_archive` object, which contains the full +#' version history of a given data set. Revisions are common in many types of +#' epidemiological data streams, and paying attention to data revisions can be +#' important for all sorts of downstream data analysis and modeling tasks. See +#' the documentation for [`epi_archive`][epi_archive] for more details on how +#' data versioning works in the `epiprocess` package (including how to +#' generate `epi_df` objects, as data snapshots, from an `epi_archive` +#' object). +#' +#' @section Geo Types: #' The following geo types are supported in an `epi_df`. Their geo coding #' (specification of geo values for each geo type) is also described below. #' @@ -64,50 +56,79 @@ #' alpha-2 country codes (lowercase). #' #' The above geo types come with aggregation utilities in the package, *todo: -#' refer to relevant functionality, vignette, and so on*. An unrecognizable -#' geo type is labeled as "custom". +#' refer to relevant functionality, vignette, and so on*. An unrecognizable +#' geo type is labeled as "custom". #' -#' @section Time types: +#' @section Time Types: #' The following time types are supported in an `epi_df`. Their time coding #' (specification of time values for each time type) is also described below. #' -#' * `"day-time"`: each observation corresponds to a time on a given day (measured -#' to the second); coded as a `POSIXct` object, as in `as.POSIXct("2020-06-09 -#' 18:45:40")`. +#' * `"day-time"`: each observation corresponds to a time on a given day +#' (measured to the second); coded as a `POSIXct` object, as in +#' `as.POSIXct("2022-01-31 18:45:40")`. #' * `"day"`: each observation corresponds to a day; coded as a `Date` object, -#' as in `as.Date("2020-06-09")`. +#' as in `as.Date("2022-01-31")`. #' * `"week"`: each observation corresponds to a week; the alignment can be #' arbitrary (as to whether a week starts on a Monday, Tuesday, etc.; the #' U.S. CDC definition of an epidemiological week starts on a Sunday); coded -#' as a `Date` object, representing the start date of week. +#' as a `Date` object, representing the start date of week. #' -#' An unrecognisable time type is labeled as "custom". -#' -#' @section Issues: -#' todo +#' An unrecognizable time type is labeled as "custom". *todo: refer to vignette +#' for time aggregation examples* +#' +#' @seealso [as_epi_df()] for converting to `epi_df` format +#' @name epi_df +NULL + +#' Convert to `epi_df` format +#' +#' Converts a data frame or tibble into an `epi_df` object. See the [getting +#' started +#' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for +#' examples. +#' +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param as_of Time value representing the time at which the given data were +#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` +#' object that is created would represent the most up-to-date version of the +#' data available as of January 31, 2022. If the `as_of` argument is missing, +#' then the current day-time will be used. +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and +#' `as_of` fields; named entries from the passed list or will be included as +#' well. +#' @param ... Additional arguments passed to methods. +#' @return An `epi_df` object. #' +#' @seealso [`epi_df`][epi_df] for more details on the `epi_df` format #' @export -as.epi_df = function(x, ...) { - UseMethod("as.epi_df") +as_epi_df = function(x, ...) { + UseMethod("as_epi_df") } -#' @method as.epi_df epi_df -#' @describeIn as.epi_df Simply returns the `epi_df` object unchanged. +#' @method as_epi_df epi_df +#' @describeIn as_epi_df Simply returns the `epi_df` object unchanged. #' @export -as.epi_df.epi_df = function(x, ...) { +as_epi_df.epi_df = function(x, ...) { return(x) } -#' @method as.epi_df tibble -#' @describeIn as.epi_df The input tibble `x` must contain the columns +#' @method as_epi_df tbl_df +#' @describeIn as_epi_df The input tibble `x` must contain the columns #' `geo_value` and `time_value`. All other columns will be preserved as is, -#' and treated as measured variables. If `issue` is missing, then the function -#' will look for `issue` as a column of `x`, or as a field in its metadata -#' (stored in its attributes), to infer the issue; if this fails, then the -#' current day-time will be used. +#' and treated as measured variables. If `as_of` is missing, then the function +#' will try to guess it from an `as_of`, `issue`, or `version` column of `x` +#' (if any of these are present), or from as an `as_of` field in its metadata +#' (stored in its attributes); if this fails, then the current day-time will +#' be used. #' @importFrom rlang .data abort #' @export -as.epi_df.tibble = function(x, geo_type, time_type, issue, +as_epi_df.tbl_df = function(x, geo_type, time_type, as_of, additional_metadata = list(), ...) { # Check that we have geo_value and time_value columns if (!("geo_value" %in% names(x))) { @@ -119,103 +140,55 @@ as.epi_df.tibble = function(x, geo_type, time_type, issue, # If geo type is missing, then try to guess it if (missing(geo_type)) { - if (is.character(x$geo_value)) { - # Convert geo values to lowercase - x$geo_value = tolower(x$geo_value) - - # If all geo values are state abbreviations, then use "state" - state_values = c(tolower(state.abb), "as", "dc", "gu", "mp", "pr", "vi") - if (all(x$geo_value %in% state_values)) geo_type = "state" - - # Else if all geo values are 2 letters, then use "nation" - else if (all(grepl("[a-z]{2}", x$geo_value))) geo_type = "nation" - - # Else if all geo values are 5 numbers, then use "county" - else if (all(grepl("[0-9]{5}", x$geo_value))) geo_type = "county" - } - - else if (is.numeric(x$geo_value)) { - # Convert geo values to integers - x$geo_value = as.integer(x$geo_value) - - # If the max geo value is at most 10, then use "hhs" - if (max(x$geo_value) <= 10) geo_type = "hhs" - - # Else if the max geo value is at most 457, then use "hrr" - if (max(x$geo_value) <= 457) geo_type = "hrr" - } - - # If we got here then we failed - else geo_type = "custom" + geo_type = guess_geo_type(x$geo_value) } # If time type is missing, then try to guess it if (missing(time_type)) { - # Convert character time values to Date or POSIXct - if (is.character(x$time_value)) { - if (nchar(x$time_value[1]) <= "10") { - new_time_value = tryCatch({ as.Date(x$time_value) }, - error = function(e) NULL) - } - else { - new_time_value = tryCatch({ as.POSIXct(x$time_value) }, - error = function(e) NULL) - } - if (!is.null(new_time_value)) x$time_value = new_time_value - } - - # Now, if a POSIXct class, then use "day-time" - if (inherits(x$time_value, "POSIXct")) time_type = "day-time" - - # Else, if a Date class, then use "week" or "day" depending on gaps - else if (inherits(x$time_value, "Date")) { - time_type = ifelse(all(diff(sort(x$time_value)) == -7), "week", "day") - } - - # If we got here then we failed - else time_type = "custom" + time_type = guess_time_type(x$time_value) } - # If issue is missing, then try to guess it - if (missing(issue)) { - # First check for a column, and take the maximum of issues - if ("issue" %in% names(x)) issue = max(x$issue) - - # Next, check the metadata - else if ("issue" %in% names(attributes(x$metadata))) { - issue = attributes(x)$metadata$issue + # If as_of is missing, then try to guess it + if (missing(as_of)) { + # First check the metadata for an as_of field + if ("metadata" %in% names(attributes(x)) && + "as_of" %in% names(attributes(x)$metadata)) { + as_of = attributes(x)$metadata$as_of } + + # Next check for as_of, issue, or version columns + else if ("as_of" %in% names(x)) as_of = max(x$as_of) + else if ("issue" %in% names(x)) as_of = max(x$issue) + else if ("version" %in% names(x)) as_of = max(x$version) # If we got here then we failed - else issue = Sys.time() # Use the current day-time + else as_of = Sys.time() # Use the current day-time } # Define metadata fields metadata = list() metadata$geo_type = geo_type metadata$time_type = time_type - metadata$issue = issue + metadata$as_of = as_of metadata = c(metadata, additional_metadata) - # Convert to a tibble, apply epi_df class, attach metadata - if (!inherits(x, "tibble")) x = tibble::as_tibble(x) + # Reorder columns (geo_value, time_value, ...) + x = dplyr::relocate(x, .data$geo_value, .data$time_value) + + # Apply epi_df class, attach metadata, and return class(x) = c("epi_df", class(x)) attributes(x)$metadata = metadata - - # Reorder columns (geo_value, time_value, ...) and return - x = dplyr::relocate(x, .data$geo_value, .data$time_value) return(x) } -#' @method as.epi_df data.frame -#' @describeIn as.epi_df The input data frame `x` must contain the columns -#' `geo_value` and `time_value`. All other columns will be preserved as is, -#' and treated as measured variables. If `issue` is missing, then the function -#' will look for `issue` as a column of `x`, or as a field in its metadata -#' (stored in its attributes), to infer the issue; if this fails, then the -#' current day-time will be used. +#' @method as_epi_df data.frame +#' @describeIn as_epi_df Works analogously to `as_epi_df.tbl_df()`. #' @export -as.epi_df.data.frame = as.epi_df.tibble +as_epi_df.data.frame = function(x, geo_type, time_type, as_of, + additional_metadata = list(), ...) { + as_epi_df.tbl_df(tibble::as_tibble(x), geo_type, time_type, as_of, + additional_metadata, ...) +} #' Print `epi_df` object #' @@ -223,9 +196,7 @@ as.epi_df.data.frame = as.epi_df.tibble #' tibble. #' #' @param x The `epi_df` object. -#' @param ... Additional arguments passed to `print.tibble()` to print the -#' data. -#' @return The `epi_df` object, unchanged. +#' @param ... Additional arguments passed to methods. #' #' @method print epi_df #' @export @@ -233,7 +204,7 @@ print.epi_df = function(x, ...) { cat("An `epi_df` object, with metadata:\n") cat(sprintf("* %-10s= %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-10s= %s\n", "time_type", attributes(x)$metadata$time_type)) - cat(sprintf("* %-10s= %s\n", "issue", attributes(x)$metadata$issue)) + cat(sprintf("* %-10s= %s\n", "as_of", attributes(x)$metadata$as_of)) cat("\n") NextMethod() } @@ -242,7 +213,7 @@ print.epi_df = function(x, ...) { #' @importFrom utils head #' @export head.epi_df = function(x, ...) { - head(tibble::as_tibble(x), ...) + head(tibble::as_tibble(x)) } #' Summarize `epi_df` object @@ -253,17 +224,17 @@ head.epi_df = function(x, ...) { #' @param object The `epi_df` object. #' @param ... Additional arguments, for compatibility with `summary()`. #' Currently unused. -#' @return No return value; called only to print summary statistics. #' #' @method summary epi_df +#' @importFrom rlang .data #' @importFrom stats median #' @export summary.epi_df = function(object, ...) { cat("An `epi_df` object, with metadata:\n") cat(sprintf("* %-10s= %s\n", "geo_type", attributes(x)$metadata$geo_type)) cat(sprintf("* %-10s= %s\n", "time_type", attributes(x)$metadata$time_type)) - cat(sprintf("* %-10s= %s\n", "issue", attributes(x)$metadata$issue)) - cat("\nSummary of space-time coverge:\n") + cat(sprintf("* %-10s= %s\n", "as_of", attributes(x)$metadata$as_of)) + cat("\nSummary of space-time coverage:\n") cat(sprintf("* %-33s= %s\n", "earliest time value", min(object$time_value))) cat(sprintf("* %-33s= %s\n", "latest time value", max(object$time_value))) cat(sprintf("* %-33s= %i\n", "median geo values per time value", @@ -272,29 +243,16 @@ summary.epi_df = function(object, ...) { dplyr::summarize(median(.data$num))))) } -#' Group or ungroup `epi_df` object -#' -#' Groups or ungroups an `epi_df`, preserving class and attributes. +#' Convert to tsibble object +#' +#' Converts an `epi_df` object into a tsibble, where the index is taken to be +#' `time_value`, and the key variables taken to be `geo_value` along with any +#' others in the `other_keys` field of the metadata, or else explicitly set. #' -#' @method group_by epi_df -#' @importFrom dplyr group_by -#' @export -group_by.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - class(x) = c("epi_df", class(x)) - attributes(x)$metadata = metadata - return(x) -} - -#' @method ungroup epi_df -#' @rdname group_by.epi_df -#' @importFrom dplyr ungroup +#' @importFrom tsibble as_tsibble +#' @method as_tsibble epi_df #' @export -ungroup.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - class(x) = c("epi_df", class(x)) - attributes(x)$metadata = metadata - return(x) +as_tsibble.epi_df = function(x, key, ...) { + if (missing(key)) key = c("geo_value", attributes(x)$metadata$other_keys) + return(as_tsibble(tibble::as_tibble(x), key, index = "time_value", ...)) } diff --git a/R/epi_verbs.R b/R/epi_verbs.R new file mode 100644 index 000000000..a4ff1d50a --- /dev/null +++ b/R/epi_verbs.R @@ -0,0 +1,74 @@ +#' dplyr verbs +#' +#' dplyr verbs for `epi_df` objexts, preserving class and attributes. +#' +#' @method arrange epi_df +#' @importFrom dplyr arrange +#' @export +arrange.epi_df = function(x, ...) { + metadata = attributes(x)$metadata + x = NextMethod() + class(x) = c("epi_df", class(x)) + attributes(x)$metadata = metadata + return(x) +} + +#' @method group_by epi_df +#' @rdname arrange.epi_df +#' @importFrom dplyr group_by +#' @export +group_by.epi_df = function(x, ...) { + metadata = attributes(x)$metadata + x = NextMethod() + class(x) = c("epi_df", class(x)) + attributes(x)$metadata = metadata + return(x) +} + +#' @method group_modify epi_df +#' @rdname arrange.epi_df +#' @importFrom dplyr group_modify +#' @export +group_modify.epi_df = function(x, ...) { + metadata = attributes(x)$metadata + x = NextMethod() + class(x) = c("epi_df", class(x)) + attributes(x)$metadata = metadata + return(x) +} + +#' @method mutate epi_df +#' @rdname arrange.epi_df +#' @importFrom dplyr mutate +#' @export +mutate.epi_df = function(x, ...) { + metadata = attributes(x)$metadata + x = NextMethod() + class(x) = c("epi_df", class(x)) + attributes(x)$metadata = metadata + return(x) +} + +#' @method ungroup epi_df +#' @rdname arrange.epi_df +#' @importFrom dplyr ungroup +#' @export +ungroup.epi_df = function(x, ...) { + metadata = attributes(x)$metadata + x = NextMethod() + class(x) = c("epi_df", class(x)) + attributes(x)$metadata = metadata + return(x) +} + +#' @method unnest epi_df +#' @rdname arrange.epi_df +#' @importFrom tidyr unnest +#' @export +unnest.epi_df = function(x, ...) { + metadata = attributes(x)$metadata + x = NextMethod() + class(x) = c("epi_df", class(x)) + attributes(x)$metadata = metadata + return(x) +} diff --git a/R/outliers.R b/R/outliers.R index 977b97475..755df75fe 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -47,10 +47,10 @@ #' according to the `new_col_name` argument, containing the outlier detection #' thresholds and replacement values from all detection methods. #' -#' @importFrom dplyr group_modify mutate select +#' @importFrom dplyr mutate select #' @importFrom purrr map pmap_dfc #' @importFrom tidyselect ends_with all_of -#' @importFrom rlang abort enquo +#' @importFrom rlang !! abort enquo #' @export epi_detect_outlr = function(x, var, methods = tibble( @@ -67,21 +67,15 @@ epi_detect_outlr = function(x, var, # Validate combiner combiner = match.arg(combiner) - # Save the metadata (dplyr drops it) - metadata = attributes(x)$metadata - # Outlier detection per group (in case x is grouped) - x = x %>% + return( + x %>% group_modify(epi_detect_outlr_one_grp, var = var, methods = methods, combiner = combiner, new_col_name = new_col_name) - - # Attach the class and metadata and return - class(x) = c("epi_df", class(x)) - attributes(x)$metadata = metadata - return(x) + ) } # Outlier detection over a single group @@ -168,6 +162,7 @@ epi_detect_outlr_one_grp = function(.data_group, var, methods, combiner, #' `upper`, and `replacement`. #' #' @importFrom dplyr mutate pull select +#' @importFrom rlang !! #' @export epi_detect_outlr_rm = function(x, var, n = 21, log_transform = FALSE, @@ -247,6 +242,7 @@ epi_detect_outlr_rm = function(x, var, n = 21, #' @importFrom dplyr case_when mutate pull select transmute #' @importFrom fabletools model #' @importFrom feasts STL +#' @importFrom rlang !! #' @export epi_detect_outlr_stl = function(x, var, n_trend = 21, @@ -261,6 +257,7 @@ epi_detect_outlr_stl = function(x, var, # Make x into a tsibble for use with fable x_tsibble = x %>% select(time_value, y = !!var) %>% + tibble::as_tibble() %>% tsibble::as_tsibble(index = time_value) # Transform if requested diff --git a/R/slide.R b/R/slide.R index d59187aab..bcd6eb643 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,6 +1,6 @@ #' Slide a function over variables in an `epi_df` object #' -#' Slides a given function over variables in an `epi_df` object. See the [slide +#' Slides a given function over variables in an `epi_df` object. See the [slide #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' @@ -46,20 +46,20 @@ #' @param align One of "right", "center", or "left", indicating the alignment of #' the sliding window relative to the reference time point. If the alignment #' is "center" and `n` is even, then one more time point will be used after -#' the reference time point than before. Default is "right". +#' the reference time point than before. Default is "right". #' @param before Positive integer less than `n`, specifying the number of time #' points to use in the sliding window strictly before the reference time #' point. For example, setting `before = n-1` would be the same as setting -#' `align = "right"`. The current argument allows for more flexible +#' `align = "right"`. The `before` argument allows for more flexible #' specification of alignment than the `align` parameter, and if specified, -#' overrides `align`. +#' overrides `align`. #' @param complete Should the slide function be run over complete windows only? #' Default is `FALSE`, which allows for computation on partial windows. #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. -#' @param new_col_type One of "dbl", "int", "lgl", "chr", or "list", indicating -#' the data type (tibble abbreviation) for the new column. Default is "dbl". +#' @param as_list_col Should the new column be stored as a list column? Default +#' is `FALSE`. #' @param time_step Optional function used to define the meaning of one time #' step, which if specified, overrides the default choice based on the #' metadata. This function must take a positive integer and return an object @@ -69,36 +69,22 @@ #' `time_type` is "day-time"). #' @return An `epi_df` object given by appending a new column to `x`, named #' according to the `new_col_name` argument, containing the slide values. -#' -#' @details In order to slide a function or formula in a suitable grouped -#' fashion (for example, per geo value), we can use `group_by()` before the -#' call to `epi_slide()`. #' -#' @importFrom dplyr arrange group_modify mutate pull summarize +#' @importFrom dplyr mutate pull summarize #' @importFrom lubridate days weeks -#' @importFrom rlang .data abort enquo enquos +#' @importFrom rlang !! abort enquo enquos #' @export epi_slide = function(x, f, ..., n = 14, align = c("right", "center", "left"), before, complete = FALSE, new_col_name = "slide_value", - new_col_type = c("dbl", "int", "lgl", "chr", "list"), - time_step) { + as_list_col = FALSE, time_step) { # Check we have an `epi_df` object if (!inherits(x, "epi_df")) abort("`x` must be of class `epi_df`.") - - # Which slide_index function? - new_col_type = match.arg(new_col_type) - index_fun = switch(new_col_type, - "dbl" = slider::slide_index_dbl, - "int" = slider::slide_index_int, - "lgl" = slider::slide_index_lgl, - "chr" = slider::slide_index_chr, - "list" = slider::slide_index) - + # What is one time step? if (!missing(time_step)) before_fun = time_step else if (attributes(x)$metadata$time_type == "week") before_fun = weeks else before_fun = days # Use days for time_type = "day" or "day-time" - + # If before is missing, then use align to set up alignment if (missing(before)) { align = match.arg(align) @@ -118,21 +104,37 @@ epi_slide = function(x, f, ..., n = 14, align = c("right", "center", "left"), # Otherwise set up alignment based on passed before value else { - if (before < 0 || before > n-1) + if (before < 0 || before > n-1) { abort("`before` must be in between 0 and n-1`.") + } before_num = before_fun(before) after_num = before_fun(n-1-before) } - - # Save the metadata (dplyr drops it) - metadata = attributes(x)$metadata + + # Convenience function for sliding over just one group + slide_one_grp = function(.data_group, + f, ..., + before_num, + after_num, + complete, + new_col_name) { + slide_values = slider::slide_index(.x = .data_group, + .i = .data_group$time_value, + .f = f, ..., + .before = before_num, + .after = after_num, + .complete = complete) + return(mutate(.data_group, !!new_col_name := slide_values)) + } + + # Arrange by increasing time_value, else slide may not work + x = arrange(x, time_value) # If f is not missing, then just go ahead, slide by group if (!missing(f)) { x = x %>% - group_modify(epi_slide_one_grp, - index_fun = index_fun, + group_modify(slide_one_grp, f = f, ..., before_num = before_num, after_num = after_num, @@ -143,40 +145,27 @@ epi_slide = function(x, f, ..., n = 14, align = c("right", "center", "left"), # Else interpret ... as an expression for tidy evaluation else { quos = enquos(...) - if (length(quos) == 0) - abort("If `f` is missing then a computation must be specified via `...`.") - if (length(quos) > 1) - abort(paste("If `f` is missing then only a single computation can be", - "specified via `...`.")) + if (length(quos) == 0) { + abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + abort("If `f` is missing then only a single computation can be specified via `...`.") + } quo = quos[[1]] f = function(x, quo, ...) rlang::eval_tidy(quo, x) new_col_name = names(rlang::quos_auto_name(quos)) x = x %>% - group_modify(epi_slide_one_grp, - index_fun = index_fun, + group_modify(slide_one_grp, f = f, quo = quo, before_num = before_num, after_num = after_num, complete = complete, new_col_name = new_col_name) } - - # Attach the class and metadata and return - class(x) = c("epi_df", class(x)) - attributes(x)$metadata = metadata - return(x) -} -# Slide over a single group -epi_slide_one_grp = function(.data_group, index_fun, f, ..., before_num, - after_num, complete, new_col_name) { - slide_values = index_fun(.x = .data_group, - .i = .data_group$time_value, - .f = f, ..., - .before = before_num, - .after = after_num, - .complete = complete) - return(mutate(.data_group, !!new_col_name := slide_values)) + # Unnest if we need to, and return + if (!as_list_col) x = tidyr::unnest(x, !!new_col_name) + return(x) } diff --git a/R/utils.R b/R/utils.R index c89c1b587..642a64b0a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,7 +1,7 @@ #' Computations with `NA` values removed #' #' These are just wrapper functions for common computational utilities with `NA` -#' values removed by default. +#' values removed by default. #' #' @export Min = function(x) min(x, na.rm = TRUE) @@ -24,13 +24,25 @@ Median = function(x) median(x, na.rm = TRUE) ########## +#' Start, middle, and end extrators +#' +#' These are just convenience functions for extracting the start, middle, and +#' end of a sequence. +#' #' @export -#' @noRd -Start = function(x) head(x, 1) +Start = function(x) x[1] +#' @rdname Start #' @export -#' @noRd -End = function(x) tail(x, 1) +Middle = function(x, floor = TRUE) { + ifelse(floor, x[floor(length(x)/2)], x[ceiling(length(x)/2)]) +} + +#' @rdname Start +#' @export +End = function(x) x[length(x)] + +########## #' @export #' @noRd @@ -41,3 +53,80 @@ quiet = function(x) { } ########## + +guess_geo_type = function(geo_value) { + if (is.character(geo_value)) { + # Convert geo values to lowercase + geo_value = tolower(geo_value) + + # If all geo values are state abbreviations, then use "state" + state_values = c(tolower(state.abb), "as", "dc", "gu", "mp", "pr", "vi") + if (all(geo_value %in% state_values)) return("state") + + # Else if all geo values are 2 letters, then use "nation" + else if (all(grepl("[a-z]{2}", geo_value))) return("nation") + + # Else if all geo values are 5 numbers, then use "county" + else if (all(grepl("[0-9]{5}", geo_value))) return("county") + } + + else if (is.numeric(geo_value)) { + # Convert geo values to integers + geo_value = as.integer(geo_value) + + # If the max geo value is at most 10, then use "hhs" + if (max(geo_value) <= 10) return("hhs") + + # Else if the max geo value is at most 457, then use "hrr" + if (max(geo_value) <= 457) return("hrr") + } + + # If we got here then we failed + return("custom") +} + +guess_time_type = function(time_value) { + # Convert character time values to Date or POSIXct + if (is.character(time_value)) { + if (nchar(time_value[1]) <= "10") { + new_time_value = tryCatch({ as.Date(time_value) }, + error = function(e) NULL) + } + else { + new_time_value = tryCatch({ as.POSIXct(time_value) }, + error = function(e) NULL) + } + if (!is.null(new_time_value)) time_value = new_time_value + } + + # Now, if a POSIXct class, then use "day-time" + if (inherits(time_value, "POSIXct")) return("day-time") + + # Else, if a Date class, then use "week" or "day" depending on gaps + else if (inherits(time_value, "Date")) { + return(ifelse(all(diff(sort(time_value)) == -7), "week", "day")) + } + + # If we got here then we failed + return("custom") +} + +########## + +# Create an auto-named list +enlist = function(...) { + x = list(...) + n = as.character(sys.call())[-1] + if (!is.null(n0 <- names(x))) { + n[n0 != ""] = n0[n0 != ""] + } + names(x) = n + return(x) +} + +# Variable assignment from a list. NOT USED. Something is broken, this doesn't +# seem to work completely as expected: the variables it define don't propogate +# down to child environments +list2var = function(x) { + list2env(x, envir = parent.frame()) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 1de3a50e2..77399cf43 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -16,11 +16,9 @@ articles: - epiprocess - slide - pct_change - - derivative - correlation - aggregation - outliers - - issues repo: url: diff --git a/docs/404.html b/docs/404.html index 45db31ccb..d725de54c 100644 --- a/docs/404.html +++ b/docs/404.html @@ -100,22 +100,16 @@ 1. Slide a computation over signal values