Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

epi_df argument refactoring #460

Open
wants to merge 15 commits into
base: dev
Choose a base branch
from
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Config/testthat/edition: 3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Depends:
R (>= 2.10)
URL: https://cmu-delphi.github.io/epiprocess/
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ export(epix_merge)
export(epix_slide)
export(epix_truncate_versions_after)
export(filter)
export(geo_column_names)
export(group_by)
export(group_modify)
export(growth_rate)
Expand All @@ -75,9 +76,11 @@ export(next_after)
export(relocate)
export(rename)
export(slice)
export(time_column_names)
export(ungroup)
export(unnest)
export(validate_epi_archive)
export(version_column_names)
importFrom(checkmate,anyInfinite)
importFrom(checkmate,anyMissing)
importFrom(checkmate,assert)
Expand All @@ -100,6 +103,7 @@ importFrom(checkmate,test_subset)
importFrom(checkmate,vname)
importFrom(cli,cat_line)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_vec)
importFrom(cli,cli_warn)
importFrom(cli,format_message)
Expand Down Expand Up @@ -185,6 +189,7 @@ importFrom(tibble,as_tibble)
importFrom(tibble,new_tibble)
importFrom(tibble,validate_tibble)
importFrom(tidyr,unnest)
importFrom(tidyselect,any_of)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,starts_with)
importFrom(tsibble,as_tsibble)
Expand Down
12 changes: 10 additions & 2 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,8 @@

#' `as_epi_archive` converts a data frame, data table, or tibble into an
#' `epi_archive` object.
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
#' example `version = release_date`
#'
#' @rdname epi_archive
#'
Expand All @@ -453,11 +455,17 @@
additional_metadata = NULL,
compactify = NULL,
clobberable_versions_start = NULL,
versions_end = NULL) {
.versions_end = NULL, ...,
versions_end = .versions_end) {
assert_data_frame(x)
x <- rename(x, ...)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: do we need to do a tryCatch here, too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error in this case is

as_epi_archive(rename(dt, weirdName = version), version = weirdName, version = time_value)
Error in `rename()` at �]8;line = 461:col = 3;file:///home/dsweber/allHail/delphi/epiprocess/R/archive.R�epiprocess/R/archive.R:461:3�]8;;�:
! Names must be unique.
✖ These names are duplicated:
  * "version" at locations 1 and 2.
Run `�]8;;rstudio:run:rlang::last_trace()�rlang::last_trace()�]8;;�` to see where the error occurred.

which I think is more obvious what went wrong. For the other rename, it's buried a little deeper why the names are redundant, so I wanted to give some context

x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
x <- guess_column_name(x, "version", version_column_names())
if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
cli_abort(
"Columns `geo_value`, `time_value`, and `version` must be present in `x`."
"Either columns `geo_value`, `time_value`, and `version` must be present in `x`, or related columns (see the internal

Check warning on line 467 in R/archive.R

View workflow job for this annotation

GitHub Actions / lint

file=R/archive.R,line=467,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 123 characters.
functions `guess_time_column_name()`, `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)."
)
}
if (anyMissing(x$version)) {
Expand Down
32 changes: 22 additions & 10 deletions R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ NULL
#'
#' @export
new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list()) {
assert_data_frame(x)
assert_list(additional_metadata)

Expand Down Expand Up @@ -249,25 +249,37 @@ as_epi_df.epi_df <- function(x, ...) {

#' @method as_epi_df tbl_df
#' @describeIn as_epi_df The input tibble `x` must contain the columns
#' `geo_value` and `time_value`. All other columns will be preserved as is,
#' and treated as measured variables. If `as_of` is missing, then the function
#' will try to guess it from an `as_of`, `issue`, or `version` column of `x`
#' (if any of these are present), or from as an `as_of` field in its metadata
#' (stored in its attributes); if this fails, then the current day-time will
#' be used.
#' `geo_value` and `time_value`, or column names that uniquely map onto these
#' (e.g. `date` or `province`). Alternatively, you can specify the conversion
#' explicitly (`time_value = someWeirdColumnName`). All other columns not
#' specified as `other_keys` will be preserved as is, and treated as measured
#' variables.
#'
#' If `as_of` is missing, then the function will try to guess it from an
#' `as_of`, `issue`, or `version` column of `x` (if any of these are present),
#' or from as an `as_of` field in its metadata (stored in its attributes); if
#' this fails, then the current day-time will be used.
#' @importFrom rlang .data
#' @importFrom tidyselect any_of
#' @importFrom cli cli_inform
#' @export
as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
additional_metadata = list(), ...) {
additional_metadata = list(),
...) {
dsweber2 marked this conversation as resolved.
Show resolved Hide resolved
# possible standard substitutions for time_value
x <- rename(x, ...)
x <- guess_column_name(x, "time_value", time_column_names())
x <- guess_column_name(x, "geo_value", geo_column_names())
if (!test_subset(c("geo_value", "time_value"), names(x))) {
cli_abort(
"Columns `geo_value` and `time_value` must be present in `x`."
"Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: do we need the same type of check and error in the as_epi_archive version of this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's already there? The most recent commit includes a wording change to match this though.

functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)."
)
}

new_epi_df(
x, geo_type, time_type, as_of,
additional_metadata, ...
additional_metadata
)
}

Expand Down
83 changes: 83 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,89 @@ guess_time_type <- function(time_value) {
return("custom")
}

#' given a vector of characters, add the same values, but upcased, e.g.
#' "date" -> c("date", "Date")
#' "target_date" -> c("target_date", "Target_Date")
#' @keywords internal
upcase_snake_case <- function(vec) {
upper_vec <- strsplit(vec, "_") %>%
map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>%
unlist()
c(vec, upper_vec)
}

#' potential time_value columns
#' @description
#' the full list of potential substitutions for the `time_value` column name:
#' `r time_column_names()`
#' @export
time_column_names <- function() {
substitutions <- c(
"time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date",
"week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth",
"yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("time_value", length(substitutions))
return(substitutions)
}
#
#' potential geo_value columns
#' @description
#' the full list of potential substitutions for the `geo_value` column name:
#' `r geo_column_names()`
#' @export
geo_column_names <- function() {
substitutions <- c(
"geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip",
"county", "hrr", "msa", "state", "province", "nation", "states",
"provinces", "counties", "geo_Value"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("geo_value", length(substitutions))
return(substitutions)
}

#' potential version columns
#' @description
#' the full list of potential substitutions for the `version` column name:
#' `r version_column_names()`
#' @export
version_column_names <- function() {
substitutions <- c(
"version", "issue", "release"
)
substitutions <- upcase_snake_case(substitutions)
names(substitutions) <- rep("version", length(substitutions))
return(substitutions)
}

#' rename potential time_value columns
#'
#' @description
#' potentially renames
#' @param x the tibble to potentially rename
#' @param substitions a named vector. the potential substitions, with every name `time_value`
#' @keywords internal
guess_column_name <- function(x, column_name, substitutions) {
if (!(column_name %in% names(x))) {
x <- tryCatch(x %>% rename(any_of(substitutions)),
error = function(cond) {
cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions.
Either `rename` some yourself or drop some.")
}
)
# if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
if (!any(names(x) %in% substitutions)) {
cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list")
}
if (any(substitutions != "")) {
cli_inform("inferring {column_name} column.")
}
}
return(x)
}

##########


Expand Down
16 changes: 10 additions & 6 deletions man/as_epi_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion man/epi_archive.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/geo_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/guess_column_name.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/new_epi_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/time_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions man/upcase_snake_case.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/version_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 22 additions & 4 deletions tests/testthat/test-archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,34 @@

test_that("data.frame must contain geo_value, time_value and version columns", {
expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE),
regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
regexp = "There is no geo_value column or similar name"
)
expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE),
regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
expect_error(expect_message(as_epi_archive(select(dt, -time_value), compactify = FALSE)),
regexp = "There is no time_value column or similar name"
)
expect_error(as_epi_archive(select(dt, -version), compactify = FALSE),
regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
regexp = "There is no version column or similar name"
)
})

test_that("as_epi_archive custom name mapping works correctly", {
# custom name works correctly
suppressWarnings(expect_equal(
as_epi_archive(rename(dt, weirdName = version), version = weirdName),
as_epi_archive(dt)
))
suppressWarnings(expect_equal(
as_epi_archive(rename(dt, weirdName = geo_value), geo_value = weirdName),
as_epi_archive(dt)
))
suppressWarnings(expect_equal(
as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName),
as_epi_archive(dt)
))

expect_error(as_epi_archive(rename(dt, weirdName = version), version = weirdName, version = time_value), "Names must be unique")

Check warning on line 36 in tests/testthat/test-archive.R

View workflow job for this annotation

GitHub Actions / lint

file=tests/testthat/test-archive.R,line=36,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 130 characters.
})

test_that("other_keys can only contain names of the data.frame columns", {
expect_error(as_epi_archive(dt, other_keys = "xyz", compactify = FALSE),
regexp = "`other_keys` must be contained in the column names of `x`."
Expand Down
Loading
Loading