cmu-delphi · dsweber2 · Jun 6, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -66,7 +66,7 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Depends:
     R (>= 2.10)
 URL: https://cmu-delphi.github.io/epiprocess/

diff --git a/NAMESPACE b/NAMESPACE
@@ -61,6 +61,7 @@ export(epix_merge)
 export(epix_slide)
 export(epix_truncate_versions_after)
 export(filter)
+export(geo_column_names)
 export(group_by)
 export(group_modify)
 export(growth_rate)
@@ -75,9 +76,11 @@ export(next_after)
 export(relocate)
 export(rename)
 export(slice)
+export(time_column_names)
 export(ungroup)
 export(unnest)
 export(validate_epi_archive)
+export(version_column_names)
 importFrom(checkmate,anyInfinite)
 importFrom(checkmate,anyMissing)
 importFrom(checkmate,assert)
@@ -100,6 +103,7 @@ importFrom(checkmate,test_subset)
 importFrom(checkmate,vname)
 importFrom(cli,cat_line)
 importFrom(cli,cli_abort)
+importFrom(cli,cli_inform)
 importFrom(cli,cli_vec)
 importFrom(cli,cli_warn)
 importFrom(cli,format_message)
@@ -185,6 +189,7 @@ importFrom(tibble,as_tibble)
 importFrom(tibble,new_tibble)
 importFrom(tibble,validate_tibble)
 importFrom(tidyr,unnest)
+importFrom(tidyselect,any_of)
 importFrom(tidyselect,eval_select)
 importFrom(tidyselect,starts_with)
 importFrom(tsibble,as_tsibble)

diff --git a/R/archive.R b/R/archive.R
@@ -441,6 +441,8 @@
 
 #' `as_epi_archive` converts a data frame, data table, or tibble into an
 #' `epi_archive` object.
+#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
+#'   example `version = release_date`
 #'
 #' @rdname epi_archive
 #'
@@ -453,11 +455,17 @@
     additional_metadata = NULL,
     compactify = NULL,
     clobberable_versions_start = NULL,
-    versions_end = NULL) {
+    .versions_end = NULL, ...,
+    versions_end = .versions_end) {
   assert_data_frame(x)
+  x <- rename(x, ...)
+  x <- guess_column_name(x, "time_value", time_column_names())
+  x <- guess_column_name(x, "geo_value", geo_column_names())
+  x <- guess_column_name(x, "version", version_column_names())
   if (!test_subset(c("geo_value", "time_value", "version"), names(x))) {
     cli_abort(
-      "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
+      "Either columns `geo_value`, `time_value`, and `version` must be present in `x`, or related columns (see the internal
+ functions `guess_time_column_name()`, `guess_geo_column_name()` and/or `guess_geo_version_name()` for complete list)."
     )
   }
   if (anyMissing(x$version)) {

diff --git a/R/epi_df.R b/R/epi_df.R
@@ -95,7 +95,7 @@ NULL
 #'
 #' @export
 new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of,
-                       additional_metadata = list(), ...) {
+                       additional_metadata = list()) {
   assert_data_frame(x)
   assert_list(additional_metadata)
 
@@ -249,25 +249,37 @@ as_epi_df.epi_df <- function(x, ...) {
 
 #' @method as_epi_df tbl_df
 #' @describeIn as_epi_df The input tibble `x` must contain the columns
-#'   `geo_value` and `time_value`. All other columns will be preserved as is,
-#'   and treated as measured variables. If `as_of` is missing, then the function
-#'   will try to guess it from an `as_of`, `issue`, or `version` column of `x`
-#'   (if any of these are present), or from as an `as_of` field in its metadata
-#'   (stored in its attributes); if this fails, then the current day-time will
-#'   be used.
+#'   `geo_value` and `time_value`, or column names that uniquely map onto these
+#'   (e.g. `date` or `province`). Alternatively, you can specify the conversion
+#'   explicitly (`time_value = someWeirdColumnName`). All other columns not
+#'   specified as `other_keys` will be preserved as is, and treated as measured
+#'   variables.
+#'
+#'  If `as_of` is missing, then the function will try to guess it from an
+#'   `as_of`, `issue`, or `version` column of `x` (if any of these are present),
+#'   or from as an `as_of` field in its metadata (stored in its attributes); if
+#'   this fails, then the current day-time will be used.
 #' @importFrom rlang .data
+#' @importFrom tidyselect any_of
+#' @importFrom cli cli_inform
 #' @export
 as_epi_df.tbl_df <- function(x, geo_type, time_type, as_of,
-                             additional_metadata = list(), ...) {
+                             additional_metadata = list(),
+                             ...) {
+  # possible standard substitutions for time_value
+  x <- rename(x, ...)
+  x <- guess_column_name(x, "time_value", time_column_names())
+  x <- guess_column_name(x, "geo_value", geo_column_names())
   if (!test_subset(c("geo_value", "time_value"), names(x))) {
     cli_abort(
-      "Columns `geo_value` and `time_value` must be present in `x`."
+      "Either columns `geo_value` and `time_value` must be present in `x`, or related columns (see the internal
+ functions `guess_time_column_name()` and/or `guess_geo_column_name()` for a complete list)."
     )
   }
 
   new_epi_df(
     x, geo_type, time_type, as_of,
-    additional_metadata, ...
+    additional_metadata
   )
 }
 

diff --git a/R/utils.R b/R/utils.R
@@ -448,6 +448,89 @@ guess_time_type <- function(time_value) {
   return("custom")
 }
 
+#'  given a vector of characters, add the same values, but upcased, e.g.
+#'   "date" -> c("date", "Date")
+#'   "target_date" -> c("target_date", "Target_Date")
+#' @keywords internal
+upcase_snake_case <- function(vec) {
+  upper_vec <- strsplit(vec, "_") %>%
+    map(function(name) paste0(toupper(substr(name, 1, 1)), substr(name, 2, nchar(name)), collapse = "_")) %>%
+    unlist()
+  c(vec, upper_vec)
+}
+
+#' potential time_value columns
+#' @description
+#' the full list of potential substitutions for the `time_value` column name:
+#' `r time_column_names()`
+#' @export
+time_column_names <- function() {
+  substitutions <- c(
+    "time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date",
+    "week", "epiweek", "month", "mon", "year", "yearmon", "yearmonth",
+    "yearMon", "yearMonth", "dates", "time_values", "target_dates", "time_Value"
+  )
+  substitutions <- upcase_snake_case(substitutions)
+  names(substitutions) <- rep("time_value", length(substitutions))
+  return(substitutions)
+}
+#
+#' potential geo_value columns
+#' @description
+#' the full list of potential substitutions for the `geo_value` column name:
+#' `r geo_column_names()`
+#' @export
+geo_column_names <- function() {
+  substitutions <- c(
+    "geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip",
+    "county", "hrr", "msa", "state", "province", "nation", "states",
+    "provinces", "counties", "geo_Value"
+  )
+  substitutions <- upcase_snake_case(substitutions)
+  names(substitutions) <- rep("geo_value", length(substitutions))
+  return(substitutions)
+}
+
+#' potential version columns
+#' @description
+#' the full list of potential substitutions for the `version` column name:
+#' `r version_column_names()`
+#' @export
+version_column_names <- function() {
+  substitutions <- c(
+    "version", "issue", "release"
+  )
+  substitutions <- upcase_snake_case(substitutions)
+  names(substitutions) <- rep("version", length(substitutions))
+  return(substitutions)
+}
+
+#' rename potential time_value columns
+#'
+#' @description
+#' potentially renames
+#' @param x the tibble to potentially rename
+#' @param substitions a named vector. the potential substitions, with every name `time_value`
+#' @keywords internal
+guess_column_name <- function(x, column_name, substitutions) {
+  if (!(column_name %in% names(x))) {
+    x <- tryCatch(x %>% rename(any_of(substitutions)),
+      error = function(cond) {
+        cli_abort("{names(x)[names(x) %in% substitutions]} are both/all valid substitutions.
+Either `rename` some yourself or drop some.")
+      }
+    )
+    # if none of the names are in substitutions, and `column_name` isn't a column, we're missing a relevant column
+    if (!any(names(x) %in% substitutions)) {
+      cli_abort("There is no {column_name} column or similar name. See e.g. [`time_column_name()`] for a complete list")
+    }
+    if (any(substitutions != "")) {
+      cli_inform("inferring {column_name} column.")
+    }
+  }
+  return(x)
+}
+
 ##########
 
 

diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd
diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd
diff --git a/man/geo_column_names.Rd b/man/geo_column_names.Rd
diff --git a/man/guess_column_name.Rd b/man/guess_column_name.Rd
diff --git a/man/new_epi_df.Rd b/man/new_epi_df.Rd
diff --git a/man/time_column_names.Rd b/man/time_column_names.Rd
diff --git a/man/upcase_snake_case.Rd b/man/upcase_snake_case.Rd
diff --git a/man/version_column_names.Rd b/man/version_column_names.Rd
diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R
@@ -8,16 +8,34 @@
 
 test_that("data.frame must contain geo_value, time_value and version columns", {
   expect_error(as_epi_archive(select(dt, -geo_value), compactify = FALSE),
-    regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
+    regexp = "There is no geo_value column or similar name"
   )
-  expect_error(as_epi_archive(select(dt, -time_value), compactify = FALSE),
-    regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
+  expect_error(expect_message(as_epi_archive(select(dt, -time_value), compactify = FALSE)),
+    regexp = "There is no time_value column or similar name"
   )
   expect_error(as_epi_archive(select(dt, -version), compactify = FALSE),
-    regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`."
+    regexp = "There is no version column or similar name"
   )
 })
 
+test_that("as_epi_archive custom name mapping works correctly", {
+  # custom name works correctly
+  suppressWarnings(expect_equal(
+    as_epi_archive(rename(dt, weirdName = version), version = weirdName),
+    as_epi_archive(dt)
+  ))
+  suppressWarnings(expect_equal(
+    as_epi_archive(rename(dt, weirdName = geo_value), geo_value = weirdName),
+    as_epi_archive(dt)
+  ))
+  suppressWarnings(expect_equal(
+    as_epi_archive(rename(dt, weirdName = time_value), time_value = weirdName),
+    as_epi_archive(dt)
+  ))
+
+  expect_error(as_epi_archive(rename(dt, weirdName = version), version = weirdName, version = time_value), "Names must be unique")
+})
+
 test_that("other_keys can only contain names of the data.frame columns", {
   expect_error(as_epi_archive(dt, other_keys = "xyz", compactify = FALSE),
     regexp = "`other_keys` must be contained in the column names of `x`."