From 881fa6fdbaa7f81dbb1e950c1cee4b599f203b05 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 14:06:47 -0700 Subject: [PATCH 1/7] feat: supply tidyselection for sum_groups_epi_df --- R/methods-epi_df.R | 47 ++++++++++++++++------------ man/sum_groups_epi_df.Rd | 18 +++++++++-- tests/testthat/test-methods-epi_df.R | 6 ++-- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 1191521c9..a0d1055f5 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -499,34 +499,41 @@ group_epi_df <- function(x, exclude = character()) { #' the resulting `epi_df` will have `geo_value` set to `"total"`. #' #' @param .x an `epi_df` -#' @param sum_cols character vector of the columns to aggregate -#' @param group_cols character vector of column names to group by. "time_value" is -#' included by default. +#' @param sum_cols <[`tidy-select`][dplyr::dplyr_tidy_select]> One unquoted +#' expression. Variable names can be used as if they like `c(x, y)` +#' were positions in the data frame, and expressions like `x:y` can +#' be used to select a range of variables. +#' @param .group_cols character vector of column names to group by. "time_value" is +#' included by default. #' @return an `epi_df` object #' #' @export -sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) { +#' @examples +#' # This data has other_keys age_group and edu_qual. +#' # We can aggregate num_graduates within geo_value +#' grad_employ_subset +#' +#' grad_employ_subset %>% +#' select(geo_value:num_graduates) %>% +#' sum_groups_epi_df(num_graduates, group_cols = "geo_value") +sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { assert_class(.x, "epi_df") - assert_character(sum_cols) assert_character(group_cols) - checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x))) checkmate::assert_subset(group_cols, key_colnames(.x)) if (!"time_value" %in% group_cols) { group_cols <- c("time_value", group_cols) } - - out <- .x %>% - group_by(across(all_of(group_cols))) %>% - dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop") + sum_cols <- rlang::enquo(sum_cols) + pos <- tidyselect::eval_select(sum_cols, .x) # trigger this to ensure cols exist + out <- group_by(.x, across(all_of(group_cols))) %>% + dplyr::summarize(across(!!sum_cols, sum), .groups = "drop") # To preserve epi_df-ness, we need to ensure that the `geo_value` column is # present. - out <- if (!"geo_value" %in% group_cols) { - out %>% + if (!"geo_value" %in% group_cols) { + out <- out %>% mutate(geo_value = "total") %>% - relocate(geo_value, .before = 1) - } else { - out + relocate(.data$geo_value, .before = 1) } # The `geo_type` will be correctly inherited here by the following logic: @@ -535,10 +542,10 @@ sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) # - if `geo_value` is not in `group_cols`, then the constructor will see # the unrecognizeable "total" value and will correctly infer the "custom" # geo_type. - out %>% - as_epi_df( - as_of = attr(.x, "metadata")$as_of, - other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols) - ) %>% + as_epi_df( + out, + as_of = attr(.x, "metadata")$as_of, + other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols) + ) %>% arrange_canonical() } diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd index f1ba84745..267949b5c 100644 --- a/man/sum_groups_epi_df.Rd +++ b/man/sum_groups_epi_df.Rd @@ -4,14 +4,17 @@ \alias{sum_groups_epi_df} \title{Aggregate an \code{epi_df} object} \usage{ -sum_groups_epi_df(.x, sum_cols = "value", group_cols = character()) +sum_groups_epi_df(.x, sum_cols, group_cols = "time_value") } \arguments{ \item{.x}{an \code{epi_df}} -\item{sum_cols}{character vector of the columns to aggregate} +\item{sum_cols}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One unquoted +expression. Variable names can be used as if they like \code{c(x, y)} +were positions in the data frame, and expressions like \code{x:y} can +be used to select a range of variables.} -\item{group_cols}{character vector of column names to group by. "time_value" is +\item{.group_cols}{character vector of column names to group by. "time_value" is included by default.} } \value{ @@ -22,3 +25,12 @@ Aggregates an \code{epi_df} object by the specified group columns, summing the \code{value} column, and returning an \code{epi_df}. If aggregating over \code{geo_value}, the resulting \code{epi_df} will have \code{geo_value} set to \code{"total"}. } +\examples{ +# This data has other_keys age_group and edu_qual. +# We can aggregate num_graduates within geo_value +grad_employ_subset + +grad_employ_subset \%>\% + select(geo_value:num_graduates) \%>\% + sum_groups_epi_df(num_graduates, group_cols = "geo_value") +} diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 3e5c180b0..c1aadfced 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -311,16 +311,18 @@ test_that("complete.epi_df works", { }) test_that("sum_groups_epi_df works", { - out <- toy_epi_df %>% sum_groups_epi_df(sum_cols = "x") + out <- toy_epi_df %>% sum_groups_epi_df("x") expected_out <- toy_epi_df %>% group_by(time_value) %>% summarize(x = sum(x)) %>% mutate(geo_value = "total") %>% as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of) expect_equal(out, expected_out) + out <- toy_epi_df %>% sum_groups_epi_df(x) + expect_equal(out, expected_out) out <- toy_epi_df %>% - sum_groups_epi_df(sum_cols = c("x", "y"), group_cols = c("time_value", "geo_value", "indic_var1")) + sum_groups_epi_df(c(x, y), group_cols = c("time_value", "geo_value", "indic_var1")) expected_out <- toy_epi_df %>% group_by(time_value, geo_value, indic_var1) %>% summarize(x = sum(x), y = sum(y), .groups = "drop") %>% From 9bdb05639d786d78e78a766ee523d9a8ed8af05e Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 14:09:06 -0700 Subject: [PATCH 2/7] add one more test --- tests/testthat/test-methods-epi_df.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index c1aadfced..bc9f1e35d 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -329,4 +329,7 @@ test_that("sum_groups_epi_df works", { as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of, other_keys = "indic_var1") %>% arrange_canonical() expect_equal(out, expected_out) + out <- toy_epi_df %>% + sum_groups_epi_df(x:y, group_cols = c("time_value", "geo_value", "indic_var1")) + expect_equal(out, expected_out) }) From 368fc39095fb5d43a27a50141f51e66afb06d206 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 14:10:42 -0700 Subject: [PATCH 3/7] bump version, add news --- DESCRIPTION | 2 +- NEWS.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 19bc82e01..a700b3e64 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.11.3 +Version: 0.11.4 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 88acabee4..2734419bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -58,6 +58,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - Various functions are now faster, using faster variants of core operations and avoiding reconstructing grouped `epi_df`s when unnecessary. - Add `autoplot.epi_archive()` to display revision patterns. +- `sum_groups_epi_df()` now supports tidyselect syntax in it's second argument (#655). ## Bug fixes From b56d55548f0456ae3c533aaa6fe322d514ab9ee5 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 8 Apr 2025 14:52:51 -0700 Subject: [PATCH 4/7] straggling period --- R/methods-epi_df.R | 3 +-- man/sum_groups_epi_df.Rd | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index a0d1055f5..fd3d8f299 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -503,7 +503,7 @@ group_epi_df <- function(x, exclude = character()) { #' expression. Variable names can be used as if they like `c(x, y)` #' were positions in the data frame, and expressions like `x:y` can #' be used to select a range of variables. -#' @param .group_cols character vector of column names to group by. "time_value" is +#' @param group_cols character vector of column names to group by. "time_value" is #' included by default. #' @return an `epi_df` object #' @@ -514,7 +514,6 @@ group_epi_df <- function(x, exclude = character()) { #' grad_employ_subset #' #' grad_employ_subset %>% -#' select(geo_value:num_graduates) %>% #' sum_groups_epi_df(num_graduates, group_cols = "geo_value") sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { assert_class(.x, "epi_df") diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd index 267949b5c..6b8ed611d 100644 --- a/man/sum_groups_epi_df.Rd +++ b/man/sum_groups_epi_df.Rd @@ -14,7 +14,7 @@ expression. Variable names can be used as if they like \code{c(x, y)} were positions in the data frame, and expressions like \code{x:y} can be used to select a range of variables.} -\item{.group_cols}{character vector of column names to group by. "time_value" is +\item{group_cols}{character vector of column names to group by. "time_value" is included by default.} } \value{ @@ -31,6 +31,5 @@ the resulting \code{epi_df} will have \code{geo_value} set to \code{"total"}. grad_employ_subset grad_employ_subset \%>\% - select(geo_value:num_graduates) \%>\% sum_groups_epi_df(num_graduates, group_cols = "geo_value") } From 3ee11aa8131334d9b8d921e18679e18bfa0312ce Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 14:18:44 -0700 Subject: [PATCH 5/7] docs(sum_groups_epi_df): tweak sum_cols roxygen, examples, code comments --- DESCRIPTION | 1 + R/methods-epi_df.R | 18 +++++++++--------- R/slide.R | 7 +------ man/sum_groups_epi_df.Rd | 15 +++++++++------ 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a700b3e64..1b1573a52 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -104,6 +104,7 @@ Collate: 'methods-epi_archive.R' 'grouped_epi_archive.R' 'growth_rate.R' + 'inline-roxygen.R' 'key_colnames.R' 'methods-epi_df.R' 'outliers.R' diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index fd3d8f299..5642db163 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -499,22 +499,20 @@ group_epi_df <- function(x, exclude = character()) { #' the resulting `epi_df` will have `geo_value` set to `"total"`. #' #' @param .x an `epi_df` -#' @param sum_cols <[`tidy-select`][dplyr::dplyr_tidy_select]> One unquoted -#' expression. Variable names can be used as if they like `c(x, y)` -#' were positions in the data frame, and expressions like `x:y` can -#' be used to select a range of variables. +#' @param sum_cols `r tidyselect_arg_roxygen` #' @param group_cols character vector of column names to group by. "time_value" is #' included by default. #' @return an `epi_df` object #' -#' @export #' @examples -#' # This data has other_keys age_group and edu_qual. -#' # We can aggregate num_graduates within geo_value +#' # This data has other_keys age_group and edu_qual: #' grad_employ_subset -#' +#' +#' # Aggregate num_graduates within each geo_value (and time_value): #' grad_employ_subset %>% #' sum_groups_epi_df(num_graduates, group_cols = "geo_value") +#' +#' @export sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { assert_class(.x, "epi_df") assert_character(group_cols) @@ -522,8 +520,10 @@ sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { if (!"time_value" %in% group_cols) { group_cols <- c("time_value", group_cols) } + # Attempt tidyselection ourselves to get "Error in `sum_groups_epi_df()`" + # rather than "in `dplyr::summarize()`", before forwarding: sum_cols <- rlang::enquo(sum_cols) - pos <- tidyselect::eval_select(sum_cols, .x) # trigger this to ensure cols exist + pos <- tidyselect::eval_select(sum_cols, .x) out <- group_by(.x, across(all_of(group_cols))) %>% dplyr::summarize(across(!!sum_cols, sum), .groups = "drop") diff --git a/R/slide.R b/R/slide.R index be000f579..6be54baa6 100644 --- a/R/slide.R +++ b/R/slide.R @@ -557,12 +557,7 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' `vignette("epi_df")` for more examples. #' #' @template basic-slide-params -#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), -#' [other tidy-select expression][tidyselect::language], or a vector of -#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if -#' they were positions in the data frame, so expressions like `x:y` can be -#' used to select a range of variables. +#' @param .col_names `r tidyselect_arg_roxygen` #' #' The tidy-selection renaming interface is not supported, and cannot be used #' to provide output column names; if you want to customize the output column diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd index 6b8ed611d..34ec99930 100644 --- a/man/sum_groups_epi_df.Rd +++ b/man/sum_groups_epi_df.Rd @@ -9,10 +9,12 @@ sum_groups_epi_df(.x, sum_cols, group_cols = "time_value") \arguments{ \item{.x}{an \code{epi_df}} -\item{sum_cols}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One unquoted -expression. Variable names can be used as if they like \code{c(x, y)} -were positions in the data frame, and expressions like \code{x:y} can -be used to select a range of variables.} +\item{sum_cols}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +\link[tidyselect:language]{other tidy-select expression}, or a vector of +characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if +they were positions in the data frame, so expressions like \code{x:y} can be +used to select a range of variables.} \item{group_cols}{character vector of column names to group by. "time_value" is included by default.} @@ -26,10 +28,11 @@ Aggregates an \code{epi_df} object by the specified group columns, summing the the resulting \code{epi_df} will have \code{geo_value} set to \code{"total"}. } \examples{ -# This data has other_keys age_group and edu_qual. -# We can aggregate num_graduates within geo_value +# This data has other_keys age_group and edu_qual: grad_employ_subset +# Aggregate num_graduates within each geo_value (and time_value): grad_employ_subset \%>\% sum_groups_epi_df(num_graduates, group_cols = "geo_value") + } From 57a6932590e1b6c5c08b1caa122bda405b8d1c03 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 14:27:01 -0700 Subject: [PATCH 6/7] Add missing files --- R/inline-roxygen.R | 16 ++++++++++++++++ man/tidyselect_arg_roxygen.Rd | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 R/inline-roxygen.R create mode 100644 man/tidyselect_arg_roxygen.Rd diff --git a/R/inline-roxygen.R b/R/inline-roxygen.R new file mode 100644 index 000000000..ae2ce66c1 --- /dev/null +++ b/R/inline-roxygen.R @@ -0,0 +1,16 @@ +# Helpers here are meant to be used inside inline R expressions within roxygen2 +# documentation when @template is inappropriate. + +#' Description of a single arg that tidyselects value variables +#' +#' Not meant for when describing tidyselect `...`. +#' +#' @keywords internal +tidyselect_arg_roxygen <- ' + <[`tidy-select`][dplyr_tidy_select]> An unquoted column + name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), + [other tidy-select expression][tidyselect::language], or a vector of + characters (e.g. `c("cases", "deaths")`). Variable names can be used as if + they were positions in the data frame, so expressions like `x:y` can be + used to select a range of variables. +' diff --git a/man/tidyselect_arg_roxygen.Rd b/man/tidyselect_arg_roxygen.Rd new file mode 100644 index 000000000..27cb264d9 --- /dev/null +++ b/man/tidyselect_arg_roxygen.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inline-roxygen.R +\docType{data} +\name{tidyselect_arg_roxygen} +\alias{tidyselect_arg_roxygen} +\title{Description of a single arg that tidyselects value variables} +\format{ +An object of class \code{character} of length 1. +} +\usage{ +tidyselect_arg_roxygen +} +\description{ +Not meant for when describing tidyselect \code{...}. +} +\keyword{internal} From cd52992383239a52689055dbce407238d6a7fe1d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Apr 2025 14:44:18 -0700 Subject: [PATCH 7/7] Lint unused variable --- R/methods-epi_df.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 5642db163..7870dede8 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -523,7 +523,7 @@ sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { # Attempt tidyselection ourselves to get "Error in `sum_groups_epi_df()`" # rather than "in `dplyr::summarize()`", before forwarding: sum_cols <- rlang::enquo(sum_cols) - pos <- tidyselect::eval_select(sum_cols, .x) + tidyselect::eval_select(sum_cols, .x) out <- group_by(.x, across(all_of(group_cols))) %>% dplyr::summarize(across(!!sum_cols, sum), .groups = "drop")