Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speedups for epix_slide #386

Merged
merged 19 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ importFrom(data.table,as.data.table)
importFrom(data.table,between)
importFrom(data.table,copy)
importFrom(data.table,key)
importFrom(data.table,rbindlist)
importFrom(data.table,set)
importFrom(data.table,setDF)
importFrom(data.table,setkeyv)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_rows)
Expand Down Expand Up @@ -114,6 +116,8 @@ importFrom(rlang,syms)
importFrom(stats,cor)
importFrom(stats,median)
importFrom(tibble,as_tibble)
importFrom(tibble,new_tibble)
importFrom(tibble,validate_tibble)
importFrom(tidyr,unnest)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,starts_with)
Expand Down
5 changes: 4 additions & 1 deletion R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,10 @@ epi_archive =
version <= max_version, ] %>%
unique(by = c("geo_value", "time_value", other_keys),
fromLast = TRUE) %>%
tibble::as_tibble() %>%
tibble::as_tibble() %>%
# (`as_tibble` should de-alias the DT and its columns in any edge
# cases where they are aliased. We don't say we guarantee this
# though.)
dplyr::select(-"version") %>%
as_epi_df(geo_type = self$geo_type,
time_type = self$time_type,
Expand Down
8 changes: 7 additions & 1 deletion R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,13 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of,

# Reorder columns (geo_value, time_value, ...)
if(sum(dim(x)) != 0){
x = dplyr::relocate(x, "geo_value", "time_value")
cols_to_put_first <- c("geo_value", "time_value")
x <- x[, c(
cols_to_put_first,
# All other columns
names(x)[!(names(x) %in% cols_to_put_first)]
)
]
}

# Apply epi_df class, attach metadata, and return
Expand Down
45 changes: 31 additions & 14 deletions R/grouped_epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ grouped_epi_archive =
#' @description Slides a given function over variables in a `grouped_epi_archive`
#' object. See the documentation for the wrapper function [`epix_slide()`] for
#' details.
#' @importFrom data.table key address
#' @importFrom data.table key address rbindlist setDF
#' @importFrom tibble as_tibble new_tibble validate_tibble
#' @importFrom dplyr group_by groups
#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms
#' env missing_arg
slide = function(f, ..., before, ref_time_values,
Expand Down Expand Up @@ -280,16 +282,19 @@ grouped_epi_archive =
if (! (is.atomic(comp_value) || is.data.frame(comp_value))) {
Abort("The slide computation must return an atomic vector or a data frame.")
}

# Label every result row with the `ref_time_value`
res <- list(time_value = ref_time_value)

# Wrap the computation output in a list and unchop/unnest later if
# `as_list_col = FALSE`. This approach means that we will get a
# list-class col rather than a data.frame-class col when
# `as_list_col = TRUE` and the computations outputs are data
# frames.
comp_value <- list(comp_value)

# Label every result row with the `ref_time_value`:
return(tibble::tibble(time_value = .env$ref_time_value,
!!new_col := .env$comp_value))
res[[new_col]] <- list(comp_value)

# Convert the list to a tibble all at once for speed.
return(validate_tibble(new_tibble(res)))
}

# If `f` is missing, interpret ... as an expression for tidy evaluation
Expand All @@ -308,7 +313,7 @@ grouped_epi_archive =
}

f = as_slide_computation(f, ...)
x = purrr::map_dfr(ref_time_values, function(ref_time_value) {
x = lapply(ref_time_values, function(ref_time_value) {
# Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`,
# `epi_archive` if `all_versions` is `TRUE`:
as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions)
Expand All @@ -331,6 +336,13 @@ grouped_epi_archive =
# copies.
if (address(as_of_archive$DT) == address(private$ungrouped$DT)) {
# `as_of` aliased its the full `$DT`; copy before mutating:
#
# Note: this step is probably unneeded; we're fine with
# aliasing of the DT or its columns: vanilla operations aren't
# going to mutate them in-place if they are aliases, and we're
# not performing mutation (unlike the situation with
# `fill_through_version` where we do mutate a `DT` and don't
# want aliasing).
as_of_archive$DT <- copy(as_of_archive$DT)
}
dt_key = data.table::key(as_of_archive$DT)
Expand All @@ -357,15 +369,20 @@ grouped_epi_archive =
}

return(
dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)),
.drop=private$drop) %>%
dplyr::group_modify(group_modify_fn,
f = f, ...,
ref_time_value = ref_time_value,
new_col = new_col,
.keep = TRUE)
dplyr::group_modify(
dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop),
group_modify_fn,
f = f, ...,
ref_time_value = ref_time_value,
new_col = new_col,
.keep = TRUE
)
)
})
# Combine output into a single tibble
x <- as_tibble(setDF(rbindlist(x)))
# Reconstruct groups
x <- group_by(x, !!!syms(private$vars), .drop=private$drop)

# Unchop/unnest if we need to
if (!as_list_col) {
Expand Down
19 changes: 17 additions & 2 deletions R/methods-epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,18 @@
#' x$as_of(max_version = v)
#' ```
#'
#' @export
#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input
#' archives, but may in some edge cases alias parts of the inputs, so copy the
#' outputs if needed before using mutating operations like `data.table`'s `:=`
#' operator. Currently, the only situation where there is potentially aliasing
#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change
#' in the future.
#'
#' @examples
#' # warning message of data latency shown
#' epix_as_of(x = archive_cases_dv_subset,
#' max_version = max(archive_cases_dv_subset$DT$version))
#'
#' @export
#' @examples
#'
#' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01
Expand All @@ -60,6 +65,8 @@
#' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning"))
#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used
#' # to globally toggle these warnings.
#'
#' @export
epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) {
if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.")
return(x$as_of(max_version, min_time_value, all_versions = all_versions))
Expand Down Expand Up @@ -798,6 +805,14 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
#' x$slide(new_var = comp(old_var), before = 119)
#' ```
#'
#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place
#' mutation of the input archives on their own. In some edge cases the inputs it
#' feeds to the slide computations may alias parts of the input archive, so copy
#' the slide computation inputs if needed before using mutating operations like
#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of
#' the slide operation may alias parts of the input archive, so similarly, make
#' sure to clone and/or copy appropriately before using in-place mutation.
#'
#' @examples
#' library(dplyr)
#'
Expand Down
6 changes: 2 additions & 4 deletions R/slide.R
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
# Count the number of appearances of each reference time value (these
# appearances should all be real for now, but if we allow ref time values
# outside of .data_group's time values):
counts = .data_group %>%
dplyr::filter(.data$time_value %in% time_values) %>%
counts = dplyr::filter(.data_group, .data$time_value %in% time_values) %>%
dplyr::count(.data$time_value) %>%
dplyr::pull(n)

Expand Down Expand Up @@ -375,8 +374,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
.x$.real <- NULL
f(.x, .group_key, .ref_time_value, ...)
}
x = x %>%
group_modify(slide_one_grp,
x = group_modify(x, slide_one_grp,
f = f_wrapper, ...,
starts = starts,
stops = stops,
Expand Down
8 changes: 8 additions & 0 deletions man/epix_as_of.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions man/epix_slide.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading