Skip to content

Commit

Permalink
Merge pull request #386 from cmu-delphi/ndefries/speedups
Browse files Browse the repository at this point in the history
Speedups for `epix_slide`
  • Loading branch information
nmdefries authored Dec 14, 2023
2 parents 31f6319 + c6863ea commit b444a3c
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 22 deletions.
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ importFrom(data.table,as.data.table)
importFrom(data.table,between)
importFrom(data.table,copy)
importFrom(data.table,key)
importFrom(data.table,rbindlist)
importFrom(data.table,set)
importFrom(data.table,setDF)
importFrom(data.table,setkeyv)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_rows)
Expand Down Expand Up @@ -114,6 +116,8 @@ importFrom(rlang,syms)
importFrom(stats,cor)
importFrom(stats,median)
importFrom(tibble,as_tibble)
importFrom(tibble,new_tibble)
importFrom(tibble,validate_tibble)
importFrom(tidyr,unnest)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,starts_with)
Expand Down
5 changes: 4 additions & 1 deletion R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,10 @@ epi_archive =
version <= max_version, ] %>%
unique(by = c("geo_value", "time_value", other_keys),
fromLast = TRUE) %>%
tibble::as_tibble() %>%
tibble::as_tibble() %>%
# (`as_tibble` should de-alias the DT and its columns in any edge
# cases where they are aliased. We don't say we guarantee this
# though.)
dplyr::select(-"version") %>%
as_epi_df(geo_type = self$geo_type,
time_type = self$time_type,
Expand Down
8 changes: 7 additions & 1 deletion R/epi_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,13 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of,

# Reorder columns (geo_value, time_value, ...)
if(sum(dim(x)) != 0){
x = dplyr::relocate(x, "geo_value", "time_value")
cols_to_put_first <- c("geo_value", "time_value")
x <- x[, c(
cols_to_put_first,
# All other columns
names(x)[!(names(x) %in% cols_to_put_first)]
)
]
}

# Apply epi_df class, attach metadata, and return
Expand Down
45 changes: 31 additions & 14 deletions R/grouped_epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ grouped_epi_archive =
#' @description Slides a given function over variables in a `grouped_epi_archive`
#' object. See the documentation for the wrapper function [`epix_slide()`] for
#' details.
#' @importFrom data.table key address
#' @importFrom data.table key address rbindlist setDF
#' @importFrom tibble as_tibble new_tibble validate_tibble
#' @importFrom dplyr group_by groups
#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms
#' env missing_arg
slide = function(f, ..., before, ref_time_values,
Expand Down Expand Up @@ -280,16 +282,19 @@ grouped_epi_archive =
if (! (is.atomic(comp_value) || is.data.frame(comp_value))) {
Abort("The slide computation must return an atomic vector or a data frame.")
}

# Label every result row with the `ref_time_value`
res <- list(time_value = ref_time_value)

# Wrap the computation output in a list and unchop/unnest later if
# `as_list_col = FALSE`. This approach means that we will get a
# list-class col rather than a data.frame-class col when
# `as_list_col = TRUE` and the computations outputs are data
# frames.
comp_value <- list(comp_value)

# Label every result row with the `ref_time_value`:
return(tibble::tibble(time_value = .env$ref_time_value,
!!new_col := .env$comp_value))
res[[new_col]] <- list(comp_value)

# Convert the list to a tibble all at once for speed.
return(validate_tibble(new_tibble(res)))
}

# If `f` is missing, interpret ... as an expression for tidy evaluation
Expand All @@ -308,7 +313,7 @@ grouped_epi_archive =
}

f = as_slide_computation(f, ...)
x = purrr::map_dfr(ref_time_values, function(ref_time_value) {
x = lapply(ref_time_values, function(ref_time_value) {
# Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`,
# `epi_archive` if `all_versions` is `TRUE`:
as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions)
Expand All @@ -331,6 +336,13 @@ grouped_epi_archive =
# copies.
if (address(as_of_archive$DT) == address(private$ungrouped$DT)) {
# `as_of` aliased its the full `$DT`; copy before mutating:
#
# Note: this step is probably unneeded; we're fine with
# aliasing of the DT or its columns: vanilla operations aren't
# going to mutate them in-place if they are aliases, and we're
# not performing mutation (unlike the situation with
# `fill_through_version` where we do mutate a `DT` and don't
# want aliasing).
as_of_archive$DT <- copy(as_of_archive$DT)
}
dt_key = data.table::key(as_of_archive$DT)
Expand All @@ -357,15 +369,20 @@ grouped_epi_archive =
}

return(
dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)),
.drop=private$drop) %>%
dplyr::group_modify(group_modify_fn,
f = f, ...,
ref_time_value = ref_time_value,
new_col = new_col,
.keep = TRUE)
dplyr::group_modify(
dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop),
group_modify_fn,
f = f, ...,
ref_time_value = ref_time_value,
new_col = new_col,
.keep = TRUE
)
)
})
# Combine output into a single tibble
x <- as_tibble(setDF(rbindlist(x)))
# Reconstruct groups
x <- group_by(x, !!!syms(private$vars), .drop=private$drop)

# Unchop/unnest if we need to
if (!as_list_col) {
Expand Down
19 changes: 17 additions & 2 deletions R/methods-epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,18 @@
#' x$as_of(max_version = v)
#' ```
#'
#' @export
#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input
#' archives, but may in some edge cases alias parts of the inputs, so copy the
#' outputs if needed before using mutating operations like `data.table`'s `:=`
#' operator. Currently, the only situation where there is potentially aliasing
#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change
#' in the future.
#'
#' @examples
#' # warning message of data latency shown
#' epix_as_of(x = archive_cases_dv_subset,
#' max_version = max(archive_cases_dv_subset$DT$version))
#'
#' @export
#' @examples
#'
#' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01
Expand All @@ -60,6 +65,8 @@
#' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning"))
#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used
#' # to globally toggle these warnings.
#'
#' @export
epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) {
if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.")
return(x$as_of(max_version, min_time_value, all_versions = all_versions))
Expand Down Expand Up @@ -798,6 +805,14 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
#' x$slide(new_var = comp(old_var), before = 119)
#' ```
#'
#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place
#' mutation of the input archives on their own. In some edge cases the inputs it
#' feeds to the slide computations may alias parts of the input archive, so copy
#' the slide computation inputs if needed before using mutating operations like
#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of
#' the slide operation may alias parts of the input archive, so similarly, make
#' sure to clone and/or copy appropriately before using in-place mutation.
#'
#' @examples
#' library(dplyr)
#'
Expand Down
6 changes: 2 additions & 4 deletions R/slide.R
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
# Count the number of appearances of each reference time value (these
# appearances should all be real for now, but if we allow ref time values
# outside of .data_group's time values):
counts = .data_group %>%
dplyr::filter(.data$time_value %in% time_values) %>%
counts = dplyr::filter(.data_group, .data$time_value %in% time_values) %>%
dplyr::count(.data$time_value) %>%
dplyr::pull(n)

Expand Down Expand Up @@ -375,8 +374,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values,
.x$.real <- NULL
f(.x, .group_key, .ref_time_value, ...)
}
x = x %>%
group_modify(slide_one_grp,
x = group_modify(x, slide_one_grp,
f = f_wrapper, ...,
starts = starts,
stops = stops,
Expand Down
8 changes: 8 additions & 0 deletions man/epix_as_of.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions man/epix_slide.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b444a3c

Please sign in to comment.