Skip to content
Merged
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
^pkgdown$
^musings$
^data-raw$
^vignettes/articles$
34 changes: 17 additions & 17 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,27 @@ Package: epipredict
Title: Basic epidemiology forecasting methods
Version: 0.0.3
Authors@R: c(
person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut","cre")),
person("Jacob", "Bien", role = "aut"),
person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")),
person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"),
person("Logan", "Brooks", role = "aut"),
person("Rachel", "Lobay", role = "ctb"),
person("Ken", "Mawer", role = "ctb"),
person("Chloe", "You", role = "ctb"),
person("Quang", "Nguyen", role = "ctb"),
person("Dmitry", "Shemetov", role = "ctb")
person("Rachel", "Lobay", role = "aut"),
person("Maggie", "Liu", role = "aut"),
person("Ken", "Mawer", role = "aut"),
person("Chloe", "You", role = "aut"),
person("Jacob", "Bien", role = "ctb")
)
Description: A forecasting "framework" for creating epidemiological forecasts
from versioned data. The framework is designed to be modular so that
different components can exchanged, enabling a large amount of flexibility.
Some simple baseline models formed from these components are included.
Description: A forecasting "framework" for creating epidemiological
forecasts from versioned data. The framework is designed to be modular
so that different components can exchanged, enabling a large amount of
flexibility. Some simple baseline models formed from these components
are included.
License: MIT + file LICENSE
URL: https://github.com/cmu-delphi/epipredict/,
https://cmu-delphi.github.io/epipredict
BugReports: https://github.com/cmu-delphi/epipredict/issues/
Depends:
R (>= 3.5.0),
epiprocess
epiprocess,
R (>= 3.5.0)
Imports:
assertthat,
cli,
Expand All @@ -44,14 +44,14 @@ Imports:
vctrs,
workflows (>= 1.0.0)
Suggests:
poissonreg,
covidcast,
data.table,
epidatr,
ggplot2,
knitr,
lubridate,
parsnip (>= 1.0.0),
poissonreg,
ranger,
RcppRoll,
rmarkdown,
Expand All @@ -60,10 +60,10 @@ Suggests:
VignetteBuilder:
knitr
Remotes:
cmu-delphi/epiprocess,
cmu-delphi/epidatr
cmu-delphi/epidatr,
cmu-delphi/epiprocess
Config/testthat/edition: 3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
LazyData: true
70 changes: 45 additions & 25 deletions R/step_population_scaling.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#' Create a recipe step that scales variables using population data
#' Convert raw scale predictions to per-capita
#'
#' `step_population_scaling` creates a specification of a recipe step
#' that will add a population scaled column in the data. For example,
#' load a dataset that contains county population, and join to an `epi_df`
#' that currently only contains number of new cases by county. Once scaled,
#' predictions can be made on case rate. Although worth noting that there is
#' nothing special about "population". The function can be used to scale by any
#' variable. Population is simply the most natural and common use case.
#' that will perform per-capita scaling. Typical usage would
#' load a dataset that contains state-level population, and use it to convert
#' predictions made from a raw scale model to rate-scale by dividing by
#' the population.
#' Although, it is worth noting that there is nothing special about "population".
#' The function can be used to scale by any variable. Population is the
#' standard use case in the epidemiology forecasting scenario. Any value
#' passed will *divide* the selected variables while the `rate_rescaling`
#' argument is a common *multiplier* of the selected variables.
#'
#' @param recipe A recipe object. The step will be added to the sequence of
#' operations for this recipe. The recipe should contain information about the
Expand All @@ -19,25 +22,30 @@
#' be ard are not limited to "outcome".
#' @param trained A logical to indicate if the quantities for preprocessing
#' have been estimated.
#' @param df a data frame that contains the population data used for scaling.
#' @param by A character vector of variables to left join by.
#' @param df a data frame that contains the population data to be used for
#' inverting the existing scaling.
#' @param by A (possibly named) character vector of variables to join by.
#'
#' If `NULL`, the default, the function will perform a natural join, using all
#' variables in common across the `epi_df` and the user-provided dataset.
#' If columns in `epi_df` and `df` have the same name (and aren't
#' included in by), `.df` is added to the one from the user-provided data
#' variables in common across the `epi_df` produced by the `predict()` call
#' and the user-provided dataset.
#' If columns in that `epi_df` and `df` have the same name (and aren't
#' included in `by`), `.df` is added to the one from the user-provided data
#' to disambiguate.
#'
#' To join by different variables on the `epi_df` and `df`, use a named vector.
#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value`
#' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
#' to `df$states`. To join by multiple variables, use a vector with length > 1.
#' For example, by = c("geo_value" = "states", "county" = "county") will match
#' For example, `by = c("geo_value" = "states", "county" = "county")` will match
#' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
#'
#' See [dplyr::left_join()] for more details.
#' @param df_pop_col the name of the column in the data frame `df` that
#' contains the population data and will be used for scaling.
#' This should be one column.
#' @param rate_rescaling Sometimes raw scales are "per 100K" or "per 1M".
#' Adjustments can be made here. For example, if the original
#' scale is "per 100K", then set `rate_rescaling = 1e5` to get rates.
#' @param create_new TRUE to create a new column and keep the original column
#' in the `epi_df`
#' @param suffix a character. The suffix added to the column name if
Expand All @@ -61,8 +69,7 @@
#' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
#' dplyr::select(geo_value, time_value, cases)
#'
#' pop_data = data.frame(states = c("ca", "ny"),
#' value = c(20000, 30000))
#' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
#'
#' r <- epi_recipe(jhu) %>%
#' step_population_scaling(df = pop_data,
Expand All @@ -86,11 +93,12 @@
#' parsnip::fit(jhu) %>%
#' add_frosting(f)
#'
#' latest <- get_test_data(recipe = r,
#' x = epiprocess::jhu_csse_daily_subset %>%
#' dplyr::filter(time_value > "2021-11-01",
#' geo_value %in% c("ca", "ny")) %>%
#' dplyr::select(geo_value, time_value, cases))
#' latest <- get_test_data(
#' recipe = r,
#' x = epiprocess::jhu_csse_daily_subset %>%
#' dplyr::filter(time_value > "2021-11-01",
#' geo_value %in% c("ca", "ny")) %>%
#' dplyr::select(geo_value, time_value, cases))
#'
#'
#' predict(wf, latest)
Expand All @@ -102,11 +110,19 @@ step_population_scaling <-
df,
by = NULL,
df_pop_col,
rate_rescaling = 1,
create_new = TRUE,
suffix = "_scaled",
columns = NULL,
skip = FALSE,
id = rand_id("population_scaling")){
arg_is_scalar(role, trained, df_pop_col, rate_rescaling, create_new, suffix, id)
arg_is_lgl(create_new, skip)
arg_is_chr(df_pop_col, suffix, id)
arg_is_chr(by, columns, allow_null = TRUE)
if (rate_rescaling <= 0)
cli_stop("`rate_rescaling` should be a positive number")

add_step(
recipe,
step_population_scaling_new(
Expand All @@ -116,6 +132,7 @@ step_population_scaling <-
df = df,
by = by,
df_pop_col = df_pop_col,
rate_rescaling = rate_rescaling,
create_new = create_new,
suffix = suffix,
columns = columns,
Expand All @@ -126,7 +143,7 @@ step_population_scaling <-
}

step_population_scaling_new <-
function(role, trained, df, by, df_pop_col, terms, create_new,
function(role, trained, df, by, df_pop_col, rate_rescaling, terms, create_new,
suffix, columns, skip, id) {
step(
subclass = "population_scaling",
Expand All @@ -136,6 +153,7 @@ step_population_scaling_new <-
df = df,
by = by,
df_pop_col = df_pop_col,
rate_rescaling = rate_rescaling,
create_new = create_new,
suffix = suffix,
columns = columns,
Expand All @@ -153,6 +171,7 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) {
df = x$df,
by = x$by,
df_pop_col = x$df_pop_col,
rate_rescaling = x$rate_rescaling,
create_new = x$create_new,
suffix = x$suffix,
columns = recipes_eval_select(x$terms, training, info),
Expand All @@ -172,8 +191,9 @@ bake.step_population_scaling <- function(object,
try_join <- try(dplyr::left_join(new_data, object$df,
by= object$by),
silent = TRUE)
if (any(grepl("Join columns must be present in data", unlist(try_join)))){
stop("columns in `by` selectors of `step_population_scaling` must be present in data and match")}
if (any(grepl("Join columns must be present in data", unlist(try_join)))) {
cli_stop(c("columns in `by` selectors of `step_population_scaling` ",
"must be present in data and match"))}

if(object$suffix != "_scaled" && object$create_new == FALSE){
message("`suffix` not used to generate new column in `step_population_scaling`")
Expand All @@ -194,7 +214,7 @@ bake.step_population_scaling <- function(object,
dplyr::mutate(
dplyr::across(
dplyr::all_of(object$columns),
~.x/!!pop_col ,
~.x * object$rate_rescaling /!!pop_col ,
.names = "{.col}{suffix}")) %>%
# removed so the models do not use the population column
dplyr::select(- !!pop_col)
Expand Down
52 changes: 31 additions & 21 deletions man/step_population_scaling.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading