diff --git a/DESCRIPTION b/DESCRIPTION index af0a92d5e..9056c3f9a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,6 +44,7 @@ Imports: vctrs, workflows (>= 1.0.0) Suggests: + poissonreg, covidcast, data.table, epidatr, @@ -64,5 +65,5 @@ Remotes: Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 LazyData: true diff --git a/R/data.R b/R/data.R index 9c93506a3..622d1c7d7 100644 --- a/R/data.R +++ b/R/data.R @@ -33,3 +33,26 @@ #' June 7 is the average of the underlying data for June 1 through 7, #' inclusive. "case_death_rate_subset" + +#' State population data +#' +#' Data set on state populations, from the 2019 US Census. +#' +#' @format Data frame with 57 rows (including one for the United States as a +#' whole, plus the District of Columbia, Puerto Rico Commonwealth, +#' American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, +#' Islands). +#' +#' \describe{ +#' \item{fips}{FIPS code} +#' \item{name}{Full name of the state or territory} +#' \item{pop}{Estimate of the location's resident population in +#' 2019.} +#' \item{abbr}{Postal abbreviation for the location} +#' } +#' +#' @source United States Census Bureau, at +#' \url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, +#' \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, +#' and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} +"state_census" diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 8e219a9e4..b5bc57ee3 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -1,35 +1,43 @@ -#' Revert population scaled prediction +#' Convert per-capita predictions to raw scale #' #' `layer_population_scaling` creates a specification of a frosting layer -#' that will add a population scaled column in the data. For example, -#' load a dataset that contains county population, and join to an `epi_df` -#' that currently predicts number of new cases by county to obtain case rates. -#' Although worth noting that there is nothing special about "population". -#' The function can be used to scale by any variable. Population is simply the -#' most natural and common use case. +#' that will "undo" per-capita scaling. Typical usage would +#' load a dataset that contains state-level population, and use it to convert +#' predictions made from a rate-scale model to raw scale by multiplying by +#' the population. +#' Although, it is worth noting that there is nothing special about "population". +#' The function can be used to scale by any variable. Population is the +#' standard use case in the epidemiology forecasting scenario. Any value +#' passed will *multiply* the selected variables while the `rate_rescaling` +#' argument is a common *divisor* of the selected variables. #' #' @param frosting a `frosting` postprocessor. The layer will be added to the -#' sequence of operations for this frosting. +#' sequence of operations for this frosting. #' @param ... One or more selector functions to scale variables -#' for this step. See [selections()] for more details. -#' @param df a data frame that contains the population data used for scaling. -#' @param by A character vector of variables to left join by. +#' for this step. See [selections()] for more details. +#' @param df a data frame that contains the population data to be used for +#' inverting the existing scaling. +#' @param by A (possibly named) character vector of variables to join by. #' #' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` and the user-provided dataset. -#' If columns in `epi_df` and `df` have the same name (and aren't -#' included in by), `.df` is added to the one from the user-provided data +#' variables in common across the `epi_df` produced by the `predict()` call +#' and the user-provided dataset. +#' If columns in that `epi_df` and `df` have the same name (and aren't +#' included in `by`), `.df` is added to the one from the user-provided data #' to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. -#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value` +#' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` #' to `df$states`. To join by multiple variables, use a vector with length > 1. -#' For example, by = c("geo_value" = "states", "county" = "county") will match +#' For example, `by = c("geo_value" = "states", "county" = "county")` will match #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`. #' #' See [dplyr::left_join()] for more details. #' @param df_pop_col the name of the column in the data frame `df` that #' contains the population data and used for scaling. +#' @param rate_rescaling Sometimes rates are "per 100K" or "per 1M" rather than +#' "per person". Adjustments can be made here. For example, if the original +#' rate is "per 100K", then set `rate_rescaling = 1e5` to get counts back. #' @param create_new TRUE to create a new column and keep the original column #' in the `epi_df`. #' @param suffix a character. The suffix added to the column name if @@ -46,8 +54,7 @@ #' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% #' dplyr::select(geo_value, time_value, cases) #' -#' pop_data = data.frame(states = c("ca", "ny"), -#' value = c(20000, 30000)) +#' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) #' #' r <- epi_recipe(jhu) %>% #' step_population_scaling(df = pop_data, @@ -71,12 +78,12 @@ #' parsnip::fit(jhu) %>% #' add_frosting(f) #' -#' latest <- get_test_data(recipe = r, -#' x = epiprocess::jhu_csse_daily_subset %>% -#' dplyr::filter(time_value > "2021-11-01", -#' geo_value %in% c("ca", "ny")) %>% -#' dplyr::select(geo_value, time_value, cases)) -#' +#' latest <- get_test_data( +#' recipe = r, +#' x = epiprocess::jhu_csse_daily_subset %>% +#' dplyr::filter(time_value > "2021-11-01", +#' geo_value %in% c("ca", "ny")) %>% +#' dplyr::select(geo_value, time_value, cases)) #' #' predict(wf, latest) layer_population_scaling <- function(frosting, @@ -84,17 +91,26 @@ layer_population_scaling <- function(frosting, df, by = NULL, df_pop_col, + rate_rescaling = 1, create_new = TRUE, - suffix = "_original", + suffix = "_scaled", .flag = TRUE, id = rand_id("population_scaling")) { + arg_is_scalar(df_pop_col, rate_rescaling, create_new, suffix, .flag, id) + arg_is_lgl(create_new, .flag) + arg_is_chr(df_pop_col, suffix, id) + arg_is_chr(by, allow_null = TRUE) + if (rate_rescaling <= 0) + cli_stop("`rate_rescaling` should be a positive number") + add_layer( frosting, layer_population_scaling_new( df = df, by = by, df_pop_col = df_pop_col, + rate_rescaling = rate_rescaling, terms = dplyr::enquos(...), create_new = create_new, suffix = suffix, @@ -105,11 +121,12 @@ layer_population_scaling <- function(frosting, } layer_population_scaling_new <- - function(df, by, df_pop_col, terms, create_new, suffix, id) { + function(df, by, df_pop_col, rate_rescaling, terms, create_new, suffix, id) { layer("population_scaling", df = df, by = by, df_pop_col = df_pop_col, + rate_rescaling = rate_rescaling, terms = terms, create_new = create_new, suffix = suffix, @@ -125,8 +142,9 @@ slather.layer_population_scaling <- try_join <- try(dplyr::left_join(components$predictions, object$df, by= object$by), silent = TRUE) - if (any(grepl("Join columns must be present in data", unlist(try_join)))){ - stop("columns in `by` selectors of `layer_population_scaling` must be present in data and match")} + if (any(grepl("Join columns must be present in data", unlist(try_join)))) { + cli_stop(c("columns in `by` selectors of `layer_population_scaling` ", + "must be present in data and match"))} object$df <- object$df %>% dplyr::mutate(dplyr::across(where(is.character), tolower)) @@ -136,13 +154,12 @@ slather.layer_population_scaling <- col_names <- names(pos) suffix = ifelse(object$create_new, object$suffix, "") - components$predictions <- dplyr::left_join(components$predictions, object$df, by= object$by, suffix = c("", ".df")) %>% dplyr::mutate(dplyr::across(dplyr::all_of(col_names), - ~.x * !!pop_col , + ~.x * !!pop_col / object$rate_rescaling , .names = "{.col}{suffix}")) %>% dplyr::select(- !!pop_col) components diff --git a/R/utils_arg.R b/R/utils_arg.R index ebac387d5..6f762382a 100644 --- a/R/utils_arg.R +++ b/R/utils_arg.R @@ -108,7 +108,7 @@ arg_is_chr = function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = F handle_arg_list( ..., tests = function(name, value) { - if (!(is.character(value) | (is.null(value) & !allow_null))) + if (!(is.character(value) | (is.null(value) & allow_null))) cli_stop("Argument {.val {name}} must be of character type.") if (any(is.na(value)) & !allow_na) diff --git a/data-raw/state_census.R b/data-raw/state_census.R new file mode 100644 index 000000000..22dde5a41 --- /dev/null +++ b/data-raw/state_census.R @@ -0,0 +1,11 @@ +library(dplyr) +library(tidyr) + +state_census <- covidcast::state_census %>% + select(STATE, NAME, POPESTIMATE2019, ABBR) %>% + rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>% + mutate(abbr = tolower(abbr)) %>% + as_tibble() + + +usethis::use_data(state_census, overwrite = TRUE) diff --git a/data/state_census.rda b/data/state_census.rda new file mode 100644 index 000000000..1118db0d0 Binary files /dev/null and b/data/state_census.rda differ diff --git a/man/create_layer.Rd b/man/create_layer.Rd index 7917e8854..81f5e33b0 100644 --- a/man/create_layer.Rd +++ b/man/create_layer.Rd @@ -9,9 +9,9 @@ create_layer(name = NULL, open = rlang::is_interactive()) \arguments{ \item{name}{Either a name without extension, or \code{NULL} to create the paired file based on currently open file in the script editor. If -the R file is open, \code{use_test()} will create/open the corresponding +the \verb{R/} file is open, \code{use_test()} will create/open the corresponding test file; if the test file is open, \code{use_r()} will create/open the -corresponding R file.} +corresponding \verb{R/} file.} \item{open}{Whether to open the file for interactive editing.} } diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index 9d59e3f08..8ac350870 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/layer_population_scaling.R \name{layer_population_scaling} \alias{layer_population_scaling} -\title{Revert population scaled prediction} +\title{Convert per-capita predictions to raw scale} \usage{ layer_population_scaling( frosting, @@ -10,33 +10,36 @@ layer_population_scaling( df, by = NULL, df_pop_col, + rate_rescaling = 1, create_new = TRUE, - suffix = "_original", + suffix = "_scaled", .flag = TRUE, id = rand_id("population_scaling") ) } \arguments{ \item{frosting}{a \code{frosting} postprocessor. The layer will be added to the -sequence of operations for this frosting.} +sequence of operations for this frosting.} \item{...}{One or more selector functions to scale variables for this step. See \code{\link[=selections]{selections()}} for more details.} -\item{df}{a data frame that contains the population data used for scaling.} +\item{df}{a data frame that contains the population data to be used for +inverting the existing scaling.} -\item{by}{A character vector of variables to left join by. +\item{by}{A (possibly named) character vector of variables to join by. If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} and the user-provided dataset. -If columns in \code{epi_df} and \code{df} have the same name (and aren't -included in by), \code{.df} is added to the one from the user-provided data +variables in common across the \code{epi_df} produced by the \code{predict()} call +and the user-provided dataset. +If columns in that \code{epi_df} and \code{df} have the same name (and aren't +included in \code{by}), \code{.df} is added to the one from the user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. -For example, by = c("geo_value" = "states") will match \code{epi_df$geo_value} +For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} to \code{df$states}. To join by multiple variables, use a vector with length > 1. -For example, by = c("geo_value" = "states", "county" = "county") will match +For example, \code{by = c("geo_value" = "states", "county" = "county")} will match \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}. See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} @@ -44,6 +47,10 @@ See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} \item{df_pop_col}{the name of the column in the data frame \code{df} that contains the population data and used for scaling.} +\item{rate_rescaling}{Sometimes rates are "per 100K" or "per 1M" rather than +"per person". Adjustments can be made here. For example, if the original +rate is "per 100K", then set \code{rate_rescaling = 1e5} to get counts back.} + \item{create_new}{TRUE to create a new column and keep the original column in the \code{epi_df}.} @@ -60,12 +67,15 @@ an updated \code{frosting} postprocessor } \description{ \code{layer_population_scaling} creates a specification of a frosting layer -that will add a population scaled column in the data. For example, -load a dataset that contains county population, and join to an \code{epi_df} -that currently predicts number of new cases by county to obtain case rates. -Although worth noting that there is nothing special about "population". -The function can be used to scale by any variable. Population is simply the -most natural and common use case. +that will "undo" per-capita scaling. Typical usage would +load a dataset that contains state-level population, and use it to convert +predictions made from a rate-scale model to raw scale by multiplying by +the population. +Although, it is worth noting that there is nothing special about "population". +The function can be used to scale by any variable. Population is the +standard use case in the epidemiology forecasting scenario. Any value +passed will \emph{multiply} the selected variables while the \code{rate_rescaling} +argument is a common \emph{divisor} of the selected variables. } \examples{ library(epiprocess) @@ -73,8 +83,7 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), - value = c(20000, 30000)) +pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% step_population_scaling(df = pop_data, @@ -98,12 +107,12 @@ wf <- epi_workflow(r, parsnip::fit(jhu) \%>\% add_frosting(f) -latest <- get_test_data(recipe = r, - x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) - +latest <- get_test_data( + recipe = r, + x = epiprocess::jhu_csse_daily_subset \%>\% + dplyr::filter(time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::select(geo_value, time_value, cases)) predict(wf, latest) } diff --git a/man/state_census.Rd b/man/state_census.Rd new file mode 100644 index 000000000..eec13eb53 --- /dev/null +++ b/man/state_census.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{state_census} +\alias{state_census} +\title{State population data} +\format{ +Data frame with 57 rows (including one for the United States as a +whole, plus the District of Columbia, Puerto Rico Commonwealth, +American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, +Islands). + +\describe{ +\item{fips}{FIPS code} +\item{name}{Full name of the state or territory} +\item{pop}{Estimate of the location's resident population in +2019.} +\item{abbr}{Postal abbreviation for the location} +} +} +\source{ +United States Census Bureau, at +\url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, +\url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, +and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} +} +\usage{ +state_census +} +\description{ +Data set on state populations, from the 2019 US Census. +} +\keyword{datasets} diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 2404a7b4c..5b805b33b 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -115,7 +115,23 @@ test_that("Postprocessing workflow works and values correct", { expect_silent(p <- predict(wf, latest)) expect_equal(nrow(p), 2L) expect_equal(ncol(p), 4L) - expect_equal(p$.pred_original, p$.pred * c(20000, 30000)) + expect_equal(p$.pred_scaled, p$.pred * c(20000, 30000)) + + f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, df = pop_data, rate_rescaling = 10000, + by = c("geo_value" = "states"), + df_pop_col = "value") + wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(f) + expect_silent(p <- predict(wf, latest)) + expect_equal(nrow(p), 2L) + expect_equal(ncol(p), 4L) + expect_equal(p$.pred_scaled, p$.pred * c(2, 3)) + }) test_that("Postprocessing to get cases from case rate", { @@ -144,8 +160,7 @@ test_that("Postprocessing to get cases from case rate", { by = c("geo_value" = "states"), df_pop_col = "value") - wf <- epi_workflow(r, - parsnip::linear_reg()) %>% + wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) @@ -159,7 +174,7 @@ test_that("Postprocessing to get cases from case rate", { expect_silent(p <- predict(wf, latest)) expect_equal(nrow(p), 2L) expect_equal(ncol(p), 4L) - expect_equal(p$.pred_original, p$.pred * c(1/20000, 1/30000)) + expect_equal(p$.pred_scaled, p$.pred * c(1/20000, 1/30000)) }) @@ -239,11 +254,11 @@ test_that("expect error if `by` selector does not match", { by = NULL, df_pop_col = "values") - expect_error(wf <- epi_workflow(r, - parsnip::linear_reg()) %>% - fit(jhu) %>% - add_frosting(f), - "columns in `by` selectors of `step_population_scaling` must be present in data and match") + expect_error( + wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(f), + "columns in `by` selectors of `step_population_scaling` must be present in data and match") r <- epi_recipe(jhu) %>% step_population_scaling(case_rate, @@ -271,8 +286,7 @@ test_that("expect error if `by` selector does not match", { geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate)) - wf <- epi_workflow(r, - parsnip::linear_reg()) %>% + wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd new file mode 100644 index 000000000..23e110b6c --- /dev/null +++ b/vignettes/preprocessing-and-models.Rmd @@ -0,0 +1,579 @@ +--- +title: Examples of Preprocessing and Models +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Examples of Preprocessing and Models} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Introduction + +The `epipredict` package utilizes the `tidymodels` framework, namely +[`recipes`](https://recipes.tidymodels.org/) for +[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences +of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a +unified interface to a range of models. + +`epipredict` has additional customized feature engineering and preprocessing +steps, such as `step_epi_shift()`, `step_population_scaling()`, +`step_epi_naomit()`. They can be used along with +steps from the `recipes` package for more feature engineering. + +In this vignette, we will illustrate some examples of how to use `epipredict` +with `recipes` and `parsnip` for different purposes of epidemiology forecasting. +We will focus on simple basic autoregressive models, in which COVID cases and +deaths in the near future are predicted using a linear combination of cases and +deaths in the near past. + +The remaining vignette will be split into three sections. The first section, we +will use a Poisson regression to predict death counts. In the second section, +we will use a linear regression to predict death rates. Last but not least, we +will create a classification model for hotspot predictions. + +```{r, warning=FALSE, message=FALSE} +library(tidyr) +library(dplyr) +library(epidatr) +library(epiprocess) +library(epipredict) +library(recipes) +library(parsnip) +library(workflows) +library(poissonreg) +``` + +## Poisson Regression + +During COVID-19, Center for Disease Control and Prevention (CDC) gathered models +and forecasts to characterize the state of an outbreak and its course. They use +it to inform public health decision makers on potential consequences of +deploying control measures. + +One of the outcomes that the CDC forecasts is [death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html). +Although there are many state-of-the-art models, we choose to use Poisson +regression, the textbook example for modeling count data, as an illustration +for using the `epipredict` package with other existing tidymodel packages. + +```{r poisson-reg-data} +x <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() %>% + select(geo_value, time_value, cases = value) + +y <- covidcast( + data_source = "jhu-csse", + signals = "deaths_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() %>% + select(geo_value, time_value, deaths = value) + +case_death_counts_subset <- x %>% + full_join(y, by = c("geo_value", "time_value")) %>% + as_epi_df() +``` + +The `case_death_counts_subset` dataset comes from the `epidatr` package,and +contains the number of confirmed cases and deaths from June 4, 2021 to +Dec 31, 2021 in some U.S. states. + +We wish to predict the 7-day ahead death counts with lagged cases and deaths. +Furthermore, we will let each state be a dummy variable. Using differential +intercept coefficients, we can allow for an intercept shift between states. + +The model takes the form of +\begin{aligned} +\text{log}\left( \mu_{t+7} \right) = \beta_0 + \delta_1 s_{\text{state_1}} + +\delta_2 s_{\text{state_2}} + ... + \beta_1 \text{deaths}_{t} + \nonumber \\ +\beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + +\beta_4 \text{cases}_{t-7} +\end{aligned} + + +where $\mu_{t+7} = \mathbb{E}(y_{t+7})$, and $y_{t+7}$ is assumed to follow a +Poisson distribution with mean $\mu_{t+7}$; $s_{\text{state}}$ are dummy +variables for each state and take values of either 0 or 1. + +Preprocessing steps will be performed using the `recipes` functions to get the +data ready for modeling. + +But before diving into them, it will be helpful to understand what `roles` are +in the `recipes` framework. + +--- + +#### Aside on `recipes` + +`recipes` can assign one or more roles to each column in the data. The roles +are not restricted to a predefined set; they can be anything. +For most conventional situations, they are typically “predictor” and/or +"outcome". Additional roles enable targeted `step_*()` operations on specific +variables or groups of variables. + +In our case, the role `predictor` is given to explanatory variable that go to the +left-hand side of the model. The role `outcome` is the response variable +that we wish to predict. `geo_value` and `time_value` are predefined roles +that are unique to the `epipredict` package. Since we work with `epi_df` +objects, all datasets should have `geo_value` and `time_value` passed through +automatically with these two roles assigned to the appropriate columns. + +The `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) +in bulk. There are a couple handy functions that can be used together to help us +manipulate variable roles easily. + +> `update_role()` alters an existing role in the recipe or assigns an initial role +> to variables that do not yet have a declared role. +> +> `add_role()` adds an additional role to variables that already have a role in +> the recipe, without overwriting old roles. +> +> `remove_role()` eliminates a single existing role in the recipe. + +#### End aside + +--- + +Notice in the following preprocessing steps, we used `add_role()` on +`geo_value_factor` since currently the default role for it is `raw` and +we would like the dummified variables to automatically be `predictor`'s. + +```{r} +case_death_counts_subset <- case_death_counts_subset %>% + mutate(geo_value_factor = as.factor(geo_value)) %>% + as_epi_df() + +epi_recipe(case_death_counts_subset) + +r <- epi_recipe(case_death_counts_subset) %>% + add_role(geo_value_factor, new_role = "predictor") %>% + step_dummy(geo_value_factor) %>% + ## Occasionally, data reporting errors / corrections result in negative + ## cases / deaths + step_mutate(cases = pmax(cases, 0), deaths = pmax(deaths, 0)) %>% + step_epi_lag(cases, deaths, lag = c(0, 7)) %>% + step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% + step_epi_naomit() +``` + +After specifying the preprocessing steps, we will use the `parsnip` package for +modeling and getting the prediction for death count, 7 days after the +latest available date in the dataset. + +```{r} +latest <- get_test_data(r, case_death_counts_subset) + +wf <- epi_workflow(r, parsnip::poisson_reg()) %>% + fit(case_death_counts_subset) + +predict(wf, latest) %>% filter(!is.na(.pred)) +``` + +Note that the `time_value` corresponds to the last available date in the +training set, **NOT** to the target date of the forecast +(`r max(latest$time_value) + 7`). + + +Let's take a look at the fit: +```{r} +extract_fit_engine(wf) +``` + +Up to now, we've used the Poisson regression to model count data. Poisson +regression can also be used to model rate data, such as case rates or death +rates, by incorporating offset terms in the model. + +To model death rates, the Poisson regression would be expressed as: +\begin{aligned} +\text{log}\left( \mu_{t+7} \right) = \text{log(population)} + +\beta_0 + \delta_1 s_{\text{state_1}} + +\delta_2 s_{\text{state_2}} + ... + \beta_1 \text{deaths}_{t} + \nonumber \\ +\beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + +\beta_4 \text{cases}_{t-7} +\end{aligned} +where $\text{log(population)}$ is the log of the state population that was +used to scale the count data on the left-hand side of the equation. + +There are several ways to model rate data given count and population data. +First, in the `parsnip` framework, we can specify the formula in `fit()`. +However, by doing so we lose the ability to use the `recipes` framework to +create new variables since new variables that do not exist in the +original dataset such as lag and leads cannot be called directly in `fit()`. + +Alternatively, `step_population_scaling()` and `layer_population_scaling()` +in the `epipredict` package can perform the population scaling if we provide the +population data, and we can model this using other models such as linear +regression, which we will illustrate in the next section. + + +## Linear Regression + +For COVID-19, the CDC required submission of case and death count predictions. +However, the Delphi Group preferred to train on rate data instead, because it +puts different locations on a similar scale. +We can use a liner regression to predict the death +rates and use state population data to scale the rates to counts. We will do so +using `layer_population_scaling()` from the `epipredict` package. + +Additionally, when forecasts are submitted, prediction intervals should be +provided along with the point estimates. This can be obtained via postprocessing +`layer_residual_quantiles()`. Although worth pointing out that +`layer_residual_quantiles()` should be used before population scaling or else +the transformation will make the results uninterpretable. + +We wish to predict the 7-day ahead death counts with lagged case rates and death +rates, along with extra behaviour indicator inputs. Namely, we will use survey data +from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators). + +The survey data shows the estimated percentage of people who wore a mask for +most or all of the time while in public in the past 7 days and the estimated +percentage of respondents who reported that all or most people they encountered +in public in the past 7 days maintained a distance of at least 6 feet. + +State-wise population data from the 2019 U.S. Census is included in this package +and will be used in `layer_population_scaling()`. +```{r} +behav_ind_mask <- covidcast( + data_source = "fb-survey", + signals = "smoothed_wwearing_mask_7d", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() %>% + select(geo_value, time_value, masking = value) + +behav_ind_distancing <- covidcast( + data_source = "fb-survey", + signals = "smoothed_wothers_distanced_public", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() %>% + select(geo_value, time_value, distancing = value) + +pop_dat <- state_census %>% select(abbr, pop) + +behav_ind <- behav_ind_mask %>% + full_join(behav_ind_distancing, by = c("geo_value", "time_value")) +``` + +Rather than using raw mask-wearing / social-distancing metrics, for the sake +of illustration, we'll convert both into categorical predictors. + +```{r, echo=FALSE, message=FALSE,fig.align='center', fig.width=6, fig.height=4} +library(ggplot2) +behav_ind %>% + pivot_longer(masking:distancing) %>% + ggplot(aes(value, fill = geo_value)) + + geom_density(alpha = 0.5) + + scale_fill_brewer(palette = "Set1", name = "") + + theme_bw() + + scale_x_continuous(expand = c(0,0)) + + scale_y_continuous(expand = expansion(c(0,.05))) + + facet_wrap(~name, scales = "free") + + theme(legend.position = "bottom") +``` + +We will take a subset of death rate and case rate data from the built-in dataset +`case_death_rate_subset`. + +```{r} +jhu <- case_death_rate_subset %>% + dplyr::filter( + time_value >= "2021-06-04", + time_value <= "2021-12-31", + geo_value %in% c("ca","fl","tx","ny","nj")) +``` + +Preprocessing steps will rely on functions from the `epipredict` package as well +as the `recipes` package: + +Additionally, there are also many functions in the `recipes` package that allow for +[scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), +such as log transformations and data centering. In our case, we will +center the numerical predictors to allow for meaningful interpretation of the +intercept. + +```{r} +jhu <- jhu %>% + mutate(geo_value_factor = as.factor(geo_value)) %>% + left_join(behav_ind, by = c("geo_value", "time_value")) %>% + as_epi_df() + +r <- epi_recipe(jhu) %>% + add_role(geo_value_factor, new_role = "predictor") %>% + step_dummy(geo_value_factor) %>% + step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% + step_mutate(masking = cut_number(masking, 5), + distancing = cut_number(distancing, 5)) %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_center(contains("lag"), role = "predictor") %>% + step_epi_naomit() +``` + +As a sanity check we can see what the training data will look like +```{r, warning = FALSE} +glimpse(slice_sample(bake(prep(r, jhu), jhu), n = 6)) +``` + +Before directly predicting the results, we need to add postprocessing layers to +obtain the death counts instead of death rates. Note that the rates used so +far are "per 100K people" rather than "per person". + +```{r} +f <- frosting() %>% + layer_predict() %>% + layer_add_target_date("2022-01-07") %>% + layer_threshold(.pred, lower = 0) %>% + layer_residual_quantiles(probs = c(0.05, 0.95), symmetrize = FALSE) %>% + layer_naomit(.pred) %>% + layer_population_scaling( + .pred, .pred_distn, df = pop_dat, rate_rescaling = 1e5, + by = c("geo_value" = "abbr"), df_pop_col = "pop") + +wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(f) + +latest <- get_test_data(recipe = r, x = jhu) +p <- predict(wf, latest) +p +``` + +The columns marked `*_scaled` have been rescaled to the correct units, in this +case `deaths` rather than deaths per 100K people (these remain in `.pred`). + +To look at the prediction intervals: +```{r} +p %>% + select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>% + mutate(.pred_distn_scaled = nested_quantiles(.pred_distn_scaled)) %>% + unnest(.pred_distn_scaled) %>% + pivot_wider(names_from = tau, values_from = q) +``` + + +Last but not least, let's take a look at the regression fit and check the +coefficients: +```{r, echo =FALSE} +summary(extract_fit_engine(wf)) +``` + +## Classification + +Sometimes it is preferable to create a predictive model for surges or upswings +rather than for raw values. In this case, +the target is to predict if the future will have increased case rates(up), +decreased case rates (down), or flat case rates (flat) relative to the current +level. Such model are often +referred to as hotspot prediction models. We will refer to the model notations +from [McDonald, Bien, Green, Hu, et al.](#references) but extend the application +to predict three categories instead of two. + +Hotspot prediction predicts a categorical variable defined in terms of the +relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. +Where $Y_{\ell, t}$ denotes the case rates in location $\ell$ at time $t$. +We define the response variables as follows: + +$$ + Z_{\ell, t}= + \begin{cases} + \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ + \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.25\\ + \text{flat}, & \text{otherwise} + \end{cases} +$$ + +where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. +We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized +as 'up', meaning the number of newly reported cases over the past 7 days has +increased by at least 25% compared to the preceding week. When $Z_{\ell,t}$ +is categorized as 'down', it suggests that there has been at least a 25% +decrease in newly reported cases over the past 7 days. Otherwise, we will +consider the trends to be flat. + +The expression of the multinomial regression we will model is as follows: + +$$ +\pi_{j}(x) = \text{Pr}(Y = j|x) = \frac{e^{g_j(x)}}{\sum_{k=0}^2 g_j(x) } +$$ +where $j$ is either down (`Y=0`), flat (`Y=1`), or up (`Y=2`), +$$g_0(x) = 0$$ + +\begin{aligned} +g_1(x)= \text{ln}\left(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}\right) &= +\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_{10} s_{\text{state_1}} + +\delta_{11} s_{\text{state_2}} + ... \nonumber \\ +&\quad + \beta_{12} Y^{\Delta}_{\ell, t} + +\beta_{13} Y^{\Delta}_{\ell, t-7} +\end{aligned} + +\begin{aligned} +g_2(x)= \text{ln}\left(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}\right) &= +\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_{20} s_{\text{state_1}} + +\delta_{21} s_{\text{state_2}} + ... \nonumber \\ +&\quad + \beta_{22} Y^{\Delta}_{\ell, t} + +\beta_{23} Y^{\Delta}_{\ell, t-7} +\end{aligned} + + + +Preprocessing steps are similar to the previous models with an additional step +of categorizing the response variables. + +We will take a subset of death rate and case rate data from our built-in dataset +`case_death_rate_subset`. +```{r} +jhu <- case_death_rate_subset %>% + dplyr::filter(time_value >= "2021-06-04", + time_value <= "2021-12-31", + geo_value %in% c("ca","fl","tx","ny","nj")) %>% + mutate(geo_value_factor = as.factor(geo_value)) %>% + as_epi_df() + +r <- epi_recipe(jhu) %>% + add_role(time_value, new_role = "predictor") %>% + step_dummy(geo_value_factor) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>% + step_mutate( + pct_diff_ahead = case_when( + lag_7_case_rate == 0 ~ 0, + TRUE ~ (ahead_7_case_rate - lag_0_case_rate) / lag_0_case_rate), + pct_diff_wk1 = case_when( + lag_7_case_rate == 0 ~ 0, + TRUE ~ (lag_0_case_rate - lag_7_case_rate) / lag_7_case_rate), + pct_diff_wk2 = case_when( + lag_14_case_rate == 0 ~ 0, + TRUE ~ (lag_7_case_rate - lag_14_case_rate) / lag_14_case_rate)) %>% + step_mutate( + response = case_when( + pct_diff_ahead < -0.20 ~ "down", + pct_diff_ahead > 0.25 ~ "up", + TRUE ~ "flat"), + role = "outcome") %>% + step_rm(death_rate, case_rate, lag_0_case_rate, lag_7_case_rate, + lag_14_case_rate, ahead_7_case_rate, pct_diff_ahead) %>% + step_epi_naomit() +``` + +We will fit the multinomial regression and take a look at the predictions: + +```{r, warning=FALSE} +wf <- epi_workflow(r, parsnip::multinom_reg()) %>% + fit(jhu) + +latest <- get_test_data(recipe = r, x = jhu) +predict(wf, latest) %>% filter(!is.na(.pred_class)) +``` + +We can also take a look at the fit: +```{r} +extract_fit_engine(wf) +``` + +One could also use a formula in `epi_recipe()` to achieve the same results as +above. However, only one of `add_formula()`, `add_recipe()`, or +`workflow_variables()` must be specified. For the purpose of demonstrating +`add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to +return a `data.frame` that could be used for model fitting. +```{r} +b <- bake(prep(r, jhu), jhu) + +epi_workflow() %>% + add_formula(response ~ geo_value + pct_diff_wk1 + pct_diff_wk2) %>% + add_model(parsnip::multinom_reg()) %>% + fit(data = b) +``` + +## Benefits of Lagging and Leading in `epipredict` + +The `step_epi_ahead` and `step_epi_lag` functions in the `epipredict` package +is handy for creating correct lags and leads for future predictions. + +Let's start with a simple dataset and preprocessing: +```{r} +ex <- case_death_rate_subset %>% + dplyr::filter(time_value >= "2021-12-01", + time_value <= "2021-12-31", + geo_value == "ca") + +dim(ex) +``` + +We want to predict death rates on `2022-01-07`, which is 7 days ahead from the +latest available date in our dataset. + +We will compare two methods of trying to create lags and leads: +```{r} +p1 <- epi_recipe(ex) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_epi_naomit() %>% + prep() + +b1 <- bake(p1, ex) +b1 + + +p2 <- epi_recipe(ex) %>% + step_mutate(lag0case_rate = lag(case_rate, 0), + lag7case_rate = lag(case_rate, 7), + lag14case_rate = lag(case_rate, 14), + lag0death_rate = lag(death_rate, 0), + lag7death_rate = lag(death_rate, 7), + lag14death_rate = lag(death_rate, 14), + ahead7death_rate = lead(death_rate, 7)) %>% + step_epi_naomit() %>% + prep() + +b2 <- bake(p2, ex) +b2 +``` + +Notice the difference in number of rows `b1` and `b2` returns. This is because +the second version, the one that doesn't use `step_epi_ahead` and `step_epi_lag`, +has omitted dates compared to the one that used the `epipredict` functions. +```{r} +dates_used_in_training1 <- + b1 %>% select(- ahead_7_death_rate) %>% na.omit() %>% select(time_value) + +dates_used_in_training1 + +dates_used_in_training2 <- + b2 %>% select(- ahead7death_rate) %>% na.omit() %>% select(time_value) + +dates_used_in_training2 +``` + +The model that is trained without the functions is predicting 7 days ahead from +`r max(dates_used_in_training2$time_value)` +instead of 7 days ahead from `r max(dates_used_in_training1$time_value)`. + +## References + +McDonald, Bien, Green, Hu, et al. "Can auxiliary indicators improve COVID-19 +forecasting and hotspot prediction?." Proceedings of the National Academy of +Sciences 118.51 (2021): e2111453118. [doi:10.1073/pnas.2111453118]( +https://doi.org/10.1073/pnas.2111453118) + +## Attribution + +This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html) + +This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University +on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.