diff --git a/DESCRIPTION b/DESCRIPTION
index af0a92d5e..9056c3f9a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -44,6 +44,7 @@ Imports:
     vctrs,
     workflows (>= 1.0.0)
 Suggests: 
+    poissonreg,
     covidcast,
     data.table,
     epidatr,
@@ -64,5 +65,5 @@ Remotes:
 Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.1
 LazyData: true
diff --git a/R/data.R b/R/data.R
index 9c93506a3..622d1c7d7 100644
--- a/R/data.R
+++ b/R/data.R
@@ -33,3 +33,26 @@
 #'   June 7 is the average of the underlying data for June 1 through 7,
 #'   inclusive.
 "case_death_rate_subset"
+
+#' State population data
+#'
+#' Data set on state populations, from the 2019 US Census.
+#'
+#' @format Data frame with 57 rows (including one for the United States as a
+#'   whole, plus the District of Columbia, Puerto Rico Commonwealth,
+#'   American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana,
+#'   Islands).
+#'
+#' \describe{
+#'   \item{fips}{FIPS code}
+#'   \item{name}{Full name of the state or territory}
+#'   \item{pop}{Estimate of the location's resident population in
+#'      2019.}
+#'   \item{abbr}{Postal abbreviation for the location}
+#' }
+#'
+#' @source United States Census Bureau, at
+#'   \url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf},
+#'   \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html},
+#'   and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html}
+"state_census"
diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 8e219a9e4..b5bc57ee3 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -1,35 +1,43 @@
-#' Revert population scaled prediction
+#' Convert per-capita predictions to raw scale
 #'
 #' `layer_population_scaling` creates a specification of a frosting layer
-#' that will add a population scaled column in the data. For example,
-#' load a dataset that contains county population, and join to an `epi_df`
-#' that currently predicts number of new cases by county to obtain case rates.
-#' Although worth noting that there is nothing special about "population".
-#' The function can be used to scale by any variable. Population is simply the
-#' most natural and common use case.
+#' that will "undo" per-capita scaling. Typical usage would
+#' load a dataset that contains state-level population, and use it to convert
+#' predictions made from a rate-scale model to raw scale by multiplying by
+#' the population.
+#' Although, it is worth noting that there is nothing special about "population".
+#' The function can be used to scale by any variable. Population is the
+#' standard use case in the epidemiology forecasting scenario. Any value
+#' passed will *multiply* the selected variables while the `rate_rescaling`
+#' argument is a common *divisor* of the selected variables.
 #'
 #' @param frosting a `frosting` postprocessor. The layer will be added to the
-#'  sequence of operations  for this frosting.
+#'   sequence of operations for this frosting.
 #' @param ... One or more selector functions to scale variables
-#'  for this step. See [selections()] for more details.
-#' @param df a data frame that contains the population data used for scaling.
-#' @param by A character vector of variables to left join by.
+#'   for this step. See [selections()] for more details.
+#' @param df a data frame that contains the population data to be used for
+#'   inverting the existing scaling.
+#' @param by A (possibly named) character vector of variables to join by.
 #'
 #' If `NULL`, the default, the function will perform a natural join, using all
-#' variables in common across the `epi_df` and the user-provided dataset.
-#' If columns in `epi_df` and `df` have the same name (and aren't
-#' included in by), `.df` is added to the one from the user-provided data
+#' variables in common across the `epi_df` produced by the `predict()` call
+#' and the user-provided dataset.
+#' If columns in that `epi_df` and `df` have the same name (and aren't
+#' included in `by`), `.df` is added to the one from the user-provided data
 #' to disambiguate.
 #'
 #' To join by different variables on the `epi_df` and `df`, use a named vector.
-#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value`
+#' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
 #' to `df$states`. To join by multiple variables, use a vector with length > 1.
-#' For example, by = c("geo_value" = "states", "county" = "county") will match
+#' For example, `by = c("geo_value" = "states", "county" = "county")` will match
 #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
 #'
 #' See [dplyr::left_join()] for more details.
 #' @param df_pop_col the name of the column in the data frame `df` that
 #' contains the population data and used for scaling.
+#' @param rate_rescaling Sometimes rates are "per 100K" or "per 1M" rather than
+#'   "per person". Adjustments can be made here. For example, if the original
+#'   rate is "per 100K", then set `rate_rescaling = 1e5` to get counts back.
 #' @param create_new TRUE to create a new column and keep the original column
 #' in the `epi_df`.
 #' @param suffix a character. The suffix added to the column name if
@@ -46,8 +54,7 @@
 #'   dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
 #'   dplyr::select(geo_value, time_value, cases)
 #'
-#' pop_data = data.frame(states = c("ca", "ny"),
-#'                       value = c(20000, 30000))
+#' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
 #'
 #' r <- epi_recipe(jhu) %>%
 #'   step_population_scaling(df = pop_data,
@@ -71,12 +78,12 @@
 #'   parsnip::fit(jhu) %>%
 #'   add_frosting(f)
 #'
-#' latest <- get_test_data(recipe = r,
-#'                         x = epiprocess::jhu_csse_daily_subset %>%
-#'                           dplyr::filter(time_value > "2021-11-01",
-#'                                         geo_value %in% c("ca", "ny")) %>%
-#'                           dplyr::select(geo_value, time_value, cases))
-#'
+#' latest <- get_test_data(
+#'   recipe = r,
+#'   x = epiprocess::jhu_csse_daily_subset %>%
+#'     dplyr::filter(time_value > "2021-11-01",
+#'                   geo_value %in% c("ca", "ny")) %>%
+#'     dplyr::select(geo_value, time_value, cases))
 #'
 #' predict(wf, latest)
 layer_population_scaling <- function(frosting,
@@ -84,17 +91,26 @@ layer_population_scaling <- function(frosting,
                            df,
                            by = NULL,
                            df_pop_col,
+                           rate_rescaling = 1,
                            create_new = TRUE,
-                           suffix = "_original",
+                           suffix = "_scaled",
                            .flag = TRUE,
                            id = rand_id("population_scaling")) {
 
+  arg_is_scalar(df_pop_col, rate_rescaling, create_new, suffix, .flag, id)
+  arg_is_lgl(create_new, .flag)
+  arg_is_chr(df_pop_col, suffix, id)
+  arg_is_chr(by, allow_null = TRUE)
+  if (rate_rescaling <= 0)
+    cli_stop("`rate_rescaling` should be a positive number")
+
   add_layer(
     frosting,
     layer_population_scaling_new(
       df = df,
       by = by,
       df_pop_col = df_pop_col,
+      rate_rescaling = rate_rescaling,
       terms = dplyr::enquos(...),
       create_new = create_new,
       suffix = suffix,
@@ -105,11 +121,12 @@ layer_population_scaling <- function(frosting,
 }
 
 layer_population_scaling_new <-
-  function(df, by, df_pop_col, terms, create_new, suffix, id) {
+  function(df, by, df_pop_col, rate_rescaling, terms, create_new, suffix, id) {
   layer("population_scaling",
         df = df,
         by = by,
         df_pop_col = df_pop_col,
+        rate_rescaling = rate_rescaling,
         terms = terms,
         create_new = create_new,
         suffix = suffix,
@@ -125,8 +142,9 @@ slather.layer_population_scaling <-
     try_join <- try(dplyr::left_join(components$predictions, object$df,
                               by= object$by),
              silent = TRUE)
-    if (any(grepl("Join columns must be present in data", unlist(try_join)))){
-      stop("columns in `by` selectors of `layer_population_scaling` must be present in data and match")}
+    if (any(grepl("Join columns must be present in data", unlist(try_join)))) {
+      cli_stop(c("columns in `by` selectors of `layer_population_scaling` ",
+                 "must be present in data and match"))}
 
     object$df <- object$df %>%
       dplyr::mutate(dplyr::across(where(is.character), tolower))
@@ -136,13 +154,12 @@ slather.layer_population_scaling <-
     col_names <- names(pos)
     suffix = ifelse(object$create_new, object$suffix, "")
 
-
     components$predictions <- dplyr::left_join(components$predictions,
                                                object$df,
                                                by= object$by,
                                                suffix = c("", ".df")) %>%
       dplyr::mutate(dplyr::across(dplyr::all_of(col_names),
-                                  ~.x * !!pop_col ,
+                                  ~.x * !!pop_col / object$rate_rescaling ,
                                   .names = "{.col}{suffix}")) %>%
      dplyr::select(- !!pop_col)
     components
diff --git a/R/utils_arg.R b/R/utils_arg.R
index ebac387d5..6f762382a 100644
--- a/R/utils_arg.R
+++ b/R/utils_arg.R
@@ -108,7 +108,7 @@ arg_is_chr = function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = F
   handle_arg_list(
     ...,
     tests = function(name, value) {
-      if (!(is.character(value) | (is.null(value) & !allow_null)))
+      if (!(is.character(value) | (is.null(value) & allow_null)))
         cli_stop("Argument {.val {name}} must be of character type.")
 
       if (any(is.na(value)) & !allow_na)
diff --git a/data-raw/state_census.R b/data-raw/state_census.R
new file mode 100644
index 000000000..22dde5a41
--- /dev/null
+++ b/data-raw/state_census.R
@@ -0,0 +1,11 @@
+library(dplyr)
+library(tidyr)
+
+state_census <- covidcast::state_census %>%
+  select(STATE, NAME, POPESTIMATE2019, ABBR) %>%
+  rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>%
+  mutate(abbr = tolower(abbr)) %>%
+  as_tibble()
+
+
+usethis::use_data(state_census, overwrite = TRUE)
diff --git a/data/state_census.rda b/data/state_census.rda
new file mode 100644
index 000000000..1118db0d0
Binary files /dev/null and b/data/state_census.rda differ
diff --git a/man/create_layer.Rd b/man/create_layer.Rd
index 7917e8854..81f5e33b0 100644
--- a/man/create_layer.Rd
+++ b/man/create_layer.Rd
@@ -9,9 +9,9 @@ create_layer(name = NULL, open = rlang::is_interactive())
 \arguments{
 \item{name}{Either a name without extension, or \code{NULL} to create the
 paired file based on currently open file in the script editor. If
-the R file is open, \code{use_test()} will create/open the corresponding
+the \verb{R/} file is open, \code{use_test()} will create/open the corresponding
 test file; if the test file is open, \code{use_r()} will create/open the
-corresponding R file.}
+corresponding \verb{R/} file.}
 
 \item{open}{Whether to open the file for interactive editing.}
 }
diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd
index 9d59e3f08..8ac350870 100644
--- a/man/layer_population_scaling.Rd
+++ b/man/layer_population_scaling.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/layer_population_scaling.R
 \name{layer_population_scaling}
 \alias{layer_population_scaling}
-\title{Revert population scaled prediction}
+\title{Convert per-capita predictions to raw scale}
 \usage{
 layer_population_scaling(
   frosting,
@@ -10,33 +10,36 @@ layer_population_scaling(
   df,
   by = NULL,
   df_pop_col,
+  rate_rescaling = 1,
   create_new = TRUE,
-  suffix = "_original",
+  suffix = "_scaled",
   .flag = TRUE,
   id = rand_id("population_scaling")
 )
 }
 \arguments{
 \item{frosting}{a \code{frosting} postprocessor. The layer will be added to the
-sequence of operations  for this frosting.}
+sequence of operations for this frosting.}
 
 \item{...}{One or more selector functions to scale variables
 for this step. See \code{\link[=selections]{selections()}} for more details.}
 
-\item{df}{a data frame that contains the population data used for scaling.}
+\item{df}{a data frame that contains the population data to be used for
+inverting the existing scaling.}
 
-\item{by}{A character vector of variables to left join by.
+\item{by}{A (possibly named) character vector of variables to join by.
 
 If \code{NULL}, the default, the function will perform a natural join, using all
-variables in common across the \code{epi_df} and the user-provided dataset.
-If columns in \code{epi_df} and \code{df} have the same name (and aren't
-included in by), \code{.df} is added to the one from the user-provided data
+variables in common across the \code{epi_df} produced by the \code{predict()} call
+and the user-provided dataset.
+If columns in that \code{epi_df} and \code{df} have the same name (and aren't
+included in \code{by}), \code{.df} is added to the one from the user-provided data
 to disambiguate.
 
 To join by different variables on the \code{epi_df} and \code{df}, use a named vector.
-For example, by = c("geo_value" = "states") will match \code{epi_df$geo_value}
+For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value}
 to \code{df$states}. To join by multiple variables, use a vector with length > 1.
-For example, by = c("geo_value" = "states", "county" = "county") will match
+For example, \code{by = c("geo_value" = "states", "county" = "county")} will match
 \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}.
 
 See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.}
@@ -44,6 +47,10 @@ See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.}
 \item{df_pop_col}{the name of the column in the data frame \code{df} that
 contains the population data and used for scaling.}
 
+\item{rate_rescaling}{Sometimes rates are "per 100K" or "per 1M" rather than
+"per person". Adjustments can be made here. For example, if the original
+rate is "per 100K", then set \code{rate_rescaling = 1e5} to get counts back.}
+
 \item{create_new}{TRUE to create a new column and keep the original column
 in the \code{epi_df}.}
 
@@ -60,12 +67,15 @@ an updated \code{frosting} postprocessor
 }
 \description{
 \code{layer_population_scaling} creates a specification of a frosting layer
-that will add a population scaled column in the data. For example,
-load a dataset that contains county population, and join to an \code{epi_df}
-that currently predicts number of new cases by county to obtain case rates.
-Although worth noting that there is nothing special about "population".
-The function can be used to scale by any variable. Population is simply the
-most natural and common use case.
+that will "undo" per-capita scaling. Typical usage would
+load a dataset that contains state-level population, and use it to convert
+predictions made from a rate-scale model to raw scale by multiplying by
+the population.
+Although, it is worth noting that there is nothing special about "population".
+The function can be used to scale by any variable. Population is the
+standard use case in the epidemiology forecasting scenario. Any value
+passed will \emph{multiply} the selected variables while the \code{rate_rescaling}
+argument is a common \emph{divisor} of the selected variables.
 }
 \examples{
 library(epiprocess)
@@ -73,8 +83,7 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\%
   dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\%
   dplyr::select(geo_value, time_value, cases)
 
-pop_data = data.frame(states = c("ca", "ny"),
-                      value = c(20000, 30000))
+pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
 
 r <- epi_recipe(jhu) \%>\%
   step_population_scaling(df = pop_data,
@@ -98,12 +107,12 @@ wf <- epi_workflow(r,
   parsnip::fit(jhu) \%>\%
   add_frosting(f)
 
-latest <- get_test_data(recipe = r,
-                        x = epiprocess::jhu_csse_daily_subset \%>\%
-                          dplyr::filter(time_value > "2021-11-01",
-                                        geo_value \%in\% c("ca", "ny")) \%>\%
-                          dplyr::select(geo_value, time_value, cases))
-
+latest <- get_test_data(
+  recipe = r,
+  x = epiprocess::jhu_csse_daily_subset \%>\%
+    dplyr::filter(time_value > "2021-11-01",
+                  geo_value \%in\% c("ca", "ny")) \%>\%
+    dplyr::select(geo_value, time_value, cases))
 
 predict(wf, latest)
 }
diff --git a/man/state_census.Rd b/man/state_census.Rd
new file mode 100644
index 000000000..eec13eb53
--- /dev/null
+++ b/man/state_census.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{state_census}
+\alias{state_census}
+\title{State population data}
+\format{
+Data frame with 57 rows (including one for the United States as a
+whole, plus the District of Columbia, Puerto Rico Commonwealth,
+American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana,
+Islands).
+
+\describe{
+\item{fips}{FIPS code}
+\item{name}{Full name of the state or territory}
+\item{pop}{Estimate of the location's resident population in
+2019.}
+\item{abbr}{Postal abbreviation for the location}
+}
+}
+\source{
+United States Census Bureau, at
+\url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf},
+\url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html},
+and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html}
+}
+\usage{
+state_census
+}
+\description{
+Data set on state populations, from the 2019 US Census.
+}
+\keyword{datasets}
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 2404a7b4c..5b805b33b 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -115,7 +115,23 @@ test_that("Postprocessing workflow works and values correct", {
   expect_silent(p <- predict(wf, latest))
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
-  expect_equal(p$.pred_original, p$.pred * c(20000, 30000))
+  expect_equal(p$.pred_scaled, p$.pred * c(20000, 30000))
+
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_threshold(.pred) %>%
+    layer_naomit(.pred) %>%
+    layer_population_scaling(.pred, df = pop_data, rate_rescaling = 10000,
+                             by =  c("geo_value" = "states"),
+                             df_pop_col = "value")
+  wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(jhu) %>%
+    add_frosting(f)
+  expect_silent(p <- predict(wf, latest))
+  expect_equal(nrow(p), 2L)
+  expect_equal(ncol(p), 4L)
+  expect_equal(p$.pred_scaled, p$.pred * c(2, 3))
+
 })
 
 test_that("Postprocessing to get cases from case rate", {
@@ -144,8 +160,7 @@ test_that("Postprocessing to get cases from case rate", {
                              by =  c("geo_value" = "states"),
                              df_pop_col = "value")
 
-  wf <- epi_workflow(r,
-                     parsnip::linear_reg()) %>%
+  wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(jhu) %>%
     add_frosting(f)
 
@@ -159,7 +174,7 @@ test_that("Postprocessing to get cases from case rate", {
   expect_silent(p <- predict(wf, latest))
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
-  expect_equal(p$.pred_original, p$.pred * c(1/20000, 1/30000))
+  expect_equal(p$.pred_scaled, p$.pred * c(1/20000, 1/30000))
 })
 
 
@@ -239,11 +254,11 @@ test_that("expect error if `by` selector does not match", {
                              by =  NULL,
                              df_pop_col = "values")
 
-  expect_error(wf <- epi_workflow(r,
-                                    parsnip::linear_reg()) %>%
-                   fit(jhu) %>%
-                   add_frosting(f),
-               "columns in `by` selectors of `step_population_scaling` must be present in data and match")
+  expect_error(
+    wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+      fit(jhu) %>%
+      add_frosting(f),
+    "columns in `by` selectors of `step_population_scaling` must be present in data and match")
 
   r <- epi_recipe(jhu) %>%
     step_population_scaling(case_rate,
@@ -271,8 +286,7 @@ test_that("expect error if `by` selector does not match", {
                                           geo_value %in% c("ca", "ny")) %>%
                             dplyr::select(geo_value, time_value, case_rate))
 
-  wf <- epi_workflow(r,
-                     parsnip::linear_reg()) %>%
+  wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(jhu) %>%
     add_frosting(f)
 
diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
new file mode 100644
index 000000000..23e110b6c
--- /dev/null
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -0,0 +1,579 @@
+---
+title: Examples of Preprocessing and Models
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Examples of Preprocessing and Models}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Introduction 
+
+The `epipredict` package utilizes the `tidymodels` framework, namely 
+[`recipes`](https://recipes.tidymodels.org/) for 
+[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences 
+of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a 
+unified interface to a range of models. 
+
+`epipredict` has additional customized feature engineering and preprocessing 
+steps, such as `step_epi_shift()`, `step_population_scaling()`, 
+`step_epi_naomit()`. They can be used along with 
+steps from the `recipes` package for more feature engineering. 
+
+In this vignette, we will illustrate some examples of how to use `epipredict`
+with `recipes` and `parsnip` for different purposes of epidemiology forecasting.
+We will focus on simple basic autoregressive models, in which COVID cases and 
+deaths in the near future are predicted using a linear combination of cases and 
+deaths in the near past.
+
+The remaining vignette will be split into three sections. The first section, we 
+will use a Poisson regression to predict death counts. In the second section,
+we will use a linear regression to predict death rates. Last but not least, we
+will create a classification model for hotspot predictions. 
+
+```{r, warning=FALSE, message=FALSE}
+library(tidyr)
+library(dplyr)
+library(epidatr)
+library(epiprocess)
+library(epipredict)
+library(recipes)
+library(parsnip)
+library(workflows)
+library(poissonreg)
+```
+
+## Poisson Regression 
+
+During COVID-19, Center for Disease Control and Prevention (CDC) gathered models
+and forecasts to characterize the state of an outbreak and its course. They use
+it to inform public health decision makers on potential consequences of 
+deploying control measures.
+
+One of the outcomes that the CDC forecasts is [death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html).
+Although there are many state-of-the-art models, we choose to use Poisson 
+regression, the textbook example for modeling count data, as an illustration
+for using the `epipredict` package with other existing tidymodel packages. 
+
+```{r poisson-reg-data}
+x <- covidcast(
+  data_source = "jhu-csse",
+  signals = "confirmed_incidence_num",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj") %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, cases = value)
+
+y <- covidcast(
+  data_source = "jhu-csse",
+  signals = "deaths_incidence_num",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj") %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, deaths = value)
+
+case_death_counts_subset <- x %>%
+  full_join(y, by = c("geo_value", "time_value")) %>%
+  as_epi_df()
+```
+
+The `case_death_counts_subset` dataset comes from the `epidatr` package,and 
+contains the number of confirmed cases and deaths from June 4, 2021 to 
+Dec 31, 2021 in some U.S. states. 
+
+We wish to predict the 7-day ahead death counts with lagged cases and deaths.
+Furthermore, we will let each state be a dummy variable. Using differential 
+intercept coefficients, we can allow for an intercept shift between states.
+
+The model takes the form of
+\begin{aligned}
+\text{log}\left( \mu_{t+7} \right) = \beta_0 + \delta_1 s_{\text{state_1}} +
+\delta_2 s_{\text{state_2}} + ... +  \beta_1 \text{deaths}_{t} + \nonumber \\
+\beta_2 \text{deaths}_{t-7}  + \beta_3 \text{cases}_{t} + 
+\beta_4 \text{cases}_{t-7}
+\end{aligned}
+
+
+where $\mu_{t+7} = \mathbb{E}(y_{t+7})$, and $y_{t+7}$ is assumed to follow a 
+Poisson distribution with mean $\mu_{t+7}$; $s_{\text{state}}$ are dummy 
+variables for each state and take values of either 0 or 1. 
+
+Preprocessing steps will be performed using the `recipes` functions to get the 
+data ready for modeling. 
+
+But before diving into them, it will be helpful to understand what `roles` are
+in the `recipes` framework. 
+
+---
+
+#### Aside on `recipes`
+
+`recipes` can assign one or more roles to each column in the data. The roles 
+are not restricted to a predefined set; they can be anything. 
+For most conventional situations, they are typically “predictor” and/or 
+"outcome". Additional roles enable targeted `step_*()` operations on specific 
+variables or groups of variables.
+
+In our case, the role `predictor` is given to explanatory variable that go to the 
+left-hand side of the model. The role `outcome` is the response variable 
+that we wish to predict. `geo_value` and `time_value` are predefined roles 
+that are unique to the `epipredict` package. Since we work with `epi_df` 
+objects, all datasets should have `geo_value` and `time_value` passed through
+automatically with these two roles assigned to the appropriate columns.
+ 
+The `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) 
+in bulk. There are a couple handy functions that can be used together to help us 
+manipulate variable roles easily. 
+
+> `update_role()` alters an existing role in the recipe or assigns an initial role 
+> to variables that do not yet have a declared role.
+> 
+> `add_role()` adds an additional role to variables that already have a role in 
+> the recipe, without overwriting old roles.
+> 
+> `remove_role()` eliminates a single existing role in the recipe.
+
+#### End aside
+
+---
+
+Notice in the following preprocessing steps, we used `add_role()` on 
+`geo_value_factor` since currently the default role for it is `raw` and 
+we would like the dummified variables to automatically be `predictor`'s. 
+
+```{r}
+case_death_counts_subset <- case_death_counts_subset %>%
+  mutate(geo_value_factor = as.factor(geo_value)) %>%
+  as_epi_df()
+
+epi_recipe(case_death_counts_subset)
+
+r <- epi_recipe(case_death_counts_subset) %>%
+  add_role(geo_value_factor, new_role = "predictor") %>%
+  step_dummy(geo_value_factor) %>%
+  ## Occasionally, data reporting errors / corrections result in negative
+  ## cases / deaths
+  step_mutate(cases = pmax(cases, 0), deaths = pmax(deaths, 0)) %>%  
+  step_epi_lag(cases, deaths, lag = c(0, 7)) %>%
+  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
+  step_epi_naomit()
+```
+
+After specifying the preprocessing steps, we will use the `parsnip` package for
+modeling and getting the prediction for death count, 7 days after the
+latest available date in the dataset. 
+
+```{r}
+latest <- get_test_data(r, case_death_counts_subset)
+
+wf <- epi_workflow(r, parsnip::poisson_reg()) %>% 
+      fit(case_death_counts_subset)
+
+predict(wf, latest) %>% filter(!is.na(.pred))
+```
+
+Note that the `time_value` corresponds to the last available date in the 
+training set, **NOT** to the target date of the forecast 
+(`r max(latest$time_value) + 7`).
+
+
+Let's take a look at the fit:
+```{r}
+extract_fit_engine(wf)
+```
+
+Up to now, we've used the Poisson regression to model count data. Poisson 
+regression can also be used to model rate data, such as case rates or death
+rates, by incorporating offset terms in the model. 
+
+To model death rates, the Poisson regression would be expressed as:
+\begin{aligned}
+\text{log}\left( \mu_{t+7} \right) = \text{log(population)} + 
+\beta_0 + \delta_1 s_{\text{state_1}} +
+\delta_2 s_{\text{state_2}} + ... +  \beta_1 \text{deaths}_{t} + \nonumber \\
+\beta_2 \text{deaths}_{t-7}  + \beta_3 \text{cases}_{t} + 
+\beta_4 \text{cases}_{t-7}
+\end{aligned}
+where $\text{log(population)}$ is the log of the state population that was 
+used to scale the count data on the left-hand side of the equation.
+
+There are several ways to model rate data given count and population data. 
+First, in the `parsnip` framework, we can specify the formula in `fit()`. 
+However, by doing so we lose the ability to use the `recipes` framework to 
+create new variables since new variables that do not exist in the 
+original dataset such as lag and leads cannot be called directly in `fit()`. 
+
+Alternatively, `step_population_scaling()` and `layer_population_scaling()` 
+in the `epipredict` package can perform the population scaling if we provide the 
+population data, and we can model this using other models such as linear 
+regression, which we will illustrate in the next section.
+
+
+## Linear Regression 
+
+For COVID-19, the CDC required submission of case and death count predictions. 
+However, the Delphi Group preferred to train on rate data instead, because it 
+puts different locations on a similar scale. 
+We can use a liner regression to predict the death
+rates and use state population data to scale the rates to counts. We will do so 
+using `layer_population_scaling()` from the `epipredict` package. 
+
+Additionally, when forecasts are submitted, prediction intervals should be 
+provided along with the point estimates. This can be obtained via postprocessing
+`layer_residual_quantiles()`. Although worth pointing out that 
+`layer_residual_quantiles()` should be used before population scaling or else 
+the transformation will make the results uninterpretable. 
+
+We wish to predict the 7-day ahead death counts with lagged case rates and death
+rates, along with extra behaviour indicator inputs. Namely, we will use survey data 
+from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators).
+
+The survey data shows the estimated percentage of people who wore a mask for 
+most or all of the time while in public in the past 7 days and the estimated 
+percentage of respondents who reported that all or most people they encountered 
+in public in the past 7 days maintained a distance of at least 6 feet. 
+
+State-wise population data from the 2019 U.S. Census is included in this package
+and will be used in `layer_population_scaling()`. 
+```{r}
+behav_ind_mask <- covidcast(
+  data_source = "fb-survey",
+  signals = "smoothed_wwearing_mask_7d",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj")  %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, masking = value)
+
+behav_ind_distancing <- covidcast(
+  data_source = "fb-survey",
+  signals = "smoothed_wothers_distanced_public",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj")  %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, distancing = value) 
+
+pop_dat <- state_census %>% select(abbr, pop)
+
+behav_ind <- behav_ind_mask %>%
+  full_join(behav_ind_distancing, by = c("geo_value", "time_value")) 
+```
+
+Rather than using raw mask-wearing / social-distancing metrics, for the sake
+of illustration, we'll convert both into categorical predictors.
+
+```{r, echo=FALSE, message=FALSE,fig.align='center', fig.width=6, fig.height=4}
+library(ggplot2)
+behav_ind %>%
+  pivot_longer(masking:distancing) %>%
+  ggplot(aes(value, fill = geo_value)) +
+  geom_density(alpha = 0.5) +
+  scale_fill_brewer(palette = "Set1", name = "") +
+  theme_bw() +
+  scale_x_continuous(expand = c(0,0)) +
+  scale_y_continuous(expand = expansion(c(0,.05))) +
+  facet_wrap(~name, scales = "free") +
+  theme(legend.position = "bottom")
+```
+
+We will take a subset of death rate and case rate data from the built-in dataset 
+`case_death_rate_subset`.
+
+```{r}
+jhu <- case_death_rate_subset %>%
+  dplyr::filter(
+    time_value >= "2021-06-04", 
+    time_value <= "2021-12-31",
+    geo_value %in% c("ca","fl","tx","ny","nj"))
+```
+
+Preprocessing steps will rely on functions from the `epipredict` package as well
+as the `recipes` package:
+
+Additionally, there are also many functions in the `recipes` package that allow for 
+[scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
+such as log transformations and data centering. In our case, we will 
+center the numerical predictors to allow for meaningful interpretation of the 
+intercept. 
+
+```{r}
+jhu <- jhu %>%
+  mutate(geo_value_factor = as.factor(geo_value)) %>%
+  left_join(behav_ind, by = c("geo_value", "time_value")) %>%
+  as_epi_df()
+            
+r <- epi_recipe(jhu) %>%
+  add_role(geo_value_factor, new_role = "predictor") %>%
+  step_dummy(geo_value_factor) %>%
+  step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%
+  step_mutate(masking = cut_number(masking, 5), 
+              distancing = cut_number(distancing, 5)) %>%
+  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+  step_center(contains("lag"), role = "predictor") %>%
+  step_epi_naomit()
+```
+
+As a sanity check we can see what the training data will look like
+```{r, warning = FALSE}
+glimpse(slice_sample(bake(prep(r, jhu), jhu), n = 6))
+```
+
+Before directly predicting the results, we need to add postprocessing layers to
+obtain the death counts instead of death rates. Note that the rates used so
+far are "per 100K people" rather than "per person".
+
+```{r}
+f <- frosting() %>%
+  layer_predict() %>%
+  layer_add_target_date("2022-01-07") %>% 
+  layer_threshold(.pred, lower = 0) %>%
+  layer_residual_quantiles(probs = c(0.05, 0.95), symmetrize = FALSE) %>%
+  layer_naomit(.pred) %>%
+  layer_population_scaling(
+    .pred, .pred_distn, df = pop_dat, rate_rescaling = 1e5,
+    by = c("geo_value" = "abbr"), df_pop_col = "pop")
+
+wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(jhu) %>%
+    add_frosting(f)
+
+latest <- get_test_data(recipe = r, x = jhu)
+p <- predict(wf, latest)
+p
+```
+
+The columns marked `*_scaled` have been rescaled to the correct units, in this
+case `deaths` rather than deaths per 100K people (these remain in `.pred`).
+
+To look at the prediction intervals:
+```{r}
+p %>%
+  select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>%
+  mutate(.pred_distn_scaled = nested_quantiles(.pred_distn_scaled)) %>%
+  unnest(.pred_distn_scaled) %>%
+  pivot_wider(names_from = tau, values_from = q)
+```
+
+
+Last but not least, let's take a look at the regression fit and check the 
+coefficients:
+```{r, echo =FALSE}
+summary(extract_fit_engine(wf))
+```
+
+## Classification
+
+Sometimes it is preferable to create a predictive model for surges or upswings
+rather than for raw values. In this case,
+the target is to predict if the future will have increased case rates(up),
+decreased case rates (down), or flat case rates (flat) relative to the current
+level. Such model are often 
+referred to as hotspot prediction models. We will refer to the model notations 
+from [McDonald, Bien, Green, Hu, et al.](#references) but extend the application
+to predict three categories instead of two. 
+
+Hotspot prediction predicts a categorical variable defined in terms of the 
+relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. 
+Where $Y_{\ell, t}$ denotes the case rates in location $\ell$ at time $t$. 
+We define the response variables as follows:
+
+$$
+ Z_{\ell, t}=
+    \begin{cases}
+      \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ 
+      \text{down}, & \text{if}\  Y^{\Delta}_{\ell, t} < -0.25\\
+      \text{flat}, & \text{otherwise}
+    \end{cases}
+$$
+
+where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. 
+We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized
+as 'up', meaning the number of newly reported cases over the past 7 days has 
+increased by at least 25% compared to the preceding week. When $Z_{\ell,t}$ 
+is categorized as 'down', it suggests that there has been at least a 25% 
+decrease in newly reported cases over the past 7 days. Otherwise, we will 
+consider the trends to be flat. 
+
+The expression of the multinomial regression we will model is as follows:
+
+$$
+\pi_{j}(x) = \text{Pr}(Y = j|x) = \frac{e^{g_j(x)}}{\sum_{k=0}^2 g_j(x) }
+$$
+where $j$ is either down (`Y=0`), flat (`Y=1`), or up (`Y=2`),
+$$g_0(x) = 0$$
+
+\begin{aligned}
+g_1(x)= \text{ln}\left(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}\right) &= 
+\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_{10} s_{\text{state_1}} +
+\delta_{11} s_{\text{state_2}} + ... \nonumber \\
+&\quad + \beta_{12} Y^{\Delta}_{\ell, t} +
+\beta_{13} Y^{\Delta}_{\ell, t-7}
+\end{aligned}
+
+\begin{aligned}
+g_2(x)= \text{ln}\left(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}\right) &= 
+\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_{20} s_{\text{state_1}} +
+\delta_{21} s_{\text{state_2}} + ... \nonumber \\
+&\quad + \beta_{22} Y^{\Delta}_{\ell, t} +
+\beta_{23} Y^{\Delta}_{\ell, t-7}
+\end{aligned}
+
+
+
+Preprocessing steps are similar to the previous models with an additional step 
+of categorizing the response variables. 
+
+We will take a subset of death rate and case rate data from our built-in dataset 
+`case_death_rate_subset`.
+```{r}
+jhu <- case_death_rate_subset %>%
+  dplyr::filter(time_value >= "2021-06-04", 
+                time_value <= "2021-12-31",
+  geo_value %in% c("ca","fl","tx","ny","nj")) %>%
+  mutate(geo_value_factor = as.factor(geo_value)) %>%
+  as_epi_df()
+
+r <- epi_recipe(jhu) %>%
+  add_role(time_value, new_role = "predictor") %>%
+  step_dummy(geo_value_factor) %>%
+  step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+  step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>%
+  step_mutate(
+    pct_diff_ahead = case_when(
+      lag_7_case_rate == 0 ~ 0,
+      TRUE ~ (ahead_7_case_rate - lag_0_case_rate) / lag_0_case_rate),
+    pct_diff_wk1 = case_when(
+      lag_7_case_rate == 0 ~ 0, 
+      TRUE ~ (lag_0_case_rate - lag_7_case_rate) / lag_7_case_rate),
+    pct_diff_wk2 = case_when(
+      lag_14_case_rate == 0 ~ 0,
+      TRUE ~ (lag_7_case_rate - lag_14_case_rate) / lag_14_case_rate)) %>%
+  step_mutate(
+    response = case_when(
+      pct_diff_ahead < -0.20 ~ "down",
+      pct_diff_ahead > 0.25 ~ "up",
+      TRUE ~ "flat"), 
+    role = "outcome") %>% 
+  step_rm(death_rate, case_rate, lag_0_case_rate,  lag_7_case_rate, 
+          lag_14_case_rate, ahead_7_case_rate, pct_diff_ahead) %>%
+  step_epi_naomit()
+```
+
+We will fit the multinomial regression and take a look at the predictions:
+
+```{r, warning=FALSE}
+wf <- epi_workflow(r, parsnip::multinom_reg()) %>%
+    fit(jhu)
+
+latest <- get_test_data(recipe = r, x = jhu)
+predict(wf, latest) %>% filter(!is.na(.pred_class))
+```
+
+We can also take a look at the fit:
+```{r}
+extract_fit_engine(wf)
+```
+
+One could also use a formula in `epi_recipe()` to achieve the same results as 
+above. However, only one of `add_formula()`, `add_recipe()`, or 
+`workflow_variables()` must be specified. For the purpose of demonstrating 
+`add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to
+return a `data.frame` that could be used for model fitting.
+```{r}
+b <- bake(prep(r, jhu), jhu)
+
+epi_workflow() %>%
+  add_formula(response ~ geo_value + pct_diff_wk1 + pct_diff_wk2) %>%
+  add_model(parsnip::multinom_reg()) %>%
+  fit(data = b)
+```
+
+## Benefits of Lagging and Leading in `epipredict`
+
+The `step_epi_ahead` and `step_epi_lag` functions in the `epipredict` package
+is handy for creating correct lags and leads for future predictions. 
+
+Let's start with a simple dataset and preprocessing:
+```{r}
+ex <- case_death_rate_subset %>%
+  dplyr::filter(time_value >= "2021-12-01", 
+                time_value <= "2021-12-31",
+                geo_value == "ca")
+
+dim(ex)
+```
+
+We want to predict death rates on `2022-01-07`, which is 7 days ahead from the 
+latest available date in our dataset. 
+
+We will compare two methods of trying to create lags and leads:
+```{r}
+p1 <- epi_recipe(ex) %>%
+  step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+  step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+  step_epi_naomit() %>%
+  prep()
+
+b1 <- bake(p1, ex)
+b1
+
+
+p2 <- epi_recipe(ex) %>%
+  step_mutate(lag0case_rate = lag(case_rate, 0),
+              lag7case_rate = lag(case_rate, 7),
+              lag14case_rate = lag(case_rate, 14),
+              lag0death_rate = lag(death_rate, 0),
+              lag7death_rate = lag(death_rate, 7),
+              lag14death_rate = lag(death_rate, 14),
+              ahead7death_rate = lead(death_rate, 7)) %>%
+  step_epi_naomit() %>%
+  prep()
+
+b2 <- bake(p2, ex)
+b2
+```
+
+Notice the difference in number of rows `b1` and `b2` returns. This is because 
+the second version, the one that doesn't use `step_epi_ahead` and `step_epi_lag`,
+has omitted dates compared to the one that used the `epipredict` functions.
+```{r}
+dates_used_in_training1 <-
+ b1 %>% select(- ahead_7_death_rate) %>% na.omit() %>% select(time_value)
+
+dates_used_in_training1
+
+dates_used_in_training2 <- 
+  b2 %>% select(- ahead7death_rate) %>% na.omit() %>% select(time_value)
+
+dates_used_in_training2
+```
+
+The model that is trained without the functions is predicting 7 days ahead from 
+`r max(dates_used_in_training2$time_value)`
+instead of 7 days ahead from `r max(dates_used_in_training1$time_value)`.
+
+## References
+
+McDonald, Bien, Green, Hu, et al. "Can auxiliary indicators improve COVID-19 
+forecasting and hotspot prediction?." Proceedings of the National Academy of 
+Sciences 118.51 (2021): e2111453118. [doi:10.1073/pnas.2111453118](
+https://doi.org/10.1073/pnas.2111453118)
+
+## Attribution
+
+This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html)
+
+This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University 
+on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.