From 41fb8de9dce4ac2b78ccb37e5be52e2b4b0bf125 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Wed, 27 Jul 2022 22:19:12 -0700
Subject: [PATCH 01/29] update some work

---
 vignettes/more-recipes.Rmd | 116 +++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 vignettes/more-recipes.Rmd

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
new file mode 100644
index 000000000..4d2a9a9d9
--- /dev/null
+++ b/vignettes/more-recipes.Rmd
@@ -0,0 +1,116 @@
+---
+title: Apply other steps and models from recipes and parsnip packages
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Apply other steps and models from recipes and parsnip packages}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+The `epipredict` package utilizes the `tidymodels` framework, namely 
+[`recipes`](https://recipes.tidymodels.org/) for 
+[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences 
+of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a 
+unified interface to a range of models. 
+
+`epipredict` has additional customized feature engineering and preprocessing steps 
+tailored to fit the needs of epidemiology forecasting, such as `step_epi_shift()`, 
+`step_population_scaling()`, `step_epi_naomit()`. They can be used along with 
+steps from the `recipes` package for more feature engineering. 
+
+In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip`
+models into the `epipredict` workflow. 
+
+```{r, warning=FALSE, message =FALSE}
+library(epipredict)
+library(epiprocess)
+library(recipes)
+library(parsnip)
+library(workflows)
+```
+
+We will use the built-in dataset `case_death_rate_subset` and apply feature 
+engineering steps and fit a few models from `parsnip` to predict future death 
+rates by states. 
+```{r}
+x <- case_death_rate_subset
+head(x)
+```
+There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates
+ranging from `r min(x$time_value)` to `r max(x$time_value)` in 
+`r n_distinct(x$geo_value)` states in the U.S.
+
+# Preprocessing and Feature Engineering
+
+In this section, we focus on showing how `step_*()` functions from the `recipes`
+package can also be applied in our `epipredict` workflows. 
+
+First, create a recipe object from the original data and then specify the 
+preprocessing steps. Steps are then sequentially added.
+
+```{r create-recipe-object}
+r <- epi_recipe(x)
+r
+```
+
+The function automatically categorizes roles into `geo_value`, `time_value` and 
+`raw` based on variable names, where `raw` includes `case_rate` and `death_rate`. 
+
+We first create a made-up population dataset by states to allow the conversion
+from `case_rate` and `death_rate` into `cases` and  `deaths`. The binary variable
+`social_distancing` indicates whether the state followed social distancing rules.
+A 'throw-away' column with all ones is also created to show that `step_rm()` 
+can be used to remove unwanted columns.
+```{r madeup-population-data}
+pop_data <- data.frame(states = unique(x$geo_value),
+                       pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
+                       social_distancing = rbinom(n = n_distinct(x$geo_value), 
+                                                  size = 1,
+                                                  prob = 0.5),
+                       throw_away = 1)
+
+head(pop_data)
+```
+
+Next, we will feature engineer `case_rate` and `death_rate` into `cases` and 
+`deaths`and create lags and leads based on them. 
+
+```{r create-new-columns-in-recipe}
+r <- r %>%
+  step_population_scaling(case_rate, death_rate, df = pop_data, 
+                          by = c("geo_value" = "states"),
+                          create_new = FALSE,
+                          df_pop_col = "pop") %>%
+  step_rename(cases = case_rate, deaths = death_rate) %>%
+  step_rm(throw_away) %>%
+  step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
+  step_epi_naomit()
+```
+
+Notice that `step_rename()` was used for renaming of the variables, and `step_rm()`
+was used to remove redundant columns.
+
+A quick view of what the prepossessed data looks like on training data:
+
+```{r peak-data-1}
+slice_sample(bake(prep(r, x),x), n = 6)
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 9c6eaebcb3f7cef4356290167ef9b6d9dd93ca2e Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Thu, 28 Jul 2022 08:54:06 -0700
Subject: [PATCH 02/29] switch to inverse population

---
 vignettes/more-recipes.Rmd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 4d2a9a9d9..30f48602e 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -62,7 +62,7 @@ A 'throw-away' column with all ones is also created to show that `step_rm()`
 can be used to remove unwanted columns.
 ```{r madeup-population-data}
 pop_data <- data.frame(states = unique(x$geo_value),
-                       pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
+                       inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
                        social_distancing = rbinom(n = n_distinct(x$geo_value), 
                                                   size = 1,
                                                   prob = 0.5),
@@ -79,7 +79,7 @@ r <- r %>%
   step_population_scaling(case_rate, death_rate, df = pop_data, 
                           by = c("geo_value" = "states"),
                           create_new = FALSE,
-                          df_pop_col = "pop") %>%
+                          df_pop_col = "inv_pop") %>%
   step_rename(cases = case_rate, deaths = death_rate) %>%
   step_rm(throw_away) %>%
   step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%

From 9d7c70d242101d4f7aa2ee1880d94331b17c340f Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Sun, 31 Jul 2022 14:41:18 -0700
Subject: [PATCH 03/29] error: prediction is empty

---
 man/epi_workflow.Rd        |  2 +-
 vignettes/more-recipes.Rmd | 60 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/man/epi_workflow.Rd b/man/epi_workflow.Rd
index bcf0e78aa..f9d753d84 100644
--- a/man/epi_workflow.Rd
+++ b/man/epi_workflow.Rd
@@ -11,7 +11,7 @@ epi_workflow(preprocessor = NULL, spec = NULL, postprocessor = NULL)
 \itemize{
 \item A formula, passed on to \code{\link[workflows:add_formula]{add_formula()}}.
 \item A recipe, passed on to \code{\link[workflows:add_recipe]{add_recipe()}}.
-\item A \code{\link[workflows:add_variables]{workflow_variables()}} object, passed on to \code{\link[workflows:add_variables]{add_variables()}}.
+\item A \code{\link[workflows:workflow_variables]{workflow_variables()}} object, passed on to \code{\link[workflows:add_variables]{add_variables()}}.
 }}
 
 \item{spec}{An optional parsnip model specification to add to the workflow.
diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 30f48602e..5e2eeee50 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -85,7 +85,8 @@ r <- r %>%
   step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-  step_epi_naomit()
+  step_epi_naomit() 
+
 ```
 
 Notice that `step_rename()` was used for renaming of the variables, and `step_rm()`
@@ -96,18 +97,75 @@ A quick view of what the prepossessed data looks like on training data:
 ```{r peak-data-1}
 slice_sample(bake(prep(r, x),x), n = 6)
 ```
+There are also many functions in the `recipes` package that allow for 
+spline transformation or other types of function transformations. 
+
+```{r, warning=FALSE}
+r <- r %>% 
+  step_sqrt(contains("lag_")) %>%
+  step_rm(cases, deaths)
+  
+
+slice_sample(bake(prep(r, x),x), n = 6)
+```
 
 
+Sometimes, the exact counts are desired so fitting a linear regression would be 
+a good idea. On the other hand, instead of predicting 
+7-day ahead death counts, we would like to predict if the death counts 7 days 
+from the latest date fall into some categories. For the purpose of illustration, 
+we define two categories: less than 10K(low) and more than 10K(high). 
 
+We can resort to the `recipes` package again:
 
+```{r, warning = FALSE}
+binner <- function(x) {
+  x <- cut(x, breaks = c(0, 10000, Inf), include.lowest = TRUE)
+  # now return the group number
+  as.numeric(x)
+}
 
+inc <- c("low", "high")
 
+r_categorical <- r %>% 
+  step_num2factor(ahead_7_deaths, 
+                  role = "outcome", 
+                  transform = binner, 
+                  levels = inc)
 
+slice_sample(bake(prep(r_categorical, x),x), n = 6)
+```
 
 
 
+# Model Fitting
 
+Once the pre-processing steps are done, we can move on to model fitting using 
+the `parsnip` package. 
 
+First, we can try fitting a linear regression:
+
+```{r linear-fit}
+wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(x) 
+# the fit
+wf$fit
+latest <- get_test_data(recipe = r, x = x)
+p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p
+```
+
+
+Logistic Regression
+```{r poisson}
+wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
+    fit(x) 
+# the fit
+wf$fit
+latest <- get_test_data(recipe = r_categorical, x = x)
+p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p
+```
 
 
 

From 8db67b2ac248eddc456ae2b43c122e39fc661a6f Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 2 Aug 2022 10:46:14 -0700
Subject: [PATCH 04/29] just needed to add filter to remove NAs oops

---
 vignettes/more-recipes.Rmd | 48 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 5e2eeee50..7a3c52f2a 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -151,7 +151,7 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>%
 # the fit
 wf$fit
 latest <- get_test_data(recipe = r, x = x)
-p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p <- predict(wf, latest) %>% filter(!is.na(.pred))
 p
 ```
 
@@ -163,12 +163,56 @@ wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
 # the fit
 wf$fit
 latest <- get_test_data(recipe = r_categorical, x = x)
-p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p <- predict(wf, latest) %>% filter(!is.na(.pred_class))
 p
 ```
 
 
 
+## Issues from vignette
+1. `step_cut()` did not work
+```{r, error=TRUE}
+r_test1 <- epi_recipe(x) %>%
+  step_population_scaling(case_rate, death_rate, df = pop_data, 
+                          by = c("geo_value" = "states"),
+                          create_new = FALSE,
+                          df_pop_col = "inv_pop") %>%
+  step_rename(cases = case_rate, deaths = death_rate) %>%
+  step_rm(throw_away) %>%
+  step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
+  step_epi_naomit() %>%
+  step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf))
 
+slice_sample(bake(prep(r_test1, x),x), n = 6)
+```
+2. Predictions are not produced, despite different models applied to it. Perhaps
+something wrong with the preprocessing steps?
+Maybe without `step_population_scaling()`:
+```{r}
+r_test2 <- epi_recipe(x) %>%
+  step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+  step_naomit(all_predictors()) %>%
+  step_naomit(all_outcomes(), skip = TRUE)
+
+slice_sample(bake(prep(r_test2, x),x), n = 6)
+
+wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>%
+    fit(x) 
+# the fit
+wf$fit
+latest <- get_test_data(recipe = r_test2, x = x)
+# latest <- x %>%
+#   filter(!is.na(case_rate), !is.na(death_rate)) %>%
+#   group_by(geo_value) %>%
+#   slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case
+#   ungroup()
+p <- predict(wf, latest)  %>% filter(!is.na(.pred))
+p
+
+```
 
 

From 4ea61b6761b55983f02853df3e90bbec86ba428b Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 2 Aug 2022 12:16:35 -0700
Subject: [PATCH 05/29] first draft of vignette done

---
 vignettes/more-recipes.Rmd | 174 +++++++++++++++++++++----------------
 1 file changed, 99 insertions(+), 75 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 7a3c52f2a..c1328f464 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -20,20 +20,24 @@ steps from the `recipes` package for more feature engineering.
 In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip`
 models into the `epipredict` workflow. 
 
+First, let's load the necessary packages:
+
 ```{r, warning=FALSE, message =FALSE}
 library(epipredict)
 library(epiprocess)
 library(recipes)
 library(parsnip)
 library(workflows)
+library(poissonreg)
 ```
 
 We will use the built-in dataset `case_death_rate_subset` and apply feature 
 engineering steps and fit a few models from `parsnip` to predict future death 
-rates by states. 
+rates by states 7 days ahead from Dec 31, 2021.
+
 ```{r}
 x <- case_death_rate_subset
-head(x)
+tail(x)
 ```
 There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates
 ranging from `r min(x$time_value)` to `r max(x$time_value)` in 
@@ -62,7 +66,8 @@ A 'throw-away' column with all ones is also created to show that `step_rm()`
 can be used to remove unwanted columns.
 ```{r madeup-population-data}
 pop_data <- data.frame(states = unique(x$geo_value),
-                       inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
+                       inv_pop = 1/seq(1e5, 1e7, 
+                                       length.out = n_distinct(x$geo_value)),
                        social_distancing = rbinom(n = n_distinct(x$geo_value), 
                                                   size = 1,
                                                   prob = 0.5),
@@ -85,36 +90,31 @@ r <- r %>%
   step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-  step_epi_naomit() 
+  step_epi_naomit() %>%
+  step_rm(cases, deaths)
 
 ```
 
-Notice that `step_rename()` was used for renaming of the variables, and `step_rm()`
-was used to remove redundant columns.
+Notice that `step_rename()` was used for renaming of the variables, and 
+`step_rm()` was used to remove redundant columns.
 
-A quick view of what the prepossessed data looks like on training data:
+A quick view of what columns exist in the the prepossessed data so far:
 
 ```{r peak-data-1}
-slice_sample(bake(prep(r, x),x), n = 6)
+names(bake(prep(r, x),x))
 ```
-There are also many functions in the `recipes` package that allow for 
-spline transformation or other types of function transformations. 
-
-```{r, warning=FALSE}
-r <- r %>% 
-  step_sqrt(contains("lag_")) %>%
-  step_rm(cases, deaths)
-  
 
-slice_sample(bake(prep(r, x),x), n = 6)
-```
+There are also many functions in the `recipes` package that allow for 
+[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
+such as log transformations and spline transformations. 
 
+Sometimes, the exact counts are desired, therefore fitting a linear regression 
+or poisson regression would be a good idea. 
 
-Sometimes, the exact counts are desired so fitting a linear regression would be 
-a good idea. On the other hand, instead of predicting 
-7-day ahead death counts, we would like to predict if the death counts 7 days 
-from the latest date fall into some categories. For the purpose of illustration, 
-we define two categories: less than 10K(low) and more than 10K(high). 
+On the other hand, say instead of predicting 7-day ahead death counts, we would 
+like to predict if the counts are high or low based on some given threshold. 
+For the purpose of illustration, we define two categories: less than 10K(low) 
+and more than 10K(high). 
 
 We can resort to the `recipes` package again:
 
@@ -133,86 +133,110 @@ r_categorical <- r %>%
                   transform = binner, 
                   levels = inc)
 
-slice_sample(bake(prep(r_categorical, x),x), n = 6)
 ```
 
+As a sanity check we can see now that the preprocessed data has a categorical 
+outcome in `ahead_7_deaths`
+```{r, warning FALSE, echo = FALSE}
+glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6))
+```
 
-
-# Model Fitting
+# Model Fitting and Post-processing
 
 Once the pre-processing steps are done, we can move on to model fitting using 
 the `parsnip` package. 
 
+### Linear Regression
+
 First, we can try fitting a linear regression:
 
-```{r linear-fit}
+```{r linear-fit-workflow, warning = FALSE}
 wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(x) 
-# the fit
-wf$fit
+
 latest <- get_test_data(recipe = r, x = x)
+```
+
+We take a look at the linear regression fit and the values of the coefficients.
+
+```{r linear-fit, warning = FALSE}
+wf$fit$fit # look at the model fit and coefficients
+
 p <- predict(wf, latest) %>% filter(!is.na(.pred))
+
 p
 ```
 
+The predictions are saved in the `.pred` column and represents the squared roots
+of predicted death counts 7 days from Dec 31. 2021. 
 
-Logistic Regression
-```{r poisson}
-wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
-    fit(x) 
-# the fit
-wf$fit
-latest <- get_test_data(recipe = r_categorical, x = x)
-p <- predict(wf, latest) %>% filter(!is.na(.pred_class))
-p
+Some post-processing can be done to revert back to death rates
+
+```{r linear-reg-post-process}
+f <- frosting() %>%
+    layer_predict() %>%
+    layer_threshold(.pred) %>%
+    layer_naomit(.pred) %>%
+    layer_population_scaling(.pred, df = pop_data,
+                             by =  c("geo_value" = "states"),
+                             df_pop_col = "inv_pop")
+
+wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(x) %>%
+    add_frosting(f)
+
+predict(wf, latest) %>% select( - social_distancing, - throw_away)
 ```
 
+### Poisson regression
 
+For count data, we can use poisson regression. Although the counts here are not 
+integer since they were calculated from case rates multiplying population, some 
+extra preprocessing steps can help us round them to integers to allow fitting a 
+poisson regression.
 
-## Issues from vignette
-1. `step_cut()` did not work
-```{r, error=TRUE}
-r_test1 <- epi_recipe(x) %>%
-  step_population_scaling(case_rate, death_rate, df = pop_data, 
-                          by = c("geo_value" = "states"),
-                          create_new = FALSE,
-                          df_pop_col = "inv_pop") %>%
-  step_rename(cases = case_rate, deaths = death_rate) %>%
-  step_rm(throw_away) %>%
-  step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-  step_epi_naomit() %>%
-  step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf))
+We will also apply post-processing steps to the workflow similar to the ones above.
+
+```{r poisson-fit}
+r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), 
+                       role = "predictor") %>%
+  step_filter(ahead_7_deaths > 0)
 
-slice_sample(bake(prep(r_test1, x),x), n = 6)
+latest <- get_test_data(recipe = r, x = x)
+
+wf <- epi_workflow(r, parsnip::poisson_reg()) %>%
+    fit(x) %>%
+    add_frosting(f)
+
+predict(wf, latest) %>% select( - social_distancing, - throw_away)
+```
+
+Let's also take a look at the poisson regression fit:
+```{r, echo =FALSE}
+wf$fit$fit
 ```
-2. Predictions are not produced, despite different models applied to it. Perhaps
-something wrong with the preprocessing steps?
-Maybe without `step_population_scaling()`:
-```{r}
-r_test2 <- epi_recipe(x) %>%
-  step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
-  step_naomit(all_predictors()) %>%
-  step_naomit(all_outcomes(), skip = TRUE)
 
-slice_sample(bake(prep(r_test2, x),x), n = 6)
+### Logistic Regression
 
-wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>%
+We use a logistic regression to predict whether 7-day ahead death counts are high
+or low. Post-processing steps do not need to be applied here. 
+```{r logistic, warning = FALSE}
+wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
     fit(x) 
-# the fit
-wf$fit
-latest <- get_test_data(recipe = r_test2, x = x)
-# latest <- x %>%
-#   filter(!is.na(case_rate), !is.na(death_rate)) %>%
-#   group_by(geo_value) %>%
-#   slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case
-#   ungroup()
-p <- predict(wf, latest)  %>% filter(!is.na(.pred))
+latest <- get_test_data(recipe = r_categorical, x = x)
+p <- predict(wf, latest) %>% filter(!is.na(.pred_class))
 p
+```
 
+The model fit is as follows:
+```{r echo=FALSE}
+wf$fit$fit
 ```
 
+# Attribution
+
+This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html)
+
+This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University 
+on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.
 

From 4c026f58610e8513f3d59bf8b95c69a9086cd653 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 2 Aug 2022 14:49:57 -0700
Subject: [PATCH 06/29] add poissonreg package to suggests

---
 DESCRIPTION | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 7a5cdbcf8..bbb863371 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -44,6 +44,7 @@ Imports:
     vctrs,
     workflows
 Suggests: 
+    poissonreg,
     covidcast,
     data.table,
     ggplot2,

From 4dcd39387faac71302b2119f0b8eb2189ba5ee45 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 2 Aug 2022 15:39:30 -0700
Subject: [PATCH 07/29] default `by` should return column used for joining

---
 tests/testthat/test-population_scaling.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index b2e32c33a..a0e79110a 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -177,7 +177,6 @@ test_that("test joining by default columns", {
                             df_pop_col = "values",
                             by = NULL,
                             suffix = "_scaled") %>%
-    step_rm(geo_value.df) %>% # additional join columns added so need to remove
     step_epi_lag(case_rate_scaled, lag = c(7, 14)) %>% # cases
     step_epi_ahead(case_rate_scaled, ahead = 7, role = "outcome") %>% # cases
     step_naomit(all_predictors()) %>%
@@ -185,7 +184,8 @@ test_that("test joining by default columns", {
 
   prep <- prep(r, jhu)
 
-  expect_silent(b <- bake(prep, jhu))
+  expect_message(b <- bake(prep, jhu)) # a message of which column was joined by default
+
 
   f <- frosting() %>%
     layer_predict() %>%
@@ -207,7 +207,7 @@ test_that("test joining by default columns", {
                             dplyr::select(geo_value, time_value, case_rate))
 
 
-  expect_silent(p <- predict(wf, latest))
+  expect_message(p <- predict(wf, latest))
 
 })
 

From c828579acdd6b39ed8fcce97ce69c0a00a49a5c7 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Wed, 27 Jul 2022 22:19:12 -0700
Subject: [PATCH 08/29] update some work

---
 vignettes/more-recipes.Rmd | 116 +++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 vignettes/more-recipes.Rmd

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
new file mode 100644
index 000000000..4d2a9a9d9
--- /dev/null
+++ b/vignettes/more-recipes.Rmd
@@ -0,0 +1,116 @@
+---
+title: Apply other steps and models from recipes and parsnip packages
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Apply other steps and models from recipes and parsnip packages}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+The `epipredict` package utilizes the `tidymodels` framework, namely 
+[`recipes`](https://recipes.tidymodels.org/) for 
+[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences 
+of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a 
+unified interface to a range of models. 
+
+`epipredict` has additional customized feature engineering and preprocessing steps 
+tailored to fit the needs of epidemiology forecasting, such as `step_epi_shift()`, 
+`step_population_scaling()`, `step_epi_naomit()`. They can be used along with 
+steps from the `recipes` package for more feature engineering. 
+
+In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip`
+models into the `epipredict` workflow. 
+
+```{r, warning=FALSE, message =FALSE}
+library(epipredict)
+library(epiprocess)
+library(recipes)
+library(parsnip)
+library(workflows)
+```
+
+We will use the built-in dataset `case_death_rate_subset` and apply feature 
+engineering steps and fit a few models from `parsnip` to predict future death 
+rates by states. 
+```{r}
+x <- case_death_rate_subset
+head(x)
+```
+There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates
+ranging from `r min(x$time_value)` to `r max(x$time_value)` in 
+`r n_distinct(x$geo_value)` states in the U.S.
+
+# Preprocessing and Feature Engineering
+
+In this section, we focus on showing how `step_*()` functions from the `recipes`
+package can also be applied in our `epipredict` workflows. 
+
+First, create a recipe object from the original data and then specify the 
+preprocessing steps. Steps are then sequentially added.
+
+```{r create-recipe-object}
+r <- epi_recipe(x)
+r
+```
+
+The function automatically categorizes roles into `geo_value`, `time_value` and 
+`raw` based on variable names, where `raw` includes `case_rate` and `death_rate`. 
+
+We first create a made-up population dataset by states to allow the conversion
+from `case_rate` and `death_rate` into `cases` and  `deaths`. The binary variable
+`social_distancing` indicates whether the state followed social distancing rules.
+A 'throw-away' column with all ones is also created to show that `step_rm()` 
+can be used to remove unwanted columns.
+```{r madeup-population-data}
+pop_data <- data.frame(states = unique(x$geo_value),
+                       pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
+                       social_distancing = rbinom(n = n_distinct(x$geo_value), 
+                                                  size = 1,
+                                                  prob = 0.5),
+                       throw_away = 1)
+
+head(pop_data)
+```
+
+Next, we will feature engineer `case_rate` and `death_rate` into `cases` and 
+`deaths`and create lags and leads based on them. 
+
+```{r create-new-columns-in-recipe}
+r <- r %>%
+  step_population_scaling(case_rate, death_rate, df = pop_data, 
+                          by = c("geo_value" = "states"),
+                          create_new = FALSE,
+                          df_pop_col = "pop") %>%
+  step_rename(cases = case_rate, deaths = death_rate) %>%
+  step_rm(throw_away) %>%
+  step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
+  step_epi_naomit()
+```
+
+Notice that `step_rename()` was used for renaming of the variables, and `step_rm()`
+was used to remove redundant columns.
+
+A quick view of what the prepossessed data looks like on training data:
+
+```{r peak-data-1}
+slice_sample(bake(prep(r, x),x), n = 6)
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 0136eb59819a5bc8543f205d2d54d2f23af6b985 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Thu, 28 Jul 2022 08:54:06 -0700
Subject: [PATCH 09/29] switch to inverse population

---
 vignettes/more-recipes.Rmd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 4d2a9a9d9..30f48602e 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -62,7 +62,7 @@ A 'throw-away' column with all ones is also created to show that `step_rm()`
 can be used to remove unwanted columns.
 ```{r madeup-population-data}
 pop_data <- data.frame(states = unique(x$geo_value),
-                       pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
+                       inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
                        social_distancing = rbinom(n = n_distinct(x$geo_value), 
                                                   size = 1,
                                                   prob = 0.5),
@@ -79,7 +79,7 @@ r <- r %>%
   step_population_scaling(case_rate, death_rate, df = pop_data, 
                           by = c("geo_value" = "states"),
                           create_new = FALSE,
-                          df_pop_col = "pop") %>%
+                          df_pop_col = "inv_pop") %>%
   step_rename(cases = case_rate, deaths = death_rate) %>%
   step_rm(throw_away) %>%
   step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%

From 0dd9713d0f1357986f964962e9c7d6831e56d965 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Sun, 31 Jul 2022 14:41:18 -0700
Subject: [PATCH 10/29] error: prediction is empty

---
 vignettes/more-recipes.Rmd | 60 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 30f48602e..5e2eeee50 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -85,7 +85,8 @@ r <- r %>%
   step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-  step_epi_naomit()
+  step_epi_naomit() 
+
 ```
 
 Notice that `step_rename()` was used for renaming of the variables, and `step_rm()`
@@ -96,18 +97,75 @@ A quick view of what the prepossessed data looks like on training data:
 ```{r peak-data-1}
 slice_sample(bake(prep(r, x),x), n = 6)
 ```
+There are also many functions in the `recipes` package that allow for 
+spline transformation or other types of function transformations. 
+
+```{r, warning=FALSE}
+r <- r %>% 
+  step_sqrt(contains("lag_")) %>%
+  step_rm(cases, deaths)
+  
+
+slice_sample(bake(prep(r, x),x), n = 6)
+```
 
 
+Sometimes, the exact counts are desired so fitting a linear regression would be 
+a good idea. On the other hand, instead of predicting 
+7-day ahead death counts, we would like to predict if the death counts 7 days 
+from the latest date fall into some categories. For the purpose of illustration, 
+we define two categories: less than 10K(low) and more than 10K(high). 
 
+We can resort to the `recipes` package again:
 
+```{r, warning = FALSE}
+binner <- function(x) {
+  x <- cut(x, breaks = c(0, 10000, Inf), include.lowest = TRUE)
+  # now return the group number
+  as.numeric(x)
+}
 
+inc <- c("low", "high")
 
+r_categorical <- r %>% 
+  step_num2factor(ahead_7_deaths, 
+                  role = "outcome", 
+                  transform = binner, 
+                  levels = inc)
 
+slice_sample(bake(prep(r_categorical, x),x), n = 6)
+```
 
 
 
+# Model Fitting
 
+Once the pre-processing steps are done, we can move on to model fitting using 
+the `parsnip` package. 
 
+First, we can try fitting a linear regression:
+
+```{r linear-fit}
+wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(x) 
+# the fit
+wf$fit
+latest <- get_test_data(recipe = r, x = x)
+p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p
+```
+
+
+Logistic Regression
+```{r poisson}
+wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
+    fit(x) 
+# the fit
+wf$fit
+latest <- get_test_data(recipe = r_categorical, x = x)
+p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p
+```
 
 
 

From 47395a25b6f0226c6312a2f249ac72a304eba661 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 2 Aug 2022 10:46:14 -0700
Subject: [PATCH 11/29] just needed to add filter to remove NAs oops

---
 vignettes/more-recipes.Rmd | 48 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 5e2eeee50..7a3c52f2a 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -151,7 +151,7 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>%
 # the fit
 wf$fit
 latest <- get_test_data(recipe = r, x = x)
-p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p <- predict(wf, latest) %>% filter(!is.na(.pred))
 p
 ```
 
@@ -163,12 +163,56 @@ wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
 # the fit
 wf$fit
 latest <- get_test_data(recipe = r_categorical, x = x)
-p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED
+p <- predict(wf, latest) %>% filter(!is.na(.pred_class))
 p
 ```
 
 
 
+## Issues from vignette
+1. `step_cut()` did not work
+```{r, error=TRUE}
+r_test1 <- epi_recipe(x) %>%
+  step_population_scaling(case_rate, death_rate, df = pop_data, 
+                          by = c("geo_value" = "states"),
+                          create_new = FALSE,
+                          df_pop_col = "inv_pop") %>%
+  step_rename(cases = case_rate, deaths = death_rate) %>%
+  step_rm(throw_away) %>%
+  step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
+  step_epi_naomit() %>%
+  step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf))
 
+slice_sample(bake(prep(r_test1, x),x), n = 6)
+```
+2. Predictions are not produced, despite different models applied to it. Perhaps
+something wrong with the preprocessing steps?
+Maybe without `step_population_scaling()`:
+```{r}
+r_test2 <- epi_recipe(x) %>%
+  step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+  step_naomit(all_predictors()) %>%
+  step_naomit(all_outcomes(), skip = TRUE)
+
+slice_sample(bake(prep(r_test2, x),x), n = 6)
+
+wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>%
+    fit(x) 
+# the fit
+wf$fit
+latest <- get_test_data(recipe = r_test2, x = x)
+# latest <- x %>%
+#   filter(!is.na(case_rate), !is.na(death_rate)) %>%
+#   group_by(geo_value) %>%
+#   slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case
+#   ungroup()
+p <- predict(wf, latest)  %>% filter(!is.na(.pred))
+p
+
+```
 
 

From 0a835992548039fc059c806462c3fabd26050f48 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 2 Aug 2022 12:16:35 -0700
Subject: [PATCH 12/29] first draft of vignette done

---
 vignettes/more-recipes.Rmd | 174 +++++++++++++++++++++----------------
 1 file changed, 99 insertions(+), 75 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 7a3c52f2a..c1328f464 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -20,20 +20,24 @@ steps from the `recipes` package for more feature engineering.
 In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip`
 models into the `epipredict` workflow. 
 
+First, let's load the necessary packages:
+
 ```{r, warning=FALSE, message =FALSE}
 library(epipredict)
 library(epiprocess)
 library(recipes)
 library(parsnip)
 library(workflows)
+library(poissonreg)
 ```
 
 We will use the built-in dataset `case_death_rate_subset` and apply feature 
 engineering steps and fit a few models from `parsnip` to predict future death 
-rates by states. 
+rates by states 7 days ahead from Dec 31, 2021.
+
 ```{r}
 x <- case_death_rate_subset
-head(x)
+tail(x)
 ```
 There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates
 ranging from `r min(x$time_value)` to `r max(x$time_value)` in 
@@ -62,7 +66,8 @@ A 'throw-away' column with all ones is also created to show that `step_rm()`
 can be used to remove unwanted columns.
 ```{r madeup-population-data}
 pop_data <- data.frame(states = unique(x$geo_value),
-                       inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)),
+                       inv_pop = 1/seq(1e5, 1e7, 
+                                       length.out = n_distinct(x$geo_value)),
                        social_distancing = rbinom(n = n_distinct(x$geo_value), 
                                                   size = 1,
                                                   prob = 0.5),
@@ -85,36 +90,31 @@ r <- r %>%
   step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
   step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-  step_epi_naomit() 
+  step_epi_naomit() %>%
+  step_rm(cases, deaths)
 
 ```
 
-Notice that `step_rename()` was used for renaming of the variables, and `step_rm()`
-was used to remove redundant columns.
+Notice that `step_rename()` was used for renaming of the variables, and 
+`step_rm()` was used to remove redundant columns.
 
-A quick view of what the prepossessed data looks like on training data:
+A quick view of what columns exist in the the prepossessed data so far:
 
 ```{r peak-data-1}
-slice_sample(bake(prep(r, x),x), n = 6)
+names(bake(prep(r, x),x))
 ```
-There are also many functions in the `recipes` package that allow for 
-spline transformation or other types of function transformations. 
-
-```{r, warning=FALSE}
-r <- r %>% 
-  step_sqrt(contains("lag_")) %>%
-  step_rm(cases, deaths)
-  
 
-slice_sample(bake(prep(r, x),x), n = 6)
-```
+There are also many functions in the `recipes` package that allow for 
+[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
+such as log transformations and spline transformations. 
 
+Sometimes, the exact counts are desired, therefore fitting a linear regression 
+or poisson regression would be a good idea. 
 
-Sometimes, the exact counts are desired so fitting a linear regression would be 
-a good idea. On the other hand, instead of predicting 
-7-day ahead death counts, we would like to predict if the death counts 7 days 
-from the latest date fall into some categories. For the purpose of illustration, 
-we define two categories: less than 10K(low) and more than 10K(high). 
+On the other hand, say instead of predicting 7-day ahead death counts, we would 
+like to predict if the counts are high or low based on some given threshold. 
+For the purpose of illustration, we define two categories: less than 10K(low) 
+and more than 10K(high). 
 
 We can resort to the `recipes` package again:
 
@@ -133,86 +133,110 @@ r_categorical <- r %>%
                   transform = binner, 
                   levels = inc)
 
-slice_sample(bake(prep(r_categorical, x),x), n = 6)
 ```
 
+As a sanity check we can see now that the preprocessed data has a categorical 
+outcome in `ahead_7_deaths`
+```{r, warning FALSE, echo = FALSE}
+glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6))
+```
 
-
-# Model Fitting
+# Model Fitting and Post-processing
 
 Once the pre-processing steps are done, we can move on to model fitting using 
 the `parsnip` package. 
 
+### Linear Regression
+
 First, we can try fitting a linear regression:
 
-```{r linear-fit}
+```{r linear-fit-workflow, warning = FALSE}
 wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(x) 
-# the fit
-wf$fit
+
 latest <- get_test_data(recipe = r, x = x)
+```
+
+We take a look at the linear regression fit and the values of the coefficients.
+
+```{r linear-fit, warning = FALSE}
+wf$fit$fit # look at the model fit and coefficients
+
 p <- predict(wf, latest) %>% filter(!is.na(.pred))
+
 p
 ```
 
+The predictions are saved in the `.pred` column and represents the squared roots
+of predicted death counts 7 days from Dec 31. 2021. 
 
-Logistic Regression
-```{r poisson}
-wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
-    fit(x) 
-# the fit
-wf$fit
-latest <- get_test_data(recipe = r_categorical, x = x)
-p <- predict(wf, latest) %>% filter(!is.na(.pred_class))
-p
+Some post-processing can be done to revert back to death rates
+
+```{r linear-reg-post-process}
+f <- frosting() %>%
+    layer_predict() %>%
+    layer_threshold(.pred) %>%
+    layer_naomit(.pred) %>%
+    layer_population_scaling(.pred, df = pop_data,
+                             by =  c("geo_value" = "states"),
+                             df_pop_col = "inv_pop")
+
+wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(x) %>%
+    add_frosting(f)
+
+predict(wf, latest) %>% select( - social_distancing, - throw_away)
 ```
 
+### Poisson regression
 
+For count data, we can use poisson regression. Although the counts here are not 
+integer since they were calculated from case rates multiplying population, some 
+extra preprocessing steps can help us round them to integers to allow fitting a 
+poisson regression.
 
-## Issues from vignette
-1. `step_cut()` did not work
-```{r, error=TRUE}
-r_test1 <- epi_recipe(x) %>%
-  step_population_scaling(case_rate, death_rate, df = pop_data, 
-                          by = c("geo_value" = "states"),
-                          create_new = FALSE,
-                          df_pop_col = "inv_pop") %>%
-  step_rename(cases = case_rate, deaths = death_rate) %>%
-  step_rm(throw_away) %>%
-  step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-  step_epi_naomit() %>%
-  step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf))
+We will also apply post-processing steps to the workflow similar to the ones above.
+
+```{r poisson-fit}
+r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), 
+                       role = "predictor") %>%
+  step_filter(ahead_7_deaths > 0)
 
-slice_sample(bake(prep(r_test1, x),x), n = 6)
+latest <- get_test_data(recipe = r, x = x)
+
+wf <- epi_workflow(r, parsnip::poisson_reg()) %>%
+    fit(x) %>%
+    add_frosting(f)
+
+predict(wf, latest) %>% select( - social_distancing, - throw_away)
+```
+
+Let's also take a look at the poisson regression fit:
+```{r, echo =FALSE}
+wf$fit$fit
 ```
-2. Predictions are not produced, despite different models applied to it. Perhaps
-something wrong with the preprocessing steps?
-Maybe without `step_population_scaling()`:
-```{r}
-r_test2 <- epi_recipe(x) %>%
-  step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
-  step_naomit(all_predictors()) %>%
-  step_naomit(all_outcomes(), skip = TRUE)
 
-slice_sample(bake(prep(r_test2, x),x), n = 6)
+### Logistic Regression
 
-wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>%
+We use a logistic regression to predict whether 7-day ahead death counts are high
+or low. Post-processing steps do not need to be applied here. 
+```{r logistic, warning = FALSE}
+wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
     fit(x) 
-# the fit
-wf$fit
-latest <- get_test_data(recipe = r_test2, x = x)
-# latest <- x %>%
-#   filter(!is.na(case_rate), !is.na(death_rate)) %>%
-#   group_by(geo_value) %>%
-#   slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case
-#   ungroup()
-p <- predict(wf, latest)  %>% filter(!is.na(.pred))
+latest <- get_test_data(recipe = r_categorical, x = x)
+p <- predict(wf, latest) %>% filter(!is.na(.pred_class))
 p
+```
 
+The model fit is as follows:
+```{r echo=FALSE}
+wf$fit$fit
 ```
 
+# Attribution
+
+This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html)
+
+This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University 
+on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.
 

From 854090df3f2b91361e530262d48d8b8270aa140b Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 2 Aug 2022 14:49:57 -0700
Subject: [PATCH 13/29] add poissonreg package to suggests

---
 DESCRIPTION | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 25f82bfec..7673c6c14 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -31,6 +31,7 @@ Imports:
     tidyselect,
     workflows
 Suggests: 
+    poissonreg,
     covidcast,
     data.table,
     ggplot2,

From b7d81b0f195600d857489753f8d0ff8f6d69778b Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Sun, 14 Aug 2022 23:21:31 -0700
Subject: [PATCH 14/29] updates according to Aug14 suggestions; need to add
 update_role() and add_role() example

---
 vignettes/more-recipes.Rmd | 80 +++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index c1328f464..475350daf 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -37,6 +37,7 @@ rates by states 7 days ahead from Dec 31, 2021.
 
 ```{r}
 x <- case_death_rate_subset
+x$geo_value <- as.factor(x$geo_value)
 tail(x)
 ```
 There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates
@@ -106,17 +107,55 @@ names(bake(prep(r, x),x))
 
 There are also many functions in the `recipes` package that allow for 
 [function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
-such as log transformations and spline transformations. 
+such as log transformations and spline transformations. In our case, we will 
+center the numerical predictors to allow for meaningful interpretation of the 
+intercept. 
 
-Sometimes, the exact counts are desired, therefore fitting a linear regression 
+```{r}
+r <- r %>% step_center(contains("lag"))
+```
+
+Since the current outcomes are positive, but the linear regression assumes the 
+outcome to be in full support, we will consider a log-transformation.
+
+```{r}
+r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome")
+```
+
+Other useful functionalities include creating a set of binary dummy variables 
+from a factor variable. As an illustration that will not affect the models, we 
+will show how states in `geo_value` become dummy variables so that each state 
+as a dummy variable has a differential intercept coefficient.
+
+```{r dummy-variable}
+length(unique(x$geo_value))
+
+dummies <- r %>% 
+  step_dummy(geo_value) %>%
+  prep(training = x) 
+
+dummy_data <- bake(dummies, x)
+
+names(dummy_data)
+```
+
+Once the pre-processing steps are done, we can move on to model fitting using 
+the `parsnip` package. 
+
+
+# Model Fitting and Post-processing
+
+Sometimes, the exact values are desired, therefore fitting a linear regression 
 or poisson regression would be a good idea. 
 
 On the other hand, say instead of predicting 7-day ahead death counts, we would 
-like to predict if the counts are high or low based on some given threshold. 
+like to classify if the future predictions are high or low based on some given threshold. 
+In this case, a logistic regression among other classification models can be used.
 For the purpose of illustration, we define two categories: less than 10K(low) 
 and more than 10K(high). 
 
-We can resort to the `recipes` package again:
+We resort to the `recipes` package again to change the numerical outcomes to 
+categorical for the purpose of classification:
 
 ```{r, warning = FALSE}
 binner <- function(x) {
@@ -136,25 +175,22 @@ r_categorical <- r %>%
 ```
 
 As a sanity check we can see now that the preprocessed data has a categorical 
-outcome in `ahead_7_deaths`
+outcome in `ahead_7_deaths`. 
+
 ```{r, warning FALSE, echo = FALSE}
 glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6))
 ```
 
-# Model Fitting and Post-processing
-
-Once the pre-processing steps are done, we can move on to model fitting using 
-the `parsnip` package. 
 
 ### Linear Regression
 
 First, we can try fitting a linear regression:
 
 ```{r linear-fit-workflow, warning = FALSE}
-wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+wf <- epi_workflow(r_lin, parsnip::linear_reg()) %>%
     fit(x) 
 
-latest <- get_test_data(recipe = r, x = x)
+latest <- get_test_data(recipe = r_lin, x = x)
 ```
 
 We take a look at the linear regression fit and the values of the coefficients.
@@ -167,26 +203,9 @@ p <- predict(wf, latest) %>% filter(!is.na(.pred))
 p
 ```
 
-The predictions are saved in the `.pred` column and represents the squared roots
-of predicted death counts 7 days from Dec 31. 2021. 
-
-Some post-processing can be done to revert back to death rates
-
-```{r linear-reg-post-process}
-f <- frosting() %>%
-    layer_predict() %>%
-    layer_threshold(.pred) %>%
-    layer_naomit(.pred) %>%
-    layer_population_scaling(.pred, df = pop_data,
-                             by =  c("geo_value" = "states"),
-                             df_pop_col = "inv_pop")
-
-wf <- epi_workflow(r, parsnip::linear_reg()) %>%
-    fit(x) %>%
-    add_frosting(f)
+The predictions are saved in the `.pred` column and represents the log-transformed 
+predicted death counts 7 days from Dec 31. 2021. 
 
-predict(wf, latest) %>% select( - social_distancing, - throw_away)
-```
 
 ### Poisson regression
 
@@ -195,7 +214,6 @@ integer since they were calculated from case rates multiplying population, some
 extra preprocessing steps can help us round them to integers to allow fitting a 
 poisson regression.
 
-We will also apply post-processing steps to the workflow similar to the ones above.
 
 ```{r poisson-fit}
 r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), 

From 10c6aecf0ce270365ae6369f81cb23dc2a9f56eb Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Mon, 15 Aug 2022 10:13:28 -0700
Subject: [PATCH 15/29] fix error; added role manipulation example

---
 vignettes/more-recipes.Rmd | 41 +++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
index 475350daf..dee09fd1d 100644
--- a/vignettes/more-recipes.Rmd
+++ b/vignettes/more-recipes.Rmd
@@ -52,9 +52,24 @@ package can also be applied in our `epipredict` workflows.
 First, create a recipe object from the original data and then specify the 
 preprocessing steps. Steps are then sequentially added.
 
+`recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) 
+in bulk. There are a couple handy functions that can be used together to help us 
+manipulate variable roles easily. 
+
+`update_role()` alters an existing role in the recipe or assigns an initial role 
+to variables that do not yet have a declared role.
+
+`add_role()` adds an additional role to variables that already have a role in 
+the recipe, without overwriting old roles.
+
+`remove_role()` eliminates a single existing role in the recipe.
+
+Here, we will add an additional `predictor` role to both `case_rate` and 
+`death_rate` variables. They currently have roles as `raw`.
+
 ```{r create-recipe-object}
-r <- epi_recipe(x)
-r
+r <- epi_recipe(x) %>% 
+  add_role(- epi_keys(x), new_role = "predictor") 
 ```
 
 The function automatically categorizes roles into `geo_value`, `time_value` and 
@@ -88,8 +103,8 @@ r <- r %>%
                           df_pop_col = "inv_pop") %>%
   step_rename(cases = case_rate, deaths = death_rate) %>%
   step_rm(throw_away) %>%
-  step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>%
-  step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>%
+  step_epi_lag(cases, lag = c(0, 7, 14)) %>%
+  step_epi_lag(deaths, lag = c(0, 7, 14)) %>%
   step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
   step_epi_naomit() %>%
   step_rm(cases, deaths)
@@ -105,6 +120,7 @@ A quick view of what columns exist in the the prepossessed data so far:
 names(bake(prep(r, x),x))
 ```
 
+
 There are also many functions in the `recipes` package that allow for 
 [function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
 such as log transformations and spline transformations. In our case, we will 
@@ -112,14 +128,15 @@ center the numerical predictors to allow for meaningful interpretation of the
 intercept. 
 
 ```{r}
-r <- r %>% step_center(contains("lag"))
+r <- r %>% step_center(contains("lag"), role = "predictor")
 ```
 
 Since the current outcomes are positive, but the linear regression assumes the 
-outcome to be in full support, we will consider a log-transformation.
+outcome to be in full support, we will consider a log-transformation with an 
+offset value of 1 to ensure `log(0)` does not occur. 
 
 ```{r}
-r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome")
+r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome", offset = 1)
 ```
 
 Other useful functionalities include creating a set of binary dummy variables 
@@ -216,17 +233,17 @@ poisson regression.
 
 
 ```{r poisson-fit}
-r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), 
+r <- r %>% 
+  step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), 
                        role = "predictor") %>%
   step_filter(ahead_7_deaths > 0)
 
 latest <- get_test_data(recipe = r, x = x)
 
 wf <- epi_workflow(r, parsnip::poisson_reg()) %>%
-    fit(x) %>%
-    add_frosting(f)
+    fit(x) 
 
-predict(wf, latest) %>% select( - social_distancing, - throw_away)
+predict(wf, latest) %>% filter(!is.na(.pred))
 ```
 
 Let's also take a look at the poisson regression fit:
@@ -237,7 +254,7 @@ wf$fit$fit
 ### Logistic Regression
 
 We use a logistic regression to predict whether 7-day ahead death counts are high
-or low. Post-processing steps do not need to be applied here. 
+or low. 
 ```{r logistic, warning = FALSE}
 wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
     fit(x) 

From 00b098205ef36e669657d3fccc01917fcc361226 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Mon, 22 Aug 2022 21:02:51 -0700
Subject: [PATCH 16/29] poisson added

---
 vignettes/preprocessing-and-models.Rmd | 260 +++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 vignettes/preprocessing-and-models.Rmd

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
new file mode 100644
index 000000000..b9c07933e
--- /dev/null
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -0,0 +1,260 @@
+---
+title: Examples of Preprocessing and Models
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Examples of Preprocessing and Models}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Introduction 
+
+The `epipredict` package utilizes the `tidymodels` framework, namely 
+[`recipes`](https://recipes.tidymodels.org/) for 
+[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences 
+of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a 
+unified interface to a range of models. 
+
+`epipredict` has additional customized feature engineering and preprocessing 
+steps, such as `step_epi_shift()`, `step_population_scaling()`, 
+`step_epi_naomit()`. They can be used along with 
+steps from the `recipes` package for more feature engineering. 
+
+In this vignette, we will illustrate some examples of how to use `epipredict`
+with `recipes` and `parsnip` for different purposes of epidemiology forecasting.
+
+The remaining vignette will be split into three sections. The first section, we 
+will use a Poisson regression to predict death counts. In the second section,
+we will use a linear regression to predict death rates. Last but not least, we
+will create a classification model for hotspot predictions. 
+
+```{r, warning=FALSE, message=FALSE}
+library(tidyverse)
+library(covidcast)
+library(epidatr)
+library(epiprocess)
+library(epipredict)
+library(recipes)
+library(poissonreg)
+```
+
+## Poisson Regression 
+
+During COVID-19, Center for Disease Control and Prevention (CDC) gathered models
+and forecasts to characterize the state of an outbreak and its course. They use
+it to inform public health decision makers on potential consequences of 
+deploying control measures.
+
+One of the outcomes that the CDC forecasts is [death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html).
+Although there are many state-of-the-art models, we choose to use Poisson 
+regression, the textbook example for modeling count data, as an illustration
+for using the `epipredict` package with other existing tidymodel packages. 
+
+```{r poisson-reg-data}
+x <- covidcast(
+  data_source = "jhu-csse",
+  signals = "confirmed_incidence_num",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj"
+) %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, cases = value)
+
+y <- covidcast(
+  data_source = "jhu-csse",
+  signals = "deaths_incidence_num",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj"
+) %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, deaths = value)
+
+case_death_counts_subset <- x %>%
+  full_join(y, by = c("geo_value", "time_value")) %>%
+  as_epi_df()
+
+```
+
+The `case_death_counts_subset` dataset comes from the `covidcast` package,and 
+contains the number of confirmed cases and deaths from June 4, 2021 to 
+Dec 31, 2021 in some U.S. states. 
+
+We wish to predict the 7-day ahead death counts with lagged cases and deaths.
+Furthermore, we will let each state be a dummy variable. Using differential 
+intercept coefficients, we can allow for an intercept shift between states.
+
+The model takes the form of
+\begin{aligned}
+\text{log}\left( \mu_{t+7} \right) = \beta_0 + \delta_1 s_{\text{state_1}} +
+\delta_2 s_{\text{state_2}} + ... +  \beta_1 \text{deaths}_{t} + \nonumber \\
+\beta_2 \text{deaths}_{t-7}  + \beta_3 \text{cases}_{t} + 
+\beta_4 \text{cases}_{t-7}
+\end{aligned}
+
+
+where $\mu_{t+7} = \mathbb{E}(y_{t+7})$, and $y_{t+7}$ is assumed to follow a 
+Poisson distribution with mean $\mu_{t+7}$; $s_{\text{state}}$ are dummy 
+variables for each state and take values of either 0 or 1. 
+
+Preprocessing steps will be performed using the `recipes` functions to get the 
+data ready for modeling. 
+
+But before diving into them, it will be helpful to understand what `roles` are
+in the `recipes` framework. 
+
+> `recipes` can assign one or more roles to each column in the data. The roles 
+> are not restricted to a predefined set; they can be anything. 
+> For most conventional situations, they are typically “predictor” and/or 
+> “outcome”. Additional roles enable targeted step operations on specific 
+> variables or groups of variables.
+>
+> In our case, the role `predictor` is given to explanatory variable that go to the 
+> left-hand side of the model. The role `outcome` is the response variable 
+> that we wish to predict. `geo_value` and `time_value` are predefined roles 
+> that are unique to the `epipredict` package. Since we work with `epi_df` 
+> objects, all datasets should have `geo_value` and `time_value` passed through
+> automatically with these two roles assigned to the appropriate columns.
+> 
+> `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) 
+> in bulk. There are a couple handy functions that can be used together to help us 
+> manipulate variable roles easily. 
+> 
+>> `update_role()` alters an existing role in the recipe or assigns an initial role 
+>> to variables that do not yet have a declared role.
+> 
+>> `add_role()` adds an additional role to variables that already have a role in 
+>> the recipe, without overwriting old roles.
+> 
+>> `remove_role()` eliminates a single existing role in the recipe.
+
+Notice in the following preprocessing steps, we used `add_role()` on 
+`geo_value_factor` since currently the default role for it is `raw` and 
+we would like the dummified variables to automatically be of role `predictor`. 
+
+```{r}
+case_death_counts_subset <- case_death_counts_subset %>%
+                            mutate(geo_value_factor = as.factor(geo_value)) %>%
+                            as_epi_df()
+
+epi_recipe(case_death_counts_subset) 
+               
+r <- epi_recipe(case_death_counts_subset) %>%
+      add_role(geo_value_factor, new_role = "predictor") %>%
+      recipes::step_dummy(geo_value_factor) %>%
+      step_mutate(cases = case_when(cases < 0 ~ 0, TRUE ~ cases)) %>%  
+      ## NOTE: why are there negative numbers in here?
+      step_mutate(deaths = case_when(deaths < 0 ~ 0, TRUE ~ deaths)) %>%
+      step_epi_lag(cases, lag = c(0, 7)) %>%
+      step_epi_lag(deaths, lag = c(0, 7)) %>%
+      step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
+      step_epi_naomit()       
+```
+
+After specifying the preprocessing steps, we will use the `parsnip` package for
+modeling and getting the prediction for the 7-day ahead death counts of the 
+latest available date in the dataset. 
+
+```{r}
+latest <- get_test_data(recipe = r, x = case_death_counts_subset)
+
+wf <- epi_workflow(r, parsnip::poisson_reg()) %>% 
+      fit(case_death_counts_subset)
+
+predict(wf, latest) %>% filter(!is.na(.pred))
+```
+
+Let's take a look at the fit:
+```{r, echo =FALSE}
+wf$fit$fit
+```
+
+Up to now, we've used the Poisson regression to model count data. Poisson 
+regression can also be used to model rate data, such as case rates or death
+rates, by incorporating offset terms in the model. 
+
+To model death rates, the Poisson regression would be expressed as:
+\begin{aligned}
+\text{log}\left( \mu_{t+7} \right) = \text{log(population)} + 
+\beta_0 + \delta_1 s_{\text{state_1}} +
+\delta_2 s_{\text{state_2}} + ... +  \beta_1 \text{deaths}_{t} + \nonumber \\
+\beta_2 \text{deaths}_{t-7}  + \beta_3 \text{cases}_{t} + 
+\beta_4 \text{cases}_{t-7}
+\end{aligned}
+where $\text{log(population)}$ is the log of the state population that was 
+used to scale the count data on the left-hand side of the equation.
+
+There are several ways to model rate data given count and population data. 
+First, in the `parsnip` framework, we can specify the formula in `fit()`. 
+However, by doing so we lose the ability to use the `recipes` framework to 
+create new variables since new variables that do not exist in the 
+original dataset such as lag and leads cannot be called directly in `fit()`. 
+
+Alternatively, `step_population_scaling()` in the `epipredict` package can 
+perform the population scaling if we provide the population data, and we can 
+model this using other models such as linear regression, which we will 
+illustrate in the next section.
+
+
+## Linear Regression 
+We wish to predict the 7-day ahead death counts with lagged cases and deaths, 
+along with extra behaviour indicator inputs. Namely, we will use survey data 
+from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators).
+
+The survey data shows the estimated percentage of people who wore a mask for 
+most or all of the time while in public in the past 7 days and the estimated 
+percentage of respondents who reported that all or most people they enountered 
+in public in the past 7 days maintained a distance of at least 6 feet. We will 
+convert these percentages into indicator variables as follows:
+
+```{r}
+behav_ind_mask <- covidcast(data_source = "fb-survey",
+                        signals = "smoothed_wwearing_mask_7d",
+                        time_type = "day",
+                        geo_type = "state",
+                        time_values = epirange(20210604, 20211231),
+                        geo_values = "ca,fl,tx,ny,nj"
+                        )  %>%
+            fetch_tbl() %>%
+            select(geo_value, time_value, percentage = value) %>%
+            mutate(mask_indicator = case_when(percentage < 50 ~ 0, 
+                                              TRUE ~  1)) %>%
+            select(geo_value, time_value, mask_indicator)  
+
+behav_ind_distancing <- covidcast(data_source = "fb-survey",
+                        signals = "smoothed_wothers_distanced_public",
+                        time_type = "day",
+                        geo_type = "state",
+                        time_values = epirange(20210604, 20211231),
+                        geo_values = "ca,fl,tx,ny,nj"
+                        )  %>%
+            fetch_tbl() %>%
+            select(geo_value, time_value, percentage = value) %>%
+            mutate(distancing_indicator = case_when(percentage < 20 ~ 0, 
+                                              TRUE ~  1)) %>%
+            select(geo_value, time_value, distancing_indicator) 
+behav_ind <- behav_ind_mask %>%
+  full_join(behav_ind_distancing, by = c("geo_value", "time_value")) %>%
+  as_epi_df()                                
+```
+
+## Classification
+
+## Reference
+* McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 
+forecasting and hotspot prediction?." Proceedings of the National Academy of 
+Sciences 118.51 (2021): e2111453118.
+
+## Attribution
+
+This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html)
+
+This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University 
+on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.

From bedcae7fb91ac4eedb572e762403c816da797b3c Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Mon, 22 Aug 2022 22:33:27 -0700
Subject: [PATCH 17/29] save progress on lin reg

---
 vignettes/preprocessing-and-models.Rmd | 73 ++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 10 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index b9c07933e..e6d8bd40e 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -55,6 +55,7 @@ regression, the textbook example for modeling count data, as an illustration
 for using the `epipredict` package with other existing tidymodel packages. 
 
 ```{r poisson-reg-data}
+## NOTE: built-in data? or use epiprocess built-in data and save code here
 x <- covidcast(
   data_source = "jhu-csse",
   signals = "confirmed_incidence_num",
@@ -144,7 +145,7 @@ case_death_counts_subset <- case_death_counts_subset %>%
                             mutate(geo_value_factor = as.factor(geo_value)) %>%
                             as_epi_df()
 
-epi_recipe(case_death_counts_subset) 
+epi_recipe(case_death_counts_subset)
                
 r <- epi_recipe(case_death_counts_subset) %>%
       add_role(geo_value_factor, new_role = "predictor") %>%
@@ -155,7 +156,7 @@ r <- epi_recipe(case_death_counts_subset) %>%
       step_epi_lag(cases, lag = c(0, 7)) %>%
       step_epi_lag(deaths, lag = c(0, 7)) %>%
       step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-      step_epi_naomit()       
+      step_epi_naomit()
 ```
 
 After specifying the preprocessing steps, we will use the `parsnip` package for
@@ -197,15 +198,28 @@ However, by doing so we lose the ability to use the `recipes` framework to
 create new variables since new variables that do not exist in the 
 original dataset such as lag and leads cannot be called directly in `fit()`. 
 
-Alternatively, `step_population_scaling()` in the `epipredict` package can 
-perform the population scaling if we provide the population data, and we can 
-model this using other models such as linear regression, which we will 
-illustrate in the next section.
+Alternatively, `step_population_scaling()` and `layer_population_scaling()` 
+in the `epipredict` package can perform the population scaling if we provide the 
+population data, and we can model this using other models such as linear 
+regression, which we will illustrate in the next section.
 
 
 ## Linear Regression 
-We wish to predict the 7-day ahead death counts with lagged cases and deaths, 
-along with extra behaviour indicator inputs. Namely, we will use survey data 
+
+The CDC requires submission of case and death count predictions. However, it is
+often the case that rate data is easier to model as the outcome as it need not 
+be an integer. We can use a liner regression to predict the death rates and
+use state population data to scale the rates to counts. We will do so using 
+`layer_population_scaling()` from the `epipredict` package. 
+
+Additionally, when forecasts are submitted, prediction intervals should be 
+provided along with the point estimates. This can be obtained via postprocessing
+`layer_residual_quantiles()`. Although worth pointing out that 
+`layer_residual_quantiles()` should be used before population scaling or else 
+the transformation will make the results uninterpretable. 
+
+We wish to predict the 7-day ahead death counts with lagged case rates and death
+rates, along with extra behaviour indicator inputs. Namely, we will use survey data 
 from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators).
 
 The survey data shows the estimated percentage of people who wore a mask for 
@@ -214,7 +228,10 @@ percentage of respondents who reported that all or most people they enountered
 in public in the past 7 days maintained a distance of at least 6 feet. We will 
 convert these percentages into indicator variables as follows:
 
+State-wise population data is obtained from `covidcast::state_census` and will
+be used in `layer_population_scaling()`. 
 ```{r}
+## NOTE: need to put in package?
 behav_ind_mask <- covidcast(data_source = "fb-survey",
                         signals = "smoothed_wwearing_mask_7d",
                         time_type = "day",
@@ -240,9 +257,45 @@ behav_ind_distancing <- covidcast(data_source = "fb-survey",
             mutate(distancing_indicator = case_when(percentage < 20 ~ 0, 
                                               TRUE ~  1)) %>%
             select(geo_value, time_value, distancing_indicator) 
+
+pop_dat <- covidcast::state_census %>% 
+          select(POPESTIMATE2019, ABBR) %>%
+          mutate(ABBR = tolower(ABBR)) %>%
+          filter(ABBR %in% c("ca","fl","tx","ny","nj"))
+
 behav_ind <- behav_ind_mask %>%
-  full_join(behav_ind_distancing, by = c("geo_value", "time_value")) %>%
-  as_epi_df()                                
+  full_join(behav_ind_distancing, by = c("geo_value", "time_value")) 
+
+dim(behav_ind)
+```
+
+We will take a subset of death rate and case rate data from our built-in dataset 
+`case_death_rate_subset`.
+```{r}
+jhu <- case_death_rate_subset %>%
+  dplyr::filter(time_value >= "2021-06-04", 
+                time_value <= "2021-12-31",
+  geo_value %in% c("ca","fl","tx","ny","nj"))
+
+dim(jhu)
+```
+
+Preprocessing steps will rely on functions from the `epipredict` package as well
+as the `recipes` package:
+
+```{r}
+jhu <- jhu %>%
+      mutate(geo_value_factor = as.factor(geo_value)) %>%
+      full_join(behav_ind, by = c("geo_value", "time_value")) %>%
+      as_epi_df()
+            
+r <- epi_recipe(jhu) %>%
+      add_role(geo_value_factor, new_role = "predictor") %>%
+      recipes::step_dummy(geo_value_factor) %>%
+      step_epi_lag(case_rate, lag = c(0, 7)) %>%
+      step_epi_lag(death_rate, lag = c(0, 7)) %>%
+      step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+      step_epi_naomit()
 ```
 
 ## Classification

From 0ef9aca49043b346fac293ff3349867891b1432a Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 23 Aug 2022 10:25:44 -0700
Subject: [PATCH 18/29] lin reg part done, classifcation wip

---
 vignettes/preprocessing-and-models.Rmd | 58 ++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index e6d8bd40e..59ea97a19 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -33,12 +33,13 @@ we will use a linear regression to predict death rates. Last but not least, we
 will create a classification model for hotspot predictions. 
 
 ```{r, warning=FALSE, message=FALSE}
-library(tidyverse)
 library(covidcast)
 library(epidatr)
 library(epiprocess)
 library(epipredict)
 library(recipes)
+library(parsnip)
+library(workflows)
 library(poissonreg)
 ```
 
@@ -149,7 +150,7 @@ epi_recipe(case_death_counts_subset)
                
 r <- epi_recipe(case_death_counts_subset) %>%
       add_role(geo_value_factor, new_role = "predictor") %>%
-      recipes::step_dummy(geo_value_factor) %>%
+      step_dummy(geo_value_factor) %>%
       step_mutate(cases = case_when(cases < 0 ~ 0, TRUE ~ cases)) %>%  
       ## NOTE: why are there negative numbers in here?
       step_mutate(deaths = case_when(deaths < 0 ~ 0, TRUE ~ deaths)) %>%
@@ -261,7 +262,8 @@ behav_ind_distancing <- covidcast(data_source = "fb-survey",
 pop_dat <- covidcast::state_census %>% 
           select(POPESTIMATE2019, ABBR) %>%
           mutate(ABBR = tolower(ABBR)) %>%
-          filter(ABBR %in% c("ca","fl","tx","ny","nj"))
+          filter(ABBR %in% c("ca","fl","tx","ny","nj")) %>%
+          rename(value = POPESTIMATE2019, geo_value = ABBR)
 
 behav_ind <- behav_ind_mask %>%
   full_join(behav_ind_distancing, by = c("geo_value", "time_value")) 
@@ -283,6 +285,12 @@ dim(jhu)
 Preprocessing steps will rely on functions from the `epipredict` package as well
 as the `recipes` package:
 
+Additionally, there are also many functions in the `recipes` package that allow for 
+[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
+such as log transformations and data centering. In our case, we will 
+center the numerical predictors to allow for meaningful interpretation of the 
+intercept. 
+
 ```{r}
 jhu <- jhu %>%
       mutate(geo_value_factor = as.factor(geo_value)) %>%
@@ -291,13 +299,55 @@ jhu <- jhu %>%
             
 r <- epi_recipe(jhu) %>%
       add_role(geo_value_factor, new_role = "predictor") %>%
-      recipes::step_dummy(geo_value_factor) %>%
+      step_dummy(geo_value_factor) %>%
       step_epi_lag(case_rate, lag = c(0, 7)) %>%
       step_epi_lag(death_rate, lag = c(0, 7)) %>%
       step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+      step_center(contains("lag"), role = "predictor") %>%
       step_epi_naomit()
 ```
 
+As a sanity check we can see what columns exist in the preprocessed data now:
+```{r, warning = FALSE, echo = FALSE}
+glimpse(slice_sample(bake(prep(r, jhu),jhu), n = 6))
+```
+
+Before directly predicting the results, we need to add postprocessing layers to
+obtain the death counts instead of death rates. 
+
+```{r}
+f <- frosting() %>%
+    layer_predict() %>%
+    # layer_add_target_date() %>% ## NOTE: this gives 2022-01-21 instead of 2022-01-07?
+    layer_add_target_date("2022-01-07") %>% # max(jhu$time_value) + 7
+    layer_threshold(.pred, lower = 0) %>%
+    layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>%
+    layer_naomit(.pred)  %>%
+    layer_population_scaling(.pred, df = pop_dat,
+                             by = c("geo_value"),
+                             df_pop_col = "value")
+                             ## NOTE: how to scale `.pred_distn` if it is type `dist`
+                             ## NOTE: need to add `dist` handling in `layer_population_scaling`?
+
+wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(jhu) %>%
+    add_frosting(f)
+
+latest <- get_test_data(recipe = r, x = jhu)
+predict(wf, latest)
+
+# lapply(nested_quantiles(predict(wf, latest)$.pred_distn), `[[`, 1)
+```
+
+The point estimate of death counts on Jan 7, 2022 is returned in column 
+`.pred_original`. 
+
+Last but not least, let's take a look at the regression fit and check the 
+coefficients:
+```{r, echo =FALSE}
+wf$fit$fit
+```
+
 ## Classification
 
 ## Reference

From ab5a288053dde1872d6ceceabdbe16562b78509d Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 23 Aug 2022 17:08:32 -0700
Subject: [PATCH 19/29] hotspot model not compatible with epi_workflow?

---
 vignettes/preprocessing-and-models.Rmd | 132 ++++++++++++++++++++++---
 1 file changed, 121 insertions(+), 11 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 59ea97a19..5073a2214 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -26,6 +26,9 @@ steps from the `recipes` package for more feature engineering.
 
 In this vignette, we will illustrate some examples of how to use `epipredict`
 with `recipes` and `parsnip` for different purposes of epidemiology forecasting.
+We will focus on simple basic autoregressive models, in which COVID cases and 
+deaths in the near future are predicted using a linear combination of cases and 
+deaths in the near past.
 
 The remaining vignette will be split into three sections. The first section, we 
 will use a Poisson regression to predict death counts. In the second section,
@@ -207,11 +210,11 @@ regression, which we will illustrate in the next section.
 
 ## Linear Regression 
 
-The CDC requires submission of case and death count predictions. However, it is
-often the case that rate data is easier to model as the outcome as it need not 
-be an integer. We can use a liner regression to predict the death rates and
-use state population data to scale the rates to counts. We will do so using 
-`layer_population_scaling()` from the `epipredict` package. 
+For COVID-19, the CDC requires submission of case and death count predictions. 
+However, it is often the case that rate data is easier to model as the outcome 
+as it need not be an integer. We can use a liner regression to predict the death
+rates and use state population data to scale the rates to counts. We will do so 
+using `layer_population_scaling()` from the `epipredict` package. 
 
 Additionally, when forecasts are submitted, prediction intervals should be 
 provided along with the point estimates. This can be obtained via postprocessing
@@ -323,11 +326,9 @@ f <- frosting() %>%
     layer_threshold(.pred, lower = 0) %>%
     layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>%
     layer_naomit(.pred)  %>%
-    layer_population_scaling(.pred, df = pop_dat,
+    layer_population_scaling(.pred, .pred_distn, df = pop_dat,
                              by = c("geo_value"),
                              df_pop_col = "value")
-                             ## NOTE: how to scale `.pred_distn` if it is type `dist`
-                             ## NOTE: need to add `dist` handling in `layer_population_scaling`?
 
 wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(jhu) %>%
@@ -335,12 +336,12 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>%
 
 latest <- get_test_data(recipe = r, x = jhu)
 predict(wf, latest)
-
-# lapply(nested_quantiles(predict(wf, latest)$.pred_distn), `[[`, 1)
+nested_quantiles(predict(wf, latest)$.pred_distn_original)
 ```
 
 The point estimate of death counts on Jan 7, 2022 is returned in column 
-`.pred_original`. 
+`.pred_original`. The predictions based on residual quantiles is stored in
+`.pred_distn_original` and also scaled to count data. 
 
 Last but not least, let's take a look at the regression fit and check the 
 coefficients:
@@ -350,6 +351,115 @@ wf$fit$fit
 
 ## Classification
 
+Sometimes predictive model for future trends are of interest. In other words, 
+the target is to predict if the future will have increasing case rates(up),
+decreasing case rates (down), or flat case rates (flat). Such model are often 
+referred to as hotspot prediction models. We will refer to the model notations 
+from McDonald et al. and illustrate how modeling can be streamlined in our 
+framework with other classifcations. 
+
+Hotspot prediction predicts a categorical variable defined in terms of the 
+relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. 
+Where $Y_{\ell, t}$ denotes the case rates in location $\ell$ at time $t$. 
+We define the response variables as follows:
+
+$$
+ Z_{\ell, t}=
+    \begin{cases}
+      \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ 
+      \text{down}, & \text{if}\  Y^{\Delta}_{\ell, t} < -0.25\\
+      \text{flat}, & \text{otherwise}
+    \end{cases}
+$$
+where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. 
+We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized
+as 'up', meaning the number of newly reported cases over the past 7 days has 
+increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ 
+is categorized as 'down', it suggests that there has been at least a 25% 
+decrease in newly reported cases over the past 7 days. Otherwise, we will 
+consider the trends to be flat. 
+
+<!-- 
+
+Preprocessing steps are similar to the previous models with an additional step 
+of categorizing the response variables. 
+
+We will take a subset of death rate and case rate data from our built-in dataset 
+`case_death_rate_subset`.
+```{r}
+jhu <- case_death_rate_subset %>%
+      dplyr::filter(time_value >= "2021-06-04", 
+                    time_value <= "2021-12-31",
+      geo_value %in% c("ca","fl","tx","ny","nj")) %>%
+      select(geo_value, time_value, case_rate) %>%
+      mutate(geo_value_factor = as.factor(geo_value)) %>%
+      full_join(behav_ind, by = c("geo_value", "time_value")) %>%
+      as_epi_df()
+
+binner <- function(x) {
+  x <- cut(x, breaks = c(-Inf, -0.25, 0.25, Inf), include.lowest = TRUE)
+  # now return the group number
+  as.numeric(x)
+}
+
+inc <- c("down", "flat", "up")
+
+
+r <- epi_recipe(jhu) %>%
+     # add_role(time_value, new_role = "predictor") %>%
+      step_dummy(geo_value_factor) %>%
+      step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+      step_epi_ahead(case_rate, ahead = 7) %>%
+      step_mutate(pct_diff1 = 
+      (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) %>%
+      step_mutate(pct_diff2 = 
+      (lag_7_case_rate - lag_14_case_rate)/lag_14_case_rate) %>%
+      step_epi_naomit() %>%
+      step_mutate(trend = pct_diff1) %>%
+      step_num2factor(trend, 
+                  role = "outcome", 
+                  transform = binner, 
+                  levels = inc) %>%
+      step_rm(case_rate,
+              lag_0_case_rate, 
+              lag_7_case_rate, 
+              lag_14_case_rate, 
+              ahead_7_case_rate) %>%
+      step_naomit(all_predictors(), skip = FALSE) %>%
+      step_naomit(all_outcomes(), skip = FALSE) %>%
+      prep()
+
+
+new_dat = bake(r, new_data = jhu)
+```
+
+We will use linear discriminant analysis and take a look at the predictions:
+```{r}
+library(discrim)
+
+lda = discrim_linear() %>% 
+  set_engine("MASS") 
+
+epi_workflow(r, spec= lda) %>%
+    parsnip::fit(data = jhu)
+
+# wf <- workflow() %>% 
+#   add_recipe(r) %>% 
+#   add_model(lda)
+
+# latest <- get_test_data(recipe = r, x = jhu)
+# the_fit  <- fit(wf, data = latest)
+
+# predict(wf, latest)
+
+using_formula <-
+  discrim_linear() %>% 
+  set_engine("MASS") %>%
+  fit_xy(x = new_dat%>% 
+  select(-trend,-mask_indicator,-distancing_indicator,-time_value,-geo_value),
+          y= new_dat$trend)
+``` -->
+
 ## Reference
 * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 
 forecasting and hotspot prediction?." Proceedings of the National Academy of 

From a38fb56fe783a71a4d507e6fc3c25e50329347f2 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Tue, 23 Aug 2022 17:18:13 -0700
Subject: [PATCH 20/29] fix minor commenting bug

---
 vignettes/preprocessing-and-models.Rmd | 28 ++++++++++++++------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 5073a2214..9d38d90e0 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -379,6 +379,7 @@ is categorized as 'down', it suggests that there has been at least a 25%
 decrease in newly reported cases over the past 7 days. Otherwise, we will 
 consider the trends to be flat. 
 
+TODO: try logistic regression on two categories and see if it works?
 <!-- 
 
 Preprocessing steps are similar to the previous models with an additional step 
@@ -434,14 +435,14 @@ new_dat = bake(r, new_data = jhu)
 ```
 
 We will use linear discriminant analysis and take a look at the predictions:
-```{r}
-library(discrim)
+# ```{r}
+# library(discrim)
 
-lda = discrim_linear() %>% 
-  set_engine("MASS") 
+# lda = discrim_linear() %>% 
+#   set_engine("MASS") 
 
-epi_workflow(r, spec= lda) %>%
-    parsnip::fit(data = jhu)
+# epi_workflow(r, spec= lda) %>%
+#     parsnip::fit(data = jhu)
 
 # wf <- workflow() %>% 
 #   add_recipe(r) %>% 
@@ -452,13 +453,14 @@ epi_workflow(r, spec= lda) %>%
 
 # predict(wf, latest)
 
-using_formula <-
-  discrim_linear() %>% 
-  set_engine("MASS") %>%
-  fit_xy(x = new_dat%>% 
-  select(-trend,-mask_indicator,-distancing_indicator,-time_value,-geo_value),
-          y= new_dat$trend)
-``` -->
+# using_formula <-
+#   discrim_linear() %>% 
+#   set_engine("MASS") %>%
+#   fit_xy(x = new_dat%>% 
+#   select(-trend,-mask_indicator,-distancing_indicator,-time_value,-geo_value),
+#           y= new_dat$trend)
+#``` 
+-->
 
 ## Reference
 * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 

From ecf075511a235b58582a403374f1431aad410232 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Wed, 24 Aug 2022 11:07:01 -0700
Subject: [PATCH 21/29] fix error

---
 vignettes/preprocessing-and-models.Rmd | 64 ++++++--------------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 9d38d90e0..6f69cda91 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -367,20 +367,17 @@ $$
  Z_{\ell, t}=
     \begin{cases}
       \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ 
-      \text{down}, & \text{if}\  Y^{\Delta}_{\ell, t} < -0.25\\
-      \text{flat}, & \text{otherwise}
+      \text{other}, & \text{otherwise}
     \end{cases}
 $$
 where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. 
 We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized
 as 'up', meaning the number of newly reported cases over the past 7 days has 
-increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ 
-is categorized as 'down', it suggests that there has been at least a 25% 
-decrease in newly reported cases over the past 7 days. Otherwise, we will 
-consider the trends to be flat. 
+increased by at least 25% compared to the preceding week. 
+
+TODO: add model expression here. (after code works?)
 
 TODO: try logistic regression on two categories and see if it works?
-<!-- 
 
 Preprocessing steps are similar to the previous models with an additional step 
 of categorizing the response variables. 
@@ -388,29 +385,20 @@ of categorizing the response variables.
 We will take a subset of death rate and case rate data from our built-in dataset 
 `case_death_rate_subset`.
 ```{r}
-jhu <- case_death_rate_subset %>%
-      dplyr::filter(time_value >= "2021-06-04", 
-                    time_value <= "2021-12-31",
-      geo_value %in% c("ca","fl","tx","ny","nj")) %>%
-      select(geo_value, time_value, case_rate) %>%
-      mutate(geo_value_factor = as.factor(geo_value)) %>%
-      full_join(behav_ind, by = c("geo_value", "time_value")) %>%
-      as_epi_df()
-
 binner <- function(x) {
-  x <- cut(x, breaks = c(-Inf, -0.25, 0.25, Inf), include.lowest = TRUE)
+  x <- cut(x, breaks = c(-Inf, 0.25, Inf), include.lowest = TRUE)
   # now return the group number
   as.numeric(x)
 }
 
-inc <- c("down", "flat", "up")
+inc <- c("other", "up")
 
 
 r <- epi_recipe(jhu) %>%
-     # add_role(time_value, new_role = "predictor") %>%
+      add_role(time_value, new_role = "predictor") %>%
       step_dummy(geo_value_factor) %>%
       step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
-      step_epi_ahead(case_rate, ahead = 7) %>%
+      step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>%
       step_mutate(pct_diff1 = 
       (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) %>%
       step_mutate(pct_diff2 = 
@@ -420,47 +408,25 @@ r <- epi_recipe(jhu) %>%
       step_num2factor(trend, 
                   role = "outcome", 
                   transform = binner, 
-                  levels = inc) %>%
+                  levels = inc)  %>%
       step_rm(case_rate,
               lag_0_case_rate, 
               lag_7_case_rate, 
               lag_14_case_rate, 
-              ahead_7_case_rate) %>%
-      step_naomit(all_predictors(), skip = FALSE) %>%
-      step_naomit(all_outcomes(), skip = FALSE) %>%
-      prep()
+              ahead_7_case_rate) 
+                  
 
-
-new_dat = bake(r, new_data = jhu)
 ```
 
 We will use linear discriminant analysis and take a look at the predictions:
-# ```{r}
-# library(discrim)
-
-# lda = discrim_linear() %>% 
-#   set_engine("MASS") 
-
-# epi_workflow(r, spec= lda) %>%
-#     parsnip::fit(data = jhu)
 
-# wf <- workflow() %>% 
-#   add_recipe(r) %>% 
-#   add_model(lda)
+```{r, eval=FALSE, echo=FALSE}
+# wf <- epi_workflow(r, parsnip::logistic_reg()) %>%
+#     fit(jhu)
 
 # latest <- get_test_data(recipe = r, x = jhu)
-# the_fit  <- fit(wf, data = latest)
-
 # predict(wf, latest)
-
-# using_formula <-
-#   discrim_linear() %>% 
-#   set_engine("MASS") %>%
-#   fit_xy(x = new_dat%>% 
-#   select(-trend,-mask_indicator,-distancing_indicator,-time_value,-geo_value),
-#           y= new_dat$trend)
-#``` 
--->
+``` 
 
 ## Reference
 * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 

From 63be91964df1335880acf52e0ead24d21e2da734 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Thu, 25 Aug 2022 18:30:06 -0700
Subject: [PATCH 22/29] classification working

---
 vignettes/preprocessing-and-models.Rmd | 93 +++++++++++++++-----------
 1 file changed, 54 insertions(+), 39 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 6f69cda91..82f2b4a92 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -321,7 +321,6 @@ obtain the death counts instead of death rates.
 ```{r}
 f <- frosting() %>%
     layer_predict() %>%
-    # layer_add_target_date() %>% ## NOTE: this gives 2022-01-21 instead of 2022-01-07?
     layer_add_target_date("2022-01-07") %>% # max(jhu$time_value) + 7
     layer_threshold(.pred, lower = 0) %>%
     layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>%
@@ -335,8 +334,20 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     add_frosting(f)
 
 latest <- get_test_data(recipe = r, x = jhu)
-predict(wf, latest)
-nested_quantiles(predict(wf, latest)$.pred_distn_original)
+p <- predict(wf, latest)
+p
+```
+
+To look at the prediction intervals:
+```{r}
+# method 1
+nested_quantiles(p$.pred_distn_original)
+# method 2
+dstn = p$.pred_distn_original
+q = distributional::parameters(dstn)$q # a list of length nrow(p)
+q
+tau = distributional::parameters(dstn)$tau
+tau
 ```
 
 The point estimate of death counts on Jan 7, 2022 is returned in column 
@@ -355,8 +366,8 @@ Sometimes predictive model for future trends are of interest. In other words,
 the target is to predict if the future will have increasing case rates(up),
 decreasing case rates (down), or flat case rates (flat). Such model are often 
 referred to as hotspot prediction models. We will refer to the model notations 
-from McDonald et al. and illustrate how modeling can be streamlined in our 
-framework with other classifcations. 
+from McDonald et al. but extend the application to predict three categories 
+instead of two. 
 
 Hotspot prediction predicts a categorical variable defined in terms of the 
 relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. 
@@ -367,17 +378,18 @@ $$
  Z_{\ell, t}=
     \begin{cases}
       \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ 
-      \text{other}, & \text{otherwise}
+      \text{down}, & \text{if}\  Y^{\Delta}_{\ell, t} < -0.25\\
+      \text{flat}, & \text{otherwise}
     \end{cases}
 $$
+
 where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. 
 We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized
 as 'up', meaning the number of newly reported cases over the past 7 days has 
-increased by at least 25% compared to the preceding week. 
-
-TODO: add model expression here. (after code works?)
-
-TODO: try logistic regression on two categories and see if it works?
+increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ 
+is categorized as 'down', it suggests that there has been at least a 25% 
+decrease in newly reported cases over the past 7 days. Otherwise, we will 
+consider the trends to be flat. 
 
 Preprocessing steps are similar to the previous models with an additional step 
 of categorizing the response variables. 
@@ -385,50 +397,53 @@ of categorizing the response variables.
 We will take a subset of death rate and case rate data from our built-in dataset 
 `case_death_rate_subset`.
 ```{r}
-binner <- function(x) {
-  x <- cut(x, breaks = c(-Inf, 0.25, Inf), include.lowest = TRUE)
-  # now return the group number
-  as.numeric(x)
-}
-
-inc <- c("other", "up")
-
+jhu <- case_death_rate_subset %>%
+  dplyr::filter(time_value >= "2021-06-04", 
+                time_value <= "2021-12-31",
+  geo_value %in% c("ca","fl","tx","ny","nj")) %>%
+  mutate(geo_value_factor = as.factor(geo_value)) %>%
+  as_epi_df()
 
 r <- epi_recipe(jhu) %>%
       add_role(time_value, new_role = "predictor") %>%
       step_dummy(geo_value_factor) %>%
       step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
       step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>%
-      step_mutate(pct_diff1 = 
-      (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) %>%
-      step_mutate(pct_diff2 = 
-      (lag_7_case_rate - lag_14_case_rate)/lag_14_case_rate) %>%
-      step_epi_naomit() %>%
-      step_mutate(trend = pct_diff1) %>%
-      step_num2factor(trend, 
-                  role = "outcome", 
-                  transform = binner, 
-                  levels = inc)  %>%
-      step_rm(case_rate,
+      step_mutate(pct_diff1 =  case_when(lag_7_case_rate == 0 ~ 0, 
+      TRUE ~ (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) ) %>%
+      step_mutate(pct_diff2 = case_when(lag_14_case_rate == 0 ~ 0,
+      TRUE ~ (lag_7_case_rate - lag_14_case_rate)/lag_14_case_rate)) %>%
+      step_mutate(trend = case_when(pct_diff1 < -0.25 ~ "down",
+                                    pct_diff1 > 0.25 ~ "up",
+                                    TRUE ~ "flat"), 
+                                    role = "outcome") %>% 
+      step_rm(death_rate,
+              case_rate,
               lag_0_case_rate, 
               lag_7_case_rate, 
               lag_14_case_rate, 
-              ahead_7_case_rate) 
+              ahead_7_case_rate) %>%
+      step_epi_naomit()
                   
-
 ```
 
-We will use linear discriminant analysis and take a look at the predictions:
+We will use a multinomial regression and take a look at the predictions:
 
-```{r, eval=FALSE, echo=FALSE}
-# wf <- epi_workflow(r, parsnip::logistic_reg()) %>%
-#     fit(jhu)
+```{r, warning=FALSE}
+wf <- epi_workflow(r, parsnip::multinom_reg()) %>%
+    fit(jhu)
+
+latest <- get_test_data(recipe = r, x = jhu)
+predict(wf, latest) %>% filter(!is.na(.pred_class))
+```
 
-# latest <- get_test_data(recipe = r, x = jhu)
-# predict(wf, latest)
-``` 
+We can also take a look at the fit:
+```{r}
+wf$fit$fit
+```
 
 ## Reference
+
 * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 
 forecasting and hotspot prediction?." Proceedings of the National Academy of 
 Sciences 118.51 (2021): e2111453118.

From 14a8a22b13612648083ec9bce03b80c4067d4069 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Fri, 26 Aug 2022 10:28:09 -0700
Subject: [PATCH 23/29] show how to use formula

---
 vignettes/preprocessing-and-models.Rmd | 43 +++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 82f2b4a92..266f8fbc1 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -391,6 +391,32 @@ is categorized as 'down', it suggests that there has been at least a 25%
 decrease in newly reported cases over the past 7 days. Otherwise, we will 
 consider the trends to be flat. 
 
+The expression of the multinomial regression we will model is as follows:
+
+$$
+\pi_{j}(x) = \text{Pr}(Y = j|x) = \frac{e^{g_j(x)}}{\sum_{k=0}^2 g_j(x) }
+$$
+where $j$ is either down(Y=0), flat(Y=1), or up (Y=2),
+$$g_0(x) = 0$$
+
+\begin{aligned}
+g_1(x)= \text{ln}(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}) = 
+\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_10 s_{\text{state_1}} +
+\delta_11 s_{\text{state_2}} + ... \nonumber \\
++ \beta_{12} Y^{\Delta}_{\ell, t} +
+\beta_{13} Y^{\Delta}_{\ell, t-7}
+\end{aligned}
+
+\begin{aligned}
+g_2(x)= \text{ln}(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}) = 
+\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_20 s_{\text{state_1}} +
+\delta_21 s_{\text{state_2}} + ... \nonumber \\
++ \beta_{22} Y^{\Delta}_{\ell, t} +
+\beta_{23} Y^{\Delta}_{\ell, t-7}
+\end{aligned}
+
+
+
 Preprocessing steps are similar to the previous models with an additional step 
 of categorizing the response variables. 
 
@@ -427,7 +453,7 @@ r <- epi_recipe(jhu) %>%
                   
 ```
 
-We will use a multinomial regression and take a look at the predictions:
+We will fit the multinomial regression and take a look at the predictions:
 
 ```{r, warning=FALSE}
 wf <- epi_workflow(r, parsnip::multinom_reg()) %>%
@@ -442,6 +468,21 @@ We can also take a look at the fit:
 wf$fit$fit
 ```
 
+One could also use formula in `epi_recipe()` to achieve the same results as 
+above. However, only one of `add_formula()`, `add_recipe()`, or 
+`workflow_variables()` must be specified. For the purpose of using `add_formula`
+over `add_recipes`, we will prep and bake our recipe to return a dataset that 
+will be used for model fitting.
+```{r}
+b <- bake(prep(r,jhu),jhu)
+m_reg <-  parsnip::multinom_reg()
+
+epi_workflow() %>%
+add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2 )%>%
+add_model(m_reg) %>%
+fit(data = b)
+```
+
 ## Reference
 
 * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 

From b91bc99c34d9b391502a898dc84f5491b60fd254 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Fri, 26 Aug 2022 13:49:09 -0700
Subject: [PATCH 24/29] addressed all feedbacks

---
 vignettes/more-recipes.Rmd             | 277 -------------------------
 vignettes/preprocessing-and-models.Rmd |  74 ++++++-
 2 files changed, 66 insertions(+), 285 deletions(-)
 delete mode 100644 vignettes/more-recipes.Rmd

diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd
deleted file mode 100644
index dee09fd1d..000000000
--- a/vignettes/more-recipes.Rmd
+++ /dev/null
@@ -1,277 +0,0 @@
----
-title: Apply other steps and models from recipes and parsnip packages
-output: rmarkdown::html_vignette
-vignette: >
-  %\VignetteIndexEntry{Apply other steps and models from recipes and parsnip packages}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
----
-The `epipredict` package utilizes the `tidymodels` framework, namely 
-[`recipes`](https://recipes.tidymodels.org/) for 
-[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences 
-of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a 
-unified interface to a range of models. 
-
-`epipredict` has additional customized feature engineering and preprocessing steps 
-tailored to fit the needs of epidemiology forecasting, such as `step_epi_shift()`, 
-`step_population_scaling()`, `step_epi_naomit()`. They can be used along with 
-steps from the `recipes` package for more feature engineering. 
-
-In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip`
-models into the `epipredict` workflow. 
-
-First, let's load the necessary packages:
-
-```{r, warning=FALSE, message =FALSE}
-library(epipredict)
-library(epiprocess)
-library(recipes)
-library(parsnip)
-library(workflows)
-library(poissonreg)
-```
-
-We will use the built-in dataset `case_death_rate_subset` and apply feature 
-engineering steps and fit a few models from `parsnip` to predict future death 
-rates by states 7 days ahead from Dec 31, 2021.
-
-```{r}
-x <- case_death_rate_subset
-x$geo_value <- as.factor(x$geo_value)
-tail(x)
-```
-There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates
-ranging from `r min(x$time_value)` to `r max(x$time_value)` in 
-`r n_distinct(x$geo_value)` states in the U.S.
-
-# Preprocessing and Feature Engineering
-
-In this section, we focus on showing how `step_*()` functions from the `recipes`
-package can also be applied in our `epipredict` workflows. 
-
-First, create a recipe object from the original data and then specify the 
-preprocessing steps. Steps are then sequentially added.
-
-`recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) 
-in bulk. There are a couple handy functions that can be used together to help us 
-manipulate variable roles easily. 
-
-`update_role()` alters an existing role in the recipe or assigns an initial role 
-to variables that do not yet have a declared role.
-
-`add_role()` adds an additional role to variables that already have a role in 
-the recipe, without overwriting old roles.
-
-`remove_role()` eliminates a single existing role in the recipe.
-
-Here, we will add an additional `predictor` role to both `case_rate` and 
-`death_rate` variables. They currently have roles as `raw`.
-
-```{r create-recipe-object}
-r <- epi_recipe(x) %>% 
-  add_role(- epi_keys(x), new_role = "predictor") 
-```
-
-The function automatically categorizes roles into `geo_value`, `time_value` and 
-`raw` based on variable names, where `raw` includes `case_rate` and `death_rate`. 
-
-We first create a made-up population dataset by states to allow the conversion
-from `case_rate` and `death_rate` into `cases` and  `deaths`. The binary variable
-`social_distancing` indicates whether the state followed social distancing rules.
-A 'throw-away' column with all ones is also created to show that `step_rm()` 
-can be used to remove unwanted columns.
-```{r madeup-population-data}
-pop_data <- data.frame(states = unique(x$geo_value),
-                       inv_pop = 1/seq(1e5, 1e7, 
-                                       length.out = n_distinct(x$geo_value)),
-                       social_distancing = rbinom(n = n_distinct(x$geo_value), 
-                                                  size = 1,
-                                                  prob = 0.5),
-                       throw_away = 1)
-
-head(pop_data)
-```
-
-Next, we will feature engineer `case_rate` and `death_rate` into `cases` and 
-`deaths`and create lags and leads based on them. 
-
-```{r create-new-columns-in-recipe}
-r <- r %>%
-  step_population_scaling(case_rate, death_rate, df = pop_data, 
-                          by = c("geo_value" = "states"),
-                          create_new = FALSE,
-                          df_pop_col = "inv_pop") %>%
-  step_rename(cases = case_rate, deaths = death_rate) %>%
-  step_rm(throw_away) %>%
-  step_epi_lag(cases, lag = c(0, 7, 14)) %>%
-  step_epi_lag(deaths, lag = c(0, 7, 14)) %>%
-  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-  step_epi_naomit() %>%
-  step_rm(cases, deaths)
-
-```
-
-Notice that `step_rename()` was used for renaming of the variables, and 
-`step_rm()` was used to remove redundant columns.
-
-A quick view of what columns exist in the the prepossessed data so far:
-
-```{r peak-data-1}
-names(bake(prep(r, x),x))
-```
-
-
-There are also many functions in the `recipes` package that allow for 
-[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
-such as log transformations and spline transformations. In our case, we will 
-center the numerical predictors to allow for meaningful interpretation of the 
-intercept. 
-
-```{r}
-r <- r %>% step_center(contains("lag"), role = "predictor")
-```
-
-Since the current outcomes are positive, but the linear regression assumes the 
-outcome to be in full support, we will consider a log-transformation with an 
-offset value of 1 to ensure `log(0)` does not occur. 
-
-```{r}
-r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome", offset = 1)
-```
-
-Other useful functionalities include creating a set of binary dummy variables 
-from a factor variable. As an illustration that will not affect the models, we 
-will show how states in `geo_value` become dummy variables so that each state 
-as a dummy variable has a differential intercept coefficient.
-
-```{r dummy-variable}
-length(unique(x$geo_value))
-
-dummies <- r %>% 
-  step_dummy(geo_value) %>%
-  prep(training = x) 
-
-dummy_data <- bake(dummies, x)
-
-names(dummy_data)
-```
-
-Once the pre-processing steps are done, we can move on to model fitting using 
-the `parsnip` package. 
-
-
-# Model Fitting and Post-processing
-
-Sometimes, the exact values are desired, therefore fitting a linear regression 
-or poisson regression would be a good idea. 
-
-On the other hand, say instead of predicting 7-day ahead death counts, we would 
-like to classify if the future predictions are high or low based on some given threshold. 
-In this case, a logistic regression among other classification models can be used.
-For the purpose of illustration, we define two categories: less than 10K(low) 
-and more than 10K(high). 
-
-We resort to the `recipes` package again to change the numerical outcomes to 
-categorical for the purpose of classification:
-
-```{r, warning = FALSE}
-binner <- function(x) {
-  x <- cut(x, breaks = c(0, 10000, Inf), include.lowest = TRUE)
-  # now return the group number
-  as.numeric(x)
-}
-
-inc <- c("low", "high")
-
-r_categorical <- r %>% 
-  step_num2factor(ahead_7_deaths, 
-                  role = "outcome", 
-                  transform = binner, 
-                  levels = inc)
-
-```
-
-As a sanity check we can see now that the preprocessed data has a categorical 
-outcome in `ahead_7_deaths`. 
-
-```{r, warning FALSE, echo = FALSE}
-glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6))
-```
-
-
-### Linear Regression
-
-First, we can try fitting a linear regression:
-
-```{r linear-fit-workflow, warning = FALSE}
-wf <- epi_workflow(r_lin, parsnip::linear_reg()) %>%
-    fit(x) 
-
-latest <- get_test_data(recipe = r_lin, x = x)
-```
-
-We take a look at the linear regression fit and the values of the coefficients.
-
-```{r linear-fit, warning = FALSE}
-wf$fit$fit # look at the model fit and coefficients
-
-p <- predict(wf, latest) %>% filter(!is.na(.pred))
-
-p
-```
-
-The predictions are saved in the `.pred` column and represents the log-transformed 
-predicted death counts 7 days from Dec 31. 2021. 
-
-
-### Poisson regression
-
-For count data, we can use poisson regression. Although the counts here are not 
-integer since they were calculated from case rates multiplying population, some 
-extra preprocessing steps can help us round them to integers to allow fitting a 
-poisson regression.
-
-
-```{r poisson-fit}
-r <- r %>% 
-  step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), 
-                       role = "predictor") %>%
-  step_filter(ahead_7_deaths > 0)
-
-latest <- get_test_data(recipe = r, x = x)
-
-wf <- epi_workflow(r, parsnip::poisson_reg()) %>%
-    fit(x) 
-
-predict(wf, latest) %>% filter(!is.na(.pred))
-```
-
-Let's also take a look at the poisson regression fit:
-```{r, echo =FALSE}
-wf$fit$fit
-```
-
-### Logistic Regression
-
-We use a logistic regression to predict whether 7-day ahead death counts are high
-or low. 
-```{r logistic, warning = FALSE}
-wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>%
-    fit(x) 
-latest <- get_test_data(recipe = r_categorical, x = x)
-p <- predict(wf, latest) %>% filter(!is.na(.pred_class))
-p
-```
-
-The model fit is as follows:
-```{r echo=FALSE}
-wf$fit$fit
-```
-
-# Attribution
-
-This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html)
-
-This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University 
-on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020.
-
diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 266f8fbc1..673b3cf9f 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -321,7 +321,7 @@ obtain the death counts instead of death rates.
 ```{r}
 f <- frosting() %>%
     layer_predict() %>%
-    layer_add_target_date("2022-01-07") %>% # max(jhu$time_value) + 7
+    layer_add_target_date("2022-01-07") %>% 
     layer_threshold(.pred, lower = 0) %>%
     layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>%
     layer_naomit(.pred)  %>%
@@ -401,16 +401,16 @@ $$g_0(x) = 0$$
 
 \begin{aligned}
 g_1(x)= \text{ln}(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}) = 
-\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_10 s_{\text{state_1}} +
-\delta_11 s_{\text{state_2}} + ... \nonumber \\
+\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_{10} s_{\text{state_1}} +
+\delta_{11} s_{\text{state_2}} + ... \nonumber \\
 + \beta_{12} Y^{\Delta}_{\ell, t} +
 \beta_{13} Y^{\Delta}_{\ell, t-7}
 \end{aligned}
 
 \begin{aligned}
 g_2(x)= \text{ln}(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}) = 
-\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_20 s_{\text{state_1}} +
-\delta_21 s_{\text{state_2}} + ... \nonumber \\
+\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_{20} s_{\text{state_1}} +
+\delta_{21} s_{\text{state_2}} + ... \nonumber \\
 + \beta_{22} Y^{\Delta}_{\ell, t} +
 \beta_{23} Y^{\Delta}_{\ell, t-7}
 \end{aligned}
@@ -478,16 +478,74 @@ b <- bake(prep(r,jhu),jhu)
 m_reg <-  parsnip::multinom_reg()
 
 epi_workflow() %>%
-add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2 )%>%
+add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2) %>%
 add_model(m_reg) %>%
 fit(data = b)
 ```
 
-## Reference
+## Benefits of Lagging and Leading in `epipredict`
 
-* McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 
+The `step_epi_ahead` and `step_epi_lag` functions in the `epipredict` package
+is handy for creating correct lags and leads for future predictions. 
+
+Let's start with a simple dataset and preprocessing:
+```{r}
+ex <- case_death_rate_subset %>%
+  dplyr::filter(time_value >= "2021-12-01", 
+                time_value <= "2021-12-31",
+  geo_value == "ca")
+
+dim(ex)
+```
+
+We want to predict death rates on 2022-01-07, which 7 days ahead from the last 
+available date in our dataset. 
+
+We will compare two methods of trying to create lags and leads:
+```{r}
+##########
+r1 <- epi_recipe(ex) %>%
+      step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+      step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+      step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+      step_epi_naomit() %>%
+      prep()
+
+b1 <- bake(r1, ex)
+dim(b1)
+##########
+r2 <- epi_recipe(ex) %>%
+    step_mutate(lag0case_rate = lag(case_rate, 0),
+                lag7case_rate = lag(case_rate, 7),
+                lag14case_rate = lag(case_rate, 14),
+                lag0death_rate = lag(death_rate, 0),
+                lag7death_rate = lag(death_rate, 7),
+                lag14death_rate = lag(death_rate, 14),
+                ahead7death_rate = lead(death_rate, 7)) %>%
+    step_epi_naomit() %>%
+                prep()
+
+b2 <- bake(r2, ex)
+dim(b2)
+```
+
+Notice the difference in number of rows `b1` and `b2` returns. This is because 
+the recipe that doesn't use `step_epi_ahead` and `step_epi_lag` is missing dates
+compared to the one that used the `epipredict` functions. Therefore the model 
+that is trained is predicting 7 days ahead from `r max(unique(b2$time_value))`
+instead of 7 days ahead from `r max(unique(b1$time_value))` which is what we want.
+```{r}
+unique(b1$time_value)
+unique(b2$time_value)
+```
+
+## References
+
+::: {#refs}
+McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 
 forecasting and hotspot prediction?." Proceedings of the National Academy of 
 Sciences 118.51 (2021): e2111453118.
+:::
 
 ## Attribution
 

From 73805e3010941c8989f3dde03aeec6aa30c75c40 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Fri, 26 Aug 2022 14:10:47 -0700
Subject: [PATCH 25/29] fix layout

---
 vignettes/preprocessing-and-models.Rmd | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 673b3cf9f..3729a3a9b 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -498,12 +498,11 @@ ex <- case_death_rate_subset %>%
 dim(ex)
 ```
 
-We want to predict death rates on 2022-01-07, which 7 days ahead from the last 
+We want to predict death rates on 2022-01-07, which is 7 days ahead from the last 
 available date in our dataset. 
 
 We will compare two methods of trying to create lags and leads:
 ```{r}
-##########
 r1 <- epi_recipe(ex) %>%
       step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
       step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
@@ -513,7 +512,8 @@ r1 <- epi_recipe(ex) %>%
 
 b1 <- bake(r1, ex)
 dim(b1)
-##########
+
+
 r2 <- epi_recipe(ex) %>%
     step_mutate(lag0case_rate = lag(case_rate, 0),
                 lag7case_rate = lag(case_rate, 7),
@@ -531,14 +531,23 @@ dim(b2)
 
 Notice the difference in number of rows `b1` and `b2` returns. This is because 
 the recipe that doesn't use `step_epi_ahead` and `step_epi_lag` is missing dates
-compared to the one that used the `epipredict` functions. Therefore the model 
-that is trained is predicting 7 days ahead from `r max(unique(b2$time_value))`
-instead of 7 days ahead from `r max(unique(b1$time_value))` which is what we want.
+compared to the one that used the `epipredict` functions.
 ```{r}
-unique(b1$time_value)
-unique(b2$time_value)
+dates_used_in_training1 <-
+ b1 %>% select(- ahead_7_death_rate) %>% na.omit() %>% select(time_value)
+
+dates_used_in_training1
+
+dates_used_in_training2 <- 
+  b2 %>% select(- ahead7death_rate) %>% na.omit() %>% select(time_value)
+
+dates_used_in_training2
 ```
 
+The model that is trained without the functions is predicting 7 days ahead from 
+`r max(dates_used_in_training2$time_value)`
+instead of 7 days ahead from `r max(dates_used_in_training1$time_value)`.
+
 ## References
 
 ::: {#refs}

From 2355aa31507f52f259f5c0d60e2c9950dbd35df9 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Tue, 13 Sep 2022 11:32:40 -0700
Subject: [PATCH 26/29] some simplifications, (may need to revert).

---
 vignettes/preprocessing-and-models.Rmd | 84 +++++++++++++-------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 3729a3a9b..4764ee8ac 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -36,7 +36,9 @@ we will use a linear regression to predict death rates. Last but not least, we
 will create a classification model for hotspot predictions. 
 
 ```{r, warning=FALSE, message=FALSE}
-library(covidcast)
+# library(covidcast)
+library(tidyr)
+library(dplyr)
 library(epidatr)
 library(epiprocess)
 library(epipredict)
@@ -62,33 +64,21 @@ for using the `epipredict` package with other existing tidymodel packages.
 ## NOTE: built-in data? or use epiprocess built-in data and save code here
 x <- covidcast(
   data_source = "jhu-csse",
-  signals = "confirmed_incidence_num",
+  signals = "confirmed_incidence_num,deaths_incidence_num",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20210604, 20211231),
-  geo_values = "ca,fl,tx,ny,nj"
-) %>%
-  fetch_tbl() %>%
-  select(geo_value, time_value, cases = value)
-
-y <- covidcast(
-  data_source = "jhu-csse",
-  signals = "deaths_incidence_num",
-  time_type = "day",
-  geo_type = "state",
-  time_values = epirange(20210604, 20211231),
-  geo_values = "ca,fl,tx,ny,nj"
-) %>%
-  fetch_tbl() %>%
-  select(geo_value, time_value, deaths = value)
+  geo_values = "ca,fl,tx,ny,nj") %>%
+  fetch_tbl() 
 
 case_death_counts_subset <- x %>%
-  full_join(y, by = c("geo_value", "time_value")) %>%
+  pivot_wider(names_from = signal, values_from = value) %>%
+  rename(cases = confirmed_incidence_num, deaths = deaths_incidence_num) %>%
+  select(geo_value, time_value, cases, deaths) %>%
   as_epi_df()
-
 ```
 
-The `case_death_counts_subset` dataset comes from the `covidcast` package,and 
+The `case_death_counts_subset` dataset comes from the `epidatr` package,and 
 contains the number of confirmed cases and deaths from June 4, 2021 to 
 Dec 31, 2021 in some U.S. states. 
 
@@ -115,30 +105,38 @@ data ready for modeling.
 But before diving into them, it will be helpful to understand what `roles` are
 in the `recipes` framework. 
 
-> `recipes` can assign one or more roles to each column in the data. The roles 
-> are not restricted to a predefined set; they can be anything. 
-> For most conventional situations, they are typically “predictor” and/or 
-> “outcome”. Additional roles enable targeted step operations on specific 
-> variables or groups of variables.
->
-> In our case, the role `predictor` is given to explanatory variable that go to the 
-> left-hand side of the model. The role `outcome` is the response variable 
-> that we wish to predict. `geo_value` and `time_value` are predefined roles 
-> that are unique to the `epipredict` package. Since we work with `epi_df` 
-> objects, all datasets should have `geo_value` and `time_value` passed through
-> automatically with these two roles assigned to the appropriate columns.
-> 
-> `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) 
-> in bulk. There are a couple handy functions that can be used together to help us 
-> manipulate variable roles easily. 
-> 
->> `update_role()` alters an existing role in the recipe or assigns an initial role 
->> to variables that do not yet have a declared role.
+---
+
+#### Aside on `recipes`
+
+`recipes` can assign one or more roles to each column in the data. The roles 
+are not restricted to a predefined set; they can be anything. 
+For most conventional situations, they are typically “predictor” and/or 
+"outcome". Additional roles enable targeted `step_*()` operations on specific 
+variables or groups of variables.
+
+In our case, the role `predictor` is given to explanatory variable that go to the 
+left-hand side of the model. The role `outcome` is the response variable 
+that we wish to predict. `geo_value` and `time_value` are predefined roles 
+that are unique to the `epipredict` package. Since we work with `epi_df` 
+objects, all datasets should have `geo_value` and `time_value` passed through
+automatically with these two roles assigned to the appropriate columns.
+ 
+The `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) 
+in bulk. There are a couple handy functions that can be used together to help us 
+manipulate variable roles easily. 
+
+> `update_role()` alters an existing role in the recipe or assigns an initial role 
+> to variables that do not yet have a declared role.
 > 
->> `add_role()` adds an additional role to variables that already have a role in 
->> the recipe, without overwriting old roles.
+> `add_role()` adds an additional role to variables that already have a role in 
+> the recipe, without overwriting old roles.
 > 
->> `remove_role()` eliminates a single existing role in the recipe.
+> `remove_role()` eliminates a single existing role in the recipe.
+
+#### End aside
+
+---
 
 Notice in the following preprocessing steps, we used `add_role()` on 
 `geo_value_factor` since currently the default role for it is `raw` and 
@@ -168,7 +166,7 @@ modeling and getting the prediction for the 7-day ahead death counts of the
 latest available date in the dataset. 
 
 ```{r}
-latest <- get_test_data(recipe = r, x = case_death_counts_subset)
+latest <- get_test_data(r, case_death_counts_subset)
 
 wf <- epi_workflow(r, parsnip::poisson_reg()) %>% 
       fit(case_death_counts_subset)

From b844be7a40e9b7280cd970702238a3da4ae80f8e Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Tue, 13 Sep 2022 17:15:25 -0700
Subject: [PATCH 27/29] revise vignette, improve population scaling, add census
 data

---
 DESCRIPTION                              |   2 +-
 R/data.R                                 |  23 ++
 R/layer_population_scaling.R             |  77 +++--
 data-raw/state_census.R                  |  11 +
 data/state_census.rda                    | Bin 0 -> 1274 bytes
 man/create_layer.Rd                      |   4 +-
 man/state_census.Rd                      |  33 ++
 tests/testthat/test-population_scaling.R |  16 +
 vignettes/preprocessing-and-models.Rmd   | 374 ++++++++++++-----------
 9 files changed, 329 insertions(+), 211 deletions(-)
 create mode 100644 data-raw/state_census.R
 create mode 100644 data/state_census.rda
 create mode 100644 man/state_census.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 4ff288b44..9056c3f9a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -65,5 +65,5 @@ Remotes:
 Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.1
 LazyData: true
diff --git a/R/data.R b/R/data.R
index 9c93506a3..622d1c7d7 100644
--- a/R/data.R
+++ b/R/data.R
@@ -33,3 +33,26 @@
 #'   June 7 is the average of the underlying data for June 1 through 7,
 #'   inclusive.
 "case_death_rate_subset"
+
+#' State population data
+#'
+#' Data set on state populations, from the 2019 US Census.
+#'
+#' @format Data frame with 57 rows (including one for the United States as a
+#'   whole, plus the District of Columbia, Puerto Rico Commonwealth,
+#'   American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana,
+#'   Islands).
+#'
+#' \describe{
+#'   \item{fips}{FIPS code}
+#'   \item{name}{Full name of the state or territory}
+#'   \item{pop}{Estimate of the location's resident population in
+#'      2019.}
+#'   \item{abbr}{Postal abbreviation for the location}
+#' }
+#'
+#' @source United States Census Bureau, at
+#'   \url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf},
+#'   \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html},
+#'   and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html}
+"state_census"
diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 8e219a9e4..29a5cd14a 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -1,35 +1,43 @@
-#' Revert population scaled prediction
+#' Convert per-capita predictions to raw scale
 #'
 #' `layer_population_scaling` creates a specification of a frosting layer
-#' that will add a population scaled column in the data. For example,
-#' load a dataset that contains county population, and join to an `epi_df`
-#' that currently predicts number of new cases by county to obtain case rates.
-#' Although worth noting that there is nothing special about "population".
-#' The function can be used to scale by any variable. Population is simply the
-#' most natural and common use case.
+#' that will "undo" per-capita scaling. Typical usage would
+#' load a dataset that contains state-level population, and use it to convert
+#' predictions made from a rate-scale model to raw scale by multiplying by
+#' the population.
+#' Although, it is worth noting that there is nothing special about "population".
+#' The function can be used to scale by any variable. Population is the
+#' standard use case in the epidemiology forecasting scenario. Any value
+#' passed will *multiply* the selected variables while the `rate_rescaling`
+#' argument is a common *divisor* of the selected variables.
 #'
 #' @param frosting a `frosting` postprocessor. The layer will be added to the
-#'  sequence of operations  for this frosting.
+#'   sequence of operations for this frosting.
 #' @param ... One or more selector functions to scale variables
-#'  for this step. See [selections()] for more details.
-#' @param df a data frame that contains the population data used for scaling.
-#' @param by A character vector of variables to left join by.
+#'   for this step. See [selections()] for more details.
+#' @param df a data frame that contains the population data to be used for
+#'   inverting the existing scaling.
+#' @param by A (possibly named) character vector of variables to join by.
 #'
 #' If `NULL`, the default, the function will perform a natural join, using all
-#' variables in common across the `epi_df` and the user-provided dataset.
-#' If columns in `epi_df` and `df` have the same name (and aren't
-#' included in by), `.df` is added to the one from the user-provided data
+#' variables in common across the `epi_df` produced by the `predict()` call
+#' and the user-provided dataset.
+#' If columns in that `epi_df` and `df` have the same name (and aren't
+#' included in `by`), `.df` is added to the one from the user-provided data
 #' to disambiguate.
 #'
 #' To join by different variables on the `epi_df` and `df`, use a named vector.
-#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value`
+#' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
 #' to `df$states`. To join by multiple variables, use a vector with length > 1.
-#' For example, by = c("geo_value" = "states", "county" = "county") will match
+#' For example, `by = c("geo_value" = "states", "county" = "county")` will match
 #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
 #'
 #' See [dplyr::left_join()] for more details.
 #' @param df_pop_col the name of the column in the data frame `df` that
 #' contains the population data and used for scaling.
+#' @param rate_rescaling Sometimes rates are "per 100K" or "per 1M" rather than
+#'   "per person". Adjustments can be made here. For example, if the original
+#'   rate is "per 100K", then set `rate_rescaling = 1e5` to get counts back.
 #' @param create_new TRUE to create a new column and keep the original column
 #' in the `epi_df`.
 #' @param suffix a character. The suffix added to the column name if
@@ -46,8 +54,7 @@
 #'   dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
 #'   dplyr::select(geo_value, time_value, cases)
 #'
-#' pop_data = data.frame(states = c("ca", "ny"),
-#'                       value = c(20000, 30000))
+#' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
 #'
 #' r <- epi_recipe(jhu) %>%
 #'   step_population_scaling(df = pop_data,
@@ -71,12 +78,12 @@
 #'   parsnip::fit(jhu) %>%
 #'   add_frosting(f)
 #'
-#' latest <- get_test_data(recipe = r,
-#'                         x = epiprocess::jhu_csse_daily_subset %>%
-#'                           dplyr::filter(time_value > "2021-11-01",
-#'                                         geo_value %in% c("ca", "ny")) %>%
-#'                           dplyr::select(geo_value, time_value, cases))
-#'
+#' latest <- get_test_data(
+#'   recipe = r,
+#'   x = epiprocess::jhu_csse_daily_subset %>%
+#'     dplyr::filter(time_value > "2021-11-01",
+#'                   geo_value %in% c("ca", "ny")) %>%
+#'     dplyr::select(geo_value, time_value, cases))
 #'
 #' predict(wf, latest)
 layer_population_scaling <- function(frosting,
@@ -84,17 +91,26 @@ layer_population_scaling <- function(frosting,
                            df,
                            by = NULL,
                            df_pop_col,
+                           rate_rescaling = 1,
                            create_new = TRUE,
-                           suffix = "_original",
+                           suffix = "_scaled",
                            .flag = TRUE,
                            id = rand_id("population_scaling")) {
 
+  arg_is_scalar(df_pop_col, rate_rescaling, create_new, suffix, .flag, id)
+  arg_is_lgl(create_new, .flag)
+  arg_is_chr(df_pop_col, suffix, id)
+  arg_is_chr(by, allow_null = TRUE)
+  if (rate_rescaling <= 0)
+    cli_stop("`rate_rescaling` should be a positive number")
+
   add_layer(
     frosting,
     layer_population_scaling_new(
       df = df,
       by = by,
       df_pop_col = df_pop_col,
+      rate_rescaling = rate_rescaling,
       terms = dplyr::enquos(...),
       create_new = create_new,
       suffix = suffix,
@@ -105,11 +121,12 @@ layer_population_scaling <- function(frosting,
 }
 
 layer_population_scaling_new <-
-  function(df, by, df_pop_col, terms, create_new, suffix, id) {
+  function(df, by, df_pop_col, rate_rescaling, terms, create_new, suffix, id) {
   layer("population_scaling",
         df = df,
         by = by,
         df_pop_col = df_pop_col,
+        rate_rescaling = rate_rescaling,
         terms = terms,
         create_new = create_new,
         suffix = suffix,
@@ -125,8 +142,9 @@ slather.layer_population_scaling <-
     try_join <- try(dplyr::left_join(components$predictions, object$df,
                               by= object$by),
              silent = TRUE)
-    if (any(grepl("Join columns must be present in data", unlist(try_join)))){
-      stop("columns in `by` selectors of `layer_population_scaling` must be present in data and match")}
+    if (any(grepl("Join columns must be present in data", unlist(try_join)))) {
+      cli_stop(c("columns in `by` selectors of `layer_population_scaling`",
+                 "must be present in data and match"))}
 
     object$df <- object$df %>%
       dplyr::mutate(dplyr::across(where(is.character), tolower))
@@ -136,13 +154,12 @@ slather.layer_population_scaling <-
     col_names <- names(pos)
     suffix = ifelse(object$create_new, object$suffix, "")
 
-
     components$predictions <- dplyr::left_join(components$predictions,
                                                object$df,
                                                by= object$by,
                                                suffix = c("", ".df")) %>%
       dplyr::mutate(dplyr::across(dplyr::all_of(col_names),
-                                  ~.x * !!pop_col ,
+                                  ~.x * !!pop_col / object$rate_rescaling ,
                                   .names = "{.col}{suffix}")) %>%
      dplyr::select(- !!pop_col)
     components
diff --git a/data-raw/state_census.R b/data-raw/state_census.R
new file mode 100644
index 000000000..22dde5a41
--- /dev/null
+++ b/data-raw/state_census.R
@@ -0,0 +1,11 @@
+library(dplyr)
+library(tidyr)
+
+state_census <- covidcast::state_census %>%
+  select(STATE, NAME, POPESTIMATE2019, ABBR) %>%
+  rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>%
+  mutate(abbr = tolower(abbr)) %>%
+  as_tibble()
+
+
+usethis::use_data(state_census, overwrite = TRUE)
diff --git a/data/state_census.rda b/data/state_census.rda
new file mode 100644
index 0000000000000000000000000000000000000000..1118db0d01236a12d67aac1d43ce2f87a694d6f0
GIT binary patch
literal 1274
zcmV<W1O@v-T4*^jL0KkKS<LetEC2(_fB*mg{YwAafB*mg|N8&`-}u~PYZOeejH<+t
z1%OZ)WA)Gj$Bs%(bse;DQjHoAX`^U^CPPC;fEYswrZh&HGGsIZKmat*14AL84FgRy
zG}8?bieOU;cnLi;X{36AJsO^;s4^G{k%ZGjOw`aa00Rk;pwQ8#CMJv~CX6E{Ar$g7
z2nnW>O*I=t^i8M%qa#B=00TklVgMQd001;J0MGyc1Q{@cAQ~8hCYmx}BLXsMlS4)z
z#ALuGjRB(|Vj5vI$kRrFlO{|6Bv6Wv)if~x(?AUaKmY&$^nf$~00006O+8H-dVl}`
z4FQ*ISdcj1#rOPL@tgmOWRN0ikg9V-z7#)Oj-Y@oPgvSpD7U#Vt!3Kf=NJyfmF4R@
z=9l;lYtJ^pZsQ9N64lN#tgM}5Cl>?(3jm}fOdtzFKnMZ`fdsZNA!j`)NJv(#CNSDe
zIR;&qcrQF)={Y@1AmIFtVam!Go0t<(1w$yHfkP*KPwZj{zA!+m7<}pXVgf<6nh6M2
z(5W|Z_AT!=c9c;)y*p{8HAzt-a}~L5cYhDM|9TY3BcqB1#aqE1AhJPhWc0LXk)-$_
zPXa~ZOt7RKK};2zK<6zW)v(f)kSIYC6p4O$%;)hb08<co1!_V`i4Yna0R_>aeq@L#
zG$2ieeWZ{Xrhy>0VM+Ts*3lLcqz?l@agKqQbhLiiB*&CQAB^`IH0;9)*n`NMYN!!1
zsef3<0;Cm)hnz$L7$p#*;-y_wrZteA;+>P2Z^Hnt2d{D%-K(*VE;c(XtvjF9gy{;o
zw%cuuw%!f|8yj?0rqBw~zgr7gDW87v4_|A+LLHQc&bmPP2fh4|Swuu4RLK|q!azEC
zAUlvXt9ZaiIxAw)aySlQ-4P~vK(`^xm=R-#%**uUQ-80qkRU{Brg233Vi_t-u%^az
zY|&7RTod99DS-tjkoj_<0@&){lmixxoVIS^VB?M|VcsB7bYv<Z5+OjL)>}90n$-WF
zIY<Wz!*A+-H?<V#*vu1zzynrJFxzbaiPmr~T2K!WU}ULV1ZN#aBCn%6?-FsTOi+X|
zxIzaQgN#kQJ9dSm3_;Ne3yNULoYb|-M2oe>+FIIR$FRP|c%Z7m0_-3QKuz~wrFusq
zaD;RMIZOH)QW-!L5#p^ZMm?B}>ynqAWPBxM$!G$Fh(Q)pB!-yjnQWkvrUJBbf`%dN
z2S9_f@Es7OZFA_6HPbwlz;Bou2;*WgUzyuY0wV%SgPak~DAH(?f-FeIx~4)yAzOir
zSy>wRN^%p$MZ|U%87c`CY=rlOz?wjs=?5lI69ge-1Y|#Hqp~K|DaLnfd}0jH9f4JE
z0E#I>DJn;7T66OdfFQsZBq6@5%B?lx2-3|W5H3@eki<b6Gee52aTEb0FHnbt2`NQ(
zqI4uF6$&60Hqataq4}2OEkKZ&=g@#7cxw`atzq%AopTs21EBOb26WR1)SlrHbj4M)
zW`R_dn^olKMa;MuQX1RXZq*xz2mxd=f{^qk6%xWkv0S!;W;Vc{<#J@AjEPdT2i?9}
zG^nVhLqh!OH?nu4Q^?e?k0y^^syAVy#SX|oS`dr(%r!?Ewc<xeWV=vAxj62~mPmxc
k5aBA2OHT)NNU3QnI_9QKdI`73KqvUSk}1N3hG&@RV68PD9{>OV

literal 0
HcmV?d00001

diff --git a/man/create_layer.Rd b/man/create_layer.Rd
index 7917e8854..81f5e33b0 100644
--- a/man/create_layer.Rd
+++ b/man/create_layer.Rd
@@ -9,9 +9,9 @@ create_layer(name = NULL, open = rlang::is_interactive())
 \arguments{
 \item{name}{Either a name without extension, or \code{NULL} to create the
 paired file based on currently open file in the script editor. If
-the R file is open, \code{use_test()} will create/open the corresponding
+the \verb{R/} file is open, \code{use_test()} will create/open the corresponding
 test file; if the test file is open, \code{use_r()} will create/open the
-corresponding R file.}
+corresponding \verb{R/} file.}
 
 \item{open}{Whether to open the file for interactive editing.}
 }
diff --git a/man/state_census.Rd b/man/state_census.Rd
new file mode 100644
index 000000000..eec13eb53
--- /dev/null
+++ b/man/state_census.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{state_census}
+\alias{state_census}
+\title{State population data}
+\format{
+Data frame with 57 rows (including one for the United States as a
+whole, plus the District of Columbia, Puerto Rico Commonwealth,
+American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana,
+Islands).
+
+\describe{
+\item{fips}{FIPS code}
+\item{name}{Full name of the state or territory}
+\item{pop}{Estimate of the location's resident population in
+2019.}
+\item{abbr}{Postal abbreviation for the location}
+}
+}
+\source{
+United States Census Bureau, at
+\url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf},
+\url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html},
+and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html}
+}
+\usage{
+state_census
+}
+\description{
+Data set on state populations, from the 2019 US Census.
+}
+\keyword{datasets}
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 2404a7b4c..4a360cd71 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -116,6 +116,22 @@ test_that("Postprocessing workflow works and values correct", {
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
   expect_equal(p$.pred_original, p$.pred * c(20000, 30000))
+
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_threshold(.pred) %>%
+    layer_naomit(.pred) %>%
+    layer_population_scaling(.pred, df = pop_data, rate_rescaling = 10000,
+                             by =  c("geo_value" = "states"),
+                             df_pop_col = "value")
+  wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(jhu) %>%
+    add_frosting(f)
+  expect_silent(p <- predict(wf, latest))
+  expect_equal(nrow(p), 2L)
+  expect_equal(ncol(p), 4L)
+  expect_equal(p$.pred_original, p$.pred * c(2, 3))
+
 })
 
 test_that("Postprocessing to get cases from case rate", {
diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 4764ee8ac..2209cf6ad 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -61,20 +61,28 @@ regression, the textbook example for modeling count data, as an illustration
 for using the `epipredict` package with other existing tidymodel packages. 
 
 ```{r poisson-reg-data}
-## NOTE: built-in data? or use epiprocess built-in data and save code here
 x <- covidcast(
   data_source = "jhu-csse",
-  signals = "confirmed_incidence_num,deaths_incidence_num",
+  signals = "confirmed_incidence_num",
   time_type = "day",
   geo_type = "state",
   time_values = epirange(20210604, 20211231),
   geo_values = "ca,fl,tx,ny,nj") %>%
-  fetch_tbl() 
+  fetch_tbl() %>%
+  select(geo_value, time_value, cases = value)
+
+y <- covidcast(
+  data_source = "jhu-csse",
+  signals = "deaths_incidence_num",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj") %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, deaths = value)
 
 case_death_counts_subset <- x %>%
-  pivot_wider(names_from = signal, values_from = value) %>%
-  rename(cases = confirmed_incidence_num, deaths = deaths_incidence_num) %>%
-  select(geo_value, time_value, cases, deaths) %>%
+  full_join(y, by = c("geo_value", "time_value")) %>%
   as_epi_df()
 ```
 
@@ -140,29 +148,28 @@ manipulate variable roles easily.
 
 Notice in the following preprocessing steps, we used `add_role()` on 
 `geo_value_factor` since currently the default role for it is `raw` and 
-we would like the dummified variables to automatically be of role `predictor`. 
+we would like the dummified variables to automatically be `predictor`'s. 
 
 ```{r}
 case_death_counts_subset <- case_death_counts_subset %>%
-                            mutate(geo_value_factor = as.factor(geo_value)) %>%
-                            as_epi_df()
+  mutate(geo_value_factor = as.factor(geo_value)) %>%
+  as_epi_df()
 
 epi_recipe(case_death_counts_subset)
-               
+
 r <- epi_recipe(case_death_counts_subset) %>%
-      add_role(geo_value_factor, new_role = "predictor") %>%
-      step_dummy(geo_value_factor) %>%
-      step_mutate(cases = case_when(cases < 0 ~ 0, TRUE ~ cases)) %>%  
-      ## NOTE: why are there negative numbers in here?
-      step_mutate(deaths = case_when(deaths < 0 ~ 0, TRUE ~ deaths)) %>%
-      step_epi_lag(cases, lag = c(0, 7)) %>%
-      step_epi_lag(deaths, lag = c(0, 7)) %>%
-      step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
-      step_epi_naomit()
+  add_role(geo_value_factor, new_role = "predictor") %>%
+  step_dummy(geo_value_factor) %>%
+  ## Occasionally, data reporting errors / corrections result in negative
+  ## cases / deaths
+  step_mutate(cases = pmax(cases, 0), deaths = pmax(deaths, 0)) %>%  
+  step_epi_lag(cases, deaths, lag = c(0, 7)) %>%
+  step_epi_ahead(deaths, ahead = 7, role = "outcome") %>%
+  step_epi_naomit()
 ```
 
 After specifying the preprocessing steps, we will use the `parsnip` package for
-modeling and getting the prediction for the 7-day ahead death counts of the 
+modeling and getting the prediction for death count, 7 days after the
 latest available date in the dataset. 
 
 ```{r}
@@ -174,9 +181,14 @@ wf <- epi_workflow(r, parsnip::poisson_reg()) %>%
 predict(wf, latest) %>% filter(!is.na(.pred))
 ```
 
+Note that the `time_value` corresponds to the last available date in the 
+training set, **NOT** to the target date of the forecast 
+(`r max(latest$time_value) + 7`).
+
+
 Let's take a look at the fit:
-```{r, echo =FALSE}
-wf$fit$fit
+```{r}
+extract_fit_engine(wf)
 ```
 
 Up to now, we've used the Poisson regression to model count data. Poisson 
@@ -208,9 +220,10 @@ regression, which we will illustrate in the next section.
 
 ## Linear Regression 
 
-For COVID-19, the CDC requires submission of case and death count predictions. 
-However, it is often the case that rate data is easier to model as the outcome 
-as it need not be an integer. We can use a liner regression to predict the death
+For COVID-19, the CDC required submission of case and death count predictions. 
+However, the Delphi Group preferred to train on rate data instead, because it 
+puts different locations on a similar scale. 
+We can use a liner regression to predict the death
 rates and use state population data to scale the rates to counts. We will do so 
 using `layer_population_scaling()` from the `epipredict` package. 
 
@@ -226,106 +239,111 @@ from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epi
 
 The survey data shows the estimated percentage of people who wore a mask for 
 most or all of the time while in public in the past 7 days and the estimated 
-percentage of respondents who reported that all or most people they enountered 
-in public in the past 7 days maintained a distance of at least 6 feet. We will 
-convert these percentages into indicator variables as follows:
+percentage of respondents who reported that all or most people they encountered 
+in public in the past 7 days maintained a distance of at least 6 feet. 
 
-State-wise population data is obtained from `covidcast::state_census` and will
-be used in `layer_population_scaling()`. 
+State-wise population data from the 2019 U.S. Census is included in this package
+and will be used in `layer_population_scaling()`. 
 ```{r}
-## NOTE: need to put in package?
-behav_ind_mask <- covidcast(data_source = "fb-survey",
-                        signals = "smoothed_wwearing_mask_7d",
-                        time_type = "day",
-                        geo_type = "state",
-                        time_values = epirange(20210604, 20211231),
-                        geo_values = "ca,fl,tx,ny,nj"
-                        )  %>%
-            fetch_tbl() %>%
-            select(geo_value, time_value, percentage = value) %>%
-            mutate(mask_indicator = case_when(percentage < 50 ~ 0, 
-                                              TRUE ~  1)) %>%
-            select(geo_value, time_value, mask_indicator)  
-
-behav_ind_distancing <- covidcast(data_source = "fb-survey",
-                        signals = "smoothed_wothers_distanced_public",
-                        time_type = "day",
-                        geo_type = "state",
-                        time_values = epirange(20210604, 20211231),
-                        geo_values = "ca,fl,tx,ny,nj"
-                        )  %>%
-            fetch_tbl() %>%
-            select(geo_value, time_value, percentage = value) %>%
-            mutate(distancing_indicator = case_when(percentage < 20 ~ 0, 
-                                              TRUE ~  1)) %>%
-            select(geo_value, time_value, distancing_indicator) 
-
-pop_dat <- covidcast::state_census %>% 
-          select(POPESTIMATE2019, ABBR) %>%
-          mutate(ABBR = tolower(ABBR)) %>%
-          filter(ABBR %in% c("ca","fl","tx","ny","nj")) %>%
-          rename(value = POPESTIMATE2019, geo_value = ABBR)
+behav_ind_mask <- covidcast(
+  data_source = "fb-survey",
+  signals = "smoothed_wwearing_mask_7d",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj")  %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, masking = value)
+
+behav_ind_distancing <- covidcast(
+  data_source = "fb-survey",
+  signals = "smoothed_wothers_distanced_public",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20210604, 20211231),
+  geo_values = "ca,fl,tx,ny,nj")  %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, distancing = value) 
+
+pop_dat <- state_census %>% select(abbr, pop)
 
 behav_ind <- behav_ind_mask %>%
   full_join(behav_ind_distancing, by = c("geo_value", "time_value")) 
+```
 
-dim(behav_ind)
+Rather than using raw mask-wearing / social-distancing metrics, for the sake
+of illustration, we'll convert both into categorical predictors.
+
+```{r, echo=FALSE, message=FALSE,fig.align='center', fig.width=6, fig.height=4}
+library(ggplot2)
+behav_ind %>%
+  pivot_longer(masking:distancing) %>%
+  ggplot(aes(value, fill = geo_value)) +
+  geom_density(alpha = 0.5) +
+  scale_fill_brewer(palette = "Set1", name = "") +
+  theme_bw() +
+  scale_x_continuous(expand = c(0,0)) +
+  scale_y_continuous(expand = expansion(c(0,.05))) +
+  facet_wrap(~name, scales = "free") +
+  theme(legend.position = "bottom")
 ```
 
-We will take a subset of death rate and case rate data from our built-in dataset 
+We will take a subset of death rate and case rate data from the built-in dataset 
 `case_death_rate_subset`.
+
 ```{r}
 jhu <- case_death_rate_subset %>%
-  dplyr::filter(time_value >= "2021-06-04", 
-                time_value <= "2021-12-31",
-  geo_value %in% c("ca","fl","tx","ny","nj"))
-
-dim(jhu)
+  dplyr::filter(
+    time_value >= "2021-06-04", 
+    time_value <= "2021-12-31",
+    geo_value %in% c("ca","fl","tx","ny","nj"))
 ```
 
 Preprocessing steps will rely on functions from the `epipredict` package as well
 as the `recipes` package:
 
 Additionally, there are also many functions in the `recipes` package that allow for 
-[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
+[scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations),
 such as log transformations and data centering. In our case, we will 
 center the numerical predictors to allow for meaningful interpretation of the 
 intercept. 
 
 ```{r}
 jhu <- jhu %>%
-      mutate(geo_value_factor = as.factor(geo_value)) %>%
-      full_join(behav_ind, by = c("geo_value", "time_value")) %>%
-      as_epi_df()
+  mutate(geo_value_factor = as.factor(geo_value)) %>%
+  left_join(behav_ind, by = c("geo_value", "time_value")) %>%
+  as_epi_df()
             
 r <- epi_recipe(jhu) %>%
-      add_role(geo_value_factor, new_role = "predictor") %>%
-      step_dummy(geo_value_factor) %>%
-      step_epi_lag(case_rate, lag = c(0, 7)) %>%
-      step_epi_lag(death_rate, lag = c(0, 7)) %>%
-      step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
-      step_center(contains("lag"), role = "predictor") %>%
-      step_epi_naomit()
+  add_role(geo_value_factor, new_role = "predictor") %>%
+  step_dummy(geo_value_factor) %>%
+  step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>%
+  step_mutate(masking = cut_number(masking, 5), 
+              distancing = cut_number(distancing, 5)) %>%
+  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+  step_center(contains("lag"), role = "predictor") %>%
+  step_epi_naomit()
 ```
 
-As a sanity check we can see what columns exist in the preprocessed data now:
-```{r, warning = FALSE, echo = FALSE}
-glimpse(slice_sample(bake(prep(r, jhu),jhu), n = 6))
+As a sanity check we can see what the training data will look like
+```{r, warning = FALSE}
+glimpse(slice_sample(bake(prep(r, jhu), jhu), n = 6))
 ```
 
 Before directly predicting the results, we need to add postprocessing layers to
-obtain the death counts instead of death rates. 
+obtain the death counts instead of death rates. Note that the rates used so
+far are "per 100K people" rather than "per person".
 
 ```{r}
 f <- frosting() %>%
-    layer_predict() %>%
-    layer_add_target_date("2022-01-07") %>% 
-    layer_threshold(.pred, lower = 0) %>%
-    layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>%
-    layer_naomit(.pred)  %>%
-    layer_population_scaling(.pred, .pred_distn, df = pop_dat,
-                             by = c("geo_value"),
-                             df_pop_col = "value")
+  layer_predict() %>%
+  layer_add_target_date("2022-01-07") %>% 
+  layer_threshold(.pred, lower = 0) %>%
+  layer_residual_quantiles(probs = c(0.05, 0.95), symmetrize = FALSE) %>%
+  layer_naomit(.pred) %>%
+  layer_population_scaling(
+    .pred, .pred_distn, df = pop_dat, rate_rescaling = 1e5,
+    by = c("geo_value" = "abbr"), df_pop_col = "pop")
 
 wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(jhu) %>%
@@ -336,36 +354,35 @@ p <- predict(wf, latest)
 p
 ```
 
+The columns marked `*_scaled` have been rescaled to the correct units, in this
+case `deaths` rather than deaths per 100K people (these remain in `.pred`).
+
 To look at the prediction intervals:
 ```{r}
-# method 1
-nested_quantiles(p$.pred_distn_original)
-# method 2
-dstn = p$.pred_distn_original
-q = distributional::parameters(dstn)$q # a list of length nrow(p)
-q
-tau = distributional::parameters(dstn)$tau
-tau
+p %>%
+  select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>%
+  mutate(.pred_distn_scaled = nested_quantiles(.pred_distn_scaled)) %>%
+  unnest(.pred_distn_scaled) %>%
+  pivot_wider(names_from = tau, values_from = q)
 ```
 
-The point estimate of death counts on Jan 7, 2022 is returned in column 
-`.pred_original`. The predictions based on residual quantiles is stored in
-`.pred_distn_original` and also scaled to count data. 
 
 Last but not least, let's take a look at the regression fit and check the 
 coefficients:
 ```{r, echo =FALSE}
-wf$fit$fit
+summary(extract_fit_engine(wf))
 ```
 
 ## Classification
 
-Sometimes predictive model for future trends are of interest. In other words, 
-the target is to predict if the future will have increasing case rates(up),
-decreasing case rates (down), or flat case rates (flat). Such model are often 
+Sometimes it is preferable to create a predictive model for surges or upswings
+rather than for raw values. In this case,
+the target is to predict if the future will have increased case rates(up),
+decreased case rates (down), or flat case rates (flat) relative to the current
+level. Such model are often 
 referred to as hotspot prediction models. We will refer to the model notations 
-from McDonald et al. but extend the application to predict three categories 
-instead of two. 
+from [McDonald, Bien, Green, Hu, et al.](#references) but extend the application
+to predict three categories instead of two. 
 
 Hotspot prediction predicts a categorical variable defined in terms of the 
 relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. 
@@ -384,7 +401,7 @@ $$
 where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. 
 We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized
 as 'up', meaning the number of newly reported cases over the past 7 days has 
-increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ 
+increased by at least 25% compared to the preceding week. When $Z_{\ell,t}$ 
 is categorized as 'down', it suggests that there has been at least a 25% 
 decrease in newly reported cases over the past 7 days. Otherwise, we will 
 consider the trends to be flat. 
@@ -394,22 +411,22 @@ The expression of the multinomial regression we will model is as follows:
 $$
 \pi_{j}(x) = \text{Pr}(Y = j|x) = \frac{e^{g_j(x)}}{\sum_{k=0}^2 g_j(x) }
 $$
-where $j$ is either down(Y=0), flat(Y=1), or up (Y=2),
+where $j$ is either down (`Y=0`), flat (`Y=1`), or up (`Y=2`),
 $$g_0(x) = 0$$
 
 \begin{aligned}
-g_1(x)= \text{ln}(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}) = 
+g_1(x)= \text{ln}\left(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}\right) &= 
 \beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_{10} s_{\text{state_1}} +
 \delta_{11} s_{\text{state_2}} + ... \nonumber \\
-+ \beta_{12} Y^{\Delta}_{\ell, t} +
+&\quad + \beta_{12} Y^{\Delta}_{\ell, t} +
 \beta_{13} Y^{\Delta}_{\ell, t-7}
 \end{aligned}
 
 \begin{aligned}
-g_2(x)= \text{ln}(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}) = 
+g_2(x)= \text{ln}\left(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}\right) &= 
 \beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_{20} s_{\text{state_1}} +
 \delta_{21} s_{\text{state_2}} + ... \nonumber \\
-+ \beta_{22} Y^{\Delta}_{\ell, t} +
+&\quad + \beta_{22} Y^{\Delta}_{\ell, t} +
 \beta_{23} Y^{\Delta}_{\ell, t-7}
 \end{aligned}
 
@@ -429,26 +446,29 @@ jhu <- case_death_rate_subset %>%
   as_epi_df()
 
 r <- epi_recipe(jhu) %>%
-      add_role(time_value, new_role = "predictor") %>%
-      step_dummy(geo_value_factor) %>%
-      step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
-      step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>%
-      step_mutate(pct_diff1 =  case_when(lag_7_case_rate == 0 ~ 0, 
-      TRUE ~ (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) ) %>%
-      step_mutate(pct_diff2 = case_when(lag_14_case_rate == 0 ~ 0,
-      TRUE ~ (lag_7_case_rate - lag_14_case_rate)/lag_14_case_rate)) %>%
-      step_mutate(trend = case_when(pct_diff1 < -0.25 ~ "down",
-                                    pct_diff1 > 0.25 ~ "up",
-                                    TRUE ~ "flat"), 
-                                    role = "outcome") %>% 
-      step_rm(death_rate,
-              case_rate,
-              lag_0_case_rate, 
-              lag_7_case_rate, 
-              lag_14_case_rate, 
-              ahead_7_case_rate) %>%
-      step_epi_naomit()
-                  
+  add_role(time_value, new_role = "predictor") %>%
+  step_dummy(geo_value_factor) %>%
+  step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+  step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>%
+  step_mutate(
+    pct_diff_ahead = case_when(
+      lag_7_case_rate == 0 ~ 0,
+      TRUE ~ (ahead_7_case_rate - lag_0_case_rate) / lag_0_case_rate),
+    pct_diff_wk1 = case_when(
+      lag_7_case_rate == 0 ~ 0, 
+      TRUE ~ (lag_0_case_rate - lag_7_case_rate) / lag_7_case_rate),
+    pct_diff_wk2 = case_when(
+      lag_14_case_rate == 0 ~ 0,
+      TRUE ~ (lag_7_case_rate - lag_14_case_rate) / lag_14_case_rate)) %>%
+  step_mutate(
+    response = case_when(
+      pct_diff_ahead < -0.20 ~ "down",
+      pct_diff_ahead > 0.25 ~ "up",
+      TRUE ~ "flat"), 
+    role = "outcome") %>% 
+  step_rm(death_rate, case_rate, lag_0_case_rate,  lag_7_case_rate, 
+          lag_14_case_rate, ahead_7_case_rate, pct_diff_ahead) %>%
+  step_epi_naomit()
 ```
 
 We will fit the multinomial regression and take a look at the predictions:
@@ -463,22 +483,21 @@ predict(wf, latest) %>% filter(!is.na(.pred_class))
 
 We can also take a look at the fit:
 ```{r}
-wf$fit$fit
+extract_fit_engine(wf)
 ```
 
-One could also use formula in `epi_recipe()` to achieve the same results as 
+One could also use a formula in `epi_recipe()` to achieve the same results as 
 above. However, only one of `add_formula()`, `add_recipe()`, or 
-`workflow_variables()` must be specified. For the purpose of using `add_formula`
-over `add_recipes`, we will prep and bake our recipe to return a dataset that 
-will be used for model fitting.
+`workflow_variables()` must be specified. For the purpose of demonstrating 
+`add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to
+return a `data.frame` that could be used for model fitting.
 ```{r}
-b <- bake(prep(r,jhu),jhu)
-m_reg <-  parsnip::multinom_reg()
+b <- bake(prep(r, jhu), jhu)
 
 epi_workflow() %>%
-add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2) %>%
-add_model(m_reg) %>%
-fit(data = b)
+  add_formula(response ~ geo_value + pct_diff_wk1 + pct_diff_wk2) %>%
+  add_model(parsnip::multinom_reg()) %>%
+  fit(data = b)
 ```
 
 ## Benefits of Lagging and Leading in `epipredict`
@@ -491,45 +510,45 @@ Let's start with a simple dataset and preprocessing:
 ex <- case_death_rate_subset %>%
   dplyr::filter(time_value >= "2021-12-01", 
                 time_value <= "2021-12-31",
-  geo_value == "ca")
+                geo_value == "ca")
 
 dim(ex)
 ```
 
-We want to predict death rates on 2022-01-07, which is 7 days ahead from the last 
-available date in our dataset. 
+We want to predict death rates on `2022-01-07`, which is 7 days ahead from the 
+latest available date in our dataset. 
 
 We will compare two methods of trying to create lags and leads:
 ```{r}
-r1 <- epi_recipe(ex) %>%
-      step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
-      step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
-      step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
-      step_epi_naomit() %>%
-      prep()
-
-b1 <- bake(r1, ex)
-dim(b1)
-
-
-r2 <- epi_recipe(ex) %>%
-    step_mutate(lag0case_rate = lag(case_rate, 0),
-                lag7case_rate = lag(case_rate, 7),
-                lag14case_rate = lag(case_rate, 14),
-                lag0death_rate = lag(death_rate, 0),
-                lag7death_rate = lag(death_rate, 7),
-                lag14death_rate = lag(death_rate, 14),
-                ahead7death_rate = lead(death_rate, 7)) %>%
-    step_epi_naomit() %>%
-                prep()
-
-b2 <- bake(r2, ex)
-dim(b2)
+p1 <- epi_recipe(ex) %>%
+  step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+  step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+  step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>%
+  step_epi_naomit() %>%
+  prep()
+
+b1 <- bake(p1, ex)
+b1
+
+
+p2 <- epi_recipe(ex) %>%
+  step_mutate(lag0case_rate = lag(case_rate, 0),
+              lag7case_rate = lag(case_rate, 7),
+              lag14case_rate = lag(case_rate, 14),
+              lag0death_rate = lag(death_rate, 0),
+              lag7death_rate = lag(death_rate, 7),
+              lag14death_rate = lag(death_rate, 14),
+              ahead7death_rate = lead(death_rate, 7)) %>%
+  step_epi_naomit() %>%
+  prep()
+
+b2 <- bake(p2, ex)
+b2
 ```
 
 Notice the difference in number of rows `b1` and `b2` returns. This is because 
-the recipe that doesn't use `step_epi_ahead` and `step_epi_lag` is missing dates
-compared to the one that used the `epipredict` functions.
+the second version, the one that doesn't use `step_epi_ahead` and `step_epi_lag`,
+has omitted dates compared to the one that used the `epipredict` functions.
 ```{r}
 dates_used_in_training1 <-
  b1 %>% select(- ahead_7_death_rate) %>% na.omit() %>% select(time_value)
@@ -548,11 +567,10 @@ instead of 7 days ahead from `r max(dates_used_in_training1$time_value)`.
 
 ## References
 
-::: {#refs}
-McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 
+McDonald, Bien, Green, Hu, et al. "Can auxiliary indicators improve COVID-19 
 forecasting and hotspot prediction?." Proceedings of the National Academy of 
-Sciences 118.51 (2021): e2111453118.
-:::
+Sciences 118.51 (2021): e2111453118. [doi:10.1073/pnas.2111453118](
+https://doi.org/10.1073/pnas.2111453118)
 
 ## Attribution
 

From ed778398c308950b14a9aae6b9072299a648345a Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Tue, 13 Sep 2022 17:15:46 -0700
Subject: [PATCH 28/29] re document

---
 man/layer_population_scaling.Rd | 57 +++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd
index 9d59e3f08..8ac350870 100644
--- a/man/layer_population_scaling.Rd
+++ b/man/layer_population_scaling.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/layer_population_scaling.R
 \name{layer_population_scaling}
 \alias{layer_population_scaling}
-\title{Revert population scaled prediction}
+\title{Convert per-capita predictions to raw scale}
 \usage{
 layer_population_scaling(
   frosting,
@@ -10,33 +10,36 @@ layer_population_scaling(
   df,
   by = NULL,
   df_pop_col,
+  rate_rescaling = 1,
   create_new = TRUE,
-  suffix = "_original",
+  suffix = "_scaled",
   .flag = TRUE,
   id = rand_id("population_scaling")
 )
 }
 \arguments{
 \item{frosting}{a \code{frosting} postprocessor. The layer will be added to the
-sequence of operations  for this frosting.}
+sequence of operations for this frosting.}
 
 \item{...}{One or more selector functions to scale variables
 for this step. See \code{\link[=selections]{selections()}} for more details.}
 
-\item{df}{a data frame that contains the population data used for scaling.}
+\item{df}{a data frame that contains the population data to be used for
+inverting the existing scaling.}
 
-\item{by}{A character vector of variables to left join by.
+\item{by}{A (possibly named) character vector of variables to join by.
 
 If \code{NULL}, the default, the function will perform a natural join, using all
-variables in common across the \code{epi_df} and the user-provided dataset.
-If columns in \code{epi_df} and \code{df} have the same name (and aren't
-included in by), \code{.df} is added to the one from the user-provided data
+variables in common across the \code{epi_df} produced by the \code{predict()} call
+and the user-provided dataset.
+If columns in that \code{epi_df} and \code{df} have the same name (and aren't
+included in \code{by}), \code{.df} is added to the one from the user-provided data
 to disambiguate.
 
 To join by different variables on the \code{epi_df} and \code{df}, use a named vector.
-For example, by = c("geo_value" = "states") will match \code{epi_df$geo_value}
+For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value}
 to \code{df$states}. To join by multiple variables, use a vector with length > 1.
-For example, by = c("geo_value" = "states", "county" = "county") will match
+For example, \code{by = c("geo_value" = "states", "county" = "county")} will match
 \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}.
 
 See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.}
@@ -44,6 +47,10 @@ See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.}
 \item{df_pop_col}{the name of the column in the data frame \code{df} that
 contains the population data and used for scaling.}
 
+\item{rate_rescaling}{Sometimes rates are "per 100K" or "per 1M" rather than
+"per person". Adjustments can be made here. For example, if the original
+rate is "per 100K", then set \code{rate_rescaling = 1e5} to get counts back.}
+
 \item{create_new}{TRUE to create a new column and keep the original column
 in the \code{epi_df}.}
 
@@ -60,12 +67,15 @@ an updated \code{frosting} postprocessor
 }
 \description{
 \code{layer_population_scaling} creates a specification of a frosting layer
-that will add a population scaled column in the data. For example,
-load a dataset that contains county population, and join to an \code{epi_df}
-that currently predicts number of new cases by county to obtain case rates.
-Although worth noting that there is nothing special about "population".
-The function can be used to scale by any variable. Population is simply the
-most natural and common use case.
+that will "undo" per-capita scaling. Typical usage would
+load a dataset that contains state-level population, and use it to convert
+predictions made from a rate-scale model to raw scale by multiplying by
+the population.
+Although, it is worth noting that there is nothing special about "population".
+The function can be used to scale by any variable. Population is the
+standard use case in the epidemiology forecasting scenario. Any value
+passed will \emph{multiply} the selected variables while the \code{rate_rescaling}
+argument is a common \emph{divisor} of the selected variables.
 }
 \examples{
 library(epiprocess)
@@ -73,8 +83,7 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\%
   dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\%
   dplyr::select(geo_value, time_value, cases)
 
-pop_data = data.frame(states = c("ca", "ny"),
-                      value = c(20000, 30000))
+pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
 
 r <- epi_recipe(jhu) \%>\%
   step_population_scaling(df = pop_data,
@@ -98,12 +107,12 @@ wf <- epi_workflow(r,
   parsnip::fit(jhu) \%>\%
   add_frosting(f)
 
-latest <- get_test_data(recipe = r,
-                        x = epiprocess::jhu_csse_daily_subset \%>\%
-                          dplyr::filter(time_value > "2021-11-01",
-                                        geo_value \%in\% c("ca", "ny")) \%>\%
-                          dplyr::select(geo_value, time_value, cases))
-
+latest <- get_test_data(
+  recipe = r,
+  x = epiprocess::jhu_csse_daily_subset \%>\%
+    dplyr::filter(time_value > "2021-11-01",
+                  geo_value \%in\% c("ca", "ny")) \%>\%
+    dplyr::select(geo_value, time_value, cases))
 
 predict(wf, latest)
 }

From 7be145c7641e769f075b9898c13bf788b1c615d1 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Tue, 13 Sep 2022 17:35:13 -0700
Subject: [PATCH 29/29] get tests to pass

---
 R/layer_population_scaling.R             |  2 +-
 R/utils_arg.R                            |  2 +-
 tests/testthat/test-population_scaling.R | 22 ++++++++++------------
 vignettes/preprocessing-and-models.Rmd   |  1 -
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 29a5cd14a..b5bc57ee3 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -143,7 +143,7 @@ slather.layer_population_scaling <-
                               by= object$by),
              silent = TRUE)
     if (any(grepl("Join columns must be present in data", unlist(try_join)))) {
-      cli_stop(c("columns in `by` selectors of `layer_population_scaling`",
+      cli_stop(c("columns in `by` selectors of `layer_population_scaling` ",
                  "must be present in data and match"))}
 
     object$df <- object$df %>%
diff --git a/R/utils_arg.R b/R/utils_arg.R
index ebac387d5..6f762382a 100644
--- a/R/utils_arg.R
+++ b/R/utils_arg.R
@@ -108,7 +108,7 @@ arg_is_chr = function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = F
   handle_arg_list(
     ...,
     tests = function(name, value) {
-      if (!(is.character(value) | (is.null(value) & !allow_null)))
+      if (!(is.character(value) | (is.null(value) & allow_null)))
         cli_stop("Argument {.val {name}} must be of character type.")
 
       if (any(is.na(value)) & !allow_na)
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 4a360cd71..5b805b33b 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -115,7 +115,7 @@ test_that("Postprocessing workflow works and values correct", {
   expect_silent(p <- predict(wf, latest))
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
-  expect_equal(p$.pred_original, p$.pred * c(20000, 30000))
+  expect_equal(p$.pred_scaled, p$.pred * c(20000, 30000))
 
   f <- frosting() %>%
     layer_predict() %>%
@@ -130,7 +130,7 @@ test_that("Postprocessing workflow works and values correct", {
   expect_silent(p <- predict(wf, latest))
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
-  expect_equal(p$.pred_original, p$.pred * c(2, 3))
+  expect_equal(p$.pred_scaled, p$.pred * c(2, 3))
 
 })
 
@@ -160,8 +160,7 @@ test_that("Postprocessing to get cases from case rate", {
                              by =  c("geo_value" = "states"),
                              df_pop_col = "value")
 
-  wf <- epi_workflow(r,
-                     parsnip::linear_reg()) %>%
+  wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(jhu) %>%
     add_frosting(f)
 
@@ -175,7 +174,7 @@ test_that("Postprocessing to get cases from case rate", {
   expect_silent(p <- predict(wf, latest))
   expect_equal(nrow(p), 2L)
   expect_equal(ncol(p), 4L)
-  expect_equal(p$.pred_original, p$.pred * c(1/20000, 1/30000))
+  expect_equal(p$.pred_scaled, p$.pred * c(1/20000, 1/30000))
 })
 
 
@@ -255,11 +254,11 @@ test_that("expect error if `by` selector does not match", {
                              by =  NULL,
                              df_pop_col = "values")
 
-  expect_error(wf <- epi_workflow(r,
-                                    parsnip::linear_reg()) %>%
-                   fit(jhu) %>%
-                   add_frosting(f),
-               "columns in `by` selectors of `step_population_scaling` must be present in data and match")
+  expect_error(
+    wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+      fit(jhu) %>%
+      add_frosting(f),
+    "columns in `by` selectors of `step_population_scaling` must be present in data and match")
 
   r <- epi_recipe(jhu) %>%
     step_population_scaling(case_rate,
@@ -287,8 +286,7 @@ test_that("expect error if `by` selector does not match", {
                                           geo_value %in% c("ca", "ny")) %>%
                             dplyr::select(geo_value, time_value, case_rate))
 
-  wf <- epi_workflow(r,
-                     parsnip::linear_reg()) %>%
+  wf <- epi_workflow(r, parsnip::linear_reg()) %>%
     fit(jhu) %>%
     add_frosting(f)
 
diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
index 2209cf6ad..23e110b6c 100644
--- a/vignettes/preprocessing-and-models.Rmd
+++ b/vignettes/preprocessing-and-models.Rmd
@@ -36,7 +36,6 @@ we will use a linear regression to predict death rates. Last but not least, we
 will create a classification model for hotspot predictions. 
 
 ```{r, warning=FALSE, message=FALSE}
-# library(covidcast)
 library(tidyr)
 library(dplyr)
 library(epidatr)