From 41fb8de9dce4ac2b78ccb37e5be52e2b4b0bf125 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Wed, 27 Jul 2022 22:19:12 -0700 Subject: [PATCH 01/29] update some work --- vignettes/more-recipes.Rmd | 116 +++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 vignettes/more-recipes.Rmd diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd new file mode 100644 index 000000000..4d2a9a9d9 --- /dev/null +++ b/vignettes/more-recipes.Rmd @@ -0,0 +1,116 @@ +--- +title: Apply other steps and models from recipes and parsnip packages +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Apply other steps and models from recipes and parsnip packages} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- +The `epipredict` package utilizes the `tidymodels` framework, namely +[`recipes`](https://recipes.tidymodels.org/) for +[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences +of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a +unified interface to a range of models. + +`epipredict` has additional customized feature engineering and preprocessing steps +tailored to fit the needs of epidemiology forecasting, such as `step_epi_shift()`, +`step_population_scaling()`, `step_epi_naomit()`. They can be used along with +steps from the `recipes` package for more feature engineering. + +In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip` +models into the `epipredict` workflow. + +```{r, warning=FALSE, message =FALSE} +library(epipredict) +library(epiprocess) +library(recipes) +library(parsnip) +library(workflows) +``` + +We will use the built-in dataset `case_death_rate_subset` and apply feature +engineering steps and fit a few models from `parsnip` to predict future death +rates by states. +```{r} +x <- case_death_rate_subset +head(x) +``` +There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates +ranging from `r min(x$time_value)` to `r max(x$time_value)` in +`r n_distinct(x$geo_value)` states in the U.S. + +# Preprocessing and Feature Engineering + +In this section, we focus on showing how `step_*()` functions from the `recipes` +package can also be applied in our `epipredict` workflows. + +First, create a recipe object from the original data and then specify the +preprocessing steps. Steps are then sequentially added. + +```{r create-recipe-object} +r <- epi_recipe(x) +r +``` + +The function automatically categorizes roles into `geo_value`, `time_value` and +`raw` based on variable names, where `raw` includes `case_rate` and `death_rate`. + +We first create a made-up population dataset by states to allow the conversion +from `case_rate` and `death_rate` into `cases` and `deaths`. The binary variable +`social_distancing` indicates whether the state followed social distancing rules. +A 'throw-away' column with all ones is also created to show that `step_rm()` +can be used to remove unwanted columns. +```{r madeup-population-data} +pop_data <- data.frame(states = unique(x$geo_value), + pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), + social_distancing = rbinom(n = n_distinct(x$geo_value), + size = 1, + prob = 0.5), + throw_away = 1) + +head(pop_data) +``` + +Next, we will feature engineer `case_rate` and `death_rate` into `cases` and +`deaths`and create lags and leads based on them. + +```{r create-new-columns-in-recipe} +r <- r %>% + step_population_scaling(case_rate, death_rate, df = pop_data, + by = c("geo_value" = "states"), + create_new = FALSE, + df_pop_col = "pop") %>% + step_rename(cases = case_rate, deaths = death_rate) %>% + step_rm(throw_away) %>% + step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% + step_epi_naomit() +``` + +Notice that `step_rename()` was used for renaming of the variables, and `step_rm()` +was used to remove redundant columns. + +A quick view of what the prepossessed data looks like on training data: + +```{r peak-data-1} +slice_sample(bake(prep(r, x),x), n = 6) +``` + + + + + + + + + + + + + + + + + + From 9c6eaebcb3f7cef4356290167ef9b6d9dd93ca2e Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Thu, 28 Jul 2022 08:54:06 -0700 Subject: [PATCH 02/29] switch to inverse population --- vignettes/more-recipes.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 4d2a9a9d9..30f48602e 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -62,7 +62,7 @@ A 'throw-away' column with all ones is also created to show that `step_rm()` can be used to remove unwanted columns. ```{r madeup-population-data} pop_data <- data.frame(states = unique(x$geo_value), - pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), + inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), social_distancing = rbinom(n = n_distinct(x$geo_value), size = 1, prob = 0.5), @@ -79,7 +79,7 @@ r <- r %>% step_population_scaling(case_rate, death_rate, df = pop_data, by = c("geo_value" = "states"), create_new = FALSE, - df_pop_col = "pop") %>% + df_pop_col = "inv_pop") %>% step_rename(cases = case_rate, deaths = death_rate) %>% step_rm(throw_away) %>% step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% From 9d7c70d242101d4f7aa2ee1880d94331b17c340f Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Sun, 31 Jul 2022 14:41:18 -0700 Subject: [PATCH 03/29] error: prediction is empty --- man/epi_workflow.Rd | 2 +- vignettes/more-recipes.Rmd | 60 +++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/man/epi_workflow.Rd b/man/epi_workflow.Rd index bcf0e78aa..f9d753d84 100644 --- a/man/epi_workflow.Rd +++ b/man/epi_workflow.Rd @@ -11,7 +11,7 @@ epi_workflow(preprocessor = NULL, spec = NULL, postprocessor = NULL) \itemize{ \item A formula, passed on to \code{\link[workflows:add_formula]{add_formula()}}. \item A recipe, passed on to \code{\link[workflows:add_recipe]{add_recipe()}}. -\item A \code{\link[workflows:add_variables]{workflow_variables()}} object, passed on to \code{\link[workflows:add_variables]{add_variables()}}. +\item A \code{\link[workflows:workflow_variables]{workflow_variables()}} object, passed on to \code{\link[workflows:add_variables]{add_variables()}}. }} \item{spec}{An optional parsnip model specification to add to the workflow. diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 30f48602e..5e2eeee50 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -85,7 +85,8 @@ r <- r %>% step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() + step_epi_naomit() + ``` Notice that `step_rename()` was used for renaming of the variables, and `step_rm()` @@ -96,18 +97,75 @@ A quick view of what the prepossessed data looks like on training data: ```{r peak-data-1} slice_sample(bake(prep(r, x),x), n = 6) ``` +There are also many functions in the `recipes` package that allow for +spline transformation or other types of function transformations. + +```{r, warning=FALSE} +r <- r %>% + step_sqrt(contains("lag_")) %>% + step_rm(cases, deaths) + + +slice_sample(bake(prep(r, x),x), n = 6) +``` +Sometimes, the exact counts are desired so fitting a linear regression would be +a good idea. On the other hand, instead of predicting +7-day ahead death counts, we would like to predict if the death counts 7 days +from the latest date fall into some categories. For the purpose of illustration, +we define two categories: less than 10K(low) and more than 10K(high). +We can resort to the `recipes` package again: +```{r, warning = FALSE} +binner <- function(x) { + x <- cut(x, breaks = c(0, 10000, Inf), include.lowest = TRUE) + # now return the group number + as.numeric(x) +} +inc <- c("low", "high") +r_categorical <- r %>% + step_num2factor(ahead_7_deaths, + role = "outcome", + transform = binner, + levels = inc) +slice_sample(bake(prep(r_categorical, x),x), n = 6) +``` +# Model Fitting +Once the pre-processing steps are done, we can move on to model fitting using +the `parsnip` package. +First, we can try fitting a linear regression: + +```{r linear-fit} +wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(x) +# the fit +wf$fit +latest <- get_test_data(recipe = r, x = x) +p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p +``` + + +Logistic Regression +```{r poisson} +wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% + fit(x) +# the fit +wf$fit +latest <- get_test_data(recipe = r_categorical, x = x) +p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p +``` From 8db67b2ac248eddc456ae2b43c122e39fc661a6f Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 2 Aug 2022 10:46:14 -0700 Subject: [PATCH 04/29] just needed to add filter to remove NAs oops --- vignettes/more-recipes.Rmd | 48 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 5e2eeee50..7a3c52f2a 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -151,7 +151,7 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>% # the fit wf$fit latest <- get_test_data(recipe = r, x = x) -p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p <- predict(wf, latest) %>% filter(!is.na(.pred)) p ``` @@ -163,12 +163,56 @@ wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% # the fit wf$fit latest <- get_test_data(recipe = r_categorical, x = x) -p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p <- predict(wf, latest) %>% filter(!is.na(.pred_class)) p ``` +## Issues from vignette +1. `step_cut()` did not work +```{r, error=TRUE} +r_test1 <- epi_recipe(x) %>% + step_population_scaling(case_rate, death_rate, df = pop_data, + by = c("geo_value" = "states"), + create_new = FALSE, + df_pop_col = "inv_pop") %>% + step_rename(cases = case_rate, deaths = death_rate) %>% + step_rm(throw_away) %>% + step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% + step_epi_naomit() %>% + step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf)) +slice_sample(bake(prep(r_test1, x),x), n = 6) +``` +2. Predictions are not produced, despite different models applied to it. Perhaps +something wrong with the preprocessing steps? +Maybe without `step_population_scaling()`: +```{r} +r_test2 <- epi_recipe(x) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + +slice_sample(bake(prep(r_test2, x),x), n = 6) + +wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>% + fit(x) +# the fit +wf$fit +latest <- get_test_data(recipe = r_test2, x = x) +# latest <- x %>% +# filter(!is.na(case_rate), !is.na(death_rate)) %>% +# group_by(geo_value) %>% +# slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case +# ungroup() +p <- predict(wf, latest) %>% filter(!is.na(.pred)) +p + +``` From 4ea61b6761b55983f02853df3e90bbec86ba428b Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 2 Aug 2022 12:16:35 -0700 Subject: [PATCH 05/29] first draft of vignette done --- vignettes/more-recipes.Rmd | 174 +++++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 75 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 7a3c52f2a..c1328f464 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -20,20 +20,24 @@ steps from the `recipes` package for more feature engineering. In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip` models into the `epipredict` workflow. +First, let's load the necessary packages: + ```{r, warning=FALSE, message =FALSE} library(epipredict) library(epiprocess) library(recipes) library(parsnip) library(workflows) +library(poissonreg) ``` We will use the built-in dataset `case_death_rate_subset` and apply feature engineering steps and fit a few models from `parsnip` to predict future death -rates by states. +rates by states 7 days ahead from Dec 31, 2021. + ```{r} x <- case_death_rate_subset -head(x) +tail(x) ``` There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates ranging from `r min(x$time_value)` to `r max(x$time_value)` in @@ -62,7 +66,8 @@ A 'throw-away' column with all ones is also created to show that `step_rm()` can be used to remove unwanted columns. ```{r madeup-population-data} pop_data <- data.frame(states = unique(x$geo_value), - inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), + inv_pop = 1/seq(1e5, 1e7, + length.out = n_distinct(x$geo_value)), social_distancing = rbinom(n = n_distinct(x$geo_value), size = 1, prob = 0.5), @@ -85,36 +90,31 @@ r <- r %>% step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() + step_epi_naomit() %>% + step_rm(cases, deaths) ``` -Notice that `step_rename()` was used for renaming of the variables, and `step_rm()` -was used to remove redundant columns. +Notice that `step_rename()` was used for renaming of the variables, and +`step_rm()` was used to remove redundant columns. -A quick view of what the prepossessed data looks like on training data: +A quick view of what columns exist in the the prepossessed data so far: ```{r peak-data-1} -slice_sample(bake(prep(r, x),x), n = 6) +names(bake(prep(r, x),x)) ``` -There are also many functions in the `recipes` package that allow for -spline transformation or other types of function transformations. - -```{r, warning=FALSE} -r <- r %>% - step_sqrt(contains("lag_")) %>% - step_rm(cases, deaths) - -slice_sample(bake(prep(r, x),x), n = 6) -``` +There are also many functions in the `recipes` package that allow for +[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), +such as log transformations and spline transformations. +Sometimes, the exact counts are desired, therefore fitting a linear regression +or poisson regression would be a good idea. -Sometimes, the exact counts are desired so fitting a linear regression would be -a good idea. On the other hand, instead of predicting -7-day ahead death counts, we would like to predict if the death counts 7 days -from the latest date fall into some categories. For the purpose of illustration, -we define two categories: less than 10K(low) and more than 10K(high). +On the other hand, say instead of predicting 7-day ahead death counts, we would +like to predict if the counts are high or low based on some given threshold. +For the purpose of illustration, we define two categories: less than 10K(low) +and more than 10K(high). We can resort to the `recipes` package again: @@ -133,86 +133,110 @@ r_categorical <- r %>% transform = binner, levels = inc) -slice_sample(bake(prep(r_categorical, x),x), n = 6) ``` +As a sanity check we can see now that the preprocessed data has a categorical +outcome in `ahead_7_deaths` +```{r, warning FALSE, echo = FALSE} +glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6)) +``` - -# Model Fitting +# Model Fitting and Post-processing Once the pre-processing steps are done, we can move on to model fitting using the `parsnip` package. +### Linear Regression + First, we can try fitting a linear regression: -```{r linear-fit} +```{r linear-fit-workflow, warning = FALSE} wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(x) -# the fit -wf$fit + latest <- get_test_data(recipe = r, x = x) +``` + +We take a look at the linear regression fit and the values of the coefficients. + +```{r linear-fit, warning = FALSE} +wf$fit$fit # look at the model fit and coefficients + p <- predict(wf, latest) %>% filter(!is.na(.pred)) + p ``` +The predictions are saved in the `.pred` column and represents the squared roots +of predicted death counts 7 days from Dec 31. 2021. -Logistic Regression -```{r poisson} -wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% - fit(x) -# the fit -wf$fit -latest <- get_test_data(recipe = r_categorical, x = x) -p <- predict(wf, latest) %>% filter(!is.na(.pred_class)) -p +Some post-processing can be done to revert back to death rates + +```{r linear-reg-post-process} +f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "inv_pop") + +wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(x) %>% + add_frosting(f) + +predict(wf, latest) %>% select( - social_distancing, - throw_away) ``` +### Poisson regression +For count data, we can use poisson regression. Although the counts here are not +integer since they were calculated from case rates multiplying population, some +extra preprocessing steps can help us round them to integers to allow fitting a +poisson regression. -## Issues from vignette -1. `step_cut()` did not work -```{r, error=TRUE} -r_test1 <- epi_recipe(x) %>% - step_population_scaling(case_rate, death_rate, df = pop_data, - by = c("geo_value" = "states"), - create_new = FALSE, - df_pop_col = "inv_pop") %>% - step_rename(cases = case_rate, deaths = death_rate) %>% - step_rm(throw_away) %>% - step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() %>% - step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf)) +We will also apply post-processing steps to the workflow similar to the ones above. + +```{r poisson-fit} +r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), + role = "predictor") %>% + step_filter(ahead_7_deaths > 0) -slice_sample(bake(prep(r_test1, x),x), n = 6) +latest <- get_test_data(recipe = r, x = x) + +wf <- epi_workflow(r, parsnip::poisson_reg()) %>% + fit(x) %>% + add_frosting(f) + +predict(wf, latest) %>% select( - social_distancing, - throw_away) +``` + +Let's also take a look at the poisson regression fit: +```{r, echo =FALSE} +wf$fit$fit ``` -2. Predictions are not produced, despite different models applied to it. Perhaps -something wrong with the preprocessing steps? -Maybe without `step_population_scaling()`: -```{r} -r_test2 <- epi_recipe(x) %>% - step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% - step_naomit(all_predictors()) %>% - step_naomit(all_outcomes(), skip = TRUE) -slice_sample(bake(prep(r_test2, x),x), n = 6) +### Logistic Regression -wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>% +We use a logistic regression to predict whether 7-day ahead death counts are high +or low. Post-processing steps do not need to be applied here. +```{r logistic, warning = FALSE} +wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% fit(x) -# the fit -wf$fit -latest <- get_test_data(recipe = r_test2, x = x) -# latest <- x %>% -# filter(!is.na(case_rate), !is.na(death_rate)) %>% -# group_by(geo_value) %>% -# slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case -# ungroup() -p <- predict(wf, latest) %>% filter(!is.na(.pred)) +latest <- get_test_data(recipe = r_categorical, x = x) +p <- predict(wf, latest) %>% filter(!is.na(.pred_class)) p +``` +The model fit is as follows: +```{r echo=FALSE} +wf$fit$fit ``` +# Attribution + +This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html) + +This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University +on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. From 4c026f58610e8513f3d59bf8b95c69a9086cd653 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 2 Aug 2022 14:49:57 -0700 Subject: [PATCH 06/29] add poissonreg package to suggests --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 7a5cdbcf8..bbb863371 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,6 +44,7 @@ Imports: vctrs, workflows Suggests: + poissonreg, covidcast, data.table, ggplot2, From 4dcd39387faac71302b2119f0b8eb2189ba5ee45 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 2 Aug 2022 15:39:30 -0700 Subject: [PATCH 07/29] default `by` should return column used for joining --- tests/testthat/test-population_scaling.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index b2e32c33a..a0e79110a 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -177,7 +177,6 @@ test_that("test joining by default columns", { df_pop_col = "values", by = NULL, suffix = "_scaled") %>% - step_rm(geo_value.df) %>% # additional join columns added so need to remove step_epi_lag(case_rate_scaled, lag = c(7, 14)) %>% # cases step_epi_ahead(case_rate_scaled, ahead = 7, role = "outcome") %>% # cases step_naomit(all_predictors()) %>% @@ -185,7 +184,8 @@ test_that("test joining by default columns", { prep <- prep(r, jhu) - expect_silent(b <- bake(prep, jhu)) + expect_message(b <- bake(prep, jhu)) # a message of which column was joined by default + f <- frosting() %>% layer_predict() %>% @@ -207,7 +207,7 @@ test_that("test joining by default columns", { dplyr::select(geo_value, time_value, case_rate)) - expect_silent(p <- predict(wf, latest)) + expect_message(p <- predict(wf, latest)) }) From c828579acdd6b39ed8fcce97ce69c0a00a49a5c7 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Wed, 27 Jul 2022 22:19:12 -0700 Subject: [PATCH 08/29] update some work --- vignettes/more-recipes.Rmd | 116 +++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 vignettes/more-recipes.Rmd diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd new file mode 100644 index 000000000..4d2a9a9d9 --- /dev/null +++ b/vignettes/more-recipes.Rmd @@ -0,0 +1,116 @@ +--- +title: Apply other steps and models from recipes and parsnip packages +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Apply other steps and models from recipes and parsnip packages} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- +The `epipredict` package utilizes the `tidymodels` framework, namely +[`recipes`](https://recipes.tidymodels.org/) for +[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences +of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a +unified interface to a range of models. + +`epipredict` has additional customized feature engineering and preprocessing steps +tailored to fit the needs of epidemiology forecasting, such as `step_epi_shift()`, +`step_population_scaling()`, `step_epi_naomit()`. They can be used along with +steps from the `recipes` package for more feature engineering. + +In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip` +models into the `epipredict` workflow. + +```{r, warning=FALSE, message =FALSE} +library(epipredict) +library(epiprocess) +library(recipes) +library(parsnip) +library(workflows) +``` + +We will use the built-in dataset `case_death_rate_subset` and apply feature +engineering steps and fit a few models from `parsnip` to predict future death +rates by states. +```{r} +x <- case_death_rate_subset +head(x) +``` +There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates +ranging from `r min(x$time_value)` to `r max(x$time_value)` in +`r n_distinct(x$geo_value)` states in the U.S. + +# Preprocessing and Feature Engineering + +In this section, we focus on showing how `step_*()` functions from the `recipes` +package can also be applied in our `epipredict` workflows. + +First, create a recipe object from the original data and then specify the +preprocessing steps. Steps are then sequentially added. + +```{r create-recipe-object} +r <- epi_recipe(x) +r +``` + +The function automatically categorizes roles into `geo_value`, `time_value` and +`raw` based on variable names, where `raw` includes `case_rate` and `death_rate`. + +We first create a made-up population dataset by states to allow the conversion +from `case_rate` and `death_rate` into `cases` and `deaths`. The binary variable +`social_distancing` indicates whether the state followed social distancing rules. +A 'throw-away' column with all ones is also created to show that `step_rm()` +can be used to remove unwanted columns. +```{r madeup-population-data} +pop_data <- data.frame(states = unique(x$geo_value), + pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), + social_distancing = rbinom(n = n_distinct(x$geo_value), + size = 1, + prob = 0.5), + throw_away = 1) + +head(pop_data) +``` + +Next, we will feature engineer `case_rate` and `death_rate` into `cases` and +`deaths`and create lags and leads based on them. + +```{r create-new-columns-in-recipe} +r <- r %>% + step_population_scaling(case_rate, death_rate, df = pop_data, + by = c("geo_value" = "states"), + create_new = FALSE, + df_pop_col = "pop") %>% + step_rename(cases = case_rate, deaths = death_rate) %>% + step_rm(throw_away) %>% + step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% + step_epi_naomit() +``` + +Notice that `step_rename()` was used for renaming of the variables, and `step_rm()` +was used to remove redundant columns. + +A quick view of what the prepossessed data looks like on training data: + +```{r peak-data-1} +slice_sample(bake(prep(r, x),x), n = 6) +``` + + + + + + + + + + + + + + + + + + From 0136eb59819a5bc8543f205d2d54d2f23af6b985 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Thu, 28 Jul 2022 08:54:06 -0700 Subject: [PATCH 09/29] switch to inverse population --- vignettes/more-recipes.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 4d2a9a9d9..30f48602e 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -62,7 +62,7 @@ A 'throw-away' column with all ones is also created to show that `step_rm()` can be used to remove unwanted columns. ```{r madeup-population-data} pop_data <- data.frame(states = unique(x$geo_value), - pop = seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), + inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), social_distancing = rbinom(n = n_distinct(x$geo_value), size = 1, prob = 0.5), @@ -79,7 +79,7 @@ r <- r %>% step_population_scaling(case_rate, death_rate, df = pop_data, by = c("geo_value" = "states"), create_new = FALSE, - df_pop_col = "pop") %>% + df_pop_col = "inv_pop") %>% step_rename(cases = case_rate, deaths = death_rate) %>% step_rm(throw_away) %>% step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% From 0dd9713d0f1357986f964962e9c7d6831e56d965 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Sun, 31 Jul 2022 14:41:18 -0700 Subject: [PATCH 10/29] error: prediction is empty --- vignettes/more-recipes.Rmd | 60 +++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 30f48602e..5e2eeee50 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -85,7 +85,8 @@ r <- r %>% step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() + step_epi_naomit() + ``` Notice that `step_rename()` was used for renaming of the variables, and `step_rm()` @@ -96,18 +97,75 @@ A quick view of what the prepossessed data looks like on training data: ```{r peak-data-1} slice_sample(bake(prep(r, x),x), n = 6) ``` +There are also many functions in the `recipes` package that allow for +spline transformation or other types of function transformations. + +```{r, warning=FALSE} +r <- r %>% + step_sqrt(contains("lag_")) %>% + step_rm(cases, deaths) + + +slice_sample(bake(prep(r, x),x), n = 6) +``` +Sometimes, the exact counts are desired so fitting a linear regression would be +a good idea. On the other hand, instead of predicting +7-day ahead death counts, we would like to predict if the death counts 7 days +from the latest date fall into some categories. For the purpose of illustration, +we define two categories: less than 10K(low) and more than 10K(high). +We can resort to the `recipes` package again: +```{r, warning = FALSE} +binner <- function(x) { + x <- cut(x, breaks = c(0, 10000, Inf), include.lowest = TRUE) + # now return the group number + as.numeric(x) +} +inc <- c("low", "high") +r_categorical <- r %>% + step_num2factor(ahead_7_deaths, + role = "outcome", + transform = binner, + levels = inc) +slice_sample(bake(prep(r_categorical, x),x), n = 6) +``` +# Model Fitting +Once the pre-processing steps are done, we can move on to model fitting using +the `parsnip` package. +First, we can try fitting a linear regression: + +```{r linear-fit} +wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(x) +# the fit +wf$fit +latest <- get_test_data(recipe = r, x = x) +p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p +``` + + +Logistic Regression +```{r poisson} +wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% + fit(x) +# the fit +wf$fit +latest <- get_test_data(recipe = r_categorical, x = x) +p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p +``` From 47395a25b6f0226c6312a2f249ac72a304eba661 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 2 Aug 2022 10:46:14 -0700 Subject: [PATCH 11/29] just needed to add filter to remove NAs oops --- vignettes/more-recipes.Rmd | 48 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 5e2eeee50..7a3c52f2a 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -151,7 +151,7 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>% # the fit wf$fit latest <- get_test_data(recipe = r, x = x) -p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p <- predict(wf, latest) %>% filter(!is.na(.pred)) p ``` @@ -163,12 +163,56 @@ wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% # the fit wf$fit latest <- get_test_data(recipe = r_categorical, x = x) -p <- predict(wf, latest) ## ERROR, NOTHING IS PREDICTED +p <- predict(wf, latest) %>% filter(!is.na(.pred_class)) p ``` +## Issues from vignette +1. `step_cut()` did not work +```{r, error=TRUE} +r_test1 <- epi_recipe(x) %>% + step_population_scaling(case_rate, death_rate, df = pop_data, + by = c("geo_value" = "states"), + create_new = FALSE, + df_pop_col = "inv_pop") %>% + step_rename(cases = case_rate, deaths = death_rate) %>% + step_rm(throw_away) %>% + step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% + step_epi_naomit() %>% + step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf)) +slice_sample(bake(prep(r_test1, x),x), n = 6) +``` +2. Predictions are not produced, despite different models applied to it. Perhaps +something wrong with the preprocessing steps? +Maybe without `step_population_scaling()`: +```{r} +r_test2 <- epi_recipe(x) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + +slice_sample(bake(prep(r_test2, x),x), n = 6) + +wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>% + fit(x) +# the fit +wf$fit +latest <- get_test_data(recipe = r_test2, x = x) +# latest <- x %>% +# filter(!is.na(case_rate), !is.na(death_rate)) %>% +# group_by(geo_value) %>% +# slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case +# ungroup() +p <- predict(wf, latest) %>% filter(!is.na(.pred)) +p + +``` From 0a835992548039fc059c806462c3fabd26050f48 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 2 Aug 2022 12:16:35 -0700 Subject: [PATCH 12/29] first draft of vignette done --- vignettes/more-recipes.Rmd | 174 +++++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 75 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 7a3c52f2a..c1328f464 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -20,20 +20,24 @@ steps from the `recipes` package for more feature engineering. In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip` models into the `epipredict` workflow. +First, let's load the necessary packages: + ```{r, warning=FALSE, message =FALSE} library(epipredict) library(epiprocess) library(recipes) library(parsnip) library(workflows) +library(poissonreg) ``` We will use the built-in dataset `case_death_rate_subset` and apply feature engineering steps and fit a few models from `parsnip` to predict future death -rates by states. +rates by states 7 days ahead from Dec 31, 2021. + ```{r} x <- case_death_rate_subset -head(x) +tail(x) ``` There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates ranging from `r min(x$time_value)` to `r max(x$time_value)` in @@ -62,7 +66,8 @@ A 'throw-away' column with all ones is also created to show that `step_rm()` can be used to remove unwanted columns. ```{r madeup-population-data} pop_data <- data.frame(states = unique(x$geo_value), - inv_pop = 1/seq(1e5, 1e7, length.out = n_distinct(x$geo_value)), + inv_pop = 1/seq(1e5, 1e7, + length.out = n_distinct(x$geo_value)), social_distancing = rbinom(n = n_distinct(x$geo_value), size = 1, prob = 0.5), @@ -85,36 +90,31 @@ r <- r %>% step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() + step_epi_naomit() %>% + step_rm(cases, deaths) ``` -Notice that `step_rename()` was used for renaming of the variables, and `step_rm()` -was used to remove redundant columns. +Notice that `step_rename()` was used for renaming of the variables, and +`step_rm()` was used to remove redundant columns. -A quick view of what the prepossessed data looks like on training data: +A quick view of what columns exist in the the prepossessed data so far: ```{r peak-data-1} -slice_sample(bake(prep(r, x),x), n = 6) +names(bake(prep(r, x),x)) ``` -There are also many functions in the `recipes` package that allow for -spline transformation or other types of function transformations. - -```{r, warning=FALSE} -r <- r %>% - step_sqrt(contains("lag_")) %>% - step_rm(cases, deaths) - -slice_sample(bake(prep(r, x),x), n = 6) -``` +There are also many functions in the `recipes` package that allow for +[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), +such as log transformations and spline transformations. +Sometimes, the exact counts are desired, therefore fitting a linear regression +or poisson regression would be a good idea. -Sometimes, the exact counts are desired so fitting a linear regression would be -a good idea. On the other hand, instead of predicting -7-day ahead death counts, we would like to predict if the death counts 7 days -from the latest date fall into some categories. For the purpose of illustration, -we define two categories: less than 10K(low) and more than 10K(high). +On the other hand, say instead of predicting 7-day ahead death counts, we would +like to predict if the counts are high or low based on some given threshold. +For the purpose of illustration, we define two categories: less than 10K(low) +and more than 10K(high). We can resort to the `recipes` package again: @@ -133,86 +133,110 @@ r_categorical <- r %>% transform = binner, levels = inc) -slice_sample(bake(prep(r_categorical, x),x), n = 6) ``` +As a sanity check we can see now that the preprocessed data has a categorical +outcome in `ahead_7_deaths` +```{r, warning FALSE, echo = FALSE} +glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6)) +``` - -# Model Fitting +# Model Fitting and Post-processing Once the pre-processing steps are done, we can move on to model fitting using the `parsnip` package. +### Linear Regression + First, we can try fitting a linear regression: -```{r linear-fit} +```{r linear-fit-workflow, warning = FALSE} wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(x) -# the fit -wf$fit + latest <- get_test_data(recipe = r, x = x) +``` + +We take a look at the linear regression fit and the values of the coefficients. + +```{r linear-fit, warning = FALSE} +wf$fit$fit # look at the model fit and coefficients + p <- predict(wf, latest) %>% filter(!is.na(.pred)) + p ``` +The predictions are saved in the `.pred` column and represents the squared roots +of predicted death counts 7 days from Dec 31. 2021. -Logistic Regression -```{r poisson} -wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% - fit(x) -# the fit -wf$fit -latest <- get_test_data(recipe = r_categorical, x = x) -p <- predict(wf, latest) %>% filter(!is.na(.pred_class)) -p +Some post-processing can be done to revert back to death rates + +```{r linear-reg-post-process} +f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "inv_pop") + +wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(x) %>% + add_frosting(f) + +predict(wf, latest) %>% select( - social_distancing, - throw_away) ``` +### Poisson regression +For count data, we can use poisson regression. Although the counts here are not +integer since they were calculated from case rates multiplying population, some +extra preprocessing steps can help us round them to integers to allow fitting a +poisson regression. -## Issues from vignette -1. `step_cut()` did not work -```{r, error=TRUE} -r_test1 <- epi_recipe(x) %>% - step_population_scaling(case_rate, death_rate, df = pop_data, - by = c("geo_value" = "states"), - create_new = FALSE, - df_pop_col = "inv_pop") %>% - step_rename(cases = case_rate, deaths = death_rate) %>% - step_rm(throw_away) %>% - step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() %>% - step_cut(ahead_7_deaths, role = "outcome", breaks= c(0, 10000, Inf)) +We will also apply post-processing steps to the workflow similar to the ones above. + +```{r poisson-fit} +r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), + role = "predictor") %>% + step_filter(ahead_7_deaths > 0) -slice_sample(bake(prep(r_test1, x),x), n = 6) +latest <- get_test_data(recipe = r, x = x) + +wf <- epi_workflow(r, parsnip::poisson_reg()) %>% + fit(x) %>% + add_frosting(f) + +predict(wf, latest) %>% select( - social_distancing, - throw_away) +``` + +Let's also take a look at the poisson regression fit: +```{r, echo =FALSE} +wf$fit$fit ``` -2. Predictions are not produced, despite different models applied to it. Perhaps -something wrong with the preprocessing steps? -Maybe without `step_population_scaling()`: -```{r} -r_test2 <- epi_recipe(x) %>% - step_epi_lag(case_rate, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_lag(death_rate, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% - step_naomit(all_predictors()) %>% - step_naomit(all_outcomes(), skip = TRUE) -slice_sample(bake(prep(r_test2, x),x), n = 6) +### Logistic Regression -wf <- epi_workflow(r_test2, parsnip::linear_reg()) %>% +We use a logistic regression to predict whether 7-day ahead death counts are high +or low. Post-processing steps do not need to be applied here. +```{r logistic, warning = FALSE} +wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% fit(x) -# the fit -wf$fit -latest <- get_test_data(recipe = r_test2, x = x) -# latest <- x %>% -# filter(!is.na(case_rate), !is.na(death_rate)) %>% -# group_by(geo_value) %>% -# slice_tail(n = 15) %>% # have lags 0,...,14, so need 15 for a complete case -# ungroup() -p <- predict(wf, latest) %>% filter(!is.na(.pred)) +latest <- get_test_data(recipe = r_categorical, x = x) +p <- predict(wf, latest) %>% filter(!is.na(.pred_class)) p +``` +The model fit is as follows: +```{r echo=FALSE} +wf$fit$fit ``` +# Attribution + +This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html) + +This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University +on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. From 854090df3f2b91361e530262d48d8b8270aa140b Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 2 Aug 2022 14:49:57 -0700 Subject: [PATCH 13/29] add poissonreg package to suggests --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 25f82bfec..7673c6c14 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,6 +31,7 @@ Imports: tidyselect, workflows Suggests: + poissonreg, covidcast, data.table, ggplot2, From b7d81b0f195600d857489753f8d0ff8f6d69778b Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Sun, 14 Aug 2022 23:21:31 -0700 Subject: [PATCH 14/29] updates according to Aug14 suggestions; need to add update_role() and add_role() example --- vignettes/more-recipes.Rmd | 80 +++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index c1328f464..475350daf 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -37,6 +37,7 @@ rates by states 7 days ahead from Dec 31, 2021. ```{r} x <- case_death_rate_subset +x$geo_value <- as.factor(x$geo_value) tail(x) ``` There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates @@ -106,17 +107,55 @@ names(bake(prep(r, x),x)) There are also many functions in the `recipes` package that allow for [function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), -such as log transformations and spline transformations. +such as log transformations and spline transformations. In our case, we will +center the numerical predictors to allow for meaningful interpretation of the +intercept. -Sometimes, the exact counts are desired, therefore fitting a linear regression +```{r} +r <- r %>% step_center(contains("lag")) +``` + +Since the current outcomes are positive, but the linear regression assumes the +outcome to be in full support, we will consider a log-transformation. + +```{r} +r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome") +``` + +Other useful functionalities include creating a set of binary dummy variables +from a factor variable. As an illustration that will not affect the models, we +will show how states in `geo_value` become dummy variables so that each state +as a dummy variable has a differential intercept coefficient. + +```{r dummy-variable} +length(unique(x$geo_value)) + +dummies <- r %>% + step_dummy(geo_value) %>% + prep(training = x) + +dummy_data <- bake(dummies, x) + +names(dummy_data) +``` + +Once the pre-processing steps are done, we can move on to model fitting using +the `parsnip` package. + + +# Model Fitting and Post-processing + +Sometimes, the exact values are desired, therefore fitting a linear regression or poisson regression would be a good idea. On the other hand, say instead of predicting 7-day ahead death counts, we would -like to predict if the counts are high or low based on some given threshold. +like to classify if the future predictions are high or low based on some given threshold. +In this case, a logistic regression among other classification models can be used. For the purpose of illustration, we define two categories: less than 10K(low) and more than 10K(high). -We can resort to the `recipes` package again: +We resort to the `recipes` package again to change the numerical outcomes to +categorical for the purpose of classification: ```{r, warning = FALSE} binner <- function(x) { @@ -136,25 +175,22 @@ r_categorical <- r %>% ``` As a sanity check we can see now that the preprocessed data has a categorical -outcome in `ahead_7_deaths` +outcome in `ahead_7_deaths`. + ```{r, warning FALSE, echo = FALSE} glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6)) ``` -# Model Fitting and Post-processing - -Once the pre-processing steps are done, we can move on to model fitting using -the `parsnip` package. ### Linear Regression First, we can try fitting a linear regression: ```{r linear-fit-workflow, warning = FALSE} -wf <- epi_workflow(r, parsnip::linear_reg()) %>% +wf <- epi_workflow(r_lin, parsnip::linear_reg()) %>% fit(x) -latest <- get_test_data(recipe = r, x = x) +latest <- get_test_data(recipe = r_lin, x = x) ``` We take a look at the linear regression fit and the values of the coefficients. @@ -167,26 +203,9 @@ p <- predict(wf, latest) %>% filter(!is.na(.pred)) p ``` -The predictions are saved in the `.pred` column and represents the squared roots -of predicted death counts 7 days from Dec 31. 2021. - -Some post-processing can be done to revert back to death rates - -```{r linear-reg-post-process} -f <- frosting() %>% - layer_predict() %>% - layer_threshold(.pred) %>% - layer_naomit(.pred) %>% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "inv_pop") - -wf <- epi_workflow(r, parsnip::linear_reg()) %>% - fit(x) %>% - add_frosting(f) +The predictions are saved in the `.pred` column and represents the log-transformed +predicted death counts 7 days from Dec 31. 2021. -predict(wf, latest) %>% select( - social_distancing, - throw_away) -``` ### Poisson regression @@ -195,7 +214,6 @@ integer since they were calculated from case rates multiplying population, some extra preprocessing steps can help us round them to integers to allow fitting a poisson regression. -We will also apply post-processing steps to the workflow similar to the ones above. ```{r poisson-fit} r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), From 10c6aecf0ce270365ae6369f81cb23dc2a9f56eb Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Mon, 15 Aug 2022 10:13:28 -0700 Subject: [PATCH 15/29] fix error; added role manipulation example --- vignettes/more-recipes.Rmd | 41 +++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd index 475350daf..dee09fd1d 100644 --- a/vignettes/more-recipes.Rmd +++ b/vignettes/more-recipes.Rmd @@ -52,9 +52,24 @@ package can also be applied in our `epipredict` workflows. First, create a recipe object from the original data and then specify the preprocessing steps. Steps are then sequentially added. +`recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) +in bulk. There are a couple handy functions that can be used together to help us +manipulate variable roles easily. + +`update_role()` alters an existing role in the recipe or assigns an initial role +to variables that do not yet have a declared role. + +`add_role()` adds an additional role to variables that already have a role in +the recipe, without overwriting old roles. + +`remove_role()` eliminates a single existing role in the recipe. + +Here, we will add an additional `predictor` role to both `case_rate` and +`death_rate` variables. They currently have roles as `raw`. + ```{r create-recipe-object} -r <- epi_recipe(x) -r +r <- epi_recipe(x) %>% + add_role(- epi_keys(x), new_role = "predictor") ``` The function automatically categorizes roles into `geo_value`, `time_value` and @@ -88,8 +103,8 @@ r <- r %>% df_pop_col = "inv_pop") %>% step_rename(cases = case_rate, deaths = death_rate) %>% step_rm(throw_away) %>% - step_epi_lag(cases, lag = c(0, 7, 14), role = "predictor") %>% - step_epi_lag(deaths, lag = c(0, 7, 14), role = "predictor") %>% + step_epi_lag(cases, lag = c(0, 7, 14)) %>% + step_epi_lag(deaths, lag = c(0, 7, 14)) %>% step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% step_epi_naomit() %>% step_rm(cases, deaths) @@ -105,6 +120,7 @@ A quick view of what columns exist in the the prepossessed data so far: names(bake(prep(r, x),x)) ``` + There are also many functions in the `recipes` package that allow for [function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), such as log transformations and spline transformations. In our case, we will @@ -112,14 +128,15 @@ center the numerical predictors to allow for meaningful interpretation of the intercept. ```{r} -r <- r %>% step_center(contains("lag")) +r <- r %>% step_center(contains("lag"), role = "predictor") ``` Since the current outcomes are positive, but the linear regression assumes the -outcome to be in full support, we will consider a log-transformation. +outcome to be in full support, we will consider a log-transformation with an +offset value of 1 to ensure `log(0)` does not occur. ```{r} -r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome") +r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome", offset = 1) ``` Other useful functionalities include creating a set of binary dummy variables @@ -216,17 +233,17 @@ poisson regression. ```{r poisson-fit} -r <- r %>% step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), +r <- r %>% + step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), role = "predictor") %>% step_filter(ahead_7_deaths > 0) latest <- get_test_data(recipe = r, x = x) wf <- epi_workflow(r, parsnip::poisson_reg()) %>% - fit(x) %>% - add_frosting(f) + fit(x) -predict(wf, latest) %>% select( - social_distancing, - throw_away) +predict(wf, latest) %>% filter(!is.na(.pred)) ``` Let's also take a look at the poisson regression fit: @@ -237,7 +254,7 @@ wf$fit$fit ### Logistic Regression We use a logistic regression to predict whether 7-day ahead death counts are high -or low. Post-processing steps do not need to be applied here. +or low. ```{r logistic, warning = FALSE} wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% fit(x) From 00b098205ef36e669657d3fccc01917fcc361226 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Mon, 22 Aug 2022 21:02:51 -0700 Subject: [PATCH 16/29] poisson added --- vignettes/preprocessing-and-models.Rmd | 260 +++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 vignettes/preprocessing-and-models.Rmd diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd new file mode 100644 index 000000000..b9c07933e --- /dev/null +++ b/vignettes/preprocessing-and-models.Rmd @@ -0,0 +1,260 @@ +--- +title: Examples of Preprocessing and Models +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Examples of Preprocessing and Models} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Introduction + +The `epipredict` package utilizes the `tidymodels` framework, namely +[`recipes`](https://recipes.tidymodels.org/) for +[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences +of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a +unified interface to a range of models. + +`epipredict` has additional customized feature engineering and preprocessing +steps, such as `step_epi_shift()`, `step_population_scaling()`, +`step_epi_naomit()`. They can be used along with +steps from the `recipes` package for more feature engineering. + +In this vignette, we will illustrate some examples of how to use `epipredict` +with `recipes` and `parsnip` for different purposes of epidemiology forecasting. + +The remaining vignette will be split into three sections. The first section, we +will use a Poisson regression to predict death counts. In the second section, +we will use a linear regression to predict death rates. Last but not least, we +will create a classification model for hotspot predictions. + +```{r, warning=FALSE, message=FALSE} +library(tidyverse) +library(covidcast) +library(epidatr) +library(epiprocess) +library(epipredict) +library(recipes) +library(poissonreg) +``` + +## Poisson Regression + +During COVID-19, Center for Disease Control and Prevention (CDC) gathered models +and forecasts to characterize the state of an outbreak and its course. They use +it to inform public health decision makers on potential consequences of +deploying control measures. + +One of the outcomes that the CDC forecasts is [death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html). +Although there are many state-of-the-art models, we choose to use Poisson +regression, the textbook example for modeling count data, as an illustration +for using the `epipredict` package with other existing tidymodel packages. + +```{r poisson-reg-data} +x <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, cases = value) + +y <- covidcast( + data_source = "jhu-csse", + signals = "deaths_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, deaths = value) + +case_death_counts_subset <- x %>% + full_join(y, by = c("geo_value", "time_value")) %>% + as_epi_df() + +``` + +The `case_death_counts_subset` dataset comes from the `covidcast` package,and +contains the number of confirmed cases and deaths from June 4, 2021 to +Dec 31, 2021 in some U.S. states. + +We wish to predict the 7-day ahead death counts with lagged cases and deaths. +Furthermore, we will let each state be a dummy variable. Using differential +intercept coefficients, we can allow for an intercept shift between states. + +The model takes the form of +\begin{aligned} +\text{log}\left( \mu_{t+7} \right) = \beta_0 + \delta_1 s_{\text{state_1}} + +\delta_2 s_{\text{state_2}} + ... + \beta_1 \text{deaths}_{t} + \nonumber \\ +\beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + +\beta_4 \text{cases}_{t-7} +\end{aligned} + + +where $\mu_{t+7} = \mathbb{E}(y_{t+7})$, and $y_{t+7}$ is assumed to follow a +Poisson distribution with mean $\mu_{t+7}$; $s_{\text{state}}$ are dummy +variables for each state and take values of either 0 or 1. + +Preprocessing steps will be performed using the `recipes` functions to get the +data ready for modeling. + +But before diving into them, it will be helpful to understand what `roles` are +in the `recipes` framework. + +> `recipes` can assign one or more roles to each column in the data. The roles +> are not restricted to a predefined set; they can be anything. +> For most conventional situations, they are typically “predictor” and/or +> “outcome”. Additional roles enable targeted step operations on specific +> variables or groups of variables. +> +> In our case, the role `predictor` is given to explanatory variable that go to the +> left-hand side of the model. The role `outcome` is the response variable +> that we wish to predict. `geo_value` and `time_value` are predefined roles +> that are unique to the `epipredict` package. Since we work with `epi_df` +> objects, all datasets should have `geo_value` and `time_value` passed through +> automatically with these two roles assigned to the appropriate columns. +> +> `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) +> in bulk. There are a couple handy functions that can be used together to help us +> manipulate variable roles easily. +> +>> `update_role()` alters an existing role in the recipe or assigns an initial role +>> to variables that do not yet have a declared role. +> +>> `add_role()` adds an additional role to variables that already have a role in +>> the recipe, without overwriting old roles. +> +>> `remove_role()` eliminates a single existing role in the recipe. + +Notice in the following preprocessing steps, we used `add_role()` on +`geo_value_factor` since currently the default role for it is `raw` and +we would like the dummified variables to automatically be of role `predictor`. + +```{r} +case_death_counts_subset <- case_death_counts_subset %>% + mutate(geo_value_factor = as.factor(geo_value)) %>% + as_epi_df() + +epi_recipe(case_death_counts_subset) + +r <- epi_recipe(case_death_counts_subset) %>% + add_role(geo_value_factor, new_role = "predictor") %>% + recipes::step_dummy(geo_value_factor) %>% + step_mutate(cases = case_when(cases < 0 ~ 0, TRUE ~ cases)) %>% + ## NOTE: why are there negative numbers in here? + step_mutate(deaths = case_when(deaths < 0 ~ 0, TRUE ~ deaths)) %>% + step_epi_lag(cases, lag = c(0, 7)) %>% + step_epi_lag(deaths, lag = c(0, 7)) %>% + step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% + step_epi_naomit() +``` + +After specifying the preprocessing steps, we will use the `parsnip` package for +modeling and getting the prediction for the 7-day ahead death counts of the +latest available date in the dataset. + +```{r} +latest <- get_test_data(recipe = r, x = case_death_counts_subset) + +wf <- epi_workflow(r, parsnip::poisson_reg()) %>% + fit(case_death_counts_subset) + +predict(wf, latest) %>% filter(!is.na(.pred)) +``` + +Let's take a look at the fit: +```{r, echo =FALSE} +wf$fit$fit +``` + +Up to now, we've used the Poisson regression to model count data. Poisson +regression can also be used to model rate data, such as case rates or death +rates, by incorporating offset terms in the model. + +To model death rates, the Poisson regression would be expressed as: +\begin{aligned} +\text{log}\left( \mu_{t+7} \right) = \text{log(population)} + +\beta_0 + \delta_1 s_{\text{state_1}} + +\delta_2 s_{\text{state_2}} + ... + \beta_1 \text{deaths}_{t} + \nonumber \\ +\beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + +\beta_4 \text{cases}_{t-7} +\end{aligned} +where $\text{log(population)}$ is the log of the state population that was +used to scale the count data on the left-hand side of the equation. + +There are several ways to model rate data given count and population data. +First, in the `parsnip` framework, we can specify the formula in `fit()`. +However, by doing so we lose the ability to use the `recipes` framework to +create new variables since new variables that do not exist in the +original dataset such as lag and leads cannot be called directly in `fit()`. + +Alternatively, `step_population_scaling()` in the `epipredict` package can +perform the population scaling if we provide the population data, and we can +model this using other models such as linear regression, which we will +illustrate in the next section. + + +## Linear Regression +We wish to predict the 7-day ahead death counts with lagged cases and deaths, +along with extra behaviour indicator inputs. Namely, we will use survey data +from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators). + +The survey data shows the estimated percentage of people who wore a mask for +most or all of the time while in public in the past 7 days and the estimated +percentage of respondents who reported that all or most people they enountered +in public in the past 7 days maintained a distance of at least 6 feet. We will +convert these percentages into indicator variables as follows: + +```{r} +behav_ind_mask <- covidcast(data_source = "fb-survey", + signals = "smoothed_wwearing_mask_7d", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj" + ) %>% + fetch_tbl() %>% + select(geo_value, time_value, percentage = value) %>% + mutate(mask_indicator = case_when(percentage < 50 ~ 0, + TRUE ~ 1)) %>% + select(geo_value, time_value, mask_indicator) + +behav_ind_distancing <- covidcast(data_source = "fb-survey", + signals = "smoothed_wothers_distanced_public", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj" + ) %>% + fetch_tbl() %>% + select(geo_value, time_value, percentage = value) %>% + mutate(distancing_indicator = case_when(percentage < 20 ~ 0, + TRUE ~ 1)) %>% + select(geo_value, time_value, distancing_indicator) +behav_ind <- behav_ind_mask %>% + full_join(behav_ind_distancing, by = c("geo_value", "time_value")) %>% + as_epi_df() +``` + +## Classification + +## Reference +* McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 +forecasting and hotspot prediction?." Proceedings of the National Academy of +Sciences 118.51 (2021): e2111453118. + +## Attribution + +This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html) + +This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University +on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. From bedcae7fb91ac4eedb572e762403c816da797b3c Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Mon, 22 Aug 2022 22:33:27 -0700 Subject: [PATCH 17/29] save progress on lin reg --- vignettes/preprocessing-and-models.Rmd | 73 ++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index b9c07933e..e6d8bd40e 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -55,6 +55,7 @@ regression, the textbook example for modeling count data, as an illustration for using the `epipredict` package with other existing tidymodel packages. ```{r poisson-reg-data} +## NOTE: built-in data? or use epiprocess built-in data and save code here x <- covidcast( data_source = "jhu-csse", signals = "confirmed_incidence_num", @@ -144,7 +145,7 @@ case_death_counts_subset <- case_death_counts_subset %>% mutate(geo_value_factor = as.factor(geo_value)) %>% as_epi_df() -epi_recipe(case_death_counts_subset) +epi_recipe(case_death_counts_subset) r <- epi_recipe(case_death_counts_subset) %>% add_role(geo_value_factor, new_role = "predictor") %>% @@ -155,7 +156,7 @@ r <- epi_recipe(case_death_counts_subset) %>% step_epi_lag(cases, lag = c(0, 7)) %>% step_epi_lag(deaths, lag = c(0, 7)) %>% step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() + step_epi_naomit() ``` After specifying the preprocessing steps, we will use the `parsnip` package for @@ -197,15 +198,28 @@ However, by doing so we lose the ability to use the `recipes` framework to create new variables since new variables that do not exist in the original dataset such as lag and leads cannot be called directly in `fit()`. -Alternatively, `step_population_scaling()` in the `epipredict` package can -perform the population scaling if we provide the population data, and we can -model this using other models such as linear regression, which we will -illustrate in the next section. +Alternatively, `step_population_scaling()` and `layer_population_scaling()` +in the `epipredict` package can perform the population scaling if we provide the +population data, and we can model this using other models such as linear +regression, which we will illustrate in the next section. ## Linear Regression -We wish to predict the 7-day ahead death counts with lagged cases and deaths, -along with extra behaviour indicator inputs. Namely, we will use survey data + +The CDC requires submission of case and death count predictions. However, it is +often the case that rate data is easier to model as the outcome as it need not +be an integer. We can use a liner regression to predict the death rates and +use state population data to scale the rates to counts. We will do so using +`layer_population_scaling()` from the `epipredict` package. + +Additionally, when forecasts are submitted, prediction intervals should be +provided along with the point estimates. This can be obtained via postprocessing +`layer_residual_quantiles()`. Although worth pointing out that +`layer_residual_quantiles()` should be used before population scaling or else +the transformation will make the results uninterpretable. + +We wish to predict the 7-day ahead death counts with lagged case rates and death +rates, along with extra behaviour indicator inputs. Namely, we will use survey data from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators). The survey data shows the estimated percentage of people who wore a mask for @@ -214,7 +228,10 @@ percentage of respondents who reported that all or most people they enountered in public in the past 7 days maintained a distance of at least 6 feet. We will convert these percentages into indicator variables as follows: +State-wise population data is obtained from `covidcast::state_census` and will +be used in `layer_population_scaling()`. ```{r} +## NOTE: need to put in package? behav_ind_mask <- covidcast(data_source = "fb-survey", signals = "smoothed_wwearing_mask_7d", time_type = "day", @@ -240,9 +257,45 @@ behav_ind_distancing <- covidcast(data_source = "fb-survey", mutate(distancing_indicator = case_when(percentage < 20 ~ 0, TRUE ~ 1)) %>% select(geo_value, time_value, distancing_indicator) + +pop_dat <- covidcast::state_census %>% + select(POPESTIMATE2019, ABBR) %>% + mutate(ABBR = tolower(ABBR)) %>% + filter(ABBR %in% c("ca","fl","tx","ny","nj")) + behav_ind <- behav_ind_mask %>% - full_join(behav_ind_distancing, by = c("geo_value", "time_value")) %>% - as_epi_df() + full_join(behav_ind_distancing, by = c("geo_value", "time_value")) + +dim(behav_ind) +``` + +We will take a subset of death rate and case rate data from our built-in dataset +`case_death_rate_subset`. +```{r} +jhu <- case_death_rate_subset %>% + dplyr::filter(time_value >= "2021-06-04", + time_value <= "2021-12-31", + geo_value %in% c("ca","fl","tx","ny","nj")) + +dim(jhu) +``` + +Preprocessing steps will rely on functions from the `epipredict` package as well +as the `recipes` package: + +```{r} +jhu <- jhu %>% + mutate(geo_value_factor = as.factor(geo_value)) %>% + full_join(behav_ind, by = c("geo_value", "time_value")) %>% + as_epi_df() + +r <- epi_recipe(jhu) %>% + add_role(geo_value_factor, new_role = "predictor") %>% + recipes::step_dummy(geo_value_factor) %>% + step_epi_lag(case_rate, lag = c(0, 7)) %>% + step_epi_lag(death_rate, lag = c(0, 7)) %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_epi_naomit() ``` ## Classification From 0ef9aca49043b346fac293ff3349867891b1432a Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 23 Aug 2022 10:25:44 -0700 Subject: [PATCH 18/29] lin reg part done, classifcation wip --- vignettes/preprocessing-and-models.Rmd | 58 ++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index e6d8bd40e..59ea97a19 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -33,12 +33,13 @@ we will use a linear regression to predict death rates. Last but not least, we will create a classification model for hotspot predictions. ```{r, warning=FALSE, message=FALSE} -library(tidyverse) library(covidcast) library(epidatr) library(epiprocess) library(epipredict) library(recipes) +library(parsnip) +library(workflows) library(poissonreg) ``` @@ -149,7 +150,7 @@ epi_recipe(case_death_counts_subset) r <- epi_recipe(case_death_counts_subset) %>% add_role(geo_value_factor, new_role = "predictor") %>% - recipes::step_dummy(geo_value_factor) %>% + step_dummy(geo_value_factor) %>% step_mutate(cases = case_when(cases < 0 ~ 0, TRUE ~ cases)) %>% ## NOTE: why are there negative numbers in here? step_mutate(deaths = case_when(deaths < 0 ~ 0, TRUE ~ deaths)) %>% @@ -261,7 +262,8 @@ behav_ind_distancing <- covidcast(data_source = "fb-survey", pop_dat <- covidcast::state_census %>% select(POPESTIMATE2019, ABBR) %>% mutate(ABBR = tolower(ABBR)) %>% - filter(ABBR %in% c("ca","fl","tx","ny","nj")) + filter(ABBR %in% c("ca","fl","tx","ny","nj")) %>% + rename(value = POPESTIMATE2019, geo_value = ABBR) behav_ind <- behav_ind_mask %>% full_join(behav_ind_distancing, by = c("geo_value", "time_value")) @@ -283,6 +285,12 @@ dim(jhu) Preprocessing steps will rely on functions from the `epipredict` package as well as the `recipes` package: +Additionally, there are also many functions in the `recipes` package that allow for +[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), +such as log transformations and data centering. In our case, we will +center the numerical predictors to allow for meaningful interpretation of the +intercept. + ```{r} jhu <- jhu %>% mutate(geo_value_factor = as.factor(geo_value)) %>% @@ -291,13 +299,55 @@ jhu <- jhu %>% r <- epi_recipe(jhu) %>% add_role(geo_value_factor, new_role = "predictor") %>% - recipes::step_dummy(geo_value_factor) %>% + step_dummy(geo_value_factor) %>% step_epi_lag(case_rate, lag = c(0, 7)) %>% step_epi_lag(death_rate, lag = c(0, 7)) %>% step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_center(contains("lag"), role = "predictor") %>% step_epi_naomit() ``` +As a sanity check we can see what columns exist in the preprocessed data now: +```{r, warning = FALSE, echo = FALSE} +glimpse(slice_sample(bake(prep(r, jhu),jhu), n = 6)) +``` + +Before directly predicting the results, we need to add postprocessing layers to +obtain the death counts instead of death rates. + +```{r} +f <- frosting() %>% + layer_predict() %>% + # layer_add_target_date() %>% ## NOTE: this gives 2022-01-21 instead of 2022-01-07? + layer_add_target_date("2022-01-07") %>% # max(jhu$time_value) + 7 + layer_threshold(.pred, lower = 0) %>% + layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, df = pop_dat, + by = c("geo_value"), + df_pop_col = "value") + ## NOTE: how to scale `.pred_distn` if it is type `dist` + ## NOTE: need to add `dist` handling in `layer_population_scaling`? + +wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(f) + +latest <- get_test_data(recipe = r, x = jhu) +predict(wf, latest) + +# lapply(nested_quantiles(predict(wf, latest)$.pred_distn), `[[`, 1) +``` + +The point estimate of death counts on Jan 7, 2022 is returned in column +`.pred_original`. + +Last but not least, let's take a look at the regression fit and check the +coefficients: +```{r, echo =FALSE} +wf$fit$fit +``` + ## Classification ## Reference From ab5a288053dde1872d6ceceabdbe16562b78509d Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 23 Aug 2022 17:08:32 -0700 Subject: [PATCH 19/29] hotspot model not compatible with epi_workflow? --- vignettes/preprocessing-and-models.Rmd | 132 ++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 11 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 59ea97a19..5073a2214 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -26,6 +26,9 @@ steps from the `recipes` package for more feature engineering. In this vignette, we will illustrate some examples of how to use `epipredict` with `recipes` and `parsnip` for different purposes of epidemiology forecasting. +We will focus on simple basic autoregressive models, in which COVID cases and +deaths in the near future are predicted using a linear combination of cases and +deaths in the near past. The remaining vignette will be split into three sections. The first section, we will use a Poisson regression to predict death counts. In the second section, @@ -207,11 +210,11 @@ regression, which we will illustrate in the next section. ## Linear Regression -The CDC requires submission of case and death count predictions. However, it is -often the case that rate data is easier to model as the outcome as it need not -be an integer. We can use a liner regression to predict the death rates and -use state population data to scale the rates to counts. We will do so using -`layer_population_scaling()` from the `epipredict` package. +For COVID-19, the CDC requires submission of case and death count predictions. +However, it is often the case that rate data is easier to model as the outcome +as it need not be an integer. We can use a liner regression to predict the death +rates and use state population data to scale the rates to counts. We will do so +using `layer_population_scaling()` from the `epipredict` package. Additionally, when forecasts are submitted, prediction intervals should be provided along with the point estimates. This can be obtained via postprocessing @@ -323,11 +326,9 @@ f <- frosting() %>% layer_threshold(.pred, lower = 0) %>% layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>% layer_naomit(.pred) %>% - layer_population_scaling(.pred, df = pop_dat, + layer_population_scaling(.pred, .pred_distn, df = pop_dat, by = c("geo_value"), df_pop_col = "value") - ## NOTE: how to scale `.pred_distn` if it is type `dist` - ## NOTE: need to add `dist` handling in `layer_population_scaling`? wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% @@ -335,12 +336,12 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>% latest <- get_test_data(recipe = r, x = jhu) predict(wf, latest) - -# lapply(nested_quantiles(predict(wf, latest)$.pred_distn), `[[`, 1) +nested_quantiles(predict(wf, latest)$.pred_distn_original) ``` The point estimate of death counts on Jan 7, 2022 is returned in column -`.pred_original`. +`.pred_original`. The predictions based on residual quantiles is stored in +`.pred_distn_original` and also scaled to count data. Last but not least, let's take a look at the regression fit and check the coefficients: @@ -350,6 +351,115 @@ wf$fit$fit ## Classification +Sometimes predictive model for future trends are of interest. In other words, +the target is to predict if the future will have increasing case rates(up), +decreasing case rates (down), or flat case rates (flat). Such model are often +referred to as hotspot prediction models. We will refer to the model notations +from McDonald et al. and illustrate how modeling can be streamlined in our +framework with other classifcations. + +Hotspot prediction predicts a categorical variable defined in terms of the +relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. +Where $Y_{\ell, t}$ denotes the case rates in location $\ell$ at time $t$. +We define the response variables as follows: + +$$ + Z_{\ell, t}= + \begin{cases} + \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ + \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.25\\ + \text{flat}, & \text{otherwise} + \end{cases} +$$ +where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. +We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized +as 'up', meaning the number of newly reported cases over the past 7 days has +increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ +is categorized as 'down', it suggests that there has been at least a 25% +decrease in newly reported cases over the past 7 days. Otherwise, we will +consider the trends to be flat. + + + ## Reference * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 forecasting and hotspot prediction?." Proceedings of the National Academy of From a38fb56fe783a71a4d507e6fc3c25e50329347f2 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Tue, 23 Aug 2022 17:18:13 -0700 Subject: [PATCH 20/29] fix minor commenting bug --- vignettes/preprocessing-and-models.Rmd | 28 ++++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 5073a2214..9d38d90e0 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -379,6 +379,7 @@ is categorized as 'down', it suggests that there has been at least a 25% decrease in newly reported cases over the past 7 days. Otherwise, we will consider the trends to be flat. +TODO: try logistic regression on two categories and see if it works? +# using_formula <- +# discrim_linear() %>% +# set_engine("MASS") %>% +# fit_xy(x = new_dat%>% +# select(-trend,-mask_indicator,-distancing_indicator,-time_value,-geo_value), +# y= new_dat$trend) +#``` +--> ## Reference * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 From ecf075511a235b58582a403374f1431aad410232 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Wed, 24 Aug 2022 11:07:01 -0700 Subject: [PATCH 21/29] fix error --- vignettes/preprocessing-and-models.Rmd | 64 ++++++-------------------- 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 9d38d90e0..6f69cda91 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -367,20 +367,17 @@ $$ Z_{\ell, t}= \begin{cases} \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ - \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.25\\ - \text{flat}, & \text{otherwise} + \text{other}, & \text{otherwise} \end{cases} $$ where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized as 'up', meaning the number of newly reported cases over the past 7 days has -increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ -is categorized as 'down', it suggests that there has been at least a 25% -decrease in newly reported cases over the past 7 days. Otherwise, we will -consider the trends to be flat. +increased by at least 25% compared to the preceding week. + +TODO: add model expression here. (after code works?) TODO: try logistic regression on two categories and see if it works? - +``` ## Reference * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 From 63be91964df1335880acf52e0ead24d21e2da734 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Thu, 25 Aug 2022 18:30:06 -0700 Subject: [PATCH 22/29] classification working --- vignettes/preprocessing-and-models.Rmd | 93 +++++++++++++++----------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 6f69cda91..82f2b4a92 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -321,7 +321,6 @@ obtain the death counts instead of death rates. ```{r} f <- frosting() %>% layer_predict() %>% - # layer_add_target_date() %>% ## NOTE: this gives 2022-01-21 instead of 2022-01-07? layer_add_target_date("2022-01-07") %>% # max(jhu$time_value) + 7 layer_threshold(.pred, lower = 0) %>% layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>% @@ -335,8 +334,20 @@ wf <- epi_workflow(r, parsnip::linear_reg()) %>% add_frosting(f) latest <- get_test_data(recipe = r, x = jhu) -predict(wf, latest) -nested_quantiles(predict(wf, latest)$.pred_distn_original) +p <- predict(wf, latest) +p +``` + +To look at the prediction intervals: +```{r} +# method 1 +nested_quantiles(p$.pred_distn_original) +# method 2 +dstn = p$.pred_distn_original +q = distributional::parameters(dstn)$q # a list of length nrow(p) +q +tau = distributional::parameters(dstn)$tau +tau ``` The point estimate of death counts on Jan 7, 2022 is returned in column @@ -355,8 +366,8 @@ Sometimes predictive model for future trends are of interest. In other words, the target is to predict if the future will have increasing case rates(up), decreasing case rates (down), or flat case rates (flat). Such model are often referred to as hotspot prediction models. We will refer to the model notations -from McDonald et al. and illustrate how modeling can be streamlined in our -framework with other classifcations. +from McDonald et al. but extend the application to predict three categories +instead of two. Hotspot prediction predicts a categorical variable defined in terms of the relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. @@ -367,17 +378,18 @@ $$ Z_{\ell, t}= \begin{cases} \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ - \text{other}, & \text{otherwise} + \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.25\\ + \text{flat}, & \text{otherwise} \end{cases} $$ + where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized as 'up', meaning the number of newly reported cases over the past 7 days has -increased by at least 25% compared to the preceding week. - -TODO: add model expression here. (after code works?) - -TODO: try logistic regression on two categories and see if it works? +increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ +is categorized as 'down', it suggests that there has been at least a 25% +decrease in newly reported cases over the past 7 days. Otherwise, we will +consider the trends to be flat. Preprocessing steps are similar to the previous models with an additional step of categorizing the response variables. @@ -385,50 +397,53 @@ of categorizing the response variables. We will take a subset of death rate and case rate data from our built-in dataset `case_death_rate_subset`. ```{r} -binner <- function(x) { - x <- cut(x, breaks = c(-Inf, 0.25, Inf), include.lowest = TRUE) - # now return the group number - as.numeric(x) -} - -inc <- c("other", "up") - +jhu <- case_death_rate_subset %>% + dplyr::filter(time_value >= "2021-06-04", + time_value <= "2021-12-31", + geo_value %in% c("ca","fl","tx","ny","nj")) %>% + mutate(geo_value_factor = as.factor(geo_value)) %>% + as_epi_df() r <- epi_recipe(jhu) %>% add_role(time_value, new_role = "predictor") %>% step_dummy(geo_value_factor) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>% - step_mutate(pct_diff1 = - (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) %>% - step_mutate(pct_diff2 = - (lag_7_case_rate - lag_14_case_rate)/lag_14_case_rate) %>% - step_epi_naomit() %>% - step_mutate(trend = pct_diff1) %>% - step_num2factor(trend, - role = "outcome", - transform = binner, - levels = inc) %>% - step_rm(case_rate, + step_mutate(pct_diff1 = case_when(lag_7_case_rate == 0 ~ 0, + TRUE ~ (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) ) %>% + step_mutate(pct_diff2 = case_when(lag_14_case_rate == 0 ~ 0, + TRUE ~ (lag_7_case_rate - lag_14_case_rate)/lag_14_case_rate)) %>% + step_mutate(trend = case_when(pct_diff1 < -0.25 ~ "down", + pct_diff1 > 0.25 ~ "up", + TRUE ~ "flat"), + role = "outcome") %>% + step_rm(death_rate, + case_rate, lag_0_case_rate, lag_7_case_rate, lag_14_case_rate, - ahead_7_case_rate) + ahead_7_case_rate) %>% + step_epi_naomit() - ``` -We will use linear discriminant analysis and take a look at the predictions: +We will use a multinomial regression and take a look at the predictions: -```{r, eval=FALSE, echo=FALSE} -# wf <- epi_workflow(r, parsnip::logistic_reg()) %>% -# fit(jhu) +```{r, warning=FALSE} +wf <- epi_workflow(r, parsnip::multinom_reg()) %>% + fit(jhu) + +latest <- get_test_data(recipe = r, x = jhu) +predict(wf, latest) %>% filter(!is.na(.pred_class)) +``` -# latest <- get_test_data(recipe = r, x = jhu) -# predict(wf, latest) -``` +We can also take a look at the fit: +```{r} +wf$fit$fit +``` ## Reference + * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 forecasting and hotspot prediction?." Proceedings of the National Academy of Sciences 118.51 (2021): e2111453118. From 14a8a22b13612648083ec9bce03b80c4067d4069 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Fri, 26 Aug 2022 10:28:09 -0700 Subject: [PATCH 23/29] show how to use formula --- vignettes/preprocessing-and-models.Rmd | 43 +++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 82f2b4a92..266f8fbc1 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -391,6 +391,32 @@ is categorized as 'down', it suggests that there has been at least a 25% decrease in newly reported cases over the past 7 days. Otherwise, we will consider the trends to be flat. +The expression of the multinomial regression we will model is as follows: + +$$ +\pi_{j}(x) = \text{Pr}(Y = j|x) = \frac{e^{g_j(x)}}{\sum_{k=0}^2 g_j(x) } +$$ +where $j$ is either down(Y=0), flat(Y=1), or up (Y=2), +$$g_0(x) = 0$$ + +\begin{aligned} +g_1(x)= \text{ln}(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}) = +\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_10 s_{\text{state_1}} + +\delta_11 s_{\text{state_2}} + ... \nonumber \\ ++ \beta_{12} Y^{\Delta}_{\ell, t} + +\beta_{13} Y^{\Delta}_{\ell, t-7} +\end{aligned} + +\begin{aligned} +g_2(x)= \text{ln}(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}) = +\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_20 s_{\text{state_1}} + +\delta_21 s_{\text{state_2}} + ... \nonumber \\ ++ \beta_{22} Y^{\Delta}_{\ell, t} + +\beta_{23} Y^{\Delta}_{\ell, t-7} +\end{aligned} + + + Preprocessing steps are similar to the previous models with an additional step of categorizing the response variables. @@ -427,7 +453,7 @@ r <- epi_recipe(jhu) %>% ``` -We will use a multinomial regression and take a look at the predictions: +We will fit the multinomial regression and take a look at the predictions: ```{r, warning=FALSE} wf <- epi_workflow(r, parsnip::multinom_reg()) %>% @@ -442,6 +468,21 @@ We can also take a look at the fit: wf$fit$fit ``` +One could also use formula in `epi_recipe()` to achieve the same results as +above. However, only one of `add_formula()`, `add_recipe()`, or +`workflow_variables()` must be specified. For the purpose of using `add_formula` +over `add_recipes`, we will prep and bake our recipe to return a dataset that +will be used for model fitting. +```{r} +b <- bake(prep(r,jhu),jhu) +m_reg <- parsnip::multinom_reg() + +epi_workflow() %>% +add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2 )%>% +add_model(m_reg) %>% +fit(data = b) +``` + ## Reference * McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 From b91bc99c34d9b391502a898dc84f5491b60fd254 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Fri, 26 Aug 2022 13:49:09 -0700 Subject: [PATCH 24/29] addressed all feedbacks --- vignettes/more-recipes.Rmd | 277 ------------------------- vignettes/preprocessing-and-models.Rmd | 74 ++++++- 2 files changed, 66 insertions(+), 285 deletions(-) delete mode 100644 vignettes/more-recipes.Rmd diff --git a/vignettes/more-recipes.Rmd b/vignettes/more-recipes.Rmd deleted file mode 100644 index dee09fd1d..000000000 --- a/vignettes/more-recipes.Rmd +++ /dev/null @@ -1,277 +0,0 @@ ---- -title: Apply other steps and models from recipes and parsnip packages -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Apply other steps and models from recipes and parsnip packages} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- -The `epipredict` package utilizes the `tidymodels` framework, namely -[`recipes`](https://recipes.tidymodels.org/) for -[dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences -of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a -unified interface to a range of models. - -`epipredict` has additional customized feature engineering and preprocessing steps -tailored to fit the needs of epidemiology forecasting, such as `step_epi_shift()`, -`step_population_scaling()`, `step_epi_naomit()`. They can be used along with -steps from the `recipes` package for more feature engineering. - -In this vignette, we will demonstrate how to weave `recipes` steps and `parsnip` -models into the `epipredict` workflow. - -First, let's load the necessary packages: - -```{r, warning=FALSE, message =FALSE} -library(epipredict) -library(epiprocess) -library(recipes) -library(parsnip) -library(workflows) -library(poissonreg) -``` - -We will use the built-in dataset `case_death_rate_subset` and apply feature -engineering steps and fit a few models from `parsnip` to predict future death -rates by states 7 days ahead from Dec 31, 2021. - -```{r} -x <- case_death_rate_subset -x$geo_value <- as.factor(x$geo_value) -tail(x) -``` -There are `r dim(x)[1]` rows and `r dim(x)[2]` columns in the data, with dates -ranging from `r min(x$time_value)` to `r max(x$time_value)` in -`r n_distinct(x$geo_value)` states in the U.S. - -# Preprocessing and Feature Engineering - -In this section, we focus on showing how `step_*()` functions from the `recipes` -package can also be applied in our `epipredict` workflows. - -First, create a recipe object from the original data and then specify the -preprocessing steps. Steps are then sequentially added. - -`recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) -in bulk. There are a couple handy functions that can be used together to help us -manipulate variable roles easily. - -`update_role()` alters an existing role in the recipe or assigns an initial role -to variables that do not yet have a declared role. - -`add_role()` adds an additional role to variables that already have a role in -the recipe, without overwriting old roles. - -`remove_role()` eliminates a single existing role in the recipe. - -Here, we will add an additional `predictor` role to both `case_rate` and -`death_rate` variables. They currently have roles as `raw`. - -```{r create-recipe-object} -r <- epi_recipe(x) %>% - add_role(- epi_keys(x), new_role = "predictor") -``` - -The function automatically categorizes roles into `geo_value`, `time_value` and -`raw` based on variable names, where `raw` includes `case_rate` and `death_rate`. - -We first create a made-up population dataset by states to allow the conversion -from `case_rate` and `death_rate` into `cases` and `deaths`. The binary variable -`social_distancing` indicates whether the state followed social distancing rules. -A 'throw-away' column with all ones is also created to show that `step_rm()` -can be used to remove unwanted columns. -```{r madeup-population-data} -pop_data <- data.frame(states = unique(x$geo_value), - inv_pop = 1/seq(1e5, 1e7, - length.out = n_distinct(x$geo_value)), - social_distancing = rbinom(n = n_distinct(x$geo_value), - size = 1, - prob = 0.5), - throw_away = 1) - -head(pop_data) -``` - -Next, we will feature engineer `case_rate` and `death_rate` into `cases` and -`deaths`and create lags and leads based on them. - -```{r create-new-columns-in-recipe} -r <- r %>% - step_population_scaling(case_rate, death_rate, df = pop_data, - by = c("geo_value" = "states"), - create_new = FALSE, - df_pop_col = "inv_pop") %>% - step_rename(cases = case_rate, deaths = death_rate) %>% - step_rm(throw_away) %>% - step_epi_lag(cases, lag = c(0, 7, 14)) %>% - step_epi_lag(deaths, lag = c(0, 7, 14)) %>% - step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() %>% - step_rm(cases, deaths) - -``` - -Notice that `step_rename()` was used for renaming of the variables, and -`step_rm()` was used to remove redundant columns. - -A quick view of what columns exist in the the prepossessed data so far: - -```{r peak-data-1} -names(bake(prep(r, x),x)) -``` - - -There are also many functions in the `recipes` package that allow for -[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), -such as log transformations and spline transformations. In our case, we will -center the numerical predictors to allow for meaningful interpretation of the -intercept. - -```{r} -r <- r %>% step_center(contains("lag"), role = "predictor") -``` - -Since the current outcomes are positive, but the linear regression assumes the -outcome to be in full support, we will consider a log-transformation with an -offset value of 1 to ensure `log(0)` does not occur. - -```{r} -r_lin <- r %>% step_log(ahead_7_deaths, role = "outcome", offset = 1) -``` - -Other useful functionalities include creating a set of binary dummy variables -from a factor variable. As an illustration that will not affect the models, we -will show how states in `geo_value` become dummy variables so that each state -as a dummy variable has a differential intercept coefficient. - -```{r dummy-variable} -length(unique(x$geo_value)) - -dummies <- r %>% - step_dummy(geo_value) %>% - prep(training = x) - -dummy_data <- bake(dummies, x) - -names(dummy_data) -``` - -Once the pre-processing steps are done, we can move on to model fitting using -the `parsnip` package. - - -# Model Fitting and Post-processing - -Sometimes, the exact values are desired, therefore fitting a linear regression -or poisson regression would be a good idea. - -On the other hand, say instead of predicting 7-day ahead death counts, we would -like to classify if the future predictions are high or low based on some given threshold. -In this case, a logistic regression among other classification models can be used. -For the purpose of illustration, we define two categories: less than 10K(low) -and more than 10K(high). - -We resort to the `recipes` package again to change the numerical outcomes to -categorical for the purpose of classification: - -```{r, warning = FALSE} -binner <- function(x) { - x <- cut(x, breaks = c(0, 10000, Inf), include.lowest = TRUE) - # now return the group number - as.numeric(x) -} - -inc <- c("low", "high") - -r_categorical <- r %>% - step_num2factor(ahead_7_deaths, - role = "outcome", - transform = binner, - levels = inc) - -``` - -As a sanity check we can see now that the preprocessed data has a categorical -outcome in `ahead_7_deaths`. - -```{r, warning FALSE, echo = FALSE} -glimpse(slice_sample(bake(prep(r_categorical, x),x), n = 6)) -``` - - -### Linear Regression - -First, we can try fitting a linear regression: - -```{r linear-fit-workflow, warning = FALSE} -wf <- epi_workflow(r_lin, parsnip::linear_reg()) %>% - fit(x) - -latest <- get_test_data(recipe = r_lin, x = x) -``` - -We take a look at the linear regression fit and the values of the coefficients. - -```{r linear-fit, warning = FALSE} -wf$fit$fit # look at the model fit and coefficients - -p <- predict(wf, latest) %>% filter(!is.na(.pred)) - -p -``` - -The predictions are saved in the `.pred` column and represents the log-transformed -predicted death counts 7 days from Dec 31. 2021. - - -### Poisson regression - -For count data, we can use poisson regression. Although the counts here are not -integer since they were calculated from case rates multiplying population, some -extra preprocessing steps can help us round them to integers to allow fitting a -poisson regression. - - -```{r poisson-fit} -r <- r %>% - step_mutate(ahead_7_deaths = round(ahead_7_deaths, 0), - role = "predictor") %>% - step_filter(ahead_7_deaths > 0) - -latest <- get_test_data(recipe = r, x = x) - -wf <- epi_workflow(r, parsnip::poisson_reg()) %>% - fit(x) - -predict(wf, latest) %>% filter(!is.na(.pred)) -``` - -Let's also take a look at the poisson regression fit: -```{r, echo =FALSE} -wf$fit$fit -``` - -### Logistic Regression - -We use a logistic regression to predict whether 7-day ahead death counts are high -or low. -```{r logistic, warning = FALSE} -wf <- epi_workflow(r_categorical, parsnip::logistic_reg()) %>% - fit(x) -latest <- get_test_data(recipe = r_categorical, x = x) -p <- predict(wf, latest) %>% filter(!is.na(.pred_class)) -p -``` - -The model fit is as follows: -```{r echo=FALSE} -wf$fit$fit -``` - -# Attribution - -This object contains a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API.](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html) - -This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University -on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 266f8fbc1..673b3cf9f 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -321,7 +321,7 @@ obtain the death counts instead of death rates. ```{r} f <- frosting() %>% layer_predict() %>% - layer_add_target_date("2022-01-07") %>% # max(jhu$time_value) + 7 + layer_add_target_date("2022-01-07") %>% layer_threshold(.pred, lower = 0) %>% layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>% layer_naomit(.pred) %>% @@ -401,16 +401,16 @@ $$g_0(x) = 0$$ \begin{aligned} g_1(x)= \text{ln}(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}) = -\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_10 s_{\text{state_1}} + -\delta_11 s_{\text{state_2}} + ... \nonumber \\ +\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_{10} s_{\text{state_1}} + +\delta_{11} s_{\text{state_2}} + ... \nonumber \\ + \beta_{12} Y^{\Delta}_{\ell, t} + \beta_{13} Y^{\Delta}_{\ell, t-7} \end{aligned} \begin{aligned} g_2(x)= \text{ln}(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}) = -\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_20 s_{\text{state_1}} + -\delta_21 s_{\text{state_2}} + ... \nonumber \\ +\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_{20} s_{\text{state_1}} + +\delta_{21} s_{\text{state_2}} + ... \nonumber \\ + \beta_{22} Y^{\Delta}_{\ell, t} + \beta_{23} Y^{\Delta}_{\ell, t-7} \end{aligned} @@ -478,16 +478,74 @@ b <- bake(prep(r,jhu),jhu) m_reg <- parsnip::multinom_reg() epi_workflow() %>% -add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2 )%>% +add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2) %>% add_model(m_reg) %>% fit(data = b) ``` -## Reference +## Benefits of Lagging and Leading in `epipredict` -* McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 +The `step_epi_ahead` and `step_epi_lag` functions in the `epipredict` package +is handy for creating correct lags and leads for future predictions. + +Let's start with a simple dataset and preprocessing: +```{r} +ex <- case_death_rate_subset %>% + dplyr::filter(time_value >= "2021-12-01", + time_value <= "2021-12-31", + geo_value == "ca") + +dim(ex) +``` + +We want to predict death rates on 2022-01-07, which 7 days ahead from the last +available date in our dataset. + +We will compare two methods of trying to create lags and leads: +```{r} +########## +r1 <- epi_recipe(ex) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_epi_naomit() %>% + prep() + +b1 <- bake(r1, ex) +dim(b1) +########## +r2 <- epi_recipe(ex) %>% + step_mutate(lag0case_rate = lag(case_rate, 0), + lag7case_rate = lag(case_rate, 7), + lag14case_rate = lag(case_rate, 14), + lag0death_rate = lag(death_rate, 0), + lag7death_rate = lag(death_rate, 7), + lag14death_rate = lag(death_rate, 14), + ahead7death_rate = lead(death_rate, 7)) %>% + step_epi_naomit() %>% + prep() + +b2 <- bake(r2, ex) +dim(b2) +``` + +Notice the difference in number of rows `b1` and `b2` returns. This is because +the recipe that doesn't use `step_epi_ahead` and `step_epi_lag` is missing dates +compared to the one that used the `epipredict` functions. Therefore the model +that is trained is predicting 7 days ahead from `r max(unique(b2$time_value))` +instead of 7 days ahead from `r max(unique(b1$time_value))` which is what we want. +```{r} +unique(b1$time_value) +unique(b2$time_value) +``` + +## References + +::: {#refs} +McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 forecasting and hotspot prediction?." Proceedings of the National Academy of Sciences 118.51 (2021): e2111453118. +::: ## Attribution From 73805e3010941c8989f3dde03aeec6aa30c75c40 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Fri, 26 Aug 2022 14:10:47 -0700 Subject: [PATCH 25/29] fix layout --- vignettes/preprocessing-and-models.Rmd | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 673b3cf9f..3729a3a9b 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -498,12 +498,11 @@ ex <- case_death_rate_subset %>% dim(ex) ``` -We want to predict death rates on 2022-01-07, which 7 days ahead from the last +We want to predict death rates on 2022-01-07, which is 7 days ahead from the last available date in our dataset. We will compare two methods of trying to create lags and leads: ```{r} -########## r1 <- epi_recipe(ex) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -513,7 +512,8 @@ r1 <- epi_recipe(ex) %>% b1 <- bake(r1, ex) dim(b1) -########## + + r2 <- epi_recipe(ex) %>% step_mutate(lag0case_rate = lag(case_rate, 0), lag7case_rate = lag(case_rate, 7), @@ -531,14 +531,23 @@ dim(b2) Notice the difference in number of rows `b1` and `b2` returns. This is because the recipe that doesn't use `step_epi_ahead` and `step_epi_lag` is missing dates -compared to the one that used the `epipredict` functions. Therefore the model -that is trained is predicting 7 days ahead from `r max(unique(b2$time_value))` -instead of 7 days ahead from `r max(unique(b1$time_value))` which is what we want. +compared to the one that used the `epipredict` functions. ```{r} -unique(b1$time_value) -unique(b2$time_value) +dates_used_in_training1 <- + b1 %>% select(- ahead_7_death_rate) %>% na.omit() %>% select(time_value) + +dates_used_in_training1 + +dates_used_in_training2 <- + b2 %>% select(- ahead7death_rate) %>% na.omit() %>% select(time_value) + +dates_used_in_training2 ``` +The model that is trained without the functions is predicting 7 days ahead from +`r max(dates_used_in_training2$time_value)` +instead of 7 days ahead from `r max(dates_used_in_training1$time_value)`. + ## References ::: {#refs} From 2355aa31507f52f259f5c0d60e2c9950dbd35df9 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 13 Sep 2022 11:32:40 -0700 Subject: [PATCH 26/29] some simplifications, (may need to revert). --- vignettes/preprocessing-and-models.Rmd | 84 +++++++++++++------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 3729a3a9b..4764ee8ac 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -36,7 +36,9 @@ we will use a linear regression to predict death rates. Last but not least, we will create a classification model for hotspot predictions. ```{r, warning=FALSE, message=FALSE} -library(covidcast) +# library(covidcast) +library(tidyr) +library(dplyr) library(epidatr) library(epiprocess) library(epipredict) @@ -62,33 +64,21 @@ for using the `epipredict` package with other existing tidymodel packages. ## NOTE: built-in data? or use epiprocess built-in data and save code here x <- covidcast( data_source = "jhu-csse", - signals = "confirmed_incidence_num", + signals = "confirmed_incidence_num,deaths_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" -) %>% - fetch_tbl() %>% - select(geo_value, time_value, cases = value) - -y <- covidcast( - data_source = "jhu-csse", - signals = "deaths_incidence_num", - time_type = "day", - geo_type = "state", - time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" -) %>% - fetch_tbl() %>% - select(geo_value, time_value, deaths = value) + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() case_death_counts_subset <- x %>% - full_join(y, by = c("geo_value", "time_value")) %>% + pivot_wider(names_from = signal, values_from = value) %>% + rename(cases = confirmed_incidence_num, deaths = deaths_incidence_num) %>% + select(geo_value, time_value, cases, deaths) %>% as_epi_df() - ``` -The `case_death_counts_subset` dataset comes from the `covidcast` package,and +The `case_death_counts_subset` dataset comes from the `epidatr` package,and contains the number of confirmed cases and deaths from June 4, 2021 to Dec 31, 2021 in some U.S. states. @@ -115,30 +105,38 @@ data ready for modeling. But before diving into them, it will be helpful to understand what `roles` are in the `recipes` framework. -> `recipes` can assign one or more roles to each column in the data. The roles -> are not restricted to a predefined set; they can be anything. -> For most conventional situations, they are typically “predictor” and/or -> “outcome”. Additional roles enable targeted step operations on specific -> variables or groups of variables. -> -> In our case, the role `predictor` is given to explanatory variable that go to the -> left-hand side of the model. The role `outcome` is the response variable -> that we wish to predict. `geo_value` and `time_value` are predefined roles -> that are unique to the `epipredict` package. Since we work with `epi_df` -> objects, all datasets should have `geo_value` and `time_value` passed through -> automatically with these two roles assigned to the appropriate columns. -> -> `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) -> in bulk. There are a couple handy functions that can be used together to help us -> manipulate variable roles easily. -> ->> `update_role()` alters an existing role in the recipe or assigns an initial role ->> to variables that do not yet have a declared role. +--- + +#### Aside on `recipes` + +`recipes` can assign one or more roles to each column in the data. The roles +are not restricted to a predefined set; they can be anything. +For most conventional situations, they are typically “predictor” and/or +"outcome". Additional roles enable targeted `step_*()` operations on specific +variables or groups of variables. + +In our case, the role `predictor` is given to explanatory variable that go to the +left-hand side of the model. The role `outcome` is the response variable +that we wish to predict. `geo_value` and `time_value` are predefined roles +that are unique to the `epipredict` package. Since we work with `epi_df` +objects, all datasets should have `geo_value` and `time_value` passed through +automatically with these two roles assigned to the appropriate columns. + +The `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) +in bulk. There are a couple handy functions that can be used together to help us +manipulate variable roles easily. + +> `update_role()` alters an existing role in the recipe or assigns an initial role +> to variables that do not yet have a declared role. > ->> `add_role()` adds an additional role to variables that already have a role in ->> the recipe, without overwriting old roles. +> `add_role()` adds an additional role to variables that already have a role in +> the recipe, without overwriting old roles. > ->> `remove_role()` eliminates a single existing role in the recipe. +> `remove_role()` eliminates a single existing role in the recipe. + +#### End aside + +--- Notice in the following preprocessing steps, we used `add_role()` on `geo_value_factor` since currently the default role for it is `raw` and @@ -168,7 +166,7 @@ modeling and getting the prediction for the 7-day ahead death counts of the latest available date in the dataset. ```{r} -latest <- get_test_data(recipe = r, x = case_death_counts_subset) +latest <- get_test_data(r, case_death_counts_subset) wf <- epi_workflow(r, parsnip::poisson_reg()) %>% fit(case_death_counts_subset) From b844be7a40e9b7280cd970702238a3da4ae80f8e Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 13 Sep 2022 17:15:25 -0700 Subject: [PATCH 27/29] revise vignette, improve population scaling, add census data --- DESCRIPTION | 2 +- R/data.R | 23 ++ R/layer_population_scaling.R | 77 +++-- data-raw/state_census.R | 11 + data/state_census.rda | Bin 0 -> 1274 bytes man/create_layer.Rd | 4 +- man/state_census.Rd | 33 ++ tests/testthat/test-population_scaling.R | 16 + vignettes/preprocessing-and-models.Rmd | 374 ++++++++++++----------- 9 files changed, 329 insertions(+), 211 deletions(-) create mode 100644 data-raw/state_census.R create mode 100644 data/state_census.rda create mode 100644 man/state_census.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 4ff288b44..9056c3f9a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -65,5 +65,5 @@ Remotes: Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 LazyData: true diff --git a/R/data.R b/R/data.R index 9c93506a3..622d1c7d7 100644 --- a/R/data.R +++ b/R/data.R @@ -33,3 +33,26 @@ #' June 7 is the average of the underlying data for June 1 through 7, #' inclusive. "case_death_rate_subset" + +#' State population data +#' +#' Data set on state populations, from the 2019 US Census. +#' +#' @format Data frame with 57 rows (including one for the United States as a +#' whole, plus the District of Columbia, Puerto Rico Commonwealth, +#' American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, +#' Islands). +#' +#' \describe{ +#' \item{fips}{FIPS code} +#' \item{name}{Full name of the state or territory} +#' \item{pop}{Estimate of the location's resident population in +#' 2019.} +#' \item{abbr}{Postal abbreviation for the location} +#' } +#' +#' @source United States Census Bureau, at +#' \url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, +#' \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, +#' and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} +"state_census" diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 8e219a9e4..29a5cd14a 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -1,35 +1,43 @@ -#' Revert population scaled prediction +#' Convert per-capita predictions to raw scale #' #' `layer_population_scaling` creates a specification of a frosting layer -#' that will add a population scaled column in the data. For example, -#' load a dataset that contains county population, and join to an `epi_df` -#' that currently predicts number of new cases by county to obtain case rates. -#' Although worth noting that there is nothing special about "population". -#' The function can be used to scale by any variable. Population is simply the -#' most natural and common use case. +#' that will "undo" per-capita scaling. Typical usage would +#' load a dataset that contains state-level population, and use it to convert +#' predictions made from a rate-scale model to raw scale by multiplying by +#' the population. +#' Although, it is worth noting that there is nothing special about "population". +#' The function can be used to scale by any variable. Population is the +#' standard use case in the epidemiology forecasting scenario. Any value +#' passed will *multiply* the selected variables while the `rate_rescaling` +#' argument is a common *divisor* of the selected variables. #' #' @param frosting a `frosting` postprocessor. The layer will be added to the -#' sequence of operations for this frosting. +#' sequence of operations for this frosting. #' @param ... One or more selector functions to scale variables -#' for this step. See [selections()] for more details. -#' @param df a data frame that contains the population data used for scaling. -#' @param by A character vector of variables to left join by. +#' for this step. See [selections()] for more details. +#' @param df a data frame that contains the population data to be used for +#' inverting the existing scaling. +#' @param by A (possibly named) character vector of variables to join by. #' #' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` and the user-provided dataset. -#' If columns in `epi_df` and `df` have the same name (and aren't -#' included in by), `.df` is added to the one from the user-provided data +#' variables in common across the `epi_df` produced by the `predict()` call +#' and the user-provided dataset. +#' If columns in that `epi_df` and `df` have the same name (and aren't +#' included in `by`), `.df` is added to the one from the user-provided data #' to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. -#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value` +#' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` #' to `df$states`. To join by multiple variables, use a vector with length > 1. -#' For example, by = c("geo_value" = "states", "county" = "county") will match +#' For example, `by = c("geo_value" = "states", "county" = "county")` will match #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`. #' #' See [dplyr::left_join()] for more details. #' @param df_pop_col the name of the column in the data frame `df` that #' contains the population data and used for scaling. +#' @param rate_rescaling Sometimes rates are "per 100K" or "per 1M" rather than +#' "per person". Adjustments can be made here. For example, if the original +#' rate is "per 100K", then set `rate_rescaling = 1e5` to get counts back. #' @param create_new TRUE to create a new column and keep the original column #' in the `epi_df`. #' @param suffix a character. The suffix added to the column name if @@ -46,8 +54,7 @@ #' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% #' dplyr::select(geo_value, time_value, cases) #' -#' pop_data = data.frame(states = c("ca", "ny"), -#' value = c(20000, 30000)) +#' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) #' #' r <- epi_recipe(jhu) %>% #' step_population_scaling(df = pop_data, @@ -71,12 +78,12 @@ #' parsnip::fit(jhu) %>% #' add_frosting(f) #' -#' latest <- get_test_data(recipe = r, -#' x = epiprocess::jhu_csse_daily_subset %>% -#' dplyr::filter(time_value > "2021-11-01", -#' geo_value %in% c("ca", "ny")) %>% -#' dplyr::select(geo_value, time_value, cases)) -#' +#' latest <- get_test_data( +#' recipe = r, +#' x = epiprocess::jhu_csse_daily_subset %>% +#' dplyr::filter(time_value > "2021-11-01", +#' geo_value %in% c("ca", "ny")) %>% +#' dplyr::select(geo_value, time_value, cases)) #' #' predict(wf, latest) layer_population_scaling <- function(frosting, @@ -84,17 +91,26 @@ layer_population_scaling <- function(frosting, df, by = NULL, df_pop_col, + rate_rescaling = 1, create_new = TRUE, - suffix = "_original", + suffix = "_scaled", .flag = TRUE, id = rand_id("population_scaling")) { + arg_is_scalar(df_pop_col, rate_rescaling, create_new, suffix, .flag, id) + arg_is_lgl(create_new, .flag) + arg_is_chr(df_pop_col, suffix, id) + arg_is_chr(by, allow_null = TRUE) + if (rate_rescaling <= 0) + cli_stop("`rate_rescaling` should be a positive number") + add_layer( frosting, layer_population_scaling_new( df = df, by = by, df_pop_col = df_pop_col, + rate_rescaling = rate_rescaling, terms = dplyr::enquos(...), create_new = create_new, suffix = suffix, @@ -105,11 +121,12 @@ layer_population_scaling <- function(frosting, } layer_population_scaling_new <- - function(df, by, df_pop_col, terms, create_new, suffix, id) { + function(df, by, df_pop_col, rate_rescaling, terms, create_new, suffix, id) { layer("population_scaling", df = df, by = by, df_pop_col = df_pop_col, + rate_rescaling = rate_rescaling, terms = terms, create_new = create_new, suffix = suffix, @@ -125,8 +142,9 @@ slather.layer_population_scaling <- try_join <- try(dplyr::left_join(components$predictions, object$df, by= object$by), silent = TRUE) - if (any(grepl("Join columns must be present in data", unlist(try_join)))){ - stop("columns in `by` selectors of `layer_population_scaling` must be present in data and match")} + if (any(grepl("Join columns must be present in data", unlist(try_join)))) { + cli_stop(c("columns in `by` selectors of `layer_population_scaling`", + "must be present in data and match"))} object$df <- object$df %>% dplyr::mutate(dplyr::across(where(is.character), tolower)) @@ -136,13 +154,12 @@ slather.layer_population_scaling <- col_names <- names(pos) suffix = ifelse(object$create_new, object$suffix, "") - components$predictions <- dplyr::left_join(components$predictions, object$df, by= object$by, suffix = c("", ".df")) %>% dplyr::mutate(dplyr::across(dplyr::all_of(col_names), - ~.x * !!pop_col , + ~.x * !!pop_col / object$rate_rescaling , .names = "{.col}{suffix}")) %>% dplyr::select(- !!pop_col) components diff --git a/data-raw/state_census.R b/data-raw/state_census.R new file mode 100644 index 000000000..22dde5a41 --- /dev/null +++ b/data-raw/state_census.R @@ -0,0 +1,11 @@ +library(dplyr) +library(tidyr) + +state_census <- covidcast::state_census %>% + select(STATE, NAME, POPESTIMATE2019, ABBR) %>% + rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>% + mutate(abbr = tolower(abbr)) %>% + as_tibble() + + +usethis::use_data(state_census, overwrite = TRUE) diff --git a/data/state_census.rda b/data/state_census.rda new file mode 100644 index 0000000000000000000000000000000000000000..1118db0d01236a12d67aac1d43ce2f87a694d6f0 GIT binary patch literal 1274 zcmVO*I=t^i8M%qa#B=00TklVgMQd001;J0MGyc1Q{@cAQ~8hCYmx}BLXsMlS4)z z#ALuGjRB(|Vj5vI$kRrFlO{|6Bv6Wv)if~x(?AUaKmY&$^nf$~00006O+8H-dVl}` z4FQ*ISdcj1#rOPL@tgmOWRN0ikg9V-z7#)Oj-Y@oPgvSpD7U#Vt!3Kf=NJyfmF4R@ z=9l;lYtJ^pZsQ9N64lN#tgM}5Cl>?(3jm}fOdtzFKnMZ`fdsZNA!j`)NJv(#CNSDe zIR;&qcrQF)={Y@1AmIFtVam!Go0t<(1w$yHfkP*KPwZj{zA!+m7<}pXVgf<6nh6M2 z(5W|Z_AT!=c9c;)y*p{8HAzt-a}~L5cYhDM|9TY3BcqB1#aqE1AhJPhWc0LXk)-$_ zPXa~ZOt7RKK};2zK<6zW)v(f)kSIYC6p4O$%;)hb08aeq@L# zG$2ieeWZ{Xrhy>0VM+Ts*3lLcqz?l@agKqQbhLiiB*&CQAB^`IH0;9)*n`NMYN!!1 zsef3<0;Cm)hnz$L7$p#*;-y_wrZteA;+>P2Z^Hnt2d{D%-K(*VE;c(XtvjF9gy{;o zw%cuuw%!f|8yj?0rqBw~zgr7gDW87v4_|A+LLHQc&bmPP2fh4|Swuu4RLK|q!azEC zAUlvXt9ZaiIxAw)aySlQ-4P~vK(`^xm=R-#%**uUQ-80qkRU{Brg233Vi_t-u%^az zY|&7RTod99DS-tjkoj_<0@&){lmixxoVIS^VB?M|VcsB7bYv}90n$-WF zIY+FIIR$FRP|c%Z7m0_-3QKuz~wrFusq zaD;RMIZOH)QW-!L5#p^ZMm?B}>ynqAWPBxM$!G$Fh(Q)pB!-yjnQWkvrUJBbf`%dN z2S9_f@Es7OZFA_6HPbwlz;Bou2;*WgUzyuY0wV%SgPak~DAH(?f-FeIx~4)yAzOir zSy>wRN^%p$MZ|U%87c`CY=rlOz?wjs=?5lI69ge-1Y|#Hqp~K|DaLnfd}0jH9f4JE z0E#I>DJn;7T66OdfFQsZBq6@5%B?lx2-3|W5H3@ekiOV literal 0 HcmV?d00001 diff --git a/man/create_layer.Rd b/man/create_layer.Rd index 7917e8854..81f5e33b0 100644 --- a/man/create_layer.Rd +++ b/man/create_layer.Rd @@ -9,9 +9,9 @@ create_layer(name = NULL, open = rlang::is_interactive()) \arguments{ \item{name}{Either a name without extension, or \code{NULL} to create the paired file based on currently open file in the script editor. If -the R file is open, \code{use_test()} will create/open the corresponding +the \verb{R/} file is open, \code{use_test()} will create/open the corresponding test file; if the test file is open, \code{use_r()} will create/open the -corresponding R file.} +corresponding \verb{R/} file.} \item{open}{Whether to open the file for interactive editing.} } diff --git a/man/state_census.Rd b/man/state_census.Rd new file mode 100644 index 000000000..eec13eb53 --- /dev/null +++ b/man/state_census.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{state_census} +\alias{state_census} +\title{State population data} +\format{ +Data frame with 57 rows (including one for the United States as a +whole, plus the District of Columbia, Puerto Rico Commonwealth, +American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, +Islands). + +\describe{ +\item{fips}{FIPS code} +\item{name}{Full name of the state or territory} +\item{pop}{Estimate of the location's resident population in +2019.} +\item{abbr}{Postal abbreviation for the location} +} +} +\source{ +United States Census Bureau, at +\url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, +\url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, +and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} +} +\usage{ +state_census +} +\description{ +Data set on state populations, from the 2019 US Census. +} +\keyword{datasets} diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 2404a7b4c..4a360cd71 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -116,6 +116,22 @@ test_that("Postprocessing workflow works and values correct", { expect_equal(nrow(p), 2L) expect_equal(ncol(p), 4L) expect_equal(p$.pred_original, p$.pred * c(20000, 30000)) + + f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, df = pop_data, rate_rescaling = 10000, + by = c("geo_value" = "states"), + df_pop_col = "value") + wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(f) + expect_silent(p <- predict(wf, latest)) + expect_equal(nrow(p), 2L) + expect_equal(ncol(p), 4L) + expect_equal(p$.pred_original, p$.pred * c(2, 3)) + }) test_that("Postprocessing to get cases from case rate", { diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 4764ee8ac..2209cf6ad 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -61,20 +61,28 @@ regression, the textbook example for modeling count data, as an illustration for using the `epipredict` package with other existing tidymodel packages. ```{r poisson-reg-data} -## NOTE: built-in data? or use epiprocess built-in data and save code here x <- covidcast( data_source = "jhu-csse", - signals = "confirmed_incidence_num,deaths_incidence_num", + signals = "confirmed_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), geo_values = "ca,fl,tx,ny,nj") %>% - fetch_tbl() + fetch_tbl() %>% + select(geo_value, time_value, cases = value) + +y <- covidcast( + data_source = "jhu-csse", + signals = "deaths_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() %>% + select(geo_value, time_value, deaths = value) case_death_counts_subset <- x %>% - pivot_wider(names_from = signal, values_from = value) %>% - rename(cases = confirmed_incidence_num, deaths = deaths_incidence_num) %>% - select(geo_value, time_value, cases, deaths) %>% + full_join(y, by = c("geo_value", "time_value")) %>% as_epi_df() ``` @@ -140,29 +148,28 @@ manipulate variable roles easily. Notice in the following preprocessing steps, we used `add_role()` on `geo_value_factor` since currently the default role for it is `raw` and -we would like the dummified variables to automatically be of role `predictor`. +we would like the dummified variables to automatically be `predictor`'s. ```{r} case_death_counts_subset <- case_death_counts_subset %>% - mutate(geo_value_factor = as.factor(geo_value)) %>% - as_epi_df() + mutate(geo_value_factor = as.factor(geo_value)) %>% + as_epi_df() epi_recipe(case_death_counts_subset) - + r <- epi_recipe(case_death_counts_subset) %>% - add_role(geo_value_factor, new_role = "predictor") %>% - step_dummy(geo_value_factor) %>% - step_mutate(cases = case_when(cases < 0 ~ 0, TRUE ~ cases)) %>% - ## NOTE: why are there negative numbers in here? - step_mutate(deaths = case_when(deaths < 0 ~ 0, TRUE ~ deaths)) %>% - step_epi_lag(cases, lag = c(0, 7)) %>% - step_epi_lag(deaths, lag = c(0, 7)) %>% - step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% - step_epi_naomit() + add_role(geo_value_factor, new_role = "predictor") %>% + step_dummy(geo_value_factor) %>% + ## Occasionally, data reporting errors / corrections result in negative + ## cases / deaths + step_mutate(cases = pmax(cases, 0), deaths = pmax(deaths, 0)) %>% + step_epi_lag(cases, deaths, lag = c(0, 7)) %>% + step_epi_ahead(deaths, ahead = 7, role = "outcome") %>% + step_epi_naomit() ``` After specifying the preprocessing steps, we will use the `parsnip` package for -modeling and getting the prediction for the 7-day ahead death counts of the +modeling and getting the prediction for death count, 7 days after the latest available date in the dataset. ```{r} @@ -174,9 +181,14 @@ wf <- epi_workflow(r, parsnip::poisson_reg()) %>% predict(wf, latest) %>% filter(!is.na(.pred)) ``` +Note that the `time_value` corresponds to the last available date in the +training set, **NOT** to the target date of the forecast +(`r max(latest$time_value) + 7`). + + Let's take a look at the fit: -```{r, echo =FALSE} -wf$fit$fit +```{r} +extract_fit_engine(wf) ``` Up to now, we've used the Poisson regression to model count data. Poisson @@ -208,9 +220,10 @@ regression, which we will illustrate in the next section. ## Linear Regression -For COVID-19, the CDC requires submission of case and death count predictions. -However, it is often the case that rate data is easier to model as the outcome -as it need not be an integer. We can use a liner regression to predict the death +For COVID-19, the CDC required submission of case and death count predictions. +However, the Delphi Group preferred to train on rate data instead, because it +puts different locations on a similar scale. +We can use a liner regression to predict the death rates and use state population data to scale the rates to counts. We will do so using `layer_population_scaling()` from the `epipredict` package. @@ -226,106 +239,111 @@ from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epi The survey data shows the estimated percentage of people who wore a mask for most or all of the time while in public in the past 7 days and the estimated -percentage of respondents who reported that all or most people they enountered -in public in the past 7 days maintained a distance of at least 6 feet. We will -convert these percentages into indicator variables as follows: +percentage of respondents who reported that all or most people they encountered +in public in the past 7 days maintained a distance of at least 6 feet. -State-wise population data is obtained from `covidcast::state_census` and will -be used in `layer_population_scaling()`. +State-wise population data from the 2019 U.S. Census is included in this package +and will be used in `layer_population_scaling()`. ```{r} -## NOTE: need to put in package? -behav_ind_mask <- covidcast(data_source = "fb-survey", - signals = "smoothed_wwearing_mask_7d", - time_type = "day", - geo_type = "state", - time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" - ) %>% - fetch_tbl() %>% - select(geo_value, time_value, percentage = value) %>% - mutate(mask_indicator = case_when(percentage < 50 ~ 0, - TRUE ~ 1)) %>% - select(geo_value, time_value, mask_indicator) - -behav_ind_distancing <- covidcast(data_source = "fb-survey", - signals = "smoothed_wothers_distanced_public", - time_type = "day", - geo_type = "state", - time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" - ) %>% - fetch_tbl() %>% - select(geo_value, time_value, percentage = value) %>% - mutate(distancing_indicator = case_when(percentage < 20 ~ 0, - TRUE ~ 1)) %>% - select(geo_value, time_value, distancing_indicator) - -pop_dat <- covidcast::state_census %>% - select(POPESTIMATE2019, ABBR) %>% - mutate(ABBR = tolower(ABBR)) %>% - filter(ABBR %in% c("ca","fl","tx","ny","nj")) %>% - rename(value = POPESTIMATE2019, geo_value = ABBR) +behav_ind_mask <- covidcast( + data_source = "fb-survey", + signals = "smoothed_wwearing_mask_7d", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() %>% + select(geo_value, time_value, masking = value) + +behav_ind_distancing <- covidcast( + data_source = "fb-survey", + signals = "smoothed_wothers_distanced_public", + time_type = "day", + geo_type = "state", + time_values = epirange(20210604, 20211231), + geo_values = "ca,fl,tx,ny,nj") %>% + fetch_tbl() %>% + select(geo_value, time_value, distancing = value) + +pop_dat <- state_census %>% select(abbr, pop) behav_ind <- behav_ind_mask %>% full_join(behav_ind_distancing, by = c("geo_value", "time_value")) +``` -dim(behav_ind) +Rather than using raw mask-wearing / social-distancing metrics, for the sake +of illustration, we'll convert both into categorical predictors. + +```{r, echo=FALSE, message=FALSE,fig.align='center', fig.width=6, fig.height=4} +library(ggplot2) +behav_ind %>% + pivot_longer(masking:distancing) %>% + ggplot(aes(value, fill = geo_value)) + + geom_density(alpha = 0.5) + + scale_fill_brewer(palette = "Set1", name = "") + + theme_bw() + + scale_x_continuous(expand = c(0,0)) + + scale_y_continuous(expand = expansion(c(0,.05))) + + facet_wrap(~name, scales = "free") + + theme(legend.position = "bottom") ``` -We will take a subset of death rate and case rate data from our built-in dataset +We will take a subset of death rate and case rate data from the built-in dataset `case_death_rate_subset`. + ```{r} jhu <- case_death_rate_subset %>% - dplyr::filter(time_value >= "2021-06-04", - time_value <= "2021-12-31", - geo_value %in% c("ca","fl","tx","ny","nj")) - -dim(jhu) + dplyr::filter( + time_value >= "2021-06-04", + time_value <= "2021-12-31", + geo_value %in% c("ca","fl","tx","ny","nj")) ``` Preprocessing steps will rely on functions from the `epipredict` package as well as the `recipes` package: Additionally, there are also many functions in the `recipes` package that allow for -[function transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), +[scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), such as log transformations and data centering. In our case, we will center the numerical predictors to allow for meaningful interpretation of the intercept. ```{r} jhu <- jhu %>% - mutate(geo_value_factor = as.factor(geo_value)) %>% - full_join(behav_ind, by = c("geo_value", "time_value")) %>% - as_epi_df() + mutate(geo_value_factor = as.factor(geo_value)) %>% + left_join(behav_ind, by = c("geo_value", "time_value")) %>% + as_epi_df() r <- epi_recipe(jhu) %>% - add_role(geo_value_factor, new_role = "predictor") %>% - step_dummy(geo_value_factor) %>% - step_epi_lag(case_rate, lag = c(0, 7)) %>% - step_epi_lag(death_rate, lag = c(0, 7)) %>% - step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% - step_center(contains("lag"), role = "predictor") %>% - step_epi_naomit() + add_role(geo_value_factor, new_role = "predictor") %>% + step_dummy(geo_value_factor) %>% + step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% + step_mutate(masking = cut_number(masking, 5), + distancing = cut_number(distancing, 5)) %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_center(contains("lag"), role = "predictor") %>% + step_epi_naomit() ``` -As a sanity check we can see what columns exist in the preprocessed data now: -```{r, warning = FALSE, echo = FALSE} -glimpse(slice_sample(bake(prep(r, jhu),jhu), n = 6)) +As a sanity check we can see what the training data will look like +```{r, warning = FALSE} +glimpse(slice_sample(bake(prep(r, jhu), jhu), n = 6)) ``` Before directly predicting the results, we need to add postprocessing layers to -obtain the death counts instead of death rates. +obtain the death counts instead of death rates. Note that the rates used so +far are "per 100K people" rather than "per person". ```{r} f <- frosting() %>% - layer_predict() %>% - layer_add_target_date("2022-01-07") %>% - layer_threshold(.pred, lower = 0) %>% - layer_residual_quantiles(probs = c(0.0275, 0.975), symmetrize = FALSE) %>% - layer_naomit(.pred) %>% - layer_population_scaling(.pred, .pred_distn, df = pop_dat, - by = c("geo_value"), - df_pop_col = "value") + layer_predict() %>% + layer_add_target_date("2022-01-07") %>% + layer_threshold(.pred, lower = 0) %>% + layer_residual_quantiles(probs = c(0.05, 0.95), symmetrize = FALSE) %>% + layer_naomit(.pred) %>% + layer_population_scaling( + .pred, .pred_distn, df = pop_dat, rate_rescaling = 1e5, + by = c("geo_value" = "abbr"), df_pop_col = "pop") wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% @@ -336,36 +354,35 @@ p <- predict(wf, latest) p ``` +The columns marked `*_scaled` have been rescaled to the correct units, in this +case `deaths` rather than deaths per 100K people (these remain in `.pred`). + To look at the prediction intervals: ```{r} -# method 1 -nested_quantiles(p$.pred_distn_original) -# method 2 -dstn = p$.pred_distn_original -q = distributional::parameters(dstn)$q # a list of length nrow(p) -q -tau = distributional::parameters(dstn)$tau -tau +p %>% + select(geo_value, target_date, .pred_scaled, .pred_distn_scaled) %>% + mutate(.pred_distn_scaled = nested_quantiles(.pred_distn_scaled)) %>% + unnest(.pred_distn_scaled) %>% + pivot_wider(names_from = tau, values_from = q) ``` -The point estimate of death counts on Jan 7, 2022 is returned in column -`.pred_original`. The predictions based on residual quantiles is stored in -`.pred_distn_original` and also scaled to count data. Last but not least, let's take a look at the regression fit and check the coefficients: ```{r, echo =FALSE} -wf$fit$fit +summary(extract_fit_engine(wf)) ``` ## Classification -Sometimes predictive model for future trends are of interest. In other words, -the target is to predict if the future will have increasing case rates(up), -decreasing case rates (down), or flat case rates (flat). Such model are often +Sometimes it is preferable to create a predictive model for surges or upswings +rather than for raw values. In this case, +the target is to predict if the future will have increased case rates(up), +decreased case rates (down), or flat case rates (flat) relative to the current +level. Such model are often referred to as hotspot prediction models. We will refer to the model notations -from McDonald et al. but extend the application to predict three categories -instead of two. +from [McDonald, Bien, Green, Hu, et al.](#references) but extend the application +to predict three categories instead of two. Hotspot prediction predicts a categorical variable defined in terms of the relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. @@ -384,7 +401,7 @@ $$ where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized as 'up', meaning the number of newly reported cases over the past 7 days has -increased by at least 25% compared to the preceding week. w\When $Z_{\ell,t}$ +increased by at least 25% compared to the preceding week. When $Z_{\ell,t}$ is categorized as 'down', it suggests that there has been at least a 25% decrease in newly reported cases over the past 7 days. Otherwise, we will consider the trends to be flat. @@ -394,22 +411,22 @@ The expression of the multinomial regression we will model is as follows: $$ \pi_{j}(x) = \text{Pr}(Y = j|x) = \frac{e^{g_j(x)}}{\sum_{k=0}^2 g_j(x) } $$ -where $j$ is either down(Y=0), flat(Y=1), or up (Y=2), +where $j$ is either down (`Y=0`), flat (`Y=1`), or up (`Y=2`), $$g_0(x) = 0$$ \begin{aligned} -g_1(x)= \text{ln}(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}) = +g_1(x)= \text{ln}\left(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}\right) &= \beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_{10} s_{\text{state_1}} + \delta_{11} s_{\text{state_2}} + ... \nonumber \\ -+ \beta_{12} Y^{\Delta}_{\ell, t} + +&\quad + \beta_{12} Y^{\Delta}_{\ell, t} + \beta_{13} Y^{\Delta}_{\ell, t-7} \end{aligned} \begin{aligned} -g_2(x)= \text{ln}(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}) = +g_2(x)= \text{ln}\left(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}\right) &= \beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_{20} s_{\text{state_1}} + \delta_{21} s_{\text{state_2}} + ... \nonumber \\ -+ \beta_{22} Y^{\Delta}_{\ell, t} + +&\quad + \beta_{22} Y^{\Delta}_{\ell, t} + \beta_{23} Y^{\Delta}_{\ell, t-7} \end{aligned} @@ -429,26 +446,29 @@ jhu <- case_death_rate_subset %>% as_epi_df() r <- epi_recipe(jhu) %>% - add_role(time_value, new_role = "predictor") %>% - step_dummy(geo_value_factor) %>% - step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% - step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>% - step_mutate(pct_diff1 = case_when(lag_7_case_rate == 0 ~ 0, - TRUE ~ (lag_0_case_rate - lag_7_case_rate)/lag_7_case_rate) ) %>% - step_mutate(pct_diff2 = case_when(lag_14_case_rate == 0 ~ 0, - TRUE ~ (lag_7_case_rate - lag_14_case_rate)/lag_14_case_rate)) %>% - step_mutate(trend = case_when(pct_diff1 < -0.25 ~ "down", - pct_diff1 > 0.25 ~ "up", - TRUE ~ "flat"), - role = "outcome") %>% - step_rm(death_rate, - case_rate, - lag_0_case_rate, - lag_7_case_rate, - lag_14_case_rate, - ahead_7_case_rate) %>% - step_epi_naomit() - + add_role(time_value, new_role = "predictor") %>% + step_dummy(geo_value_factor) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(case_rate, ahead = 7, role = "predictor") %>% + step_mutate( + pct_diff_ahead = case_when( + lag_7_case_rate == 0 ~ 0, + TRUE ~ (ahead_7_case_rate - lag_0_case_rate) / lag_0_case_rate), + pct_diff_wk1 = case_when( + lag_7_case_rate == 0 ~ 0, + TRUE ~ (lag_0_case_rate - lag_7_case_rate) / lag_7_case_rate), + pct_diff_wk2 = case_when( + lag_14_case_rate == 0 ~ 0, + TRUE ~ (lag_7_case_rate - lag_14_case_rate) / lag_14_case_rate)) %>% + step_mutate( + response = case_when( + pct_diff_ahead < -0.20 ~ "down", + pct_diff_ahead > 0.25 ~ "up", + TRUE ~ "flat"), + role = "outcome") %>% + step_rm(death_rate, case_rate, lag_0_case_rate, lag_7_case_rate, + lag_14_case_rate, ahead_7_case_rate, pct_diff_ahead) %>% + step_epi_naomit() ``` We will fit the multinomial regression and take a look at the predictions: @@ -463,22 +483,21 @@ predict(wf, latest) %>% filter(!is.na(.pred_class)) We can also take a look at the fit: ```{r} -wf$fit$fit +extract_fit_engine(wf) ``` -One could also use formula in `epi_recipe()` to achieve the same results as +One could also use a formula in `epi_recipe()` to achieve the same results as above. However, only one of `add_formula()`, `add_recipe()`, or -`workflow_variables()` must be specified. For the purpose of using `add_formula` -over `add_recipes`, we will prep and bake our recipe to return a dataset that -will be used for model fitting. +`workflow_variables()` must be specified. For the purpose of demonstrating +`add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to +return a `data.frame` that could be used for model fitting. ```{r} -b <- bake(prep(r,jhu),jhu) -m_reg <- parsnip::multinom_reg() +b <- bake(prep(r, jhu), jhu) epi_workflow() %>% -add_formula(trend ~ time_value + geo_value + pct_diff1 + pct_diff2) %>% -add_model(m_reg) %>% -fit(data = b) + add_formula(response ~ geo_value + pct_diff_wk1 + pct_diff_wk2) %>% + add_model(parsnip::multinom_reg()) %>% + fit(data = b) ``` ## Benefits of Lagging and Leading in `epipredict` @@ -491,45 +510,45 @@ Let's start with a simple dataset and preprocessing: ex <- case_death_rate_subset %>% dplyr::filter(time_value >= "2021-12-01", time_value <= "2021-12-31", - geo_value == "ca") + geo_value == "ca") dim(ex) ``` -We want to predict death rates on 2022-01-07, which is 7 days ahead from the last -available date in our dataset. +We want to predict death rates on `2022-01-07`, which is 7 days ahead from the +latest available date in our dataset. We will compare two methods of trying to create lags and leads: ```{r} -r1 <- epi_recipe(ex) %>% - step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% - step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% - step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% - step_epi_naomit() %>% - prep() - -b1 <- bake(r1, ex) -dim(b1) - - -r2 <- epi_recipe(ex) %>% - step_mutate(lag0case_rate = lag(case_rate, 0), - lag7case_rate = lag(case_rate, 7), - lag14case_rate = lag(case_rate, 14), - lag0death_rate = lag(death_rate, 0), - lag7death_rate = lag(death_rate, 7), - lag14death_rate = lag(death_rate, 14), - ahead7death_rate = lead(death_rate, 7)) %>% - step_epi_naomit() %>% - prep() - -b2 <- bake(r2, ex) -dim(b2) +p1 <- epi_recipe(ex) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% + step_epi_naomit() %>% + prep() + +b1 <- bake(p1, ex) +b1 + + +p2 <- epi_recipe(ex) %>% + step_mutate(lag0case_rate = lag(case_rate, 0), + lag7case_rate = lag(case_rate, 7), + lag14case_rate = lag(case_rate, 14), + lag0death_rate = lag(death_rate, 0), + lag7death_rate = lag(death_rate, 7), + lag14death_rate = lag(death_rate, 14), + ahead7death_rate = lead(death_rate, 7)) %>% + step_epi_naomit() %>% + prep() + +b2 <- bake(p2, ex) +b2 ``` Notice the difference in number of rows `b1` and `b2` returns. This is because -the recipe that doesn't use `step_epi_ahead` and `step_epi_lag` is missing dates -compared to the one that used the `epipredict` functions. +the second version, the one that doesn't use `step_epi_ahead` and `step_epi_lag`, +has omitted dates compared to the one that used the `epipredict` functions. ```{r} dates_used_in_training1 <- b1 %>% select(- ahead_7_death_rate) %>% na.omit() %>% select(time_value) @@ -548,11 +567,10 @@ instead of 7 days ahead from `r max(dates_used_in_training1$time_value)`. ## References -::: {#refs} -McDonald, Daniel J., et al. "Can auxiliary indicators improve COVID-19 +McDonald, Bien, Green, Hu, et al. "Can auxiliary indicators improve COVID-19 forecasting and hotspot prediction?." Proceedings of the National Academy of -Sciences 118.51 (2021): e2111453118. -::: +Sciences 118.51 (2021): e2111453118. [doi:10.1073/pnas.2111453118]( +https://doi.org/10.1073/pnas.2111453118) ## Attribution From ed778398c308950b14a9aae6b9072299a648345a Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 13 Sep 2022 17:15:46 -0700 Subject: [PATCH 28/29] re document --- man/layer_population_scaling.Rd | 57 +++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index 9d59e3f08..8ac350870 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/layer_population_scaling.R \name{layer_population_scaling} \alias{layer_population_scaling} -\title{Revert population scaled prediction} +\title{Convert per-capita predictions to raw scale} \usage{ layer_population_scaling( frosting, @@ -10,33 +10,36 @@ layer_population_scaling( df, by = NULL, df_pop_col, + rate_rescaling = 1, create_new = TRUE, - suffix = "_original", + suffix = "_scaled", .flag = TRUE, id = rand_id("population_scaling") ) } \arguments{ \item{frosting}{a \code{frosting} postprocessor. The layer will be added to the -sequence of operations for this frosting.} +sequence of operations for this frosting.} \item{...}{One or more selector functions to scale variables for this step. See \code{\link[=selections]{selections()}} for more details.} -\item{df}{a data frame that contains the population data used for scaling.} +\item{df}{a data frame that contains the population data to be used for +inverting the existing scaling.} -\item{by}{A character vector of variables to left join by. +\item{by}{A (possibly named) character vector of variables to join by. If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} and the user-provided dataset. -If columns in \code{epi_df} and \code{df} have the same name (and aren't -included in by), \code{.df} is added to the one from the user-provided data +variables in common across the \code{epi_df} produced by the \code{predict()} call +and the user-provided dataset. +If columns in that \code{epi_df} and \code{df} have the same name (and aren't +included in \code{by}), \code{.df} is added to the one from the user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. -For example, by = c("geo_value" = "states") will match \code{epi_df$geo_value} +For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} to \code{df$states}. To join by multiple variables, use a vector with length > 1. -For example, by = c("geo_value" = "states", "county" = "county") will match +For example, \code{by = c("geo_value" = "states", "county" = "county")} will match \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}. See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} @@ -44,6 +47,10 @@ See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} \item{df_pop_col}{the name of the column in the data frame \code{df} that contains the population data and used for scaling.} +\item{rate_rescaling}{Sometimes rates are "per 100K" or "per 1M" rather than +"per person". Adjustments can be made here. For example, if the original +rate is "per 100K", then set \code{rate_rescaling = 1e5} to get counts back.} + \item{create_new}{TRUE to create a new column and keep the original column in the \code{epi_df}.} @@ -60,12 +67,15 @@ an updated \code{frosting} postprocessor } \description{ \code{layer_population_scaling} creates a specification of a frosting layer -that will add a population scaled column in the data. For example, -load a dataset that contains county population, and join to an \code{epi_df} -that currently predicts number of new cases by county to obtain case rates. -Although worth noting that there is nothing special about "population". -The function can be used to scale by any variable. Population is simply the -most natural and common use case. +that will "undo" per-capita scaling. Typical usage would +load a dataset that contains state-level population, and use it to convert +predictions made from a rate-scale model to raw scale by multiplying by +the population. +Although, it is worth noting that there is nothing special about "population". +The function can be used to scale by any variable. Population is the +standard use case in the epidemiology forecasting scenario. Any value +passed will \emph{multiply} the selected variables while the \code{rate_rescaling} +argument is a common \emph{divisor} of the selected variables. } \examples{ library(epiprocess) @@ -73,8 +83,7 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), - value = c(20000, 30000)) +pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% step_population_scaling(df = pop_data, @@ -98,12 +107,12 @@ wf <- epi_workflow(r, parsnip::fit(jhu) \%>\% add_frosting(f) -latest <- get_test_data(recipe = r, - x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) - +latest <- get_test_data( + recipe = r, + x = epiprocess::jhu_csse_daily_subset \%>\% + dplyr::filter(time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::select(geo_value, time_value, cases)) predict(wf, latest) } From 7be145c7641e769f075b9898c13bf788b1c615d1 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 13 Sep 2022 17:35:13 -0700 Subject: [PATCH 29/29] get tests to pass --- R/layer_population_scaling.R | 2 +- R/utils_arg.R | 2 +- tests/testthat/test-population_scaling.R | 22 ++++++++++------------ vignettes/preprocessing-and-models.Rmd | 1 - 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 29a5cd14a..b5bc57ee3 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -143,7 +143,7 @@ slather.layer_population_scaling <- by= object$by), silent = TRUE) if (any(grepl("Join columns must be present in data", unlist(try_join)))) { - cli_stop(c("columns in `by` selectors of `layer_population_scaling`", + cli_stop(c("columns in `by` selectors of `layer_population_scaling` ", "must be present in data and match"))} object$df <- object$df %>% diff --git a/R/utils_arg.R b/R/utils_arg.R index ebac387d5..6f762382a 100644 --- a/R/utils_arg.R +++ b/R/utils_arg.R @@ -108,7 +108,7 @@ arg_is_chr = function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = F handle_arg_list( ..., tests = function(name, value) { - if (!(is.character(value) | (is.null(value) & !allow_null))) + if (!(is.character(value) | (is.null(value) & allow_null))) cli_stop("Argument {.val {name}} must be of character type.") if (any(is.na(value)) & !allow_na) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 4a360cd71..5b805b33b 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -115,7 +115,7 @@ test_that("Postprocessing workflow works and values correct", { expect_silent(p <- predict(wf, latest)) expect_equal(nrow(p), 2L) expect_equal(ncol(p), 4L) - expect_equal(p$.pred_original, p$.pred * c(20000, 30000)) + expect_equal(p$.pred_scaled, p$.pred * c(20000, 30000)) f <- frosting() %>% layer_predict() %>% @@ -130,7 +130,7 @@ test_that("Postprocessing workflow works and values correct", { expect_silent(p <- predict(wf, latest)) expect_equal(nrow(p), 2L) expect_equal(ncol(p), 4L) - expect_equal(p$.pred_original, p$.pred * c(2, 3)) + expect_equal(p$.pred_scaled, p$.pred * c(2, 3)) }) @@ -160,8 +160,7 @@ test_that("Postprocessing to get cases from case rate", { by = c("geo_value" = "states"), df_pop_col = "value") - wf <- epi_workflow(r, - parsnip::linear_reg()) %>% + wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) @@ -175,7 +174,7 @@ test_that("Postprocessing to get cases from case rate", { expect_silent(p <- predict(wf, latest)) expect_equal(nrow(p), 2L) expect_equal(ncol(p), 4L) - expect_equal(p$.pred_original, p$.pred * c(1/20000, 1/30000)) + expect_equal(p$.pred_scaled, p$.pred * c(1/20000, 1/30000)) }) @@ -255,11 +254,11 @@ test_that("expect error if `by` selector does not match", { by = NULL, df_pop_col = "values") - expect_error(wf <- epi_workflow(r, - parsnip::linear_reg()) %>% - fit(jhu) %>% - add_frosting(f), - "columns in `by` selectors of `step_population_scaling` must be present in data and match") + expect_error( + wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(f), + "columns in `by` selectors of `step_population_scaling` must be present in data and match") r <- epi_recipe(jhu) %>% step_population_scaling(case_rate, @@ -287,8 +286,7 @@ test_that("expect error if `by` selector does not match", { geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate)) - wf <- epi_workflow(r, - parsnip::linear_reg()) %>% + wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 2209cf6ad..23e110b6c 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -36,7 +36,6 @@ we will use a linear regression to predict death rates. Last but not least, we will create a classification model for hotspot predictions. ```{r, warning=FALSE, message=FALSE} -# library(covidcast) library(tidyr) library(dplyr) library(epidatr)