From c7f7221aa5c841014495aaf952fdfef4be13b6d8 Mon Sep 17 00:00:00 2001 From: Kenneth Tay Date: Tue, 9 Mar 2021 14:55:46 -0800 Subject: [PATCH 01/36] Add documentation examples. Examples added for slide_by_geo(), pct_change() and estimate_deriv(). --- R-packages/modeltools/R/change.R | 13 ++++++++++++ R-packages/modeltools/R/deriv.R | 20 +++++++++++++++++++ R-packages/modeltools/R/slide.R | 11 +++++++++++ R-packages/modeltools/man/estimate_deriv.Rd | 22 +++++++++++++++++++++ R-packages/modeltools/man/pct_change.Rd | 15 ++++++++++++++ R-packages/modeltools/man/slide_by_geo.Rd | 13 ++++++++++++ 6 files changed, 94 insertions(+) diff --git a/R-packages/modeltools/R/change.R b/R-packages/modeltools/R/change.R index 7ad0e4e0..47990266 100644 --- a/R-packages/modeltools/R/change.R +++ b/R-packages/modeltools/R/change.R @@ -19,6 +19,19 @@ #' @return A data frame given by appending a new column to `x` named according #' to the `col_name` argument, containing the percentage change values. #' +#' @examples \dontrun{ +#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", +#' start_day = "2021-01-01", +#' end_day = "2021-01-31", +#' geo_type = "state") +#' +#' # percentage change between back-to-back weeks (default, as n = 14) +#' pct_change(df) +#' +#' # percentage change between back-to-back days +#' pct_change(df, n = 2) +#' } +#' #' @export pct_change = function(x, n = 14, col_name = "pct_change") { # Check if n is odd and if so bump it up one diff --git a/R-packages/modeltools/R/deriv.R b/R-packages/modeltools/R/deriv.R index 73fdbbc6..3d927f01 100644 --- a/R-packages/modeltools/R/deriv.R +++ b/R-packages/modeltools/R/deriv.R @@ -74,6 +74,26 @@ #' #' @return A data frame given by appending a new column to `x` named according #' to the `col_name` argument, containing the estimated derivative values. +#' +#' @examples \dontrun{ +#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", +#' start_day = "2021-01-01", +#' end_day = "2021-02-28", +#' geo_type = "state", +#' geo_values = c("ca", "fl")) +#' +#' # estimate derivative using linear regression and n = 10 days +#' estimate_deriv(df, method = "lin", n = 10) +#' +#' # keep the linear regression fits +#' estimate_deriv(df, method = "lin", n = 10, keep_obj = TRUE) +#' +#' # estimate derivative using smoothing spline with 8 degrees of freedom +#' estimate_deriv(df, method = "ss", n = 28, df = 8) +#' +#' # estimate derivative using trend filtering with 8 degrees of freedom +#' estimate_deriv(df, method = "tf", n = 28, df = 8) +#' } #' #' @export estimate_deriv = function(x, method = c("lin", "ss", "tf"), n = 14, diff --git a/R-packages/modeltools/R/slide.R b/R-packages/modeltools/R/slide.R index ecd48dd4..e3420e6e 100644 --- a/R-packages/modeltools/R/slide.R +++ b/R-packages/modeltools/R/slide.R @@ -29,6 +29,17 @@ #' #' @return A data frame given by appending a new column to `x` named according #' to the `col_name` argument, containing the function values. +#' +#' @examples \dontrun{ +#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", +#' start_day = "2021-01-01", +#' end_day = "2021-01-31", +#' geo_type = "state") +#' +#' # two equivalent ways to compute 7-day trailing averages +#' slide_by_geo(df, slide_fun = ~ Mean(.x$value), n = 7) +#' slide_by_geo(df, slide_fun = function(x, ...) Mean(x$value) , n = 7) +#' } #' #' @importFrom dplyr %>% arrange group_by group_modify mutate ungroup #' @importFrom lubridate days diff --git a/R-packages/modeltools/man/estimate_deriv.Rd b/R-packages/modeltools/man/estimate_deriv.Rd index 635ca542..cb9320d0 100644 --- a/R-packages/modeltools/man/estimate_deriv.Rd +++ b/R-packages/modeltools/man/estimate_deriv.Rd @@ -97,3 +97,25 @@ based on the cross-validation error curve (minimum or 1-standard-error rule, respectively). Default is 8 when \code{cv = FALSE}, and "1se" when \code{cv = TRUE}.} } } +\examples{ +\dontrun{ +df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", + start_day = "2021-01-01", + end_day = "2021-02-28", + geo_type = "state", + geo_values = c("ca", "fl")) + +# estimate derivative using linear regression and n = 10 days +estimate_deriv(df, method = "lin", n = 10) + +# keep the linear regression fits +estimate_deriv(df, method = "lin", n = 10, keep_obj = TRUE) + +# estimate derivative using smoothing spline with 8 degrees of freedom +estimate_deriv(df, method = "ss", n = 28, df = 8) + +# estimate derivative using trend filtering with 8 degrees of freedom +estimate_deriv(df, method = "tf", n = 28, df = 8) +} + +} diff --git a/R-packages/modeltools/man/pct_change.Rd b/R-packages/modeltools/man/pct_change.Rd index 0163c02c..f22a06e4 100644 --- a/R-packages/modeltools/man/pct_change.Rd +++ b/R-packages/modeltools/man/pct_change.Rd @@ -29,3 +29,18 @@ frame. (When multiple issue dates are present, only the latest issue is considered.) See the \href{https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/pct-change.html}{percentage change vignette} for examples. } +\examples{ +\dontrun{ +df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", + start_day = "2021-01-01", + end_day = "2021-01-31", + geo_type = "state") + +# percentage change between back-to-back weeks (default, as n = 14) +pct_change(df) + +# percentage change between back-to-back days +pct_change(df, n = 2) +} + +} diff --git a/R-packages/modeltools/man/slide_by_geo.Rd b/R-packages/modeltools/man/slide_by_geo.Rd index 33f2bdb3..8657baeb 100644 --- a/R-packages/modeltools/man/slide_by_geo.Rd +++ b/R-packages/modeltools/man/slide_by_geo.Rd @@ -49,3 +49,16 @@ grouped by \code{geo_value}. (When multiple issue dates are present, only the latest issue is considered.) See the \href{https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/modeltools.html}{getting started guide} for examples. } +\examples{ +\dontrun{ +df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", + start_day = "2021-01-01", + end_day = "2021-01-31", + geo_type = "state") + +# two equivalent ways to compute 7-day trailing averages +slide_by_geo(df, slide_fun = ~ Mean(.x$value), n = 7) +slide_by_geo(df, slide_fun = function(x, ...) Mean(x$value) , n = 7) +} + +} From 1d2f1d40aeb470856370d59ab84863a8b91a3518 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Tue, 16 Mar 2021 19:58:23 -0400 Subject: [PATCH 02/36] function for testing and training matrices --- R-packages/modeltools/NAMESPACE | 2 + R-packages/modeltools/R/matrix.R | 75 +++++++++++++++++++ .../man/create_train_and_predict_matrices.Rd | 46 ++++++++++++ .../modeltools/tests/testthat/test-matrix.R | 44 +++++++++++ 4 files changed, 167 insertions(+) create mode 100644 R-packages/modeltools/R/matrix.R create mode 100644 R-packages/modeltools/man/create_train_and_predict_matrices.Rd create mode 100644 R-packages/modeltools/tests/testthat/test-matrix.R diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 8bbd90e3..9d353a1c 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -7,6 +7,7 @@ export(Median) export(Min) export(Start) export(Sum) +export(create_train_and_predict_matrices) export(estimate_deriv) export(pct_change) export(quantgen_forecaster) @@ -32,5 +33,6 @@ importFrom(stats,coef) importFrom(stats,lsfit) importFrom(stats,predict) importFrom(stats,smooth.spline) +importFrom(tibble,tibble) importFrom(tidyr,drop_na) importFrom(tidyr,pivot_longer) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R new file mode 100644 index 00000000..25f95afd --- /dev/null +++ b/R-packages/modeltools/R/matrix.R @@ -0,0 +1,75 @@ +#' Create a training and prediction matrices for a given ahead. +#' +#' @param lagged_df Data frame of lagged data. It should have the following columns: +#' \itemize{ +#' \item{`geo_value`}{Strings of geographic locations.} +#' \item{`time_value`}{Dates of training data.} +#' \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values +#' correspond to `{signal}` `{days}` before `time_value`} +#' \item{Response columns}{Columns with names of the form `value+{n}:{response}` whose values +#' correspond to `{response}` `{n}` incidence period units after `time_value`. Since +#' columns beginning with "value+" are interpretted as the response rather than a +#' covariate, observed data from the same day as `time_value` should be in columns +#' `value-0:{signal}` rather than `value+0:{signal}`} +#' } +#' A data frame in this format can be made using `covidcast::aggregate_signals()`. +#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast +#' @param training_window_size Size of the local training window in days to use. For example, if +#' `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on +#' data from December 1 to December 14. +#' +#' @return Named list with entries: +#' \itemize{ +#' \item{`train_x`}{Matrix of training data whose columns correspond to the +#' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the +#' latest date `d` such that there is an observed response at time `d + ahead` and all data +#' from the `training_window_size` days prior to it.} +#' \item{`train_y`}{Vector of response data from the `value+{ahead}:{response}` column of +#' `lagged_df` corresponding to the rows of `train_x`.} +#' \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`. The +#' prediction data contains the most recent `training_window_size` days.} +#' \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.} +#' } +#' +#' @importFrom tibble tibble +#' +#' @export +create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) { + out <- list() + + train_df <- lagged_df %>% + select(geo_value, time_value, tidyselect::matches("^value(\\+0|-)")) + + # Find the last possible date of training data + response_end_date <- lagged_df %>% + select(time_value, tidyselect::starts_with(sprintf("value+%i:", ahead))) %>% + tidyr::drop_na() %>% + summarize(max(time_value)) %>% + pull() + train_end_date <- min(max(lagged_df$time_value), response_end_date) + + # Training matrices + out$train_x <- train_df %>% + filter(between(time_value, + train_end_date - training_window_size + 1, + train_end_date)) %>% + select(-c(geo_value, time_value)) %>% + as.matrix() + out$train_y <- lagged_df %>% + filter(between(time_value, + train_end_date - training_window_size + 1, + train_end_date)) %>% + select(tidyselect::starts_with(sprintf("value+%i:", ahead))) %>% + pull() + + # Prediction matrices + out$predict_x <- lagged_df %>% + filter(time_value == max(time_value)) %>% + select(tidyselect::matches("^value(\\+0|-)")) %>% + as.matrix() + out$predict_geo_values <- lagged_df %>% + filter(time_value == max(time_value)) %>% + select(geo_value) %>% pull() + + return(out) +} diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd new file mode 100644 index 00000000..9a99dd6c --- /dev/null +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix.R +\name{create_train_and_predict_matrices} +\alias{create_train_and_predict_matrices} +\title{Create a training and prediction matrices for a given ahead.} +\usage{ +create_train_and_predict_matrices(lagged_df, ahead, training_window_size) +} +\arguments{ +\item{lagged_df}{Data frame of lagged data. It should have the following columns: +\itemize{ +\item{\code{geo_value}}{Strings of geographic locations.} +\item{\code{time_value}}{Dates of training data.} +\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values +correspond to \code{{signal}} \code{{days}} before \code{time_value}} +\item{Response columns}{Columns with names of the form \code{value+{n}:{response}} whose values +correspond to \code{{response}} \code{{n}} incidence period units after \code{time_value}. Since +columns beginning with "value+" are interpretted as the response rather than a +covariate, observed data from the same day as \code{time_value} should be in columns +\code{value-0:{signal}} rather than \code{value+0:{signal}}} +} +A data frame in this format can be made using \code{covidcast::aggregate_signals()}.} + +\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast} + +\item{training_window_size}{Size of the local training window in days to use. For example, if +\code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on +data from December 1 to December 14.} +} +\value{ +Named list with entries: +\itemize{ +\item{\code{train_x}}{Matrix of training data whose columns correspond to the +\code{value-{days}:{signal}} columns in \code{lagged_df}. The training data consists of the +latest date \code{d} such that there is an observed response at time \code{d + ahead} and all data +from the \code{training_window_size} days prior to it.} +\item{\code{train_y}}{Vector of response data from the \code{value+{ahead}:{response}} column of +\code{lagged_df} corresponding to the rows of \code{train_x}.} +\item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}. The +prediction data contains the most recent \code{training_window_size} days.} +\item{\code{predict_geo_values}}{Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.} +} +} +\description{ +Create a training and prediction matrices for a given ahead. +} diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R new file mode 100644 index 00000000..bc4ced9d --- /dev/null +++ b/R-packages/modeltools/tests/testthat/test-matrix.R @@ -0,0 +1,44 @@ +library(tibble) + +test_that("training and prediction matrices are created", { + df <- tibble( + geo_value = rep(c("az", "wv"), 5), + time_value = rep( + as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")), + each = 2), + `value-2:signal_1` = seq(-3, 6), + `value-2:signal_2` = seq(7, 16), + `value-1:signal_1` = seq(-1, 8), + `value-1:signal_2` = seq(9, 18), + `value-0:signal_1` = seq(1, 10), + `value-0:signal_2` = seq(11, 20), + `value+1:signal_1` = c(seq(3, 10), rep(NA, 2)), + `value+1:signal_2` = c(seq(13, 20), rep(NA, 2)), + `value+2:signal_1` = c(seq(5, 10), rep(NA, 4)), + `value+2:signal_2` = c(seq(15, 20), rep(NA, 4)) + ) + + out <- create_train_and_predict_matrices(df, 2, 1) + + expect_equal(names(out), c("train_x", "train_y", "predict_x", "predict_geo_values")) + expect_equal(out$train_x, + as.matrix(tibble( + `value-2:signal_1` = c(1, 2), + `value-2:signal_2` = c(11, 12), + `value-1:signal_1` = c(3, 4), + `value-1:signal_2` = c(13, 14), + `value-0:signal_1` = c(5, 6), + `value-0:signal_2` = c(15, 16))) + ) + expect_equal(out$train_y, c(19, 20)) + expect_equal(out$predict_x, + as.matrix(tibble( + `value-2:signal_1` = c(5, 6), + `value-2:signal_2` = c(15, 16), + `value-1:signal_1` = c(7, 8), + `value-1:signal_2` = c(17, 18), + `value-0:signal_1` = c(9, 10), + `value-0:signal_2` = c(19, 20))) + ) + expect_equal(out$predict_geo_values, c("az", "wv")) +}) From 8c9a99082367142b16e65b9f1497f1e0eb47d702 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Wed, 17 Mar 2021 15:00:47 -0400 Subject: [PATCH 03/36] test for slide --- R-packages/modeltools/tests/testthat/test-slide.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 R-packages/modeltools/tests/testthat/test-slide.R diff --git a/R-packages/modeltools/tests/testthat/test-slide.R b/R-packages/modeltools/tests/testthat/test-slide.R new file mode 100644 index 00000000..9040dc6b --- /dev/null +++ b/R-packages/modeltools/tests/testthat/test-slide.R @@ -0,0 +1,15 @@ +library(tibble) + +test_that("slide_by_geo works", { + df <- tibble( + geo_value = rep(c("a", "b"), each=5), + time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2), + value = 1:10 + ) + out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value") + expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value")) + expect_equal(out$time_value, df$time_value) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$value, df$value) + expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27)) +}) From 75984ddbbbd8152e922635ea8f55ea3e570761e5 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Wed, 17 Mar 2021 16:09:42 -0400 Subject: [PATCH 04/36] add argument to slide_by_geo for shifting --- R-packages/modeltools/R/slide.R | 49 +++++++----- R-packages/modeltools/man/slide_by_geo.Rd | 16 ++-- .../modeltools/tests/testthat/test-slide.R | 76 ++++++++++++++++++- 3 files changed, 116 insertions(+), 25 deletions(-) diff --git a/R-packages/modeltools/R/slide.R b/R-packages/modeltools/R/slide.R index e3420e6e..eb5b240f 100644 --- a/R-packages/modeltools/R/slide.R +++ b/R-packages/modeltools/R/slide.R @@ -1,7 +1,7 @@ #' Slide a function over values in `covidcast_signal` data frame, grouped by #' `geo_value` #' -#' Slides a given function over the values in a `covidcast_signal` data frame, +#' Slides a given function over the values in a `covidcast_signal` data frame, #' grouped by `geo_value`. (When multiple issue dates are present, only the #' latest issue is considered.) See the [getting started #' guide](https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/modeltools.html) @@ -10,15 +10,19 @@ #' @param x The `covidcast_signal` data frame under consideration. #' @param slide_fun Function or formula to slide over the values in `x`, grouped #' by `geo_value`. To "slide" means to apply the function or formula over a -#' trailing window of `n` days of data. If a function, `slide_fun` must take +#' right-aligned window of `n` days of data. If a function, `slide_fun` must take #' `x`, a data frame the same column names as the original data frame; #' followed by any number of named additional arguments; and ending with #' `...`, to capture general additional arguments. If a formula, `slide_fun` #' can operate directly on `.x$value`, `.x$time_value`, etc., as in `~ #' mean(.x$value)` to compute a trailing mean over the last `n` days of data. -#' @param n Size of the local window (in days) to use. For example, if `n = 5`, -#' then to estimate the derivative on November 5, we train the given method on -#' data in between November 1 and November 5. Default is 14. +#' @param n Size of the local window (in days) to use. Default is 14. +#' @param shift Number of days forward from the `time_value` to shift the window. Because the +#' window is right-aligned, if `n = 5` and `shift = 0`, then for a `time_value` of November 5, +#' the function will be applied to values from November 1 to November 5. A positive value shifts +#' the window forwards in time, while negative values shift the window backwards in time. Ex: +#' for a `time_value` of November 5 `shift = 3` captures November 4 through November 8; +#' `shift = -3` captures October 29 through November 2. Default is not to shift. #' @param col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `col_name = "value"` will overwrite the existing "value" column. @@ -44,8 +48,13 @@ #' @importFrom dplyr %>% arrange group_by group_modify mutate ungroup #' @importFrom lubridate days #' @export -slide_by_geo = function(x, slide_fun, n = 14, col_name = "slide_value", - col_type = c("dbl", "int", "lgl", "chr", "list"), ...) { +slide_by_geo <- function(x, + slide_fun, + n = 14, + shift = 0, + col_name = "slide_value", + col_type = c("dbl", "int", "lgl", "chr", "list"), + ...) { # Check we have the minimal columns we need if (!all(c("geo_value", "time_value") %in% colnames(x))) { stop("`x` must have columns 'geo_value' and 'time_value'.") @@ -53,20 +62,22 @@ slide_by_geo = function(x, slide_fun, n = 14, col_name = "slide_value", # x = covidcast:::latest_issue(x) # TODO is this needed? # Which slide_index function? - col_type = match.arg(col_type) - slide_index_zzz = switch(col_type, - "dbl" = slider::slide_index_dbl, - "int" = slider::slide_index_int, - "lgl" = slider::slide_index_lgl, - "chr" = slider::slide_index_chr, - "list" = slider::slide_index) + col_type <- match.arg(col_type) + slide_index_zzz <- switch(col_type, + "dbl" = slider::slide_index_dbl, + "int" = slider::slide_index_int, + "lgl" = slider::slide_index_lgl, + "chr" = slider::slide_index_chr, + "list" = slider::slide_index) # Slide over a single geo value - slide_one_geo = function(.data_group, slide_fun, n, col_name, ...) { - slide_values = slide_index_zzz(.x = .data_group, - .i = .data_group$time_value, - .f = slide_fun, ..., - .before = days(n-1)) + slide_one_geo <- function(.data_group, slide_fun, n, col_name, ...) { + slide_values <- slide_index_zzz(.x = .data_group, + .i = .data_group$time_value, + .f = slide_fun, ..., + .before = days(n - 1 - shift), + .after = days(shift) + ) return(mutate(.data_group, !!col_name := slide_values)) } diff --git a/R-packages/modeltools/man/slide_by_geo.Rd b/R-packages/modeltools/man/slide_by_geo.Rd index 8657baeb..e35cc602 100644 --- a/R-packages/modeltools/man/slide_by_geo.Rd +++ b/R-packages/modeltools/man/slide_by_geo.Rd @@ -9,6 +9,7 @@ slide_by_geo( x, slide_fun, n = 14, + shift = 0, col_name = "slide_value", col_type = c("dbl", "int", "lgl", "chr", "list"), ... @@ -19,15 +20,20 @@ slide_by_geo( \item{slide_fun}{Function or formula to slide over the values in \code{x}, grouped by \code{geo_value}. To "slide" means to apply the function or formula over a -trailing window of \code{n} days of data. If a function, \code{slide_fun} must take +right-aligned window of \code{n} days of data. If a function, \code{slide_fun} must take \code{x}, a data frame the same column names as the original data frame; followed by any number of named additional arguments; and ending with \code{...}, to capture general additional arguments. If a formula, \code{slide_fun} can operate directly on \code{.x$value}, \code{.x$time_value}, etc., as in \code{~ mean(.x$value)} to compute a trailing mean over the last \code{n} days of data.} -\item{n}{Size of the local window (in days) to use. For example, if \code{n = 5}, -then to estimate the derivative on November 5, we train the given method on -data in between November 1 and November 5. Default is 14.} +\item{n}{Size of the local window (in days) to use. Default is 14.} + +\item{shift}{Number of days forward from the \code{time_value} to shift the window. Because the +window is right-aligned, if \code{n = 5} and \code{shift = 0}, then for a \code{time_value} of November 5, +the function will be applied to values from November 1 to November 5. A positive value shifts +the window forwards in time, while negative values shift the window backwards in time. Ex: +for a \code{time_value} of November 5 \code{shift = 3} captures November 4 through November 8; +\code{shift = -3} captures October 29 through November 2. Default is not to shift.} \item{col_name}{String indicating the name of the new column that will contain the derivative values. Default is "slide_value"; note that setting @@ -44,7 +50,7 @@ A data frame given by appending a new column to \code{x} named according to the \code{col_name} argument, containing the function values. } \description{ -Slides a given function over the values in a \code{covidcast_signal} data frame, +Slides a given function over the values in a \code{covidcast_signal} data frame, grouped by \code{geo_value}. (When multiple issue dates are present, only the latest issue is considered.) See the \href{https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/modeltools.html}{getting started guide} for examples. diff --git a/R-packages/modeltools/tests/testthat/test-slide.R b/R-packages/modeltools/tests/testthat/test-slide.R index 9040dc6b..496a6457 100644 --- a/R-packages/modeltools/tests/testthat/test-slide.R +++ b/R-packages/modeltools/tests/testthat/test-slide.R @@ -8,8 +8,82 @@ test_that("slide_by_geo works", { ) out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value") expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value")) - expect_equal(out$time_value, df$time_value) expect_equal(out$geo_value, df$geo_value) + expect_equal(out$time_value, df$time_value) expect_equal(out$value, df$value) expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27)) }) + + +test_that("slide_by_geo works with interleaved geo_values", { + df <- tibble( + geo_value = rep(c("a", "b"), 5), + time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), each=2), + # keep the individual rows the same as the previous test so the summed values are identical + value = c(1, 6, 2, 7, 3, 8, 4, 9, 5, 10) + ) + out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value") + expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value")) + expect_equal(out$geo_value, rep(c("a", "b"), each=5)) + expect_equal(out$time_value, + rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2)) + expect_equal(out$value, 1:10) + expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27)) +}) + +test_that("slide_by_geo overwrites column when specified", { + df <- tibble( + geo_value = rep(c("a", "b"), each=5), + time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2), + value = 1:10 + ) + out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "value") + expect_equal(names(out), c("geo_value", "time_value", "value")) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$time_value, df$time_value) + expect_equal(out$value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27)) +}) + + +test_that("slide_by_geo works with positive shift", { + df <- tibble( + geo_value = rep(c("a", "b"), each=5), + time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2), + value = 1:10 + ) + out <- slide_by_geo(df, ~ Sum(.x$value), n = 2, shift = 1, col_name = "summed_value") + expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value")) + expect_equal(out$time_value, df$time_value) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$value, df$value) + expect_equal(out$summed_value, c(3, 5, 7, 9, 5, 13, 15, 17, 19, 10)) +}) + +test_that("slide_by_geo works with negative shift", { + df <- tibble( + geo_value = rep(c("a", "b"), each=5), + time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2), + value = 1:10 + ) + out <- slide_by_geo(df, ~ Sum(.x$value), n = 2, shift = -1, col_name = "summed_value") + expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value")) + expect_equal(out$time_value, df$time_value) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$value, df$value) + expect_equal(out$summed_value, c(0, 1, 3, 5, 7, 0, 6, 13, 15, 17)) +}) + +test_that("slide_by_geo works with positive shift past current value", { + df <- tibble( + geo_value = rep(c("a", "b"), each=10), + time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-10"), "day"), 2), + value = 1:20 + ) + out <- slide_by_geo(df, ~ Sum(.x$value), n = 2, shift = 3, col_name = "summed_value") + expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value")) + expect_equal(out$time_value, df$time_value) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$value, df$value) + expect_equal(out$summed_value, c(7, 9, 11, 13, 15, 17, 19, 10, 0, 0, + 27, 29, 31, 33, 35, 37, 39, 20, 0, 0)) +}) From 0307791c46d2161777ebbc2e2a003923da4840e2 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Wed, 17 Mar 2021 17:02:41 -0400 Subject: [PATCH 05/36] add tests for missing time values --- .../modeltools/tests/testthat/test-slide.R | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/R-packages/modeltools/tests/testthat/test-slide.R b/R-packages/modeltools/tests/testthat/test-slide.R index 496a6457..4d344383 100644 --- a/R-packages/modeltools/tests/testthat/test-slide.R +++ b/R-packages/modeltools/tests/testthat/test-slide.R @@ -3,7 +3,7 @@ library(tibble) test_that("slide_by_geo works", { df <- tibble( geo_value = rep(c("a", "b"), each=5), - time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2), + time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-10"), "day"), value = 1:10 ) out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value") @@ -31,6 +31,22 @@ test_that("slide_by_geo works with interleaved geo_values", { expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27)) }) + +test_that("slide_by_geo works with missing time values", { + df <- tibble( + geo_value = rep(c("a", "b"), each=5), + time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-20"), by = 2), + value = 1:10 + ) + out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value") + expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value")) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$time_value, df$time_value) + expect_equal(out$value, df$value) + expect_equal(out$summed_value, c(1, 3, 5, 7, 9, 6, 13, 15, 17, 19)) +}) + + test_that("slide_by_geo overwrites column when specified", { df <- tibble( geo_value = rep(c("a", "b"), each=5), From b876289f5306ad27f3d2814fa564330ca0daa68b Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Thu, 18 Mar 2021 07:01:28 -0400 Subject: [PATCH 06/36] add function for response aggregation --- R-packages/modeltools/R/response.R | 53 +++++++++++++++++++ .../modeltools/tests/testthat/test-response.R | 43 +++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 R-packages/modeltools/R/response.R create mode 100644 R-packages/modeltools/tests/testthat/test-response.R diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R new file mode 100644 index 00000000..2d02fbb6 --- /dev/null +++ b/R-packages/modeltools/R/response.R @@ -0,0 +1,53 @@ +make_response <- function(signals, + response_name, + forecast_date, + incidence_period, + ahead) { + out_df <- signals + for (a in ahead) { + target_period <- as.list(get_target_period(forecast_date, incidence_period, a)[1, ]) + window_size <- as.numeric(target_period$end - target_period$start) + 1 + out_df <- slide_by_geo(out_df, + ~ Mean(window_size * .x[[paste0("value-0:", response_name)]]), + n = window_size, + shift = as.numeric(target_period$end - forecast_date), + col_name = paste0("value+", a, ":", response_name)) + } + return(out_df) +} + +#' Get the target period for a forecast date, incidence period and ahead +#' +#' +get_target_period <- function(forecast_date, incidence_period, ahead) { + # This function gives the start and end dates of the target period, + # based on the system described in the COVIDHub rules here: + # https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md + # + # Inputs: + # forecast_date: can be a vector of dates + # incidence_period: one of "epiweek" or "day" + # ahead: how many epiweeks/days ahead are you forecasting? + forecast_date <- lubridate::ymd(forecast_date) + if (incidence_period == "day") + return(tibble(start = forecast_date + ahead, end = forecast_date + ahead)) + assert_that(incidence_period == "epiweek", + msg="Unsupported `incidence_period`.") + # incidence_period: epiweek + ew_frcst_date <- MMWRweek::MMWRweek(forecast_date) # get epiweek of forecast_dates + sunday_of_ew_frcst_date <- MMWRweek::MMWRweek2Date( + MMWRyear = ew_frcst_date$MMWRyear, + MMWRweek = ew_frcst_date$MMWRweek, + MMWRday = 1) # 1 is Sunday + # From https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md: + # "For week-ahead forecasts with forecast_date of Sunday or Monday of EW12, a + # 1 week ahead forecast corresponds to EW12 and should have target_end_date of + # the Saturday of EW12. For week-ahead forecasts with forecast_date of Tuesday + # through Saturday of EW12, a 1 week ahead forecast corresponds to EW13 and + # should have target_end_date of the Saturday of EW13." + week_ahead <- ifelse(lubridate::wday(forecast_date) <= 2, # forecasting on a Sun/Monday + ahead - 1, + ahead) + tibble(start = sunday_of_ew_frcst_date + week_ahead * 7, + end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1) +} \ No newline at end of file diff --git a/R-packages/modeltools/tests/testthat/test-response.R b/R-packages/modeltools/tests/testthat/test-response.R new file mode 100644 index 00000000..29dfc230 --- /dev/null +++ b/R-packages/modeltools/tests/testthat/test-response.R @@ -0,0 +1,43 @@ +library(tibble) + + +test_that("make_response works for epiweeks", { + df <- tibble( + geo_value = "oh", + time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"), + `value-0:my_response` = 1:31 + ) + out <- make_response(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2) + + expect_equal(names(out), + c("geo_value", "time_value", "value-0:my_response", + "value+1:my_response", "value+2:my_response")) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$time_value, df$time_value) + expect_equal(out$`value-0:my_response`, df$`value-0:my_response`) + expect_equal(out$`value+1:my_response`, + c(seq(35, 196, 7), seq(199.5, 217, 3.5), NA)) + expect_equal(out$`value+2:my_response`, + c(seq(84, 196, 7), seq(199.5, 217, 3.5), rep(NA, 8))) +}) + + +test_that("make_response works for days", { + df <- tibble( + geo_value = "oh", + time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"), + `value-0:my_response` = 1:31 + ) + out <- make_response(df, "my_response", as.Date("2021-01-30"), "day", 1:4) + + expect_equal(names(out), + c("geo_value", "time_value", "value-0:my_response", + "value+1:my_response", "value+2:my_response", "value+3:my_response", "value+4:my_response")) + expect_equal(out$geo_value, df$geo_value) + expect_equal(out$time_value, df$time_value) + expect_equal(out$`value-0:my_response`, df$`value-0:my_response`) + expect_equal(out$`value+1:my_response`, c(seq(2, 31), NA)) + expect_equal(out$`value+2:my_response`, c(seq(3, 31), rep(NA, 2))) + expect_equal(out$`value+3:my_response`, c(seq(4, 31), rep(NA, 3))) + expect_equal(out$`value+4:my_response`, c(seq(5, 31), rep(NA, 4))) +}) From 8ac018ae351fb3b1418cccd73c4469e96e83b1d9 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Thu, 18 Mar 2021 09:55:18 -0400 Subject: [PATCH 07/36] add documentation and change function names --- R-packages/modeltools/NAMESPACE | 1 + R-packages/modeltools/R/response.R | 97 ++++++++++++------- .../modeltools/man/add_response_columns.Rd | 43 ++++++++ .../modeltools/man/get_target_period.Rd | 11 +++ .../modeltools/tests/testthat/test-response.R | 9 +- 5 files changed, 121 insertions(+), 40 deletions(-) create mode 100644 R-packages/modeltools/man/add_response_columns.Rd create mode 100644 R-packages/modeltools/man/get_target_period.Rd diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 8bbd90e3..53039634 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -7,6 +7,7 @@ export(Median) export(Min) export(Start) export(Sum) +export(add_response_columns) export(estimate_deriv) export(pct_change) export(quantgen_forecaster) diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R index 2d02fbb6..78645f9a 100644 --- a/R-packages/modeltools/R/response.R +++ b/R-packages/modeltools/R/response.R @@ -1,8 +1,35 @@ -make_response <- function(signals, - response_name, - forecast_date, - incidence_period, - ahead) { +library(assertthat) + +#' Compute the corresponding observed response values for a set of signals. +#' +#' The response for an observed data point at time `t` is defined as the mean of observed values on +#' days `t + s` and `t + e` multiplied by `e - s`, where `s` and `e` are the number of days between +#' `forecast_date` and the start and end, respectively, of the next full `incidence_period`. We +#' choose to take the mean and multiply by the incidence period length rather than simply sum the +#' observed values in order to avoid treating missing values as 0. +#' +#' @param signals Wide data frame of signals. We expect the following columns: +#' \itemize{ +#' \item{`geo_value`}{Strings of geographic locations.} +#' \item{`time_value`}{Dates of training data.} +#' \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values +#' correspond to `{signal}` `{days}` before `time_value`} +#' } +#' @param response_name Name of response signal. The value of the response variable on a single +#' day should be located in column `value-0:{response_name}` of `signals`. +#' @param forecast_date Date on which the forecast will be made +#' @param incidence_period Time period over which the response should be summed. +#' @param ahead integer or vector of integer ahead values +#' +#' @return Data frame of signals equal to `signals` with `length(ahead)` additional columns with +#' names `value+{a}:{response_name}` corresponding to the response variable at ahead `a`. +#' +#' @export +add_response_columns <- function(signals, + response_name, + forecast_date, + incidence_period, + ahead) { out_df <- signals for (a in ahead) { target_period <- as.list(get_target_period(forecast_date, incidence_period, a)[1, ]) @@ -20,34 +47,34 @@ make_response <- function(signals, #' #' get_target_period <- function(forecast_date, incidence_period, ahead) { - # This function gives the start and end dates of the target period, - # based on the system described in the COVIDHub rules here: - # https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md - # - # Inputs: - # forecast_date: can be a vector of dates - # incidence_period: one of "epiweek" or "day" - # ahead: how many epiweeks/days ahead are you forecasting? - forecast_date <- lubridate::ymd(forecast_date) - if (incidence_period == "day") - return(tibble(start = forecast_date + ahead, end = forecast_date + ahead)) - assert_that(incidence_period == "epiweek", - msg="Unsupported `incidence_period`.") - # incidence_period: epiweek - ew_frcst_date <- MMWRweek::MMWRweek(forecast_date) # get epiweek of forecast_dates - sunday_of_ew_frcst_date <- MMWRweek::MMWRweek2Date( - MMWRyear = ew_frcst_date$MMWRyear, - MMWRweek = ew_frcst_date$MMWRweek, - MMWRday = 1) # 1 is Sunday - # From https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md: - # "For week-ahead forecasts with forecast_date of Sunday or Monday of EW12, a - # 1 week ahead forecast corresponds to EW12 and should have target_end_date of - # the Saturday of EW12. For week-ahead forecasts with forecast_date of Tuesday - # through Saturday of EW12, a 1 week ahead forecast corresponds to EW13 and - # should have target_end_date of the Saturday of EW13." - week_ahead <- ifelse(lubridate::wday(forecast_date) <= 2, # forecasting on a Sun/Monday - ahead - 1, - ahead) - tibble(start = sunday_of_ew_frcst_date + week_ahead * 7, - end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1) + # This function gives the start and end dates of the target period, + # based on the system described in the COVIDHub rules here: + # https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md + # + # Inputs: + # forecast_date: can be a vector of dates + # incidence_period: one of "epiweek" or "day" + # ahead: how many epiweeks/days ahead are you forecasting? + forecast_date <- lubridate::ymd(forecast_date) + if (incidence_period == "day") + return(tibble(start = forecast_date + ahead, end = forecast_date + ahead)) + assert_that(incidence_period == "epiweek", + msg="Unsupported `incidence_period`.") + # incidence_period: epiweek + ew_frcst_date <- MMWRweek::MMWRweek(forecast_date) # get epiweek of forecast_dates + sunday_of_ew_frcst_date <- MMWRweek::MMWRweek2Date( + MMWRyear = ew_frcst_date$MMWRyear, + MMWRweek = ew_frcst_date$MMWRweek, + MMWRday = 1) # 1 is Sunday + # From https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md: + # "For week-ahead forecasts with forecast_date of Sunday or Monday of EW12, a + # 1 week ahead forecast corresponds to EW12 and should have target_end_date of + # the Saturday of EW12. For week-ahead forecasts with forecast_date of Tuesday + # through Saturday of EW12, a 1 week ahead forecast corresponds to EW13 and + # should have target_end_date of the Saturday of EW13." + week_ahead <- ifelse(lubridate::wday(forecast_date) <= 2, # forecasting on a Sun/Monday + ahead - 1, + ahead) + tibble(start = sunday_of_ew_frcst_date + week_ahead * 7, + end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1) } \ No newline at end of file diff --git a/R-packages/modeltools/man/add_response_columns.Rd b/R-packages/modeltools/man/add_response_columns.Rd new file mode 100644 index 00000000..6f0c5a24 --- /dev/null +++ b/R-packages/modeltools/man/add_response_columns.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/response.R +\name{add_response_columns} +\alias{add_response_columns} +\title{Compute the corresponding observed response values for a set of signals.} +\usage{ +add_response_columns( + signals, + response_name, + forecast_date, + incidence_period, + ahead +) +} +\arguments{ +\item{signals}{Wide data frame of signals. We expect the following columns: +\itemize{ +\item{\code{geo_value}}{Strings of geographic locations.} +\item{\code{time_value}}{Dates of training data.} +\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values +correspond to \code{{signal}} \code{{days}} before \code{time_value}} +}} + +\item{response_name}{Name of response signal. The value of the response variable on a single +day should be located in column \code{value-0:{response_name}} of \code{signals}.} + +\item{forecast_date}{Date on which the forecast will be made} + +\item{incidence_period}{Time period over which the response should be summed.} + +\item{ahead}{integer or vector of integer ahead values} +} +\value{ +Data frame of signals equal to \code{signals} with \code{length(ahead)} additional columns with +names \code{value+{a}:{response_name}} corresponding to the response variable at ahead \code{a}. +} +\description{ +The response for an observed data point at time \code{t} is defined as the mean of observed values on +days \code{t + s} and \code{t + e} multiplied by \code{e - s}, where \code{s} and \code{e} are the number of days between +\code{forecast_date} and the start and end, respectively, of the next full \code{incidence_period}. We +choose to take the mean and multiply by the incidence period length rather than simply sum the +observed values in order to avoid treating missing values as 0. +} diff --git a/R-packages/modeltools/man/get_target_period.Rd b/R-packages/modeltools/man/get_target_period.Rd new file mode 100644 index 00000000..48f92421 --- /dev/null +++ b/R-packages/modeltools/man/get_target_period.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/response.R +\name{get_target_period} +\alias{get_target_period} +\title{Get the target period for a forecast date, incidence period and ahead} +\usage{ +get_target_period(forecast_date, incidence_period, ahead) +} +\description{ +Get the target period for a forecast date, incidence period and ahead +} diff --git a/R-packages/modeltools/tests/testthat/test-response.R b/R-packages/modeltools/tests/testthat/test-response.R index 29dfc230..83cc8cf5 100644 --- a/R-packages/modeltools/tests/testthat/test-response.R +++ b/R-packages/modeltools/tests/testthat/test-response.R @@ -1,13 +1,12 @@ library(tibble) - -test_that("make_response works for epiweeks", { +test_that("add_response_columns works for epiweeks", { df <- tibble( geo_value = "oh", time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"), `value-0:my_response` = 1:31 ) - out <- make_response(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2) + out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2) expect_equal(names(out), c("geo_value", "time_value", "value-0:my_response", @@ -22,13 +21,13 @@ test_that("make_response works for epiweeks", { }) -test_that("make_response works for days", { +test_that("add_response_columns works for days", { df <- tibble( geo_value = "oh", time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"), `value-0:my_response` = 1:31 ) - out <- make_response(df, "my_response", as.Date("2021-01-30"), "day", 1:4) + out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "day", 1:4) expect_equal(names(out), c("geo_value", "time_value", "value-0:my_response", From 63304024bde0a090bf6b98294ef37f92d59b20c1 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Thu, 18 Mar 2021 10:06:56 -0400 Subject: [PATCH 08/36] add trailing newline --- R-packages/modeltools/R/response.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R index 78645f9a..8da25427 100644 --- a/R-packages/modeltools/R/response.R +++ b/R-packages/modeltools/R/response.R @@ -77,4 +77,4 @@ get_target_period <- function(forecast_date, incidence_period, ahead) { ahead) tibble(start = sunday_of_ew_frcst_date + week_ahead * 7, end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1) -} \ No newline at end of file +} From 09ecff73f6d1d60dd88ddb550f3663c54155f986 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Thu, 18 Mar 2021 20:27:56 -0400 Subject: [PATCH 09/36] fix comment unclarity --- R-packages/modeltools/R/response.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R index 8da25427..4f945cfc 100644 --- a/R-packages/modeltools/R/response.R +++ b/R-packages/modeltools/R/response.R @@ -3,10 +3,10 @@ library(assertthat) #' Compute the corresponding observed response values for a set of signals. #' #' The response for an observed data point at time `t` is defined as the mean of observed values on -#' days `t + s` and `t + e` multiplied by `e - s`, where `s` and `e` are the number of days between -#' `forecast_date` and the start and end, respectively, of the next full `incidence_period`. We -#' choose to take the mean and multiply by the incidence period length rather than simply sum the -#' observed values in order to avoid treating missing values as 0. +#' days `t + s` through `t + e` (inclusive) multiplied by `e - s`, where `s` and `e` are the number +#' of days from `forecast_date` and the start and end, respectively, of the next full +#' `incidence_period`. We choose to take the mean and multiply by the incidence period length +#' rather than simply sum the observed values in order to avoid treating missing values as 0. #' #' @param signals Wide data frame of signals. We expect the following columns: #' \itemize{ From d8f67ab2378af10258160be99c55babb8b8969d8 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Fri, 19 Mar 2021 14:39:41 -0400 Subject: [PATCH 10/36] update response documentation slightly --- R-packages/modeltools/R/response.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R index 4f945cfc..6ce3e631 100644 --- a/R-packages/modeltools/R/response.R +++ b/R-packages/modeltools/R/response.R @@ -4,7 +4,7 @@ library(assertthat) #' #' The response for an observed data point at time `t` is defined as the mean of observed values on #' days `t + s` through `t + e` (inclusive) multiplied by `e - s`, where `s` and `e` are the number -#' of days from `forecast_date` and the start and end, respectively, of the next full +#' of days from `forecast_date` to the start and end, respectively, of the next full #' `incidence_period`. We choose to take the mean and multiply by the incidence period length #' rather than simply sum the observed values in order to avoid treating missing values as 0. #' From 06454b9cb7e65ccfb76ea6a4bbf16a1291da9a88 Mon Sep 17 00:00:00 2001 From: Addison Hu Date: Fri, 19 Mar 2021 18:43:22 -0400 Subject: [PATCH 11/36] Update R-packages/modeltools/R/matrix.R --- R-packages/modeltools/R/matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 25f95afd..1aee57b5 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -8,7 +8,7 @@ #' correspond to `{signal}` `{days}` before `time_value`} #' \item{Response columns}{Columns with names of the form `value+{n}:{response}` whose values #' correspond to `{response}` `{n}` incidence period units after `time_value`. Since -#' columns beginning with "value+" are interpretted as the response rather than a +#' columns beginning with "value+" are interpreted as the response rather than a #' covariate, observed data from the same day as `time_value` should be in columns #' `value-0:{signal}` rather than `value+0:{signal}`} #' } From 2eb80f5817c59fbd73fee3bd27d98d4f8c7f8069 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Fri, 19 Mar 2021 21:15:10 -0400 Subject: [PATCH 12/36] change naming convention of values and responses --- R-packages/modeltools/R/response.R | 12 ++++---- .../modeltools/man/add_response_columns.Rd | 16 +++++------ .../modeltools/tests/testthat/test-response.R | 28 +++++++++---------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R index 6ce3e631..a5e48048 100644 --- a/R-packages/modeltools/R/response.R +++ b/R-packages/modeltools/R/response.R @@ -12,17 +12,17 @@ library(assertthat) #' \itemize{ #' \item{`geo_value`}{Strings of geographic locations.} #' \item{`time_value`}{Dates of training data.} -#' \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values -#' correspond to `{signal}` `{days}` before `time_value`} +#' \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` or +#' `value+0:{signal} whose values correspond to `{signal}` `{days}` before `time_value`} #' } #' @param response_name Name of response signal. The value of the response variable on a single -#' day should be located in column `value-0:{response_name}` of `signals`. +#' day should be located in column `value+0:{response_name}` of `signals`. #' @param forecast_date Date on which the forecast will be made #' @param incidence_period Time period over which the response should be summed. #' @param ahead integer or vector of integer ahead values #' #' @return Data frame of signals equal to `signals` with `length(ahead)` additional columns with -#' names `value+{a}:{response_name}` corresponding to the response variable at ahead `a`. +#' names `response+{a}:{response_name}` corresponding to the response variable at ahead `a`. #' #' @export add_response_columns <- function(signals, @@ -35,10 +35,10 @@ add_response_columns <- function(signals, target_period <- as.list(get_target_period(forecast_date, incidence_period, a)[1, ]) window_size <- as.numeric(target_period$end - target_period$start) + 1 out_df <- slide_by_geo(out_df, - ~ Mean(window_size * .x[[paste0("value-0:", response_name)]]), + ~ Mean(window_size * .x[[paste0("value+0:", response_name)]]), n = window_size, shift = as.numeric(target_period$end - forecast_date), - col_name = paste0("value+", a, ":", response_name)) + col_name = paste0("response+", a, ":", response_name)) } return(out_df) } diff --git a/R-packages/modeltools/man/add_response_columns.Rd b/R-packages/modeltools/man/add_response_columns.Rd index 6f0c5a24..2e53ad48 100644 --- a/R-packages/modeltools/man/add_response_columns.Rd +++ b/R-packages/modeltools/man/add_response_columns.Rd @@ -17,12 +17,12 @@ add_response_columns( \itemize{ \item{\code{geo_value}}{Strings of geographic locations.} \item{\code{time_value}}{Dates of training data.} -\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values -correspond to \code{{signal}} \code{{days}} before \code{time_value}} +\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or +\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value`} }} \item{response_name}{Name of response signal. The value of the response variable on a single -day should be located in column \code{value-0:{response_name}} of \code{signals}.} +day should be located in column \code{value+0:{response_name}} of \code{signals}.} \item{forecast_date}{Date on which the forecast will be made} @@ -32,12 +32,12 @@ day should be located in column \code{value-0:{response_name}} of \code{signals} } \value{ Data frame of signals equal to \code{signals} with \code{length(ahead)} additional columns with -names \code{value+{a}:{response_name}} corresponding to the response variable at ahead \code{a}. +names \code{response+{a}:{response_name}} corresponding to the response variable at ahead \code{a}. } \description{ The response for an observed data point at time \code{t} is defined as the mean of observed values on -days \code{t + s} and \code{t + e} multiplied by \code{e - s}, where \code{s} and \code{e} are the number of days between -\code{forecast_date} and the start and end, respectively, of the next full \code{incidence_period}. We -choose to take the mean and multiply by the incidence period length rather than simply sum the -observed values in order to avoid treating missing values as 0. +days \code{t + s} through \code{t + e} (inclusive) multiplied by \code{e - s}, where \code{s} and \code{e} are the number +of days from \code{forecast_date} to the start and end, respectively, of the next full +\code{incidence_period}. We choose to take the mean and multiply by the incidence period length +rather than simply sum the observed values in order to avoid treating missing values as 0. } diff --git a/R-packages/modeltools/tests/testthat/test-response.R b/R-packages/modeltools/tests/testthat/test-response.R index 83cc8cf5..393f3b06 100644 --- a/R-packages/modeltools/tests/testthat/test-response.R +++ b/R-packages/modeltools/tests/testthat/test-response.R @@ -4,19 +4,19 @@ test_that("add_response_columns works for epiweeks", { df <- tibble( geo_value = "oh", time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"), - `value-0:my_response` = 1:31 + `value+0:my_response` = 1:31 ) out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2) expect_equal(names(out), - c("geo_value", "time_value", "value-0:my_response", - "value+1:my_response", "value+2:my_response")) + c("geo_value", "time_value", "value+0:my_response", + "response+1:my_response", "response+2:my_response")) expect_equal(out$geo_value, df$geo_value) expect_equal(out$time_value, df$time_value) - expect_equal(out$`value-0:my_response`, df$`value-0:my_response`) - expect_equal(out$`value+1:my_response`, + expect_equal(out$`value+0:my_response`, df$`value+0:my_response`) + expect_equal(out$`response+1:my_response`, c(seq(35, 196, 7), seq(199.5, 217, 3.5), NA)) - expect_equal(out$`value+2:my_response`, + expect_equal(out$`response+2:my_response`, c(seq(84, 196, 7), seq(199.5, 217, 3.5), rep(NA, 8))) }) @@ -25,18 +25,18 @@ test_that("add_response_columns works for days", { df <- tibble( geo_value = "oh", time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"), - `value-0:my_response` = 1:31 + `value+0:my_response` = 1:31 ) out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "day", 1:4) expect_equal(names(out), - c("geo_value", "time_value", "value-0:my_response", - "value+1:my_response", "value+2:my_response", "value+3:my_response", "value+4:my_response")) + c("geo_value", "time_value", "value+0:my_response", + "response+1:my_response", "response+2:my_response", "response+3:my_response", "response+4:my_response")) expect_equal(out$geo_value, df$geo_value) expect_equal(out$time_value, df$time_value) - expect_equal(out$`value-0:my_response`, df$`value-0:my_response`) - expect_equal(out$`value+1:my_response`, c(seq(2, 31), NA)) - expect_equal(out$`value+2:my_response`, c(seq(3, 31), rep(NA, 2))) - expect_equal(out$`value+3:my_response`, c(seq(4, 31), rep(NA, 3))) - expect_equal(out$`value+4:my_response`, c(seq(5, 31), rep(NA, 4))) + expect_equal(out$`value+0:my_response`, df$`value+0:my_response`) + expect_equal(out$`response+1:my_response`, c(seq(2, 31), NA)) + expect_equal(out$`response+2:my_response`, c(seq(3, 31), rep(NA, 2))) + expect_equal(out$`response+3:my_response`, c(seq(4, 31), rep(NA, 3))) + expect_equal(out$`response+4:my_response`, c(seq(5, 31), rep(NA, 4))) }) From c6149eaa9b2f92419aaef804ce19c446d77de4c3 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Fri, 19 Mar 2021 21:23:09 -0400 Subject: [PATCH 13/36] update naming conventions for signals vs response --- R-packages/modeltools/R/matrix.R | 17 +++++++--------- .../man/create_train_and_predict_matrices.Rd | 13 +++--------- .../modeltools/tests/testthat/test-matrix.R | 20 +++++++++---------- 3 files changed, 20 insertions(+), 30 deletions(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 1aee57b5..3ff2f7f2 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -4,13 +4,10 @@ #' \itemize{ #' \item{`geo_value`}{Strings of geographic locations.} #' \item{`time_value`}{Dates of training data.} -#' \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values -#' correspond to `{signal}` `{days}` before `time_value`} -#' \item{Response columns}{Columns with names of the form `value+{n}:{response}` whose values -#' correspond to `{response}` `{n}` incidence period units after `time_value`. Since -#' columns beginning with "value+" are interpreted as the response rather than a -#' covariate, observed data from the same day as `time_value` should be in columns -#' `value-0:{signal}` rather than `value+0:{signal}`} +#' \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` or +#' `value+0:{signal} whose values correspond to `{signal}` `{days}` before `time_value`} +#' \item{Response columns}{Columns with names of the form `response+{n}:{response}` whose values +#' correspond to `{response}` `{n}` incidence period units after `time_value`.} #' } #' A data frame in this format can be made using `covidcast::aggregate_signals()`. #' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast @@ -24,7 +21,7 @@ #' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the #' latest date `d` such that there is an observed response at time `d + ahead` and all data #' from the `training_window_size` days prior to it.} -#' \item{`train_y`}{Vector of response data from the `value+{ahead}:{response}` column of +#' \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of #' `lagged_df` corresponding to the rows of `train_x`.} #' \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`. The #' prediction data contains the most recent `training_window_size` days.} @@ -42,7 +39,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ # Find the last possible date of training data response_end_date <- lagged_df %>% - select(time_value, tidyselect::starts_with(sprintf("value+%i:", ahead))) %>% + select(time_value, tidyselect::starts_with(sprintf("response+%i:", ahead))) %>% tidyr::drop_na() %>% summarize(max(time_value)) %>% pull() @@ -59,7 +56,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ filter(between(time_value, train_end_date - training_window_size + 1, train_end_date)) %>% - select(tidyselect::starts_with(sprintf("value+%i:", ahead))) %>% + select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>% pull() # Prediction matrices diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index 9a99dd6c..603e42da 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -11,15 +11,8 @@ create_train_and_predict_matrices(lagged_df, ahead, training_window_size) \itemize{ \item{\code{geo_value}}{Strings of geographic locations.} \item{\code{time_value}}{Dates of training data.} -\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values -correspond to \code{{signal}} \code{{days}} before \code{time_value}} -\item{Response columns}{Columns with names of the form \code{value+{n}:{response}} whose values -correspond to \code{{response}} \code{{n}} incidence period units after \code{time_value}. Since -columns beginning with "value+" are interpretted as the response rather than a -covariate, observed data from the same day as \code{time_value} should be in columns -\code{value-0:{signal}} rather than \code{value+0:{signal}}} -} -A data frame in this format can be made using \code{covidcast::aggregate_signals()}.} +\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or +\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()`.} \item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast} @@ -34,7 +27,7 @@ Named list with entries: \code{value-{days}:{signal}} columns in \code{lagged_df}. The training data consists of the latest date \code{d} such that there is an observed response at time \code{d + ahead} and all data from the \code{training_window_size} days prior to it.} -\item{\code{train_y}}{Vector of response data from the \code{value+{ahead}:{response}} column of +\item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of \code{lagged_df} corresponding to the rows of \code{train_x}.} \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}. The prediction data contains the most recent \code{training_window_size} days.} diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R index bc4ced9d..3f955659 100644 --- a/R-packages/modeltools/tests/testthat/test-matrix.R +++ b/R-packages/modeltools/tests/testthat/test-matrix.R @@ -10,12 +10,12 @@ test_that("training and prediction matrices are created", { `value-2:signal_2` = seq(7, 16), `value-1:signal_1` = seq(-1, 8), `value-1:signal_2` = seq(9, 18), - `value-0:signal_1` = seq(1, 10), - `value-0:signal_2` = seq(11, 20), - `value+1:signal_1` = c(seq(3, 10), rep(NA, 2)), - `value+1:signal_2` = c(seq(13, 20), rep(NA, 2)), - `value+2:signal_1` = c(seq(5, 10), rep(NA, 4)), - `value+2:signal_2` = c(seq(15, 20), rep(NA, 4)) + `value+0:signal_1` = seq(1, 10), + `value+0:signal_2` = seq(11, 20), + `response+1:signal_1` = c(seq(3, 10), rep(NA, 2)), + `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)), + `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)), + `response+2:signal_2` = c(seq(15, 20), rep(NA, 4)) ) out <- create_train_and_predict_matrices(df, 2, 1) @@ -27,8 +27,8 @@ test_that("training and prediction matrices are created", { `value-2:signal_2` = c(11, 12), `value-1:signal_1` = c(3, 4), `value-1:signal_2` = c(13, 14), - `value-0:signal_1` = c(5, 6), - `value-0:signal_2` = c(15, 16))) + `value+0:signal_1` = c(5, 6), + `value+0:signal_2` = c(15, 16))) ) expect_equal(out$train_y, c(19, 20)) expect_equal(out$predict_x, @@ -37,8 +37,8 @@ test_that("training and prediction matrices are created", { `value-2:signal_2` = c(15, 16), `value-1:signal_1` = c(7, 8), `value-1:signal_2` = c(17, 18), - `value-0:signal_1` = c(9, 10), - `value-0:signal_2` = c(19, 20))) + `value+0:signal_1` = c(9, 10), + `value+0:signal_2` = c(19, 20))) ) expect_equal(out$predict_geo_values, c("az", "wv")) }) From 641602e26e0ac50aff6fd197d4ed49fee132fcef Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Mon, 22 Mar 2021 13:04:42 -0400 Subject: [PATCH 14/36] code example --- R-packages/modeltools/R/response.R | 15 +++++++++++++++ .../modeltools/man/add_response_columns.Rd | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R index a5e48048..a190cb17 100644 --- a/R-packages/modeltools/R/response.R +++ b/R-packages/modeltools/R/response.R @@ -24,6 +24,21 @@ library(assertthat) #' @return Data frame of signals equal to `signals` with `length(ahead)` additional columns with #' names `response+{a}:{response_name}` corresponding to the response variable at ahead `a`. #' +#' @examples +#' \dontrun{ +#' add_response_columns( +#' tibble( +#' geo_value = "mi" +#' time_value = seq.Date(as.Date("2020-07-01"), as.Date("2020-07-14"), "day"), +#' `value+0:sig` = 1:14 +#' ), +#' "sig", +#' as.Date("2020-07-15"), +#' "epiweek", +#' 1:2 +#' ) +#' } +#' #' @export add_response_columns <- function(signals, response_name, diff --git a/R-packages/modeltools/man/add_response_columns.Rd b/R-packages/modeltools/man/add_response_columns.Rd index 2e53ad48..1f2d8ade 100644 --- a/R-packages/modeltools/man/add_response_columns.Rd +++ b/R-packages/modeltools/man/add_response_columns.Rd @@ -41,3 +41,19 @@ of days from \code{forecast_date} to the start and end, respectively, of the nex \code{incidence_period}. We choose to take the mean and multiply by the incidence period length rather than simply sum the observed values in order to avoid treating missing values as 0. } +\examples{ +\dontrun{ +add_response_columns( + tibble( + geo_value = "mi" + time_value = seq.Date(as.Date("2020-07-01"), as.Date("2020-07-14"), "day"), + `value+0:sig` = 1:14 + ), + "sig", + as.Date("2020-07-15"), + "epiweek", + 1:2 +) +} + +} From f2701ea7ec1801572e40d0384f16c85adfe5d9a1 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Mon, 22 Mar 2021 14:48:12 -0400 Subject: [PATCH 15/36] update documentation --- R-packages/modeltools/R/matrix.R | 26 ++++++++++++++--- .../man/create_train_and_predict_matrices.Rd | 28 +++++++++++++++---- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 3ff2f7f2..902ef49f 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -1,4 +1,4 @@ -#' Create a training and prediction matrices for a given ahead. +#' Create training and testing data matrices and a training response vector for a given ahead. #' #' @param lagged_df Data frame of lagged data. It should have the following columns: #' \itemize{ @@ -9,7 +9,8 @@ #' \item{Response columns}{Columns with names of the form `response+{n}:{response}` whose values #' correspond to `{response}` `{n}` incidence period units after `time_value`.} #' } -#' A data frame in this format can be made using `covidcast::aggregate_signals()`. +#' A data frame in this format can be made using `covidcast::aggregate_signals()` and +#' `modeltools::get_response_columns()`. #' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast #' @param training_window_size Size of the local training window in days to use. For example, if #' `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on @@ -19,8 +20,8 @@ #' \itemize{ #' \item{`train_x`}{Matrix of training data whose columns correspond to the #' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the -#' latest date `d` such that there is an observed response at time `d + ahead` and all data -#' from the `training_window_size` days prior to it.} +#' latest date `d` such that there is an observed response at time `d + ahead * +#' incidence_period`, plus all data from the `training_window_size` days prior to it.} #' \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of #' `lagged_df` corresponding to the rows of `train_x`.} #' \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`. The @@ -28,6 +29,23 @@ #' \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.} #' } #' +#' @examples \dontrun{ +#' create_train_and_predict_matrices( +#' tibble( +#' geo_value = rep(c("az", "wv"), 5), +#' time_value = rep( +#' as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")), +#' each = 2), +#' `value-2:signal_1` = seq(-3, 6), +#' `value-1:signal_1` = seq(-1, 8), +#' `value+0:signal_1` = seq(1, 10), +#' `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)) +#' ), +#' ahead = 2, +#' training_window_size = 1) +#' ) +#' } +#' #' @importFrom tibble tibble #' #' @export diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index 603e42da..78676a96 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/matrix.R \name{create_train_and_predict_matrices} \alias{create_train_and_predict_matrices} -\title{Create a training and prediction matrices for a given ahead.} +\title{Create training and testing data matrices and a training response vector for a given ahead.} \usage{ create_train_and_predict_matrices(lagged_df, ahead, training_window_size) } @@ -12,7 +12,7 @@ create_train_and_predict_matrices(lagged_df, ahead, training_window_size) \item{\code{geo_value}}{Strings of geographic locations.} \item{\code{time_value}}{Dates of training data.} \item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or -\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()`.} +\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()\code{and}modeltools::get_response_columns()`.} \item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast} @@ -25,8 +25,7 @@ Named list with entries: \itemize{ \item{\code{train_x}}{Matrix of training data whose columns correspond to the \code{value-{days}:{signal}} columns in \code{lagged_df}. The training data consists of the -latest date \code{d} such that there is an observed response at time \code{d + ahead} and all data -from the \code{training_window_size} days prior to it.} +latest date \code{d} such that there is an observed response at time \code{d + ahead * incidence_period}, plus all data from the \code{training_window_size} days prior to it.} \item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of \code{lagged_df} corresponding to the rows of \code{train_x}.} \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}. The @@ -35,5 +34,24 @@ prediction data contains the most recent \code{training_window_size} days.} } } \description{ -Create a training and prediction matrices for a given ahead. +Create training and testing data matrices and a training response vector for a given ahead. +} +\examples{ +\dontrun{ +create_train_and_predict_matrices( + tibble( + geo_value = rep(c("az", "wv"), 5), + time_value = rep( + as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")), + each = 2), + `value-2:signal_1` = seq(-3, 6), + `value-1:signal_1` = seq(-1, 8), + `value+0:signal_1` = seq(1, 10), + `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)) + ), + ahead = 2, + training_window_size = 1) +) +} + } From 83d9db23145cb0740ffa7a01552a7ebbe5fed404 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Mon, 22 Mar 2021 15:47:48 -0400 Subject: [PATCH 16/36] brief comment updates --- R-packages/modeltools/R/matrix.R | 8 ++++---- .../modeltools/man/create_train_and_predict_matrices.Rd | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 902ef49f..3279f7f2 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -20,8 +20,8 @@ #' \itemize{ #' \item{`train_x`}{Matrix of training data whose columns correspond to the #' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the -#' latest date `d` such that there is an observed response at time `d + ahead * -#' incidence_period`, plus all data from the `training_window_size` days prior to it.} +#' latest date with an non-null response, plus all data from the `training_window_size` +#' days prior to it.} #' \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of #' `lagged_df` corresponding to the rows of `train_x`.} #' \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`. The @@ -53,7 +53,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ out <- list() train_df <- lagged_df %>% - select(geo_value, time_value, tidyselect::matches("^value(\\+0|-)")) + select(geo_value, time_value, tidyselect::starts_with("value")) # Find the last possible date of training data response_end_date <- lagged_df %>% @@ -80,7 +80,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ # Prediction matrices out$predict_x <- lagged_df %>% filter(time_value == max(time_value)) %>% - select(tidyselect::matches("^value(\\+0|-)")) %>% + select(tidyselect::starts_with("value")) %>% as.matrix() out$predict_geo_values <- lagged_df %>% filter(time_value == max(time_value)) %>% diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index 78676a96..b7889bd7 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -25,7 +25,8 @@ Named list with entries: \itemize{ \item{\code{train_x}}{Matrix of training data whose columns correspond to the \code{value-{days}:{signal}} columns in \code{lagged_df}. The training data consists of the -latest date \code{d} such that there is an observed response at time \code{d + ahead * incidence_period}, plus all data from the \code{training_window_size} days prior to it.} +latest date with an non-null response, plus all data from the \code{training_window_size} +days prior to it.} \item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of \code{lagged_df} corresponding to the rows of \code{train_x}.} \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}. The From ce50871d62695720400b81697a82dec81817d056 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Mon, 22 Mar 2021 16:02:30 -0400 Subject: [PATCH 17/36] add test for multiple aheads --- R-packages/modeltools/NAMESPACE | 1 + R-packages/modeltools/R/matrix.R | 8 ++++++ .../modeltools/tests/testthat/test-matrix.R | 25 +++++++++++++++++-- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 9d353a1c..7786c780 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -13,6 +13,7 @@ export(pct_change) export(quantgen_forecaster) export(quiet) export(slide_by_geo) +importFrom(assertthat,assert_that) importFrom(dplyr,"%>%") importFrom(dplyr,arrange) importFrom(dplyr,between) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 3279f7f2..d7a1e7c9 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -47,11 +47,19 @@ #' } #' #' @importFrom tibble tibble +#' @importFrom assertthat assert_that #' #' @export create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) { out <- list() + # make sure the response columns are unique + responses_at_ahead <- lagged_df %>% + select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>% + ncol() + assert_that(responses_at_ahead == 1, + msg=paste("multiple responses at ahead =",ahead)) + train_df <- lagged_df %>% select(geo_value, time_value, tidyselect::starts_with("value")) diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R index 3f955659..2dcf3e5e 100644 --- a/R-packages/modeltools/tests/testthat/test-matrix.R +++ b/R-packages/modeltools/tests/testthat/test-matrix.R @@ -12,9 +12,7 @@ test_that("training and prediction matrices are created", { `value-1:signal_2` = seq(9, 18), `value+0:signal_1` = seq(1, 10), `value+0:signal_2` = seq(11, 20), - `response+1:signal_1` = c(seq(3, 10), rep(NA, 2)), `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)), - `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)), `response+2:signal_2` = c(seq(15, 20), rep(NA, 4)) ) @@ -42,3 +40,26 @@ test_that("training and prediction matrices are created", { ) expect_equal(out$predict_geo_values, c("az", "wv")) }) + +test_that("fails with multiple responses", { + df <- tibble( + geo_value = rep(c("az", "wv"), 5), + time_value = rep( + as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")), + each = 2), + `value-2:signal_1` = seq(-3, 6), + `value-2:signal_2` = seq(7, 16), + `value-1:signal_1` = seq(-1, 8), + `value-1:signal_2` = seq(9, 18), + `value+0:signal_1` = seq(1, 10), + `value+0:signal_2` = seq(11, 20), + `response+1:signal_1` = c(seq(3, 10), rep(NA, 2)), + `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)), + `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)), + `response+2:signal_2` = c(seq(15, 20), rep(NA, 4)) + ) + expect_error( + create_train_and_predict_matrices(df, 2, 1), + "multiple responses at ahead = 2" + ) +}) From 98bb3475fe4f28daf7846159398de47afe71d972 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Mon, 22 Mar 2021 19:57:54 -0400 Subject: [PATCH 18/36] remove library call --- R-packages/modeltools/NAMESPACE | 1 + R-packages/modeltools/R/response.R | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 53039634..10575f99 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -13,6 +13,7 @@ export(pct_change) export(quantgen_forecaster) export(quiet) export(slide_by_geo) +importFrom(assertthat,assert_that) importFrom(dplyr,"%>%") importFrom(dplyr,arrange) importFrom(dplyr,between) diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R index a190cb17..eacc9228 100644 --- a/R-packages/modeltools/R/response.R +++ b/R-packages/modeltools/R/response.R @@ -1,5 +1,3 @@ -library(assertthat) - #' Compute the corresponding observed response values for a set of signals. #' #' The response for an observed data point at time `t` is defined as the mean of observed values on @@ -39,6 +37,8 @@ library(assertthat) #' ) #' } #' +#' @importFrom assertthat assert_that +#' #' @export add_response_columns <- function(signals, response_name, From 70a11d445f619edd36c67c436324f3e4bb6bcf73 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Mon, 22 Mar 2021 20:16:47 -0400 Subject: [PATCH 19/36] update description --- R-packages/modeltools/DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION index ccbff0e8..c3af2159 100755 --- a/R-packages/modeltools/DESCRIPTION +++ b/R-packages/modeltools/DESCRIPTION @@ -19,7 +19,8 @@ RoxygenNote: 7.1.1 Remotes: github::cmu-delphi/covidcast/R-packages/covidcast@main, github::cmu-delphi/covidcast/R-packages/evalcast@main -Imports: +Imports: + assertthat, covidcast, dplyr, evalcast, From be9a5357a2c62425789448db3702baba11e38be4 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Wed, 24 Mar 2021 15:11:22 -0400 Subject: [PATCH 20/36] make example forecaster --- R-packages/modeltools/NAMESPACE | 3 +- R-packages/modeltools/R/example_forecaster.R | 31 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 R-packages/modeltools/R/example_forecaster.R diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 1f8a03bf..1ed016f6 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -7,9 +7,10 @@ export(Median) export(Min) export(Start) export(Sum) -export(create_train_and_predict_matrices) export(add_response_columns) +export(create_train_and_predict_matrices) export(estimate_deriv) +export(example_forecaster) export(pct_change) export(quantgen_forecaster) export(quiet) diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R new file mode 100644 index 00000000..74c6df55 --- /dev/null +++ b/R-packages/modeltools/R/example_forecaster.R @@ -0,0 +1,31 @@ +#' @export +example_forecaster <- function(df_list, forecast_date) { + ahead <- 1:4 + lags <- c(0, -1, -2, -3, -7, -14) + quantiles <- modeltools::covidhub_probs() + + covariates <- covidcast::aggregate_signals(df_list, dt = lags, format = "wide") + covariates_with_response <- add_response_columns(covariates, + "jhu-csse_confirmed_incidence_num", + forecast_date, + "epiweek", + ahead) + + results <- list() + for (a in ahead) { + mats <- create_train_and_predict_matrices(covariates_with_response, a, 14) + model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0) + predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x) + print(predictions) + colnames(predictions) <- quantiles + predict_df <- bind_cols(geo_value = mats$predict_geo_values, + predictions) %>% + pivot_longer(cols = -geo_value, + names_to = "quantile", + values_to = "value") %>% + mutate(ahead = a) + + results[[a]] <- predict_df + } + return(bind_rows(results)) +} \ No newline at end of file From fb76ee0fc5ecd8a8e01cbccddd3ebb1b97798067 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Wed, 24 Mar 2021 15:43:38 -0400 Subject: [PATCH 21/36] update example_forecaster documentation --- R-packages/modeltools/R/example_forecaster.R | 16 +++++++++++ .../modeltools/man/example_forecaster.Rd | 27 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 R-packages/modeltools/man/example_forecaster.Rd diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R index 74c6df55..b9980abd 100644 --- a/R-packages/modeltools/R/example_forecaster.R +++ b/R-packages/modeltools/R/example_forecaster.R @@ -1,3 +1,19 @@ +#' Simple example forecaster that forecasts confirmed cases. +#' +#' @param df_list List of downloaded covidcast signals +#' @param forecast_date Date from which the forecast should be made. +#' +#' preds <- evalcast::get_predictions(modeltools::example_forecaster, +#' "example", +#' tibble( +#' data_source = "jhu-csse", +#' signal = "confirmed_incidence_num", +#' geo_type = "state", +#' start_date = as.Date("2021-01-01"), +#' ), +#' as.Date("2021-03-01"), +#' "epiweek") +#' #' @export example_forecaster <- function(df_list, forecast_date) { ahead <- 1:4 diff --git a/R-packages/modeltools/man/example_forecaster.Rd b/R-packages/modeltools/man/example_forecaster.Rd new file mode 100644 index 00000000..b743fb09 --- /dev/null +++ b/R-packages/modeltools/man/example_forecaster.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/example_forecaster.R +\name{example_forecaster} +\alias{example_forecaster} +\title{Simple example forecaster that forecasts confirmed cases.} +\usage{ +example_forecaster(df_list, forecast_date) +} +\arguments{ +\item{df_list}{List of downloaded covidcast signals} + +\item{forecast_date}{Date from which the forecast should be made. + +preds <- evalcast::get_predictions(modeltools::example_forecaster, +"example", +tibble( +data_source = "jhu-csse", +signal = "confirmed_incidence_num", +geo_type = "state", +start_date = as.Date("2021-01-01"), +), +as.Date("2021-03-01"), +"epiweek")} +} +\description{ +Simple example forecaster that forecasts confirmed cases. +} From ee2e40b9342a23137c694f75e550cca1ff787dbb Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Wed, 24 Mar 2021 15:44:54 -0400 Subject: [PATCH 22/36] update quantgen forecaster to use new signature --- R-packages/modeltools/R/quantgen.R | 14 +++++++------- R-packages/modeltools/man/quantgen_forecaster.Rd | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R-packages/modeltools/R/quantgen.R b/R-packages/modeltools/R/quantgen.R index 8364156c..386e1585 100644 --- a/R-packages/modeltools/R/quantgen.R +++ b/R-packages/modeltools/R/quantgen.R @@ -6,7 +6,7 @@ #' vignette](https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/quantgen-forecast.html) #' for examples. #' -#' @param df Data frame of signal values to use for forecasting, of the format +#' @param df_list List of data frames of signal values to use for forecasting, of the format #' that is returned by [covidcast::covidcast_signals()]. #' @param forecast_date Date object or string of the form "YYYY-MM-DD", #' indicating the date on which forecasts will be made. For example, if @@ -109,7 +109,7 @@ #' @importFrom dplyr filter select pull summarize between bind_cols #' @importFrom tidyr pivot_longer #' @export -quantgen_forecaster = function(df, forecast_date, signals, incidence_period, +quantgen_forecaster = function(df_list, forecast_date, signals, incidence_period, ahead, geo_type, n = 4 * ifelse(incidence_period == "day", 7, 1), lags = 0, tau = modeltools::covidhub_probs, @@ -138,8 +138,8 @@ quantgen_forecaster = function(df, forecast_date, signals, incidence_period, if (is.null(inv_trans)) { stop("If `transform` is specified, then `inv_trans` must be as well.") } - for (i in 1:length(df)) { - df[[i]] = df[[i]] %>% mutate(value = transform[[i]](value)) + for (i in 1:length(df_list)) { + df_list[[i]] = df_list[[i]] %>% mutate(value = transform[[i]](value)) } } @@ -147,9 +147,9 @@ quantgen_forecaster = function(df, forecast_date, signals, incidence_period, # shift, for each ahead value, for convenience later dt = lapply(lags, "-") dt[[1]] = c(dt[[1]], ahead) - + # Append shifts, and aggregate into wide format - df_wide = covidcast::aggregate_signals(df, dt = dt, format = "wide") + df_wide = covidcast::aggregate_signals(df_list, dt = dt, format = "wide") # Separate out into feature data frame, featurize if we need to df_features = df_wide %>% @@ -214,7 +214,7 @@ quantgen_forecaster = function(df, forecast_date, signals, incidence_period, train_end_date - n + 1, train_end_date)) %>% select(tidyselect::starts_with(sprintf("value+%i:", a))) %>% pull() - + # Define noncrossing constraints, if we need to if (noncross) { if (noncross_points == "all") x0 = rbind(x, newx) diff --git a/R-packages/modeltools/man/quantgen_forecaster.Rd b/R-packages/modeltools/man/quantgen_forecaster.Rd index 816108c2..b810303d 100644 --- a/R-packages/modeltools/man/quantgen_forecaster.Rd +++ b/R-packages/modeltools/man/quantgen_forecaster.Rd @@ -5,7 +5,7 @@ \title{Simple quantile autoregressive forecaster based on \code{quantgen}} \usage{ quantgen_forecaster( - df, + df_list, forecast_date, signals, incidence_period, @@ -25,7 +25,7 @@ quantgen_forecaster( ) } \arguments{ -\item{df}{Data frame of signal values to use for forecasting, of the format +\item{df_list}{List of data frames of signal values to use for forecasting, of the format that is returned by \code{\link[covidcast:covidcast_signals]{covidcast::covidcast_signals()}}.} \item{forecast_date}{Date object or string of the form "YYYY-MM-DD", From bd2a3f71397171bf83d0065feb97d1e11cc60ae0 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Wed, 24 Mar 2021 15:47:03 -0400 Subject: [PATCH 23/36] cleanup --- R-packages/modeltools/R/example_forecaster.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R index b9980abd..1dda5c90 100644 --- a/R-packages/modeltools/R/example_forecaster.R +++ b/R-packages/modeltools/R/example_forecaster.R @@ -32,7 +32,7 @@ example_forecaster <- function(df_list, forecast_date) { mats <- create_train_and_predict_matrices(covariates_with_response, a, 14) model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0) predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x) - print(predictions) + colnames(predictions) <- quantiles predict_df <- bind_cols(geo_value = mats$predict_geo_values, predictions) %>% @@ -44,4 +44,4 @@ example_forecaster <- function(df_list, forecast_date) { results[[a]] <- predict_df } return(bind_rows(results)) -} \ No newline at end of file +} From 75dc1b4dba1fa22e9cfe0b57aa21cef709fb6812 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Thu, 25 Mar 2021 10:58:38 -0400 Subject: [PATCH 24/36] move values to arguments --- R-packages/modeltools/NAMESPACE | 2 ++ R-packages/modeltools/R/example_forecaster.R | 23 +++++++++++++------ .../modeltools/man/example_forecaster.Rd | 22 +++++++++++++++--- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 1ed016f6..23705baa 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -17,9 +17,11 @@ export(quiet) export(slide_by_geo) importFrom(assertthat,assert_that) importFrom(dplyr,"%>%") +importFrom(dplyr,"bind_cols,") importFrom(dplyr,arrange) importFrom(dplyr,between) importFrom(dplyr,bind_cols) +importFrom(dplyr,bind_rows) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R index 1dda5c90..596d248f 100644 --- a/R-packages/modeltools/R/example_forecaster.R +++ b/R-packages/modeltools/R/example_forecaster.R @@ -1,7 +1,12 @@ #' Simple example forecaster that forecasts confirmed cases. #' -#' @param df_list List of downloaded covidcast signals +#' @param df_list List of downloaded covidcast signals. #' @param forecast_date Date from which the forecast should be made. +#' @param ahead Vector of epiweeks ahead to predict. +#' @param training_window_size Number of days prior to `forecast_date` to use for training. +#' @param lags Vector of lags or list of vectors of lags to apply to `df_list`. See documentation +#' of `dt` argument to `covidcast::aggregate_signals()` for details. +#' @param quantiles Vector of quantile values at which to predict #' #' preds <- evalcast::get_predictions(modeltools::example_forecaster, #' "example", @@ -14,12 +19,16 @@ #' as.Date("2021-03-01"), #' "epiweek") #' +#' @importFrom dplyr mutate bind_cols, bind_rows +#' @importFrom tidyr pivot_longer +#' #' @export -example_forecaster <- function(df_list, forecast_date) { - ahead <- 1:4 - lags <- c(0, -1, -2, -3, -7, -14) - quantiles <- modeltools::covidhub_probs() - +example_forecaster <- function(df_list, + forecast_date, + ahead = 1:4, + training_window_size = 14, + lags = seq(0, -1*training_window_size, -7), + quantiles = modeltools::covidhub_probs) { covariates <- covidcast::aggregate_signals(df_list, dt = lags, format = "wide") covariates_with_response <- add_response_columns(covariates, "jhu-csse_confirmed_incidence_num", @@ -29,7 +38,7 @@ example_forecaster <- function(df_list, forecast_date) { results <- list() for (a in ahead) { - mats <- create_train_and_predict_matrices(covariates_with_response, a, 14) + mats <- create_train_and_predict_matrices(covariates_with_response, a, training_window_size) model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0) predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x) diff --git a/R-packages/modeltools/man/example_forecaster.Rd b/R-packages/modeltools/man/example_forecaster.Rd index b743fb09..03143770 100644 --- a/R-packages/modeltools/man/example_forecaster.Rd +++ b/R-packages/modeltools/man/example_forecaster.Rd @@ -4,12 +4,28 @@ \alias{example_forecaster} \title{Simple example forecaster that forecasts confirmed cases.} \usage{ -example_forecaster(df_list, forecast_date) +example_forecaster( + df_list, + forecast_date, + ahead = 1:4, + training_window_size = 14, + lags = seq(0, -1 * training_window_size, -7), + quantiles = modeltools::covidhub_probs +) } \arguments{ -\item{df_list}{List of downloaded covidcast signals} +\item{df_list}{List of downloaded covidcast signals.} -\item{forecast_date}{Date from which the forecast should be made. +\item{forecast_date}{Date from which the forecast should be made.} + +\item{ahead}{Vector of epiweeks ahead to predict.} + +\item{training_window_size}{Number of days prior to \code{forecast_date} to use for training.} + +\item{lags}{Vector of lags or list of vectors of lags to apply to \code{df_list}. See documentation +of \code{dt} argument to \code{covidcast::aggregate_signals()} for details.} + +\item{quantiles}{Vector of quantile values at which to predict preds <- evalcast::get_predictions(modeltools::example_forecaster, "example", From 0927475e300597e5fb072833dfc55d71a74da1c9 Mon Sep 17 00:00:00 2001 From: Mike O'Brien Date: Fri, 26 Mar 2021 08:57:37 -0400 Subject: [PATCH 25/36] remove stray comma --- R-packages/modeltools/NAMESPACE | 1 - R-packages/modeltools/R/example_forecaster.R | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 23705baa..62e60157 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -17,7 +17,6 @@ export(quiet) export(slide_by_geo) importFrom(assertthat,assert_that) importFrom(dplyr,"%>%") -importFrom(dplyr,"bind_cols,") importFrom(dplyr,arrange) importFrom(dplyr,between) importFrom(dplyr,bind_cols) diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R index 596d248f..4b6ccc60 100644 --- a/R-packages/modeltools/R/example_forecaster.R +++ b/R-packages/modeltools/R/example_forecaster.R @@ -19,7 +19,7 @@ #' as.Date("2021-03-01"), #' "epiweek") #' -#' @importFrom dplyr mutate bind_cols, bind_rows +#' @importFrom dplyr mutate bind_cols bind_rows #' @importFrom tidyr pivot_longer #' #' @export From 6cd72a7eb4f46694465169b04e4195b9c26528ed Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 26 Mar 2021 14:54:01 -0700 Subject: [PATCH 26/36] export train_predict_matrices --- R-packages/modeltools/DESCRIPTION | 8 +++++--- R-packages/modeltools/R/matrix.R | 6 ++++-- .../modeltools/man/create_train_and_predict_matrices.Rd | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION index c3af2159..d9426422 100755 --- a/R-packages/modeltools/DESCRIPTION +++ b/R-packages/modeltools/DESCRIPTION @@ -17,8 +17,9 @@ LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.1 Remotes: - github::cmu-delphi/covidcast/R-packages/covidcast@main, - github::cmu-delphi/covidcast/R-packages/evalcast@main + cmu-delphi/covidcast/R-packages/covidcast@main, + cmu-delphi/covidcast/R-packages/evalcast@main + ryantibs/quantgen/R-package/quantgen@master, Imports: assertthat, covidcast, @@ -27,4 +28,5 @@ Imports: genlasso, slider, tidyselect, - tidyr + tidyr, + quantgen diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index d7a1e7c9..7070c738 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -27,6 +27,7 @@ #' \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`. The #' prediction data contains the most recent `training_window_size` days.} #' \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.} +#' \item{`train_end_date`}{latest `time_value` used in the training period} #' } #' #' @examples \dontrun{ @@ -49,7 +50,7 @@ #' @importFrom tibble tibble #' @importFrom assertthat assert_that #' -#' @export +#' @export create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) { out <- list() @@ -63,7 +64,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ train_df <- lagged_df %>% select(geo_value, time_value, tidyselect::starts_with("value")) - # Find the last possible date of training data + # Find the last possible date of training data response_end_date <- lagged_df %>% select(time_value, tidyselect::starts_with(sprintf("response+%i:", ahead))) %>% tidyr::drop_na() %>% @@ -93,6 +94,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ out$predict_geo_values <- lagged_df %>% filter(time_value == max(time_value)) %>% select(geo_value) %>% pull() + out$train_end_date <- train_end_date return(out) } diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index b7889bd7..6f552ed3 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -32,6 +32,7 @@ days prior to it.} \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}. The prediction data contains the most recent \code{training_window_size} days.} \item{\code{predict_geo_values}}{Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.} +\item{\code{train_end_date}}{latest \code{time_value} used in the training period} } } \description{ From 08c5b66c8bea7d07bbff2a05c99039ad3305feb8 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 26 Mar 2021 14:55:15 -0700 Subject: [PATCH 27/36] fix documentation --- R-packages/modeltools/R/matrix.R | 1 - R-packages/modeltools/man/create_train_and_predict_matrices.Rd | 1 - 2 files changed, 2 deletions(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 7070c738..77db82fc 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -44,7 +44,6 @@ #' ), #' ahead = 2, #' training_window_size = 1) -#' ) #' } #' #' @importFrom tibble tibble diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index 6f552ed3..189af173 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -53,7 +53,6 @@ create_train_and_predict_matrices( ), ahead = 2, training_window_size = 1) -) } } From da8b866231f00ab6dc7ffd13046345a5e5aff4b3 Mon Sep 17 00:00:00 2001 From: Kenneth Tay Date: Thu, 1 Apr 2021 13:42:42 -0700 Subject: [PATCH 28/36] Fix failing test in test-matrix.R. --- R-packages/modeltools/tests/testthat/test-matrix.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R index 2dcf3e5e..88c04541 100644 --- a/R-packages/modeltools/tests/testthat/test-matrix.R +++ b/R-packages/modeltools/tests/testthat/test-matrix.R @@ -18,7 +18,8 @@ test_that("training and prediction matrices are created", { out <- create_train_and_predict_matrices(df, 2, 1) - expect_equal(names(out), c("train_x", "train_y", "predict_x", "predict_geo_values")) + expect_equal(names(out), c("train_x", "train_y", "predict_x", + "predict_geo_values", "train_end_date")) expect_equal(out$train_x, as.matrix(tibble( `value-2:signal_1` = c(1, 2), From d907095d617566b86a977b063853767c02501b33 Mon Sep 17 00:00:00 2001 From: Kenneth Tay Date: Thu, 1 Apr 2021 13:53:38 -0700 Subject: [PATCH 29/36] Fix documentation for create_train_and_predict_matrices(). --- R-packages/modeltools/R/matrix.R | 30 +++++++++--------- .../man/create_train_and_predict_matrices.Rd | 31 +++++++++++-------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 77db82fc..280ae6b3 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -1,13 +1,13 @@ #' Create training and testing data matrices and a training response vector for a given ahead. #' -#' @param lagged_df Data frame of lagged data. It should have the following columns: +#' @param lagged_df Data frame of lagged data. It should have the following columns: #' \itemize{ -#' \item{`geo_value`}{Strings of geographic locations.} -#' \item{`time_value`}{Dates of training data.} -#' \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` or -#' `value+0:{signal} whose values correspond to `{signal}` `{days}` before `time_value`} -#' \item{Response columns}{Columns with names of the form `response+{n}:{response}` whose values -#' correspond to `{response}` `{n}` incidence period units after `time_value`.} +#' \item `geo_value`: Strings of geographic locations. +#' \item `time_value`: Dates of training data. +#' \item Covariate columns: Columns with names of the form `value-{days}:{signal}` or +#' `value+0:{signal}` whose values correspond to `{signal}` `{days}` before `time_value`. +#' \item Response columns: Columns with names of the form `response+{n}:{response}` whose values +#' correspond to `{response}` `{n}` incidence period units after `time_value`. #' } #' A data frame in this format can be made using `covidcast::aggregate_signals()` and #' `modeltools::get_response_columns()`. @@ -18,16 +18,16 @@ #' #' @return Named list with entries: #' \itemize{ -#' \item{`train_x`}{Matrix of training data whose columns correspond to the +#' \item `train_x`: Matrix of training data whose columns correspond to the #' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the #' latest date with an non-null response, plus all data from the `training_window_size` -#' days prior to it.} -#' \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of -#' `lagged_df` corresponding to the rows of `train_x`.} -#' \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`. The -#' prediction data contains the most recent `training_window_size` days.} -#' \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.} -#' \item{`train_end_date`}{latest `time_value` used in the training period} +#' days prior to it. +#' \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of +#' `lagged_df` corresponding to the rows of `train_x`. +#' \item `predict_x`: Matrix of prediction data in the same format as `train_x`. The +#' prediction data contains the most recent `training_window_size` days. +#' \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`. +#' \item `train_end_date`: Latest `time_value` used in the training period. #' } #' #' @examples \dontrun{ diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index 189af173..1b6b02fe 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -7,12 +7,17 @@ create_train_and_predict_matrices(lagged_df, ahead, training_window_size) } \arguments{ -\item{lagged_df}{Data frame of lagged data. It should have the following columns: +\item{lagged_df}{Data frame of lagged data. It should have the following columns: \itemize{ -\item{\code{geo_value}}{Strings of geographic locations.} -\item{\code{time_value}}{Dates of training data.} -\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or -\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()\code{and}modeltools::get_response_columns()`.} +\item \code{geo_value}: Strings of geographic locations. +\item \code{time_value}: Dates of training data. +\item Covariate columns: Columns with names of the form \code{value-{days}:{signal}} or +\code{value+0:{signal}} whose values correspond to \code{{signal}} \code{{days}} before \code{time_value}. +\item Response columns: Columns with names of the form \code{response+{n}:{response}} whose values +correspond to \code{{response}} \code{{n}} incidence period units after \code{time_value}. +} +A data frame in this format can be made using \code{covidcast::aggregate_signals()} and +\code{modeltools::get_response_columns()}.} \item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast} @@ -23,16 +28,16 @@ data from December 1 to December 14.} \value{ Named list with entries: \itemize{ -\item{\code{train_x}}{Matrix of training data whose columns correspond to the +\item \code{train_x}: Matrix of training data whose columns correspond to the \code{value-{days}:{signal}} columns in \code{lagged_df}. The training data consists of the latest date with an non-null response, plus all data from the \code{training_window_size} -days prior to it.} -\item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of -\code{lagged_df} corresponding to the rows of \code{train_x}.} -\item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}. The -prediction data contains the most recent \code{training_window_size} days.} -\item{\code{predict_geo_values}}{Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.} -\item{\code{train_end_date}}{latest \code{time_value} used in the training period} +days prior to it. +\item \code{train_y}: Vector of response data from the \code{response+{ahead}:{response}} column of +\code{lagged_df} corresponding to the rows of \code{train_x}. +\item \code{predict_x}: Matrix of prediction data in the same format as \code{train_x}. The +prediction data contains the most recent \code{training_window_size} days. +\item \code{predict_geo_values}: Vector of \code{geo_values} corresponding to the rows of \code{predict_x}. +\item \code{train_end_date}: Latest \code{time_value} used in the training period. } } \description{ From 90c85e67a6ca3ead4857879a6242d0f24eb11a8d Mon Sep 17 00:00:00 2001 From: Kenneth Tay Date: Thu, 1 Apr 2021 22:24:45 -0700 Subject: [PATCH 30/36] Modify create_train_and_predict_matrices() to create matrices for multiple ahead values. --- R-packages/modeltools/R/matrix.R | 110 ++++++++++-------- .../man/create_train_and_predict_matrices.Rd | 10 +- .../modeltools/tests/testthat/test-matrix.R | 29 ++++- 3 files changed, 98 insertions(+), 51 deletions(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 280ae6b3..1b2c1fe7 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -1,4 +1,7 @@ -#' Create training and testing data matrices and a training response vector for a given ahead. +#' Create training and test data matrices and a training response for given aheads. +#' +#' Create training and test data matrices and training response for a set of +#' given aheads. Works for both single ahead values and a vector of ahead values. #' #' @param lagged_df Data frame of lagged data. It should have the following columns: #' \itemize{ @@ -11,7 +14,10 @@ #' } #' A data frame in this format can be made using `covidcast::aggregate_signals()` and #' `modeltools::get_response_columns()`. -#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast +#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast. +#' Can be a single positive integer or a vector of positive integers. Note that +#' for each `{a}` in `ahead`, the column `response+{a}:{response}` should be +#' present in `lagged_df`. #' @param training_window_size Size of the local training window in days to use. For example, if #' `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on #' data from December 1 to December 14. @@ -51,49 +57,59 @@ #' #' @export create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) { - out <- list() - - # make sure the response columns are unique - responses_at_ahead <- lagged_df %>% - select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>% - ncol() - assert_that(responses_at_ahead == 1, - msg=paste("multiple responses at ahead =",ahead)) - - train_df <- lagged_df %>% - select(geo_value, time_value, tidyselect::starts_with("value")) - - # Find the last possible date of training data - response_end_date <- lagged_df %>% - select(time_value, tidyselect::starts_with(sprintf("response+%i:", ahead))) %>% - tidyr::drop_na() %>% - summarize(max(time_value)) %>% - pull() - train_end_date <- min(max(lagged_df$time_value), response_end_date) - - # Training matrices - out$train_x <- train_df %>% - filter(between(time_value, - train_end_date - training_window_size + 1, - train_end_date)) %>% - select(-c(geo_value, time_value)) %>% - as.matrix() - out$train_y <- lagged_df %>% - filter(between(time_value, - train_end_date - training_window_size + 1, - train_end_date)) %>% - select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>% - pull() - - # Prediction matrices - out$predict_x <- lagged_df %>% - filter(time_value == max(time_value)) %>% - select(tidyselect::starts_with("value")) %>% - as.matrix() - out$predict_geo_values <- lagged_df %>% - filter(time_value == max(time_value)) %>% - select(geo_value) %>% pull() - out$train_end_date <- train_end_date - - return(out) + all_out <- list() + + for (a in ahead) { + out <- list() # matrices corresponding to ahead+a + + # make sure the response columns are unique + responses_at_ahead <- lagged_df %>% + select(tidyselect::starts_with(sprintf("response+%i:", a))) %>% + ncol() + assert_that(responses_at_ahead == 1, + msg = paste("multiple responses at ahead =", a)) + + train_df <- lagged_df %>% + select(geo_value, time_value, tidyselect::starts_with("value")) + + # Find the last possible date of training data + response_end_date <- lagged_df %>% + select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>% + tidyr::drop_na() %>% + summarize(max(time_value)) %>% + pull() + train_end_date <- min(max(lagged_df$time_value), response_end_date) + + # Training matrices + out$train_x <- train_df %>% + filter(between(time_value, + train_end_date - training_window_size + 1, + train_end_date)) %>% + select(-c(geo_value, time_value)) %>% + as.matrix() + out$train_y <- lagged_df %>% + filter(between(time_value, + train_end_date - training_window_size + 1, + train_end_date)) %>% + select(tidyselect::starts_with(sprintf("response+%i:", a))) %>% + pull() + + # Prediction matrices + out$predict_x <- lagged_df %>% + filter(time_value == max(time_value)) %>% + select(tidyselect::starts_with("value")) %>% + as.matrix() + out$predict_geo_values <- lagged_df %>% + filter(time_value == max(time_value)) %>% + select(geo_value) %>% pull() + out$train_end_date <- train_end_date + + all_out[[paste0("ahead+", a)]] <- out + } + + if (length(ahead) == 1) { + return(all_out[[1]]) + } else { + return(all_out) + } } diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index 1b6b02fe..cc0288b2 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/matrix.R \name{create_train_and_predict_matrices} \alias{create_train_and_predict_matrices} -\title{Create training and testing data matrices and a training response vector for a given ahead.} +\title{Create training and test data matrices and a training response for given aheads.} \usage{ create_train_and_predict_matrices(lagged_df, ahead, training_window_size) } @@ -19,7 +19,10 @@ correspond to \code{{response}} \code{{n}} incidence period units after \code{ti A data frame in this format can be made using \code{covidcast::aggregate_signals()} and \code{modeltools::get_response_columns()}.} -\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast} +\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast. +Can be a single positive integer or a vector of positive integers. Note that +for each \code{{a}} in \code{ahead}, the column \code{response+{a}:{response}} should be +present in \code{lagged_df}.} \item{training_window_size}{Size of the local training window in days to use. For example, if \code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on @@ -41,7 +44,8 @@ prediction data contains the most recent \code{training_window_size} days. } } \description{ -Create training and testing data matrices and a training response vector for a given ahead. +Create training and test data matrices and training response for a set of +given aheads. Works for both single ahead values and a vector of ahead values. } \examples{ \dontrun{ diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R index 88c04541..e38d51b0 100644 --- a/R-packages/modeltools/tests/testthat/test-matrix.R +++ b/R-packages/modeltools/tests/testthat/test-matrix.R @@ -16,7 +16,8 @@ test_that("training and prediction matrices are created", { `response+2:signal_2` = c(seq(15, 20), rep(NA, 4)) ) - out <- create_train_and_predict_matrices(df, 2, 1) + out <- create_train_and_predict_matrices(df, ahead = 2, + training_window_size = 1) expect_equal(names(out), c("train_x", "train_y", "predict_x", "predict_geo_values", "train_end_date")) @@ -42,6 +43,32 @@ test_that("training and prediction matrices are created", { expect_equal(out$predict_geo_values, c("az", "wv")) }) +test_that("training and prediction matrices for multiple aheads (separate)", { + df <- tibble( + geo_value = rep(c("az", "wv"), 5), + time_value = rep( + as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")), + each = 2), + `value-2:signal_1` = seq(-3, 6), + `value-2:signal_2` = seq(7, 16), + `value-1:signal_1` = seq(-1, 8), + `value-1:signal_2` = seq(9, 18), + `value+0:signal_1` = seq(1, 10), + `value+0:signal_2` = seq(11, 20), + `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)), + `response+2:signal_2` = c(seq(15, 20), rep(NA, 4)) + ) + + out <- create_train_and_predict_matrices(df, ahead = c(1, 2), + training_window_size = 1) + + expect_equal(length(out), 2) + expect_equal(out[[1]], create_train_and_predict_matrices(df, ahead = 1, + training_window_size = 1)) + expect_equal(out[[2]], create_train_and_predict_matrices(df, ahead = 2, + training_window_size = 1)) +}) + test_that("fails with multiple responses", { df <- tibble( geo_value = rep(c("az", "wv"), 5), From 32ecee3652c7d788ad680a97c48e2d1239c48ba5 Mon Sep 17 00:00:00 2001 From: Kenneth Tay Date: Thu, 1 Apr 2021 22:55:48 -0700 Subject: [PATCH 31/36] Modify create_train_and_predict_matrices() to be able to return matrix of responses for multiple aheads. --- R-packages/modeltools/R/matrix.R | 107 +++++++++++++----- .../man/create_train_and_predict_matrices.Rd | 19 +++- .../modeltools/tests/testthat/test-matrix.R | 48 ++++++++ 3 files changed, 146 insertions(+), 28 deletions(-) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 1b2c1fe7..0df026be 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -2,6 +2,9 @@ #' #' Create training and test data matrices and training response for a set of #' given aheads. Works for both single ahead values and a vector of ahead values. +#' For multiple ahead values, the function has the ability to return separate +#' data matrices and responses for each ahead, or a single data matrix and +#' response matrix for all aheads at once. #' #' @param lagged_df Data frame of lagged data. It should have the following columns: #' \itemize{ @@ -21,8 +24,10 @@ #' @param training_window_size Size of the local training window in days to use. For example, if #' `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on #' data from December 1 to December 14. +#' @param aheads_separate If `length(ahead) > 1`, should there be separate +#' data matrices and responses for each ahead? Default is `TRUE`. #' -#' @return Named list with entries: +#' @return For a single ahead value, named list with entries: #' \itemize{ #' \item `train_x`: Matrix of training data whose columns correspond to the #' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the @@ -35,6 +40,10 @@ #' \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`. #' \item `train_end_date`: Latest `time_value` used in the training period. #' } +#' For multiple ahead values and `aheads_separate = TRUE`, a list having +#' the same length as `ahead`, with each element being a named list as above. +#' For multiple ahead values and `ahead_separate = FALSE`, a named list as +#' above, except `train_y` is a matrix of responses rather than a vector. #' #' @examples \dontrun{ #' create_train_and_predict_matrices( @@ -56,32 +65,89 @@ #' @importFrom assertthat assert_that #' #' @export -create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) { +create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size, + aheads_separate = TRUE) { all_out <- list() + # prediction matrices are the same for all aheads + predict_x <- lagged_df %>% + filter(time_value == max(time_value)) %>% + select(tidyselect::starts_with("value")) %>% + as.matrix() + predict_geo_values <- lagged_df %>% + filter(time_value == max(time_value)) %>% + select(geo_value) %>% pull() + + # make sure the response columns are unique for (a in ahead) { - out <- list() # matrices corresponding to ahead+a - - # make sure the response columns are unique responses_at_ahead <- lagged_df %>% select(tidyselect::starts_with(sprintf("response+%i:", a))) %>% ncol() assert_that(responses_at_ahead == 1, msg = paste("multiple responses at ahead =", a)) + } + + if (aheads_separate) { + for (a in ahead) { + out <- list() # matrices corresponding to ahead+a + + # Find the last possible date of training data + response_end_date <- lagged_df %>% + select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>% + tidyr::drop_na() %>% + summarize(max(time_value)) %>% + pull() + train_end_date <- min(max(lagged_df$time_value), response_end_date) + + # Training matrices + out$train_x <- lagged_df %>% + select(geo_value, time_value, tidyselect::starts_with("value")) %>% + filter(between(time_value, + train_end_date - training_window_size + 1, + train_end_date)) %>% + select(-c(geo_value, time_value)) %>% + as.matrix() + out$train_y <- lagged_df %>% + filter(between(time_value, + train_end_date - training_window_size + 1, + train_end_date)) %>% + select(tidyselect::starts_with(sprintf("response+%i:", a))) %>% + pull() + + # Add prediction matrices and training end date + out$predict_x <- predict_x + out$predict_geo_values <- predict_geo_values + out$train_end_date <- train_end_date + + all_out[[paste0("ahead+", a)]] <- out + } - train_df <- lagged_df %>% - select(geo_value, time_value, tidyselect::starts_with("value")) + if (length(ahead) == 1) { + return(all_out[[1]]) + } else { + return(all_out) + } + } else { + # We want a single training data matrix and a matrix of + # training responses containing all the aheads + # Need to recompute training_window_size + training_window_size <- training_window_size + max(ahead) - min(ahead) + + out <- list() # Find the last possible date of training data + # (corresponds to the smallest ahead) response_end_date <- lagged_df %>% - select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>% + select(time_value, + tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>% tidyr::drop_na() %>% summarize(max(time_value)) %>% pull() train_end_date <- min(max(lagged_df$time_value), response_end_date) # Training matrices - out$train_x <- train_df %>% + out$train_x <- lagged_df %>% + select(geo_value, time_value, tidyselect::starts_with("value")) %>% filter(between(time_value, train_end_date - training_window_size + 1, train_end_date)) %>% @@ -91,25 +157,14 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ filter(between(time_value, train_end_date - training_window_size + 1, train_end_date)) %>% - select(tidyselect::starts_with(sprintf("response+%i:", a))) %>% - pull() - - # Prediction matrices - out$predict_x <- lagged_df %>% - filter(time_value == max(time_value)) %>% - select(tidyselect::starts_with("value")) %>% + select(tidyselect::starts_with(paste0("response+", ahead, ":"))) %>% as.matrix() - out$predict_geo_values <- lagged_df %>% - filter(time_value == max(time_value)) %>% - select(geo_value) %>% pull() + + # Add prediction matrices and training end date + out$predict_x <- predict_x + out$predict_geo_values <- predict_geo_values out$train_end_date <- train_end_date - all_out[[paste0("ahead+", a)]] <- out - } - - if (length(ahead) == 1) { - return(all_out[[1]]) - } else { - return(all_out) + return(out) } } diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index cc0288b2..c9bbad74 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -4,7 +4,12 @@ \alias{create_train_and_predict_matrices} \title{Create training and test data matrices and a training response for given aheads.} \usage{ -create_train_and_predict_matrices(lagged_df, ahead, training_window_size) +create_train_and_predict_matrices( + lagged_df, + ahead, + training_window_size, + aheads_separate = TRUE +) } \arguments{ \item{lagged_df}{Data frame of lagged data. It should have the following columns: @@ -27,9 +32,12 @@ present in \code{lagged_df}.} \item{training_window_size}{Size of the local training window in days to use. For example, if \code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on data from December 1 to December 14.} + +\item{aheads_separate}{If \code{length(ahead) > 1}, should there be separate +data matrices and responses for each ahead? Default is \code{TRUE}.} } \value{ -Named list with entries: +For a single ahead value, named list with entries: \itemize{ \item \code{train_x}: Matrix of training data whose columns correspond to the \code{value-{days}:{signal}} columns in \code{lagged_df}. The training data consists of the @@ -42,10 +50,17 @@ prediction data contains the most recent \code{training_window_size} days. \item \code{predict_geo_values}: Vector of \code{geo_values} corresponding to the rows of \code{predict_x}. \item \code{train_end_date}: Latest \code{time_value} used in the training period. } +For multiple ahead values and \code{aheads_separate = TRUE}, a list having +the same length as \code{ahead}, with each element being a named list as above. +For multiple ahead values and \code{ahead_separate = FALSE}, a named list as +above, except \code{train_y} is a matrix of responses rather than a vector. } \description{ Create training and test data matrices and training response for a set of given aheads. Works for both single ahead values and a vector of ahead values. +For multiple ahead values, the function has the ability to return separate +data matrices and responses for each ahead, or a single data matrix and +response matrix for all aheads at once. } \examples{ \dontrun{ diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R index e38d51b0..9c787555 100644 --- a/R-packages/modeltools/tests/testthat/test-matrix.R +++ b/R-packages/modeltools/tests/testthat/test-matrix.R @@ -69,6 +69,54 @@ test_that("training and prediction matrices for multiple aheads (separate)", { training_window_size = 1)) }) +test_that("training and prediction matrices for multiple aheads (together)", { + df <- tibble( + geo_value = rep(c("az", "wv"), 5), + time_value = rep( + as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")), + each = 2), + `value-2:signal_1` = seq(-3, 6), + `value-2:signal_2` = seq(7, 16), + `value-1:signal_1` = seq(-1, 8), + `value-1:signal_2` = seq(9, 18), + `value+0:signal_1` = seq(1, 10), + `value+0:signal_2` = seq(11, 20), + `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)), + `response+2:signal_2` = c(seq(15, 20), rep(NA, 4)) + ) + + out <- create_train_and_predict_matrices(df, ahead = c(1, 2), + training_window_size = 1, + aheads_separate = FALSE) + + expect_equal(names(out), c("train_x", "train_y", "predict_x", + "predict_geo_values", "train_end_date")) + expect_equal(out$train_x, + as.matrix(tibble( + `value-2:signal_1` = c(1, 2, 3, 4), + `value-2:signal_2` = c(11, 12, 13, 14), + `value-1:signal_1` = c(3, 4, 5, 6), + `value-1:signal_2` = c(13, 14, 15, 16), + `value+0:signal_1` = c(5, 6, 7, 8), + `value+0:signal_2` = c(15, 16, 17, 18))) + ) + expect_equal(out$train_y, + as.matrix(tibble( + `response+1:signal_2` = c(17, 18, 19, 20), + `response+2:signal_2` = c(19, 20, NA, NA))) + ) + expect_equal(out$predict_x, + as.matrix(tibble( + `value-2:signal_1` = c(5, 6), + `value-2:signal_2` = c(15, 16), + `value-1:signal_1` = c(7, 8), + `value-1:signal_2` = c(17, 18), + `value+0:signal_1` = c(9, 10), + `value+0:signal_2` = c(19, 20))) + ) + expect_equal(out$predict_geo_values, c("az", "wv")) +}) + test_that("fails with multiple responses", { df <- tibble( geo_value = rep(c("az", "wv"), 5), From 091bc6064d25a4c96df52fde47848e6393e0ee25 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 2 Apr 2021 23:45:48 -0700 Subject: [PATCH 32/36] add missing comma --- R-packages/modeltools/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION index d9426422..deb84f29 100755 --- a/R-packages/modeltools/DESCRIPTION +++ b/R-packages/modeltools/DESCRIPTION @@ -18,7 +18,7 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.1 Remotes: cmu-delphi/covidcast/R-packages/covidcast@main, - cmu-delphi/covidcast/R-packages/evalcast@main + cmu-delphi/covidcast/R-packages/evalcast@main, ryantibs/quantgen/R-package/quantgen@master, Imports: assertthat, From fe231fee3877fbd935047371e2928765c88b409a Mon Sep 17 00:00:00 2001 From: Kenneth Tay Date: Mon, 5 Apr 2021 13:25:20 -0700 Subject: [PATCH 33/36] Refactor create_train_and_predict_matrices(). --- R-packages/modeltools/DESCRIPTION | 1 + R-packages/modeltools/NAMESPACE | 1 + R-packages/modeltools/R/matrix.R | 218 +++++++++++------- .../man/create_train_and_predict_matrices.Rd | 15 +- .../modeltools/man/create_train_matrices.Rd | 45 ++++ .../modeltools/man/get_train_end_date.Rd | 24 ++ .../modeltools/tests/testthat/test-matrix.R | 19 +- 7 files changed, 227 insertions(+), 96 deletions(-) create mode 100644 R-packages/modeltools/man/create_train_matrices.Rd create mode 100644 R-packages/modeltools/man/get_train_end_date.Rd diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION index d9426422..b1584303 100755 --- a/R-packages/modeltools/DESCRIPTION +++ b/R-packages/modeltools/DESCRIPTION @@ -26,6 +26,7 @@ Imports: dplyr, evalcast, genlasso, + purrr, slider, tidyselect, tidyr, diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index 62e60157..de638773 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -33,6 +33,7 @@ importFrom(genlasso,coef.genlasso) importFrom(genlasso,cv.trendfilter) importFrom(genlasso,trendfilter) importFrom(lubridate,days) +importFrom(purrr,map2) importFrom(stats,coef) importFrom(stats,lsfit) importFrom(stats,predict) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 0df026be..4ba09dda 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -17,15 +17,15 @@ #' } #' A data frame in this format can be made using `covidcast::aggregate_signals()` and #' `modeltools::get_response_columns()`. -#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast. -#' Can be a single positive integer or a vector of positive integers. Note that -#' for each `{a}` in `ahead`, the column `response+{a}:{response}` should be -#' present in `lagged_df`. +#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) +#' ahead to forecast. Can be a single positive integer or a vector of +#' positive integers. Note that for each `{a}` in `ahead`, the column +#' `response+{a}:{response}` should be present in `lagged_df`. #' @param training_window_size Size of the local training window in days to use. For example, if #' `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on #' data from December 1 to December 14. #' @param aheads_separate If `length(ahead) > 1`, should there be separate -#' data matrices and responses for each ahead? Default is `TRUE`. +#' data matrices and responses for each ahead? Default is `TRUE`. #' #' @return For a single ahead value, named list with entries: #' \itemize{ @@ -35,10 +35,15 @@ #' days prior to it. #' \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of #' `lagged_df` corresponding to the rows of `train_x`. +#' \item `train_geo_values`: Vector of geo values corresponding to the rows +#' of `train_x`. +#' \item `train_time_values`: Vector of time values corresponding to the rows +#' of `train_x`. +#' \item `train_end_date`: Latest `time_value` used in the training period. #' \item `predict_x`: Matrix of prediction data in the same format as `train_x`. The #' prediction data contains the most recent `training_window_size` days. #' \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`. -#' \item `train_end_date`: Latest `time_value` used in the training period. +#' \item `predict_time_value`: Time value corresponding to `predict_x`. #' } #' For multiple ahead values and `aheads_separate = TRUE`, a list having #' the same length as `ahead`, with each element being a named list as above. @@ -63,21 +68,11 @@ #' #' @importFrom tibble tibble #' @importFrom assertthat assert_that +#' @importFrom purrr map2 #' #' @export create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size, aheads_separate = TRUE) { - all_out <- list() - - # prediction matrices are the same for all aheads - predict_x <- lagged_df %>% - filter(time_value == max(time_value)) %>% - select(tidyselect::starts_with("value")) %>% - as.matrix() - predict_geo_values <- lagged_df %>% - filter(time_value == max(time_value)) %>% - select(geo_value) %>% pull() - # make sure the response columns are unique for (a in ahead) { responses_at_ahead <- lagged_df %>% @@ -87,84 +82,133 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ msg = paste("multiple responses at ahead =", a)) } + # prediction matrices are the same for all aheads + predict_time_value <- max(lagged_df$time_value) + predict_x <- lagged_df %>% + filter(time_value == predict_time_value) %>% + select(tidyselect::starts_with("value")) %>% + as.matrix() + predict_geo_values <- lagged_df %>% + filter(time_value == predict_time_value) %>% + select(geo_value) %>% pull() + + all_out <- list() + if (aheads_separate) { - for (a in ahead) { - out <- list() # matrices corresponding to ahead+a - - # Find the last possible date of training data - response_end_date <- lagged_df %>% - select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>% - tidyr::drop_na() %>% - summarize(max(time_value)) %>% - pull() - train_end_date <- min(max(lagged_df$time_value), response_end_date) - - # Training matrices - out$train_x <- lagged_df %>% - select(geo_value, time_value, tidyselect::starts_with("value")) %>% - filter(between(time_value, - train_end_date - training_window_size + 1, - train_end_date)) %>% - select(-c(geo_value, time_value)) %>% - as.matrix() - out$train_y <- lagged_df %>% - filter(between(time_value, - train_end_date - training_window_size + 1, - train_end_date)) %>% - select(tidyselect::starts_with(sprintf("response+%i:", a))) %>% - pull() - - # Add prediction matrices and training end date - out$predict_x <- predict_x - out$predict_geo_values <- predict_geo_values - out$train_end_date <- train_end_date - - all_out[[paste0("ahead+", a)]] <- out - } + train_end_dates <- Reduce(c, lapply(ahead, + function(a) get_train_end_date(lagged_df, a))) - if (length(ahead) == 1) { - return(all_out[[1]]) - } else { - return(all_out) - } - } else { - # We want a single training data matrix and a matrix of - # training responses containing all the aheads + all_out <- purrr::map2(ahead, train_end_dates, + ~ create_train_matrices(lagged_df, .x, training_window_size, + .y)) + } else { # ahead_separate = FALSE: We want a single training data matrix and + # a matrix of training responses containing all the aheads # Need to recompute training_window_size training_window_size <- training_window_size + max(ahead) - min(ahead) - out <- list() - # Find the last possible date of training data # (corresponds to the smallest ahead) - response_end_date <- lagged_df %>% - select(time_value, - tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>% - tidyr::drop_na() %>% - summarize(max(time_value)) %>% - pull() - train_end_date <- min(max(lagged_df$time_value), response_end_date) + train_end_date <- get_train_end_date(lagged_df, ahead) # Training matrices - out$train_x <- lagged_df %>% - select(geo_value, time_value, tidyselect::starts_with("value")) %>% - filter(between(time_value, - train_end_date - training_window_size + 1, - train_end_date)) %>% - select(-c(geo_value, time_value)) %>% - as.matrix() - out$train_y <- lagged_df %>% - filter(between(time_value, - train_end_date - training_window_size + 1, - train_end_date)) %>% - select(tidyselect::starts_with(paste0("response+", ahead, ":"))) %>% - as.matrix() - - # Add prediction matrices and training end date - out$predict_x <- predict_x - out$predict_geo_values <- predict_geo_values - out$train_end_date <- train_end_date - - return(out) + all_out[[1]] <- create_train_matrices(lagged_df, ahead, + training_window_size, + train_end_date) } + + # Add prediction matrices / info + for (i in seq_along(all_out)) { + all_out[[i]]$predict_x <- predict_x + all_out[[i]]$predict_geo_values <- predict_geo_values + all_out[[i]]$predict_time_value <- predict_time_value + } + + if (length(ahead) == 1 || aheads_separate == FALSE) { + return(all_out[[1]]) + } else { + names(all_out) <- paste0("ahead+", ahead) + return(all_out) + } +} + +#' Get last possible date of training data for given aheads. +#' +#' Returns the last possible date of the training data for a given set of +#' aheads. If more than one ahead is given, the date return corresponds to the +#' last possible date corresponding to the smallest ahead value. This is because +#' the smallest ahead value will have the latest possible date. +#' +#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`. +#' @param ahead Number of days ahead to forecast. Can be a single positive +#' integer or a vector of positive integers. +#' +#' @return Single date corresponding to the last possible date of the training +#' data. +get_train_end_date <- function(lagged_df, ahead) { + response_end_date <- lagged_df %>% + select(time_value, + tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>% + tidyr::drop_na() %>% + summarize(max(time_value)) %>% + pull() + train_end_date <- min(max(lagged_df$time_value), response_end_date) + return(train_end_date) +} + +#' Create training data matrix and a training response for given aheads. +#' +#' Create training and data matrix and training response for a set of +#' given aheads. Works for both single ahead values and a vector of ahead +#' values. However, note that this function works different from +#' `create_train_and_predict_matrices()` for multiple ahead values. If +#' multiple ahead values are supplied, we return a matrix of responses +#' containing all aheads at once. +#' +#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`. +#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) +#' ahead to forecast. Can be a single positive integer or a vector of +#' positive integers. Note that for each `{a}` in `ahead`, the column +#' `response+{a}:{response}` should be present in `lagged_df`. +#' @param n_days Number of days worth of data to pull. +#' @param train_end_date The last date to be included in the training data. +#' +#' @return A named list with entries: +#' \itemize{ +#' \item `train_x`: Matrix of training data whose columns correspond to the +#' `value-{days}:{signal}` columns in `lagged_df`. The training data consists of the +#' latest date with an non-null response, plus all data from the `training_window_size` +#' days prior to it. +#' \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of +#' `lagged_df` corresponding to the rows of `train_x`. If multiple ahead +#' values are provided, then this is a matrix instead. +#' \item `train_geo_values`: Vector of geo values corresponding to the rows +#' of `train_x`. +#' \item `train_time_values`: Vector of time values corresponding to the rows +#' of `train_x`. +#' \item `train_end_date`: Latest `time_value` used in the training period. +#' } +create_train_matrices <- function(lagged_df, ahead, n_days, + train_end_date) { + train_df <- lagged_df %>% + filter(between(time_value, + train_end_date - n_days + 1, + train_end_date)) + out <- list() + out$train_x <- train_df %>% + select(tidyselect::starts_with("value")) %>% + as.matrix() + + train_y <- train_df %>% + select(tidyselect::starts_with(paste0("response+", ahead, ":"))) + if (length(ahead) == 1) { + out$train_y <- pull(train_y) + } else { + out$train_y <- as.matrix(train_y) + } + + out$train_geo_values <- train_df$geo_value + out$train_time_values <- train_df$time_value + out$train_end_date <- as.Date(train_end_date) + + return(out) } diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd index c9bbad74..cb8c485f 100644 --- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd +++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd @@ -24,10 +24,10 @@ correspond to \code{{response}} \code{{n}} incidence period units after \code{ti A data frame in this format can be made using \code{covidcast::aggregate_signals()} and \code{modeltools::get_response_columns()}.} -\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast. -Can be a single positive integer or a vector of positive integers. Note that -for each \code{{a}} in \code{ahead}, the column \code{response+{a}:{response}} should be -present in \code{lagged_df}.} +\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) +ahead to forecast. Can be a single positive integer or a vector of +positive integers. Note that for each \code{{a}} in \code{ahead}, the column +\code{response+{a}:{response}} should be present in \code{lagged_df}.} \item{training_window_size}{Size of the local training window in days to use. For example, if \code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on @@ -45,10 +45,15 @@ latest date with an non-null response, plus all data from the \code{training_win days prior to it. \item \code{train_y}: Vector of response data from the \code{response+{ahead}:{response}} column of \code{lagged_df} corresponding to the rows of \code{train_x}. +\item \code{train_geo_values}: Vector of geo values corresponding to the rows +of \code{train_x}. +\item \code{train_time_values}: Vector of time values corresponding to the rows +of \code{train_x}. +\item \code{train_end_date}: Latest \code{time_value} used in the training period. \item \code{predict_x}: Matrix of prediction data in the same format as \code{train_x}. The prediction data contains the most recent \code{training_window_size} days. \item \code{predict_geo_values}: Vector of \code{geo_values} corresponding to the rows of \code{predict_x}. -\item \code{train_end_date}: Latest \code{time_value} used in the training period. +\item \code{predict_time_value}: Time value corresponding to \code{predict_x}. } For multiple ahead values and \code{aheads_separate = TRUE}, a list having the same length as \code{ahead}, with each element being a named list as above. diff --git a/R-packages/modeltools/man/create_train_matrices.Rd b/R-packages/modeltools/man/create_train_matrices.Rd new file mode 100644 index 00000000..9b27f24c --- /dev/null +++ b/R-packages/modeltools/man/create_train_matrices.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix.R +\name{create_train_matrices} +\alias{create_train_matrices} +\title{Create training data matrix and a training response for given aheads.} +\usage{ +create_train_matrices(lagged_df, ahead, n_days, train_end_date) +} +\arguments{ +\item{lagged_df}{Data frame of lagged data as in \code{create_train_and_predict_matrices()}.} + +\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) +ahead to forecast. Can be a single positive integer or a vector of +positive integers. Note that for each \code{{a}} in \code{ahead}, the column +\code{response+{a}:{response}} should be present in \code{lagged_df}.} + +\item{n_days}{Number of days worth of data to pull.} + +\item{train_end_date}{The last date to be included in the training data.} +} +\value{ +A named list with entries: +\itemize{ +\item \code{train_x}: Matrix of training data whose columns correspond to the +\code{value-{days}:{signal}} columns in \code{lagged_df}. The training data consists of the +latest date with an non-null response, plus all data from the \code{training_window_size} +days prior to it. +\item \code{train_y}: Vector of response data from the \code{response+{ahead}:{response}} column of +\code{lagged_df} corresponding to the rows of \code{train_x}. If multiple ahead +values are provided, then this is a matrix instead. +\item \code{train_geo_values}: Vector of geo values corresponding to the rows +of \code{train_x}. +\item \code{train_time_values}: Vector of time values corresponding to the rows +of \code{train_x}. +\item \code{train_end_date}: Latest \code{time_value} used in the training period. +} +} +\description{ +Create training and data matrix and training response for a set of +given aheads. Works for both single ahead values and a vector of ahead +values. However, note that this function works different from +\code{create_train_and_predict_matrices()} for multiple ahead values. If +multiple ahead values are supplied, we return a matrix of responses +containing all aheads at once. +} diff --git a/R-packages/modeltools/man/get_train_end_date.Rd b/R-packages/modeltools/man/get_train_end_date.Rd new file mode 100644 index 00000000..8b4c1a9a --- /dev/null +++ b/R-packages/modeltools/man/get_train_end_date.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix.R +\name{get_train_end_date} +\alias{get_train_end_date} +\title{Get last possible date of training data for given aheads.} +\usage{ +get_train_end_date(lagged_df, ahead) +} +\arguments{ +\item{lagged_df}{Data frame of lagged data as in \code{create_train_and_predict_matrices()}.} + +\item{ahead}{Number of days ahead to forecast. Can be a single positive +integer or a vector of positive integers.} +} +\value{ +Single date corresponding to the last possible date of the training +data. +} +\description{ +Returns the last possible date of the training data for a given set of +aheads. If more than one ahead is given, the date return corresponds to the +last possible date corresponding to the smallest ahead value. This is because +the smallest ahead value will have the latest possible date. +} diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R index 9c787555..c04655f4 100644 --- a/R-packages/modeltools/tests/testthat/test-matrix.R +++ b/R-packages/modeltools/tests/testthat/test-matrix.R @@ -19,8 +19,9 @@ test_that("training and prediction matrices are created", { out <- create_train_and_predict_matrices(df, ahead = 2, training_window_size = 1) - expect_equal(names(out), c("train_x", "train_y", "predict_x", - "predict_geo_values", "train_end_date")) + expect_equal(names(out), c("train_x", "train_y", "train_geo_values", + "train_time_values", "train_end_date", + "predict_x", "predict_geo_values", "predict_time_value")) expect_equal(out$train_x, as.matrix(tibble( `value-2:signal_1` = c(1, 2), @@ -31,6 +32,9 @@ test_that("training and prediction matrices are created", { `value+0:signal_2` = c(15, 16))) ) expect_equal(out$train_y, c(19, 20)) + expect_equal(out$train_geo_values, c("az", "wv")) + expect_equal(out$train_time_values, rep(as.Date("2021-01-27"), 2)) + expect_equal(out$train_end_date, as.Date("2021-01-27")) expect_equal(out$predict_x, as.matrix(tibble( `value-2:signal_1` = c(5, 6), @@ -41,6 +45,7 @@ test_that("training and prediction matrices are created", { `value+0:signal_2` = c(19, 20))) ) expect_equal(out$predict_geo_values, c("az", "wv")) + expect_equal(out$predict_time_value, as.Date("2021-01-29")) }) test_that("training and prediction matrices for multiple aheads (separate)", { @@ -89,8 +94,9 @@ test_that("training and prediction matrices for multiple aheads (together)", { training_window_size = 1, aheads_separate = FALSE) - expect_equal(names(out), c("train_x", "train_y", "predict_x", - "predict_geo_values", "train_end_date")) + expect_equal(names(out), c("train_x", "train_y", "train_geo_values", + "train_time_values", "train_end_date", + "predict_x", "predict_geo_values", "predict_time_value")) expect_equal(out$train_x, as.matrix(tibble( `value-2:signal_1` = c(1, 2, 3, 4), @@ -105,6 +111,10 @@ test_that("training and prediction matrices for multiple aheads (together)", { `response+1:signal_2` = c(17, 18, 19, 20), `response+2:signal_2` = c(19, 20, NA, NA))) ) + expect_equal(out$train_geo_values, rep(c("az", "wv"), 2)) + expect_equal(out$train_time_values, + rep(as.Date(c("2021-01-27", "2021-01-28")), each = 2)) + expect_equal(out$train_end_date, as.Date("2021-01-28")) expect_equal(out$predict_x, as.matrix(tibble( `value-2:signal_1` = c(5, 6), @@ -115,6 +125,7 @@ test_that("training and prediction matrices for multiple aheads (together)", { `value+0:signal_2` = c(19, 20))) ) expect_equal(out$predict_geo_values, c("az", "wv")) + expect_equal(out$predict_time_value, as.Date("2021-01-29")) }) test_that("fails with multiple responses", { From 61bac8a2d9340b4789fc39f90f503aeaeaef6991 Mon Sep 17 00:00:00 2001 From: Kenneth Tay Date: Mon, 5 Apr 2021 20:45:08 -0700 Subject: [PATCH 34/36] Address PR #520 comments. --- R-packages/modeltools/NAMESPACE | 2 ++ R-packages/modeltools/R/matrix.R | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE index de638773..fbb1f629 100644 --- a/R-packages/modeltools/NAMESPACE +++ b/R-packages/modeltools/NAMESPACE @@ -32,8 +32,10 @@ importFrom(dplyr,ungroup) importFrom(genlasso,coef.genlasso) importFrom(genlasso,cv.trendfilter) importFrom(genlasso,trendfilter) +importFrom(lubridate,as_date) importFrom(lubridate,days) importFrom(purrr,map2) +importFrom(purrr,map_dbl) importFrom(stats,coef) importFrom(stats,lsfit) importFrom(stats,predict) diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R index 4ba09dda..512e7429 100644 --- a/R-packages/modeltools/R/matrix.R +++ b/R-packages/modeltools/R/matrix.R @@ -68,7 +68,8 @@ #' #' @importFrom tibble tibble #' @importFrom assertthat assert_that -#' @importFrom purrr map2 +#' @importFrom lubridate as_date +#' @importFrom purrr map_dbl map2 #' #' @export create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size, @@ -95,12 +96,13 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_ all_out <- list() if (aheads_separate) { - train_end_dates <- Reduce(c, lapply(ahead, - function(a) get_train_end_date(lagged_df, a))) + train_end_dates <- ahead %>% + purrr::map_dbl(~ get_train_end_date(lagged_df, .x)) %>% + lubridate::as_date() all_out <- purrr::map2(ahead, train_end_dates, - ~ create_train_matrices(lagged_df, .x, training_window_size, - .y)) + ~ create_train_matrices( + lagged_df, .x, training_window_size, .y)) } else { # ahead_separate = FALSE: We want a single training data matrix and # a matrix of training responses containing all the aheads # Need to recompute training_window_size From f24ceb61bb269873df5da99bcdce397ec99782e6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 21 Sep 2021 10:33:31 -0700 Subject: [PATCH 35/36] Update quantgen dependency folder structure --- R-packages/modeltools/DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION index f1691e9e..ce25041a 100755 --- a/R-packages/modeltools/DESCRIPTION +++ b/R-packages/modeltools/DESCRIPTION @@ -1,6 +1,6 @@ Package: modeltools Type: Package -Title: Tools for Building COVID Forecasters and Related Models +Title: Tools for Building COVID Forecasters and Related Models Version: 0.1.0 Authors@R: c( @@ -19,7 +19,7 @@ RoxygenNote: 7.1.1 Remotes: cmu-delphi/covidcast/R-packages/covidcast@main, cmu-delphi/covidcast/R-packages/evalcast@main, - ryantibs/quantgen/R-package/quantgen@master, + ryantibs/quantgen/quantgen@master, Imports: assertthat, covidcast, From b4ab08789622190571347af63d58890ea562e14d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 13 Feb 2023 15:08:49 -0800 Subject: [PATCH 36/36] Modeltools: update deps * switch evalcast@main -> evalcast@evalcast * make quantgen dependency explicit * make lubridate, MMWRweek, tibble dependencies explicit --- R-packages/modeltools/DESCRIPTION | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION index ce25041a..a130ff63 100755 --- a/R-packages/modeltools/DESCRIPTION +++ b/R-packages/modeltools/DESCRIPTION @@ -15,19 +15,23 @@ License: MIT Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.3 Remotes: cmu-delphi/covidcast/R-packages/covidcast@main, - cmu-delphi/covidcast/R-packages/evalcast@main, - ryantibs/quantgen/quantgen@master, + cmu-delphi/covidcast/R-packages/evalcast@evalcast, + ryantibs/quantgen/quantgen@master Imports: assertthat, covidcast, dplyr, - evalcast, genlasso, + lubridate, + MMWRweek, purrr, + quantgen, slider, + tibble, tidyselect, - tidyr, - quantgen + tidyr +Suggests: + evalcast