From c7f7221aa5c841014495aaf952fdfef4be13b6d8 Mon Sep 17 00:00:00 2001
From: Kenneth Tay <kjytay@gmail.com>
Date: Tue, 9 Mar 2021 14:55:46 -0800
Subject: [PATCH 01/36] Add documentation examples.

Examples added for slide_by_geo(), pct_change() and estimate_deriv().
---
 R-packages/modeltools/R/change.R            | 13 ++++++++++++
 R-packages/modeltools/R/deriv.R             | 20 +++++++++++++++++++
 R-packages/modeltools/R/slide.R             | 11 +++++++++++
 R-packages/modeltools/man/estimate_deriv.Rd | 22 +++++++++++++++++++++
 R-packages/modeltools/man/pct_change.Rd     | 15 ++++++++++++++
 R-packages/modeltools/man/slide_by_geo.Rd   | 13 ++++++++++++
 6 files changed, 94 insertions(+)

diff --git a/R-packages/modeltools/R/change.R b/R-packages/modeltools/R/change.R
index 7ad0e4e0..47990266 100644
--- a/R-packages/modeltools/R/change.R
+++ b/R-packages/modeltools/R/change.R
@@ -19,6 +19,19 @@
 #' @return A data frame given by appending a new column to `x` named according
 #'   to the `col_name` argument, containing the percentage change values. 
 #'
+#' @examples \dontrun{
+#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+#'     start_day = "2021-01-01", 
+#'     end_day = "2021-01-31",
+#'     geo_type = "state")
+#' 
+#' # percentage change between back-to-back weeks (default, as n = 14)
+#' pct_change(df)
+#' 
+#' # percentage change between back-to-back days
+#' pct_change(df, n = 2)
+#' }
+#'
 #' @export
 pct_change = function(x, n = 14, col_name = "pct_change") {
   # Check if n is odd and if so bump it up one
diff --git a/R-packages/modeltools/R/deriv.R b/R-packages/modeltools/R/deriv.R
index 73fdbbc6..3d927f01 100644
--- a/R-packages/modeltools/R/deriv.R
+++ b/R-packages/modeltools/R/deriv.R
@@ -74,6 +74,26 @@
 #' 
 #' @return A data frame given by appending a new column to `x` named according
 #'   to the `col_name` argument, containing the estimated derivative values. 
+#'   
+#' @examples \dontrun{
+#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+#'     start_day = "2021-01-01", 
+#'     end_day = "2021-02-28",
+#'     geo_type = "state",
+#'     geo_values = c("ca", "fl"))
+#' 
+#' # estimate derivative using linear regression and n = 10 days
+#' estimate_deriv(df, method = "lin", n = 10)
+#' 
+#' # keep the linear regression fits
+#' estimate_deriv(df, method = "lin", n = 10, keep_obj = TRUE)
+#' 
+#' # estimate derivative using smoothing spline with 8 degrees of freedom
+#' estimate_deriv(df, method = "ss", n = 28, df = 8)
+#' 
+#' # estimate derivative using trend filtering with 8 degrees of freedom
+#' estimate_deriv(df, method = "tf", n = 28, df = 8)
+#' }
 #'
 #' @export
 estimate_deriv = function(x, method = c("lin", "ss", "tf"), n = 14,
diff --git a/R-packages/modeltools/R/slide.R b/R-packages/modeltools/R/slide.R
index ecd48dd4..e3420e6e 100644
--- a/R-packages/modeltools/R/slide.R
+++ b/R-packages/modeltools/R/slide.R
@@ -29,6 +29,17 @@
 #' 
 #' @return A data frame given by appending a new column to `x` named according
 #'   to the `col_name` argument, containing the function values.
+#'   
+#' @examples \dontrun{
+#' df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+#'     start_day = "2021-01-01", 
+#'     end_day = "2021-01-31",
+#'     geo_type = "state")
+#'     
+#' # two equivalent ways to compute 7-day trailing averages
+#' slide_by_geo(df, slide_fun = ~ Mean(.x$value), n = 7)
+#' slide_by_geo(df, slide_fun = function(x, ...) Mean(x$value) , n = 7)
+#' }
 #'
 #' @importFrom dplyr %>% arrange group_by group_modify mutate ungroup
 #' @importFrom lubridate days
diff --git a/R-packages/modeltools/man/estimate_deriv.Rd b/R-packages/modeltools/man/estimate_deriv.Rd
index 635ca542..cb9320d0 100644
--- a/R-packages/modeltools/man/estimate_deriv.Rd
+++ b/R-packages/modeltools/man/estimate_deriv.Rd
@@ -97,3 +97,25 @@ based on the cross-validation error curve (minimum or 1-standard-error
 rule, respectively). Default is 8 when \code{cv = FALSE}, and "1se" when \code{cv = TRUE}.}
 }
 }
+\examples{
+\dontrun{
+df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+    start_day = "2021-01-01", 
+    end_day = "2021-02-28",
+    geo_type = "state",
+    geo_values = c("ca", "fl"))
+
+# estimate derivative using linear regression and n = 10 days
+estimate_deriv(df, method = "lin", n = 10)
+
+# keep the linear regression fits
+estimate_deriv(df, method = "lin", n = 10, keep_obj = TRUE)
+
+# estimate derivative using smoothing spline with 8 degrees of freedom
+estimate_deriv(df, method = "ss", n = 28, df = 8)
+
+# estimate derivative using trend filtering with 8 degrees of freedom
+estimate_deriv(df, method = "tf", n = 28, df = 8)
+}
+
+}
diff --git a/R-packages/modeltools/man/pct_change.Rd b/R-packages/modeltools/man/pct_change.Rd
index 0163c02c..f22a06e4 100644
--- a/R-packages/modeltools/man/pct_change.Rd
+++ b/R-packages/modeltools/man/pct_change.Rd
@@ -29,3 +29,18 @@ frame. (When multiple issue dates are present, only the latest issue is
 considered.)  See the \href{https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/pct-change.html}{percentage change vignette}
 for examples.
 }
+\examples{
+\dontrun{
+df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+    start_day = "2021-01-01", 
+    end_day = "2021-01-31",
+    geo_type = "state")
+
+# percentage change between back-to-back weeks (default, as n = 14)
+pct_change(df)
+
+# percentage change between back-to-back days
+pct_change(df, n = 2)
+}
+
+}
diff --git a/R-packages/modeltools/man/slide_by_geo.Rd b/R-packages/modeltools/man/slide_by_geo.Rd
index 33f2bdb3..8657baeb 100644
--- a/R-packages/modeltools/man/slide_by_geo.Rd
+++ b/R-packages/modeltools/man/slide_by_geo.Rd
@@ -49,3 +49,16 @@ grouped by \code{geo_value}. (When multiple issue dates are present, only the
 latest issue is considered.) See the \href{https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/modeltools.html}{getting started guide}
 for examples.
 }
+\examples{
+\dontrun{
+df <- covidcast::covidcast_signal("fb-survey", "smoothed_cli", 
+    start_day = "2021-01-01", 
+    end_day = "2021-01-31",
+    geo_type = "state")
+    
+# two equivalent ways to compute 7-day trailing averages
+slide_by_geo(df, slide_fun = ~ Mean(.x$value), n = 7)
+slide_by_geo(df, slide_fun = function(x, ...) Mean(x$value) , n = 7)
+}
+
+}

From 1d2f1d40aeb470856370d59ab84863a8b91a3518 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Tue, 16 Mar 2021 19:58:23 -0400
Subject: [PATCH 02/36] function for testing and training matrices

---
 R-packages/modeltools/NAMESPACE               |  2 +
 R-packages/modeltools/R/matrix.R              | 75 +++++++++++++++++++
 .../man/create_train_and_predict_matrices.Rd  | 46 ++++++++++++
 .../modeltools/tests/testthat/test-matrix.R   | 44 +++++++++++
 4 files changed, 167 insertions(+)
 create mode 100644 R-packages/modeltools/R/matrix.R
 create mode 100644 R-packages/modeltools/man/create_train_and_predict_matrices.Rd
 create mode 100644 R-packages/modeltools/tests/testthat/test-matrix.R

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 8bbd90e3..9d353a1c 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -7,6 +7,7 @@ export(Median)
 export(Min)
 export(Start)
 export(Sum)
+export(create_train_and_predict_matrices)
 export(estimate_deriv)
 export(pct_change)
 export(quantgen_forecaster)
@@ -32,5 +33,6 @@ importFrom(stats,coef)
 importFrom(stats,lsfit)
 importFrom(stats,predict)
 importFrom(stats,smooth.spline)
+importFrom(tibble,tibble)
 importFrom(tidyr,drop_na)
 importFrom(tidyr,pivot_longer)
diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
new file mode 100644
index 00000000..25f95afd
--- /dev/null
+++ b/R-packages/modeltools/R/matrix.R
@@ -0,0 +1,75 @@
+#' Create a training and prediction matrices for a given ahead.
+#'
+#' @param lagged_df Data frame of lagged data.  It should have the following columns:
+#'     \itemize{
+#'     \item{`geo_value`}{Strings of geographic locations.}
+#'     \item{`time_value`}{Dates of training data.}
+#'     \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values
+#'         correspond to `{signal}` `{days}` before `time_value`}
+#'     \item{Response columns}{Columns with names of the form `value+{n}:{response}` whose values
+#'         correspond to `{response}` `{n}` incidence period units after `time_value`.  Since
+#'         columns beginning with "value+" are interpretted as the response rather than a
+#'         covariate, observed data from the same day as `time_value` should be in columns
+#'         `value-0:{signal}` rather than `value+0:{signal}`}
+#'     }
+#'     A data frame in this format can be made using `covidcast::aggregate_signals()`.
+#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast
+#' @param training_window_size Size of the local training window in days to use. For example, if
+#'     `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on
+#'     data from December 1 to December 14.
+#'
+#' @return Named list with entries:
+#'     \itemize{
+#'     \item{`train_x`}{Matrix of training data whose columns correspond to the
+#'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
+#'         latest date `d` such that there is an observed response at time `d + ahead` and all data
+#'         from the `training_window_size` days prior to it.}
+#'     \item{`train_y`}{Vector of response data from the `value+{ahead}:{response}` column of
+#'         `lagged_df` corresponding to the rows of `train_x`.}
+#'     \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`.  The
+#'         prediction data contains the most recent `training_window_size` days.}
+#'     \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.}
+#'     }
+#'
+#' @importFrom tibble tibble
+#'
+#' @export 
+create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) {
+    out <- list()
+
+    train_df <- lagged_df %>%
+        select(geo_value, time_value, tidyselect::matches("^value(\\+0|-)"))
+
+    # Find the last possible date of training data   
+    response_end_date <- lagged_df %>%
+        select(time_value, tidyselect::starts_with(sprintf("value+%i:", ahead))) %>%
+        tidyr::drop_na() %>%
+        summarize(max(time_value)) %>%
+        pull()
+    train_end_date <- min(max(lagged_df$time_value), response_end_date)
+
+    # Training matrices
+    out$train_x <- train_df %>%
+        filter(between(time_value,
+                       train_end_date - training_window_size + 1,
+                       train_end_date)) %>%
+        select(-c(geo_value, time_value)) %>%
+        as.matrix()
+    out$train_y <- lagged_df %>%
+        filter(between(time_value,
+                       train_end_date - training_window_size + 1,
+                       train_end_date)) %>%
+        select(tidyselect::starts_with(sprintf("value+%i:", ahead))) %>%
+        pull()
+
+    # Prediction matrices
+    out$predict_x <- lagged_df %>%
+        filter(time_value == max(time_value)) %>%
+        select(tidyselect::matches("^value(\\+0|-)")) %>%
+        as.matrix()
+    out$predict_geo_values <- lagged_df %>%
+        filter(time_value == max(time_value)) %>%
+        select(geo_value) %>% pull()
+
+    return(out)
+}
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
new file mode 100644
index 00000000..9a99dd6c
--- /dev/null
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/matrix.R
+\name{create_train_and_predict_matrices}
+\alias{create_train_and_predict_matrices}
+\title{Create a training and prediction matrices for a given ahead.}
+\usage{
+create_train_and_predict_matrices(lagged_df, ahead, training_window_size)
+}
+\arguments{
+\item{lagged_df}{Data frame of lagged data.  It should have the following columns:
+\itemize{
+\item{\code{geo_value}}{Strings of geographic locations.}
+\item{\code{time_value}}{Dates of training data.}
+\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values
+correspond to \code{{signal}} \code{{days}} before \code{time_value}}
+\item{Response columns}{Columns with names of the form \code{value+{n}:{response}} whose values
+correspond to \code{{response}} \code{{n}} incidence period units after \code{time_value}.  Since
+columns beginning with "value+" are interpretted as the response rather than a
+covariate, observed data from the same day as \code{time_value} should be in columns
+\code{value-0:{signal}} rather than \code{value+0:{signal}}}
+}
+A data frame in this format can be made using \code{covidcast::aggregate_signals()}.}
+
+\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast}
+
+\item{training_window_size}{Size of the local training window in days to use. For example, if
+\code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on
+data from December 1 to December 14.}
+}
+\value{
+Named list with entries:
+\itemize{
+\item{\code{train_x}}{Matrix of training data whose columns correspond to the
+\code{value-{days}:{signal}} columns in \code{lagged_df}.  The training data consists of the
+latest date \code{d} such that there is an observed response at time \code{d + ahead} and all data
+from the \code{training_window_size} days prior to it.}
+\item{\code{train_y}}{Vector of response data from the \code{value+{ahead}:{response}} column of
+\code{lagged_df} corresponding to the rows of \code{train_x}.}
+\item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}.  The
+prediction data contains the most recent \code{training_window_size} days.}
+\item{\code{predict_geo_values}}{Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.}
+}
+}
+\description{
+Create a training and prediction matrices for a given ahead.
+}
diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R
new file mode 100644
index 00000000..bc4ced9d
--- /dev/null
+++ b/R-packages/modeltools/tests/testthat/test-matrix.R
@@ -0,0 +1,44 @@
+library(tibble)
+
+test_that("training and prediction matrices are created", {
+    df <- tibble(
+        geo_value = rep(c("az", "wv"), 5),
+        time_value = rep(
+            as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
+            each = 2),
+        `value-2:signal_1` = seq(-3, 6),
+        `value-2:signal_2` = seq(7, 16),
+        `value-1:signal_1` = seq(-1, 8),
+        `value-1:signal_2` = seq(9, 18),
+        `value-0:signal_1` = seq(1, 10),
+        `value-0:signal_2` = seq(11, 20),
+        `value+1:signal_1` = c(seq(3, 10), rep(NA, 2)),
+        `value+1:signal_2` = c(seq(13, 20), rep(NA, 2)),
+        `value+2:signal_1` = c(seq(5, 10), rep(NA, 4)),
+        `value+2:signal_2` = c(seq(15, 20), rep(NA, 4))
+    )
+
+    out <- create_train_and_predict_matrices(df, 2, 1)
+
+    expect_equal(names(out), c("train_x", "train_y", "predict_x", "predict_geo_values"))
+    expect_equal(out$train_x,
+                 as.matrix(tibble(
+                    `value-2:signal_1` = c(1, 2),
+                    `value-2:signal_2` = c(11, 12),
+                    `value-1:signal_1` = c(3, 4),
+                    `value-1:signal_2` = c(13, 14),
+                    `value-0:signal_1` = c(5, 6),
+                    `value-0:signal_2` = c(15, 16)))
+    )
+    expect_equal(out$train_y, c(19, 20))
+    expect_equal(out$predict_x,
+                 as.matrix(tibble(
+                    `value-2:signal_1` = c(5, 6),
+                    `value-2:signal_2` = c(15, 16),
+                    `value-1:signal_1` = c(7, 8),
+                    `value-1:signal_2` = c(17, 18),
+                    `value-0:signal_1` = c(9, 10),
+                    `value-0:signal_2` = c(19, 20)))
+    )
+    expect_equal(out$predict_geo_values, c("az", "wv"))
+})

From 8c9a99082367142b16e65b9f1497f1e0eb47d702 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Wed, 17 Mar 2021 15:00:47 -0400
Subject: [PATCH 03/36] test for slide

---
 R-packages/modeltools/tests/testthat/test-slide.R | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 R-packages/modeltools/tests/testthat/test-slide.R

diff --git a/R-packages/modeltools/tests/testthat/test-slide.R b/R-packages/modeltools/tests/testthat/test-slide.R
new file mode 100644
index 00000000..9040dc6b
--- /dev/null
+++ b/R-packages/modeltools/tests/testthat/test-slide.R
@@ -0,0 +1,15 @@
+library(tibble)
+
+test_that("slide_by_geo works", {
+    df <- tibble(
+        geo_value = rep(c("a", "b"), each=5),
+        time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2),
+        value = 1:10
+    )
+    out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value")
+    expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value"))
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$value, df$value)
+    expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27))
+})

From 75984ddbbbd8152e922635ea8f55ea3e570761e5 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Wed, 17 Mar 2021 16:09:42 -0400
Subject: [PATCH 04/36] add argument to slide_by_geo for shifting

---
 R-packages/modeltools/R/slide.R               | 49 +++++++-----
 R-packages/modeltools/man/slide_by_geo.Rd     | 16 ++--
 .../modeltools/tests/testthat/test-slide.R    | 76 ++++++++++++++++++-
 3 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/R-packages/modeltools/R/slide.R b/R-packages/modeltools/R/slide.R
index e3420e6e..eb5b240f 100644
--- a/R-packages/modeltools/R/slide.R
+++ b/R-packages/modeltools/R/slide.R
@@ -1,7 +1,7 @@
 #' Slide a function over values in `covidcast_signal` data frame, grouped by
 #' `geo_value`  
 #'
-#' Slides a given function over the values in a  `covidcast_signal` data frame,
+#' Slides a given function over the values in a `covidcast_signal` data frame,
 #' grouped by `geo_value`. (When multiple issue dates are present, only the
 #' latest issue is considered.) See the [getting started
 #' guide](https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/modeltools.html)
@@ -10,15 +10,19 @@
 #' @param x The `covidcast_signal` data frame under consideration. 
 #' @param slide_fun Function or formula to slide over the values in `x`, grouped
 #'   by `geo_value`. To "slide" means to apply the function or formula over a
-#'   trailing window of `n` days of data. If a function, `slide_fun` must take
+#'   right-aligned window of `n` days of data. If a function, `slide_fun` must take
 #'   `x`, a data frame the same column names as the original data frame;
 #'   followed by any number of named additional arguments; and ending with
 #'   `...`, to capture general additional arguments. If a formula, `slide_fun`
 #'   can operate directly on `.x$value`, `.x$time_value`, etc., as in `~
 #'   mean(.x$value)` to compute a trailing mean over the last `n` days of data.
-#' @param n Size of the local window (in days) to use. For example, if `n = 5`,
-#'   then to estimate the derivative on November 5, we train the given method on
-#'   data in between November 1 and November 5. Default is 14.
+#' @param n Size of the local window (in days) to use.  Default is 14.
+#' @param shift Number of days forward from the `time_value` to shift the window.  Because the
+#'   window is right-aligned, if `n = 5` and `shift = 0`, then for a `time_value` of November 5,
+#'   the function will be applied to values from November 1 to November 5.  A positive value shifts
+#'   the window forwards in time, while negative values shift the window backwards in time.  Ex:
+#'   for a `time_value` of November 5 `shift = 3` captures November 4 through November 8;
+#'   `shift = -3` captures October 29 through November 2.  Default is not to shift.
 #' @param col_name String indicating the name of the new column that will
 #'   contain the derivative values. Default is "slide_value"; note that setting
 #'   `col_name = "value"` will overwrite the existing "value" column. 
@@ -44,8 +48,13 @@
 #' @importFrom dplyr %>% arrange group_by group_modify mutate ungroup
 #' @importFrom lubridate days
 #' @export
-slide_by_geo = function(x, slide_fun, n = 14, col_name = "slide_value",
-                        col_type = c("dbl", "int", "lgl", "chr", "list"), ...) {
+slide_by_geo <- function(x,
+                         slide_fun,
+                         n = 14,
+                         shift = 0,
+                         col_name = "slide_value",
+                         col_type = c("dbl", "int", "lgl", "chr", "list"),
+                         ...) {
   # Check we have the minimal columns we need
   if (!all(c("geo_value", "time_value") %in% colnames(x))) {
     stop("`x` must have columns 'geo_value' and 'time_value'.")
@@ -53,20 +62,22 @@ slide_by_geo = function(x, slide_fun, n = 14, col_name = "slide_value",
   # x = covidcast:::latest_issue(x) # TODO is this needed?
 
   # Which slide_index function?
-  col_type = match.arg(col_type)
-  slide_index_zzz = switch(col_type,
-                           "dbl" = slider::slide_index_dbl,
-                           "int" = slider::slide_index_int,
-                           "lgl" = slider::slide_index_lgl,
-                           "chr" = slider::slide_index_chr,
-                           "list" = slider::slide_index)
+  col_type <- match.arg(col_type)
+  slide_index_zzz <- switch(col_type,
+                            "dbl" = slider::slide_index_dbl,
+                            "int" = slider::slide_index_int,
+                            "lgl" = slider::slide_index_lgl,
+                            "chr" = slider::slide_index_chr,
+                            "list" = slider::slide_index)
 
   # Slide over a single geo value
-  slide_one_geo = function(.data_group, slide_fun, n, col_name, ...) {
-    slide_values = slide_index_zzz(.x = .data_group,
-                                   .i = .data_group$time_value,
-                                   .f = slide_fun, ..., 
-                                   .before = days(n-1))
+  slide_one_geo <- function(.data_group, slide_fun, n, col_name, ...) {
+    slide_values <- slide_index_zzz(.x = .data_group,
+                                    .i = .data_group$time_value,
+                                    .f = slide_fun, ..., 
+                                    .before = days(n - 1 - shift),
+                                    .after = days(shift)
+                                    )
     return(mutate(.data_group, !!col_name := slide_values))
   }
   
diff --git a/R-packages/modeltools/man/slide_by_geo.Rd b/R-packages/modeltools/man/slide_by_geo.Rd
index 8657baeb..e35cc602 100644
--- a/R-packages/modeltools/man/slide_by_geo.Rd
+++ b/R-packages/modeltools/man/slide_by_geo.Rd
@@ -9,6 +9,7 @@ slide_by_geo(
   x,
   slide_fun,
   n = 14,
+  shift = 0,
   col_name = "slide_value",
   col_type = c("dbl", "int", "lgl", "chr", "list"),
   ...
@@ -19,15 +20,20 @@ slide_by_geo(
 
 \item{slide_fun}{Function or formula to slide over the values in \code{x}, grouped
 by \code{geo_value}. To "slide" means to apply the function or formula over a
-trailing window of \code{n} days of data. If a function, \code{slide_fun} must take
+right-aligned window of \code{n} days of data. If a function, \code{slide_fun} must take
 \code{x}, a data frame the same column names as the original data frame;
 followed by any number of named additional arguments; and ending with
 \code{...}, to capture general additional arguments. If a formula, \code{slide_fun}
 can operate directly on \code{.x$value}, \code{.x$time_value}, etc., as in \code{~ mean(.x$value)} to compute a trailing mean over the last \code{n} days of data.}
 
-\item{n}{Size of the local window (in days) to use. For example, if \code{n = 5},
-then to estimate the derivative on November 5, we train the given method on
-data in between November 1 and November 5. Default is 14.}
+\item{n}{Size of the local window (in days) to use.  Default is 14.}
+
+\item{shift}{Number of days forward from the \code{time_value} to shift the window.  Because the
+window is right-aligned, if \code{n = 5} and \code{shift = 0}, then for a \code{time_value} of November 5,
+the function will be applied to values from November 1 to November 5.  A positive value shifts
+the window forwards in time, while negative values shift the window backwards in time.  Ex:
+for a \code{time_value} of November 5 \code{shift = 3} captures November 4 through November 8;
+\code{shift = -3} captures October 29 through November 2.  Default is not to shift.}
 
 \item{col_name}{String indicating the name of the new column that will
 contain the derivative values. Default is "slide_value"; note that setting
@@ -44,7 +50,7 @@ A data frame given by appending a new column to \code{x} named according
 to the \code{col_name} argument, containing the function values.
 }
 \description{
-Slides a given function over the values in a  \code{covidcast_signal} data frame,
+Slides a given function over the values in a \code{covidcast_signal} data frame,
 grouped by \code{geo_value}. (When multiple issue dates are present, only the
 latest issue is considered.) See the \href{https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/modeltools.html}{getting started guide}
 for examples.
diff --git a/R-packages/modeltools/tests/testthat/test-slide.R b/R-packages/modeltools/tests/testthat/test-slide.R
index 9040dc6b..496a6457 100644
--- a/R-packages/modeltools/tests/testthat/test-slide.R
+++ b/R-packages/modeltools/tests/testthat/test-slide.R
@@ -8,8 +8,82 @@ test_that("slide_by_geo works", {
     )
     out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value")
     expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value"))
-    expect_equal(out$time_value, df$time_value)
     expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$time_value, df$time_value)
     expect_equal(out$value, df$value)
     expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27))
 })
+
+
+test_that("slide_by_geo works with interleaved geo_values", {
+    df <- tibble(
+        geo_value = rep(c("a", "b"), 5),
+        time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), each=2),
+        # keep the individual rows the same as the previous test so the summed values are identical
+        value = c(1, 6, 2, 7, 3, 8, 4, 9, 5, 10)
+    )
+    out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value")
+    expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value"))
+    expect_equal(out$geo_value, rep(c("a", "b"), each=5))
+    expect_equal(out$time_value,
+                 rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2))
+    expect_equal(out$value, 1:10)
+    expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27))
+})
+
+test_that("slide_by_geo overwrites column when specified", {
+    df <- tibble(
+        geo_value = rep(c("a", "b"), each=5),
+        time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2),
+        value = 1:10
+    )
+    out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "value")
+    expect_equal(names(out), c("geo_value", "time_value", "value"))
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27))
+})
+
+
+test_that("slide_by_geo works with positive shift", {
+    df <- tibble(
+        geo_value = rep(c("a", "b"), each=5),
+        time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2),
+        value = 1:10
+    )
+    out <- slide_by_geo(df, ~ Sum(.x$value), n = 2, shift = 1, col_name = "summed_value")
+    expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value"))
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$value, df$value)
+    expect_equal(out$summed_value, c(3, 5, 7, 9, 5, 13, 15, 17, 19, 10))
+})
+
+test_that("slide_by_geo works with negative shift", {
+    df <- tibble(
+        geo_value = rep(c("a", "b"), each=5),
+        time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2),
+        value = 1:10
+    )
+    out <- slide_by_geo(df, ~ Sum(.x$value), n = 2, shift = -1, col_name = "summed_value")
+    expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value"))
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$value, df$value)
+    expect_equal(out$summed_value, c(0, 1, 3, 5, 7, 0, 6, 13, 15, 17))
+})
+
+test_that("slide_by_geo works with positive shift past current value", {
+    df <- tibble(
+        geo_value = rep(c("a", "b"), each=10),
+        time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-10"), "day"), 2),
+        value = 1:20
+    )
+    out <- slide_by_geo(df, ~ Sum(.x$value), n = 2, shift = 3, col_name = "summed_value")
+    expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value"))
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$value, df$value)
+    expect_equal(out$summed_value, c(7, 9, 11, 13, 15, 17, 19, 10, 0, 0,
+                                     27, 29, 31, 33, 35, 37, 39, 20, 0, 0))
+})

From 0307791c46d2161777ebbc2e2a003923da4840e2 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Wed, 17 Mar 2021 17:02:41 -0400
Subject: [PATCH 05/36] add tests for missing time values

---
 .../modeltools/tests/testthat/test-slide.R     | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/R-packages/modeltools/tests/testthat/test-slide.R b/R-packages/modeltools/tests/testthat/test-slide.R
index 496a6457..4d344383 100644
--- a/R-packages/modeltools/tests/testthat/test-slide.R
+++ b/R-packages/modeltools/tests/testthat/test-slide.R
@@ -3,7 +3,7 @@ library(tibble)
 test_that("slide_by_geo works", {
     df <- tibble(
         geo_value = rep(c("a", "b"), each=5),
-        time_value = rep(seq.Date(as.Date("2021-01-01"), as.Date("2021-01-05"), "day"), 2),
+        time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-10"), "day"),
         value = 1:10
     )
     out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value")
@@ -31,6 +31,22 @@ test_that("slide_by_geo works with interleaved geo_values", {
     expect_equal(out$summed_value, c(1, 3, 6, 9, 12, 6, 13, 21, 24, 27))
 })
 
+
+test_that("slide_by_geo works with missing time values", {
+    df <- tibble(
+        geo_value = rep(c("a", "b"), each=5),
+        time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-20"), by = 2),
+        value = 1:10
+    )
+    out <- slide_by_geo(df, ~ Sum(.x$value), n = 3, col_name = "summed_value")
+    expect_equal(names(out), c("geo_value", "time_value", "value", "summed_value"))
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$value, df$value)
+    expect_equal(out$summed_value, c(1, 3, 5, 7, 9, 6, 13, 15, 17, 19))
+})
+
+
 test_that("slide_by_geo overwrites column when specified", {
     df <- tibble(
         geo_value = rep(c("a", "b"), each=5),

From b876289f5306ad27f3d2814fa564330ca0daa68b Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Thu, 18 Mar 2021 07:01:28 -0400
Subject: [PATCH 06/36] add function for response aggregation

---
 R-packages/modeltools/R/response.R            | 53 +++++++++++++++++++
 .../modeltools/tests/testthat/test-response.R | 43 +++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 R-packages/modeltools/R/response.R
 create mode 100644 R-packages/modeltools/tests/testthat/test-response.R

diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
new file mode 100644
index 00000000..2d02fbb6
--- /dev/null
+++ b/R-packages/modeltools/R/response.R
@@ -0,0 +1,53 @@
+make_response <- function(signals,
+                          response_name,
+                          forecast_date,
+                          incidence_period,
+                          ahead) {
+    out_df <- signals
+    for (a in ahead) {
+        target_period <- as.list(get_target_period(forecast_date, incidence_period, a)[1, ])
+        window_size <- as.numeric(target_period$end - target_period$start) + 1
+        out_df <- slide_by_geo(out_df,
+                               ~ Mean(window_size * .x[[paste0("value-0:", response_name)]]),
+                               n = window_size,
+                               shift = as.numeric(target_period$end - forecast_date),
+                               col_name = paste0("value+", a, ":", response_name))
+    }
+    return(out_df)
+}
+
+#' Get the target period for a forecast date, incidence period and ahead
+#'
+#' 
+get_target_period <- function(forecast_date, incidence_period, ahead) {
+  # This function gives the start and end dates of the target period,
+  # based on the system described in the COVIDHub rules here:
+  # https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md
+  #
+  # Inputs:
+  #  forecast_date: can be a vector of dates
+  #  incidence_period: one of "epiweek" or "day"
+  #  ahead: how many epiweeks/days ahead are you forecasting?
+  forecast_date <- lubridate::ymd(forecast_date)
+  if (incidence_period == "day")
+    return(tibble(start = forecast_date + ahead, end = forecast_date + ahead))
+  assert_that(incidence_period == "epiweek",
+              msg="Unsupported `incidence_period`.")
+  # incidence_period: epiweek
+  ew_frcst_date <- MMWRweek::MMWRweek(forecast_date) # get epiweek of forecast_dates
+  sunday_of_ew_frcst_date <- MMWRweek::MMWRweek2Date(
+    MMWRyear = ew_frcst_date$MMWRyear,
+    MMWRweek = ew_frcst_date$MMWRweek,
+    MMWRday = 1) # 1 is Sunday
+  # From https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md:
+  # "For week-ahead forecasts with forecast_date of Sunday or Monday of EW12, a
+  # 1 week ahead forecast corresponds to EW12 and should have target_end_date of
+  # the Saturday of EW12. For week-ahead forecasts with forecast_date of Tuesday
+  # through Saturday of EW12, a 1 week ahead forecast corresponds to EW13 and
+  # should have target_end_date of the Saturday of EW13."
+  week_ahead <- ifelse(lubridate::wday(forecast_date) <= 2, # forecasting on a Sun/Monday
+                       ahead - 1,
+                       ahead)
+  tibble(start = sunday_of_ew_frcst_date + week_ahead * 7,
+         end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1)
+}
\ No newline at end of file
diff --git a/R-packages/modeltools/tests/testthat/test-response.R b/R-packages/modeltools/tests/testthat/test-response.R
new file mode 100644
index 00000000..29dfc230
--- /dev/null
+++ b/R-packages/modeltools/tests/testthat/test-response.R
@@ -0,0 +1,43 @@
+library(tibble)
+
+
+test_that("make_response works for epiweeks", {
+    df <- tibble(
+        geo_value = "oh",
+        time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"),
+        `value-0:my_response` = 1:31
+    )
+    out <- make_response(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2)
+
+    expect_equal(names(out),
+                 c("geo_value", "time_value", "value-0:my_response",
+                   "value+1:my_response", "value+2:my_response"))
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$`value-0:my_response`, df$`value-0:my_response`)
+    expect_equal(out$`value+1:my_response`,
+                 c(seq(35, 196, 7), seq(199.5, 217, 3.5), NA))
+    expect_equal(out$`value+2:my_response`,
+                 c(seq(84, 196, 7), seq(199.5, 217, 3.5), rep(NA, 8)))
+})
+
+
+test_that("make_response works for days", {
+    df <- tibble(
+        geo_value = "oh",
+        time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"),
+        `value-0:my_response` = 1:31
+    )
+    out <- make_response(df, "my_response", as.Date("2021-01-30"), "day", 1:4)
+
+    expect_equal(names(out),
+                 c("geo_value", "time_value", "value-0:my_response",
+                   "value+1:my_response", "value+2:my_response", "value+3:my_response", "value+4:my_response"))
+    expect_equal(out$geo_value, df$geo_value)
+    expect_equal(out$time_value, df$time_value)
+    expect_equal(out$`value-0:my_response`, df$`value-0:my_response`)
+    expect_equal(out$`value+1:my_response`, c(seq(2, 31), NA))
+    expect_equal(out$`value+2:my_response`, c(seq(3, 31), rep(NA, 2)))
+    expect_equal(out$`value+3:my_response`, c(seq(4, 31), rep(NA, 3)))
+    expect_equal(out$`value+4:my_response`, c(seq(5, 31), rep(NA, 4)))
+})

From 8ac018ae351fb3b1418cccd73c4469e96e83b1d9 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Thu, 18 Mar 2021 09:55:18 -0400
Subject: [PATCH 07/36] add documentation and change function names

---
 R-packages/modeltools/NAMESPACE               |  1 +
 R-packages/modeltools/R/response.R            | 97 ++++++++++++-------
 .../modeltools/man/add_response_columns.Rd    | 43 ++++++++
 .../modeltools/man/get_target_period.Rd       | 11 +++
 .../modeltools/tests/testthat/test-response.R |  9 +-
 5 files changed, 121 insertions(+), 40 deletions(-)
 create mode 100644 R-packages/modeltools/man/add_response_columns.Rd
 create mode 100644 R-packages/modeltools/man/get_target_period.Rd

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 8bbd90e3..53039634 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -7,6 +7,7 @@ export(Median)
 export(Min)
 export(Start)
 export(Sum)
+export(add_response_columns)
 export(estimate_deriv)
 export(pct_change)
 export(quantgen_forecaster)
diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
index 2d02fbb6..78645f9a 100644
--- a/R-packages/modeltools/R/response.R
+++ b/R-packages/modeltools/R/response.R
@@ -1,8 +1,35 @@
-make_response <- function(signals,
-                          response_name,
-                          forecast_date,
-                          incidence_period,
-                          ahead) {
+library(assertthat)
+
+#' Compute the corresponding observed response values for a set of signals.
+#'
+#' The response for an observed data point at time `t` is defined as the mean of observed values on
+#' days `t + s` and `t + e` multiplied by `e - s`, where `s` and `e` are the number of days between
+#' `forecast_date` and the start and end, respectively, of the next full `incidence_period`.  We
+#' choose to take the mean and multiply by the incidence period length rather than simply sum the
+#' observed values in order to avoid treating missing values as 0.
+#'
+#' @param signals Wide data frame of signals.  We expect the following columns:
+#'     \itemize{
+#'     \item{`geo_value`}{Strings of geographic locations.}
+#'     \item{`time_value`}{Dates of training data.}
+#'     \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values
+#'         correspond to `{signal}` `{days}` before `time_value`}
+#'     }
+#' @param response_name Name of response signal.  The value of the response variable on a single
+#'     day should be located in column `value-0:{response_name}` of `signals`.
+#' @param forecast_date Date on which the forecast will be made
+#' @param incidence_period Time period over which the response should be summed.
+#' @param ahead integer or vector of integer ahead values
+#'
+#' @return Data frame of signals equal to `signals` with `length(ahead)` additional columns with
+#'     names `value+{a}:{response_name}` corresponding to the response variable at ahead `a`. 
+#'
+#' @export
+add_response_columns <- function(signals,
+                                 response_name,
+                                 forecast_date,
+                                 incidence_period,
+                                 ahead) {
     out_df <- signals
     for (a in ahead) {
         target_period <- as.list(get_target_period(forecast_date, incidence_period, a)[1, ])
@@ -20,34 +47,34 @@ make_response <- function(signals,
 #'
 #' 
 get_target_period <- function(forecast_date, incidence_period, ahead) {
-  # This function gives the start and end dates of the target period,
-  # based on the system described in the COVIDHub rules here:
-  # https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md
-  #
-  # Inputs:
-  #  forecast_date: can be a vector of dates
-  #  incidence_period: one of "epiweek" or "day"
-  #  ahead: how many epiweeks/days ahead are you forecasting?
-  forecast_date <- lubridate::ymd(forecast_date)
-  if (incidence_period == "day")
-    return(tibble(start = forecast_date + ahead, end = forecast_date + ahead))
-  assert_that(incidence_period == "epiweek",
-              msg="Unsupported `incidence_period`.")
-  # incidence_period: epiweek
-  ew_frcst_date <- MMWRweek::MMWRweek(forecast_date) # get epiweek of forecast_dates
-  sunday_of_ew_frcst_date <- MMWRweek::MMWRweek2Date(
-    MMWRyear = ew_frcst_date$MMWRyear,
-    MMWRweek = ew_frcst_date$MMWRweek,
-    MMWRday = 1) # 1 is Sunday
-  # From https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md:
-  # "For week-ahead forecasts with forecast_date of Sunday or Monday of EW12, a
-  # 1 week ahead forecast corresponds to EW12 and should have target_end_date of
-  # the Saturday of EW12. For week-ahead forecasts with forecast_date of Tuesday
-  # through Saturday of EW12, a 1 week ahead forecast corresponds to EW13 and
-  # should have target_end_date of the Saturday of EW13."
-  week_ahead <- ifelse(lubridate::wday(forecast_date) <= 2, # forecasting on a Sun/Monday
-                       ahead - 1,
-                       ahead)
-  tibble(start = sunday_of_ew_frcst_date + week_ahead * 7,
-         end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1)
+    # This function gives the start and end dates of the target period,
+    # based on the system described in the COVIDHub rules here:
+    # https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md
+    #
+    # Inputs:
+    #  forecast_date: can be a vector of dates
+    #  incidence_period: one of "epiweek" or "day"
+    #  ahead: how many epiweeks/days ahead are you forecasting?
+    forecast_date <- lubridate::ymd(forecast_date)
+    if (incidence_period == "day")
+        return(tibble(start = forecast_date + ahead, end = forecast_date + ahead))
+    assert_that(incidence_period == "epiweek",
+                msg="Unsupported `incidence_period`.")
+    # incidence_period: epiweek
+    ew_frcst_date <- MMWRweek::MMWRweek(forecast_date) # get epiweek of forecast_dates
+    sunday_of_ew_frcst_date <- MMWRweek::MMWRweek2Date(
+        MMWRyear = ew_frcst_date$MMWRyear,
+        MMWRweek = ew_frcst_date$MMWRweek,
+        MMWRday = 1) # 1 is Sunday
+    # From https://github.com/reichlab/covid19-forecast-hub/blob/master/data-processed/README.md:
+    # "For week-ahead forecasts with forecast_date of Sunday or Monday of EW12, a
+    # 1 week ahead forecast corresponds to EW12 and should have target_end_date of
+    # the Saturday of EW12. For week-ahead forecasts with forecast_date of Tuesday
+    # through Saturday of EW12, a 1 week ahead forecast corresponds to EW13 and
+    # should have target_end_date of the Saturday of EW13."
+    week_ahead <- ifelse(lubridate::wday(forecast_date) <= 2, # forecasting on a Sun/Monday
+                        ahead - 1,
+                        ahead)
+    tibble(start = sunday_of_ew_frcst_date + week_ahead * 7,
+            end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1)
 }
\ No newline at end of file
diff --git a/R-packages/modeltools/man/add_response_columns.Rd b/R-packages/modeltools/man/add_response_columns.Rd
new file mode 100644
index 00000000..6f0c5a24
--- /dev/null
+++ b/R-packages/modeltools/man/add_response_columns.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/response.R
+\name{add_response_columns}
+\alias{add_response_columns}
+\title{Compute the corresponding observed response values for a set of signals.}
+\usage{
+add_response_columns(
+  signals,
+  response_name,
+  forecast_date,
+  incidence_period,
+  ahead
+)
+}
+\arguments{
+\item{signals}{Wide data frame of signals.  We expect the following columns:
+\itemize{
+\item{\code{geo_value}}{Strings of geographic locations.}
+\item{\code{time_value}}{Dates of training data.}
+\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values
+correspond to \code{{signal}} \code{{days}} before \code{time_value}}
+}}
+
+\item{response_name}{Name of response signal.  The value of the response variable on a single
+day should be located in column \code{value-0:{response_name}} of \code{signals}.}
+
+\item{forecast_date}{Date on which the forecast will be made}
+
+\item{incidence_period}{Time period over which the response should be summed.}
+
+\item{ahead}{integer or vector of integer ahead values}
+}
+\value{
+Data frame of signals equal to \code{signals} with \code{length(ahead)} additional columns with
+names \code{value+{a}:{response_name}} corresponding to the response variable at ahead \code{a}.
+}
+\description{
+The response for an observed data point at time \code{t} is defined as the mean of observed values on
+days \code{t + s} and \code{t + e} multiplied by \code{e - s}, where \code{s} and \code{e} are the number of days between
+\code{forecast_date} and the start and end, respectively, of the next full \code{incidence_period}.  We
+choose to take the mean and multiply by the incidence period length rather than simply sum the
+observed values in order to avoid treating missing values as 0.
+}
diff --git a/R-packages/modeltools/man/get_target_period.Rd b/R-packages/modeltools/man/get_target_period.Rd
new file mode 100644
index 00000000..48f92421
--- /dev/null
+++ b/R-packages/modeltools/man/get_target_period.Rd
@@ -0,0 +1,11 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/response.R
+\name{get_target_period}
+\alias{get_target_period}
+\title{Get the target period for a forecast date, incidence period and ahead}
+\usage{
+get_target_period(forecast_date, incidence_period, ahead)
+}
+\description{
+Get the target period for a forecast date, incidence period and ahead
+}
diff --git a/R-packages/modeltools/tests/testthat/test-response.R b/R-packages/modeltools/tests/testthat/test-response.R
index 29dfc230..83cc8cf5 100644
--- a/R-packages/modeltools/tests/testthat/test-response.R
+++ b/R-packages/modeltools/tests/testthat/test-response.R
@@ -1,13 +1,12 @@
 library(tibble)
 
-
-test_that("make_response works for epiweeks", {
+test_that("add_response_columns works for epiweeks", {
     df <- tibble(
         geo_value = "oh",
         time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"),
         `value-0:my_response` = 1:31
     )
-    out <- make_response(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2)
+    out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2)
 
     expect_equal(names(out),
                  c("geo_value", "time_value", "value-0:my_response",
@@ -22,13 +21,13 @@ test_that("make_response works for epiweeks", {
 })
 
 
-test_that("make_response works for days", {
+test_that("add_response_columns works for days", {
     df <- tibble(
         geo_value = "oh",
         time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"),
         `value-0:my_response` = 1:31
     )
-    out <- make_response(df, "my_response", as.Date("2021-01-30"), "day", 1:4)
+    out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "day", 1:4)
 
     expect_equal(names(out),
                  c("geo_value", "time_value", "value-0:my_response",

From 63304024bde0a090bf6b98294ef37f92d59b20c1 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Thu, 18 Mar 2021 10:06:56 -0400
Subject: [PATCH 08/36] add trailing newline

---
 R-packages/modeltools/R/response.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
index 78645f9a..8da25427 100644
--- a/R-packages/modeltools/R/response.R
+++ b/R-packages/modeltools/R/response.R
@@ -77,4 +77,4 @@ get_target_period <- function(forecast_date, incidence_period, ahead) {
                         ahead)
     tibble(start = sunday_of_ew_frcst_date + week_ahead * 7,
             end = sunday_of_ew_frcst_date + (week_ahead + 1) * 7 - 1)
-}
\ No newline at end of file
+}

From 09ecff73f6d1d60dd88ddb550f3663c54155f986 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Thu, 18 Mar 2021 20:27:56 -0400
Subject: [PATCH 09/36] fix comment unclarity

---
 R-packages/modeltools/R/response.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
index 8da25427..4f945cfc 100644
--- a/R-packages/modeltools/R/response.R
+++ b/R-packages/modeltools/R/response.R
@@ -3,10 +3,10 @@ library(assertthat)
 #' Compute the corresponding observed response values for a set of signals.
 #'
 #' The response for an observed data point at time `t` is defined as the mean of observed values on
-#' days `t + s` and `t + e` multiplied by `e - s`, where `s` and `e` are the number of days between
-#' `forecast_date` and the start and end, respectively, of the next full `incidence_period`.  We
-#' choose to take the mean and multiply by the incidence period length rather than simply sum the
-#' observed values in order to avoid treating missing values as 0.
+#' days `t + s` through `t + e` (inclusive) multiplied by `e - s`, where `s` and `e` are the number
+#' of days from `forecast_date` and the start and end, respectively, of the next full
+#' `incidence_period`.  We choose to take the mean and multiply by the incidence period length
+#' rather than simply sum the observed values in order to avoid treating missing values as 0.
 #'
 #' @param signals Wide data frame of signals.  We expect the following columns:
 #'     \itemize{

From d8f67ab2378af10258160be99c55babb8b8969d8 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Fri, 19 Mar 2021 14:39:41 -0400
Subject: [PATCH 10/36] update response documentation slightly

---
 R-packages/modeltools/R/response.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
index 4f945cfc..6ce3e631 100644
--- a/R-packages/modeltools/R/response.R
+++ b/R-packages/modeltools/R/response.R
@@ -4,7 +4,7 @@ library(assertthat)
 #'
 #' The response for an observed data point at time `t` is defined as the mean of observed values on
 #' days `t + s` through `t + e` (inclusive) multiplied by `e - s`, where `s` and `e` are the number
-#' of days from `forecast_date` and the start and end, respectively, of the next full
+#' of days from `forecast_date` to the start and end, respectively, of the next full
 #' `incidence_period`.  We choose to take the mean and multiply by the incidence period length
 #' rather than simply sum the observed values in order to avoid treating missing values as 0.
 #'

From 06454b9cb7e65ccfb76ea6a4bbf16a1291da9a88 Mon Sep 17 00:00:00 2001
From: Addison Hu <huisaddison@gmail.com>
Date: Fri, 19 Mar 2021 18:43:22 -0400
Subject: [PATCH 11/36] Update R-packages/modeltools/R/matrix.R

---
 R-packages/modeltools/R/matrix.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 25f95afd..1aee57b5 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -8,7 +8,7 @@
 #'         correspond to `{signal}` `{days}` before `time_value`}
 #'     \item{Response columns}{Columns with names of the form `value+{n}:{response}` whose values
 #'         correspond to `{response}` `{n}` incidence period units after `time_value`.  Since
-#'         columns beginning with "value+" are interpretted as the response rather than a
+#'         columns beginning with "value+" are interpreted as the response rather than a
 #'         covariate, observed data from the same day as `time_value` should be in columns
 #'         `value-0:{signal}` rather than `value+0:{signal}`}
 #'     }

From 2eb80f5817c59fbd73fee3bd27d98d4f8c7f8069 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Fri, 19 Mar 2021 21:15:10 -0400
Subject: [PATCH 12/36] change naming convention of values and responses

---
 R-packages/modeltools/R/response.R            | 12 ++++----
 .../modeltools/man/add_response_columns.Rd    | 16 +++++------
 .../modeltools/tests/testthat/test-response.R | 28 +++++++++----------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
index 6ce3e631..a5e48048 100644
--- a/R-packages/modeltools/R/response.R
+++ b/R-packages/modeltools/R/response.R
@@ -12,17 +12,17 @@ library(assertthat)
 #'     \itemize{
 #'     \item{`geo_value`}{Strings of geographic locations.}
 #'     \item{`time_value`}{Dates of training data.}
-#'     \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values
-#'         correspond to `{signal}` `{days}` before `time_value`}
+#'     \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` or
+#'         `value+0:{signal} whose values correspond to `{signal}` `{days}` before `time_value`}
 #'     }
 #' @param response_name Name of response signal.  The value of the response variable on a single
-#'     day should be located in column `value-0:{response_name}` of `signals`.
+#'     day should be located in column `value+0:{response_name}` of `signals`.
 #' @param forecast_date Date on which the forecast will be made
 #' @param incidence_period Time period over which the response should be summed.
 #' @param ahead integer or vector of integer ahead values
 #'
 #' @return Data frame of signals equal to `signals` with `length(ahead)` additional columns with
-#'     names `value+{a}:{response_name}` corresponding to the response variable at ahead `a`. 
+#'     names `response+{a}:{response_name}` corresponding to the response variable at ahead `a`. 
 #'
 #' @export
 add_response_columns <- function(signals,
@@ -35,10 +35,10 @@ add_response_columns <- function(signals,
         target_period <- as.list(get_target_period(forecast_date, incidence_period, a)[1, ])
         window_size <- as.numeric(target_period$end - target_period$start) + 1
         out_df <- slide_by_geo(out_df,
-                               ~ Mean(window_size * .x[[paste0("value-0:", response_name)]]),
+                               ~ Mean(window_size * .x[[paste0("value+0:", response_name)]]),
                                n = window_size,
                                shift = as.numeric(target_period$end - forecast_date),
-                               col_name = paste0("value+", a, ":", response_name))
+                               col_name = paste0("response+", a, ":", response_name))
     }
     return(out_df)
 }
diff --git a/R-packages/modeltools/man/add_response_columns.Rd b/R-packages/modeltools/man/add_response_columns.Rd
index 6f0c5a24..2e53ad48 100644
--- a/R-packages/modeltools/man/add_response_columns.Rd
+++ b/R-packages/modeltools/man/add_response_columns.Rd
@@ -17,12 +17,12 @@ add_response_columns(
 \itemize{
 \item{\code{geo_value}}{Strings of geographic locations.}
 \item{\code{time_value}}{Dates of training data.}
-\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values
-correspond to \code{{signal}} \code{{days}} before \code{time_value}}
+\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or
+\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value`}
 }}
 
 \item{response_name}{Name of response signal.  The value of the response variable on a single
-day should be located in column \code{value-0:{response_name}} of \code{signals}.}
+day should be located in column \code{value+0:{response_name}} of \code{signals}.}
 
 \item{forecast_date}{Date on which the forecast will be made}
 
@@ -32,12 +32,12 @@ day should be located in column \code{value-0:{response_name}} of \code{signals}
 }
 \value{
 Data frame of signals equal to \code{signals} with \code{length(ahead)} additional columns with
-names \code{value+{a}:{response_name}} corresponding to the response variable at ahead \code{a}.
+names \code{response+{a}:{response_name}} corresponding to the response variable at ahead \code{a}.
 }
 \description{
 The response for an observed data point at time \code{t} is defined as the mean of observed values on
-days \code{t + s} and \code{t + e} multiplied by \code{e - s}, where \code{s} and \code{e} are the number of days between
-\code{forecast_date} and the start and end, respectively, of the next full \code{incidence_period}.  We
-choose to take the mean and multiply by the incidence period length rather than simply sum the
-observed values in order to avoid treating missing values as 0.
+days \code{t + s} through \code{t + e} (inclusive) multiplied by \code{e - s}, where \code{s} and \code{e} are the number
+of days from \code{forecast_date} to the start and end, respectively, of the next full
+\code{incidence_period}.  We choose to take the mean and multiply by the incidence period length
+rather than simply sum the observed values in order to avoid treating missing values as 0.
 }
diff --git a/R-packages/modeltools/tests/testthat/test-response.R b/R-packages/modeltools/tests/testthat/test-response.R
index 83cc8cf5..393f3b06 100644
--- a/R-packages/modeltools/tests/testthat/test-response.R
+++ b/R-packages/modeltools/tests/testthat/test-response.R
@@ -4,19 +4,19 @@ test_that("add_response_columns works for epiweeks", {
     df <- tibble(
         geo_value = "oh",
         time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"),
-        `value-0:my_response` = 1:31
+        `value+0:my_response` = 1:31
     )
     out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "epiweek", 1:2)
 
     expect_equal(names(out),
-                 c("geo_value", "time_value", "value-0:my_response",
-                   "value+1:my_response", "value+2:my_response"))
+                 c("geo_value", "time_value", "value+0:my_response",
+                   "response+1:my_response", "response+2:my_response"))
     expect_equal(out$geo_value, df$geo_value)
     expect_equal(out$time_value, df$time_value)
-    expect_equal(out$`value-0:my_response`, df$`value-0:my_response`)
-    expect_equal(out$`value+1:my_response`,
+    expect_equal(out$`value+0:my_response`, df$`value+0:my_response`)
+    expect_equal(out$`response+1:my_response`,
                  c(seq(35, 196, 7), seq(199.5, 217, 3.5), NA))
-    expect_equal(out$`value+2:my_response`,
+    expect_equal(out$`response+2:my_response`,
                  c(seq(84, 196, 7), seq(199.5, 217, 3.5), rep(NA, 8)))
 })
 
@@ -25,18 +25,18 @@ test_that("add_response_columns works for days", {
     df <- tibble(
         geo_value = "oh",
         time_value = seq.Date(as.Date("2021-01-01"), as.Date("2021-01-31"), "day"),
-        `value-0:my_response` = 1:31
+        `value+0:my_response` = 1:31
     )
     out <- add_response_columns(df, "my_response", as.Date("2021-01-30"), "day", 1:4)
 
     expect_equal(names(out),
-                 c("geo_value", "time_value", "value-0:my_response",
-                   "value+1:my_response", "value+2:my_response", "value+3:my_response", "value+4:my_response"))
+                 c("geo_value", "time_value", "value+0:my_response",
+                   "response+1:my_response", "response+2:my_response", "response+3:my_response", "response+4:my_response"))
     expect_equal(out$geo_value, df$geo_value)
     expect_equal(out$time_value, df$time_value)
-    expect_equal(out$`value-0:my_response`, df$`value-0:my_response`)
-    expect_equal(out$`value+1:my_response`, c(seq(2, 31), NA))
-    expect_equal(out$`value+2:my_response`, c(seq(3, 31), rep(NA, 2)))
-    expect_equal(out$`value+3:my_response`, c(seq(4, 31), rep(NA, 3)))
-    expect_equal(out$`value+4:my_response`, c(seq(5, 31), rep(NA, 4)))
+    expect_equal(out$`value+0:my_response`, df$`value+0:my_response`)
+    expect_equal(out$`response+1:my_response`, c(seq(2, 31), NA))
+    expect_equal(out$`response+2:my_response`, c(seq(3, 31), rep(NA, 2)))
+    expect_equal(out$`response+3:my_response`, c(seq(4, 31), rep(NA, 3)))
+    expect_equal(out$`response+4:my_response`, c(seq(5, 31), rep(NA, 4)))
 })

From c6149eaa9b2f92419aaef804ce19c446d77de4c3 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Fri, 19 Mar 2021 21:23:09 -0400
Subject: [PATCH 13/36] update naming conventions for signals vs response

---
 R-packages/modeltools/R/matrix.R              | 17 +++++++---------
 .../man/create_train_and_predict_matrices.Rd  | 13 +++---------
 .../modeltools/tests/testthat/test-matrix.R   | 20 +++++++++----------
 3 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 1aee57b5..3ff2f7f2 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -4,13 +4,10 @@
 #'     \itemize{
 #'     \item{`geo_value`}{Strings of geographic locations.}
 #'     \item{`time_value`}{Dates of training data.}
-#'     \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` whose values
-#'         correspond to `{signal}` `{days}` before `time_value`}
-#'     \item{Response columns}{Columns with names of the form `value+{n}:{response}` whose values
-#'         correspond to `{response}` `{n}` incidence period units after `time_value`.  Since
-#'         columns beginning with "value+" are interpreted as the response rather than a
-#'         covariate, observed data from the same day as `time_value` should be in columns
-#'         `value-0:{signal}` rather than `value+0:{signal}`}
+#'     \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` or
+#'         `value+0:{signal} whose values correspond to `{signal}` `{days}` before `time_value`}
+#'     \item{Response columns}{Columns with names of the form `response+{n}:{response}` whose values
+#'         correspond to `{response}` `{n}` incidence period units after `time_value`.}
 #'     }
 #'     A data frame in this format can be made using `covidcast::aggregate_signals()`.
 #' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast
@@ -24,7 +21,7 @@
 #'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
 #'         latest date `d` such that there is an observed response at time `d + ahead` and all data
 #'         from the `training_window_size` days prior to it.}
-#'     \item{`train_y`}{Vector of response data from the `value+{ahead}:{response}` column of
+#'     \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of
 #'         `lagged_df` corresponding to the rows of `train_x`.}
 #'     \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`.  The
 #'         prediction data contains the most recent `training_window_size` days.}
@@ -42,7 +39,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
 
     # Find the last possible date of training data   
     response_end_date <- lagged_df %>%
-        select(time_value, tidyselect::starts_with(sprintf("value+%i:", ahead))) %>%
+        select(time_value, tidyselect::starts_with(sprintf("response+%i:", ahead))) %>%
         tidyr::drop_na() %>%
         summarize(max(time_value)) %>%
         pull()
@@ -59,7 +56,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
         filter(between(time_value,
                        train_end_date - training_window_size + 1,
                        train_end_date)) %>%
-        select(tidyselect::starts_with(sprintf("value+%i:", ahead))) %>%
+        select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>%
         pull()
 
     # Prediction matrices
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index 9a99dd6c..603e42da 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -11,15 +11,8 @@ create_train_and_predict_matrices(lagged_df, ahead, training_window_size)
 \itemize{
 \item{\code{geo_value}}{Strings of geographic locations.}
 \item{\code{time_value}}{Dates of training data.}
-\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} whose values
-correspond to \code{{signal}} \code{{days}} before \code{time_value}}
-\item{Response columns}{Columns with names of the form \code{value+{n}:{response}} whose values
-correspond to \code{{response}} \code{{n}} incidence period units after \code{time_value}.  Since
-columns beginning with "value+" are interpretted as the response rather than a
-covariate, observed data from the same day as \code{time_value} should be in columns
-\code{value-0:{signal}} rather than \code{value+0:{signal}}}
-}
-A data frame in this format can be made using \code{covidcast::aggregate_signals()}.}
+\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or
+\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()`.}
 
 \item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast}
 
@@ -34,7 +27,7 @@ Named list with entries:
 \code{value-{days}:{signal}} columns in \code{lagged_df}.  The training data consists of the
 latest date \code{d} such that there is an observed response at time \code{d + ahead} and all data
 from the \code{training_window_size} days prior to it.}
-\item{\code{train_y}}{Vector of response data from the \code{value+{ahead}:{response}} column of
+\item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of
 \code{lagged_df} corresponding to the rows of \code{train_x}.}
 \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}.  The
 prediction data contains the most recent \code{training_window_size} days.}
diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R
index bc4ced9d..3f955659 100644
--- a/R-packages/modeltools/tests/testthat/test-matrix.R
+++ b/R-packages/modeltools/tests/testthat/test-matrix.R
@@ -10,12 +10,12 @@ test_that("training and prediction matrices are created", {
         `value-2:signal_2` = seq(7, 16),
         `value-1:signal_1` = seq(-1, 8),
         `value-1:signal_2` = seq(9, 18),
-        `value-0:signal_1` = seq(1, 10),
-        `value-0:signal_2` = seq(11, 20),
-        `value+1:signal_1` = c(seq(3, 10), rep(NA, 2)),
-        `value+1:signal_2` = c(seq(13, 20), rep(NA, 2)),
-        `value+2:signal_1` = c(seq(5, 10), rep(NA, 4)),
-        `value+2:signal_2` = c(seq(15, 20), rep(NA, 4))
+        `value+0:signal_1` = seq(1, 10),
+        `value+0:signal_2` = seq(11, 20),
+        `response+1:signal_1` = c(seq(3, 10), rep(NA, 2)),
+        `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)),
+        `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)),
+        `response+2:signal_2` = c(seq(15, 20), rep(NA, 4))
     )
 
     out <- create_train_and_predict_matrices(df, 2, 1)
@@ -27,8 +27,8 @@ test_that("training and prediction matrices are created", {
                     `value-2:signal_2` = c(11, 12),
                     `value-1:signal_1` = c(3, 4),
                     `value-1:signal_2` = c(13, 14),
-                    `value-0:signal_1` = c(5, 6),
-                    `value-0:signal_2` = c(15, 16)))
+                    `value+0:signal_1` = c(5, 6),
+                    `value+0:signal_2` = c(15, 16)))
     )
     expect_equal(out$train_y, c(19, 20))
     expect_equal(out$predict_x,
@@ -37,8 +37,8 @@ test_that("training and prediction matrices are created", {
                     `value-2:signal_2` = c(15, 16),
                     `value-1:signal_1` = c(7, 8),
                     `value-1:signal_2` = c(17, 18),
-                    `value-0:signal_1` = c(9, 10),
-                    `value-0:signal_2` = c(19, 20)))
+                    `value+0:signal_1` = c(9, 10),
+                    `value+0:signal_2` = c(19, 20)))
     )
     expect_equal(out$predict_geo_values, c("az", "wv"))
 })

From 641602e26e0ac50aff6fd197d4ed49fee132fcef Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Mon, 22 Mar 2021 13:04:42 -0400
Subject: [PATCH 14/36] code example

---
 R-packages/modeltools/R/response.R               | 15 +++++++++++++++
 .../modeltools/man/add_response_columns.Rd       | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
index a5e48048..a190cb17 100644
--- a/R-packages/modeltools/R/response.R
+++ b/R-packages/modeltools/R/response.R
@@ -24,6 +24,21 @@ library(assertthat)
 #' @return Data frame of signals equal to `signals` with `length(ahead)` additional columns with
 #'     names `response+{a}:{response_name}` corresponding to the response variable at ahead `a`. 
 #'
+#' @examples
+#' \dontrun{
+#' add_response_columns(
+#'   tibble(
+#'     geo_value = "mi"
+#'     time_value = seq.Date(as.Date("2020-07-01"), as.Date("2020-07-14"), "day"),
+#'     `value+0:sig` = 1:14
+#'   ),
+#'   "sig",
+#'   as.Date("2020-07-15"),
+#'   "epiweek",
+#'   1:2 
+#' )
+#' }
+#'
 #' @export
 add_response_columns <- function(signals,
                                  response_name,
diff --git a/R-packages/modeltools/man/add_response_columns.Rd b/R-packages/modeltools/man/add_response_columns.Rd
index 2e53ad48..1f2d8ade 100644
--- a/R-packages/modeltools/man/add_response_columns.Rd
+++ b/R-packages/modeltools/man/add_response_columns.Rd
@@ -41,3 +41,19 @@ of days from \code{forecast_date} to the start and end, respectively, of the nex
 \code{incidence_period}.  We choose to take the mean and multiply by the incidence period length
 rather than simply sum the observed values in order to avoid treating missing values as 0.
 }
+\examples{
+\dontrun{
+add_response_columns(
+  tibble(
+    geo_value = "mi"
+    time_value = seq.Date(as.Date("2020-07-01"), as.Date("2020-07-14"), "day"),
+    `value+0:sig` = 1:14
+  ),
+  "sig",
+  as.Date("2020-07-15"),
+  "epiweek",
+  1:2 
+)
+}
+
+}

From f2701ea7ec1801572e40d0384f16c85adfe5d9a1 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Mon, 22 Mar 2021 14:48:12 -0400
Subject: [PATCH 15/36] update documentation

---
 R-packages/modeltools/R/matrix.R              | 26 ++++++++++++++---
 .../man/create_train_and_predict_matrices.Rd  | 28 +++++++++++++++----
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 3ff2f7f2..902ef49f 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -1,4 +1,4 @@
-#' Create a training and prediction matrices for a given ahead.
+#' Create training and testing data matrices and a training response vector for a given ahead.
 #'
 #' @param lagged_df Data frame of lagged data.  It should have the following columns:
 #'     \itemize{
@@ -9,7 +9,8 @@
 #'     \item{Response columns}{Columns with names of the form `response+{n}:{response}` whose values
 #'         correspond to `{response}` `{n}` incidence period units after `time_value`.}
 #'     }
-#'     A data frame in this format can be made using `covidcast::aggregate_signals()`.
+#'     A data frame in this format can be made using `covidcast::aggregate_signals()` and
+#'     `modeltools::get_response_columns()`.
 #' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast
 #' @param training_window_size Size of the local training window in days to use. For example, if
 #'     `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on
@@ -19,8 +20,8 @@
 #'     \itemize{
 #'     \item{`train_x`}{Matrix of training data whose columns correspond to the
 #'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
-#'         latest date `d` such that there is an observed response at time `d + ahead` and all data
-#'         from the `training_window_size` days prior to it.}
+#'         latest date `d` such that there is an observed response at time `d + ahead *
+#'         incidence_period`, plus all data from the `training_window_size` days prior to it.}
 #'     \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of
 #'         `lagged_df` corresponding to the rows of `train_x`.}
 #'     \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`.  The
@@ -28,6 +29,23 @@
 #'     \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.}
 #'     }
 #'
+#' @examples \dontrun{
+#' create_train_and_predict_matrices(
+#'   tibble(
+#'     geo_value = rep(c("az", "wv"), 5),
+#'     time_value = rep(
+#'       as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
+#'       each = 2),
+#'     `value-2:signal_1` = seq(-3, 6),
+#'     `value-1:signal_1` = seq(-1, 8),
+#'     `value+0:signal_1` = seq(1, 10),
+#'     `response+2:signal_1` = c(seq(5, 10), rep(NA, 4))
+#'   ),
+#'   ahead = 2,
+#'   training_window_size = 1)
+#' )
+#' }
+#'
 #' @importFrom tibble tibble
 #'
 #' @export 
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index 603e42da..78676a96 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/matrix.R
 \name{create_train_and_predict_matrices}
 \alias{create_train_and_predict_matrices}
-\title{Create a training and prediction matrices for a given ahead.}
+\title{Create training and testing data matrices and a training response vector for a given ahead.}
 \usage{
 create_train_and_predict_matrices(lagged_df, ahead, training_window_size)
 }
@@ -12,7 +12,7 @@ create_train_and_predict_matrices(lagged_df, ahead, training_window_size)
 \item{\code{geo_value}}{Strings of geographic locations.}
 \item{\code{time_value}}{Dates of training data.}
 \item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or
-\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()`.}
+\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()\code{and}modeltools::get_response_columns()`.}
 
 \item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast}
 
@@ -25,8 +25,7 @@ Named list with entries:
 \itemize{
 \item{\code{train_x}}{Matrix of training data whose columns correspond to the
 \code{value-{days}:{signal}} columns in \code{lagged_df}.  The training data consists of the
-latest date \code{d} such that there is an observed response at time \code{d + ahead} and all data
-from the \code{training_window_size} days prior to it.}
+latest date \code{d} such that there is an observed response at time \code{d + ahead * incidence_period}, plus all data from the \code{training_window_size} days prior to it.}
 \item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of
 \code{lagged_df} corresponding to the rows of \code{train_x}.}
 \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}.  The
@@ -35,5 +34,24 @@ prediction data contains the most recent \code{training_window_size} days.}
 }
 }
 \description{
-Create a training and prediction matrices for a given ahead.
+Create training and testing data matrices and a training response vector for a given ahead.
+}
+\examples{
+\dontrun{
+create_train_and_predict_matrices(
+  tibble(
+    geo_value = rep(c("az", "wv"), 5),
+    time_value = rep(
+      as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
+      each = 2),
+    `value-2:signal_1` = seq(-3, 6),
+    `value-1:signal_1` = seq(-1, 8),
+    `value+0:signal_1` = seq(1, 10),
+    `response+2:signal_1` = c(seq(5, 10), rep(NA, 4))
+  ),
+  ahead = 2,
+  training_window_size = 1)
+)
+}
+
 }

From 83d9db23145cb0740ffa7a01552a7ebbe5fed404 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Mon, 22 Mar 2021 15:47:48 -0400
Subject: [PATCH 16/36] brief comment updates

---
 R-packages/modeltools/R/matrix.R                          | 8 ++++----
 .../modeltools/man/create_train_and_predict_matrices.Rd   | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 902ef49f..3279f7f2 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -20,8 +20,8 @@
 #'     \itemize{
 #'     \item{`train_x`}{Matrix of training data whose columns correspond to the
 #'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
-#'         latest date `d` such that there is an observed response at time `d + ahead *
-#'         incidence_period`, plus all data from the `training_window_size` days prior to it.}
+#'         latest date with an non-null response, plus all data from the `training_window_size`
+#'         days prior to it.}
 #'     \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of
 #'         `lagged_df` corresponding to the rows of `train_x`.}
 #'     \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`.  The
@@ -53,7 +53,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
     out <- list()
 
     train_df <- lagged_df %>%
-        select(geo_value, time_value, tidyselect::matches("^value(\\+0|-)"))
+        select(geo_value, time_value, tidyselect::starts_with("value"))
 
     # Find the last possible date of training data   
     response_end_date <- lagged_df %>%
@@ -80,7 +80,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
     # Prediction matrices
     out$predict_x <- lagged_df %>%
         filter(time_value == max(time_value)) %>%
-        select(tidyselect::matches("^value(\\+0|-)")) %>%
+        select(tidyselect::starts_with("value")) %>%
         as.matrix()
     out$predict_geo_values <- lagged_df %>%
         filter(time_value == max(time_value)) %>%
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index 78676a96..b7889bd7 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -25,7 +25,8 @@ Named list with entries:
 \itemize{
 \item{\code{train_x}}{Matrix of training data whose columns correspond to the
 \code{value-{days}:{signal}} columns in \code{lagged_df}.  The training data consists of the
-latest date \code{d} such that there is an observed response at time \code{d + ahead * incidence_period}, plus all data from the \code{training_window_size} days prior to it.}
+latest date with an non-null response, plus all data from the \code{training_window_size}
+days prior to it.}
 \item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of
 \code{lagged_df} corresponding to the rows of \code{train_x}.}
 \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}.  The

From ce50871d62695720400b81697a82dec81817d056 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Mon, 22 Mar 2021 16:02:30 -0400
Subject: [PATCH 17/36] add test for multiple aheads

---
 R-packages/modeltools/NAMESPACE               |  1 +
 R-packages/modeltools/R/matrix.R              |  8 ++++++
 .../modeltools/tests/testthat/test-matrix.R   | 25 +++++++++++++++++--
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 9d353a1c..7786c780 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -13,6 +13,7 @@ export(pct_change)
 export(quantgen_forecaster)
 export(quiet)
 export(slide_by_geo)
+importFrom(assertthat,assert_that)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)
diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 3279f7f2..d7a1e7c9 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -47,11 +47,19 @@
 #' }
 #'
 #' @importFrom tibble tibble
+#' @importFrom assertthat assert_that
 #'
 #' @export 
 create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) {
     out <- list()
 
+    # make sure the response columns are unique
+    responses_at_ahead <- lagged_df %>%
+        select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>%
+        ncol()
+    assert_that(responses_at_ahead == 1,
+                msg=paste("multiple responses at ahead =",ahead))
+
     train_df <- lagged_df %>%
         select(geo_value, time_value, tidyselect::starts_with("value"))
 
diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R
index 3f955659..2dcf3e5e 100644
--- a/R-packages/modeltools/tests/testthat/test-matrix.R
+++ b/R-packages/modeltools/tests/testthat/test-matrix.R
@@ -12,9 +12,7 @@ test_that("training and prediction matrices are created", {
         `value-1:signal_2` = seq(9, 18),
         `value+0:signal_1` = seq(1, 10),
         `value+0:signal_2` = seq(11, 20),
-        `response+1:signal_1` = c(seq(3, 10), rep(NA, 2)),
         `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)),
-        `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)),
         `response+2:signal_2` = c(seq(15, 20), rep(NA, 4))
     )
 
@@ -42,3 +40,26 @@ test_that("training and prediction matrices are created", {
     )
     expect_equal(out$predict_geo_values, c("az", "wv"))
 })
+
+test_that("fails with multiple responses", {
+    df <- tibble(
+        geo_value = rep(c("az", "wv"), 5),
+        time_value = rep(
+            as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
+            each = 2),
+        `value-2:signal_1` = seq(-3, 6),
+        `value-2:signal_2` = seq(7, 16),
+        `value-1:signal_1` = seq(-1, 8),
+        `value-1:signal_2` = seq(9, 18),
+        `value+0:signal_1` = seq(1, 10),
+        `value+0:signal_2` = seq(11, 20),
+        `response+1:signal_1` = c(seq(3, 10), rep(NA, 2)),
+        `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)),
+        `response+2:signal_1` = c(seq(5, 10), rep(NA, 4)),
+        `response+2:signal_2` = c(seq(15, 20), rep(NA, 4))
+    )
+    expect_error(
+        create_train_and_predict_matrices(df, 2, 1),
+        "multiple responses at ahead = 2"
+    )
+})

From 98bb3475fe4f28daf7846159398de47afe71d972 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Mon, 22 Mar 2021 19:57:54 -0400
Subject: [PATCH 18/36] remove library call

---
 R-packages/modeltools/NAMESPACE    | 1 +
 R-packages/modeltools/R/response.R | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 53039634..10575f99 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -13,6 +13,7 @@ export(pct_change)
 export(quantgen_forecaster)
 export(quiet)
 export(slide_by_geo)
+importFrom(assertthat,assert_that)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)
diff --git a/R-packages/modeltools/R/response.R b/R-packages/modeltools/R/response.R
index a190cb17..eacc9228 100644
--- a/R-packages/modeltools/R/response.R
+++ b/R-packages/modeltools/R/response.R
@@ -1,5 +1,3 @@
-library(assertthat)
-
 #' Compute the corresponding observed response values for a set of signals.
 #'
 #' The response for an observed data point at time `t` is defined as the mean of observed values on
@@ -39,6 +37,8 @@ library(assertthat)
 #' )
 #' }
 #'
+#' @importFrom assertthat assert_that
+#'
 #' @export
 add_response_columns <- function(signals,
                                  response_name,

From 70a11d445f619edd36c67c436324f3e4bb6bcf73 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Mon, 22 Mar 2021 20:16:47 -0400
Subject: [PATCH 19/36] update description

---
 R-packages/modeltools/DESCRIPTION | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION
index ccbff0e8..c3af2159 100755
--- a/R-packages/modeltools/DESCRIPTION
+++ b/R-packages/modeltools/DESCRIPTION
@@ -19,7 +19,8 @@ RoxygenNote: 7.1.1
 Remotes:
     github::cmu-delphi/covidcast/R-packages/covidcast@main,
     github::cmu-delphi/covidcast/R-packages/evalcast@main
-Imports: 
+Imports:
+    assertthat,
     covidcast,
     dplyr,
     evalcast,

From be9a5357a2c62425789448db3702baba11e38be4 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Wed, 24 Mar 2021 15:11:22 -0400
Subject: [PATCH 20/36] make example forecaster

---
 R-packages/modeltools/NAMESPACE              |  3 +-
 R-packages/modeltools/R/example_forecaster.R | 31 ++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 R-packages/modeltools/R/example_forecaster.R

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 1f8a03bf..1ed016f6 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -7,9 +7,10 @@ export(Median)
 export(Min)
 export(Start)
 export(Sum)
-export(create_train_and_predict_matrices)
 export(add_response_columns)
+export(create_train_and_predict_matrices)
 export(estimate_deriv)
+export(example_forecaster)
 export(pct_change)
 export(quantgen_forecaster)
 export(quiet)
diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R
new file mode 100644
index 00000000..74c6df55
--- /dev/null
+++ b/R-packages/modeltools/R/example_forecaster.R
@@ -0,0 +1,31 @@
+#' @export
+example_forecaster <- function(df_list, forecast_date) {
+    ahead <- 1:4
+    lags <- c(0, -1, -2, -3, -7, -14)
+    quantiles <- modeltools::covidhub_probs()
+
+    covariates <- covidcast::aggregate_signals(df_list, dt = lags, format = "wide")
+    covariates_with_response <- add_response_columns(covariates,
+                                                     "jhu-csse_confirmed_incidence_num",
+                                                     forecast_date,
+                                                     "epiweek",
+                                                     ahead)
+
+    results <- list()
+    for (a in ahead) {
+        mats <- create_train_and_predict_matrices(covariates_with_response, a, 14)
+        model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0)
+        predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x)
+        print(predictions)
+        colnames(predictions) <- quantiles
+        predict_df <- bind_cols(geo_value = mats$predict_geo_values,
+                                predictions) %>%
+            pivot_longer(cols = -geo_value,
+                         names_to = "quantile",
+                         values_to = "value") %>%
+            mutate(ahead = a)
+
+        results[[a]] <- predict_df
+    }
+    return(bind_rows(results))
+}
\ No newline at end of file

From fb76ee0fc5ecd8a8e01cbccddd3ebb1b97798067 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Wed, 24 Mar 2021 15:43:38 -0400
Subject: [PATCH 21/36] update example_forecaster documentation

---
 R-packages/modeltools/R/example_forecaster.R  | 16 +++++++++++
 .../modeltools/man/example_forecaster.Rd      | 27 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 R-packages/modeltools/man/example_forecaster.Rd

diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R
index 74c6df55..b9980abd 100644
--- a/R-packages/modeltools/R/example_forecaster.R
+++ b/R-packages/modeltools/R/example_forecaster.R
@@ -1,3 +1,19 @@
+#' Simple example forecaster that forecasts confirmed cases.
+#'
+#' @param df_list List of downloaded covidcast signals
+#' @param forecast_date Date from which the forecast should be made.
+#'
+#' preds <- evalcast::get_predictions(modeltools::example_forecaster,
+#'                                    "example",
+#'                                    tibble(
+#'                                      data_source = "jhu-csse",
+#'                                      signal = "confirmed_incidence_num",
+#'                                      geo_type = "state",
+#'                                      start_date = as.Date("2021-01-01"),
+#'                                    ),
+#'                                    as.Date("2021-03-01"),
+#'                                    "epiweek")
+#'
 #' @export
 example_forecaster <- function(df_list, forecast_date) {
     ahead <- 1:4
diff --git a/R-packages/modeltools/man/example_forecaster.Rd b/R-packages/modeltools/man/example_forecaster.Rd
new file mode 100644
index 00000000..b743fb09
--- /dev/null
+++ b/R-packages/modeltools/man/example_forecaster.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/example_forecaster.R
+\name{example_forecaster}
+\alias{example_forecaster}
+\title{Simple example forecaster that forecasts confirmed cases.}
+\usage{
+example_forecaster(df_list, forecast_date)
+}
+\arguments{
+\item{df_list}{List of downloaded covidcast signals}
+
+\item{forecast_date}{Date from which the forecast should be made.
+
+preds <- evalcast::get_predictions(modeltools::example_forecaster,
+"example",
+tibble(
+data_source = "jhu-csse",
+signal = "confirmed_incidence_num",
+geo_type = "state",
+start_date = as.Date("2021-01-01"),
+),
+as.Date("2021-03-01"),
+"epiweek")}
+}
+\description{
+Simple example forecaster that forecasts confirmed cases.
+}

From ee2e40b9342a23137c694f75e550cca1ff787dbb Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Wed, 24 Mar 2021 15:44:54 -0400
Subject: [PATCH 22/36] update quantgen forecaster to use new signature

---
 R-packages/modeltools/R/quantgen.R               | 14 +++++++-------
 R-packages/modeltools/man/quantgen_forecaster.Rd |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/R-packages/modeltools/R/quantgen.R b/R-packages/modeltools/R/quantgen.R
index 8364156c..386e1585 100644
--- a/R-packages/modeltools/R/quantgen.R
+++ b/R-packages/modeltools/R/quantgen.R
@@ -6,7 +6,7 @@
 #' vignette](https://cmu-delphi.github.io/covidcast/modeltoolsR/articles/quantgen-forecast.html)
 #' for examples.
 #' 
-#' @param df Data frame of signal values to use for forecasting, of the format
+#' @param df_list List of data frames of signal values to use for forecasting, of the format
 #'   that is returned by [covidcast::covidcast_signals()].
 #' @param forecast_date Date object or string of the form "YYYY-MM-DD",
 #'   indicating the date on which forecasts will be made. For example, if
@@ -109,7 +109,7 @@
 #' @importFrom dplyr filter select pull summarize between bind_cols
 #' @importFrom tidyr pivot_longer
 #' @export
-quantgen_forecaster = function(df, forecast_date, signals, incidence_period,
+quantgen_forecaster = function(df_list, forecast_date, signals, incidence_period,
                                ahead, geo_type,
                                n = 4 * ifelse(incidence_period == "day", 7, 1),
                                lags = 0, tau = modeltools::covidhub_probs,
@@ -138,8 +138,8 @@ quantgen_forecaster = function(df, forecast_date, signals, incidence_period,
     if (is.null(inv_trans)) {
       stop("If `transform` is specified, then `inv_trans` must be as well.")
     }
-    for (i in 1:length(df)) {
-      df[[i]] = df[[i]] %>% mutate(value = transform[[i]](value))
+    for (i in 1:length(df_list)) {
+      df_list[[i]] = df_list[[i]] %>% mutate(value = transform[[i]](value))
     }
   }
 
@@ -147,9 +147,9 @@ quantgen_forecaster = function(df, forecast_date, signals, incidence_period,
   # shift, for each ahead value, for convenience later 
   dt = lapply(lags, "-")
   dt[[1]] = c(dt[[1]], ahead)
-    
+
   # Append shifts, and aggregate into wide format
-  df_wide = covidcast::aggregate_signals(df, dt = dt, format = "wide")
+  df_wide = covidcast::aggregate_signals(df_list, dt = dt, format = "wide")
   
   # Separate out into feature data frame, featurize if we need to
   df_features = df_wide %>%
@@ -214,7 +214,7 @@ quantgen_forecaster = function(df, forecast_date, signals, incidence_period,
                      train_end_date - n + 1,
                      train_end_date)) %>%
       select(tidyselect::starts_with(sprintf("value+%i:", a))) %>% pull()
-    
+
     # Define noncrossing constraints, if we need to
     if (noncross) {
       if (noncross_points == "all") x0 = rbind(x, newx)
diff --git a/R-packages/modeltools/man/quantgen_forecaster.Rd b/R-packages/modeltools/man/quantgen_forecaster.Rd
index 816108c2..b810303d 100644
--- a/R-packages/modeltools/man/quantgen_forecaster.Rd
+++ b/R-packages/modeltools/man/quantgen_forecaster.Rd
@@ -5,7 +5,7 @@
 \title{Simple quantile autoregressive forecaster based on \code{quantgen}}
 \usage{
 quantgen_forecaster(
-  df,
+  df_list,
   forecast_date,
   signals,
   incidence_period,
@@ -25,7 +25,7 @@ quantgen_forecaster(
 )
 }
 \arguments{
-\item{df}{Data frame of signal values to use for forecasting, of the format
+\item{df_list}{List of data frames of signal values to use for forecasting, of the format
 that is returned by \code{\link[covidcast:covidcast_signals]{covidcast::covidcast_signals()}}.}
 
 \item{forecast_date}{Date object or string of the form "YYYY-MM-DD",

From bd2a3f71397171bf83d0065feb97d1e11cc60ae0 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Wed, 24 Mar 2021 15:47:03 -0400
Subject: [PATCH 23/36] cleanup

---
 R-packages/modeltools/R/example_forecaster.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R
index b9980abd..1dda5c90 100644
--- a/R-packages/modeltools/R/example_forecaster.R
+++ b/R-packages/modeltools/R/example_forecaster.R
@@ -32,7 +32,7 @@ example_forecaster <- function(df_list, forecast_date) {
         mats <- create_train_and_predict_matrices(covariates_with_response, a, 14)
         model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0)
         predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x)
-        print(predictions)
+
         colnames(predictions) <- quantiles
         predict_df <- bind_cols(geo_value = mats$predict_geo_values,
                                 predictions) %>%
@@ -44,4 +44,4 @@ example_forecaster <- function(df_list, forecast_date) {
         results[[a]] <- predict_df
     }
     return(bind_rows(results))
-}
\ No newline at end of file
+}

From 75dc1b4dba1fa22e9cfe0b57aa21cef709fb6812 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Thu, 25 Mar 2021 10:58:38 -0400
Subject: [PATCH 24/36] move values to arguments

---
 R-packages/modeltools/NAMESPACE               |  2 ++
 R-packages/modeltools/R/example_forecaster.R  | 23 +++++++++++++------
 .../modeltools/man/example_forecaster.Rd      | 22 +++++++++++++++---
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 1ed016f6..23705baa 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -17,9 +17,11 @@ export(quiet)
 export(slide_by_geo)
 importFrom(assertthat,assert_that)
 importFrom(dplyr,"%>%")
+importFrom(dplyr,"bind_cols,")
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)
 importFrom(dplyr,bind_cols)
+importFrom(dplyr,bind_rows)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_modify)
diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R
index 1dda5c90..596d248f 100644
--- a/R-packages/modeltools/R/example_forecaster.R
+++ b/R-packages/modeltools/R/example_forecaster.R
@@ -1,7 +1,12 @@
 #' Simple example forecaster that forecasts confirmed cases.
 #'
-#' @param df_list List of downloaded covidcast signals
+#' @param df_list List of downloaded covidcast signals.
 #' @param forecast_date Date from which the forecast should be made.
+#' @param ahead Vector of epiweeks ahead to predict.
+#' @param training_window_size Number of days prior to `forecast_date` to use for training.
+#' @param lags Vector of lags or list of vectors of lags to apply to `df_list`.  See documentation
+#'     of `dt` argument to `covidcast::aggregate_signals()` for details.
+#' @param quantiles Vector of quantile values at which to predict
 #'
 #' preds <- evalcast::get_predictions(modeltools::example_forecaster,
 #'                                    "example",
@@ -14,12 +19,16 @@
 #'                                    as.Date("2021-03-01"),
 #'                                    "epiweek")
 #'
+#' @importFrom dplyr mutate bind_cols, bind_rows
+#' @importFrom tidyr pivot_longer
+#'
 #' @export
-example_forecaster <- function(df_list, forecast_date) {
-    ahead <- 1:4
-    lags <- c(0, -1, -2, -3, -7, -14)
-    quantiles <- modeltools::covidhub_probs()
-
+example_forecaster <- function(df_list,
+                               forecast_date,
+                               ahead = 1:4,
+                               training_window_size = 14,
+                               lags = seq(0, -1*training_window_size, -7),
+                               quantiles = modeltools::covidhub_probs) {
     covariates <- covidcast::aggregate_signals(df_list, dt = lags, format = "wide")
     covariates_with_response <- add_response_columns(covariates,
                                                      "jhu-csse_confirmed_incidence_num",
@@ -29,7 +38,7 @@ example_forecaster <- function(df_list, forecast_date) {
 
     results <- list()
     for (a in ahead) {
-        mats <- create_train_and_predict_matrices(covariates_with_response, a, 14)
+        mats <- create_train_and_predict_matrices(covariates_with_response, a, training_window_size)
         model <- quantgen::quantile_lasso(mats$train_x, mats$train_y, quantiles, lambda = 0)
         predictions <- quantgen:::predict.quantile_genlasso(model, mats$predict_x)
 
diff --git a/R-packages/modeltools/man/example_forecaster.Rd b/R-packages/modeltools/man/example_forecaster.Rd
index b743fb09..03143770 100644
--- a/R-packages/modeltools/man/example_forecaster.Rd
+++ b/R-packages/modeltools/man/example_forecaster.Rd
@@ -4,12 +4,28 @@
 \alias{example_forecaster}
 \title{Simple example forecaster that forecasts confirmed cases.}
 \usage{
-example_forecaster(df_list, forecast_date)
+example_forecaster(
+  df_list,
+  forecast_date,
+  ahead = 1:4,
+  training_window_size = 14,
+  lags = seq(0, -1 * training_window_size, -7),
+  quantiles = modeltools::covidhub_probs
+)
 }
 \arguments{
-\item{df_list}{List of downloaded covidcast signals}
+\item{df_list}{List of downloaded covidcast signals.}
 
-\item{forecast_date}{Date from which the forecast should be made.
+\item{forecast_date}{Date from which the forecast should be made.}
+
+\item{ahead}{Vector of epiweeks ahead to predict.}
+
+\item{training_window_size}{Number of days prior to \code{forecast_date} to use for training.}
+
+\item{lags}{Vector of lags or list of vectors of lags to apply to \code{df_list}.  See documentation
+of \code{dt} argument to \code{covidcast::aggregate_signals()} for details.}
+
+\item{quantiles}{Vector of quantile values at which to predict
 
 preds <- evalcast::get_predictions(modeltools::example_forecaster,
 "example",

From 0927475e300597e5fb072833dfc55d71a74da1c9 Mon Sep 17 00:00:00 2001
From: Mike O'Brien <mikeob@google.com>
Date: Fri, 26 Mar 2021 08:57:37 -0400
Subject: [PATCH 25/36] remove stray comma

---
 R-packages/modeltools/NAMESPACE              | 1 -
 R-packages/modeltools/R/example_forecaster.R | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 23705baa..62e60157 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -17,7 +17,6 @@ export(quiet)
 export(slide_by_geo)
 importFrom(assertthat,assert_that)
 importFrom(dplyr,"%>%")
-importFrom(dplyr,"bind_cols,")
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)
 importFrom(dplyr,bind_cols)
diff --git a/R-packages/modeltools/R/example_forecaster.R b/R-packages/modeltools/R/example_forecaster.R
index 596d248f..4b6ccc60 100644
--- a/R-packages/modeltools/R/example_forecaster.R
+++ b/R-packages/modeltools/R/example_forecaster.R
@@ -19,7 +19,7 @@
 #'                                    as.Date("2021-03-01"),
 #'                                    "epiweek")
 #'
-#' @importFrom dplyr mutate bind_cols, bind_rows
+#' @importFrom dplyr mutate bind_cols bind_rows
 #' @importFrom tidyr pivot_longer
 #'
 #' @export

From 6cd72a7eb4f46694465169b04e4195b9c26528ed Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Fri, 26 Mar 2021 14:54:01 -0700
Subject: [PATCH 26/36] export train_predict_matrices

---
 R-packages/modeltools/DESCRIPTION                         | 8 +++++---
 R-packages/modeltools/R/matrix.R                          | 6 ++++--
 .../modeltools/man/create_train_and_predict_matrices.Rd   | 1 +
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION
index c3af2159..d9426422 100755
--- a/R-packages/modeltools/DESCRIPTION
+++ b/R-packages/modeltools/DESCRIPTION
@@ -17,8 +17,9 @@ LazyData: true
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.1.1
 Remotes:
-    github::cmu-delphi/covidcast/R-packages/covidcast@main,
-    github::cmu-delphi/covidcast/R-packages/evalcast@main
+    cmu-delphi/covidcast/R-packages/covidcast@main,
+    cmu-delphi/covidcast/R-packages/evalcast@main
+    ryantibs/quantgen/R-package/quantgen@master,
 Imports:
     assertthat,
     covidcast,
@@ -27,4 +28,5 @@ Imports:
     genlasso,
     slider,
     tidyselect,
-    tidyr
+    tidyr,
+    quantgen
diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index d7a1e7c9..7070c738 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -27,6 +27,7 @@
 #'     \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`.  The
 #'         prediction data contains the most recent `training_window_size` days.}
 #'     \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.}
+#'     \item{`train_end_date`}{latest `time_value` used in the training period}
 #'     }
 #'
 #' @examples \dontrun{
@@ -49,7 +50,7 @@
 #' @importFrom tibble tibble
 #' @importFrom assertthat assert_that
 #'
-#' @export 
+#' @export
 create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) {
     out <- list()
 
@@ -63,7 +64,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
     train_df <- lagged_df %>%
         select(geo_value, time_value, tidyselect::starts_with("value"))
 
-    # Find the last possible date of training data   
+    # Find the last possible date of training data
     response_end_date <- lagged_df %>%
         select(time_value, tidyselect::starts_with(sprintf("response+%i:", ahead))) %>%
         tidyr::drop_na() %>%
@@ -93,6 +94,7 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
     out$predict_geo_values <- lagged_df %>%
         filter(time_value == max(time_value)) %>%
         select(geo_value) %>% pull()
+    out$train_end_date <- train_end_date
 
     return(out)
 }
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index b7889bd7..6f552ed3 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -32,6 +32,7 @@ days prior to it.}
 \item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}.  The
 prediction data contains the most recent \code{training_window_size} days.}
 \item{\code{predict_geo_values}}{Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.}
+\item{\code{train_end_date}}{latest \code{time_value} used in the training period}
 }
 }
 \description{

From 08c5b66c8bea7d07bbff2a05c99039ad3305feb8 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Fri, 26 Mar 2021 14:55:15 -0700
Subject: [PATCH 27/36] fix documentation

---
 R-packages/modeltools/R/matrix.R                               | 1 -
 R-packages/modeltools/man/create_train_and_predict_matrices.Rd | 1 -
 2 files changed, 2 deletions(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 7070c738..77db82fc 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -44,7 +44,6 @@
 #'   ),
 #'   ahead = 2,
 #'   training_window_size = 1)
-#' )
 #' }
 #'
 #' @importFrom tibble tibble
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index 6f552ed3..189af173 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -53,7 +53,6 @@ create_train_and_predict_matrices(
   ),
   ahead = 2,
   training_window_size = 1)
-)
 }
 
 }

From da8b866231f00ab6dc7ffd13046345a5e5aff4b3 Mon Sep 17 00:00:00 2001
From: Kenneth Tay <kjytay@gmail.com>
Date: Thu, 1 Apr 2021 13:42:42 -0700
Subject: [PATCH 28/36] Fix failing test in test-matrix.R.

---
 R-packages/modeltools/tests/testthat/test-matrix.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R
index 2dcf3e5e..88c04541 100644
--- a/R-packages/modeltools/tests/testthat/test-matrix.R
+++ b/R-packages/modeltools/tests/testthat/test-matrix.R
@@ -18,7 +18,8 @@ test_that("training and prediction matrices are created", {
 
     out <- create_train_and_predict_matrices(df, 2, 1)
 
-    expect_equal(names(out), c("train_x", "train_y", "predict_x", "predict_geo_values"))
+    expect_equal(names(out), c("train_x", "train_y", "predict_x", 
+                               "predict_geo_values", "train_end_date"))
     expect_equal(out$train_x,
                  as.matrix(tibble(
                     `value-2:signal_1` = c(1, 2),

From d907095d617566b86a977b063853767c02501b33 Mon Sep 17 00:00:00 2001
From: Kenneth Tay <kjytay@gmail.com>
Date: Thu, 1 Apr 2021 13:53:38 -0700
Subject: [PATCH 29/36] Fix documentation for
 create_train_and_predict_matrices().

---
 R-packages/modeltools/R/matrix.R              | 30 +++++++++---------
 .../man/create_train_and_predict_matrices.Rd  | 31 +++++++++++--------
 2 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 77db82fc..280ae6b3 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -1,13 +1,13 @@
 #' Create training and testing data matrices and a training response vector for a given ahead.
 #'
-#' @param lagged_df Data frame of lagged data.  It should have the following columns:
+#' @param lagged_df Data frame of lagged data. It should have the following columns:
 #'     \itemize{
-#'     \item{`geo_value`}{Strings of geographic locations.}
-#'     \item{`time_value`}{Dates of training data.}
-#'     \item{Covariate columns}{Columns with names of the form `value-{days}:{signal}` or
-#'         `value+0:{signal} whose values correspond to `{signal}` `{days}` before `time_value`}
-#'     \item{Response columns}{Columns with names of the form `response+{n}:{response}` whose values
-#'         correspond to `{response}` `{n}` incidence period units after `time_value`.}
+#'     \item `geo_value`: Strings of geographic locations.
+#'     \item `time_value`: Dates of training data.
+#'     \item Covariate columns: Columns with names of the form `value-{days}:{signal}` or
+#'         `value+0:{signal}` whose values correspond to `{signal}` `{days}` before `time_value`.
+#'     \item Response columns: Columns with names of the form `response+{n}:{response}` whose values
+#'         correspond to `{response}` `{n}` incidence period units after `time_value`.
 #'     }
 #'     A data frame in this format can be made using `covidcast::aggregate_signals()` and
 #'     `modeltools::get_response_columns()`.
@@ -18,16 +18,16 @@
 #'
 #' @return Named list with entries:
 #'     \itemize{
-#'     \item{`train_x`}{Matrix of training data whose columns correspond to the
+#'     \item `train_x`: Matrix of training data whose columns correspond to the
 #'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
 #'         latest date with an non-null response, plus all data from the `training_window_size`
-#'         days prior to it.}
-#'     \item{`train_y`}{Vector of response data from the `response+{ahead}:{response}` column of
-#'         `lagged_df` corresponding to the rows of `train_x`.}
-#'     \item{`predict_x`}{Matrix of prediction data in the same format as `train_x`.  The
-#'         prediction data contains the most recent `training_window_size` days.}
-#'     \item{`predict_geo_values`}{Vector of `geo_values` corresponding to the rows of `predict_x`.}
-#'     \item{`train_end_date`}{latest `time_value` used in the training period}
+#'         days prior to it.
+#'     \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of
+#'         `lagged_df` corresponding to the rows of `train_x`.
+#'     \item `predict_x`: Matrix of prediction data in the same format as `train_x`.  The
+#'         prediction data contains the most recent `training_window_size` days.
+#'     \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`.
+#'     \item `train_end_date`: Latest `time_value` used in the training period.
 #'     }
 #'
 #' @examples \dontrun{
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index 189af173..1b6b02fe 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -7,12 +7,17 @@
 create_train_and_predict_matrices(lagged_df, ahead, training_window_size)
 }
 \arguments{
-\item{lagged_df}{Data frame of lagged data.  It should have the following columns:
+\item{lagged_df}{Data frame of lagged data. It should have the following columns:
 \itemize{
-\item{\code{geo_value}}{Strings of geographic locations.}
-\item{\code{time_value}}{Dates of training data.}
-\item{Covariate columns}{Columns with names of the form \code{value-{days}:{signal}} or
-\verb{value+0:\{signal\} whose values correspond to }{signal}\verb{}{days}\code{before}time_value\verb{\} \\item\{Response columns\}\{Columns with names of the form }response+{n}:{response}\verb{whose values correspond to}{response}\verb{}{n}\verb{incidence period units after}time_value\verb{.\} \} A data frame in this format can be made using }covidcast::aggregate_signals()\code{and}modeltools::get_response_columns()`.}
+\item \code{geo_value}: Strings of geographic locations.
+\item \code{time_value}: Dates of training data.
+\item Covariate columns: Columns with names of the form \code{value-{days}:{signal}} or
+\code{value+0:{signal}} whose values correspond to \code{{signal}} \code{{days}} before \code{time_value}.
+\item Response columns: Columns with names of the form \code{response+{n}:{response}} whose values
+correspond to \code{{response}} \code{{n}} incidence period units after \code{time_value}.
+}
+A data frame in this format can be made using \code{covidcast::aggregate_signals()} and
+\code{modeltools::get_response_columns()}.}
 
 \item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast}
 
@@ -23,16 +28,16 @@ data from December 1 to December 14.}
 \value{
 Named list with entries:
 \itemize{
-\item{\code{train_x}}{Matrix of training data whose columns correspond to the
+\item \code{train_x}: Matrix of training data whose columns correspond to the
 \code{value-{days}:{signal}} columns in \code{lagged_df}.  The training data consists of the
 latest date with an non-null response, plus all data from the \code{training_window_size}
-days prior to it.}
-\item{\code{train_y}}{Vector of response data from the \code{response+{ahead}:{response}} column of
-\code{lagged_df} corresponding to the rows of \code{train_x}.}
-\item{\code{predict_x}}{Matrix of prediction data in the same format as \code{train_x}.  The
-prediction data contains the most recent \code{training_window_size} days.}
-\item{\code{predict_geo_values}}{Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.}
-\item{\code{train_end_date}}{latest \code{time_value} used in the training period}
+days prior to it.
+\item \code{train_y}: Vector of response data from the \code{response+{ahead}:{response}} column of
+\code{lagged_df} corresponding to the rows of \code{train_x}.
+\item \code{predict_x}: Matrix of prediction data in the same format as \code{train_x}.  The
+prediction data contains the most recent \code{training_window_size} days.
+\item \code{predict_geo_values}: Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.
+\item \code{train_end_date}: Latest \code{time_value} used in the training period.
 }
 }
 \description{

From 90c85e67a6ca3ead4857879a6242d0f24eb11a8d Mon Sep 17 00:00:00 2001
From: Kenneth Tay <kjytay@gmail.com>
Date: Thu, 1 Apr 2021 22:24:45 -0700
Subject: [PATCH 30/36] Modify create_train_and_predict_matrices() to create
 matrices for multiple ahead values.

---
 R-packages/modeltools/R/matrix.R              | 110 ++++++++++--------
 .../man/create_train_and_predict_matrices.Rd  |  10 +-
 .../modeltools/tests/testthat/test-matrix.R   |  29 ++++-
 3 files changed, 98 insertions(+), 51 deletions(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 280ae6b3..1b2c1fe7 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -1,4 +1,7 @@
-#' Create training and testing data matrices and a training response vector for a given ahead.
+#' Create training and test data matrices and a training response for given aheads.
+#' 
+#' Create training and test data matrices and training response for a set of
+#' given aheads. Works for both single ahead values and a vector of ahead values.
 #'
 #' @param lagged_df Data frame of lagged data. It should have the following columns:
 #'     \itemize{
@@ -11,7 +14,10 @@
 #'     }
 #'     A data frame in this format can be made using `covidcast::aggregate_signals()` and
 #'     `modeltools::get_response_columns()`.
-#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast
+#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast.
+#' Can be a single positive integer or a vector of positive integers. Note that 
+#' for each `{a}` in `ahead`, the column `response+{a}:{response}` should be
+#' present in `lagged_df`.
 #' @param training_window_size Size of the local training window in days to use. For example, if
 #'     `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on
 #'     data from December 1 to December 14.
@@ -51,49 +57,59 @@
 #'
 #' @export
 create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) {
-    out <- list()
-
-    # make sure the response columns are unique
-    responses_at_ahead <- lagged_df %>%
-        select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>%
-        ncol()
-    assert_that(responses_at_ahead == 1,
-                msg=paste("multiple responses at ahead =",ahead))
-
-    train_df <- lagged_df %>%
-        select(geo_value, time_value, tidyselect::starts_with("value"))
-
-    # Find the last possible date of training data
-    response_end_date <- lagged_df %>%
-        select(time_value, tidyselect::starts_with(sprintf("response+%i:", ahead))) %>%
-        tidyr::drop_na() %>%
-        summarize(max(time_value)) %>%
-        pull()
-    train_end_date <- min(max(lagged_df$time_value), response_end_date)
-
-    # Training matrices
-    out$train_x <- train_df %>%
-        filter(between(time_value,
-                       train_end_date - training_window_size + 1,
-                       train_end_date)) %>%
-        select(-c(geo_value, time_value)) %>%
-        as.matrix()
-    out$train_y <- lagged_df %>%
-        filter(between(time_value,
-                       train_end_date - training_window_size + 1,
-                       train_end_date)) %>%
-        select(tidyselect::starts_with(sprintf("response+%i:", ahead))) %>%
-        pull()
-
-    # Prediction matrices
-    out$predict_x <- lagged_df %>%
-        filter(time_value == max(time_value)) %>%
-        select(tidyselect::starts_with("value")) %>%
-        as.matrix()
-    out$predict_geo_values <- lagged_df %>%
-        filter(time_value == max(time_value)) %>%
-        select(geo_value) %>% pull()
-    out$train_end_date <- train_end_date
-
-    return(out)
+    all_out <- list()
+    
+    for (a in ahead) {
+        out <- list()  # matrices corresponding to ahead+a
+        
+        # make sure the response columns are unique
+        responses_at_ahead <- lagged_df %>%
+            select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
+            ncol()
+        assert_that(responses_at_ahead == 1,
+                    msg = paste("multiple responses at ahead =", a))
+        
+        train_df <- lagged_df %>%
+            select(geo_value, time_value, tidyselect::starts_with("value"))
+        
+        # Find the last possible date of training data
+        response_end_date <- lagged_df %>%
+            select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>%
+            tidyr::drop_na() %>%
+            summarize(max(time_value)) %>%
+            pull()
+        train_end_date <- min(max(lagged_df$time_value), response_end_date)
+        
+        # Training matrices
+        out$train_x <- train_df %>%
+            filter(between(time_value,
+                           train_end_date - training_window_size + 1,
+                           train_end_date)) %>%
+            select(-c(geo_value, time_value)) %>%
+            as.matrix()
+        out$train_y <- lagged_df %>%
+            filter(between(time_value,
+                           train_end_date - training_window_size + 1,
+                           train_end_date)) %>%
+            select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
+            pull()
+        
+        # Prediction matrices
+        out$predict_x <- lagged_df %>%
+            filter(time_value == max(time_value)) %>%
+            select(tidyselect::starts_with("value")) %>%
+            as.matrix()
+        out$predict_geo_values <- lagged_df %>%
+            filter(time_value == max(time_value)) %>%
+            select(geo_value) %>% pull()
+        out$train_end_date <- train_end_date
+        
+        all_out[[paste0("ahead+", a)]] <- out
+    }
+    
+    if (length(ahead) == 1) {
+        return(all_out[[1]])
+    } else {
+        return(all_out)
+    }
 }
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index 1b6b02fe..cc0288b2 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/matrix.R
 \name{create_train_and_predict_matrices}
 \alias{create_train_and_predict_matrices}
-\title{Create training and testing data matrices and a training response vector for a given ahead.}
+\title{Create training and test data matrices and a training response for given aheads.}
 \usage{
 create_train_and_predict_matrices(lagged_df, ahead, training_window_size)
 }
@@ -19,7 +19,10 @@ correspond to \code{{response}} \code{{n}} incidence period units after \code{ti
 A data frame in this format can be made using \code{covidcast::aggregate_signals()} and
 \code{modeltools::get_response_columns()}.}
 
-\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast}
+\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast.
+Can be a single positive integer or a vector of positive integers. Note that
+for each \code{{a}} in \code{ahead}, the column \code{response+{a}:{response}} should be
+present in \code{lagged_df}.}
 
 \item{training_window_size}{Size of the local training window in days to use. For example, if
 \code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on
@@ -41,7 +44,8 @@ prediction data contains the most recent \code{training_window_size} days.
 }
 }
 \description{
-Create training and testing data matrices and a training response vector for a given ahead.
+Create training and test data matrices and training response for a set of
+given aheads. Works for both single ahead values and a vector of ahead values.
 }
 \examples{
 \dontrun{
diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R
index 88c04541..e38d51b0 100644
--- a/R-packages/modeltools/tests/testthat/test-matrix.R
+++ b/R-packages/modeltools/tests/testthat/test-matrix.R
@@ -16,7 +16,8 @@ test_that("training and prediction matrices are created", {
         `response+2:signal_2` = c(seq(15, 20), rep(NA, 4))
     )
 
-    out <- create_train_and_predict_matrices(df, 2, 1)
+    out <- create_train_and_predict_matrices(df, ahead = 2, 
+                                             training_window_size = 1)
 
     expect_equal(names(out), c("train_x", "train_y", "predict_x", 
                                "predict_geo_values", "train_end_date"))
@@ -42,6 +43,32 @@ test_that("training and prediction matrices are created", {
     expect_equal(out$predict_geo_values, c("az", "wv"))
 })
 
+test_that("training and prediction matrices for multiple aheads (separate)", {
+    df <- tibble(
+        geo_value = rep(c("az", "wv"), 5),
+        time_value = rep(
+            as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
+            each = 2),
+        `value-2:signal_1` = seq(-3, 6),
+        `value-2:signal_2` = seq(7, 16),
+        `value-1:signal_1` = seq(-1, 8),
+        `value-1:signal_2` = seq(9, 18),
+        `value+0:signal_1` = seq(1, 10),
+        `value+0:signal_2` = seq(11, 20),
+        `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)),
+        `response+2:signal_2` = c(seq(15, 20), rep(NA, 4))
+    )
+    
+    out <- create_train_and_predict_matrices(df, ahead = c(1, 2), 
+                                             training_window_size = 1)
+    
+    expect_equal(length(out), 2)
+    expect_equal(out[[1]], create_train_and_predict_matrices(df, ahead = 1, 
+                                                             training_window_size = 1))
+    expect_equal(out[[2]], create_train_and_predict_matrices(df, ahead = 2, 
+                                                             training_window_size = 1))
+})
+
 test_that("fails with multiple responses", {
     df <- tibble(
         geo_value = rep(c("az", "wv"), 5),

From 32ecee3652c7d788ad680a97c48e2d1239c48ba5 Mon Sep 17 00:00:00 2001
From: Kenneth Tay <kjytay@gmail.com>
Date: Thu, 1 Apr 2021 22:55:48 -0700
Subject: [PATCH 31/36] Modify create_train_and_predict_matrices() to be able
 to return matrix of responses for multiple aheads.

---
 R-packages/modeltools/R/matrix.R              | 107 +++++++++++++-----
 .../man/create_train_and_predict_matrices.Rd  |  19 +++-
 .../modeltools/tests/testthat/test-matrix.R   |  48 ++++++++
 3 files changed, 146 insertions(+), 28 deletions(-)

diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 1b2c1fe7..0df026be 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -2,6 +2,9 @@
 #' 
 #' Create training and test data matrices and training response for a set of
 #' given aheads. Works for both single ahead values and a vector of ahead values.
+#' For multiple ahead values, the function has the ability to return separate
+#' data matrices and responses for each ahead, or a single data matrix and
+#' response matrix for all aheads at once.
 #'
 #' @param lagged_df Data frame of lagged data. It should have the following columns:
 #'     \itemize{
@@ -21,8 +24,10 @@
 #' @param training_window_size Size of the local training window in days to use. For example, if
 #'     `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on
 #'     data from December 1 to December 14.
+#' @param aheads_separate If `length(ahead) > 1`, should there be separate
+#' data matrices and responses for each ahead? Default is `TRUE`.
 #'
-#' @return Named list with entries:
+#' @return For a single ahead value, named list with entries:
 #'     \itemize{
 #'     \item `train_x`: Matrix of training data whose columns correspond to the
 #'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
@@ -35,6 +40,10 @@
 #'     \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`.
 #'     \item `train_end_date`: Latest `time_value` used in the training period.
 #'     }
+#'     For multiple ahead values and `aheads_separate = TRUE`, a list having
+#'     the same length as `ahead`, with each element being a named list as above.
+#'     For multiple ahead values and `ahead_separate = FALSE`, a named list as
+#'     above, except `train_y` is a matrix of responses rather than a vector.
 #'
 #' @examples \dontrun{
 #' create_train_and_predict_matrices(
@@ -56,32 +65,89 @@
 #' @importFrom assertthat assert_that
 #'
 #' @export
-create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size) {
+create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size,
+                                              aheads_separate = TRUE) {
     all_out <- list()
     
+    # prediction matrices are the same for all aheads
+    predict_x <- lagged_df %>%
+        filter(time_value == max(time_value)) %>%
+        select(tidyselect::starts_with("value")) %>%
+        as.matrix()
+    predict_geo_values <- lagged_df %>%
+        filter(time_value == max(time_value)) %>%
+        select(geo_value) %>% pull()
+    
+    # make sure the response columns are unique
     for (a in ahead) {
-        out <- list()  # matrices corresponding to ahead+a
-        
-        # make sure the response columns are unique
         responses_at_ahead <- lagged_df %>%
             select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
             ncol()
         assert_that(responses_at_ahead == 1,
                     msg = paste("multiple responses at ahead =", a))
+    }
+    
+    if (aheads_separate) {
+        for (a in ahead) {
+            out <- list()  # matrices corresponding to ahead+a
+            
+            # Find the last possible date of training data
+            response_end_date <- lagged_df %>%
+                select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>%
+                tidyr::drop_na() %>%
+                summarize(max(time_value)) %>%
+                pull()
+            train_end_date <- min(max(lagged_df$time_value), response_end_date)
+            
+            # Training matrices
+            out$train_x <- lagged_df %>%
+                select(geo_value, time_value, tidyselect::starts_with("value")) %>%
+                filter(between(time_value,
+                               train_end_date - training_window_size + 1,
+                               train_end_date)) %>%
+                select(-c(geo_value, time_value)) %>%
+                as.matrix()
+            out$train_y <- lagged_df %>%
+                filter(between(time_value,
+                               train_end_date - training_window_size + 1,
+                               train_end_date)) %>%
+                select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
+                pull()
+            
+            # Add prediction matrices and training end date
+            out$predict_x <- predict_x
+            out$predict_geo_values <- predict_geo_values
+            out$train_end_date <- train_end_date
+            
+            all_out[[paste0("ahead+", a)]] <- out
+        }
         
-        train_df <- lagged_df %>%
-            select(geo_value, time_value, tidyselect::starts_with("value"))
+        if (length(ahead) == 1) {
+            return(all_out[[1]])
+        } else {
+            return(all_out)
+        }
+    } else {
+        # We want a single training data matrix and a matrix of 
+        # training responses containing all the aheads
+        # Need to recompute training_window_size 
+        training_window_size <- training_window_size + max(ahead) - min(ahead)
+        
+        out <- list()
         
         # Find the last possible date of training data
+        # (corresponds to the smallest ahead)
         response_end_date <- lagged_df %>%
-            select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>%
+            select(time_value, 
+                   tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>%
             tidyr::drop_na() %>%
             summarize(max(time_value)) %>%
             pull()
         train_end_date <- min(max(lagged_df$time_value), response_end_date)
         
         # Training matrices
-        out$train_x <- train_df %>%
+        out$train_x <- lagged_df %>%
+            select(geo_value, time_value, tidyselect::starts_with("value")) %>%
             filter(between(time_value,
                            train_end_date - training_window_size + 1,
                            train_end_date)) %>%
@@ -91,25 +157,14 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
             filter(between(time_value,
                            train_end_date - training_window_size + 1,
                            train_end_date)) %>%
-            select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
-            pull()
-        
-        # Prediction matrices
-        out$predict_x <- lagged_df %>%
-            filter(time_value == max(time_value)) %>%
-            select(tidyselect::starts_with("value")) %>%
+            select(tidyselect::starts_with(paste0("response+", ahead, ":"))) %>%
             as.matrix()
-        out$predict_geo_values <- lagged_df %>%
-            filter(time_value == max(time_value)) %>%
-            select(geo_value) %>% pull()
+        
+        # Add prediction matrices and training end date
+        out$predict_x <- predict_x
+        out$predict_geo_values <- predict_geo_values
         out$train_end_date <- train_end_date
         
-        all_out[[paste0("ahead+", a)]] <- out
-    }
-    
-    if (length(ahead) == 1) {
-        return(all_out[[1]])
-    } else {
-        return(all_out)
+        return(out)
     }
 }
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index cc0288b2..c9bbad74 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -4,7 +4,12 @@
 \alias{create_train_and_predict_matrices}
 \title{Create training and test data matrices and a training response for given aheads.}
 \usage{
-create_train_and_predict_matrices(lagged_df, ahead, training_window_size)
+create_train_and_predict_matrices(
+  lagged_df,
+  ahead,
+  training_window_size,
+  aheads_separate = TRUE
+)
 }
 \arguments{
 \item{lagged_df}{Data frame of lagged data. It should have the following columns:
@@ -27,9 +32,12 @@ present in \code{lagged_df}.}
 \item{training_window_size}{Size of the local training window in days to use. For example, if
 \code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on
 data from December 1 to December 14.}
+
+\item{aheads_separate}{If \code{length(ahead) > 1}, should there be separate
+data matrices and responses for each ahead? Default is \code{TRUE}.}
 }
 \value{
-Named list with entries:
+For a single ahead value, named list with entries:
 \itemize{
 \item \code{train_x}: Matrix of training data whose columns correspond to the
 \code{value-{days}:{signal}} columns in \code{lagged_df}.  The training data consists of the
@@ -42,10 +50,17 @@ prediction data contains the most recent \code{training_window_size} days.
 \item \code{predict_geo_values}: Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.
 \item \code{train_end_date}: Latest \code{time_value} used in the training period.
 }
+For multiple ahead values and \code{aheads_separate = TRUE}, a list having
+the same length as \code{ahead}, with each element being a named list as above.
+For multiple ahead values and \code{ahead_separate = FALSE}, a named list as
+above, except \code{train_y} is a matrix of responses rather than a vector.
 }
 \description{
 Create training and test data matrices and training response for a set of
 given aheads. Works for both single ahead values and a vector of ahead values.
+For multiple ahead values, the function has the ability to return separate
+data matrices and responses for each ahead, or a single data matrix and
+response matrix for all aheads at once.
 }
 \examples{
 \dontrun{
diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R
index e38d51b0..9c787555 100644
--- a/R-packages/modeltools/tests/testthat/test-matrix.R
+++ b/R-packages/modeltools/tests/testthat/test-matrix.R
@@ -69,6 +69,54 @@ test_that("training and prediction matrices for multiple aheads (separate)", {
                                                              training_window_size = 1))
 })
 
+test_that("training and prediction matrices for multiple aheads (together)", {
+    df <- tibble(
+        geo_value = rep(c("az", "wv"), 5),
+        time_value = rep(
+            as.Date(c("2021-01-25", "2021-01-26", "2021-01-27", "2021-01-28", "2021-01-29")),
+            each = 2),
+        `value-2:signal_1` = seq(-3, 6),
+        `value-2:signal_2` = seq(7, 16),
+        `value-1:signal_1` = seq(-1, 8),
+        `value-1:signal_2` = seq(9, 18),
+        `value+0:signal_1` = seq(1, 10),
+        `value+0:signal_2` = seq(11, 20),
+        `response+1:signal_2` = c(seq(13, 20), rep(NA, 2)),
+        `response+2:signal_2` = c(seq(15, 20), rep(NA, 4))
+    )
+    
+    out <- create_train_and_predict_matrices(df, ahead = c(1, 2), 
+                                             training_window_size = 1,
+                                             aheads_separate = FALSE)
+    
+    expect_equal(names(out), c("train_x", "train_y", "predict_x", 
+                               "predict_geo_values", "train_end_date"))
+    expect_equal(out$train_x,
+                 as.matrix(tibble(
+                     `value-2:signal_1` = c(1, 2, 3, 4),
+                     `value-2:signal_2` = c(11, 12, 13, 14),
+                     `value-1:signal_1` = c(3, 4, 5, 6),
+                     `value-1:signal_2` = c(13, 14, 15, 16),
+                     `value+0:signal_1` = c(5, 6, 7, 8),
+                     `value+0:signal_2` = c(15, 16, 17, 18)))
+    )
+    expect_equal(out$train_y,
+                 as.matrix(tibble(
+                     `response+1:signal_2` = c(17, 18, 19, 20),
+                     `response+2:signal_2` = c(19, 20, NA, NA)))
+    )
+    expect_equal(out$predict_x,
+                 as.matrix(tibble(
+                     `value-2:signal_1` = c(5, 6),
+                     `value-2:signal_2` = c(15, 16),
+                     `value-1:signal_1` = c(7, 8),
+                     `value-1:signal_2` = c(17, 18),
+                     `value+0:signal_1` = c(9, 10),
+                     `value+0:signal_2` = c(19, 20)))
+    )
+    expect_equal(out$predict_geo_values, c("az", "wv"))
+})
+
 test_that("fails with multiple responses", {
     df <- tibble(
         geo_value = rep(c("az", "wv"), 5),

From 091bc6064d25a4c96df52fde47848e6393e0ee25 Mon Sep 17 00:00:00 2001
From: Daniel McDonald <dajmcdon@gmail.com>
Date: Fri, 2 Apr 2021 23:45:48 -0700
Subject: [PATCH 32/36] add missing comma

---
 R-packages/modeltools/DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION
index d9426422..deb84f29 100755
--- a/R-packages/modeltools/DESCRIPTION
+++ b/R-packages/modeltools/DESCRIPTION
@@ -18,7 +18,7 @@ Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.1.1
 Remotes:
     cmu-delphi/covidcast/R-packages/covidcast@main,
-    cmu-delphi/covidcast/R-packages/evalcast@main
+    cmu-delphi/covidcast/R-packages/evalcast@main,
     ryantibs/quantgen/R-package/quantgen@master,
 Imports:
     assertthat,

From fe231fee3877fbd935047371e2928765c88b409a Mon Sep 17 00:00:00 2001
From: Kenneth Tay <kjytay@gmail.com>
Date: Mon, 5 Apr 2021 13:25:20 -0700
Subject: [PATCH 33/36] Refactor create_train_and_predict_matrices().

---
 R-packages/modeltools/DESCRIPTION             |   1 +
 R-packages/modeltools/NAMESPACE               |   1 +
 R-packages/modeltools/R/matrix.R              | 218 +++++++++++-------
 .../man/create_train_and_predict_matrices.Rd  |  15 +-
 .../modeltools/man/create_train_matrices.Rd   |  45 ++++
 .../modeltools/man/get_train_end_date.Rd      |  24 ++
 .../modeltools/tests/testthat/test-matrix.R   |  19 +-
 7 files changed, 227 insertions(+), 96 deletions(-)
 create mode 100644 R-packages/modeltools/man/create_train_matrices.Rd
 create mode 100644 R-packages/modeltools/man/get_train_end_date.Rd

diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION
index d9426422..b1584303 100755
--- a/R-packages/modeltools/DESCRIPTION
+++ b/R-packages/modeltools/DESCRIPTION
@@ -26,6 +26,7 @@ Imports:
     dplyr,
     evalcast,
     genlasso,
+    purrr,
     slider,
     tidyselect,
     tidyr,
diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index 62e60157..de638773 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -33,6 +33,7 @@ importFrom(genlasso,coef.genlasso)
 importFrom(genlasso,cv.trendfilter)
 importFrom(genlasso,trendfilter)
 importFrom(lubridate,days)
+importFrom(purrr,map2)
 importFrom(stats,coef)
 importFrom(stats,lsfit)
 importFrom(stats,predict)
diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 0df026be..4ba09dda 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -17,15 +17,15 @@
 #'     }
 #'     A data frame in this format can be made using `covidcast::aggregate_signals()` and
 #'     `modeltools::get_response_columns()`.
-#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast.
-#' Can be a single positive integer or a vector of positive integers. Note that 
-#' for each `{a}` in `ahead`, the column `response+{a}:{response}` should be
-#' present in `lagged_df`.
+#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) 
+#'     ahead to forecast. Can be a single positive integer or a vector of 
+#'     positive integers. Note that for each `{a}` in `ahead`, the column 
+#'     `response+{a}:{response}` should be present in `lagged_df`.
 #' @param training_window_size Size of the local training window in days to use. For example, if
 #'     `training_window_size = 14`, then to make a 1-day-ahead forecast on December 15, we train on
 #'     data from December 1 to December 14.
 #' @param aheads_separate If `length(ahead) > 1`, should there be separate
-#' data matrices and responses for each ahead? Default is `TRUE`.
+#'     data matrices and responses for each ahead? Default is `TRUE`.
 #'
 #' @return For a single ahead value, named list with entries:
 #'     \itemize{
@@ -35,10 +35,15 @@
 #'         days prior to it.
 #'     \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of
 #'         `lagged_df` corresponding to the rows of `train_x`.
+#'     \item `train_geo_values`: Vector of geo values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_time_values`: Vector of time values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_end_date`: Latest `time_value` used in the training period.
 #'     \item `predict_x`: Matrix of prediction data in the same format as `train_x`.  The
 #'         prediction data contains the most recent `training_window_size` days.
 #'     \item `predict_geo_values`: Vector of `geo_values` corresponding to the rows of `predict_x`.
-#'     \item `train_end_date`: Latest `time_value` used in the training period.
+#'     \item `predict_time_value`: Time value corresponding to `predict_x`.
 #'     }
 #'     For multiple ahead values and `aheads_separate = TRUE`, a list having
 #'     the same length as `ahead`, with each element being a named list as above.
@@ -63,21 +68,11 @@
 #'
 #' @importFrom tibble tibble
 #' @importFrom assertthat assert_that
+#' @importFrom purrr map2
 #'
 #' @export
 create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size,
                                               aheads_separate = TRUE) {
-    all_out <- list()
-    
-    # prediction matrices are the same for all aheads
-    predict_x <- lagged_df %>%
-        filter(time_value == max(time_value)) %>%
-        select(tidyselect::starts_with("value")) %>%
-        as.matrix()
-    predict_geo_values <- lagged_df %>%
-        filter(time_value == max(time_value)) %>%
-        select(geo_value) %>% pull()
-    
     # make sure the response columns are unique
     for (a in ahead) {
         responses_at_ahead <- lagged_df %>%
@@ -87,84 +82,133 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
                     msg = paste("multiple responses at ahead =", a))
     }
     
+    # prediction matrices are the same for all aheads
+    predict_time_value <- max(lagged_df$time_value)
+    predict_x <- lagged_df %>%
+        filter(time_value == predict_time_value) %>%
+        select(tidyselect::starts_with("value")) %>%
+        as.matrix()
+    predict_geo_values <- lagged_df %>%
+        filter(time_value == predict_time_value) %>%
+        select(geo_value) %>% pull()
+    
+    all_out <- list()
+    
     if (aheads_separate) {
-        for (a in ahead) {
-            out <- list()  # matrices corresponding to ahead+a
-            
-            # Find the last possible date of training data
-            response_end_date <- lagged_df %>%
-                select(time_value, tidyselect::starts_with(sprintf("response+%i:", a))) %>%
-                tidyr::drop_na() %>%
-                summarize(max(time_value)) %>%
-                pull()
-            train_end_date <- min(max(lagged_df$time_value), response_end_date)
-            
-            # Training matrices
-            out$train_x <- lagged_df %>%
-                select(geo_value, time_value, tidyselect::starts_with("value")) %>%
-                filter(between(time_value,
-                               train_end_date - training_window_size + 1,
-                               train_end_date)) %>%
-                select(-c(geo_value, time_value)) %>%
-                as.matrix()
-            out$train_y <- lagged_df %>%
-                filter(between(time_value,
-                               train_end_date - training_window_size + 1,
-                               train_end_date)) %>%
-                select(tidyselect::starts_with(sprintf("response+%i:", a))) %>%
-                pull()
-            
-            # Add prediction matrices and training end date
-            out$predict_x <- predict_x
-            out$predict_geo_values <- predict_geo_values
-            out$train_end_date <- train_end_date
-            
-            all_out[[paste0("ahead+", a)]] <- out
-        }
+        train_end_dates <- Reduce(c, lapply(ahead, 
+                                            function(a) get_train_end_date(lagged_df, a)))
         
-        if (length(ahead) == 1) {
-            return(all_out[[1]])
-        } else {
-            return(all_out)
-        }
-    } else {
-        # We want a single training data matrix and a matrix of 
-        # training responses containing all the aheads
+        all_out <- purrr::map2(ahead, train_end_dates,
+                              ~ create_train_matrices(lagged_df, .x, training_window_size,
+                                                      .y))
+    } else {  # ahead_separate = FALSE: We want a single training data matrix and 
+              # a matrix of training responses containing all the aheads
         # Need to recompute training_window_size 
         training_window_size <- training_window_size + max(ahead) - min(ahead)
         
-        out <- list()
-        
         # Find the last possible date of training data
         # (corresponds to the smallest ahead)
-        response_end_date <- lagged_df %>%
-            select(time_value, 
-                   tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>%
-            tidyr::drop_na() %>%
-            summarize(max(time_value)) %>%
-            pull()
-        train_end_date <- min(max(lagged_df$time_value), response_end_date)
+        train_end_date <- get_train_end_date(lagged_df, ahead)
         
         # Training matrices
-        out$train_x <- lagged_df %>%
-            select(geo_value, time_value, tidyselect::starts_with("value")) %>%
-            filter(between(time_value,
-                           train_end_date - training_window_size + 1,
-                           train_end_date)) %>%
-            select(-c(geo_value, time_value)) %>%
-            as.matrix()
-        out$train_y <- lagged_df %>%
-            filter(between(time_value,
-                           train_end_date - training_window_size + 1,
-                           train_end_date)) %>%
-            select(tidyselect::starts_with(paste0("response+", ahead, ":"))) %>%
-            as.matrix()
-        
-        # Add prediction matrices and training end date
-        out$predict_x <- predict_x
-        out$predict_geo_values <- predict_geo_values
-        out$train_end_date <- train_end_date
-        
-        return(out)
+        all_out[[1]] <- create_train_matrices(lagged_df, ahead, 
+                                              training_window_size,
+                                              train_end_date)
     }
+    
+    # Add prediction matrices / info
+    for (i in seq_along(all_out)) {
+        all_out[[i]]$predict_x <- predict_x
+        all_out[[i]]$predict_geo_values <- predict_geo_values
+        all_out[[i]]$predict_time_value <- predict_time_value
+    }
+    
+    if (length(ahead) == 1 || aheads_separate == FALSE) {
+        return(all_out[[1]])
+    } else {
+        names(all_out) <- paste0("ahead+", ahead)
+        return(all_out)
+    }
+}
+
+#' Get last possible date of training data for given aheads.
+#' 
+#' Returns the last possible date of the training data for a given set of
+#' aheads. If more than one ahead is given, the date return corresponds to the
+#' last possible date corresponding to the smallest ahead value. This is because
+#' the smallest ahead value will have the latest possible date.
+#'
+#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`.
+#' @param ahead Number of days ahead to forecast. Can be a single positive 
+#'     integer or a vector of positive integers.
+#' 
+#' @return Single date corresponding to the last possible date of the training
+#' data.
+get_train_end_date <- function(lagged_df, ahead) {
+    response_end_date <- lagged_df %>%
+        select(time_value, 
+               tidyselect::starts_with(sprintf("response+%i:", min(ahead)))) %>%
+        tidyr::drop_na() %>%
+        summarize(max(time_value)) %>%
+        pull()
+    train_end_date <- min(max(lagged_df$time_value), response_end_date)
+    return(train_end_date)
+}
+    
+#' Create training data matrix and a training response for given aheads.
+#' 
+#' Create training and data matrix and training response for a set of
+#' given aheads. Works for both single ahead values and a vector of ahead 
+#' values. However, note that this function works different from
+#' `create_train_and_predict_matrices()` for multiple ahead values. If
+#' multiple ahead values are supplied, we return a matrix of responses
+#' containing all aheads at once.
+#'
+#' @param lagged_df Data frame of lagged data as in `create_train_and_predict_matrices()`.
+#' @param ahead Number of incidence period units (i.e., epiweeks, days, etc.) 
+#'     ahead to forecast. Can be a single positive integer or a vector of 
+#'     positive integers. Note that for each `{a}` in `ahead`, the column 
+#'     `response+{a}:{response}` should be present in `lagged_df`.
+#' @param n_days Number of days worth of data to pull.
+#' @param train_end_date The last date to be included in the training data.
+#'
+#' @return A named list with entries:
+#'     \itemize{
+#'     \item `train_x`: Matrix of training data whose columns correspond to the
+#'         `value-{days}:{signal}` columns in `lagged_df`.  The training data consists of the
+#'         latest date with an non-null response, plus all data from the `training_window_size`
+#'         days prior to it.
+#'     \item `train_y`: Vector of response data from the `response+{ahead}:{response}` column of
+#'         `lagged_df` corresponding to the rows of `train_x`. If multiple ahead
+#'         values are provided, then this is a matrix instead.
+#'     \item `train_geo_values`: Vector of geo values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_time_values`: Vector of time values corresponding to the rows
+#'         of `train_x`.
+#'     \item `train_end_date`: Latest `time_value` used in the training period.
+#'     }
+create_train_matrices <- function(lagged_df, ahead, n_days,
+                                   train_end_date) {
+    train_df <- lagged_df %>%
+        filter(between(time_value,
+                       train_end_date - n_days + 1,
+                       train_end_date))
+    out <- list()
+    out$train_x <- train_df %>%
+        select(tidyselect::starts_with("value")) %>%
+        as.matrix()
+    
+    train_y <- train_df %>%
+        select(tidyselect::starts_with(paste0("response+", ahead, ":")))
+    if (length(ahead) == 1) {
+        out$train_y <- pull(train_y)
+    } else {
+        out$train_y <- as.matrix(train_y)
+    }
+    
+    out$train_geo_values  <- train_df$geo_value
+    out$train_time_values <- train_df$time_value
+    out$train_end_date    <- as.Date(train_end_date)
+    
+    return(out)
 }
diff --git a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
index c9bbad74..cb8c485f 100644
--- a/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
+++ b/R-packages/modeltools/man/create_train_and_predict_matrices.Rd
@@ -24,10 +24,10 @@ correspond to \code{{response}} \code{{n}} incidence period units after \code{ti
 A data frame in this format can be made using \code{covidcast::aggregate_signals()} and
 \code{modeltools::get_response_columns()}.}
 
-\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.) ahead to forecast.
-Can be a single positive integer or a vector of positive integers. Note that
-for each \code{{a}} in \code{ahead}, the column \code{response+{a}:{response}} should be
-present in \code{lagged_df}.}
+\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.)
+ahead to forecast. Can be a single positive integer or a vector of
+positive integers. Note that for each \code{{a}} in \code{ahead}, the column
+\code{response+{a}:{response}} should be present in \code{lagged_df}.}
 
 \item{training_window_size}{Size of the local training window in days to use. For example, if
 \code{training_window_size = 14}, then to make a 1-day-ahead forecast on December 15, we train on
@@ -45,10 +45,15 @@ latest date with an non-null response, plus all data from the \code{training_win
 days prior to it.
 \item \code{train_y}: Vector of response data from the \code{response+{ahead}:{response}} column of
 \code{lagged_df} corresponding to the rows of \code{train_x}.
+\item \code{train_geo_values}: Vector of geo values corresponding to the rows
+of \code{train_x}.
+\item \code{train_time_values}: Vector of time values corresponding to the rows
+of \code{train_x}.
+\item \code{train_end_date}: Latest \code{time_value} used in the training period.
 \item \code{predict_x}: Matrix of prediction data in the same format as \code{train_x}.  The
 prediction data contains the most recent \code{training_window_size} days.
 \item \code{predict_geo_values}: Vector of \code{geo_values} corresponding to the rows of \code{predict_x}.
-\item \code{train_end_date}: Latest \code{time_value} used in the training period.
+\item \code{predict_time_value}: Time value corresponding to \code{predict_x}.
 }
 For multiple ahead values and \code{aheads_separate = TRUE}, a list having
 the same length as \code{ahead}, with each element being a named list as above.
diff --git a/R-packages/modeltools/man/create_train_matrices.Rd b/R-packages/modeltools/man/create_train_matrices.Rd
new file mode 100644
index 00000000..9b27f24c
--- /dev/null
+++ b/R-packages/modeltools/man/create_train_matrices.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/matrix.R
+\name{create_train_matrices}
+\alias{create_train_matrices}
+\title{Create training data matrix and a training response for given aheads.}
+\usage{
+create_train_matrices(lagged_df, ahead, n_days, train_end_date)
+}
+\arguments{
+\item{lagged_df}{Data frame of lagged data as in \code{create_train_and_predict_matrices()}.}
+
+\item{ahead}{Number of incidence period units (i.e., epiweeks, days, etc.)
+ahead to forecast. Can be a single positive integer or a vector of
+positive integers. Note that for each \code{{a}} in \code{ahead}, the column
+\code{response+{a}:{response}} should be present in \code{lagged_df}.}
+
+\item{n_days}{Number of days worth of data to pull.}
+
+\item{train_end_date}{The last date to be included in the training data.}
+}
+\value{
+A named list with entries:
+\itemize{
+\item \code{train_x}: Matrix of training data whose columns correspond to the
+\code{value-{days}:{signal}} columns in \code{lagged_df}.  The training data consists of the
+latest date with an non-null response, plus all data from the \code{training_window_size}
+days prior to it.
+\item \code{train_y}: Vector of response data from the \code{response+{ahead}:{response}} column of
+\code{lagged_df} corresponding to the rows of \code{train_x}. If multiple ahead
+values are provided, then this is a matrix instead.
+\item \code{train_geo_values}: Vector of geo values corresponding to the rows
+of \code{train_x}.
+\item \code{train_time_values}: Vector of time values corresponding to the rows
+of \code{train_x}.
+\item \code{train_end_date}: Latest \code{time_value} used in the training period.
+}
+}
+\description{
+Create training and data matrix and training response for a set of
+given aheads. Works for both single ahead values and a vector of ahead
+values. However, note that this function works different from
+\code{create_train_and_predict_matrices()} for multiple ahead values. If
+multiple ahead values are supplied, we return a matrix of responses
+containing all aheads at once.
+}
diff --git a/R-packages/modeltools/man/get_train_end_date.Rd b/R-packages/modeltools/man/get_train_end_date.Rd
new file mode 100644
index 00000000..8b4c1a9a
--- /dev/null
+++ b/R-packages/modeltools/man/get_train_end_date.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/matrix.R
+\name{get_train_end_date}
+\alias{get_train_end_date}
+\title{Get last possible date of training data for given aheads.}
+\usage{
+get_train_end_date(lagged_df, ahead)
+}
+\arguments{
+\item{lagged_df}{Data frame of lagged data as in \code{create_train_and_predict_matrices()}.}
+
+\item{ahead}{Number of days ahead to forecast. Can be a single positive
+integer or a vector of positive integers.}
+}
+\value{
+Single date corresponding to the last possible date of the training
+data.
+}
+\description{
+Returns the last possible date of the training data for a given set of
+aheads. If more than one ahead is given, the date return corresponds to the
+last possible date corresponding to the smallest ahead value. This is because
+the smallest ahead value will have the latest possible date.
+}
diff --git a/R-packages/modeltools/tests/testthat/test-matrix.R b/R-packages/modeltools/tests/testthat/test-matrix.R
index 9c787555..c04655f4 100644
--- a/R-packages/modeltools/tests/testthat/test-matrix.R
+++ b/R-packages/modeltools/tests/testthat/test-matrix.R
@@ -19,8 +19,9 @@ test_that("training and prediction matrices are created", {
     out <- create_train_and_predict_matrices(df, ahead = 2, 
                                              training_window_size = 1)
 
-    expect_equal(names(out), c("train_x", "train_y", "predict_x", 
-                               "predict_geo_values", "train_end_date"))
+    expect_equal(names(out), c("train_x", "train_y", "train_geo_values",
+                               "train_time_values", "train_end_date",
+                               "predict_x", "predict_geo_values", "predict_time_value"))
     expect_equal(out$train_x,
                  as.matrix(tibble(
                     `value-2:signal_1` = c(1, 2),
@@ -31,6 +32,9 @@ test_that("training and prediction matrices are created", {
                     `value+0:signal_2` = c(15, 16)))
     )
     expect_equal(out$train_y, c(19, 20))
+    expect_equal(out$train_geo_values, c("az", "wv"))
+    expect_equal(out$train_time_values, rep(as.Date("2021-01-27"), 2))
+    expect_equal(out$train_end_date, as.Date("2021-01-27"))
     expect_equal(out$predict_x,
                  as.matrix(tibble(
                     `value-2:signal_1` = c(5, 6),
@@ -41,6 +45,7 @@ test_that("training and prediction matrices are created", {
                     `value+0:signal_2` = c(19, 20)))
     )
     expect_equal(out$predict_geo_values, c("az", "wv"))
+    expect_equal(out$predict_time_value, as.Date("2021-01-29"))
 })
 
 test_that("training and prediction matrices for multiple aheads (separate)", {
@@ -89,8 +94,9 @@ test_that("training and prediction matrices for multiple aheads (together)", {
                                              training_window_size = 1,
                                              aheads_separate = FALSE)
     
-    expect_equal(names(out), c("train_x", "train_y", "predict_x", 
-                               "predict_geo_values", "train_end_date"))
+    expect_equal(names(out), c("train_x", "train_y", "train_geo_values",
+                               "train_time_values", "train_end_date",
+                               "predict_x", "predict_geo_values", "predict_time_value"))
     expect_equal(out$train_x,
                  as.matrix(tibble(
                      `value-2:signal_1` = c(1, 2, 3, 4),
@@ -105,6 +111,10 @@ test_that("training and prediction matrices for multiple aheads (together)", {
                      `response+1:signal_2` = c(17, 18, 19, 20),
                      `response+2:signal_2` = c(19, 20, NA, NA)))
     )
+    expect_equal(out$train_geo_values, rep(c("az", "wv"), 2))
+    expect_equal(out$train_time_values, 
+                 rep(as.Date(c("2021-01-27", "2021-01-28")), each = 2))
+    expect_equal(out$train_end_date, as.Date("2021-01-28"))
     expect_equal(out$predict_x,
                  as.matrix(tibble(
                      `value-2:signal_1` = c(5, 6),
@@ -115,6 +125,7 @@ test_that("training and prediction matrices for multiple aheads (together)", {
                      `value+0:signal_2` = c(19, 20)))
     )
     expect_equal(out$predict_geo_values, c("az", "wv"))
+    expect_equal(out$predict_time_value, as.Date("2021-01-29"))
 })
 
 test_that("fails with multiple responses", {

From 61bac8a2d9340b4789fc39f90f503aeaeaef6991 Mon Sep 17 00:00:00 2001
From: Kenneth Tay <kjytay@gmail.com>
Date: Mon, 5 Apr 2021 20:45:08 -0700
Subject: [PATCH 34/36] Address PR #520 comments.

---
 R-packages/modeltools/NAMESPACE  |  2 ++
 R-packages/modeltools/R/matrix.R | 12 +++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/R-packages/modeltools/NAMESPACE b/R-packages/modeltools/NAMESPACE
index de638773..fbb1f629 100644
--- a/R-packages/modeltools/NAMESPACE
+++ b/R-packages/modeltools/NAMESPACE
@@ -32,8 +32,10 @@ importFrom(dplyr,ungroup)
 importFrom(genlasso,coef.genlasso)
 importFrom(genlasso,cv.trendfilter)
 importFrom(genlasso,trendfilter)
+importFrom(lubridate,as_date)
 importFrom(lubridate,days)
 importFrom(purrr,map2)
+importFrom(purrr,map_dbl)
 importFrom(stats,coef)
 importFrom(stats,lsfit)
 importFrom(stats,predict)
diff --git a/R-packages/modeltools/R/matrix.R b/R-packages/modeltools/R/matrix.R
index 4ba09dda..512e7429 100644
--- a/R-packages/modeltools/R/matrix.R
+++ b/R-packages/modeltools/R/matrix.R
@@ -68,7 +68,8 @@
 #'
 #' @importFrom tibble tibble
 #' @importFrom assertthat assert_that
-#' @importFrom purrr map2
+#' @importFrom lubridate as_date
+#' @importFrom purrr map_dbl map2
 #'
 #' @export
 create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_size,
@@ -95,12 +96,13 @@ create_train_and_predict_matrices <- function(lagged_df, ahead, training_window_
     all_out <- list()
     
     if (aheads_separate) {
-        train_end_dates <- Reduce(c, lapply(ahead, 
-                                            function(a) get_train_end_date(lagged_df, a)))
+        train_end_dates <- ahead %>%
+            purrr::map_dbl(~ get_train_end_date(lagged_df, .x)) %>%
+            lubridate::as_date()
         
         all_out <- purrr::map2(ahead, train_end_dates,
-                              ~ create_train_matrices(lagged_df, .x, training_window_size,
-                                                      .y))
+                              ~ create_train_matrices(
+                                  lagged_df, .x, training_window_size, .y))
     } else {  # ahead_separate = FALSE: We want a single training data matrix and 
               # a matrix of training responses containing all the aheads
         # Need to recompute training_window_size 

From f24ceb61bb269873df5da99bcdce397ec99782e6 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dmtryshmtv@gmail.com>
Date: Tue, 21 Sep 2021 10:33:31 -0700
Subject: [PATCH 35/36] Update quantgen dependency folder structure

---
 R-packages/modeltools/DESCRIPTION | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION
index f1691e9e..ce25041a 100755
--- a/R-packages/modeltools/DESCRIPTION
+++ b/R-packages/modeltools/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: modeltools
 Type: Package
-Title: Tools for Building COVID Forecasters and Related Models 
+Title: Tools for Building COVID Forecasters and Related Models
 Version: 0.1.0
 Authors@R:
 	c(
@@ -19,7 +19,7 @@ RoxygenNote: 7.1.1
 Remotes:
     cmu-delphi/covidcast/R-packages/covidcast@main,
     cmu-delphi/covidcast/R-packages/evalcast@main,
-    ryantibs/quantgen/R-package/quantgen@master,
+    ryantibs/quantgen/quantgen@master,
 Imports:
     assertthat,
     covidcast,

From b4ab08789622190571347af63d58890ea562e14d Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Mon, 13 Feb 2023 15:08:49 -0800
Subject: [PATCH 36/36] Modeltools: update deps * switch evalcast@main ->
 evalcast@evalcast * make quantgen dependency explicit * make lubridate,
 MMWRweek, tibble dependencies explicit

---
 R-packages/modeltools/DESCRIPTION | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/R-packages/modeltools/DESCRIPTION b/R-packages/modeltools/DESCRIPTION
index ce25041a..a130ff63 100755
--- a/R-packages/modeltools/DESCRIPTION
+++ b/R-packages/modeltools/DESCRIPTION
@@ -15,19 +15,23 @@ License: MIT
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.3
 Remotes:
     cmu-delphi/covidcast/R-packages/covidcast@main,
-    cmu-delphi/covidcast/R-packages/evalcast@main,
-    ryantibs/quantgen/quantgen@master,
+    cmu-delphi/covidcast/R-packages/evalcast@evalcast,
+    ryantibs/quantgen/quantgen@master
 Imports:
     assertthat,
     covidcast,
     dplyr,
-    evalcast,
     genlasso,
+    lubridate,
+    MMWRweek,
     purrr,
+    quantgen,
     slider,
+    tibble,
     tidyselect,
-    tidyr,
-    quantgen
+    tidyr
+Suggests:
+    evalcast