Simulate secondary (#591)

* `simulate_secondary()` -> `convolve_and_scale()` * simulate_secondary function * add tests * add news item * load data.table explicitly in example * Apply suggestions from code review Co-authored-by: Sam Abbott <s.e.abbott12@gmail.com> * add correct documentation --------- Co-authored-by: Sam Abbott <s.e.abbott12@gmail.com>
epiforecasts · Mar 4, 2024 · e0a99cf · e0a99cf
1 parent 5e742bd
commit e0a99cf
Show file tree

Hide file tree

Showing 11 changed files with 424 additions and 75 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -23,6 +23,7 @@ export(clean_nowcasts)
 export(construct_output)
 export(convert_to_logmean)
 export(convert_to_logsd)
+export(convolve_and_scale)
 export(copy_results_to_latest)
 export(create_backcalc_data)
 export(create_clean_reported_cases)

diff --git a/NEWS.md b/NEWS.md
@@ -28,6 +28,7 @@
 * Added the possibility of specifying fixed overdispersion. By @sbfnk in #560 and reviewed by @seabbs.
 * The example in `estimate_truncation()` has been simplified. The package now ships with a dataset `example_truncated`, which is used in the `estimate_truncation()` example and tests. The steps for creating the `example_truncated` is stored in `./data-raw/estimate-truncation.R`. By @jamesmbaazam in #584 and reviewed by @seabbs and @sbfnk.
 * Tests have been updated to only set random seeds before snapshot tests involving random number generation, and unset them subsequently. By @sbfnk in #590 and reviewed by @seabbs.
+* A function `simulate_secondary()` was added to simulate from parameters of the `estimate_secondary` model. A function of the same name that was previously based on a reimplementation of that model in R with potentially time-varying scalings and delays has been renamed to `convolve_and_scale()`. By @sbfnk in #591 and reviewed by @seabbs.
 
 ## Model changes
 

diff --git a/R/estimate_secondary.R b/R/estimate_secondary.R
@@ -87,7 +87,7 @@
 #' cases[, meanlog := 1.8][, sdlog := 0.5]
 #'
 #' # Simulate secondary cases
-#' cases <- simulate_secondary(cases, type = "incidence")
+#' cases <- convolve_and_scale(cases, type = "incidence")
 #' #
 #' # fit model to example data specifying a weak prior for fraction reported
 #' # with a secondary case
@@ -113,7 +113,7 @@
 #' cases[, meanlog := 1.6][, sdlog := 0.8]
 #'
 #' # Simulate secondary cases
-#' cases <- simulate_secondary(cases, type = "prevalence")
+#' cases <- convolve_and_scale(cases, type = "prevalence")
 #'
 #' # fit model to example prevalence data
 #' prev <- estimate_secondary(cases[1:100],
@@ -376,7 +376,14 @@ plot.estimate_secondary <- function(x, primary = FALSE,
   return(plot)
 }
 
-#' Simulate a secondary observation
+#' Convolve and scale a time series
+#'
+#' This applies a lognormal convolution with given, potentially time-varying
+#' parameters representing the parameters of the lognormal distribution used for
+#' the convolution and an optional scaling factor. This is akin to the model
+#' used in [estimate_secondary()] and [simulate_secondary()].
+#'
+#' Up to version 1.4.0 this function was called [simulate_secondary()].
 #'
 #' @param data A `<data.frame>` containing the `date` of report and `primary`
 #' cases as a numeric vector.
@@ -417,7 +424,7 @@ plot.estimate_secondary <- function(x, primary = FALSE,
 #' cases[, meanlog := 1.8][, sdlog := 0.5]
 #'
 #' # Simulate secondary cases
-#' cases <- simulate_secondary(cases, type = "incidence")
+#' cases <- convolve_and_scale(cases, type = "incidence")
 #' cases
 #' #### Prevalence data example ####
 #'
@@ -432,9 +439,9 @@ plot.estimate_secondary <- function(x, primary = FALSE,
 #' cases[, meanlog := 1.6][, sdlog := 0.8]
 #'
 #' # Simulate secondary cases
-#' cases <- simulate_secondary(cases, type = "prevalence")
+#' cases <- convolve_and_scale(cases, type = "prevalence")
 #' cases
-simulate_secondary <- function(data, type = "incidence", family = "poisson",
+convolve_and_scale <- function(data, type = "incidence", family = "poisson",
                                delay_max = 30, ...) {
   type <- arg_match(type, values = c("incidence", "prevalence"))
   family <- arg_match(family, values = c("none", "poisson", "negbin"))

diff --git a/R/simulate_secondary.R b/R/simulate_secondary.R
@@ -0,0 +1,152 @@
+#' Simulate secondary observations from primary observations
+#'
+#' Simulations are done from a given trajectory of primary observations by
+#' applying any given delays and observation parameters.
+#'
+#' In order to simulate, all parameters that are specified such as the mean and
+#' standard deviation of delays or observation scaling, must be fixed.
+#' Uncertain parameters are not allowed.
+#'
+#' A function of the same name that was previously based on a reimplementation
+#' of that model in R with potentially time-varying scalings and delays is
+#' available as `convolve_and_scale()
+#' @param primary a data frame of primary reports (column `primary`) by date
+#'   (column `date`). Column `primary` must be numeric and `date` must be in
+#'   date format.  it will be assumed that `primary` is zero on the missing
+#'   days.
+#' @inheritParams simulate_infections
+#' @inheritParams estimate_secondary
+#' @importFrom checkmate assert_data_frame assert_date assert_numeric
+#'   assert_subset
+#' @return A data.table of simulated secondary observations (column `secondary`)
+#'   by date.
+#' @export
+#' @examples
+#' \donttest{
+#'   ## load data.table to manipulate `example_confirmed` below
+#'   library(data.table)
+#'   cases <- as.data.table(example_confirmed)[, primary := confirm]
+#'   sim <- simulate_secondary(
+#'     cases,
+#'     delays = delay_opts(fix_dist(example_reporting_delay)),
+#'     obs = obs_opts(family = "poisson")
+#'   )
+#' }
+simulate_secondary <- function(primary,
+                               day_of_week_effect = NULL,
+                               secondary = secondary_opts(),
+                               delays = delay_opts(),
+                               truncation = trunc_opts(),
+                               obs = obs_opts(),
+                               CrIs = c(0.2, 0.5, 0.9),
+                               backend = "rstan",
+                               ...) {
+  ## deprecated usage
+  assert_data_frame(primary, any.missing = FALSE)
+  assert_subset(c("date", "primary"), colnames(primary))
+  assert_date(primary$date)
+  assert_numeric(primary$primary, lower = 0)
+  assert_numeric(day_of_week_effect, lower = 0, null.ok = TRUE)
+  assert_class(secondary, "secondary_opts")
+  assert_class(delays, "delay_opts")
+  assert_class(truncation, "trunc_opts")
+  assert_class(obs, "obs_opts")
+
+  ## create primary values for all dates modelled
+  all_dates <- data.table(
+    date = seq.Date(min(primary$date), max(primary$date), by = "day")
+  )
+  primary <- merge.data.table(all_dates, primary, by = "date", all.x = TRUE)
+  primary <- primary[, primary := nafill(primary, type = "const", fill = 0)]
+
+  data <- list(
+    n = 1,
+    t = nrow(primary),
+    h = nrow(primary),
+    all_dates = 0,
+    obs = array(integer(0)),
+    primary = array(primary$primary, dim =  c(1, nrow(primary))),
+    seeding_time = 0L
+  )
+
+  data <- c(data, secondary)
+
+  data <- c(data, create_stan_delays(
+    delay = delays,
+    trunc = truncation
+  ))
+
+  if ((length(data$delay_mean_sd) > 0 && any(data$delay_mean_sd > 0)) ||
+      (length(data$delay_sd_sd) > 0 && any(data$delay_sd_sd > 0))) {
+    stop(
+      "Cannot simulate from uncertain parameters. Use the [fix_dist()] ",
+      "function to set the parameters of uncertain distributions either the ",
+      "mean or a randomly sampled value"
+    )
+  }
+  data$delay_mean <- array(
+    data$delay_mean_mean, dim = c(1, length(data$delay_mean_mean))
+  )
+  data$delay_sd <- array(
+    data$delay_sd_mean, dim = c(1, length(data$delay_sd_mean))
+  )
+  data$delay_mean_sd <- NULL
+  data$delay_sd_sd <- NULL
+
+  data <- c(data, create_obs_model(
+    obs, dates = primary$date
+  ))
+
+  if (data$obs_scale_sd > 0) {
+    stop(
+      "Cannot simulate from uncertain observation scaling; use fixed scaling ",
+      "instead."
+    )
+  }
+  if (data$obs_scale) {
+    data$frac_obs <- array(data$obs_scale_mean, dim = c(1, 1))
+  } else {
+    data$frac_obs <- array(dim = c(1, 0))
+  }
+  data$obs_scale_mean <- NULL
+  data$obs_scale_sd <- NULL
+
+  if (obs$family == "negbin") {
+    if (data$phi_sd > 0) {
+      stop(
+        "Cannot simulate from uncertain overdispersion; use fixed ",
+        "overdispersion instead."
+      )
+    }
+    data$rep_phi <- array(data$phi_mean, dim = c(1, 1))
+  } else {
+    data$rep_phi <- array(dim = c(1, 0))
+  }
+  data$phi_mean <- NULL
+  data$phi_sd <- NULL
+
+  ## day of week effect
+  if (is.null(day_of_week_effect)) {
+    day_of_week_effect <- rep(1, data$week_effect)
+  }
+
+  day_of_week_effect <- day_of_week_effect / sum(day_of_week_effect)
+  data$day_of_week_simplex <- array(
+    day_of_week_effect, dim = c(1, data$week_effect)
+  )
+
+  # Create stan arguments
+  stan <- stan_opts(backend = backend, chains = 1, samples = 1, warmup = 1)
+  args <- create_stan_args(
+    stan, data = data, fixed_param = TRUE, model = "simulate_secondary",
+    verbose = FALSE
+  )
+
+  ## simulate
+  sim <- fit_model(args, id = "simulate_secondary")
+
+  secondary <- extract_samples(sim, "sim_secondary")$sim_secondary[1, , ]
+  out <- data.table(date = all_dates, secondary = secondary)
+
+  return(out[])
+}
diff --git a/inst/stan/simulate_secondary.stan b/inst/stan/simulate_secondary.stan
@@ -64,6 +64,20 @@ generated quantities {
     if (week_effect > 1) {
       secondary = day_of_week_effect(secondary, day_of_week, to_vector(day_of_week_simplex[i]));
     }
+
+    // truncate near time cases to observed reports
+    if (trunc_id) {
+      vector[delay_type_max[trunc_id] + 1] trunc_rev_cmf = get_delay_rev_pmf(
+        trunc_id, delay_type_max[trunc_id] + 1, delay_types_p, delay_types_id,
+        delay_types_groups, delay_max, delay_np_pmf,
+        delay_np_pmf_groups, delay_mean[i], delay_sd[i], delay_dist,
+        0, 1, 1
+      );
+      secondary = truncate(
+        secondary, trunc_rev_cmf, 0
+      );
+    }
+
     // simulate secondary reports
     sim_secondary[i] = report_rng(
       tail(secondary, all_dates ? t : h), rep_phi[i], model_type

diff --git a/man/convolve_and_scale.Rd b/man/convolve_and_scale.Rd
diff --git a/man/estimate_secondary.Rd b/man/estimate_secondary.Rd