epiverse-trace · joshwlambert · Oct 19, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/CITATION.cff b/CITATION.cff
@@ -144,6 +144,18 @@ references:
     email: xie@yihui.name
     orcid: https://orcid.org/0000-0003-0645-5666
   year: '2023'
+- type: software
+  title: bookdown
+  abstract: 'bookdown: Authoring Books and Technical Documents with R Markdown'
+  notes: Suggests
+  url: https://pkgs.rstudio.com/bookdown/
+  repository: https://CRAN.R-project.org/package=bookdown
+  authors:
+  - family-names: Xie
+    given-names: Yihui
+    email: xie@yihui.name
+    orcid: https://orcid.org/0000-0003-0645-5666
+  year: '2023'
 - type: software
   title: rmarkdown
   abstract: 'rmarkdown: Dynamic Documents for R'

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,6 +24,7 @@ Imports:
 Suggests: 
     incidence2,
     knitr,
+    bookdown,
     rmarkdown,
     spelling,
     testthat (>= 3.0.0)

diff --git a/R/sim_linelist.R b/R/sim_linelist.R
@@ -246,8 +246,14 @@ sim_linelist <- function(R, # nolint cyclocomp
   chain$hospitalisation_date <- chain$hosp_rounded + outbreak_start_date
   chain$death_date <- chain$death_rounded + outbreak_start_date
 
+  linelist_cols <- c(
+    "id", "case_type", "gender", "age", "onset_date", "hospitalisation_date",
+    "date_first_contact", "date_last_contact"
+  )
+
   if (add_names) {
     chain <- .add_names(.data = chain)
+    linelist_cols <- append(linelist_cols, "case_name", after = 1)
   }
 
   # add confirmed, probable, suspected case types
@@ -258,10 +264,6 @@ sim_linelist <- function(R, # nolint cyclocomp
     prob = case_type_probs
   )
 
-  linelist_cols <- c(
-    "id", "gender", "age", "onset_date", "hospitalisation_date",
-    "date_first_contact", "date_last_contact"
-  )
 
   # add Ct if confirmed
   if (add_ct) {
@@ -274,7 +276,6 @@ sim_linelist <- function(R, # nolint cyclocomp
     contacts <- create_contacts(.data = chain) # nolint var assigned not used
   }
 
-
   chain <- chain[, linelist_cols]
 
   # return linelist

diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -19,9 +19,12 @@ Linelist
 packagename
 parameterised
 Poisson
+randomNames
 RECON
+resimulate
 SARS
 sim
 st
+stochasticity
 svg
 yaml
diff --git a/tests/testthat/test-sim_linelist.R b/tests/testthat/test-sim_linelist.R
@@ -31,11 +31,11 @@ test_that("sim_list works as expected", {
   )
 
   expect_s3_class(linelist, class = "data.frame")
-  expect_identical(dim(linelist), c(42L, 7L))
+  expect_identical(dim(linelist), c(42L, 9L))
   expect_identical(
     colnames(linelist),
-    c("id", "gender", "age", "onset_date", "hospitalisation_date",
-      "date_first_contact", "date_last_contact")
+    c("id", "case_name", "case_type", "gender", "age", "onset_date",
+      "hospitalisation_date", "date_first_contact", "date_last_contact")
   )
 })
 
@@ -67,11 +67,11 @@ test_that("sim_list works as expected with age-strat rates", {
   )
 
   expect_s3_class(linelist, class = "data.frame")
-  expect_identical(dim(linelist), c(42L, 7L))
+  expect_identical(dim(linelist), c(42L, 9L))
   expect_identical(
     colnames(linelist),
-    c("id", "gender", "age", "onset_date", "hospitalisation_date",
-      "date_first_contact", "date_last_contact")
+    c("id", "case_name", "case_type", "gender", "age", "onset_date",
+      "hospitalisation_date", "date_first_contact", "date_last_contact")
   )
 })
 
@@ -85,12 +85,32 @@ test_that("sim_list works as expected with Ct", {
     add_ct = TRUE
   )
 
+  expect_s3_class(linelist, class = "data.frame")
+  expect_identical(dim(linelist), c(42L, 10L))
+  expect_identical(
+    colnames(linelist),
+    c("id", "case_name", "case_type", "gender", "age", "onset_date",
+      "hospitalisation_date", "date_first_contact", "date_last_contact",
+      "ct_value")
+  )
+})
+
+test_that("sim_list works as expected with anonymous", {
+  set.seed(1)
+  linelist <- sim_linelist(
+    R = 1.1,
+    serial_interval = serial_interval,
+    onset_to_hosp = onset_to_hosp,
+    onset_to_death = onset_to_death,
+    add_names = FALSE
+  )
+
   expect_s3_class(linelist, class = "data.frame")
   expect_identical(dim(linelist), c(42L, 8L))
   expect_identical(
     colnames(linelist),
-    c("id", "gender", "age", "onset_date", "hospitalisation_date",
-      "date_first_contact", "date_last_contact", "ct_value")
+    c("id", "case_type", "gender", "age", "onset_date",
+      "hospitalisation_date", "date_first_contact", "date_last_contact")
   )
 })
 

diff --git a/vignettes/.gitignore b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/simulist.Rmd b/vignettes/simulist.Rmd
@@ -0,0 +1,159 @@
+---
+title: "Getting Started with {simulist}"
+output: 
+  bookdown::html_vignette2:
+    code_folding: show
+pkgdown:
+  as_is: true
+vignette: >
+  %\VignetteIndexEntry{Getting Started with {simulist}}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+This is an introductory vignette to the {simulist} R package. {simulist} simulates epidemiological linelist data, to provide a row-by-row account of cases in an infectious disease outbreak.
+
+The main function in the {simulist} package is `sim_linelist()`. This functions takes in arguments that control the dynamics of the outbreak, such as the serial interval, and outputs a linelist table (`<data.frame>`) with case information for each infected individual.
+
+For this introduction we will simulate a linelist for a COVID-19 (SARS-CoV-2) outbreak. This will require two R packages: {simulist}, to produce the linelist, and {epiparameter} to provide epidemiological parameters, such as onset-to-death delays.
+
+```{r setup}
+library(simulist)
+library(epiparameter)
+```
+
+First we load in some data that is required for the linelist simulation. Data on epidemiological parameters and distributions is read from the {epiparameter} R package.
+
+```{r read-epidist}
+# get serial interval from {epiparameter} database
+
+serial_interval <- epidist(
+  disease = "COVID-19",
+  epi_dist = "serial interval",
+  prob_distribution = "gamma",
+  prob_distribution_params = c(shape = 1, scale = 1)
+)
+
+# get onset to hospital admission from {epiparameter} database
+onset_to_hosp <- epidist_db(
+  disease = "COVID-19",
+  epi_dist = "onset to hospitalisation",
+  single_epidist = TRUE
+)
+
+# get onset to death from {epiparameter} database
+onset_to_death <- epidist_db(
+  disease = "COVID-19",
+  epi_dist = "onset to death",
+  single_epidist = TRUE
+)
+```
+
+The reproduction number (`R`) is the first argument in `sim_linelist()` and with the serial interval will control the growth rate of the simulated epidemic. Here we set it as 1.1. The minimum requirements to simulate a linelist are the reproduction number (`R`), the serial interval, onset-to-hospitalisation delay and onset-to-death delay.
+
+```{r, sim-linelist}
+linelist <- sim_linelist(
+  R = 1.1,
+  serial_interval = serial_interval,
+  onset_to_hosp = onset_to_hosp,
+  onset_to_death = onset_to_death
+)
+head(linelist)
+```
+
+## Case type 
+
+In the midst of an infectious disease outbreak it may not be possible to confirm every case. A confirmed case is usually done via a diagnostic test which may or may not need to be carried out in the laboratory. There are several reasons why a case may not be confirmed, one example is limited testing capacity, especially in fast growing epidemics. The other categories for cases are: probable and suspected. Probable cases are those that show clinical evidence for the disease but have not, or cannot, be confirmed by a diagnostic test. Suspected cases are those that are possibly infected but do not show clear clinical or epidemiological evidence, nor has a diagnostic test been performed.
+
+The linelist output from the {simulist} simulation contains a column (`case_type`) with the type of case. 
+
+{simulist} can simulate varying probabilities of each case being suspected, probable or confirmed. By default the `sim_linelist()` function uses probabilities of `suspected = 0.2`, `probable = 0.3` and `confirmed = 0.5`. 
+
+```{r, sim-linelist-default-case-type}
+linelist <- sim_linelist(
+  R = 1.1,
+  serial_interval = serial_interval,
+  onset_to_hosp = onset_to_hosp,
+  onset_to_death = onset_to_death
+)
+head(linelist)
+```
+
+To alter these probabilities, supply a named vector to the `sim_linelist()` argument `case_type_probs`. The vector should contain three numbers, with the names `suspected`, `probable` and `confirmed`, with the numbers summing to one. Here we change the values to simulate an outbreak in which the proportion of cases confirmed by laboratory testing is high.
+
+```{r, sim-linelist-mod-case-type}
+linelist <- sim_linelist(
+  R = 1.1,
+  serial_interval = serial_interval,
+  onset_to_hosp = onset_to_hosp,
+  onset_to_death = onset_to_death,
+  case_type_probs = c(suspected = 0.05, probable = 0.05, confirmed = 0.9)
+)
+head(linelist)
+```
+
+It is also possible to set one of these categories to `1`, in which case every case will be of that type. 
+
+The way {simulist} assigns case types is by pasting case types onto existing case data. Thus, it could be viewed that the true underlying data is that all cases in the simulation are confirmed, but that there is a lack of information in some cases. There are no cases in the output linelist that are incorrectly attributed as probable or suspected that have not been infected. That is to say, all individuals in the linelist, whatever their case type, are infected during the outbreak.
+
+## Conditioning simulation on outbreak size
+
+The reproduction number has a strong influence on the size of an outbreak. However, the {simulist} package generates linelist data using a stochastic algorithm, so even when $R < 1$ it can produce a substantial outbreak by chance, or an $R >> 1$ will sometimes not produce a vast epidemic in one simulation (i.e. one replicate) due to the stochasticity. 
+
+When requiring a minimum outbreak size we can specify the `min_chain_size` argument in `sim_linelist()`. By default this is set to 10. This means that the simulation will not return a linelist until the conditioning has been met. In other words, the simulation will resimulate a branching process model until an outbreak infects at least 10 people.
+
+When requiring a linelist that represents a large outbreak, such as the COVID-19 outbreak, setting the `min_chain_size` to a larger number guarantees a linelist of at least that size. Here we simulate a linelist requiring at least 250 cases.
+
+```{r, sim-large-linelist}
+linelist <- sim_linelist(
+  R = 1.1,
+  serial_interval = serial_interval,
+  onset_to_hosp = onset_to_hosp,
+  onset_to_death = onset_to_death,
+  min_chain_size = 250
+)
+head(linelist)
+```
+
+The amount of time the simulation takes can be determined by the reproduction number (`R`) and the minimum outbreak size (`min_chain_size`). If the `min_outbreak_size` is large, for example hundreds or thousands of cases, and the reproduction number is below one, it will take many branching process simulations until finding one that produces a sufficiently large epidemic.
+
+## Anonymous linelist
+
+By default `sim_linelist()` provides the name of each individual in the linelist. If an anonymised linelist is required the `add_names` argument of `sim_linelist()` can be set to `FALSE`. 
+
+```{r sim-anon-linelist}
+linelist <- sim_linelist(
+  R = 1.1,
+  serial_interval = serial_interval,
+  onset_to_hosp = onset_to_hosp,
+  onset_to_death = onset_to_death,
+  add_names = FALSE
+)
+head(linelist)
+```
+
+The names used in the linelist are produced at random by the [{randomNames}](https://CRAN.R-project.org/package=randomNames) R package. Therefore, even when `add_names = TRUE` there is no personal data of real individuals being produced or shared.
+
+## Population age
+
+By default `sim_linelist()` simulates individuals ages assuming a uniform distribution between 1 and 90. To change this age range, a vector of two numbers can be supplied to the `age_range` argument. Here we simulated an outbreak in a population with a population ranging from 5 to 75.
+
+```{r sim-linelist-age-range}
+linelist <- sim_linelist(
+  R = 1.1,
+  serial_interval = serial_interval,
+  onset_to_hosp = onset_to_hosp,
+  onset_to_death = onset_to_death,
+  age_range = c(5, 75)
+)
+head(linelist)
+```
+
+It is currently not possible to simulate with a non-uniform population age structure in {simulist}.