Merge pull request #115 from cmu-delphi/ds/clean

refactor: unpackage the functions in R/ folder, clean repo
cmu-delphi · May 13, 2024 · 54e3006 · 54e3006
2 parents 150d876 + 66a5e98
commit 54e3006
Show file tree

Hide file tree

Showing 46 changed files with 552 additions and 770 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
diff --git a/.github/workflows/R-CMD-check.yaml → .github/workflows/tests.yaml b/.github/workflows/R-CMD-check.yaml → .github/workflows/tests.yaml
@@ -30,19 +30,19 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
-      - uses: r-lib/actions/setup-pandoc@v2
+      - name: System dependencies
+        run: |
+          sudo apt-get install libudunits2-dev libgdal-dev libgeos-dev libproj-dev libglpk-dev
 
       - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
           http-user-agent: ${{ matrix.config.http-user-agent }}
           use-public-rspm: true
 
-      - uses: r-lib/actions/setup-r-dependencies@v2
-        with:
-          extra-packages: any::rcmdcheck
-          needs: check
+      - uses: r-lib/actions/setup-renv@v2
 
-      - uses: r-lib/actions/check-r-package@v2
-        with:
-          upload-snapshots: true
+      - name: Run tests
+        run: |
+          make test
+        shell: bash
diff --git a/DESCRIPTION b/DESCRIPTION
diff --git a/LICENSE b/LICENSE
@@ -1,2 +1,2 @@
 YEAR: 2023
-COPYRIGHT HOLDER: epieval authors
+COPYRIGHT HOLDER: exploration-tooling authors
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,6 +1,6 @@
 # MIT License
 
-Copyright (c) 2023 epieval authors
+Copyright (c) 2023 exploration-tooling authors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/Makefile b/Makefile
@@ -3,7 +3,10 @@ install:
 	Rscript -e "renv::restore()"
 	Rscript -e 'renv::install(".")'
 
-.PHONY: all run run-nohup sync download upload dashboard
+.PHONY: all test test-forecasters run run-nohup sync download upload dashboard
+
+test:
+	Rscript -e "testthat::test_dir('tests/testthat')"
 
 run:
 	Rscript scripts/run.R
@@ -12,16 +15,17 @@ run-nohup:
 	nohup Rscript scripts/run.R &
 
 sync:
-	Rscript scripts/sync.R
+	Rscript -e "source('R/utils.R'); sync_aws()"
 
 pull:
-	Rscript scripts/sync.R download
+	Rscript -e "source('R/utils.R'); sync_aws(direction = 'download')"
+
+download:pull
 
 push:
-	Rscript scripts/sync.R upload
+	Rscript -e "source('R/utils.R'); sync_aws(direction = 'upload')"
+
+upload: push
 
 dashboard:
 	Rscript scripts/dashboard.R
-
-test-forecasters:
-	Rscript scripts/test-forecasters-data.R
diff --git a/R/epieval-package.R b/R/epieval-package.R
diff --git a/R/data_transforms.R → R/forecasters/data_transforms.R b/R/data_transforms.R → R/forecasters/data_transforms.R
@@ -1,7 +1,7 @@
-# various reusable transforms to apply before handing to epipredict
+# Reusable transforms to apply before handing to epipredict
 
-#' extract the non-key, non-smoothed columns from epi_data
-#' @keywords internal
+#' Extract the non-key, non-smoothed columns from epi_data
+#'
 #' @param epi_data the `epi_df`
 #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
 get_trainable_names <- function(epi_data, cols) {
@@ -13,25 +13,28 @@ get_trainable_names <- function(epi_data, cols) {
   return(cols)
 }
 
-#' just the names which aren't keys for an epi_df
-#' @description
-#' names, but it excludes keys
+#' Just the names which aren't keys for an epi_df
+#'
+#' Names, but it excludes keys
+#'
 #' @param epi_data the epi_df
 get_nonkey_names <- function(epi_data) {
   cols <- names(epi_data)
   cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
   return(cols)
 }
 
-#' get a rolling average for the named columns
-#' @description
-#' add column(s) that are the rolling means of the specified columns, as
-#'   implemented by slider. Defaults to the previous 7 days.
-#' Currently only group_by's on the geo_value. Should probably extend to more
-#'   keys if you have them
+#' Get a rolling average for the named columns
+#'
+#' Add column(s) that are the rolling means of the specified columns, as
+#' implemented by slider. Defaults to the previous 7 days. Currently only
+#' group_by's on the geo_value. Should probably extend to more keys if you have
+#' them.
+#'
 #' @param epi_data the dataset
 #' @param width the number of days (or examples, the sliding isn't time-aware) to use
 #' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
+#'
 #' @importFrom slider slide_dbl
 #' @importFrom epiprocess epi_slide
 #' @export
@@ -46,11 +49,12 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   return(epi_data)
 }
 
-#' get a rolling standard deviation for the named columns
-#' @description
+#' Get a rolling standard deviation for the named columns
+#'
 #' A rolling standard deviation, based off of a rolling mean. First it
-#'   calculates a rolling mean with width `mean_width`, and then squares the
-#'   difference between that and the actual value, averaged over `sd_width`.
+#' calculates a rolling mean with width `mean_width`, and then squares the
+#' difference between that and the actual value, averaged over `sd_width`.
+#'
 #' @param epi_data the dataset
 #' @param sd_width the number of days (or examples, the sliding isn't
 #'   time-aware) to use for the standard deviation calculation
@@ -59,6 +63,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
 #'   (so 14 in the complete default case)
 #' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
 #' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
+#'
 #' @importFrom epiprocess epi_slide
 #' @export
 rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
@@ -82,3 +87,57 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
   }
   result %<>% ungroup()
 }
+
+#' Temporary patch that pulls `NA`'s out of an epi_df
+#'
+#' Just delete rows that have NA's in them. eventually epipredict should
+#' directly handle this so we don't have to
+#'
+#' @param epi_data the epi_df to be fixed
+#' @param outcome the column name containing the target variable
+#' @param extra_sources any other columns used as predictors
+#'
+#' @importFrom tidyr drop_na
+#' @importFrom epiprocess as_epi_df
+#' @export
+clear_lastminute_nas <- function(epi_data, outcome, extra_sources) {
+  meta_data <- attr(epi_data, "metadata")
+  if (extra_sources == c("")) {
+    extra_sources <- character(0L)
+  }
+  epi_data %<>%
+    drop_na(c(!!outcome, !!!extra_sources)) %>%
+    as_epi_df()
+  attr(epi_data, "metadata") <- meta_data
+  return(epi_data)
+}
+
+#' Only extend the ahead
+#'
+#' Instead of filling in new values, this just extends how far into the future
+#' the model is predicting. For example, if the last data is on the 3rd, the
+#' `as_of` is the 5th, and we want an ahead of 4, then this actually sets the
+#' ahead to be 6, since the 9th (the target date) is 6 days after the last day
+#' of data.
+#'
+#' @param epi_data the dataset
+#' @param ahead how many units (depending on the dataset, normally days or
+#' weeks) to predict ahead of the `forecast_date`
+#'
+#' @export
+extend_ahead <- function(epi_data, ahead) {
+  time_values <- epi_data$time_value
+  if (length(time_values) > 0) {
+    as_of <- attributes(epi_data)$metadata$as_of
+    max_time <- max(time_values)
+    if (is.null(as_of)) {
+      as_of <- max_time
+    }
+    effective_ahead <- as.integer(
+      as.Date(as_of) - max_time + ahead
+    )
+  } else {
+    effective_ahead <- Inf
+  }
+  return(list(epi_data, effective_ahead))
+}
diff --git a/R/data_validation.R → R/forecasters/data_validation.R b/R/data_validation.R → R/forecasters/data_validation.R
@@ -1,17 +1,19 @@
-#' helper function for those writing forecasters
-#' @description
-#' a smorgasbord of checks that any epipredict-based forecaster should do:
+#' Helper function for those writing forecasters
+#'
+#' A smorgasbord of checks that any epipredict-based forecaster should do:
 #' 1. check that the args list is created correctly,
 #' 2. rewrite an empty extra sources list from an empty string
 #' 3. validate the outcome and predictors as present,
 #' 4. make sure the trainer is a `regression` model from `parsnip`
 #' 5. adjust the trainer's quantiles based on those in args_list if it's a
 #'    quantile trainer
 #' 6. remake the lags to match the numebr of predictors
+#'
 #' @inheritParams scaled_pop
 #' @param predictors the full list of predictors including the outcome. can
 #'   include empty strings
 #' @param args_list the args list created by [`epipredict::arx_args_list`]
+#'
 #' @export
 sanitize_args_predictors_trainer <- function(epi_data,
                                              outcome,
@@ -40,10 +42,11 @@ sanitize_args_predictors_trainer <- function(epi_data,
   return(list(args_list, predictors, trainer))
 }
 
-#' confirm that there's enough data to run this model
-#' @description
-#' epipredict is a little bit fragile about having enough data to train; we want
-#'   to be able to return a null result rather than error out.
+#' Confirm that there's enough data to run this model
+#'
+#' Epipredict is a little bit fragile about having enough data to train; we want
+#' to be able to return a null result rather than error out.
+#'
 #' @param epi_data the input data
 #' @param ahead the effective ahead; may be infinite if there isn't enough data.
 #' @param args_input the input as supplied to `slide_forecaster`; lags is the
@@ -54,6 +57,7 @@ sanitize_args_predictors_trainer <- function(epi_data,
 #' @param buffer how many training data to insist on having (e.g. if `buffer=1`,
 #'   this trains on one sample; the default is set so that `linear_reg` isn't
 #'   rank deficient)
+#'
 #' @importFrom tidyr drop_na
 #' @export
 confirm_sufficient_data <- function(epi_data, ahead, args_input, outcome, extra_sources, buffer = 9) {

diff --git a/R/ensemble_average.R → R/forecasters/ensemble_average.R b/R/ensemble_average.R → R/forecasters/ensemble_average.R
@@ -1,8 +1,9 @@
-#' an ensemble model that averages each quantile separately
-#' @description
+#' An ensemble model that averages each quantile separately
+#'
 #' The simplest class of ensembing models, it takes in a list of quantile
-#'   forecasts and averages them on a per-quantile basis. By default the average
-#'   used is the median, but it can accept any vectorized function.
+#' forecasts and averages them on a per-quantile basis. By default the average
+#' used is the median, but it can accept any vectorized function.
+#'
 #' @param epi_data unused for this forecaster, but potentially an ensemble may
 #'   want the underlying data.
 #' @param outcome The name of the target variable.
@@ -17,6 +18,7 @@
 #' @param ensemble_args_names an argument purely for use in targets. You
 #'   probably shouldn't worry about it. In a target, it should probably be
 #'   `ensemble_args_names = names(ensemble_args)`
+#'
 #' @importFrom rlang %||%
 #' @export
 ensemble_average <- function(epi_data,
@@ -27,9 +29,7 @@ ensemble_average <- function(epi_data,
                              ensemble_args_names = NULL) {
   # unique parameters must be buried in ensemble_args so that the generic function signature is stable
   # their names are separated for obscure target related reasons
-  if (!is.null(ensemble_args_names)) {
-    names(ensemble_args) <- ensemble_args_names
-  }
+  names(ensemble_args) <- ensemble_args_names %||% names(ensemble_args)
   average_type <- ensemble_args$average_type %||% median
   join_columns <- ensemble_args$join_columns %||% c("geo_value", "forecast_date", "target_end_date", "quantile")
   # begin actual analysis