Skip to content

Commit

Permalink
Merge pull request #115 from cmu-delphi/ds/clean
Browse files Browse the repository at this point in the history
refactor: unpackage the functions in R/ folder, clean repo
  • Loading branch information
dshemetov authored May 13, 2024
2 parents 150d876 + 66a5e98 commit 54e3006
Show file tree
Hide file tree
Showing 46 changed files with 552 additions and 770 deletions.
6 changes: 0 additions & 6 deletions .Rbuildignore

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ jobs:
steps:
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-pandoc@v2
- name: System dependencies
run: |
sudo apt-get install libudunits2-dev libgdal-dev libgeos-dev libproj-dev libglpk-dev
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
needs: check
- uses: r-lib/actions/setup-renv@v2

- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
- name: Run tests
run: |
make test
shell: bash
53 changes: 0 additions & 53 deletions DESCRIPTION

This file was deleted.

2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
YEAR: 2023
COPYRIGHT HOLDER: epieval authors
COPYRIGHT HOLDER: exploration-tooling authors
2 changes: 1 addition & 1 deletion LICENSE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License

Copyright (c) 2023 epieval authors
Copyright (c) 2023 exploration-tooling authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
18 changes: 11 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ install:
Rscript -e "renv::restore()"
Rscript -e 'renv::install(".")'

.PHONY: all run run-nohup sync download upload dashboard
.PHONY: all test test-forecasters run run-nohup sync download upload dashboard

test:
Rscript -e "testthat::test_dir('tests/testthat')"

run:
Rscript scripts/run.R
Expand All @@ -12,16 +15,17 @@ run-nohup:
nohup Rscript scripts/run.R &

sync:
Rscript scripts/sync.R
Rscript -e "source('R/utils.R'); sync_aws()"

pull:
Rscript scripts/sync.R download
Rscript -e "source('R/utils.R'); sync_aws(direction = 'download')"

download:pull

push:
Rscript scripts/sync.R upload
Rscript -e "source('R/utils.R'); sync_aws(direction = 'upload')"

upload: push

dashboard:
Rscript scripts/dashboard.R

test-forecasters:
Rscript scripts/test-forecasters-data.R
7 changes: 0 additions & 7 deletions R/epieval-package.R

This file was deleted.

91 changes: 75 additions & 16 deletions R/data_transforms.R → R/forecasters/data_transforms.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# various reusable transforms to apply before handing to epipredict
# Reusable transforms to apply before handing to epipredict

#' extract the non-key, non-smoothed columns from epi_data
#' @keywords internal
#' Extract the non-key, non-smoothed columns from epi_data
#'
#' @param epi_data the `epi_df`
#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
get_trainable_names <- function(epi_data, cols) {
Expand All @@ -13,25 +13,28 @@ get_trainable_names <- function(epi_data, cols) {
return(cols)
}

#' just the names which aren't keys for an epi_df
#' @description
#' names, but it excludes keys
#' Just the names which aren't keys for an epi_df
#'
#' Names, but it excludes keys
#'
#' @param epi_data the epi_df
get_nonkey_names <- function(epi_data) {
cols <- names(epi_data)
cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
return(cols)
}

#' get a rolling average for the named columns
#' @description
#' add column(s) that are the rolling means of the specified columns, as
#' implemented by slider. Defaults to the previous 7 days.
#' Currently only group_by's on the geo_value. Should probably extend to more
#' keys if you have them
#' Get a rolling average for the named columns
#'
#' Add column(s) that are the rolling means of the specified columns, as
#' implemented by slider. Defaults to the previous 7 days. Currently only
#' group_by's on the geo_value. Should probably extend to more keys if you have
#' them.
#'
#' @param epi_data the dataset
#' @param width the number of days (or examples, the sliding isn't time-aware) to use
#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
#'
#' @importFrom slider slide_dbl
#' @importFrom epiprocess epi_slide
#' @export
Expand All @@ -46,11 +49,12 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
return(epi_data)
}

#' get a rolling standard deviation for the named columns
#' @description
#' Get a rolling standard deviation for the named columns
#'
#' A rolling standard deviation, based off of a rolling mean. First it
#' calculates a rolling mean with width `mean_width`, and then squares the
#' difference between that and the actual value, averaged over `sd_width`.
#' calculates a rolling mean with width `mean_width`, and then squares the
#' difference between that and the actual value, averaged over `sd_width`.
#'
#' @param epi_data the dataset
#' @param sd_width the number of days (or examples, the sliding isn't
#' time-aware) to use for the standard deviation calculation
Expand All @@ -59,6 +63,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
#' (so 14 in the complete default case)
#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
#'
#' @importFrom epiprocess epi_slide
#' @export
rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
Expand All @@ -82,3 +87,57 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
}
result %<>% ungroup()
}

#' Temporary patch that pulls `NA`'s out of an epi_df
#'
#' Just delete rows that have NA's in them. eventually epipredict should
#' directly handle this so we don't have to
#'
#' @param epi_data the epi_df to be fixed
#' @param outcome the column name containing the target variable
#' @param extra_sources any other columns used as predictors
#'
#' @importFrom tidyr drop_na
#' @importFrom epiprocess as_epi_df
#' @export
clear_lastminute_nas <- function(epi_data, outcome, extra_sources) {
meta_data <- attr(epi_data, "metadata")
if (extra_sources == c("")) {
extra_sources <- character(0L)
}
epi_data %<>%
drop_na(c(!!outcome, !!!extra_sources)) %>%
as_epi_df()
attr(epi_data, "metadata") <- meta_data
return(epi_data)
}

#' Only extend the ahead
#'
#' Instead of filling in new values, this just extends how far into the future
#' the model is predicting. For example, if the last data is on the 3rd, the
#' `as_of` is the 5th, and we want an ahead of 4, then this actually sets the
#' ahead to be 6, since the 9th (the target date) is 6 days after the last day
#' of data.
#'
#' @param epi_data the dataset
#' @param ahead how many units (depending on the dataset, normally days or
#' weeks) to predict ahead of the `forecast_date`
#'
#' @export
extend_ahead <- function(epi_data, ahead) {
time_values <- epi_data$time_value
if (length(time_values) > 0) {
as_of <- attributes(epi_data)$metadata$as_of
max_time <- max(time_values)
if (is.null(as_of)) {
as_of <- max_time
}
effective_ahead <- as.integer(
as.Date(as_of) - max_time + ahead
)
} else {
effective_ahead <- Inf
}
return(list(epi_data, effective_ahead))
}
18 changes: 11 additions & 7 deletions R/data_validation.R → R/forecasters/data_validation.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
#' helper function for those writing forecasters
#' @description
#' a smorgasbord of checks that any epipredict-based forecaster should do:
#' Helper function for those writing forecasters
#'
#' A smorgasbord of checks that any epipredict-based forecaster should do:
#' 1. check that the args list is created correctly,
#' 2. rewrite an empty extra sources list from an empty string
#' 3. validate the outcome and predictors as present,
#' 4. make sure the trainer is a `regression` model from `parsnip`
#' 5. adjust the trainer's quantiles based on those in args_list if it's a
#' quantile trainer
#' 6. remake the lags to match the numebr of predictors
#'
#' @inheritParams scaled_pop
#' @param predictors the full list of predictors including the outcome. can
#' include empty strings
#' @param args_list the args list created by [`epipredict::arx_args_list`]
#'
#' @export
sanitize_args_predictors_trainer <- function(epi_data,
outcome,
Expand Down Expand Up @@ -40,10 +42,11 @@ sanitize_args_predictors_trainer <- function(epi_data,
return(list(args_list, predictors, trainer))
}

#' confirm that there's enough data to run this model
#' @description
#' epipredict is a little bit fragile about having enough data to train; we want
#' to be able to return a null result rather than error out.
#' Confirm that there's enough data to run this model
#'
#' Epipredict is a little bit fragile about having enough data to train; we want
#' to be able to return a null result rather than error out.
#'
#' @param epi_data the input data
#' @param ahead the effective ahead; may be infinite if there isn't enough data.
#' @param args_input the input as supplied to `slide_forecaster`; lags is the
Expand All @@ -54,6 +57,7 @@ sanitize_args_predictors_trainer <- function(epi_data,
#' @param buffer how many training data to insist on having (e.g. if `buffer=1`,
#' this trains on one sample; the default is set so that `linear_reg` isn't
#' rank deficient)
#'
#' @importFrom tidyr drop_na
#' @export
confirm_sufficient_data <- function(epi_data, ahead, args_input, outcome, extra_sources, buffer = 9) {
Expand Down
14 changes: 7 additions & 7 deletions R/ensemble_average.R → R/forecasters/ensemble_average.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#' an ensemble model that averages each quantile separately
#' @description
#' An ensemble model that averages each quantile separately
#'
#' The simplest class of ensembing models, it takes in a list of quantile
#' forecasts and averages them on a per-quantile basis. By default the average
#' used is the median, but it can accept any vectorized function.
#' forecasts and averages them on a per-quantile basis. By default the average
#' used is the median, but it can accept any vectorized function.
#'
#' @param epi_data unused for this forecaster, but potentially an ensemble may
#' want the underlying data.
#' @param outcome The name of the target variable.
Expand All @@ -17,6 +18,7 @@
#' @param ensemble_args_names an argument purely for use in targets. You
#' probably shouldn't worry about it. In a target, it should probably be
#' `ensemble_args_names = names(ensemble_args)`
#'
#' @importFrom rlang %||%
#' @export
ensemble_average <- function(epi_data,
Expand All @@ -27,9 +29,7 @@ ensemble_average <- function(epi_data,
ensemble_args_names = NULL) {
# unique parameters must be buried in ensemble_args so that the generic function signature is stable
# their names are separated for obscure target related reasons
if (!is.null(ensemble_args_names)) {
names(ensemble_args) <- ensemble_args_names
}
names(ensemble_args) <- ensemble_args_names %||% names(ensemble_args)
average_type <- ensemble_args$average_type %||% median
join_columns <- ensemble_args$join_columns %||% c("geo_value", "forecast_date", "target_end_date", "quantile")
# begin actual analysis
Expand Down
Loading

0 comments on commit 54e3006

Please sign in to comment.