From c03221e50747da660f15b71cbdac4212e51cc4bf Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Tue, 17 Dec 2024 16:11:00 -0800
Subject: [PATCH 1/4] Fix age group rate aggregation example

---
 vignettes/epi_df.Rmd | 107 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 94 insertions(+), 13 deletions(-)

diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd
index 15f763389..129856f3e 100644
--- a/vignettes/epi_df.Rmd
+++ b/vignettes/epi_df.Rmd
@@ -209,12 +209,41 @@ Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html)
 
 ```{r}
 library(epidatr)
-flu_data <- pub_flusurv(
+flu_data_api <- pub_flusurv(
   locations = "ca",
-  epiweeks = epirange(201801, 202001),
-) %>%
-  select(location, epiweek, issue, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>%
-  tidyr::pivot_longer(cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate")
+  epiweeks = epirange(201801, 202001)
+)
+```
+
+We're interested in the age-specific rates:
+```{r}
+flu_data <- flu_data_api %>%
+  select(location, epiweek, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>%
+  # Turn `rate_age_0`..`rate_age_4` columns into an `age_group` and `rate`
+  # column (with 5x as many rows):
+  tidyr::pivot_longer(
+    cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate",
+    # When converting column names to entries in `age_group`, remove this prefix:
+    names_prefix = "rate_age_",
+    # And add a better prefix:
+    names_transform = function(age_group) paste0("age_group_", age_group)
+  ) %>%
+  # Improve `age_group` labels a bit more:
+  mutate(
+    age_group = case_match(
+      age_group,
+      "age_group_0" ~ "0--4 yr",
+      "age_group_1" ~ "5--17 yr",
+      "age_group_2" ~ "18--49 yr",
+      "age_group_3" ~ "50--64 yr",
+      "age_group_4" ~ ">= 65 yr",
+      # Make this a factor with appropriate level ordering:
+      .ptype = factor(levels = c("0--4 yr", "5--17 yr", "18--49 yr", "50--64 yr", ">= 65 yr"))
+    )
+  ) %>%
+  # The API currently outputs `epiweek` in Date format (the constituent Sunday);
+  # rename it to remind us that it's not in YYYYww format:
+  rename(time_value = epiweek)
 flu_data
 ```
 
@@ -222,22 +251,75 @@ We can now convert this data to an `epi_df` object and set the `age_group`
 column as an additional group key:
 
 ```{r}
-flu_data <- flu_data %>% as_epi_df(other_keys = "age_group", as_of = as.Date("2024-03-20"))
+flu_data <- flu_data %>% as_epi_df(other_keys = "age_group")
 flu_data
 ```
 
 Note that the `epi_df` object now has an additional key column `age_group`. This
 means that there should only be one row for each combination of `geo_value`,
-`time_value`, and `age_group` in the dataset (this is enforced at construction
+`age_group`, and `time_value` in the dataset (this is enforced at construction
 time).
 
-Now we can aggregate the data by `age_group`, if we want to compute the total:
-
+Now we can aggregate the data by `age_group`, if we want to compute the total.
+Since we are working with rates, we need to attach some population data in order
+to do this aggregation. It's somewhat ambiguous whether FluSurv-NET reporting
+uses either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or
+[Census](https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html)
+populations for `time_value`s before 2020 included in reports from 2020 onward,
+but in this case, the two sources agreed exactly. FluSurv-NET also directly
+reports an overall rate, so we can check our work.
 ```{r}
-group_cols <- key_colnames(exclude = "age_group")
-flu_data %>%
-  sum_groups_epi_df("rate", group_cols = group_cols)
+group_cols <- key_colnames(flu_data, exclude = "age_group")
+rate_overall_recalc_edf <-
+  flu_data %>%
+  inner_join(
+    # Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and
+    # 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually
+    # released by Census in June 2021 and by NCHS in September 2021), which is the
+    # last available reporting found with population estimates for 2017 and 2018:
+    tribble(
+      ~geo_value, ~age_group,  ~time_value,           ~pop,
+      "CA",       "0--4 yr",   as.Date("2017-07-01"), 203813,
+      "CA",       "5--17 yr",  as.Date("2017-07-01"), 521827,
+      "CA",       "18--49 yr", as.Date("2017-07-01"), 1722399,
+      "CA",       "50--64 yr", as.Date("2017-07-01"), 700090,
+      "CA",       ">= 65 yr",  as.Date("2017-07-01"), 534789,
+      "CA",       "0--4 yr",   as.Date("2018-07-01"), 201265,
+      "CA",       "5--17 yr",  as.Date("2018-07-01"), 520077,
+      "CA",       "18--49 yr", as.Date("2018-07-01"), 1725382,
+      "CA",       "50--64 yr", as.Date("2018-07-01"), 699145,
+      "CA",       ">= 65 yr",  as.Date("2018-07-01"), 551243,
+    ),
+    # Simple population interpolation/extrapolation scheme: last observation
+    # carried forward. Use the estimated population on 2017-07-01 for
+    # time_values 2017-07-01 through 2018-06-30, and the estimated population on
+    # 2018-07-01 for all subsequent time_values:
+    join_by(geo_value, closest(y$time_value <= x$time_value), age_group),
+    suffix = c("", "_for_pop"),
+    relationship = "many-to-one", unmatched = "error"
+  ) %>%
+  select(-time_value_for_pop) %>%
+  group_by(geo_value, time_value) %>%
+  mutate(count = rate * pop / 100e3) %>%
+  ungroup() %>%
+  sum_groups_epi_df(c("count", "pop"), group_cols = group_cols) %>%
+  mutate(rate_overall_recalc = count / pop * 100e3) %>%
+  # match rounding of original data:
+  mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) %>%
+  # compare to published overall rates:
+  inner_join(
+    flu_data_api %>%
+      select(geo_value = location, time_value = epiweek, rate_overall),
+    by = c("geo_value", "time_value"),
+    relationship = "one-to-one", unmatched = "error"
+  )
+# What's our maximum error vs. the official overall estimates?
+max(abs(rate_overall_recalc_edf$rate_overall - rate_overall_recalc_edf$rate_overall_recalc))
 ```
+This small amount of difference is expected, since all the age-specific rates
+were rounded to the first decimal place, and population data might have been
+interpolated and extrapolated a bit differently in the official source, limiting
+our ability to precisely recreate its estimates from an age group breakdown.
 
 ## Detecting and filling time gaps with `complete.epi_df`
 
@@ -515,4 +597,3 @@ Engineering. Copyright Johns Hopkins University 2020.
 API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html):
 These signals are taken directly from the JHU CSSE [COVID-19 GitHub
 repository](https://github.com/CSSEGISandData/COVID-19) without changes.
-

From b569131305741c12bad05397bd05524852814b21 Mon Sep 17 00:00:00 2001
From: nmdefries <42820733+nmdefries@users.noreply.github.com>
Date: Fri, 10 Jan 2025 19:40:26 -0500
Subject: [PATCH 2/4] switch to summing pop-weighted rates

---
 vignettes/epi_df.Rmd | 75 +++++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 25 deletions(-)

diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd
index 129856f3e..8644251a5 100644
--- a/vignettes/epi_df.Rmd
+++ b/vignettes/epi_df.Rmd
@@ -268,28 +268,40 @@ uses either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or
 populations for `time_value`s before 2020 included in reports from 2020 onward,
 but in this case, the two sources agreed exactly. FluSurv-NET also directly
 reports an overall rate, so we can check our work.
+```{r}
+# Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and
+# 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually
+# released by Census in June 2021 and by NCHS in September 2021), which is the
+# last available reporting found with population estimates for 2017 and 2018:
+pop <- tribble(
+  ~geo_value, ~age_group,  ~time_value,           ~pop,
+  "CA",       "0--4 yr",   as.Date("2017-07-01"), 203813,
+  "CA",       "5--17 yr",  as.Date("2017-07-01"), 521827,
+  "CA",       "18--49 yr", as.Date("2017-07-01"), 1722399,
+  "CA",       "50--64 yr", as.Date("2017-07-01"), 700090,
+  "CA",       ">= 65 yr",  as.Date("2017-07-01"), 534789,
+  "CA",       "0--4 yr",   as.Date("2018-07-01"), 201265,
+  "CA",       "5--17 yr",  as.Date("2018-07-01"), 520077,
+  "CA",       "18--49 yr", as.Date("2018-07-01"), 1725382,
+  "CA",       "50--64 yr", as.Date("2018-07-01"), 699145,
+  "CA",       ">= 65 yr",  as.Date("2018-07-01"), 551243,
+)
+# Calculate fraction of total population in each age group.
+pop <- pop %>% 
+  group_by(geo_value, time_value) %>% 
+  mutate(frac_pop = pop / sum(pop))
+```
+
+After joining population onto the rate data, we can calculate the
+population-weighted rate for each age group, that is, the portion of each
+`age_group`'s rate that it contributes to the overall rate.
+
 ```{r}
 group_cols <- key_colnames(flu_data, exclude = "age_group")
-rate_overall_recalc_edf <-
+fractional_rate_by_age_group <-
   flu_data %>%
   inner_join(
-    # Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and
-    # 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually
-    # released by Census in June 2021 and by NCHS in September 2021), which is the
-    # last available reporting found with population estimates for 2017 and 2018:
-    tribble(
-      ~geo_value, ~age_group,  ~time_value,           ~pop,
-      "CA",       "0--4 yr",   as.Date("2017-07-01"), 203813,
-      "CA",       "5--17 yr",  as.Date("2017-07-01"), 521827,
-      "CA",       "18--49 yr", as.Date("2017-07-01"), 1722399,
-      "CA",       "50--64 yr", as.Date("2017-07-01"), 700090,
-      "CA",       ">= 65 yr",  as.Date("2017-07-01"), 534789,
-      "CA",       "0--4 yr",   as.Date("2018-07-01"), 201265,
-      "CA",       "5--17 yr",  as.Date("2018-07-01"), 520077,
-      "CA",       "18--49 yr", as.Date("2018-07-01"), 1725382,
-      "CA",       "50--64 yr", as.Date("2018-07-01"), 699145,
-      "CA",       ">= 65 yr",  as.Date("2018-07-01"), 551243,
-    ),
+    pop,
     # Simple population interpolation/extrapolation scheme: last observation
     # carried forward. Use the estimated population on 2017-07-01 for
     # time_values 2017-07-01 through 2018-06-30, and the estimated population on
@@ -298,14 +310,27 @@ rate_overall_recalc_edf <-
     suffix = c("", "_for_pop"),
     relationship = "many-to-one", unmatched = "error"
   ) %>%
-  select(-time_value_for_pop) %>%
-  group_by(geo_value, time_value) %>%
-  mutate(count = rate * pop / 100e3) %>%
-  ungroup() %>%
-  sum_groups_epi_df(c("count", "pop"), group_cols = group_cols) %>%
-  mutate(rate_overall_recalc = count / pop * 100e3) %>%
+  mutate(frac_rate = rate * frac_pop)
+```
+
+We can then use `sum_groups_epi_df` to sum population-weighted rate across all
+age groups to get the overall rate.
+
+```{r}
+rate_overall_recalc_edf <-
+  fractional_rate_by_age_group %>% 
+  sum_groups_epi_df("frac_rate", group_cols = group_cols) %>%
+  rename(rate_overall_recalc = frac_rate) %>% 
   # match rounding of original data:
-  mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) %>%
+  mutate(rate_overall_recalc = round(rate_overall_recalc, 1))
+rate_overall_recalc_edf
+```
+
+Let's compare our calculated rate to the overall rate reported by FluSurv-NET.
+
+```{r}
+rate_overall_recalc_edf <-
+  rate_overall_recalc_edf %>% 
   # compare to published overall rates:
   inner_join(
     flu_data_api %>%

From ee04efa81996a1097a843821a4d8da6249c7d192 Mon Sep 17 00:00:00 2001
From: nmdefries <nmdefries@users.noreply.github.com>
Date: Sat, 11 Jan 2025 00:45:20 +0000
Subject: [PATCH 3/4] style: styler (GHA)

---
 vignettes/epi_df.Rmd | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd
index 8644251a5..fac678de6 100644
--- a/vignettes/epi_df.Rmd
+++ b/vignettes/epi_df.Rmd
@@ -287,8 +287,8 @@ pop <- tribble(
   "CA",       ">= 65 yr",  as.Date("2018-07-01"), 551243,
 )
 # Calculate fraction of total population in each age group.
-pop <- pop %>% 
-  group_by(geo_value, time_value) %>% 
+pop <- pop %>%
+  group_by(geo_value, time_value) %>%
   mutate(frac_pop = pop / sum(pop))
 ```
 
@@ -318,9 +318,9 @@ age groups to get the overall rate.
 
 ```{r}
 rate_overall_recalc_edf <-
-  fractional_rate_by_age_group %>% 
+  fractional_rate_by_age_group %>%
   sum_groups_epi_df("frac_rate", group_cols = group_cols) %>%
-  rename(rate_overall_recalc = frac_rate) %>% 
+  rename(rate_overall_recalc = frac_rate) %>%
   # match rounding of original data:
   mutate(rate_overall_recalc = round(rate_overall_recalc, 1))
 rate_overall_recalc_edf
@@ -330,7 +330,7 @@ Let's compare our calculated rate to the overall rate reported by FluSurv-NET.
 
 ```{r}
 rate_overall_recalc_edf <-
-  rate_overall_recalc_edf %>% 
+  rate_overall_recalc_edf %>%
   # compare to published overall rates:
   inner_join(
     flu_data_api %>%

From 6e18b32f22cd2bac4b02fe446dd3b9106b2d4ec3 Mon Sep 17 00:00:00 2001
From: "Logan C. Brooks" <lcbrooks+github@andrew.cmu.edu>
Date: Mon, 13 Jan 2025 15:21:09 -0800
Subject: [PATCH 4/4] docs(epi_df.Rmd): frac_rate -> rate_contrib, comments,
 limit grouped outputs

---
 vignettes/epi_df.Rmd | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd
index fac678de6..3976208e6 100644
--- a/vignettes/epi_df.Rmd
+++ b/vignettes/epi_df.Rmd
@@ -119,7 +119,8 @@ edf %>%
   complete(geo_value, time_value = seq.Date(min(time_value), max(time_value), by = "day")) %>%
   arrange_canonical() %>%
   group_by(geo_value) %>%
-  mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0))
+  mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0)) %>%
+  ungroup()
 ```
 
 Furthermore `epi_slide()` allows for selecting `.ref_time_value`, which the
@@ -183,10 +184,8 @@ cases:
 
 ```{r}
 edf %>%
-  group_by(geo_value) %>%
   epi_slide_mean("cases", .window_size = 7, na.rm = TRUE)
 edf %>%
-  group_by(geo_value) %>%
   epi_slide_sum("cases", .window_size = 7, na.rm = TRUE)
 ```
 
@@ -289,7 +288,8 @@ pop <- tribble(
 # Calculate fraction of total population in each age group.
 pop <- pop %>%
   group_by(geo_value, time_value) %>%
-  mutate(frac_pop = pop / sum(pop))
+  mutate(frac_pop = pop / sum(pop)) %>%
+  ungroup()
 ```
 
 After joining population onto the rate data, we can calculate the
@@ -297,7 +297,6 @@ population-weighted rate for each age group, that is, the portion of each
 `age_group`'s rate that it contributes to the overall rate.
 
 ```{r}
-group_cols <- key_colnames(flu_data, exclude = "age_group")
 fractional_rate_by_age_group <-
   flu_data %>%
   inner_join(
@@ -306,11 +305,11 @@ fractional_rate_by_age_group <-
     # carried forward. Use the estimated population on 2017-07-01 for
     # time_values 2017-07-01 through 2018-06-30, and the estimated population on
     # 2018-07-01 for all subsequent time_values:
-    join_by(geo_value, closest(y$time_value <= x$time_value), age_group),
+    join_by(geo_value, age_group, closest(y$time_value <= x$time_value)),
     suffix = c("", "_for_pop"),
     relationship = "many-to-one", unmatched = "error"
   ) %>%
-  mutate(frac_rate = rate * frac_pop)
+  mutate(rate_contrib = rate * frac_pop)
 ```
 
 We can then use `sum_groups_epi_df` to sum population-weighted rate across all
@@ -319,8 +318,8 @@ age groups to get the overall rate.
 ```{r}
 rate_overall_recalc_edf <-
   fractional_rate_by_age_group %>%
-  sum_groups_epi_df("frac_rate", group_cols = group_cols) %>%
-  rename(rate_overall_recalc = frac_rate) %>%
+  sum_groups_epi_df("rate_contrib", group_cols = c("geo_value")) %>%
+  rename(rate_overall_recalc = rate_contrib) %>%
   # match rounding of original data:
   mutate(rate_overall_recalc = round(rate_overall_recalc, 1))
 rate_overall_recalc_edf
@@ -336,6 +335,7 @@ rate_overall_recalc_edf <-
     flu_data_api %>%
       select(geo_value = location, time_value = epiweek, rate_overall),
     by = c("geo_value", "time_value"),
+    # validate that we have exactly the same set of geo_value x time_value combinations:
     relationship = "one-to-one", unmatched = "error"
   )
 # What's our maximum error vs. the official overall estimates?
@@ -370,11 +370,14 @@ edf_missing %>%
 Now let's fill in the missing data with explicit zeros:
 
 ```{r}
-edf_missing %>%
+edf_missing <- edf_missing %>%
   complete(
     time_value = seq.Date(min(time_value), max(time_value), by = "day"),
     fill = list(cases = 0)
   ) %>%
+  ungroup()
+
+edf_missing %>%
   print(n = 12)
 ```