cmu-delphi · brookslogan · Jan 24, 2025 · Dec 18, 2024 · Jan 11, 2025 · Jan 11, 2025
diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd
@@ -119,7 +119,8 @@ edf %>%
   complete(geo_value, time_value = seq.Date(min(time_value), max(time_value), by = "day")) %>%
   arrange_canonical() %>%
   group_by(geo_value) %>%
-  mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0))
+  mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0)) %>%
+  ungroup()
 ```
 
 Furthermore `epi_slide()` allows for selecting `.ref_time_value`, which the
@@ -183,10 +184,8 @@ cases:
 
 ```{r}
 edf %>%
-  group_by(geo_value) %>%
   epi_slide_mean("cases", .window_size = 7, na.rm = TRUE)
 edf %>%
-  group_by(geo_value) %>%
   epi_slide_sum("cases", .window_size = 7, na.rm = TRUE)
 ```
 
@@ -209,36 +208,144 @@ Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html)
 
 ```{r}
 library(epidatr)
-flu_data <- pub_flusurv(
+flu_data_api <- pub_flusurv(
   locations = "ca",
-  epiweeks = epirange(201801, 202001),
-) %>%
-  select(location, epiweek, issue, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>%
-  tidyr::pivot_longer(cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate")
+  epiweeks = epirange(201801, 202001)
+)
+```
+
+We're interested in the age-specific rates:
+```{r}
+flu_data <- flu_data_api %>%
+  select(location, epiweek, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>%
+  # Turn `rate_age_0`..`rate_age_4` columns into an `age_group` and `rate`
+  # column (with 5x as many rows):
+  tidyr::pivot_longer(
+    cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate",
+    # When converting column names to entries in `age_group`, remove this prefix:
+    names_prefix = "rate_age_",
+    # And add a better prefix:
+    names_transform = function(age_group) paste0("age_group_", age_group)
+  ) %>%
+  # Improve `age_group` labels a bit more:
+  mutate(
+    age_group = case_match(
+      age_group,
+      "age_group_0" ~ "0--4 yr",
+      "age_group_1" ~ "5--17 yr",
+      "age_group_2" ~ "18--49 yr",
+      "age_group_3" ~ "50--64 yr",
+      "age_group_4" ~ ">= 65 yr",
+      # Make this a factor with appropriate level ordering:
+      .ptype = factor(levels = c("0--4 yr", "5--17 yr", "18--49 yr", "50--64 yr", ">= 65 yr"))
+    )
+  ) %>%
+  # The API currently outputs `epiweek` in Date format (the constituent Sunday);
+  # rename it to remind us that it's not in YYYYww format:
+  rename(time_value = epiweek)
 flu_data
 ```
 
 We can now convert this data to an `epi_df` object and set the `age_group`
 column as an additional group key:
 
 ```{r}
-flu_data <- flu_data %>% as_epi_df(other_keys = "age_group", as_of = as.Date("2024-03-20"))
+flu_data <- flu_data %>% as_epi_df(other_keys = "age_group")
 flu_data
 ```
 
 Note that the `epi_df` object now has an additional key column `age_group`. This
 means that there should only be one row for each combination of `geo_value`,
-`time_value`, and `age_group` in the dataset (this is enforced at construction
+`age_group`, and `time_value` in the dataset (this is enforced at construction
 time).
 
-Now we can aggregate the data by `age_group`, if we want to compute the total:
+Now we can aggregate the data by `age_group`, if we want to compute the total.
+Since we are working with rates, we need to attach some population data in order
+to do this aggregation. It's somewhat ambiguous whether FluSurv-NET reporting
+uses either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or
+[Census](https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html)
+populations for `time_value`s before 2020 included in reports from 2020 onward,
+but in this case, the two sources agreed exactly. FluSurv-NET also directly
+reports an overall rate, so we can check our work.
+```{r}
+# Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and
+# 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually
+# released by Census in June 2021 and by NCHS in September 2021), which is the
+# last available reporting found with population estimates for 2017 and 2018:
+pop <- tribble(
+  ~geo_value, ~age_group,  ~time_value,           ~pop,
+  "CA",       "0--4 yr",   as.Date("2017-07-01"), 203813,
+  "CA",       "5--17 yr",  as.Date("2017-07-01"), 521827,
+  "CA",       "18--49 yr", as.Date("2017-07-01"), 1722399,
+  "CA",       "50--64 yr", as.Date("2017-07-01"), 700090,
+  "CA",       ">= 65 yr",  as.Date("2017-07-01"), 534789,
+  "CA",       "0--4 yr",   as.Date("2018-07-01"), 201265,
+  "CA",       "5--17 yr",  as.Date("2018-07-01"), 520077,
+  "CA",       "18--49 yr", as.Date("2018-07-01"), 1725382,
+  "CA",       "50--64 yr", as.Date("2018-07-01"), 699145,
+  "CA",       ">= 65 yr",  as.Date("2018-07-01"), 551243,
+)
+# Calculate fraction of total population in each age group.
+pop <- pop %>%
+  group_by(geo_value, time_value) %>%
+  mutate(frac_pop = pop / sum(pop)) %>%
+  ungroup()
+```
+
+After joining population onto the rate data, we can calculate the
+population-weighted rate for each age group, that is, the portion of each
+`age_group`'s rate that it contributes to the overall rate.
 
 ```{r}
-group_cols <- key_colnames(exclude = "age_group")
-flu_data %>%
-  sum_groups_epi_df("rate", group_cols = group_cols)
+fractional_rate_by_age_group <-
+  flu_data %>%
+  inner_join(
+    pop,
+    # Simple population interpolation/extrapolation scheme: last observation
+    # carried forward. Use the estimated population on 2017-07-01 for
+    # time_values 2017-07-01 through 2018-06-30, and the estimated population on
+    # 2018-07-01 for all subsequent time_values:
+    join_by(geo_value, age_group, closest(y$time_value <= x$time_value)),
+    suffix = c("", "_for_pop"),
+    relationship = "many-to-one", unmatched = "error"
+  ) %>%
+  mutate(rate_contrib = rate * frac_pop)
 ```
 
+We can then use `sum_groups_epi_df` to sum population-weighted rate across all
+age groups to get the overall rate.
+
+```{r}
+rate_overall_recalc_edf <-
+  fractional_rate_by_age_group %>%
+  sum_groups_epi_df("rate_contrib", group_cols = c("geo_value")) %>%
+  rename(rate_overall_recalc = rate_contrib) %>%
+  # match rounding of original data:
+  mutate(rate_overall_recalc = round(rate_overall_recalc, 1))
+rate_overall_recalc_edf
+```
+
+Let's compare our calculated rate to the overall rate reported by FluSurv-NET.
+
+```{r}
+rate_overall_recalc_edf <-
+  rate_overall_recalc_edf %>%
+  # compare to published overall rates:
+  inner_join(
+    flu_data_api %>%
+      select(geo_value = location, time_value = epiweek, rate_overall),
+    by = c("geo_value", "time_value"),
+    # validate that we have exactly the same set of geo_value x time_value combinations:
+    relationship = "one-to-one", unmatched = "error"
+  )
+# What's our maximum error vs. the official overall estimates?
+max(abs(rate_overall_recalc_edf$rate_overall - rate_overall_recalc_edf$rate_overall_recalc))
+```
+This small amount of difference is expected, since all the age-specific rates
+were rounded to the first decimal place, and population data might have been
+interpolated and extrapolated a bit differently in the official source, limiting
+our ability to precisely recreate its estimates from an age group breakdown.
+
 ## Detecting and filling time gaps with `complete.epi_df`
 
 Sometimes you may have missing data in your time series. This can be due to
@@ -263,11 +370,14 @@ edf_missing %>%
 Now let's fill in the missing data with explicit zeros:
 
 ```{r}
-edf_missing %>%
+edf_missing <- edf_missing %>%
   complete(
     time_value = seq.Date(min(time_value), max(time_value), by = "day"),
     fill = list(cases = 0)
   ) %>%
+  ungroup()
+
+edf_missing %>%
   print(n = 12)
 ```
 
@@ -515,4 +625,3 @@ Engineering. Copyright Johns Hopkins University 2020.
 API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html):
 These signals are taken directly from the JHU CSSE [COVID-19 GitHub
 repository](https://github.com/CSSEGISandData/COVID-19) without changes.
-