From c03221e50747da660f15b71cbdac4212e51cc4bf Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 17 Dec 2024 16:11:00 -0800 Subject: [PATCH 1/4] Fix age group rate aggregation example --- vignettes/epi_df.Rmd | 107 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 13 deletions(-) diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd index 15f763389..129856f3e 100644 --- a/vignettes/epi_df.Rmd +++ b/vignettes/epi_df.Rmd @@ -209,12 +209,41 @@ Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/flusurv.html) ```{r} library(epidatr) -flu_data <- pub_flusurv( +flu_data_api <- pub_flusurv( locations = "ca", - epiweeks = epirange(201801, 202001), -) %>% - select(location, epiweek, issue, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>% - tidyr::pivot_longer(cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate") + epiweeks = epirange(201801, 202001) +) +``` + +We're interested in the age-specific rates: +```{r} +flu_data <- flu_data_api %>% + select(location, epiweek, rate_age_0, rate_age_1, rate_age_2, rate_age_3, rate_age_4) %>% + # Turn `rate_age_0`..`rate_age_4` columns into an `age_group` and `rate` + # column (with 5x as many rows): + tidyr::pivot_longer( + cols = starts_with("rate_age_"), names_to = "age_group", values_to = "rate", + # When converting column names to entries in `age_group`, remove this prefix: + names_prefix = "rate_age_", + # And add a better prefix: + names_transform = function(age_group) paste0("age_group_", age_group) + ) %>% + # Improve `age_group` labels a bit more: + mutate( + age_group = case_match( + age_group, + "age_group_0" ~ "0--4 yr", + "age_group_1" ~ "5--17 yr", + "age_group_2" ~ "18--49 yr", + "age_group_3" ~ "50--64 yr", + "age_group_4" ~ ">= 65 yr", + # Make this a factor with appropriate level ordering: + .ptype = factor(levels = c("0--4 yr", "5--17 yr", "18--49 yr", "50--64 yr", ">= 65 yr")) + ) + ) %>% + # The API currently outputs `epiweek` in Date format (the constituent Sunday); + # rename it to remind us that it's not in YYYYww format: + rename(time_value = epiweek) flu_data ``` @@ -222,22 +251,75 @@ We can now convert this data to an `epi_df` object and set the `age_group` column as an additional group key: ```{r} -flu_data <- flu_data %>% as_epi_df(other_keys = "age_group", as_of = as.Date("2024-03-20")) +flu_data <- flu_data %>% as_epi_df(other_keys = "age_group") flu_data ``` Note that the `epi_df` object now has an additional key column `age_group`. This means that there should only be one row for each combination of `geo_value`, -`time_value`, and `age_group` in the dataset (this is enforced at construction +`age_group`, and `time_value` in the dataset (this is enforced at construction time). -Now we can aggregate the data by `age_group`, if we want to compute the total: - +Now we can aggregate the data by `age_group`, if we want to compute the total. +Since we are working with rates, we need to attach some population data in order +to do this aggregation. It's somewhat ambiguous whether FluSurv-NET reporting +uses either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or +[Census](https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates/2020-evaluation-estimates/2010s-county-detail.html) +populations for `time_value`s before 2020 included in reports from 2020 onward, +but in this case, the two sources agreed exactly. FluSurv-NET also directly +reports an overall rate, so we can check our work. ```{r} -group_cols <- key_colnames(exclude = "age_group") -flu_data %>% - sum_groups_epi_df("rate", group_cols = group_cols) +group_cols <- key_colnames(flu_data, exclude = "age_group") +rate_overall_recalc_edf <- + flu_data %>% + inner_join( + # Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and + # 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually + # released by Census in June 2021 and by NCHS in September 2021), which is the + # last available reporting found with population estimates for 2017 and 2018: + tribble( + ~geo_value, ~age_group, ~time_value, ~pop, + "CA", "0--4 yr", as.Date("2017-07-01"), 203813, + "CA", "5--17 yr", as.Date("2017-07-01"), 521827, + "CA", "18--49 yr", as.Date("2017-07-01"), 1722399, + "CA", "50--64 yr", as.Date("2017-07-01"), 700090, + "CA", ">= 65 yr", as.Date("2017-07-01"), 534789, + "CA", "0--4 yr", as.Date("2018-07-01"), 201265, + "CA", "5--17 yr", as.Date("2018-07-01"), 520077, + "CA", "18--49 yr", as.Date("2018-07-01"), 1725382, + "CA", "50--64 yr", as.Date("2018-07-01"), 699145, + "CA", ">= 65 yr", as.Date("2018-07-01"), 551243, + ), + # Simple population interpolation/extrapolation scheme: last observation + # carried forward. Use the estimated population on 2017-07-01 for + # time_values 2017-07-01 through 2018-06-30, and the estimated population on + # 2018-07-01 for all subsequent time_values: + join_by(geo_value, closest(y$time_value <= x$time_value), age_group), + suffix = c("", "_for_pop"), + relationship = "many-to-one", unmatched = "error" + ) %>% + select(-time_value_for_pop) %>% + group_by(geo_value, time_value) %>% + mutate(count = rate * pop / 100e3) %>% + ungroup() %>% + sum_groups_epi_df(c("count", "pop"), group_cols = group_cols) %>% + mutate(rate_overall_recalc = count / pop * 100e3) %>% + # match rounding of original data: + mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) %>% + # compare to published overall rates: + inner_join( + flu_data_api %>% + select(geo_value = location, time_value = epiweek, rate_overall), + by = c("geo_value", "time_value"), + relationship = "one-to-one", unmatched = "error" + ) +# What's our maximum error vs. the official overall estimates? +max(abs(rate_overall_recalc_edf$rate_overall - rate_overall_recalc_edf$rate_overall_recalc)) ``` +This small amount of difference is expected, since all the age-specific rates +were rounded to the first decimal place, and population data might have been +interpolated and extrapolated a bit differently in the official source, limiting +our ability to precisely recreate its estimates from an age group breakdown. ## Detecting and filling time gaps with `complete.epi_df` @@ -515,4 +597,3 @@ Engineering. Copyright Johns Hopkins University 2020. API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - From b569131305741c12bad05397bd05524852814b21 Mon Sep 17 00:00:00 2001 From: nmdefries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 10 Jan 2025 19:40:26 -0500 Subject: [PATCH 2/4] switch to summing pop-weighted rates --- vignettes/epi_df.Rmd | 75 +++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd index 129856f3e..8644251a5 100644 --- a/vignettes/epi_df.Rmd +++ b/vignettes/epi_df.Rmd @@ -268,28 +268,40 @@ uses either [NCHS](https://www.cdc.gov/nchs/nvss/bridged_race.htm) or populations for `time_value`s before 2020 included in reports from 2020 onward, but in this case, the two sources agreed exactly. FluSurv-NET also directly reports an overall rate, so we can check our work. +```{r} +# Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and +# 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually +# released by Census in June 2021 and by NCHS in September 2021), which is the +# last available reporting found with population estimates for 2017 and 2018: +pop <- tribble( + ~geo_value, ~age_group, ~time_value, ~pop, + "CA", "0--4 yr", as.Date("2017-07-01"), 203813, + "CA", "5--17 yr", as.Date("2017-07-01"), 521827, + "CA", "18--49 yr", as.Date("2017-07-01"), 1722399, + "CA", "50--64 yr", as.Date("2017-07-01"), 700090, + "CA", ">= 65 yr", as.Date("2017-07-01"), 534789, + "CA", "0--4 yr", as.Date("2018-07-01"), 201265, + "CA", "5--17 yr", as.Date("2018-07-01"), 520077, + "CA", "18--49 yr", as.Date("2018-07-01"), 1725382, + "CA", "50--64 yr", as.Date("2018-07-01"), 699145, + "CA", ">= 65 yr", as.Date("2018-07-01"), 551243, +) +# Calculate fraction of total population in each age group. +pop <- pop %>% + group_by(geo_value, time_value) %>% + mutate(frac_pop = pop / sum(pop)) +``` + +After joining population onto the rate data, we can calculate the +population-weighted rate for each age group, that is, the portion of each +`age_group`'s rate that it contributes to the overall rate. + ```{r} group_cols <- key_colnames(flu_data, exclude = "age_group") -rate_overall_recalc_edf <- +fractional_rate_by_age_group <- flu_data %>% inner_join( - # Population estimates for FluSurv-NET-covered part of CA on 2017-07-01 and - # 2018-07-01, extracted and aggregated from "vintage 2020" estimates (actually - # released by Census in June 2021 and by NCHS in September 2021), which is the - # last available reporting found with population estimates for 2017 and 2018: - tribble( - ~geo_value, ~age_group, ~time_value, ~pop, - "CA", "0--4 yr", as.Date("2017-07-01"), 203813, - "CA", "5--17 yr", as.Date("2017-07-01"), 521827, - "CA", "18--49 yr", as.Date("2017-07-01"), 1722399, - "CA", "50--64 yr", as.Date("2017-07-01"), 700090, - "CA", ">= 65 yr", as.Date("2017-07-01"), 534789, - "CA", "0--4 yr", as.Date("2018-07-01"), 201265, - "CA", "5--17 yr", as.Date("2018-07-01"), 520077, - "CA", "18--49 yr", as.Date("2018-07-01"), 1725382, - "CA", "50--64 yr", as.Date("2018-07-01"), 699145, - "CA", ">= 65 yr", as.Date("2018-07-01"), 551243, - ), + pop, # Simple population interpolation/extrapolation scheme: last observation # carried forward. Use the estimated population on 2017-07-01 for # time_values 2017-07-01 through 2018-06-30, and the estimated population on @@ -298,14 +310,27 @@ rate_overall_recalc_edf <- suffix = c("", "_for_pop"), relationship = "many-to-one", unmatched = "error" ) %>% - select(-time_value_for_pop) %>% - group_by(geo_value, time_value) %>% - mutate(count = rate * pop / 100e3) %>% - ungroup() %>% - sum_groups_epi_df(c("count", "pop"), group_cols = group_cols) %>% - mutate(rate_overall_recalc = count / pop * 100e3) %>% + mutate(frac_rate = rate * frac_pop) +``` + +We can then use `sum_groups_epi_df` to sum population-weighted rate across all +age groups to get the overall rate. + +```{r} +rate_overall_recalc_edf <- + fractional_rate_by_age_group %>% + sum_groups_epi_df("frac_rate", group_cols = group_cols) %>% + rename(rate_overall_recalc = frac_rate) %>% # match rounding of original data: - mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) %>% + mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) +rate_overall_recalc_edf +``` + +Let's compare our calculated rate to the overall rate reported by FluSurv-NET. + +```{r} +rate_overall_recalc_edf <- + rate_overall_recalc_edf %>% # compare to published overall rates: inner_join( flu_data_api %>% From ee04efa81996a1097a843821a4d8da6249c7d192 Mon Sep 17 00:00:00 2001 From: nmdefries Date: Sat, 11 Jan 2025 00:45:20 +0000 Subject: [PATCH 3/4] style: styler (GHA) --- vignettes/epi_df.Rmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd index 8644251a5..fac678de6 100644 --- a/vignettes/epi_df.Rmd +++ b/vignettes/epi_df.Rmd @@ -287,8 +287,8 @@ pop <- tribble( "CA", ">= 65 yr", as.Date("2018-07-01"), 551243, ) # Calculate fraction of total population in each age group. -pop <- pop %>% - group_by(geo_value, time_value) %>% +pop <- pop %>% + group_by(geo_value, time_value) %>% mutate(frac_pop = pop / sum(pop)) ``` @@ -318,9 +318,9 @@ age groups to get the overall rate. ```{r} rate_overall_recalc_edf <- - fractional_rate_by_age_group %>% + fractional_rate_by_age_group %>% sum_groups_epi_df("frac_rate", group_cols = group_cols) %>% - rename(rate_overall_recalc = frac_rate) %>% + rename(rate_overall_recalc = frac_rate) %>% # match rounding of original data: mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) rate_overall_recalc_edf @@ -330,7 +330,7 @@ Let's compare our calculated rate to the overall rate reported by FluSurv-NET. ```{r} rate_overall_recalc_edf <- - rate_overall_recalc_edf %>% + rate_overall_recalc_edf %>% # compare to published overall rates: inner_join( flu_data_api %>% From 6e18b32f22cd2bac4b02fe446dd3b9106b2d4ec3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 13 Jan 2025 15:21:09 -0800 Subject: [PATCH 4/4] docs(epi_df.Rmd): frac_rate -> rate_contrib, comments, limit grouped outputs --- vignettes/epi_df.Rmd | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd index fac678de6..3976208e6 100644 --- a/vignettes/epi_df.Rmd +++ b/vignettes/epi_df.Rmd @@ -119,7 +119,8 @@ edf %>% complete(geo_value, time_value = seq.Date(min(time_value), max(time_value), by = "day")) %>% arrange_canonical() %>% group_by(geo_value) %>% - mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0)) + mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0)) %>% + ungroup() ``` Furthermore `epi_slide()` allows for selecting `.ref_time_value`, which the @@ -183,10 +184,8 @@ cases: ```{r} edf %>% - group_by(geo_value) %>% epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) edf %>% - group_by(geo_value) %>% epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) ``` @@ -289,7 +288,8 @@ pop <- tribble( # Calculate fraction of total population in each age group. pop <- pop %>% group_by(geo_value, time_value) %>% - mutate(frac_pop = pop / sum(pop)) + mutate(frac_pop = pop / sum(pop)) %>% + ungroup() ``` After joining population onto the rate data, we can calculate the @@ -297,7 +297,6 @@ population-weighted rate for each age group, that is, the portion of each `age_group`'s rate that it contributes to the overall rate. ```{r} -group_cols <- key_colnames(flu_data, exclude = "age_group") fractional_rate_by_age_group <- flu_data %>% inner_join( @@ -306,11 +305,11 @@ fractional_rate_by_age_group <- # carried forward. Use the estimated population on 2017-07-01 for # time_values 2017-07-01 through 2018-06-30, and the estimated population on # 2018-07-01 for all subsequent time_values: - join_by(geo_value, closest(y$time_value <= x$time_value), age_group), + join_by(geo_value, age_group, closest(y$time_value <= x$time_value)), suffix = c("", "_for_pop"), relationship = "many-to-one", unmatched = "error" ) %>% - mutate(frac_rate = rate * frac_pop) + mutate(rate_contrib = rate * frac_pop) ``` We can then use `sum_groups_epi_df` to sum population-weighted rate across all @@ -319,8 +318,8 @@ age groups to get the overall rate. ```{r} rate_overall_recalc_edf <- fractional_rate_by_age_group %>% - sum_groups_epi_df("frac_rate", group_cols = group_cols) %>% - rename(rate_overall_recalc = frac_rate) %>% + sum_groups_epi_df("rate_contrib", group_cols = c("geo_value")) %>% + rename(rate_overall_recalc = rate_contrib) %>% # match rounding of original data: mutate(rate_overall_recalc = round(rate_overall_recalc, 1)) rate_overall_recalc_edf @@ -336,6 +335,7 @@ rate_overall_recalc_edf <- flu_data_api %>% select(geo_value = location, time_value = epiweek, rate_overall), by = c("geo_value", "time_value"), + # validate that we have exactly the same set of geo_value x time_value combinations: relationship = "one-to-one", unmatched = "error" ) # What's our maximum error vs. the official overall estimates? @@ -370,11 +370,14 @@ edf_missing %>% Now let's fill in the missing data with explicit zeros: ```{r} -edf_missing %>% +edf_missing <- edf_missing %>% complete( time_value = seq.Date(min(time_value), max(time_value), by = "day"), fill = list(cases = 0) ) %>% + ungroup() + +edf_missing %>% print(n = 12) ```