cmu-delphi · krivard · Jul 15, 2022 · Jun 10, 2022 · Jun 10, 2022 · Jun 10, 2022
diff --git a/facebook/delphiFacebook/DESCRIPTION b/facebook/delphiFacebook/DESCRIPTION
@@ -15,18 +15,21 @@ Imports:
     rlang,
     readr,
     dplyr,
+    plyr,
+    tidyr,
     stringi,
     jsonlite,
     lubridate,
     data.table,
     tibble,
     purrr,
-    Rcpp
+    Rcpp,
+    survey
 Suggests:
     knitr (>= 1.15),
     rmarkdown (>= 1.4),
     testthat (>= 1.0.1),
     covr (>= 2.2.2)
 LinkingTo: Rcpp
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.0
 Encoding: UTF-8
diff --git a/facebook/delphiFacebook/NAMESPACE b/facebook/delphiFacebook/NAMESPACE
@@ -69,6 +69,7 @@ importFrom(dplyr,bind_rows)
 importFrom(dplyr,case_when)
 importFrom(dplyr,coalesce)
 importFrom(dplyr,desc)
+importFrom(dplyr,distinct)
 importFrom(dplyr,everything)
 importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
@@ -98,6 +99,7 @@ importFrom(lubridate,ymd)
 importFrom(lubridate,ymd_hms)
 importFrom(parallel,detectCores)
 importFrom(parallel,mclapply)
+importFrom(plyr,round_any)
 importFrom(purrr,reduce)
 importFrom(readr,col_character)
 importFrom(readr,col_integer)
@@ -115,13 +117,19 @@ importFrom(stats,setNames)
 importFrom(stats,weighted.mean)
 importFrom(stringi,stri_extract)
 importFrom(stringi,stri_extract_first)
+importFrom(stringi,stri_pad)
 importFrom(stringi,stri_replace)
 importFrom(stringi,stri_replace_all)
 importFrom(stringi,stri_split)
 importFrom(stringi,stri_sub)
 importFrom(stringi,stri_trans_tolower)
 importFrom(stringi,stri_trim)
+importFrom(survey,oldsvyquantile)
+importFrom(survey,svydesign)
+importFrom(survey,svymean)
+importFrom(survey,svyvar)
 importFrom(tibble,add_column)
 importFrom(tibble,tribble)
+importFrom(tidyr,drop_na)
 importFrom(utils,tail)
 useDynLib(delphiFacebook, .registration = TRUE)
diff --git a/facebook/delphiFacebook/R/binary.R b/facebook/delphiFacebook/R/binary.R
@@ -59,8 +59,8 @@ get_binary_indicators <- function() {
 
     # work outside home
     # pre-wave 4
-    "wip_smoothed_work_outside_home_5d", "weight_unif", "c_work_outside_5d", 6, compute_binary_response, jeffreys_binary,
-    "wip_smoothed_wwork_outside_home_5d", "weight", "c_work_outside_5d", 6, compute_binary_response, jeffreys_binary,
+    "smoothed_work_outside_home_5d", "weight_unif", "c_work_outside_5d", 6, compute_binary_response, jeffreys_binary,
+    "smoothed_wwork_outside_home_5d", "weight", "c_work_outside_5d", 6, compute_binary_response, jeffreys_binary,
     # wave 4+, pre-wave 10
     "smoothed_work_outside_home_1d", "weight_unif", "a_work_outside_home_1d", 6, compute_binary_response, jeffreys_binary,
     "smoothed_wwork_outside_home_1d", "weight", "a_work_outside_home_1d", 6, compute_binary_response, jeffreys_binary,

diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R
@@ -26,6 +26,7 @@
 #' @import data.table
 #' @importFrom dplyr full_join %>% select all_of
 #' @importFrom purrr reduce
+#' @importFrom tidyr drop_na
 #'
 #' @export
 produce_aggregates <- function(df, aggregations, cw_list, params) {
@@ -65,6 +66,12 @@ produce_aggregates <- function(df, aggregations, cw_list, params) {
     geo_level <- agg_groups$geo_level[group_ind]
     geo_crosswalk <- cw_list[[geo_level]]
 
+    if (geo_level == "county") {
+      # Raise sample size threshold to 100.
+      old_thresh <- params$num_filter
+      params$num_filter <- 100L
+    }
+
     # Subset aggregations to keep only those grouping by the current agg_group
     # and with the current geo_level. `setequal` ignores differences in
     # ordering and only looks at unique elements.
@@ -78,24 +85,39 @@ produce_aggregates <- function(df, aggregations, cw_list, params) {
     ## "effective_sample_size", "represented"), add here.
     # If these names change (e.g. `sample_size` to `n`), update
     # `contingency-combine.R`.
-    keep_vars <- c("val", "se", "sample_size", "represented")
-
+    keep_vars <- c("val", "se", "sd", "p25", "p50", "p75", "sample_size", "represented")
     for (agg_id in names(dfs_out)) {
       if (nrow(dfs_out[[agg_id]]) == 0) {
         dfs_out[[agg_id]] <- NULL
         next
       }
       agg_metric <- aggregations$name[aggregations$id == agg_id]
-      map_old_new_names <- keep_vars
-      names(map_old_new_names) <- paste(keep_vars, agg_metric, sep="_")
-
+
+      tmp_keep_vars <- intersect(keep_vars, names(dfs_out[[agg_id]]))
+      names(tmp_keep_vars) <- paste(tmp_keep_vars, agg_metric, sep="_")
       dfs_out[[agg_id]] <- rename(
-        dfs_out[[agg_id]][, c(agg_group, keep_vars)], all_of(map_old_new_names))
+        dfs_out[[agg_id]][, c(agg_group, tmp_keep_vars)], all_of(tmp_keep_vars))
     }
 
     if ( length(dfs_out) != 0 ) {
       df_out <- dfs_out %>% reduce(full_join, by=agg_group, suff=c("", ""))
       write_contingency_tables(df_out, params, geo_level, agg_group)
+
+      for (theme in names(THEME_GROUPS)) {
+        theme_out <- select(df_out, agg_group, contains(THEME_GROUPS[[theme]]))
+        # Drop any rows that are completely `NA`. Grouping variables are always
+        # defined, so need to ignore those.
+        theme_out <- drop_na(theme_out, !!setdiff(names(theme_out), agg_group))
+
+        if ( nrow(theme_out) != 0 && ncol(theme_out) != 0 ) {
+          write_contingency_tables(theme_out, params, geo_level, agg_group, theme)  
+        }
+      }
+    }
+
+    if (geo_level == "county") {
+      # Restore old sample size threshold
+      params$num_filter <- old_thresh
     }
   }
 }
@@ -296,10 +318,14 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
       rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size")])) == 0,
     ]
 
+    # Censor rows with low sample size
     dfs_out[[aggregation]] <- apply_privacy_censoring(dfs_out[[aggregation]], params)
 
-    ## Apply the post-function
+    # Apply the post-function
     dfs_out[[aggregation]] <- post_fn(dfs_out[[aggregation]])
+
+    # Round sample sizes
+    dfs_out[[aggregation]] <- round_n(dfs_out[[aggregation]], params)
   }
 
   return(dfs_out)
@@ -366,6 +392,13 @@ summarize_aggregations_group <- function(group_df, aggregations, target_group, g
       dfs_out[[aggregation]]$sample_size <- sample_size
       dfs_out[[aggregation]]$effective_sample_size <- new_row$effective_sample_size
       dfs_out[[aggregation]]$represented <- new_row$represented
+
+      if (all(c("sd", "p25", "p50", "p75") %in% names(new_row))) {
+        dfs_out[[aggregation]]$sd <- new_row$sd
+        dfs_out[[aggregation]]$p25 <- new_row$p25
+        dfs_out[[aggregation]]$p50 <- new_row$p50
+        dfs_out[[aggregation]]$p75 <- new_row$p75
+      }
     }
   }
 

diff --git a/facebook/delphiFacebook/R/contingency_calculate.R b/facebook/delphiFacebook/R/contingency_calculate.R
@@ -104,3 +104,53 @@ post_convert_count_to_pct <- function(df) {
   return(mutate(df,
                 val = 100 * .data$val / sum(.data$val, na.rm=TRUE)))
 }
+
+#' Return numeric response estimates. Val is the mean of a numeric vector.
+#'
+#' This function takes vectors as input and computes the response values
+#' (a point estimate named "val" and 25, 50, and 75th percentiles).
+#'
+#' @param response a vector of multiple choice responses
+#' @param weight a vector of sample weights for inverse probability weighting;
+#'   invariant up to a scaling factor
+#' @param sample_size The sample size to use, which may be a non-integer (as
+#'   responses from ZIPs that span geographical boundaries are weighted
+#'   proportionately, and survey weights may also be applied)
+#' @param total_represented Number of people represented in sample, which may
+#'   be a non-integer
+#'
+#' @return a list of named mean and other descriptive statistics
+#'
+#' @importFrom survey svydesign svymean svyvar oldsvyquantile
+compute_numeric_mean <- function(response, weight, sample_size, total_represented) {
+  assert(length(response) == length(weight))
+
+  if (length(response) == 1) {
+    # svydesign complains if given only one response, so return an empty df
+    # instead. This group will be dropped anyway.
+    return(list(
+      val = NA_real_,
+      se = NA_real_,
+      sd = NA_real_,
+      p25 = NA_real_,
+      p50 = NA_real_,
+      p75 = NA_real_,
+      sample_size = NA_real_,
+      effective_sample_size = NA_real_,
+      represented = NA_real_
+    ))
+  }
+
+  design <- svydesign(id = ~1, weight = ~weight, data = data.frame(response, weight))
+  return(list(
+    val = as.data.frame(svymean(~response, na.rm = TRUE, design = design))[,"mean"],
+    se = NA_real_,
+    sd = as.data.frame(sqrt(svyvar(~response, na.rm = TRUE, design = design)))[,"variance"],
+    p25 = as.data.frame(oldsvyquantile(~response, na.rm = TRUE, design = design, quantiles = 0.25))[,"0.25"],
+    p50 = as.data.frame(oldsvyquantile(~response, na.rm = TRUE, design = design, quantiles = 0.5))[,"0.5"],
+    p75 = as.data.frame(oldsvyquantile(~response, na.rm = TRUE, design = design, quantiles = 0.75))[,"0.75"],
+    sample_size = sample_size,
+    effective_sample_size = sample_size,
+    represented = total_represented
+  ))
+}