cmu-delphi · krivard · Dec 13, 2021 · Dec 8, 2021 · Dec 9, 2021 · Dec 9, 2021
diff --git a/facebook/Makefile b/facebook/Makefile
@@ -12,6 +12,7 @@ PYTHON:=env/bin/python
 QUALTRICS=$(shell $(PYTHON) -m delphi_utils get input_dir)
 WEIGHTS=$(shell $(PYTHON) -m delphi_utils get weights_in_dir)
 CIDS=$(shell $(PYTHON) -m delphi_utils get weights_out_dir)
+CIDS_EXP=$(shell $(PYTHON) -m delphi_utils get experimental_weights_out_dir)
 INDIVIDUAL=$(shell $(PYTHON) -m delphi_utils get individual_dir)
 INDIVIDUAL_RACEETH=$(shell $(PYTHON) -m delphi_utils get individual_raceeth_dir)
 ARCHIVE=$(shell $(PYTHON) -m delphi_utils get archive_dir)
@@ -25,7 +26,9 @@ SFTP_OPTIONS=$(shell $(PYTHON) -m delphi_utils get sftp_options)
 MAX_WEIGHTED=ls -1 $(WEIGHTS) | grep dap | tail -1 | sed 's/_.*//;s/-//g;'
 
 ANTIJOIN:="antijoin.cids.sorted.txt"
+ANTIJOIN_EXP:="antijoin.experimental.cids.sorted.txt"
 CIDS_DEST:="fb-interchange/cmu_respondent_ids"
+CIDS_EXP_DEST:="fb-interchange/cmu_respondent_ww_ids"
 INDIVID_DEST:="fb-public-results/"
 INDIVID_RACEETH_DEST:="protected-race-ethnicity-data/"
 RAW_DEST:="raw"
@@ -59,7 +62,7 @@ tidy: receiving
 	mv scratch/*.tgz tidy/
 
 clean:
-	rm -f $(RECEIVING)/*.csv $(INDIVIDUAL)/*.csv $(INDIVIDUAL_RACEETH)/*.csv  $(CIDS)/*.csv
+	rm -f $(RECEIVING)/*.csv $(INDIVIDUAL)/*.csv $(INDIVIDUAL_RACEETH)/*.csv $(CIDS)/*.csv $(CIDS_EXP)/*.csv
 
 clean-archive:
 	rm -f $(ARCHIVE)/*.Rds
@@ -78,6 +81,9 @@ install: install-python install-R
 $(CIDS):
 	[ -f $(CIDS) ] || mkdir -p $(CIDS)
 
+$(CIDS_EXP):
+	[ -f $(CIDS_EXP) ] || mkdir -p $(CIDS_EXP)
+
 init-qualtrics:
 	grep '"token": "..*"' params.json
 
@@ -133,14 +139,14 @@ dev: delphiFacebook_1.0.tar.gz
 lib:
 	R -e 'roxygen2::roxygenise("delphiFacebook")'
 
-run-R: $(CIDS)
+run-R: $(CIDS) $(CIDS_EXP)
 	rm -rf tmp
 	time Rscript run.R 2>&1 |tee tmp
 	grep "run_facebook completed successfully" tmp
 	grep "scheduled core" tmp ; \
 	[ "$$?" -eq 1 ]
 
-pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-individual post-individual-raceeth post-done tidy
+pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-experimental-cids post-individual post-individual-raceeth post-done tidy
 	grep $(TODAY) params.json
 	[ -f $(YESTERDAY) ] && rm $(YESTERDAY) || true
 	touch $@
@@ -184,6 +190,28 @@ post-cids: $(TODAY) $(CIDS)
 	echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` cid files" >> $(MESSAGES)
 	touch $@
 
+post-experimental-cids: $(TODAY) $(CIDS_EXP)
+	rm -rf tmp
+	touch $(ANTIJOIN_EXP)
+	POST=`find $(CIDS_EXP) -maxdepth 1 -newer $(TODAY) -name "cvid_cids_*.csv"`; \
+	[ -n "$${POST}" ]; \
+	LC_ALL=C find $(CIDS_EXP) -maxdepth 1 -daystart -mtime +0 -name "cvid_cids*.csv" -exec sort -u -o ${ANTIJOIN_EXP} {} +; \
+	BATCH=""; \
+	for f in $${POST}; do \
+	    LC_ALL=C comm -23 <(LC_ALL=C sort $$f) ${ANTIJOIN_EXP} >tmp; \
+	    diff -q tmp $$f || mv $$f $$f.bak; \
+	    mv tmp $$f; \
+	    ncids=`wc -l $$f | awk '{print $$1}'`; \
+	    if [[ $$ncids == "0" ]]; then \
+		echo "ERROR: 0 CIDs reported for $$f"; \
+		exit 73; \
+	    fi; \
+	    BATCH="$${BATCH}put $$f ${CIDS_EXP_DEST}\n"; \
+	done; \
+	$(SFTP_POST); \
+	echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` experimental cid files" >> $(MESSAGES)
+	touch $@
+
 post-individual: $(TODAY) $(INDIVIDUAL)
 	POST=`find $(INDIVIDUAL) -maxdepth 1 -newer $(TODAY) -name "cvid_responses_*.csv"`; \
 	[ -n "$${POST}" ]; \
@@ -210,10 +238,12 @@ post-individual-raceeth: $(TODAY) $(INDIVIDUAL_RACEETH)
 	echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` race-ethnicity microresponse files" >> $(MESSAGES)
 	touch $@
 
-post-done: post-cids
+post-done: post-cids post-experimental-cids
 	touch $(YESTERDAY).done
 	BATCH="put $(YESTERDAY).done $(CIDS_DEST)\n"; \
 	$(SFTP_POST)
+	BATCH="put $(YESTERDAY).done $(CIDS_EXP_DEST)\n"; \
+	$(SFTP_POST)
 	echo "SUCCESS: $(DRY_MESSAGE)Posted $(YESTERDAY).done" >> $(MESSAGES)
 
 validate-covidcast:

diff --git a/facebook/delphiFacebook/NAMESPACE b/facebook/delphiFacebook/NAMESPACE
@@ -12,6 +12,7 @@ export(end_of_prev_full_month)
 export(end_of_prev_full_week)
 export(filter_complete_responses)
 export(filter_data_for_aggregation)
+export(filter_module_complete_responses)
 export(filter_responses)
 export(floor_epiweek)
 export(get_filenames_in_range)
@@ -51,6 +52,7 @@ export(update_archive)
 export(update_params)
 export(verify_aggs)
 export(write_cid)
+export(write_cid_experimental_wrapper)
 export(write_contingency_tables)
 export(write_data_api)
 export(write_individual)
@@ -121,4 +123,5 @@ importFrom(stringi,stri_trans_tolower)
 importFrom(stringi,stri_trim)
 importFrom(tibble,add_column)
 importFrom(tibble,tribble)
+importFrom(utils,tail)
 useDynLib(delphiFacebook, .registration = TRUE)
diff --git a/facebook/delphiFacebook/R/contingency_variables.R b/facebook/delphiFacebook/R/contingency_variables.R
@@ -4,70 +4,6 @@
 ## input data is always from only one wave of the survey -- they do not deal
 ## with inputs that have multiple waves mingled in one data frame.
 
-#' Gender
-#'
-#' @param input_data input data frame of raw survey data
-#' @param wave integer indicating survey version
-#' 
-#' @return augmented data frame
-code_gender <- function(input_data, wave) {
-  if ("D1" %in% names(input_data)) {
-    input_data$gender <- case_when(
-      input_data$D1 == 1 ~ "Male",
-      input_data$D1 == 2 ~ "Female",
-      input_data$D1 == 3 ~ "Other",
-      input_data$D1 == 4 ~ "Other",
-      input_data$D1 == 5 ~ NA_character_,
-      TRUE ~ NA_character_
-    )
-  } else {
-    input_data$gender <- NA_character_
-  }
-
-  return(input_data)
-}
-
-#' Age-related fields
-#'
-#' @param input_data input data frame of raw survey data
-#' @param wave integer indicating survey version
-#' 
-#' @return augmented data frame
-code_age <- function(input_data, wave) {
-  if ("D2" %in% names(input_data)) {
-    input_data$agefull <- case_when(
-      input_data$D2 == 1 ~ "18-24",
-      input_data$D2 == 2 ~ "25-34",
-      input_data$D2 == 3 ~ "35-44",
-      input_data$D2 == 4 ~ "45-54",
-      input_data$D2 == 5 ~ "55-64",
-      input_data$D2 == 6 ~ "65-74",
-      input_data$D2 == 7 ~ "75plus",
-      TRUE ~ NA_character_
-    )
-
-    # Condensed age categories
-    input_data$age <- case_when(
-      input_data$D2 == 1 ~ "18-24",
-      input_data$D2 == 2 ~ "25-44",
-      input_data$D2 == 3 ~ "25-44",
-      input_data$D2 == 4 ~ "45-64",
-      input_data$D2 == 5 ~ "45-64",
-      input_data$D2 == 6 ~ "65plus",
-      input_data$D2 == 7 ~ "65plus",
-      TRUE ~ NA_character_
-    )
-
-    input_data$age65plus <- input_data$age == "65plus"
-  } else {
-    input_data$agefull <- NA_character_
-    input_data$age <- NA_character_
-    input_data$age65plus <- NA
-  }
-
-  return(input_data)
-}
-
 #' Occupation
 #'
 #' @param input_data input data frame of raw survey data

diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R
@@ -194,6 +194,8 @@ load_response_one <- function(input_filename, params, contingency_run) {
   input_data <- code_schooling(input_data, wave)
   input_data <- code_beliefs(input_data, wave)
   input_data <- code_news_and_info(input_data, wave)
+  input_data <- code_gender(input_data, wave)
+  input_data <- code_age(input_data, wave)
 
   if (!is.null(params$produce_individual_raceeth) && params$produce_individual_raceeth) {
     input_data <- code_race_ethnicity(input_data, wave)
@@ -227,8 +229,6 @@ load_response_one <- function(input_filename, params, contingency_run) {
   if (contingency_run) {
     ## Create additional fields for aggregations.
     # Demographic grouping variables
-    input_data <- code_gender(input_data, wave)
-    input_data <- code_age(input_data, wave)
     input_data <- code_race_ethnicity(input_data, wave)
     input_data <- code_occupation(input_data, wave)
     input_data <- code_education(input_data, wave)
@@ -546,6 +546,8 @@ module_assignment <- function(input_data, wave) {
       input_data$FL_23_DO == "ModuleB" ~ "B",
       TRUE ~ NA_character_
     )
+  } else {
+    input_data$module <- NA_character_
   }
 
   return(input_data)
@@ -752,3 +754,48 @@ filter_complete_responses <- function(data_full, params)
 
   return(data_full)
 }
+
+#' Filter responses to those that are "module-complete". Splits by module assignment
+#'
+#' Inclusion criteria:
+#'
+#' * answered age consent
+#' * CID/token IS NOT missing
+#' * distribution source (ie previews) IS NOT irregular
+#' * start date IS IN range, pacific time
+#' * Date is in [`params$start_date - params$backfill_days`, `end_date`],
+#' inclusive.
+#' * answered minimum of 2 additional questions, where to "answer" a numeric
+#' open-ended question (A2, A2b, B2b, Q40, C10_1_1, C10_2_1, C10_3_1, C10_4_1,
+#' D3, D4, D5) means to provide any number (floats okay) and to "answer" a radio
+#' button question is to provide a selection.
+#' * reached the end of the survey (i.e. sees the "Thank you" message)
+#' * answered age and gender questions
+#'
+#' Most of these criteria are handled by `filter_responses()` and
+#' `filter_complete_responses()` above; this function need only handle the last
+#' two criteria.
+#'
+#' @param data_full data frame of responses
+#' @param params named list of configuration options from `read_params()`,
+#'   containing `start_date`, `backfill_days`, and `end_date`
+#'
+#' @importFrom dplyr filter
+#' @importFrom rlang .data
+#' @export
+filter_module_complete_responses <- function(data_full, params)
+{
+  date_col <- if ("day" %in% names(data_full)) { "day" } else { "Date" }
+  data_full <- rename(data_full, Date = .data$date) %>% 
+    filter_complete_responses(params) %>% 
+    filter(!is.na(.data$age),
+           !is.na(.data$gender),
+           .data$Finished == 1) %>% 
+    select(date_col, .data$token, .data$module)
+
+  data_a <- filter(data_full, .data$module == "A")
+  data_b <- filter(data_full, .data$module == "B")
+
+  return(list(a = data_a, b = data_b))
+}
+
diff --git a/facebook/delphiFacebook/R/run.R b/facebook/delphiFacebook/R/run.R
@@ -42,6 +42,11 @@ run_facebook <- function(params)
   data_full <- join_weights(data_full, params, weights = "full")$df
   msg_df("full data to share with research partners", data_full)
 
+  # create module-complete data used to create CID lists separately for each module
+  data_module_complete <- filter_module_complete_responses(input_data, params)
+  data_module_complete_a <- data_module_complete[["a"]]
+  data_module_complete_b <- data_module_complete[["b"]]
+
   ## Set default number of cores for mclapply to the total available number,
   ## because we are greedy and this will typically run on a server.
   if (params$parallel) {
@@ -60,6 +65,11 @@ run_facebook <- function(params)
   {
     write_cid(data_full, "full", params)
     write_cid(data_agg, "part_a", params)
+
+    write_cid_experimental_wrapper(data_full, "full", params, "")
+    write_cid_experimental_wrapper(data_agg, "part_a", params, "")
+    write_cid_experimental_wrapper(data_module_complete_a, "module_complete", params, "modul_a_")
+    write_cid_experimental_wrapper(data_module_complete_b, "module_complete", params, "modul_b_")
   }
   if ( "archive" %in% params$output )
   {

diff --git a/facebook/delphiFacebook/R/variables.R b/facebook/delphiFacebook/R/variables.R
@@ -1032,3 +1032,68 @@ code_race_ethnicity <- function(input_data, wave) {
 
   return(input_data)
 }
+
+#' Gender
+#'
+#' @param input_data input data frame of raw survey data
+#' @param wave integer indicating survey version
+#' 
+#' @return augmented data frame
+code_gender <- function(input_data, wave) {
+  if ("D1" %in% names(input_data)) {
+    input_data$gender <- case_when(
+      input_data$D1 == 1 ~ "Male",
+      input_data$D1 == 2 ~ "Female",
+      input_data$D1 == 3 ~ "Other",
+      input_data$D1 == 4 ~ "Other",
+      input_data$D1 == 5 ~ NA_character_,
+      TRUE ~ NA_character_
+    )
+  } else {
+    input_data$gender <- NA_character_
+  }
+
+  return(input_data)
+}
+
+#' Age-related fields
+#'
+#' @param input_data input data frame of raw survey data
+#' @param wave integer indicating survey version
+#' 
+#' @return augmented data frame
+code_age <- function(input_data, wave) {
+  if ("D2" %in% names(input_data)) {
+    input_data$agefull <- case_when(
+      input_data$D2 == 1 ~ "18-24",
+      input_data$D2 == 2 ~ "25-34",
+      input_data$D2 == 3 ~ "35-44",
+      input_data$D2 == 4 ~ "45-54",
+      input_data$D2 == 5 ~ "55-64",
+      input_data$D2 == 6 ~ "65-74",
+      input_data$D2 == 7 ~ "75plus",
+      TRUE ~ NA_character_
+    )
+
+    # Condensed age categories
+    input_data$age <- case_when(
+      input_data$D2 == 1 ~ "18-24",
+      input_data$D2 == 2 ~ "25-44",
+      input_data$D2 == 3 ~ "25-44",
+      input_data$D2 == 4 ~ "45-64",
+      input_data$D2 == 5 ~ "45-64",
+      input_data$D2 == 6 ~ "65plus",
+      input_data$D2 == 7 ~ "65plus",
+      TRUE ~ NA_character_
+    )
+
+    input_data$age65plus <- input_data$age == "65plus"
+  } else {
+    input_data$agefull <- NA_character_
+    input_data$age <- NA_character_
+    input_data$age65plus <- NA
+  }
+
+  return(input_data)
+}
+