Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions facebook/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ PYTHON:=env/bin/python
QUALTRICS=$(shell $(PYTHON) -m delphi_utils get input_dir)
WEIGHTS=$(shell $(PYTHON) -m delphi_utils get weights_in_dir)
CIDS=$(shell $(PYTHON) -m delphi_utils get weights_out_dir)
CIDS_EXP=$(shell $(PYTHON) -m delphi_utils get experimental_weights_out_dir)
INDIVIDUAL=$(shell $(PYTHON) -m delphi_utils get individual_dir)
INDIVIDUAL_RACEETH=$(shell $(PYTHON) -m delphi_utils get individual_raceeth_dir)
ARCHIVE=$(shell $(PYTHON) -m delphi_utils get archive_dir)
Expand All @@ -25,7 +26,9 @@ SFTP_OPTIONS=$(shell $(PYTHON) -m delphi_utils get sftp_options)
MAX_WEIGHTED=ls -1 $(WEIGHTS) | grep dap | tail -1 | sed 's/_.*//;s/-//g;'

ANTIJOIN:="antijoin.cids.sorted.txt"
ANTIJOIN_EXP:="antijoin.experimental.cids.sorted.txt"
CIDS_DEST:="fb-interchange/cmu_respondent_ids"
CIDS_EXP_DEST:="fb-interchange/cmu_respondent_ww_ids"
INDIVID_DEST:="fb-public-results/"
INDIVID_RACEETH_DEST:="protected-race-ethnicity-data/"
RAW_DEST:="raw"
Expand Down Expand Up @@ -59,7 +62,7 @@ tidy: receiving
mv scratch/*.tgz tidy/

clean:
rm -f $(RECEIVING)/*.csv $(INDIVIDUAL)/*.csv $(INDIVIDUAL_RACEETH)/*.csv $(CIDS)/*.csv
rm -f $(RECEIVING)/*.csv $(INDIVIDUAL)/*.csv $(INDIVIDUAL_RACEETH)/*.csv $(CIDS)/*.csv $(CIDS_EXP)/*.csv

clean-archive:
rm -f $(ARCHIVE)/*.Rds
Expand All @@ -78,6 +81,9 @@ install: install-python install-R
$(CIDS):
[ -f $(CIDS) ] || mkdir -p $(CIDS)

$(CIDS_EXP):
[ -f $(CIDS_EXP) ] || mkdir -p $(CIDS_EXP)

init-qualtrics:
grep '"token": "..*"' params.json

Expand Down Expand Up @@ -133,14 +139,14 @@ dev: delphiFacebook_1.0.tar.gz
lib:
R -e 'roxygen2::roxygenise("delphiFacebook")'

run-R: $(CIDS)
run-R: $(CIDS) $(CIDS_EXP)
rm -rf tmp
time Rscript run.R 2>&1 |tee tmp
grep "run_facebook completed successfully" tmp
grep "scheduled core" tmp ; \
[ "$$?" -eq 1 ]

pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-individual post-individual-raceeth post-done tidy
pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-experimental-cids post-individual post-individual-raceeth post-done tidy
grep $(TODAY) params.json
[ -f $(YESTERDAY) ] && rm $(YESTERDAY) || true
touch $@
Expand Down Expand Up @@ -184,6 +190,28 @@ post-cids: $(TODAY) $(CIDS)
echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` cid files" >> $(MESSAGES)
touch $@

post-experimental-cids: $(TODAY) $(CIDS_EXP)
rm -rf tmp
touch $(ANTIJOIN_EXP)
POST=`find $(CIDS_EXP) -maxdepth 1 -newer $(TODAY) -name "cvid_cids_*.csv"`; \
[ -n "$${POST}" ]; \
LC_ALL=C find $(CIDS_EXP) -maxdepth 1 -daystart -mtime +0 -name "cvid_cids*.csv" -exec sort -u -o ${ANTIJOIN_EXP} {} +; \
BATCH=""; \
for f in $${POST}; do \
LC_ALL=C comm -23 <(LC_ALL=C sort $$f) ${ANTIJOIN_EXP} >tmp; \
diff -q tmp $$f || mv $$f $$f.bak; \
mv tmp $$f; \
ncids=`wc -l $$f | awk '{print $$1}'`; \
if [[ $$ncids == "0" ]]; then \
echo "ERROR: 0 CIDs reported for $$f"; \
exit 73; \
fi; \
BATCH="$${BATCH}put $$f ${CIDS_EXP_DEST}\n"; \
done; \
$(SFTP_POST); \
echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` experimental cid files" >> $(MESSAGES)
touch $@

post-individual: $(TODAY) $(INDIVIDUAL)
POST=`find $(INDIVIDUAL) -maxdepth 1 -newer $(TODAY) -name "cvid_responses_*.csv"`; \
[ -n "$${POST}" ]; \
Expand All @@ -210,10 +238,12 @@ post-individual-raceeth: $(TODAY) $(INDIVIDUAL_RACEETH)
echo "SUCCESS: $(DRY_MESSAGE)Posted `echo $${POST} | wc -w` race-ethnicity microresponse files" >> $(MESSAGES)
touch $@

post-done: post-cids
post-done: post-cids post-experimental-cids
touch $(YESTERDAY).done
BATCH="put $(YESTERDAY).done $(CIDS_DEST)\n"; \
$(SFTP_POST)
BATCH="put $(YESTERDAY).done $(CIDS_EXP_DEST)\n"; \
$(SFTP_POST)
echo "SUCCESS: $(DRY_MESSAGE)Posted $(YESTERDAY).done" >> $(MESSAGES)

validate-covidcast:
Expand Down
3 changes: 3 additions & 0 deletions facebook/delphiFacebook/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export(end_of_prev_full_month)
export(end_of_prev_full_week)
export(filter_complete_responses)
export(filter_data_for_aggregation)
export(filter_module_complete_responses)
export(filter_responses)
export(floor_epiweek)
export(get_filenames_in_range)
Expand Down Expand Up @@ -51,6 +52,7 @@ export(update_archive)
export(update_params)
export(verify_aggs)
export(write_cid)
export(write_cid_experimental_wrapper)
export(write_contingency_tables)
export(write_data_api)
export(write_individual)
Expand Down Expand Up @@ -121,4 +123,5 @@ importFrom(stringi,stri_trans_tolower)
importFrom(stringi,stri_trim)
importFrom(tibble,add_column)
importFrom(tibble,tribble)
importFrom(utils,tail)
useDynLib(delphiFacebook, .registration = TRUE)
64 changes: 0 additions & 64 deletions facebook/delphiFacebook/R/contingency_variables.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,70 +4,6 @@
## input data is always from only one wave of the survey -- they do not deal
## with inputs that have multiple waves mingled in one data frame.

#' Gender
#'
#' @param input_data input data frame of raw survey data
#' @param wave integer indicating survey version
#'
#' @return augmented data frame
code_gender <- function(input_data, wave) {
if ("D1" %in% names(input_data)) {
input_data$gender <- case_when(
input_data$D1 == 1 ~ "Male",
input_data$D1 == 2 ~ "Female",
input_data$D1 == 3 ~ "Other",
input_data$D1 == 4 ~ "Other",
input_data$D1 == 5 ~ NA_character_,
TRUE ~ NA_character_
)
} else {
input_data$gender <- NA_character_
}

return(input_data)
}

#' Age-related fields
#'
#' @param input_data input data frame of raw survey data
#' @param wave integer indicating survey version
#'
#' @return augmented data frame
code_age <- function(input_data, wave) {
if ("D2" %in% names(input_data)) {
input_data$agefull <- case_when(
input_data$D2 == 1 ~ "18-24",
input_data$D2 == 2 ~ "25-34",
input_data$D2 == 3 ~ "35-44",
input_data$D2 == 4 ~ "45-54",
input_data$D2 == 5 ~ "55-64",
input_data$D2 == 6 ~ "65-74",
input_data$D2 == 7 ~ "75plus",
TRUE ~ NA_character_
)

# Condensed age categories
input_data$age <- case_when(
input_data$D2 == 1 ~ "18-24",
input_data$D2 == 2 ~ "25-44",
input_data$D2 == 3 ~ "25-44",
input_data$D2 == 4 ~ "45-64",
input_data$D2 == 5 ~ "45-64",
input_data$D2 == 6 ~ "65plus",
input_data$D2 == 7 ~ "65plus",
TRUE ~ NA_character_
)

input_data$age65plus <- input_data$age == "65plus"
} else {
input_data$agefull <- NA_character_
input_data$age <- NA_character_
input_data$age65plus <- NA
}

return(input_data)
}

#' Occupation
#'
#' @param input_data input data frame of raw survey data
Expand Down
51 changes: 49 additions & 2 deletions facebook/delphiFacebook/R/responses.R
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ load_response_one <- function(input_filename, params, contingency_run) {
input_data <- code_schooling(input_data, wave)
input_data <- code_beliefs(input_data, wave)
input_data <- code_news_and_info(input_data, wave)
input_data <- code_gender(input_data, wave)
input_data <- code_age(input_data, wave)

if (!is.null(params$produce_individual_raceeth) && params$produce_individual_raceeth) {
input_data <- code_race_ethnicity(input_data, wave)
Expand Down Expand Up @@ -227,8 +229,6 @@ load_response_one <- function(input_filename, params, contingency_run) {
if (contingency_run) {
## Create additional fields for aggregations.
# Demographic grouping variables
input_data <- code_gender(input_data, wave)
input_data <- code_age(input_data, wave)
input_data <- code_race_ethnicity(input_data, wave)
input_data <- code_occupation(input_data, wave)
input_data <- code_education(input_data, wave)
Expand Down Expand Up @@ -546,6 +546,8 @@ module_assignment <- function(input_data, wave) {
input_data$FL_23_DO == "ModuleB" ~ "B",
TRUE ~ NA_character_
)
} else {
input_data$module <- NA_character_
}

return(input_data)
Expand Down Expand Up @@ -752,3 +754,48 @@ filter_complete_responses <- function(data_full, params)

return(data_full)
}

#' Filter responses to those that are "module-complete". Splits by module assignment
#'
#' Inclusion criteria:
#'
#' * answered age consent
#' * CID/token IS NOT missing
#' * distribution source (ie previews) IS NOT irregular
#' * start date IS IN range, pacific time
#' * Date is in [`params$start_date - params$backfill_days`, `end_date`],
#' inclusive.
#' * answered minimum of 2 additional questions, where to "answer" a numeric
#' open-ended question (A2, A2b, B2b, Q40, C10_1_1, C10_2_1, C10_3_1, C10_4_1,
#' D3, D4, D5) means to provide any number (floats okay) and to "answer" a radio
#' button question is to provide a selection.
#' * reached the end of the survey (i.e. sees the "Thank you" message)
#' * answered age and gender questions
#'
#' Most of these criteria are handled by `filter_responses()` and
#' `filter_complete_responses()` above; this function need only handle the last
#' two criteria.
#'
#' @param data_full data frame of responses
#' @param params named list of configuration options from `read_params()`,
#' containing `start_date`, `backfill_days`, and `end_date`
#'
#' @importFrom dplyr filter
#' @importFrom rlang .data
#' @export
filter_module_complete_responses <- function(data_full, params)
{
date_col <- if ("day" %in% names(data_full)) { "day" } else { "Date" }
data_full <- rename(data_full, Date = .data$date) %>%
filter_complete_responses(params) %>%
filter(!is.na(.data$age),
!is.na(.data$gender),
.data$Finished == 1) %>%
select(date_col, .data$token, .data$module)

data_a <- filter(data_full, .data$module == "A")
data_b <- filter(data_full, .data$module == "B")

return(list(a = data_a, b = data_b))
}

10 changes: 10 additions & 0 deletions facebook/delphiFacebook/R/run.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ run_facebook <- function(params)
data_full <- join_weights(data_full, params, weights = "full")$df
msg_df("full data to share with research partners", data_full)

# create module-complete data used to create CID lists separately for each module
data_module_complete <- filter_module_complete_responses(input_data, params)
data_module_complete_a <- data_module_complete[["a"]]
data_module_complete_b <- data_module_complete[["b"]]

## Set default number of cores for mclapply to the total available number,
## because we are greedy and this will typically run on a server.
if (params$parallel) {
Expand All @@ -60,6 +65,11 @@ run_facebook <- function(params)
{
write_cid(data_full, "full", params)
write_cid(data_agg, "part_a", params)

write_cid_experimental_wrapper(data_full, "full", params, "")
write_cid_experimental_wrapper(data_agg, "part_a", params, "")
write_cid_experimental_wrapper(data_module_complete_a, "module_complete", params, "modul_a_")
write_cid_experimental_wrapper(data_module_complete_b, "module_complete", params, "modul_b_")
}
if ( "archive" %in% params$output )
{
Expand Down
65 changes: 65 additions & 0 deletions facebook/delphiFacebook/R/variables.R
Original file line number Diff line number Diff line change
Expand Up @@ -1032,3 +1032,68 @@ code_race_ethnicity <- function(input_data, wave) {

return(input_data)
}

#' Gender
#'
#' @param input_data input data frame of raw survey data
#' @param wave integer indicating survey version
#'
#' @return augmented data frame
code_gender <- function(input_data, wave) {
if ("D1" %in% names(input_data)) {
input_data$gender <- case_when(
input_data$D1 == 1 ~ "Male",
input_data$D1 == 2 ~ "Female",
input_data$D1 == 3 ~ "Other",
input_data$D1 == 4 ~ "Other",
input_data$D1 == 5 ~ NA_character_,
TRUE ~ NA_character_
)
} else {
input_data$gender <- NA_character_
}

return(input_data)
}

#' Age-related fields
#'
#' @param input_data input data frame of raw survey data
#' @param wave integer indicating survey version
#'
#' @return augmented data frame
code_age <- function(input_data, wave) {
if ("D2" %in% names(input_data)) {
input_data$agefull <- case_when(
input_data$D2 == 1 ~ "18-24",
input_data$D2 == 2 ~ "25-34",
input_data$D2 == 3 ~ "35-44",
input_data$D2 == 4 ~ "45-54",
input_data$D2 == 5 ~ "55-64",
input_data$D2 == 6 ~ "65-74",
input_data$D2 == 7 ~ "75plus",
TRUE ~ NA_character_
)

# Condensed age categories
input_data$age <- case_when(
input_data$D2 == 1 ~ "18-24",
input_data$D2 == 2 ~ "25-44",
input_data$D2 == 3 ~ "25-44",
input_data$D2 == 4 ~ "45-64",
input_data$D2 == 5 ~ "45-64",
input_data$D2 == 6 ~ "65plus",
input_data$D2 == 7 ~ "65plus",
TRUE ~ NA_character_
)

input_data$age65plus <- input_data$age == "65plus"
} else {
input_data$agefull <- NA_character_
input_data$age <- NA_character_
input_data$age65plus <- NA
}

return(input_data)
}

Loading