diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R new file mode 100644 index 000000000..8ce09a353 --- /dev/null +++ b/facebook/qsf-tools/combine_translations_eu.R @@ -0,0 +1,95 @@ +#!/usr/bin/env Rscript + +## Combine a set of EU and non-EU translation files (UMD only), adding in a +## column indicating whether a given field was included in just the EU version, +## just the non-EU version, or in both. +## +## Usage: +## +## Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir + +suppressPackageStartupMessages({ + library(tidyverse) + source("qsf-utils.R") +}) + + +combine_translation_pair <- function(eu_translation, + noneu_translation) { + translation <- bind_rows(eu_translation, noneu_translation) %>% + mutate(eu_noneu = case_when( + startsWith(PhraseID, "intro1_eu") ~ "EU", + startsWith(PhraseID, "intro2_eu") ~ "EU", + startsWith(PhraseID, "intro1_noneu") ~ "nonEU", + startsWith(PhraseID, "intro2_noneu") ~ "nonEU", + TRUE ~ "Both" + )) + return(translation) +} + +combine_translations <- function(path_to_eu_translations, + path_to_noneu_translations, + path_to_combined) { + eu_name_pattern <- "_eu_" + if (!grepl(eu_name_pattern, path_to_eu_translations)) { + stop(path_to_eu_translations, "does not specify that it is for the EU") + } + noneu_name_pattern <- "_noneu_" + if (!grepl(noneu_name_pattern, path_to_noneu_translations)) { + stop(path_to_noneu_translations, "does not specify that it is for the non-EU") + } + + eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$", full.names = TRUE) + eu_translations <- list() + for (filename in eu_files) { + eu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% + filter(startsWith(PhraseID, "intro")) + } + + noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$", full.names = TRUE) + noneu_translations <- list() + for (filename in noneu_files) { + noneu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% + # Drop response options for the country + region question, they take up way too much space. + filter( + !startsWith(PhraseID, "A2_3_Answer"), + !startsWith(PhraseID, "A2_2_Answer"), + !startsWith(PhraseID, "NA_") + ) + } + + if (!identical(sort(names(eu_translations)), sort(names(noneu_translations)))) { + stop("not all waves are available for both EU and non-EU") + } + + dir.create(path_to_combined, showWarnings = FALSE) + for (wave in names(eu_translations)) { + combined <- combine_translation_pair( + eu_translations[[wave]], + noneu_translations[[wave]] + ) + + write_excel_csv( + combined, + file.path( + path_to_combined, + sprintf("umd_ctis_combined_v%02g_translations.csv", as.numeric(wave)) + ), + quote = "needed") + } +} + + +args <- commandArgs(TRUE) + +if (length(args) != 3) { + stop("Usage: Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir") +} + +path_to_eu_translations <- args[1] +path_to_noneu_translations <- args[2] +path_to_combined <- args[3] + +invisible(combine_translations(path_to_eu_translations, path_to_noneu_translations, path_to_combined)) + + diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 26f81948e..17c5111e4 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -274,6 +274,7 @@ process_qsf <- function(path_to_qsf, # format all qsf content lists into a single tibble qdf <- tibble(variable = item_names, + qid = qids, question = questions, question_type = qtype, response_options = choices, @@ -360,6 +361,7 @@ process_qsf <- function(path_to_qsf, mutate(new = list( tibble(matrix_base_name = variable, variable = unlist(matrix_subquestion_field_names), + qid = qid, question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -381,6 +383,7 @@ process_qsf <- function(path_to_qsf, mutate(new = list( tibble(matrix_base_name = variable, variable = unlist(matrix_subquestion_field_names), + qid = qid, question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -426,6 +429,7 @@ process_qsf <- function(path_to_qsf, ) %>% select(wave, variable, + qid, matrix_base_name, replaces, description, diff --git a/facebook/qsf-tools/qsf-utils.R b/facebook/qsf-tools/qsf-utils.R index 2891964c1..4ec7280f3 100644 --- a/facebook/qsf-tools/qsf-utils.R +++ b/facebook/qsf-tools/qsf-utils.R @@ -71,6 +71,32 @@ get_wave <- function(path_to_qsf) { return(wave) } +#' Get wave number from qsf filename +#' +#' Wave number as provided in the qsf name should be an integer or a float with +#' one decimal place. +#' +#' @param path_to_file +#' +#' @return (mostly) integer wave number +get_wave_from_csv <- function(path_to_file) { + name_pattern <- "(.*[Ww]ave_?)([0-9]*([.][0-9])?)(.*csv.*)" + if (!grepl(name_pattern, path_to_file)) { + stop( + "The CSV filename must include the string 'csv', and the wave number in ", + "the format 'Wave_XX', 'WaveXX', 'wave_XX', or 'waveXX' where 'XX' is an ", + "integer or float. The wave specification can occur anywhere in the ", + "filename but must precede the string 'csv'." + ) + } + + wave <- as.numeric( + sub(name_pattern, "\\2", path_to_file) + ) + + return(wave) +} + #' Create mapping of QIDs to module name #' #' @param qsf contents of QSF file in JSON format diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R new file mode 100644 index 000000000..ab769a4df --- /dev/null +++ b/facebook/qsf-tools/replace_translation_qids.R @@ -0,0 +1,73 @@ +#!/usr/bin/env Rscript + +## In translation CSVs, replace the QID in the name column with the human-readable +## item name (e.g. A1). Export modified translation CSVs in the same format as the +## original. +## +## Usage: +## +## Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook + +suppressPackageStartupMessages({ + library(tidyverse) + library(purrr) + library(stringr) + source("qsf-utils.R") +}) + + +replace_qid_wrapper <- function(path_to_translations, path_to_codebook) { + if (dir.exists(path_to_translations)) { + # Process all CSVs in directory + csvs <- list.files(path_to_translations, pattern = "*.csv$", full.names = TRUE) + for (csv in csvs) { + replace_qids(csv, path_to_codebook) + } + } else if (file.exists(path_to_translations)) { + replace_qids(path_to_translations, path_to_codebook) + } else { + stop(path_to_translations, " is not a valid file or directory") + } +} + +replace_qids <- function(path_to_translation_file, path_to_codebook) { + wave <- get_wave_from_csv(path_to_translation_file) + # Load codebook + codebook <- read_csv(path_to_codebook, col_types = cols( + .default = col_character(), + version = col_double() + )) %>% + filter(!is.na(qid), version == wave) + + # Load translation file + translation <- read_csv(path_to_translation_file, show_col_types = FALSE) %>% + # Drop survey ID line + filter(!startsWith(PhraseID, "SV_")) + + # Use codebook to make a mapping of QID -> item name. + var_qid_pairs <- codebook %>% mutate(variable = coalesce(matrix_base_name, variable)) %>% distinct(qid, variable) + qid_item_map <- var_qid_pairs %>% pull(variable) + names(qid_item_map) <- var_qid_pairs %>% pull(qid) + + # Use QID-name mapping to replace QID in first column. + ii_qid <- startsWith(translation$PhraseID, "QID") + translation[ii_qid,] <- translation[ii_qid,] %>% mutate( + PhraseID = str_replace(PhraseID, "(^QID[0-9]*)_", function(match) { + paste0(qid_item_map[str_sub(match, 1, -2)], "_") + }) + ) + + # Save processed file back to CSV under the same name. + write_excel_csv(translation, path_to_translation_file, quote = "needed") +} + +args <- commandArgs(TRUE) + +if (!(length(args) %in% c(2))) { + stop("Usage: Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook") +} + +path_to_translations <- args[1] +path_to_codebook <- args[2] + +invisible(replace_qid_wrapper(path_to_translations, path_to_codebook))