cmu-delphi · krivard · Jun 14, 2022 · Jun 7, 2022 · Jun 7, 2022 · Jun 9, 2022
diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R
@@ -0,0 +1,95 @@
+#!/usr/bin/env Rscript
+
+## Combine a set of EU and non-EU translation files (UMD only), adding in a
+## column indicating whether a given field was included in just the EU version,
+## just the non-EU version, or in both.
+##
+## Usage:
+##
+## Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir
+
+suppressPackageStartupMessages({
+  library(tidyverse)
+  source("qsf-utils.R")
+})
+
+
+combine_translation_pair <- function(eu_translation,
+                                     noneu_translation) {
+  translation <- bind_rows(eu_translation, noneu_translation) %>%
+    mutate(eu_noneu = case_when(
+      startsWith(PhraseID, "intro1_eu") ~ "EU",
+      startsWith(PhraseID, "intro2_eu") ~ "EU",
+      startsWith(PhraseID, "intro1_noneu") ~ "nonEU",
+      startsWith(PhraseID, "intro2_noneu") ~ "nonEU",
+      TRUE ~ "Both"
+    ))
+  return(translation)
+}
+
+combine_translations <- function(path_to_eu_translations,
+                                 path_to_noneu_translations,
+                                 path_to_combined) {
+  eu_name_pattern <- "_eu_"
+  if (!grepl(eu_name_pattern, path_to_eu_translations)) {
+    stop(path_to_eu_translations, "does not specify that it is for the EU")
+  }
+  noneu_name_pattern <- "_noneu_"
+  if (!grepl(noneu_name_pattern, path_to_noneu_translations)) {
+    stop(path_to_noneu_translations, "does not specify that it is for the non-EU")
+  }
+
+  eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$", full.names = TRUE)
+  eu_translations <- list()
+  for (filename in eu_files) {
+    eu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% 
+      filter(startsWith(PhraseID, "intro"))
+  }
+
+  noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$", full.names = TRUE)
+  noneu_translations <- list()
+  for (filename in noneu_files) {
+    noneu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% 
+      # Drop response options for the country + region question, they take up way too much space.
+      filter(
+        !startsWith(PhraseID, "A2_3_Answer"),
+        !startsWith(PhraseID, "A2_2_Answer"),
+        !startsWith(PhraseID, "NA_")
+      )
+  }
+
+  if (!identical(sort(names(eu_translations)), sort(names(noneu_translations)))) {
+    stop("not all waves are available for both EU and non-EU")
+  }
+
+  dir.create(path_to_combined, showWarnings = FALSE)
+  for (wave in names(eu_translations)) {
+    combined <- combine_translation_pair(
+      eu_translations[[wave]],
+      noneu_translations[[wave]]
+    )
+
+    write_excel_csv(
+      combined,
+      file.path(
+        path_to_combined,
+        sprintf("umd_ctis_combined_v%02g_translations.csv", as.numeric(wave))
+      ),
+      quote = "needed")
+  }
+}
+
+
+args <- commandArgs(TRUE)
+
+if (length(args) != 3) {
+  stop("Usage: Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir")
+}
+
+path_to_eu_translations <- args[1]
+path_to_noneu_translations <- args[2]
+path_to_combined <- args[3]
+
+invisible(combine_translations(path_to_eu_translations, path_to_noneu_translations, path_to_combined))
+
+
diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R
@@ -274,6 +274,7 @@ process_qsf <- function(path_to_qsf,
 
   # format all qsf content lists into a single tibble
   qdf <- tibble(variable = item_names,
+                qid = qids,
                 question = questions,
                 question_type = qtype,
                 response_options = choices,
@@ -360,6 +361,7 @@ process_qsf <- function(path_to_qsf,
     mutate(new = list(	
       tibble(matrix_base_name = variable,
              variable = unlist(matrix_subquestion_field_names),
+             qid = qid,
              question = question,	
              matrix_subquestion = unlist(matrix_subquestions),	
              question_type = question_type,	
@@ -381,6 +383,7 @@ process_qsf <- function(path_to_qsf,
     mutate(new = list(	
       tibble(matrix_base_name = variable,
              variable = unlist(matrix_subquestion_field_names),	
+             qid = qid,
              question = question,	
              matrix_subquestion = unlist(matrix_subquestions),	
              question_type = question_type,	
@@ -426,6 +429,7 @@ process_qsf <- function(path_to_qsf,
     ) %>% 
     select(wave,
            variable,
+           qid,
            matrix_base_name,
            replaces,
            description,

diff --git a/facebook/qsf-tools/qsf-utils.R b/facebook/qsf-tools/qsf-utils.R
@@ -71,6 +71,32 @@ get_wave <- function(path_to_qsf) {
   return(wave)
 }
 
+#' Get wave number from qsf filename
+#' 
+#' Wave number as provided in the qsf name should be an integer or a float with
+#' one decimal place.
+#'
+#' @param path_to_file
+#'
+#' @return (mostly) integer wave number
+get_wave_from_csv <- function(path_to_file) {
+  name_pattern <- "(.*[Ww]ave_?)([0-9]*([.][0-9])?)(.*csv.*)"
+  if (!grepl(name_pattern, path_to_file)) {
+    stop(
+      "The CSV filename must include the string 'csv', and the wave number in ",
+      "the format 'Wave_XX', 'WaveXX', 'wave_XX', or 'waveXX' where 'XX' is an ",
+      "integer or float. The wave specification can occur anywhere in the ",
+      "filename but must precede the string 'csv'."
+    )
+  }
+
+  wave <- as.numeric(
+    sub(name_pattern, "\\2", path_to_file)
+  ) 
+
+  return(wave)
+}
+
 #' Create mapping of QIDs to module name
 #'
 #' @param qsf contents of QSF file in JSON format

diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R
@@ -0,0 +1,73 @@
+#!/usr/bin/env Rscript
+
+## In translation CSVs, replace the QID in the name column with the human-readable
+## item name (e.g. A1). Export modified translation CSVs in the same format as the
+## original.
+##
+## Usage:
+##
+## Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook
+
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(purrr)
+  library(stringr)
+  source("qsf-utils.R")
+})
+
+
+replace_qid_wrapper <- function(path_to_translations, path_to_codebook) {
+  if (dir.exists(path_to_translations)) {
+    # Process all CSVs in directory
+    csvs <- list.files(path_to_translations, pattern = "*.csv$", full.names = TRUE)
+    for (csv in csvs) {
+      replace_qids(csv, path_to_codebook)
+    }
+  } else if (file.exists(path_to_translations)) {
+    replace_qids(path_to_translations, path_to_codebook)
+  } else {
+    stop(path_to_translations, " is not a valid file or directory")
+  }
+}
+
+replace_qids <- function(path_to_translation_file, path_to_codebook) {
+  wave <- get_wave_from_csv(path_to_translation_file)
+  # Load codebook
+  codebook <- read_csv(path_to_codebook, col_types = cols(
+    .default = col_character(),
+    version = col_double()
+  )) %>%
+    filter(!is.na(qid), version == wave)
+
+  # Load translation file
+  translation <- read_csv(path_to_translation_file, show_col_types = FALSE) %>% 
+    # Drop survey ID line
+    filter(!startsWith(PhraseID, "SV_"))  
+
+  # Use codebook to make a mapping of QID -> item name.
+  var_qid_pairs <- codebook %>% mutate(variable = coalesce(matrix_base_name, variable)) %>% distinct(qid, variable)
+  qid_item_map <- var_qid_pairs %>% pull(variable)
+  names(qid_item_map) <- var_qid_pairs %>% pull(qid)
+
+  # Use QID-name mapping to replace QID in first column.
+  ii_qid <- startsWith(translation$PhraseID, "QID")
+  translation[ii_qid,] <- translation[ii_qid,] %>% mutate(
+    PhraseID = str_replace(PhraseID, "(^QID[0-9]*)_", function(match) {
+      paste0(qid_item_map[str_sub(match, 1, -2)], "_")
+    })
+  ) 
+
+  # Save processed file back to CSV under the same name.
+  write_excel_csv(translation, path_to_translation_file, quote = "needed")
+}
+
+args <- commandArgs(TRUE)
+
+if (!(length(args) %in% c(2))) {
+  stop("Usage: Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook")
+}
+
+path_to_translations <- args[1]
+path_to_codebook <- args[2]
+
+invisible(replace_qid_wrapper(path_to_translations, path_to_codebook))