From af7184bb5740c8ec40fc9d12b28e3b01520f992e Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Tue, 7 Jun 2022 15:05:22 -0400
Subject: [PATCH 1/7] initial commit to turn QIDs in translation files into
 item names

---
 facebook/qsf-tools/replace_translation_qids.R | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 facebook/qsf-tools/replace_translation_qids.R

diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R
new file mode 100644
index 000000000..5aa3824a5
--- /dev/null
+++ b/facebook/qsf-tools/replace_translation_qids.R
@@ -0,0 +1,71 @@
+#!/usr/bin/env Rscript
+
+## In translation CSVs, replace the QID in the name column with the human-readable
+## item name (e.g. A1). Export modified translation CSVs in the same format as the
+## original.
+##
+## Usage:
+##
+## Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook
+
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(purrr)
+  library(stringr)
+  source("qsf-utils.R")
+})
+
+
+replace_qid_wrapper <- function(path_to_translations, path_to_codebook) {
+  if (dir.exists(path_to_translations)) {
+    # Process all CSVs in directory
+    csvs <- list.files(path_to_translations, pattern = "*.csv$", full.names = TRUE)
+    for (csv in csvs) {
+      replace_qids(csv, path_to_codebook)
+    }
+  } else if (file.exists(path_to_translations)) {
+    replace_qids(path_to_translations, path_to_codebook)
+  } else {
+    stop(path_to_translations, " is not a valid file or directory")
+  }
+}
+
+replace_qids <- function(path_to_translation_file, path_to_codebook) {
+  wave <- get_wave_from_csv(path_to_translation_file)
+  # Load codebook
+  codebook <- read_csv(path_to_codebook, col_types = cols(
+    .default = col_character(),
+    version = col_double()
+  )) %>%
+    filter(!is.na(qid), version == wave)
+
+  # Load translation file
+  translation <- read_csv(path_to_translation_file)
+
+  # Use codebook to make a mapping of QID -> item name.
+  var_qid_pairs <- codebook %>% mutate(variable = coalesce(matrix_base_name, variable)) %>% distinct(qid, variable)
+  qid_item_map <- var_qid_pairs %>% pull(variable)
+  names(qid_item_map) <- var_qid_pairs %>% pull(qid)
+  
+  # Use QID-name mapping to replace QID in first column.
+  ii_qid <- startsWith(translation$PhraseID, "QID")
+  translation[ii_qid,] <- translation[ii_qid,] %>% mutate(
+    PhraseID = str_replace(PhraseID, "(^QID[0-9]*)_", function(match) {
+      paste0(qid_item_map[str_sub(match, 1, -2)], "_")
+    })
+  )
+  
+  # Save processed file back to CSV under the same name.
+  write_csv(translation, path_to_translation_file)
+}
+
+args <- commandArgs(TRUE)
+
+if (!(length(args) %in% c(2))) {
+  stop("Usage: Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook")
+}
+
+path_to_translations <- args[1]
+path_to_codebook <- args[2]
+
+invisible(replace_qid_wrapper(path_to_translations, path_to_codebook))

From ee3e3626dc6859ee3bd9a04f8fa764ff5586bd88 Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Tue, 7 Jun 2022 18:09:22 -0400
Subject: [PATCH 2/7] report qids in codebook

---
 facebook/qsf-tools/generate-codebook.R |  4 ++++
 facebook/qsf-tools/qsf-utils.R         | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R
index 26f81948e..17c5111e4 100644
--- a/facebook/qsf-tools/generate-codebook.R
+++ b/facebook/qsf-tools/generate-codebook.R
@@ -274,6 +274,7 @@ process_qsf <- function(path_to_qsf,
   
   # format all qsf content lists into a single tibble
   qdf <- tibble(variable = item_names,
+                qid = qids,
                 question = questions,
                 question_type = qtype,
                 response_options = choices,
@@ -360,6 +361,7 @@ process_qsf <- function(path_to_qsf,
     mutate(new = list(	
       tibble(matrix_base_name = variable,
              variable = unlist(matrix_subquestion_field_names),
+             qid = qid,
              question = question,	
              matrix_subquestion = unlist(matrix_subquestions),	
              question_type = question_type,	
@@ -381,6 +383,7 @@ process_qsf <- function(path_to_qsf,
     mutate(new = list(	
       tibble(matrix_base_name = variable,
              variable = unlist(matrix_subquestion_field_names),	
+             qid = qid,
              question = question,	
              matrix_subquestion = unlist(matrix_subquestions),	
              question_type = question_type,	
@@ -426,6 +429,7 @@ process_qsf <- function(path_to_qsf,
     ) %>% 
     select(wave,
            variable,
+           qid,
            matrix_base_name,
            replaces,
            description,
diff --git a/facebook/qsf-tools/qsf-utils.R b/facebook/qsf-tools/qsf-utils.R
index 2891964c1..4ec7280f3 100644
--- a/facebook/qsf-tools/qsf-utils.R
+++ b/facebook/qsf-tools/qsf-utils.R
@@ -71,6 +71,32 @@ get_wave <- function(path_to_qsf) {
   return(wave)
 }
 
+#' Get wave number from qsf filename
+#' 
+#' Wave number as provided in the qsf name should be an integer or a float with
+#' one decimal place.
+#'
+#' @param path_to_file
+#'
+#' @return (mostly) integer wave number
+get_wave_from_csv <- function(path_to_file) {
+  name_pattern <- "(.*[Ww]ave_?)([0-9]*([.][0-9])?)(.*csv.*)"
+  if (!grepl(name_pattern, path_to_file)) {
+    stop(
+      "The CSV filename must include the string 'csv', and the wave number in ",
+      "the format 'Wave_XX', 'WaveXX', 'wave_XX', or 'waveXX' where 'XX' is an ",
+      "integer or float. The wave specification can occur anywhere in the ",
+      "filename but must precede the string 'csv'."
+    )
+  }
+  
+  wave <- as.numeric(
+    sub(name_pattern, "\\2", path_to_file)
+  ) 
+  
+  return(wave)
+}
+
 #' Create mapping of QIDs to module name
 #'
 #' @param qsf contents of QSF file in JSON format

From de29f3add3c3190aa0fc239b4d0ba8151ec806de Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Thu, 9 Jun 2022 16:39:32 -0400
Subject: [PATCH 3/7] remove initial translation file line with survey ID

---
 facebook/qsf-tools/replace_translation_qids.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R
index 5aa3824a5..588d8082d 100644
--- a/facebook/qsf-tools/replace_translation_qids.R
+++ b/facebook/qsf-tools/replace_translation_qids.R
@@ -40,7 +40,9 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) {
     filter(!is.na(qid), version == wave)
 
   # Load translation file
-  translation <- read_csv(path_to_translation_file)
+  translation <- read_csv(path_to_translation_file) %>% 
+    # Drop survey ID line
+    filter(!startsWith(PhraseID, "SV_"))  
 
   # Use codebook to make a mapping of QID -> item name.
   var_qid_pairs <- codebook %>% mutate(variable = coalesce(matrix_base_name, variable)) %>% distinct(qid, variable)

From 27f62e0f6990c34d4a471e7628ed357ea20b741b Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Thu, 9 Jun 2022 16:47:02 -0400
Subject: [PATCH 4/7] make csvs excel compatible

---
 facebook/qsf-tools/replace_translation_qids.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R
index 588d8082d..f6ed84122 100644
--- a/facebook/qsf-tools/replace_translation_qids.R
+++ b/facebook/qsf-tools/replace_translation_qids.R
@@ -55,10 +55,10 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) {
     PhraseID = str_replace(PhraseID, "(^QID[0-9]*)_", function(match) {
       paste0(qid_item_map[str_sub(match, 1, -2)], "_")
     })
-  )
+  ) 
   
   # Save processed file back to CSV under the same name.
-  write_csv(translation, path_to_translation_file)
+  write_excel_csv(translation, path_to_translation_file)
 }
 
 args <- commandArgs(TRUE)

From 26561d7389e9a1f9481a170df5f0b81680200425 Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Thu, 9 Jun 2022 17:34:28 -0400
Subject: [PATCH 5/7] new script to combine eu translation files

---
 facebook/qsf-tools/combine_translations_eu.R | 87 ++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 facebook/qsf-tools/combine_translations_eu.R

diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R
new file mode 100644
index 000000000..aa8c583a3
--- /dev/null
+++ b/facebook/qsf-tools/combine_translations_eu.R
@@ -0,0 +1,87 @@
+#!/usr/bin/env Rscript
+
+## Combine a set of EU and non-EU translation files (UMD only), adding in a
+## column indicating whether a given field was included in just the EU version,
+## just the non-EU version, or in both.
+##
+## Usage:
+##
+## Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir
+
+suppressPackageStartupMessages({
+  library(tidyverse)
+  source("qsf-utils.R")
+})
+
+
+combine_translation_pair <- function(path_to_eu_translations,
+                                     path_to_noneu_translations) {
+  
+  translation <- translation %>% 
+    # Drop response options for the country + region question, they take up way too much space.
+    filter(!startsWith(PhraseID, "A2_3_Answer")) %>%
+    mutate(eu_noneu = case_when(
+      startsWith(PhraseID, "intro1_eu") ~ "EU",
+      startsWith(PhraseID, "intro2_eu") ~ "EU",
+      startsWith(PhraseID, "intro1_noneu") ~ "nonEU",
+      startsWith(PhraseID, "intro2_noneu") ~ "nonEU",
+      TRUE ~ "Both"
+    )
+    )
+}
+
+combine_translations <- function(path_to_eu_translations,
+                                 path_to_noneu_translations,
+                                 path_to_combined) {
+  eu_name_pattern <- "_eu_"
+  if (!grepl(eu_name_pattern, path_to_eu_translations)) {
+    stop(path_to_eu_translations, "does not specify that it is for the EU")
+  }
+  noneu_name_pattern <- "_noneu_"
+  if (!grepl(noneu_name_pattern, path_to_noneu_translations)) {
+    stop(path_to_noneu_translations, "does not specify that it is for the non-EU")
+  }
+  
+  eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$")
+  eu_translations <- list()
+  for (filename in eu_files) {
+    eu_translations[[get_wave_csv(filename)]] <- read_csv(filename) %>% 
+      filter(!startsWith(PhraseID, "intro"))
+  }
+  
+  noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$")
+  noneu_translations <- list()
+  for (filename in noneu_files) {
+    noneu_translations[[get_wave_csv(filename)]] <- read_csv(filename)
+  }
+  
+  if (!identical(sort(names(eu_translations)), sort(names(noneu_translations)))) {
+    stop("not all waves are available for both EU and non-EU")
+  }
+  
+  for (wave in names(eu_translations)) {
+    combined <- combine_translation_pair(
+      eu_translations[[wave]],
+      noneu_translations[[wave]]
+    )
+    write_excel_csv(combined, file.path(
+      path_to_combined,
+      sprintf("umd_ctis_combined_wave%02d_translations.csv", wave)
+    ))
+  }
+}
+
+
+args <- commandArgs(TRUE)
+
+if (length(args) != 3) {
+  stop("Usage: Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir")
+}
+
+path_to_eu_translations <- args[1]
+path_to_noneu_translations <- args[2]
+path_to_combined <- args[3]
+
+invisible(combine_translations(path_to_eu_translations, path_to_noneu_translations, path_to_combined))
+
+

From 3be1ef01794ab177457d525ff45e3a242f716d59 Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Fri, 10 Jun 2022 12:03:14 -0400
Subject: [PATCH 6/7] drop some unnecessary fields + output format changes

---
 facebook/qsf-tools/combine_translations_eu.R  | 42 +++++++++++--------
 facebook/qsf-tools/replace_translation_qids.R |  4 +-
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R
index aa8c583a3..14978e011 100644
--- a/facebook/qsf-tools/combine_translations_eu.R
+++ b/facebook/qsf-tools/combine_translations_eu.R
@@ -14,20 +14,17 @@ suppressPackageStartupMessages({
 })
 
 
-combine_translation_pair <- function(path_to_eu_translations,
-                                     path_to_noneu_translations) {
-  
-  translation <- translation %>% 
-    # Drop response options for the country + region question, they take up way too much space.
-    filter(!startsWith(PhraseID, "A2_3_Answer")) %>%
+combine_translation_pair <- function(eu_translation,
+                                     noneu_translation) {
+  translation <- bind_rows(eu_translation, noneu_translation) %>%
     mutate(eu_noneu = case_when(
       startsWith(PhraseID, "intro1_eu") ~ "EU",
       startsWith(PhraseID, "intro2_eu") ~ "EU",
       startsWith(PhraseID, "intro1_noneu") ~ "nonEU",
       startsWith(PhraseID, "intro2_noneu") ~ "nonEU",
       TRUE ~ "Both"
-    )
-    )
+    ))
+  return(translation)
 }
 
 combine_translations <- function(path_to_eu_translations,
@@ -42,32 +39,43 @@ combine_translations <- function(path_to_eu_translations,
     stop(path_to_noneu_translations, "does not specify that it is for the non-EU")
   }
   
-  eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$")
+  eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$", full.names = TRUE)
   eu_translations <- list()
   for (filename in eu_files) {
-    eu_translations[[get_wave_csv(filename)]] <- read_csv(filename) %>% 
-      filter(!startsWith(PhraseID, "intro"))
+    eu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% 
+      filter(startsWith(PhraseID, "intro"))
   }
   
-  noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$")
+  noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$", full.names = TRUE)
   noneu_translations <- list()
   for (filename in noneu_files) {
-    noneu_translations[[get_wave_csv(filename)]] <- read_csv(filename)
+    noneu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% 
+      # Drop response options for the country + region question, they take up way too much space.
+      filter(
+        !startsWith(PhraseID, "A2_3_Answer"),
+        !startsWith(PhraseID, "A2_2_Answer"),
+        !startsWith(PhraseID, "NA_")
+      )
   }
   
   if (!identical(sort(names(eu_translations)), sort(names(noneu_translations)))) {
     stop("not all waves are available for both EU and non-EU")
   }
   
+  dir.create(path_to_combined, showWarnings = FALSE)
   for (wave in names(eu_translations)) {
     combined <- combine_translation_pair(
       eu_translations[[wave]],
       noneu_translations[[wave]]
     )
-    write_excel_csv(combined, file.path(
-      path_to_combined,
-      sprintf("umd_ctis_combined_wave%02d_translations.csv", wave)
-    ))
+
+    write_excel_csv(
+      combined,
+      file.path(
+        path_to_combined,
+        sprintf("umd_ctis_combined_wave%02g_translations.csv", as.numeric(wave))
+      ),
+      quote = "needed")
   }
 }
 
diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R
index f6ed84122..ab769a4df 100644
--- a/facebook/qsf-tools/replace_translation_qids.R
+++ b/facebook/qsf-tools/replace_translation_qids.R
@@ -40,7 +40,7 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) {
     filter(!is.na(qid), version == wave)
 
   # Load translation file
-  translation <- read_csv(path_to_translation_file) %>% 
+  translation <- read_csv(path_to_translation_file, show_col_types = FALSE) %>% 
     # Drop survey ID line
     filter(!startsWith(PhraseID, "SV_"))  
 
@@ -58,7 +58,7 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) {
   ) 
   
   # Save processed file back to CSV under the same name.
-  write_excel_csv(translation, path_to_translation_file)
+  write_excel_csv(translation, path_to_translation_file, quote = "needed")
 }
 
 args <- commandArgs(TRUE)

From cc8a6ad35f99bfe33f3276e257cb71345ce2e0b4 Mon Sep 17 00:00:00 2001
From: Nat DeFries <42820733+nmdefries@users.noreply.github.com>
Date: Tue, 14 Jun 2022 12:46:36 -0400
Subject: [PATCH 7/7] save name with "v" instead of "wave"

---
 facebook/qsf-tools/combine_translations_eu.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R
index 14978e011..8ce09a353 100644
--- a/facebook/qsf-tools/combine_translations_eu.R
+++ b/facebook/qsf-tools/combine_translations_eu.R
@@ -73,7 +73,7 @@ combine_translations <- function(path_to_eu_translations,
       combined,
       file.path(
         path_to_combined,
-        sprintf("umd_ctis_combined_wave%02g_translations.csv", as.numeric(wave))
+        sprintf("umd_ctis_combined_v%02g_translations.csv", as.numeric(wave))
       ),
       quote = "needed")
   }