From af7184bb5740c8ec40fc9d12b28e3b01520f992e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 7 Jun 2022 15:05:22 -0400 Subject: [PATCH 1/7] initial commit to turn QIDs in translation files into item names --- facebook/qsf-tools/replace_translation_qids.R | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 facebook/qsf-tools/replace_translation_qids.R diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R new file mode 100644 index 000000000..5aa3824a5 --- /dev/null +++ b/facebook/qsf-tools/replace_translation_qids.R @@ -0,0 +1,71 @@ +#!/usr/bin/env Rscript + +## In translation CSVs, replace the QID in the name column with the human-readable +## item name (e.g. A1). Export modified translation CSVs in the same format as the +## original. +## +## Usage: +## +## Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook + +suppressPackageStartupMessages({ + library(tidyverse) + library(purrr) + library(stringr) + source("qsf-utils.R") +}) + + +replace_qid_wrapper <- function(path_to_translations, path_to_codebook) { + if (dir.exists(path_to_translations)) { + # Process all CSVs in directory + csvs <- list.files(path_to_translations, pattern = "*.csv$", full.names = TRUE) + for (csv in csvs) { + replace_qids(csv, path_to_codebook) + } + } else if (file.exists(path_to_translations)) { + replace_qids(path_to_translations, path_to_codebook) + } else { + stop(path_to_translations, " is not a valid file or directory") + } +} + +replace_qids <- function(path_to_translation_file, path_to_codebook) { + wave <- get_wave_from_csv(path_to_translation_file) + # Load codebook + codebook <- read_csv(path_to_codebook, col_types = cols( + .default = col_character(), + version = col_double() + )) %>% + filter(!is.na(qid), version == wave) + + # Load translation file + translation <- read_csv(path_to_translation_file) + + # Use codebook to make a mapping of QID -> item name. + var_qid_pairs <- codebook %>% mutate(variable = coalesce(matrix_base_name, variable)) %>% distinct(qid, variable) + qid_item_map <- var_qid_pairs %>% pull(variable) + names(qid_item_map) <- var_qid_pairs %>% pull(qid) + + # Use QID-name mapping to replace QID in first column. + ii_qid <- startsWith(translation$PhraseID, "QID") + translation[ii_qid,] <- translation[ii_qid,] %>% mutate( + PhraseID = str_replace(PhraseID, "(^QID[0-9]*)_", function(match) { + paste0(qid_item_map[str_sub(match, 1, -2)], "_") + }) + ) + + # Save processed file back to CSV under the same name. + write_csv(translation, path_to_translation_file) +} + +args <- commandArgs(TRUE) + +if (!(length(args) %in% c(2))) { + stop("Usage: Rscript replace_translation_qids.R path/to/translation/directory/or/single/translation/CSV path/to/codebook") +} + +path_to_translations <- args[1] +path_to_codebook <- args[2] + +invisible(replace_qid_wrapper(path_to_translations, path_to_codebook)) From ee3e3626dc6859ee3bd9a04f8fa764ff5586bd88 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:09:22 -0400 Subject: [PATCH 2/7] report qids in codebook --- facebook/qsf-tools/generate-codebook.R | 4 ++++ facebook/qsf-tools/qsf-utils.R | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index 26f81948e..17c5111e4 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -274,6 +274,7 @@ process_qsf <- function(path_to_qsf, # format all qsf content lists into a single tibble qdf <- tibble(variable = item_names, + qid = qids, question = questions, question_type = qtype, response_options = choices, @@ -360,6 +361,7 @@ process_qsf <- function(path_to_qsf, mutate(new = list( tibble(matrix_base_name = variable, variable = unlist(matrix_subquestion_field_names), + qid = qid, question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -381,6 +383,7 @@ process_qsf <- function(path_to_qsf, mutate(new = list( tibble(matrix_base_name = variable, variable = unlist(matrix_subquestion_field_names), + qid = qid, question = question, matrix_subquestion = unlist(matrix_subquestions), question_type = question_type, @@ -426,6 +429,7 @@ process_qsf <- function(path_to_qsf, ) %>% select(wave, variable, + qid, matrix_base_name, replaces, description, diff --git a/facebook/qsf-tools/qsf-utils.R b/facebook/qsf-tools/qsf-utils.R index 2891964c1..4ec7280f3 100644 --- a/facebook/qsf-tools/qsf-utils.R +++ b/facebook/qsf-tools/qsf-utils.R @@ -71,6 +71,32 @@ get_wave <- function(path_to_qsf) { return(wave) } +#' Get wave number from qsf filename +#' +#' Wave number as provided in the qsf name should be an integer or a float with +#' one decimal place. +#' +#' @param path_to_file +#' +#' @return (mostly) integer wave number +get_wave_from_csv <- function(path_to_file) { + name_pattern <- "(.*[Ww]ave_?)([0-9]*([.][0-9])?)(.*csv.*)" + if (!grepl(name_pattern, path_to_file)) { + stop( + "The CSV filename must include the string 'csv', and the wave number in ", + "the format 'Wave_XX', 'WaveXX', 'wave_XX', or 'waveXX' where 'XX' is an ", + "integer or float. The wave specification can occur anywhere in the ", + "filename but must precede the string 'csv'." + ) + } + + wave <- as.numeric( + sub(name_pattern, "\\2", path_to_file) + ) + + return(wave) +} + #' Create mapping of QIDs to module name #' #' @param qsf contents of QSF file in JSON format From de29f3add3c3190aa0fc239b4d0ba8151ec806de Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 9 Jun 2022 16:39:32 -0400 Subject: [PATCH 3/7] remove initial translation file line with survey ID --- facebook/qsf-tools/replace_translation_qids.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R index 5aa3824a5..588d8082d 100644 --- a/facebook/qsf-tools/replace_translation_qids.R +++ b/facebook/qsf-tools/replace_translation_qids.R @@ -40,7 +40,9 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) { filter(!is.na(qid), version == wave) # Load translation file - translation <- read_csv(path_to_translation_file) + translation <- read_csv(path_to_translation_file) %>% + # Drop survey ID line + filter(!startsWith(PhraseID, "SV_")) # Use codebook to make a mapping of QID -> item name. var_qid_pairs <- codebook %>% mutate(variable = coalesce(matrix_base_name, variable)) %>% distinct(qid, variable) From 27f62e0f6990c34d4a471e7628ed357ea20b741b Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 9 Jun 2022 16:47:02 -0400 Subject: [PATCH 4/7] make csvs excel compatible --- facebook/qsf-tools/replace_translation_qids.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R index 588d8082d..f6ed84122 100644 --- a/facebook/qsf-tools/replace_translation_qids.R +++ b/facebook/qsf-tools/replace_translation_qids.R @@ -55,10 +55,10 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) { PhraseID = str_replace(PhraseID, "(^QID[0-9]*)_", function(match) { paste0(qid_item_map[str_sub(match, 1, -2)], "_") }) - ) + ) # Save processed file back to CSV under the same name. - write_csv(translation, path_to_translation_file) + write_excel_csv(translation, path_to_translation_file) } args <- commandArgs(TRUE) From 26561d7389e9a1f9481a170df5f0b81680200425 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 9 Jun 2022 17:34:28 -0400 Subject: [PATCH 5/7] new script to combine eu translation files --- facebook/qsf-tools/combine_translations_eu.R | 87 ++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 facebook/qsf-tools/combine_translations_eu.R diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R new file mode 100644 index 000000000..aa8c583a3 --- /dev/null +++ b/facebook/qsf-tools/combine_translations_eu.R @@ -0,0 +1,87 @@ +#!/usr/bin/env Rscript + +## Combine a set of EU and non-EU translation files (UMD only), adding in a +## column indicating whether a given field was included in just the EU version, +## just the non-EU version, or in both. +## +## Usage: +## +## Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir + +suppressPackageStartupMessages({ + library(tidyverse) + source("qsf-utils.R") +}) + + +combine_translation_pair <- function(path_to_eu_translations, + path_to_noneu_translations) { + + translation <- translation %>% + # Drop response options for the country + region question, they take up way too much space. + filter(!startsWith(PhraseID, "A2_3_Answer")) %>% + mutate(eu_noneu = case_when( + startsWith(PhraseID, "intro1_eu") ~ "EU", + startsWith(PhraseID, "intro2_eu") ~ "EU", + startsWith(PhraseID, "intro1_noneu") ~ "nonEU", + startsWith(PhraseID, "intro2_noneu") ~ "nonEU", + TRUE ~ "Both" + ) + ) +} + +combine_translations <- function(path_to_eu_translations, + path_to_noneu_translations, + path_to_combined) { + eu_name_pattern <- "_eu_" + if (!grepl(eu_name_pattern, path_to_eu_translations)) { + stop(path_to_eu_translations, "does not specify that it is for the EU") + } + noneu_name_pattern <- "_noneu_" + if (!grepl(noneu_name_pattern, path_to_noneu_translations)) { + stop(path_to_noneu_translations, "does not specify that it is for the non-EU") + } + + eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$") + eu_translations <- list() + for (filename in eu_files) { + eu_translations[[get_wave_csv(filename)]] <- read_csv(filename) %>% + filter(!startsWith(PhraseID, "intro")) + } + + noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$") + noneu_translations <- list() + for (filename in noneu_files) { + noneu_translations[[get_wave_csv(filename)]] <- read_csv(filename) + } + + if (!identical(sort(names(eu_translations)), sort(names(noneu_translations)))) { + stop("not all waves are available for both EU and non-EU") + } + + for (wave in names(eu_translations)) { + combined <- combine_translation_pair( + eu_translations[[wave]], + noneu_translations[[wave]] + ) + write_excel_csv(combined, file.path( + path_to_combined, + sprintf("umd_ctis_combined_wave%02d_translations.csv", wave) + )) + } +} + + +args <- commandArgs(TRUE) + +if (length(args) != 3) { + stop("Usage: Rscript combine_translations_eu.R path/to/eu/translations/dir path/to/noneu/translations/dir path/to/combined/translations/dir") +} + +path_to_eu_translations <- args[1] +path_to_noneu_translations <- args[2] +path_to_combined <- args[3] + +invisible(combine_translations(path_to_eu_translations, path_to_noneu_translations, path_to_combined)) + + From 3be1ef01794ab177457d525ff45e3a242f716d59 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 10 Jun 2022 12:03:14 -0400 Subject: [PATCH 6/7] drop some unnecessary fields + output format changes --- facebook/qsf-tools/combine_translations_eu.R | 42 +++++++++++-------- facebook/qsf-tools/replace_translation_qids.R | 4 +- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R index aa8c583a3..14978e011 100644 --- a/facebook/qsf-tools/combine_translations_eu.R +++ b/facebook/qsf-tools/combine_translations_eu.R @@ -14,20 +14,17 @@ suppressPackageStartupMessages({ }) -combine_translation_pair <- function(path_to_eu_translations, - path_to_noneu_translations) { - - translation <- translation %>% - # Drop response options for the country + region question, they take up way too much space. - filter(!startsWith(PhraseID, "A2_3_Answer")) %>% +combine_translation_pair <- function(eu_translation, + noneu_translation) { + translation <- bind_rows(eu_translation, noneu_translation) %>% mutate(eu_noneu = case_when( startsWith(PhraseID, "intro1_eu") ~ "EU", startsWith(PhraseID, "intro2_eu") ~ "EU", startsWith(PhraseID, "intro1_noneu") ~ "nonEU", startsWith(PhraseID, "intro2_noneu") ~ "nonEU", TRUE ~ "Both" - ) - ) + )) + return(translation) } combine_translations <- function(path_to_eu_translations, @@ -42,32 +39,43 @@ combine_translations <- function(path_to_eu_translations, stop(path_to_noneu_translations, "does not specify that it is for the non-EU") } - eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$") + eu_files <- list.files(path_to_eu_translations, pattern = "*.csv$", full.names = TRUE) eu_translations <- list() for (filename in eu_files) { - eu_translations[[get_wave_csv(filename)]] <- read_csv(filename) %>% - filter(!startsWith(PhraseID, "intro")) + eu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% + filter(startsWith(PhraseID, "intro")) } - noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$") + noneu_files <- list.files(path_to_noneu_translations, pattern = "*.csv$", full.names = TRUE) noneu_translations <- list() for (filename in noneu_files) { - noneu_translations[[get_wave_csv(filename)]] <- read_csv(filename) + noneu_translations[[as.character(get_wave_from_csv(filename))]] <- read_csv(filename, show_col_types = FALSE) %>% + # Drop response options for the country + region question, they take up way too much space. + filter( + !startsWith(PhraseID, "A2_3_Answer"), + !startsWith(PhraseID, "A2_2_Answer"), + !startsWith(PhraseID, "NA_") + ) } if (!identical(sort(names(eu_translations)), sort(names(noneu_translations)))) { stop("not all waves are available for both EU and non-EU") } + dir.create(path_to_combined, showWarnings = FALSE) for (wave in names(eu_translations)) { combined <- combine_translation_pair( eu_translations[[wave]], noneu_translations[[wave]] ) - write_excel_csv(combined, file.path( - path_to_combined, - sprintf("umd_ctis_combined_wave%02d_translations.csv", wave) - )) + + write_excel_csv( + combined, + file.path( + path_to_combined, + sprintf("umd_ctis_combined_wave%02g_translations.csv", as.numeric(wave)) + ), + quote = "needed") } } diff --git a/facebook/qsf-tools/replace_translation_qids.R b/facebook/qsf-tools/replace_translation_qids.R index f6ed84122..ab769a4df 100644 --- a/facebook/qsf-tools/replace_translation_qids.R +++ b/facebook/qsf-tools/replace_translation_qids.R @@ -40,7 +40,7 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) { filter(!is.na(qid), version == wave) # Load translation file - translation <- read_csv(path_to_translation_file) %>% + translation <- read_csv(path_to_translation_file, show_col_types = FALSE) %>% # Drop survey ID line filter(!startsWith(PhraseID, "SV_")) @@ -58,7 +58,7 @@ replace_qids <- function(path_to_translation_file, path_to_codebook) { ) # Save processed file back to CSV under the same name. - write_excel_csv(translation, path_to_translation_file) + write_excel_csv(translation, path_to_translation_file, quote = "needed") } args <- commandArgs(TRUE) From cc8a6ad35f99bfe33f3276e257cb71345ce2e0b4 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 14 Jun 2022 12:46:36 -0400 Subject: [PATCH 7/7] save name with "v" instead of "wave" --- facebook/qsf-tools/combine_translations_eu.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook/qsf-tools/combine_translations_eu.R b/facebook/qsf-tools/combine_translations_eu.R index 14978e011..8ce09a353 100644 --- a/facebook/qsf-tools/combine_translations_eu.R +++ b/facebook/qsf-tools/combine_translations_eu.R @@ -73,7 +73,7 @@ combine_translations <- function(path_to_eu_translations, combined, file.path( path_to_combined, - sprintf("umd_ctis_combined_wave%02g_translations.csv", as.numeric(wave)) + sprintf("umd_ctis_combined_v%02g_translations.csv", as.numeric(wave)) ), quote = "needed") }