Merge branch 'main' into testthat-helper-state

easystats · Jun 3, 2024 · 79c22c1 · 79c22c1
2 parents 00e8011 + a7d3c80
commit 79c22c1
Show file tree

Hide file tree

Showing 16 changed files with 584 additions and 173 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.10.0.5
+Version: 0.10.0.6
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NEWS.md b/NEWS.md
@@ -23,6 +23,8 @@ CHANGES
   If you recode into a numeric variable, and one of the recode values is `NA`,
   you no longer need to use `NA_real_` for numeric `NA` values.
 
+* Improved documentation for some functions.
+
 BUG FIXES
 
 * `data_to_long()` did not work for data frame where columns had attributes

diff --git a/R/data_read.R b/R/data_read.R
@@ -70,7 +70,7 @@
 #' factors, where imported value labels will be set as factor levels. If a
 #' numeric variable has _no_ value labels or less value labels than values, it
 #' is not converted to factor. In this case, value labels are preserved as
-#' `"labels"` attribute. Character vectors are preserved.  Use
+#' `"labels"` attribute. Character vectors are preserved. Use
 #' `convert_factors = FALSE` to remove the automatic conversion of numeric
 #' variables to factors.
 #'
@@ -105,7 +105,7 @@ data_read <- function(path,
     por = .read_spss(path, encoding, convert_factors, verbose, ...),
     dta = .read_stata(path, encoding, convert_factors, verbose, ...),
     sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...),
-    .read_unknown(path, convert_factors, verbose, ...)
+    .read_unknown(path, file_type, convert_factors, verbose, ...)
   )
 
   # tell user about empty columns
@@ -178,20 +178,18 @@ data_read <- function(path,
         if (is.character(i)) {
           # we need this to drop haven-specific class attributes
           i <- as.character(i)
-        } else {
+        } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
           # if all values are labelled, we assume factor. Use labels as levels
-          if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
-            if (is.numeric(i)) {
-              i <- factor(i, labels = names(value_labels))
-            } else {
-              i <- factor(as.character(i), labels = names(value_labels))
-            }
-            value_labels <- NULL
-            attr(i, "converted_to_factor") <- TRUE
+          if (is.numeric(i)) {
+            i <- factor(i, labels = names(value_labels))
           } else {
-            # else, fall back to numeric
-            i <- as.numeric(i)
+            i <- factor(as.character(i), labels = names(value_labels))
           }
+          value_labels <- NULL
+          attr(i, "converted_to_factor") <- TRUE
+        } else {
+          # else, fall back to numeric
+          i <- as.numeric(i)
         }
 
         # drop unused value labels
@@ -290,12 +288,18 @@ data_read <- function(path,
 }
 
 
-.read_unknown <- function(path, convert_factors, verbose, ...) {
-  insight::check_if_installed("rio", reason = paste0("to read files of type '", .file_ext(path), "'"))
+.read_unknown <- function(path, file_type, convert_factors, verbose, ...) {
+  insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'"))
   if (verbose) {
     insight::format_alert("Reading data...")
   }
-  out <- rio::import(file = path, ...)
+  # set up arguments. for RDS, we set trust = TRUE, to avoid warnings
+  rio_args <- list(file = path)
+  # check if we have RDS, and if so, add trust = TRUE
+  if (file_type == "rds") {
+    rio_args$trust <- TRUE
+  }
+  out <- do.call(rio::import, c(rio_args, list(...)))
 
   # for "unknown" data formats (like .RDS), which still can be imported via
   # "rio::import()", we must check whether we actually have a data frame or
@@ -310,9 +314,8 @@ data_read <- function(path,
         )
       }
       return(out)
-    } else {
-      out <- tmp
     }
+    out <- tmp
   }
 
   .post_process_imported_data(out, convert_factors, verbose)

diff --git a/R/data_restoretype.R b/R/data_restoretype.R
@@ -1,5 +1,6 @@
 #' Restore the type of columns according to a reference data frame
 #'
+#' @param data A data frame for which to restore the column types.
 #' @inheritParams data_to_long
 #' @inheritParams data_rename
 #' @param reference A reference data frame from which to find the correct

diff --git a/R/data_to_long.R b/R/data_to_long.R
@@ -4,65 +4,124 @@
 #' the number of columns. This is a dependency-free base-R equivalent of
 #' `tidyr::pivot_longer()`.
 #'
-#' @param data A data frame to pivot.
-#' @param names_to The name of the new column that will contain the column
-#'   names.
+#' @param data A data frame to convert to long format, so that it has more
+#' rows and fewer columns after the operation.
+#' @param names_to The name of the new column (variable) that will contain the
+#' _names_ from columns in `select` as values, to identify the source of the
+#' values. `names_to` can be a character vector with more than one column name,
+#' in which case `names_sep` or `names_pattern` must be provided in order to
+#' identify which parts of the column names go into newly created columns.
+#' See also 'Examples'.
 #' @param names_prefix A regular expression used to remove matching text from
 #' the start of each variable name.
 #' @param names_sep,names_pattern If `names_to` contains multiple values, this
-#' argument controls how the column name is broken up.
-#' `names_pattern` takes a regular expression containing matching groups, i.e. "()".
-#' @param values_to The name of the new column that will contain the values of
-#'   the pivoted variables.
+#' argument controls how the column name is broken up. `names_pattern` takes a
+#' regular expression containing matching groups, i.e. "()".
+#' @param values_to The name of the new column that will contain the _values_ of
+#' the columns in `select`.
 #' @param values_drop_na If `TRUE`, will drop rows that contain only `NA` in the
-#'   `values_to` column. This effectively converts explicit missing values to
-#'   implicit missing values, and should generally be used only when missing values
-#'   in data were created by its structure.
+#' `values_to` column. This effectively converts explicit missing values to
+#' implicit missing values, and should generally be used only when missing values
+#' in data were created by its structure.
 #' @param rows_to The name of the column that will contain the row names or row
-#'   numbers from the original data. If `NULL`, will be removed.
+#' numbers from the original data. If `NULL`, will be removed.
 #' @param ... Currently not used.
 #' @inheritParams extract_column_names
 #' @param cols Identical to `select`. This argument is here to ensure compatibility
-#'   with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
-#'   is used.
+#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
+#' is used.
+#'
+#' @details
+#' Reshaping data into long format usually means that the input data frame is
+#' in _wide_ format, where multiple measurements taken on the same subject are
+#' stored in multiple columns (variables). The long format stores the same
+#' information in a single column, with each measurement per subject stored in
+#' a separate row. The values of all variables that are not in `select` will
+#' be repeated.
+#'
+#' The necessary information for `data_to_long()` is:
+#'
+#' - The columns that contain the repeated measurements (`select`).
+#' - The name of the newly created column that will contain the names of the
+#'   columns in `select` (`names_to`), to identify the source of the values.
+#'   `names_to` can also be a character vector with more than one column name,
+#'   in which case `names_sep` or `names_pattern` must be provided to specify
+#'   which parts of the column names go into the newly created columns.
+#' - The name of the newly created column that contains the values of the
+#'   columns in `select` (`values_to`).
+#'
+#' In other words: repeated measurements that are spread across several columns
+#' will be gathered into a single column (`values_to`), with the original column
+#' names, that identify the source of the gathered values, stored in one or more
+#' new columns (`names_to`).
 #'
 #' @return If a tibble was provided as input, `reshape_longer()` also returns a
 #' tibble. Otherwise, it returns a data frame.
 #'
 #' @examplesIf requireNamespace("psych") && requireNamespace("tidyr")
-#' wide_data <- data.frame(replicate(5, rnorm(10)))
+#' wide_data <- setNames(
+#'   data.frame(replicate(2, rnorm(8))),
+#'   c("Time1", "Time2")
+#' )
+#' wide_data$ID <- 1:8
+#' wide_data
 #'
-#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:5))
+#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3))
+#' # probably doesn't make much sense to mix "time" and "id"
 #' data_to_long(wide_data)
 #'
 #' # Customizing the names
-#' data_to_long(wide_data,
-#'   select = c(1, 2),
-#'   names_to = "Column",
-#'   values_to = "Numbers",
-#'   rows_to = "Row"
+#' data_to_long(
+#'   wide_data,
+#'   select = c("Time1", "Time2"),
+#'   names_to = "Timepoint",
+#'   values_to = "Score"
+#' )
+#'
+#' # Reshape multiple columns into long format.
+#' mydat <- data.frame(
+#'   age = c(20, 30, 40),
+#'   sex = c("Female", "Male", "Male"),
+#'   score_t1 = c(30, 35, 32),
+#'   score_t2 = c(33, 34, 37),
+#'   score_t3 = c(36, 35, 38),
+#'   speed_t1 = c(2, 3, 1),
+#'   speed_t2 = c(3, 4, 5),
+#'   speed_t3 = c(1, 8, 6)
+#' )
+#' # The column names are split into two columns: "type" and "time". The
+#' # pattern for splitting column names is provided in `names_pattern`. Values
+#' # of all "score_*" and "speed_*" columns are gathered into a single column
+#' # named "count".
+#' data_to_long(
+#'   mydat,
+#'   select = 3:8,
+#'   names_to = c("type", "time"),
+#'   names_pattern = "(score|speed)_t(\\d+)",
+#'   values_to = "count"
 #' )
 #'
 #' # Full example
 #' # ------------------
 #' data <- psych::bfi # Wide format with one row per participant's personality test
 #'
 #' # Pivot long format
-#' data_to_long(data,
+#' very_long_data <- data_to_long(data,
 #'   select = regex("\\d"), # Select all columns that contain a digit
 #'   names_to = "Item",
 #'   values_to = "Score",
 #'   rows_to = "Participant"
 #' )
+#' head(very_long_data)
 #'
-#' data_to_long(
+#' even_longer_data <- data_to_long(
 #'   tidyr::who,
 #'   select = new_sp_m014:newrel_f65,
 #'   names_to = c("diagnosis", "gender", "age"),
 #'   names_pattern = "new_?(.*)_(.)(.*)",
 #'   values_to = "count"
 #' )
-#'
+#' head(even_longer_data)
 #' @inherit data_rename
 #' @export
 data_to_long <- function(data,
@@ -223,7 +282,11 @@ data_to_long <- function(data,
   # if columns in data frame have attributes (e.g. labelled data), `cbind()`
   # won't work, so we need to remove them. We'll set them back later
   not_stacked[] <- lapply(not_stacked, function(i) {
-    attributes(i) <- NULL
+    # we can't remove *all* attributes, this will convert factors into integers
+    attr(i, "label") <- NULL
+    attr(i, "labels") <- NULL
+    attr(i, "format.spss") <- NULL
+    class(i) <- setdiff(class(i), c("haven_labelled", "vctrs_vctr"))
     i
   })