report_sample allows group_by > 1

easystats · Jun 14, 2023 · ffff602 · ffff602
1 parent 9c01ea2
commit ffff602
Show file tree

Hide file tree

Showing 7 changed files with 76 additions and 25 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: report
 Title: Automated Reporting of Results and Statistical Models
-Version: 0.5.7.5
+Version: 0.5.7.6
 Authors@R:
     c(person(given = "Dominique",
              family = "Makowski",

diff --git a/NEWS.md b/NEWS.md
@@ -12,6 +12,9 @@ Minor changes
 * `report_sample` now works on grouped data frame, using the defined groups as
   values for the `group_by` argument.
 
+* `report_sample` can now summarize data based on more than one grouping variable
+  (i.e. `group_by` is allowed to be longer than 1).
+
 # report 0.5.7
 
 Hotfix for CRAN reverse dependency compatibility.

diff --git a/R/report_sample.R b/R/report_sample.R
@@ -3,8 +3,9 @@
 #' Create sample description table (also referred to as "Table 1").
 #'
 #' @param data A data frame for which descriptive statistics should be created.
-#' @param group_by Character vector, indicating the column for possible grouping
-#'   of the descriptive table.
+#' @param group_by Character vector, indicating the column(s) for possible grouping
+#'   of the descriptive table. Note that weighting (see `weights`) does not work
+#' wirh more than one grouping column.
 #' @param centrality Character, indicates the statistics that should be
 #'   calculated for numeric variables. May be `"mean"` (for mean and
 #'   standard deviation) or `"median"` (for median and median absolute
@@ -120,16 +121,13 @@ report_sample <- function(data,
     group_by <- setdiff(colnames(attributes(data)$groups), ".rows")
   }
 
-  # group_by is not allow to be longer than 1 element
-  if (length(group_by) > 1) {
-    insight::format_error(
-      "`report_sample` only works for one grouping variable.",
-      "Thus, `group_by` must be of length 1, or data frames are only allowed to be grouped by one variable."
-    )
-  }
-
   # grouped by?
-  grouping <- !is.null(group_by) && group_by %in% colnames(data)
+  grouping <- !is.null(group_by) && all(group_by %in% colnames(data))
+
+  # sanity check - weights and grouping
+  if (!is.null(group_by) && length(group_by) > 1 && !is.null(weights)) {
+    insight::format_error("Cannot apply `weights` when grouping is done by more than one variable.")
+  }
 
   # character to factor
   data[] <- lapply(data, function(i) {
@@ -139,9 +137,12 @@ report_sample <- function(data,
     i
   })
 
+  # coerce group_by columns to factor
+  groups <- as.data.frame(lapply(data[group_by], factor))
+
   out <- if (isTRUE(grouping)) {
-    result <- lapply(split(data[variables], factor(data[[group_by]])), function(x) {
-      x[[group_by]] <- NULL
+    result <- lapply(split(data[variables], groups), function(x) {
+      x[group_by] <- NULL
       .generate_descriptive_table(
         x,
         centrality,
@@ -153,21 +154,36 @@ report_sample <- function(data,
         ci_correct
       )
     })
+    # for more than one group, fix column names. we don't want "a.b (n=10)",
+    # but rather ""a, b (n=10)""
+    if (length(group_by) > 1) {
+      old_names <- datawizard::data_unite(
+        unique(groups),
+        new_column = ".old_names",
+        separator = "."
+      )[[".old_names"]]
+      new_names <- datawizard::data_unite(
+        unique(groups),
+        new_column = ".new_names",
+        separator = ", "
+      )[[".new_names"]]
+      result <- datawizard::data_rename(result, pattern = old_names, replacement = new_names)
+    }
     # remember values of first columns
     variable <- result[[1]]["Variable"]
     # number of observation, based on weights
     if (!is.null(weights)) {
       n_obs <- round(as.vector(stats::xtabs(data[[weights]] ~ data[[group_by]])))
     } else {
-      n_obs <- as.vector(table(data[[group_by]]))
+      n_obs <- as.vector(table(data[group_by]))
     }
     # column names for groups
     cn <- sprintf("%s (n=%g)", names(result), n_obs)
     # just extract summary columns
     summaries <- do.call(cbind, lapply(result, function(i) i["Summary"]))
     colnames(summaries) <- cn
     # generate data for total column, but make sure to remove missings
-    total_data <- data[!is.na(data[[group_by]]), unique(c(variables, group_by))]
+    total_data <- data[stats::complete.cases(data[group_by]), unique(c(variables, group_by))]
     # bind all together, including total column
     final <- cbind(
       variable,
@@ -189,9 +205,9 @@ report_sample <- function(data,
     }
     # define total N, based on weights
     if (!is.null(weights)) {
-      total_n <- round(sum(as.vector(table(data[[group_by]]))) * mean(data[[weights]], na.rm = TRUE))
+      total_n <- round(sum(as.vector(table(data[group_by]))) * mean(data[[weights]], na.rm = TRUE))
     } else {
-      total_n <- sum(as.vector(table(data[[group_by]])))
+      total_n <- sum(as.vector(table(data[group_by])))
     }
     # add N to column name
     colnames(final)[ncol(final)] <- sprintf(

diff --git a/man/report-package.Rd b/man/report-package.Rd
diff --git a/man/report_sample.Rd b/man/report_sample.Rd
diff --git a/tests/testthat/_snaps/report_sample.md b/tests/testthat/_snaps/report_sample.md
@@ -0,0 +1,12 @@
+# report_sample, error on more than one grouping variable
+
+    Code
+      out
+    Output
+      # Descriptive Statistics
+      
+      Variable               | setosa, a (n=16) | versicolor, a (n=17) | virginica, a (n=9) | setosa, b (n=15) | versicolor, b (n=17) | virginica, b (n=22) | setosa, c (n=19) | versicolor, c (n=16) | virginica, c (n=19) | Total (n=150)
+      -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+      Mean Sepal.Length (SD) |      5.13 (0.36) |          6.08 (0.48) |        6.59 (0.44) |      4.91 (0.31) |          5.84 (0.54) |         6.77 (0.63) |      4.97 (0.36) |          5.88 (0.53) |         6.38 (0.69) |   5.84 (0.83)
+      Mean Sepal.Width (SD)  |      3.50 (0.42) |          2.76 (0.40) |        3.02 (0.25) |      3.34 (0.25) |          2.81 (0.25) |         3.05 (0.36) |      3.44 (0.43) |          2.74 (0.29) |         2.87 (0.30) |   3.06 (0.44)
+
diff --git a/tests/testthat/test-report_sample.R b/tests/testthat/test-report_sample.R
@@ -32,6 +32,18 @@ test_that("report_sample weights, coorect weighted N", {
       "x [c], % |    33.3 |    33.3 |         33.3"
     )
   )
+
+  d <- data.frame(
+    x = c("a", "a", "a", "a", "b", "b", "b", "b", "c", "c", "c", "c"),
+    g1 = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2),
+    g2 = c(3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 1),
+    w = c(0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, 1, 1, 1.5, 1.5),
+    stringsAsFactors = FALSE
+  )
+  expect_error(
+    report_sample(d, select = "x", group_by = c("g1", "g2"), weights = "w"),
+    regex = "Cannot apply"
+  )
 })
 
 test_that("report_sample check input", {
@@ -241,9 +253,15 @@ test_that("report_sample grouped data frames", {
 })
 
 test_that("report_sample, error on more than one grouping variable", {
-  data(mtcars)
-  expect_error(
-    report_sample(mtcars, group_by = c("vs", "gear"), select = c("hp", "mpg")),
-    regex = "only works"
-  )
+  data(iris)
+  set.seed(123)
+  iris$grp <- sample(letters[1:3], nrow(iris), TRUE)
+  out <- report_sample(
+    iris,
+    group_by = c("Species", "grp"),
+    select = c("Sepal.Length", "Sepal.Width")
+  )
+  # verified against
+  expected <- aggregate(iris["Sepal.Length"], iris[c("Species", "grp")], mean)
+  expect_snapshot(out)
 })