From ffff602a9f7ffa957438e8da497cec830d710cc8 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 14 Jun 2023 12:21:21 +0200 Subject: [PATCH] report_sample allows group_by > 1 --- DESCRIPTION | 2 +- NEWS.md | 3 ++ R/report_sample.R | 50 +++++++++++++++++--------- man/report-package.Rd | 1 + man/report_sample.Rd | 5 +-- tests/testthat/_snaps/report_sample.md | 12 +++++++ tests/testthat/test-report_sample.R | 28 ++++++++++++--- 7 files changed, 76 insertions(+), 25 deletions(-) create mode 100644 tests/testthat/_snaps/report_sample.md diff --git a/DESCRIPTION b/DESCRIPTION index 9e5f1203..500a293f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: report Title: Automated Reporting of Results and Statistical Models -Version: 0.5.7.5 +Version: 0.5.7.6 Authors@R: c(person(given = "Dominique", family = "Makowski", diff --git a/NEWS.md b/NEWS.md index 55817259..0d0cb2d8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,9 @@ Minor changes * `report_sample` now works on grouped data frame, using the defined groups as values for the `group_by` argument. +* `report_sample` can now summarize data based on more than one grouping variable + (i.e. `group_by` is allowed to be longer than 1). + # report 0.5.7 Hotfix for CRAN reverse dependency compatibility. diff --git a/R/report_sample.R b/R/report_sample.R index 4101a7c6..66ea5e34 100644 --- a/R/report_sample.R +++ b/R/report_sample.R @@ -3,8 +3,9 @@ #' Create sample description table (also referred to as "Table 1"). #' #' @param data A data frame for which descriptive statistics should be created. -#' @param group_by Character vector, indicating the column for possible grouping -#' of the descriptive table. +#' @param group_by Character vector, indicating the column(s) for possible grouping +#' of the descriptive table. Note that weighting (see `weights`) does not work +#' wirh more than one grouping column. #' @param centrality Character, indicates the statistics that should be #' calculated for numeric variables. May be `"mean"` (for mean and #' standard deviation) or `"median"` (for median and median absolute @@ -120,16 +121,13 @@ report_sample <- function(data, group_by <- setdiff(colnames(attributes(data)$groups), ".rows") } - # group_by is not allow to be longer than 1 element - if (length(group_by) > 1) { - insight::format_error( - "`report_sample` only works for one grouping variable.", - "Thus, `group_by` must be of length 1, or data frames are only allowed to be grouped by one variable." - ) - } - # grouped by? - grouping <- !is.null(group_by) && group_by %in% colnames(data) + grouping <- !is.null(group_by) && all(group_by %in% colnames(data)) + + # sanity check - weights and grouping + if (!is.null(group_by) && length(group_by) > 1 && !is.null(weights)) { + insight::format_error("Cannot apply `weights` when grouping is done by more than one variable.") + } # character to factor data[] <- lapply(data, function(i) { @@ -139,9 +137,12 @@ report_sample <- function(data, i }) + # coerce group_by columns to factor + groups <- as.data.frame(lapply(data[group_by], factor)) + out <- if (isTRUE(grouping)) { - result <- lapply(split(data[variables], factor(data[[group_by]])), function(x) { - x[[group_by]] <- NULL + result <- lapply(split(data[variables], groups), function(x) { + x[group_by] <- NULL .generate_descriptive_table( x, centrality, @@ -153,13 +154,28 @@ report_sample <- function(data, ci_correct ) }) + # for more than one group, fix column names. we don't want "a.b (n=10)", + # but rather ""a, b (n=10)"" + if (length(group_by) > 1) { + old_names <- datawizard::data_unite( + unique(groups), + new_column = ".old_names", + separator = "." + )[[".old_names"]] + new_names <- datawizard::data_unite( + unique(groups), + new_column = ".new_names", + separator = ", " + )[[".new_names"]] + result <- datawizard::data_rename(result, pattern = old_names, replacement = new_names) + } # remember values of first columns variable <- result[[1]]["Variable"] # number of observation, based on weights if (!is.null(weights)) { n_obs <- round(as.vector(stats::xtabs(data[[weights]] ~ data[[group_by]]))) } else { - n_obs <- as.vector(table(data[[group_by]])) + n_obs <- as.vector(table(data[group_by])) } # column names for groups cn <- sprintf("%s (n=%g)", names(result), n_obs) @@ -167,7 +183,7 @@ report_sample <- function(data, summaries <- do.call(cbind, lapply(result, function(i) i["Summary"])) colnames(summaries) <- cn # generate data for total column, but make sure to remove missings - total_data <- data[!is.na(data[[group_by]]), unique(c(variables, group_by))] + total_data <- data[stats::complete.cases(data[group_by]), unique(c(variables, group_by))] # bind all together, including total column final <- cbind( variable, @@ -189,9 +205,9 @@ report_sample <- function(data, } # define total N, based on weights if (!is.null(weights)) { - total_n <- round(sum(as.vector(table(data[[group_by]]))) * mean(data[[weights]], na.rm = TRUE)) + total_n <- round(sum(as.vector(table(data[group_by]))) * mean(data[[weights]], na.rm = TRUE)) } else { - total_n <- sum(as.vector(table(data[[group_by]]))) + total_n <- sum(as.vector(table(data[group_by]))) } # add N to column name colnames(final)[ncol(final)] <- sprintf( diff --git a/man/report-package.Rd b/man/report-package.Rd index 9bcfb300..68600e2f 100644 --- a/man/report-package.Rd +++ b/man/report-package.Rd @@ -39,6 +39,7 @@ Authors: Other contributors: \itemize{ \item Rudolf Siegel \email{mutlusun@users.noreply.github.com} (\href{https://orcid.org/0000-0002-6021-804X}{ORCID}) [contributor] + \item Camden Bock \email{camden.bock@maine.edu} (\href{https://orcid.org/0000-0002-3907-7748}{ORCID}) [contributor] } } diff --git a/man/report_sample.Rd b/man/report_sample.Rd index bc65733a..a27aae74 100644 --- a/man/report_sample.Rd +++ b/man/report_sample.Rd @@ -23,8 +23,9 @@ report_sample( \arguments{ \item{data}{A data frame for which descriptive statistics should be created.} -\item{group_by}{Character vector, indicating the column for possible grouping -of the descriptive table.} +\item{group_by}{Character vector, indicating the column(s) for possible grouping +of the descriptive table. Note that weighting (see \code{weights}) does not work +wirh more than one grouping column.} \item{centrality}{Character, indicates the statistics that should be calculated for numeric variables. May be \code{"mean"} (for mean and diff --git a/tests/testthat/_snaps/report_sample.md b/tests/testthat/_snaps/report_sample.md new file mode 100644 index 00000000..8b84b72e --- /dev/null +++ b/tests/testthat/_snaps/report_sample.md @@ -0,0 +1,12 @@ +# report_sample, error on more than one grouping variable + + Code + out + Output + # Descriptive Statistics + + Variable | setosa, a (n=16) | versicolor, a (n=17) | virginica, a (n=9) | setosa, b (n=15) | versicolor, b (n=17) | virginica, b (n=22) | setosa, c (n=19) | versicolor, c (n=16) | virginica, c (n=19) | Total (n=150) + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Mean Sepal.Length (SD) | 5.13 (0.36) | 6.08 (0.48) | 6.59 (0.44) | 4.91 (0.31) | 5.84 (0.54) | 6.77 (0.63) | 4.97 (0.36) | 5.88 (0.53) | 6.38 (0.69) | 5.84 (0.83) + Mean Sepal.Width (SD) | 3.50 (0.42) | 2.76 (0.40) | 3.02 (0.25) | 3.34 (0.25) | 2.81 (0.25) | 3.05 (0.36) | 3.44 (0.43) | 2.74 (0.29) | 2.87 (0.30) | 3.06 (0.44) + diff --git a/tests/testthat/test-report_sample.R b/tests/testthat/test-report_sample.R index 17b7e266..b22952ec 100644 --- a/tests/testthat/test-report_sample.R +++ b/tests/testthat/test-report_sample.R @@ -32,6 +32,18 @@ test_that("report_sample weights, coorect weighted N", { "x [c], % | 33.3 | 33.3 | 33.3" ) ) + + d <- data.frame( + x = c("a", "a", "a", "a", "b", "b", "b", "b", "c", "c", "c", "c"), + g1 = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2), + g2 = c(3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 1), + w = c(0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, 1, 1, 1.5, 1.5), + stringsAsFactors = FALSE + ) + expect_error( + report_sample(d, select = "x", group_by = c("g1", "g2"), weights = "w"), + regex = "Cannot apply" + ) }) test_that("report_sample check input", { @@ -241,9 +253,15 @@ test_that("report_sample grouped data frames", { }) test_that("report_sample, error on more than one grouping variable", { - data(mtcars) - expect_error( - report_sample(mtcars, group_by = c("vs", "gear"), select = c("hp", "mpg")), - regex = "only works" - ) + data(iris) + set.seed(123) + iris$grp <- sample(letters[1:3], nrow(iris), TRUE) + out <- report_sample( + iris, + group_by = c("Species", "grp"), + select = c("Sepal.Length", "Sepal.Width") + ) + # verified against + expected <- aggregate(iris["Sepal.Length"], iris[c("Species", "grp")], mean) + expect_snapshot(out) })