Skip to content

Commit

Permalink
report_sample allows group_by > 1
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke committed Jun 14, 2023
1 parent 9c01ea2 commit ffff602
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 25 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: report
Title: Automated Reporting of Results and Statistical Models
Version: 0.5.7.5
Version: 0.5.7.6
Authors@R:
c(person(given = "Dominique",
family = "Makowski",
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Minor changes
* `report_sample` now works on grouped data frame, using the defined groups as
values for the `group_by` argument.

* `report_sample` can now summarize data based on more than one grouping variable
(i.e. `group_by` is allowed to be longer than 1).

# report 0.5.7

Hotfix for CRAN reverse dependency compatibility.
Expand Down
50 changes: 33 additions & 17 deletions R/report_sample.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
#' Create sample description table (also referred to as "Table 1").
#'
#' @param data A data frame for which descriptive statistics should be created.
#' @param group_by Character vector, indicating the column for possible grouping
#' of the descriptive table.
#' @param group_by Character vector, indicating the column(s) for possible grouping
#' of the descriptive table. Note that weighting (see `weights`) does not work
#' wirh more than one grouping column.
#' @param centrality Character, indicates the statistics that should be
#' calculated for numeric variables. May be `"mean"` (for mean and
#' standard deviation) or `"median"` (for median and median absolute
Expand Down Expand Up @@ -120,16 +121,13 @@ report_sample <- function(data,
group_by <- setdiff(colnames(attributes(data)$groups), ".rows")
}

# group_by is not allow to be longer than 1 element
if (length(group_by) > 1) {
insight::format_error(
"`report_sample` only works for one grouping variable.",
"Thus, `group_by` must be of length 1, or data frames are only allowed to be grouped by one variable."
)
}

# grouped by?
grouping <- !is.null(group_by) && group_by %in% colnames(data)
grouping <- !is.null(group_by) && all(group_by %in% colnames(data))

# sanity check - weights and grouping
if (!is.null(group_by) && length(group_by) > 1 && !is.null(weights)) {
insight::format_error("Cannot apply `weights` when grouping is done by more than one variable.")
}

# character to factor
data[] <- lapply(data, function(i) {
Expand All @@ -139,9 +137,12 @@ report_sample <- function(data,
i
})

# coerce group_by columns to factor
groups <- as.data.frame(lapply(data[group_by], factor))

out <- if (isTRUE(grouping)) {
result <- lapply(split(data[variables], factor(data[[group_by]])), function(x) {
x[[group_by]] <- NULL
result <- lapply(split(data[variables], groups), function(x) {
x[group_by] <- NULL
.generate_descriptive_table(
x,
centrality,
Expand All @@ -153,21 +154,36 @@ report_sample <- function(data,
ci_correct
)
})
# for more than one group, fix column names. we don't want "a.b (n=10)",
# but rather ""a, b (n=10)""
if (length(group_by) > 1) {
old_names <- datawizard::data_unite(
unique(groups),
new_column = ".old_names",
separator = "."
)[[".old_names"]]
new_names <- datawizard::data_unite(
unique(groups),
new_column = ".new_names",
separator = ", "
)[[".new_names"]]
result <- datawizard::data_rename(result, pattern = old_names, replacement = new_names)
}
# remember values of first columns
variable <- result[[1]]["Variable"]
# number of observation, based on weights
if (!is.null(weights)) {
n_obs <- round(as.vector(stats::xtabs(data[[weights]] ~ data[[group_by]])))
} else {
n_obs <- as.vector(table(data[[group_by]]))
n_obs <- as.vector(table(data[group_by]))
}
# column names for groups
cn <- sprintf("%s (n=%g)", names(result), n_obs)
# just extract summary columns
summaries <- do.call(cbind, lapply(result, function(i) i["Summary"]))
colnames(summaries) <- cn
# generate data for total column, but make sure to remove missings
total_data <- data[!is.na(data[[group_by]]), unique(c(variables, group_by))]
total_data <- data[stats::complete.cases(data[group_by]), unique(c(variables, group_by))]
# bind all together, including total column
final <- cbind(
variable,
Expand All @@ -189,9 +205,9 @@ report_sample <- function(data,
}
# define total N, based on weights
if (!is.null(weights)) {
total_n <- round(sum(as.vector(table(data[[group_by]]))) * mean(data[[weights]], na.rm = TRUE))
total_n <- round(sum(as.vector(table(data[group_by]))) * mean(data[[weights]], na.rm = TRUE))
} else {
total_n <- sum(as.vector(table(data[[group_by]])))
total_n <- sum(as.vector(table(data[group_by])))
}
# add N to column name
colnames(final)[ncol(final)] <- sprintf(
Expand Down
1 change: 1 addition & 0 deletions man/report-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions man/report_sample.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions tests/testthat/_snaps/report_sample.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# report_sample, error on more than one grouping variable

Code
out
Output
# Descriptive Statistics
Variable | setosa, a (n=16) | versicolor, a (n=17) | virginica, a (n=9) | setosa, b (n=15) | versicolor, b (n=17) | virginica, b (n=22) | setosa, c (n=19) | versicolor, c (n=16) | virginica, c (n=19) | Total (n=150)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Mean Sepal.Length (SD) | 5.13 (0.36) | 6.08 (0.48) | 6.59 (0.44) | 4.91 (0.31) | 5.84 (0.54) | 6.77 (0.63) | 4.97 (0.36) | 5.88 (0.53) | 6.38 (0.69) | 5.84 (0.83)
Mean Sepal.Width (SD) | 3.50 (0.42) | 2.76 (0.40) | 3.02 (0.25) | 3.34 (0.25) | 2.81 (0.25) | 3.05 (0.36) | 3.44 (0.43) | 2.74 (0.29) | 2.87 (0.30) | 3.06 (0.44)

28 changes: 23 additions & 5 deletions tests/testthat/test-report_sample.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ test_that("report_sample weights, coorect weighted N", {
"x [c], % | 33.3 | 33.3 | 33.3"
)
)

d <- data.frame(
x = c("a", "a", "a", "a", "b", "b", "b", "b", "c", "c", "c", "c"),
g1 = c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2),
g2 = c(3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 1),
w = c(0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, 1, 1, 1.5, 1.5),
stringsAsFactors = FALSE
)
expect_error(
report_sample(d, select = "x", group_by = c("g1", "g2"), weights = "w"),
regex = "Cannot apply"
)
})

test_that("report_sample check input", {
Expand Down Expand Up @@ -241,9 +253,15 @@ test_that("report_sample grouped data frames", {
})

test_that("report_sample, error on more than one grouping variable", {
data(mtcars)
expect_error(
report_sample(mtcars, group_by = c("vs", "gear"), select = c("hp", "mpg")),
regex = "only works"
)
data(iris)
set.seed(123)
iris$grp <- sample(letters[1:3], nrow(iris), TRUE)
out <- report_sample(
iris,
group_by = c("Species", "grp"),
select = c("Sepal.Length", "Sepal.Width")
)
# verified against
expected <- aggregate(iris["Sepal.Length"], iris[c("Species", "grp")], mean)
expect_snapshot(out)
})

0 comments on commit ffff602

Please sign in to comment.