Skip to content

Commit

Permalink
tweaks
Browse files Browse the repository at this point in the history
1. change `remove_na` to `show_na` and set default to conditionally show the missing category if any NA are present
2. use the Wilson confidence interval rather than the normal approximation to avoid CIs that escape the [0, 1] bounds
3. namespace ggplot2 functions
  • Loading branch information
bwiernik committed Aug 31, 2023
1 parent 439c2a4 commit 1ca219c
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 39 deletions.
104 changes: 74 additions & 30 deletions R/plot.dw_data_tabulate.R
Original file line number Diff line number Diff line change
@@ -1,29 +1,50 @@
#' Plot tabulated data.
#'
#' @param x Object created by `datawizard::data_tabulate()`.
#' @param value_lab Boolean. Should values and percentages be displayed at the
#' top of each bar.
#' @param remove_na Boolean. Should missing values be dropped?
#' @param na_label The label given to missing values (only when
#' `remove_na = FALSE`).
#' @param error_bar Boolean. Should error bars be displayed?
#' @param label_values Logical. Should values and percentages be displayed at the
#' top of each bar.
#' @param show_na Should missing values be dropped? Can be `"if_any"` (default) to show
#' the missing category only if any missing values are present, `"always"` to
#' always show the missing category, or `"never"` to never show the missing
#' category.
#' @param na_label The label given to missing values when they are shown.
#' @param error_bar Logical. Should error bars be displayed?
#' If `TRUE`, confidence intervals computed using the Wilson method are shown.
#' See Brown et al. (2001) for details.
#' @param ci Confidence Interval (CI) level. Default to `0.95` (⁠95%⁠).
#' @param fill_col Color to use for category columns (default: "#87CEFA").
#' @param color_error_bar Color to use for error bars (default: "#607B8B").
#' @param ... Unused
#'
#' @references
#' Brown, L. D., Cai, T. T., & DasGupta, A. (2001).
#' Interval estimation for a binomial proportion.
#' _Statistical Science, 16_(2), 101–133. \doi{10.1214/ss/1009213286}
#'
#' @rdname plot.dw_data_tabulate
#' @export

plot.dw_data_tabulates <- function(x, value_lab = TRUE, remove_na = FALSE,
na_label = "(Missing)", error_bar = TRUE,
plot.dw_data_tabulates <- function(x, label_values = TRUE,
show_na = c("if_any", "always", "never"),
na_label = "(Missing)",
error_bar = TRUE, ci = .95,
fill_col = "#87CEFA",
color_error_bar = "#607B8B",
...) {
show_na <- match.arg(show_na, choices = c("if_any", "always", "never"))
if (length(x) == 1) {
plot.dw_data_tabulate(
x[[1]], value_lab = value_lab, remove_na = remove_na,
na_label = na_label, error_bar = error_bar
x[[1]], label_values = label_values,
show_na = show_na, na_label = na_label,
error_bar = error_bar, ci = ci,
fill_col = fill_col, color_error_bar = color_error_bar
)
} else {
lapply(x, plot.dw_data_tabulate,
value_lab = value_lab, remove_na = remove_na,
na_label = na_label, error_bar = error_bar
label_values = label_values,
show_na = show_na, na_label = na_label,
error_bar = error_bar, ci = ci,
fill_col = fill_col, color_error_bar = color_error_bar
)
}
}
Expand All @@ -32,12 +53,25 @@ plot.dw_data_tabulates <- function(x, value_lab = TRUE, remove_na = FALSE,
#'
#' @export

plot.dw_data_tabulate <- function(x, value_lab = TRUE, remove_na = FALSE,
na_label = "(Missing)", error_bar = TRUE,
plot.dw_data_tabulate <- function(x, label_values = TRUE,
show_na = c("if_any", "always", "never"),
na_label = "(Missing)",
error_bar = TRUE, ci = .95,
fill_col = "#87CEFA",
color_error_bar = "#607B8B",
...) {
show_na <- match.arg(show_na, choices = c("if_any", "always", "never"))
dat <- as.data.frame(x)

if (isTRUE(remove_na)) {
if (show_na == "if_any") {
if (any(is.na(dat$Value))) {
show_na <- ifelse(dat[is.na(dat$Value), "N"] > 0, "always", "never")
} else {
show_na <- "never"
}
}

if (show_na == "never") {
dat <- dat[!is.na(dat$Value), ]
dat$output <- dat[[which(startsWith(names(dat), "Valid"))]]
} else {
Expand All @@ -54,40 +88,50 @@ plot.dw_data_tabulate <- function(x, value_lab = TRUE, remove_na = FALSE,

if (isTRUE(error_bar)) {
total_n <- sum(dat$N)
rel_frq <- dat$output / 100
ci <- 1.96 * suppressWarnings(sqrt(rel_frq * (1 - rel_frq) / total_n))
dat$upper.ci <- total_n * (rel_frq + ci)
dat$lower.ci <- total_n * (rel_frq - ci)
props <- dat$output / 100
dat <- cbind(dat, CI = ci, .wilson_ci(prop = props, total_n = total_n, ci = ci) * total_n)
dat$label <- paste0(dat$N, " (", round(dat$output, 2), "%)")
} else {
dat$label <- paste0(dat$N, "\n(", round(dat$output, 2), "%)")
}

out <- ggplot(dat, aes(x = .data$Value, y = .data$N)) +
geom_col() +
labs(title = unique(dat$Variable)) +
out <- ggplot2::ggplot(dat) +
ggplot2::aes(x = .data$Value, y = .data$N) +
ggplot2::geom_col(fill = fill_col) +
ggplot2::labs(title = unique(dat$Variable)) +
theme_modern()

if (isTRUE(value_lab)) {
if (isTRUE(label_values)) {
if (isTRUE(error_bar)) {
out <- out +
geom_text(aes(label = .data$label), vjust = -1, hjust = 1.2) +
ylim(c(0, max(dat$N) * 1.5))
ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -1, hjust = 1.2) +
ggplot2::coord_cartesian(ylim = c(0, max(dat$CI_high)))
} else {
out <- out +
geom_text(aes(label = .data$label), vjust = -0.5) +
ylim(c(0, max(dat$N) * 1.2))
ggplot2::geom_text(ggplot2::aes(label = .data$label), vjust = -0.5) +
ggplot2::coord_cartesian(ylim = c(0, max(dat$N) * 1.2))
}
}

# add confidence intervals for frequencies
if (isTRUE(error_bar)) {
out <- out +
geom_errorbar(
aes(ymin = .data$lower.ci, ymax = .data$upper.ci),
width = 0.5, color = "darkblue"
ggplot2::geom_linerange(
ggplot2::aes(ymin = .data$CI_low, ymax = .data$CI_high),
color = color_error_bar
)
}

out
}

.wilson_ci <- function(prop, total_n, ci = .95) {
z <- qnorm((1 - ci) / 2, lower.tail = FALSE)
z2 <- z^2
p1 <- prop + 0.5 * z2 / total_n
p2 <- z * sqrt((prop * (1 - prop) + 0.25 * z2 / total_n) / total_n)
p3 <- 1 + z2 / total_n
CI_low <- (p1 - p2) / p3
CI_high <- (p1 + p2) / p3
return(data.frame(CI_low = CI_low, CI_high = CI_high))
}
39 changes: 30 additions & 9 deletions man/plot.dw_data_tabulate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1ca219c

Please sign in to comment.