-
-
Notifications
You must be signed in to change notification settings - Fork 14
/
descriptives.R
191 lines (179 loc) · 6.5 KB
/
descriptives.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# distribution_mode ----------------------------------
#' Compute mode for a statistical distribution
#'
#' @param x An atomic vector, a list, or a data frame.
#'
#' @return
#'
#' The value that appears most frequently in the provided data.
#' The returned data structure will be the same as the entered one.
#'
#' @seealso For continuous variables, the
#' **Highest Maximum a Posteriori probability estimate (MAP)** may be
#' a more useful way to estimate the most commonly-observed value
#' than the mode. See [bayestestR::map_estimate()].
#'
#' @examples
#'
#' distribution_mode(c(1, 2, 3, 3, 4, 5))
#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5))
#'
#' @export
distribution_mode <- function(x) {
# TODO: Add support for weights, trim, binned (method)
uniqv <- unique(x)
tab <- tabulate(match(x, uniqv))
idx <- which.max(tab)
uniqv[idx]
}
#' Compute the coefficient of variation
#'
#' Compute the coefficient of variation (CV, ratio of the standard deviation to
#' the mean, \eqn{\sigma/\mu}) for a set of numeric values.
#'
#' @return The computed coefficient of variation for `x`.
#' @export
#'
#' @examples
#' coef_var(1:10)
#' coef_var(c(1:10, 100), method = "median_mad")
#' coef_var(c(1:10, 100), method = "qcd")
#' coef_var(mu = 10, sigma = 20)
#' coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30)
coef_var <- function(x, ...) {
UseMethod("coef_var")
}
#' @name distribution_cv
#' @rdname coef_var
#' @export
distribution_coef_var <- coef_var
#' @export
coef_var.default <- function(x, verbose = TRUE, ...) {
if (verbose) {
insight::format_warning(
paste0("Can't compute the coefficient of variation objects of class `", class(x)[1], "`.")
)
}
NULL
}
#' @param x A numeric vector of ratio scale (see details), or vector of values than can be coerced to one.
#' @param mu A numeric vector of mean values to use to compute the coefficient
#' of variation. If supplied, `x` is not used to compute the mean.
#' @param sigma A numeric vector of standard deviation values to use to compute the coefficient
#' of variation. If supplied, `x` is not used to compute the SD.
#' @param method Method to use to compute the CV. Can be `"standard"` to compute
#' by dividing the standard deviation by the mean, `"unbiased"` for the
#' unbiased estimator for normally distributed data, or one of two robust
#' alternatives: `"median_mad"` to divide the median by the [stats::mad()],
#' or `"qcd"` (quartile coefficient of dispersion, interquartile range divided
#' by the sum of the quartiles \[twice the midhinge\]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}.
#' @param trim the fraction (0 to 0.5) of values to be trimmed from
#' each end of `x` before the mean and standard deviation (or other measures)
#' are computed. Values of `trim` outside the range of (0 to 0.5) are taken
#' as the nearest endpoint.
#' @param remove_na Logical. Should `NA` values be removed before computing (`TRUE`)
#' or not (`FALSE`, default)?
#' @param na.rm Deprecated. Please use `remove_na` instead.
#' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not
#' computed from `x`), what sample size to use to adjust the computed CV
#' for small-sample bias?
#' @param ... Further arguments passed to computation functions.
#'
#' @details
#' CV is only applicable of values taken on a ratio scale: values that have a
#' *fixed* meaningfully defined 0 (which is either the lowest or highest
#' possible value), and that ratios between them are interpretable For example,
#' how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches
#' is 4 times more than 5 sandwiches. If I were to center the number of
#' sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the
#' mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio
#' scale still results in a ratio scale. So I can re define "how many half
#' sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean
#' "none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches.
#'
#' This means that CV is **NOT** invariant to shifting, but it is to scaling:
#' ```{r}
#' sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7)
#' coef_var(sandwiches)
#'
#' coef_var(sandwiches / 2) # same
#'
#' coef_var(sandwiches + 4) # different! 0 is no longer meaningful!
#' ````
#'
#' @rdname coef_var
#'
#' @export
coef_var.numeric <- function(x, mu = NULL, sigma = NULL,
method = c("standard", "unbiased", "median_mad", "qcd"),
trim = 0, remove_na = FALSE, n = NULL, na.rm = FALSE, ...) {
# TODO: remove deprecated argument later
if (!missing(na.rm)) {
insight::format_warning("Argument `na.rm` is deprecated. Please use `remove_na` instead.")
remove_na <- na.rm
}
# TODO: Support weights
if (!missing(x) && all(c(-1, 1) %in% sign(x))) {
insight::format_error("Coefficient of variation only applicable for ratio scale variables.")
}
method <- match.arg(method, choices = c("standard", "unbiased", "median_mad", "qcd"))
if (is.null(mu) || is.null(sigma)) {
if (isTRUE(remove_na)) {
x <- .drop_na(x)
}
n <- length(x)
x <- .trim_values(x, trim = trim, n = n)
}
if (is.null(mu)) {
mu <- switch(method,
standard = ,
unbiased = mean(x, ...),
median_mad = stats::median(x, ...),
qcd = unname(sum(stats::quantile(x, probs = c(0.25, 0.75), ...)))
)
}
if (is.null(sigma)) {
sigma <- switch(method,
standard = ,
unbiased = stats::sd(x, ...),
median_mad = stats::mad(x, center = mu, ...),
qcd = unname(diff(stats::quantile(x, probs = c(0.25, 0.75), ...)))
)
}
out <- sigma / mu
if (method == "unbiased") {
if (is.null(n)) {
insight::format_error(
"A value for `n` must be provided when `method = \"unbiased\"` and both `mu` and `sigma` are provided."
)
}
# from DescTools::CoefVar
out <- out * (1 - 1 / (4 * (n - 1)) + 1 / n * out^2 + 1 / (2 * (n - 1)^2))
}
return(out)
}
# descriptives helpers
.drop_na <- function(x) {
x[!is.na(x)]
}
.trim_values <- function(x, trim = 0, n = NULL, weights = NULL) {
# TODO: Support weights
if (!is.numeric(trim) || length(trim) != 1L) {
insight::format_error("`trim` must be a single numeric value.")
}
if (is.null(n)) {
n <- length(x)
}
if (trim > 0 && n) {
if (anyNA(x)) {
return(NA_real_)
}
if (trim >= 0.5) {
return(stats::median(x, na.rm = FALSE))
}
lo <- floor(n * trim) + 1
hi <- n + 1 - lo
x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi]
}
x
}