/
across.R
438 lines (384 loc) · 13.3 KB
/
across.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
#' Apply a function (or functions) across multiple columns
#'
#' @description
#' `across()` makes it easy to apply the same transformation to multiple
#' columns, allowing you to use [select()] semantics inside in "data-masking"
#' functions like [summarise()] and [mutate()]. See `vignette("colwise")` for
#' more details.
#'
#' `if_any()` and `if_all()` apply the same
#' predicate function to a selection of columns and combine the
#' results into a single logical vector.
#'
#' `across()` supersedes the family of "scoped variants" like
#' `summarise_at()`, `summarise_if()`, and `summarise_all()`.
#'
#' @param cols,.cols <[`tidy-select`][dplyr_tidy_select]> Columns to transform.
#' Because `across()` is used within functions like `summarise()` and
#' `mutate()`, you can't select or compute upon grouping variables.
#' @param .fns Functions to apply to each of the selected columns.
#' Possible values are:
#'
#' - `NULL`, to returns the columns untransformed.
#' - A function, e.g. `mean`.
#' - A purrr-style lambda, e.g. `~ mean(.x, na.rm = TRUE)`
#' - A list of functions/lambdas, e.g.
#' `list(mean = mean, n_miss = ~ sum(is.na(.x))`
#'
#' Within these functions you can use [cur_column()] and [cur_group()]
#' to access the current column and grouping keys respectively.
#' @param ... Additional arguments for the function calls in `.fns`.
#' @param .names A glue specification that describes how to name the output
#' columns. This can use `{.col}` to stand for the selected column name, and
#' `{.fn}` to stand for the name of the function being applied. The default
#' (`NULL`) is equivalent to `"{.col}"` for the single function case and
#' `"{.col}_{.fn}"` for the case where a list is used for `.fns`.
#'
#' @returns
#' `across()` returns a tibble with one column for each column in `.cols` and each function in `.fns`.
#'
#' `if_any()` and `if_all()` return a logical vector.
#' @examples
#' # across() -----------------------------------------------------------------
#' # Different ways to select the same set of columns
#' # See <https://tidyselect.r-lib.org/articles/syntax.html> for details
#' iris %>%
#' as_tibble() %>%
#' mutate(across(c(Sepal.Length, Sepal.Width), round))
#' iris %>%
#' as_tibble() %>%
#' mutate(across(c(1, 2), round))
#' iris %>%
#' as_tibble() %>%
#' mutate(across(1:Sepal.Width, round))
#' iris %>%
#' as_tibble() %>%
#' mutate(across(where(is.double) & !c(Petal.Length, Petal.Width), round))
#'
#' # A purrr-style formula
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), ~mean(.x, na.rm = TRUE)))
#'
#' # A named list of functions
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd)))
#'
#' # Use the .names argument to control the output names
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), mean, .names = "mean_{.col}"))
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd), .names = "{.col}.{.fn}"))
#'
#' # When the list is not named, .fn is replaced by the function's position
#' iris %>%
#' group_by(Species) %>%
#' summarise(across(starts_with("Sepal"), list(mean, sd), .names = "{.col}.fn{.fn}"))
#'
#' # if_any() and if_all() ----------------------------------------------------
#' iris %>%
#' filter(if_any(ends_with("Width"), ~ . > 4))
#' iris %>%
#' filter(if_all(ends_with("Width"), ~ . > 2))
#'
#' @export
#' @seealso [c_across()] for a function that returns a vector
across <- function(.cols = everything(), .fns = NULL, ..., .names = NULL) {
key <- key_deparse(sys.call())
setup <- across_setup({{ .cols }}, fns = .fns, names = .names, key = key, .caller_env = caller_env())
vars <- setup$vars
if (length(vars) == 0L) {
return(new_tibble(list(), nrow = 1L))
}
fns <- setup$fns
names <- setup$names
mask <- peek_mask()
data <- mask$current_cols(vars)
if (is.null(fns)) {
nrow <- length(mask$current_rows())
data <- new_data_frame(data, n = nrow, class = c("tbl_df", "tbl"))
if (is.null(names)) {
return(data)
} else {
return(set_names(data, names))
}
}
n_cols <- length(data)
n_fns <- length(fns)
seq_n_cols <- seq_len(n_cols)
seq_fns <- seq_len(n_fns)
k <- 1L
out <- vector("list", n_cols * n_fns)
# Reset `cur_column()` info on exit
old_var <- context_peek_bare("column")
on.exit(context_poke("column", old_var), add = TRUE)
# Loop in such an order that all functions are applied
# to a single column before moving on to the next column
for (i in seq_n_cols) {
var <- vars[[i]]
col <- data[[i]]
context_poke("column", var)
for (j in seq_fns) {
fn <- fns[[j]]
out[[k]] <- fn(col, ...)
k <- k + 1L
}
}
size <- vec_size_common(!!!out)
out <- vec_recycle_common(!!!out, .size = size)
names(out) <- names
new_data_frame(out, n = size, class = c("tbl_df", "tbl"))
}
#' @rdname across
#' @export
if_any <- function(.cols = everything(), .fns = NULL, ..., .names = NULL) {
df <- across({{ .cols }}, .fns = .fns, ..., .names = .names)
n <- nrow(df)
df <- vec_cast_common(!!!df, .to = logical())
.Call(dplyr_reduce_lgl_or, df, n)
}
#' @rdname across
#' @export
if_all <- function(.cols = everything(), .fns = NULL, ..., .names = NULL) {
df <- across({{ .cols }}, .fns = .fns, ..., .names = .names)
n <- nrow(df)
df <- vec_cast_common(!!!df, .to = logical())
.Call(dplyr_reduce_lgl_and, df, n)
}
#' Combine values from multiple columns
#'
#' @description
#' `c_across()` is designed to work with [rowwise()] to make it easy to
#' perform row-wise aggregations. It has two differences from `c()`:
#'
#' * It uses tidy select semantics so you can easily select multiple variables.
#' See `vignette("rowwise")` for more details.
#'
#' * It uses [vctrs::vec_c()] in order to give safer outputs.
#'
#' @inheritParams across
#' @seealso [across()] for a function that returns a tibble.
#' @export
#' @examples
#' df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4))
#' df %>%
#' rowwise() %>%
#' mutate(
#' sum = sum(c_across(w:z)),
#' sd = sd(c_across(w:z))
#' )
c_across <- function(cols = everything()) {
key <- key_deparse(sys.call())
vars <- c_across_setup({{ cols }}, key = key)
mask <- peek_mask("c_across()")
cols <- mask$current_cols(vars)
vec_c(!!!cols, .name_spec = zap())
}
across_glue_mask <- function(.col, .fn, .caller_env) {
glue_mask <- env(.caller_env, .col = .col, .fn = .fn)
# TODO: we can make these bindings louder later
env_bind_active(
glue_mask, col = function() glue_mask$.col, fn = function() glue_mask$.fn
)
glue_mask
}
# TODO: The usage of a cache in `across_setup()` and `c_across_setup()` is a stopgap solution, and
# this idea should not be used anywhere else. This should be replaced by the
# next version of hybrid evaluation, which should offer a way for any function
# to do any required "set up" work (like the `eval_select()` call) a single
# time per top-level call, rather than once per group.
across_setup <- function(cols, fns, names, key, .caller_env) {
mask <- peek_mask("across()")
value <- mask$across_cache_get(key)
if (is.null(value)) {
value <- across_setup_impl({{ cols }},
fns = fns, names = names, .caller_env = .caller_env, mask = mask,
.top_level = FALSE
)
mask$across_cache_add(key, value)
}
value
}
across_setup_impl <- function(cols, fns, names, .caller_env, mask = peek_mask("across()"), .top_level = FALSE) {
cols <- enquo(cols)
if (.top_level) {
# FIXME: this is a little bit hacky to make top_across()
# work, otherwise mask$across_cols() fails when calling
# self$current_cols(across_vars_used)
# it should not affect anything because it is expected that
# across_setup() is only ever called on the first group anyway
# but perhaps it is time to review how across_cols() work
mask$set_current_group(1L)
}
# `across()` is evaluated in a data mask so we need to remove the
# mask layer from the quosure environment (#5460)
cols <- quo_set_env(cols, data_mask_top(quo_get_env(cols), recursive = FALSE, inherit = TRUE))
vars <- tidyselect::eval_select(cols, data = mask$across_cols())
vars <- names(vars)
if (is.null(fns)) {
if (!is.null(names)) {
glue_mask <- across_glue_mask(.caller_env, .col = vars, .fn = "1")
names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique")
}
value <- list(vars = vars, fns = fns, names = names)
return(value)
}
# apply `.names` smart default
if (is.function(fns) || is_formula(fns)) {
names <- names %||% "{.col}"
fns <- list("1" = fns)
} else {
names <- names %||% "{.col}_{.fn}"
}
if (!is.list(fns)) {
abort(c("Problem with `across()` input `.fns`.",
i = "Input `.fns` must be NULL, a function, a formula, or a list of functions/formulas."
))
}
fns <- map(fns, as_function)
# make sure fns has names, use number to replace unnamed
if (is.null(names(fns))) {
names_fns <- seq_along(fns)
} else {
names_fns <- names(fns)
empties <- which(names_fns == "")
if (length(empties)) {
names_fns[empties] <- empties
}
}
glue_mask <- glue_mask <- across_glue_mask(.caller_env,
.col = rep(vars, each = length(fns)),
.fn = rep(names_fns, length(vars))
)
names <- vec_as_names(glue(names, .envir = glue_mask), repair = "check_unique")
list(vars = vars, fns = fns, names = names)
}
# FIXME: This pattern should be encapsulated by rlang
data_mask_top <- function(env, recursive = FALSE, inherit = FALSE) {
while (env_has(env, ".__tidyeval_data_mask__.", inherit = inherit)) {
env <- env_parent(env_get(env, ".top_env", inherit = inherit))
if (!recursive) {
return(env)
}
}
env
}
c_across_setup <- function(cols, key) {
mask <- peek_mask("c_across()")
value <- mask$across_cache_get(key)
if (!is.null(value)) {
return(value)
}
cols <- enquo(cols)
across_cols <- mask$across_cols()
vars <- tidyselect::eval_select(expr(!!cols), across_cols)
value <- names(vars)
mask$across_cache_add(key, value)
value
}
key_deparse <- function(key) {
deparse(key, width.cutoff = 500L, backtick = TRUE, nlines = 1L, control = NULL)
}
# When mutate() or summarise() have an unnamed call to across() at the top level, e.g.
# summarise(across(<...>)) or mutate(across(<...>))
#
# a call to top_across(<...>) is evaluated instead.
# top_across() returns a flattened list of expressions along with some
# information about the "current column" for each expression
# in the "columns" attribute:
#
# For example with: summarise(across(c(x, y), mean, .names = "mean_{.col}")) top_across() will return
# something like:
#
# structure(
# list(mean_x = expr(mean(x)), mean_y = expr(mean(y)))
# columns = c("x", "y")
# )
top_across <- function(.cols = everything(), .fns = NULL, ..., .names = NULL) {
setup <- across_setup_impl(
{{ .cols }},
fns = .fns, names = .names, .caller_env = caller_env(),
.top_level = TRUE
)
vars <- setup$vars
# nothing
if (length(vars) == 0L) {
return(list())
}
fns <- setup$fns
names <- setup$names
# no functions, so just return a list of symbols
if (is.null(fns)) {
expressions <- syms(vars)
names(expressions) <- if (is.null(.names)) vars else names
return(expressions)
}
n_vars <- length(vars)
n_fns <- length(fns)
seq_vars <- seq_len(n_vars)
seq_fns <- seq_len(n_fns)
expressions <- vector(mode = "list", n_vars * n_fns)
columns <- character(n_vars * n_fns)
extra_args <- list(...)
k <- 1L
for (i in seq_vars) {
var <- vars[[i]]
for (j in seq_fns) {
fn <- fns[[j]]
call <- call2(fn, sym(var), !!!extra_args)
expressions[[k]] <- call
columns[[k]] <- var
k <- k + 1L
}
}
names(expressions) <- names
attr(expressions, "columns") <- columns
expressions
}
new_dplyr_quosure <- function(quo, ...) {
attr(quo, "dplyr:::data") <- list2(...)
quo
}
dplyr_quosures <- function(...) {
quosures <- enquos(..., .ignore_empty = "all")
names_given <- names2(quosures)
names_auto <- names(enquos(..., .named = TRUE, .ignore_empty = "all"))
for (i in seq_along(quosures)) {
quosures[[i]] <- new_dplyr_quosure(quosures[[i]],
name_given = names_given[i],
name_auto = names_auto[i],
is_named = names_given[i] != "",
index = i
)
}
quosures
}
expand_quosure <- function(quo) {
quo_data <- attr(quo, "dplyr:::data")
if (quo_is_call(quo, "across", ns = c("", "dplyr")) && !quo_data$is_named) {
# call top_across() instead of across()
quo_env <- quo_get_env(quo)
quo <- new_quosure(node_poke_car(quo_get_expr(quo), top_across), quo_env)
mask <- peek_mask()
expressions <- eval_tidy(quo, mask$get_rlang_mask(), mask$get_caller_env())
names_expressions <- names(expressions)
# process the results of top_across()
quosures <- vector(mode = "list", length(expressions))
for (j in seq_along(expressions)) {
name <- names_expressions[j]
quosures[[j]] <- new_dplyr_quosure(new_quosure(expressions[[j]], quo_env),
name_given = name,
name_auto = name,
is_named = TRUE,
index = c(quo_data$index, j),
column = attr(expressions, "columns")[j]
)
}
} else {
quosures <- list(quo)
}
quosures
}