/
too_far.R
185 lines (169 loc) · 5.56 KB
/
too_far.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#' Set rows of data to `NA` if the lie too far from a reference set of values
#'
#' @param smooth an mgcv smooth object
#' @param input data frame containing the input observations and the columns to
#' be set to `NA`
#' @param reference data frame containing the reference values
#' @param cols character vector of columns whose elements will be set to `NA` if
#' the data lies too far from the reference set
#' @param dist numeric, the distance from the reference set beyond which
#' elements of `input` will be set to `NA`
#'
#' @export
`too_far_to_na` <- function(smooth, input, reference, cols, dist = NULL) {
UseMethod("too_far_to_na")
}
#' @export
#' @importFrom dplyr mutate across
#' @importFrom tidyselect all_of
`too_far_to_na.mgcv.smooth` <- function(smooth, input, reference, cols,
dist = NULL) {
# only for smooths of 2D currently
sm_dim <- smooth_dim(smooth)
if (sm_dim < 2L || sm_dim > 3L) {
return(input)
}
# call too_far to identify the observations too far from reference
input <- .call_too_far(
smooth = smooth, input = input,
reference = reference, cols = cols, dist = dist
)
# return
input
}
#' @export
#' @importFrom dplyr mutate across
#' @importFrom tidyselect all_of
`too_far_to_na.t2.smooth` <- function(smooth, input, reference, cols,
dist = NULL) {
# only for smooths of 2D currently
sm_dim <- smooth_dim(smooth)
if (sm_dim < 2L || sm_dim > 3L) {
return(input)
}
# call too_far to identify the observations too far from reference
input <- .call_too_far(
smooth = smooth, input = input,
reference = reference, cols = cols, dist = dist
)
# return
input
}
#' @export
#' @importFrom dplyr mutate across
#' @importFrom tidyselect all_of
`too_far_to_na.tensor.smooth` <- function(smooth, input, reference, cols,
dist = NULL) {
# only for smooths of 2D currently
sm_dim <- smooth_dim(smooth)
if (sm_dim < 2L || sm_dim > 3L) {
return(input)
}
# call too_far to identify the observations too far from reference
input <- .call_too_far(
smooth = smooth, input = input,
reference = reference, cols = cols, dist = dist
)
# return
input
}
#' Exclude values that lie too far from the support of data
#'
#' Identifies pairs of covariate values that lie too far from the original data.
#' The function is currently a basic wrapper around [mgcv::exclude.too.far()].
#'
#' @param x,y numeric; vector of values of the covariates to compare with
#' the observed data
#' @param ref_1,ref_2 numeric; vectors of covariate values that represent the
#' reference against which `x1 and `x2` are compared
#' @param dist if supplied, a numeric vector of length 1 representing the
#' distance from the data beyond which an observation is excluded. For
#' example, you want to exclude values that lie further from an observation
#' than 10% of the range of the observed data, use `0.1`.
#'
#' @return Returns a logical vector of the same length as `x1`.
#'
#' @importFrom mgcv exclude.too.far
#' @export
`too_far` <- function(x, y, ref_1, ref_2, dist = NULL) {
ind <- if (is.null(dist)) {
# If `NULL` keep everything == vector of TRUEs of correct length
rep(TRUE, length.out = length(x))
} else {
# if `dist` is provided, check it is of the correct kind
if (!is.numeric(dist) || !identical(length(dist), 1L)) {
stop("'dist', if provided, must be a single numeric value.",
call. = FALSE
)
}
# call exclude.too.far
mgcv::exclude.too.far(x, y, ref_1, ref_2, dist = dist)
}
# return
ind
}
#' Sets the elements of vector to `NA`
#'
#' Given a vector `i` indexing the elements of `x`, sets the selected elements
#' of `x` to `NA`.
#'
#' @param x vector of values
#' @param i vector of values used to subset `x`
#'
#' @return Returns `x` with possibly some elements set to `NA`
#'
#' @export
`to_na` <- function(x, i) {
# check x and i are of the same length
if (identical(length(x), length(i))) {
# set the indicated elements of `x` to `NA`
x[i] <- NA
} else {
stop("'x' and 'i' must be the same legnth.")
}
x
}
#' Set up and call `too_far` on the supplied input
#'
#' @param smooth an mgcv smooth object
#' @param input data frame containing the input observations and the columns to
#' be set to `NA`
#' @param reference data frame containing the reference values
#' @param cols character vector of columns whose elements will be set to `NA` if
#' the data lies too far from the reference set
#' @param dist numeric, the distance from the reference set beyond which
#' elements of `input` will be set to `NA`
#'
#' @noRd
#'
#' @keywords internal
`.call_too_far` <- function(smooth, input, reference, cols, dist) {
# Handle the case where `input` is a nested tibble & data are in $data
was_nested <- FALSE
if (!is.null(input[["data"]]) &&
(inherits(input, "tbl_df") && is.list(input[["data"]]))) {
input <- unnest(input, cols = all_of("data"))
was_nested <- TRUE
}
# what variables do we need to work on
sm_vars <- smooth_variable(smooth)
# indicator for observations too far from grid
ind <- too_far(
x = input[[sm_vars[1L]]],
y = input[[sm_vars[2L]]],
ref_1 = reference[[sm_vars[1L]]],
ref_2 = reference[[sm_vars[2L]]],
dist = dist
)
# set the indicated columns to `NA`
input <- mutate(
input,
across(all_of(cols), \(x) to_na(x, i = ind))
)
# nest `input` again if it was nested previously
if (was_nested) {
input <- nest(input, data = !all_of(c(".smooth", ".type", ".by")))
}
# return
input
}