-
-
Notifications
You must be signed in to change notification settings - Fork 22
/
normalize.R
140 lines (107 loc) · 3.3 KB
/
normalize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#' Normalization
#'
#' Performs a normalization of data, i.e., it scales all numeric variables in the range 0 - 1. This is a special case of \code{\link{change_scale}}.
#'
#' @inheritParams standardize.data.frame
#'
#' @param x Object.
#' @param include_bounds Logical, if \code{TRUE}, return value may include 0
#' and 1. If \code{FALSE}, the return value is compressed, using the formula
#' \code{(x * (n - 1) + 0.5) / n} (\cite{Smithson and Verkuilen 2006}), to
#' avoid zeros and ones in the normalized variables. This can be useful in
#' case of beta-regression, where the response variable is not allowed to
#' include zeros and ones.
#' @param ... Arguments passed to or from other methods.
#'
#' @examples
#' normalize(c(0, 1, 5, -5, -2))
#' normalize(c(0, 1, 5, -5, -2), include_bounds = FALSE)
#'
#' head(normalize(iris))
#' @references Smithson M, Verkuilen J (2006). A Better Lemon Squeezer? Maximum-Likelihood Regression with Beta-Distributed Dependent Variables. Psychological Methods, 11(1), 54–71.
#'
#' @seealso \code{\link{ranktransform}} \code{\link{standardize}} \code{\link{change_scale}}
#' @return A normalized object.
#' @export
normalize <- function(x, ...) {
UseMethod("normalize")
}
#' @rdname normalize
#' @export
normalize.numeric <- function(x, include_bounds = TRUE, verbose = TRUE, ...) {
# Warning if all NaNs
if (all(is.na(x))) {
return(x)
}
# Warning if only one value
if (length(unique(x)) == 1) {
if (is.null(names(x))) {
name <- deparse(substitute(x))
} else {
name <- names(x)
}
if (verbose) {
warning(paste0("Variable `", name, "` contains only one unique value and will not be normalized."))
}
return(x)
}
# Warning if logical vector
if (length(unique(x)) == 2) {
if (is.null(names(x))) {
name <- deparse(substitute(x))
} else {
name <- names(x)
}
if (verbose) {
warning(paste0("Variable `", name, "` contains only two different values. Consider converting it to a factor."))
}
}
out <- as.vector((x - min(x, na.rm = TRUE)) / diff(range(x, na.rm = TRUE), na.rm = TRUE))
if (!include_bounds && (any(out == 0) | any(out == 1))) {
out <- (out * (length(out) - 1) + 0.5) / length(out)
}
out
}
#' @export
normalize.factor <- function(x, ...) {
x
}
#' @rdname normalize
#' @export
normalize.grouped_df <- function(x, select = NULL, exclude = NULL, include_bounds = TRUE, ...) {
info <- attributes(x)
# dplyr >= 0.8.0 returns attribute "indices"
grps <- attr(x, "groups", exact = TRUE)
# dplyr < 0.8.0?
if (is.null(grps)) {
grps <- attr(x, "indices", exact = TRUE)
grps <- lapply(grps, function(x) x + 1)
} else {
grps <- grps[[".rows"]]
}
x <- as.data.frame(x)
for (rows in grps) {
x[rows, ] <- normalize(
x[rows, ],
select = select,
exclude = exclude,
include_bounds = include_bounds,
...
)
}
# set back class, so data frame still works with dplyr
attributes(x) <- info
x
}
#' @rdname normalize
#' @export
normalize.data.frame <- function(x, select = NULL, exclude = NULL, include_bounds = TRUE, ...) {
if (is.null(select)) {
select <- names(x)
}
if (!is.null(exclude)) {
select <- setdiff(select, exclude)
}
x[select] <- lapply(x[select], normalize, include_bounds = include_bounds)
x
}