/
find_distribution.R
67 lines (57 loc) · 2.11 KB
/
find_distribution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#' Machine learning model trained to classify distributions
#'
#' Mean accuracy and Kappa of 0.96.
#'
#'
"classify_distribution"
#' Classify the variable's distribution using machine learning
#'
#'
#' This function uses an internal random forest model to classify the variable's distribution.
#'
#' @param x A numeric vector.
#' @param probabilities Return a dataframe containing the probabilities of belonging to each distribution type.
#'
#' @examples
#' find_distribution(rnorm(100))
#' find_distribution(rpois(100, lambda = 4))
#' find_distribution(runif(100))
#'
#' @importFrom bayestestR map_estimate
#' @importFrom stats IQR density predict
#' @export
find_distribution <- function(x, probabilities = FALSE) {
if (!requireNamespace("caret", quietly = TRUE)) {
stop("Package `caret` required for distribution classification. Please install it.", call. = FALSE)
}
if (!requireNamespace("randomForest", quietly = TRUE)) {
stop("Package `randomForest` required for distribution classification. Please install it.", call. = FALSE)
}
# Extract features
density_Z <- parameters::normalize(stats::density(x, n = 100)$y)
# Extract features
data <- data.frame(
"Mean" = mean(x),
"SD" = stats::sd(x),
"Median" = stats::median(x),
"MAD" = mad(x, constant = 1),
"Mean_Median_Distance" = mean(x) - stats::median(x),
"Mean_Mode_Distance" = mean(x) - bayestestR::map_estimate(x),
"SD_MAD_Distance" = stats::sd(x) - stats::mad(x, constant = 1),
"Mode" = bayestestR::map_estimate(x),
"Range" = diff(range(x)) / stats::sd(x),
"IQR" = stats::IQR(x),
"Skewness" = skewness(x),
"Kurtosis" = kurtosis(x),
"Smoothness_Cor_1" = smoothness(density_Z, method = "cor", lag = 1),
"Smoothness_Diff_1" = smoothness(density_Z, method = "diff", lag = 1),
"Smoothness_Cor_5" = smoothness(density_Z, method = "cor", lag = 5),
"Smoothness_Diff_5" = smoothness(density_Z, method = "diff", lag = 5)
)
# Predict
if (probabilities) {
return(stats::predict(classify_distribution, data, type = "prob"))
} else {
return(as.character(stats::predict(classify_distribution, data)))
}
}