-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit bbd72d3
Showing
34 changed files
with
2,827 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
Package: sgboost | ||
Title: Sparse-Group Boosting | ||
Version: 0.1.0 | ||
Authors@R: | ||
person("Fabian", "Obster", , "fabian.obster@unibw.de", role = c("aut", "cre", "cph"), | ||
comment = c(ORCID = "https://orcid.org/0000-0002-6951-9869")) | ||
Description: Sparse-group boosting to be used in conjunction with the 'mboost' for modeling grouped data. | ||
Applicable to all sparse-group lasso type problems where within-group and between-group sparsity is desired. | ||
Interprets and visualizes individual variables and groups. | ||
Imports: dplyr, mboost, stringr, rlang, tibble, ggplot2, ggforce | ||
License: MIT + file LICENSE | ||
Encoding: UTF-8 | ||
RoxygenNote: 7.3.1 | ||
URL: https://github.com/FabianObster/sgboost | ||
BugReports: https://github.com/FabianObster/sgboost/issues | ||
Suggests: knitr, rmarkdown, testthat (>= 3.0.0) | ||
Config/testthat/edition: 3 | ||
VignetteBuilder: knitr | ||
NeedsCompilation: no | ||
Packaged: 2024-04-10 20:03:03 UTC; fabia | ||
Author: Fabian Obster [aut, cre, cph] (<https://orcid.org/0000-0002-6951-9869>) | ||
Maintainer: Fabian Obster <fabian.obster@unibw.de> | ||
Repository: CRAN | ||
Date/Publication: 2024-04-11 11:20:02 UTC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
YEAR: 2024 | ||
COPYRIGHT HOLDER: Fabian Obster |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
56f0ca36686a7c582a59e627dc9e40c1 *DESCRIPTION | ||
d07665b844aab6988eff559029775110 *LICENSE | ||
7c9f8ef91ecfbd4bfd208c45781c13fc *NAMESPACE | ||
f1af48ee9bbc36021598ddd07bdcc920 *NEWS.md | ||
695e875b5adbbfca3f328afb4c86fae5 *R/create_formula.R | ||
03b4fb9c6acbd3caaed1a81f6973ac5c *R/get_coef.R | ||
51bae40dab4406aaf94c1b19c62e69e9 *R/get_coef_path.R | ||
198158beb468f550dcef43cd7611743b *R/get_varimp.R | ||
fc9bb18019062538d8d02d3beebf6f08 *R/plot_effects.R | ||
7852bb4d5c4b785affb2608c06fc9cb4 *R/plot_path.R | ||
1ec95825d71626b0a8c6f6c776049c7c *R/plot_varimp.R | ||
e2045b8d93763b5c795187b36693aa5b *README.md | ||
2f57f446d529aef7ce2b242259e42deb *build/vignette.rds | ||
b2f113515771a49cca4dd1b9cfc2965c *inst/doc/sgboost.R | ||
9c199efd206ee9036aaf217bdfa027f4 *inst/doc/sgboost.Rmd | ||
b81de5bfff69858fcb5df97bd3ef675b *inst/doc/sgboost.html | ||
261e432e273ce19056a90b21a133f346 *man/create_formula.Rd | ||
401ced294a2bbf46ffd3cdb136ea2050 *man/figures/README-minimal-ex-1.png | ||
502a5f8dd6ba90060101596366cb7d58 *man/figures/README-pressure-1.png | ||
6769e651f526f127a794b04a366b2ec0 *man/get_coef.Rd | ||
853f1ac1163683f25bb06a7b7f8d1f41 *man/get_coef_path.Rd | ||
3a64186e992887ed138ea391f0f8e882 *man/get_varimp.Rd | ||
6a820a65f02867fd4dcfd69a55965e8d *man/plot_effects.Rd | ||
6d0feb82db2bc3521f903df0c2c0a89b *man/plot_path.Rd | ||
0bc97cf9117fe6d275c1cec3e3258b6b *man/plot_varimp.Rd | ||
f519b2ef1f4fc388424e1b94280ce7a9 *tests/testthat.R | ||
521decc492bf4051aa593af4e23eabbd *tests/testthat/test-create_formula.R | ||
b34f6a2f33b3867605ba4da273e2edb1 *tests/testthat/test-get_coef.R | ||
dc09ceacac1635c72155f082ae879a6c *tests/testthat/test-get_coef_path.R | ||
88c72b6a59066918f7c62c1b0f45f82f *tests/testthat/test-get_varimp.R | ||
f43d7faa48a01479dc6b4ea5557c9d54 *tests/testthat/test-plot_effects.R | ||
910fab3b3b2f68fa982194dc0b159cce *tests/testthat/test-plot_varimp.R | ||
9c199efd206ee9036aaf217bdfa027f4 *vignettes/sgboost.Rmd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(create_formula) | ||
export(get_coef) | ||
export(get_coef_path) | ||
export(get_varimp) | ||
export(plot_effects) | ||
export(plot_path) | ||
export(plot_varimp) | ||
import(ggplot2) | ||
importFrom(dplyr,"%>%") | ||
importFrom(dplyr,arrange) | ||
importFrom(dplyr,case_when) | ||
importFrom(dplyr,filter) | ||
importFrom(dplyr,group_by) | ||
importFrom(dplyr,mutate) | ||
importFrom(dplyr,order_by) | ||
importFrom(dplyr,select) | ||
importFrom(dplyr,summarize) | ||
importFrom(dplyr,ungroup) | ||
importFrom(ggforce,geom_circle) | ||
importFrom(mboost,mstop) | ||
importFrom(mboost,varimp) | ||
importFrom(rlang,.data) | ||
importFrom(stats,reorder) | ||
importFrom(stringr,str_detect) | ||
importFrom(stringr,str_replace) | ||
importFrom(tibble,rownames_to_column) | ||
importFrom(tibble,tibble) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# sgboost 0.1.0 | ||
|
||
* Added a 'NEWS.md' file to track changes to the package. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#' Create a sparse-group boosting formula | ||
#' | ||
#' @description | ||
#' Creates a `mboost` formula that allows to fit a sparse-group boosting model based on | ||
#' boosted Ridge Regression with mixing parameter `alpha`. The formula consists of a | ||
#' group baselearner part with degrees of freedom | ||
#' 1-`alpha` and individual baselearners with degrees of freedom `alpha`. | ||
#' | ||
#' @param alpha Numeric mixing parameter. For alpha = 0 only group baselearners and for | ||
#' alpha = 1 only individual baselearners are defined. | ||
#' @param group_df input data.frame containing variable names with group structure. | ||
#' @param var_name Name of column in group_df containing the variable names | ||
#' to be used as predictors. Default is `"var_name"` | ||
#' @param group_name Name of column in group_df indicating the group structure of the variables. | ||
#' Default is `"group_name`. | ||
#' @param blearner Type of baselearner. Default is `'bols'`. | ||
#' @param outcome_name String indicating the name of dependent variable. Default is `"y"` | ||
#' @param intercept Logical, should intercept be used? | ||
#' @importFrom dplyr select group_by summarize mutate %>% | ||
#' @importFrom rlang .data | ||
#' @return Character containing the formula to be passed to [mboost::mboost()] | ||
#' yielding the sparse-group boosting for a given value mixing parameter `alpha`. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' library(mboost) | ||
#' library(dplyr) | ||
#' set.seed(1) | ||
#' df <- data.frame( | ||
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100), | ||
#' x4 = rnorm(100), x5 = runif(100) | ||
#' ) | ||
#' df <- df %>% | ||
#' mutate_all(function(x) { | ||
#' as.numeric(scale(x)) | ||
#' }) | ||
#' df$y <- df$x1 + df$x4 + df$x5 | ||
#' group_df <- data.frame( | ||
#' group_name = c(1, 1, 1, 2, 2), | ||
#' var_name = c("x1", "x2", "x3", "x4", "x5") | ||
#' ) | ||
#' | ||
#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df) | ||
#' sgb_model <- mboost(formula = sgb_formula, data = df) | ||
#' summary(sgb_model) | ||
create_formula <- function(alpha = 0.3, group_df = NULL, blearner = "bols", | ||
outcome_name = "y", group_name = "group_name", | ||
var_name = "var_name", intercept = FALSE) { | ||
stopifnot("Mixing parameter alpha must be numeric" = is.numeric(alpha)) | ||
stopifnot("Mixing parameter alpha must between zero and one" = (alpha >= 0 & alpha <= 1)) | ||
stopifnot("group_df must be a data.frame" = is.data.frame(group_df)) | ||
stopifnot( | ||
"group_name and var_name have to be columns of group_df" = | ||
(group_name %in% colnames(group_df) & | ||
var_name %in% colnames(group_df)) | ||
) | ||
if (blearner != "bols") { | ||
warning("passing a baselearner other than bols does not guarantee | ||
that mboost() returns a sparse-group boosting model") | ||
} | ||
var_names <- group_names <- NULL | ||
formula_df <- group_df | ||
formula_df$var_names <- group_df[[var_name]] | ||
formula_df$group_names <- group_df[[group_name]] | ||
formula_group <- formula_df %>% | ||
dplyr::select(var_names, group_names) %>% | ||
dplyr::group_by(.data$group_names) %>% | ||
dplyr::summarize(var_names = paste0(.data$var_names, collapse = " , ")) %>% | ||
dplyr::mutate(term = paste0( | ||
blearner, "(", .data$var_names, ", df = ", | ||
(1 - alpha), ", intercept=", intercept, ")" | ||
)) | ||
formula <- paste0(paste0( | ||
blearner, "(", formula_df$var_names, ", df = ", | ||
alpha, ", intercept=", intercept, ")" | ||
), collapse = "+") | ||
formula_group <- paste0(formula_group$term, collapse = "+") | ||
if (alpha == 0) { | ||
final_formula <- formula_group | ||
} else if (alpha == 1) { | ||
final_formula <- formula | ||
} else { | ||
final_formula <- paste0(formula, " + ", formula_group) | ||
} | ||
final_formula <- paste0(outcome_name, "~", final_formula) | ||
return(formula(final_formula)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#' Aggregated and raw coefficients in a sparse group boosting model | ||
#' | ||
#' @description | ||
#' Computes the aggregated coefficients from group and individual baselearners. | ||
#' Also returns the raw coefficients associated with each baselearner. | ||
#' @details in a sparse group boosting models a variable in a dataset can be selected | ||
#' as an individual variable or as a group. Therefore there can be two associated effect sizes for the | ||
#' same variable. This function aggregates both and returns it in a data.frame. | ||
#' | ||
#' @param sgb_model Model of type `mboost` to compute the coefficients for. | ||
#' @importFrom dplyr filter mutate %>% | ||
#' @importFrom tibble rownames_to_column tibble | ||
#' @importFrom stringr str_replace str_detect | ||
#' | ||
#' @return List of data.frames containing the a data.frame `'$raw'` with the | ||
#' variable and the raw (Regression) coefficients and the data.frame `'$aggregated'` with the | ||
#' aggregated (Regression) coefficients. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' library(mboost) | ||
#' library(dplyr) | ||
#' set.seed(1) | ||
#' df <- data.frame( | ||
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100), | ||
#' x4 = rnorm(100), x5 = runif(100) | ||
#' ) | ||
#' df <- df %>% | ||
#' mutate_all(function(x) { | ||
#' as.numeric(scale(x)) | ||
#' }) | ||
#' df$y <- df$x1 + df$x4 + df$x5 | ||
#' group_df <- data.frame( | ||
#' group_name = c(1, 1, 1, 2, 2), | ||
#' var_name = c("x1", "x2", "x3", "x4", "x5") | ||
#' ) | ||
#' | ||
#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df) | ||
#' sgb_model <- mboost(formula = sgb_formula, data = df) | ||
#' sgb_coef <- get_coef(sgb_model) | ||
get_coef <- function(sgb_model) { | ||
stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost") | ||
sgb_coef <- sgb_model$coef() | ||
coef_df <- sgb_model$coef() %>% | ||
seq_along() %>% | ||
lapply(function(i) { | ||
as.data.frame(sgb_model$coef()[[i]]) %>% | ||
tibble::rownames_to_column() %>% | ||
mutate(blearner = names(sgb_model$coef())[i]) | ||
}) %>% | ||
dplyr::bind_rows() %>% | ||
tibble() | ||
colnames(coef_df)[1:2] <- c("variable", "effect") | ||
coef_df <- coef_df %>% | ||
mutate( | ||
predictor = str_replace(.data$blearner, ",[^,]*=.*", ""), | ||
predictor = str_replace(.data$predictor, "bols\\(", ""), | ||
predictor = str_replace(.data$predictor, "\\)", ""), | ||
type = dplyr::case_when( | ||
stringr::str_detect(predictor, ",") ~ "group", | ||
T ~ "individual" | ||
) | ||
) %>% | ||
dplyr::arrange(-abs(.data$effect)) %>% | ||
dplyr::filter(.data$effect != 0) | ||
coef_df_aggregate <- coef_df %>% | ||
dplyr::group_by(.data$variable) %>% | ||
dplyr::reframe( | ||
effect = sum(.data$effect), | ||
blearner = paste0(.data$blearner, collapse = "; "), | ||
predictor = paste0(.data$predictor, collapse = "; ") | ||
) %>% | ||
dplyr::arrange(-abs(.data$effect)) %>% | ||
dplyr::filter(.data$effect != 0) | ||
return(list(raw = coef_df, aggregated = coef_df_aggregate)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#' Path of aggregated and raw coefficients in a sparse-group boosting model | ||
#' | ||
#' @description | ||
#' Computes the aggregated coefficients from group and individual baselearners for each boosting iteration. | ||
#' @details in a sparse-group boosting models a variable in a dataset can be selected | ||
#' as an individual variable or as a group. Therefore there can be two associated effect sizes for the | ||
#' same variable. This function aggregates both and returns it in a data.frame for each boosting iteration | ||
#' | ||
#' @param sgb_model Model of type `mboost` to compute the coefficient path for . | ||
#' @importFrom dplyr mutate %>% | ||
#' | ||
#' @return List of data.frames containing the a data.frame `$raw` with the | ||
#' variable and the raw (Regression) coefficients and the data.frame `$aggregated` with the | ||
#' aggregated (Regression) coefficients. | ||
#' @export | ||
#' @seealso [get_coef()] | ||
#' @examples | ||
#' library(mboost) | ||
#' library(dplyr) | ||
#' set.seed(1) | ||
#' df <- data.frame( | ||
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100), | ||
#' x4 = rnorm(100), x5 = runif(100) | ||
#' ) | ||
#' df <- df %>% | ||
#' mutate_all(function(x) { | ||
#' as.numeric(scale(x)) | ||
#' }) | ||
#' df$y <- df$x1 + df$x4 + df$x5 | ||
#' group_df <- data.frame( | ||
#' group_name = c(1, 1, 1, 2, 2), | ||
#' var_name = c("x1", "x2", "x3", "x4", "x5") | ||
#' ) | ||
#' | ||
#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df) | ||
#' sgb_model <- mboost(formula = sgb_formula, data = df) | ||
#' sgb_coef_path <- get_coef_path(sgb_model) | ||
get_coef_path <- function(sgb_model) { | ||
stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost") | ||
initial_mstop <- mboost::mstop(sgb_model) | ||
coef_path <- get_coef(sgb_model) %>% | ||
lapply(function(x) { | ||
x %>% mutate(iteration = mboost::mstop(sgb_model)) | ||
}) | ||
for (i in (mboost::mstop(sgb_model) - 1):1) { | ||
coef_path$raw <- coef_path$raw %>% | ||
dplyr::bind_rows(get_coef(sgb_model[i])$raw %>% mutate(iteration = i)) | ||
coef_path$aggregated <- coef_path$aggregated %>% | ||
dplyr::bind_rows(get_coef(sgb_model[i])$aggregated %>% mutate(iteration = i)) | ||
} | ||
mboost::mstop(sgb_model) <- initial_mstop | ||
return(coef_path) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#' Variable importance of a sparse-group boosting model | ||
#' | ||
#' @description | ||
#' Variable importance is computed as relative reduction of loss-function attributed | ||
#' to each predictor (groups and individual variables). | ||
#' Returns a list of two data.frames. The first contains the variable importance | ||
#' of a sparse-group model in a data.frame for each predictor. | ||
#' The second one contains the aggregated relative importance of all groups vs. individual variables. | ||
#' | ||
#' @param sgb_model Model of type `mboost` to compute the variable importance for. | ||
#' @importFrom dplyr filter mutate %>% | ||
#' @importFrom stringr str_detect | ||
#' @importFrom mboost varimp | ||
#' @importFrom rlang .data | ||
#' | ||
#' @return List of two data.frames. `$raw` contains the name of the variables, group structure and | ||
#' variable importance on both group and individual variable basis. | ||
#' `$group_importance` contains the the aggregated relative importance of all | ||
#' group baselearners and of all individual variables. | ||
#' @export | ||
#' @seealso [mboost::varimp()] which this function uses. | ||
#' | ||
#' @examples | ||
#' library(mboost) | ||
#' library(dplyr) | ||
#' set.seed(1) | ||
#' df <- data.frame( | ||
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100), | ||
#' x4 = rnorm(100), x5 = runif(100) | ||
#' ) | ||
#' df <- df %>% | ||
#' mutate_all(function(x) { | ||
#' as.numeric(scale(x)) | ||
#' }) | ||
#' df$y <- df$x1 + df$x4 + df$x5 | ||
#' group_df <- data.frame( | ||
#' group_name = c(1, 1, 1, 2, 2), | ||
#' var_name = c("x1", "x2", "x3", "x4", "x5") | ||
#' ) | ||
#' | ||
#' sgb_formula <- as.formula(create_formula(alpha = 0.3, group_df = group_df)) | ||
#' sgb_model <- mboost(formula = sgb_formula, data = df) | ||
#' sgb_varimp <- get_varimp(sgb_model) | ||
get_varimp <- function(sgb_model) { | ||
stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost") | ||
sgb_varimp <- mboost::varimp(sgb_model) %>% | ||
as.data.frame() %>% | ||
dplyr::rename("predictor" = "variable") %>% | ||
dplyr::filter(.data$reduction != 0) %>% | ||
dplyr::mutate( | ||
type = dplyr::case_when( | ||
stringr::str_detect(.data$predictor, ",") ~ "group", | ||
T ~ "individual" | ||
), | ||
predictor = as.character(.data$predictor), | ||
blearner = as.character(.data$blearner) | ||
) %>% | ||
dplyr::mutate(relative_importance = .data$reduction / sum(.data$reduction)) %>% | ||
dplyr::group_by(.data$type) %>% | ||
dplyr::ungroup() %>% | ||
dplyr::arrange(-.data$relative_importance) | ||
group_importance <- sgb_varimp %>% | ||
dplyr::group_by(.data$type) %>% | ||
dplyr::summarize(importance = sum(.data$relative_importance)) %>% | ||
dplyr::arrange(-.data$importance) | ||
return(list(varimp = sgb_varimp, group_importance = group_importance)) | ||
} |
Oops, something went wrong.