Skip to content

Commit

Permalink
version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Fabian Obster authored and cran-robot committed Apr 12, 2024
0 parents commit bbd72d3
Show file tree
Hide file tree
Showing 34 changed files with 2,827 additions and 0 deletions.
24 changes: 24 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Package: sgboost
Title: Sparse-Group Boosting
Version: 0.1.0
Authors@R:
person("Fabian", "Obster", , "fabian.obster@unibw.de", role = c("aut", "cre", "cph"),
comment = c(ORCID = "https://orcid.org/0000-0002-6951-9869"))
Description: Sparse-group boosting to be used in conjunction with the 'mboost' for modeling grouped data.
Applicable to all sparse-group lasso type problems where within-group and between-group sparsity is desired.
Interprets and visualizes individual variables and groups.
Imports: dplyr, mboost, stringr, rlang, tibble, ggplot2, ggforce
License: MIT + file LICENSE
Encoding: UTF-8
RoxygenNote: 7.3.1
URL: https://github.com/FabianObster/sgboost
BugReports: https://github.com/FabianObster/sgboost/issues
Suggests: knitr, rmarkdown, testthat (>= 3.0.0)
Config/testthat/edition: 3
VignetteBuilder: knitr
NeedsCompilation: no
Packaged: 2024-04-10 20:03:03 UTC; fabia
Author: Fabian Obster [aut, cre, cph] (<https://orcid.org/0000-0002-6951-9869>)
Maintainer: Fabian Obster <fabian.obster@unibw.de>
Repository: CRAN
Date/Publication: 2024-04-11 11:20:02 UTC
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2024
COPYRIGHT HOLDER: Fabian Obster
33 changes: 33 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
56f0ca36686a7c582a59e627dc9e40c1 *DESCRIPTION
d07665b844aab6988eff559029775110 *LICENSE
7c9f8ef91ecfbd4bfd208c45781c13fc *NAMESPACE
f1af48ee9bbc36021598ddd07bdcc920 *NEWS.md
695e875b5adbbfca3f328afb4c86fae5 *R/create_formula.R
03b4fb9c6acbd3caaed1a81f6973ac5c *R/get_coef.R
51bae40dab4406aaf94c1b19c62e69e9 *R/get_coef_path.R
198158beb468f550dcef43cd7611743b *R/get_varimp.R
fc9bb18019062538d8d02d3beebf6f08 *R/plot_effects.R
7852bb4d5c4b785affb2608c06fc9cb4 *R/plot_path.R
1ec95825d71626b0a8c6f6c776049c7c *R/plot_varimp.R
e2045b8d93763b5c795187b36693aa5b *README.md
2f57f446d529aef7ce2b242259e42deb *build/vignette.rds
b2f113515771a49cca4dd1b9cfc2965c *inst/doc/sgboost.R
9c199efd206ee9036aaf217bdfa027f4 *inst/doc/sgboost.Rmd
b81de5bfff69858fcb5df97bd3ef675b *inst/doc/sgboost.html
261e432e273ce19056a90b21a133f346 *man/create_formula.Rd
401ced294a2bbf46ffd3cdb136ea2050 *man/figures/README-minimal-ex-1.png
502a5f8dd6ba90060101596366cb7d58 *man/figures/README-pressure-1.png
6769e651f526f127a794b04a366b2ec0 *man/get_coef.Rd
853f1ac1163683f25bb06a7b7f8d1f41 *man/get_coef_path.Rd
3a64186e992887ed138ea391f0f8e882 *man/get_varimp.Rd
6a820a65f02867fd4dcfd69a55965e8d *man/plot_effects.Rd
6d0feb82db2bc3521f903df0c2c0a89b *man/plot_path.Rd
0bc97cf9117fe6d275c1cec3e3258b6b *man/plot_varimp.Rd
f519b2ef1f4fc388424e1b94280ce7a9 *tests/testthat.R
521decc492bf4051aa593af4e23eabbd *tests/testthat/test-create_formula.R
b34f6a2f33b3867605ba4da273e2edb1 *tests/testthat/test-get_coef.R
dc09ceacac1635c72155f082ae879a6c *tests/testthat/test-get_coef_path.R
88c72b6a59066918f7c62c1b0f45f82f *tests/testthat/test-get_varimp.R
f43d7faa48a01479dc6b4ea5557c9d54 *tests/testthat/test-plot_effects.R
910fab3b3b2f68fa982194dc0b159cce *tests/testthat/test-plot_varimp.R
9c199efd206ee9036aaf217bdfa027f4 *vignettes/sgboost.Rmd
29 changes: 29 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Generated by roxygen2: do not edit by hand

export(create_formula)
export(get_coef)
export(get_coef_path)
export(get_varimp)
export(plot_effects)
export(plot_path)
export(plot_varimp)
import(ggplot2)
importFrom(dplyr,"%>%")
importFrom(dplyr,arrange)
importFrom(dplyr,case_when)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,mutate)
importFrom(dplyr,order_by)
importFrom(dplyr,select)
importFrom(dplyr,summarize)
importFrom(dplyr,ungroup)
importFrom(ggforce,geom_circle)
importFrom(mboost,mstop)
importFrom(mboost,varimp)
importFrom(rlang,.data)
importFrom(stats,reorder)
importFrom(stringr,str_detect)
importFrom(stringr,str_replace)
importFrom(tibble,rownames_to_column)
importFrom(tibble,tibble)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# sgboost 0.1.0

* Added a 'NEWS.md' file to track changes to the package.
87 changes: 87 additions & 0 deletions R/create_formula.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#' Create a sparse-group boosting formula
#'
#' @description
#' Creates a `mboost` formula that allows to fit a sparse-group boosting model based on
#' boosted Ridge Regression with mixing parameter `alpha`. The formula consists of a
#' group baselearner part with degrees of freedom
#' 1-`alpha` and individual baselearners with degrees of freedom `alpha`.
#'
#' @param alpha Numeric mixing parameter. For alpha = 0 only group baselearners and for
#' alpha = 1 only individual baselearners are defined.
#' @param group_df input data.frame containing variable names with group structure.
#' @param var_name Name of column in group_df containing the variable names
#' to be used as predictors. Default is `"var_name"`
#' @param group_name Name of column in group_df indicating the group structure of the variables.
#' Default is `"group_name`.
#' @param blearner Type of baselearner. Default is `'bols'`.
#' @param outcome_name String indicating the name of dependent variable. Default is `"y"`
#' @param intercept Logical, should intercept be used?
#' @importFrom dplyr select group_by summarize mutate %>%
#' @importFrom rlang .data
#' @return Character containing the formula to be passed to [mboost::mboost()]
#' yielding the sparse-group boosting for a given value mixing parameter `alpha`.
#' @export
#'
#' @examples
#' library(mboost)
#' library(dplyr)
#' set.seed(1)
#' df <- data.frame(
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
#' x4 = rnorm(100), x5 = runif(100)
#' )
#' df <- df %>%
#' mutate_all(function(x) {
#' as.numeric(scale(x))
#' })
#' df$y <- df$x1 + df$x4 + df$x5
#' group_df <- data.frame(
#' group_name = c(1, 1, 1, 2, 2),
#' var_name = c("x1", "x2", "x3", "x4", "x5")
#' )
#'
#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df)
#' sgb_model <- mboost(formula = sgb_formula, data = df)
#' summary(sgb_model)
create_formula <- function(alpha = 0.3, group_df = NULL, blearner = "bols",
outcome_name = "y", group_name = "group_name",
var_name = "var_name", intercept = FALSE) {
stopifnot("Mixing parameter alpha must be numeric" = is.numeric(alpha))
stopifnot("Mixing parameter alpha must between zero and one" = (alpha >= 0 & alpha <= 1))
stopifnot("group_df must be a data.frame" = is.data.frame(group_df))
stopifnot(
"group_name and var_name have to be columns of group_df" =
(group_name %in% colnames(group_df) &
var_name %in% colnames(group_df))
)
if (blearner != "bols") {
warning("passing a baselearner other than bols does not guarantee
that mboost() returns a sparse-group boosting model")
}
var_names <- group_names <- NULL
formula_df <- group_df
formula_df$var_names <- group_df[[var_name]]
formula_df$group_names <- group_df[[group_name]]
formula_group <- formula_df %>%
dplyr::select(var_names, group_names) %>%
dplyr::group_by(.data$group_names) %>%
dplyr::summarize(var_names = paste0(.data$var_names, collapse = " , ")) %>%
dplyr::mutate(term = paste0(
blearner, "(", .data$var_names, ", df = ",
(1 - alpha), ", intercept=", intercept, ")"
))
formula <- paste0(paste0(
blearner, "(", formula_df$var_names, ", df = ",
alpha, ", intercept=", intercept, ")"
), collapse = "+")
formula_group <- paste0(formula_group$term, collapse = "+")
if (alpha == 0) {
final_formula <- formula_group
} else if (alpha == 1) {
final_formula <- formula
} else {
final_formula <- paste0(formula, " + ", formula_group)
}
final_formula <- paste0(outcome_name, "~", final_formula)
return(formula(final_formula))
}
76 changes: 76 additions & 0 deletions R/get_coef.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#' Aggregated and raw coefficients in a sparse group boosting model
#'
#' @description
#' Computes the aggregated coefficients from group and individual baselearners.
#' Also returns the raw coefficients associated with each baselearner.
#' @details in a sparse group boosting models a variable in a dataset can be selected
#' as an individual variable or as a group. Therefore there can be two associated effect sizes for the
#' same variable. This function aggregates both and returns it in a data.frame.
#'
#' @param sgb_model Model of type `mboost` to compute the coefficients for.
#' @importFrom dplyr filter mutate %>%
#' @importFrom tibble rownames_to_column tibble
#' @importFrom stringr str_replace str_detect
#'
#' @return List of data.frames containing the a data.frame `'$raw'` with the
#' variable and the raw (Regression) coefficients and the data.frame `'$aggregated'` with the
#' aggregated (Regression) coefficients.
#' @export
#'
#' @examples
#' library(mboost)
#' library(dplyr)
#' set.seed(1)
#' df <- data.frame(
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
#' x4 = rnorm(100), x5 = runif(100)
#' )
#' df <- df %>%
#' mutate_all(function(x) {
#' as.numeric(scale(x))
#' })
#' df$y <- df$x1 + df$x4 + df$x5
#' group_df <- data.frame(
#' group_name = c(1, 1, 1, 2, 2),
#' var_name = c("x1", "x2", "x3", "x4", "x5")
#' )
#'
#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df)
#' sgb_model <- mboost(formula = sgb_formula, data = df)
#' sgb_coef <- get_coef(sgb_model)
get_coef <- function(sgb_model) {
stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost")
sgb_coef <- sgb_model$coef()
coef_df <- sgb_model$coef() %>%
seq_along() %>%
lapply(function(i) {
as.data.frame(sgb_model$coef()[[i]]) %>%
tibble::rownames_to_column() %>%
mutate(blearner = names(sgb_model$coef())[i])
}) %>%
dplyr::bind_rows() %>%
tibble()
colnames(coef_df)[1:2] <- c("variable", "effect")
coef_df <- coef_df %>%
mutate(
predictor = str_replace(.data$blearner, ",[^,]*=.*", ""),
predictor = str_replace(.data$predictor, "bols\\(", ""),
predictor = str_replace(.data$predictor, "\\)", ""),
type = dplyr::case_when(
stringr::str_detect(predictor, ",") ~ "group",
T ~ "individual"
)
) %>%
dplyr::arrange(-abs(.data$effect)) %>%
dplyr::filter(.data$effect != 0)
coef_df_aggregate <- coef_df %>%
dplyr::group_by(.data$variable) %>%
dplyr::reframe(
effect = sum(.data$effect),
blearner = paste0(.data$blearner, collapse = "; "),
predictor = paste0(.data$predictor, collapse = "; ")
) %>%
dplyr::arrange(-abs(.data$effect)) %>%
dplyr::filter(.data$effect != 0)
return(list(raw = coef_df, aggregated = coef_df_aggregate))
}
53 changes: 53 additions & 0 deletions R/get_coef_path.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#' Path of aggregated and raw coefficients in a sparse-group boosting model
#'
#' @description
#' Computes the aggregated coefficients from group and individual baselearners for each boosting iteration.
#' @details in a sparse-group boosting models a variable in a dataset can be selected
#' as an individual variable or as a group. Therefore there can be two associated effect sizes for the
#' same variable. This function aggregates both and returns it in a data.frame for each boosting iteration
#'
#' @param sgb_model Model of type `mboost` to compute the coefficient path for .
#' @importFrom dplyr mutate %>%
#'
#' @return List of data.frames containing the a data.frame `$raw` with the
#' variable and the raw (Regression) coefficients and the data.frame `$aggregated` with the
#' aggregated (Regression) coefficients.
#' @export
#' @seealso [get_coef()]
#' @examples
#' library(mboost)
#' library(dplyr)
#' set.seed(1)
#' df <- data.frame(
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
#' x4 = rnorm(100), x5 = runif(100)
#' )
#' df <- df %>%
#' mutate_all(function(x) {
#' as.numeric(scale(x))
#' })
#' df$y <- df$x1 + df$x4 + df$x5
#' group_df <- data.frame(
#' group_name = c(1, 1, 1, 2, 2),
#' var_name = c("x1", "x2", "x3", "x4", "x5")
#' )
#'
#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df)
#' sgb_model <- mboost(formula = sgb_formula, data = df)
#' sgb_coef_path <- get_coef_path(sgb_model)
get_coef_path <- function(sgb_model) {
stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost")
initial_mstop <- mboost::mstop(sgb_model)
coef_path <- get_coef(sgb_model) %>%
lapply(function(x) {
x %>% mutate(iteration = mboost::mstop(sgb_model))
})
for (i in (mboost::mstop(sgb_model) - 1):1) {
coef_path$raw <- coef_path$raw %>%
dplyr::bind_rows(get_coef(sgb_model[i])$raw %>% mutate(iteration = i))
coef_path$aggregated <- coef_path$aggregated %>%
dplyr::bind_rows(get_coef(sgb_model[i])$aggregated %>% mutate(iteration = i))
}
mboost::mstop(sgb_model) <- initial_mstop
return(coef_path)
}
67 changes: 67 additions & 0 deletions R/get_varimp.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#' Variable importance of a sparse-group boosting model
#'
#' @description
#' Variable importance is computed as relative reduction of loss-function attributed
#' to each predictor (groups and individual variables).
#' Returns a list of two data.frames. The first contains the variable importance
#' of a sparse-group model in a data.frame for each predictor.
#' The second one contains the aggregated relative importance of all groups vs. individual variables.
#'
#' @param sgb_model Model of type `mboost` to compute the variable importance for.
#' @importFrom dplyr filter mutate %>%
#' @importFrom stringr str_detect
#' @importFrom mboost varimp
#' @importFrom rlang .data
#'
#' @return List of two data.frames. `$raw` contains the name of the variables, group structure and
#' variable importance on both group and individual variable basis.
#' `$group_importance` contains the the aggregated relative importance of all
#' group baselearners and of all individual variables.
#' @export
#' @seealso [mboost::varimp()] which this function uses.
#'
#' @examples
#' library(mboost)
#' library(dplyr)
#' set.seed(1)
#' df <- data.frame(
#' x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
#' x4 = rnorm(100), x5 = runif(100)
#' )
#' df <- df %>%
#' mutate_all(function(x) {
#' as.numeric(scale(x))
#' })
#' df$y <- df$x1 + df$x4 + df$x5
#' group_df <- data.frame(
#' group_name = c(1, 1, 1, 2, 2),
#' var_name = c("x1", "x2", "x3", "x4", "x5")
#' )
#'
#' sgb_formula <- as.formula(create_formula(alpha = 0.3, group_df = group_df))
#' sgb_model <- mboost(formula = sgb_formula, data = df)
#' sgb_varimp <- get_varimp(sgb_model)
get_varimp <- function(sgb_model) {
stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost")
sgb_varimp <- mboost::varimp(sgb_model) %>%
as.data.frame() %>%
dplyr::rename("predictor" = "variable") %>%
dplyr::filter(.data$reduction != 0) %>%
dplyr::mutate(
type = dplyr::case_when(
stringr::str_detect(.data$predictor, ",") ~ "group",
T ~ "individual"
),
predictor = as.character(.data$predictor),
blearner = as.character(.data$blearner)
) %>%
dplyr::mutate(relative_importance = .data$reduction / sum(.data$reduction)) %>%
dplyr::group_by(.data$type) %>%
dplyr::ungroup() %>%
dplyr::arrange(-.data$relative_importance)
group_importance <- sgb_varimp %>%
dplyr::group_by(.data$type) %>%
dplyr::summarize(importance = sum(.data$relative_importance)) %>%
dplyr::arrange(-.data$importance)
return(list(varimp = sgb_varimp, group_importance = group_importance))
}

0 comments on commit bbd72d3

Please sign in to comment.