version 0.1.0

cran · Apr 12, 2024 · bbd72d3 · bbd72d3
commit bbd72d3
Show file tree

Hide file tree

Showing 34 changed files with 2,827 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,24 @@
+Package: sgboost
+Title: Sparse-Group Boosting
+Version: 0.1.0
+Authors@R: 
+    person("Fabian", "Obster", , "fabian.obster@unibw.de", role = c("aut", "cre", "cph"),
+           comment = c(ORCID = "https://orcid.org/0000-0002-6951-9869"))
+Description: Sparse-group boosting to be used in conjunction with the 'mboost' for modeling grouped data.
+    Applicable to all sparse-group lasso type problems where within-group and between-group sparsity is desired.
+    Interprets and visualizes individual variables and groups.
+Imports: dplyr, mboost, stringr, rlang, tibble, ggplot2, ggforce
+License: MIT + file LICENSE
+Encoding: UTF-8
+RoxygenNote: 7.3.1
+URL: https://github.com/FabianObster/sgboost
+BugReports: https://github.com/FabianObster/sgboost/issues
+Suggests: knitr, rmarkdown, testthat (>= 3.0.0)
+Config/testthat/edition: 3
+VignetteBuilder: knitr
+NeedsCompilation: no
+Packaged: 2024-04-10 20:03:03 UTC; fabia
+Author: Fabian Obster [aut, cre, cph] (<https://orcid.org/0000-0002-6951-9869>)
+Maintainer: Fabian Obster <fabian.obster@unibw.de>
+Repository: CRAN
+Date/Publication: 2024-04-11 11:20:02 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2024
+COPYRIGHT HOLDER: Fabian Obster
diff --git a/MD5 b/MD5
@@ -0,0 +1,33 @@
+56f0ca36686a7c582a59e627dc9e40c1 *DESCRIPTION
+d07665b844aab6988eff559029775110 *LICENSE
+7c9f8ef91ecfbd4bfd208c45781c13fc *NAMESPACE
+f1af48ee9bbc36021598ddd07bdcc920 *NEWS.md
+695e875b5adbbfca3f328afb4c86fae5 *R/create_formula.R
+03b4fb9c6acbd3caaed1a81f6973ac5c *R/get_coef.R
+51bae40dab4406aaf94c1b19c62e69e9 *R/get_coef_path.R
+198158beb468f550dcef43cd7611743b *R/get_varimp.R
+fc9bb18019062538d8d02d3beebf6f08 *R/plot_effects.R
+7852bb4d5c4b785affb2608c06fc9cb4 *R/plot_path.R
+1ec95825d71626b0a8c6f6c776049c7c *R/plot_varimp.R
+e2045b8d93763b5c795187b36693aa5b *README.md
+2f57f446d529aef7ce2b242259e42deb *build/vignette.rds
+b2f113515771a49cca4dd1b9cfc2965c *inst/doc/sgboost.R
+9c199efd206ee9036aaf217bdfa027f4 *inst/doc/sgboost.Rmd
+b81de5bfff69858fcb5df97bd3ef675b *inst/doc/sgboost.html
+261e432e273ce19056a90b21a133f346 *man/create_formula.Rd
+401ced294a2bbf46ffd3cdb136ea2050 *man/figures/README-minimal-ex-1.png
+502a5f8dd6ba90060101596366cb7d58 *man/figures/README-pressure-1.png
+6769e651f526f127a794b04a366b2ec0 *man/get_coef.Rd
+853f1ac1163683f25bb06a7b7f8d1f41 *man/get_coef_path.Rd
+3a64186e992887ed138ea391f0f8e882 *man/get_varimp.Rd
+6a820a65f02867fd4dcfd69a55965e8d *man/plot_effects.Rd
+6d0feb82db2bc3521f903df0c2c0a89b *man/plot_path.Rd
+0bc97cf9117fe6d275c1cec3e3258b6b *man/plot_varimp.Rd
+f519b2ef1f4fc388424e1b94280ce7a9 *tests/testthat.R
+521decc492bf4051aa593af4e23eabbd *tests/testthat/test-create_formula.R
+b34f6a2f33b3867605ba4da273e2edb1 *tests/testthat/test-get_coef.R
+dc09ceacac1635c72155f082ae879a6c *tests/testthat/test-get_coef_path.R
+88c72b6a59066918f7c62c1b0f45f82f *tests/testthat/test-get_varimp.R
+f43d7faa48a01479dc6b4ea5557c9d54 *tests/testthat/test-plot_effects.R
+910fab3b3b2f68fa982194dc0b159cce *tests/testthat/test-plot_varimp.R
+9c199efd206ee9036aaf217bdfa027f4 *vignettes/sgboost.Rmd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,29 @@
+# Generated by roxygen2: do not edit by hand
+
+export(create_formula)
+export(get_coef)
+export(get_coef_path)
+export(get_varimp)
+export(plot_effects)
+export(plot_path)
+export(plot_varimp)
+import(ggplot2)
+importFrom(dplyr,"%>%")
+importFrom(dplyr,arrange)
+importFrom(dplyr,case_when)
+importFrom(dplyr,filter)
+importFrom(dplyr,group_by)
+importFrom(dplyr,mutate)
+importFrom(dplyr,order_by)
+importFrom(dplyr,select)
+importFrom(dplyr,summarize)
+importFrom(dplyr,ungroup)
+importFrom(ggforce,geom_circle)
+importFrom(mboost,mstop)
+importFrom(mboost,varimp)
+importFrom(rlang,.data)
+importFrom(stats,reorder)
+importFrom(stringr,str_detect)
+importFrom(stringr,str_replace)
+importFrom(tibble,rownames_to_column)
+importFrom(tibble,tibble)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# sgboost 0.1.0
+
+* Added a 'NEWS.md' file to track changes to the package.
diff --git a/R/create_formula.R b/R/create_formula.R
@@ -0,0 +1,87 @@
+#' Create a sparse-group boosting formula
+#'
+#' @description
+#' Creates a `mboost` formula that allows to fit a sparse-group boosting model based on
+#' boosted Ridge Regression with mixing parameter `alpha`. The formula consists of a
+#' group baselearner part with degrees of freedom
+#' 1-`alpha` and individual baselearners with degrees of freedom `alpha`.
+#'
+#' @param alpha Numeric mixing parameter. For alpha = 0 only group baselearners and for
+#' alpha = 1 only individual baselearners are defined.
+#' @param group_df input data.frame containing variable names with group structure.
+#' @param var_name Name of column in group_df containing the variable names
+#' to be used as predictors. Default is `"var_name"`
+#' @param group_name Name of column in group_df indicating the group structure of the variables.
+#' Default is `"group_name`.
+#' @param blearner Type of baselearner. Default is `'bols'`.
+#' @param outcome_name String indicating the name of dependent variable. Default is `"y"`
+#' @param intercept Logical, should intercept be used?
+#' @importFrom dplyr select group_by summarize mutate %>%
+#' @importFrom rlang .data
+#' @return Character containing the formula to be passed to [mboost::mboost()]
+#'  yielding the sparse-group boosting for a given value mixing parameter `alpha`.
+#' @export
+#'
+#' @examples
+#' library(mboost)
+#' library(dplyr)
+#' set.seed(1)
+#' df <- data.frame(
+#'   x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
+#'   x4 = rnorm(100), x5 = runif(100)
+#' )
+#' df <- df %>%
+#'   mutate_all(function(x) {
+#'     as.numeric(scale(x))
+#'   })
+#' df$y <- df$x1 + df$x4 + df$x5
+#' group_df <- data.frame(
+#'   group_name = c(1, 1, 1, 2, 2),
+#'   var_name = c("x1", "x2", "x3", "x4", "x5")
+#' )
+#'
+#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df)
+#' sgb_model <- mboost(formula = sgb_formula, data = df)
+#' summary(sgb_model)
+create_formula <- function(alpha = 0.3, group_df = NULL, blearner = "bols",
+                           outcome_name = "y", group_name = "group_name",
+                           var_name = "var_name", intercept = FALSE) {
+  stopifnot("Mixing parameter alpha must be numeric" = is.numeric(alpha))
+  stopifnot("Mixing parameter alpha must between zero and one" = (alpha >= 0 & alpha <= 1))
+  stopifnot("group_df must be a data.frame" = is.data.frame(group_df))
+  stopifnot(
+    "group_name and var_name have to be columns of group_df" =
+      (group_name %in% colnames(group_df) &
+        var_name %in% colnames(group_df))
+  )
+  if (blearner != "bols") {
+    warning("passing a baselearner other than bols does not guarantee
+            that mboost() returns a sparse-group boosting model")
+  }
+  var_names <- group_names <- NULL
+  formula_df <- group_df
+  formula_df$var_names <- group_df[[var_name]]
+  formula_df$group_names <- group_df[[group_name]]
+  formula_group <- formula_df %>%
+    dplyr::select(var_names, group_names) %>%
+    dplyr::group_by(.data$group_names) %>%
+    dplyr::summarize(var_names = paste0(.data$var_names, collapse = " , ")) %>%
+    dplyr::mutate(term = paste0(
+      blearner, "(", .data$var_names, ", df = ",
+      (1 - alpha), ", intercept=", intercept, ")"
+    ))
+  formula <- paste0(paste0(
+    blearner, "(", formula_df$var_names, ", df = ",
+    alpha, ", intercept=", intercept, ")"
+  ), collapse = "+")
+  formula_group <- paste0(formula_group$term, collapse = "+")
+  if (alpha == 0) {
+    final_formula <- formula_group
+  } else if (alpha == 1) {
+    final_formula <- formula
+  } else {
+    final_formula <- paste0(formula, " + ", formula_group)
+  }
+  final_formula <- paste0(outcome_name, "~", final_formula)
+  return(formula(final_formula))
+}
diff --git a/R/get_coef.R b/R/get_coef.R
@@ -0,0 +1,76 @@
+#' Aggregated and raw coefficients in a sparse group boosting model
+#'
+#' @description
+#' Computes the aggregated coefficients from group and individual baselearners.
+#' Also returns the raw coefficients associated with each baselearner.
+#' @details in a sparse group boosting models a variable in a dataset can be selected
+#' as an individual variable or as a group. Therefore there can be two associated effect sizes for the
+#' same variable. This function aggregates both and returns it in a data.frame.
+#'
+#' @param sgb_model Model of type `mboost` to compute the coefficients for.
+#' @importFrom dplyr filter mutate %>%
+#' @importFrom tibble rownames_to_column tibble
+#' @importFrom stringr str_replace str_detect
+#'
+#' @return List of data.frames containing the a data.frame `'$raw'` with the
+#' variable and the raw (Regression) coefficients and the data.frame `'$aggregated'` with the
+#' aggregated (Regression) coefficients.
+#' @export
+#'
+#' @examples
+#' library(mboost)
+#' library(dplyr)
+#' set.seed(1)
+#' df <- data.frame(
+#'   x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
+#'   x4 = rnorm(100), x5 = runif(100)
+#' )
+#' df <- df %>%
+#'   mutate_all(function(x) {
+#'     as.numeric(scale(x))
+#'   })
+#' df$y <- df$x1 + df$x4 + df$x5
+#' group_df <- data.frame(
+#'   group_name = c(1, 1, 1, 2, 2),
+#'   var_name = c("x1", "x2", "x3", "x4", "x5")
+#' )
+#'
+#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df)
+#' sgb_model <- mboost(formula = sgb_formula, data = df)
+#' sgb_coef <- get_coef(sgb_model)
+get_coef <- function(sgb_model) {
+  stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost")
+  sgb_coef <- sgb_model$coef()
+  coef_df <- sgb_model$coef() %>%
+    seq_along() %>%
+    lapply(function(i) {
+      as.data.frame(sgb_model$coef()[[i]]) %>%
+        tibble::rownames_to_column() %>%
+        mutate(blearner = names(sgb_model$coef())[i])
+    }) %>%
+    dplyr::bind_rows() %>%
+    tibble()
+  colnames(coef_df)[1:2] <- c("variable", "effect")
+  coef_df <- coef_df %>%
+    mutate(
+      predictor = str_replace(.data$blearner, ",[^,]*=.*", ""),
+      predictor = str_replace(.data$predictor, "bols\\(", ""),
+      predictor = str_replace(.data$predictor, "\\)", ""),
+      type = dplyr::case_when(
+        stringr::str_detect(predictor, ",") ~ "group",
+        T ~ "individual"
+      )
+    ) %>%
+    dplyr::arrange(-abs(.data$effect)) %>%
+    dplyr::filter(.data$effect != 0)
+  coef_df_aggregate <- coef_df %>%
+    dplyr::group_by(.data$variable) %>%
+    dplyr::reframe(
+      effect = sum(.data$effect),
+      blearner = paste0(.data$blearner, collapse = "; "),
+      predictor = paste0(.data$predictor, collapse = "; ")
+    ) %>%
+    dplyr::arrange(-abs(.data$effect)) %>%
+    dplyr::filter(.data$effect != 0)
+  return(list(raw = coef_df, aggregated = coef_df_aggregate))
+}
diff --git a/R/get_coef_path.R b/R/get_coef_path.R
@@ -0,0 +1,53 @@
+#' Path of aggregated and raw coefficients in a sparse-group boosting model
+#'
+#' @description
+#' Computes the aggregated coefficients from group and individual baselearners for each boosting iteration.
+#' @details in a sparse-group boosting models a variable in a dataset can be selected
+#' as an individual variable or as a group. Therefore there can be two associated effect sizes for the
+#' same variable. This function aggregates both and returns it in a data.frame for each boosting iteration
+#'
+#' @param sgb_model Model of type `mboost` to compute the coefficient path for .
+#' @importFrom dplyr mutate %>%
+#'
+#' @return List of data.frames containing the a data.frame `$raw` with the
+#' variable and the raw (Regression) coefficients and the data.frame `$aggregated` with the
+#' aggregated (Regression) coefficients.
+#' @export
+#' @seealso [get_coef()]
+#' @examples
+#' library(mboost)
+#' library(dplyr)
+#' set.seed(1)
+#' df <- data.frame(
+#'   x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
+#'   x4 = rnorm(100), x5 = runif(100)
+#' )
+#' df <- df %>%
+#'   mutate_all(function(x) {
+#'     as.numeric(scale(x))
+#'   })
+#' df$y <- df$x1 + df$x4 + df$x5
+#' group_df <- data.frame(
+#'   group_name = c(1, 1, 1, 2, 2),
+#'   var_name = c("x1", "x2", "x3", "x4", "x5")
+#' )
+#'
+#' sgb_formula <- create_formula(alpha = 0.3, group_df = group_df)
+#' sgb_model <- mboost(formula = sgb_formula, data = df)
+#' sgb_coef_path <- get_coef_path(sgb_model)
+get_coef_path <- function(sgb_model) {
+  stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost")
+  initial_mstop <- mboost::mstop(sgb_model)
+  coef_path <- get_coef(sgb_model) %>%
+    lapply(function(x) {
+      x %>% mutate(iteration = mboost::mstop(sgb_model))
+    })
+  for (i in (mboost::mstop(sgb_model) - 1):1) {
+    coef_path$raw <- coef_path$raw %>%
+      dplyr::bind_rows(get_coef(sgb_model[i])$raw %>% mutate(iteration = i))
+    coef_path$aggregated <- coef_path$aggregated %>%
+      dplyr::bind_rows(get_coef(sgb_model[i])$aggregated %>% mutate(iteration = i))
+  }
+  mboost::mstop(sgb_model) <- initial_mstop
+  return(coef_path)
+}
diff --git a/R/get_varimp.R b/R/get_varimp.R
@@ -0,0 +1,67 @@
+#' Variable importance of a sparse-group boosting model
+#'
+#' @description
+#'  Variable importance is computed as relative reduction of loss-function attributed
+#'  to each predictor (groups and individual variables).
+#'  Returns a list of two data.frames. The first contains the variable importance
+#'  of a sparse-group model in a data.frame for each predictor.
+#'  The second one contains the aggregated relative importance of all groups vs. individual variables.
+#'
+#' @param sgb_model Model of type `mboost` to compute the variable importance for.
+#' @importFrom dplyr filter mutate %>%
+#' @importFrom stringr str_detect
+#' @importFrom mboost varimp
+#' @importFrom rlang .data
+#'
+#' @return List of two data.frames. `$raw` contains the name of the variables, group structure and
+#' variable importance on both group and individual variable basis.
+#' `$group_importance` contains the the aggregated relative importance of all
+#' group baselearners and of all individual variables.
+#' @export
+#' @seealso [mboost::varimp()] which this function uses.
+#'
+#' @examples
+#' library(mboost)
+#' library(dplyr)
+#' set.seed(1)
+#' df <- data.frame(
+#'   x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100),
+#'   x4 = rnorm(100), x5 = runif(100)
+#' )
+#' df <- df %>%
+#'   mutate_all(function(x) {
+#'     as.numeric(scale(x))
+#'   })
+#' df$y <- df$x1 + df$x4 + df$x5
+#' group_df <- data.frame(
+#'   group_name = c(1, 1, 1, 2, 2),
+#'   var_name = c("x1", "x2", "x3", "x4", "x5")
+#' )
+#'
+#' sgb_formula <- as.formula(create_formula(alpha = 0.3, group_df = group_df))
+#' sgb_model <- mboost(formula = sgb_formula, data = df)
+#' sgb_varimp <- get_varimp(sgb_model)
+get_varimp <- function(sgb_model) {
+  stopifnot("Model must be of class mboost" = class(sgb_model) == "mboost")
+  sgb_varimp <- mboost::varimp(sgb_model) %>%
+    as.data.frame() %>%
+    dplyr::rename("predictor" = "variable") %>%
+    dplyr::filter(.data$reduction != 0) %>%
+    dplyr::mutate(
+      type = dplyr::case_when(
+        stringr::str_detect(.data$predictor, ",") ~ "group",
+        T ~ "individual"
+      ),
+      predictor = as.character(.data$predictor),
+      blearner = as.character(.data$blearner)
+    ) %>%
+    dplyr::mutate(relative_importance = .data$reduction / sum(.data$reduction)) %>%
+    dplyr::group_by(.data$type) %>%
+    dplyr::ungroup() %>%
+    dplyr::arrange(-.data$relative_importance)
+  group_importance <- sgb_varimp %>%
+    dplyr::group_by(.data$type) %>%
+    dplyr::summarize(importance = sum(.data$relative_importance)) %>%
+    dplyr::arrange(-.data$importance)
+  return(list(varimp = sgb_varimp, group_importance = group_importance))
+}