diff --git a/vignettes/summary-statistics.R b/vignettes/summary-statistics.R index 999a255..9f9963d 100644 --- a/vignettes/summary-statistics.R +++ b/vignettes/summary-statistics.R @@ -41,7 +41,6 @@ knitr::opts_chunk$set(collapse = TRUE) #' original numeric values in `cyl`, a `character` version, and a `factor` version. set.seed(42) library(magrittr) -library(dplyr) library(qwraps2) # define the markup language we are working in. @@ -132,24 +131,30 @@ n_perc(mtcars2$cyl %in% c(4, 6)) #' #' Let $\left\{x_1, x_2, x_3, \ldots, x_n \right\}$ be a sample of size $n$ with #' $x_i > 0$ for all $i.$ Then the geometric mean, $\mu_g,$ and geometric standard -#' deviation are in Equation \@ref(eq:geometricmean) and \@ref(eq:geometricsd) -#' respectively. +#' deviation are #' #' $$ #' \begin{equation} -#' (\#eq:geometricmean) -#' \mu_g = \left( \prod_{i = 1}^{n} x_i \right)^{\frac{1}{n}} = b^{ \sum_{i = 1}^{n} \log_{b} x_i } +#' \mu_g = \left( \prod_{i = 1}^{n} x_i \right)^{\frac{1}{n}} = b^{ \sum_{i = +#' 1}^{n} \log_{b} x_i }, #' \end{equation} #' $$ -#' +#' and #' $$ #' \begin{equation} -#' (\#eq:geometricsd) #' \sigma_g = b ^ { #' \sqrt{ \frac{\sum_{i = 1}^{n} \left( \log_{b} \frac{x_i}{\mu_g} #' \right)^2}{n}}} #' \end{equation} #' $$ +#' or, for clarity, +#' $$ +#' \begin{equation} +#' \log_{b} \sigma_g = +#' \sqrt{ \frac{\sum_{i = 1}^{n} \left( \log_{b} \frac{x_i}{\mu_g} +#' \right)^2}{n}} +#' \end{equation} +#' $$ #' #' When looking for the geometric standard deviation in R, the simple #' `exp(sd(log(x)))` is not exactly correct. Note that in @@ -209,16 +214,17 @@ gmean_sd(x) #' The function `summary_table`, along with some `dplyr` functions will do the work #' for us. `summary_table` takes two arguments: #' -#' 1. `.data` a (`grouped_df`) data.frame +#' 1. `x` a (`grouped_df`) data.frame. #' 2. `summaries` a list of summaries. This is a list-of-lists. The outer list #' defines the row groups and the inner lists define the specif summaries. +#' The default is generated by the `qsummary` function. #' args(summary_table) #' #' Let's build a list-of-lists to pass to the `summaries` argument of #' `summary_table`. The inner lists are named `formula`e defining the wanted -#' summary. These `formula`e are passed through `dplyr::summarize_` to generate +#' summary. These `formula`e are passed through `dplyr::summarize` to generate #' the table. The names are important, as they are used to label row groups and row #' names in the table. The arguemnt for the functions below use the `.data` #' pronoun for tidy evaluation (see `help(topic = ".data", package = "rlang")`). @@ -260,6 +266,7 @@ by_cyl #' #' To report a table with both the whole sample summary and conditional columns #' together: +#+results = "asis" both <- cbind(whole, by_cyl) both @@ -285,13 +292,17 @@ print(both, #' defined by `qsummary`. The purpose of `qsummary` is to provide the same #' summary for all numeric variables within a data.frame and a single style of #' summary for categorical variables within the data.frame. For example, the -#' default summary for the `mtcars2` data set is -qsummary(mtcars2) +#' default summary for a set of variables from the the `mtcars2` data set is +mtcars2 %>% + dplyr::select(.data$mpg, .data$cyl_factor, .data$wt) %>% + qsummary(.) #' #' That default summary is used for a table as follows: -#+label="summary_table_mtcars2_default", result = "asis" -summary_table(mtcars2) +#+label="summary_table_mtcars2_default", results = "asis" +mtcars2 %>% + dplyr::select(.data$mpg, .data$cyl_factor, .data$wt) %>% + summary_table(.) #' #' Now, say we want to only report the minimum and maximum for each of the @@ -301,13 +312,13 @@ summary_table(mtcars2) #' Note that when defining the list of numeric_summaries that the argument place #' holder is the `%s` character. new_summary <- - qsummary(mtcars2, + mtcars2 %>% + dplyr::select(.data$mpg, .data$cyl_factor, .data$wt) %>% + qsummary(., numeric_summaries = list("Minimum" = "~ min(%s)", "Maximum" = "~ max(%s)"), n_perc_args = list(digits = 1, show_symbol = TRUE, show_denom = "always")) -new_summary - #' #' The resulting table is: #+results = "asis" @@ -315,7 +326,10 @@ summary_table(mtcars2, new_summary) #' #' The summary can easily be used on a grouped `data.frame`. -summary_table(dplyr::group_by(mtcars2, .data$am), new_summary) +#+results = "asis" +mtcars2 %>% + dplyr::group_by(.data$am) %>% + summary_table(., new_summary) #' #' ## Adding P-values to a Summary Table @@ -369,13 +383,6 @@ a[grepl("Forward Gears", a)] %<>% sub("  \\ \\|$", paste(fpval, "|"), #+ results = "asis" cat(a, sep = "\n") -#' -#' ## Closing Note on `summary_table` and `tab_summary`. -#' -#' I encourage you, the end user, to use `summary_table` primarily, and use -#' `tab_summary` as a quick tool for generating a script. It might be best if -#' you use `tab_summary` to generate a template of the `formula`e you will want, -#' copy the template into your script and edit accordingly. #' #' # Session Info print(sessionInfo(), local = FALSE) diff --git a/vignettes/summary-statistics.Rmd b/vignettes/summary-statistics.Rmd index a4bdef2..d93716d 100644 --- a/vignettes/summary-statistics.Rmd +++ b/vignettes/summary-statistics.Rmd @@ -42,7 +42,6 @@ original numeric values in `cyl`, a `character` version, and a `factor` version. ```{r } set.seed(42) library(magrittr) -library(dplyr) library(qwraps2) # define the markup language we are working in. @@ -149,24 +148,30 @@ n_perc(mtcars2$cyl %in% c(4, 6)) Let $\left\{x_1, x_2, x_3, \ldots, x_n \right\}$ be a sample of size $n$ with $x_i > 0$ for all $i.$ Then the geometric mean, $\mu_g,$ and geometric standard -deviation are in Equation \@ref(eq:geometricmean) and \@ref(eq:geometricsd) -respectively. +deviation are $$ \begin{equation} - (\#eq:geometricmean) - \mu_g = \left( \prod_{i = 1}^{n} x_i \right)^{\frac{1}{n}} = b^{ \sum_{i = 1}^{n} \log_{b} x_i } + \mu_g = \left( \prod_{i = 1}^{n} x_i \right)^{\frac{1}{n}} = b^{ \sum_{i = + 1}^{n} \log_{b} x_i }, \end{equation} $$ - +and $$ \begin{equation} - (\#eq:geometricsd) \sigma_g = b ^ { \sqrt{ \frac{\sum_{i = 1}^{n} \left( \log_{b} \frac{x_i}{\mu_g} \right)^2}{n}}} \end{equation} $$ +or, for clarity, +$$ +\begin{equation} + \log_{b} \sigma_g = + \sqrt{ \frac{\sum_{i = 1}^{n} \left( \log_{b} \frac{x_i}{\mu_g} + \right)^2}{n}} +\end{equation} +$$ When looking for the geometric standard deviation in R, the simple `exp(sd(log(x)))` is not exactly correct. Note that in @@ -235,9 +240,10 @@ and by number of cylinders. The function `summary_table`, along with some `dplyr` functions will do the work for us. `summary_table` takes two arguments: -1. `.data` a (`grouped_df`) data.frame +1. `x` a (`grouped_df`) data.frame. 2. `summaries` a list of summaries. This is a list-of-lists. The outer list defines the row groups and the inner lists define the specif summaries. + The default is generated by the `qsummary` function. ```{r } @@ -247,7 +253,7 @@ args(summary_table) Let's build a list-of-lists to pass to the `summaries` argument of `summary_table`. The inner lists are named `formula`e defining the wanted -summary. These `formula`e are passed through `dplyr::summarize_` to generate +summary. These `formula`e are passed through `dplyr::summarize` to generate the table. The names are important, as they are used to label row groups and row names in the table. The arguemnt for the functions below use the `.data` pronoun for tidy evaluation (see `help(topic = ".data", package = "rlang")`). @@ -298,7 +304,7 @@ by_cyl To report a table with both the whole sample summary and conditional columns together: -```{r } +```{r results = "asis"} both <- cbind(whole, by_cyl) both ``` @@ -328,17 +334,21 @@ By default, calling `summary_table` will use the default summary metrics defined by `qsummary`. The purpose of `qsummary` is to provide the same summary for all numeric variables within a data.frame and a single style of summary for categorical variables within the data.frame. For example, the -default summary for the `mtcars2` data set is +default summary for a set of variables from the the `mtcars2` data set is ```{r } -qsummary(mtcars2) +mtcars2 %>% + dplyr::select(.data$mpg, .data$cyl_factor, .data$wt) %>% + qsummary(.) ``` That default summary is used for a table as follows: -```{r label="summary_table_mtcars2_default", result = "asis"} -summary_table(mtcars2) +```{r label="summary_table_mtcars2_default", results = "asis"} +mtcars2 %>% + dplyr::select(.data$mpg, .data$cyl_factor, .data$wt) %>% + summary_table(.) ``` @@ -351,12 +361,12 @@ holder is the `%s` character. ```{r } new_summary <- - qsummary(mtcars2, + mtcars2 %>% + dplyr::select(.data$mpg, .data$cyl_factor, .data$wt) %>% + qsummary(., numeric_summaries = list("Minimum" = "~ min(%s)", "Maximum" = "~ max(%s)"), n_perc_args = list(digits = 1, show_symbol = TRUE, show_denom = "always")) - -new_summary ``` @@ -369,8 +379,10 @@ summary_table(mtcars2, new_summary) The summary can easily be used on a grouped `data.frame`. -```{r } -summary_table(dplyr::group_by(mtcars2, .data$am), new_summary) +```{r results = "asis"} +mtcars2 %>% + dplyr::group_by(.data$am) %>% + summary_table(., new_summary) ``` @@ -437,13 +449,6 @@ cat(a, sep = "\n") ``` -## Closing Note on `summary_table` and `tab_summary`. - -I encourage you, the end user, to use `summary_table` primarily, and use -`tab_summary` as a quick tool for generating a script. It might be best if -you use `tab_summary` to generate a template of the `formula`e you will want, -copy the template into your script and edit accordingly. - # Session Info ```{r }