Simplified Summarize

fishR-Core-Team · Aug 22, 2016 · 37f319e · 37f319e
1 parent 7665e56
commit 37f319e
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 451 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: FSA
 Title: Simple Fisheries Stock Assessment Methods
 Version: 0.8.9
-Date: 2016-7-16
+Date: 2016-8-21
 Description: A variety of simple fish stock assessment methods.
     Detailed vignettes are available on the fishR website <http://derekogle.com/fishR/>.
 Authors@R: person("Derek","Ogle",email="derek@derekogle.com",role=c("aut","cre"))

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,9 @@
 # FSA 0.8.9 ongoing
+* `ageComparison()`: Modified.  Removed an internal call to `fact2num()` because of changes to `Summarize()` below.  Should not impact user experience.
 * `diags()`: Added.
 * `gompertzFuns()`: Modified.  Fixed some spacing around the message when `msg=TRUE`.
 * `logisticFuns()`: Modified.  Fixed some spacing around the message when `msg=TRUE`.
-* `Summarize()`: Modified.  Removed all `exclude=` arguments within calls to `table()` to address issues that will be created with forthcoming R v3.4.0 (per notice from CRAN).  Added more examples and tests.
+* `Summarize()`: Modified.  Removed all uses where the main variable was a factor (this functionality was largely unneeded and unused, was inelegant and difficult to maintain).  Removed pass-through to `summary`.  Removed warnings about the RHS variables being converted to factors.  Columns for "levels" of the RHS variables are now returned in their original model (i.e., if the variable was numeric in the original data.frame it is now numeric in the data.frame returned from this function) -- this should reduce need for using `fact2num()` when using the results of this function for variables that were originally numeric.  Added more examples and tests for the numeric data.
 
 # FSA 0.8.8 18-Jul-16
 * `growthFunShow()`: Modified.  Added Pauly et al. (1992) seasonal cessation function.  Added `case=` for use with Schnute model.

diff --git a/R/Summarize.R b/R/Summarize.R
@@ -1,98 +1,69 @@
-#' @title Summary statistics for a numeric or factor variable.
+#' @title Summary statistics for a numeric variable.
 #'
-#' @description Summary statistics for a single numeric or factor variable, possibly separated by the levels of a factor variable.  This function is very similar to \code{\link[base]{summary}} for a numeric variables and \code{\link[base]{table}} for factor variables.
+#' @description Summary statistics for a single numeric variable, possibly separated by the levels of a factor variable or variables.  This function is very similar to \code{\link[base]{summary}} for a numeric variable.
 #'
-#' @details This function is primarily used with formulas.  Five general types of formulae may be used (where \code{quant} and \code{factor} generically represent quantitative/numeric and factor variables, respectively)
+#' @details This function is primarily used with formulas of the following types (where \code{quant} and \code{factor} generically represent quantitative/numeric and factor variables, respectively):
 #' \tabular{ll}{
 #'   Formula \tab Description of Summary \cr
 #'   \code{~quant} \tab Numerical summaries (see below) of \code{quant}.\cr
-#'   \code{~factor} \tab One-way frequency or percentage (see below) table of \code{factor}.\cr
 #'   \code{quant~factor} \tab Summaries of \code{quant} separated by levels in \code{factor}.\cr
 #'   \code{quant~factor1*factor2} \tab Summaries of \code{quant} separated by the combined levels in \code{factor1} and \code{factor2}.\cr
-#'   \code{factor1~factor2} \tab Two-way frequency or percentage table with levels of \code{factor2} as rows and \code{factor1} as columns.\cr
 #' }
 #' 
 #' Numerical summaries include all results from \code{\link[base]{summary}} (min, Q1, mean, median, Q3, and max) and the sample size, valid sample size (sample size minus number of \code{NA}s), and standard deviation (i.e., \code{sd}).  \code{NA} values are removed from the calculations with \code{na.rm=TRUE} (the DEFAULT).  The number of digits in the returned results are controlled with \code{digits=}.
 #' 
-#' Factor variables may be summarized as a frequency (if \code{percent="none"}) or percentages table (the DEFAULT).  For a single factor variable, the percentages table is returned if \code{percent="total"}.  For two factor variables, the percentage table may be returned as a row-, column-, or table-percent table with \code{percent="row"} (the DEFAULT), \code{percent="column"}, and \code{percent="total"}, respectively.   The number of digits in the returned table are controlled with \code{percdigs=}.  A marginal total, either for all margins if \code{percent="none"} or the appropriate margin otherwise, is added to the table if \code{addtotal=TRUE}.  The results for a factor are NOT meant to replace \code{\link[base]{table}} or \code{\link[stats]{xtabs}}.  This functionality is provided to make this function more complete.
-#'
 #' @note Students often need to examine basic statistics of a quantitative variable separated for different levels of a categorical variable.  These results may be obtained with \code{\link[base]{tapply}}, \code{\link[base]{by}}, or \code{\link[stats]{aggregate}} (or with functions in other packages), but the use of these functions is not obvious to newbie students or return results in a format that is not obvious to newbie students.  Thus, the formula method to \code{Summarize} allows newbie students to use a common notation (i.e., formula) to easily compute summary statistics for a quantitative variable separated by the levels of a factor.
 #'
 #' @aliases Summarize Summarize.default Summarize.formula
 #'
-#' @param object A vector of numeric or factor data.
-#' @param digits A numeric that indicates the number of decimals to round the numeric summaries.
-#' @param addtotal A logical that indicates whether totals should be added to tables (\code{=TRUE}, default) or not.  See details.
-#' @param percent A string that indicates the type of percents to compute for tables from factor variables.  See details.
-#' @param percdigs A numeric that indicates the number of decimals to round the percentage summaries.
+#' @param object A vector of numeric data.
+#' @param digits A single numeric that indicates the number of decimals to round the numeric summaries.
 #' @param na.rm A logical that indicates whether numeric missing values (\code{NA}) should be removed (\code{=TRUE}, default) or not.
 #' @param exclude A string that contains the level that should be excluded from a factor variable.
-#' @param data An optional data frame that contains the variables in \code{formula}.
+#' @param data A data.frame that contains the variables in \code{formula}.
 #' @param \dots Not implemented.
 #'
-#' @return A named vector or data frame (when a quantitative variable is separted by one or two factor variables) of summary statistics for numeric data and a matrix of frequencies and, possibly, percentages for factor variables.
+#' @return A named vector or data frame (when a quantitative variable is separted by one or two factor variables) of summary statistics for numeric data.
 #'
 #' @author Derek H. Ogle, \email{derek@@derekogle.com}
 #'
-#' @seealso See \code{\link[base]{summary}}, \code{\link[base]{table}}, and \code{\link[stats]{xtabs}} for related one dimensional functionality.  See \code{\link[base]{tapply}}, \code{\link[doBy]{summaryBy}} in \pkg{doBy}, \code{\link[psych]{describe}} in \pkg{psych}, \code{\link[prettyR]{describe}} in \pkg{prettyR}, and \code{\link[fBasics]{basicStats}} in \pkg{fBasics} for similar \dQuote{by} functionality.
+#' @seealso See \code{\link[base]{summary}} for related one dimensional functionality.  See \code{\link[base]{tapply}}, \code{\link[doBy]{summaryBy}} in \pkg{doBy}, \code{\link[psych]{describe}} in \pkg{psych}, \code{\link[prettyR]{describe}} in \pkg{prettyR}, and \code{\link[fBasics]{basicStats}} in \pkg{fBasics} for similar \dQuote{by} functionality.
 #'
 #' @keywords misc
 #'
 #' @examples
-#' ## Create a numeric vector (with missing values)
+#' ## Create a data.frame of "data"
 #' n <- 102
-#' y <- c(0,0,NA,NA,NA,runif(n-5))
-#' ## Create a factor vector (with missing values)
-#' g1 <- factor(sample(c("A","B","C","NA"),n,replace=TRUE))
-#' ## Create a factor vector with unknowns
-#' g2 <- factor(sample(c("male","female","UNKNOWN"),n,replace=TRUE))
-#' # Put into a data.frame (with some extra variables)
-#' d <- data.frame(dy=y,dg1=g1,dg2=g2,
-#'                 dw=sample(1:3,n,replace=TRUE),
-#'                 dv=sample(1:3,n,replace=TRUE))
+#' d <- data.frame(y=c(0,0,NA,NA,NA,runif(n-5)),
+#'                 w=sample(7:9,n,replace=TRUE),
+#'                 v=sample(1:3,n,replace=TRUE),
+#'                 g1=factor(sample(c("A","B","C",NA),n,replace=TRUE)),
+#'                 g2=factor(sample(c("male","female","UNKNOWN"),n,replace=TRUE)),
+#'                 g3=sample(c("a","b","c","d"),n,replace=TRUE),
+#'                 stringsAsFactors=FALSE)
 #' 
 #' # typical output of summary() for a numeric variable
-#' summary(y)   
+#' summary(d$y)   
 #'
 #' # this function           
-#' Summarize(y,digits=3)
-#' Summarize(~dy,data=d,digits=3)
-#' Summarize(dy~1,data=d,digits=3)
-#'
-#' ## Factor vector (excluding "NA"s in second call)
-#' Summarize(~dg1,data=d)
-#' Summarize(~dg1,data=d,exclude="NA")
-#'
-#' ## Factor vector with UNKNOWNs
-#' Summarize(~dg2,data=d)
-#' Summarize(~dg2,data=d,exclude="UNKNOWN")
+#' Summarize(d$y,digits=3)
+#' Summarize(~y,data=d,digits=3)
+#' Summarize(y~1,data=d,digits=3)
 #'
 #' ## Numeric vector by levels of a factor variable
-#' Summarize(dy~dg1,data=d,digits=3)
-#' Summarize(dy~dg1,data=d,digits=3,exclude="NA")
-#' Summarize(dy~dg2,data=d,digits=3)
-#' Summarize(dy~dg2,data=d,digits=3,exclude="UNKNOWN")
+#' Summarize(y~g1,data=d,digits=3)
+#' Summarize(y~g2,data=d,digits=3)
+#' Summarize(y~g2,data=d,digits=3,exclude="UNKNOWN")
 #'
 #' ## Numeric vector by levels of two factor variables
-#' Summarize(dy~dg1+dg2,data=d,digits=3)
-#' Summarize(dy~dg1+dg2,data=d,digits=3,exclude="NA")
-#' Summarize(dy~dg1+dg2,data=d,digits=3,exclude="UNKNOWN")
-#' Summarize(dy~dg1+dg2,data=d,digits=3,exclude=c("NA","UNKNOWN"))
+#' Summarize(y~g1+g2,data=d,digits=3)
+#' Summarize(y~g1+g2,data=d,digits=3,exclude="UNKNOWN")
 #' 
 #' ## What happens if RHS of formula is not a factor
-#' Summarize(dy~dw,data=d,digits=3)
-#' Summarize(y~dw*dv,data=d,digits=3)
-#'
-#' ## Summarize factor variable by a factor variable
-#' Summarize(dg1~dg2,data=d)
-#' Summarize(dg1~dg2,data=d,exclude="NA")
-#' Summarize(dg1~dg2,data=d,exclude=c("NA","UNKNOWN"))
-#' Summarize(dg1~dg2,data=d,percent="none")
-#' Summarize(dg1~dg2,data=d,percent="column")
-#' Summarize(dg1~dg2,data=d,percent="total")
+#' Summarize(y~w,data=d,digits=3)
 #'
-#' ## Summarizing all variables in a data frame
-#' lapply(as.list(d),Summarize,digits=4)
+#' ## Summarizing multiple variables in a data.frame (must reduce to numerics)
+#' lapply(as.list(d[,1:3]),Summarize,digits=4)
 #'
 #' @rdname Summarize
 #' @export
@@ -103,9 +74,7 @@ Summarize <- function(object, ...) {
 #' @rdname Summarize
 #' @export
 Summarize.default <- function(object,digits=getOption("digits"),
-                              addtotal=TRUE,percent=c("total","none"),percdigs=2,
                               na.rm=TRUE,exclude=NULL,...) {
-  percent <- match.arg(percent)
   ## Do some checking on object type
   if (is.data.frame(object)) stop("'Summarize' does not work with a data.frame.",call.=FALSE)
   if (is.matrix(object)) {
@@ -117,39 +86,27 @@ Summarize.default <- function(object,digits=getOption("digits"),
     }
   }
   ## Start processing
-  if (is.numeric(object)) {
-   # quantitative data
-   iSummarizeQ1(object,digits,na.rm)
-  } else if (is.factor(object)) {
-    # Categorical data
-    iSummarizeC1(object,addtotal,percent,percdigs,exclude)
-  } else summary(object) # A pass-through to the original summary function
+  if (!is.numeric(object)) stop("'Summarize' only works with a numeric variable",call.=FALSE)
+  else iSummarizeQ1(object,digits,na.rm)
 }
 
 #' @rdname Summarize
 #' @export
 Summarize.formula <- function(object,data=NULL,digits=getOption("digits"),
-                              percent=c("row","column","total","none"),percdigs=2,
-                              addtotal=TRUE,na.rm=TRUE,exclude=NULL,...) {
-   percent <- match.arg(percent)
+                              na.rm=TRUE,exclude=NULL,...) {
    ## Handle the formula
    tmp <- iHndlFormula(object,data,expNumR=1)
    ## Start Processing
    if (tmp$vnum==1) {
      ## Only one variable ... send first column of model.frame
      ## to summarize.default
-     Summarize(tmp$mf[,1],digits=digits,percent=ifelse(percent=="none","none","total"),
-               percdigs=percdigs,addtotal=addtotal,na.rm=na.rm,exclude=exclude,...)
+     Summarize(tmp$mf[,1],digits=digits,na.rm=na.rm,exclude=exclude,...)
    } else {
      ## More than one variable
      if (!tmp$metExpNumR) stop("Must have one variable on LHS of formula with more than one variable",call.=FALSE)
      if (tmp$Rclass %in% c("numeric","integer")) {
-       ## Quantitative response
        iSummarizeQf(tmp,digits,na.rm,exclude)
-     } else {
-       ## Categorical response
-       iSummarizeCf(tmp,percent,percdigs,addtotal,exclude)
-     }
+     } else stop("'Summarize' only works with a numeric variable on LHS.",call.=FALSE)
    }
 }
 
@@ -177,39 +134,12 @@ iSummarizeQ1 <- function(object,digits,na.rm) {
   round(res,digits)
 }
 
-##############################################################
-## Internal function for vector of categorical data
-##############################################################
-iSummarizeC1 <- function(object,addtotal,percent,percdigs,exclude) {
-  # exclude values if asked to
-  if (!is.null(exclude)) object <- droplevels(object[object!=exclude])
-  # Summary table
-  res <- table(object)
-  # Add percents to table if asked for
-  if (percent!="none") {
-    # Add percents, forms a 2-col matrix
-    res <- cbind(res,round(prop.table(res)*100,percdigs))
-    colnames(res) <- c("freq","perc")                                          
-    # Adds a valid percent column if any of the rownames are NA
-    if (any(rownames(res)=="NA")) {
-      # reorders rownames so that "NA" is last
-      rn <- rownames(res)
-      rn1 <- c(rn[rn!="NA"],"NA")
-      res <- res[rn1,]
-      vp <- c(round(prop.table(table(object,exclude="NA"))*100,percdigs),0)
-      res <- cbind(res,vp)
-      colnames(res) <- c("freq","perc","validPerc")
-    }
-  }
-  # Adds a total of each column to table
-  if (addtotal) res <- stats::addmargins(res,margin=1,FUN=list(Total=sum),quiet=TRUE)
-  res                                                                            
-}
-
 ##############################################################
 ## Internal function for formula with quantitative response
 ##############################################################
 iSummarizeQf <- function(tmp,digits,na.rm,exclude) {
+  ## Set defaults for whether variables should be unfactored or not
+  unfactor1 <- unfactor2 <- FALSE
   ## Exclude values from factor variables if asked to
   if (!is.null(exclude)) {
     for (i in 1:length(exclude)) {
@@ -219,29 +149,15 @@ iSummarizeQf <- function(tmp,digits,na.rm,exclude) {
   ## Get the response variable
   nv <- tmp$mf[,tmp$Rpos]
   ## Make sure LHS is simple enough
-  if (tmp$Enum>2) stop("With a quantitative response (LHS), the RHS\n must contain only one or two factors.",call.=FALSE)
-  ## Handle the explanatory variables
+  if (tmp$Enum>2) stop("With a quantitative response (LHS), the RHS\n may contain only one or two factors.",call.=FALSE)
+  ## Get results for quant variable by each level of interaction variable.
   if (tmp$Enum==1) { ## Only one explanatory variable
     rv <- tmp$mf[,tmp$Enames]
-    if (tmp$Eclass!="factor") {
-      warning("RHS variable was converted to a factor.",call.=FALSE)
-      rv <- factor(rv)
-    }
-    # Get results for quant variable by each level of a single factor variable
     intres <- tapply(nv,rv,Summarize,na.rm=na.rm)
   } else { ## Two explanatory variables
     rv <- tmp$mf[,tmp$Enames[1]]
-    if (tmp$Eclass[1]!="factor") {
-      warning("First RHS variable was converted to a factor.",call.=FALSE)
-      rv <- factor(rv)
-    }
     cv <- tmp$mf[,tmp$Enames[2]]
-    if (tmp$Eclass[2]!="factor") {
-      warning("Second RHS variable was converted to a factor.",call.=FALSE)
-      cv <- factor(cv)
-    }
     iv <- interaction(rv,cv,sep=":")
-    # Get results for quant variable by each level of interaction variable.
     intres <- tapply(nv,iv,Summarize,na.rm=na.rm)
   }
   # Put together as a matrix
@@ -253,45 +169,24 @@ iSummarizeQf <- function(tmp,digits,na.rm,exclude) {
   # remove attribute names on the level labels
   attr(lvl.lbls,"dimnames") <- NULL
   # put together as a data.frame
-  res <- data.frame(lvl.lbls,res)
+  if (tmp$Enum==1) { ## Only one explanatory variable
+    if (tmp$Eclass!="factor") mode(lvl.lbls) <- tmp$Eclass
+      else lvl.lbls <- as.factor(lvl.lbls)
+    res <- data.frame(lvl.lbls,res,stringsAsFactors=FALSE)
+  } else {
+    lvl.lbls1 <- lvl.lbls[,1]
+    lvl.lbls2 <- lvl.lbls[,2]
+    if (tmp$Eclass[1]!="factor") mode(lvl.lbls1) <- tmp$Eclass[1]
+      else lvl.lbls1 <- as.factor(lvl.lbls1)
+    if (tmp$Eclass[2]!="factor") mode(lvl.lbls2) <- tmp$Eclass[2]
+      else lvl.lbls2 <- as.factor(lvl.lbls2)
+    res <- data.frame(lvl.lbls1,lvl.lbls2,res,stringsAsFactors=FALSE)
+  }
   # make sure colnames of new data.frame make sense
   # should be names of explanatory variables and the colnames
   # from the tapply result
   names(res) <- c(tmp$Enames,res.names)
   # eliminate row names
   rownames(res) <- NULL
   res
-}
-
-##############################################################
-## Internal function for formula with categorical response
-##############################################################
-iSummarizeCf <- function(tmp,percent,percdigs,addtotal,exclude) {
-  ## Exclude values if asked to
-  if (!is.null(exclude)) {
-    for (i in 1:length(exclude)) tmp$mf <- droplevels(tmp$mf[apply(tmp$mf!=exclude[i],1,all),])
-  }
-  ## Get response variable
-  rv <- tmp$mf[,tmp$Rpos]
-  ## Check RHS has only one variable
-  if (tmp$Enum>1) stop("With a categorical response (LHS), the RHS\n must contain only one factor.",call.=FALSE)
-  ## Get explanatory variable, make sure it is a factor
-  ev <- tmp$mf[,tmp$Enames[1]]
-  if (tmp$Eclass!="factor") {
-    warning("RHS variable was converted to a factor.\n",call.=FALSE)
-    ev <- as.factor(ev)
-  }
-  res <- table(ev,rv,dnn=list(tmp$Enames,tmp$Rname))
-  if (percent!="none") {
-    if (percent=="total") mrgn=NULL
-    else mrgn <- c(1,2)[which(percent==c("row","column"))]
-    res <- prop.table(res,margin=mrgn)*100
-  }
-  if (addtotal) {
-    if(percent %in% c("total","none")) mrgn <- 1:2
-    else mrgn <- c(2,1)[which(percent==c("row","column"))]
-    res <- stats::addmargins(res,margin=mrgn,FUN=list(Total=sum),quiet=TRUE)
-  }
-  if (percent!="none") res <- round(res,digits=percdigs)                                                  
-  res
 }
diff --git a/R/ageComparisons.R b/R/ageComparisons.R
@@ -251,7 +251,7 @@ summary.ageBias <- function(object,
 #=============================================================
 iAgeBiasDF <- function(tmp,cname,diff,min.n.CI,sig.level) {
   # Ages of cdata strux
-  x <- fact2num(tmp[,1])
+  x <-tmp[,1]
   # Is it appropriate to compute a CI
   canCI <- tmp$n>=min.n.CI & tmp$sd>0
   # Fill SEs, p-values calcs, and CIs with NA (will leave NA