diff --git a/DESCRIPTION b/DESCRIPTION index 088f326..76b44f8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,21 @@ Package: cvTools Type: Package Title: Cross-validation tools for regression models -Version: 0.1.1 -Date: 2011-12-07 +Version: 0.2.0 +Date: 2012-02-13 Author: Andreas Alfons Maintainer: Andreas Alfons Depends: R (>= 2.11.0), lattice, robustbase -Description: Tools that allow developers to write functions - for cross-validation with minimal programming effort - and assist users with model selection. +Imports: lattice, robustbase, stats +Description: Tools that allow developers to write functions for + cross-validation with minimal programming effort and assist + users with model selection. License: GPL (>= 2) LazyLoad: yes -Collate: 'accessors.R' 'aggregate.R' 'bwplot.R' 'cost.R' 'cvExamples.R' - 'cvFit.R' 'cvFolds.R' 'cvReshape.R' 'cvSelect.R' 'cvTool.R' - 'cvTuning.R' 'densityplot.R' 'dotplot.R' 'plot.R' 'print.R' - 'subset.R' 'summary.R' 'utils.R' 'xyplot.R' -Packaged: 2011-12-07 00:15:47 UTC; andi +Collate: 'accessors.R' 'aggregate.R' 'bwplot.R' 'cost.R' 'cvFit.R' + 'cvFolds.R' 'cvReshape.R' 'cvSelect.R' 'cvTool.R' 'cvTuning.R' + 'densityplot.R' 'dotplot.R' 'plot.R' 'print.R' 'repCV.R' + 'selectBest.R' 'subset.R' 'summary.R' 'utils.R' 'xyplot.R' +Packaged: 2012-02-13 12:16:53 UTC; andi Repository: CRAN -Date/Publication: 2011-12-07 17:07:24 +Date/Publication: 2012-02-13 12:46:49 diff --git a/MD5 b/MD5 index b5d2620..e6d1907 100644 --- a/MD5 +++ b/MD5 @@ -1,29 +1,29 @@ -fc040270f59e000d737131e2b62d01db *DESCRIPTION -09f63cf33f60f752d33ada7aa00cef61 *NAMESPACE -19be3e29185ca04d81133a9659f53317 *NEWS -48b2c3810a8fa90471c4fb6b6b788287 *R/accessors.R -c68ea945f1d98b9627184222b4778e7e *R/aggregate.R -e7c1113e14898697bf4df600190488ca *R/bwplot.R -9242c02087bdbf1ef6de9063204d01db *R/cost.R -c8918b91b5a1ebcf3936d735331e6cd2 *R/cvExamples.R -363bb51711414f00f592c27762fa7dfd *R/cvFit.R -955411a4d1383a14960fd7e600e42f18 *R/cvFolds.R -5c16b115a94f3f32febef99071c3ed27 *R/cvReshape.R -968bb75ef4547e519a001b753aff8c7a *R/cvSelect.R -2006b0398493afc3b078eb2ffcb15d1c *R/cvTool.R -5502a11380c02e285d7f5909aa0ee639 *R/cvTuning.R -9c0195b959a2bb550f7d4ab02213cb7c *R/densityplot.R -140d36e327ab204880902dea1e0faead *R/dotplot.R -3f47fa5b135279093e3d104a13028481 *R/plot.R -7505ece24c6dbd1c721dbdd7c7fd57c6 *R/print.R -fe6b97561c271e8416ad2f8a9e669aeb *R/subset.R -6450e0cd3117c8c262fb51dff4acd43c *R/summary.R -e4d7f1157a58ac610fb9b64d119dd7df *R/utils.R -930e428a62aa12bd92b3284e0a01ba2d *R/xyplot.R +573d9de879faf2769d8d184ba369d017 *DESCRIPTION +3dfbab2b529557b78684f7246de5ed96 *NAMESPACE +13816a1032489b3be4aa53aa7a7f96b9 *NEWS +b397301b6de28042cd36de56b50ce145 *R/accessors.R +38b3a4b29c70ac2da562c21d00691c3c *R/aggregate.R +0c07168914e5627f96633e5dff9138e3 *R/bwplot.R +5ccfa919b733b028d5dabe8ec847a502 *R/cost.R +edecaa067584204843f2abfc0a124115 *R/cvFit.R +24398fd63f63f6438b805663588163f0 *R/cvFolds.R +91ff198fd41846d46fe00de82f7d686d *R/cvReshape.R +a86e39c97e83a9f282b87fffbb0546ad *R/cvSelect.R +74457ff168a8944717d900a486453546 *R/cvTool.R +5afeae9df0fd300f7704d0576606f236 *R/cvTuning.R +e4fdd61e76c8343e764aa239570bfa19 *R/densityplot.R +8d546cc39ae0b64871ccbaba493ab8c0 *R/dotplot.R +e649978d04434760d0e622c066e8d38a *R/plot.R +dcb0694d7a8837b2912ca5194fc6755e *R/print.R +07f4f56b8fde6df81e7920b4c6294c52 *R/repCV.R +14aea30357b3ca1031c8ab6c45731a03 *R/selectBest.R +f33204bb96e326b4327b0517b943918d *R/subset.R +a47692a042ebfe144e37b3d9fa5678be *R/summary.R +abe4bec9e985980935d52c0ed3965a79 *R/utils.R +c6ecc2da7c715f45ca435e7ef4918067 *R/xyplot.R a7fb93ab8b8c87287ee6ae3532e4e084 *inst/doc/examples/example-accessors.R 96c17ca35116c5919d0a4b920e6faa6b *inst/doc/examples/example-aggregate.R 7e93e37e153121cfdac1595571d540b0 *inst/doc/examples/example-bwplot.R -91fa09a4b5fe431149023f32cd24bdfc *inst/doc/examples/example-cvExamples.R 6ed9c05c84a4f14609db195d00973eae *inst/doc/examples/example-cvFit.R 1ba5d997426d14332ef3e5b9d2ed0615 *inst/doc/examples/example-cvReshape.R a03e81f822a631f313e955e132a33b66 *inst/doc/examples/example-cvSelect.R @@ -32,24 +32,25 @@ a03e81f822a631f313e955e132a33b66 *inst/doc/examples/example-cvSelect.R c3d7eedf0be5f2bb054397e7cd7fd742 *inst/doc/examples/example-densityplot.R 5bbb84a057d8895cf7b841b409d33cc6 *inst/doc/examples/example-dotplot.R ef335dd66da19918ce14c845a881dce7 *inst/doc/examples/example-plot.R +16d5323935a11f27615ffdbe7887a191 *inst/doc/examples/example-repCV.R 331c3a0cf8fbedb337fb28d8d8447656 *inst/doc/examples/example-subset.R 09ff2ea3c53522657e74e01ffa7096a6 *inst/doc/examples/example-summary.R 0f3e66e0c10f34c66c4dc03aea3bc4ff *inst/doc/examples/example-xyplot.R -be6185c1028975e4e5c3c6fd18af5a5e *man/accessors.Rd -3524598663d42b47033adc6bacd5b0a6 *man/aggregate.cv.Rd -31c631cfd97fa2a05fc7b965e85742c9 *man/bwplot.cv.Rd -91ae69b7a16565e3bbe68ebce06bcefb *man/cost.Rd -b20d201b4d21c2fd98de29c5d7641dcc *man/cvExamples.Rd -076f5f90aeb28478fd60928c401f5313 *man/cvFit.Rd -3f8e6b06af39a887dccf57313e32bbab *man/cvFolds.Rd -ec22fe85e4c369d3bb39b1e47e4e5f08 *man/cvReshape.Rd -36b2534a6791129e4b53ee3f92c99d63 *man/cvSelect.Rd -37fd0a86766164550615de28c1a608c2 *man/cvTool.Rd -662b7d4f9b4d93d4cf52c94e8cb79293 *man/cvTools-package.Rd -8c50fa742d6bb9df66a319661de391a1 *man/cvTuning.Rd -5c318a3eb87222d45b88d44409a00a3f *man/densityplot.cv.Rd -aad5bcb236f1525ec16c49231d3efdfb *man/dotplot.cvSelect.Rd -38708544ed1937f12776aab5b4c70200 *man/plot.cv.Rd -43a068f6a1de41cd04d490d2b5d85aa0 *man/subset.cv.Rd -62bf8cc5e7c8ee3f81d21d7fe1cb0ebd *man/summary.cv.Rd -c7a712546a3b40294f26aae6bf3c2587 *man/xyplot.cvSelect.Rd +be915e96939dffdcd8b6e4b0b18bcca6 *man/accessors.Rd +04a6badc4198b636bb1748bf66abbe3c *man/aggregate.cv.Rd +ac323b29b7a1b77e70469fe7dff488c2 *man/bwplot.cv.Rd +2f1283c243c880f1be5a4c57c3f40e8e *man/cost.Rd +e7b5173b7ba6dd4c31c00aae1702d137 *man/cvFit.Rd +c3455f9c21a42fce405f2b739c8f99c8 *man/cvFolds.Rd +d522a615775388e3cd7326fb9bb70ab8 *man/cvReshape.Rd +606429e762e9b2820cc98366d2f8cbf8 *man/cvSelect.Rd +6988e084820a825fde952e4553351958 *man/cvTool.Rd +a037ae4b626e316ba8b3531a52dde51e *man/cvTools-package.Rd +90ca3ce5b98c2db1ce24a93039ec182b *man/cvTuning.Rd +1f1e5620a7f0540c04a0fdb00791b050 *man/densityplot.cv.Rd +8c31e99ae44736c8f8c9e0ffb3f2fcd8 *man/dotplot.cv.Rd +fa9ee4c80126a4530aeef5f01c5afc1b *man/plot.cv.Rd +02cea1c011bbcfb20cf8dd4d01f63eca *man/repCV.Rd +c1215e7f730038ccf0a241fdaf961ce7 *man/subset.cv.Rd +7543745fa1ff706e65d790c5ecd8b9b6 *man/summary.cv.Rd +80a2e79f72aa22aab6c1e3ecf4db8bbf *man/xyplot.cv.Rd diff --git a/NAMESPACE b/NAMESPACE index f49ac40..06c494f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,26 +1,3 @@ -export("cvNames<-") -export("fits<-") -export(cvFit) -export(cvFolds) -export(cvLm) -export(cvLmrob) -export(cvLts) -export(cvNames) -export(cvReshape) -export(cvSelect) -export(cvTool) -export(cvTuning) -export(fits) -export(mape) -export(mspe) -export(ncv) -export(nfits) -export(rmspe) -export(rtmspe) -export(tmspe) -import(lattice) -import(robustbase) -import(stats) S3method("cvNames<-",cv) S3method("cvNames<-",cvSelect) S3method("fits<-",cv) @@ -41,6 +18,7 @@ S3method(cvTuning,"function") S3method(cvTuning,call) S3method(densityplot,cv) S3method(densityplot,cvSelect) +S3method(dotplot,cv) S3method(dotplot,cvSelect) S3method(fits,cv) S3method(fits,cvSelect) @@ -58,10 +36,38 @@ S3method(print,cvTuning) S3method(print,summary.cv) S3method(print,summary.cvSelect) S3method(print,summary.cvTuning) +S3method(repCV,lm) +S3method(repCV,lmrob) +S3method(repCV,lts) S3method(subset,cv) S3method(subset,cvSelect) S3method(summary,cv) S3method(summary,cvSelect) S3method(summary,cvTuning) +S3method(xyplot,cv) S3method(xyplot,cvSelect) S3method(xyplot,cvTuning) +export("cvNames<-") +export("fits<-") +export(cvFit) +export(cvFolds) +export(cvLm) +export(cvLmrob) +export(cvLts) +export(cvNames) +export(cvReshape) +export(cvSelect) +export(cvTool) +export(cvTuning) +export(fits) +export(mape) +export(mspe) +export(ncv) +export(nfits) +export(repCV) +export(rmspe) +export(rtmspe) +export(tmspe) +import(lattice) +import(robustbase) +import(stats) diff --git a/NEWS b/NEWS index ca71b04..f37a054 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,25 @@ +Changes in cvTools version 0.2.0 + + + New generic function repCV() for (repeated) K-fold cross-validation. The + example functions cvLm(), cvLmrob() and cvLts() are now aliases for the + respective methods. + + + Bugfix in cvFit(): default names are retained if cvTool() returns a + 1x1 matrix. + + + Bugfix in plot.cvSelect(): default handling of argument 'method' no + longer throws error in case of only one replication. + + + Added functionality to incorporate standard errors. + + + cvSelect() and cvTuning() now allow to specify criterion for selecting + the best model via the argument 'selectBest'. + + + New xyplot() and dotplot() methods for class "cv". + + + Changes in cvTools version 0.1.1 + Added different types of obtaining CV folds. + \ No newline at end of file diff --git a/R/accessors.R b/R/accessors.R index ded2ab4..6db02ba 100644 --- a/R/accessors.R +++ b/R/accessors.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Access or set information on cross-validation results @@ -63,7 +63,7 @@ cvNames.cvSelect <- function(x) names(x$cv)[-1] #' @S3method cvNames<- cv "cvNames<-.cv" <- function(x, value) { object <- x - names(object$cv) <- value + names(object$cv) <- names(object$sd) <- value if(!is.null(x$reps)) colnames(object$reps) <- value eval.parent(substitute(x <- object)) } @@ -73,7 +73,7 @@ cvNames.cvSelect <- function(x) names(x$cv)[-1] object <- x names(object$best) <- value value <- c("Fit", value) - names(object$cv) <- value + names(object$cv) <- names(object$sd) <- value if(!is.null(x$reps)) names(object$reps) <- value eval.parent(substitute(x <- object)) } @@ -104,7 +104,7 @@ fits.cvSelect <- function(x) x$cv$Fit "fits<-.cvSelect" <- function(x, value) { object <- x if(is.factor(value)) value <- factor(as.character(value), levels=value) - object$cv$Fit <- value + object$cv$Fit <- object$sd$Fit <- value if(!is.null(reps <- x$reps)) { indices <- match(reps$Fit, x$cv$Fit, nomatch=0) object$reps$Fit <- value[indices] diff --git a/R/aggregate.R b/R/aggregate.R index e06900a..009bc08 100644 --- a/R/aggregate.R +++ b/R/aggregate.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Aggregate cross-validation results diff --git a/R/bwplot.R b/R/bwplot.R index d5e3d0f..65702de 100644 --- a/R/bwplot.R +++ b/R/bwplot.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Box-and-whisker plots of cross-validation results @@ -46,7 +46,7 @@ bwplot.cv <- function(x, data, select = NULL, ...) { # initializations - if(x$R == 1) stop("density plot is only meaningful for repeated CV") + if(x$R == 1) stop("box plot is only meaningful for repeated CV") # construct data frame in lattice format and call internal function CV <- getLatticeData(x, select) localBwplot(CV, ...) diff --git a/R/cost.R b/R/cost.R index d3ba1da..5398326 100644 --- a/R/cost.R +++ b/R/cost.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Prediction loss @@ -18,6 +18,12 @@ #' squared prediction error. A proportion of the largest squared differences #' of the observed and fitted values are thereby trimmed. #' +#' Standard errors can be requested via the \code{sd} argument. Note that +#' standard errors for \code{tmspe} are based on a winsorized standard +#' deviation. Furthermore, standard errors for \code{rmspe} and \code{rtmspe} +#' are computed from the respective standard errors of \code{mspe} and +#' \code{tmspe} via the delta method. +#' #' @rdname cost #' @name cost #' @@ -26,25 +32,48 @@ #' giving the fitted values. #' @param trim a numeric value giving the trimming proportion (the default is #' 0.25). +#' @param sd a logical indicating whether standard errors should be computed +#' as well. +#' +#' @return If standard errors are not requested, a numeric value giving the +#' prediction loss is returned. #' -#' @return A numeric value giving the prediction loss. +#' Otherwise a list is returned, with the first component containing the +#' prediction loss and the second component the corresponding standard error. #' #' @author Andreas Alfons #' +#' @references +#' Tukey, J.W. and McLaughlin, D.H. (1963) Less vulnerable confidence and +#' significance procedures for location based on a single sample: +#' Trimming/winsorization. \emph{Sankhya: The Indian Journal of Statistics, +#' Series A}, \bold{25}(3), 331--352 +#' +#' Oehlert, G.W. (1992) A note on the delta method. \emph{The American +#' Statistician}, \bold{46}(1), 27--29. +#' #' @seealso \code{\link{cvFit}}, \code{\link{cvTuning}} #' #' @examples #' # fit an MM-regression model -#' library("robustbase") #' data("coleman") #' fit <- lmrob(Y~., data=coleman) #' -#' # compute the prediction loss +#' # compute the prediction loss from the fitted values +#' # (hence the prediction loss is underestimated in this simple +#' # example since all observations are used to fit the model) #' mspe(coleman$Y, predict(fit)) #' rmspe(coleman$Y, predict(fit)) #' mape(coleman$Y, predict(fit)) -#' tmspe(coleman$Y, predict(fit), trim=0.1) -#' rtmspe(coleman$Y, predict(fit), trim=0.1) +#' tmspe(coleman$Y, predict(fit), trim = 0.1) +#' rtmspe(coleman$Y, predict(fit), trim = 0.1) +#' +#' # include standard error +#' mspe(coleman$Y, predict(fit), sd = TRUE) +#' rmspe(coleman$Y, predict(fit), sd = TRUE) +#' mape(coleman$Y, predict(fit), sd = TRUE) +#' tmspe(coleman$Y, predict(fit), trim = 0.1, sd = TRUE) +#' rtmspe(coleman$Y, predict(fit), trim = 0.1, sd = TRUE) #' #' @keywords utilities @@ -53,45 +82,78 @@ NULL ## mean squared prediction error #' @rdname cost #' @export -mspe <- function(y, yHat) { +mspe <- function(y, yHat, sd = FALSE) { residuals2 <- (y - yHat)^2 # squared residuals if(!is.null(dim(y))) { residuals2 <- rowSums(residuals2) # squared norm in multivariate case } - mean(residuals2) + res <- mean(residuals2) + if(isTRUE(sd)) { + res <- list(mspe=res, sd=sd(residuals2)/sqrt(nobs(residuals2))) + } + res } ## root mean squared prediction error #' @rdname cost #' @export -rmspe <- function(y, yHat) sqrt(mspe(y, yHat)) +rmspe <- function(y, yHat, sd = FALSE) { + res <- mspe(y, yHat, sd=sd) + if(isTRUE(sd)) { + rmspe <- sqrt(res$mspe) + res <- list(rmspe=rmspe, sd=res$sd/(2*rmspe)) + } else res <- sqrt(res) + res +} ## mean absolute prediction error #' @rdname cost #' @export -mape <- function(y, yHat) { +mape <- function(y, yHat, sd = FALSE) { absResiduals <- abs(y - yHat) # absolue residuals if(!is.null(dim(y))) { absResiduals <- rowSums(absResiduals) # norm in multivariate case } - mean(absResiduals) + res <- mean(absResiduals) + if(isTRUE(sd)) { + res <- list(mape=res, sd=sd(absResiduals)/sqrt(nobs(absResiduals))) + } + res } ## trimmed mean squared prediction error #' @rdname cost #' @export -tmspe <- function(y, yHat, trim=0.25) { +tmspe <- function(y, yHat, trim = 0.25, sd = FALSE) { n <- nobs(y) h <- n - floor(n*trim) residuals2 <- (y - yHat)^2 # squared residuals if(!is.null(dim(y))) { residuals2 <- rowSums(residuals2) # squared norm in multivariate case } - residuals2 <- sort(residuals2)[1:h] # select h smallest values - mean(residuals2) + residuals2 <- sort(residuals2) # sort squared residuals + res <- mean(residuals2[seq_len(h)]) # mean over h smallest values + if(isTRUE(sd)) { + # standard error of the trimmed mean is based on a winsorized + # standard deviation + alpha <- 1 - trim + if(h < n) { + q <- quantile(residuals2, alpha) # quantile for winsorization + residuals2[(h+1):n] <- q # replace tail with quantile + } + res <- list(tmspe=res, sd=sd(residuals2)/(alpha*sqrt(n))) + } + res } ## root trimmed mean squared prediction error #' @rdname cost #' @export -rtmspe <- function(y, yHat, trim=0.25) sqrt(tmspe(y, yHat, trim=trim)) +rtmspe <- function(y, yHat, trim = 0.25, sd = FALSE) { + res <- tmspe(y, yHat, trim=trim, sd=sd) + if(isTRUE(sd)) { + rtmspe <- sqrt(res$tmspe) + res <- list(rtmspe=rtmspe, sd=res$sd/(2*rtmspe)) + } else res <- sqrt(res) + res +} diff --git a/R/cvFit.R b/R/cvFit.R index 5e69270..c7d8849 100644 --- a/R/cvFit.R +++ b/R/cvFit.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Cross-validation for model evaluation @@ -71,8 +71,10 @@ #' fitting function. #' @param cost a cost function measuring prediction loss. It should expect #' the observed values of the response to be passed as the first argument and -#' the predicted values as the second argument, and must return a non-negative -#' scalar value. The default is to use the root mean squared prediction error +#' the predicted values as the second argument, and must return either a +#' non-negative scalar value, or a list with the first component containing +#' the prediction error and the second component containing the standard +#' error. The default is to use the root mean squared prediction error #' (see \code{\link{cost}}). #' @param K an integer giving the number of groups into which the data should #' be split (the default is five). Keep in mind that this should be chosen @@ -106,6 +108,8 @@ #' @returnItem cv a numeric vector containing the respective estimated #' prediction errors. For repeated cross-validation, those are average values #' over all replications. +#' @returnItem sd a numeric vector containing the respective estimated +#' standard errors of the prediction loss. #' @returnItem reps a numeric matrix in which each column contains the #' respective estimated prediction errors from all replications. This is #' only returned for repeated cross-validation. @@ -220,9 +224,23 @@ cvFit.call <- function(object, data = NULL, x = NULL, y, cost = rmspe, if(R > 1) { reps <- cv cv <- apply(reps, 2, mean) - } else cv <- drop(cv) + sd <- apply(reps, 2, sd) + } else { + if(is.list(cv)) { + sd <- cv[[2]] + cv <- cv[[1]] + } else { + cv <- drop(cv) + if(is.null(names(cv))) { + # drop() removes column name of 1x1 matrix + names(cv) <- defaultCvNames(length(cv)) + } + sd <- rep.int(NA, length(cv)) + names(sd) <- names(cv) + } + } ## construct return object - out <- list(n=folds$n, K=folds$K, R=R, cv=cv) + out <- list(n=folds$n, K=folds$K, R=R, cv=cv, sd=sd) if(R > 1) out$reps <- reps out$seed <- seed out$call <- matchedCall diff --git a/R/cvFolds.R b/R/cvFolds.R index 4e5d301..83471ae 100644 --- a/R/cvFolds.R +++ b/R/cvFolds.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- ## split data into blocks for cross-validation as in package 'lars' diff --git a/R/cvReshape.R b/R/cvReshape.R index fd850a2..26c9607 100644 --- a/R/cvReshape.R +++ b/R/cvReshape.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Reshape cross-validation results @@ -12,6 +12,20 @@ #' #' @param x an object inheriting from class \code{"cv"} or \code{"cvSelect"} #' that contains cross-validation results. +#' @param selectBest a character string specifying a criterion for selecting +#' the best model. Possible values are \code{"min"} (the default) or +#' \code{"hastie"}. The former selects the model with the smallest prediction +#' error. The latter is useful for nested models or for models with a tuning +#' parameter controlling the complexity of the model (e.g., penalized +#' regression). It selects the most parsimonious model whose prediction error +#' is no larger than \code{sdFactor} standard errors above the prediction error +#' of the best overall model. Note that the models are thereby assumed to be +#' ordered from the most parsimonious one to the most complex one. In +#' particular a one-standard-error rule is frequently applied. +#' @param sdFactor a numeric value giving a multiplication factor of the +#' standard error for the selection of the best model. This is ignored if +#' \code{selectBest} is \code{"min"}. +#' @param \dots additional arguments to be passed down. #' #' @returnClass cvSelect #' @returnItem n an integer giving the number of observations. @@ -24,12 +38,23 @@ #' @returnItem cv a data frame containing the estimated prediction errors for #' the models. For repeated cross-validation, those are average values over #' all replications. +#' @returnItem sd a data frame containing the estimated standard errors of the +#' prediction loss for the models. +#' @returnItem selectBest a character string specifying the criterion used for +#' selecting the best model. +#' @returnItem sdFactor a numeric value giving the multiplication factor of +#' the standard error used for the selection of the best model. #' @returnItem reps a data frame containing the estimated prediction errors #' for the models from all replications. This is only returned if repeated -#' cross-validation was performed for at least one of the models. +#' cross-validation was performed. #' #' @author Andreas Alfons #' +#' @references +#' Hastie, T., Tibshirani, R. and Friedman, J. (2009) \emph{The Elements of +#' Statistical Learning: Data Mining, Inference, and Prediction}. Springer, +#' 2nd edition. +#' #' @seealso \code{\link{cvFit}}, \code{\link{cvSelect}}, \code{\link{cvTuning}} #' #' @example inst/doc/examples/example-cvReshape.R @@ -38,11 +63,13 @@ #' #' @export -cvReshape <- function(x) UseMethod("cvReshape") +cvReshape <- function(x, ...) UseMethod("cvReshape") -#' @S3method cvReshape cv -#' @S3method cvReshape cvSelect -cvReshape.cv <- cvReshape.cvSelect <- function(x) { +#' @rdname cvReshape +#' @method cvReshape cv +#' @export +cvReshape.cv <- function(x, + selectBest = c("min", "hastie"), sdFactor = 1, ...) { # initializations if(ncv(x) == 0 || isTRUE(nfits(x) == 0)) stop("empty object") cvNames <- cvNames(x) @@ -61,8 +88,15 @@ cvReshape.cv <- cvReshape.cvSelect <- function(x) { } # call cvSelect() to combine the model fits names(objects) <- cvNames + objects$.selectBest <- selectBest + objects$.sdFactor <- sdFactor out <- do.call(cvSelect, objects) if(length(out$K) > 1) out$K <- out$K[1] if(length(out$R) > 1) out$R <- out$R[1] out } + +#' @rdname cvReshape +#' @method cvReshape cvSelect +#' @export +cvReshape.cvSelect <- cvReshape.cv diff --git a/R/cvSelect.R b/R/cvSelect.R index 141d545..38267f5 100644 --- a/R/cvSelect.R +++ b/R/cvSelect.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Model selection based on cross-validation @@ -26,15 +26,29 @@ #' results are first transformed with \code{\link{cvReshape}} to have only one #' column. Then the best overall model is selected. #' -#' It should also be noted that the \code{.reshape} argument starts with a dot -#' to avoid conflicts with the argument names used for the objects containing -#' cross-validation results. +#' It should also be noted that the argument names of \code{.reshape}, +#' \code{.selectBest} and \code{.sdFacor} start with a dot to avoid conflicts +#' with the argument names used for the objects containing cross-validation +#' results. #' #' @param \dots objects inheriting from class \code{"cv"} or \code{"cvSelect"} #' that contain cross-validation results. #' @param .reshape a logical indicating whether objects with more than one #' column of cross-validation results should be reshaped to have only one #' column (see \dQuote{Details}). +#' @param .selectBest a character string specifying a criterion for selecting +#' the best model. Possible values are \code{"min"} (the default) or +#' \code{"hastie"}. The former selects the model with the smallest prediction +#' error. The latter is useful for nested models or for models with a tuning +#' parameter controlling the complexity of the model (e.g., penalized +#' regression). It selects the most parsimonious model whose prediction error +#' is no larger than \code{.sdFactor} standard errors above the prediction error +#' of the best overall model. Note that the models are thereby assumed to be +#' ordered from the most parsimonious one to the most complex one. In +#' particular a one-standard-error rule is frequently applied. +#' @param .sdFactor a numeric value giving a multiplication factor of the +#' standard error for the selection of the best model. This is ignored if +#' \code{.selectBest} is \code{"min"}. #' #' @aliases print.cvSelect #' @@ -49,6 +63,12 @@ #' @returnItem cv a data frame containing the estimated prediction errors for #' the models. For models for which repeated cross-validation was performed, #' those are average values over all replications. +#' @returnItem sd a data frame containing the estimated standard errors of the +#' prediction loss for the models. +#' @returnItem selectBest a character string specifying the criterion used for +#' selecting the best model. +#' @returnItem sdFactor a numeric value giving the multiplication factor of +#' the standard error used for the selection of the best model. #' @returnItem reps a data frame containing the estimated prediction errors #' from all replications for those models for which repeated cross-validation #' was performed. This is only returned if repeated cross-validation was @@ -62,6 +82,11 @@ #' #' @author Andreas Alfons #' +#' @references +#' Hastie, T., Tibshirani, R. and Friedman, J. (2009) \emph{The Elements of +#' Statistical Learning: Data Mining, Inference, and Prediction}. Springer, +#' 2nd edition. +#' #' @seealso \code{\link{cvFit}}, \code{\link{cvTuning}} #' #' @example inst/doc/examples/example-cvSelect.R @@ -70,7 +95,8 @@ #' #' @export -cvSelect <- function(..., .reshape = FALSE) { +cvSelect <- function(..., .reshape = FALSE, .selectBest = c("min", "hastie"), + .sdFactor = 1) { ## initializations objects <- list(...) m <- length(objects) @@ -80,6 +106,7 @@ cvSelect <- function(..., .reshape = FALSE) { if(!all(sapply(objects, inherits, "cv") | isCvSelect)) { stop("all objects must inherit from class \"cv\" or \"cvSelect\"") } + .selectBest <- match.arg(.selectBest) # remove empty objects keep <- sapply(objects, function(x) ncv(x) > 0 && !isTRUE(nfits(x) == 0)) objects <- objects[keep] @@ -117,6 +144,7 @@ cvSelect <- function(..., .reshape = FALSE) { x$R <- rep(x$R, length.out=m) # remove column specifying fit from results x$cv <- x$cv[, -1, drop=FALSE] + x$sd <- x$sd[, -1, drop=FALSE] if(!is.null(x$reps)) x$reps <- x$reps[, -1, drop=FALSE] x }) @@ -129,12 +157,17 @@ cvSelect <- function(..., .reshape = FALSE) { R <- unlist(lapply(objects, function(x) x$R), use.names=FALSE) if(length(unique(R)) > 1) warning("different number of replications") names(K) <- names(R) <- fits - ## combine CV results + ## combine CV results and standard errors cv <- lapply(objects, function(x) { cv <- x$cv # extract CV results if(is.null(dim(cv))) t(cv) else as.matrix(cv) # return matrix }) + sd <- lapply(objects, + function(x) { + sd <- x$sd # extract standard errors + if(is.null(dim(sd))) t(sd) else as.matrix(sd) # return matrix + }) if(m > 1) { # check if names are the same for all objects cvNames <- colnames(cv[[1]]) @@ -143,8 +176,12 @@ cvSelect <- function(..., .reshape = FALSE) { if(adjustNames) cvNames <- defaultCvNames(length(cvNames)) } cv <- do.call("rbind", cv) - if(m > 1 && adjustNames) colnames(cv) <- cvNames + sd <- do.call("rbind", sd) + if(m > 1 && adjustNames) { + colnames(cv) <- colnames(sd) <- cvNames + } cv <- data.frame(Fit=factor(fits, levels=fits), cv, row.names=NULL) + sd <- data.frame(Fit=factor(fits, levels=fits), sd, row.names=NULL) ## combine repeated CV results haveReps <- any(i <- sapply(objects, function(x) !is.null(x$reps))) if(haveReps) { @@ -157,9 +194,17 @@ cvSelect <- function(..., .reshape = FALSE) { reps, row.names=NULL) } ## find best model - best <- sapply(cv[, -1, drop=FALSE], which.min) + if(.selectBest == "min") { + .sdFactor <- NA + best <- sapply(cv[, -1, drop=FALSE], selectMin) + } else { + .sdFactor <- rep(.sdFactor, length.out=1) + best <- sapply(names(cv)[-1], + function(j) selectHastie(cv[, j], sd[, j], sdFactor=.sdFactor)) + } ## construct return object - out <- list(n=n, K=K, R=R, best=best, cv=cv) + out <- list(n=n, K=K, R=R, best=best, cv=cv, sd=sd, + selectBest=.selectBest, sdFactor=.sdFactor) if(haveReps) out$reps <- reps class(out) <- "cvSelect" out diff --git a/R/cvTool.R b/R/cvTool.R index b26685f..51d40df 100644 --- a/R/cvTool.R +++ b/R/cvTool.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Low-level function for cross-validation @@ -54,8 +54,10 @@ #' @param y a numeric vector or matrix containing the response. #' @param cost a cost function measuring prediction loss. It should expect #' the observed values of the response to be passed as the first argument and -#' the predicted values as the second argument, and must return a non-negative -#' scalar value. The default is to use the root mean squared prediction error +#' the predicted values as the second argument, and must return either a +#' non-negative scalar value, or a list with the first component containing +#' the prediction error and the second component containing the standard +#' error. The default is to use the root mean squared prediction error #' (see \code{\link{cost}}). #' @param folds an object of class \code{"cvFolds"} giving the folds of the #' data for cross-validation (as returned by \code{\link{cvFolds}}). @@ -68,8 +70,13 @@ #' @param envir the \code{\link{environment}} in which to evaluate the #' function call for fitting the models (see \code{\link{eval}}). #' -#' @return A numeric matrix in which each column contains the respective -#' estimated prediction errors from all replications. +#' @return If only one replication is requested and the prediction loss +#' function \code{cost} also returns the standard error, a list is returned, +#' with the first component containing the estimated prediction errors and the +#' second component the corresponding estimated standard errors. +#' +#' Otherwise the return value is a numeric matrix in which each column contains +#' the respective estimated prediction errors from all replications. #' #' @author Andreas Alfons #' @@ -99,23 +106,71 @@ cvTool <- function(call, data = NULL, x = NULL, y, cost = rmspe, folds, names=names, predictArgs=predictArgs, envir=envir) ) } + R <- folds$R # perform repeated cross-validation - cv <- sapply(seq_len(folds$R), - function(r) { - subsetList <- getSubsetList(folds, r) - yHat <- eval(cvExpr) # perform cross-validation - # instead of collecting the results from the folds in the original - # order of the observations, the response is re-ordered accordingly - yHat <- combineData(yHat) # combine predictions from the folds - y <- dataSubset(y, unlist(subsetList)) # re-order response - # compute cost function for predicted values - if(is.null(dim(y)) && !is.null(dim(yHat))) { - apply(yHat, 2, - function(yHat) doCall(cost, y, yHat, args=costArgs)) - } else doCall(cost, y, yHat, args=costArgs) - }) - cv <- if(is.null(dim(cv))) as.matrix(cv) else t(cv) - if(is.null(colnames(cv))) colnames(cv) <- defaultCvNames(ncol(cv)) +# cv <- lapply(seq_len(R), +# function(r) { +# subsetList <- getSubsetList(folds, r) +# yHat <- eval(cvExpr) # perform cross-validation +# # instead of collecting the results from the folds in the original +# # order of the observations, the response is re-ordered accordingly +# yHat <- combineData(yHat) # combine predictions from the folds +# y <- dataSubset(y, unlist(subsetList)) # re-order response +# # compute cost function for predicted values +# if(is.null(dim(y)) && !is.null(dim(yHat))) { +# apply(yHat, 2, +# function(yHat) doCall(cost, y, yHat, args=costArgs)) +# } else doCall(cost, y, yHat, args=costArgs) +# }) + if(R == 1) { + subsetList <- getSubsetList(folds) + yHat <- eval(cvExpr) # perform cross-validation + # instead of collecting the results from the folds in the original + # order of the observations, the response is re-ordered accordingly + yHat <- combineData(yHat) # combine predictions from the folds + y <- dataSubset(y, unlist(subsetList)) # re-order response + # compute cost function for predicted values + if(is.null(dim(y)) && !is.null(dim(yHat))) { + cv <- apply(yHat, 2, + function(yHat) doCall(cost, y, yHat, args=costArgs)) + if(is.list(cv)) { + cv <- list(sapply(cv, function(x) x[[1]]), + sapply(cv, function(x) x[[2]])) + } + } else cv <- doCall(cost, y, yHat, args=costArgs) + } else { + cv <- sapply(seq_len(R), + function(r) { + subsetList <- getSubsetList(folds, r) + yHat <- eval(cvExpr) # perform cross-validation + # instead of collecting the results from the folds in the original + # order of the observations, the response is re-ordered accordingly + yHat <- combineData(yHat) # combine predictions from the folds + y <- dataSubset(y, unlist(subsetList)) # re-order response + # compute cost function for predicted values + if(is.null(dim(y)) && !is.null(dim(yHat))) { + tmp <- apply(yHat, 2, + function(yHat) doCall(cost, y, yHat, args=costArgs)) + if(is.list(tmp)) tmp <- sapply(tmp, function(x) x[[1]]) + } else { + tmp <- doCall(cost, y, yHat, args=costArgs) + if(is.list(tmp)) tmp <- tmp[[1]] + } + tmp + }) + } +# cv <- if(is.null(dim(cv))) as.matrix(cv) else t(cv) +# if(is.null(colnames(cv))) colnames(cv) <- defaultCvNames(ncol(cv)) + if(is.list(cv)) { + cv <- lapply(cv, + function(x) { + if(is.null(names(x))) names(x) <- defaultCvNames(length(x)) + x + }) + } else { + cv <- if(is.null(dim(cv))) as.matrix(cv) else t(cv) + if(is.null(colnames(cv))) colnames(cv) <- defaultCvNames(ncol(cv)) + } cv } diff --git a/R/cvTuning.R b/R/cvTuning.R index d5827cc..8a79d27 100644 --- a/R/cvTuning.R +++ b/R/cvTuning.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Cross-validation for tuning parameter selection @@ -73,8 +73,10 @@ #' fitting function. #' @param cost a cost function measuring prediction loss. It should expect #' the observed values of the response to be passed as the first argument and -#' the predicted values as the second argument, and must return a non-negative -#' scalar value. The default is to use the root mean squared prediction error +#' the predicted values as the second argument, and must return either a +#' non-negative scalar value, or a list with the first component containing +#' the prediction error and the second component containing the standard +#' error. The default is to use the root mean squared prediction error #' (see \code{\link{cost}}). #' @param K an integer giving the number of groups into which the data should #' be split (the default is five). Keep in mind that this should be chosen @@ -95,6 +97,19 @@ #' \code{\link[stats]{predict}} method of the fitted models. #' @param costArgs a list of additional arguments to be passed to the #' prediction loss function \code{cost}. +#' @param selectBest a character string specifying a criterion for selecting +#' the best model. Possible values are \code{"min"} (the default) or +#' \code{"hastie"}. The former selects the model with the smallest prediction +#' error. The latter is useful for models with a tuning parameter controlling +#' the complexity of the model (e.g., penalized regression). It selects the +#' most parsimonious model whose prediction error is no larger than +#' \code{sdFactor} standard errors above the prediction error of the best +#' overall model. Note that the models are thereby assumed to be ordered +#' from the most parsimonious one to the most complex one. In particular +#' a one-standard-error rule is frequently applied. +#' @param sdFactor a numeric value giving a multiplication factor of the +#' standard error for the selection of the best model. This is ignored if +#' \code{selectBest} is \code{"min"}. #' @param envir the \code{\link{environment}} in which to evaluate the #' function call for fitting the models (see \code{\link{eval}}). #' @param seed optional initial seed for the random number generator (see @@ -117,6 +132,12 @@ #' @returnItem cv a data frame containing the estimated prediction errors for #' all combinations of tuning parameter values. For repeated cross-validation, #' those are average values over all replications. +#' @returnItem sd a data frame containing the estimated standard errors of the +#' prediction loss for all combinations of tuning parameter values. +#' @returnItem selectBest a character string specifying the criterion used for +#' selecting the best model. +#' @returnItem sdFactor a numeric value giving the multiplication factor of +#' the standard error used for the selection of the best model. #' @returnItem reps a data frame containing the estimated prediction errors #' from all replications for all combinations of tuning parameter values. This #' is only returned for repeated cross-validation. @@ -129,6 +150,11 @@ #' #' @author Andreas Alfons #' +#' @references +#' Hastie, T., Tibshirani, R. and Friedman, J. (2009) \emph{The Elements of +#' Statistical Learning: Data Mining, Inference, and Prediction}. Springer, +#' 2nd edition. +#' #' @seealso \code{\link{cvTool}}, \code{\link{cvFit}}, \code{\link{cvSelect}}, #' \code{\link{cvFolds}}, \code{\link{cost}} #' @@ -149,6 +175,7 @@ cvTuning.function <- function(object, formula, data = NULL, x = NULL, y, tuning = list(), args = list(), cost = rmspe, K = 5, R = 1, foldType = c("random", "consecutive", "interleaved"), folds = NULL, names = NULL, predictArgs = list(), costArgs = list(), + selectBest = c("min", "hastie"), sdFactor = 1, envir = parent.frame(), seed = NULL, ...) { ## initializations matchedCall <- match.call() @@ -171,7 +198,8 @@ cvTuning.function <- function(object, formula, data = NULL, x = NULL, y, ## call method for unevaluated function calls out <- cvTuning(call, data=data, x=x, y=y, tuning=tuning, cost=cost, K=K, R=R, foldType=foldType, folds=folds, names=names, - predictArgs=predictArgs, costArgs=costArgs, envir=envir, seed=seed) + predictArgs=predictArgs, costArgs=costArgs, selectBest=selectBest, + sdFactor=sdFactor, envir=envir, seed=seed) out$call <- matchedCall out } @@ -185,8 +213,8 @@ cvTuning.call <- function(object, data = NULL, x = NULL, y, tuning = list(), cost = rmspe, K = 5, R = 1, foldType = c("random", "consecutive", "interleaved"), folds = NULL, names = NULL, predictArgs = list(), - costArgs = list(), envir = parent.frame(), - seed = NULL, ...) { + costArgs = list(), selectBest = c("min", "hastie"), + sdFactor = 1, envir = parent.frame(), seed = NULL, ...) { ## initializations matchedCall <- match.call() matchedCall[[1]] <- as.name("cvTuning") @@ -199,6 +227,7 @@ cvTuning.call <- function(object, data = NULL, x = NULL, y, nx <- nobs(data) } if(!isTRUE(n == nx)) stop(sprintf("'%s' must have %d observations", sx, nx)) + selectBest <- match.arg(selectBest) # create all combinations of tuning parameters tuning <- do.call("expand.grid", tuning) nTuning <- nrow(tuning) @@ -238,17 +267,39 @@ cvTuning.call <- function(object, data = NULL, x = NULL, y, predictArgs=predictArgs, costArgs=costArgs, envir=envir) }) } + if(R == 1) { + haveList <- is.list(cv[[1]]) + if(haveList) { + sd <- lapply(cv, function(x) x[[2]]) + cv <- lapply(cv, function(x) x[[1]]) + } + } cv <- do.call("rbind", cv) + if(R == 1) { + if(haveList) { + sd <- do.call("rbind", sd) + } else sd <- matrix(NA, nrow(cv), ncol(cv), dimnames=dimnames(cv)) + sd <- data.frame(Fit=seq_len(nTuning), sd, row.names=NULL) + } cv <- data.frame(Fit=rep(seq_len(nTuning), each=R), cv, row.names=NULL) ## compute average results in case of repeated CV if(R > 1) { reps <- cv cv <- aggregate(reps[, -1, drop=FALSE], reps[, 1, drop=FALSE], mean) + sd <- aggregate(reps[, -1, drop=FALSE], reps[, 1, drop=FALSE], sd) } ## find optimal values of tuning parameters - best <- sapply(cv[, -1, drop=FALSE], which.min) + if(selectBest == "min") { + sdFactor <- NA + best <- sapply(cv[, -1, drop=FALSE], selectMin) + } else { + sdFactor <- rep(sdFactor, length.out=1) + best <- sapply(names(cv)[-1], + function(j) selectHastie(cv[, j], sd[, j], sdFactor=sdFactor)) + } ## construct return object - out <- list(n=folds$n, K=folds$K, R=R, tuning=tuning, best=best, cv=cv) + out <- list(n=folds$n, K=folds$K, R=R, tuning=tuning, best=best, + cv=cv, sd=sd, selectBest=selectBest, sdFactor=sdFactor) if(R > 1) out$reps <- reps out$seed <- seed out$call <- matchedCall diff --git a/R/densityplot.R b/R/densityplot.R index 648cf9b..2481c84 100644 --- a/R/densityplot.R +++ b/R/densityplot.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Kernel density plots of cross-validation results diff --git a/R/dotplot.R b/R/dotplot.R index 0532ef1..2078e89 100644 --- a/R/dotplot.R +++ b/R/dotplot.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Dot plots of cross-validation results @@ -11,7 +11,7 @@ #' For objects with multiple columns of repeated cross-validation results, #' conditional dot plots are produced. #' -#' @method dotplot cvSelect +#' @method dotplot cv #' #' @param x an object inheriting from class \code{"cvSelect"} that contains #' cross-validation results. @@ -20,6 +20,9 @@ #' of models for which to plot the cross-validation results. #' @param select a character, integer or logical vector indicating the columns #' of cross-validation results to be plotted. +#' @param sdFactor a numeric value giving the multiplication factor of the +#' standard error for displaying error bars. Error bars can be suppressed by +#' setting this to \code{NA}. #' @param \dots additional arguments to be passed to the \code{"formula"} #' method of \code{\link[lattice:xyplot]{dotplot}}. #' @@ -43,15 +46,29 @@ #' @import lattice #' @export -dotplot.cvSelect <- function(x, data, subset = NULL, select = NULL, ...) { +dotplot.cv <- function(x, data, select = NULL, sdFactor = NA, ...) { + # construct data frame in lattice format and call internal function + tmp <- getLatticeData(x, select, reps=FALSE, sdFactor=sdFactor) + localDotplot(tmp$CV, tmp$lower, tmp$upper, ...) +} + + +#' @rdname dotplot.cv +#' @method dotplot cvSelect +#' @export + +dotplot.cvSelect <- function(x, data, subset = NULL, select = NULL, + sdFactor = x$sdFactor, ...) { # construct data frame in lattice format and call internal function - CV <- getLatticeData(x, subset, select, reps=FALSE, numericAsFactor=TRUE) - localDotplot(CV, ...) + tmp <- getLatticeData(x, subset, select, reps=FALSE, + sdFactor=sdFactor, numericAsFactor=TRUE) + localDotplot(tmp$CV, tmp$lower, tmp$upper, ...) } # internal function for dot plots -localDotplot <- function(CV, horizontal = TRUE, xlab = NULL, ylab = NULL, ..., +localDotplot <- function(CV, lower, upper, horizontal = TRUE, + xlab = NULL, ylab = NULL, ..., # the following arguments are defined so that they aren't supplied twice x, formula, data, groups) { # construct formula for call to xyplot() @@ -69,6 +86,46 @@ localDotplot <- function(CV, horizontal = TRUE, xlab = NULL, ylab = NULL, ..., } conditional <- if("Name" %in% cvNames) "Name" else NULL f <- getFormula(left, right, conditional) - # call dotplot() - dotplot(f, data=CV, horizontal=horizontal, xlab=xlab, ylab=ylab, ...) + # call stripplot() since dotplot() does not pass the 'subscripts' to the + # prepanel function + stripplot(f, data=CV, lower=lower, upper=upper, horizontal=horizontal, + prepanel=prepanelDotplot, panel=panelDotplot, xlab=xlab, ylab=ylab, + ...) +} + +# prepanel function +prepanelDotplot <- function(x, y, lower, upper, horizontal, subscripts, ...) { + tmp <- c(lower[subscripts], if(horizontal) x else y, upper[subscripts]) + tmp <- tmp[is.finite(tmp)] + if(length(tmp) > 0) { + lim <- range(tmp, finite=TRUE) + if(horizontal) list(xlim=lim) else list(ylim=lim) + } else list() +} + +# panel function +panelDotplot <- function(x, y, lower, upper, horizontal, subscripts, + col = trellis.par.get("dot.symbol")$col, angle=90, + length=0.5, unit="lines", ends, type, lty, lwd, ...) { + # initializations + dot.line <- trellis.par.get("dot.line") + box.umbrella <- trellis.par.get("box.umbrella") + if(missing(lty) || length(lty) == 0) { + lty <- c(dot.line$lty, box.umbrella$lty) + } else if(length(lty == 1)) lty = c(lty, box.umbrella$lty) + if(missing(lwd) || length(lwd) == 0) { + lwd <- c(dot.line$lwd, box.umbrella$lwd) + } else if(length(lwd == 1)) lwd = c(lwd, box.umbrella$lwd) + # create plot + panel.dotplot(x, y, horizontal=horizontal, subscripts=subscripts, + col=col, lty=lty[1], lwd=lwd[1], ...) + if(horizontal) { + panel.arrows(lower[subscripts], y, upper[subscripts], y, + angle=angle, length=length, unit=unit, ends="both", + col=col, lty=lty[2], lwd=lwd[2], ...) + } else { + panel.arrows(x, lower[subscripts], x, upper[subscripts], + angle=angle, length=length, unit=unit, ends="both", + col=col, lty=lty[2], lwd=lwd[2], ...) + } } diff --git a/R/plot.R b/R/plot.R index 27e60ff..009b6b2 100644 --- a/R/plot.R +++ b/R/plot.R @@ -1,8 +1,29 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- +# workhorse function +plotCV <- function(x, + method = c("bwplot", "densityplot", "xyplot", "dotplot"), ...) { + ## initializations + if(all(x$R == 1)) { + choices <- eval(formals(sys.function())[["method"]]) + if(identical(method, choices)) { + method <- "xyplot" + } else method <- match.arg(method, c("xyplot", "dotplot")) + } else method <- match.arg(method) + ## call plot function + if(method == "bwplot") { + bwplot(x, ...) + } else if(method == "densityplot") { + densityplot(x, ...) + } else if(method == "xyplot") { + xyplot(x, ...) + } else dotplot(x, ...) +} + + #' Plot cross-validation results #' #' Plot results from (repeated) \eqn{K}-fold cross-validation. @@ -49,32 +70,37 @@ #' #' @export -plot.cv <- function(x, method = c("bwplot", "densityplot"), ...) { - ## initializations - method <- match.arg(method) - ## call plot function - if(method == "bwplot") { - bwplot(x, ...) - } else densityplot(x, ...) -} +#plot.cv <- function(x, method = c("bwplot", "densityplot"), ...) { +# ## initializations +# method <- match.arg(method) +# ## call plot function +# if(method == "bwplot") { +# bwplot(x, ...) +# } else densityplot(x, ...) +#} +plot.cv <- plotCV #' @rdname plot.cv #' @method plot cvSelect #' @export -plot.cvSelect <- function(x, - method = c("bwplot", "densityplot", "xyplot", "dotplot"), ...) { - ## initializations - if(all(x$R == 1)) { - method <- match.arg(method, c("xyplot", "dotplot")) - } else method <- match.arg(method) - ## call plot function - if(method == "bwplot") { - bwplot(x, ...) - } else if(method == "densityplot") { - densityplot(x, ...) - } else if(method == "xyplot") { - xyplot(x, ...) - } else dotplot(x, ...) -} +#plot.cvSelect <- function(x, +# method = c("bwplot", "densityplot", "xyplot", "dotplot"), ...) { +# ## initializations +# if(all(x$R == 1)) { +# choices <- eval(formals(sys.function())[["method"]]) +# if(identical(method, choices)) { +# method <- "xyplot" +# } else method <- match.arg(method, c("xyplot", "dotplot")) +# } else method <- match.arg(method) +# ## call plot function +# if(method == "bwplot") { +# bwplot(x, ...) +# } else if(method == "densityplot") { +# densityplot(x, ...) +# } else if(method == "xyplot") { +# xyplot(x, ...) +# } else dotplot(x, ...) +#} +plot.cvSelect <- plotCV diff --git a/R/print.R b/R/print.R index 8dc4933..8b21398 100644 --- a/R/print.R +++ b/R/print.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' @S3method print cvFolds @@ -55,7 +55,7 @@ print.cvSelect <- print.summary.cvSelect <- function(x, best = TRUE, ...) { } else cat(sprintf("\n%d-fold CV results:\n", K)) } else cat("\nCV results:\n") print(x$cv, ...) - # print optimal value for tuning parameters if requested + # print optimal model if requested if(isTRUE(best)) { cat("\nBest model:\n") best <- x$best diff --git a/R/cvExamples.R b/R/repCV.R similarity index 77% rename from R/cvExamples.R rename to R/repCV.R index 8fd8257..2ae4486 100644 --- a/R/cvExamples.R +++ b/R/repCV.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Cross-validation for linear models @@ -25,20 +25,22 @@ #' and the estimated prediction errors from all replications as well as their #' average are included in the returned object. #' -#' @rdname cvExamples -#' @name cvExamples +#' @aliases cvExamples #' -#' @param object for \code{cvLm}, an object of class \code{"lm"} computed -#' with \code{\link[stats]{lm}}. For \code{cvLmrob}, an object of class -#' \code{"lmrob"} computed with \code{\link[robustbase]{lmrob}}. For -#' \code{cvLts}, an object of class \code{"lts"} computed with -#' \code{\link[robustbase]{ltsReg}}. +#' @param object an object returned from a model fitting function. Methods +#' are implemented for objects of class \code{"lm"} computed with +#' \code{\link[stats]{lm}}, objects of class \code{"lmrob"} computed with +#' \code{\link[robustbase]{lmrob}}, and object of class \code{"lts"} computed +#' with \code{\link[robustbase]{ltsReg}}. #' @param cost a cost function measuring prediction loss. It should expect #' the observed values of the response to be passed as the first argument and -#' the predicted values as the second argument, and must return a non-negative -#' scalar value. The default is to use the root mean squared prediction error -#' for \code{cvLm} and the root trimmed mean squared prediction error for -#' \code{cvLmrob} and \code{cvLts} (see \code{\link{cost}}). +#' the predicted values as the second argument, and must return either a +#' non-negative scalar value, or a list with the first component containing +#' the prediction error and the second component containing the standard +#' error. The default is to use the root mean squared prediction error +#' for the \code{"lm"} method and the root trimmed mean squared prediction +#' error for the \code{"lmrob"} and \code{"lts"} methods (see +#' \code{\link{cost}}). #' @param K an integer giving the number of groups into which the data should #' be split (the default is five). Keep in mind that this should be chosen #' such that all groups are of approximately equal size. Setting \code{K} @@ -67,21 +69,27 @@ #' @returnItem K an integer giving the number of folds. #' @returnItem R an integer giving the number of replications. #' @returnItem cv a numeric vector containing the estimated prediction -#' errors. For \code{cvLm} and \code{cvLmrob}, this is a single numeric -#' value. For \code{cvLts}, this contains one value for each of the requested -#' fits. In the case of repeated cross-validation, those are average values -#' over all replications. +#' errors. For the \code{"lm"} and \code{"lmrob"} methods, this is a single +#' numeric value. For the \code{"lts"} method, this contains one value for +#' each of the requested fits. In the case of repeated cross-validation, those +#' are average values over all replications. +#' @returnItem sd a numeric vector containing the estimated standard +#' errors of the prediction loss. For the \code{"lm"} and \code{"lmrob"} +#' methods, this is a single numeric value. For the \code{"lts"} method, this +#' contains one value for each of the requested fits. #' @returnItem reps a numeric matrix containing the estimated prediction -#' errors from all replications. For \code{cvLm} and \code{cvLmrob}, this is a -#' matrix with one column. For \code{cvLts}, this contains one column for each -#' of the requested fits. However, this is only returned for repeated -#' cross-validation. +#' errors from all replications. For the \code{"lm"} and \code{"lmrob"} +#' methods, this is a matrix with one column. For the \code{"lts"} method, +#' this contains one column for each of the requested fits. However, this is +#' only returned for repeated cross-validation. #' @returnItem seed the seed of the random number generator before #' cross-validation was performed. #' @returnItem call the matched function call. #' -#' @note Those are simple wrapper functions that extract the data from the -#' fitted model and call \code{\link{cvFit}} to perform cross-validation. +#' @note The \code{repCV} methods are simple wrapper functions that extract the +#' data from the fitted model and call \code{\link{cvFit}} to perform +#' cross-validation. In addition, \code{cvLm}, \code{cvLmrob} and \code{cvLts} +#' are aliases for the respective methods. #' #' @author Andreas Alfons #' @@ -89,19 +97,21 @@ #' \code{\link[stats]{lm}}, \code{\link[robustbase]{lmrob}}, #' \code{\link[robustbase]{ltsReg}} #' -#' @example inst/doc/examples/example-cvExamples.R +#' @example inst/doc/examples/example-repCV.R #' #' @keywords utilities +#' +#' @export repCV -NULL +repCV <- function(object, ...) UseMethod("repCV") -# ---------------------- -#' @rdname cvExamples +## LS regression +#' @rdname repCV +#' @method repCV lm #' @export -## LS regression -cvLm <- function(object, cost=rmspe, K = 5, R = 1, +repCV.lm <- function(object, cost=rmspe, K = 5, R = 1, foldType = c("random", "consecutive", "interleaved"), folds = NULL, seed = NULL, ...) { ## initializations @@ -132,14 +142,14 @@ cvLm <- function(object, cost=rmspe, K = 5, R = 1, out } -# ---------------------- -#' @rdname cvExamples -#' @import robustbase +## MM and SDMD regression +#' @rdname repCV +#' @method repCV lmrob #' @export +#' @import robustbase -## MM and SDMD regression -cvLmrob <- function(object, cost=rtmspe, K = 5, R = 1, +repCV.lmrob <- function(object, cost=rtmspe, K = 5, R = 1, foldType = c("random", "consecutive", "interleaved"), folds = NULL, seed = NULL, ...) { ## initializations @@ -170,14 +180,14 @@ cvLmrob <- function(object, cost=rtmspe, K = 5, R = 1, out } -# ---------------------- -#' @rdname cvExamples -#' @import robustbase +## LTS regression +#' @rdname repCV +#' @method repCV lts #' @export +#' @import robustbase -## LTS regression -cvLts <- function(object, cost = rtmspe, K = 5, R = 1, +repCV.lts <- function(object, cost = rtmspe, K = 5, R = 1, foldType = c("random", "consecutive", "interleaved"), folds = NULL, fit = c("reweighted", "raw", "both"), seed = NULL, ...) { @@ -216,8 +226,29 @@ cvLts <- function(object, cost = rtmspe, K = 5, R = 1, } +# ---------------------- + +## LS regression +#' @rdname repCV +#' @export +cvLm <- repCV.lm + +## MM and SDMD regression +#' @rdname repCV +#' @export +#' @import robustbase +cvLmrob <- repCV.lmrob + +## LTS regression +#' @rdname repCV +#' @export #' @import robustbase +cvLts <- repCV.lts + +# ---------------------- + #' @S3method predict lts +#' @import robustbase # there is no predict() method for "lts" objects in package 'robustbase' predict.lts <- function(object, newdata, diff --git a/R/selectBest.R b/R/selectBest.R new file mode 100644 index 0000000..8c85b5f --- /dev/null +++ b/R/selectBest.R @@ -0,0 +1,73 @@ +# ---------------------- +# Author: Andreas Alfons +# KU Leuven +# ---------------------- + +# Model selection criteria based on prediction loss +# +# Determine the best model based on prediction loss. +# +# \code{selectMin} selects the model with the smallest prediction loss. +# +# \code{selectHastie} is useful for nested models or for models where a +# tuning parameter controls the complexity of the model (e.g., penalized +# regression). It selects the most parsimonious model whose prediction error +# is no larger than \code{sdFactor} standard errors above the prediction error +# of the best model. In particular a one-standard-error rule is frequently +# applied. +# +# @rdname selectBest +# @name selectBest +# +# @param x a numeric vector containing the estimated prediction errors for +# the models. +# @param sd a numeric vector containing the estimated standard errors of the +# prediction loss for the models. +# @param sdFactor a numeric value giving the multiplication factor of the +# standard error. +# +# @return An integer giving the index of the best model. +# +# @note These functions are designed to be supplied to \code{\link{cvSelect}} +# or \code{\link{cvTuning}}. +# +# @author Andreas Alfons +# +# @references +# Hastie, T., Tibshirani, R. and Friedman, J. (2009) \emph{The Elements of +# Statistical Learning: Data Mining, Inference, and Prediction}. Springer, +# 2nd edition. +# +# @seealso \code{\link{cvSelect}}, \code{\link{cvTuning}} +# +# @keywords utilities +# +# NULL +# +# @rdname selectBest +# @export +selectMin <- function(x) which.min(x) + +# @rdname selectBest +# @export +selectHastie <- function(x, sd, sdFactor = 1) { + i <- which.min(x) + within <- which(x[seq_len(i)] < (x[i] + sdFactor * sd[i])) + if(length(within) == 0) i else within[1] # ensure it works if sd[i] is NA +} + +#selectDiff <- function(x, sd, sdFactor = 1) { +# i <- which.min(x) +# seqI <- seq_len(i) +# out <- which(c(TRUE, (diff(x[seqI]) + sdFactor * sd[seqI[-1]]) < 0)) +# tail(out, 1) +#} +# +#selectRelChange <- function(x, sd, sdFactor = 1, threshold = 0.001) { +# # find models with large enough relative change +# i <- which.min(x) +# xSeqI <- x[seq_len(i)] +# keep <- which((xSeqI - x[i]) / max(xSeqI) > threshold) +# # call selectHastie() for the remaining models +# selectHastie(xSeqI[keep], sd[keep], sdFactor=sdFactor) +#} diff --git a/R/subset.R b/R/subset.R index c44796b..ee25521 100644 --- a/R/subset.R +++ b/R/subset.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Subsetting cross-validation results @@ -32,8 +32,8 @@ subset.cv <- function(x, select = NULL, ...) { if(is.null(select)) return(x) - cv <- x$cv - x$cv <- cv[select] + x$cv <- x$cv[select] + x$sd <- x$sd[select] if(!is.null(reps <- x$reps)) x$reps <- reps[, select, drop=FALSE] x } @@ -45,6 +45,7 @@ subset.cv <- function(x, select = NULL, ...) { subset.cvSelect <- function(x, subset = NULL, select = NULL, ...) { cv <- x$cv + sd <- x$sd cvNames <- cvNames(x) reps <- x$reps # extract subset of models @@ -54,6 +55,7 @@ subset.cvSelect <- function(x, subset = NULL, select = NULL, ...) { x$best <- x$best[select] select <- c("Fit", select) # also select column describing models x$cv <- cv[, select, drop=FALSE] + x$sd <- sd[, select, drop=FALSE] if(!is.null(reps)) x$reps <- reps[, select, drop=FALSE] } } else { @@ -64,10 +66,12 @@ subset.cvSelect <- function(x, subset = NULL, select = NULL, ...) { # extract CV results for the models to keep if(is.null(select)) { cv <- cv[subset, , drop=FALSE] + sd <- sd[subset, , drop=FALSE] } else { if(!is.character(select)) select <- cvNames[select] select <- c("Fit", select) # also select column describing models cv <- cv[subset, select, drop=FALSE] + sd <- sd[subset, select, drop=FALSE] } fits <- cv$Fit # models to keep haveFactor <- is.factor(fits) @@ -75,12 +79,22 @@ subset.cvSelect <- function(x, subset = NULL, select = NULL, ...) { # for a factor, unused levels should be dropped and # remaining levels should be in the right order fits <- as.character(fits) - cv$Fit <- factor(fits, levels=fits) + cv$Fit <- sd$Fit <- factor(fits, levels=fits) } x$cv <- cv + x$sd <- sd # find best model among the remaining ones + if(is.null(x$selectBest)) x$selectBest <- "min" + if(is.null(x$sdFactor)) x$sdFactor <- NA if(ncol(cv) > 1) { - x$best <- sapply(cv[, -1, drop=FALSE], which.min) + if(x$selectBest == "min") { + x$best <- sapply(cv[, -1, drop=FALSE], selectMin) + } else { + x$best <- sapply(names(cv)[-1], + function(j) { + selectHastie(cv[, j], sd[, j], sdFactor=x$sdFactor) + }) + } } else x$best <- x$best[integer()] # this ensures empty integer vector # extract the CV replicates for the models to keep if(!is.null(reps)) { diff --git a/R/summary.R b/R/summary.R index 51d78f8..453d370 100644 --- a/R/summary.R +++ b/R/summary.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' Summarize cross-validation results diff --git a/R/utils.R b/R/utils.R index 0a9bcfa..032f67e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- ## utilities for cross-validation functions @@ -45,6 +45,48 @@ doCall <- function(fun, ..., args) { } else do.call(fun, c(list(...), args)) } +## call the supplied function for selecting the respective best model and +## modify the multiplication factor of the standard error as necessary +findBest <- function(fun, x, sd, sdFactor = NULL, returnSdFactor = TRUE) { + formalArgs <- formals(fun) + argNames <- names(formalArgs) + if(length(argNames) == 1) { + # only one argument for the prediction errors, call the supplied + # function directly + best <- sapply(x[, -1, drop=FALSE], fun) + sdFactor <- NA + } else { + # more than one argument: first should expect the prediction errors and + # second the corresponding standard errors + if("sdFactor" %in% argNames) { + # function contains argument for the multiplication factor of the + # standard error + # if this is not is supplied, store the default value of the + # supplied function + # otherwise include the supplied value as extra argument in a list + # (this way it can easily be extended to take additional arguments) + if(is.null(sdFactor)) { + sdFactor <- formalArgs$sdFactor + args <- list() + } else args <- list(sdFactor=sdFactor) + } else { + # function does not contain argument for the multiplication factor + # of the standard error + args <- list() + sdFactor <- NA + } + # call the supplied function via doCall() + best <- sapply(names(x)[-1], + function(j) doCall(fun, x[, j], sd[, j], args=args)) + } + if(returnSdFactor) { + # return list with the first component containing the respective best + # model and the second component containing the multiplication factor + # of the standard error + list(best=best, sdFactor=sdFactor) + } else best +} + # default names defaultCvNames <- function(p) { if(p == 1) { @@ -73,7 +115,6 @@ addIntercept <- function(x, check = FALSE) { } else x } - # remove intercept column from design matrix removeIntercept <- function(x, pos) { if(missing(pos)) { @@ -93,103 +134,125 @@ getFormula <- function(left, right, conditional = NULL) { } else as.formula(paste(left, "~", right, "|", conditional)) } + # get data in the correct format for lattice graphics -# TODO: the code can be simplified by using subset() methods getLatticeData <- function(x, ...) UseMethod("getLatticeData") -getLatticeData.cv <- function(x, select = NULL, ...) { - CV <- as.data.frame(x$reps) + +getLatticeData.cv <- function(x, select = NULL, reps = TRUE, + sdFactor = NA, ...) { + # extract subset of models + x <- subset(x, select=select) + if(reps) { + CV <- x$reps + if(is.null(CV)) { + stop("replications not available") + } else CV <- as.data.frame(CV) + } else { + CV <- as.data.frame(t(x$cv)) + } # stack selected results on top of each other - cvNames <- names(CV) cvName <- defaultCvNames(1) - if(isTRUE(cvNames == cvName)) { - # one column of results with default name: - # no column for conditional plots - if(!is.null(select)) { - CV <- CV[, select, drop=FALSE] - if(ncol(CV) == 0) { - # return data frame of NAs if column is not selected - CV[, cvName] <- as.numeric(rep.int(NA, nrow(CV))) - } - } + cvNames <- cvNames(x) + ncv <- ncv(x) + n <- nrow(CV) + if(ncv == 0) { + # return data frame of NAs if column is not selected + CV[, cvName] <- rep.int(as.numeric(NA), n) + if(!reps) SD <- as.numeric(NA) } else { - # include column for conditional plots - if(is.null(select)) { - select <- cvNames - } else if(!is.character(select)) select <- cvNames[select] - n <- nrow(CV) - if(length(select) == 0) { - # return data frame of NAs if no results are selected - CV <- data.frame(as.numeric(rep.int(NA, n))) - names(CV) <- cvName - } else { - CV <- lapply(select, + if(!isTRUE(cvNames == cvName)) { + CV <- lapply(cvNames, function(j) data.frame(Name=rep.int(j, n), CV=CV[, j])) CV <- do.call(rbind, CV) + names(CV) <- c("Name", cvName) } + if(!reps) SD <- unname(x$sd) } # return data - CV + if(reps) { + CV + } else { + halflength <- sdFactor * SD + lower <- CV[, cvName] - halflength + upper <- CV[, cvName] + halflength + list(CV=CV, lower=lower, upper=upper) + } } + getLatticeData.cvSelect <- function(x, subset = NULL, select = NULL, - reps = TRUE, numericAsFactor = FALSE, ...) { - CV <- if(reps) x$reps else x$cv + reps = TRUE, sdFactor = x$sdFactor, numericAsFactor = FALSE, ...) { # extract subset of models - if(is.null(subset)) { -# CV$Fit <- CV$Fit[, drop=TRUE] # drop unused factor levels - fits <- fits(x) + x <- subset(x, subset=subset, select=select) + fits <- fits(x) + if(reps) { + CV <- x$reps + if(is.null(CV)) stop("replications not available") } else { - fits <- x$cv[subset, "Fit"] - if(reps) { - CV <- CV[CV$Fit %in% fits, , drop=FALSE] - } else { - CV <- CV[subset, , drop=FALSE] - } + CV <- x$cv + SD <- x$sd } # ensure that models are shown in the correct order and drop unused levels # ensure that correct values are shown for a numeric tuning parameter if(numericAsFactor && is.double(CV$Fit)) { CV$Fit <- factor(shingle(CV$Fit), levels=fits) + if(!reps) SD$Fit <- factor(shingle(SD$Fit), levels=fits) } else if(numericAsFactor || !is.numeric(CV$Fit)) { CV$Fit <- factor(CV$Fit, levels=fits) + if(!reps) SD$Fit <- factor(SD$Fit, levels=fits) } # stack selected results on top of each other - cvNames <- names(CV)[-1] cvName <- defaultCvNames(1) - if(is.null(select)) { - select <- cvNames - } else if(!is.character(select)) select <- cvNames[select] + cvNames <- cvNames(x) + nfits <- nfits(x) + ncv <- ncv(x) n <- nrow(CV) - if(n == 0) { + if(nfits == 0) { # no models selected: no column for grouping - if(isTRUE(cvNames == cvName) || length(select) == 0) { + if(isTRUE(cvNames == cvName) || ncv == 0) { # return data frame without column for conditional plots and one NA CV <- data.frame(as.numeric(NA)) names(CV) <- cvName + if(!reps) SD <- as.numeric(NA) } else { # return data frame with column for conditional plots and NA values - CV <- data.frame(select, as.numeric(rep.int(NA, length(select)))) + CV <- data.frame(cvNames, rep.int(as.numeric(NA), ncv)) names(CV) <- c("Name", cvName) + if(!reps) SD <- rep.int(as.numeric(NA), ncv) } } else { # include column for grouping - if(length(select) == 0) { + if(ncv == 0) { # no results selected: no column for conditional plots and NA values CV <- CV[, "Fit", drop=FALSE] - CV[, cvName] <- as.numeric(rep.int(NA, n)) + CV[, cvName] <- rep.int(as.numeric(NA), n) + if(!reps) SD <- rep.int(as.numeric(NA), nfits) } else { # no column for conditional plots if there is only one column of # results with default name - if(!isTRUE(cvNames == cvName)) { + if(isTRUE(cvNames == cvName)) { + if(!reps) SD <- SD[, cvName] + } else { CVFit <- CV[, "Fit", drop=FALSE] - CV <- lapply(select, + CV <- lapply(cvNames, function(j) cbind(CVFit, Name=rep.int(j, n), CV=CV[, j])) CV <- do.call(rbind, CV) + names(CV) <- c("Fit", "Name", cvName) + if(!reps) SD <- unlist(SD[, cvNames], use.names=FALSE) } } } # return data - CV + if(reps) { + CV + } else { + if(is.null(sdFactor)) sdFactor <- NA + halflength <- sdFactor * SD + lower <- CV[, cvName] - halflength + upper <- CV[, cvName] + halflength + list(CV=CV, lower=lower, upper=upper) + } } + getLatticeData.cvTuning <- function(x, ...) { # adjust column specifying the model in case of only one tuning parameter if(ncol(x$tuning) == 1) fits(x) <- x$tuning[, 1] diff --git a/R/xyplot.R b/R/xyplot.R index 91421a2..5ae7563 100644 --- a/R/xyplot.R +++ b/R/xyplot.R @@ -1,6 +1,6 @@ # ---------------------- # Author: Andreas Alfons -# K.U.Leuven +# KU Leuven # ---------------------- #' X-Y plots of cross-validation results @@ -24,7 +24,7 @@ #' \code{type} argument (a full list of accepted values can be found in the #' help file of \code{\link[lattice:panel.xyplot]{panel.xyplot}}). #' -#' @method xyplot cvSelect +#' @method xyplot cv #' #' @param x an object inheriting from class \code{"cvSelect"} that contains #' cross-validation results (note that this includes objects of class @@ -34,6 +34,9 @@ #' of models for which to plot the cross-validation results. #' @param select a character, integer or logical vector indicating the columns #' of cross-validation results to be plotted. +#' @param sdFactor a numeric value giving the multiplication factor of the +#' standard error for displaying error bars. Error bars can be suppressed by +#' setting this to \code{NA}. #' @param \dots additional arguments to be passed to the \code{"formula"} #' method of \code{\link[lattice:xyplot]{xyplot}}. #' @@ -57,31 +60,49 @@ #' @import lattice #' @export -xyplot.cvSelect <- function(x, data, subset = NULL, select = NULL, ...) { +xyplot.cv <- function(x, data, select = NULL, sdFactor = NA, ...) { # construct data frame in lattice format and call internal function - CV <- getLatticeData(x, subset, select, reps=FALSE, numericAsFactor=TRUE) - localXyplot(CV, ...) + tmp <- getLatticeData(x, select, reps=FALSE, sdFactor=sdFactor) + localXyplot(tmp$CV, tmp$lower, tmp$upper, ...) } -#' @rdname xyplot.cvSelect +#' @rdname xyplot.cv +#' @method xyplot cvSelect +#' @export + +xyplot.cvSelect <- function(x, data, subset = NULL, select = NULL, + sdFactor = x$sdFactor, ...) { + # construct data frame in lattice format and call internal function + tmp <- getLatticeData(x, subset, select, reps=FALSE, + sdFactor=sdFactor, numericAsFactor=TRUE) + localXyplot(tmp$CV, tmp$lower, tmp$upper, ...) +} + + +#' @rdname xyplot.cv #' @method xyplot cvTuning #' @export -xyplot.cvTuning <- function(x, data, subset = NULL, select = NULL, ...) { +xyplot.cvTuning <- function(x, data, subset = NULL, select = NULL, + sdFactor = x$sdFactor, ...) { # construct data frame in lattice format and call internal function - CV <- getLatticeData(x, subset, select, reps=FALSE) - localXyplot(CV, x$tuning, ...) + tmp <- getLatticeData(x, subset, select, reps=FALSE, sdFactor=sdFactor) + localXyplot(tmp$CV, tmp$lower, tmp$upper, x$tuning, ...) } # internal function for x-y plots -localXyplot <- function(CV, tuning = NULL, type, xlab, ylab = "CV results", ..., +localXyplot <- function(CV, lower, upper, tuning = NULL, type, + xlab, ylab = "CV results", ..., # the following arguments are defined so that they aren't supplied twice x, formula, data, groups) { # construct formula for call to xyplot() cvNames <- names(CV) - if(!("Fit" %in% cvNames)) CV$Fit <- rep.int(NA, nrow(CV)) +# if(!("Fit" %in% cvNames)) CV$Fit <- rep.int(NA, nrow(CV)) + if(!("Fit" %in% cvNames)) { + CV$Fit <- factor(rep.int(defaultFitNames(1), nrow(CV))) + } conditional <- if("Name" %in% cvNames) "Name" else NULL f <- getFormula("CV", "Fit", conditional) # default plot type x-axis label @@ -89,9 +110,41 @@ localXyplot <- function(CV, tuning = NULL, type, xlab, ylab = "CV results", ..., if(missing(type)) type <- "b" if(missing(xlab)) xlab <- names(tuning) } else { - if(missing(type)) type <- "h" + if(missing(type)) type <- c("h", "p") if(missing(xlab)) xlab <- NULL } # call xyplot() - xyplot(f, data=CV, type=type, xlab=xlab, ylab=ylab, ...) + xyplot(f, data=CV, lower=lower, upper=upper, prepanel=prepanelXyplot, + panel=panelXyplot, type=type, xlab=xlab, ylab=ylab, ...) +} + +# prepanel function +prepanelXyplot <- function(x, y, lower, upper, subscripts, ...) { + tmp <- c(lower[subscripts], y, upper[subscripts]) + tmp <- tmp[is.finite(tmp)] + if(length(tmp) > 0) { + lim <- range(tmp, finite=TRUE) + list(ylim=lim) + } else list() +} + +# panel function +panelXyplot <- function(x, y, lower, upper, subscripts, + col = plot.line$col, angle=90, length=0.5, + unit="lines", ends, type, lty, lwd, ...) { + # initializations + plot.line <- trellis.par.get("plot.line") + box.umbrella <- trellis.par.get("box.umbrella") + if(missing(lty) || length(lty) == 0) { + lty <- c(plot.line$lty, box.umbrella$lty) + } else if(length(lty == 1)) lty = c(lty, box.umbrella$lty) + if(missing(lwd) || length(lwd) == 0) { + lwd <- c(plot.line$lwd, box.umbrella$lwd) + } else if(length(lwd == 1)) lwd = c(lwd, box.umbrella$lwd) + # create plot + panel.xyplot(x, y, subscripts=subscripts, type=type, col=col, + lty=lty[1], lwd=lwd[1], ...) + panel.arrows(x, lower[subscripts], x, upper[subscripts], + angle=angle, length=length, unit=unit, ends="both", + col=col, lty=lty[2], lwd=lwd[2], ...) } diff --git a/inst/doc/examples/example-cvExamples.R b/inst/doc/examples/example-repCV.R similarity index 67% rename from inst/doc/examples/example-cvExamples.R rename to inst/doc/examples/example-repCV.R index df4c495..d57cea2 100644 --- a/inst/doc/examples/example-cvExamples.R +++ b/inst/doc/examples/example-repCV.R @@ -7,14 +7,14 @@ folds <- cvFolds(nrow(coleman), K = 5, R = 10) # perform cross-validation for an LS regression model fitLm <- lm(Y ~ ., data = coleman) -cvLm(fitLm, cost = rtmspe, folds = folds, trim = 0.1) +repCV(fitLm, cost = rtmspe, folds = folds, trim = 0.1) # perform cross-validation for an MM regression model fitLmrob <- lmrob(Y ~ ., data = coleman) -cvLmrob(fitLmrob, cost = rtmspe, folds = folds, trim = 0.1) +repCV(fitLmrob, cost = rtmspe, folds = folds, trim = 0.1) # perform cross-validation for an LTS regression model fitLts <- ltsReg(Y ~ ., data = coleman) -cvLts(fitLts, cost = rtmspe, folds = folds, trim = 0.1) -cvLts(fitLts, cost = rtmspe, folds = folds, +repCV(fitLts, cost = rtmspe, folds = folds, trim = 0.1) +repCV(fitLts, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) diff --git a/man/accessors.Rd b/man/accessors.Rd index 4706657..b68256e 100644 --- a/man/accessors.Rd +++ b/man/accessors.Rd @@ -8,17 +8,17 @@ \alias{nfits} \title{Access or set information on cross-validation results} \usage{ - cvNames(x) + cvNames(x) - cvNames(x) <- value + cvNames(x) <- value - fits(x) + fits(x) - fits(x) <- value + fits(x) <- value - ncv(x) + ncv(x) - nfits(x) + nfits(x) } \arguments{ \item{x}{an object inheriting from class \code{"cv"} or @@ -62,14 +62,14 @@ folds <- cvFolds(nrow(coleman), K = 5, R = 10) ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) diff --git a/man/aggregate.cv.Rd b/man/aggregate.cv.Rd index 230c16c..69a4d74 100644 --- a/man/aggregate.cv.Rd +++ b/man/aggregate.cv.Rd @@ -4,12 +4,13 @@ \alias{aggregate.cvTuning} \title{Aggregate cross-validation results} \usage{ - \method{aggregate}{cv} (x, FUN = mean, select = NULL, ...) + \method{aggregate}{cv} (x, FUN = mean, select = NULL, + ...) - \method{aggregate}{cvSelect} (x, FUN = mean, select = - NULL, ...) + \method{aggregate}{cvSelect} (x, FUN = mean, + select = NULL, ...) - \method{aggregate}{cvTuning} (x, ...) + \method{aggregate}{cvTuning} (x, ...) } \arguments{ \item{x}{an object inheriting from class \code{"cv"} or @@ -53,14 +54,14 @@ folds <- cvFolds(nrow(coleman), K = 5, R = 10) ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) @@ -69,14 +70,14 @@ cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, cvFitsLts <- cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) cvFitsLts -# summary of the results with the 50% subsets +# summary of the results with the 50\% subsets aggregate(cvFitLts50, summary) # summary of the combined results aggregate(cvFitsLts, summary) ## evaluate MM regression models tuned for -## 80%, 85%, 90% and 95% efficiency +## 80\%, 85\%, 90\% and 95\% efficiency tuning <- list(tuning.psi=c(3.14, 3.44, 3.88, 4.68)) # set up function call diff --git a/man/bwplot.cv.Rd b/man/bwplot.cv.Rd index e413030..13b99e7 100644 --- a/man/bwplot.cv.Rd +++ b/man/bwplot.cv.Rd @@ -3,10 +3,10 @@ \alias{bwplot.cvSelect} \title{Box-and-whisker plots of cross-validation results} \usage{ - \method{bwplot}{cv} (x, data, select = NULL, ...) + \method{bwplot}{cv} (x, data, select = NULL, ...) - \method{bwplot}{cvSelect} (x, data, subset = NULL, select - = NULL, ...) + \method{bwplot}{cvSelect} (x, data, subset = NULL, + select = NULL, ...) } \arguments{ \item{x}{an object inheriting from class \code{"cv"} or @@ -82,14 +82,14 @@ bwplot(cvFits) ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) diff --git a/man/cost.Rd b/man/cost.Rd index 8d07820..b67fe15 100644 --- a/man/cost.Rd +++ b/man/cost.Rd @@ -1,71 +1,104 @@ -\name{cost} -\alias{cost} -\alias{mape} -\alias{mspe} -\alias{rmspe} -\alias{rtmspe} -\alias{tmspe} -\title{Prediction loss} -\usage{ - mspe(y, yHat) - - rmspe(y, yHat) - - mape(y, yHat) - - tmspe(y, yHat, trim = 0.25) - - rtmspe(y, yHat, trim = 0.25) -} -\arguments{ - \item{y}{a numeric vector or matrix giving the observed - values.} - - \item{yHat}{a numeric vector or matrix of the same - dimensions as \code{y} giving the fitted values.} - - \item{trim}{a numeric value giving the trimming - proportion (the default is 0.25).} -} -\value{ - A numeric value giving the prediction loss. -} -\description{ - Compute the prediction loss of a model. -} -\details{ - \code{mspe} and \code{rmspe} compute the mean squared - prediction error and the root mean squared prediction - error, respectively. In addition, \code{mape} returns - the mean absolute prediction error, which is somewhat - more robust. - - Robust prediction loss based on trimming is implemented - in \code{tmspe} and \code{rtmspe}. To be more precise, - \code{tmspe} computes the trimmed mean squared prediction - error and \code{rtmspe} computes the root trimmed mean - squared prediction error. A proportion of the largest - squared differences of the observed and fitted values are - thereby trimmed. -} -\examples{ -# fit an MM-regression model -library("robustbase") -data("coleman") -fit <- lmrob(Y~., data=coleman) - -# compute the prediction loss -mspe(coleman$Y, predict(fit)) -rmspe(coleman$Y, predict(fit)) -mape(coleman$Y, predict(fit)) -tmspe(coleman$Y, predict(fit), trim=0.1) -rtmspe(coleman$Y, predict(fit), trim=0.1) -} -\author{ - Andreas Alfons -} -\seealso{ - \code{\link{cvFit}}, \code{\link{cvTuning}} -} -\keyword{utilities} - +\name{cost} +\alias{cost} +\alias{mape} +\alias{mspe} +\alias{rmspe} +\alias{rtmspe} +\alias{tmspe} +\title{Prediction loss} +\usage{ + mspe(y, yHat, sd = FALSE) + + rmspe(y, yHat, sd = FALSE) + + mape(y, yHat, sd = FALSE) + + tmspe(y, yHat, trim = 0.25, sd = FALSE) + + rtmspe(y, yHat, trim = 0.25, sd = FALSE) +} +\arguments{ + \item{y}{a numeric vector or matrix giving the observed + values.} + + \item{yHat}{a numeric vector or matrix of the same + dimensions as \code{y} giving the fitted values.} + + \item{trim}{a numeric value giving the trimming + proportion (the default is 0.25).} + + \item{sd}{a logical indicating whether standard errors + should be computed as well.} +} +\value{ + If standard errors are not requested, a numeric value + giving the prediction loss is returned. + + Otherwise a list is returned, with the first component + containing the prediction loss and the second component + the corresponding standard error. +} +\description{ + Compute the prediction loss of a model. +} +\details{ + \code{mspe} and \code{rmspe} compute the mean squared + prediction error and the root mean squared prediction + error, respectively. In addition, \code{mape} returns + the mean absolute prediction error, which is somewhat + more robust. + + Robust prediction loss based on trimming is implemented + in \code{tmspe} and \code{rtmspe}. To be more precise, + \code{tmspe} computes the trimmed mean squared prediction + error and \code{rtmspe} computes the root trimmed mean + squared prediction error. A proportion of the largest + squared differences of the observed and fitted values are + thereby trimmed. + + Standard errors can be requested via the \code{sd} + argument. Note that standard errors for \code{tmspe} are + based on a winsorized standard deviation. Furthermore, + standard errors for \code{rmspe} and \code{rtmspe} are + computed from the respective standard errors of + \code{mspe} and \code{tmspe} via the delta method. +} +\examples{ +# fit an MM-regression model +data("coleman") +fit <- lmrob(Y~., data=coleman) + +# compute the prediction loss from the fitted values +# (hence the prediction loss is underestimated in this simple +# example since all observations are used to fit the model) +mspe(coleman$Y, predict(fit)) +rmspe(coleman$Y, predict(fit)) +mape(coleman$Y, predict(fit)) +tmspe(coleman$Y, predict(fit), trim = 0.1) +rtmspe(coleman$Y, predict(fit), trim = 0.1) + +# include standard error +mspe(coleman$Y, predict(fit), sd = TRUE) +rmspe(coleman$Y, predict(fit), sd = TRUE) +mape(coleman$Y, predict(fit), sd = TRUE) +tmspe(coleman$Y, predict(fit), trim = 0.1, sd = TRUE) +rtmspe(coleman$Y, predict(fit), trim = 0.1, sd = TRUE) +} +\author{ + Andreas Alfons +} +\references{ + Tukey, J.W. and McLaughlin, D.H. (1963) Less vulnerable + confidence and significance procedures for location based + on a single sample: Trimming/winsorization. + \emph{Sankhya: The Indian Journal of Statistics, Series + A}, \bold{25}(3), 331--352 + + Oehlert, G.W. (1992) A note on the delta method. + \emph{The American Statistician}, \bold{46}(1), 27--29. +} +\seealso{ + \code{\link{cvFit}}, \code{\link{cvTuning}} +} +\keyword{utilities} + diff --git a/man/cvFit.Rd b/man/cvFit.Rd index bedc086..d3640b4 100644 --- a/man/cvFit.Rd +++ b/man/cvFit.Rd @@ -1,227 +1,235 @@ -\name{cvFit} -\alias{cvFit} -\alias{cvFit.call} -\alias{cvFit.default} -\alias{cvFit.function} -\alias{print.cv} -\title{Cross-validation for model evaluation} -\usage{ - cvFit(object, ...) - - \method{cvFit}{default} (object, data = NULL, x = NULL, y, - cost = rmspe, K = 5, R = 1, foldType = c("random", - "consecutive", "interleaved"), folds = NULL, names = - NULL, predictArgs = list(), costArgs = list(), envir = - parent.frame(), seed = NULL, ...) - - \method{cvFit}{function} (object, formula, data = NULL, x - = NULL, y, args = list(), cost = rmspe, K = 5, R = 1, - foldType = c("random", "consecutive", "interleaved"), - folds = NULL, names = NULL, predictArgs = list(), - costArgs = list(), envir = parent.frame(), seed = - NULL, ...) - - \method{cvFit}{call} (object, data = NULL, x = NULL, y, - cost = rmspe, K = 5, R = 1, foldType = c("random", - "consecutive", "interleaved"), folds = NULL, names = - NULL, predictArgs = list(), costArgs = list(), envir = - parent.frame(), seed = NULL, ...) -} -\arguments{ - \item{object}{the fitted model for which to estimate the - prediction error, a function for fitting a model, or an - unevaluated function call for fitting a model (see - \code{\link{call}} for the latter). In the case of a - fitted model, the object is required to contain a - component \code{call} that stores the function call used - to fit the model, which is typically the case for objects - returned by model fitting functions.} - - \item{formula}{a \code{\link[stats]{formula}} describing - the model.} - - \item{data}{a data frame containing the variables - required for fitting the models. This is typically used - if the model in the function call is described by a - \code{\link[stats]{formula}}.} - - \item{x}{a numeric matrix containing the predictor - variables. This is typically used if the function call - for fitting the models requires the predictor matrix and - the response to be supplied as separate arguments.} - - \item{y}{a numeric vector or matrix containing the - response.} - - \item{args}{a list of additional arguments to be passed - to the model fitting function.} - - \item{cost}{a cost function measuring prediction loss. - It should expect the observed values of the response to - be passed as the first argument and the predicted values - as the second argument, and must return a non-negative - scalar value. The default is to use the root mean - squared prediction error (see \code{\link{cost}}).} - - \item{K}{an integer giving the number of groups into - which the data should be split (the default is five). - Keep in mind that this should be chosen such that all - groups are of approximately equal size. Setting \code{K} - equal to \code{n} yields leave-one-out cross-validation.} - - \item{R}{an integer giving the number of replications for - repeated \eqn{K}-fold cross-validation. This is ignored - for for leave-one-out cross-validation and other - non-random splits of the data.} - - \item{foldType}{a character string specifying the type of - folds to be generated. Possible values are - \code{"random"} (the default), \code{"consecutive"} or - \code{"interleaved"}.} - - \item{folds}{an object of class \code{"cvFolds"} giving - the folds of the data for cross-validation (as returned - by \code{\link{cvFolds}}). If supplied, this is - preferred over \code{K} and \code{R}.} - - \item{names}{an optional character vector giving names - for the arguments containing the data to be used in the - function call (see \dQuote{Details}).} - - \item{predictArgs}{a list of additional arguments to be - passed to the \code{\link[stats]{predict}} method of the - fitted models.} - - \item{costArgs}{a list of additional arguments to be - passed to the prediction loss function \code{cost}.} - - \item{envir}{the \code{\link{environment}} in which to - evaluate the function call for fitting the models (see - \code{\link{eval}}).} - - \item{seed}{optional initial seed for the random number - generator (see \code{\link{.Random.seed}}).} - - \item{\dots}{additional arguments to be passed down.} -} -\value{ - An object of class \code{"cv"} with the following - components: - - \item{n}{an integer giving the number of observations.} - - \item{K}{an integer giving the number of folds.} - - \item{R}{an integer giving the number of replications.} - - \item{cv}{a numeric vector containing the respective - estimated prediction errors. For repeated - cross-validation, those are average values over all - replications.} - - \item{reps}{a numeric matrix in which each column - contains the respective estimated prediction errors from - all replications. This is only returned for repeated - cross-validation.} - - \item{seed}{the seed of the random number generator - before cross-validation was performed.} - - \item{call}{the matched function call.} -} -\description{ - Estimate the prediction error of a model via (repeated) - \eqn{K}-fold cross-validation. It is thereby possible to - supply an object returned by a model fitting function, a - model fitting function itself, or an unevaluated function - call to a model fitting function. -} -\details{ - (Repeated) \eqn{K}-fold cross-validation is performed in - the following way. The data are first split into \eqn{K} - previously obtained blocks of approximately equal size. - Each of the \eqn{K} data blocks is left out once to fit - the model, and predictions are computed for the - observations in the left-out block with the - \code{\link[stats]{predict}} method of the fitted model. - Thus a prediction is obtained for each observation. - - The response variable and the obtained predictions for - all observations are then passed to the prediction loss - function \code{cost} to estimate the prediction error. - For repeated cross-validation, this process is replicated - and the estimated prediction errors from all replications - as well as their average are included in the returned - object. - - Furthermore, if the response is a vector but the - \code{\link[stats]{predict}} method of the fitted models - returns a matrix, the prediction error is computed for - each column. A typical use case for this behavior would - be if the \code{\link[stats]{predict}} method returns - predictions from an initial model fit and stepwise - improvements thereof. - - If \code{formula} or \code{data} are supplied, all - variables required for fitting the models are added as - one argument to the function call, which is the typical - behavior of model fitting functions with a - \code{\link[stats]{formula}} interface. In this case, - the accepted values for \code{names} depend on the - method. For the \code{function} method, a character - vector of length two should supplied, with the first - element specifying the argument name for the formula and - the second element specifying the argument name for the - data (the default is to use \code{c("formula", "data")}). - Note that names for both arguments should be supplied - even if only one is actually used. For the other - methods, which do not have a \code{formula} argument, a - character string specifying the argument name for the - data should be supplied (the default is to use - \code{"data"}). - - If \code{x} is supplied, on the other hand, the predictor - matrix and the response are added as separate arguments - to the function call. In this case, \code{names} should - be a character vector of length two, with the first - element specifying the argument name for the predictor - matrix and the second element specifying the argument - name for the response (the default is to use \code{c("x", - "y")}). It should be noted that the \code{formula} or - \code{data} arguments take precedence over \code{x}. -} -\examples{ -library("robustbase") -data("coleman") - -## via model fit -# fit an MM regression model -fit <- lmrob(Y ~ ., data=coleman) -# perform cross-validation -cvFit(fit, data = coleman, y = coleman$Y, cost = rtmspe, - K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234) - -## via model fitting function -# perform cross-validation -# note that the response is extracted from 'data' in -# this example and does not have to be supplied -cvFit(lmrob, formula = Y ~ ., data = coleman, cost = rtmspe, - K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234) - -## via function call -# set up function call -call <- call("lmrob", formula = Y ~ .) -# perform cross-validation -cvFit(call, data = coleman, y = coleman$Y, cost = rtmspe, - K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234) -} -\author{ - Andreas Alfons -} -\seealso{ - \code{\link{cvTool}}, \code{\link{cvSelect}}, - \code{\link{cvTuning}}, \code{\link{cvFolds}}, - \code{\link{cost}} -} -\keyword{utilities} - +\name{cvFit} +\alias{cvFit} +\alias{cvFit.call} +\alias{cvFit.default} +\alias{cvFit.function} +\alias{print.cv} +\title{Cross-validation for model evaluation} +\usage{ + cvFit(object, ...) + + \method{cvFit}{default} (object, data = NULL, x = NULL, + y, cost = rmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, names = NULL, predictArgs = list(), + costArgs = list(), envir = parent.frame(), seed = NULL, + ...) + + \method{cvFit}{function} (object, formula, data = NULL, + x = NULL, y, args = list(), cost = rmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, names = NULL, predictArgs = list(), + costArgs = list(), envir = parent.frame(), seed = NULL, + ...) + + \method{cvFit}{call} (object, data = NULL, x = NULL, y, + cost = rmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, names = NULL, predictArgs = list(), + costArgs = list(), envir = parent.frame(), seed = NULL, + ...) +} +\arguments{ + \item{object}{the fitted model for which to estimate the + prediction error, a function for fitting a model, or an + unevaluated function call for fitting a model (see + \code{\link{call}} for the latter). In the case of a + fitted model, the object is required to contain a + component \code{call} that stores the function call used + to fit the model, which is typically the case for objects + returned by model fitting functions.} + + \item{formula}{a \code{\link[stats]{formula}} describing + the model.} + + \item{data}{a data frame containing the variables + required for fitting the models. This is typically used + if the model in the function call is described by a + \code{\link[stats]{formula}}.} + + \item{x}{a numeric matrix containing the predictor + variables. This is typically used if the function call + for fitting the models requires the predictor matrix and + the response to be supplied as separate arguments.} + + \item{y}{a numeric vector or matrix containing the + response.} + + \item{args}{a list of additional arguments to be passed + to the model fitting function.} + + \item{cost}{a cost function measuring prediction loss. + It should expect the observed values of the response to + be passed as the first argument and the predicted values + as the second argument, and must return either a + non-negative scalar value, or a list with the first + component containing the prediction error and the second + component containing the standard error. The default is + to use the root mean squared prediction error (see + \code{\link{cost}}).} + + \item{K}{an integer giving the number of groups into + which the data should be split (the default is five). + Keep in mind that this should be chosen such that all + groups are of approximately equal size. Setting \code{K} + equal to \code{n} yields leave-one-out cross-validation.} + + \item{R}{an integer giving the number of replications for + repeated \eqn{K}-fold cross-validation. This is ignored + for for leave-one-out cross-validation and other + non-random splits of the data.} + + \item{foldType}{a character string specifying the type of + folds to be generated. Possible values are + \code{"random"} (the default), \code{"consecutive"} or + \code{"interleaved"}.} + + \item{folds}{an object of class \code{"cvFolds"} giving + the folds of the data for cross-validation (as returned + by \code{\link{cvFolds}}). If supplied, this is + preferred over \code{K} and \code{R}.} + + \item{names}{an optional character vector giving names + for the arguments containing the data to be used in the + function call (see \dQuote{Details}).} + + \item{predictArgs}{a list of additional arguments to be + passed to the \code{\link[stats]{predict}} method of the + fitted models.} + + \item{costArgs}{a list of additional arguments to be + passed to the prediction loss function \code{cost}.} + + \item{envir}{the \code{\link{environment}} in which to + evaluate the function call for fitting the models (see + \code{\link{eval}}).} + + \item{seed}{optional initial seed for the random number + generator (see \code{\link{.Random.seed}}).} + + \item{\dots}{additional arguments to be passed down.} +} +\value{ + An object of class \code{"cv"} with the following + components: + + \item{n}{an integer giving the number of observations.} + + \item{K}{an integer giving the number of folds.} + + \item{R}{an integer giving the number of replications.} + + \item{cv}{a numeric vector containing the respective + estimated prediction errors. For repeated + cross-validation, those are average values over all + replications.} + + \item{sd}{a numeric vector containing the respective + estimated standard errors of the prediction loss.} + + \item{reps}{a numeric matrix in which each column + contains the respective estimated prediction errors from + all replications. This is only returned for repeated + cross-validation.} + + \item{seed}{the seed of the random number generator + before cross-validation was performed.} + + \item{call}{the matched function call.} +} +\description{ + Estimate the prediction error of a model via (repeated) + \eqn{K}-fold cross-validation. It is thereby possible to + supply an object returned by a model fitting function, a + model fitting function itself, or an unevaluated function + call to a model fitting function. +} +\details{ + (Repeated) \eqn{K}-fold cross-validation is performed in + the following way. The data are first split into \eqn{K} + previously obtained blocks of approximately equal size. + Each of the \eqn{K} data blocks is left out once to fit + the model, and predictions are computed for the + observations in the left-out block with the + \code{\link[stats]{predict}} method of the fitted model. + Thus a prediction is obtained for each observation. + + The response variable and the obtained predictions for + all observations are then passed to the prediction loss + function \code{cost} to estimate the prediction error. + For repeated cross-validation, this process is replicated + and the estimated prediction errors from all replications + as well as their average are included in the returned + object. + + Furthermore, if the response is a vector but the + \code{\link[stats]{predict}} method of the fitted models + returns a matrix, the prediction error is computed for + each column. A typical use case for this behavior would + be if the \code{\link[stats]{predict}} method returns + predictions from an initial model fit and stepwise + improvements thereof. + + If \code{formula} or \code{data} are supplied, all + variables required for fitting the models are added as + one argument to the function call, which is the typical + behavior of model fitting functions with a + \code{\link[stats]{formula}} interface. In this case, + the accepted values for \code{names} depend on the + method. For the \code{function} method, a character + vector of length two should supplied, with the first + element specifying the argument name for the formula and + the second element specifying the argument name for the + data (the default is to use \code{c("formula", "data")}). + Note that names for both arguments should be supplied + even if only one is actually used. For the other + methods, which do not have a \code{formula} argument, a + character string specifying the argument name for the + data should be supplied (the default is to use + \code{"data"}). + + If \code{x} is supplied, on the other hand, the predictor + matrix and the response are added as separate arguments + to the function call. In this case, \code{names} should + be a character vector of length two, with the first + element specifying the argument name for the predictor + matrix and the second element specifying the argument + name for the response (the default is to use \code{c("x", + "y")}). It should be noted that the \code{formula} or + \code{data} arguments take precedence over \code{x}. +} +\examples{ +library("robustbase") +data("coleman") + +## via model fit +# fit an MM regression model +fit <- lmrob(Y ~ ., data=coleman) +# perform cross-validation +cvFit(fit, data = coleman, y = coleman$Y, cost = rtmspe, + K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234) + +## via model fitting function +# perform cross-validation +# note that the response is extracted from 'data' in +# this example and does not have to be supplied +cvFit(lmrob, formula = Y ~ ., data = coleman, cost = rtmspe, + K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234) + +## via function call +# set up function call +call <- call("lmrob", formula = Y ~ .) +# perform cross-validation +cvFit(call, data = coleman, y = coleman$Y, cost = rtmspe, + K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234) +} +\author{ + Andreas Alfons +} +\seealso{ + \code{\link{cvTool}}, \code{\link{cvSelect}}, + \code{\link{cvTuning}}, \code{\link{cvFolds}}, + \code{\link{cost}} +} +\keyword{utilities} + diff --git a/man/cvFolds.Rd b/man/cvFolds.Rd index 0101770..cbd4bd0 100644 --- a/man/cvFolds.Rd +++ b/man/cvFolds.Rd @@ -3,8 +3,8 @@ \alias{print.cvFolds} \title{Cross-validation folds} \usage{ - cvFolds(n, K = 5, R = 1, type = c("random", "consecutive", - "interleaved")) + cvFolds(n, K = 5, R = 1, + type = c("random", "consecutive", "interleaved")) } \arguments{ \item{n}{an integer giving the number of observations to diff --git a/man/cvReshape.Rd b/man/cvReshape.Rd index 538401a..4e7d499 100644 --- a/man/cvReshape.Rd +++ b/man/cvReshape.Rd @@ -4,12 +4,40 @@ \alias{cvReshape.cvSelect} \title{Reshape cross-validation results} \usage{ - cvReshape(x) + cvReshape(x, ...) + + \method{cvReshape}{cv} (x, + selectBest = c("min", "hastie"), sdFactor = 1, ...) + + \method{cvReshape}{cvSelect} (x, + selectBest = c("min", "hastie"), sdFactor = 1, ...) } \arguments{ \item{x}{an object inheriting from class \code{"cv"} or \code{"cvSelect"} that contains cross-validation results.} + + \item{selectBest}{a character string specifying a + criterion for selecting the best model. Possible values + are \code{"min"} (the default) or \code{"hastie"}. The + former selects the model with the smallest prediction + error. The latter is useful for nested models or for + models with a tuning parameter controlling the complexity + of the model (e.g., penalized regression). It selects + the most parsimonious model whose prediction error is no + larger than \code{sdFactor} standard errors above the + prediction error of the best overall model. Note that + the models are thereby assumed to be ordered from the + most parsimonious one to the most complex one. In + particular a one-standard-error rule is frequently + applied.} + + \item{sdFactor}{a numeric value giving a multiplication + factor of the standard error for the selection of the + best model. This is ignored if \code{selectBest} is + \code{"min"}.} + + \item{\dots}{additional arguments to be passed down.} } \value{ An object of class \code{"cvSelect"} with the following @@ -31,10 +59,20 @@ cross-validation, those are average values over all replications.} + \item{sd}{a data frame containing the estimated standard + errors of the prediction loss for the models.} + + \item{selectBest}{a character string specifying the + criterion used for selecting the best model.} + + \item{sdFactor}{a numeric value giving the multiplication + factor of the standard error used for the selection of + the best model.} + \item{reps}{a data frame containing the estimated prediction errors for the models from all replications. This is only returned if repeated cross-validation was - performed for at least one of the models.} + performed.} } \description{ Reshape cross-validation results into an object of class @@ -55,6 +93,11 @@ cvReshape(cvFitLts) \author{ Andreas Alfons } +\references{ + Hastie, T., Tibshirani, R. and Friedman, J. (2009) + \emph{The Elements of Statistical Learning: Data Mining, + Inference, and Prediction}. Springer, 2nd edition. +} \seealso{ \code{\link{cvFit}}, \code{\link{cvSelect}}, \code{\link{cvTuning}} diff --git a/man/cvSelect.Rd b/man/cvSelect.Rd index d3f3b41..1d6ae3e 100644 --- a/man/cvSelect.Rd +++ b/man/cvSelect.Rd @@ -3,7 +3,8 @@ \alias{print.cvSelect} \title{Model selection based on cross-validation} \usage{ - cvSelect(..., .reshape = FALSE) + cvSelect(..., .reshape = FALSE, + .selectBest = c("min", "hastie"), .sdFactor = 1) } \arguments{ \item{\dots}{objects inheriting from class \code{"cv"} or @@ -13,6 +14,26 @@ more than one column of cross-validation results should be reshaped to have only one column (see \dQuote{Details}).} + + \item{.selectBest}{a character string specifying a + criterion for selecting the best model. Possible values + are \code{"min"} (the default) or \code{"hastie"}. The + former selects the model with the smallest prediction + error. The latter is useful for nested models or for + models with a tuning parameter controlling the complexity + of the model (e.g., penalized regression). It selects + the most parsimonious model whose prediction error is no + larger than \code{.sdFactor} standard errors above the + prediction error of the best overall model. Note that + the models are thereby assumed to be ordered from the + most parsimonious one to the most complex one. In + particular a one-standard-error rule is frequently + applied.} + + \item{.sdFactor}{a numeric value giving a multiplication + factor of the standard error for the selection of the + best model. This is ignored if \code{.selectBest} is + \code{"min"}.} } \value{ An object of class \code{"cvSelect"} with the following @@ -35,6 +56,16 @@ repeated cross-validation was performed, those are average values over all replications.} + \item{sd}{a data frame containing the estimated standard + errors of the prediction loss for the models.} + + \item{selectBest}{a character string specifying the + criterion used for selecting the best model.} + + \item{sdFactor}{a numeric value giving the multiplication + factor of the standard error used for the selection of + the best model.} + \item{reps}{a data frame containing the estimated prediction errors from all replications for those models for which repeated cross-validation was performed. This @@ -70,8 +101,9 @@ \code{\link{cvReshape}} to have only one column. Then the best overall model is selected. - It should also be noted that the \code{.reshape} argument - starts with a dot to avoid conflicts with the argument + It should also be noted that the argument names of + \code{.reshape}, \code{.selectBest} and \code{.sdFacor} + start with a dot to avoid conflicts with the argument names used for the objects containing cross-validation results. } @@ -115,14 +147,14 @@ cvSelect(LS = cvFitLm, MM = cvFitLmrob, LTS = cvFitLts) ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) @@ -133,6 +165,11 @@ cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) \author{ Andreas Alfons } +\references{ + Hastie, T., Tibshirani, R. and Friedman, J. (2009) + \emph{The Elements of Statistical Learning: Data Mining, + Inference, and Prediction}. Springer, 2nd edition. +} \seealso{ \code{\link{cvFit}}, \code{\link{cvTuning}} } diff --git a/man/cvTool.Rd b/man/cvTool.Rd index fe9d357..396a992 100644 --- a/man/cvTool.Rd +++ b/man/cvTool.Rd @@ -1,130 +1,139 @@ -\name{cvTool} -\alias{cvTool} -\title{Low-level function for cross-validation} -\usage{ - cvTool(call, data = NULL, x = NULL, y, cost = rmspe, - folds, names = NULL, predictArgs = list(), costArgs = - list(), envir = parent.frame()) -} -\arguments{ - \item{call}{an unevaluated function call for fitting a - model (see \code{\link{call}}).} - - \item{data}{a data frame containing the variables - required for fitting the models. This is typically used - if the model in the function call is described by a - \code{\link[stats]{formula}}.} - - \item{x}{a numeric matrix containing the predictor - variables. This is typically used if the function call - for fitting the models requires the predictor matrix and - the response to be supplied as separate arguments.} - - \item{y}{a numeric vector or matrix containing the - response.} - - \item{cost}{a cost function measuring prediction loss. - It should expect the observed values of the response to - be passed as the first argument and the predicted values - as the second argument, and must return a non-negative - scalar value. The default is to use the root mean - squared prediction error (see \code{\link{cost}}).} - - \item{folds}{an object of class \code{"cvFolds"} giving - the folds of the data for cross-validation (as returned - by \code{\link{cvFolds}}).} - - \item{names}{an optional character vector giving names - for the arguments containing the data to be used in the - function call (see \dQuote{Details}).} - - \item{predictArgs}{a list of additional arguments to be - passed to the \code{\link[stats]{predict}} method of the - fitted models.} - - \item{costArgs}{a list of additional arguments to be - passed to the prediction loss function \code{cost}.} - - \item{envir}{the \code{\link{environment}} in which to - evaluate the function call for fitting the models (see - \code{\link{eval}}).} -} -\value{ - A numeric matrix in which each column contains the - respective estimated prediction errors from all - replications. -} -\description{ - Basic function to estimate the prediction error of a - model via (repeated) \eqn{K}-fold cross-validation. The - model is thereby specified by an unevaluated function - call to a model fitting function. -} -\details{ - (Repeated) \eqn{K}-fold cross-validation is performed in - the following way. The data are first split into \eqn{K} - previously obtained blocks of approximately equal size - (given by \code{folds}). Each of the \eqn{K} data blocks - is left out once to fit the model, and predictions are - computed for the observations in the left-out block with - the \code{\link[stats]{predict}} method of the fitted - model. Thus a prediction is obtained for each - observation. - - The response variable and the obtained predictions for - all observations are then passed to the prediction loss - function \code{cost} to estimate the prediction error. - For repeated cross-validation (as indicated by - \code{folds}), this process is replicated and the - estimated prediction errors from all replications are - returned. - - Furthermore, if the response is a vector but the - \code{\link[stats]{predict}} method of the fitted models - returns a matrix, the prediction error is computed for - each column. A typical use case for this behavior would - be if the \code{\link[stats]{predict}} method returns - predictions from an initial model fit and stepwise - improvements thereof. - - If \code{data} is supplied, all variables required for - fitting the models are added as one argument to the - function call, which is the typical behavior of model - fitting functions with a \code{\link[stats]{formula}} - interface. In this case, a character string specifying - the argument name can be passed via \code{names} (the - default is to use \code{"data"}). - - If \code{x} is supplied, on the other hand, the predictor - matrix and the response are added as separate arguments - to the function call. In this case, \code{names} should - be a character vector of length two, with the first - element specifying the argument name for the predictor - matrix and the second element specifying the argument - name for the response (the default is to use \code{c("x", - "y")}). It should be noted that \code{data} takes - precedence over \code{x} if both are supplied. -} -\examples{ -library("robustbase") -data("coleman") -set.seed(1234) # set seed for reproducibility - -# set up function call for an MM regression model -call <- call("lmrob", formula = Y ~ .) -# set up folds for cross-validation -folds <- cvFolds(nrow(coleman), K = 5, R = 10) - -# perform cross-validation -cvTool(call, data = coleman, y = coleman$Y, cost = rtmspe, - folds = folds, costArgs = list(trim = 0.1)) -} -\author{ - Andreas Alfons -} -\seealso{ - \code{\link{cvFit}}, \code{\link{cvTuning}}, - \code{\link{cvFolds}}, \code{\link{cost}} -} -\keyword{utilities} - +\name{cvTool} +\alias{cvTool} +\title{Low-level function for cross-validation} +\usage{ + cvTool(call, data = NULL, x = NULL, y, cost = rmspe, + folds, names = NULL, predictArgs = list(), + costArgs = list(), envir = parent.frame()) +} +\arguments{ + \item{call}{an unevaluated function call for fitting a + model (see \code{\link{call}}).} + + \item{data}{a data frame containing the variables + required for fitting the models. This is typically used + if the model in the function call is described by a + \code{\link[stats]{formula}}.} + + \item{x}{a numeric matrix containing the predictor + variables. This is typically used if the function call + for fitting the models requires the predictor matrix and + the response to be supplied as separate arguments.} + + \item{y}{a numeric vector or matrix containing the + response.} + + \item{cost}{a cost function measuring prediction loss. + It should expect the observed values of the response to + be passed as the first argument and the predicted values + as the second argument, and must return either a + non-negative scalar value, or a list with the first + component containing the prediction error and the second + component containing the standard error. The default is + to use the root mean squared prediction error (see + \code{\link{cost}}).} + + \item{folds}{an object of class \code{"cvFolds"} giving + the folds of the data for cross-validation (as returned + by \code{\link{cvFolds}}).} + + \item{names}{an optional character vector giving names + for the arguments containing the data to be used in the + function call (see \dQuote{Details}).} + + \item{predictArgs}{a list of additional arguments to be + passed to the \code{\link[stats]{predict}} method of the + fitted models.} + + \item{costArgs}{a list of additional arguments to be + passed to the prediction loss function \code{cost}.} + + \item{envir}{the \code{\link{environment}} in which to + evaluate the function call for fitting the models (see + \code{\link{eval}}).} +} +\value{ + If only one replication is requested and the prediction + loss function \code{cost} also returns the standard + error, a list is returned, with the first component + containing the estimated prediction errors and the second + component the corresponding estimated standard errors. + + Otherwise the return value is a numeric matrix in which + each column contains the respective estimated prediction + errors from all replications. +} +\description{ + Basic function to estimate the prediction error of a + model via (repeated) \eqn{K}-fold cross-validation. The + model is thereby specified by an unevaluated function + call to a model fitting function. +} +\details{ + (Repeated) \eqn{K}-fold cross-validation is performed in + the following way. The data are first split into \eqn{K} + previously obtained blocks of approximately equal size + (given by \code{folds}). Each of the \eqn{K} data blocks + is left out once to fit the model, and predictions are + computed for the observations in the left-out block with + the \code{\link[stats]{predict}} method of the fitted + model. Thus a prediction is obtained for each + observation. + + The response variable and the obtained predictions for + all observations are then passed to the prediction loss + function \code{cost} to estimate the prediction error. + For repeated cross-validation (as indicated by + \code{folds}), this process is replicated and the + estimated prediction errors from all replications are + returned. + + Furthermore, if the response is a vector but the + \code{\link[stats]{predict}} method of the fitted models + returns a matrix, the prediction error is computed for + each column. A typical use case for this behavior would + be if the \code{\link[stats]{predict}} method returns + predictions from an initial model fit and stepwise + improvements thereof. + + If \code{data} is supplied, all variables required for + fitting the models are added as one argument to the + function call, which is the typical behavior of model + fitting functions with a \code{\link[stats]{formula}} + interface. In this case, a character string specifying + the argument name can be passed via \code{names} (the + default is to use \code{"data"}). + + If \code{x} is supplied, on the other hand, the predictor + matrix and the response are added as separate arguments + to the function call. In this case, \code{names} should + be a character vector of length two, with the first + element specifying the argument name for the predictor + matrix and the second element specifying the argument + name for the response (the default is to use \code{c("x", + "y")}). It should be noted that \code{data} takes + precedence over \code{x} if both are supplied. +} +\examples{ +library("robustbase") +data("coleman") +set.seed(1234) # set seed for reproducibility + +# set up function call for an MM regression model +call <- call("lmrob", formula = Y ~ .) +# set up folds for cross-validation +folds <- cvFolds(nrow(coleman), K = 5, R = 10) + +# perform cross-validation +cvTool(call, data = coleman, y = coleman$Y, cost = rtmspe, + folds = folds, costArgs = list(trim = 0.1)) +} +\author{ + Andreas Alfons +} +\seealso{ + \code{\link{cvFit}}, \code{\link{cvTuning}}, + \code{\link{cvFolds}}, \code{\link{cost}} +} +\keyword{utilities} + diff --git a/man/cvTools-package.Rd b/man/cvTools-package.Rd index 3124694..0894670 100644 --- a/man/cvTools-package.Rd +++ b/man/cvTools-package.Rd @@ -12,11 +12,16 @@ and assist users with model selection. \tabular{ll}{ Package: \tab cvTools\cr Type: \tab Package\cr -Version: \tab 0.1.1\cr -Date: \tab 2011-12-07\cr +Version: \tab 0.2.0\cr +Date: \tab 2012-02-13\cr Depends: \tab +R (>= 2.11.0), lattice, robustbase\cr +Imports: \tab +lattice, +robustbase, +stats\cr License: \tab GPL (>= 2)\cr LazyLoad: \tab yes\cr } @@ -29,7 +34,6 @@ aggregate.cv Aggregate cross-validation results bwplot.cv Box-and-whisker plots of cross-validation results cost Prediction loss -cvExamples Cross-validation for linear models cvFit Cross-validation for model evaluation cvFolds Cross-validation folds cvReshape Reshape cross-validation results @@ -39,11 +43,12 @@ cvTools-package Cross-validation tools for regression models cvTuning Cross-validation for tuning parameter selection densityplot.cv Kernel density plots of cross-validation results -dotplot.cvSelect Dot plots of cross-validation results +dotplot.cv Dot plots of cross-validation results plot.cv Plot cross-validation results +repCV Cross-validation for linear models subset.cv Subsetting cross-validation results summary.cv Summarize cross-validation results -xyplot.cvSelect X-Y plots of cross-validation results +xyplot.cv X-Y plots of cross-validation results } } \author{ diff --git a/man/cvTuning.Rd b/man/cvTuning.Rd index ad54c3e..1d9b1eb 100644 --- a/man/cvTuning.Rd +++ b/man/cvTuning.Rd @@ -5,20 +5,22 @@ \alias{print.cvTuning} \title{Cross-validation for tuning parameter selection} \usage{ - cvTuning(object, ...) - - \method{cvTuning}{function} (object, formula, data = NULL, - x = NULL, y, tuning = list(), args = list(), cost = - rmspe, K = 5, R = 1, foldType = c("random", - "consecutive", "interleaved"), folds = NULL, names = - NULL, predictArgs = list(), costArgs = list(), envir = - parent.frame(), seed = NULL, ...) - - \method{cvTuning}{call} (object, data = NULL, x = NULL, y, - tuning = list(), cost = rmspe, K = 5, R = 1, foldType - = c("random", "consecutive", "interleaved"), folds = - NULL, names = NULL, predictArgs = list(), costArgs = - list(), envir = parent.frame(), seed = NULL, ...) + cvTuning(object, ...) + + \method{cvTuning}{function} (object, formula, + data = NULL, x = NULL, y, tuning = list(), + args = list(), cost = rmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, names = NULL, predictArgs = list(), + costArgs = list(), selectBest = c("min", "hastie"), + sdFactor = 1, envir = parent.frame(), seed = NULL, ...) + + \method{cvTuning}{call} (object, data = NULL, x = NULL, + y, tuning = list(), cost = rmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, names = NULL, predictArgs = list(), + costArgs = list(), selectBest = c("min", "hastie"), + sdFactor = 1, envir = parent.frame(), seed = NULL, ...) } \arguments{ \item{object}{a function or an unevaluated function call @@ -55,9 +57,12 @@ \item{cost}{a cost function measuring prediction loss. It should expect the observed values of the response to be passed as the first argument and the predicted values - as the second argument, and must return a non-negative - scalar value. The default is to use the root mean - squared prediction error (see \code{\link{cost}}).} + as the second argument, and must return either a + non-negative scalar value, or a list with the first + component containing the prediction error and the second + component containing the standard error. The default is + to use the root mean squared prediction error (see + \code{\link{cost}}).} \item{K}{an integer giving the number of groups into which the data should be split (the default is five). @@ -91,6 +96,25 @@ \item{costArgs}{a list of additional arguments to be passed to the prediction loss function \code{cost}.} + \item{selectBest}{a character string specifying a + criterion for selecting the best model. Possible values + are \code{"min"} (the default) or \code{"hastie"}. The + former selects the model with the smallest prediction + error. The latter is useful for models with a tuning + parameter controlling the complexity of the model (e.g., + penalized regression). It selects the most parsimonious + model whose prediction error is no larger than + \code{sdFactor} standard errors above the prediction + error of the best overall model. Note that the models + are thereby assumed to be ordered from the most + parsimonious one to the most complex one. In particular + a one-standard-error rule is frequently applied.} + + \item{sdFactor}{a numeric value giving a multiplication + factor of the standard error for the selection of the + best model. This is ignored if \code{selectBest} is + \code{"min"}.} + \item{envir}{the \code{\link{environment}} in which to evaluate the function call for fitting the models (see \code{\link{eval}}).} @@ -126,6 +150,17 @@ parameter values. For repeated cross-validation, those are average values over all replications.} + \item{sd}{a data frame containing the estimated standard + errors of the prediction loss for all combinations of + tuning parameter values.} + + \item{selectBest}{a character string specifying the + criterion used for selecting the best model.} + + \item{sdFactor}{a numeric value giving the multiplication + factor of the standard error used for the selection of + the best model.} + \item{reps}{a data frame containing the estimated prediction errors from all replications for all combinations of tuning parameter values. This is only @@ -206,7 +241,7 @@ library("robustbase") data("coleman") -## evaluate MM regression models tuned for 85% and 95% efficiency +## evaluate MM regression models tuned for 85\% and 95\% efficiency tuning <- list(tuning.psi = c(3.443689, 4.685061)) ## via model fitting function @@ -228,6 +263,11 @@ cvTuning(call, data = coleman, y = coleman$Y, tuning = tuning, \author{ Andreas Alfons } +\references{ + Hastie, T., Tibshirani, R. and Friedman, J. (2009) + \emph{The Elements of Statistical Learning: Data Mining, + Inference, and Prediction}. Springer, 2nd edition. +} \seealso{ \code{\link{cvTool}}, \code{\link{cvFit}}, \code{\link{cvSelect}}, \code{\link{cvFolds}}, diff --git a/man/densityplot.cv.Rd b/man/densityplot.cv.Rd index 7b235a5..12ca1cf 100644 --- a/man/densityplot.cv.Rd +++ b/man/densityplot.cv.Rd @@ -3,10 +3,10 @@ \alias{densityplot.cvSelect} \title{Kernel density plots of cross-validation results} \usage{ - \method{densityplot}{cv} (x, data, select = NULL, ...) + \method{densityplot}{cv} (x, data, select = NULL, ...) - \method{densityplot}{cvSelect} (x, data, subset = NULL, - select = NULL, ...) + \method{densityplot}{cvSelect} (x, data, subset = NULL, + select = NULL, ...) } \arguments{ \item{x}{an object inheriting from class \code{"cv"} or @@ -82,14 +82,14 @@ densityplot(cvFits) ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) diff --git a/man/dotplot.cvSelect.Rd b/man/dotplot.cv.Rd similarity index 83% rename from man/dotplot.cvSelect.Rd rename to man/dotplot.cv.Rd index 3050b54..855b169 100644 --- a/man/dotplot.cvSelect.Rd +++ b/man/dotplot.cv.Rd @@ -1,106 +1,115 @@ -\name{dotplot.cvSelect} -\alias{dotplot.cvSelect} -\title{Dot plots of cross-validation results} -\usage{ - \method{dotplot}{cvSelect} (x, data, subset = NULL, select - = NULL, ...) -} -\arguments{ - \item{x}{an object inheriting from class - \code{"cvSelect"} that contains cross-validation - results.} - - \item{data}{currently ignored.} - - \item{subset}{a character, integer or logical vector - indicating the subset of models for which to plot the - cross-validation results.} - - \item{select}{a character, integer or logical vector - indicating the columns of cross-validation results to be - plotted.} - - \item{\dots}{additional arguments to be passed to the - \code{"formula"} method of - \code{\link[lattice:xyplot]{dotplot}}.} -} -\value{ - An object of class \code{"trellis"} is returned - invisibly. The - \code{\link[lattice:update.trellis]{update}} method can - be used to update components of the object and the - \code{\link[lattice:print.trellis]{print}} method - (usually called by default) will plot it on an - appropriate plotting device. -} -\description{ - Produce dot plots of (average) results from (repeated) - \eqn{K}-fold cross-validation. -} -\details{ - For objects with multiple columns of repeated - cross-validation results, conditional dot plots are - produced. -} -\examples{ -library("robustbase") -data("coleman") -set.seed(1234) # set seed for reproducibility - -## set up folds for cross-validation -folds <- cvFolds(nrow(coleman), K = 5, R = 10) - - -## compare LS, MM and LTS regression - -# perform cross-validation for an LS regression model -fitLm <- lm(Y ~ ., data = coleman) -cvFitLm <- cvLm(fitLm, cost = rtmspe, - folds = folds, trim = 0.1) - -# perform cross-validation for an MM regression model -fitLmrob <- lmrob(Y ~ ., data = coleman, k.max = 500) -cvFitLmrob <- cvLmrob(fitLmrob, cost = rtmspe, - folds = folds, trim = 0.1) - -# perform cross-validation for an LTS regression model -fitLts <- ltsReg(Y ~ ., data = coleman) -cvFitLts <- cvLts(fitLts, cost = rtmspe, - folds = folds, trim = 0.1) - -# combine and plot results -cvFits <- cvSelect(LS = cvFitLm, MM = cvFitLmrob, LTS = cvFitLts) -cvFits -dotplot(cvFits) - - -## compare raw and reweighted LTS estimators for -## 50% and 75% subsets - -# 50% subsets -fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) -cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, - fit = "both", trim = 0.1) - -# 75% subsets -fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) -cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, - fit = "both", trim = 0.1) - -# combine and plot results -cvFitsLts <- cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) -cvFitsLts -dotplot(cvFitsLts) -} -\author{ - Andreas Alfons -} -\seealso{ - \code{\link{cvFit}}, \code{\link{cvSelect}}, - \code{\link{cvTuning}}, \code{\link[=plot.cv]{plot}}, - \code{\link[=xyplot.cvSelect]{xyplot}}, - \code{\link[=bwplot.cv]{bwplot}}, - \code{\link[=densityplot.cv]{densityplot}} -} -\keyword{hplot} - +\name{dotplot.cv} +\alias{dotplot.cv} +\alias{dotplot.cvSelect} +\title{Dot plots of cross-validation results} +\usage{ + \method{dotplot}{cv} (x, data, select = NULL, + sdFactor = NA, ...) + + \method{dotplot}{cvSelect} (x, data, subset = NULL, + select = NULL, sdFactor = x$sdFactor, ...) +} +\arguments{ + \item{x}{an object inheriting from class + \code{"cvSelect"} that contains cross-validation + results.} + + \item{data}{currently ignored.} + + \item{subset}{a character, integer or logical vector + indicating the subset of models for which to plot the + cross-validation results.} + + \item{select}{a character, integer or logical vector + indicating the columns of cross-validation results to be + plotted.} + + \item{sdFactor}{a numeric value giving the multiplication + factor of the standard error for displaying error bars. + Error bars can be suppressed by setting this to + \code{NA}.} + + \item{\dots}{additional arguments to be passed to the + \code{"formula"} method of + \code{\link[lattice:xyplot]{dotplot}}.} +} +\value{ + An object of class \code{"trellis"} is returned + invisibly. The + \code{\link[lattice:update.trellis]{update}} method can + be used to update components of the object and the + \code{\link[lattice:print.trellis]{print}} method + (usually called by default) will plot it on an + appropriate plotting device. +} +\description{ + Produce dot plots of (average) results from (repeated) + \eqn{K}-fold cross-validation. +} +\details{ + For objects with multiple columns of repeated + cross-validation results, conditional dot plots are + produced. +} +\examples{ +library("robustbase") +data("coleman") +set.seed(1234) # set seed for reproducibility + +## set up folds for cross-validation +folds <- cvFolds(nrow(coleman), K = 5, R = 10) + + +## compare LS, MM and LTS regression + +# perform cross-validation for an LS regression model +fitLm <- lm(Y ~ ., data = coleman) +cvFitLm <- cvLm(fitLm, cost = rtmspe, + folds = folds, trim = 0.1) + +# perform cross-validation for an MM regression model +fitLmrob <- lmrob(Y ~ ., data = coleman, k.max = 500) +cvFitLmrob <- cvLmrob(fitLmrob, cost = rtmspe, + folds = folds, trim = 0.1) + +# perform cross-validation for an LTS regression model +fitLts <- ltsReg(Y ~ ., data = coleman) +cvFitLts <- cvLts(fitLts, cost = rtmspe, + folds = folds, trim = 0.1) + +# combine and plot results +cvFits <- cvSelect(LS = cvFitLm, MM = cvFitLmrob, LTS = cvFitLts) +cvFits +dotplot(cvFits) + + +## compare raw and reweighted LTS estimators for +## 50\% and 75\% subsets + +# 50\% subsets +fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) +cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, + fit = "both", trim = 0.1) + +# 75\% subsets +fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) +cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, + fit = "both", trim = 0.1) + +# combine and plot results +cvFitsLts <- cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) +cvFitsLts +dotplot(cvFitsLts) +} +\author{ + Andreas Alfons +} +\seealso{ + \code{\link{cvFit}}, \code{\link{cvSelect}}, + \code{\link{cvTuning}}, \code{\link[=plot.cv]{plot}}, + \code{\link[=xyplot.cvSelect]{xyplot}}, + \code{\link[=bwplot.cv]{bwplot}}, + \code{\link[=densityplot.cv]{densityplot}} +} +\keyword{hplot} + diff --git a/man/plot.cv.Rd b/man/plot.cv.Rd index 31785d6..76a24f1 100644 --- a/man/plot.cv.Rd +++ b/man/plot.cv.Rd @@ -3,11 +3,13 @@ \alias{plot.cvSelect} \title{Plot cross-validation results} \usage{ - \method{plot}{cv} (x, method = c("bwplot", "densityplot"), - ...) + \method{plot}{cv} (x, + method = c("bwplot", "densityplot", "xyplot", "dotplot"), + ...) - \method{plot}{cvSelect} (x, method = c("bwplot", - "densityplot", "xyplot", "dotplot"), ...) + \method{plot}{cvSelect} (x, + method = c("bwplot", "densityplot", "xyplot", "dotplot"), + ...) } \arguments{ \item{x}{an object inheriting from class \code{"cv"} or @@ -93,14 +95,14 @@ plot(cvFits, method = "dot") ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) diff --git a/man/cvExamples.Rd b/man/repCV.Rd similarity index 55% rename from man/cvExamples.Rd rename to man/repCV.Rd index 8d2b16a..e8abca3 100644 --- a/man/cvExamples.Rd +++ b/man/repCV.Rd @@ -1,166 +1,200 @@ -\name{cvExamples} -\alias{cvExamples} -\alias{cvLm} -\alias{cvLmrob} -\alias{cvLts} -\title{Cross-validation for linear models} -\usage{ - cvLm(object, cost = rmspe, K = 5, R = 1, foldType = - c("random", "consecutive", "interleaved"), folds = - NULL, seed = NULL, ...) - - cvLmrob(object, cost = rtmspe, K = 5, R = 1, foldType = - c("random", "consecutive", "interleaved"), folds = - NULL, seed = NULL, ...) - - cvLts(object, cost = rtmspe, K = 5, R = 1, foldType = - c("random", "consecutive", "interleaved"), folds = - NULL, fit = c("reweighted", "raw", "both"), seed = - NULL, ...) -} -\arguments{ - \item{object}{for \code{cvLm}, an object of class - \code{"lm"} computed with \code{\link[stats]{lm}}. For - \code{cvLmrob}, an object of class \code{"lmrob"} - computed with \code{\link[robustbase]{lmrob}}. For - \code{cvLts}, an object of class \code{"lts"} computed - with \code{\link[robustbase]{ltsReg}}.} - - \item{cost}{a cost function measuring prediction loss. - It should expect the observed values of the response to - be passed as the first argument and the predicted values - as the second argument, and must return a non-negative - scalar value. The default is to use the root mean - squared prediction error for \code{cvLm} and the root - trimmed mean squared prediction error for \code{cvLmrob} - and \code{cvLts} (see \code{\link{cost}}).} - - \item{K}{an integer giving the number of groups into - which the data should be split (the default is five). - Keep in mind that this should be chosen such that all - groups are of approximately equal size. Setting \code{K} - equal to \code{n} yields leave-one-out cross-validation.} - - \item{R}{an integer giving the number of replications for - repeated \eqn{K}-fold cross-validation. This is ignored - for for leave-one-out cross-validation and other - non-random splits of the data.} - - \item{foldType}{a character string specifying the type of - folds to be generated. Possible values are - \code{"random"} (the default), \code{"consecutive"} or - \code{"interleaved"}.} - - \item{folds}{an object of class \code{"cvFolds"} giving - the folds of the data for cross-validation (as returned - by \code{\link{cvFolds}}). If supplied, this is - preferred over \code{K} and \code{R}.} - - \item{fit}{a character string specifying for which fit to - estimate the prediction error. Possible values are - \code{"reweighted"} (the default) for the prediction - error of the reweighted fit, \code{"raw"} for the - prediction error of the raw fit, or \code{"both"} for the - prediction error of both fits.} - - \item{seed}{optional initial seed for the random number - generator (see \code{\link{.Random.seed}}).} - - \item{\dots}{additional arguments to be passed to the - prediction loss function \code{cost}.} -} -\value{ - An object of class \code{"cv"} with the following - components: - - \item{n}{an integer giving the number of observations.} - - \item{K}{an integer giving the number of folds.} - - \item{R}{an integer giving the number of replications.} - - \item{cv}{a numeric vector containing the estimated - prediction errors. For \code{cvLm} and \code{cvLmrob}, - this is a single numeric value. For \code{cvLts}, this - contains one value for each of the requested fits. In - the case of repeated cross-validation, those are average - values over all replications.} - - \item{reps}{a numeric matrix containing the estimated - prediction errors from all replications. For \code{cvLm} - and \code{cvLmrob}, this is a matrix with one column. - For \code{cvLts}, this contains one column for each of - the requested fits. However, this is only returned for - repeated cross-validation.} - - \item{seed}{the seed of the random number generator - before cross-validation was performed.} - - \item{call}{the matched function call.} -} -\description{ - Estimate the prediction error of a linear model via - (repeated) \eqn{K}-fold cross-validation. - Cross-validation functions are available for least - squares fits computed with \code{\link[stats]{lm}} as - well as for the following robust alternatives: MM-type - models computed with \code{\link[robustbase]{lmrob}} and - least trimmed squares fits computed with - \code{\link[robustbase]{ltsReg}}. -} -\details{ - (Repeated) \eqn{K}-fold cross-validation is performed in - the following way. The data are first split into \eqn{K} - previously obtained blocks of approximately equal size. - Each of the \eqn{K} data blocks is left out once to fit - the model, and predictions are computed for the - observations in the left-out block with the - \code{\link[stats]{predict}} method of the fitted model. - Thus a prediction is obtained for each observation. - - The response variable and the obtained predictions for - all observations are then passed to the prediction loss - function \code{cost} to estimate the prediction error. - For repeated cross-validation, this process is replicated - and the estimated prediction errors from all replications - as well as their average are included in the returned - object. -} -\note{ - Those are simple wrapper functions that extract the data - from the fitted model and call \code{\link{cvFit}} to - perform cross-validation. -} -\examples{ -library("robustbase") -data("coleman") -set.seed(1234) # set seed for reproducibility - -# set up folds for cross-validation -folds <- cvFolds(nrow(coleman), K = 5, R = 10) - -# perform cross-validation for an LS regression model -fitLm <- lm(Y ~ ., data = coleman) -cvLm(fitLm, cost = rtmspe, folds = folds, trim = 0.1) - -# perform cross-validation for an MM regression model -fitLmrob <- lmrob(Y ~ ., data = coleman) -cvLmrob(fitLmrob, cost = rtmspe, folds = folds, trim = 0.1) - -# perform cross-validation for an LTS regression model -fitLts <- ltsReg(Y ~ ., data = coleman) -cvLts(fitLts, cost = rtmspe, folds = folds, trim = 0.1) -cvLts(fitLts, cost = rtmspe, folds = folds, - fit = "both", trim = 0.1) -} -\author{ - Andreas Alfons -} -\seealso{ - \code{\link{cvFit}}, \code{\link{cvFolds}}, - \code{\link{cost}}, \code{\link[stats]{lm}}, - \code{\link[robustbase]{lmrob}}, - \code{\link[robustbase]{ltsReg}} -} -\keyword{utilities} - +\name{repCV} +\alias{cvExamples} +\alias{cvLm} +\alias{cvLmrob} +\alias{cvLts} +\alias{repCV} +\alias{repCV.lm} +\alias{repCV.lmrob} +\alias{repCV.lts} +\title{Cross-validation for linear models} +\usage{ + repCV(object, ...) + + \method{repCV}{lm} (object, cost = rmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, seed = NULL, ...) + + \method{repCV}{lmrob} (object, cost = rtmspe, K = 5, + R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, seed = NULL, ...) + + \method{repCV}{lts} (object, cost = rtmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, fit = c("reweighted", "raw", "both"), + seed = NULL, ...) + + cvLm(object, cost = rmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, seed = NULL, ...) + + cvLmrob(object, cost = rtmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, seed = NULL, ...) + + cvLts(object, cost = rtmspe, K = 5, R = 1, + foldType = c("random", "consecutive", "interleaved"), + folds = NULL, fit = c("reweighted", "raw", "both"), + seed = NULL, ...) +} +\arguments{ + \item{object}{an object returned from a model fitting + function. Methods are implemented for objects of class + \code{"lm"} computed with \code{\link[stats]{lm}}, + objects of class \code{"lmrob"} computed with + \code{\link[robustbase]{lmrob}}, and object of class + \code{"lts"} computed with + \code{\link[robustbase]{ltsReg}}.} + + \item{cost}{a cost function measuring prediction loss. + It should expect the observed values of the response to + be passed as the first argument and the predicted values + as the second argument, and must return either a + non-negative scalar value, or a list with the first + component containing the prediction error and the second + component containing the standard error. The default is + to use the root mean squared prediction error for the + \code{"lm"} method and the root trimmed mean squared + prediction error for the \code{"lmrob"} and \code{"lts"} + methods (see \code{\link{cost}}).} + + \item{K}{an integer giving the number of groups into + which the data should be split (the default is five). + Keep in mind that this should be chosen such that all + groups are of approximately equal size. Setting \code{K} + equal to \code{n} yields leave-one-out cross-validation.} + + \item{R}{an integer giving the number of replications for + repeated \eqn{K}-fold cross-validation. This is ignored + for for leave-one-out cross-validation and other + non-random splits of the data.} + + \item{foldType}{a character string specifying the type of + folds to be generated. Possible values are + \code{"random"} (the default), \code{"consecutive"} or + \code{"interleaved"}.} + + \item{folds}{an object of class \code{"cvFolds"} giving + the folds of the data for cross-validation (as returned + by \code{\link{cvFolds}}). If supplied, this is + preferred over \code{K} and \code{R}.} + + \item{fit}{a character string specifying for which fit to + estimate the prediction error. Possible values are + \code{"reweighted"} (the default) for the prediction + error of the reweighted fit, \code{"raw"} for the + prediction error of the raw fit, or \code{"both"} for the + prediction error of both fits.} + + \item{seed}{optional initial seed for the random number + generator (see \code{\link{.Random.seed}}).} + + \item{\dots}{additional arguments to be passed to the + prediction loss function \code{cost}.} +} +\value{ + An object of class \code{"cv"} with the following + components: + + \item{n}{an integer giving the number of observations.} + + \item{K}{an integer giving the number of folds.} + + \item{R}{an integer giving the number of replications.} + + \item{cv}{a numeric vector containing the estimated + prediction errors. For the \code{"lm"} and + \code{"lmrob"} methods, this is a single numeric value. + For the \code{"lts"} method, this contains one value for + each of the requested fits. In the case of repeated + cross-validation, those are average values over all + replications.} + + \item{sd}{a numeric vector containing the estimated + standard errors of the prediction loss. For the + \code{"lm"} and \code{"lmrob"} methods, this is a single + numeric value. For the \code{"lts"} method, this + contains one value for each of the requested fits.} + + \item{reps}{a numeric matrix containing the estimated + prediction errors from all replications. For the + \code{"lm"} and \code{"lmrob"} methods, this is a matrix + with one column. For the \code{"lts"} method, this + contains one column for each of the requested fits. + However, this is only returned for repeated + cross-validation.} + + \item{seed}{the seed of the random number generator + before cross-validation was performed.} + + \item{call}{the matched function call.} +} +\description{ + Estimate the prediction error of a linear model via + (repeated) \eqn{K}-fold cross-validation. + Cross-validation functions are available for least + squares fits computed with \code{\link[stats]{lm}} as + well as for the following robust alternatives: MM-type + models computed with \code{\link[robustbase]{lmrob}} and + least trimmed squares fits computed with + \code{\link[robustbase]{ltsReg}}. +} +\details{ + (Repeated) \eqn{K}-fold cross-validation is performed in + the following way. The data are first split into \eqn{K} + previously obtained blocks of approximately equal size. + Each of the \eqn{K} data blocks is left out once to fit + the model, and predictions are computed for the + observations in the left-out block with the + \code{\link[stats]{predict}} method of the fitted model. + Thus a prediction is obtained for each observation. + + The response variable and the obtained predictions for + all observations are then passed to the prediction loss + function \code{cost} to estimate the prediction error. + For repeated cross-validation, this process is replicated + and the estimated prediction errors from all replications + as well as their average are included in the returned + object. +} +\note{ + The \code{repCV} methods are simple wrapper functions + that extract the data from the fitted model and call + \code{\link{cvFit}} to perform cross-validation. In + addition, \code{cvLm}, \code{cvLmrob} and \code{cvLts} + are aliases for the respective methods. +} +\examples{ +library("robustbase") +data("coleman") +set.seed(1234) # set seed for reproducibility + +# set up folds for cross-validation +folds <- cvFolds(nrow(coleman), K = 5, R = 10) + +# perform cross-validation for an LS regression model +fitLm <- lm(Y ~ ., data = coleman) +repCV(fitLm, cost = rtmspe, folds = folds, trim = 0.1) + +# perform cross-validation for an MM regression model +fitLmrob <- lmrob(Y ~ ., data = coleman) +repCV(fitLmrob, cost = rtmspe, folds = folds, trim = 0.1) + +# perform cross-validation for an LTS regression model +fitLts <- ltsReg(Y ~ ., data = coleman) +repCV(fitLts, cost = rtmspe, folds = folds, trim = 0.1) +repCV(fitLts, cost = rtmspe, folds = folds, + fit = "both", trim = 0.1) +} +\author{ + Andreas Alfons +} +\seealso{ + \code{\link{cvFit}}, \code{\link{cvFolds}}, + \code{\link{cost}}, \code{\link[stats]{lm}}, + \code{\link[robustbase]{lmrob}}, + \code{\link[robustbase]{ltsReg}} +} +\keyword{utilities} + diff --git a/man/subset.cv.Rd b/man/subset.cv.Rd index 8567483..d8be0c5 100644 --- a/man/subset.cv.Rd +++ b/man/subset.cv.Rd @@ -3,10 +3,10 @@ \alias{subset.cvSelect} \title{Subsetting cross-validation results} \usage{ - \method{subset}{cv} (x, select = NULL, ...) + \method{subset}{cv} (x, select = NULL, ...) - \method{subset}{cvSelect} (x, subset = NULL, select = - NULL, ...) + \method{subset}{cvSelect} (x, subset = NULL, + select = NULL, ...) } \arguments{ \item{x}{an object inheriting from class \code{"cv"} or @@ -41,14 +41,14 @@ folds <- cvFolds(nrow(coleman), K = 5, R = 10) ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) @@ -57,7 +57,7 @@ cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, cvFitsLts <- cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) cvFitsLts -# extract reweighted LTS results with 50% subsets +# extract reweighted LTS results with 50\% subsets subset(cvFitLts50, select = "reweighted") subset(cvFitsLts, subset = c(TRUE, FALSE), select = "reweighted") } diff --git a/man/summary.cv.Rd b/man/summary.cv.Rd index 7c1d165..5a0ae13 100644 --- a/man/summary.cv.Rd +++ b/man/summary.cv.Rd @@ -4,11 +4,11 @@ \alias{summary.cvTuning} \title{Summarize cross-validation results} \usage{ - \method{summary}{cv} (object, ...) + \method{summary}{cv} (object, ...) - \method{summary}{cvSelect} (object, ...) + \method{summary}{cvSelect} (object, ...) - \method{summary}{cvTuning} (object, ...) + \method{summary}{cvTuning} (object, ...) } \arguments{ \item{object}{an object inheriting from class \code{"cv"} @@ -37,14 +37,14 @@ folds <- cvFolds(nrow(coleman), K = 5, R = 10) ## compare raw and reweighted LTS estimators for -## 50% and 75% subsets +## 50\% and 75\% subsets -# 50% subsets +# 50\% subsets fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) -# 75% subsets +# 75\% subsets fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, fit = "both", trim = 0.1) @@ -53,14 +53,14 @@ cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, cvFitsLts <- cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) cvFitsLts -# summary of the results with the 50% subsets +# summary of the results with the 50\% subsets summary(cvFitLts50) # summary of the combined results summary(cvFitsLts) ## evaluate MM regression models tuned for -## 80%, 85%, 90% and 95% efficiency +## 80\%, 85\%, 90\% and 95\% efficiency tuning <- list(tuning.psi=c(3.14, 3.44, 3.88, 4.68)) # set up function call diff --git a/man/xyplot.cvSelect.Rd b/man/xyplot.cv.Rd similarity index 84% rename from man/xyplot.cvSelect.Rd rename to man/xyplot.cv.Rd index d68c6eb..b96587f 100644 --- a/man/xyplot.cvSelect.Rd +++ b/man/xyplot.cv.Rd @@ -1,141 +1,150 @@ -\name{xyplot.cvSelect} -\alias{xyplot.cvSelect} -\alias{xyplot.cvTuning} -\title{X-Y plots of cross-validation results} -\usage{ - \method{xyplot}{cvSelect} (x, data, subset = NULL, select - = NULL, ...) - - \method{xyplot}{cvTuning} (x, data, subset = NULL, select - = NULL, ...) -} -\arguments{ - \item{x}{an object inheriting from class - \code{"cvSelect"} that contains cross-validation results - (note that this includes objects of class - \code{"cvTuning"}).} - - \item{data}{currently ignored.} - - \item{subset}{a character, integer or logical vector - indicating the subset of models for which to plot the - cross-validation results.} - - \item{select}{a character, integer or logical vector - indicating the columns of cross-validation results to be - plotted.} - - \item{\dots}{additional arguments to be passed to the - \code{"formula"} method of - \code{\link[lattice:xyplot]{xyplot}}.} -} -\value{ - An object of class \code{"trellis"} is returned - invisibly. The - \code{\link[lattice:update.trellis]{update}} method can - be used to update components of the object and the - \code{\link[lattice:print.trellis]{print}} method - (usually called by default) will plot it on an - appropriate plotting device. -} -\description{ - Plot the (average) results from (repeated) \eqn{K}-fold - cross-validation on the \eqn{y}-axis against the - respective models on the \eqn{x}-axis. -} -\details{ - For objects with multiple columns of repeated - cross-validation results, conditional plots are produced. - - In most situations, the default behavior is to represent - the cross-validation results for each model by a vertical - line segment (i.e., to call the default method of - \code{\link[lattice:xyplot]{xyplot}} with \code{type = - "h"}). However, the behavior is different for objects of - class \code{"cvTuning"} with only one numeric tuning - parameter. In that situation, the cross-validation - results are plotted against the values of the tuning - parameter as a connected line (i.e., by using \code{type - = "b"}). - - The default behavior can of course be overridden by - supplying the \code{type} argument (a full list of - accepted values can be found in the help file of - \code{\link[lattice:panel.xyplot]{panel.xyplot}}). -} -\examples{ -library("robustbase") -data("coleman") -set.seed(1234) # set seed for reproducibility - -## set up folds for cross-validation -folds <- cvFolds(nrow(coleman), K = 5, R = 10) - - -## compare LS, MM and LTS regression - -# perform cross-validation for an LS regression model -fitLm <- lm(Y ~ ., data = coleman) -cvFitLm <- cvLm(fitLm, cost = rtmspe, - folds = folds, trim = 0.1) - -# perform cross-validation for an MM regression model -fitLmrob <- lmrob(Y ~ ., data = coleman, k.max = 500) -cvFitLmrob <- cvLmrob(fitLmrob, cost = rtmspe, - folds = folds, trim = 0.1) - -# perform cross-validation for an LTS regression model -fitLts <- ltsReg(Y ~ ., data = coleman) -cvFitLts <- cvLts(fitLts, cost = rtmspe, - folds = folds, trim = 0.1) - -# combine and plot results -cvFits <- cvSelect(LS = cvFitLm, MM = cvFitLmrob, LTS = cvFitLts) -cvFits -xyplot(cvFits) - - -## compare raw and reweighted LTS estimators for -## 50% and 75% subsets - -# 50% subsets -fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) -cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, - fit = "both", trim = 0.1) - -# 75% subsets -fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) -cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, - fit = "both", trim = 0.1) - -# combine and plot results -cvFitsLts <- cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) -cvFitsLts -xyplot(cvFitsLts) - - -## evaluate MM regression models tuned for -## 80%, 85%, 90% and 95% efficiency -tuning <- list(tuning.psi=c(3.14, 3.44, 3.88, 4.68)) - -# perform cross-validation -cvFitsLmrob <- cvTuning(fitLmrob$call, data = coleman, - y = coleman$Y, tuning = tuning, cost = rtmspe, - folds = folds, costArgs = list(trim = 0.1)) -cvFitsLmrob - -# plot results -xyplot(cvFitsLmrob) -} -\author{ - Andreas Alfons -} -\seealso{ - \code{\link{cvFit}}, \code{\link{cvSelect}}, - \code{\link{cvTuning}}, \code{\link[=plot.cv]{plot}}, - \code{\link[=dotplot.cvSelect]{dotplot}}, - \code{\link[=bwplot.cv]{bwplot}}, - \code{\link[=densityplot.cv]{densityplot}} -} -\keyword{hplot} - +\name{xyplot.cv} +\alias{xyplot.cv} +\alias{xyplot.cvSelect} +\alias{xyplot.cvTuning} +\title{X-Y plots of cross-validation results} +\usage{ + \method{xyplot}{cv} (x, data, select = NULL, + sdFactor = NA, ...) + + \method{xyplot}{cvSelect} (x, data, subset = NULL, + select = NULL, sdFactor = x$sdFactor, ...) + + \method{xyplot}{cvTuning} (x, data, subset = NULL, + select = NULL, sdFactor = x$sdFactor, ...) +} +\arguments{ + \item{x}{an object inheriting from class + \code{"cvSelect"} that contains cross-validation results + (note that this includes objects of class + \code{"cvTuning"}).} + + \item{data}{currently ignored.} + + \item{subset}{a character, integer or logical vector + indicating the subset of models for which to plot the + cross-validation results.} + + \item{select}{a character, integer or logical vector + indicating the columns of cross-validation results to be + plotted.} + + \item{sdFactor}{a numeric value giving the multiplication + factor of the standard error for displaying error bars. + Error bars can be suppressed by setting this to + \code{NA}.} + + \item{\dots}{additional arguments to be passed to the + \code{"formula"} method of + \code{\link[lattice:xyplot]{xyplot}}.} +} +\value{ + An object of class \code{"trellis"} is returned + invisibly. The + \code{\link[lattice:update.trellis]{update}} method can + be used to update components of the object and the + \code{\link[lattice:print.trellis]{print}} method + (usually called by default) will plot it on an + appropriate plotting device. +} +\description{ + Plot the (average) results from (repeated) \eqn{K}-fold + cross-validation on the \eqn{y}-axis against the + respective models on the \eqn{x}-axis. +} +\details{ + For objects with multiple columns of repeated + cross-validation results, conditional plots are produced. + + In most situations, the default behavior is to represent + the cross-validation results for each model by a vertical + line segment (i.e., to call the default method of + \code{\link[lattice:xyplot]{xyplot}} with \code{type = + "h"}). However, the behavior is different for objects of + class \code{"cvTuning"} with only one numeric tuning + parameter. In that situation, the cross-validation + results are plotted against the values of the tuning + parameter as a connected line (i.e., by using \code{type + = "b"}). + + The default behavior can of course be overridden by + supplying the \code{type} argument (a full list of + accepted values can be found in the help file of + \code{\link[lattice:panel.xyplot]{panel.xyplot}}). +} +\examples{ +library("robustbase") +data("coleman") +set.seed(1234) # set seed for reproducibility + +## set up folds for cross-validation +folds <- cvFolds(nrow(coleman), K = 5, R = 10) + + +## compare LS, MM and LTS regression + +# perform cross-validation for an LS regression model +fitLm <- lm(Y ~ ., data = coleman) +cvFitLm <- cvLm(fitLm, cost = rtmspe, + folds = folds, trim = 0.1) + +# perform cross-validation for an MM regression model +fitLmrob <- lmrob(Y ~ ., data = coleman, k.max = 500) +cvFitLmrob <- cvLmrob(fitLmrob, cost = rtmspe, + folds = folds, trim = 0.1) + +# perform cross-validation for an LTS regression model +fitLts <- ltsReg(Y ~ ., data = coleman) +cvFitLts <- cvLts(fitLts, cost = rtmspe, + folds = folds, trim = 0.1) + +# combine and plot results +cvFits <- cvSelect(LS = cvFitLm, MM = cvFitLmrob, LTS = cvFitLts) +cvFits +xyplot(cvFits) + + +## compare raw and reweighted LTS estimators for +## 50\% and 75\% subsets + +# 50\% subsets +fitLts50 <- ltsReg(Y ~ ., data = coleman, alpha = 0.5) +cvFitLts50 <- cvLts(fitLts50, cost = rtmspe, folds = folds, + fit = "both", trim = 0.1) + +# 75\% subsets +fitLts75 <- ltsReg(Y ~ ., data = coleman, alpha = 0.75) +cvFitLts75 <- cvLts(fitLts75, cost = rtmspe, folds = folds, + fit = "both", trim = 0.1) + +# combine and plot results +cvFitsLts <- cvSelect("0.5" = cvFitLts50, "0.75" = cvFitLts75) +cvFitsLts +xyplot(cvFitsLts) + + +## evaluate MM regression models tuned for +## 80\%, 85\%, 90\% and 95\% efficiency +tuning <- list(tuning.psi=c(3.14, 3.44, 3.88, 4.68)) + +# perform cross-validation +cvFitsLmrob <- cvTuning(fitLmrob$call, data = coleman, + y = coleman$Y, tuning = tuning, cost = rtmspe, + folds = folds, costArgs = list(trim = 0.1)) +cvFitsLmrob + +# plot results +xyplot(cvFitsLmrob) +} +\author{ + Andreas Alfons +} +\seealso{ + \code{\link{cvFit}}, \code{\link{cvSelect}}, + \code{\link{cvTuning}}, \code{\link[=plot.cv]{plot}}, + \code{\link[=dotplot.cvSelect]{dotplot}}, + \code{\link[=bwplot.cv]{bwplot}}, + \code{\link[=densityplot.cv]{densityplot}} +} +\keyword{hplot} +