diff --git a/DESCRIPTION b/DESCRIPTION index 7f1daea..4b67f08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,22 +2,23 @@ Package: vcrpart Type: Package Title: Tree-Based Varying Coefficient Regression for Generalized Linear and Ordinal Mixed Models -Version: 0.2-1 -Date: 2014-09-10 +Version: 0.2-2 +Date: 2014-10-24 Authors@R: c( person("Reto", "Buergin", role = c("aut", "cre", "cph"), email = "rbuergin@gmx.ch"), person("Gilbert", "Ritschard", role = c("ctb", "ths"), email = "gilbert.ritschard@unige.ch")) Maintainer: Reto Buergin -Description: Recursive partitioning algorithm for varying coefficient generalized linear models and ordinal 2-stage linear mixed models. Special features are coefficient-wise partitioning, non-varying coefficients and partitioning of time-varying variables in longitudinal ordinal regression. +Description: Recursive partitioning for varying coefficient generalized linear models and ordinal linear mixed models. Special features are coefficient-wise partitioning, non-varying coefficients and partitioning of time-varying variables in longitudinal regression. License: GPL (>= 2) Depends: R (>= 3.1.0), parallel, partykit Imports: stats, grid, graphics, methods, nlme, rpart, numDeriv, ucminf, zoo, sandwich, strucchange +URL: http://vcrpart.wordpress.com LazyLoad: yes NeedsCompilation: yes -Packaged: 2014-09-09 22:27:25 UTC; rbuergin +Packaged: 2014-11-07 17:28:59 UTC; reto Author: Reto Buergin [aut, cre, cph], Gilbert Ritschard [ctb, ths] Repository: CRAN -Date/Publication: 2014-09-10 00:51:30 +Date/Publication: 2014-11-08 02:13:19 diff --git a/MD5 b/MD5 index 12d888b..3e0d0b9 100644 --- a/MD5 +++ b/MD5 @@ -1,20 +1,21 @@ -1cbd34b3510d90cfb1a6848b44abb82b *DESCRIPTION -9fa1ee1a037664dca6310ec341ced692 *NAMESPACE -c9cf28f3456a5cf3c5d84783d4c6f7bb *NEWS +6d24e99c18c914344785bf01d1b715a6 *DESCRIPTION +dfae833697182c8bb7f3b3572b2413e6 *NAMESPACE +4ad832a72530f1d077cd0296dbcdecb7 *NEWS 8b40e51cf24df0eb99503a7070b84a93 *R/AAA.R -76803808590893cc4202d5c78b2b8ada *R/AllGeneric.R -ed5c8359901f28c78724ea61ac731f3b *R/fvcm.R +b70bc37834d2f3b8c94d9d1c0020c614 *R/AllGeneric.R +91d50efd23854dea04fca307a8c9ce97 *R/fvcm.R 80b84163397dcda539b20d9ab51c56e7 *R/import.R -38166df78924be2eecfa73a0df50500d *R/olmm-methods.R +0801b64fedea58af67c400f8b0e15bd2 *R/olmm-methods.R 7e4d28175b9184bf3a0992401f277072 *R/olmm-utils.R -47d508ee6eda573318986badf2bc07b1 *R/olmm.R +ecdcf046dcbb466c4cf29bc7e9efaf8a *R/olmm.R b6c73716e8ea10810b0ef419e666217c *R/otsplot.R -93374105b3b9b55f260ebe19155d9d8b *R/tvcm-cv.R -5ded623facdc6ee8b637444a5328c3d2 *R/tvcm-methods.R -535373d36b06c8ce69666be8656a398f *R/tvcm-plot.R -556205fc2616156b58239d61f6ae926f *R/tvcm-utils.R -db0d8050384fd821d8f3f31acc56b14c *R/tvcm.R -0ecd155512121e5553df3af06a0a1fc2 *R/utils.R +2af1d39305eedf8fc4128b14307e1317 *R/tvcm-cv.R +32a6d478af156ed579d5909795d5cc98 *R/tvcm-methods.R +d7de761ae5514253233d0540ddd85ec5 *R/tvcm-plot.R +3d45fc591fdde7ffc882b8add1448436 *R/tvcm-utils.R +efa265167aaa866b67a1acd78df5a859 *R/tvcm.R +759f474cb091fa4062bec1eaabfdd8de *R/utils.R +8562fccdfc9dc113b72f8eb8df66bf57 *data/PL.RData a2c9a87cf50549fd5802cbb6a88d281a *data/movie.RData 72890e1f368da6d42d9cf9414c5396e0 *data/poverty.RData ea62be4e5d10df5bc683725a7f7935f6 *data/schizo.RData @@ -23,28 +24,33 @@ ea62be4e5d10df5bc683725a7f7935f6 *data/schizo.RData cb803decf95c39567f648e1e5e0ddb45 *data/vcrpart_2.RData 3ddf5ac69dc37253946b83c5bfd5fca7 *data/vcrpart_3.RData c2c0ec5bc767a17c65b372996eb094bc *inst/CITATION +3bc5b4abcf89156967fbf807a9125ee6 *man/PL.Rd 02a3675b72ff9f699d45dab0183780b8 *man/fvcm-methods.Rd -1bd0959c572019011ab4c537f9f86919 *man/fvcm.Rd +2ad931e91f7922156d825b32e68df10a *man/fvcm.Rd edb080c605c0dacb43ae037d6199abd4 *man/movie.Rd f66f99d711ddf32948fbf9b39c63b888 *man/olmm-control.Rd -b57f0746e38cad4b16bfe592ec7c60ac *man/olmm-gefp.Rd -6ee7950a48737e3b9ef60d8ba4990989 *man/olmm-methods.Rd +e7cc5a4ab52c84577dd32c4c04219ef1 *man/olmm-gefp.Rd +c0635745a839f49a8ffaa66ffcacdb65 *man/olmm-methods.Rd e58a118afbf3ff05498f9e5bbc73e893 *man/olmm-predict.Rd c914ee5c769aacd190558f0b967f1072 *man/olmm-summary.Rd -fedfc37145b8e3764bef8589ad05496c *man/olmm.Rd +99fd9612846185c8b253368ad62640bc *man/olmm.Rd 913e87ac8d1158061b59644190ef8984 *man/otsplot.Rd -58735e7359b9794d34d5c78eb0c07aa4 *man/poverty.Rd +b5d023f2e25d796fc013667de0bf0c2b *man/poverty.Rd 198b4f1a33689dc49700c02f14e42510 *man/schizo.Rd -061e82789901fc9517c35f3d44daa35c *man/tvcm-control.Rd -2a47a463b75faddfdbaacdd763760b9e *man/tvcm-cv.Rd -33dc39a7f1d534bbd8b26a5a89b05f1e *man/tvcm-methods.Rd -20100f8ce3bbcb0ca761531b0705dee8 *man/tvcm-plot.Rd -1676e4649643b03e5b50cf0b79988d6d *man/tvcm.Rd +830a40f120259f6c1e30cce2f06a224e *man/tvcglm.Rd +fc4d871d97b9315c31c720307d4b481c *man/tvcm-control.Rd +5b0157aea2925e83c7325814864fad14 *man/tvcm-cv.Rd +c2434053aa241bce6f70d969bf385e25 *man/tvcm-methods.Rd +0fa2459d031af80f1b4dbd7d3d1da797 *man/tvcm-plot.Rd +395961700e672d7ba7f7901564e83892 *man/tvcm.Rd +8c644871c09cae7e14ca6ad204a0dee3 *man/tvcolmm.Rd 158d7c7045232f33bd35bf5c39dcedcf *man/vcrpart-demo.Rd a14ef1287c994bf09c716cc21d50517f *man/vcrpart-formula.Rd 1e0af65b11fb043bd18f7982291d076f *src/Makevars -490c20f360ddf1fa209c81bbf51ba1de *src/init.c -04dd4cbf292548702d5590e2475aec35 *src/olmm.c +e0ee3aca34161fb42f8fffa717fc6c3e *src/init.c +62783432fffd5da456a3bb3e61c2be35 *src/olmm.c 1e5f560c59e4ea73b51fa72c057166ec *src/olmm.h -e5dcd67c462566773294a1e5829ef188 *src/utils.c -5d30135987641736d67d08d3c140c86e *src/utils.h +0af72621bd0a3d74afc3a3a0220f92c4 *src/tvcm.c +e8b96835ed09462a3fef4b1f581279cd *src/tvcm.h +79abdf7fd5ae53f7c77a6db8453d3408 *src/utils.c +643b41eb70ee7ca5689f176f7d82b795 *src/utils.h diff --git a/NAMESPACE b/NAMESPACE index a1cca45..cd10bd5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,7 +49,9 @@ export( predecor_control, estfun.olmm, gefp.olmm, - tvcm_control, + tvcm_control, + tvcolmm_control, + tvcglm_control, tvcm, tvcolmm, tvcglm, @@ -57,8 +59,13 @@ export( fvcolmm, fvcglm, fvcm_control, + fvcolmm_control, + fvcglm_control, folds_control) +## Exported methods for 'glm' class +S3method(fixef, glm) + ## Exported methods for 'fvcm' class S3method(fitted, fvcm) S3method(predict, fvcm) @@ -109,6 +116,7 @@ S3method(print, otsplot) ## Exported methods for 'tvcm' class S3method(coef, tvcm) S3method(coefficients, tvcm) +S3method(depth, tvcm) S3method(cvloss, tvcm) S3method(plot, cvloss.tvcm) S3method(print, cvloss.tvcm) @@ -135,3 +143,4 @@ S3method(splitpath, tvcm) S3method(print, splitpath.tvcm) S3method(summary, tvcm) S3method(weights, tvcm) +S3method(width, tvcm) diff --git a/NEWS b/NEWS index 1bbc832..af2e11d 100644 --- a/NEWS +++ b/NEWS @@ -1,9 +1,44 @@ +Changes in Version 0.2-2 + + o Added seed option to 'tvcm_control'. + + o The new implementation clearly distinguishes between the two + functions 'tvcolmm' and 'tvcolmm' with separate help files. + The general function 'tvcm' is still available. + + o Added convenience function 'tvcolmm_control' and + 'tvglm_control'. + + o Improvement for 'panel_coef': Points and lines surpassing + the boxes are now suppressed. + + o Added variable centering as default for split selection. + + o Redefinition of tuning parameters for 'tvcm'. The main + tuning parameter is now 'cp'. See the help of 'tvcm' and + 'tvcm_control' for details. + + o Added 'nimpute' argument for 'tvcm_control'. + + o Added detail section to the help page of 'tvcm_control' + + o Removed AIC table from 'print.tvcm' (AIC and BIC seem not + relevant measures for models fitted by 'tvcm'). + + o Added 'PL' data set. + + o Removed bug for numeric estimation of covariance of 'olmm' + objects. + + o Added 'depth' and 'width' methods. + + Changes in Version 0.2-1 - o First CRAN release. + o First CRAN release. - o 'tvcm' and 'fvcm' allow for multiple 'vc' terms, i.e. - coefficient-specific partitions + o 'tvcm' and 'fvcm' allow for multiple 'vc' terms, i.e. + coefficient-specific partitions o Complete revision of syntaxes, argument names and default parameters. R commands for the former version 0.1-14 are diff --git a/R/AllGeneric.R b/R/AllGeneric.R index 124bf41..7a8d8f5 100644 --- a/R/AllGeneric.R +++ b/R/AllGeneric.R @@ -35,6 +35,8 @@ cvloss <- function(object, ...) UseMethod("cvloss") extract <- function(object, ...) UseMethod("extract") +fixef.glm <- function(object, ...) coef(object) + neglogLik2 <- function(object, ...) UseMethod("neglogLik2") neglogLik2.default <- function(object, ...) diff --git a/R/fvcm.R b/R/fvcm.R index 5c710d2..f7fd477 100644 --- a/R/fvcm.R +++ b/R/fvcm.R @@ -1,7 +1,7 @@ ##' -------------------------------------------------------- # ##' Author: Reto Buergin ##' E-Mail: reto.buergin@unige.ch, rbuergin@gmx.ch -##' Date: 2014-09-07 +##' Date: 2014-10-14 ##' ##' Description: ##' Random forests and bagging for the 'tvcm' algorithm. @@ -22,6 +22,9 @@ ##' - set 'ptry', 'vtry' and 'ntry' automatically (see Hastie) ##' ##' Last modifications: +##' 2014-10-14: found bug in predict.tvcm: now the 'coefi' +##' matrices are ordered by the column names of +##' 'coef'. ##' 2014-09-07: - improvment of predict.tvcm function ##' - treated bugs for 'type = "coef"' ##' - deal with ordinal responses in cases not @@ -33,7 +36,7 @@ ##' 2014-08-05: - changed specification for folds ##' -------------------------------------------------------- # -fvcolmm <- function(..., family = cumulative(), control = fvcm_control()) { +fvcolmm <- function(..., family = cumulative(), control = fvcolmm_control()) { mc <- match.call() mc[[1L]] <- as.name("fvcm") mc$fit <- "olmm" @@ -43,7 +46,17 @@ fvcolmm <- function(..., family = cumulative(), control = fvcm_control()) { } -fvcglm <- function(..., family, control = fvcm_control()) { +fvcolmm_control <- function(maxstep = 10, folds = folds_control("subsampling", 5), + ptry = 1, ntry = 1, vtry = 5, alpha = 1.0, ...) { + + mc <- match.call() + mc[[1L]] <- as.name("fvcm_control") + mc$sctest <- TRUE + return(eval.parent(mc)) +} + + +fvcglm <- function(..., family, control = fvcglm_control()) { mc <- match.call() mc[[1L]] <- as.name("fvcm") mc$fit <- "glm" @@ -53,6 +66,15 @@ fvcglm <- function(..., family, control = fvcm_control()) { } +fvcglm_control <- function(maxstep = 10, folds = folds_control("subsampling", 5), + ptry = 1, ntry = 1, vtry = 5, mindev = 0, ...) { + + mc <- match.call() + mc[[1L]] <- as.name("fvcm_control") + return(eval.parent(mc)) +} + + fvcm <- function(..., control = fvcm_control()) { mc <- match.call() @@ -110,7 +132,7 @@ fvcm <- function(..., control = fvcm_control()) { fvcm_control <- function(maxstep = 10, folds = folds_control("subsampling", 5), ptry = 1, ntry = 1, vtry = 5, - alpha = 1.0, maxoverstep = Inf, ...) { + alpha = 1.0, mindev = 0.0, ...) { ## modify the 'papply' argument mc <- match.call() @@ -123,7 +145,7 @@ fvcm_control <- function(maxstep = 10, folds = folds_control("subsampling", 5), ## combine the parameter to a list and disble cross validation and pruning call <- list(maxstep = maxstep, folds = folds, ptry = ptry, ntry = ntry, vtry = vtry, - alpha = alpha, maxoverstep = Inf, + alpha = alpha, mindev = mindev, papply = papply, cv = FALSE, prune = FALSE) call <- appendDefArgs(call, list(...)) @@ -346,7 +368,7 @@ predict.fvcm <- function(object, newdata = NULL, coefi <- predict(object, newdata = newdata, type = "coef", ranef = FALSE, na.action = na.pass, ...) if (!is.matrix(coefi)) coefi <- matrix(coefi, nrow = nrow(newdata)) - + ## acount for skipped categories if (object$info$fit == "olmm" && ncol(coefi) < ncol(coef)) { subsiCols <- table(mf[folds[, i] > 0, yName]) > 0L @@ -366,6 +388,9 @@ predict.fvcm <- function(object, newdata = NULL, colnames(coefi) <- colnamesi } + ## order columns of coefi + coefi <- coefi[, intersect(colnames(coef), colnames(coefi)), drop = FALSE] + ## index matrix for valid entries subsi <- subs if (oob) subsi[folds[,i] > 0L, ] <- FALSE diff --git a/R/olmm-methods.R b/R/olmm-methods.R index 84b23e3..0f22c9f 100644 --- a/R/olmm-methods.R +++ b/R/olmm-methods.R @@ -1,6 +1,6 @@ ##' -------------------------------------------------------- # ##' Author: Reto Buergin, rbuergin@gmx.ch -##' Date: 2014-09-08 +##' Date: 2014-10-24 ##' ##' Description: ##' methods for olmm objects. @@ -40,6 +40,11 @@ ##' weights: Weights ##' ##' Modifications: +##' 2014-10-24: - improve simulate.olmm +##' - improved 'estfun.olmm' call in 'gefp.olmm' +##' 2014-10-23: - fix bug in predict.olmm +##' 2014-09-22: - (internal) change 'Ninpute' to 'Nimpute' in estfun.olmm +##' 2014-09-20: - use tile case in titles ##' 2014-09-08: - partial substitution of 'rep' by 'rep.int' ##' - replace 'do.call' by 'call' in 'resid.olmm' ##' 2013-03-17: changed many methods to S3 methods (as in lme4) @@ -52,6 +57,7 @@ ##' - improve update method ##' - plot methods ##' - estfun.olmm: handle equal zero random effects +##' - anova with a single model ##' -------------------------------------------------------- # anova.olmm <- function(object, ...) { @@ -203,9 +209,10 @@ estfun.olmm <- function(x, predecor = FALSE, control = predecor_control(), parm <- seq_along(x$coefficients) # internal variable if (!is.null(nuisance) & is.character(nuisance)) nuisance <- which(names(coef(x)) %in% nuisance) + nuisance <- sort(union(nuisance, which(x$restricted))) parm <- setdiff(parm, nuisance) attr <- list() # default attributes - + scores <- x$score_obs subsImp <- rep.int(FALSE, nrow(scores)) @@ -215,19 +222,19 @@ estfun.olmm <- function(x, predecor = FALSE, control = predecor_control(), if (predecor && any(Ni != Nmax)) { - Ninpute <- Nmax - Ni - subsImp <- c(rep.int(FALSE, x$dims["n"]), rep.int(TRUE, sum(Ninpute))) - sbjImp <- factor(rep.int(names(Ni), Ninpute), names(Ni)) + Nimpute <- Nmax - Ni + subsImp <- c(rep.int(FALSE, x$dims["n"]), rep.int(TRUE, sum(Nimpute))) + sbjImp <- factor(rep.int(names(Ni), Nimpute), names(Ni)) ranef <- ranef(x) ranefImp <- ranef[rownames(ranef) %in% unique(sbjImp),,drop = FALSE] ## get predictors from empirical distribution yName <- all.vars(formula(x))[1L] yLevs <- levels(x$y) - newFrame <- x$frame[rep.int(1L, sum(Ninpute)),,drop=FALSE] - newFrame[, x$subjectName] <- rep.int(names(Ninpute), Ninpute) - newX <- x$X[rep.int(1L, sum(Ninpute)),,drop=FALSE] - newW <- x$W[rep.int(1L, sum(Ninpute)),,drop=FALSE] + newFrame <- x$frame[rep.int(1L, sum(Nimpute)),,drop=FALSE] + newFrame[, x$subjectName] <- rep.int(names(Nimpute), Nimpute) + newX <- x$X[rep.int(1L, sum(Nimpute)),,drop=FALSE] + newW <- x$W[rep.int(1L, sum(Nimpute)),,drop=FALSE] ## add imputations to model x$frame <- rbind(x$frame, newFrame) @@ -239,10 +246,10 @@ estfun.olmm <- function(x, predecor = FALSE, control = predecor_control(), factor(c(as.character(x$subject), newFrame[, x$subjectName]), levels = names(Ni)) x$weights <- x$weights_sbj[as.integer(x$subject)] - x$offset <- rbind(x$offset, matrix(0.0, sum(Ninpute), x$dims["nEta"])) + x$offset <- rbind(x$offset, matrix(0.0, sum(Nimpute), x$dims["nEta"])) x$dims["n"] <- nrow(x$frame) - x$eta <- rbind(x$eta, matrix(0.0, sum(Ninpute), x$dims["nEta"])) - x$score_obs <- rbind(x$score_obs, matrix(0.0, sum(Ninpute), x$dims["nPar"])) + x$eta <- rbind(x$eta, matrix(0.0, sum(Nimpute), x$dims["nEta"])) + x$score_obs <- rbind(x$score_obs, matrix(0.0, sum(Nimpute), x$dims["nPar"])) ## simulate responses if (control$impute) { @@ -250,9 +257,10 @@ estfun.olmm <- function(x, predecor = FALSE, control = predecor_control(), if (control$verbose) cat("\n* impute scores ... ") ## set seed + if (!is.null(control$seed)) set.seed(control$seed) ## impute predictors - times <- Ninpute[x$subject[!subsImp]] + times <- Nimpute[x$subject[!subsImp]] rows <- unlist(tapply(1:sum(Ni), x$subject[!subsImp], function(x) sample(x, times[x[1L]], replace = TRUE))) x$frame[subsImp,] <- x$frame[rows,,drop=FALSE] x$X[subsImp, ] <- x$X[rows,,drop=FALSE] @@ -268,7 +276,6 @@ estfun.olmm <- function(x, predecor = FALSE, control = predecor_control(), ranef[as.integer(x$subject[subsImp])]) %*% tmatW eta <- etaFixef + etaRanef probs <- x$family$linkinv(eta) - if (!is.null(control$seed)) set.seed(control$seed) x$y[subsImp] <- # simulate responses ordered(apply(probs, 1L, function(x) sample(yLevs, 1L, prob = x)), yLevs) @@ -370,9 +377,13 @@ gefp.olmm <- function(object, scores = NULL, order.by = NULL, subset = NULL, ## extract scores (if scores is not a matrix) if (is.null(scores)) { - estfunCall <- call(name = "estfun.olmm", x = quote(object), predecor = predecor) - dotargs <- list(...)[names(formals(estfun.olmm))] - for (arg in names(dotargs)) estfunCall[arg] <- dotargs[[arg]] + estfunCall <- list(name = as.name("estfun.olmm"), + x = quote(object), + predecor = quote(predecor)) + dotargs <- list(...) + dotargs <- dotargs[intersect(names(formals(estfun.olmm)), names(dotargs))] + estfunCall[names(dotargs)] <- dotargs + mode(estfunCall) <- "call" scores <- try(eval(estfunCall)) } else if (is.function(scores)) { scores <- scores(object) @@ -464,7 +475,7 @@ gefp.olmm <- function(object, scores = NULL, order.by = NULL, subset = NULL, lim.process = "Brownian bridge", type.name = "M-fluctuation test", order.name = deparse(substitute(order.by)), - subset <- rownames(model.frame(object))[subset & subsScores], + subset = rownames(model.frame(object))[subset & subsScores], J12 = NULL) class(rval) <- "gefp" return(rval) @@ -525,7 +536,7 @@ predict.olmm <- function(object, newdata = NULL, if (type == "ranef") return(ranef(object, ...)) if (type == "prob") type <- "response" - formList <- vcrpart_formula(formula(object)) # extract formulas + formList <- vcrpart_formula(formula(object), object$family) # extract formulas offset <- list(...)$offset subset <- list(...)$subset dims <- object$dims @@ -563,13 +574,17 @@ predict.olmm <- function(object, newdata = NULL, ## fixed effects only getTerms <- function(x) attr(terms(x, keep.order = TRUE), "term.labels") terms <- lapply(formList$fe$eta, getTerms) - mfForm <- formula(paste("~", paste(unlist(terms), collapse = "+"))) + if (length(unlist(terms)) > 0L) { + mfForm <- formula(paste("~", paste(unlist(terms), collapse = "+"))) + } else { + mfForm <- ~ 1 + } } mf <- model.frame(object) Terms <- delete.response(terms(mfForm)) xlevels <- .getXlevels(attr(mf, "terms"), mf) xlevels <- xlevels[names(xlevels) %in% all.vars(Terms)] - + xlevels <- xlevels[names(xlevels) != object$subjectName] newdata <- as.data.frame(model.frame(Terms, newdata, na.action = na.action, xlev = xlevels)) @@ -607,7 +622,7 @@ predict.olmm <- function(object, newdata = NULL, rownames(W) <- rownames(newdata) ## check entered random effects - if (any(dim(ranef) != c(nlevels(subject), ncol(W)))) + if (any(dim(ranef) != c(nlevels(subject), nrow(object$ranefCholFac)))) stop("'ranef' matrix has wrong dimensions") if (any(!levels(subject) %in% rownames(ranef))) { @@ -806,6 +821,23 @@ simulate.olmm <- function(object, nsim = 1, seed = NULL, if (!exists(".Random.seed", envir = .GlobalEnv)) runif(1) RNGstate <- .Random.seed dotArgs$type <- "response" + if (is.logical(ranef) && ranef) { + if (object$subjectName %in% colnames(newdata)) { + subject <- droplevels(newdata[, object$subjectName]) + ranef <- ranef(object) + if (any(!levels(subject) %in% rownames(ranef))) { + ranef <- matrix(rnorm(nlevels(subject) * ncol(ranef)), + nrow = nlevels(subject), ncol = ncol(ranef), + dimnames = list(levels(subject), colnames(ranef))) + ranef <- ranef %*% t(object$ranefCholFac) + } else { + ranef <- ranef[rownames(ranef) %in% levels(subject),,drop = FALSE] + } + } else { + stop(paste("'newdata' must contain a column '", + object$subjectName, "'", sep = "")) + } + } pred <- predict(object, newdata = newdata, type = "prob", ranef = ranef, ...) FUN <- function(x) sample(levels(object$y), 1, prob = x) rval <- as.data.frame(replicate(nsim, apply(pred, 1L, FUN))) @@ -884,14 +916,14 @@ summary.olmm <- function(object, etalab = c("int", "char", "eta"), VarCorr <- matrix(, 0L, 3L, dimnames = list(c(), c("Variance", "StdDev", ""))) } - + ## title - methTitle <- "Ordinal linear" - if (dims["hasRanef"] > 0L) methTitle <- paste(methTitle, "mixed") - methTitle <- paste(methTitle, "model") + methTitle <- "Ordinal Linear" + if (dims["hasRanef"] > 0L) methTitle <- paste(methTitle, "Mixed") + methTitle <- paste(methTitle, "Model") if (dims["hasRanef"] > 0L) - paste(methTitle, " fit by marginal maximum\n", - "likelihood with Gauss-Hermite quadrature", sep = "") + methTitle <- paste(methTitle, " fit by Marginal Maximum\n", + "Likelihood with Gauss-Hermite Quadrature", sep = "") na.action <- naprint(attr(model.frame(object), "na.action")) na.action <- if (na.action == "") character() else paste("(", na.action, ")", sep = "") diff --git a/R/olmm.R b/R/olmm.R index e4ca180..13b502b 100644 --- a/R/olmm.R +++ b/R/olmm.R @@ -1,6 +1,6 @@ ##' -------------------------------------------------------- # ##' Author: Reto Buergin, rbuergin@gmx.ch -##' Date: 2014-09-08 +##' Date: 2014-09-25 ##' ##' References: ##' ordinal: http://cran.r-project.org/web/packages/ordinal/index.html @@ -13,6 +13,11 @@ ##' ##' ##' Modifications: +##' 2014-09-25: - removed bug for numeric estimation of covariance of +##' 'olmm' objects +##' - define 'score_sbj' and 'score_obs' slot even if +##' 'numGrad = FALSE' (otherwise olmm_update_marg gives error) +##' 2014-09-19: allow 'family' to be of class 'function' ##' 2014-09-08: partial substitution of 'rep' by 'rep.int' ##' 2014-06-17: convert routine to S3 class ##' 2014-05-03: moved several control parameters to 'control' argument @@ -21,9 +26,9 @@ ##' 2014-04-22: implement change in 'form' object ##' 2013-09-15: Free() commands were added in olmm.c ##' 2013-09-07: C implementation for updating the marginal Likelihood -##' and predicting random-effects was stabilized by -##' replacing many alloca's by the R built-in function -##' Calloc, which may slow the estimation +##' and predicting random-effects was stabilized by +##' replacing many alloca's by the R built-in function +##' Calloc, which may slow the estimation ##' 2013-07-27: change 'start' handling and add 'restricted' ##' argument ##' 2013-07-19: correct use of numGrad argument (from now the slots @@ -106,6 +111,18 @@ olmm_control <- function(fit = c("nlminb", "ucminf", "optim"), doFit = TRUE, numGrad = FALSE, numHess = numGrad, nGHQ = 7L, start = NULL, restricted = NULL, verbose = FALSE, ...) { fit <- match.arg(fit) + stopifnot(is.logical(doFit) && length(doFit) == 1) + stopifnot(is.logical(numGrad) && length(numGrad) == 1) + stopifnot(is.logical(numHess) && length(numHess) == 1) + if (!numHess & numGrad) + stop("'numHess' must be TRUE if numGrad is 'TRUE'") + stopifnot(is.numeric(nGHQ) && length(nGHQ) == 1) + if (nGHQ != round(nGHQ)) + warning(paste("'nGHQ' is set to ", nGHQ, ".", sep = "")) + nGHQ <- as.integer(round(nGHQ)) + stopifnot(is.null(start) | is.numeric(start)) + stopifnot(is.null(restricted) | is.character(restricted)) + stopifnot(is.logical(verbose) && length(verbose) == 1) rval <- append(list(fit = fit, doFit = doFit, numGrad = numGrad, @@ -137,13 +154,17 @@ olmm <- function(formula, data, family = cumulative(), stopifnot(inherits(control, "olmm_control")) ## link and family - stopifnot(class(family) == "family.olmm") + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } else if (is.function(family)) { + family <- family() + } + if (!inherits(family, "family.olmm")) { + print(family) + stop("'family' not recognized") + } linkNum <- switch(family$link, logit = 1L, probit = 2L, cauchy = 3L) famNum <- switch(family$family, cumulative = 1L, baseline = 2L, adjacent = 3L) - - ## numerical methods for score and hessian matrix - if (!control$numHess & control$numGrad) - stop("'numHess' must be 'TRUE' if 'numGrad' is 'TRUE'") ## evaluate contrasts con <- lapply(1:ncol(data), function(i) attr(data[, i], "contrasts")) @@ -152,14 +173,9 @@ olmm <- function(formula, data, family = cumulative(), if (missing(contrasts)) contrasts <- NULL contrasts <- appendDefArgs(contrasts, con) - ## number of Gauss-Hermite quadrature points - if (control$nGHQ != as.integer(round(control$nGHQ))) - warning(paste("'nGHQ' has been set to ", control$nGHQ, ".", sep = "")) - control$nGHQ <- as.integer(round(control$nGHQ)) - ## optimizer control option optim <- olmm_optim_setup(x = control, env = environment()) - control$numGrad <- control$numHess <- is.null(optim$gr) + ## control$numGrad <- control$numHess <- is.null(optim$gr) ## set environment env <- if (!is.null(list(...)$env)) list(...)$env else parent.frame(n = 1L) @@ -325,16 +341,11 @@ olmm <- function(formula, data, family = cumulative(), ll <- c(0.0) ## score function - if (!control$numGrad) { - score_obs <- matrix(0, dims["n"], dims["nPar"]) - rownames(score_obs) <- rownames(X) - colnames(score_obs) <- unlist(parNames) - score_sbj <- matrix(0, dims["N"], dims["nPar"], - dimnames = list(levels(subject), unlist(parNames))) - } else { - score_obs <- matrix(, 0L, 0L) - score_sbj <- matrix(, 0L, 0L) - } + score_obs <- matrix(0, dims["n"], dims["nPar"]) + rownames(score_obs) <- rownames(X) + colnames(score_obs) <- unlist(parNames) + score_sbj <- matrix(0, dims["N"], dims["nPar"], + dimnames = list(levels(subject), unlist(parNames))) score <- rep.int(0, dims["nPar"]) names(score) <- unlist(parNames) @@ -485,8 +496,9 @@ olmm <- function(formula, data, family = cumulative(), object$info[] <- # replace the info slot - hessian(func = object$optim[[2L]], x = object$coefficients, - method.args = list(r = 2, show.details = TRUE), - restricted = object$restricted) + method.args = list(func = + if (dims["numGrad"]) object$optim[[3L]] else NULL), + restricted = rep.int(FALSE, dims["nPar"])) if (dims["verb"] > 0L) cat("OK") } diff --git a/R/tvcm-cv.R b/R/tvcm-cv.R index 76b36b1..437f00f 100644 --- a/R/tvcm-cv.R +++ b/R/tvcm-cv.R @@ -47,7 +47,6 @@ ##' - cvloss: add 'direction' as new parameter ## --------------------------------------------------------- # - oobloss.tvcm <- function(object, newdata = NULL, weights = NULL, fun = NULL, ...) { @@ -190,38 +189,36 @@ tvcm_folds <- function(object, control) { } -cvloss.tvcm <- function(object, folds = folds_control(), - fun = NULL, dfpar = NULL, - direction = c("backward", "forward"), - papply = mclapply, verbose = FALSE, ...) { +cvloss.tvcm <- function(object, folds = folds_control(), ...) { mc <- match.call() + + control <- extract(object, "control") stopifnot(inherits(folds, "folds")) + control$folds <- folds - if (is.null(dfpar)) dfpar <- object$info$control$dfpar - dfsplit <- object$info$control$dfsplit + ## set the verbose (no verbose is shown when evaluating the validation sets) + object$info$control$verbose <- FALSE + + ## desactivate parallelization in single evaluations + object$info$control$papply <- "lapply" + object$info$control$papply.args <- list() - stopifnot(is.numeric(dfpar) && length(dfpar) == 1L) + ## (hidden) type of evaluation ('loss', 'forest' etc.) type <- list(...)$type if (is.null(type)) type <- "loss" + + ## (hidden) whether the original sample should be evaluated original <- list(...)$original if (is.null(original)) original <- FALSE - direction <- match.arg(direction) + + ## get folds foldsMat <- tvcm_folds(object, folds) - stopifnot(is.character(papply) | is.function(papply)) - if (is.function(papply)) { - if ("papply" %in% names(mc)) { - papply <- deparse(mc$papply) - } else { - papply <- deparse(formals(tvcm_control)$papply) - } - } - papplyArgs <- list(...)[names(list(...)) %in% names(formals(papply))] - keeploss <- list(...)$keeploss - if (is.null(keeploss)) keeploss <- object$info$control$keeploss - control <- object$info$control - control$verbose <- FALSE + ## get initial complexity penalty + cp <- control$cp + + ## weights and model frame weights <- weights(object$info$model) mf <- model.frame(object) @@ -229,11 +226,12 @@ cvloss.tvcm <- function(object, folds = folds_control(), cvFun <- function(i) { + ## the return value of 'cvFun' cv <- vector(mode = "list", length = switch(type, loss = 2L, forest = 3L)) - if (verbose) { - if (papply == "lapply" && i > 1L) cat("\n") - if (papply != "lapply") cat("[", i, "]") else cat("* fold", i, "...") + if (control$verbose) { + if (control$papply == "lapply" && i > 1L) cat("\n") + if (control$papply != "lapply") cat("[", i, "]") else cat("* fold", i, "...") } if (i > 0L) { @@ -250,8 +248,8 @@ cvloss.tvcm <- function(object, folds = folds_control(), ibWeights <- weights[ibSubs] oobWeights <- weights[oobSubs] } - object$info$control <- control } else { + object$info$control <- control ibSubs <- NULL ibWeights <- NULL } @@ -271,20 +269,19 @@ cvloss.tvcm <- function(object, folds = folds_control(), run <- 1L while (run > 0L) { - ibTree <- try(prune(ibTree, dfsplit, dfpar, direction, - papply = "lapply", keeploss = keeploss), TRUE) + ibTree <- try(prune(tree = ibTree, cp = cp), TRUE) if (!inherits(ibTree, "try-error")) { ## save the out-of-bag loss and the current tuning parameter oobLoss <- oobloss(ibTree, newdata = mf[oobSubs,,drop = FALSE], - weights = oobWeights, fun = fun) + weights = oobWeights, fun = control$ooblossfun) cv[[1L]] <- - cbind(cv[[1L]], c(dfsplit)) + cbind(cv[[1L]], cp) cv[[2L]] <- - cbind(cv[[2L]], c(control$lossfun(ibTree), oobLoss)) + cbind(cv[[2L]], oobLoss) ## set a new and stronger tuning parameter tab <- ibTree$info$prunepath[[length(ibTree$info$prunepath)]]$tab - if (nrow(tab) > 1L) dfsplit <- min(tab$dfsplit[-1L]) + if (nrow(tab) > 1L) cp <- min(tab[, "dev"][-1L]) if (nrow(tab) == 1L) run <- 0L } else { @@ -295,7 +292,7 @@ cvloss.tvcm <- function(object, folds = folds_control(), } } else if (type == "forest") { - if (verbose && papply == "lapply") cat("...") + if (control$verbose && control$papply == "lapply") cat("...") ibTreePr <- ibTree cv[[1]] <- ibTree$info$node cv[[2]] <- coef(extract(ibTree, "model")) @@ -307,8 +304,8 @@ cvloss.tvcm <- function(object, folds = folds_control(), cv <- NULL } - if (verbose) { - if (papply != "lapply") { + if (control$verbose) { + if (control$papply != "lapply") { if (is.null(cv)) cat("failed") } else { if (is.null(cv)) cat("failed") else cat(" OK") @@ -318,10 +315,10 @@ cvloss.tvcm <- function(object, folds = folds_control(), return(cv) } - call <- list(name = as.name(papply), - X = quote(seq(ifelse(original, 0, 1), ncol(foldsMat))), + call <- list(name = as.name(control$papply), + X = quote(seq(ifelse(original, 0L, 1L), ncol(foldsMat))), FUN = quote(cvFun)) - call[names(papplyArgs)] <- papplyArgs + call[names(control$papply.args)] <- control$papply.args mode(call) <- "call" cv <- eval(call) @@ -330,7 +327,9 @@ cvloss.tvcm <- function(object, folds = folds_control(), ## extract tree on all data if (original) { tree <- cv[[1L]] + tree$info$control <- control cv <- cv[2:length(cv)] + if (is.null(tree)) stop("partitioning failed.") } else { tree <- NULL } @@ -358,38 +357,17 @@ cvloss.tvcm <- function(object, folds = folds_control(), ## compute results grid <- sort(unique(c(unlist(lapply(cv, function(x) x[[1]][1, ]))))) rval <- list(grid = grid) - - ## column names - if (length(grid) > 1L) { - cn <- c(paste("<=", round(grid[2L], 2)), paste(">", round(grid[-1L], 2))) - } else { - cn <- "0" - } ## make a matrix with the 'loss' for each fold - ## at each dfsplit in 'grid' + ## at each cp in 'grid' cv <- lapply(cv, function(x) { x[[2]][is.nan(x[[2]]) | is.infinite(x[[2]])] <- NA; return(x) }) - ## ib-loss - ibLoss <- t(sapply(cv, getVals, grid = grid, rowsG = 1L, rowsL = 1L)) - if (length(grid) == 1L) ibLoss <- t(ibLoss) - - if (attr(foldsMat, "value") == "weights") { - ibWeights <- foldsMat - } else { - ibWeights <- matrix(rep(weights, ncol(foldsMat)), ncol = ncol(foldsMat)) - ibWeights[foldsMat <= 0] <- 0 - } - ibLoss <- ibLoss / colSums(ibWeights)[!fails] - rownames(ibLoss) <- paste("fold", 1L:length(cv)) - colnames(ibLoss) <- cn - - ## oob-loss - oobLoss <- t(sapply(cv, getVals, grid = grid, rowsG = 1L, rowsL = 2L)) + ## oob-loss (computes the average loss) + oobLoss <- t(sapply(cv, getVals, grid = grid, rowsG = 1L, rowsL = 1L)) if (length(grid) == 1L) oobLoss <- t(oobLoss) oobWeights <- matrix(rep(weights, ncol(foldsMat)), ncol = ncol(foldsMat)) @@ -400,23 +378,27 @@ cvloss.tvcm <- function(object, folds = folds_control(), } oobLoss <- oobLoss / colSums(oobWeights)[!fails] rownames(oobLoss) <- paste("fold", 1L:length(cv)) - colnames(oobLoss) <- cn + colnames(oobLoss) <- if (length(grid) > 1L) { + cn <- c(paste("<=", round(grid[2L], 2)), paste(">", round(grid[-1L], 2))) + } else { + cn <- "0" + } - rval <- append(rval, list(ibloss = ibLoss, oobloss = oobLoss)) + rval <- append(rval, list(oobloss = oobLoss)) meanLoss <- colMeans(rval$oobloss, na.rm = TRUE) - ## dfsplit with minimal loss + ## cp with minimal loss minSubs <- which(meanLoss == min(meanLoss)) if (length(minSubs) > 1L) minSubs <- max(minSubs) - rval$dfsplit.hat <- max(0, mean(c(rval$grid, Inf)[minSubs:(minSubs + 1)])) + rval$cp.hat <- max(0, mean(c(rval$grid, Inf)[minSubs:(minSubs + 1)])) rval$foldsMat <- foldsMat class(rval) <- "cvloss.tvcm" } - rval$direction <- direction - rval$call <- deparseCall(getCall(object)) - + rval$call <- getCall(object) + environment(rval$call) <- NULL + if (original) { tree$info$cv <- rval rval <- tree @@ -437,36 +419,35 @@ cvloss.tvcm <- function(object, folds = folds_control(), rval$folds <- foldsMat } - if (verbose) cat("\n") + if (control$verbose) cat("\n") return(rval) } print.cvloss.tvcm <- function(x, ...) { - cat(ifelse(x$direction == "backward", "Backwards", "Forward"), - "cross-validated average loss", "\n\n") - cat("Call: ", x$call, "\n\n") + cat("Cross-Validated Loss", "\n\n") + cat("Call: ", deparseCall(x$call), "\n\n") rval <- colMeans(x$oobloss, na.rm = TRUE) if (length(rval) > 10L) rval <- rval[seq(1L, length(rval), length.out = 10)] print(rval) cat("\n") - cat("dfsplit with minimal loss:", format(x$dfsplit.hat, digits = 3), "\n") + cat("cp with minimal oob-loss:", format(x$cp.hat, digits = 3), "\n") return(invisible(x)) } plot.cvloss.tvcm <- function(x, legend = TRUE, details = TRUE, ...) { - xlab <- "dfsplit" - ylab <- "average loss(dfsplit)" + xlab <- "cp" + ylab <- "average loss(cp)" type <- "s" lpos <- "topleft" - lsubs <- if (details) 1L:3L else 1L - if (x$dfsplit.hat < Inf) lsubs <- c(lsubs, 4L) - lcol <- c("black", "grey80", "black", "black") - llty <- c(1, 1, 3, 2) + lsubs <- if (details) 1L:2L else 1L + if (x$cp.hat < Inf) lsubs <- c(lsubs, 4L) + lcol <- c("black", "grey80", "black") + llty <- c(1, 1, 2) dotList <- list(...) defArgs <- list(type = type, xlab = xlab, ylab = ylab, @@ -477,14 +458,15 @@ plot.cvloss.tvcm <- function(x, legend = TRUE, details = TRUE, ...) { yy2 <- t(cbind(x$oobloss, x$oobloss[,ncol(x$oobloss)])) yy1 <- rowMeans(yy2, na.rm = TRUE) - yy3 <- rowMeans(t(cbind(x$ibloss, x$ibloss[,ncol(x$ibloss)])), na.rm = TRUE) - defArgs$ylim <- range(if (details) c(yy2, yy3) else yy1, na.rm = TRUE) + defArgs$ylim <- range(if (details) yy2 else yy1, na.rm = TRUE) ## set plot arguments call <- list(name = as.name("plot"), x = quote(xx[, 1]), y = quote(yy1)) call <- appendDefArgs(call, list(...)) call <- appendDefArgs(call, defArgs) + type <- call$type + call$type <- "n" llty[1L] <- call$lty lcol[1L] <- call$col mode(call) <- "call" @@ -493,24 +475,26 @@ plot.cvloss.tvcm <- function(x, legend = TRUE, details = TRUE, ...) { eval(call) ## plot details - if (details) { + if (details) matplot(x = xx, yy2, type = type, lty = llty[2L], col = lcol[2L], add = TRUE) - points(x = xx[,1], yy3, type = type, lty = llty[3L], col = lcol[3L]) - } + + ## plot average oob-loss as last + call$name <- as.name("points") + call$type <- type + eval(call) if (legend) { - ltext <- c("average oob estimate", - "foldwise oob estimates", - "in-sample estimate", - "dfsplit with minimal oob-loss") + ltext <- c("average oob-loss", + "foldwise oob-loss", + "cp with minimal oob-loss") legend(lpos, ltext[lsubs], col = lcol[lsubs], lty = llty[lsubs]) } - if (x$dfsplit.hat < Inf) { - subsMin <- max(which(x$grid <= x$dfsplit.hat)) + if (x$cp.hat < Inf) { + subsMin <- max(which(x$grid <= x$cp.hat)) minLoss <- colMeans(x$oobloss, na.rm = TRUE)[subsMin] - segments(x$dfsplit.hat, par()$usr[3], x$dfsplit.hat, minLoss, lty = 2) - axis(1, x$dfsplit.hat, format(x$dfsplit.hat, digits = 3), + segments(x$cp.hat, par()$usr[3], x$cp.hat, minLoss, lty = 2) + axis(1, x$cp.hat, format(x$cp.hat, digits = 3), line = 1, tick = FALSE) } } diff --git a/R/tvcm-methods.R b/R/tvcm-methods.R index 2546801..79eef14 100644 --- a/R/tvcm-methods.R +++ b/R/tvcm-methods.R @@ -1,7 +1,7 @@ ##' -------------------------------------------------------- # ##' Author: Reto Buergin +##' Date: 2014-10-18 ##' E-Mail: reto.buergin@unige.ch, rbuergin@gmx.ch -##' Date: 2014-09-08 ##' ##' Description: ##' S3 methods for tvcm objects @@ -12,6 +12,7 @@ ##' ##' Methods: ##' coef, coefficients: +##' depth: depth of trees ##' extract: ##' fitted: ##' formula: @@ -27,12 +28,21 @@ ##' resid, residuals: extract residuals ##' splitpath: show information of the splitting procedure ##' weights: extract the weights -##' +##' width: width of trees +##' ##' Modifications: +##' 2014-10-18: added 'depth' and 'width' methods +##' 2014-10-14: adapt print.splitpath to new dev-grid structure +##' 2014-10-03: add option 'cv' to 'extract.tvcm' +##' 2014-09-17: prune.tvcm: +##' - 'keepdev' argument in 'prune.tvcm' dropped (to complicated +##' to explain) +##' - add 'control' argument +##' 2014-09-15: 'tab', 'ntab' and 'otab' are not matrices ##' 2014-09-02: added 'prunepath.tvcm' and 'print.prunepath.tvcm' functions ##' 2014-09-02: various modifications in 'prune.tvcm' for accelerations: ##' - 'do.call' was replaced by 'eval' -##' - new option 'keeploss' (reuses information of the previous +##' - new option 'keepdev' (reuses information of the previous ##' pruning step) ##' - new option 'papply' (even though the accrelation is not ##' as efficient as expected) @@ -46,14 +56,20 @@ coef.tvcm <- function(object, ...) tvcm_get_estimates(object, ...) coefficients.tvcm <- coef.tvcm +depth.tvcm <- function(x, root = FALSE, ...) { + rval <- sapply(x$info$node, depth, root = root) + names(rval) <- LETTERS[seq_along(rval)] + return(rval) +} + + extract.tvcm <- function(object, what = c("control", "model", "nodes", "sctest", "p.value", - "lossgrid", "selected", + "devgrid", "cv", "selected", "coef", "sd", "var"), steps = NULL, ...) { what <- match.arg(what) - splitpath <- object$info$splitpath if (length(splitpath) > 0 && is.null(steps)) steps <- seq(1L, object$info$nstep) @@ -69,10 +85,17 @@ extract.tvcm <- function(object, what = c("control", "model", rval <- lapply(splitpath[steps], function(x) x$sctest) return(rval) - } else if (what == "lossgrid" && !is.null(splitpath)) { + } else if (what == "devgrid" && !is.null(splitpath)) { - rval <- lapply(splitpath[steps], function(x) x$lossgrid) + rval <- lapply(splitpath[steps], function(x) x$grid) return(rval) + + } else if (what == "cv") { + if (is.null(object$info$cv)) { + warning("no information on cross-validation") + } else { + return(object$info$cv) + } } else if (what == "model") { @@ -83,13 +106,13 @@ extract.tvcm <- function(object, what = c("control", "model", rval <- lapply(object$info$node, function(node) { - if (depth(node) > 0L) { - ids <- setdiff(nodeids(node), nodeids(node, terminal = TRUE)) - rval <- unlist(nodeapply(node, ids, function(node) node$split$varid)) - if (length(rval) > 0L) rval <- unique(colnames(object$data)[rval]) - } else { - rval <- NULL - } + if (depth(node) > 0L) { + ids <- setdiff(nodeids(node), nodeids(node, terminal = TRUE)) + rval <- unlist(nodeapply(node, ids, function(node) node$split$varid)) + if (length(rval) > 0L) rval <- unique(colnames(object$data)[rval]) + } else { + rval <- NULL + } return(rval) }) return(rval) @@ -263,19 +286,9 @@ tvcm_print <- function(x, type = c("print", "summary"), cat(paste(" Tests: alpha = ", format(x$info$control$alpha, ...), if (x$info$control$bonferroni) ", nodewise Bonferroni corrected\n", sep = "")) - - if (type == "summary") { - cat("\nGoodness of fit:\n") - lLik <- logLik(x) - AICtab <- data.frame(AIC = AIC(lLik), - BIC = BIC(lLik), - logLik = as.vector(lLik), - row.names = "") - print(AICtab, ...) - } if (length(coef$re) > 0L) { - cat("\nRandom effects:\n") + cat("\nRandom Effects:\n") VarCorr <- VarCorr(extract(x, "model")) if (x$info$fit == "olmm") { VarCorr <- olmm_rename(VarCorr, yLevs, x$info$family, etalab) @@ -286,7 +299,7 @@ tvcm_print <- function(x, type = c("print", "summary"), } if (length(coef$fe) > 0L) { - cat("\nFixed effects:\n") + cat("\nFixed Effects:\n") if (type == "print") { coefMat <- matrix(coef$fe, 1) colnames(coefMat) <- names(coef$fe) @@ -326,10 +339,11 @@ tvcm_print <- function(x, type = c("print", "summary"), class(terminal_panel) <- "grapcon_generator" - vcLabs <- tvcm_print_vclabs(x) + vcLabs <- tvcm_print_vclabs(x$info$formula) for (pid in seq_along(coef$vc)) { - cat(paste("\nVarying coefficient: ", vcLabs[pid], "\n", sep = "")) + cat(paste("\nVarying Coefficient ", LETTERS[pid], + ": ", vcLabs[pid], "\n", sep = "")) x$node <- x$info$node[[pid]] print.party(x, terminal_panel = terminal_panel, tp_args = list(partid = pid)) @@ -355,32 +369,13 @@ print.tvcm <- function(x, ...) tvcm_print(x, type = "print", ...) -prune.tvcm <- function(tree, dfsplit = NULL, dfpar = NULL, - direction = c("backward", "forward"), - alpha = NULL, maxstep = NULL, terminal = NULL, - papply = mclapply, keeploss = FALSE, original = FALSE, - ...) { +prune.tvcm <- function(tree, cp = NULL, alpha = NULL, maxstep = NULL, + terminal = NULL, original = FALSE, ...) { mc <- match.call() - - ## checking arguments - direction <- match.arg(direction) - if (is.null(dfpar)) dfpar <- tree$info$control$dfpar - stopifnot(is.numeric(dfpar) && length(dfpar) == 1L) - stopifnot(is.character(papply) | is.function(papply)) - if (is.function(papply)) { - if ("papply" %in% names(mc)) { - papply <- deparse(mc$papply) - } else { - papply <- deparse(formals(prune.tvcm)$papply) - } - } - papplyArgs <- list(...)[names(list(...)) %in% names(formals(papply))] - stopifnot((is.logical(keeploss) | is.numeric(keeploss)) && - length(keeploss) == 1L) - keeploss <- as.numeric(keeploss) - - tunepar <- c(!is.null(dfsplit), + + ## checking arguments + tunepar <- c(!is.null(cp), !is.null(alpha), !is.null(maxstep), !is.null(terminal)) @@ -388,10 +383,10 @@ prune.tvcm <- function(tree, dfsplit = NULL, dfpar = NULL, if (sum(tunepar) < 1L) stop("no tuning parameter specified.") if (sum(tunepar) > 1L) stop("only one tuning parameter is allowed.") - tunepar <- c("dfsplit", "alpha", "maxstep", "terminal")[tunepar] + tunepar <- c("cp", "alpha", "maxstep", "terminal")[tunepar] - if (tunepar == "dfsplit") { - stopifnot(is.numeric(dfsplit) && length(dfsplit) == 1L) + if (tunepar == "cp") { + stopifnot(is.numeric(cp) && length(cp) == 1L) } else if (tunepar == "alpha") { if (!tree$info$control$sctest) { @@ -426,252 +421,209 @@ prune.tvcm <- function(tree, dfsplit = NULL, dfpar = NULL, refit <- function(tree) { - ## extract new formula - env <- environment() - formList <- tree$info$formula - vcRoot <- sapply(tree$info$node, width) == 1L - ff <- tvcm_formula(formList, vcRoot, tree$info$family, env) - - ## overwrite node predictors - data <- model.frame(tree) - data[, paste("Node", LETTERS[seq_along(tree$info$node)], sep = "")] <- - tvcm_get_node(tree, tree$data, TRUE, tree$fitted[,"(weights)"], formList) - - ## refit model - call <- list(name = as.name(tree$info$fit), - formula = quote(ff$full), - data = quote(data), - family = quote(tree$info$family), - weights = tree$fitted[,"(weights)"]) - call[names(tree$info$dotargs)] <- tree$info$dotargs - mode(call) <- "call" - tree$info$model <- suppressWarnings(try(eval(call), TRUE)) - - ## check if the refitting failed - if (inherits(tree$info$model, "try-error")) - stop("tree model fitting failed.") - - ## heavy terms - if (inherits(tree$info$model, "glm")) { - attr(attr(tree$info$model$model, "terms"), ".Environment") <- NULL - environment(tree$info$model$formula) <- NULL - attr(tree$info$model$terms, ".Environment") <- NULL - } - - ## update control - tree$info$control <- - tvcm_grow_setcontrol(tree$info$control, tree$info$model, formList, vcRoot) - - return(tree) + ## extract new formula + env <- environment() + formList <- tree$info$formula + vcRoot <- sapply(tree$info$node, width) == 1L + ff <- tvcm_formula(formList, vcRoot, tree$info$family, env) + + ## overwrite node predictors + data <- model.frame(tree) + data[, paste("Node", LETTERS[seq_along(tree$info$node)], sep = "")] <- + tvcm_get_node(tree, tree$data, TRUE, tree$fitted[,"(weights)"], formList) + + ## refit model + call <- list(name = as.name(tree$info$fit), + formula = quote(ff$full), + data = quote(data), + family = quote(tree$info$family), + weights = tree$fitted[,"(weights)"]) + call[names(tree$info$dotargs)] <- tree$info$dotargs + mode(call) <- "call" + tree$info$model <- suppressWarnings(try(eval(call), TRUE)) + + ## check if the refitting failed + if (inherits(tree$info$model, "try-error")) + stop("tree model fitting failed.") + + ## heavy terms + if (inherits(tree$info$model, "glm")) { + attr(attr(tree$info$model$model, "terms"), ".Environment") <- NULL + environment(tree$info$model$formula) <- NULL + attr(tree$info$model$terms, ".Environment") <- NULL + } + + ## update control + tree$info$control <- + tvcm_grow_setcontrol(tree$info$control, tree$info$model, formList, vcRoot) + + return(tree) } - + ## get the original tree if (original && !identical(tree$info$node, tree$info$grownode)) { - tree$node <- tree$info$grownode[[1L]] - tree$info$node <- tree$info$grownode - tree <- refit(tree) + tree$node <- tree$info$grownode[[1L]] + tree$info$node <- tree$info$grownode + tree <- refit(tree) } if (tunepar %in% c("alpha", "maxstep", "terminal")) { - - ## prune the tree structure - node <- tvcm_prune_node(tree, alpha, maxstep, terminal) - ## refit the model if something has changed - if (!identical(node, tree$info$node)) { + ## prune the tree structure + node <- tvcm_prune_node(tree, alpha, maxstep, terminal) + + ## refit the model if something has changed + if (!identical(node, tree$info$node)) { - ## attach nodes - tree$node <- node[[1L]] - tree$info$node <- node - - ## refit the model - tree <- refit(tree) - } + ## attach nodes + tree$node <- node[[1L]] + tree$info$node <- node - } else if (tunepar == "dfsplit") { + ## refit the model + tree <- refit(tree) + } + + } else if (tunepar == "cp") { + + ## prune as long as there remain 'weak' links + run <- 1L; step <- 0L; + cols <- c("part", "node", "loss", "npar", "nsplit", "dev") + evalcols <- c("loss", "npar", "nsplit") + prunepath <- list() - if (direction == "backward") { + while (run > 0) { - ## prune as long as there remain 'weak' links - run <- 1L; step <- 0L; - cols <- c("loss", "npar", "nspl") - prunepath <- list() - keeplosscount <- 0 + run <- 0 ; step <- step + 1L; + ncollapse <- NULL - while (run > 0) { - - run <- 0 ; step <- step + 1L; - ncollapse <- NULL - - ids <- lapply(tree$info$node, function(node) { - setdiff(nodeids(node), nodeids(node, terminal = TRUE)) - }) - - ids00 <- lapply(seq_along(tree$info$node), - function(pid) { - unlist(nodeapply(tree$info$node[[pid]], ids[[pid]], - function(node) node$info$id$original)) - }) - - ## 'call' to evaluate the collapses - prStatCall <- list(name = as.name(papply), X = quote(subs), - FUN = quote(prStat)) - prStatCall[names(papplyArgs)] <- papplyArgs - mode(prStatCall) <- "call" + ids <- lapply(tree$info$node, function(node) { + setdiff(nodeids(node), nodeids(node, terminal = TRUE)) + }) + + ids00 <- lapply(seq_along(tree$info$node), + function(pid) { + unlist(nodeapply(tree$info$node[[pid]], ids[[pid]], + function(node) node$info$id$original)) + }) + + ## 'call' to evaluate the collapses + prStatCall <- list(name = as.name(tree$info$control$papply), + X = quote(subs), + FUN = quote(prStat)) + prStatCall[names(tree$info$control$papply.args)] <- + tree$info$control$papply.args + mode(prStatCall) <- "call" + + ntab <- matrix(, length(unlist(ids)), length(cols)) + colnames(ntab) <- cols + ntab[, "part"] <- rep(seq_along(tree$info$node), sapply(ids, length)) + ntab[, "node"] <- unlist(ids) + ntab[, "loss"] <- rep.int(Inf, length(unlist(ids))) + + if (nrow(ntab) > 0L) { - ntab <- data.frame( - part = rep(seq_along(tree$info$node), sapply(ids, length)), - node = unlist(ids), - loss = rep.int(Inf, length(unlist(ids))), - npar = rep.int(NA, length(unlist(ids))), - nspl = rep.int(NA, length(unlist(ids))), - dfsplit = rep.int(NA, length(unlist(ids)))) - - if (nrow(ntab) > 0L) { + if (step > 1L) { - if (step > 1L) { - - ## search for the parents of the previously selecte node - spart <- otab$part[ocollapse] - snode <- otab$node[ocollapse] - root <- 1L - npath <- c(root); opath <- c(root); - while (root != snode) { - nkids <- unlist(nodeapply(tree$info$node[[spart]], root, - function(node) - sapply(node$kids, function(kids) kids$id))) - okids <- unlist(nodeapply(tree$info$node[[spart]], nkids, - function(node) node$info$id$last)) - kidkids <- nodeapply(tree$info$node[[spart]], nkids, nodeids) - kid <- sapply(kidkids, function(x) snode %in% x) - root <- nkids[kid] - if (root != snode) { - npath <- c(npath, nkids[kid]) - opath <- c(opath, okids[kid]) - } + ## search for the parents of the previously selecte node + spart <- otab[ocollapse, "part"] + snode <- otab[ocollapse, "node"] + root <- 1L + npath <- c(root); opath <- c(root); + while (root != snode) { + nkids <- + unlist(nodeapply(tree$info$node[[spart]], root, + function(node) + sapply(node$kids, function(kids) kids$id))) + okids <- unlist(nodeapply(tree$info$node[[spart]], nkids, + function(node) node$info$id$last)) + kidkids <- nodeapply(tree$info$node[[spart]], nkids, nodeids) + kid <- sapply(kidkids, function(x) snode %in% x) + root <- nkids[kid] + if (root != snode) { + npath <- c(npath, nkids[kid]) + opath <- c(opath, okids[kid]) } - - ## insert results of parent models - ntab[ntab$node %in% npath & ntab$part == spart, cols] <- - otab[otab$node %in% opath & otab$part == spart, cols] - - - if (keeplosscount < keeploss) { - - ## approximate the remaining ids from the selected partition - otab$loss <- otab$loss + (otab$loss[ocollapse] - loss0) - otab$npar <- otab$npar - (npar0 - otab$npar[ocollapse]) - otab$nspl <- otab$nspl - (nspl0 - otab$nspl[ocollapse]) - nids <- nodeids(tree$info$node[[spart]]) - oids <- unlist(nodeapply(tree$info$node[[spart]], - nodeids(tree$info$node[[spart]]), - function(node) node$info$id$last)) - subs <- ntab$node %in% nids & - ntab$part %in% spart & is.infinite(ntab$loss) - subs <- nids %in% ntab$node[subs] - nids <- nids[subs] - oids <- oids[subs] - ntab[ntab$node %in% nids & ntab$part == spart, cols] <- - otab[otab$node %in% oids & otab$part == spart, cols] - keeplosscount <- keeplosscount + 1 - - } else keeplosscount <- 0 } - subs <- which(is.infinite(ntab$loss)) - if (length(subs) > 0L) { - - ## reestimate all models which collapse each on inner node - prStat <- function(i) { + ## insert results of parent models + ntab[ntab[, "node"] %in% npath & ntab[, "part"] == spart, evalcols] <- + otab[otab[, "node"] %in% opath & otab[, "part"] == spart, evalcols] + } + } + + loss0 <- tree$info$control$lossfun(tree) + npar0 <- extractAIC(tree)[1L] + nsplit0 <- sum(sapply(tree$info$node, width) - 1L) + + subs <- which(is.infinite(ntab[, "loss"])) + if (length(subs) > 0L) { + + ## re-estimate all models which collapse each on inner node + prStat <- function(i) { + term <- lapply(seq_along(tree$info$node), + function(pid) { + if (pid == ntab[i, "part"]) return(ntab[i, "node"]) + return(NULL) + }) + prTree <- try(prune(tree, terminal = term, papply = lapply), TRUE) + if (!inherits(prTree, "try-error")) + return(c(prTree$info$control$lossfun(prTree), + extractAIC(prTree$info$model)[1L], + sum(sapply(prTree$info$node, width) - 1L))) + return(c(Inf, NA, NA)) + } + stat <- eval(prStatCall) + ntab[subs, evalcols] <- t(sapply(stat, function(x) x)) + } + + if (nrow(ntab) > 0L) { + if (any(ntab[, "loss"] < Inf)) { + + ## minimum dfsplit such that the smaller model improves the fit + ntab[, "dev"] <- + (ntab[, "loss"] - loss0) / + (tvcm_complexity(npar0, tree$info$control$dfpar, + nsplit0, tree$info$control$dfsplit) - + tvcm_complexity(ntab[, "npar"], tree$info$control$dfpar, + ntab[, "nsplit"], tree$info$control$dfsplit)) + + ## prune selected inner node + if (any(ntab[, "dev"] <= cp)) { + ncollapse <- which(!is.na(ntab[, "dev"]) & + !is.nan(ntab[, "dev"]) & + ntab[, "dev"] == min(ntab[, "dev"])) + if (length(ncollapse) > 1L) ncollapse <- sample(ncollapse, 1L) + if (length(ncollapse) > 0L) { term <- lapply(seq_along(tree$info$node), function(pid) { - if (pid == ntab$part[i]) return(ntab$node[i]) + if (pid == ntab[ncollapse, "part"]) + return(ntab[ncollapse, "node"]) return(NULL) }) - prTree <- try(prune(tree, terminal = term, papply = lapply), TRUE) - if (!inherits(prTree, "try-error")) - return(c(prTree$info$control$lossfun(prTree), - extractAIC(prTree$info$model)[1L], - sum(sapply(prTree$info$node, width) - 1L))) - return(c(Inf, NA, NA)) + tree <- prune(tree, terminal = term) + run <- 1 } - stat <- eval(prStatCall) - ntab[subs, cols] <- t(sapply(stat, function(x) x)) } + otab <- ntab + ocollapse <- ncollapse - if (any(ntab$loss < Inf)) { - - ## minimum dfsplit such that the smaller model improves the fit - loss0 <- tree$info$control$lossfun(tree) - npar0 <- extractAIC(tree)[1L] - nspl0 <- sum(sapply(tree$info$node, width) - 1L) - ntab$dfsplit <- (ntab$loss + dfpar * ntab$npar - loss0 - dfpar * npar0) / - (nspl0 - ntab$nspl) - - ## prune selected inner node - if (any(ntab$dfsplit <= dfsplit)) { - ncollapse <- which(!is.na(ntab$dfsplit) & - !is.nan(ntab$dfsplit) & - ntab$dfsplit == min(ntab$dfsplit)) - if (length(ncollapse) > 1L) ncollapse <- sample(ncollapse, 1L) - if (length(ncollapse) > 0L) { - term <- lapply(seq_along(tree$info$node), - function(pid) { - if (pid == ntab$part[ncollapse]) - return(ntab$node[ncollapse]) - return(NULL) - }) - tree <- prune(tree, terminal = term, keeploss = TRUE) - run <- 1 - } - } - otab <- ntab - ocollapse <- ncollapse - - } else { - stop("fitting of nested models failed.") - } - } - - ## create a 'data.frame' for the 'prunepath' method - tab <- data.frame( - part = NA, - node = NA, - loss = tree$info$control$lossfun(tree), - npar = extractAIC(tree)[1L], - nspl = sum(sapply(tree$info$node, width) - 1L), - dfsplit = NA, - row.names = "") - if (nrow(ntab) > 0L) { - ntab$node <- unlist(ids00) - rownames(ntab) <- seq_along(rownames(ntab)) - tab <- rbind(tab, ntab) + } else { + stop("fitting of nested models failed.") } - prunepath[[step]] <- list(step = step, tab = tab) } - } - tree$info$prunepath <- prunepath - - } else if (direction == "forward") { - - ## table with information from the latest step - tab <- data.frame( - step = seq(0, tree$info$nstep, 1L), - loss = sapply(tree$info$splitpath, function(x) x$loss), - npar = sapply(tree$info$splitpath, function(x) x$npar), - nspl = sapply(tree$info$splitpath, function(x) x$nspl)) - - ## maximum dfsplit such that the larger model improves the fit - tab$dfsplit <- c(-(diff(tab$loss) + dfpar * diff(tab$npar)) / diff(tab$nspl), 0) - subs <- tab$dfsplit > dfsplit - - if (any(subs)) { - maxstep <- max(tab$step[subs]) - tree <- prune(tree, maxstep = maxstep) - tab <- tab[subs, ] + ## create a 'data.frame' for the 'prunepath' method + + tab <- matrix(c(NA, NA, loss0, npar0, nsplit0, NA), + ncol = length(cols), dimnames = list("", cols)) + if (nrow(ntab) > 0L) { + ntab[, "node"] <- unlist(ids00) + rownames(ntab) <- seq(1L, nrow(ntab), length.out = nrow(ntab)) + tab <- rbind(tab, ntab) + } + prunepath[[step]] <- list(step = step, tab = tab) } - tree$info$prunepath <- tab + tree$info$prunepath <- prunepath } ## return pruned model @@ -681,17 +633,22 @@ prune.tvcm <- function(tree, dfsplit = NULL, dfpar = NULL, prunepath.tvcm <- function(tree, steps = 1L, ...) { rval <- tree$info$prunepath[steps] + rval <- lapply(rval, function(x) { + x$tab <- as.data.frame(x$tab) + x$tab$part <- LETTERS[x$tab$part] + return(x) + }) class(rval) <- "prunepath.tvcm" return(rval) } -print.prunepath.tvcm <- function(x, ...) { +print.prunepath.tvcm <- function(x, na.print = "", ...) { for (i in seq_along(x)) { if (!is.null(x[[i]])) { if (i != 1L) cat("\n") cat("Step:", x[[i]]$step, "\n") - print(x[[i]]$tab, ...) + print(as.matrix(x[[i]]$tab), na.print = na.print, quote = FALSE, ...) } } } @@ -720,7 +677,7 @@ splitpath.tvcm <- function(tree, steps = 1L, if (!details) { for (i in seq_along(steps)) { rval[[i]]$sctest <- NULL - rval[[i]]$lossgrid <- NULL + rval[[i]]$grid <- NULL } } class(rval) <- "splitpath.tvcm" @@ -735,33 +692,36 @@ print.splitpath.tvcm <- function(x, ...) { if (is.null(unlist(x[[i]]$varid))) { cat(" (no splitting processed)\n") } else { + cat("\n\nSelected Split:") cat("\nPartition:", LETTERS[x[[i]]$partid]) - cat("\nSplitting variable:", x[[i]]$var) cat("\nNode:", x[[i]]$node) + cat("\nVariable:", x[[i]]$var) cat("\nCutpoint: ") cat(paste("{",paste(x[[i]]$cutpoint, collapse = "}, {"), "}", sep = "")) - cat("\nPenalized loss reduction:", format(x[[i]]$plossred, ...), "\n") } - if (!is.null(x[[i]]$sctest) | !is.null(x[[i]]$lossgrid)) - cat("\nDetails:\n") if (!is.null(x[[i]]$sctest)) { - cat("\nCoefficient constancy tests (p-value):\n") + cat("\nCoefficient Constancy Tests (p-value):\n") for (pid in seq_along(x[[i]]$sctest)) { cat(paste("\nPartition ", LETTERS[pid], ":\n", sep = "")) print(as.data.frame(x[[i]]$sctest[[pid]]), ...) } - } - if (!is.null(x[[i]]$lossgrid)) { - cat("\nLoss-minimizing grid search (loss reduction):\n") - for (pid in seq_along(x[[i]]$lossgrid)) { - for (nid in seq_along(x[[i]]$lossgrid[[pid]])) { - for (vid in seq_along(x[[i]]$lossgrid[[pid]][[nid]])) { - if (length(x[[i]]$lossgrid[[pid]][[nid]][[vid]]) > 0L) { - cat("\nPartition:", names(x[[i]]$lossgrid)[pid], - "Node:", sub("Node", "",names(x[[i]]$lossgrid[[pid]])[nid]), - "Variable:", names(x[[i]]$lossgrid[[pid]][[nid]])[vid], "\n") - print(as.data.frame(x[[i]]$lossgrid[[pid]][[nid]][[vid]]), ...) + } else cat("\n") + + if (!is.null(x[[i]]$grid)) { + cat("\nLoss Reduction Statistics:") + for (pid in seq_along(x[[i]]$grid)) { + for (nid in seq_along(x[[i]]$grid[[pid]])) { + for (vid in seq_along(x[[i]]$grid[[pid]][[nid]])) { + if (is.null(x[[i]]$sctest) | !is.null(x[[i]]$sctest) && + any(x[[i]]$grid[[pid]][[nid]][[vid]][[2L]] > -Inf)) { + cat("\nPartition:", LETTERS[pid], + "Node:", sub("Node", "",names(x[[i]]$grid[[pid]])[nid]), + "Variable:", names(x[[i]]$grid[[pid]][[nid]])[vid], "\n") + print(as.data.frame( + cbind(x[[i]]$grid[[pid]][[nid]][[vid]][[1L]], + "dev" = x[[i]]$grid[[pid]][[nid]][[vid]][[2L]], + "npar" = x[[i]]$grid[[pid]][[nid]][[vid]][[3L]])), ...) } } } @@ -774,3 +734,10 @@ print.splitpath.tvcm <- function(x, ...) { weights.tvcm <- function(object, ...) { weights(extract(object, "model")) } + + +width.tvcm <- function(x, ...) { + rval <- sapply(x$info$node, width) + names(rval) <- LETTERS[seq_along(rval)] + return(rval) +} diff --git a/R/tvcm-plot.R b/R/tvcm-plot.R index 8c3db09..50408a2 100644 --- a/R/tvcm-plot.R +++ b/R/tvcm-plot.R @@ -33,14 +33,13 @@ plot.tvcm <- function(x, type = c("default", "coef", "simple", "partdep", "cv"), - main = NULL, part = NULL, + main, part = NULL, drop_terminal = TRUE, tnex = 1, newpage = TRUE, ask = NULL, pop = TRUE, gp = gpar(), ...) { ## checks type <- match.arg(type) - stopifnot(is.null(main) | is.character(main)) stopifnot(is.logical(drop_terminal) && length(drop_terminal) == 1L) stopifnot(is.numeric(tnex) && length(tnex) == 1L) stopifnot(is.logical(newpage) && length(newpage) == 1L) @@ -49,13 +48,15 @@ plot.tvcm <- function(x, type = c("default", "coef", if (type == "partdep") { - call <- list(as.name("panel_partdep"), object = quote(x), ask = ask, main = main) - call <- append(call, list(...)) - mode(call) <- "call" - eval(call) + if (missing(main)) main <- NULL + call <- list(as.name("panel_partdep"), object = quote(x), ask = ask, main = main) + call <- append(call, list(...)) + mode(call) <- "call" + eval(call) } else if (type == "cv") { + if (missing(main)) main <- NULL if (is.null(x$info$cv)) { warning("no information on cross validation.") } else { @@ -65,29 +66,29 @@ plot.tvcm <- function(x, type = c("default", "coef", } else { ## tree plots - if (is.null(part)) part <- seq_along(x$info$node) if (is.character(part)) { - levs <- LETTERS[seq_along(x$info$node)] - if (any(!part %in% levs)) stop("unknown 'part'.") - part <- which(part %in% levs) + part <- which(LETTERS[seq_along(x$info$node)] %in% part) } else if (is.numeric(part)) { part <- as.integer(part) } else { stop("'part' must be a 'character' or a 'integer'.") } + if (length(part) < 1L) stop("no valid 'part' specified.") ## whether an input is expected before plotting the next tree if (is.null(ask)) ask <- ifelse(length(part) == 1L, FALSE, TRUE) ## repeat the title - if (is.null(main)) { - main <- tvcm_print_vclabs(x) - main <- main[part] + if (missing(main)) { + main <- tvcm_print_vclabs(x$info$formula)[part] + } else if (is.character(main)){ + main <- rep(main, length.out = length(part)) } else { - main <- rep(main, length.out = length(part)) + main <- NULL } + ## terminal panel tp_fun <- switch(type, @@ -99,8 +100,11 @@ plot.tvcm <- function(x, type = c("default", "coef", }, "coef" = panel_coef, "simple" = panel_empty) - tp_args <- + tp_args <- if ("tp_args" %in% names(list(...))) { + list(...)$tp_args + } else { list(...)[names(list(...)) %in% names(formals(tp_fun))[-1]] + } tp_args$part <- part tp_args$gp <- gp @@ -110,17 +114,30 @@ plot.tvcm <- function(x, type = c("default", "coef", "default" = node_inner, "coef" = node_inner, "simple" = node_inner) - ip_args <- + ip_args <- if ("ip_args" %in% names(list(...))) { + list(...)$ip_args + } else { list(...)[names(list(...)) %in% names(formals(ip_fun))[-1]] + } + + ## edge panel + ep_fun <- edge_default + ep_args <- if ("ep_args" %in% names(list(...))) { + list(...)$ep_args + } else { + list(...)[names(list(...)) %in% names(formals(ep_fun))[-1]] + } ## other arguments dotargs <- list(...)[names(list(...)) %in% names(formals(plot.party))[-1]] - + dotargs <- dotargs[setdiff(names(dotargs), c("tp_args", "ip_args", "ep_args"))] + ## prepare call call <- list(name = as.name("plot.party"), x = quote(x), terminal_panel = quote(tp_fun), tp_args = quote(tp_args), inner_panel = quote(ip_fun), ip_args = quote(ip_args), + edge_panel = quote(ep_fun), ep_args = quote(ep_args), drop_terminal = quote(drop_terminal), tnex = quote(tnex), newpage = quote(newpage), main = quote(main[pid]), pop = pop, gp = gp) @@ -271,7 +288,7 @@ panel_get_main <- function(object, node, id, nobs) { if (id) rval <- paste(rval, paste(names(object)[id_node(node)], sep = "")) if (id && nobs) rval <- paste(rval, ": ", sep = "") - if (nobs) rval <- paste(rval, "nobs = ", node$info$dims["n"], sep = "") + if (nobs) rval <- paste(rval, "n = ", node$info$dims["n"], sep = "") return(rval) } @@ -310,10 +327,12 @@ panel_coef <- function(object, parm = NULL, rval <- function(node) { pushViewport(viewport()) grid.text("(no split)") - upViewport() + upViewport(2L) } return(rval) } + + ## extract subset of coefficients if (is.null(parm)) parm <- colnames(coef) if (!is.list(parm)) parm <- list(parm) if (is.numeric(unlist(parm))) { @@ -401,6 +420,7 @@ panel_coef <- function(object, parm = NULL, } ## population mean + meanCoef <- NULL if (mean) { mean_gp <- argsToList(mean_gp) @@ -413,10 +433,6 @@ panel_coef <- function(object, parm = NULL, sum(weights(object)) meanCoef <- colSums(coef * matrix(w, nrow(coef), ncol(coef))) meanCoef <- lapply(parm, function(trms) meanCoef[trms, drop = FALSE]) - if (conf.int) { - meanSd <- sqrt(colSums(sd^2 * matrix(w, nrow(coef), ncol(coef))^2)) - meanSd <- lapply(parm, function(trms) sqrt(meanSd[trms, drop = FALSE])) - } } qN <- qnorm(0.975) @@ -459,63 +475,79 @@ panel_coef <- function(object, parm = NULL, unit(plot_gp[[i]]$xlim[2], "native"), unit(0, "native"), gp = gpar(col = "black")) - if (conf.int) { - - grid.segments(unit(1:ncol(coefList[[i]]), "native"), - unit(coefList[[i]][as.character(id_node(node)),] - - qN * sdList[[i]][as.character(id_node(node)),], "native"), - unit(1:ncol(coefList[[i]]), "native"), - unit(coefList[[i]][as.character(id_node(node)),] + - qN * sdList[[i]][as.character(id_node(node)),], "native"), - arrow = arrow(angle = conf.int_gp[[i]]$angle, - length = conf.int_gp[[i]]$length, - ends = conf.int_gp[[i]]$ends, - type = conf.int_gp[[i]]$type), - gp = plot_gp[[i]]$gp) + subs <- coefList[[i]][as.character(id_node(node)),] >= plot_gp[[i]]$ylim[1L] & + coefList[[i]][as.character(id_node(node)),]<= plot_gp[[i]]$ylim[2L] + + subsMean <- NULL + if (mean) { + subsMean <- meanCoef[[i]] > plot_gp[[i]]$ylim[1L] & + meanCoef[[i]] > plot_gp[[i]]$ylim[1L] + } + + ## option 'conf.int = TRUE' + if (conf.int) { + + ## crop the lines + nCoef <- length(coefList[[i]][as.character(id_node(node)),]) + endCi <- rep(conf.int_gp[[i]]$ends, length.out = nCoef) + lenCi <- rep(conf.int_gp[[i]]$length, length.out = nCoef) + lwr <- coefList[[i]][as.character(id_node(node)),] - + qN * sdList[[i]][as.character(id_node(node)),] + endCi[lwr < plot_gp[[i]]$ylim[1L]] <- "last" + lwr[lwr < plot_gp[[i]]$ylim[1L]] <- plot_gp[[i]]$ylim[1L] + lwr[lwr > plot_gp[[i]]$ylim[2L]] <- NA + upr <- coefList[[i]][as.character(id_node(node)),] + + qN * sdList[[i]][as.character(id_node(node)),] + endCi[upr > plot_gp[[i]]$ylim[2L] & endCi == "last"] <- "none" + endCi[upr > plot_gp[[i]]$ylim[2L] & endCi == "both"] <- "first" + upr[upr > plot_gp[[i]]$ylim[2L]] <- plot_gp[[i]]$ylim[2] + upr[upr < plot_gp[[i]]$ylim[1L]] <- NA + subsCi <- !is.na(lwr) & !is.na(upr) + lenCi[endCi == "none"] <- 0 + endCi[endCi == "none"] <- "both" - if (FALSE) { - - grid.segments(unit(1:ncol(coefList[[i]]), "native"), - unit(meanCoef[[i]] - qN * meanSd[[i]], "native"), - unit(1:ncol(coefList[[i]]), "native"), - unit(meanCoef[[i]] + qN * meanSd[[i]], "native"), - arrow = arrow(angle = conf.int_gp[[1]]$angle, - length = conf.int_gp[[i]]$length, - ends = conf.int_gp[[i]]$ends, - type = conf.int_gp[[i]]$type), - gp = mean_gp[[i]]$gp) - - } + ## plot + if (any(subsCi)) + grid.segments(unit(which(subsCi), "native"), + unit(lwr[subsCi], "native"), + unit(which(subsCi), "native"), + unit(upr[subsCi], "native"), + arrow = arrow(angle = conf.int_gp[[i]]$angle, + length = lenCi, + ends = endCi, + type = conf.int_gp[[i]]$type), + gp = plot_gp[[i]]$gp) } + ## option 'type = "p"' if (plot_gp[[i]]$type %in% c("p", "b")) { - if (mean) { - grid.points(unit(1:ncol(coefList[[i]]), "native"), - unit(meanCoef[[i]], "native"), + if (mean && any(subsMean)) + grid.points(unit(which(subsMean), "native"), + unit(meanCoef[[i]][subsMean], "native"), pch = mean_gp[[i]]$pch, gp = mean_gp[[i]]$gp) - } - grid.points(unit(1:ncol(coefList[[i]]), "native"), - unit(coefList[[i]][as.character(id_node(node)),], - "native"), - pch = plot_gp[[i]]$pch, gp = plot_gp[[i]]$gp) + if (any(subs)) + grid.points(unit(which(subs), "native"), + unit(coefList[[i]][as.character(id_node(node)),][subs], + "native"), + pch = plot_gp[[i]]$pch, gp = plot_gp[[i]]$gp) } - + ## option 'type = "l"' if (plot_gp[[i]]$type %in% c("l", "b")) { - - if (mean) { - grid.lines(unit(1:ncol(coefList[[i]]), "native"), - unit(meanCoef[[i]], "native"), - gp = mean_gp[[i]]$gp) - } - grid.lines(unit(1:ncol(coefList[[i]]), "native"), - unit(coefList[[i]][as.character(id_node(node)),], - "native"), - gp = plot_gp[[i]]$gp) + if (mean && any(subsMean)) + grid.lines(unit(which(subsMean), "native"), + unit(meanCoef[[i]][subsMean], "native"), + gp = mean_gp[[i]]$gp) + + if (any(subs)) + grid.lines(unit(which(subs), "native"), + unit(coefList[[i]][as.character(id_node(node)),][subs], + "native"), + gp = plot_gp[[i]]$gp) } if (id_node(node) == min(nodeids(object, terminal = TRUE))) { @@ -558,3 +590,42 @@ panel_empty <- function(object, part = 1L, id = TRUE, nobs = TRUE, ...) { return(rval) } class(panel_empty) <- "grapcon_generator" + +edge_default <- function(obj, digits = 3, abbreviate = FALSE, + justmin = Inf, + just = c("alternate", "increasing", "decreasing", "equal")) { + meta <- obj$data + + justfun <- function(i, split) { + myjust <- if(mean(nchar(split)) > justmin) { + match.arg(just, c("alternate", "increasing", "decreasing", "equal")) + } else { + "equal" + } + k <- length(split) + rval <- switch(myjust, + "equal" = rep.int(0, k), + "alternate" = rep(c(0.5, -0.5), length.out = k), + "increasing" = seq(from = -k/2, to = k/2, by = 1), + "decreasing" = seq(from = k/2, to = -k/2, by = -1) + ) + unit(0.5, "npc") + unit(rval[i], "lines") + } + + ## panel function for simple edge labelling + function(node, i) { + split <- character_split(split_node(node), meta, digits = digits)$levels + y <- justfun(i, split) + split <- split[i] + ## try() because the following won't work for split = "< 10 Euro", for example. + if(any(grep(">", split) > 0) | any(grep("<", split) > 0)) { + tr <- suppressWarnings(try(parse(text = paste("phantom(0)", split)), + silent = TRUE)) + if(!inherits(tr, "try-error")) split <- tr + } + grid.rect(y = y, gp = gpar(fill = "white", col = 0), + width = unit(1, "strwidth", split)) + grid.text(split, y = y, just = "center") + } +} +class(edge_default) <- "grapcon_generator" diff --git a/R/tvcm-utils.R b/R/tvcm-utils.R index e005c01..4650893 100644 --- a/R/tvcm-utils.R +++ b/R/tvcm-utils.R @@ -1,7 +1,7 @@ ##' -------------------------------------------------------- # ##' Author: Reto Buergin ##' E-Mail: reto.buergin@unige.ch, rbuergin@gmx.ch -##' Date: 2014-09-08 +##' Date: 2014-10-14 ##' ##' Description: ##' Workhorse functions for the 'tvcm' function @@ -9,17 +9,20 @@ ##' Overview: ##' ##' Workhorse functions for partitioning: +##' tvcm_complexity: computes the complexity of the model ##' tvcm_grow: main function for growing the trees ##' tvcm_grow_fit: fits the current model ##' tvcm_grow_update: refits the model (with utility function ##' 'glm.doNotFit') +##' tvcm_getNumSplits get cutpoints of numeric variables +##' tvcm_getOrdSplits get cutpoints of ordinal variables +##' tvcm_getNomSplits get cutpoints of nominal variables ##' tvcm_grow_setsplits: get current splits -##' tvcm_setsplits_validcats: validate categorical cuts ##' tvcm_setsplits_sctest: update splits after tests ##' tvcm_setsplits_splitnode: ##' tvcm_setsplits_rselect: randomly select partitions, variables and nodes ##' tvcm_grow_sctest: run coefficient constancy tests -##' tvcm_grow_gridsearch: grid based loss minimization +##' tvcm_grow_gridsearch: compute the dev statistics ##' tvcm_grow_splitnode: split in variable x. ##' tvcm_formula: extract separate formulas for ##' model and partitioning from @@ -46,6 +49,33 @@ ##' tvcm_grow_splitpath: creates a 'splitpath.tvcm' object ##' ##' Last modifications: +##' 2014-11-05: parallelized 'estfun.olmm' call in 'tvcm_grow_sctest' +##' 2014-10-14: - modify dev-grid structure obtained from 'tvcm_grow_gridsearch' +##' each combination of part/node/var has now a list of three +##' elements where the first contains the cuts, the second the +##' the loss reduction and the third the difference in the number +##' of parameters. Modifications concerned: +##' - tvcm_grow_setsplits +##' - tvcm_setsplits_sctest +##' - tvcm_setsplits_rselect +##' - print.splitpath +##' - tvcm_setsplits_splitnode allocates for the splitted +##' node a list structure (before it was a empty list +##' on the node level) +##' - small modifications in getSplits function +##' - deleted 'tvcm_setsplits_validcats' +##' - added 'tvcm_getNumSplits', 'tvcm_getOrdSplits' and +##' 'tvcm_getNomSplits' +##' 2014-09-22: deleted unnecessary 'subs' object in 'tvcm_grow_gridsearch' +##' which I didn't remove when removing the 'keepdev' +##' option +##' 2014-09-17: - delete 'keepdev' argument (also for prune.tvcm) +##' - add function 'tvcm_complexity' +##' 2014-09-15: changed 'dev' labels to 'dev' etc. +##' 2014-09-10: - add 'control' argument for 'tvcm_grow_update' +##' to allow the control of variable centering +##' - add variable centering in 'tvcm_grow_update' +##' (which was not implemented for some curious reasons) ##' 2014-09-08: substitute 'rep' function by 'rep.int' or 'rep_len' ##' 2014-09-07: - added 'tvcm_get_vcparm' function ##' - set default values in 'glm.doNotFit' @@ -66,7 +96,7 @@ ##' multiple vc() terms with equal 'by' arguments ##' are present ##' 2014-08-08: correct bug in 'tvcm_grow_setsplits' regarding -##' 'keeploss' +##' 'keepdev' ##' 2014-08-08: add suppressWarnings in tvcm_grow_fit ##' 2014-07-22: the list of splits is now of structure ##' partitions-nodes-variables @@ -85,7 +115,27 @@ ##' 2013-12-02: remove 'tvcm_grow_setupnode' ##' 2013-11-01: modify 'restricted' and 'terms' correctly in ##' 'tvcm_modify_modargs' +##' +##' Bottleneck functions: +##' - tvcm_grow_sctest +##' - tvcm_grow_setsplits +##' - tvcm_grow_gridsearch +##' -------------------------------------------------------- # + ##' -------------------------------------------------------- # +##' Compute the complexity of the model. +##' +##' @param npar the number of coefficients +##' @param dfpar the degree of freedom per parameter +##' @param nsplit the number of splits +##' @param dfsplit the degree of freedom per split +##' +##' @return a scalar giving the complexity of the model +##' -------------------------------------------------------- # + +tvcm_complexity <- function(npar, dfpar, nsplit, dfsplit) + return(dfsplit * nsplit + dfpar * npar) + ##' -------------------------------------------------------- # ##' \code{\link{tvcm_grow_fit}} fits the current node model. @@ -132,24 +182,33 @@ tvcm_grow <- function(object, subset = NULL, weights = NULL) { as.integer(sapply(x, function(x) which(colnames(partData) == x))) }) ## set the root node - nodes <- replicate(nPart, partynode(id = 1L, info = list(dims = nobs(model), depth = 0L))) + nodes <- + replicate(nPart, partynode(id = 1L, info = list(dims = nobs(model), depth = 0L))) names(nodes) <- names(formList$vc) where <- vector("list", length = nPart) partid <- seq(1, nPart, length.out = nPart) spart <- 0 # pseudo value - splits <- vector("list", length = nPart) - + + ## allocate 'splits' + splits <- lapply(seq_along(partid), function(pid) { + lapply(1L, function(nid, pid) { + lapply(seq_along(varid[[pid]]), function(x) { + vector("list", 3) + }) + }, pid = pid) + }) + + ## allocate 'splitpath' splitpath <- list() run <- 1L step <- 0L - noverstep <- 0L while (run > 0L) { step <- step + 1L; nstep <- step; - test <- NULL; loss <- NULL; + test <- NULL; dev <- NULL; ## get current partitions and add them to the model data for (pid in seq_along(nodes)) { @@ -186,7 +245,7 @@ tvcm_grow <- function(object, subset = NULL, weights = NULL) { } ## compute / update splits - splits <- tvcm_grow_setsplits(splits, spart, partid, nodeid, varid, model, + splits <- tvcm_grow_setsplits(splits, partid, nodeid, varid, model, nodes, where, partData, control) ## check if there is at least one admissible split @@ -221,9 +280,10 @@ tvcm_grow <- function(object, subset = NULL, weights = NULL) { if (inherits(test, "try-error")) { run <- 0L stopinfo <- test - + } else { - testAdj <- tvcm_sctest_bonf(test,ifelse(control$bonferroni,"nodewise", "none")) + testAdj <- + tvcm_sctest_bonf(test,ifelse(control$bonferroni,"nodewise", "none")) run <- 1L * (min(c(1.0 + .Machine$double.eps, unlist(testAdj)), na.rm = TRUE) <= control$alpha) } @@ -234,9 +294,10 @@ tvcm_grow <- function(object, subset = NULL, weights = NULL) { testAdjPart <- tvcm_sctest_bonf(test,ifelse(control$bonferroni,"partitionwise","none")) minpval <- min(unlist(testAdjPart), na.rm = TRUE) - spart <- which(sapply(testAdjPart, function(x)any(sapply(x,identical,minpval)))) + spart <- + which(sapply(testAdjPart, function(x)any(sapply(x,identical,minpval)))) if (length(spart) > 1L) spart <- sample(spart, 1L) - + ## select variable and node minsubs <- which(sapply(test[[spart]], identical, min(test[[spart]], na.rm = TRUE))) @@ -246,22 +307,22 @@ tvcm_grow <- function(object, subset = NULL, weights = NULL) { ## print results if (control$verbose) { - + ## tests cat("\nCoefficient constancy tests (p-value):\n") for (pid in seq_along(nodes)) { - cat(paste("\nPartition ", LETTERS[pid], ":\n", sep = "")) - print(data.frame(format(testAdj[[pid]], digits = 2L))) - } + cat(paste("\nPartition ", LETTERS[pid], ":\n", sep = "")) + print(data.frame(format(testAdj[[pid]], digits = 2L))) + } ## selections - cat("\nSplitting partition:", names(nodes)[spart]) - cat("\nSplitting variable:", names(partData)[varid[[spart]][svar]]) + cat("\nPartition:", LETTERS[spart]) cat("\nNode:", levels(where[[spart]])[snode]) + cat("\nVariable:", names(partData)[varid[[spart]][svar]], "\n") } - ## set deviance statistic of not to selected nodes to 'Inf' to avoid + ## set dev statistic of not to selected nodes to 'Inf' to avoid ## model evaluations splits <- tvcm_setsplits_sctest(splits, partid, spart, nodeid, snode, varid, svar) @@ -273,100 +334,96 @@ tvcm_grow <- function(object, subset = NULL, weights = NULL) { if (run > 0L) { - ## ------------------------------------------------- # - ## Step 3: search a cutpoint - ## ------------------------------------------------- # - - ## compute the loss of all candidate splits and extract the best split - loss <- try(tvcm_grow_gridsearch(splits, partid, nodeid, varid, - model, nodes, where, partData, - control, mcall, formList, step), silent = TRUE) + ## ------------------------------------------------- # + ## Step 3: search a cutpoint + ## ------------------------------------------------- # - ## handling stops - if (inherits(loss, "try-error")) { - run <- 0L - stopinfo <- loss - nstep <- step - 1L - - } else { - splits <- loss$lossgrid - spart <- loss$partid + ## compute the dev of all candidate splits and extract the best split + dev <- try(tvcm_grow_gridsearch(splits, partid, nodeid, varid, + model, nodes, where, partData, + control, mcall, formList, step), + silent = TRUE) - if (is.null(loss$cut)) { - run <- 0L - stopinfo <- "no split that decreases the loss found" - nstep <- step - 1L - } - - if (run > 0L) { - noverstep <- if (loss$plossred < control$dfsplit) noverstep + 1L else 0L - if (noverstep > control$maxoverstep) { + ## handling stops + if (inherits(dev, "try-error")) { run <- 0L - if (control$maxoverstep > 0L) { - stopinfo <- "'maxoverstep' reached" - } else { - stopinfo <- "minimal penalized loss-reduction reached" + stopinfo <- dev + nstep <- step - 1L + + } else { + splits <- dev$grid + spart <- dev$partid + + if (is.null(dev$cut)) { + run <- 0L + stopinfo <- "found no admissible split" + nstep <- step - 1L } - nstep <- nstep - 1L - } - } - } + if (run > 0L) { + if (dev$pendev < control$mindev) { + run <- 0 + stopinfo <- paste("no split with", + if (control$cp > 0) "penalized", + "loss reduction > mindev") + nstep <- nstep - 1L + } + } + } } ## incorporate the split into 'nodes' if (run > 0L) - nodes <- tvcm_grow_splitnode(nodes, where, loss, partData, - step, weights) + nodes <- tvcm_grow_splitnode(nodes, where, dev, partData, + step, weights) if (run > 0L) - splits <- tvcm_setsplits_splitnode(splits, loss$partid, loss$nodeid, - nodeid, loss, model, control) + splits <- tvcm_setsplits_splitnode(splits, dev$partid, dev$nodeid, nodeid) ## update 'splitpath' to make the splitting process traceable if (run >= 0L) splitpath[[step]] <- list(step = step, - loss = control$lossfun(model), + dev = control$lossfun(model), npar = extractAIC(model)[1L], - nspl = step - 1L) + nsplit = step - 1L) if (!inherits(test, "try-error")) splitpath[[step]]$sctest <- test - if (!inherits(loss, "try-error")) { - if (run > 0L) { - splitpath[[step]]$partid <- loss$partid - splitpath[[step]]$nodeid <- loss$nodeid - splitpath[[step]]$varid <- loss$varid - splitpath[[step]]$cutid <- loss$cutid - splitpath[[step]]$plossred <- loss$plossred - } - splitpath[[step]]$lossgrid <- loss$lossgrid - } + if (!inherits(dev, "try-error") && run > 0L) + splitpath[[step]] <- append(splitpath[[step]], dev) ## print the split if (control$verbose) { if (run > 0L) { if (!control$sctest) { - cat("\n\nSplitting partition:", names(nodes)[loss$partid]) - cat("\nNode:", levels(where[[loss$partid]])[loss$nodeid]) - cat("\nVariable:", names(partData)[loss$varid]) + cat("\n\nPartition:", LETTERS[dev$partid]) + cat("\nNode:", levels(where[[dev$partid]])[dev$nodeid]) + cat("\nVariable:", names(partData)[dev$varid]) } else { cat("\n") } cat("\nCutpoint:\n") - print(as.data.frame(matrix(loss$cut, 1L, - dimnames = list(loss$cutid, - names(loss$cut))))) + print(as.data.frame(matrix(dev$cut, 1L, + dimnames = list(dev$cutid, + names(dev$cut))))) cat("Model comparison:\n") - print(data.frame("loss" = c(control$lossfun(model), - control$lossfun(model) - loss$lossred), - "penalized loss reduction" = c("", format(loss$plossred)), - row.names = paste("step", step + c(-1, 0)), - check.names = FALSE)) + print(data.frame( + cbind("loss" = c( + round(control$lossfun(model), 2), + round(control$lossfun(model) - dev$dev, 2L)), + ## if 'cp == 0' + "dev" = if (control$cp == 0) + c("", round(dev$dev, 2L)), + ## if 'cp > 0' + "penalized dev" = if (control$cp > 0) + c("", round(dev$pendev, 2L)), + deparse.level = 2), + row.names = paste("step", step + c(-1, 0)), + check.names = FALSE)) } else { cat("\n\nStopping partitioning.\nMessage:", as.character(stopinfo), "\n") @@ -387,7 +444,7 @@ tvcm_grow <- function(object, subset = NULL, weights = NULL) { } ## prepare the title - title <- c("Tree-based varying-coefficients model") + title <- c("Tree-Based Varying Coefficients Model") ## modify splitpath splitpath <- tvcm_grow_splitpath(splitpath, varid, nodes, partData, control) @@ -526,7 +583,7 @@ tvcm_grow_fit <- function(mcall, doFit = TRUE) { ##' Improve performance for non 'olmm' objects ##' -------------------------------------------------------- # -tvcm_grow_update <- function(object) { +tvcm_grow_update <- function(object, control) { if (inherits(object, "olmm")) { @@ -551,27 +608,32 @@ tvcm_grow_update <- function(object) { olmm_merge_mm(x = model.matrix(termsFeCe, object$frame, conCe), y = model.matrix(termsFeGe, object$frame, conGe), TRUE) - ## extract interaction predictors to be centered - subsCe <- which(rownames(attr(termsFeCe, "factors")) %in% c("Left", "Right")) - if (any(subsCe)) { - subsCe <- - which(colSums(attr(termsFeCe, "factors")[subsCe,,drop = FALSE]) > 0 & - !colnames(attr(termsFeCe, "factors")) %in% c("Left", "Right")) - subsCe <- - which(attr(object$X, "assign") %in% subsCe & attr(object$X, "merge") == 1) - } - subsGe <- which(rownames(attr(termsFeGe, "factors")) %in% c("Left", "Right")) - if (any(subsGe)) { - subsGe <- - which(colSums(attr(termsFeGe, "factors")[subsGe,,drop = FALSE]) > 0 & - !colnames(attr(termsFeGe, "factors")) %in% c("Left", "Right")) - subsGe <- - which(attr(object$X, "assign") %in% subsGe & attr(object$X, "merge") == 2) + if (control$center) { + + ## extract interaction predictors to be centered + ## (the ones with 'Left' and 'Right') + cColsCe <- which(rownames(attr(termsFeCe, "factors")) %in% c("Left", "Right")) + if (length(cColsCe) > 0L) { + cColsCe <- + which(colSums(attr(termsFeCe, "factors")[cColsCe,,drop = FALSE]) > 0 & + !colnames(attr(termsFeCe, "factors")) %in% c("Left", "Right")) + cColsCe <- + which(attr(object$X, "assign") %in% cColsCe & attr(object$X, "merge") == 1) + } + cColsGe <- which(rownames(attr(termsFeGe, "factors")) %in% c("Left", "Right")) + if (length(cColsGe) > 0L) { + cColsGe <- + which(colSums(attr(termsFeGe, "factors")[cColsGe,,drop = FALSE]) > 0 & + !colnames(attr(termsFeGe, "factors")) %in% c("Left", "Right")) + cColsGe <- + which(attr(object$X, "assign") %in% cColsGe & attr(object$X, "merge") == 2) + } + + ## center the predictors + object$X[, c(cColsCe, cColsGe)] <- + scale(object$X[, c(cColsCe, cColsGe)], center = TRUE, scale = TRUE) } - ## center the predictors - for (v in c(subsCe, subsGe)) object$X[,v] <- object$X[,v] - mean(object$X[,v]) - ## prepare optimization optim <- object$optim optim[[1L]] <- object$coefficients @@ -599,10 +661,28 @@ tvcm_grow_update <- function(object) { } else { - ## modify components in 'object' - x <- model.matrix(object$formula, model.frame(object)) + ## extract interaction predictors to be centered + ## (the ones with 'Left' and 'Right') + X <- model.matrix(object$formula, model.frame(object)) + + if (control$center) { + + ## get the columns of 'X' to be centered + terms <- terms(object$formula) + cCols <- which(rownames(attr(terms, "factors")) %in% c("Left", "Right")) + if (length(cCols) > 0L) { + cCols <- which(colSums(attr(terms, "factors")[cCols,,drop = FALSE]) > 0 & + !colnames(attr(terms, "factors")) %in% c("Left", "Right")) + cCols <- which(attr(X, "assign") %in% cCols) + } + + ## centering + X[, cCols] <- scale(X[, cCols], center = TRUE, scale = TRUE) + } + + object <- try(suppressWarnings( - glm.fit(x = x, y = object$y, weights = object$prior.weights, + glm.fit(x = X, y = object$y, weights = object$prior.weights, start = object$coefficients, offset = object$offset, family = object$family, control = object$control, intercept = TRUE)), TRUE) @@ -620,6 +700,159 @@ tvcm_grow_update <- function(object) { tvcm_grow_gefp <- gefp.olmm # see 'olmm-methods.R' +##'-------------------------------------------------------- # +##' Computes cutpoints of 'numeric' partitioning variables +##' +##' @param z a numeric vector +##' @param w a numeric vector of weights +##' @param minsize numerical scalar. The minimum node size. +##' @param maxnumsplit integer scalar. The maximum number +##' of cutpoints. +##' +##' @return A matrix with one column and one row for each +##' cutpoint +##' +##' @details Used in 'tvcm_grow_setsplits'. +##'-------------------------------------------------------- # + +tvcm_getNumSplits <- function(z, w, minsize, maxnumsplit) { + + ## order the partitioning variable + ord <- order(z) + z <- z[ord]; w <- w[ord]; + + ## get first and last valid indices + cw <- cumsum(w) + subsL <- max(c(1, which(cw < minsize))) + subsR <- min(c(length(z), which(cw > (cw[length(cw)] - minsize)))) + + if (subsL <= subsR && z[subsL] <= z[subsR]) { + + ## valid splits available + rval <- unique(z[z >= z[subsL] & z < z[subsR]]) + maxz <- z[length(z)] + + ## apply a cutpoint reduction if necessary + if ((length(rval) - 1) > maxnumsplit) { + nq <- maxnumsplit - 1L + rval <- c() + cw <- cw / cw[length(cw)] + while (length(rval) < (maxnumsplit + 1L)) { + nq <- nq + 1L + q <- (1:nq) / (nq + 1L) + rval <- unique(c(sapply(q, function(p) z[max(which(cw <= p))]), maxz)) + } + } + + ## delete largest value + rval <- rval[rval < maxz] + + } else { + + ## no valid splits + rval <- numeric() + } + + ## prepare return value + rval <- matrix(rval, ncol = 1L) + colnames(rval) <- "cut" + attr(rval, "type") <- "dev" + + ## return matrix with cutpoints + return(rval) +} + + +##'-------------------------------------------------------- # +##' Computes cutpoints of 'ordinal' partitioning variables +##' +##' @param z a vector of class 'ordered' +##' @param w a numeric vector of weights +##' @param minsize numerical scalar. The minimum node size. +##' @param maxordsplit integer scalar. The maximum number +##' of cutpoints. +##' +##' @return A matrix with one column for each category and +##' one row for each cutpoint +##' +##' @details Used in 'tvcm_grow_setsplits'. +##'-------------------------------------------------------- # + +tvcm_getOrdSplits <- function(z, w, minsize, maxordsplit) { + + ## get integer cutpoints using 'tvcm_getNumSplits' + nl <- nlevels(z) # all levels + cuts <- tvcm_getNumSplits(as.integer(z), w, minsize, maxordsplit) + zdlev <- 1:nl %in% as.integer(cuts) + + ## create a matrix to apply categorical splits + rval <- diag(nl) + rval[lower.tri(rval)] <- 1L + rval <- rval[zdlev,, drop = FALSE] + + ## prepare return value + colnames(rval) <- levels(z) + attr(rval, "type") <- "dev" + + ## return matrix with cutpoints + return(rval) +} + + +##'-------------------------------------------------------- # +##' Computes cutpoints of 'nominal' partitioning variables +##' +##' @param z a vector of class 'factor' +##' @param w a numeric vector of weights +##' @param minsize numerical scalar. The minimum node size. +##' @param maxnomsplit integer scalar. The maximum number +##' of cutpoints. +##' +##' @return A matrix with one column for each category and +##' one row for each cutpoint +##' +##' @details Used in 'tvcm_grow_setsplits'. +##'-------------------------------------------------------- # + +tvcm_getNomSplits <- function(z, w, minsize, maxnomsplit) { + + zdlev <- 1 * (levels(z) %in% levels(droplevels(z))) + if (sum(zdlev) < maxnomsplit) { + + ## exhaustive search + rval <- .Call("tvcm_nomsplits", + as.integer(zdlev), + PACKAGE = "vcrpart") + type <- "dev" + + } else { + + ## Heuristic reduction of splits: in tvcm_grow_gridsearch, + ## the 'isolated' coefficients of each category are + ## computed. The coefficients are used for ordering + ## the categories and finally the variable is treated + ## as ordinal. See tvcm_grow_gridsearch + + rval <- diag(nlevels(z)) + type <- "coef" + } + + ## remove splits according to 'minsize' + sumWTot <- sum(w) + sumWCat <- tapply(w, z, sum) + sumWCat[is.na(sumWCat)] <- 0 + valid <- apply(rval, 1, function(x) { + all(c(sum(sumWCat[x > 0]), sumWTot - sum(sumWCat[x > 0])) > minsize) + }) + rval <- rval[valid,, drop = FALSE] + + ## return matrix with cutpoints + colnames(rval) <- levels(z) + attr(rval, "type") <- type + return(rval) +} + + ##'-------------------------------------------------------- # ##' Computes candidate splits for the current step ##' @@ -649,9 +882,9 @@ tvcm_grow_gefp <- gefp.olmm # see 'olmm-methods.R' ##' @details Used in 'tvcm'. ##'-------------------------------------------------------- # -tvcm_grow_setsplits <- function(splits, spart, partid, - nodeid, varid, model, nodes, - where, partData, control) { +tvcm_grow_setsplits <- function(splits, partid, nodeid, varid, + model, nodes, where, + partData, control) { ## get tree size criteria of current tree(s) width <- sapply(nodes, width) @@ -660,207 +893,71 @@ tvcm_grow_setsplits <- function(splits, spart, partid, info_node(node)$depth })) }) w <- weights(model) - - ##'------------------------------------------------------ # - ##' 'getSplits' extracts splits for each combination of - ##' partition, variable and node. - ##' - ##' @param pid integer. The partition identification number. - ##' @param vid integer. The row of the variable in 'partData'. - ##' @param nid integer. The node identification number. - ##' For example, if the tree has terminal nodes with - ##' identification numbers 2, 4 and 5, the index 1 is used - ##' to get splits in node 2, index 2 for node 4 and so on. - ##' - ##' @return a matrix with one row for each split. - ##'------------------------------------------------------ # getSplits <- function(pid, nid, vid) { - ## get subset + ## prepare data subs <- where[[pid]] == levels(where[[pid]])[nid] - - ## get partitioning variable - z <- partData[, vid] + z <- partData[subs, vid] + w <- w[subs] - ## return 'NULL' if tree classical tree growing parameters exceeded if (width[pid] >= control$maxwidth[pid] | depth[[pid]][nid] >= control$maxdepth[pid] | sum(subs) < 1L | - sum(w[subs]) < 2 * control$minsize[pid]) { - rval <- matrix(, 0, ifelse(is.numeric(z), 3L, nlevels(z) + 2L)) - colnames(rval) <- c(if (is.numeric(z)) "cut" else levels(z), - "lossred", "df") - attr(rval, "type") <- "loss" - attr(rval, "keeplosscount") <- 0 - return(rval) - } - type <- "loss" + sum(w) < 2 * control$minsize[pid]) { + + ## return an empty matrix if control parameters exceeded + rval <- matrix(, 0, ifelse(is.numeric(z), 1L, nlevels(z))) + colnames(rval) <- if (is.numeric(z)) "cut" else levels(z) + attr(rval, "type") <- "dev" - if (is.numeric(z)) { - ## continuous variables - - sz <- z[subs] - - ## get all splits that satisfy the 'minsize' criteria - subsL <- rev(which(cumsum(w[subs][order(sz)]) < control$minsize[pid]))[1L] - subsR <- which(cumsum(w[subs][order(sz)]) > - (sum(w[subs]) - control$minsize[pid]))[1L] - - if (subsL <= subsR && sort(sz)[subsL] < sort(sz)[subsR]) { - sz <- sort(sz) - sz <- sz[sz >= sz[subsL] & sz < sz[subsR]] - rval <- unique(sz) - - ## reduce the number of splits according to 'control$maxevalsplit' - if ((length(rval) - 1) > control$maxnumsplit) { - nq <- control$maxnumsplit - 1L - rval <- c() - while (length(rval) < (control$maxnumsplit + 1L)) { - nq <- nq + 1L - rval <- unique(quantile(sz, c((1:nq) / (nq + 1L)), 1), type = 1L) - } - rval <- rval[-length(rval)] - } - rval <- cbind(rval, rep.int(NA, length(rval)), rep.int(NA, length(rval))) - - } else { - rval <- matrix(, 0L, 3L) - } - colnames(rval) <- c("cut", "lossred", "df") - - } else if (is.factor(z)) { # categorical variables - - nl <- nlevels(z) # all levels - nld <- nlevels(droplevels(z[subs])) # observed levels in current node - zdlev <- which(levels(z) %in% levels(droplevels(z[subs]))) - - if (is.ordered(z)) { - ## ordinal variables - - rval <- diag(nl) - rval[lower.tri(rval)] <- 1L - rval <- rval[-nl,, drop = FALSE] - - } else { - ## nominal variables - - if (nld <= control$maxfacsplit) { - - ## exhaustive search - mi <- 2^(nld - 1L) - 1L - rval <- .Call("tvcm_nomsplits", - as.integer(nl), - as.integer(zdlev), as.integer(nld), - as.integer(mi), PACKAGE = "vcrpart") - rval <- matrix(rval, mi, nl, byrow = TRUE) - - } else { - - ## Heuristic reduction of splits: in tvcm_grow_gridsearch, - ## the 'isolated' coefficients of each category are - ## computed. The coefficients are used for ordering - ## the categories and finally the variable is treated - ## as ordinal. See tvcm_grow_gridsearch - rval <- diag(nl) - type <- "coef" - } - } - - ## delete indadmissible splits - valid <- tvcm_setsplits_validcats(rval, z, w, subs, control$minsize[pid]) - rval <- rval[valid,, drop = FALSE] - - ## delete ordinal splits if 'maxordsplit' is exceeded - if (is.ordered(z) && nrow(rval) > control$maxordsplit) { - nCat <- apply(rval, 1, function(x) sum(subs & z %in% levels(z)[x > 0L])) - nCat <- round(c(nCat[1L], diff(nCat))) - zd <- rep.int(1:nrow(rval), nCat) - nq <- control$maxordsplit - 1L; rows <- 1; - while(length(rows) < control$maxordsplit) { - nq <- nq + 1L - rows <- unique(quantile(zd, (1:nq) / (nq + 1L)), type = 1L) - } - rval <- rval[rows,,drop=FALSE] - } - - rval <- cbind(rval, rep.int(NA, nrow(rval)), rep.int(NA, nrow(rval))) - colnames(rval) <- c(levels(z), "lossred", "df") - } else { - - rval <- matrix(, 0L, 3L) - colnames(rval) <- c("cut", "lossred", "df") + + ## compute matrix according to their scale + rval <- switch(class(z)[1L], + numeric = tvcm_getNumSplits(z, w, + control$minsize[pid], + control$maxnumsplit), + integer = tvcm_getNumSplits(z, w, + control$minsize[pid], + control$maxnumsplit), + ordered = tvcm_getOrdSplits(z, w, + control$minsize[pid], + control$maxordsplit), + factor = tvcm_getNomSplits(z, w, + control$minsize[pid], + control$maxnomsplit), + stop("class of variable '", + colnames(partData)[vid], + "' not recognized")) } - attr(rval, "type") <- type - attr(rval, "keeplosscount") <- 0 return(rval) } ## compute splits in all partitions, partitioning variables and nodes - rval <- vector("list", length(partid)) for (pid in seq_along(partid)) { - rval[[pid]] <- vector("list", length(nodeid[[partid[pid]]])) for (nid in seq_along(nodeid[[partid[pid]]])) { - rval[[pid]][[nid]] <- vector("list", length(varid[[partid[pid]]])) for (vid in seq_along(varid[[partid[pid]]])) { split <- splits[[pid]][[nid]][[vid]] - if (is.null(split) | !is.null(split) && attr(split, "type") == "coef" | + if (is.null(split[[1L]]) | + (!is.null(split[[1L]]) && attr(split[[1L]], "type") == "coef") | width[pid] >= control$maxwidth[pid]) { - split <- getSplits(partid[pid], - nodeid[[partid[pid]]][nid], - varid[[partid[pid]]][vid]) + + split[[1L]] <- getSplits(partid[pid], + nodeid[[partid[pid]]][nid], + varid[[partid[pid]]][vid]) + split[[2L]] <- split[[3L]] <- rep(NA, nrow(split[[1L]])) } else { - if (nrow(split) > 0L && - (spart != partid[pid] | - attr(split, "keeplosscount") >= control$keeploss)) { - attr(split, "keeplosscount") <- 0 - split[, c("lossred", "df")] <- NA - } else { - attr(split, "keeplosscount") <- attr(split, "keeplosscount") + 1L - } + if (nrow(split[[1]]) > 0L) + split[[2L]][] <- split[[3L]][] <- NA } - rval[[pid]][[nid]][[vid]] <- split + splits[[pid]][[nid]][[vid]] <- split } } } ## return list of updated 'splits' - return(rval) -} - - -##'------------------------------------------------------ # -##' Computes the valid categorical splits defined in a -##' integer matrix with respect to the \code{minsize} -##' criteria -##' -##' @param cp an integer matrix that defines the splits -##' according to the levels in \code{z}. -##' @param z the categorical (nominal or ordered) partitioning -##' variable. -##' @param weights a weights vector -##' @param subs a logical vector that extracts the current -##' node. -##' @param minsize the minimum number of splits in a node -##' -##' @return a logical vector of length equal to the number -##' of rows of \code{cp}. -##'------------------------------------------------------ # - -tvcm_setsplits_validcats <- function(cp, z, weights, subs, minsize) { - - ## delete cuts that do not satisfy 'minsize' - rval <- rep.int(TRUE, nrow(cp)) - for (i in seq_along(rval)) { - Node <- factor(1 * (z[subs] %in% levels(z)[cp[i,]==1L])) - if (nlevels(Node) == 1L | (nlevels(Node) > 1L && - any(tapply(weights[subs], Node, sum) < minsize))) - rval[i] <- FALSE - } - - ## return logical vector - return(rval) + return(splits) } @@ -886,15 +983,15 @@ tvcm_setsplits_validcats <- function(cp, z, weights, subs, minsize) { tvcm_setsplits_sctest <- function(splits, partid, spart, nodeid, snode, varid, svar) { - + ## set loss of not selected parts to -Inf for (pid in seq_along(partid)) for (nid in seq_along(nodeid[[pid]])) for (vid in seq_along(varid[[pid]])) { if (pid == spart & nid == snode & vid == svar) { - splits[[pid]][[nid]][[vid]][, "lossred"] <- NA + splits[[pid]][[nid]][[vid]][[2L]][] <- NA } else { - splits[[pid]][[nid]][[vid]][, "lossred"] <- -Inf + splits[[pid]][[nid]][[vid]][[2L]][] <- -Inf } } ## return updated 'splits' @@ -918,13 +1015,9 @@ tvcm_setsplits_sctest <- function(splits, partid, spart, ##' @return An modified list of splits. ##' ##' @details Used in 'tvcm'. -##' -##' To do: -##' 2014-07-21: find a better rule! (function is not used currently!) ##'------------------------------------------------------ # -tvcm_setsplits_splitnode <- function(splits, spart, snode, - nodeid, loss, model, control) { +tvcm_setsplits_splitnode <- function(splits, spart, snode, nodeid) { ## expand the splits list lnodes <- nodeid[[spart]][nodeid[[spart]] < snode] @@ -933,6 +1026,9 @@ tvcm_setsplits_splitnode <- function(splits, spart, snode, split <- vector("list", length(split0) + 1L) if (length(lnodes) > 0L) split[lnodes] <- split0[lnodes] if (length(unodes) > 0L) split[unodes + 1L] <- split0[unodes] + nvar <- length(split0[[snode]]) + split[[snode]] <- split[[snode + 1]] <- + replicate(nvar, vector("list", 3L), simplify = FALSE) splits[[spart]] <- split ## return updated 'splits' @@ -997,7 +1093,7 @@ tvcm_setsplits_rselect <- function(splits, partid, nodeid, varid, control) { for (vid in seq_along(varid[[pid]])) if (nrow(splits[[pid]][[nid]][[vid]]) > 0 && !(pid %in% spart & vid %in% svar[[pid]] & nid %in% snode[[pid]])) - splits[[pid]][[nid]][[vid]][, "lossred"] <- -Inf + splits[[pid]][[nid]][[vid]][[2L]][] <- -Inf ## return updated 'splits' return(splits) @@ -1024,7 +1120,7 @@ tvcm_setsplits_rselect <- function(splits, partid, nodeid, varid, control) { ##'------------------------------------------------------ # tvcm_grow_sctest <- function(model, nodes, where, partid, nodeid, varid, - splits, partData, control) { + splits, partData, control) { ## get variable types functional <- sapply(partData, function(x) { @@ -1038,19 +1134,25 @@ tvcm_grow_sctest <- function(model, nodes, where, partid, nodeid, varid, ## prepare list with arguments for 'sctest' rval <- vector("list", length(nodes)) for (pid in seq_along(nodes)) { - dim <- c(nlevels(where[[pid]]), length(varid[[pid]]), control$ninpute) + dim <- c(nlevels(where[[pid]]), length(varid[[pid]]), control$nimpute) dn <- list(paste("Node", LETTERS[pid], levels(where[[pid]]), sep = ""), - colnames(partData)[varid[[pid]]], 1:control$ninpute) + colnames(partData)[varid[[pid]]], 1:control$nimpute) rval[[pid]] <- array(, dim = dim, dimnames = dn) } - ## call 'estfun' - eCall <- list(name = as.name(ifelse(inherits(model, "olmm"),"estfun.olmm", "estfun"))) + ## call 'estfun' (eventually parallelized) + eCall <- + list(name = as.name(ifelse(inherits(model, "olmm"),"estfun.olmm", "estfun"))) eCall$x <- quote(model) - eCall[names(control$estfun)] <- control$estfun + eCall[names(control$estfun.args)] <- control$estfun.args mode(eCall) <- "call" - scores <- replicate(control$ninpute, eval(eCall)) - + pCall <- list(name = as.name(control$papply), + X = quote(seq(1L, control$nimpute, 1L)), + FUN = quote(function(i) eval(eCall))) + pCall[names(control$papply.args)] <- control$papply.args + mode(pCall) <- "call" + scores <- simplify2array(eval(pCall)) + ## set the 'gefp' call (which is called in each iteration below) gCall <- call(name = "tvcm_grow_gefp", object = quote(model), scores = quote(sc), @@ -1073,7 +1175,7 @@ tvcm_grow_sctest <- function(model, nodes, where, partid, nodeid, varid, for (nid in seq_along(nodeid[[pid]])) { # loop over nodes ## check if there is a permitted split - if (length(splits[[pid]][[nid]][[vid]]) > 0L) { + if (nrow(splits[[pid]][[nid]][[vid]][[1L]]) > 0L) { ## observations of the current partition rows <- where[[pid]] == levels(where[[pid]])[nid] @@ -1087,7 +1189,7 @@ tvcm_grow_sctest <- function(model, nodes, where, partid, nodeid, varid, cols <- dimnames(scores)[[2L]][cols] } - for (k in 1:control$ninpute) { + for (k in 1:control$nimpute) { sc <- matrix(scores[,,k,drop=FALSE], dim(scores)[1], dim(scores)[2], dimnames = dimnames(scores)[1L:2L]) gefp <- try(eval(gCall), TRUE) @@ -1160,15 +1262,15 @@ tvcm_sctest_bonf <- function(test, type) { ##' @return A nested list with loss matrices. Partitions of nodes ##' are nested in partitions for variables. ##' -##' @details Used in 'tvcm'. 'tvcm_grow_lossRed' is a help +##' @details Used in 'tvcm'. 'tvcm_grow_dev' is a help ##' function of 'tvcm_grow_gridsearch' ##'-------------------------------------------------------- # -tvcm_grow_lossred <- function(cutpoint, type = "loss", - pid, nid, vid, - model, modelNuis, nuisance, - where, partData, - control, loss0, mfName) { +tvcm_grow_dev <- function(cutpoint, type = "dev", + pid, nid, vid, + model, modelNuis, nuisance, + where, partData, + control, loss0, mfName) { ## set node indicator subs <- where[[pid]] == levels(where[[pid]])[nid] @@ -1183,11 +1285,11 @@ tvcm_grow_lossred <- function(cutpoint, type = "loss", parm <- grep("Left", names(coef(model)), value = TRUE) ## fit the 'update' model - model <- tvcm_grow_update(model) + model <- tvcm_grow_update(model, control) - if (type == "loss") { + if (type == "dev") { if (!inherits(model, "try-error")) { - rval <- c(loss0 - control$lossfun(model), + rval <- c((loss0 - control$lossfun(model)), length(coef(model)[grep("Left", names(coef(model)))]) - length(nuisance)) if (is.null(modelNuis)) { @@ -1195,7 +1297,7 @@ tvcm_grow_lossred <- function(cutpoint, type = "loss", } else { modelNuis[[mfName]]$Left <- 1 * (subs & zs) modelNuis[[mfName]]$Right <- 1 * (subs & !zs) - modelNuis <- tvcm_grow_update(modelNuis) + modelNuis <- tvcm_grow_update(modelNuis, control) rval[1L] <- rval[1L] - (loss0 - control$lossfun(modelNuis)) return(rval) } @@ -1243,7 +1345,6 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, Right <- Left - 1 for (pid in seq_along(partid)) { - if (length(unlist(splits[[pid]])) > 0L) { mcall$formula <- ff$update[[pid]][[1L]] @@ -1268,24 +1369,27 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, for (nid in seq_along(splits[[pid]])) { if (length(unlist(splits[[pid]][[nid]])) > 0L) { for (vid in seq_along(splits[[pid]][[nid]])) { - cp <- splits[[pid]][[nid]][[vid]] + + cp <- splits[[pid]][[nid]][[vid]][[1L]] type <- attr(cp, "type") - subs <- is.na(cp[, "lossred"]) - cp <- cp[, !colnames(cp) %in% c("lossred", "df"), drop = FALSE] - if (any(subs)) { - st <- apply(cp, 1, tvcm_grow_lossred, type = type, + subs <- is.na(splits[[pid]][[nid]][[vid]][[2L]]) + cp <- cp[subs,, drop = FALSE] + if (nrow(cp) > 0L) { + st <- apply(cp, 1, tvcm_grow_dev, type = type, pid = partid[pid], nid = nodeid[[partid[pid]]][nid], vid = varid[[partid[pid]]][vid], model = sModel, modelNuis = sModelN, nuisance = control$nuisance[[pid]], where = where, partData = partData, - control = control, loss0 = loss0, mfName = mfName) + control = control, loss0 = loss0, + mfName = mfName) if (is.matrix(st)) st <- t(st) else st <- matrix(st, ncol = 1L) - if (type == "loss") { - splits[[pid]][[nid]][[vid]][subs, c("lossred", "df")] <- st - + if (type == "dev") { + splits[[pid]][[nid]][[vid]][[2L]][subs] <- st[, 1L] + splits[[pid]][[nid]][[vid]][[3L]][subs] <- st[, 2L] + } else if (type == "coef") { ## if 'z' is a nominal variable with many categories @@ -1301,35 +1405,21 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, score <- rep.int(0, nlevels(z)) score[colSums(cp) > 0] <- prcomp(st)$x[,1] - ## define 'z' as ordinal and retrieve the splits - zd <- factor(z, levels = levels(z)[order(score)], ordered = TRUE) - nl <- nlevels(zd) - cp <- diag(nl) - cp[lower.tri(cp)] <- 1L - cp <- cp[-nl,, drop = FALSE] - valid <- - tvcm_setsplits_validcats(cp, zd, w, subs, - control$minsize[partid[pid]]) - cp <- cp[valid,, drop = FALSE] + ## define 'z' as ordinal and retrieve the splits + z <- ordered(z, levels = levels(z)[order(score)]) + cp <- tvcm_getOrdSplits(z[subs], w[subs], + control$minsize[partid[pid]], + control$maxordsplit) + ## reorder the columns of 'cp' acc. to the original categories cp <- cp[,order(order(score)), drop = FALSE] - if (nrow(cp) > control$maxordsplit) { - nCat <- - apply(cp, 1, function(x) sum(subs & z %in% levels(z)[x > 0L])) - nCat <- round(c(nCat[1L], diff(nCat))) - zd <- rep.int(1:nrow(cp), nCat) - nq <- control$maxordsplit - 1L; rows <- 1; - while(length(rows) < control$maxordsplit) { - nq <- nq + 1L - rows <- unique(quantile(zd, (1:nq) / (nq + 1L)), type = 1L) - } - cp <- cp[rows,,drop=FALSE] - } + ## ensures that split is newly built + attr(cp, "type") <- "coef" ## compute the loss of the new splits - st <- apply(cp, 1, tvcm_grow_lossred, type = "loss", + st <- apply(cp, 1, tvcm_grow_dev, type = "dev", pid = partid[pid], nid = nodeid[[partid[pid]]][nid], vid = varid[[partid[pid]]][vid], @@ -1337,13 +1427,11 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, modelNuis = sModelN, nuisance = control$nuisance[[pid]], where = where, partData = partData, - control = control, loss0 = loss0, mfName = mfName) + control = control, loss0 = loss0, + mfName = mfName) if (is.matrix(st)) st <- t(st) else st <- matrix(st, ncol = 2L) - split <- cbind(cp, st) - colnames(split) <- c(levels(z), "lossred", "df") - attr(split, "type") <- "coef" - splits[[pid]][[nid]][[vid]] <- split + splits[[pid]][[nid]][[vid]] <- list(cp, st[, 1L], st[, 2L]) } } } @@ -1352,42 +1440,50 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, } } - ## function that extracts the loss reduction (eventually corrected by the - ## number of predictors) - getPenLossRed <- function(x) { - if (is.list(x)) return(lapply(x, getPenLossRed)) - if (is.matrix(x)) - if (nrow(x) > 0L) - return(x[, "lossred"] - control$dfpar * x[, "df"]) else return(numeric()) - return(x) - } - loss <- getPenLossRed(splits) + ## extracts the penalized loss reduction + pendev <- lapply(splits, function(part) { + ## partition level + lapply(part, function(node) { + ## node level + lapply(node, function(var) { + ## variable level + if (length(var[[2L]]) > 0L) { + return(var[[2L]] - control$cp * + tvcm_complexity(var[[3]], control$dfpar, 1, control$dfsplit)) + } else { + return(numeric()) + } + }) + }) + }) + - ## function that extracts the maximum loss reduction - getMaxPenLossRed <- function(x) { + ## function to extract the maximum loss reduction for each part/node/var + getMaxPenDev <- function(x) { x <- unlist(x) if (length(x) == 0L) return(-Inf) x <- na.omit(x) if (length(x) > 0L) return(max(x)) else return(-Inf) } - maxLossDiff <- max(c(-Inf, na.omit(unlist(loss)))) + ## the maximum loss reduction + maxpendev <- max(c(-Inf, na.omit(unlist(pendev)))) - if (maxLossDiff > -Inf) { + if (maxpendev > -Inf) { ## select the partition, node and variable - spart <- which(sapply(sapply(loss, getMaxPenLossRed), identical, maxLossDiff)) + spart <- which(sapply(sapply(pendev, getMaxPenDev), identical, maxpendev)) if (length(spart) > 1L) spart <- sample(spart, 1L) snode <- - which(sapply(sapply(loss[[spart]], getMaxPenLossRed), identical, maxLossDiff)) + which(sapply(sapply(pendev[[spart]], getMaxPenDev), identical, maxpendev)) if (length(snode) > 1L) snode <- sample(snode, 1L) - svar <- which(sapply(sapply(loss[[spart]][[snode]], getMaxPenLossRed), - identical, maxLossDiff)) + svar <- which(sapply(sapply(pendev[[spart]][[snode]], getMaxPenDev), + identical, maxpendev)) if (length(svar) > 1L) svar <- sample(svar, 1L) ## select the cut stat <- splits[[spart]][[snode]][[svar]] - cutid <- which(stat[, "lossred"] == max(stat[, "lossred"])) + cutid <- which(stat[[2L]] == max(stat[[2L]], na.rm = TRUE)) if (length(cutid) > 1L) cutid <- sample(cutid, 1L) if (verbose) cat("OK") @@ -1396,18 +1492,18 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, nodeid = nodeid[[partid[spart]]][snode], varid = varid[[partid[spart]]][svar], cutid = cutid, - cut = stat[cutid, !colnames(stat) %in% c("lossred", "df")], - lossred = as.numeric(stat[cutid, "lossred"]), - plossred = maxLossDiff, - df = as.numeric(stat[cutid, "df"]), - lossgrid = splits)) + cut = stat[[1L]][cutid, ], + dev = as.numeric(stat[[2L]][cutid]), + pendev = maxpendev, + npar = as.numeric(stat[[3L]][cutid]), + grid = splits)) } else { if (verbose) cat("failed") return(list(partid = NULL, nodeid = NULL, varid = NULL, - cutid = NULL, cut = NULL, lossred = NULL, - plossred = NULL, lossgrid = splits)) + cutid = NULL, cut = NULL, dev = NULL, + pendev = NULL, grid = splits)) } } @@ -1415,7 +1511,7 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, ##'-------------------------------------------------------- # ##' Incorporates a new binary split into an existing -##' tree structire. +##' tree structure. ##' ##' @param nodes an object of class 'partynode'. ##' @param loss a list produced by 'tvcm_grow_gridsearch'. @@ -1427,14 +1523,13 @@ tvcm_grow_gridsearch <- function(splits, partid, nodeid, varid, ##' Used in 'tvcm'. ##'-------------------------------------------------------- # -tvcm_grow_splitnode <- function(nodes, where, loss, partData, step, weights) { +tvcm_grow_splitnode <- function(nodes, where, dev, partData, step, weights) { - pid <- loss$partid - nid <- loss$nodeid - vid <- loss$varid + pid <- dev$partid + nid <- dev$nodeid + vid <- dev$varid nidLab <- nodeids(nodes[[pid]], terminal = TRUE)[nid] - stat <- loss$loss - cut <- loss$cut + cut <- dev$cut x <- partData[, vid] ## collect information for the split @@ -1517,6 +1612,9 @@ tvcm_grow_splitnode <- function(nodes, where, loss, partData, step, weights) { ##' @param family an object of class 'family' or 'family.olmm'. ##' @param env the environment where the output formula ##' is to be evaluated. +##' @param full whether the full formula should be derived +##' @param update whether the formula for the update model +##' should be derived. ##' @return A list of formulas ('root', 'tree' and 'original'). ##' ##' @details Used in \code{\link{predict.fvcm}} and @@ -1791,9 +1889,9 @@ tvcm_grow_setcontrol <- function(control, model, formList, root, parm.only = TRU ## set 'nuisance' slots control$nuisance <- lapply(formList$vc, function(x) x$nuisance) - control$estfun$nuisance <- - unique(c(control$estfun$nuisance, - setdiff(names(coef(model)), unlist(control$parm)))) + control$estfun.args$nuisance <- + unique(c(control$estfun.args$nuisance, + setdiff(names(coef(model)), names(fixef(model))))) return(control) } @@ -1867,7 +1965,7 @@ tvcm_get_terms <- function(names, ids, parm) { parm <- lapply(parm, function(x) { lapply(x, function(x) x[grepl("Node[A-Z]", x)]) - }) + }) if (any(unlist(ids) == 1L)) { getNames <- function(x) { @@ -1990,8 +2088,8 @@ tvcm_get_vcparm <- function(object) { tvcm_get_estimates <- function(object, what = c("coef", "sd", "var"), ...) { what <- match.arg(what) - model <- extract(object, "model") - + model <- object$info$model + rval <- list(fe = numeric(), vc = replicate(length(object$info$node), matrix(,0,0)), re = numeric()) @@ -2004,66 +2102,73 @@ tvcm_get_estimates <- function(object, what = c("coef", "sd", "var"), ...) { var = diag(vcov(model))) ids <- lapply(object$info$node, nodeids, terminal = TRUE) - + formList <- object$info$formula - + + ## the terms for which coefficients exist termsC <- tvcm_get_terms(names(coef(model)), ids, object$info$control$parm) + + ## the terms for which estimates for 'type' exist termsE <- tvcm_get_terms(names(estimates), ids, object$info$control$parm) - + ## restricted coefficients if (any(termsE$type == "fe")) rval$fe <- estimates[termsE$type == "fe"] - + ## random effects if (any(termsE$type == "re")) rval$re <- estimates[termsE$type == "re"] - + ## varying coefficients if (any(termsE$type == "vc")) { - + for (pid in seq_along(object$info$node)) { - - nnodes <- length(ids[[pid]]) # number of nodes ## extract the terms corresponding to the partition vcTermsC <- unique(termsC$terms[termsC$partition == LETTERS[pid]]) vcTermsE <- unique(termsE$terms[termsE$partition == LETTERS[pid]]) - - ## build a matrix to store the coefficients - rval$vc[[pid]] <- matrix(, nnodes, length(vcTermsC)) - rownames(rval$vc[[pid]]) <- ids[[pid]] - colnames(rval$vc[[pid]]) <- vcTermsC - - ## fill the matrix - for (i in seq_along(vcTermsC)) { - subs <- termsE$terms == vcTermsC[i] & termsE$partition == LETTERS[pid] - rval$vc[[pid]][termsE$node[subs], i] <- estimates[subs] - } - - ## add colnames for varying intercepts - subs <- which(colnames(rval$vc[[pid]]) %in% "") - if (length(subs) > 0L) colnames(rval$vc[[pid]])[subs] <- "(Intercept)" - if (ncol(rval$vc[[pid]]) > 0 & inherits(object$info$family, "family.olmm")) { - tmp <- strsplit(colnames(rval$vc[[pid]]), ":") - subs <- sapply(tmp, length) == 1L & - sapply(tmp, function(x) substr(x[1L], 1L, 3L) == "Eta") - colnames(rval$vc[[pid]])[subs] <- - paste(colnames(rval$vc[[pid]])[subs], "(Intercept)", sep = ":") - } - - ## fill the last row if necessary - subs <- is.na(rval$vc[[pid]][nnodes, ]) - if (any(subs)) { + + ## make a matrix only if specific terms exist for the partition + if (length(vcTermsC) > 0L) { - ## compute the coefficients of the omitted node - con <- model$contrasts[[paste("Node", LETTERS[pid], - sep = "")]][nnodes, ] - for (i in which(subs)) { - rval$vc[[pid]][nnodes, i] <- - switch(what, - coef = sum(con * rval$vc[[pid]][-nnodes, i], na.rm = TRUE), - sd = sum(con^2 * rval$vc[[pid]][-nnodes, i], na.rm = TRUE), - var = sum(con^2 * rval$vc[[pid]][-nnodes, i], na.rm = TRUE)) + nnodes <- length(ids[[pid]]) # number of nodes + + ## build a matrix to store the coefficients + rval$vc[[pid]] <- matrix(, nnodes, length(vcTermsC)) + rownames(rval$vc[[pid]]) <- ids[[pid]] + colnames(rval$vc[[pid]]) <- vcTermsC + + ## fill the matrix + for (i in seq_along(vcTermsC)) { + subs <- termsE$terms == vcTermsC[i] & termsE$partition == LETTERS[pid] + rval$vc[[pid]][termsE$node[subs], i] <- estimates[subs] + } + + ## add colnames for varying intercepts + subs <- which(colnames(rval$vc[[pid]]) %in% "") + if (length(subs) > 0L) colnames(rval$vc[[pid]])[subs] <- "(Intercept)" + if (ncol(rval$vc[[pid]]) > 0 & inherits(object$info$family, "family.olmm")) { + tmp <- strsplit(colnames(rval$vc[[pid]]), ":") + subs <- sapply(tmp, length) == 1L & + sapply(tmp, function(x) substr(x[1L], 1L, 3L) == "Eta") + colnames(rval$vc[[pid]])[subs] <- + paste(colnames(rval$vc[[pid]])[subs], "(Intercept)", sep = ":") + } + + ## fill the last row if necessary + subs <- is.na(rval$vc[[pid]][nnodes, ]) + if (any(subs)) { + + ## compute the coefficients of the omitted node + con <- model$contrasts[[paste("Node", LETTERS[pid], + sep = "")]][nnodes, ] + for (i in which(subs)) { + rval$vc[[pid]][nnodes, i] <- + switch(what, + coef = sum(con * rval$vc[[pid]][-nnodes, i], na.rm = TRUE), + sd = sum(con^2 * rval$vc[[pid]][-nnodes, i], na.rm = TRUE), + var = sum(con^2 * rval$vc[[pid]][-nnodes, i], na.rm = TRUE)) + } } } } @@ -2072,16 +2177,15 @@ tvcm_get_estimates <- function(object, what = c("coef", "sd", "var"), ...) { getSqrt <- function(x) { if (is.list(x)) { return(lapply(x, getSqrt)) - } else if (!is.null(x)) { - if (length(x) == 0) return(NA) else return(sqrt(x)) + } else if (length(x) > 0) { + return(sqrt(x)) } else { return(x) } } - rval <- lapply(rval, getSqrt) - + rval <- lapply(rval, getSqrt) } -} + } return(rval) } @@ -2097,20 +2201,19 @@ tvcm_get_estimates <- function(object, what = c("coef", "sd", "var"), ...) { ##' @details Used in 'tvcm_print' and 'plot.tvcm'. ##'-------------------------------------------------------- # -tvcm_print_vclabs <- function(object) { +tvcm_print_vclabs <- function(formList) { - formula <- object$info$formula - if (length(formula$vc) == 0) return(NULL) + if (length(formList$vc) == 0) return(NULL) ## conditioning variables - cond <- lapply(formula$vc, function(x) { + cond <- lapply(formList$vc, function(x) { rval <- all.vars(x$cond) if (length(rval) > 2L) rval <- c(rval[1L], "...") return(rval) }) ## 'by' terms - vcLabs <- terms(formula$original, specials = "vc") + vcLabs <- terms(formList$original, specials = "vc") if (length(attr(vcLabs, "specials")$vc) == 0L) return(NULL) vcLabs <- rownames(attr(vcLabs, "factors"))[attr(vcLabs, "specials")$vc] vcLabs <- paste("getBy", vcLabs, sep = "_") @@ -2122,7 +2225,7 @@ tvcm_print_vclabs <- function(object) { by <- sapply(vcLabs, function(x) eval(parse(text = x))) ## collapse the short labels - rval <- rep.int("vc(", length(formula$vc)) + rval <- rep.int("vc(", length(formList$vc)) for (pid in seq_along(rval)) { if (length(cond) > 0L) rval[pid] <- paste(rval[pid], paste(cond[[pid]], collapse = ", "), sep = "") @@ -2162,8 +2265,6 @@ tvcm_prune_node <- function(object, alpha = NULL, maxstep = NULL, terminal = NUL if (all(c(is.null(alpha), is.null(maxstep), is.null(terminal)))) return(object$info$node) - control <- extract(object, "control") - if (!is.null(alpha) && depth(rval[[pid]]) > 0L) { splitpath <- object$info$splitpath p.value <- extract(object, "p.value") @@ -2176,6 +2277,7 @@ tvcm_prune_node <- function(object, alpha = NULL, maxstep = NULL, terminal = NUL for (pid in seq_along(rval)) { ## prune the tree + if (!is.null(maxstep)) rval[[pid]] <- tvcm_prune_maxstep(rval[[pid]], maxstep) @@ -2325,14 +2427,14 @@ tvcm_grow_splitpath <- function(splitpath, varid, nodes, partData, control) { } - ## change the names of the loss grid elements !!! - if (!is.null(splitpath[[step]]$lossgrid)) { - names(splitpath[[step]]$lossgrid) <- LETTERS[seq_along(nodes)] - for (pid in seq_along(splitpath[[step]]$lossgrid)) { - names(splitpath[[step]]$lossgrid[[pid]]) <- + ## change the names of the dev grid elements !!! + if (!is.null(splitpath[[step]]$grid)) { + names(splitpath[[step]]$grid) <- LETTERS[seq_along(nodes)] + for (pid in seq_along(splitpath[[step]]$grid)) { + names(splitpath[[step]]$grid[[pid]]) <- paste("Node", kidids[[pid]], sep = "") - for (nid in seq_along(splitpath[[step]]$lossgrid[[pid]])) { - names(splitpath[[step]]$lossgrid[[pid]][[nid]]) <- + for (nid in seq_along(splitpath[[step]]$grid[[pid]])) { + names(splitpath[[step]]$grid[[pid]][[nid]]) <- colnames(partData)[varid[[pid]]] } } diff --git a/R/tvcm.R b/R/tvcm.R index 355cefd..f42e0db 100644 --- a/R/tvcm.R +++ b/R/tvcm.R @@ -1,19 +1,31 @@ ##' -------------------------------------------------------- # ##' Author: Reto Buergin ##' E-Mail: reto.buergin@unige.ch, rbuergin@gmx.ch -##' Date: 2014-09-06 +##' Date: 2014-10-23 ##' ##' Description: ##' The 'tvcm' function ##' -##' tvcolmm convenience function for 'tvcm' -##' tvcglm convenience function for 'tvcm' -##' tvcm the main fitting function -##' tvcm_control control function for 'tvcm' +##' tvcolmm convenience function for 'tvcm' +##' tvcolmm_control control function for 'tvcolmm' +##' tvcglm convenience function for 'tvcm' +##' tvcglm_control control function for 'tvglm' +##' tvcm the main fitting function +##' tvcm_control control function for 'tvcm' ##' ##' all functions are documented as *.Rd files ##' ##' Last modifications: +##' 2014-11-05: - set seed at start of 'tvcm' and re-establish old seed +##' at the end +##' 2014-10-23: - improved extraction of fitting arguments (see 'fitargs') +##' - added 'tvcolmm_control' and 'tvcglm_control' to better +##' distinguish between the articles. +##' 2014-09-20: - add argument 'ninupute' the 'tvcm_control' +##' 2014-09-19: - do not call 'cvloss' if no varying coefficients +##' 2014-09-17: - defined definition of penalization +##' - deleted parameter 'maxoverstep' +##' - added parameters 'mindev', 'cp' ##' 2014-09-08: - resolved a problem with 'offset' ##' - removed the environment from the model, which ##' require a lot of memory @@ -24,7 +36,7 @@ ##' produced by 'vcrpart_formula'. The modification ##' was due to acceleration techniques ('vcrpart_formula' ##' is usually slow!) -##' 2014-08-29: - implement adjustment of loss reduction by number +##' 2014-08-29: - implement adjustment of deviance by number ##' of predictor of coefficient-group ##' 2014-07-31: - set 'sctest = FALSE' as the default ##' - return an error if multiple trees and 'sctest = TRUE' @@ -36,32 +48,68 @@ ##' and 'maxoverstep' ##' 2014-06-26: incorporate new function 'tvcm_grow_setsplits' ##' 2014-06-16: allow coefficient-wise trees +##' +##' To do: +##' - ##' -------------------------------------------------------- # tvcolmm <- function(formula, data, family = cumulative(), weights, subset, offset, na.action, - control = tvcm_control(), ...) { + control = tvcolmm_control(), ...) { mc <- match.call() mc[[1L]] <- as.name("tvcm") if (!"family" %in% names(mc) & (length(mc) < 4L | length(mc) >= 4L && !inherits(eval.parent(mc[[4L]]), "family.olmm"))) - mc$family <- formals(tvcolmm)$family + mc$family <- formals(tvcolmm)$family + if (!"control" %in% names(mc) & + (length(mc) < 9L | + length(mc) >= 9L && !inherits(eval.parent(mc[[4L]]), "tvcm_control"))) + mc$control <- formals(tvcolmm)$control mc$fit <- "olmm" return(eval.parent(mc)) } +tvcolmm_control <- function(alpha = 0.05, bonferroni = TRUE, minsize = 50, + maxnomsplit = 5, maxordsplit = 9, maxnumsplit = 9, + trim = 0.1, estfun.args = list(), nimpute = 5, + seed = NULL, ...) { + + mc <- match.call() + mc[[1L]] <- as.name("tvcm_control") + if (!"minsize" %in% names(mc) & length(mc) < 7L) + mc$minsize <- formals(tvcolmm_control)$minsize + mc$sctest <- TRUE + return(eval.parent(mc)) +} + + tvcglm <- function(formula, data, family, weights, subset, offset, na.action, - control = tvcm_control(), ...) { + control = tvcglm_control(), ...) { mc <- match.call() - mc[[1L]] <- as.name("tvcm") + mc[[1L]] <- as.name("tvcm") + if (!"control" %in% names(mc) & + (length(mc) < 9L | + length(mc) >= 9L && !inherits(eval.parent(mc[[4L]]), "tvcm_control"))) + mc$control <- formals(tvcglm)$control mc$fit <- "glm" return(eval.parent(mc)) } +tvcglm_control <- function(minsize = 30, mindev = 2.0, + maxnomsplit = 5, maxordsplit = 9, maxnumsplit = 9, + cv = TRUE, folds = folds_control("kfold", 5), + prune = cv, center = TRUE, ...) { + mc <- match.call() + mc[[1L]] <- as.name("tvcm_control") + return(eval.parent(mc)) +} + + + tvcm <- function(formula, data, fit, family, weights, subset, offset, na.action, control = tvcm_control(), ...) { @@ -73,6 +121,21 @@ tvcm <- function(formula, data, fit, family, if (control$verbose) cat("* checking arguments ... ") stopifnot(inherits(formula, "formula")) stopifnot(inherits(control, "tvcm_control")) + + ## set seed + if (!exists(".Random.seed", envir = .GlobalEnv)) runif(1) + oldSeed <- get(".Random.seed", mode="numeric", envir=globalenv()) + if (!is.null(control$seed)) set.seed(control$seed) + RNGstate <- .Random.seed + + ## check and set 'family' + if (missing(family)) stop("no 'family'.") + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } else if (is.function(family)) { + family <- family() + } + if (!class(family) %in% c("family", "family.olmm")) stop("'family' not recognized") ## check and set 'fit' if (missing(fit)) { @@ -85,15 +148,6 @@ tvcm <- function(formula, data, fit, family, if (is.function(fit)) fit <- deparse(mc$fit) } if (!fit %in% c("glm", "olmm")) stop("'fit' not recognized.") - - ## check and set 'family' - if (missing(family)) stop("no 'family'.") - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } else if (is.function(family)) { - family <- family() - } - if (!class(family) %in% c("family", "family.olmm")) stop("'family' not recognized") if (fit != "olmm") control$estfun <- NULL ## set formulas @@ -106,7 +160,8 @@ tvcm <- function(formula, data, fit, family, env <- environment(eval.parent(mc$formula)) formList <- vcrpart_formula(formula, family, env) nPart <- length(formList$vc) - + if (nPart < 1L) control$cv <- FALSE + direct <- any(sapply(formList$vc, function(x) x$direct)) if (length(direct) == 0L) direct <- FALSE control$direct <- direct @@ -140,7 +195,10 @@ tvcm <- function(formula, data, fit, family, data = quote(mf)) mce <- match.call(expand.dots = TRUE) dotargs <- setdiff(names(mce), names(mc)) - dotargs <- intersect(dotargs, names(formals(fit))) + fitargs <- + switch(fit,olmm = union(names(formals(olmm)), names(formals(olmm_control))), + glm = union(names(formals(glm)), names(formals(glm.control))), "") + dotargs <- intersect(fitargs, dotargs) dotargs <- setdiff(dotargs, names(mcall)) dotargs <- list(...)[dotargs] mcall[names(dotargs)] <- dotargs @@ -167,7 +225,8 @@ tvcm <- function(formula, data, fit, family, if (nPart > 1L) stop("coefficient constancy tests can be used only ", "if a single 'vc' term is specified.") - if (!is.null(formList$vc) && (formList$vc[[1L]]$direct & formList$fe$intercept != "none")) + if (!is.null(formList$vc) && (formList$vc[[1L]]$direct & + formList$fe$intercept != "none")) stop("if 'sctest = TRUE', searching for intercept is only possible if the", "global intercept is removed. Use something like 'formula = y ~ -1 + ...'") } @@ -177,8 +236,8 @@ tvcm <- function(formula, data, fit, family, ## set imputation in 'control' if (!inherits(model, "olmm") | inherits(model, "olmm") && - length(unique(table(model$subject))) == 1L) - control$ninpute <- 1L + length(unique(table(model$subject))) == 1L) + control$nimpute <- 1L ## specify which coefficients are considered as 'nuisance' parameters if (control$verbose && control$sctest) @@ -245,7 +304,7 @@ tvcm <- function(formula, data, fit, family, ## call cvloss tree <- eval(cvCall) if (control$verbose) - cat("\nestimated dfsplit =", format(tree$info$cv$dfsplit.hat, digits = 3), "\n") + cat("\nestimated cp =", format(tree$info$cv$cp.hat, digits = 3), "\n") } else { @@ -257,7 +316,7 @@ tvcm <- function(formula, data, fit, family, ## pruning if (control$prune && inherits(tree, "tvcm")) { if (control$verbose) cat("\n* pruning ... ") - tree <- prune(tree, dfsplit = tree$info$cv$dfsplit.hat, papply = control$papply) + tree <- prune(tree, cp = tree$info$cv$cp.hat, papply = control$papply) if (control$verbose) cat("OK") } @@ -265,6 +324,9 @@ tvcm <- function(formula, data, fit, family, cat("\n\nFitted model:\n") print(tree) } + + ## reset seed + assign(".Random.seed", oldSeed, envir=globalenv()) if (control$verbose) cat("* computations finished, return object\n") @@ -272,56 +334,63 @@ tvcm <- function(formula, data, fit, family, return(tree) } - -tvcm_control <- function(lossfun = neglogLik2, - maxstep = Inf, maxwidth = Inf, - minsize = 30, maxdepth = Inf, - dfpar = 2.0, dfsplit = 0.0, - maxoverstep = ifelse(sctest, Inf, 0), +tvcm_control <- function(minsize = 30, mindev = ifelse(sctest, 0.0, 2.0), sctest = FALSE, alpha = 0.05, bonferroni = TRUE, - trim = 0.1, estfun = list(), - maxfacsplit = 5L, maxordsplit = 10, maxnumsplit = 10, + trim = 0.1, estfun.args = list(), nimpute = 5, + maxnomsplit = 5, maxordsplit = 9, maxnumsplit = 9, + maxstep = 1e3, maxwidth = 1e9, maxdepth = 1e9, + lossfun = neglogLik2, ooblossfun = NULL, + cp = 0.0, dfpar = 0.0, dfsplit = 1.0, cv = !sctest, folds = folds_control("kfold", 5), - prune = cv, keeploss = FALSE, papply = mclapply, - verbose = FALSE, ...) { + prune = cv, papply = mclapply, papply.args = list(), + center = TRUE, seed = NULL, verbose = FALSE, ...) { mc <- match.call() ## check available arguments - stopifnot(is.function(lossfun)) - stopifnot(is.numeric(maxstep) && length(maxstep) == 1L && maxstep >= 0L) - stopifnot(is.numeric(maxwidth) && all(maxwidth > 0L)) stopifnot(is.null(minsize) | (is.numeric(minsize) && all(minsize > 0))) - stopifnot(is.numeric(maxdepth) && all(maxdepth >= 0)) - stopifnot(is.numeric(dfpar) && length(dfpar) == 1L) - stopifnot(is.numeric(dfsplit) && length(dfsplit) == 1L) - stopifnot(is.numeric(maxoverstep) && length(maxoverstep) == 1L) + stopifnot(is.numeric(mindev) && length(mindev) == 1L) + stopifnot(is.logical(sctest) && length(sctest) == 1L) - if (length(alpha) != 1L) stopifnot(is.numeric(alpha) && length(alpha) == 1L && alpha >= 0.0 && alpha <= 1.0) stopifnot(is.logical(bonferroni) && length(bonferroni) == 1L) + stopifnot(is.numeric(trim) && length(trim) == 1L && trim >= 0.0 & trim < 0.5) - stopifnot(is.list(estfun)) - stopifnot(is.numeric(maxfacsplit) && length(maxfacsplit) == 1L && maxfacsplit > 1L) + stopifnot(is.list(estfun.args)) + stopifnot(is.numeric(nimpute) && length(nimpute) == 1L && nimpute > 0) + nimpute <- max(1.0, round(nimpute)) + + stopifnot(is.numeric(maxnomsplit) && length(maxnomsplit) == 1L && maxnomsplit > 1L) stopifnot(is.numeric(maxordsplit) && length(maxordsplit) == 1L && maxordsplit > 1L) stopifnot(is.numeric(maxnumsplit) && length(maxnumsplit) == 1L && maxnumsplit > 1L) + + stopifnot(is.numeric(maxstep) && length(maxstep) == 1L && maxstep >= 0L) + stopifnot(is.numeric(maxwidth) && all(maxwidth > 0L)) + stopifnot(is.numeric(maxdepth) && all(maxdepth >= 0)) + + stopifnot(is.function(lossfun)) + stopifnot(is.null(ooblossfun) | is.function(ooblossfun)) + + stopifnot(is.numeric(cp) && length(cp) == 1L) + stopifnot(is.numeric(dfpar) && length(dfpar) == 1L) + stopifnot(is.numeric(dfsplit) && length(dfsplit) == 1L) + stopifnot(is.logical(cv) && length(cv) == 1L) stopifnot(inherits(folds, "folds")) + stopifnot(is.logical(prune) && length(prune) == 1L) if (!cv & prune) stop("'prune = TRUE' requires 'cv = TRUE'") - stopifnot((is.logical(keeploss) | is.numeric(keeploss)) && - length(keeploss) == 1L) - keeploss <- as.numeric(keeploss) - if (is.numeric(verbose)) verbose <- as.logical(verbose) + + stopifnot(is.logical(center) && length(center) == 1L) stopifnot(is.logical(verbose) && length(verbose) == 1L) - - ## check hidden arguments - ptry <- ifelse(is.null(list(...)$ptry), Inf, list(...)$ptry) - stopifnot(is.numeric(ptry) && length(ptry) == 1L && ptry > 0) - ntry <- if (is.null(list(...)$ntry)) Inf else list(...)$ntry - stopifnot(is.numeric(ntry) && all(ntry > 0)) - vtry <- if (is.null(list(...)$vtry)) Inf else list(...)$vtry - stopifnot(is.numeric(vtry) && all(vtry > 0)) + ## set the default parameters for 'gefp.estfun' calls + estfun.args <- appendDefArgs(estfun.args, list(predecor = TRUE, + nuisance = NULL, + silent = FALSE)) + if (!is.null(estfun.args$level) && estfun.args$level != "observation") + warning("'level' argument for 'estfun' is set to 'observation'") + estfun.args$level <- "observation" + ## check and set 'papply' stopifnot(is.character(papply) | is.function(papply)) if (is.function(papply)) { @@ -331,51 +400,62 @@ tvcm_control <- function(lossfun = neglogLik2, papply <- deparse(formals(tvcm_control)$papply) } } + stopifnot(is.list(papply.args)) + + ## check hidden arguments + ptry <- ifelse(is.null(list(...)$ptry), Inf, list(...)$ptry) + stopifnot(is.numeric(ptry) && length(ptry) == 1L && ptry > 0) + ntry <- if (is.null(list(...)$ntry)) Inf else list(...)$ntry + stopifnot(is.numeric(ntry) && all(ntry > 0)) + vtry <- if (is.null(list(...)$vtry)) Inf else list(...)$vtry + stopifnot(is.numeric(vtry) && all(vtry > 0)) - ## set the default parameters for 'gefp.estfun' calls - estfun <- appendDefArgs(estfun, list(predecor = TRUE, - nuisance = NULL, - silent = FALSE)) - if (!is.null(estfun$level) && estfun$level != "observation") - warning("'level' argument for 'estfun' is set to 'observation'") - estfun$level <- "observation" - ## ensure backward compability if ("maxevalsplit" %in% names(list(...))) maxnumsplit <- list(...)$maxevalsplit if ("minbucket" %in% names(list(...))) minsize <- list(...)$minbucket + + functional.factor <- ifelse("functional.factor" %in% names(list(...)), + list(...)$functional.factor, "LMuo") + functional.ordered <- ifelse("functional.ordered" %in% names(list(...)), + list(...)$functional.ordered, "LMuo") + functional.numeric <- ifelse("functional.numeric" %in% names(list(...)), + list(...)$functional.numeric, "supLM") ## create a list of parameters of class 'tvcm_control' return(structure( - appendDefArgs( - list(...), - list(lossfun = lossfun, - maxstep = maxstep, - maxwidth = maxwidth, - minsize = minsize, - maxdepth = maxdepth, - dfpar = dfpar, - dfsplit = dfsplit, - maxoverstep = maxoverstep, - ptry = ptry, - ntry = ntry, - vtry = vtry, - trim = trim, - sctest = sctest, - alpha = alpha, - bonferroni = bonferroni, - estfun = estfun, - maxfacsplit = maxfacsplit, - maxordsplit = maxordsplit, - maxnumsplit = maxnumsplit, - cv = cv, - folds = folds, - prune = prune, - keeploss = keeploss, - papply = papply, - verbose = verbose, - parm = NULL, intercept = NULL, - functional.factor = "LMuo", - functional.ordered = "LMuo", - functional.numeric = "supLM")), + list(minsize = minsize, + mindev = mindev, + sctest = sctest, + alpha = alpha, + bonferroni = bonferroni, + trim = trim, + estfun.args = estfun.args, + nimpute = nimpute, + maxnomsplit = as.integer(maxnomsplit), + maxordsplit = as.integer(maxordsplit), + maxnumsplit = as.integer(maxnumsplit), + maxstep = as.integer(maxstep), + maxwidth = as.integer(maxwidth), + maxdepth = as.integer(maxdepth), + lossfun = lossfun, + ooblossfun = ooblossfun, + cp = cp, + dfpar = dfpar, + dfsplit = dfsplit, + cv = cv, + folds = folds, + prune = prune, + papply = papply, + papply.args = papply.args, + center = center, + verbose = verbose, + ptry = ptry, + ntry = ntry, + vtry = vtry, + parm = NULL, intercept = NULL, + seed = seed, + functional.factor = "LMuo", + functional.ordered = "LMuo", + functional.numeric = "supLM"), class = "tvcm_control")) } diff --git a/R/utils.R b/R/utils.R index 8422923..40cb2b2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -513,7 +513,7 @@ vcrpart_formula <- function(formula, family = cumulative(), types <- c("fe", "vc", "re") terms <- terms(formula, specials = types, keep.order = TRUE) - yName <- rownames(attr(terms(formula), "factors")) + yName <- deparse((formula)[[2L]]) termLabs <- attr(terms, "term.labels") termFac <- attr(terms, "factors") type <- rep.int("NA", length(termLabs)) diff --git a/data/PL.RData b/data/PL.RData new file mode 100644 index 0000000..61a3de0 Binary files /dev/null and b/data/PL.RData differ diff --git a/man/PL.Rd b/man/PL.Rd new file mode 100644 index 0000000..f786124 --- /dev/null +++ b/man/PL.Rd @@ -0,0 +1,66 @@ +\name{PL} + +\alias{PL} + +\docType{data} + +\title{Effect of parental leave policy} + +\description{Data to analyze the effect of the 1990 Austrian parental leave + reform on fertility and postbirth labor market careers. The data originate + from the Austrian Social Security Database (ASSD) and where prepared by + Lalive and Zweimueller (2009). The sample includes 6'180 women giving a + childbirth (the first birth recorded in the ASSD data) between June and July + 1990 and were eligible to benefit from the parental leave program.} + +\usage{data(PL)} + +\format{ + A data frame with 6'180 observations on the following variables + \describe{ + \item{\code{uncb3}}{binary. Additional birth 0-36 months after child birth.} + \item{\code{uncb10}}{binary. Additional birth 0-120 months after child birth.} + \item{\code{uncj3}}{binary. Return-to-work 0-36 months after child birth.} + \item{\code{uncj10}}{numeric. Return-to-work 0-120 months after child birth.} + \item{\code{pbexp10}}{numeric. Employment (months/yr), 37-120 months + after child birth.} + \item{\code{pbinc_tot10}}{numeric. Earnings (EUR/month), 37-120 + months after child birth.} + \item{\code{pbexp3}}{numeric. Employment (months/yr), 0-36 months + after child birth.} + \item{\code{pbinc_tot3}}{numeric. Earnings (EUR/month), 0-36 months + after child birth.} + \item{\code{ikar3}}{numeric. Length of parental leave of the first + year after birth.} + \item{\code{ikar4}}{numeric. Length of parental leave of the second + year after birth.} + \item{\code{july}}{binary treatment variable. Indicates whether the child + considered (the first recorded in the ASSD data) was born in June 1990 + or in July 1990.} + \item{\code{bd}}{child's birthday.} + \item{\code{workExp}}{years in employment prior to birth.} + \item{\code{unEmpl}}{years in unemployment prior to birth.} + \item{\code{zeroLabEarn}}{factor. Whether women has earnings at birth.} + \item{\code{laborEarnings}}{numeric. Earnings at birth.} + \item{\code{employed}}{factor. Whether the woman was employed in 1989.} + \item{\code{whiteCollar}}{factor. Whether woman is white collar worker.} + \item{\code{wage}}{numeric. Daily 1989 earnings.} + \item{\code{age}}{ordered factor. Age.} + \item{\code{industry}, \code{industry.SL}}{factor. Industry where woman worked.} + \item{\code{region}, \code{region.SL}}{factor. The region where the woman lives.} + } +} + +\details{ + The data are described in Lalive and Zweimueller (2009). +} + +\source{Austrian Social Security Database (ASSD). The data set is also available + from \url{https://sites.google.com/site/rafaellalive/research}} + +\references{Lalive, R. and Zweimueller, J. (2009), How does parental leave affect + fertility and return to work? Evidence from two natural experiments. + \emph{The Quarterly Journal of Economics}. +} + +\keyword{datasets} diff --git a/man/fvcm.Rd b/man/fvcm.Rd index 0abfd49..d0ce5cc 100644 --- a/man/fvcm.Rd +++ b/man/fvcm.Rd @@ -3,7 +3,9 @@ \alias{fvcm} \alias{fvcm_control} \alias{fvcolmm} +\alias{fvcolmm_control} \alias{fvcglm} +\alias{fvcglm_control} \title{Bagging and random forests based on \command{\link{tvcm}}} @@ -16,14 +18,19 @@ fvcm(..., control = fvcm_control()) -fvcolmm(..., family = cumulative(), control = fvcm_control()) - -fvcglm(..., family, control = fvcm_control()) - fvcm_control(maxstep = 10, folds = folds_control("subsampling", 5), ptry = 1, ntry = 1, vtry = 5, - alpha = 1.0, maxoverstep = Inf, ...) + alpha = 1.0, mindev = 0.0, ...) + +fvcolmm(..., family = cumulative(), control = fvcolmm_control()) + +fvcolmm_control(maxstep = 10, folds = folds_control("subsampling", 5), + ptry = 1, ntry = 1, vtry = 5, alpha = 1.0, ...) + +fvcglm(..., family, control = fvcglm_control()) +fvcglm_control(maxstep = 10, folds = folds_control("subsampling", 5), + ptry = 1, ntry = 1, vtry = 5, mindev = 0, ...) } \arguments{ @@ -57,7 +64,7 @@ fvcm_control(maxstep = 10, folds = folds_control("subsampling", 5), iteration. If \code{0 < vtry < 1}, \code{ntry} is interpreted as the relative number of variables to select regarding the total number of variables of the corresponding term.} - \item{maxoverstep, alpha}{These two parameters are merely specified to + \item{mindev, alpha}{These two parameters are merely specified to disable the default stopping rules for \command{\link{tvcm}}. See also \command{\link{tvcm_control}} for details.} } diff --git a/man/olmm-gefp.Rd b/man/olmm-gefp.Rd index a6a2264..aa729b2 100644 --- a/man/olmm-gefp.Rd +++ b/man/olmm-gefp.Rd @@ -54,9 +54,10 @@ gefp.olmm(object, scores = NULL, order.by = NULL, subset = NULL, suppressed?} \item{verbose}{logical scalar. Produces messages.} \item{scores}{a function or a matrix. Function to extract the - estimating equations from \code{object}, e.g. - \command{\link{estfun.olmm}}, or a matrix representing the estimating - equations.} + estimating equations from \code{object} or a matrix representing the + estimating equations. If \code{NULL} (default), the + \command{\link{estfun.olmm}} function will be used with + argument \code{predecor} and additional arguments from \code{...}.} \item{order.by}{a numeric or factor vector. The explanatory variable to be used to order the entries in the estimating equations. If set to \code{NULL} (the default) the observations are assumed to be @@ -102,22 +103,15 @@ gefp.olmm(object, scores = NULL, order.by = NULL, subset = NULL, i'} and \eqn{t \neq t'}. The pre-decorrelation approach above is principally limited to - balanced data. If the data of \code{object} are not balanced, a - multiple imputation algorithm is applied to estimate the values that - scores would take if the data were balanced. Specifically, in each - iteration of the imputation, (i) a set of predictors is drawn from the - empirical distribution of the model matrix, (ii) responses are drawn - based on these predictors of (i) and \code{object} - (cf. \command{\link{simulate.olmm}}) and (iii) the predictors of (i) - and the responses of (ii) are added \code{object} and the scores are - recomputed. The obtained sets of scores are averaged and then used to - for transforming the scores instead of the original scores. The - imputation algorithm can be controlled with \code{maxit.impute} (the - number of imputation iterations), \code{abstol.impute} the maximum - permitted quantile of differences between scores of the previous and - the current step and \code{ratio.impute} specifies the quantile to be - taken. - + balanced data. If the data of \code{object} are not balanced, the data + are balanced out by (i) drawing a set of predictors from the empirical + distribution of the model matrix of \code{object}, (ii) drawing + responses based on the predictors of (i) and \code{object} + (cf. \command{\link{simulate.olmm}}) and (iii) add the predictors of + (i) and the responses of (ii) to \code{object} and recompute the + scores based on the original coefficients. Note that, the column sum + of the returned score matrix is not necessarily zero in these cases. + Given a score matrix produced by \command{\link{estfun.olmm}}, the empirical fluctuation process can be computed by \command{\link{gefp.olmm}}. See Zeileis and Hornik diff --git a/man/olmm-methods.Rd b/man/olmm-methods.Rd index 7d05834..9e4b41d 100644 --- a/man/olmm-methods.Rd +++ b/man/olmm-methods.Rd @@ -7,6 +7,7 @@ \alias{deviance.olmm} \alias{formula.olmm} \alias{fixef} +\alias{fixef.glm} \alias{fixef.olmm} \alias{getCall.olmm} \alias{logLik.olmm} @@ -79,8 +80,15 @@ \item{seed}{an object specifying if and how the random number generator should be initialized. See \command{\link{simulate}}} \item{newdata}{a data frame with predictor variables.} - \item{ranef}{whether the simulated responses should be based on - conditional probabilities.} + \item{ranef}{either a logical or a matrix (see + \command{\link{predict.olmm}}). whether the simulated responses should + be conditional on random effects. If \code{TRUE}, the \code{newdata} + data frame must contain the subject identification + variable. Further, if all subjects in \code{newdata} are in + \code{object}, the simulation will be based on the estimated random + effects as obtained with \command{\link{ranef}}. If any subject in + \code{newdata} is not in \code{object} the random effects are + simulated.} \item{sigma}{ignored but obligatory argument from original generic.} \item{rdig}{ignored but obligatory argument from original generic.} \item{...}{potential further arguments passed to methods.} @@ -144,18 +152,35 @@ data(schizo) schizo <- schizo[1:181,] schizo$id <- droplevels(schizo$id) +## anova comparison +## ---------------- + ## fit two alternative models for the 'schizo' data model.0 <- olmm(imps79o ~ tx + sqrt(week) + re(1|id), schizo) model.1 <- olmm(imps79o ~ tx + sqrt(week)+tx*sqrt(week)+re(1|id),schizo) - anova(model.0, model.1) + +## simulate responses +## ------------------ + +## simulate responses based on estimated random effects +simulate(model.0, newdata = schizo[1, ], ranef = TRUE, seed = 1) +simulate(model.0, newdata = schizo[1, ], seed = 1, + ranef = ranef(model.0)[schizo[1, "id"],,drop=FALSE]) +## simulate responses based on simulated random effects +newdata <- schizo[1, ] +newdata$id <- factor("123456789") +simulate(model.0, newdata = newdata, ranef = TRUE) + +## other methods +## ------------- + coef(model.1) fixef(model.1) head(model.matrix(model.1, "fe-ge")) head(weights(model.1)) ranefCov(model.1) head(resid(model.1)) -head(simulate(model.1)) terms(model.1, "fe-ge") VarCorr(model.1) head(weights(model.1, "subject")) diff --git a/man/olmm.Rd b/man/olmm.Rd index 951f87c..d81c28b 100644 --- a/man/olmm.Rd +++ b/man/olmm.Rd @@ -6,7 +6,7 @@ \alias{baseline} \alias{olmm} -\title{Fitting ordinal two-stage linear mixed models} +\title{Fitting ordinal and nominal two-stage linear mixed models} \description{ Fits different types of two-stage linear mixed models for longitudinal diff --git a/man/poverty.Rd b/man/poverty.Rd index d47e203..0022bde 100644 --- a/man/poverty.Rd +++ b/man/poverty.Rd @@ -18,7 +18,7 @@ \describe{ \item{\code{Poor}}{binary response variable on whether the person is considered as poor or not. 0 = no and 1 = yes.} - \item{\code{Canton}}{the canton where the person is living. All + \item{\code{Canton}}{the canton where the person lives. All individuals origin from the canton Wallis.} \item{\code{Gender}}{whether person is a male or a female.} \item{\code{AgeGroup}}{to which age group the person belongs to.} diff --git a/man/tvcglm.Rd b/man/tvcglm.Rd new file mode 100644 index 0000000..3d2f1a3 --- /dev/null +++ b/man/tvcglm.Rd @@ -0,0 +1,169 @@ +\name{tvcglm} + +\alias{tvcglm} +\alias{tvcglm_control} + +\title{Coefficient-wise tree-based varying coefficient regression based + on generalized linear models} + +\description{The \command{\link{tvcglm}} function implements the + tree-based varying coefficient regression algorithm for generalized + linear models introduced by Buergin and Ritschard (2014b). The + algorithm approximates varying coefficients by piecewise constant + functions using recursive partitioning, i.e., it estimates the + coefficients of the model separately for strata of the value space of + partitioning variables. The special feature of the algorithm is to + assign each varying coefficient a partition, which enhances the + possibilities for model specification and to select moderator + variables individually by coefficient +} + +\usage{ +tvcglm(formula, data, family, + weights, subset, offset, na.action, + control = tvcglm_control(), ...) + +tvcglm_control(minsize = 30, mindev = 2.0, + maxnomsplit = 5, maxordsplit = 9, maxnumsplit = 9, + cv = TRUE, folds = folds_control("kfold", 5), + prune = cv, center = TRUE, ...) +} + +\arguments{ + \item{formula}{a symbolic description of the model to fit, e.g., + + \code{y ~ vc(z1, \ldots, zL, by = x1 + \ldots + xP) + re(1|id)} + + where \code{vc} term specifies the varying fixed coefficients. Only + one such \code{vc} term is allowed. For details, + see \command{\link{olmm}} and \command{\link{vcrpart-formula}}.} + \item{family}{the model family. An object of class + \command{\link{family.olmm}}.} + \item{data}{a data frame containing the variables in the model.} + \item{weights}{an optional numeric vector of weights to be used in the + fitting process.} + \item{subset}{an optional logical or integer vector specifying a + subset of \code{'data'} to be used in the fitting process.} + \item{offset}{this can be used to specify an a priori known component + to be included in the linear predictor during fitting.} + \item{na.action}{a function that indicates what should happen if data + contain \code{NA}s. See \command{\link{na.action}}.} + \item{control}{a list with control parameters as returned by + \command{\link{tvcolmm_control}}.} + \item{minsize}{numeric (vector). The minimum sum of weights in + terminal nodes.} + \item{mindev}{numeric scalar. The minimum permitted training error + reduction a split must exhibit to be considered of a new split. + The main role of this parameter is to save computing time by early + stopping. May be set lower for very few partitioning variables + resp. higher for many partitioning variables. } + \item{maxnomsplit, maxordsplit, maxnumsplit}{integer scalars for split + candidate reduction. See \command{\link{tvcm_control}}} + \item{cv}{logical scalar. Whether or not the \code{cp} parameter + should be cross-validated. If \code{TRUE} \command{\link{cvloss}} is + called.} + \item{folds}{a list of parameters to create folds as produced by + \command{\link{folds_control}}. Is used for cross-validation.} + \item{prune}{logical scalar. Whether or not the initial tree should be + pruned by the estimated \code{cp} parameter from + cross-validation. Cannot be \code{TRUE} if \code{cv = FALSE}.} + \item{center}{logical integer. Whether the predictor variables of + update models during the grid search should be centered. Note that + \code{TRUE} will not modify the predictors of the fitted model.} + \item{\ldots}{additional arguments passed to the fitting function + \code{fit} or to \command{\link{tvcm_control}}.} +} + +\details{ + The TVCGLM algorithm uses two stages. The first stage (partitioning) + builds too overly fine partitions and the second stage (pruning) + selects the best-sized partitions by collapsing inner nodes. For the + second stage, which is automatically processed, we refer to + \command{\link{tvcm-assessment}}. The partitioning stage iterates the + following steps: + + \enumerate{ + \item Fit the current generalized linear model + + \code{y ~ NodeA:x1 + \ldots + NodeK:xK} + + with \command{\link{glm}}, where \code{NodeK} is a categorical + variable with terminal node labels \code{1}, \ldots for the + \eqn{K}-th varying coefficient. + + \item Search and globally optimal split among the candidate + splits by exhaustive -2 likelihood training error grid search, + by cycling through all partitions, nodes and moderator variables. + + \item If the -2 likelihood training error reduction through the best + split is smaller than \code{mindev} or there is no candidate split + satisfying the minimum node size \code{minsize}, stop the algorithm. + + \item Else incorporate the best split and repeat the procedure. + } + + The partitioning stage selects, in each iteration, the split that + maximizes the -2 likelihood training error reduction, compared to the + current model. The default stopping parameters are \code{minsize = 30} + (a minimum node size of 30) and \code{mindev = 2} (the training error + reduction of the best split must be larger than two to continue). + + The algorithm can be seen as an extension of CART (Breiman et. al., + 1984) and PartReg (Wang and Hastie, 2014), with the new feature that + partitioning can be processed coefficient-wise. + } + +\value{An object of class \command{\link{tvcm}} +} + +\references{ + Breiman, L., Friedman, J.H., Olshen, R.A. and Stone, C.J. (1984) + \emph{Classification and Regression Trees}. Wadsworth. + + Wang, J. C., Hastie, T. (2014), Boosted Varying-Coefficient + Regression Models for Product Demand Prediction, \emph{Journal of + Computational and Graphical Statistics}, \bold{23}, 361--382. + + Buergin R. and Ritschard G. (2014b). Coefficient-wise tree-based + varying coefficient regression with vcrpart. Article in progress. +} + +\seealso{\command{\link{tvcm_control}}, \command{\link{tvcm-methods}}, + \command{\link{tvcm-plot}}, \command{\link{tvcm-plot}}, + \command{\link{tvcm-assessment}}, \command{\link{glm}}} + +\examples{ +## ------------------------------------------------------------------- # +## Example 1: Moderated effect of education on poverty +## +## The algorithm is used to find out whether the effect of high +## education 'EduHigh' on poverty 'Poor' is moderated by the civil +## status 'CivStat'. We specify two 'vc' terms in the logistic +## regression model for 'Poor': a first that accounts for the direct +## effect of 'CivStat' and a second that accounts for the moderation of +## 'CivStat' on the relation between 'EduHigh' and 'Poor'. We use here +## the 2-stage procedure with a partitioning- and a pruning stage as +## described in Buergin and Ritschard (2014b). +## ------------------------------------------------------------------- # + +data(poverty) +poverty$EduHigh <- 1 * (poverty$Edu == "high") + +## fit the model +model.Pov <- + tvcglm(Poor ~ -1 + vc(CivStat) + vc(CivStat, by = EduHigh) + NChild, + family = binomial(), data = poverty, subset = 1:200, + control = tvcm_control(verbose = TRUE, papply = lapply, + folds = folds_control(K = 1, type = "subsampling", seed = 7))) + +## diagnosis +plot(model.Pov, "cv") +plot(model.Pov, "coef") +summary(model.Pov) +splitpath(model.Pov, steps = 1:3) +prunepath(model.Pov, steps = 1) +} + +\author{Reto Buergin} + +\keyword{tree} \ No newline at end of file diff --git a/man/tvcm-control.Rd b/man/tvcm-control.Rd index de34e41..8bf6af5 100644 --- a/man/tvcm-control.Rd +++ b/man/tvcm-control.Rd @@ -9,94 +9,68 @@ } \usage{ -tvcm_control(lossfun = neglogLik2, - maxstep = Inf, maxwidth = Inf, - minsize = 30, maxdepth = Inf, - dfpar = 2.0, dfsplit = 0.0, - maxoverstep = ifelse(sctest, Inf, 0), +tvcm_control(minsize = 30, mindev = ifelse(sctest, 0.0, 2.0), sctest = FALSE, alpha = 0.05, bonferroni = TRUE, - trim = 0.1, estfun = list(), - maxfacsplit = 5L, maxordsplit = 10, maxnumsplit = 10, + trim = 0.1, estfun.args = list(), nimpute = 5, + maxnomsplit = 5, maxordsplit = 9, maxnumsplit = 9, + maxstep = 1e3, maxwidth = 1e9, maxdepth = 1e9, + lossfun = neglogLik2, ooblossfun = NULL, + cp = 0.0, dfpar = 0.0, dfsplit = 1.0, cv = !sctest, folds = folds_control("kfold", 5), - prune = cv, keeploss = FALSE, papply = mclapply, - verbose = FALSE, ...) - + prune = cv, papply = mclapply, papply.args = list(), + center = TRUE, seed = NULL, verbose = FALSE, ...) } \arguments{ - \item{lossfun}{a function that extracts a loss measure from a - fitted object, e.g., two times the negative log likelihood - (default).} - \item{maxstep}{integer. The maximum number of iterations i.e. the - total number of splits processed.} - \item{maxwidth}{integer (vector). The maximum width of the tree(s).} - \item{minsize}{numeric. The minimum sum of weights in a terminal - node. The default is the number of varying coefficients times 10. - The parameter specifies also the trimming in parameter coefficient - tests for numeric variables (if \code{sctest = TRUE}).} - \item{maxdepth}{integer (vector). The maximum depth of the tree(s).} - \item{dfpar}{a numeric scalar larger than zero. The per-parameter - penalty to be applied for stopping. See also argument - \code{maxoverstep}.} - \item{dfsplit}{a numeric scalar larger than zero. The per-split - penalty to be applied for stopping. See also argument - \code{maxoverstep}.} - \item{maxoverstep}{integer scalar. The maximum number of consecutive - times the penalized reduction statistic is allowed to be smaller - than \code{dfsplit} before stopping. Specifically, the penalized - loss is computed as the loss (see argument \code{lossfun}) of the - model plus \code{dfpar} times the number of coefficients (as - extracted from \command{\link{extractAIC}}), and the penalized loss - reduction statistic is the difference between the penalized loss of - the current model and the penalized loss of the selected (best) update - model. The argument is disabled if \code{sctest = TRUE}.} + \item{alpha, bonferroni, trim, estfun.args, nimpute}{See + \command{\link{tvcglm_control}}} + \item{mindev, cv, folds, prune, center}{See + \command{\link{tvcglm_control}}} + \item{minsize}{numeric (vector). The minimum sum of weights in + terminal nodes.} \item{sctest}{logical scalar. Defines whether coefficient constancy - tests should be used for variable and node selection.} - \item{alpha}{numeric significance threshold between 0 and 1. A node is - splitted when the smallest (possibly Bonferroni-corrected) \eqn{p} - value for any coefficient constancy test in the current step falls - below \code{alpha}.} - \item{bonferroni}{logical. Indicates if and how \eqn{p}-values of - coefficient constancy tests must be Bonferroni - corrected. See details.} - \item{trim}{numeric between 0 and 1. Specifies the trimming parameter - in coefficient constancy tests for continuous partitioning - variables.} - \item{estfun}{list of arguments to be passed to - \command{\link{gefp.olmm}}.} - \item{maxfacsplit}{integer.} - \item{maxordsplit}{integer.} - \item{maxnumsplit}{integer. The maximum number of evaluation for - splits on numeric partitioning variables.} - \item{cv}{logical scalar. Whether or not the \code{dfsplit} parameter - should be cross-validated.} - \item{folds}{a list of parameters to create folds as produced by - \command{\link{folds_control}}.} - \item{prune}{logical scalar. Whether or not the overly large - partitions should be pruned by the estimated \code{dfsplit} parameter - from cross-validation. Note that \code{prune = TRUE} conflicts with - \code{cv = FALSE}} - \item{keeploss}{logical scalar or a numeric equal or larger than - 0. Indicates if and how many times the computed penalized loss - reduction statistics should be reused in the following - iterations. Specifically, the option activates approximating the - penalized loss reduction statistic of a split based on the penalized - loss reduction statistic of the same split in the previous - iteration. Values different from \code{FALSE} or 0 will accelerate the - algorithm at the expense of accuracy. In cases of multiple - \command{\link{vc}} terms, only the statistics corresponding to the - \command{\link{vc}} term which was splitted in the previous iteration - are reused.} - \item{papply}{(parallel) apply function, defaults to + tests should be used for the variable and node selection in each + iteration.} + \item{maxnomsplit}{integer. For nominal partitioning variables with + more the \code{maxnomsplit} the categories are ordered an treated as + ordinal.} + \item{maxordsplit}{integer. The maximum number of splits of ordered + partitioning variables to be evaluated.} + \item{maxnumsplit}{integer. The maximum number of splits of numeric + partitioning variables to be evaluated.} + \item{maxstep}{integer. The maximum number of iterations i.e. number + of splits to be processed.} + \item{maxwidth}{integer (vector). The maximum width of the partition(s).} + \item{maxdepth}{integer (vector). The maximum depth of the partition(s).} + \item{lossfun}{a function to extract the training error, typically + minus two times the negative log likelihood of the fitted model (see + \command{\link{neglogLik2}}).} + \item{ooblossfun}{a loss function that defines how to compute the + validation error during cross-validation. The function will be + assigned to the \code{fun} argument of \command{\link{oobloss}}.} + \item{cp}{numeric scalar. The penalty to be multiplied with the + complexity of the model during partitioning. The complexity of the + model is defined as the number of coefficients times \code{dfpar} + plus the number of splits times \code{dfsplit}. By default, \code{cp + = 0} (no penalization during partitioning) and \code{dfpar = 0} and + \code{dfsplit = 1} (the complexity is measured as the total number + of splits). \code{cp} also presents the minimum evaluated value at + cross-validation.} + \item{dfpar}{numeric scalar. The degree of freedom per model + coefficient. Is used to compute the complexity of the model, see + \code{cp}.} + \item{dfsplit}{a numeric scalar. The degree of freedom per split. Is + used to compute the complexity of the model, see \code{cp}.} + \item{papply}{(parallel) apply function, defaults to \code{\link[parallel]{mclapply}}. The function will parallelize the partition stage and the evaluation of the cross-validation folds as - well as the final pruning stage.} + well as the final pruning stage.} + \item{papply.args}{a list of arguments to be passed to \code{papply}.} + \item{seed}{an integer specifying which seed should be set at the + beginning.} \item{verbose}{logical. Should information about the fitting process - (such as test statistics, \eqn{p} values, - selected splitting variables and split points) be printed to the - screen?} - \item{\ldots}{further, undocumented arguments to be passed. These can - include arguments for the \code{papply} function.} + be printed to the screen?} + \item{\ldots}{further, undocumented arguments to be passed.} } \value{ @@ -106,7 +80,9 @@ tvcm_control(lossfun = neglogLik2, \author{Reto Buergin} -\seealso{\command{\link{tvcm}}, \command{\link{fvcm}}} +\seealso{\command{\link{tvcolmm_control}}, + \command{\link{tvcglm_control}}, \command{\link{tvcm}}, + \command{\link{fvcm}}} \examples{ tvcm_control() diff --git a/man/tvcm-cv.Rd b/man/tvcm-cv.Rd index b403345..7887ec7 100644 --- a/man/tvcm-cv.Rd +++ b/man/tvcm-cv.Rd @@ -1,6 +1,8 @@ \name{tvcm-assessment} \alias{tvcm-assessment} +\alias{prune} +\alias{prune.tvcm} \alias{folds_control} \alias{cvloss} \alias{cvloss.tvcm} @@ -10,26 +12,25 @@ \alias{oobloss.tvcm} \alias{print.cvloss.tvcm} \alias{plot.cvloss.tvcm} -\alias{prune} -\alias{prune.tvcm} -\title{Model assessment and model selection for \command{\link{tvcm}} objects.} +\title{Model selection utility functions for \command{\link{tvcm}} objects.} \description{ - Out-of-bag loss, cross-validation and pruning for - \command{\link{tvcm}} objects. + Pruning, cross-validation to find the optimal pruning parameter and computing + validation set errors for \command{\link{tvcm}} objects. } \usage{ +\method{prune}{tvcm}(tree, cp = NULL, alpha = NULL, maxstep = NULL, + terminal = NULL, original = FALSE, ...) + folds_control(type = c("kfold", "subsampling", "bootstrap"), K = ifelse(type == "kfold", 5, 30), prob = 0.5, weights = c("case", "freq"), seed = NULL) -\method{cvloss}{tvcm}(object, folds = folds_control(), - fun = NULL, dfpar = NULL, direction = c("backward", "forward"), - papply = mclapply, verbose = FALSE, ...) +\method{cvloss}{tvcm}(object, folds = folds_control(), ...) \method{print}{cvloss.tvcm}(x, ...) @@ -37,11 +38,6 @@ folds_control(type = c("kfold", "subsampling", "bootstrap"), \method{oobloss}{tvcm}(object, newdata = NULL, weights = NULL, fun = NULL, ...) - -\method{prune}{tvcm}(tree, dfsplit = NULL, dfpar = NULL, - direction = c("backward", "forward"), - alpha = NULL, maxstep = NULL, terminal = NULL, - papply = mclapply, keeploss = FALSE, original,...) } \arguments{ @@ -61,34 +57,16 @@ folds_control(type = c("kfold", "subsampling", "bootstrap"), \item{seed}{an numeric scalar that defines the seed.} \item{folds}{a list with control arguments as produced by \command{\link{folds_control}}.} + \item{legend}{logical scalar. Whether a legend should be added.} + \item{details}{logical scalar. Whether the foldwise validation errors + should be shown.} \item{fun}{the loss function for the validation sets. By default, the (possibly weighted) mean of the deviance residuals as defined by the \command{\link{family}} of the fitted \code{object} is applied.} - \item{dfpar}{a numeric scalar larger than zero. The per parameter penalty - to be applied. If the 2 log-likelihood prediction error is used, - this value is typically set to 2. If \code{NULL}, the value of - \code{dfpar} of the partitioning stage is used, see - \code{tvcm_control()}} - \item{direction}{either \code{"backward"} (the default) or - \code{"forward"}. Indicates the pruning algorithm to be - used. \code{"backward"} applies backward pruning where in each - iteration the inner node that produces the smallest per-node - increase is collapsed. \code{"forward"} applies premature stopping - when no new split would improve the model.} - \item{papply}{(parallel) apply function, defaults to - \code{\link[parallel]{mclapply}}. To run \code{cvloss} sequentially - (i.e. not in parallel), use \command{\link{lapply}}. Special arguments - for that apply function can be assigned as well.} \item{newdata}{a data.frame of out-of-bag data (including the response variable). See also \command{\link{predict.tvcm}}.} - \item{verbose}{logical scalar. If \code{TRUE} verbose output is - generated during the validation.} - \item{legend}{logical scalar. Whether a legend should be added.} - \item{details}{logical scalar. Whether the foldwise validation errors - and the in-sample prediction error should be shown.} - \item{dfsplit}{numeric scalar. The per-split cost \code{dfsplit} with - which the partitions are to be cross-validated. If no \code{dfsplit} - is specified (default), the parameter is ignored for pruning.} + \item{cp}{numeric scalar. The complexity parameter to be cross-validated + resp. the penalty with which the model should be pruned.} \item{alpha}{numeric significance level. Represents the stopping parameter for \command{\link{tvcm}} objects grown with \code{sctest = TRUE}, see \command{\link{tvcm_control}}. A node is @@ -96,119 +74,112 @@ folds_control(type = c("kfold", "subsampling", "bootstrap"), in that node falls below \code{alpha}.} \item{maxstep}{integer. The maximum number of steps of the algorithm.} \item{terminal}{a list of integer vectors with the ids of the nodes - the subnodes of which should be merged. } - \item{keeploss}{logical scalar or numeric. If and how many times the - statistics should be reused in following iterations. Specifically, - the option activates approximating the AIC reduction per split based - on AIC reduction of the last iteration adjusted by AIC reduction of - the last collapse. Values different from \code{FALSE} or 0 will - accelerate the algorithm at the expense of accuracy. In cases of - multiple \command{\link{vc}} terms, only the statistics corresponding - to the \command{\link{vc}} term in which a node was collapsed in the - previous iteration are considered to be reused.} + the inner nodes to be set to terminal nodes. The length of the list must + be equal the number of partitions.} \item{original}{logical scalar. Whether pruning should be based on the trees from partitioning rather than on the current trees.} \item{...}{other arguments to be passed.} } -\details{As described in the help of \command{\link{tvcm}}, TVCM - (combined with \code{sctest = FALSE}) is a two stage procedure that - first grows overly fine partitions and second selects the best-sized - partitions by pruning. Both steps can be carried out with a single - \command{\link{tvcm}} and several parameters can be specified with - \command{\link{tvcm_control}}. The here presented functions may be - interesting for advanced users who want to process the two stages with - separate calls. - - The \command{\link{prune}} method collapses inner nodes of the overly - large tree fitted with \command{\link{tvcm}} according to the tuning - parameter \code{dfsplit} to minimize the estimated in-sample - prediction error. The in-sample prediction error is, in what follows, - defined as the mean of the in-sample loss plus \code{dfpar} times the - number of coefficients plus \code{dfsplit} times the number of - splits. In the common likelihood setting, the loss is equal - 2 times - the maximum likelihood and \code{dfpar = 2}. The per-split penalty - \code{dfsplit} generally unknown and estimated by using - cross-validation. - - \command{\link{folds_control}} and \command{\link{cvloss}} allow for - estimating \code{dfsplit} by cross-validation. The function - \command{\link{folds_control}} is used to specify the cross-validation - scheme, where a random 5-fold cross-validation scheme is set as the - default. Alternatives are \code{type = "subsampling"} (random draws - without replacement) and \code{type = "bootstrap"} (random draws with - replacement). For 2-stage models (with random-effects) fitted by - \command{\link{olmm}}, the subsets are based on subject-wise i.e. first - stage sampling. For models where weights represent frequencies of - observation units with exactly the same values in all variables (e.g., - data from contingency tables), the option \code{weights = "freq"} - should be used. - - \command{\link{cvloss}} repeatedly fits \command{\link{tvcm}} objects - based on the internally created folds and evaluates mean out-of-bag - loss of the model at different levels of the tuning parameter - \code{dfsplit}. Out-of-bag loss refers here to the prediction error - based on a loss function, which is typically the -2 log-likelihood - error (see the details for \code{oobloss} below). Commonly, - \code{dfsplit} is used for backward pruning (\code{direction = - "backward"}), but it is also possible to cross-validate \code{dfsplit} - for premature stopping (\code{direction = "forward"}, see argument - \code{dfsplit} in - \command{\link{tvcm_control}}). \command{\link{cvloss}} returns an - object for which a \code{print} and a \code{plot} generic is - provided. The proposed estimate for \code{dfsplit} is the one that - minimizes the validated loss and can be extracted from component - \code{dfsplit.min}. +\details{By default, \command{\link{tvcm}} is a two stage procedure that + first grows overly large trees and second selects the best-sized trees by + pruning. The here presented functions may be interesting for advanced users + who want to process the model selection stage separately. + + In normal practice, the \command{\link{prune}} function is used to collapse + inner nodes of the tree structures by the tuning parameter \code{cp}. The + aim of pruning by \code{cp} is to collapse inner nodes to minimize the + cost-complexity criterion + + \deqn{error(cp) = error(tree) + cp * complexity(tree)} + + whereby, the training error \eqn{error(tree)} is defined by \code{lossfun} + and \eqn{complexity(tree)} is defined as the total number of coefficients times + \code{dfpar} plus the total number of splits times \code{dfsplit}. The function + \code{lossfun} and the parameters \code{dfpar} and \code{dfsplit} are defined + by the \code{control} argument of \command{\link{tvcm}}, see also + \command{\link{tvcm_control}}. By default, \eqn{error(tree)} is minus two + times the total likelihood of the model and \eqn{complexity(tree)} the number + of splits. The minimization of \eqn{error(cp)} is implemented by the + following iterative backward-stepwise algorithm + + \enumerate{ + \item fit all \code{subtree} models that collapse one inner node of the + current \code{tree} model. + \item compute the per-complexity increase in the training error + \deqn{dev = (error(subtree) - error(tree)) / + (complexity(tree) - complexity(subtree))} + for all fitted \code{subtree} models + \item if any \code{dev} < \code{cp} then set as the \code{tree} model + the \code{subtree} that minimizes \code{dev} and repeated 1 to 3, + otherwise stop. + } + + The penalty \code{cp} is generally unknown and is estimated adaptively from + the data. \command{\link{cvloss}} implements the cross-validation + method to do this. \command{\link{cvloss}} repeats for each fold the following + steps + + \enumerate{ + \item fit a new model with \command{\link{tvcm}} based on the training data of + the fold. + \item prune the new model for increasing \code{cp}. Compute for each + \code{cp} the average validation error. + } + + Doing so yields for each fold a sequence of values for \code{cp} and a + sequence of average validation errors. The obtained sequences for \code{cp} + are combined to a fine grid and the average validation error is averaged + correspondingly. From these two sequences we choose the \code{cp} that + minimizes the validation error. Notice that the average validation error + is computed as the total prediction error of the validation set divided + by the sum of validation set weights. See also the argument \code{ooblossfun} in + \command{\link{tvcm_control}} and the function \command{\link{oobloss}}. + + The function \command{\link{folds_control}} is used to specify the + cross-validation scheme, where a random 5-fold cross-validation scheme + is set as the default. Alternatives are \code{type = "subsampling"} + (random draws without replacement) and \code{type = "bootstrap"} (random + draws with replacement). For 2-stage models (with random-effects) + fitted by \command{\link{olmm}}, the subsets are based on subject-wise + i.e. first stage sampling. For models where weights represent frequencies + of observation units (e.g., data from contingency tables), the option + \code{weights = "freq"} should be considered. \command{\link{cvloss}} + returns an object for which a \code{print} and a \code{plot} generic is + provided. - \command{\link{oobloss}} can be used for estimating the out-of-bag - prediction error for out-of-bag data (the \code{newdata} - argument). By default, the loss is defined as the sum of deviance - residuals, see the return value \code{dev.resids} of - \command{\link{family}} resp. \command{\link{family.olmm}}. Otherwise, - the loss function can be defined manually by the argument \code{fun}, - see the examples below. In general the sum of deviance residual is - equal the -2 log-likelihood. A special case is the gaussian family, - where the deviance residuals are computed as \eqn{\sum_{i=1}^N w_i - (y_i-\mu)^2} that is, the deviance residuals ignore the term - \eqn{\log{2\pi\sigma^2}}. Therefore, the sum of deviance residuals for - the gaussian model (and possibly others) is not exactly the -2 - log-likelihood prediction error but shifted by a constant. Another - special case are models with random effects. For models based on - \command{\link{olmm}}, the deviance residuals are based on the - marginal predictions (where random effects are integrated out). - - The \command{\link{prune}} function is used to select a nested model - of the current model, i.e. a model which collapses leaves to their - inner nodes based on the estimated prediction error. The estimated - prediction error is defined as the AIC of the model plus \code{dfsplit} - times the number of splits. Pruning with \code{direction = "backward"} - works as follows: In each iteration, all nested models of the current - model are evaluated, i.e. models which collapse one of the inner nodes - of the current model. The inner node that yields the smallest increase - in the estimated prediction error is collapsed and the resulting model - substitutes the current model. The algorithm is stopped as soon as all - nested models have a higher estimated prediction error than the - current model, which will be returned. + \command{\link{oobloss}} can be used to estimate the total prediction error + for validation data (the \code{newdata} argument). By default, the loss is + defined as the sum of deviance residuals, see the return value \code{dev.resids} + of \command{\link{family}} resp. \command{\link{family.olmm}}. Otherwise, the + loss function can be defined manually by the argument \code{fun}, see the + examples below. In general the sum of deviance residual is equal the sum of + the -2 log-likelihood errors. A special case is the gaussian family, where + the deviance residuals are computed as \eqn{\sum_{i=1}^N w_i (y_i-\mu)^2}, + that is, the deviance residuals ignore the term \eqn{log 2\pi\sigma^2}. + Therefore, the sum of deviance residuals for the gaussian model (and + possibly others) is not exactly the sum of -2 log-likelihood prediction + errors (but shifted by a constant). Another special case are models with + random effects. For models based on \command{\link{olmm}}, the deviance + residuals are retrieved from marginal predictions (where random effects are + integrated out). } -\value{\command{\link{folds_control}} returns a list of parameters for - building a cross-validation scheme. \command{\link{cvloss}} returns an - \code{cvloss.tvcm} object with the following essential components: - - \item{grid}{a list with two matrices \code{dfsplit} and - \code{nsplit}. Specifies the grid of values at which the - cross-validated loss was evaluated.} - \item{loss}{a list with two matrices \code{dfsplit} and - \code{nsplit}. The cross-validated loss for each fold corresponding - to the values in \code{grid}.} - \item{dfsplit.min}{numeric scalar. The tuning parameter which - minimizes the cross-validated loss.} +\value{\command{\link{prune}} returns a \command{\link{tvcm}} object, + \command{\link{folds_control}} returns a list of parameters for building a + cross-validation scheme. \command{\link{cvloss}} returns an \code{cvloss.tvcm} + object with at least the following components: + + \item{grid}{a list with values for \code{cp}.} + \item{oobloss}{a matrix recording the validated loss for each value in + \code{grid} for each fold.} + \item{cp.hat}{numeric scalar. The tuning parameter which + minimizes the cross-validated error.} \item{folds}{the used folds to extract the learning and the validation sets.} - Further, \command{\link{oobloss}} returns the loss of the \code{newdata} - validation set and \command{\link{prune}} returns a (possibly modified) - \command{\link{tvcm}} object. + \command{\link{oobloss}} returns a scalar representing the total prediction + error for \code{newdata}. } \references{ diff --git a/man/tvcm-methods.Rd b/man/tvcm-methods.Rd index 17c726a..ed7df9d 100644 --- a/man/tvcm-methods.Rd +++ b/man/tvcm-methods.Rd @@ -3,6 +3,7 @@ \alias{tvcm-methods} \alias{coef.tvcm} \alias{coefficients.tvcm} +\alias{depth.tvcm} \alias{extract} \alias{extract.tvcm} \alias{formula.tvcm} @@ -23,6 +24,7 @@ \alias{splitpath.tvcm} \alias{summary.tvcm} \alias{weights.tvcm} +\alias{width.tvcm} \title{Methods for \command{\link{tvcm}} objects} @@ -32,10 +34,12 @@ \usage{ \method{coef}{tvcm}(object, ...) +\method{depth}{tvcm}(x, root = FALSE, ...) + \method{extract}{tvcm}(object, what = c( "control", "model", "nodes", "sctest", "p.value", - "lossgrid", "selected", + "devgrid", "cv", "selected", "coef", "sd", "var"), steps = NULL, ...) @@ -53,10 +57,13 @@ \method{summary}{tvcm}(object, ...) +\method{width}{tvcm}(x, ...) } \arguments{ - \item{object, tree}{an object of class \command{\link{tvcm}}.} + \item{object, tree, x}{an object of class \command{\link{tvcm}}.} + \item{root}{logical scalar. Should the root count be counted in + \code{depth}?} \item{steps}{integer vector. The iteration steps from which information should be extracted.} \item{newdata}{an optional data frame in which to look for variables @@ -89,6 +96,12 @@ several information, such as the loss reduction of new splits during partitioning or the loss reduction of collapsing an inner node when pruning. + + \command{\link{summary}} computes summary statistics of the fitted model, + including the estimated coefficients. The varying coefficient are printed + by means of a printed decision tree. Notice that in cases there is no split + for the varying coefficient, the average coefficient will be among the + fixed effects. Further undocumented, available methods are: \command{\link{fitted}}, \command{\link{formula}}, \command{\link{getCall}}, diff --git a/man/tvcm-plot.Rd b/man/tvcm-plot.Rd index 9dd3c8e..b7cd474 100644 --- a/man/tvcm-plot.Rd +++ b/man/tvcm-plot.Rd @@ -14,8 +14,7 @@ \usage{ \method{plot}{tvcm}(x, type = c("default", "coef", "simple", "partdep", "cv"), - main = NULL, part = NULL, - drop_terminal = TRUE, + main, part = NULL, drop_terminal = TRUE, tnex = 1, newpage = TRUE, ask = NULL, pop = TRUE, gp = gpar(), ...) @@ -49,8 +48,8 @@ panel_coef(object, parm = NULL, \item{pop}{a logical whether the viewport tree should be popped before return.} \item{gp}{graphical parameters. See \command{\link{gpar}}.} - \item{part}{integer scalar. The partition i.e. varying coefficient - component to be plotted.} + \item{part}{integer or letter. The partition i.e. varying + coefficient component to be plotted.} \item{parm}{character vector (\command{\link{panel_partdep}} and \command{\link{panel_coef}}) or list of character vectors (\command{\link{panel_coef}}) with names of model diff --git a/man/tvcm.Rd b/man/tvcm.Rd index 12d2dbd..29daf83 100644 --- a/man/tvcm.Rd +++ b/man/tvcm.Rd @@ -1,32 +1,20 @@ \name{tvcm} \alias{tvcm} -\alias{tvcolmm} -\alias{tvcglm} \title{Tree-based varying coefficient regression models} \description{ - TVCM is a tree-based algorithm that aims to estimate varying - coefficient regression models. TVCM approximates varying coefficients - by piecewise constant functions, i.e., it estimates linear models with - stratum specific coefficients. The \command{\link{tvcm}} function - implements the partitioning algorithms described in Buergin and - Ritschard (2014b) (default) and Buergin and Ritschard (2014a). + \command{\link{tvcm}} is the general implementation for tree-based + varying coefficient regression. It may be used to combine the two + different algorithms \command{\link{tvcolmm}} and + \command{\link{tvcglm}}. } \usage{ tvcm(formula, data, fit, family, weights, subset, offset, na.action, control = tvcm_control(), ...) - -tvcolmm(formula, data, family = cumulative(), - weights, subset, offset, na.action, - control = tvcm_control(), ...) - -tvcglm(formula, data, family, - weights, subset, offset, na.action, - control = tvcm_control(), ...) } \arguments{ @@ -56,60 +44,22 @@ tvcglm(formula, data, family, } \details{ - The TVCM partitioning algorithm works as follows: Starting with - \eqn{M_k = 1} stratum (i.e. node) for all \eqn{K} \command{\link{vc}} - terms, the algorithm splits in each iteration one of the current - \eqn{K \sum_{k=1}^K M_k} nodes into two new nodes. For selecting the - \code{vc} term, the node, the variable and the cutpoint in each - iteration, there are two procedures available. - - The first and default procedure (cf. Buergin and Ritschard, 2014b) is - a two-stage procedure which builds overly fine partitions in the first - stage and selects the best-sized partitions by pruning in the second - stage. For the second stage, which is automatically processed, we - refer to \command{\link{tvcm-assessment}}. The partitioning stage - selects, in each iteration, the split that maximizes the penalized - loss reduction statistic that compares the current with the one-step - ahead model (see also argument \code{lossfun} in - \command{\link{tvcm_control}}). By default, the penalized loss - reduction statistic is the Akaike Information Criterion (Akaike, - 1974), but alternatives could be specified by the \code{control} - argument. The algorithm is continued to build until the criteria - specified in \code{control} are reached. By default, only the - \code{minsize} (minimum node size) criteron are specified (andthat the - penalized loss reduction is larger than 0). For large data sets with - many partitioning variables, this can be slow and, therefore, - \command{\link{tvcm_control}} provides further criteria. In - particular, you may increase the minimum permitted penalized loss - reduction by \code{dfsplit}. - - The second procedure (cf. Buergin and Ritschard, 2014a) is, for - technical reasons, restricted to the cases where a single - \command{\link{vc}} term is used and non of the moderators - (partitioning variables) intersects with predictors. On the other - hand, the second procedure avoids the variable selection bias that - exhibits the exhaustive search above and is computationally much less - burdensome. The procedure selects the split by first choosing the node - and the variable with M-fluctuation tests (cf. Zeileis and Hornik, - 2007) and second choosing the cutpoint by the deviance reduction - statistic, as above. To use this option it is necessary to set - \code{sctest=TRUE} in \command{\link{tvcm_control}}. The algorithm is - stopped as soon as all nodewise Bonferroni corrected p-values of - M-fluctuation tests reach a prespecified threshold (e.g., 0.05), see - argument \code{alpha} in \command{\link{tvcm_control}} and no pruning - is necessary (or optional). Note that, as explained in Buergin and - Ritschard (2014a), coefficient constancy tests are adjusted for - intra-subject correlation for 2-stage models, see - \command{\link{estfun.olmm}}. The procedure is illustrated below in - example 2. - - An alternative tree-based algorithm to \command{\link{tvcm}} are the + TVCM partitioning works as follows: In each iteration we fit the + current model and select a binary split for one of the current + terminal nodes. The selection requires 4 decisions: the \code{vc} + term, the node, the variable and the cutpoint in the selected + variable. The algorithm starts with \eqn{M_k = 1} node for each of the + \eqn{K} \command{\link{vc}} terms and iterates until the criteria + defined by \code{control} are reached, see + \command{\link{tvcm_control}}. For the specific criteria for the split + selection, see \command{\link{tvcolmm}} and \command{\link{tvcglm}}. + + Alternative tree-based algorithm to \command{\link{tvcm}} are the MOB (Zeileis et al., 2008) and the PartReg (Wang and Hastie, 2014) algorithms. The MOB algorithm is implemented by the \command{mob} - function in the packages \pkg{party} and \pkg{partykit}. For - alternative, smoothing splines and kernel regression approaches to - varying coefficients, see the packages \pkg{mgcv}, - \pkg{svcm},\pkg{mboost} or \pkg{np}. + function in the packages \pkg{party} and \pkg{partykit}. For smoothing + splines and kernel regression approaches to varying coefficients, see + the packages \pkg{mgcv}, \pkg{svcm},\pkg{mboost} or \pkg{np}. The \command{\link{tvcm}} function builds on the software infrastructure of the \pkg{partykit} package. The authors are grateful @@ -130,52 +80,38 @@ tvcglm(formula, data, family, \code{model}.} } -\references{ - Zeileis, A., Hothorn, T., and Hornik, K. (2008). Model-Based - Recursive Partitioning. \emph{Journal of Computational and Graphical - Statistics}, \bold{17}(2), 492--514. - - Zeileis, A., Hornik, K. (2007), Generalized M-Fluctuation Tests for - Parameter Instability, \emph{Statistica Neerlandica}, \bold{61}, - 488--508. doi:10.1111/j.1467-9574.2007.00371.x. - - Torsten Hothorn, Achim Zeileis (2014). partykit: A Modular Toolkit - for Recursive Partytioning in R. Working Paper 2014-10. Working - Papers in Economics and Statistics, Research Platform Empirical and - Experimental Economics, Universitaet Innsbruck. URL - \url{http://EconPapers.RePEc.org/RePEc:inn:wpaper:2014-10} - - Wang, J. C., Hastie, T. (2013), Boosted Varying-Coefficient - Regression Models for Product Demand Prediction, \emph{Journal of - Computational and Graphical Statistics}. - - Buergin R. and Ritschard G. (2014a), Tree-based varying coefficient - regression for longitudinal ordinal responses. Submitted article. - - Buergin R. and Ritschard G. (2014b), Coefficient-wise tree-based - varying coefficient regression with vcrpart. Article in progress. - - Akaike H. (1974), A new look at the statistical model identification, - \emph{IEEE Transactions on Automatic Control}, \bold{19}, 716--723. +\references{ + Zeileis, A., Hothorn, T., and Hornik, K. (2008). Model-Based + Recursive Partitioning. \emph{Journal of Computational and Graphical + Statistics}, \bold{17}(2), 492--514. + + Wang, J. C., Hastie, T. (2014), Boosted Varying-Coefficient + Regression Models for Product Demand Prediction, \emph{Journal of + Computational and Graphical Statistics}, \bold{23}, 361--382. + + Torsten Hothorn, Achim Zeileis (2014). partykit: A Modular Toolkit + for Recursive Partytioning in R. Working Paper 2014-10. \emph{Working + Papers in Economics and Statistics, Research Platform Empirical and + Experimental Economics, Universitaet Innsbruck}. URL + \url{http://EconPapers.RePEc.org/RePEc:inn:wpaper:2014-10} + + Buergin R. and Ritschard G. (2014a), Tree-based varying coefficient + regression for longitudinal ordinal responses. Article in progress. + + Buergin R. and Ritschard G. (2014b), Coefficient-wise tree-based + varying coefficient regression with vcrpart. Article in progress. } -\seealso{\command{\link{tvcm_control}}, \command{\link{tvcm-methods}}, - \command{\link{tvcm-plot}}, \command{\link{tvcm-plot}}, - \command{\link{tvcm-assessment}}} +\seealso{\command{\link{tvcolmm}}, \command{\link{tvcglm}}, + \command{\link{tvcm_control}}, \command{\link{tvcm-methods}}, + \command{\link{tvcm-plot}}, \command{\link{tvcm-assessment}}} \examples{ ## ------------------------------------------------------------------- # ## Example 1: Moderated effect of education on poverty ## -## The algorithm is used to find out whether the effect of high -## education 'EduHigh' on poverty 'Poor' is moderated by the civil -## status 'CivStat'. We specify two 'vc' terms in the logistic -## regression model for 'Poor': a first that accounts for the direct -## effect of 'CivStat' and a second that accounts for the moderation of -## 'CivStat' on the relation between 'EduHigh' and 'Poor'. We use here -## the 2-stage procedure with a partitioning- and a pruning stage as -## described in Buergin and Ritschard (2014b). +## See the help of 'tvcglm'. ## ------------------------------------------------------------------- # data(poverty) @@ -183,10 +119,10 @@ poverty$EduHigh <- 1 * (poverty$Edu == "high") ## fit the model model.Pov <- - tvcglm(Poor ~ -1 + vc(CivStat) + vc(CivStat, by = EduHigh) + NChild, + tvcm(Poor ~ -1 + vc(CivStat) + vc(CivStat, by = EduHigh) + NChild, family = binomial(), data = poverty, subset = 1:200, - control = tvcm_control(verbose = TRUE, - folds = folds_control(K = 1, type = "subsampling", seed = 4))) + control = tvcm_control(verbose = TRUE, + folds = folds_control(K = 1, type = "subsampling", seed = 7))) ## diagnosis plot(model.Pov, "cv") @@ -199,21 +135,18 @@ prunepath(model.Pov, steps = 1) ## ------------------------------------------------------------------- # ## Example 2: Moderated effect effect of unemployment ## -## Here we fit a varying coefficient ordinal linear mixed on the -## synthetic ordinal longitudinal data 'unemp'. The interest is whether -## the effect of unemployment 'UNEMP' on happiness 'GHQL' is moderated -## by 'AGE', 'FISIT', 'GENDER' and 'UEREGION'. 'FISIT' is the only true -## moderator. For the the partitioning we coefficient constancy tests, -## as described in Buergin and Ritschard (2014a) +## See the help of 'tvcolmm'. ## ------------------------------------------------------------------- # data(unemp) ## fit the model model.UE <- - tvcolmm(GHQL ~ -1 + + tvcm(GHQL ~ -1 + vc(AGE, FISIT, GENDER, UEREGION, by = UNEMP, intercept = TRUE) + - re(1|PID), data = unemp, control = tvcm_control(sctest = TRUE)) + re(1|PID), + data = unemp, control = tvcm_control(sctest = TRUE), + family = cumulative()) ## diagnosis (no cross-validation was performed since 'sctest = TRUE') plot(model.UE, "coef") diff --git a/man/tvcolmm.Rd b/man/tvcolmm.Rd new file mode 100644 index 0000000..9800083 --- /dev/null +++ b/man/tvcolmm.Rd @@ -0,0 +1,190 @@ +\name{tvcolmm} + +\alias{tvcolmm} +\alias{tvcolmm_control} + +\title{Tree-based varying coefficient regression based on ordinal and + nominal two-stage linear mixed models.} + +\description{The \command{\link{tvcolmm}} function implements the + tree-based longitudinal varying coefficient regression algorithm + proposed in Buergin and Ritschard (2014a). The algorithm approximates + varying fixed coefficients in the cumulative logit mixed model by a + (multivariate) piecewise constant functions using recursive + partitioning, i.e., it estimates the fixed effect component of the + model separately for strata of the value space of partitioning + variables. +} + +\usage{ +tvcolmm(formula, data, family = cumulative(), + weights, subset, offset, na.action, + control = tvcolmm_control(), ...) + +tvcolmm_control(alpha = 0.05, bonferroni = TRUE, minsize = 50, + maxnomsplit = 5, maxordsplit = 9, maxnumsplit = 9, + trim = 0.1, estfun.args = list(), nimpute = 5, + seed = NULL, ...) +} + +\arguments{ + \item{formula}{a symbolic description of the model to fit, e.g., + + \code{y ~ -1 + vc(z1, \ldots, zL, by = x1 + \ldots + xP, intercept = + TRUE) + re(1|id)} + + where \code{vc} term specifies the varying fixed coefficients. Only + one such \code{vc} term is allowed with + \command{\link{tvcolmm}}. The example formula removes the global + intercept and adds a locally varying intercept by setting + \code{intercept = TRUE} in the \code{vc} term. For details, see + \command{\link{olmm}} and \command{\link{vcrpart-formula}}.} + \item{family}{the model family. An object of class + \command{\link{family.olmm}}.} + \item{data}{a data frame containing the variables in the model.} + \item{weights}{an optional numeric vector of weights to be used in the + fitting process.} + \item{subset}{an optional logical or integer vector specifying a + subset of \code{'data'} to be used in the fitting process.} + \item{offset}{this can be used to specify an a priori known component + to be included in the linear predictor during fitting.} + \item{na.action}{a function that indicates what should happen if data + contain \code{NA}s. See \command{\link{na.action}}.} + \item{control}{a list with control parameters as returned by + \command{\link{tvcolmm_control}}.} + \item{alpha}{numeric significance threshold between 0 and 1. A node is + splitted when the smallest (possibly Bonferroni-corrected) \eqn{p} + value for any coefficient constancy test in the current step falls + below \code{alpha}.} + \item{bonferroni}{logical. Indicates if and how \eqn{p}-values of + coefficient constancy tests must be Bonferroni + corrected. See details.} + \item{minsize}{numeric scalar. The minimum sum of weights in terminal + nodes.} + \item{maxnomsplit, maxordsplit, maxnumsplit}{integer scalars for split + candidate reduction. See \command{\link{tvcm_control}}} + \item{trim}{numeric between 0 and 1. Specifies the trimming parameter + in coefficient constancy tests for continuous partitioning + variables. See also the argument \code{from} of function + \code{supLM} in package \pkg{strucchange}.} + \item{estfun.args}{list of arguments to be passed to + \command{\link{gefp.olmm}}. See details.} + \item{nimpute}{a positive integer scalar. The number of times + coefficient constancy tests should be repeated in each + iteration. See details.} + \item{seed}{an integer specifying which seed should be set at the + beginning.} + \item{\ldots}{additional arguments passed to the fitting function + \code{fit} or to \command{\link{tvcm_control}}.} +} + +\details{ + The TVCOLMM algorithm iterates the following steps: + + \enumerate{ + \item Fit the current mixed model + + \code{y ~ Node:x1 + \ldots + Node:xP + re(1 + w1 + \ldots |id)} + + with \command{\link{olmm}}, where \code{Node} is a categorical + variable with terminal node labels \code{1}, \ldots, \code{M}. + + \item Test for the constancy of the fixed effects \code{Node:x1, + \ldots} separately for each moderator \code{z1}, \ldots, \code{zL} + in each node \code{1}, \ldots, \code{M}. This yields \code{L} times + \code{M} (possibly Bonferroni corrected) \eqn{p}-values for + rejecting coefficient constancy. + + \item If the minimum \eqn{p}-value is smaller than \code{alpha}, + then select the node and the variable corresponding to the minimum + \eqn{p}-value. Search and incorporate the optimal + among the candidate splits in the selected node and variable by + exhaustive likelihood maximization grid search. + + \item Else if minimum \eqn{p}-value is larger than \code{alpha}, + stop the algorithm and return the current model. + } + + The implemented coefficient constancy tests used for node and variable + selection (step 2) are based on the M-fluctuation tests of Zeileis and + Hornik (2007), using the observation scores of the fitted mixed + model. These observation scores can be extracted by + \command{\link{estfun.olmm}} for models fitted with + \command{\link{olmm}}. To deal with intra-individual correlations + between such observation scores, the \command{\link{estfun.olmm}} + function decorrelates the observation scores. In cases of unbalanced + data, the pre-decorrelation method requires imputation. \code{nimpute} + gives the number of times the coefficient constancy tests are repeated + in each iteration. The final \eqn{p}-values are then the averages of + the repetations. + + The algorithm combines the splitting technique of Zeileis (2008) with + the technique of Hajjem et. al (2011) and Sela and Simonoff (2012) to + incorporate regression trees into mixed models. + + Special attention is given to varying intercepts, i.e. the terms that + account for the direct effects of the moderators. A common + specification is + + \code{y ~ -1 + vc(z1, \ldots, zL, by = x1 + \ldots + xP, intercept = + TRUE) + re(1 + w1 + \ldots |id)} + + Doing so replaces the globale intercept by local intercepts. + } + +\value{An object of class \command{\link{tvcm}} +} + +\references{ + Zeileis, A., Hothorn, T., and Hornik, K. (2008). Model-Based + Recursive Partitioning. \emph{Journal of Computational and Graphical + Statistics}, \bold{17}(2), 492--514. + + Zeileis, A., Hornik, K. (2007), Generalized M-Fluctuation Tests for + Parameter Instability, \emph{Statistica Neerlandica}, \bold{61}, + 488--508. doi:10.1111/j.1467-9574.2007.00371.x. + + Buergin R. and Ritschard G. (2014a), Tree-based varying coefficient + regression for longitudinal ordinal responses. Article in progress. + + R. Sela and J. S. Simonoff (2012). RE-EM trees: a data mining + approach for longitudinal and clustered data, \emph{Machine Learning} + \bold{86}, 169--207. + + A. Hajjem, F. Bellavance and D. Larocque (2011), Mixed effects + regression trees for clustered data, \emph{Statistics & Probability + Letters} \bold{81}, 451--459. +} + +\seealso{\command{\link{tvcm_control}}, \command{\link{tvcm-methods}}, + \command{\link{tvcm-plot}}, \command{\link{glm}}} + +\examples{ +## ------------------------------------------------------------------- # +## Example 1: Moderated effect effect of unemployment +## +## Here we fit a varying coefficient ordinal linear mixed on the +## synthetic ordinal longitudinal data 'unemp'. The interest is whether +## the effect of unemployment 'UNEMP' on happiness 'GHQL' is moderated +## by 'AGE', 'FISIT', 'GENDER' and 'UEREGION'. 'FISIT' is the only true +## moderator. For the the partitioning we coefficient constancy tests, +## as described in Buergin and Ritschard (2014a) +## ------------------------------------------------------------------- # + +data(unemp) + +## fit the model +model.UE <- + tvcolmm(GHQL ~ -1 + + vc(AGE, FISIT, GENDER, UEREGION, by = UNEMP, intercept = TRUE) + + re(1|PID), data = unemp) + +## diagnosis +plot(model.UE, "coef") +summary(model.UE) +splitpath(model.UE, steps = 1, details = TRUE) +} + +\author{Reto Buergin} + +\keyword{tree} \ No newline at end of file diff --git a/src/init.c b/src/init.c index b797bd0..47bd24d 100644 --- a/src/init.c +++ b/src/init.c @@ -1,5 +1,6 @@ #include "olmm.h" #include "utils.h" +#include "tvcm.h" #include #define CALLDEF(name, n) {#name, (DL_FUNC) &name, n} @@ -10,7 +11,7 @@ static R_CallMethodDef CallEntries[] = { CALLDEF(olmm_update_u, 1), CALLDEF(olmm_pred_marg, 5), CALLDEF(vcrpart_duplicate, 1), - CALLDEF(tvcm_nomsplits, 4), + CALLDEF(tvcm_nomsplits, 1), {NULL, NULL, 0} }; diff --git a/src/olmm.c b/src/olmm.c index 633254f..3b68422 100644 --- a/src/olmm.c +++ b/src/olmm.c @@ -1,4 +1,5 @@ #include "olmm.h" +#include "utils.h" #include #include #include @@ -103,17 +104,6 @@ double olmm_GLink(double x, int link) { } } -SEXP getListElement(SEXP list, const char *str) { - SEXP elmt = R_NilValue, - names = getAttrib(list, R_NamesSymbol); - for (int i = 0; i < length(list); i++) - if(strcmp(CHAR(STRING_ELT(names, i)), str) == 0) { - elmt = VECTOR_ELT(list, i); - break; - } - return elmt; -} - /** * --------------------------------------------------------- * Store a new parameter vector in a olmm object @@ -204,15 +194,22 @@ SEXP olmm_update_marg(SEXP x, SEXP par) { R_CheckStack(); /* numeric valued objects */ - double *X = X_SLOT(x), *W = W_SLOT(x), - *weights_obs = WEIGHTS_SLOT(x), *weights_sbj = WEIGHTSSBJ_SLOT(x), - *offset = OFFSET_SLOT(x), *eta = ETA_SLOT(x), - *fixef = FIXEF_SLOT(x), *ranefCholFac = RANEFCHOLFAC_SLOT(x), - *logLik_sbj = LOGLIKSBJ_SLOT(x), *logLik = LOGLIK_SLOT(x), - *score_obs = SCOREOBS_SLOT(x), *score_sbj = SCORESBJ_SLOT(x), + double *X = X_SLOT(x), + *W = W_SLOT(x), + *weights_obs = WEIGHTS_SLOT(x), + *weights_sbj = WEIGHTSSBJ_SLOT(x), + *offset = OFFSET_SLOT(x), + *eta = ETA_SLOT(x), + *fixef = FIXEF_SLOT(x), + *ranefCholFac = RANEFCHOLFAC_SLOT(x), + *logLik_sbj = LOGLIKSBJ_SLOT(x), + *logLik = LOGLIK_SLOT(x), + *score_obs = SCOREOBS_SLOT(x), + *score_sbj = SCORESBJ_SLOT(x), *score = SCORE_SLOT(x), *info = INFO_SLOT(x), - *ghw = GHW_SLOT(x), *ghx = GHX_SLOT(x), + *ghw = GHW_SLOT(x), + *ghx = GHX_SLOT(x), *ranefElMat = RANEFELMAT_SLOT(x); R_CheckStack(); diff --git a/src/tvcm.c b/src/tvcm.c new file mode 100644 index 0000000..5a3f63e --- /dev/null +++ b/src/tvcm.c @@ -0,0 +1,43 @@ +#include "tvcm.h" + +/** + * --------------------------------------------------------- + * Computes the split matrix for nominal variables for + * the 'tvcm' algorithm. + * + * @param nLevs total number of categories. + * @param xdlev integer vector levels observed have a + 1 and the other a 0 + * @param nCuts number of possible splits with the categories + * observed in the current node. + * + * @return an integer vector of length nCuts x nLevs to be used + * to construct the split matrix. + * --------------------------------------------------------- + */ + +SEXP tvcm_nomsplits(SEXP zdlev) { + zdlev = PROTECT(coerceVector(zdlev, INTSXP)); + int *rzdlev = INTEGER(zdlev); + int nLevs = length(zdlev), nLevsD = 0, nCuts = 1; + for (int l = 0; l < nLevs; l++) nLevsD += rzdlev[l]; + for (int l = 0; l < (nLevsD - 1); l++) nCuts *= 2; + nCuts = nCuts - 1; + SEXP indz = PROTECT(allocMatrix(INTSXP, nCuts, nLevs)); + int *rindz = INTEGER(indz); + for (int i = 0; i < (nLevs * nCuts); i++) rindz[i] = 0; + int ii = 0; + if (nCuts > 0) { + for (int i = 0; i < nCuts; i++) { + ii = i + 1; + for (int l = 0; l < nLevs; l++) { + if (rzdlev[l] > 0) { + rindz[i + nCuts * l] = ii % 2; + ii = ii / 2; + } + } + } + } + UNPROTECT(2); + return indz; +} diff --git a/src/tvcm.h b/src/tvcm.h new file mode 100644 index 0000000..df84c3c --- /dev/null +++ b/src/tvcm.h @@ -0,0 +1,9 @@ +#ifndef VCRPART_TVCM_H +#define VCRPART_TVCM_H + +#include +#include + +SEXP tvcm_nomsplits(SEXP zdlev); + +#endif diff --git a/src/utils.c b/src/utils.c index 0af1c4c..516a8f6 100644 --- a/src/utils.c +++ b/src/utils.c @@ -19,42 +19,22 @@ SEXP vcrpart_duplicate(SEXP x) { /** * --------------------------------------------------------- - * Computes the split matrix for nominal variables for - * the 'tvcm' alorithm. + * Get list elements * - * @param xlev integer vector, all observed categories. - * @param nl total number of categories. - * @param xdlev integer vector, the levels observed. - * in the current node. - * @param nld number of categories observed in the current - * node. - * @param mi number of possible splits with the categories - * observed in the current node. + * @param list a list. + * @param str a character string. * - * @return an integer vector of length nld * mi to be used - * to construct the split matrix. + * @return a list element * --------------------------------------------------------- */ -SEXP tvcm_nomsplits(SEXP nl, SEXP xdlev, SEXP nld, SEXP mi) { - - int *xdlev_c = INTEGER(xdlev); - const int nl_c = INTEGER(nl)[0], - nld_c = INTEGER(nld)[0], - mi_c = INTEGER(mi)[0]; - - SEXP indx = PROTECT(allocVector(INTSXP, nl_c * mi_c)); - for (int i = 0; i < (nl_c * mi_c); i++) INTEGER(indx)[i] = 0; - int ii = 0; - if (mi_c > 0) { - for (int i = 0; i < mi_c; i++) { - ii = i + 1; - for (int l = 0; l < nld_c; l++) { - INTEGER(indx)[i * nl_c + xdlev_c[l] - 1] = ii % 2; - ii = ii / 2; - } +SEXP getListElement(SEXP list, const char *str) { + SEXP elmt = R_NilValue, + names = getAttrib(list, R_NamesSymbol); + for (int i = 0; i < length(list); i++) + if(strcmp(CHAR(STRING_ELT(names, i)), str) == 0) { + elmt = VECTOR_ELT(list, i); + break; } - } - UNPROTECT(1); - return indx; + return elmt; } diff --git a/src/utils.h b/src/utils.h index 2c2184a..832e326 100644 --- a/src/utils.h +++ b/src/utils.h @@ -5,6 +5,6 @@ #include SEXP vcrpart_duplicate(SEXP x); -SEXP tvcm_nomsplits(SEXP nl, SEXP xdlev, SEXP nld, SEXP mi); +SEXP getListElement(SEXP list, const char *str); #endif