diff --git a/.Rbuildignore b/.Rbuildignore index ff134b2..c8cd602 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,4 @@ docs/ pkgdown/ _pkgdown.yaml ^CRAN-SUBMISSION$ +dev/ diff --git a/DESCRIPTION b/DESCRIPTION index 1beb155..f382535 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ggbiplot Type: Package Title: A Grammar of Graphics Implementation of Biplots -Version: 0.6.3 -Date: 2024-04-24 +Version: 0.6.5 +Date: 2025-09-16 Authors@R: c( person(c("Vincent", "Q."), "Vu", , "vqv@stat.osu.edu", role = c("aut"), comment = c(ORCID = "0000-0002-4689-0497")), @@ -18,7 +18,7 @@ Description: A 'ggplot2' based implementation of biplots, giving a representatio biplot and scree plot methods which can be used with the results of prcomp(), princomp(), FactoMineR::PCA(), ade4::dudi.pca() or MASS::lda() and can be customized using 'ggplot2' techniques. Depends: - R (>= 3.5.0), + R (>= 4.1.0), ggplot2 Imports: scales diff --git a/NEWS.md b/NEWS.md index cba051d..4f88526 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +## Version 0.6.5 + +- Now Depends R (>= 4.1.0) to handle |> +- Added lda() objects to those handled by reflect() +- Illustrate reflection in ggbiplot examples + +## Version 0.6.4 + +- Documented solution to 'scale_color_discrete() produces two legends' #2 +- Fix glitch with axis labels +- `ggbiplot` gains `geom.ind` and `geom.var` arguments for more flexlible handling of the geometries used to display the +observation points and variable labels. + ## Version 0.6.3 - Fix axis label spacing diff --git a/R/ggbiplot.r b/R/ggbiplot.r index 2f2d073..41ca4dd 100644 --- a/R/ggbiplot.r +++ b/R/ggbiplot.r @@ -77,6 +77,21 @@ #' values is assigned, contains the so-called \emph{standard coordinates} and have sum of squared #' values equal to 1.0. #' +#' \bold{Scales and legend} +#' +#' When the `groups` argument is not \code{NULL}, the function uses that value to set the aesthetics for `color`, `fill` and `shape`. +#' If you override the defaults using \code{scale_color_discrete}, etc., you may find that duplicate legends are produced. +#' To avoid this, you need to have the same name for aesthetics to be merged in the legend, for example, +#' \preformatted{ +#' scale_fill_discrete(name = 'Species') +#' scale_color_discrete(name = 'Species') +#' } +#' or, +#' \preformatted{ +#' labs(fill = "Species", color = "Species") +#' } +#' +#' #' @param pcobj an object returned by \code{\link[stats]{prcomp}}, \code{\link[stats]{princomp}}, #' \code{\link[FactoMineR]{PCA}}, \code{\link[ade4]{dudi.pca}}, or \code{\link[MASS]{lda}} #' @param choices Which components to plot? An integer vector of length 2. @@ -97,6 +112,14 @@ #' @param groups Optional factor variable indicating the groups that the observations belong to. #' If provided the points will be colored according to groups and this allows data ellipses also #' to be drawn when \code{ellipse = TRUE}. +#' @param geom.ind a text specifying the geometry to be used for the observations. Allowed +#' values are among \code{c("point", "text")}. Use +#' \code{"point"} (to show only points); \code{"text"} to show only labels; +#' \code{c("point", "text")} to show both. +#' @param geom.var a text specifying the geometry to be used for the variables. Allowed +#' values are among \code{c("arrow", "text")}. Use +#' \code{"arrow"} (to show only vectors); \code{"text"} to show only labels; +#' \code{c("arrow", "text")} to show both. #' @param point.size Size of observation points. #' @param ellipse Logical; draw a normal data ellipse for each group? #' @param ellipse.prob Coverage size of the data ellipse in Normal probability @@ -131,7 +154,7 @@ #' \code{\link[stats]{biplot}} for the original stats package version; #' \code{\link[factoextra]{fviz_pca_biplot}} for the factoextra package version. #' -#' @author Vincent Q. Vu. +#' @author Vincent Q. Vu., Michael Friendly #' @references #' Gabriel, K. R. (1971). The biplot graphical display of matrices with application to principal component analysis. #' \emph{Biometrika}, \bold{58}, 453–467. \doi{10.2307/2334381}. @@ -157,6 +180,22 @@ #' varname.size = 4, #' groups = wine.class, #' ellipse = TRUE, circle = TRUE) +#' +#' # Easier interpretation if the axes are reflected +#' wine.pca <- reflect(wine.pca) +#' # Use direct labels rather than a legend +#' means <- aggregate(cbind(PC1, PC2) ~ wine.class, +#' data = wine.pca$x, FUN = mean) +#' +#' ggbiplot(wine.pca, +#' obs.scale = 1, var.scale = 1, +#' groups = wine.class, +#' varname.size = 4, +#' ellipse = TRUE, +#' circle = TRUE) + +#' geom_label(data = means, aes(x=PC1, y=PC2, label = wine.class)) + +#' theme(legend.position = 'none') +#' #' #' data(iris) #' iris.pca <- prcomp (~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, @@ -181,6 +220,8 @@ ggbiplot <- function(pcobj, var.scale = scale, var.factor = 1, # MF groups = NULL, + geom.ind = "point", + geom.var = c("arrow", "text"), point.size = 1.5, ellipse = FALSE, ellipse.prob = 0.68, @@ -276,9 +317,9 @@ ggbiplot <- function(pcobj, # Change the title labels for the axes if(obs.scale == 0) { - u.axis.labs <- paste('standardized ', axis.title, choices, sep='') + u.axis.labs <- paste0('standardized ', axis.title, choices) } else { - u.axis.labs <- paste(axis.title, choices, sep='') + u.axis.labs <- paste0(axis.title, choices) } # Append the proportion of explained variance to the axis labels @@ -297,11 +338,11 @@ ggbiplot <- function(pcobj, } # Variable Names - if(varname.abbrev) { - df.v$varname <- abbreviate(rownames(v)) - } else { - df.v$varname <- rownames(v) + vn <- rownames(v) + if(is.logical(varname.abbrev) & isTRUE(varname.abbrev)) { + vn <- abbreviate(vn) } + df.v$varname <- vn # Variables for text label placement df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar)) @@ -314,18 +355,39 @@ ggbiplot <- function(pcobj, coord_equal(clip = clip) # Draw either labels or points - if(!is.null(df.u$labels)) { + # if(!is.null(df.u$labels)) { + # if(!is.null(df.u$groups)) { + # g <- g + geom_text(aes(label = labels, color = groups), + # size = labels.size) + # } else { + # g <- g + geom_text(aes(label = labels), size = labels.size) + # } + # } else { + # if(!is.null(df.u$groups)) { + # g <- g + geom_point(aes(color = groups), alpha = alpha, size = point.size) + # } else { + # g <- g + geom_point(alpha = alpha, size = point.size) + # } + # } + + # Allow to use points and/or labels + if("point" %in% geom.ind) { if(!is.null(df.u$groups)) { - g <- g + geom_text(aes(label = labels, color = groups), - size = labels.size) + g <- g + geom_point(aes(color = groups, shape = groups), + alpha = alpha, size = point.size) } else { - g <- g + geom_text(aes(label = labels), size = labels.size) + g <- g + geom_point(alpha = alpha, size = point.size) } - } else { + } + if("text" %in% geom.ind) { if(!is.null(df.u$groups)) { - g <- g + geom_point(aes(color = groups), alpha = alpha, size = point.size) + # g <- g + geom_text(aes(label = labels, color = groups), + # size = labels.size, vjust = -0.5) + g <- g + geom_text(aes(label = labels), + size = labels.size, vjust = -0.5) } else { - g <- g + geom_point(alpha = alpha, size = point.size) + g <- g + geom_text(aes(label = labels), + size = labels.size, vjust = -0.5) } } @@ -341,6 +403,7 @@ ggbiplot <- function(pcobj, } # Draw directions + if("arrow" %in% geom.var) { arrow_style <- arrow(length = unit(1/2, 'picas'), type="closed", angle=15) g <- g + geom_segment(data = df.v, @@ -348,6 +411,7 @@ ggbiplot <- function(pcobj, arrow = arrow_style, color = varname.color, linewidth = 1.4) # MR: was 1.2 + } } # Overlay a concentration ellipse if there are groups @@ -355,46 +419,7 @@ ggbiplot <- function(pcobj, theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50)) circle <- cbind(cos(theta), sin(theta)) - # ell <- ddply(df.u, 'groups', function(x) { - # if(nrow(x) <= 2) { - # return(NULL) - # } - # sigma <- var(cbind(x$xvar, x$yvar)) - # mu <- c(mean(x$xvar), mean(x$yvar)) - # ed <- sqrt(qchisq(ellipse.prob, df = 2)) - # data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'), - # groups = x$groups[1]) - # }) - # names(ell)[1:2] <- c('xvar', 'yvar') - # ell <- - # df.u |> - # group_by(groups) |> - # filter(n() > 2) |> - # summarize( - # sigma = list(var(cbind(xvar, yvar))), - # mu = list(c(mean(xvar), mean(yvar))), - # ed = sqrt(qchisq(ellipse.prob, df = 2)), - # circle_chol = list(circle %*% chol(sigma[[1]]) * ed), - # ell = list(sweep(circle_chol[[1]], 2, mu[[1]], FUN = "+")), - # xvar = map(ell, ~.x[,1]), - # yvar = map(ell, ~.x[,2]), - # .groups = "drop" - # ) |> - # dplyr::select(xvar, yvar, groups) |> - # tidyr::unnest(c(xvar, yvar)) - - # g <- g + geom_path(data = ell, - # aes(color = groups, - # group = groups), - # linewidth = ellipse.linewidth) - # g <- g + geom_polygon(data = ell, - # aes(color = groups, - # fill = groups - # # group = groups - # ), - # alpha = 0.4, # MF: why doesn't this have any effect? - # linewidth = ellipse.linewidth) # Overlay a concentration ellipse if there are groups geom <- if(isTRUE(ellipse.fill)) "polygon" else "path" @@ -418,12 +443,12 @@ ggbiplot <- function(pcobj, } # Label the variable axes - if(var.axes) { + if(var.axes & "text" %in% geom.var) { g <- g + geom_text(data = df.v, aes(label = varname, x = xvar, y = yvar, angle = angle, hjust = hjust), - color = varname.color, size = varname.size) + color = varname.color, size = varname.size, lineheight = 0.75) } # Change the name of the legend for groups # if(!is.null(groups)) { diff --git a/R/reflect.R b/R/reflect.R index b2aad97..897e551 100644 --- a/R/reflect.R +++ b/R/reflect.R @@ -34,6 +34,13 @@ #' #' crime.pca <- reflect(crime.pca) # reflect columns 1:2 #' biplot(crime.pca) +#' +#' iris.lda <- MASS::lda(Species ~ ., data=iris) +#' #reflect the first dimension +#' iris.lda1 <- reflect(iris.lda, columns = 1) +#' # compare predicted scores +#' predict(iris.lda)$x |> head() +#' predict(iris.lda1)$x |> head() reflect <- function(pcobj, columns = 1:2){ @@ -60,11 +67,10 @@ reflect <- function(pcobj, columns = 1:2){ if ("quanti.sup" %in% names(pcobj)) pcobj$quanti.sup$coord[, columns] <- -1 * pcobj$quanti.sup$coord[, columns] } else if(inherits(pcobj, "lda")) { - warning("Can't reflect an 'lda' object") # Why not??? - # u <- predict(pcobj)$x - # check(u, columns) - # pcobj$scaling[, columns] <- -1 * pcobj$scaling[, columns] - # pcobj$x[, columns] <- -1 * pcobj$x[, columns] + # lda objects don't have a scores (x) component. They come from predict() + u <- predict(pcobj)$x + check(u, columns) + pcobj$scaling[, columns] <- -1 * pcobj$scaling[, columns] } else { stop('Expected a object of class "prcomp", "princomp", "PCA", or "lda"') diff --git a/dev/peng-out-test.R b/dev/peng-out-test.R new file mode 100644 index 0000000..104589c --- /dev/null +++ b/dev/peng-out-test.R @@ -0,0 +1,120 @@ +# How to label points selectively in ggbiplot or factoextra::fviz_pac_biplot + +library(heplots) +library(dplyr) +library(ggplot2) +# library(ggbiplot) +library(factoextra) + +data(peng, package="heplots") +source("C:/R/projects/Vis-MLM-book/R/penguin/penguin-colors.R") + +# find potential multivariate outliers +DSQ <- heplots::Mahalanobis(peng[, 3:6]) +noteworthy <- order(DSQ, decreasing = TRUE)[1:3] |> print() + +peng_plot <- peng |> + tibble::rownames_to_column(var = "id") |> + mutate(note = id %in% noteworthy) + + +peng.pca <- prcomp (~ bill_length + bill_depth + flipper_length + body_mass, + data=peng, scale. = TRUE) + +# create vector of labels, blank except for the noteworthy +lab <- 1:nrow(peng) +lab <- ifelse(lab %in% noteworthy, lab, "") + +col <- c("#F37A00, #6A3D9A, #33a02c") # pengion.colors("dark") +# options(ggplot2.discrete.fill = col, +# ggplot2.discrete.color = col) + +#remotes::install_github("friendly/ggbiplot", ref = "geoms") +ggbiplot(peng.pca, + choices = 3:4, + groups = peng$species, + ellipse = TRUE, ellipse.alpha = 0.1, + circle = TRUE, + var.factor = 4.5, + geom.ind = c("point", "text"), + point.size = 2, + labels = lab, labels.size = 6, + varname.size = 5, + clip = "off") + + theme_minimal(base_size = 14) + + theme_penguins("dark") + + theme(legend.direction = 'horizontal', legend.position = 'top') + +# first two dims +ggbiplot(peng.pca, + choices = 1:2, + groups = peng$species, + ellipse = TRUE, ellipse.alpha = 0.1, + circle = TRUE, + var.factor = 1, + geom.ind = c("point", "text"), + point.size = 1, + labels = lab, labels.size = 6, + varname.size = 5, + clip = "off") + + theme_minimal(base_size = 14) + + theme_penguins("dark") + + scale_shape_discrete() + + theme(legend.direction = 'horizontal', legend.position = 'top') + +#------------------------------------ +# adjust variable names to fold at '_' +vn <- rownames(peng.pca$rotation) +vn <- gsub("_", "\n", vn) +rownames(peng.pca$rotation) <- vn + +ggbiplot(peng.pca, # obs.scale = 1, var.scale = 1, + choices = 1:2, + groups = peng$species, + ellipse = TRUE, ellipse.alpha = 0.1, + circle = TRUE, + var.factor = 1, + geom.ind = c("point", "text"), + point.size = 1, + labels = lab, labels.size = 6, + varname.size = 5, + clip = "off") + + theme_minimal(base_size = 14) + + theme_penguins("dark") + + scale_shape_discrete() + + theme(legend.direction = 'horizontal', legend.position = 'top') + + + + +fviz_pca_biplot( + peng.pca, + axes = 3:4, + habillage = peng$species, + addEllipses = TRUE, ellipse.level = 0.68, + palette = peng.colors("dark"), + arrowsize = 1.5, col.var = "black", labelsize=4, + # label = lab +) + + theme(legend.position = "top") + +# from +label <- lab +fviz_pca_biplot( + peng.pca, + axes = 3:4, + habillage = peng$species, + addEllipses = TRUE, + ellipse.level = 0.68, +# label = "none", + # palette = peng.colors("dark"), + arrowsize = 1.5, + col.var = "black", +) + + geom_text( + data = ~ dplyr::bind_cols(., label), + aes(label = label), + vjust = 0, + nudge_y = .05 + ) + + theme(legend.position = "top") diff --git a/examples/wine-robpca.R b/examples/wine-robpca.R index 57b2fa5..d7f2cf7 100644 --- a/examples/wine-robpca.R +++ b/examples/wine-robpca.R @@ -4,6 +4,9 @@ data(wine, package = "ggbiplot") wine.pca <- prcomp(wine, scale=TRUE) +wine.pca <- reflect(wine.pca) + + ggbiplot(wine.pca, obs.scale = 1, var.scale = 1, groups = wine.class, @@ -13,6 +16,20 @@ ggbiplot(wine.pca, labs(fill = "Cultivar", color = "Cultivar") + theme(legend.direction = 'horizontal', legend.position = 'top') +# direct labels +means <- aggregate(cbind(PC1, PC2) ~ wine.class, + data = wine.pca$x, FUN = mean) + +ggbiplot(wine.pca, + obs.scale = 1, var.scale = 1, + groups = wine.class, + varname.size = 4, + ellipse = TRUE, + circle = TRUE) + + geom_label(data = means, aes(x=PC1, y=PC2, label = wine.class)) + + theme(legend.position = 'none') + + wine.rpca <- robpca(wine, k = 2, scale=TRUE) names(wine.rpca) @@ -22,7 +39,8 @@ names(wine.rpca) class(wine.rpca) # list -diagPlot(wine.rpca) +col <- c("red", "green", "blue")[wine.class] +diagPlot(wine.rpca, col = col) wine.rpca$loadings diff --git a/man/ggbiplot.Rd b/man/ggbiplot.Rd index e8c5814..517200d 100644 --- a/man/ggbiplot.Rd +++ b/man/ggbiplot.Rd @@ -13,6 +13,8 @@ ggbiplot( var.scale = scale, var.factor = 1, groups = NULL, + geom.ind = "point", + geom.var = c("arrow", "text"), point.size = 1.5, ellipse = FALSE, ellipse.prob = 0.68, @@ -63,6 +65,16 @@ Mahalanobis distance.} If provided the points will be colored according to groups and this allows data ellipses also to be drawn when \code{ellipse = TRUE}.} +\item{geom.ind}{a text specifying the geometry to be used for the observations. Allowed +values are among \code{c("point", "text")}. Use +\code{"point"} (to show only points); \code{"text"} to show only labels; +\code{c("point", "text")} to show both.} + +\item{geom.var}{a text specifying the geometry to be used for the variables. Allowed +values are among \code{c("arrow", "text")}. Use +\code{"arrow"} (to show only vectors); \code{"text"} to show only labels; +\code{c("arrow", "text")} to show both.} + \item{point.size}{Size of observation points.} \item{ellipse}{Logical; draw a normal data ellipse for each group?} @@ -175,7 +187,21 @@ coordinates are called \emph{principal coordinates} and the sum of squared coord on each dimension equal the corresponding singular value. The other matrix, to which no part of the singular values is assigned, contains the so-called \emph{standard coordinates} and have sum of squared -values equal to 1.0. +values equal to 1.0. + +\bold{Scales and legend} + +When the `groups` argument is not \code{NULL}, the function uses that value to set the aesthetics for `color`, `fill` and `shape`. +If you override the defaults using \code{scale_color_discrete}, etc., you may find that duplicate legends are produced. +To avoid this, you need to have the same name for aesthetics to be merged in the legend, for example, +\preformatted{ +scale_fill_discrete(name = 'Species') +scale_color_discrete(name = 'Species') +} +or, +\preformatted{ +labs(fill = "Species", color = "Species") +} } \examples{ data(wine) @@ -186,6 +212,22 @@ ggbiplot(wine.pca, varname.size = 4, groups = wine.class, ellipse = TRUE, circle = TRUE) + + # Easier interpretation if the axes are reflected +wine.pca <- reflect(wine.pca) + # Use direct labels rather than a legend +means <- aggregate(cbind(PC1, PC2) ~ wine.class, + data = wine.pca$x, FUN = mean) + +ggbiplot(wine.pca, + obs.scale = 1, var.scale = 1, + groups = wine.class, + varname.size = 4, + ellipse = TRUE, + circle = TRUE) + + geom_label(data = means, aes(x=PC1, y=PC2, label = wine.class)) + + theme(legend.position = 'none') + data(iris) iris.pca <- prcomp (~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, @@ -222,5 +264,5 @@ Gabriel, K. R. (1971). The biplot graphical display of matrices with application \code{\link[factoextra]{fviz_pca_biplot}} for the factoextra package version. } \author{ -Vincent Q. Vu. +Vincent Q. Vu., Michael Friendly } diff --git a/man/reflect.Rd b/man/reflect.Rd index b6dd534..bd36ea4 100644 --- a/man/reflect.Rd +++ b/man/reflect.Rd @@ -40,6 +40,13 @@ crime.pca <- crime.pca <- reflect(crime.pca) # reflect columns 1:2 biplot(crime.pca) + + iris.lda <- MASS::lda(Species ~ ., data=iris) + #reflect the first dimension + iris.lda1 <- reflect(iris.lda, columns = 1) + # compare predicted scores + predict(iris.lda)$x |> head() + predict(iris.lda1)$x |> head() } \seealso{ \code{\link[stats]{prcomp}}, \code{\link[stats]{princomp}},