diff --git a/.Rbuildignore b/.Rbuildignore index f714b08..b6368e0 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,9 +4,29 @@ ^README\.Rmd$ ^README\.md$ ^README_files$ -^NEWS\.md$ ^revdep$ ^.*\.RData +^ljc\.Rda +^stmmodel\.Rda +^wikiwWordVis\.Rda +^wij +^visscratc\.R +^wikiwordcoords\.Rda +^ng20wij\.Rda +^time\.Rda +^wordcoords\.Rda +^vignettedata/$ +^vignettedata/.$ +^stm\.Rda +^wikiwij\.Rda +^wikiwords\.Rda +^wij$ +^visscratch\.R$ +^ng20wij\.Rda$ +^woordcoords\.Rda$ +^vignettedata$ +^mnistCoords\.wij$ +^unprojectable20ngwij\.Rda$ ^appveyor\.yml$ ^vignettes/largeVis\.pdf$ ^vignettes/largeVis\.md$ @@ -23,7 +43,7 @@ ^data/train\.RData$ ^libs$ ^doc$ -^Rplots\.pdf$ +^Rplots\.pdf$^.*\.RData ^m./.Rda$ ^./.bin$ ^inst/samples.Rda$ @@ -32,3 +52,10 @@ ^f.*\.Rda$ ^vignettes/.*\.Rda$ ^vignettes/.*\.Rda$ +^Examples/.Rmd +^Examples/.html +^faceshighres.png$ +^poliblog/.Rda +^log4j/.spark/.log +^mnist$ +^derby/.log \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5b6a065..77ae9e4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ .Rhistory .RData .Ruserdata +inst/doc +Examples.html diff --git a/.travis.yml b/.travis.yml index 83dbc8b..2e0d9a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,56 @@ # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r -language: R +language: r +matrix: + include: + - os: linux + dist: trusty + r: release + - os: osx + osx_image: xcode7.4 + r: release + allow_failures: + - os: osx + osx_image: xcode6.4 + r: release + - os: linux + dist: trusty + r: devel + - os: osx + osx_image: xcode7.4 + r: devel + - os: linux + dist: trusty + r: oldrel + fast_finish: true + sudo: false cache: packages + +r_github_packages: + - jimhester/covr + +r_packages: + - Rcpp + - RcppArmadillo + - devtools + +addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - gcc-4.9 + - g++-4.9 + - gfortran-4.9 + +before_install: | + mkdir ~/.R + cat < ~/.R/Makevars + CXX1X=g++-4.9 + FC=gfortran-4.9 + CXX1XSTD=-std=c++11 + + after_success: - - Rscript -e 'covr::codecov()' + - Rscript -e 'covr::codecov(branch="reference")' diff --git a/DESCRIPTION b/DESCRIPTION index eb9d964..0a8182d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: largeVis Type: Package Title: High-Quality Visualizations of Large, High-Dimensional Datasets -Version: 0.1.5 +Version: 0.1.6 Author: Amos B. Elberg Maintainer: Amos Elberg Description: Implements the largeVis algorithm for visualizing very large high-dimensional datasets. Also very fast search for approximate nearest neighbors. @@ -9,25 +9,27 @@ License: GPL-3 LazyData: TRUE RoxygenNote: 5.0.1 Depends: - R (>= 2.10), - Matrix, - RcppProgress (>= 0.2.1), - RcppArmadillo (>= 0.7.100.3.0) + R (>= 3.0.2), + Matrix Imports: parallel, Rcpp (>= 0.12.4), - abind -LinkingTo: Rcpp,RcppProgress,RcppArmadillo + abind, + ggplot2 (>= 0.9.2.1), + dbscan +LinkingTo: Rcpp,RcppProgress (>= 0.2.1),RcppArmadillo (>= 0.7.100.3.0),testthat(>= 1.0.2) Suggests: testthat, covr, knitr, rmarkdown, - ggplot2, wesanderson, - RColorBrewer + RColorBrewer, + dplyr, + magrittr URL: https://github.com/elbamos/largeVis BugReports: https://github.com/elbamos/largeVis/issues NeedsCompilation: yes OS_type: unix, windows BuildVignettes: FALSE VignetteBuilder: knitr +SystemRequirements: C++11 diff --git a/NAMESPACE b/NAMESPACE index bd56178..17473b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,7 @@ # Generated by roxygen2: do not edit by hand -S3method(buildEdgeMatrix,CsparseMatrix) -S3method(buildEdgeMatrix,TsparseMatrix) -S3method(buildEdgeMatrix,default) +S3method(buildWijMatrix,CsparseMatrix) +S3method(buildWijMatrix,TsparseMatrix) S3method(distance,CsparseMatrix) S3method(distance,TsparseMatrix) S3method(distance,matrix) @@ -10,19 +9,25 @@ S3method(randomProjectionTreeSearch,CsparseMatrix) S3method(randomProjectionTreeSearch,TsparseMatrix) S3method(randomProjectionTreeSearch,matrix) export(buildEdgeMatrix) +export(buildWijMatrix) export(distance) +export(ggManifoldMap) +export(largeVis) export(manifoldMap) +export(manifoldMapStretch) export(neighborsToVectors) export(projectKNNs) export(randomProjectionTreeSearch) -export(vis) -importClassesFrom(Matrix,CsparseMatrix) -importClassesFrom(Matrix,TsparseMatrix) +importFrom(Matrix,sparseMatrix) importFrom(Rcpp,sourceCpp) +importFrom(dbscan,opticsXi) +importFrom(dbscan,optics_cut) +importFrom(ggplot2,aes) +importFrom(ggplot2,annotation_raster) +importFrom(ggplot2,geom_blank) +importFrom(ggplot2,ggplot) importFrom(grDevices,as.raster) importFrom(graphics,rasterImage) -importFrom(stats,optimize) -importFrom(stats,rnorm) -importFrom(utils,setTxtProgressBar) -importFrom(utils,txtProgressBar) +importFrom(stats,aggregate) +importFrom(stats,runif) useDynLib(largeVis) diff --git a/NEWS.md b/NEWS.md index 9adc430..db1c3ed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,47 @@ +### largeVis 0.1.6 + +* Revisions for CRAN release, including verifying correctness by reproducing paper examples, and timing tests/benchmarks + + Tested against the paper authors' wiki-doc and wiki-word datasets + + Tested with up to 2.5m rows, 100m edges (processed in 12 hours). +* Neighbor search: + + Dense search is much, much faster and more efficient + + Tree search for cosine distances uses normalized vectors +* projectKNNs + + Should be 10x faster for small datasets + + Replaced binary search ( O(n log n) ) with the alias algorithm for weighted sampling ( O(1) ) + + Clips and smooths gradients, per discussion with paper authors + + Optimized implementation for alpha == 1 + + Removed option for mixing weights into loss function - doesn't make sense if gradients are being clipped. + + Fixed OpenMP-related bug which caused visualizations to be "fuzzy" +* Vignettes: + + Reuse initialization matrices and neighbors, to make it easier to see the effect of hyperparameters + + Benchmarks now a separate vignette, more detailed + + Examples removed from vignettes and moved to readme + + Added examples of manifold map with color faces using OpenFace vectors +* Sigms, P_ij matrix, w_ij matrix + + Replaced C++ code entirely with new code based on reference implementation + + Refactored R code into `buildEdgeMatrix()` and `buildWijMatrix()`, which are simpler. +* Visualization + + Color manifold maps work + + Ported Karpathy's function for non-overlapping embeddings (experimental) + + Removed transparency parameter + + Added ggManifoldMap function for adding a manifold map to a ggplot2 plot +* vis + + Whether to return neighbors and sigmas now adjustable parameters, for memory reasons + + Runs gc() periodically +* Data + + Removed most data and extdata that had been included before; this is to reduce size for CRAN submission +* Dependencies & Build + + Many misc changes to simplify dependencies for CRAN + + Re-added ARMA_64BIT_WORD; otherwise, could exceed the limitation on size of an arma sparse matrix with moderately sized datasets (~ 1 M rows, K = 100) + + Now depends on R >= 3.0.2, so RcppProgress and RcppArmadillo could be moved from the Depends section of the DESCRIPTION file + + Will now compile on systems that lack OpenMP (e.g., OS X systems with old versions of xcode). +* Correctness and Testing + + Tests are separated by subject + + Additional, more extensive tests with greater code coverage + + Added travis testing against OSX +* Clustering + + Very preliminary support for dbscan and optics added ### largeVis 0.1.5 diff --git a/R/RcppExports.R b/R/RcppExports.R index 9f249ca..03d91ce 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,12 +1,36 @@ # This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -sgd <- function(coords, is, js, ps, ws, gamma, rho, minRho, useWeights, nBatches, M, alpha, verbose) { - .Call('largeVis_sgd', PACKAGE = 'largeVis', coords, is, js, ps, ws, gamma, rho, minRho, useWeights, nBatches, M, alpha, verbose) +dbscan_e <- function(edges, eps, minPts, verbose) { + .Call('largeVis_dbscan_e', PACKAGE = 'largeVis', edges, eps, minPts, verbose) } -searchTrees <- function(threshold, n_trees, K, max_recursion_degree, maxIter, data, distMethod, verbose) { - .Call('largeVis_searchTrees', PACKAGE = 'largeVis', threshold, n_trees, K, max_recursion_degree, maxIter, data, distMethod, verbose) +dbscan_ed <- function(edges, data, eps, minPts, verbose) { + .Call('largeVis_dbscan_ed', PACKAGE = 'largeVis', edges, data, eps, minPts, verbose) +} + +dbscan_nd <- function(neighbors, data, eps, minPts, verbose) { + .Call('largeVis_dbscan_nd', PACKAGE = 'largeVis', neighbors, data, eps, minPts, verbose) +} + +optics_e <- function(edges, eps, minPts, verbose) { + .Call('largeVis_optics_e', PACKAGE = 'largeVis', edges, eps, minPts, verbose) +} + +optics_ed <- function(edges, data, eps, minPts, verbose) { + .Call('largeVis_optics_ed', PACKAGE = 'largeVis', edges, data, eps, minPts, verbose) +} + +optics_nd <- function(neighbors, data, eps, minPts, verbose) { + .Call('largeVis_optics_nd', PACKAGE = 'largeVis', neighbors, data, eps, minPts, verbose) +} + +silhouetteDbscan <- function(edges, sil) { + invisible(.Call('largeVis_silhouetteDbscan', PACKAGE = 'largeVis', edges, sil)) +} + +searchTrees <- function(threshold, n_trees, K, maxIter, data, distMethod, verbose) { + .Call('largeVis_searchTrees', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, data, distMethod, verbose) } fastDistance <- function(is, js, data, distMethod, verbose) { @@ -21,19 +45,19 @@ fastSDistance <- function(is, js, i_locations, j_locations, x, distMethod, verbo .Call('largeVis_fastSDistance', PACKAGE = 'largeVis', is, js, i_locations, j_locations, x, distMethod, verbose) } -distMatrixTowij <- function(is, js, xs, sigmas, N, verbose) { - .Call('largeVis_distMatrixTowij', PACKAGE = 'largeVis', is, js, xs, sigmas, N, verbose) +referenceWij <- function(i, j, d, perplexity) { + .Call('largeVis_referenceWij', PACKAGE = 'largeVis', i, j, d, perplexity) } -sigFunc <- function(sigma, x_i, perplexity) { - .Call('largeVis_sigFunc', PACKAGE = 'largeVis', sigma, x_i, perplexity) +sgd <- function(coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, verbose) { + .Call('largeVis_sgd', PACKAGE = 'largeVis', coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, verbose) } -searchTreesCSparse <- function(threshold, n_trees, K, max_recursion_degree, maxIter, i, p, x, distMethod, verbose) { - .Call('largeVis_searchTreesCSparse', PACKAGE = 'largeVis', threshold, n_trees, K, max_recursion_degree, maxIter, i, p, x, distMethod, verbose) +searchTreesCSparse <- function(threshold, n_trees, K, maxIter, i, p, x, distMethod, verbose) { + .Call('largeVis_searchTreesCSparse', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, i, p, x, distMethod, verbose) } -searchTreesTSparse <- function(threshold, n_trees, K, max_recursion_degree, maxIter, i, j, x, distMethod, verbose) { - .Call('largeVis_searchTreesTSparse', PACKAGE = 'largeVis', threshold, n_trees, K, max_recursion_degree, maxIter, i, j, x, distMethod, verbose) +searchTreesTSparse <- function(threshold, n_trees, K, maxIter, i, j, x, distMethod, verbose) { + .Call('largeVis_searchTreesTSparse', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, i, j, x, distMethod, verbose) } diff --git a/R/buildEdgeMatrix.R b/R/buildEdgeMatrix.R index 47e49b3..ee8cd9e 100644 --- a/R/buildEdgeMatrix.R +++ b/R/buildEdgeMatrix.R @@ -1,147 +1,55 @@ -#' Build an edge-weight matrix for the LargeVis algorithm. +#' Build an nearest-neighbor graph weighted by distance. #' -#' @param x A sparseMatrix, either a \code{\link[Matrix]{CsparseMatrix-class}} or \code{\link[Matrix]{TsparseMatrix-class}} -#' @param i Indices of one node of the nearest-neighbor graph. -#' @param j Indices of the other node. -#' @param p Integer vector of pointers to the initial index of elements of each column. See \code{\link[Matrix]{CsparseMatrix-class}}. -#' @param d The distances between the nodes identified in parameters \code{i} and \code{j}. -#' @param perplexity See the paper for discussion. +#' @param data A matrix with a number of columns equal to the number of columns in `x` +#' @param neighbors An adjacency matrix of the type produced by \code{\link{randomProjectionTreeSearch}}. +#' @param distance_method One of "Euclidean" or "Cosine" #' @param verbose Verbosity #' -#' @details Implements the portion of the LargeVis algorithm that converts distances between nearest neighbors to an -#' edge-weight graph. -#' -#' @importFrom stats optimize -#' @importFrom utils setTxtProgressBar txtProgressBar -#' -#' @return A list containing: \describe{ -#' \item{"sigmas"}{A vector of \eqn{2 \dot \sigma^2} calculated for each node.} -#' \item{"wij"}{A symmetric, sparse matrix of the weights for each edge between nearest neighbors.} -#' } -#' @importClassesFrom Matrix CsparseMatrix -#' @importClassesFrom Matrix TsparseMatrix -#' @export -buildEdgeMatrix <- function(x, - i, - j, - p, - d, - perplexity, - verbose) UseMethod("buildEdgeMatrix") - - +#' @return A `sparseMatrix` +#' @importFrom Matrix sparseMatrix #' @export -#' @rdname buildEdgeMatrix -buildEdgeMatrix.default <- function(x = NULL, - i, - j, - p = NULL, - d, - perplexity = 50, - verbose = TRUE) { - if (is.null(p)) p <- i2p(i) - N <- max(max(i), max(j)) + 1 - - if (verbose) { - progress <- txtProgressBar(max = N, title = "sigmas") - cat("Estimating sigmas\n") - } - - perplexity <- log2(perplexity) - sigmas <- parallel::mclapply(1:N, FUN = function(idx) { # nocov start - if (verbose) setTxtProgressBar(progress, idx) - x_i <- d[(p[idx] + 1):(p[idx + 1])] - ret <- optimize(f = sigFunc, - x = x_i, - perplexity = perplexity, - interval = c(0, 10000)) - }) # nocov end - sigmas <- sapply(sigmas, `[[`, 1) - - if (verbose) close(progress) - - if (any(is.na(sigmas)) + - any(is.infinite(sigmas)) + - any(is.nan(sigmas)) + - any( (sigmas == 0)) > 0) - stop("An error has propogated into the sigma vector.") - - if (length(sigmas) != N) stop("Wrong sigma count") - - if (! requireNamespace("Matrix", quietly = T)) - stop("The Matrix package must be available.") - - if (verbose) cat("Calculating w_{ij}.\n") - - wij <- distMatrixTowij(i, j, d, sigmas, N, verbose) - - if (any(is.na(wij@x)) + - any(is.infinite(wij@x)) + - any(is.nan(wij@x)) + - any( (wij@x == 0)) > 0) - stop(paste("An error has propogated into the w_{ij} vector.", - "This probably means the input data wasn't scaled.")) - - return(list(sigmas = sigmas, wij = wij)) +buildEdgeMatrix <- function(data, + neighbors, + distance_method = "Euclidean", + verbose = TRUE) { + indices <- neighborsToVectors(neighbors) + distances <- distance(indices$i, indices$j, x = data, distance_method, verbose) + mat <- sparseMatrix( + i = indices$i + 1, + j = indices$j + 1, + x = as.vector(distances), + dims = c(ncol(data), ncol(data))) + return(mat) } +#' buildWijMatrix +#' +#' Rescale the weights in an edge matrix to match a given perplexity. +#' +#' @param x A sparse matrix +#' @param perplexity Given perplexity. +#' +#' @return A \code{list} with the following components: \describe{ +#' \item{'dist'}{An [N,K] matrix of the distances to the nearest neighbors.} +#' \item{'id'}{An [N,K] matrix of the node indexes of the neartest neighbors. Note that this matrix is 1-indexed, +#' unlike most other matrices in this package.} +#' \item{'k'}{The number of nearest neighbors.} +#' } #' @export -#' @rdname buildEdgeMatrix -buildEdgeMatrix.CsparseMatrix <- function(x, - i = NULL, - j = NULL, - p = NULL, - d = NULL, - perplexity = 50, - verbose = TRUE) { - # Will have x@i, which is quickly varying, and x@p, and x@x - is <- rep(0:(nrow(x) - 1), diff(x@p)) - js <- x@i - ps <- x@p - ds <- x@x - NextMethod("buildEdgeMatrix", - i = is, - j = js, - p = ps, - d = ds, - perplexity = perplexity, - verbose = verbose) -} - -#' @inheritParams buildEdgeMatrix.CsparseMatrix +buildWijMatrix <- function(x, + perplexity = 50) UseMethod("buildWijMatrix") #' @export -#' @rdname buildEdgeMatrix - -buildEdgeMatrix.TsparseMatrix <- function(x, - i = NULL, - j = NULL, - p = NULL, - d = NULL, - perplexity = 50, - verbose = TRUE) { - ps <- i2p(x@i) - is <- x@i - js <- x@j - ds <- x@x - NextMethod("buildEdgeMatrix", - i = is, - j = js, - p = ps, - d = ds, - perplexity = perplexity, - verbose = verbose) +#' @rdname buildWijMatrix +buildWijMatrix.TsparseMatrix <- function(x, + perplexity = 50) { + wij <- referenceWij(x@j, x@i, x@x^2, perplexity) + return(wij) } - -i2p <- function(is) { - N <- max(is) - ps <- rep(NA, N + 1) - diffs <- diff(is) - ps[is[which(diffs > 0)] + 2] <- which(diffs > 0) + 1 - good <- cumsum(!is.na(ps)) - ps <- ps[good + 1] - ps[1] <- 1 - ps <- ps - 1 - ps[length(ps) + 1] <- length(is) - return(ps) +#' @export +#' @rdname buildWijMatrix +buildWijMatrix.CsparseMatrix <- function(x, perplexity = 50) { + is <- rep(0:(ncol(x) - 1), diff(x@p)) + wij <- referenceWij(is, x@i, x@x^2,perplexity) + return(wij) } diff --git a/R/dbscan.R b/R/dbscan.R new file mode 100644 index 0000000..c6e45dc --- /dev/null +++ b/R/dbscan.R @@ -0,0 +1,181 @@ +#' OPTICS +#' +#' An implementation of the OPTICS algorithm. +#' +#' @param data Input data, where examples are columns. +#' @param neighbors An adjacency matrix of the type produced by \code{\link{randomProjectionTreeSearch}} +#' @param edges A weighted graph of the type produced by \code{\link{buildEdgeMatrix}}. +#' @param eps See \code{\link[dbscan]{optics}}. +#' @param minPts See \code{\link[dbscan]{optics}}. +#' @param eps_cl See \code{\link[dbscan]{optics}}. +#' @param xi See \code{\link[dbscan]{optics}}. +#' @param verbose Vebosity level. +#' +#' @details This is a preliminary implementation of a variant of the OPTICS algorithm that attempts +#' to leverage the \code{largeVis} nearest-neighbor search. +#' +#' One of \code{neighbors} or \code{edges} must be specified. If \code{edges} is missing, +#' \code{data} must also be given. If \code{data} is given along with either \code{edges} +#' or \code{neighbors}, the algorithm will attempt a more thorough search. +#' +#' @note Support for dbscan and optics are preliminary, and not fully tested for +#' correctness. +#' +#' @note This is not the original OPTICS algorithm. In particular, the neighbor-search strategy in +#' OPTICS is not used, in favor of using a pre-calculated neighbor matrix produced incidentally by +#' `largeVis`. +#' +#' @return An \code{\link[dbscan]{optics}} object. +#' +#' @importFrom dbscan optics_cut opticsXi +optics <- function(data = NULL, + neighbors = NULL, + edges = NULL, + eps, + minPts = nrow(data) + 1, + eps_cl, + xi, + verbose = TRUE) { + if (! is.null(edges) && is.null(data)) + ret <- optics_e(edges = edges, + eps = as.double(eps), minPts = as.integer(minPts), + verbose = verbose) + else if (! is.null(edges)) + ret <- optics_ed(edges = edges, data = data, + eps = as.double(eps), minPts = as.integer(minPts), + verbose = verbose) + else + ret <- optics_nd(neighbors = neighbors, data = data, + eps = as.double(eps), minPts = as.integer(minPts), + verbose = verbose) + + ret$minPts <- minPts + ret$eps <- eps + ret$eps_cl <- NA + class(ret) <- "optics" + + if(!missing(eps_cl)) ret <-optics_cut(ret, eps_cl) + if(!missing(xi)) ret <- opticsXi(ret, xi) + + ret +} + +#' dbscan +#' +#' An implementation of the dbscan algorithm. +#' +#' @param data Input data, where examples are columns. +#' @param neighbors An adjacency matrix of the type produced by \code{\link{randomProjectionTreeSearch}} +#' @param edges A weighted graph of the type produced by \code{\link{buildEdgeMatrix}}. +#' @param eps See \code{\link[dbscan]{dbscan}}. +#' @param minPts Minimum size of a cluster.' +#' @param partition If \code{TRUE}, attempt to calculate an approximate silhouette so the object returned is also +#' of class \code{\link[cluster]{partition.object}}, for compatibility with the \code{cluster} package. +#' @param verbose Verbosity level. +#' +#' @details This is a preliminary implementation of the OPTICS algorithm that attempts +#' to leverage the \code{largeVis} nearest-neighbor search. +#' +#' One of \code{neighbors} or \code{edges} must be specified. If \code{edges} is missing, +#' \code{data} must also be given. If \code{data} is given along with either \code{edges} +#' or \code{neighbors}, the algorithm will attempt a more thorough search. +#' +#' @note Support for dbscan and optics are preliminary, and not fully tested for +#' correctness. +#' +#' @note This is not the original DBSCAN algorithm. In particular, the neighbor-search strategy in +#' DBSCAN is not used, in favor of using a pre-calculated neighbor matrix produced incidentally by +#' `largeVis`. +#' +#' @importFrom stats aggregate +#' +#' @return An \code{\link[dbscan]{dbscan}} object. +dbscan <- function(data = NULL, + neighbors = NULL, + edges = NULL, + eps, + minPts = nrow(data) + 1, + partition = !missing(edges), + verbose = TRUE) { + + if (! is.null(edges) && is.null(data)) + ret <- dbscan_e(edges = edges, + eps = as.double(eps), minPts = as.integer(minPts), + verbose = verbose) + else if (! is.null(edges)) + ret <- dbscan_ed(edges = edges, data = data, + eps = as.double(eps), minPts = as.integer(minPts), + verbose = verbose) + else + ret <- dbscan_nd(neighbors = neighbors, data = data, + eps = as.double(eps), minPts = as.integer(minPts), + verbose = verbose) + + ret <- structure(list(cluster = ret, eps = eps, minPts = minPts), + class = c("dbscan_fast", "dbscan")) + if (partition) { + ret$call <- sys.call() + sil <- silhouette.dbscan(ret$cluster, edges) + avgs <- aggregate(sil[, 3], by = list(as.vector(sil[, 1])), FUN = "mean", na.rm = TRUE) + ret$silinfo <- list( + widths = sil, + clus.avg.widths = avgs$x, + avg.width = mean(sil[, 3], na.rm = TRUE) + ) + ret$objective <- NA + ret$diss <- NA + class(ret) <- c("dbscan_fast", "dbscan", "partition") + } + ret +} + +silhouette.dbscan <- function(clusters, edges) { + sil <- cbind(clusters, matrix(0, nrow = length(clusters), ncol = 2)) + silhouetteDbscan(edges, sil) + colnames(sil) <- c("cluster", "neighbor", "sil_width") + sil[, 2] <- abs(sil[, 2]) + class(sil) <- "silhouette" + sil +} + +edgeMatrixToKNNS <- function(edges) { + id = apply(edges,MARGIN = 1, FUN = function(x) which(x != 0)) + dist = apply(edges, MARGIN = 1, FUN = function(x) x[x != 0]) + for (i in 1:ncol(id)) { + ord <- order(dist[,i]) + id[,i] <- id[,i][ord] + dist[,i] <- dist[,i][ord] + } + k = nrow(id) + list(dist = t(dist), id = t(id), k = k) +} + + +#' Local Outlier Factor Score +#' +#' @description Calculate the Local Outlier Factor (LOF) score for each data point given knowledge +#' of k-Nearest Neighbors. +#' +#' @param edges An edge matrix of the type produced by \code{\link{buildEdgeMatrix}}. +#' +#' @references Based on code in the \code{\link[dbscan]{dbscan}} package. +#' +#' @return A vector of LOF values for each data point. +lof <- function(edges) { + kNNlist <- edgeMatrixToKNNS(edges) + N <- nrow(kNNlist$id) + K <- kNNlist$k + + lrd <- rep(0, N) + for(i in 1:N) { + input <- kNNlist$dist[c(i, kNNlist$id[i, ]) ,] + lrd[i] <- 1 / (sum(apply(input, MARGIN = 1, max)) / K) + } + + lof <- rep(0, N) + for (i in 1:N) lof[i] <- sum(lrd[kNNlist$id[i,]])/K / lrd[i] + + lof[is.nan(lof)] <- NA + + lof +} diff --git a/R/distance.R b/R/distance.R index 966e34f..f919cc0 100644 --- a/R/distance.R +++ b/R/distance.R @@ -11,6 +11,7 @@ #' @param verbose Verbosity. #' #' @return A vector of the distances between the columns in `x` indexed by `i` and `j`. +#' @family lowmem #' @export distance <- function(x, i, diff --git a/R/facevectors.R b/R/facevectors.R new file mode 100644 index 0000000..d25d003 --- /dev/null +++ b/R/facevectors.R @@ -0,0 +1,15 @@ +#' Embedding vectors for faces in the Labelled Faces in the Wild dataset +#' +#' A dataset of OpenFace embeddings for the "Labelled Faces in the Wild" dataset, see \url{http://vis-www.cs.umass.edu/lfw/}. +#' +#' OpenFace is a facial recognition library. The similarity between two OpenFace vectors should correlate with the +#' likelihood that the vectors were generated from images of the same person. For details and discussion, +#' see \url{https://cmusatyalab.github.io/openface/}. The images may be obtained from \url{http://vis-www.cs.umass.edu/lfw/}. +#' +#' @format A data.frame where each row represents an image. The first column is the name of the person in the image, the second column +#' is the name of the image file, and the remaining columns are the columns of the embedding vector for each image as calculated with +#' the OpenFace `batch-represent` function. +#' +#' @source \url{http://openface-models.storage.cmusatyalab.org/lfw.nn4.small2.v1/labels.csv} +#' @source \url{http://openface-models.storage.cmusatyalab.org/lfw.nn4.small2.v1/reps.csv} +"facevectors" diff --git a/R/largeVis.R b/R/largeVis.R index a202603..7152d42 100644 --- a/R/largeVis.R +++ b/R/largeVis.R @@ -1,37 +1,28 @@ #' Apply the LargeVis algorithm for visualizing large high-dimensional datasets. #' -#' Implements the \code{vis} -#' -#' Note that this implementation expects the data to be free of \code{NaN}'s, \code{NA}'s, \code{Inf}'s, and duplicate rows. -#' If any of these assumptions are violated, the algorithm will fail. It is also usually a good idea to scale the input data -#' to have unit norm and mean 0. If there are large values in the input matrix, some computations may oveflow. -#' #' @param x A matrix, where the features are rows and the examples are columns. #' @param dim The number of dimensions in the output #' @param K The number of nearest-neighbors to use in computing the kNN graph #' @param n_trees See \code{\link{randomProjectionTreeSearch}}. The default is set at 50, which is the number #' used in the examples in the original paper. #' @param tree_threshold See \code{\link{randomProjectionTreeSearch}}. By default, this is the number of features -#' in the input set, which is the setting used in the examples in the original paper. Note the time and memory requirements: -#' the first pass through the neighborhood exploration phases will involve up to \eqn{N * nTrees * threshold} comparisons. -#' @param max_depth See \code{\link{randomProjectionTreeSearch}} +#' in the input set. #' @param max_iter See \code{\link{randomProjectionTreeSearch}}. #' @param distance_method One of "Euclidean" or "Cosine." See \code{\link{randomProjectionTreeSearch}}. -#' @param perplexity See paper +#' @param perplexity See \code{\link{buildWijMatrix}}. #' @param sgd_batches See \code{\link{projectKNNs}}. #' @param M See \code{\link{projectKNNs}}. -#' @param weight_pos_samples See \code{\link{projectKNNs}}. #' @param alpha See \code{\link{projectKNNs}}. #' @param gamma See \code{\link{projectKNNs}}. #' @param rho See \code{\link{projectKNNs}}. -#' @param min_rho \code{\link{projectKNNs}}. +#' @param save_neighbors Whether to include in the output the adjacency matrix of nearest neighbors. #' @param coords A [N,K] matrix of coordinates to use as a starting point -- useful for refining an embedding in stages. #' @param verbose Verbosity -#' @param ... See paper +#' @param ... Additional arguments passed to \code{\link{projectKNNs}}. #' #' @return A `largeVis` object with the following slots: #' \describe{ -#' \item{'knns'}{An [N,K] integer matrix, which is an adjacency list of each vertex' identified nearest neighbors. +#' \item{'knns'}{An [N,K] 0-indexed integer matrix, which is an adjacency list of each vertex' identified nearest neighbors. #' If the algorithm failed to find \code{K} neighbors, the matrix is padded with \code{NA}'s.} #' \item{'wij'}{A sparse [N,N] matrix where each cell represents \eqn{w_{ij}}.} #' \item{'call'}{The call.} @@ -47,10 +38,9 @@ #' dat <- as.matrix(iris[,1:4]) #' dat <- scale(dat) #' dupes = which(duplicated(dat)) -#' dat <- dat[-dupes,] # duplicated data potentially can cause the algorithm to fail +#' dat <- dat[-dupes,] # duplicates can cause the algorithm to fail #' dat <- t(dat) -#' visObject <- vis(dat, max_iter = 20, sgd_batches = 800000, -#' K = 10, gamma = 2, rho = 1, M = 40, alpha = 20,verbose=FALSE) +#' visObject <- largeVis(dat, max_iter = 20, K = 10) #'\dontrun{ #' # mnist #' load("./mnist.Rda") @@ -58,33 +48,30 @@ #' dim(dat) <- c(42000, 28 * 28) #' dat <- (dat / 255) - 0.5 #' dat <- t(dat) -#' coords <- vis(dat, check=FALSE, -#' n_tree = 50, tree_th = 200, -#' K = 50, alpha = 2, max.iter = 4) +#' coords <- largeVis(dat, n_trees = 50, tree_th = 200, K = 50) #' } #' -vis <- function(x, +largeVis <- function(x, dim = 2, K = 40, n_trees = 50, - tree_threshold = max(10, nrow(x)), - max_iter = 3, - max_depth = 32, + tree_threshold = max(10, ncol(x)), + max_iter = 1, distance_method = "Euclidean", perplexity = 50, - sgd_batches = ncol(x) * 20000, + sgd_batches = NULL, M = 5, - weight_pos_samples = TRUE, alpha = 1, gamma = 7, rho = 1, - min_rho = 0, coords = NULL, + save_neighbors = TRUE, + verbose = TRUE, ...) { @@ -96,92 +83,57 @@ vis <- function(x, tree_threshold = tree_threshold, K = K, max_iter = max_iter, - max_depth = max_depth, distance_method = distance_method, verbose = verbose) - ############################################# # Clean knns ############################################# - if (verbose[1]) cat("Calculating edge weights...") - neighbor_indices <- neighborsToVectors(knns) - - ####################################################### - # Calculate edge weights for candidate neighbors - ####################################################### - if (verbose) cat("Calculating neighbor distances.\n") - - xs <- distance(x = x, - neighbor_indices$i, - neighbor_indices$j, - distance_method, - verbose)[, 1] - - if (verbose) cat("\n") - - if ( (any(is.na(xs)) + - any(is.infinite(xs)) + - any(is.nan(xs)) + - any(xs == 0)) > 0) - stop("An error leaked into the distance calculation - check for duplicates") - if (any(xs > 27)) { # nocov start - warning(paste( - "The Distances between some neighbors are large enough to cause the calculation of p_{j|i} to overflow.", - "Scaling the distance vector.")) - xs <- scale(xs, center = FALSE) + if (verbose[1]) cat("Calculating edge weights...\n") + edges <- buildEdgeMatrix(data = x, + neighbors = knns, + distance_method = distance_method, + verbose = verbose) + if (! save_neighbors) rm(knns) + gc() + if (any(edges@x > 27)) { # nocov start + warning(paste( + "The Distances between some neighbors are large enough to cause the calculation of p_{j|i} to overflow.", + "Scaling the distance vector.")) + edges@x <- scale(edges@x, center = FALSE) } # nocov end + wij <- buildWijMatrix(edges, perplexity) + rm(edges) - ####################################################### - # Get w_{ij} - ####################################################### - - sigwij <- buildEdgeMatrix(i = neighbor_indices$i, - j = neighbor_indices$j, - d = xs, - perplexity = perplexity, - verbose = verbose) - - rm(neighbor_indices) ####################################################### # Estimate embeddings ####################################################### - coords <- projectKNNs(wij = sigwij$wij, + coords <- projectKNNs(wij = wij, dim = dim, sgd_batches = sgd_batches, M = M, - weight_pos_samples = weight_pos_samples, gamma = gamma, verbose = verbose, alpha = alpha, coords = coords, rho = rho, - min_rho = min_rho, ...) ####################################################### # Cleanup ####################################################### - knns[knns == -1] <- NA returnvalue <- list( knns = t(knns), - wij = sigwij$wij, + wij = wij, call = sys.call(), - coords = coords, - sigmas = sqrt(sigwij$sigmas / 2) + coords = coords ) + if (save_neighbors) { + knns[knns == -1] <- NA + returnvalue$knns <- t(knns) + } + class(returnvalue) <- "largeVis" return(returnvalue) } - -########################################## -# Some helper functions useful for debugging -########################################## - -pji <- function(x_i, sigma) - exp(- (x_i^2) / sigma) / sum(exp(- (x_i^2) / sigma)) -perp <- function(x_i, sigma) - - sum(log2(pji(x_i, sigma))) / length(x_i) -pdiff <- function(x_i, sigma, perplexity) - (perplexity - perp(x_i, sigma))^2 diff --git a/R/projectKNNs.R b/R/projectKNNs.R index 3df7421..bb9e1bd 100644 --- a/R/projectKNNs.R +++ b/R/projectKNNs.R @@ -12,31 +12,21 @@ #' where \eqn{f()} is a probabilistic function relating the distance between two points in the low-dimensional projection space, #' and the probability that they are nearest neighbors. #' -#' There are two available probabilistic functions, \eqn{1 / (1 + \alpha \dot ||x||^2)} and \eqn{1 / (1 + \exp(||x||^2))}. -#' The second function, which the paper authors recommend against, is used if parameter \code{alpha} is set to 0. -#' -#' The \code{weight_pos_samples} parameter controls how to handle edge-weights. The paper authors recommend using a weighted -#' sampling approach to select edges, and treating edge-weight as binary in calculating the objective. This is the default. -#' -#' However, the algorithm for drawing weighted samples runs in \eqn{O(n \log n)}. The alternative approach, which runs in -#' \eqn{O(n)}, is to draw unweighted samples and include \eqn{w_{ij}} in the objective function. In addition, the -#' alternative probabalistic function used when \eqn{\alpha == 0} tends to overflow unless edge weights are used. +#' The default probabilistic function is \eqn{1 / (1 + \alpha \dot ||x||^2)}. If \eqn{\alpha} is set to zero, +#' an alternative probabilistic function, \eqn{1 / (1 + \exp(x^2))} will be used instead. #' #' Note that the input matrix should be symmetric. If any columns in the matrix are empty, the function will fail. #' #' @param wij A symmetric sparse matrix of edge weights, in C-compressed format, as created with the \code{Matrix} package. #' @param dim The number of dimensions for the projection space. -#' @param sgd_batches The number of edges to process during SGD; defaults to 20000 * the number of rows in x, as recommended -#' by the paper authors. +#' @param sgd_batches The number of edges to process during SGD. #' @param M The number of negative edges to sample for each positive edge. #' @param gamma The strength of the force pushing non-neighbor nodes apart. -#' @param alpha Hyperparameter used in the default distance function, \eqn{1 / (1 + \alpha \dot ||y_i - y_j||^2)}. If \code{alpha} is 0, the alternative distance -#' function \eqn{1 / 1 + exp(||y_i - y_j||^2)} is used instead. These functions relate the distance between points in the low-dimensional projection to the likelihood -#' that they two points are nearest neighbors. -#' @param weight_pos_samples Whether to sample positive edges according to their edge weights (the default, unless alpha == 0) or take the -#' weights into account when calculating gradient. See also the Details section. +#' @param alpha Hyperparameter used in the default distance function, \eqn{1 / (1 + \alpha \dot ||y_i - y_j||^2)}. The function relates the distance +#' between points in the low-dimensional projection to the likelihood that the two points are nearest neighbors. Increasing \eqn{\alpha} tends +#' to push nodes and their neighbors closer together; decreasing \eqn{\alpha} produces a broader distribution. Setting \eqn{\alpha} to zero +#' enables the alternative distance function. \eqn{\alpha} below zero is meaningless. #' @param rho Initial learning rate. -#' @param min_rho Final learning rate. #' @param coords An initialized coordinate matrix. #' @param verbose Verbosity #' @@ -49,23 +39,21 @@ #' coords <- scale(coords) #' plot(coords, xlim = c(-1.5,1.5), ylim = c(-1.5,1.5)) #' } -#' @importFrom stats rnorm +#' @importFrom stats runif #' projectKNNs <- function(wij, # symmetric sparse matrix dim = 2, # dimension of the projection space - sgd_batches = (length(wij@p) -1) * 20000, + sgd_batches = NULL, M = 5, - weight_pos_samples = if (alpha == 0) {FALSE} else {TRUE}, gamma = 7, alpha = 1, rho = 1, coords = NULL, - min_rho = 0, verbose = TRUE) { - if (alpha == 0) warning("The alternative (alpha == 0) distance function is not fully implemented.") - N <- (length(wij@p) -1) + if (alpha < 0) stop("alpha < 0 is meaningless") + N <- (length(wij@p) - 1) js <- rep(0:(N - 1), diff(wij@p)) if (any(is.na(js))) stop("NAs in the index vector.") is <- wij@i @@ -73,20 +61,32 @@ projectKNNs <- function(wij, # symmetric sparse matrix ############################################## # Initialize coordinate matrix ############################################## - if (is.null(coords)) coords <- matrix(rnorm(N * dim), nrow = dim) + if (is.null(coords)) #coords <- matrix(rnorm(N * dim), nrow = dim) + coords <- matrix((runif(N * dim) - 0.5) / dim * 0.0001, nrow = dim) + if (is.null(sgd_batches)) { + if (N < 10000) { + sgd_batches <- 20000 * length(wij@x) + } else if (N < 1000000) { + sgd_batches <- (N - 10000) * 9000 / (1000000 - 10000) + 1000 + sgd_batches <- sgd_batches * 1000000 + } else { + sgd_batches <- 10000 * N + } + } ################################################# # SGD ################################################# if (verbose) cat("Estimating embeddings.\n") coords <- sgd(coords, - is = is, - js = js, - ps = wij@p, - ws = wij@x, - gamma = gamma, rho = rho, minRho = min_rho, - useWeights = ! weight_pos_samples, nBatches = sgd_batches, - M = M, alpha = alpha, verbose = verbose) + targets_i = is, + sources_j = js, + ps = wij@p, + weights = wij@x, + alpha = alpha, gamma = gamma, M = M, + rho = rho, + n_samples = sgd_batches, + verbose = verbose) return(coords) } diff --git a/R/projectionTreeSearch.R b/R/projectionTreeSearch.R index 5355354..4f2f4d1 100644 --- a/R/projectionTreeSearch.R +++ b/R/projectionTreeSearch.R @@ -13,18 +13,16 @@ #' @param tree_threshold The threshold for creating a new branch. The paper authors suggest #' using a value equivalent to the number of features in the input set. #' @param max_iter Number of iterations in the neighborhood exploration phase. -#' @param max_depth The maximum level of recursion. #' @param distance_method One of "Euclidean" or "Cosine." #' @param verbose Whether to print verbose logging using the \code{progress} package. #' #' @return A [K, N] matrix of the approximate K nearest neighbors for each vertex. #' @export randomProjectionTreeSearch <- function(x, - K = 5, - n_trees = 2, + K = 150, + n_trees = 50, tree_threshold = max(10, nrow(x)), - max_iter = 2, - max_depth = 32, + max_iter = 1, distance_method = "Euclidean", verbose= TRUE) UseMethod("randomProjectionTreeSearch") @@ -32,21 +30,22 @@ randomProjectionTreeSearch <- function(x, #' @export #' @rdname randomProjectionTreeSearch randomProjectionTreeSearch.matrix <- function(x, - K = 5, - n_trees = 2, + K = 150, + n_trees = 50, tree_threshold = max(10, nrow(x)), - max_iter = 2, - max_depth = 32, + max_iter = 1, distance_method = "Euclidean", verbose= TRUE) { - if (verbose) cat("Searching for neighbors.\n") + if (verbose) cat("Searching for neighbors.\n") + + if (distance_method == "Cosine") x <- x / rowSums(x) knns <- searchTrees(threshold = tree_threshold, n_trees = n_trees, - K = K, max_recursion_degree = max_depth, + K = K, maxIter = max_iter, data = x, - distance_method, + distMethod = distance_method, verbose = verbose) if (sum(colSums(knns != -1) == 0) > 0) @@ -64,23 +63,22 @@ randomProjectionTreeSearch.matrix <- function(x, #' @export #' @rdname randomProjectionTreeSearch randomProjectionTreeSearch.CsparseMatrix <- function(x, - K = 5, - n_trees = 2, + K = 150, + n_trees = 50, tree_threshold = max(10, nrow(x)), - max_iter = 2, - max_depth = 32, + max_iter = 1, distance_method = "Euclidean", verbose= TRUE) { if (verbose) cat("Searching for neighbors.\n") knns <- searchTreesCSparse(threshold = tree_threshold, n_trees = n_trees, - K = K, max_recursion_degree = max_depth, + K = K, maxIter = max_iter, i = x@i, p = x@p, x = x@x, - distance_method, + distMethod = distance_method, verbose = verbose) if (sum(colSums(knns != -1) == 0) > 0) @@ -98,23 +96,24 @@ randomProjectionTreeSearch.CsparseMatrix <- function(x, #' @export #' @rdname randomProjectionTreeSearch randomProjectionTreeSearch.TsparseMatrix <- function(x, - K = 5, - n_trees = 2, - tree_threshold = max(10, nrow(x)), - max_iter = 2, - max_depth = 32, - distance_method = "Euclidean", + K = 150, + n_trees = 50, + tree_threshold = + max(10, nrow(x)), + max_iter = 1, + distance_method = + "Euclidean", verbose= TRUE) { if (verbose) cat("Searching for neighbors.\n") knns <- searchTreesTSparse(threshold = tree_threshold, n_trees = n_trees, - K = K, max_recursion_degree = max_depth, + K = K, maxIter = max_iter, i = x@i, j = x@j, x = x@x, - distance_method, + distMethod = distance_method, verbose = verbose) if (sum(colSums(knns != -1) == 0) > 0) diff --git a/R/visualize.R b/R/visualize.R index f12e553..1886f08 100644 --- a/R/visualize.R +++ b/R/visualize.R @@ -6,7 +6,6 @@ #' @param n The number of images to sample. #' @param images The images. A 3-D or 4-D array. #' @param scale Proportion to scale the images to. -#' @param transparency Whether to add an alpha channel to greyscale images. #' @param ... Addiitional parameters passed to \code{plot}. #' #' @details The images can be passed in either as a list or a 3- or 4-dimensional array. The first dimension is \code{n}. @@ -14,8 +13,11 @@ #' If the objects in the list are \code{matrix} objects, or the array is 3-dimensional, the images will be treated as #' greyscale. If there is an additional dimension, it must have a length of 3 and be RGB color layers. #' +#' @references Andrej Karpapthy. \href{http://cs.stanford.edu/people/karpathy/cnnembed/}{t-SNE Visualization of CNN Codes.} +#' #' @importFrom grDevices as.raster #' @importFrom graphics rasterImage +#' @seealso \code{\link{ggManifoldMap}} #' @export #' @examples \dontrun{ #' load("mnist.Rda") @@ -37,50 +39,38 @@ #' } manifoldMap <- function(x, - n, - images, - scale = 1, - transparency = FALSE, - ...) { #nocov start + n = nrow(x), + images, + scale = 1, + ...) { #nocov start if (class(x) == "largeVis") x <- t(x$coords) if (ncol(x) != 2) stop("Can only visualize in 2-D.") N <- nrow(x) if (class(images) == "list" && - N != length(images)) stop("Number of images doesn't equal number of points.") - if (N != nrow(images)) stop("Number of images doesn't equal number of points.") + N != length(images)) + stop("Number of images doesn't equal number of points.") + if (N != nrow(images)) + stop("Number of images doesn't equal number of points.") D <- length(dim(images)) - 1 if (! (D == 2 || D == 3)) stop("Wrong number of dimensions.") if (D == 3 && - (dim(x)[3] < 2 || - dim(x)[3] > 4)) stop("Wrong number of color layers.") + (dim(images)[4] < 2 || + dim(images)[4] > 4)) stop("Wrong number of color layers.") selections <- sample(N, n, replace = F) lowerscale <- min(images) upperscale <- max(images) - graphics::plot(x[selections, ], pch = NA, ...) + graphics::plot(x * 1.1, pch = NA, type = 'n', ...) for (i in selections) { if (D == 2) { - image_data <- images[i,, ] + image_data <- images[i, , ] } else { - image_data <- images[i,,, ] + image_data <- images[i, , , ] } image_data <- 1 - ( (image_data - lowerscale) / upperscale) - if (transparency) { - if (length(dim(image_data)) == 2) - image_data <- abind::abind(image_data, - image_data, - image_data, - image_data, - along = 3) - else if (length(dim(image_data)) == 3) { - alpha <- apply(image_data, MARGIN = 3, FUN = sum) - alpha <- alpha / max(alpha) - image_data <- abind::abind(image_data, alpha, along = 3) - } - } image <- grDevices::as.raster(image_data) offsetx <- (nrow(image) * scale) / 2 offsety <- (ncol(image) * scale) / 2 @@ -90,6 +80,162 @@ manifoldMap <- function(x, x[i, 1] + offsetx, x[i, 2] + offsety, interpolate = TRUE - ) + ) + } +} # nocov end + +#' Visualize an embedding by ggplotting with images +#' +#' Identical to \link{manifoldMap}, but adds images to an existing \code{ggplot2} object or creates one. +#' +#' @param ggObject a \code{\link[ggplot2]{ggplot}} object. If not provided, a new \code{ggplot} +#' object with \code{\link[ggplot2]{geom_blank}} will be created. +#' @param x A \code{largeVis} object or [N,D] matrix of coordinates. +#' @param n The number of images to sample. +#' @param images The images. A 3-D or 4-D array. +#' @param scale Proportion to scale the images to. +#' @return A \code{ggplot} object. +#' +#' @details See \code{\link{manifoldMap}}. Note that this function can be considerably slower to display than \code{manifoldMap}. +#' It therefore should only be used if other features of \code{ggplot2} are required. +#' +#' If the objects in the list are \code{matrix} objects, or the array is 3-dimensional, the images will be treated as +#' greyscale. If there is an additional dimension, it must have a length of 3 and be RGB color layers. +#' +#' @importFrom grDevices as.raster +#' @importFrom graphics rasterImage +#' @importFrom ggplot2 ggplot +#' @importFrom ggplot2 geom_blank +#' @importFrom ggplot2 annotation_raster +#' @importFrom ggplot2 aes +#' @export +ggManifoldMap <- function(ggObject = NULL, + x, + n = nrow(x), + images, + scale = 1) { #nocov start + if (class(x) == "largeVis") x <- t(x$coords) + if (ncol(x) != 2) stop("Can only visualize in 2-D.") + N <- nrow(x) + if (class(images) == "list" && + N != length(images)) + stop("Number of images doesn't equal number of points.") + if (N != nrow(images)) + stop("Number of images doesn't equal number of points.") + + D <- length(dim(images)) - 1 + + if (! (D == 2 || D == 3)) stop("Wrong number of dimensions.") + if (D == 3 && + (dim(images)[4] < 2 || + dim(images)[4] > 4)) stop("Wrong number of color layers.") + + selections <- sample(N, n, replace = F) + lowerscale <- min(images) + upperscale <- max(images) + if (is.null(ggObject)) { + x <- data.frame(x) + colnames(x) <- c("x", "y") + ggObject = ggplot2::`%+%`(ggplot2::ggplot(x, + ggplot2::aes_(x = quote(x), + y = quote(y))), + geom_blank()) } + + for (i in selections) { + if (D == 2) { + image_data <- images[i, , ] + } else { + image_data <- images[i, , , ] + } + image_data <- 1 - ( (image_data - lowerscale) / upperscale) + image <- grDevices::as.raster(image_data) + offsetx <- (nrow(image) * scale) / 2 + offsety <- (ncol(image) * scale) / 2 + + ggObject <- ggplot2::`%+%`(ggObject, ggplot2::annotation_raster( + image, + xmin = x[i, 1] - offsetx, + ymin = x[i, 2] - offsety, + xmax = x[i, 1] + offsetx, + ymax = x[i, 2] + offsety, + interpolate = TRUE + )) + } + return(ggObject) } # nocov end + +#' manifoldMapStretch +#' +#' A manifold map that fills the full extent of the plot. +#' +#' Ported from \url{http://cs.stanford.edu/people/karpathy/cnnembed/}. Each position is filled with its nearest neighbor. +#' +#' @param x A [N,D] matrix of coordinates. +#' @param f A function that, called with the index number of a row of \code{x}, returns an R object representing +#' an image. See the example. +#' @param size_x The width of the requested plot, in pixels. +#' @param size_y The height of the requested plot, in pixels. +#' @param image_size The size to plot each image; each is plotted as a square. +#' @param ... Additional parameters passed to \code{plot}. +#' +#' @note This function is experimental. +#' +#' @examples +#' \dontrun{ +#' # Demonstration of f +#' load(system.file("extdata", "faces.Rda", package="largeVis")) +#' +#' imagepaths <- paste("pathtoimages", +#' faceLabels[,1], sub("png", "jpg", faceLabels[,2]), sep = "/") +#' +#' manifoldMapStretch(as.matrix(faceCoords[,1:2]), +#' f = function(x) jpeg::readJPEG(imagePaths[x]), +#' size_x = 5000, size_y = 5000, image_size = 100) +#' } +#' +#' @export +manifoldMapStretch <- function(x, + f, + size_x = 500, + size_y = 500, + image_size = 50, + ...) { #nocov start + + xnum <- size_x / image_size + ynum <- size_y / image_size + + coordsadj <- x - c(min(x[,1]), min(x[,2])) + coordsadj <- coordsadj * c(size_x / max(coordsadj[,1]), + size_y / max(coordsadj[,2])) + + graphics::plot(matrix(c(0, size_x, + size_y, 0), + ncol = 2), + pch = NA, + type = 'n', ...) + + abes <- matrix(c(rep(1:xnum, ynum), + rep(1:ynum, each = xnum)), + ncol = 2) + + for (i in 1:nrow(abes)) { + img_x <- abes[i, 1] + img_y <- abes[i, 2] + xf <- (img_x * image_size) - (image_size/2) + yf <- (img_y * image_size) - (image_size/2) + dd <- apply((coordsadj - c(xf, yf))^2, + MARGIN = 1, + FUN = sum) + selection <- which(dd == min(dd)) + coordsadj[selection,] <- Inf + image <- f(selection) + rasterImage( image, + xleft = xf - (image_size / 2), + ybottom = yf - (image_size / 2), + xright = xf + (image_size / 2), + ytop = yf + (image_size / 2), + interpolate = TRUE + ) + } +} #nocov end diff --git a/R/wiki.R b/R/wiki.R deleted file mode 100644 index 8f59d55..0000000 --- a/R/wiki.R +++ /dev/null @@ -1,8 +0,0 @@ -#' Voting data on wikipedia from inception until January, 2008. -#' -#' @format A symmetric sparse matrix in C-compressed format. Weights for present edges are either 1, -#' indicating that each node case a vote for the other, or 0.5. Nodes with fewer than 5 votes were -#' removed from the dataset. -#' -#' @source \url{https://snap.stanford.edu/data/wiki-Vote.html} -"wiki" diff --git a/README.Rmd b/README.Rmd index 7b97905..08a7062 100644 --- a/README.Rmd +++ b/README.Rmd @@ -3,24 +3,55 @@ title: "largeVis" output: github_document bibliography: vignettes/TangLZM16.bib --- +```{r getversion,eval=T,echo=F,warning=F,error=F,message=F} +branch <- system("git branch -v", intern=TRUE) +branch <- branch[grep("\\*", branch)] +poses <- regexpr("(?<=\\*\\s)(\\S+)(?=\\s)", perl = TRUE, branch) +branch <- substr(branch, attr(poses, "capture.start"), attr(poses, "capture.start") + attr(poses, "capture.length")) +branch <- gsub("^\\s+|\\s+$", "", branch) +``` +[![Travis-CI Build Status](https://travis-ci.org/elbamos/largeVis.svg?branch=`r branch`)](https://travis-ci.org/elbamos/largeVis) +[![Coverage Status](https://img.shields.io/codecov/c/github/elbamos/largeVis/`r branch`.svg)](https://codecov.io/gh/elbamos/largeVis/branch/`r branch`) [![https://gitter.im/elbamos/largeVis](https://badges.gitter.im/elbamos/largeVis.svg)](https://gitter.im/elbamos/largeVis?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/elbamos/largeVis?branch=`r branch`&svg=true)](https://ci.appveyor.com/project/elbamos/largeVis?branch=`r branch`) -[![Travis-CI Build Status](https://travis-ci.org/elbamos/largeVis.svg?branch=0.1.5)](https://travis-ci.org/elbamos/largeVis) [![Coverage Status](https://img.shields.io/codecov/c/github/elbamos/largeVis/0.1.5.svg)](https://codecov.io/github/elbamos/largeVis?branch=0.1.5)[![https://gitter.im/elbamos/largeVis](https://badges.gitter.im/elbamos/largeVis.svg)](https://gitter.im/elbamos/largeVis?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/elbamos/largeVis?branch=0.1.5&svg=true)](https://ci.appveyor.com/project/elbamos/largeVis) - -This is an implementation of the `largeVis` algorithm described in (https://arxiv.org/abs/1602.00370). It also incorporates code for a very fast algorithm for estimating k-nearest neighbors, and for visualizing a map of the manifold. +This is an implementation of the `largeVis` algorithm described in (https://arxiv.org/abs/1602.00370). It also incorporates a very fast algorithm for estimating k-nearest neighbors, implemented in C++ with `Rcpp` and `OpenMP`, and for visualizing a map of the manifold like [this](http://cs.stanford.edu/people/karpathy/cnnembed/). -The inner loops for nearest-neighbor search and gradient descent are implemented in C++ using `Rcpp` and `RcppArmadillo`. #### Project Status & Caveats -* Support for sparse matrices! -* Now tested with (dense) matrices > 1 Million rows, and sparse matrices with > 10,000 features. -* Memory efficiency and performance are excellent. Memory efficiency can be improved further by using utility functions to perform the algorithm in stages. (Explained in the vignette.) -* Not yet fully tested: - + The alternative distance function ($\alpha = 0$). - + Transparency in the visualization function. - + Multi-color images in the visualization function. -* I am attempting to replicate the paper's results with larger and larger datasets. This takes time because my hardware is not as powerful as the authors'. If you have any to volunteer, please contact me! +* In final testing before submission to CRAN. +* Tested with (dense) matrices > 2.5 Million rows, and sparse matrices with > 10,000 features. +* Performance and memory efficiency are good. +* I have been able to replicate, in the sense of producing characteristically similar visualizations, the results in the original paper. + +### Notes +* On Mac OS X, the Apple compiler does not support OpenMP. Compiling `largeVis` with OpenMP support requires some fiddling. Here are some instructions that may work for you: + - Use `homebrew` to install `llvm` version 3.8 or greater. + - Link it to `/usr/local` with `brew link --force llvm`. + - Add the following line to `~.R/Makevars`: + ``` + SHLIB_OPENMP_CFLAGS = -fopenmp + ``` + - Add the following to `~/.Renviron`: + ``` + PATH=/usr/local/bin:${PATH} + ``` + +## Examples +```{r echo=F} +vdatapath <- "../largeVisData/vignettedata/" +``` +```{r child='Examples.Rmd',} +``` + +## Benchmarks + +```{r child='vignettes/benchmarks.Rmd',} +``` ## Vignette -```{r child='vignettes/largeVis.Rmd'} +```{r echo=F} +vdatapath <- "../../largeVisData/vignettedata/" +``` +```{r child='vignettes/largeVis.Rmd',} ``` diff --git a/README.md b/README.md index 03ff938..1ae5aa0 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,125 @@ largeVis ================ -[![Travis-CI Build Status](https://travis-ci.org/elbamos/largeVis.svg?branch=0.1.5)](https://travis-ci.org/elbamos/largeVis) [![Coverage Status](https://img.shields.io/codecov/c/github/elbamos/largeVis/0.1.5.svg)](https://codecov.io/github/elbamos/largeVis?branch=0.1.5)[![https://gitter.im/elbamos/largeVis](https://badges.gitter.im/elbamos/largeVis.svg)](https://gitter.im/elbamos/largeVis?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/elbamos/largeVis?branch=0.1.5&svg=true)](https://ci.appveyor.com/project/elbamos/largeVis) +[![Travis-CI Build Status](https://travis-ci.org/elbamos/largeVis.svg?branch=0.1.6)](https://travis-ci.org/elbamos/largeVis) [![Coverage Status](https://img.shields.io/codecov/c/github/elbamos/largeVis/0.1.6.svg)](https://codecov.io/gh/elbamos/largeVis/branch/0.1.6) [![https://gitter.im/elbamos/largeVis](https://badges.gitter.im/elbamos/largeVis.svg)](https://gitter.im/elbamos/largeVis?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/elbamos/largeVis?branch=0.1.6&svg=true)](https://ci.appveyor.com/project/elbamos/largeVis?branch=0.1.6) -This is an implementation of the `largeVis` algorithm described in (). It also incorporates code for a very fast algorithm for estimating k-nearest neighbors, and for visualizing a map of the manifold. - -The inner loops for nearest-neighbor search and gradient descent are implemented in C++ using `Rcpp` and `RcppArmadillo`. +This is an implementation of the `largeVis` algorithm described in (). It also incorporates a very fast algorithm for estimating k-nearest neighbors, implemented in C++ with `Rcpp` and `OpenMP`, and for visualizing a map of the manifold like [this](http://cs.stanford.edu/people/karpathy/cnnembed/). #### Project Status & Caveats -- Support for sparse matrices! -- Now tested with (dense) matrices > 1 Million rows, and sparse matrices with > 10,000 features. -- Memory efficiency and performance are excellent. Memory efficiency can be improved further by using utility functions to perform the algorithm in stages. (Explained in the vignette.) -- Not yet fully tested: - - The alternative distance function (*α* = 0). - - Transparency in the visualization function. - - Multi-color images in the visualization function. -- I am attempting to replicate the paper's results with larger and larger datasets. This takes time because my hardware is not as powerful as the authors'. If you have any to volunteer, please contact me! +- In final testing before submission to CRAN. +- Tested with (dense) matrices > 2.5 Million rows, and sparse matrices with > 10,000 features. +- Performance and memory efficiency are good. +- I have been able to replicate, in the sense of producing characteristically similar visualizations, the results in the original paper. + +### Notes + +- On Mac OS X, the Apple compiler does not support OpenMP. Compiling `largeVis` with OpenMP support requires some fiddling. Here are some instructions that may work for you: + - Use `homebrew` to install `llvm` version 3.8 or greater. + - Link it to `/usr/local` with `brew link --force llvm`. + - Add the following line to `~.R/Makevars`: + + SHLIB_OPENMP_CFLAGS = -fopenmp + + - Add the following to `~/.Renviron`: + + PATH=/usr/local/bin:${PATH} + +Examples +-------- + +### MNIST + +![](README_files/figure-markdown_github/drawmnist-1.png) + + + +### Wikipedia Terms and Documents + +![](README_files/figure-markdown_github/drawwikiwords-1.png) + +### 20 Newsgroups + +![](README_files/figure-markdown_github/draw20ng-1.png) + +### Openface + +![](README_files/figure-markdown_github/plotFaceVectors-1.png) + +![](README_files/figure-markdown_github/faceImages-1.png) + +A high resolution version is available [here](./faceshighres.png) + +### Visualizing tf-idf and Topic Matrices + +![](README_files/figure-markdown_github/drawtdm-1.png) + +Benchmarks +---------- + +Overview +-------- + +Besides manifold visualization, `largeVis` also includes an extremely efficient approximate nearest-neighbor search that runs in *O*(*n*) time. + +This vignette includes benchmarks and recommendations for adjusting hyperparameters in the neighbor search for best results. + +Hyperparameters +--------------- + +The `randomProjectionTreeSearch` function has three hyperparameters that trade-off accuracy and efficiency in the neighbor search: + +1. `n_trees` - In the first phase of the function, the number of random projection trees to create. +2. `tree_threshold` - The maximum number of any nodes on a random projection tree leaf. If, after branching, the number of nodes in a branch exceeds this threshold, the branch will be divided again. +3. `max_iters` - The number of iterations for the neighborhood-exploration phase of the algorithm. + +Data Collection & Methodology +----------------------------- + +The data in the benchmarks below was obtained by running the `benchmark.R` script, which is installed along with the package, on two machines. + +The aim was to replicate as much as possible the methodology used by Erik Bernhardsson's [ANN Benchmark](https://github.com/erikbern/ann-benchmarks) github. However, `ANN Benchmark` is designed for libraries that are designed to build a neighbor index and then rapidly process queries against the index. The measure used by `ANN Benchmark` is therefore queries-per-second. By contract, `largeVis` is concerned with getting neighbors for all of the nodes in a finite dataset as quickly as possible. + +Times shown for `RcppAnnoy` include the time to build a searchable index and query neighbors for all rows in the dataset. + +The data used is the 1-million vector, 128-feature [SIFT Dataset](http://corpus-texmex.irisa.fr/), which is the test data used by `ANN Benchmark`. + +Benchmarks were run on several machines. First, benchmarks were run on a workstation and a server with *K* = 100. Benchmarks were then run on an AWS c4.2xlarge instance with *K* = 100 and *K* = 50, to replicate as closely as possible the conditions of `ANN Benchmark`. + +Results that appear to have used virtual memory, in that the completion time was radically discontinuous with other results from the same machine, were discarded. + +I welcome submissions of output from the script from other hardware. + +Comparison With Annoy +--------------------- + +The following chart illustrates performance versus the `Annoy` library, as implemented through the `RcppAnnoy` R package. + +To facilitate comparison with the ANN Benchmark charts, the Y-axis shows the number of vectors processed per second. + + + +Approximate Equivalence of Number of Trees and Tree Threshold +------------------------------------------------------------- + +There is an approximate trade-off in memory use between the tree threshold and number of trees. Peak memory consumption during the tree search phase = N \* n\_trees \* threshold. + +The trade-off is not precise because the tree split phase will return fewer nodes per tree than the threshold. On average, it should return about 3/4 of the threshold. + +On the following chart, points that share the same values of n\_trees \* threshold, referred to as `tth`, (and number of neighborhood exploration iterations), are shown as the same series. + +![](README_files/figure-markdown_github/constn-1.png) + +Results that hold nn constant while varying the number of trees and threshold tend to cluster together, however increasing the number of trees (while holding tth constant) tends to improve accuracy and decrease speed. The degree of dispersion increases when a neighborhood exploration iteration is added. + +On the charts below, n\_trees \* threshold is referred to as `tth`. + +Effect of Increasing `tth` vs. `max_iters` +------------------------------------------ + +![](README_files/figure-markdown_github/tree_threshold-1.png) + +A single iteration clearly has substantial impact on accuracy. The marginal benefit of additional iterations declines, but adding a second iteration is a more efficient way to improve accuracy than increasing tth. This is consistent with the recommendation of the paper authors. Vignette -------- @@ -26,12 +129,28 @@ This Vingette provides an overview of the largeVis package. Introduction ------------ -The `largeVis` package offers four functions for visualizing high-dimensional datasets and finding approximate nearest neighbors, based on the `LargeVis` algorithm presented in Tang et al. (2016): +This package provides `LargeVis` visualizations and fast nearest-neighbor search. The `LargeVis` algorithm, presented in Tang et al. (2016), creates high-quality low-dimensional representaitons of large, high-dimensional datasets, similar to [t-SNE](https://lvdmaaten.github.io/tsne/). + +These visualizations are useful for data exploration, for visualizing complex non-linear functions, and especially for visualizing embeddings such as learned vectors for images. + +A limitation of t-SNE is that because the algorithm has complexity order *O*(*n*2), it is not feasible for use on even moderately sized datasets. [Barnes-Hut](https://arxiv.org/pdf/1301.3342.pdf), an approximation of t-SNE, has complexity *O*(*n*log*n*) but also quickly becomes infeasible as the size of data grows. `LargeVis` is intended to address the issue by operating in linear *O*(*n*) time. It has been benchmarked at more than 30x faster than Barnes-Hut on datasets of approximately 1-million rows, and scaled linearly as long as there is sufficient RAM. + +In addition, `LargeVis` includes an algorithm for finding approximate k-Nearest Neighbors in *O*(*n*) time. This algorithm turns out to be faster at finding accurate a-NNs than any other method I was able to test. + +The package also includes a function for visualizing image embeddings by plotting images at the locations given by the `LargeVis` algorithm. + +For a detailed description of the algorithm, please see the original paper, Tang et al. (2016). + +Package Overview +---------------- + +The `largeVis` package offers five functions for visualizing high-dimensional datasets and finding approximate nearest neighbors (along with some helper functions): 1. `randomProjectionTreeSearch`, a method for finding approximate nearest neighbors. 2. `projectKNNs`, which takes as input a weighted nearest-neighbor graph and estimates a projection into a low-dimensional space. -3. `vis`, which combines `randomProjectionTreeSearch`, `buildEdgeMatrix`, and `projectKNNs`, along with additional code to implement the `LargeVis` algorithm. -4. `manifoldMap`, which produces a plot for visualizing embeddings of images. +3. `largeVis`, which implements the entire `LargeVis` algorithm. +4. `manifoldMap` (and companon `ggManifoldMap`), which produce a plot for visualizing embeddings of images. +5. `buildWijMatrix` takes a sparse matrix of the distances between nearest neighbors, and returns one with the edges properly weighted for use in `projectKNNs`. See the [original paper](https://arxiv.org/abs/1602.00370) for a detailed description of the algorithm. @@ -44,29 +163,20 @@ If there are NA's, Infs, or NULLs in the input, `randomProjectionTreeSearch` wil If the numerical range covered by the data is large, this can cause errors in or before the `buildEdgeMatrix` function. This is because the algorithm requires calculating $\\exp(||\\vec{x\_i}, \\vec{x\_j}||^2)$ in the high-dimensional space, which will overflow if the distance between any nearest neighbors exceeds about 26. -If there are duplicates in the input data, while the implementation tries to filter duplicates, it is likely to lead to problems. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during `buildEdgeMatrix`, or stochastic gradient descent. - -Examples --------- - - - - +Duplicates in the input data are likely to cause issues. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during `buildEdgeMatrix`, or stochastic gradient descent. Overview of Functions and Hyperparameters ----------------------------------------- ### `randomProjectionTreeSearch` -This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, the algorithm creates `n_trees` binary trees dividing the space into leaves of at most `tree_threshold` nodes. A node's candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. In the second phase, for each node, the algorithm looks at the candidate nearest neighbors for that node, as well as each of those nodes' candidate nearest neighbors. The logic of the algorithm is that a node's neighbors' neighbors are likely to be the node's own neighbors. In each iteration, the closest `K` candidate neighbors for each node are kept. +This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, which is based on [Erik Bernhardsson](http://erikbern.com)'s [Annoy](https://github.com/spotify/annoy) algorithm, `n_trees` trees are formed by recursively dividing the space by hyperplanes until at most `tree_threshold` nodes remain in a branch. A node's candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. The `largeVis` algorithm adds a second phase, neighborhood exploration, which considers, for each node, whether the candidate neighbors of the node's candidate immediate neighbors are closer. The logic of the algorithm is that a node's neighbors' neighbors are likely to be the node's own neighbors. In each iteration, the closest `K` candidate neighbors for each node are kept. -The authors of Tang et al. (2016) suggest that a single iteration of the second phase is generally sufficient to obtain satisfactory performance. +(Note that this implementation of `largeVis` differs from the approach taken by `Annoy`, in that `Annoy` always uses the number of features as the leaf threshold, where `largeVis` allows this to be an adjustable parameter.) -The chart below illlustrates the trade-off between performance and accuracy for the nearest-neighbor search, using various hyperparameters. The data was produced using the `benchmark.R` script in the `inst/` directory. The test data is the 1-million vector, 128-feature [SIFT Dataset](http://corpus-texmex.irisa.fr/), as per Erik Bernhardsson's [ANN Benchmark](https://github.com/erikbern/ann-benchmarks) github. - - +The authors of Tang et al. (2016) suggest that a single iteration of the second phase is generally sufficient to obtain satisfactory performance. -If `randomProjectionTreeSearch` fails to find the desired number of neighbors, usually the best result is obtained by increasing the tree threshold. If `randomProjectionTreeSearch` fails with an error that no neighbors were found for some nodes, and the tree threshold is already reasonable, this may be an indication that duplicates remain in the input data. +See the vignette "ANN Benchmarks" for additional information. ### `projectKNNs` @@ -74,7 +184,11 @@ This function takes as its input a `Matrix::sparseMatrix`, of connections betwee The `LargeVis` algorithm, explained in detail in Tang et al. (2016), estimates the embedding by sampling from the identitied nearest-neighbor connections. For each edge, the algorithm also samples `M` non-nearest neighbor negative samples. `M`, along with *γ* and *α*, control the visualization. *α* controls the desired distance between nearest neighbors. *γ* controls the relative strength of the attractive force between nearest neighbors and repulsive force between non-neighbors. -The following grid illustrates the effect of the *α* and *γ* hyperparameters, using the `wiki` dataset which is included with the package: +The following grid illustrates the effect of the *α* and *γ* hyperparameters: + +``` r +load(system.file(package = "largeVis", "extdata/vignettedata.Rda")) +``` @@ -86,73 +200,113 @@ The algorithm can treat positive edge weights in two different ways. The authors The `vis` function combines `randomProjectionTreeSearch` and `projectKNNs`, along with additional logic for calculating edge weights, to implement the complete `LargeVis` algorithm. -The following chart illustrates the effect of the `M` and `K` parameters, using the `iris` dataset. +The following chart illustrates the effect of the `M` and `K` parameters, using the `iris` dataset. Each row re-uses the same set of identified `K` neighbors, and initial coordinates. - + ### `manifoldMap` -The `manifoldMap` function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by `projectKNNs` or `vis`) and an `array` of `N` images, the function samples `n` images and plots them at the coordinates given in the matrix. If the `transparency` parameter is a number between 0 and 1, then the function adds to each image an alpha channel where the value per pixel is proportional to *t**r**a**n**s**p**a**r**e**n**c**y*\* the image content. +The `manifoldMap` function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by `projectKNNs` or `vis`) and an `array` of `N` images, the function samples `n` images and plots them at the coordinates given in the matrix. -The function can plot both color and greyscale images. - -The following code will plot 5000 images sampled from the MNIST dataset at positions generated by `vis`: +The following code will generate the visualization shown in the examples: ``` r -if (exists("trainData")) { - dim(trainData) <- c(60000, 28, 28) - manifoldMap(mnistCoords[,1:2], - n = 5000, - scale = 0.003, - transparency = F, - images = trainData, - xlab="", ylab="", - xlim = c(-2, 2), - ylim = c(-2, 2)) -} +dim(trainData) <- c(60000, 28, 28) +aperm(trainData, perm = c(1,3,2), resize = FALSE) +set.seed(1974) +manifoldMap(mnistCoords[,1:2], + n = 5000, + scale = 0.1, + images = trainData, + xlab = "", + ylab = "") ``` - - -The code is disabled by default in this vignette for data size reasons. - Support for Sparse Matrices --------------------------- -`largeVis` supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices. +`largeVis` supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices directly, and compare the result with the result of visualizing topic vectors. -For example, the following plot visualizes a tf-idf weighted document-term matrix for a corpus of 5000 political blog entries, as included with the `stm` package. +Visualizing Graphs +------------------ -![](README_files/figure-markdown_github/drawtdm-1.png) +The `largeVis` visualization algorithm can be used to visualize undirected weighted or unweighted acyclic graphs. The included `wiki` dataset is an example. + +The following code illustrates how to import and visualize a graph using the YouTube-communities dataset available [here](https://snap.stanford.edu/data/com-Youtube.html). The data and visualization are not included here for size reasons. + +``` r +youtube <- readr::read_tsv(pathToGraphFile, skip=4, col_names=FALSE) +youtube <- as.matrix(youtube) +youtube <- Matrix::sparseMatrix(i = youtube[, 1], + j = youtube[, 2], + x = rep(1, nrow(youtube)), + dims = c(max(youtube), max(youtube))) +youtube <- youtube + t(youtube) +communities <- readr::read_lines(pathToCommunities) +communities <- lapply(communities, + FUN = function(x) as.numeric(unlist(strsplit(x, "\t")))) +community_assignments <- rep(0, + nrow(youtube)) +for (i in 1:length(communities)) community_assignments[communities[[i]]] <- i + +wij <- buildWijMatrix(youtube) +youTube_coordinates <- projectKNNs(youtube) +youTube_coordinates <- data.frame(scale(t(youTube_coordinates))) +colnames(youTube_coordinates) <- c("x", "y") +youTube_coordinates$community <- factor(community_assignments) +youTube_coordinates$alpha <- factor(ifelse(youTube_coordinates$community == 0, 0.05, 0.2)) +ggplot(youTube_coordinates, aes( x = x, + y = y, + color = community, + alpha = alpha, + size = alpha)) + + geom_point() + + scale_color_manual(values = + c("black", colors_continuous(5000)), + guide = FALSE) + + scale_alpha_manual(values = c(0.005, 0.2), guide = FALSE) + + scale_size_manual(values = c(0.03, 0.15), guide = FALSE) + + scale_x_continuous("", + breaks = NULL, limits = c(-2.5,2.5)) + + scale_y_continuous("", + breaks = NULL, limits = c(-2.5,2.5)) + + ggtitle("YouTube Communities") +``` Distance Methods ---------------- -The original `LargeVis` paper used Euclidean distances exclusively. The `largeVis` package offers a choice among Euclidean and Cosine distance measures. +The original `LargeVis` paper used Euclidean distances exclusively. The `largeVis` package offers a choice between Euclidean and Cosine distance measures. + +The implementation is not optimized for cosine distances. Memory Consumption ------------------ -The algorithm is necessarily memory-intensive for large datasets. `neighborsToVectors`, `distance`, and `buildEdgeMatrix` are available as separate functions to facilitate memory-efficient handling of large datasets, because the high-dimensional dataset is not needed after distances have been calculated. In this case, the workflow is: +The algorithm is necessarily memory-intensive for large datasets. + +A simple way to reduce peak memory usage, is to turn-off the `save_neighbors` parameter when running `vis`. If this is insufficient, the steps of the algorithm can be run separately with the `neighborsToVectors`, `distance`, and `buildEdgeMatrix` functions. In this case, the workflow is: ``` r neighbors <- randomProjectionTreeSearch(largeDataset) -neighborIndices <- neighborsToVectors(neighbors) +edges <- buildEdgeMatrix(data = largeDataset, neighbors = neighbors) rm(neighbors) -distances <- distance(neighborIndices$i, - neighborIndices$j, - largeDataset) -rm(largeDataset) -wij <- buildEdgeMatrix(i = neighborIndices$i, - j = neighborIndices$j, - d = distances) -rm(distances, neighborIndices) -coords <- projectKNNs(wij$wij) +gc() +wij <- buildWijMaatrix(edges) +rm(edges) +gc() +coords <- projectKNNs(wij) ``` -In testing, this method reduced peak RAM requirements by more than 70%. +Note that `gc()` is being called explicitly. The reason is that R will not collect garbage while executing the package's C++ functions, which can require substantial temporary RAM. -Bibliography ------------- +Memory requirements during the neighbor search may be managed by reducing `n_trees` and increasing the `tree_threshold`. The decrease in precision is marginal, and may be compensated-for by increasing `max_iters`. See the benchmarks vignette for further detail. + +References +---------- + +``` r +save(agcoords, iriscoords, file = "vignettedata/vignettedata.Rda") +``` -Tang, Jian, Jingzhou Liu, Ming Zhang, and Qiaozhu Mei. 2016. “Visualization Large-Scale and High-Dimensional Data.” *CoRR* abs/1602.00370. . +Tang, Jian, Jingzhou Liu, Ming Zhang, and Qiaozhu Mei. 2016. “Visualizing Large-Scale and High-Dimensional Data.” In *Proceedings of the 25th International Conference on World Wide Web*, 287–97. International World Wide Web Conferences Steering Committee. diff --git a/README_files/figure-markdown_github/constn-1.png b/README_files/figure-markdown_github/constn-1.png new file mode 100644 index 0000000..29e1745 Binary files /dev/null and b/README_files/figure-markdown_github/constn-1.png differ diff --git a/README_files/figure-markdown_github/draw20ng-1.png b/README_files/figure-markdown_github/draw20ng-1.png index 5e01c80..b01966d 100644 Binary files a/README_files/figure-markdown_github/draw20ng-1.png and b/README_files/figure-markdown_github/draw20ng-1.png differ diff --git a/README_files/figure-markdown_github/drawYouTube-1.png b/README_files/figure-markdown_github/drawYouTube-1.png new file mode 100644 index 0000000..e69de29 diff --git a/README_files/figure-markdown_github/drawhyperparameters-1.png b/README_files/figure-markdown_github/drawhyperparameters-1.png index cb4fe4e..ac8a21a 100644 Binary files a/README_files/figure-markdown_github/drawhyperparameters-1.png and b/README_files/figure-markdown_github/drawhyperparameters-1.png differ diff --git a/README_files/figure-markdown_github/drawiris-1.png b/README_files/figure-markdown_github/drawiris-1.png new file mode 100644 index 0000000..5ae3a83 Binary files /dev/null and b/README_files/figure-markdown_github/drawiris-1.png differ diff --git a/README_files/figure-markdown_github/drawiriscoords-1.png b/README_files/figure-markdown_github/drawiriscoords-1.png index b700744..009522d 100644 Binary files a/README_files/figure-markdown_github/drawiriscoords-1.png and b/README_files/figure-markdown_github/drawiriscoords-1.png differ diff --git a/README_files/figure-markdown_github/drawmanifoldmap-1.png b/README_files/figure-markdown_github/drawmanifoldmap-1.png index a213e09..4e734a4 100644 Binary files a/README_files/figure-markdown_github/drawmanifoldmap-1.png and b/README_files/figure-markdown_github/drawmanifoldmap-1.png differ diff --git a/README_files/figure-markdown_github/drawmnist-1.png b/README_files/figure-markdown_github/drawmnist-1.png index c93f1c3..eba8075 100644 Binary files a/README_files/figure-markdown_github/drawmnist-1.png and b/README_files/figure-markdown_github/drawmnist-1.png differ diff --git a/README_files/figure-markdown_github/drawstm-1.png b/README_files/figure-markdown_github/drawstm-1.png new file mode 100644 index 0000000..c6c9b60 Binary files /dev/null and b/README_files/figure-markdown_github/drawstm-1.png differ diff --git a/README_files/figure-markdown_github/drawtdm-1.png b/README_files/figure-markdown_github/drawtdm-1.png index 7407c41..0626d78 100644 Binary files a/README_files/figure-markdown_github/drawtdm-1.png and b/README_files/figure-markdown_github/drawtdm-1.png differ diff --git a/README_files/figure-markdown_github/drawwikidocs-1.png b/README_files/figure-markdown_github/drawwikidocs-1.png new file mode 100644 index 0000000..5adc02c Binary files /dev/null and b/README_files/figure-markdown_github/drawwikidocs-1.png differ diff --git a/README_files/figure-markdown_github/drawwikiwords-1.png b/README_files/figure-markdown_github/drawwikiwords-1.png new file mode 100644 index 0000000..8ed2316 Binary files /dev/null and b/README_files/figure-markdown_github/drawwikiwords-1.png differ diff --git a/README_files/figure-markdown_github/faceImages-1.png b/README_files/figure-markdown_github/faceImages-1.png new file mode 100644 index 0000000..b1db84d Binary files /dev/null and b/README_files/figure-markdown_github/faceImages-1.png differ diff --git a/README_files/figure-markdown_github/iris_mkhyperparams-1.png b/README_files/figure-markdown_github/iris_mkhyperparams-1.png new file mode 100644 index 0000000..b78ce5c Binary files /dev/null and b/README_files/figure-markdown_github/iris_mkhyperparams-1.png differ diff --git a/README_files/figure-markdown_github/max_iters-1.png b/README_files/figure-markdown_github/max_iters-1.png new file mode 100644 index 0000000..b831e28 Binary files /dev/null and b/README_files/figure-markdown_github/max_iters-1.png differ diff --git a/README_files/figure-markdown_github/mnistmanifold-1.png b/README_files/figure-markdown_github/mnistmanifold-1.png new file mode 100644 index 0000000..86a5192 Binary files /dev/null and b/README_files/figure-markdown_github/mnistmanifold-1.png differ diff --git a/README_files/figure-markdown_github/n_trees-1.png b/README_files/figure-markdown_github/n_trees-1.png new file mode 100644 index 0000000..e3775ad Binary files /dev/null and b/README_files/figure-markdown_github/n_trees-1.png differ diff --git a/README_files/figure-markdown_github/plotFaceVectors-1.png b/README_files/figure-markdown_github/plotFaceVectors-1.png new file mode 100644 index 0000000..1d66d0b Binary files /dev/null and b/README_files/figure-markdown_github/plotFaceVectors-1.png differ diff --git a/README_files/figure-markdown_github/plotpeformance-1.png b/README_files/figure-markdown_github/plotpeformance-1.png index 45644e6..5e9bb98 100644 Binary files a/README_files/figure-markdown_github/plotpeformance-1.png and b/README_files/figure-markdown_github/plotpeformance-1.png differ diff --git a/README_files/figure-markdown_github/tree_threshold-1.png b/README_files/figure-markdown_github/tree_threshold-1.png new file mode 100644 index 0000000..a6942ee Binary files /dev/null and b/README_files/figure-markdown_github/tree_threshold-1.png differ diff --git a/data/facevectors.rda b/data/facevectors.rda new file mode 100644 index 0000000..89a6f22 Binary files /dev/null and b/data/facevectors.rda differ diff --git a/data/wiki.rda b/data/wiki.rda deleted file mode 100644 index 3099670..0000000 Binary files a/data/wiki.rda and /dev/null differ diff --git a/faceshighres.png b/faceshighres.png new file mode 100644 index 0000000..1b6f68b Binary files /dev/null and b/faceshighres.png differ diff --git a/inst/benchmark.R b/inst/benchmark.R index 8606650..96f835a 100644 --- a/inst/benchmark.R +++ b/inst/benchmark.R @@ -1,10 +1,10 @@ benchmark <- function(path, samplepath, - K = 40, - tree_range = c(10, 20), - thresholds = c(10, 20, 30, 40, 50), - iters = c(3), - n = 10) { + K = 40, + tree_range = c(10, 20, 50), + thresholds = c(10, 20, 30, 40, 50), + iters = c(3), + n = 10) { data <- readr::read_delim(path, delim = " ", col_names = F) data <- as.matrix(data) data <- scale(data) @@ -26,48 +26,244 @@ benchmark <- function(path, } data <- t(data) - results <- data.frame(time = numeric(0), - precision = numeric(0), - n_trees = numeric(0), - max_iterations = numeric(0), - tree_threshold = numeric(0)) - for (n_trees in tree_range) { for (max_iters in iters) { for (threshold in thresholds) { print(paste(n_trees, max_iters, threshold)) - time <- system.time( + gc() + start <- Sys.time() knns <- randomProjectionTreeSearch(data, - K, n_trees, threshold, max_iters, - verbose = TRUE) - ) + K, n_trees, threshold, max_iters, + verbose = TRUE) + time <- Sys.time() - start + units(time) <- "mins" precision <- lapply(1:n, - FUN = function(x) - sum(knns[, savedsamples$samples[x]] %in% savedsamples$neighbors[x, ])) + FUN = function(x) + sum(knns[, savedsamples$samples[x]] %in% savedsamples$neighbors[x, ])) + precision <- sum(as.numeric(precision)) / n one_result <- data.frame( - time = time[1] + time[5], - precision = sum(as.numeric(precision)) / n, - n_trees = n_trees, - max_iterations = max_iters, - tree_threshold = threshold) + time = time, + precision = precision, + n_trees = n_trees, + max_iterations = max_iters, + tree_threshold = threshold, + method = "largeVis", + tree_type = "", + searchtype = "", + eps = 0, + K = K, + machine = "aws") print(one_result) readr::write_csv(one_result, path = "results.csv", append = TRUE) + if (precision == K) break } } } - return(results) +} + +benchmarkRANN <- function(path, + samplepath, + K = 40, + tree_types = "kd", # c("kd", "bd"), + searchtypes = c("priority", "standard"), + epss = c(0.1, 0.2, 0.5), + n = 10) { + data <- readr::read_delim(path, delim = " ", col_names = F) + data <- as.matrix(data) + data <- scale(data) + cat("Getting actual neighbors...\n") + + if (file.exists(samplepath)) { + load(samplepath) + } else { + samples <- sample(nrow(data), n, replace = F) + + actualneighbors <- RANN::nn2( + data, data[samples, ], k = K, treetype = "kd", + )$nn.idx - 1 + + savedsamples <- list(samples = samples, neighbors = actualneighbors) + save(savedsamples, file = samplepath) + rm(samples) + rm(actualneighbors) + } + library(RANN) + for (tree_type in tree_types) { + for (searchtype in searchtypes) { + for (eps in epss) { + print(paste(tree_type, searchtype, eps)) + start <- Sys.time() + knns <- nn2(data, k = K, query = data[savedsamples$samples, ], + treetype = tree_type, + searchtype = searchtype, + eps = eps)$nn.idx + time <- Sys.time() - start + units(time) <- "mins" + precision <- lapply(1:n, + FUN = function(x) + sum(knns[x, ] %in% (savedsamples$neighbors[x, ] + 1))) + + one_result <- data.frame( + time = time, + precision = sum(as.numeric(precision)) / n, + n_trees = 0, + max_iterations = 0, + tree_threshold = 0, + method = "RANN", + tree_type = tree_type, + searchtype = searchtype, + eps = eps) + print(one_result) + readr::write_csv(one_result, path = "results.csv", append = TRUE) + } + } + } +} + +benchmarkAnnoy <- function(path, + samplepath, + K = 40, + tree_range = c(10, 20, 50, 100), + n = 10, + full = FALSE) { + data <- readr::read_delim(path, delim = " ", col_names = F) + data <- as.matrix(data) + data <- scale(data) + cat("Getting actual neighbors...\n") + + if (file.exists(samplepath)) { + load(samplepath) + } else { + samples <- sample(nrow(data), n, replace = F) + + actualneighbors <- RANN::nn2( + data, data[samples, ], k = K, treetype = "kd", + )$nn.idx - 1 + + savedsamples <- list(samples = samples, neighbors = actualneighbors) + save(savedsamples, file = samplepath) + rm(samples) + rm(actualneighbors) + } + library(RcppAnnoy) + for (n_trees in tree_range) { + knns <- list() + print(n_trees) + gc() + start <- Sys.time() + a <- new(AnnoyEuclidean, ncol(data)) + for (i in 1:nrow(data)) a$addItem(i - 1, data[i, ]) + a$build(n_trees) + if (! full) for (i in 1:n) + knns[[i]] <- a$getNNsByItem(item = savedsamples$samples[i] - 1, + size = K) + else for (i in 1:nrow(data)) + knns[[i]] <- a$getNNsByItem(item = i - 1, size = K) + time <- Sys.time() - start + units(time) <- "mins" + + if (full) knns <- knns[savedsamples$samples] + + precision <- lapply(1:n, + FUN = function(x) + sum(knns[[x]] %in% savedsamples$neighbors[x, ])) + + if (full) method <- "RcppAnnoy-Full" + else method <- "RcppAnnoy" + + one_result <- data.frame( + time = time, + precision = sum(as.numeric(precision)) / n, + n_trees = n_trees, + max_iterations = 0, + tree_threshold = 0, + method = method, + tree_type = "", + searchtype = "", + eps = 0, + K = K, + machine = "aws") + print(one_result) + readr::write_csv(one_result, path = "results.csv", append = TRUE) + } } require( largeVis ) -path <- "/mnt/hfsshare/DATASETS/sift/siftknns.txt" +path <- "./siftknns.txt" samplepath <- "./samples.Rda" +Annoyresults <- benchmarkAnnoy(path, + samplepath, + tree_range = c(10, 20, 50, 100, 200, 400), + n = 10000, + K = 50, + full = TRUE) results <- benchmark(path, samplepath, n = 10000, - K = 100, - tree_range = 10, - thresholds = 20, - iters = 2) -print(results) + tree_range = c(10, 20, 50, 100, 200), + thresholds = c(128), + iters = c(1, 0, 2, 3), + K = 50) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(10, 20, 50), + thresholds = c(10, 20, 50, 80, 256, 512), + iters = c(1), + K = 50) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(10), + thresholds = c(200, 400, 800), + iters = c(0,1,2), + K = 50) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(20), + thresholds = c(100, 200, 400), + iters = c(0,1,2), + K = 50) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(40), + thresholds = c(50, 100, 200), + iters = c(0,1,2), + K = 50) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(80), + thresholds = c(25, 50, 100), + iters = c(0,1,2), + K = 50) +# RANNresults <- benchmarkRANN(path, +# samplepath, +# epss = c(.1, .5, 1,2,5), +# n = 10000, +# K = 500) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(2), + thresholds = c(50, 100, 250), + iters = c(0,1,2), + K = 50) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(4), + thresholds = c(125), + iters = c(0,1,2), + K = 50) +results2 <- benchmark(path, + samplepath, + n = 10000, + tree_range = c(5), + thresholds = c(20, 40, 100), + iters = c(0,1,2), + K = 50) diff --git a/inst/doc/benchmarks.R b/inst/doc/benchmarks.R new file mode 100644 index 0000000..0715ff7 --- /dev/null +++ b/inst/doc/benchmarks.R @@ -0,0 +1,127 @@ +## ----setupbenchmark,eval=T,echo=F,warning=F,error=F,message=F------------ +# Note to reader: Please don't steal the semi-distinctive visual style I spent several minutes creating for myself. +require(ggplot2, + quietly = TRUE) +require(RColorBrewer, + quietly = TRUE) +require(wesanderson, + quietly = TRUE) +require(dplyr, quietly = TRUE) +knitr::opts_chunk$set(collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height = 5) +colors_discrete <- function(x) rep(wes_palette("Darjeeling", + n = min(x, 5)), + 2)[1:x] +colors_divergent_discrete <- function(x) + grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) +colors_continuous <- function(x) wes_palette(name = "Zissou", + n = x, + type = "continuous") + +nacol <- colors_discrete(4)[4] +theme_set( + theme_bw() %+replace% + theme( + legend.key.size = unit(4, "mm"), + legend.title = element_text(size = rel(0.8), + face = "bold"), + legend.margin = unit(0, "cm"), + legend.position = "bottom", + legend.key.size = unit(0.5, "lines"), + legend.text=element_text(size = unit(8, "points")), + axis.title.y = element_text(angle = 90), + axis.text = element_text(size = rel(0.7)), + plot.margin = unit(c(0, 0.5, 1, 0), "lines"), + axis.title = element_text(size = rel(0.8), + face = "bold"), + title = element_text(size = rel(0.9)) + ) +) + +## ----plotpeformance,echo=F,fig.align='center',warning=FALSE,message=FALSE---- +load(system.file("extdata", "benchmark.Rda", package = "largeVis")) +benchmark %>% + filter(machine != 'Large Server', + machine == 'Workstation' | K == 50) %>% + mutate(facet = precision, + facet = ifelse(facet < 0.95, '', 'Closeup'), + facet = factor(facet)) %>% + ggplot(aes( y = time, + x = precision, + group = series, + fill = series, + shape = series)) + + geom_point(size = 1.5, alpha = 0.7, color = "grey80") + + scale_y_log10(name = "Speed, log (nodes / seconds)") + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 0.925, 0.95, 0.975, 1.0)) + + facet_grid(K + machine ~ facet, scales = "free") + + scale_fill_manual(name = "Method & n. iter.", + values = colors_divergent_discrete(nlevels(benchmark$series))(nlevels(benchmark$series))) + + scale_shape_manual(name = "Method & n. iter.", + values = c(21, 21, 21, 21, 23)) + + # guides(color = guide_legend(nrow=3)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, RcppAnnoy and largeVis", + atop(italic("(n = 10000; Upper Right is Better)")) + ) + )) + +## ----constn,echo=F,warning=F--------------------------------------------- +bench <- benchmark %>% + filter(method == 'largeVis', machine == 'Large Server') %>% + mutate(nn = threshold * n_trees) %>% + group_by(max_iters, nn) %>% + filter(n() > 2) %>% + mutate(series = paste(max_iters, ", ", nn, sep = " ")) +bench$facet <- factor(ifelse(bench$n_trees >= 4, "", "n. trees < 10")) +bench %>% + ggplot(aes(y = time, + x = precision, + fill = series, + group = series, + color = factor(n_trees))) + + geom_point(size = 1.5, alpha = 0.8, shape = 21) + + scale_fill_manual("n. iter, tth", values = colors_divergent_discrete(6)(6)) + + scale_color_grey("n. trees", start = 0.8, end = 0 ) + +# guides(color = FALSE) + + # scale_shape(name = "Iterations", solid = FALSE) + + facet_grid(machine ~ .) + + scale_y_log10(name = "Speed, log (nodes / second)", limits = c(1e2,1e5)) + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, n_trees and tree_threshold", + atop(italic("(100-NN precision, n = 10000; Upper Right is Better)"))))) + +## ----tree_threshold,echo=F----------------------------------------------- +bench <- benchmark %>% + filter(method == 'largeVis', + machine != 'Large Server') %>% + mutate(label = ifelse(threshold == 128, "128", "Other"), + label = factor(label), + facet = precision, + facet = ifelse(facet < 0.85, '', 'Closeup')) +bench$facet <- factor(bench$facet) +bench %>% + arrange(nn) %>% + mutate(max_iters = factor(max_iters)) %>% + ggplot(aes(y = time, + x = precision , + color = max_iters, + group = max_iters)) + +# geom_path(size = 0.5, alpha =0.8, arrow = arrow(length = unit(0.05, "inches"))) + + geom_point(size = 1, alpha = 0.8, shape = 16) + + facet_grid(K + machine ~ facet, scales = 'free') + + scale_y_log10(name = "Speed, log (nodes / second)") + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0)) + + # scale_shape_discrete(name = "", solid = FALSE) + + # guides(color = FALSE) + + scale_color_manual("n. iter", values = colors_discrete(4)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, effect of increasing tth vs. max_iters", + atop(italic("(n = 10000; Upper Right is Better)"))))) + diff --git a/inst/doc/benchmarks.Rmd b/inst/doc/benchmarks.Rmd new file mode 100644 index 0000000..3753fde --- /dev/null +++ b/inst/doc/benchmarks.Rmd @@ -0,0 +1,197 @@ +--- +title: "ANN Benchmarks" +author: "Amos Elberg" +date: '`r Sys.Date()`' +output: + rmarkdown::html_vignette: default +vignette: | + %\VignetteIndexEntry{ANN Benchmarks} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setupbenchmark,eval=T,echo=F,warning=F,error=F,message=F} +# Note to reader: Please don't steal the semi-distinctive visual style I spent several minutes creating for myself. +require(ggplot2, + quietly = TRUE) +require(RColorBrewer, + quietly = TRUE) +require(wesanderson, + quietly = TRUE) +require(dplyr, quietly = TRUE) +knitr::opts_chunk$set(collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height = 5) +colors_discrete <- function(x) rep(wes_palette("Darjeeling", + n = min(x, 5)), + 2)[1:x] +colors_divergent_discrete <- function(x) + grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) +colors_continuous <- function(x) wes_palette(name = "Zissou", + n = x, + type = "continuous") + +nacol <- colors_discrete(4)[4] +theme_set( + theme_bw() %+replace% + theme( + legend.key.size = unit(4, "mm"), + legend.title = element_text(size = rel(0.8), + face = "bold"), + legend.margin = unit(0, "cm"), + legend.position = "bottom", + legend.key.size = unit(0.5, "lines"), + legend.text=element_text(size = unit(8, "points")), + axis.title.y = element_text(angle = 90), + axis.text = element_text(size = rel(0.7)), + plot.margin = unit(c(0, 0.5, 1, 0), "lines"), + axis.title = element_text(size = rel(0.8), + face = "bold"), + title = element_text(size = rel(0.9)) + ) +) +``` + +## Overview + +Besides manifold visualization, `largeVis` also includes an extremely efficient approximate nearest-neighbor search that runs in $O(n)$ time. + +This vignette includes benchmarks and recommendations for adjusting hyperparameters in the neighbor search for best results. + +## Hyperparameters + +The `randomProjectionTreeSearch` function has three hyperparameters that trade-off accuracy and efficiency in the neighbor search: + +1. `n_trees` - In the first phase of the function, the number of random projection trees to create. +2. `tree_threshold` - The maximum number of any nodes on a random projection tree leaf. If, after branching, the number of nodes in a branch exceeds this threshold, the branch will be divided again. +3. `max_iters` - The number of iterations for the neighborhood-exploration phase of the algorithm. + +## Data Collection \& Methodology + +The data in the benchmarks below was obtained by running the `benchmark.R` script, which is installed along with the package, on two machines. + +The aim was to replicate as much as possible the methodology used by Erik Bernhardsson's [ANN Benchmark](https://github.com/erikbern/ann-benchmarks) github. However, `ANN Benchmark` is designed for libraries that are designed to build a neighbor index and then rapidly process queries against the index. The measure used by `ANN Benchmark` is therefore queries-per-second. By contract, `largeVis` is concerned with getting neighbors for all of the nodes in a finite dataset as quickly as possible. + +Times shown for `RcppAnnoy` include the time to build a searchable index and query neighbors for all rows in the dataset. + +The data used is the 1-million vector, 128-feature [SIFT Dataset](http://corpus-texmex.irisa.fr/), which is the test data used by `ANN Benchmark`. + +Benchmarks were run on several machines. First, benchmarks were run on a workstation and a server with $K = 100$. Benchmarks were then run on an AWS c4.2xlarge instance with $K = 100$ and $K = 50$, to replicate as closely as possible the conditions of `ANN Benchmark`. + +Results that appear to have used virtual memory, in that the completion time was radically discontinuous with other results from the same machine, were discarded. + +I welcome submissions of output from the script from other hardware. + +## Comparison With Annoy + +The following chart illustrates performance versus the `Annoy` library, as implemented through the `RcppAnnoy` R package. + +To facilitate comparison with the ANN Benchmark charts, the Y-axis shows the number of vectors processed per second. + +```{r plotpeformance,echo=F,fig.align='center',warning=FALSE,message=FALSE} +load(system.file("extdata", "benchmark.Rda", package = "largeVis")) +benchmark %>% + filter(machine != 'Large Server', + machine == 'Workstation' | K == 50) %>% + mutate(facet = precision, + facet = ifelse(facet < 0.95, '', 'Closeup'), + facet = factor(facet)) %>% + ggplot(aes( y = time, + x = precision, + group = series, + fill = series, + shape = series)) + + geom_point(size = 1.5, alpha = 0.7, color = "grey80") + + scale_y_log10(name = "Speed, log (nodes / seconds)") + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 0.925, 0.95, 0.975, 1.0)) + + facet_grid(K + machine ~ facet, scales = "free") + + scale_fill_manual(name = "Method & n. iter.", + values = colors_divergent_discrete(nlevels(benchmark$series))(nlevels(benchmark$series))) + + scale_shape_manual(name = "Method & n. iter.", + values = c(21, 21, 21, 21, 23)) + + # guides(color = guide_legend(nrow=3)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, RcppAnnoy and largeVis", + atop(italic("(n = 10000; Upper Right is Better)")) + ) + )) +``` + + +## Approximate Equivalence of Number of Trees and Tree Threshold + +There is an approximate trade-off in memory use between the tree threshold and number of trees. Peak memory consumption during the tree search phase = N * n_trees * threshold. + +The trade-off is not precise because the tree split phase will return fewer nodes per tree than the threshold. On average, it should return about 3/4 of the threshold. + +On the following chart, points that share the same values of n_trees * threshold, referred to as `tth`, (and number of neighborhood exploration iterations), are shown as the same series. + +```{r constn,echo=F,warning=F} +bench <- benchmark %>% + filter(method == 'largeVis', machine == 'Large Server') %>% + mutate(nn = threshold * n_trees) %>% + group_by(max_iters, nn) %>% + filter(n() > 2) %>% + mutate(series = paste(max_iters, ", ", nn, sep = " ")) +bench$facet <- factor(ifelse(bench$n_trees >= 4, "", "n. trees < 10")) +bench %>% + ggplot(aes(y = time, + x = precision, + fill = series, + group = series, + color = factor(n_trees))) + + geom_point(size = 1.5, alpha = 0.8, shape = 21) + + scale_fill_manual("n. iter, tth", values = colors_divergent_discrete(6)(6)) + + scale_color_grey("n. trees", start = 0.8, end = 0 ) + +# guides(color = FALSE) + + # scale_shape(name = "Iterations", solid = FALSE) + + facet_grid(machine ~ .) + + scale_y_log10(name = "Speed, log (nodes / second)", limits = c(1e2,1e5)) + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, n_trees and tree_threshold", + atop(italic("(100-NN precision, n = 10000; Upper Right is Better)"))))) +``` + +Results that hold nn constant while varying the number of trees and threshold tend to cluster together, however increasing the number of trees (while holding tth constant) tends to improve accuracy and decrease speed. The degree of dispersion increases when a neighborhood exploration iteration is added. + +On the charts below, n_trees * threshold is referred to as `tth`. + +## Effect of Increasing `tth` vs. `max_iters` + + +```{r tree_threshold,echo=F} +bench <- benchmark %>% + filter(method == 'largeVis', + machine != 'Large Server') %>% + mutate(label = ifelse(threshold == 128, "128", "Other"), + label = factor(label), + facet = precision, + facet = ifelse(facet < 0.85, '', 'Closeup')) +bench$facet <- factor(bench$facet) +bench %>% + arrange(nn) %>% + mutate(max_iters = factor(max_iters)) %>% + ggplot(aes(y = time, + x = precision , + color = max_iters, + group = max_iters)) + +# geom_path(size = 0.5, alpha =0.8, arrow = arrow(length = unit(0.05, "inches"))) + + geom_point(size = 1, alpha = 0.8, shape = 16) + + facet_grid(K + machine ~ facet, scales = 'free') + + scale_y_log10(name = "Speed, log (nodes / second)") + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0)) + + # scale_shape_discrete(name = "", solid = FALSE) + + # guides(color = FALSE) + + scale_color_manual("n. iter", values = colors_discrete(4)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, effect of increasing tth vs. max_iters", + atop(italic("(n = 10000; Upper Right is Better)"))))) +``` + +A single iteration clearly has substantial impact on accuracy. The marginal benefit of additional iterations declines, but adding a second iteration is a more efficient way to improve accuracy than increasing tth. This is consistent with the recommendation of the paper authors. + diff --git a/inst/doc/benchmarks.html b/inst/doc/benchmarks.html new file mode 100644 index 0000000..0612823 --- /dev/null +++ b/inst/doc/benchmarks.html @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + +ANN Benchmarks + + + + + + + + + + + + + + + +

ANN Benchmarks

+

Amos Elberg

+

2016-08-01

+ + + +
+

Overview

+

Besides manifold visualization, largeVis also includes an extremely efficient approximate nearest-neighbor search that runs in \(O(n)\) time.

+

This vignette includes benchmarks and recommendations for adjusting hyperparameters in the neighbor search for best results.

+
+
+

Hyperparameters

+

The randomProjectionTreeSearch function has three hyperparameters that trade-off accuracy and efficiency in the neighbor search:

+
    +
  1. n_trees - In the first phase of the function, the number of random projection trees to create.
  2. +
  3. tree_threshold - The maximum number of any nodes on a random projection tree leaf. If, after branching, the number of nodes in a branch exceeds this threshold, the branch will be divided again.
  4. +
  5. max_iters - The number of iterations for the neighborhood-exploration phase of the algorithm.
  6. +
+
+
+

Data Collection & Methodology

+

The data in the benchmarks below was obtained by running the benchmark.R script, which is installed along with the package, on two machines.

+

The aim was to replicate as much as possible the methodology used by Erik Bernhardsson’s ANN Benchmark github. However, ANN Benchmark is designed for libraries that are designed to build a neighbor index and then rapidly process queries against the index. The measure used by ANN Benchmark is therefore queries-per-second. By contract, largeVis is concerned with getting neighbors for all of the nodes in a finite dataset as quickly as possible.

+

Times shown for RcppAnnoy include the time to build a searchable index and query neighbors for all rows in the dataset.

+

The data used is the 1-million vector, 128-feature SIFT Dataset, which is the test data used by ANN Benchmark.

+

Benchmarks were run on several machines. First, benchmarks were run on a workstation and a server with \(K = 100\). Benchmarks were then run on an AWS c4.2xlarge instance with \(K = 100\) and \(K = 50\), to replicate as closely as possible the conditions of ANN Benchmark.

+

Results that appear to have used virtual memory, in that the completion time was radically discontinuous with other results from the same machine, were discarded.

+

I welcome submissions of output from the script from other hardware.

+
+
+

Comparison With Annoy

+

The following chart illustrates performance versus the Annoy library, as implemented through the RcppAnnoy R package.

+

To facilitate comparison with the ANN Benchmark charts, the Y-axis shows the number of vectors processed per second.

+

+
+
+

Approximate Equivalence of Number of Trees and Tree Threshold

+

There is an approximate trade-off in memory use between the tree threshold and number of trees. Peak memory consumption during the tree search phase = N * n_trees * threshold.

+

The trade-off is not precise because the tree split phase will return fewer nodes per tree than the threshold. On average, it should return about 3/4 of the threshold.

+

On the following chart, points that share the same values of n_trees * threshold, referred to as tth, (and number of neighborhood exploration iterations), are shown as the same series.

+

+

Results that hold nn constant while varying the number of trees and threshold tend to cluster together, however increasing the number of trees (while holding tth constant) tends to improve accuracy and decrease speed. The degree of dispersion increases when a neighborhood exploration iteration is added.

+

On the charts below, n_trees * threshold is referred to as tth.

+
+
+

Effect of Increasing tth vs. max_iters

+

+

A single iteration clearly has substantial impact on accuracy. The marginal benefit of additional iterations declines, but adding a second iteration is a more efficient way to improve accuracy than increasing tth. This is consistent with the recommendation of the paper authors.

+
+ + + + + + + + diff --git a/inst/doc/largeVis.R b/inst/doc/largeVis.R index 3a5c4c3..583a56e 100644 --- a/inst/doc/largeVis.R +++ b/inst/doc/largeVis.R @@ -1,258 +1,208 @@ -## ----setup,eval=T,echo=F,warning=F,error=F,message=F--------------------- -# Note to reader: Please don't steal the semi-distinctive visual style I spent several minutes creating for myself. -library(RColorBrewer,quietly=T) -library(wesanderson,quietly=T) -colors_discrete <- function(x) rep(wes_palette("Darjeeling", n = min(x,5)), - 2)[1:x] -colors_divergent_discrete <- function(x) grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) -colors_continuous <- function(x) wes_palette(name= "Zissou",n = x, type= "continuous") +## ----setupvignette,eval=T,echo=F,warning=F,error=F,message=F------------- +require(ggplot2, + quietly = TRUE) +require(RColorBrewer, + quietly = TRUE) +require(wesanderson, + quietly = TRUE) +knitr::opts_chunk$set(collapse = TRUE, + comment = "#>", + cache=FALSE) +colors_discrete <- function(x) + rep(wes_palette("Darjeeling", n = min(x, 5)), 2)[1:x] +colors_divergent_discrete <- function(x) + grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) +colors_continuous <- function(x) wes_palette(name = "Zissou", + n = x, + type = "continuous") nacol <- colors_discrete(4)[4] -require(ggplot2,quietly = T) theme_set( theme_bw() %+replace% theme( - legend.key.size=unit(4,"mm"), - legend.title=element_text(size=rel(0.8), face = "bold"), - legend.margin=unit(0,"cm"), - legend.position="bottom", - legend.key.size=unit(0.5,"lines"), + legend.key.size = unit(4, "mm"), + legend.title = element_text(size = rel(0.8), + face = "bold"), + legend.margin = unit(0, "cm"), + legend.position = "bottom", + legend.key.size = unit(0.5, "lines"), legend.text=element_text(size = unit(8, "points")), - axis.title.y = element_text(angle=90), - axis.text = element_text(size=rel(0.7)), + axis.title.y = element_text(angle = 90), + axis.text = element_text(size = rel(0.7)), plot.margin = unit(c(0, 0.5, 1, 0), "lines"), - axis.title=element_text(size=rel(0.8),face="bold"), - title = element_text(size=rel(0.9)) - ) + axis.title = element_text(size = rel(0.8), + face = "bold"), + title = element_text(size = rel(0.9)) + ) ) -require(largeVis) +rebuild <- FALSE + +require(largeVis,quietly = TRUE) + +## ----reload,eval=!rebuild------------------------------------------------ +load(system.file(package = "largeVis", "extdata/vignettedata.Rda")) + +## ----drawhyperparameters,echo=F,fig.width=3.5,fig.height=4,fig.align='center',results='asis',cache=FALSE---- +if (! exists("agcoords")) { + data(wiki) + inputs <- data.frame( + g = rep(c(.5,1,7,14), 5), + a = rep(c(0,.1,1,5,10), each = 4) + ) + wij <- buildWijMatrix(wiki) + set.seed(1974) + initialcoords <- matrix(rnorm(ncol(wij) * 2), nrow = 2) + + agcoords <- do.call(rbind, + lapply(1:nrow(inputs), + FUN = function(x) { + a <- inputs[x, 'a'] + g <- inputs[x, 'g'] + newcoords <- initialcoords + projectKNNs(wij, alpha = a, + gamma = g, + verbose = FALSE, + coords = newcoords) %>% + t() %>% + scale() %>% + data.frame() %>% + set_colnames(c("x", "y")) %>% + mutate(a = a, g = g, degree = colSums(wiki)) + })) +} -## ----MNIST,echo=F,message=F,warning=F,results='hide',eval=F-------------- -# darch::provideMNIST(download=T) -# load("data/train.RData") -# -# mnistCoords <- vis(t(trainData) - 0.5, K = 40, tree_threshold = 700, -# n_trees = 40, max_iter = 2, verbose=F) -# mnistCoords <- mnistCoords$coords -# mnistCoords <- scale(t(mnistCoords)) -# mnistCoords <- data.frame(mnistCoords) -# colnames(mnistCoords) <- c("x", "y") -# labs <- apply(trainLabels, MARGIN=1, FUN=function(x) which(x == 1)) -# mnistCoords$labels <- factor(labs - 1) - -## ----drawmnist,echo=F,warning=F,fig.width=3.5,fig.height=4,fig.align='center',fig.show='hold'---- -load(system.file("extdata", "mnistcoords.Rda", package="largeVis")) -ggplot(mnistCoords, aes(x = x, y = y, color = labels)) + - geom_point(size = 0.1, alpha = 0.3) + - scale_x_continuous(name = "", limits = c(-2.5, 2), breaks = NULL) + - scale_y_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - scale_color_manual(values = colors_divergent_discrete(10)(10)) + - guides(colour = guide_legend(override.aes = list(size=5))) + - ggtitle("MNIST") - -## ----ldafromldavis,echo=F,eval=F----------------------------------------- -# library(LDAvis) -# data("TwentyNewsgroups") -# theta <- scale(t(TwentyNewsgroups$theta)) -# visObj <- vis(theta, K = 100, n_trees = 20, tree_threshold = 100, -# max_iter = 2) -# -# ngcoords <- scale(t(visObj$coords)) -# ngcoords <- data.frame(ngcoords) -# colnames(ngcoords) <- c("x", "y") -# library(lda) -# data("newsgroup.train.labels") -# ngcoords$label <- factor(newsgroup.train.labels)[-1] - -## ----draw20ng,fig.align='center',echo=F,fig.width=3.5,fig.height=4,eval=T,warning=FALSE,error=FALSE,message=FALSE,fig.show='hold'---- -load(system.file("extdata", "ngcoords.Rda", package="largeVis")) -ggplot(ngcoords, - aes(x = x, y = y, color = label)) + - geom_point(size = 0.4, alpha = 0.5) + - scale_color_manual(values = colors_divergent_discrete(20)(20), - guide=FALSE) + - scale_x_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - scale_y_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - ggtitle("20 Newsgroups") - -## ----3draw,webgl=TRUE,echo=F,eval=F,results='asis'----------------------- -# # d3coords <- projectKNNs(visObj$wij, dim = 3) -# # d3coords <- data.frame(scale(t(d3coords))) -# # colnames(d3coords) <- c("x", "y", "z") -# # d3coords$label <- factor(newsgroup.train.labels)[-1] -# # library(threejs) -# # rgl::plot3d(x = d3coords[,1], -# # y = d3coords[,2], -# # z = d3coords[,3], -# # main = "20 Newsgroups", -# # type = "p", -# # col = c(newsgroup.train.labels, -# # newsgroup.test.labels)) - -## ----performance,echo=F,eval=F------------------------------------------- -# benchmark <- readr::read_csv(system.file("extdata", "results.csv", package="largeVis")) -# colnames(benchmark) <- c("time", -# "precision", -# "n_trees", -# "max_iters", -# "threshold") -# benchmark$series <- factor(paste(benchmark$n_trees, "trees,", -# benchmark$max_iters, "iterations.")) - -## ----plotpeformance,echo=F,fig.width=3.5,fig.height=4,fig.align='center'---- -load(system.file("extdata", "benchmark.Rda", package = "largeVis")) -ggplot(benchmark, aes(x = time, y = precision / 100, - group = series, color = series, - shape = series, - label =threshold)) + - geom_point(size = 1) + geom_line(size = 0.5) + - geom_text(vjust = 1, hjust = -0.1, size = 2.5) + - scale_x_continuous("Time (relative)") + - scale_y_log10("Precision", limits = c(0.1,1), - breaks = c(.1, .25, .5, .8, .9, .99)) + - scale_color_manual(values = colors_divergent_discrete(nlevels(benchmark$series))(nlevels(benchmark$series))) + - guides(color = guide_legend(nrow=3)) + - ggtitle(expression( - atop("Time vs. Precision (K = 1000)", - atop(italic("Labelled by Tree Threshold")) - ) - )) - -## ----wikihyperparameters,echo=F,eval=F----------------------------------- -# data(wiki) -# -# inputs <- data.frame( -# g = rep(c(.5,1,7,14), 4), -# a = rep(c(.1,1,5,10), each = 4) -# ) -# -# agcoords <- do.call(rbind, lapply(1:nrow(inputs), FUN = function(x) { -# a <- inputs[x, 'a'] -# g <- inputs[x, 'g'] -# localcoords <- projectKNNs(wiki, alpha = a, gamma = g,verbose=FALSE) -# localcoords <- data.frame(scale(t(localcoords))) -# colnames(localcoords) <- c("x", "y") -# localcoords$a <- a -# localcoords$g <- g -# localcoords$activity <- log(Matrix::colSums(wiki)) -# localcoords -# })) - -## ----drawhyperparameters,echo=F,fig.width=3.5,fig.height=4,fig.align='center'---- -load(system.file("extdata", "agcoords.Rda", package="largeVis")) ggplot(agcoords, - aes(x = x, y = y, color = activity)) + - geom_point(alpha = 0.2, size = 0.05) + + aes(x = x, + y = y, + color = degree)) + + geom_point(alpha = 0.2, + size = 0.05) + facet_grid(a ~ g, - labeller = label_bquote(alpha == .(a), gamma == .(g)), + labeller = label_bquote(alpha == .(a), + gamma == .(g)), scales = 'free') + - scale_x_continuous(breaks=NULL,name="") + - scale_y_continuous(breaks=NULL,name = "") + - scale_color_gradientn(colors = colors_continuous(10), guide=FALSE) + - ggtitle(expression(paste("Effect of", alpha, "vs.", gamma, sep = " "))) + scale_x_continuous(breaks = NULL, + name = "") + + scale_y_continuous(breaks = NULL, + name = "") + + scale_color_gradientn(colors = colors_continuous(10), + guide=FALSE) + + ggtitle(expression(paste("Effect of ", alpha, " vs. ", gamma, sep = " "))) + +## ----drawiris,echo=F,fig.width=4,fig.height=4.5,fig.align='center',results='asis'---- +if (!exists("iriscoords")) { + data(iris) + Ks <- c(5, 10,20,30) + Ms <- c(5, 10, 20) + dat <- iris[,1:4] + dupes <- duplicated(dat) + dat <- dat[-dupes,] + labels <- iris$Species[-dupes] + dat <- as.matrix(dat) + dat <- t(dat) + + set.seed(1974) + coordsinput <- matrix(rnorm(ncol(dat) * 2), nrow = 2) + + iriscoords <- do.call(rbind, lapply(Ks, FUN = function(K) { + neighbors <- randomProjectionTreeSearch(dat, + K = K, + verbose = FALSE) + edges <- buildEdgeMatrix(dat, neighbors, verbose = FALSE) + wij <- buildWijMatrix(edges) + do.call(rbind, lapply(Ms, FUN = function(M) { + coords <- projectKNNs(wij = wij, M = M, + coords = coordsinput, + verbose = TRUE, + sgd_batches = 2000000) + coords <- scale(t(coords)) + coords <- data.frame(coords) + colnames(coords) <- c("x", "y") + coords$K <- K + coords$M <- M + coords$rebuild <- 'no' + coords$Species <- as.integer(labels) + coords + })) + })) + iriscoords$Species <- factor(iriscoords$Species) + levels(iriscoords$Species) <- levels(iris$Species) +} -## ----iris,echo=F,fig.width=5,fig.height=5,eval=F------------------------- -# data(iris) -# Ks <- c(5, 10, 20, 40) -# Ms <- c(1, 5, 10, 20) -# data(iris) -# dat <- iris[,1:4] -# dupes <- duplicated(dat) -# dat <- dat[-dupes,] -# labels <- iris$Species[-dupes] -# dat <- scale(dat) -# dat <- as.matrix(dat) -# dat <- t(dat) -# -# inputs <- data.frame( -# K = rep(Ks, length(Ms)), -# M = rep(Ms, each = length(Ks)) -# ) -# iriscoords <- do.call(rbind, lapply(1:nrow(inputs), FUN = function(x) { -# K <- inputs[x, 'K'] -# M <- inputs[x, 'M'] -# visO <- vis(dat, K = K, M = M, verbose=FALSE) -# localcoords <- data.frame(scale(t(visO$coords))) -# colnames(localcoords) <- c("x", "y") -# localcoords$K <- K -# localcoords$M <- M -# localcoords$Species <- as.integer(labels) -# localcoords -# })) -# iriscoords$Species <- factor(iriscoords$Species) -# levels(iriscoords$Species) <- levels(iris$Species) - -## ----drawiriscoords,echo=F,fig.width=4,fig.height=4.5,fig.align='center'---- -load(system.file("extdata", "iriscoords.Rda", package="largeVis")) ggplot(iriscoords, aes(x = x, y = y, - color =Species)) + + color = Species)) + geom_point(size = 0.5) + - scale_x_continuous("", breaks = NULL) + - scale_y_continuous("", breaks = NULL) + - facet_grid(K ~ M, scales = 'free', labeller = label_bquote(K == .(K), M == .(M))) + + scale_x_continuous("", + breaks = NULL) + + scale_y_continuous("", + breaks = NULL) + + facet_grid(K ~ M, + scales = 'free', + labeller = label_bquote(K == .(K), M == .(M))) + scale_color_manual(values = colors_discrete(3)) + ggtitle("Effect of M and K on Iris Dataset") -## ----loadmnistimages,eval=F,echo=F--------------------------------------- -# load("data/train.RData") - -## ----drawmanifoldmap,echo=T,fig.width=8,fig.height=8,message=F,warning=F,fig.align='center'---- -if (exists("trainData")) { - dim(trainData) <- c(60000, 28, 28) - manifoldMap(mnistCoords[,1:2], - n = 5000, - scale = 0.003, - transparency = F, - images = trainData, - xlab="", ylab="", - xlim = c(-2, 2), - ylim = c(-2, 2)) -} - -## ----tdm,echo=F,eval=F--------------------------------------------------- -# library(stm) -# data("poliblog5k") -# p <- c(0, cumsum(as.numeric(lapply(poliblog5k.docs, function(x) ncol(x))))) -# i <- do.call("c", lapply(poliblog5k.docs, function(x) x[1,])) -# p[length(p)] <- length(i) -# j <- rep(0:(length(diff(p)) - 1), diff(p)) -# v <- do.call("c", lapply(poliblog5k.docs, function(x) x[2,])) -# poli <- Matrix::sparseMatrix(i = i + 1, j = j + 1, x = v) -# dupes <- duplicated(slam::as.simple_triplet_matrix(Matrix::t(poli))) -# poli <- poli[, ! dupes] -# poli <- poli / log(Matrix::rowSums(poli > 0)) # tf-idf weight -# policoords <- vis(poli, K = 100, n_trees = 20, -# tree_threshold = 100, max_iter = 10, -# M=10,gamma=15, -# distance_method = 'Cosine',verbose=F) -# polidata <- data.frame(scale(t(policoords$coords))) -# colnames(polidata) <- c('x', 'y') -# polidata$rating <- poliblog5k.meta$rating[!dupes] -# polidata$blog <- poliblog5k.meta$blog[!dupes] - -## ----drawtdm,echo=F,fig.height=4,fig.width=7----------------------------- -load(system.file("extdata", "polidata.Rda", package="largeVis")) -ggplot(polidata, aes(x = x, y = y, color = blog)) + - geom_point(size = 0.3, alpha = 0.8) + - scale_color_manual(values = colors_divergent_discrete(6)(6)) + - facet_grid(. ~ rating, scale = 'free') + - scale_x_continuous("", breaks = NULL) + - scale_y_continuous("", breaks = NULL) + - ggtitle("Visualization of a tf-idf Matrix") - -## ----eval=F,echo=T------------------------------------------------------- +## ----echomanifold,echo=T,eval=F------------------------------------------ +# dim(trainData) <- c(60000, 28, 28) +# aperm(trainData, perm = c(1,3,2), resize = FALSE) +# set.seed(1974) +# manifoldMap(mnistCoords[,1:2], +# n = 5000, +# scale = 0.1, +# images = trainData, +# xlab = "", +# ylab = "") + +## ----youtube,eval=F,echo=T----------------------------------------------- +# youtube <- readr::read_tsv(pathToGraphFile, skip=4, col_names=FALSE) +# youtube <- as.matrix(youtube) +# youtube <- Matrix::sparseMatrix(i = youtube[, 1], +# j = youtube[, 2], +# x = rep(1, nrow(youtube)), +# dims = c(max(youtube), max(youtube))) +# youtube <- youtube + t(youtube) +# communities <- readr::read_lines(pathToCommunities) +# communities <- lapply(communities, +# FUN = function(x) as.numeric(unlist(strsplit(x, "\t")))) +# community_assignments <- rep(0, +# nrow(youtube)) +# for (i in 1:length(communities)) community_assignments[communities[[i]]] <- i +# +# wij <- buildWijMatrix(youtube) +# youTube_coordinates <- projectKNNs(youtube) +# youTube_coordinates <- data.frame(scale(t(youTube_coordinates))) +# colnames(youTube_coordinates) <- c("x", "y") +# youTube_coordinates$community <- factor(community_assignments) +# youTube_coordinates$alpha <- factor(ifelse(youTube_coordinates$community == 0, 0.05, 0.2)) +# ggplot(youTube_coordinates, aes( x = x, +# y = y, +# color = community, +# alpha = alpha, +# size = alpha)) + +# geom_point() + +# scale_color_manual(values = +# c("black", colors_continuous(5000)), +# guide = FALSE) + +# scale_alpha_manual(values = c(0.005, 0.2), guide = FALSE) + +# scale_size_manual(values = c(0.03, 0.15), guide = FALSE) + +# scale_x_continuous("", +# breaks = NULL, limits = c(-2.5,2.5)) + +# scale_y_continuous("", +# breaks = NULL, limits = c(-2.5,2.5)) + +# ggtitle("YouTube Communities") + +## ----lowmemexample,eval=F,echo=T----------------------------------------- # neighbors <- randomProjectionTreeSearch(largeDataset) -# neighborIndices <- neighborsToVectors(neighbors) +# edges <- buildEdgeMatrix(data = largeDataset, neighbors = neighbors) # rm(neighbors) -# distances <- distance(neighborIndices$i, -# neighborIndices$j, -# largeDataset) -# rm(largeDataset) -# wij <- buildEdgeMatrix(i = neighborIndices$i, -# j = neighborIndices$j, -# d = distances) -# rm(distances, neighborIndices) -# coords <- projectKNNs(wij$wij) +# gc() +# wij <- buildWijMaatrix(edges) +# rm(edges) +# gc() +# coords <- projectKNNs(wij) + +## ----save,eval=rebuild--------------------------------------------------- +# save(agcoords, iriscoords, file = "vignettedata/vignettedata.Rda") diff --git a/inst/doc/largeVis.Rmd b/inst/doc/largeVis.Rmd index cf6ad74..52c3704 100644 --- a/inst/doc/largeVis.Rmd +++ b/inst/doc/largeVis.Rmd @@ -5,54 +5,79 @@ date: '`r Sys.Date()`' output: rmarkdown::html_vignette: fig_caption: yes - rmarkdown::github_document: - dev: png bibliography: TangLZM16.bib -vignette: > - %\VignetteIndexEntry{largeVis} - %\VignetteEngine{knitr::rmarkdown} +vignette: | + %\VignetteIndexEntry{largeVis} + %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -```{r setup,eval=T,echo=F,warning=F,error=F,message=F} -# Note to reader: Please don't steal the semi-distinctive visual style I spent several minutes creating for myself. -library(RColorBrewer,quietly=T) -library(wesanderson,quietly=T) -colors_discrete <- function(x) rep(wes_palette("Darjeeling", n = min(x,5)), - 2)[1:x] -colors_divergent_discrete <- function(x) grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) -colors_continuous <- function(x) wes_palette(name= "Zissou",n = x, type= "continuous") +```{r setupvignette,eval=T,echo=F,warning=F,error=F,message=F} +require(ggplot2, + quietly = TRUE) +require(RColorBrewer, + quietly = TRUE) +require(wesanderson, + quietly = TRUE) +knitr::opts_chunk$set(collapse = TRUE, + comment = "#>", + cache=FALSE) +colors_discrete <- function(x) + rep(wes_palette("Darjeeling", n = min(x, 5)), 2)[1:x] +colors_divergent_discrete <- function(x) + grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) +colors_continuous <- function(x) wes_palette(name = "Zissou", + n = x, + type = "continuous") nacol <- colors_discrete(4)[4] -require(ggplot2,quietly = T) theme_set( theme_bw() %+replace% theme( - legend.key.size=unit(4,"mm"), - legend.title=element_text(size=rel(0.8), face = "bold"), - legend.margin=unit(0,"cm"), - legend.position="bottom", - legend.key.size=unit(0.5,"lines"), + legend.key.size = unit(4, "mm"), + legend.title = element_text(size = rel(0.8), + face = "bold"), + legend.margin = unit(0, "cm"), + legend.position = "bottom", + legend.key.size = unit(0.5, "lines"), legend.text=element_text(size = unit(8, "points")), - axis.title.y = element_text(angle=90), - axis.text = element_text(size=rel(0.7)), + axis.title.y = element_text(angle = 90), + axis.text = element_text(size = rel(0.7)), plot.margin = unit(c(0, 0.5, 1, 0), "lines"), - axis.title=element_text(size=rel(0.8),face="bold"), - title = element_text(size=rel(0.9)) - ) + axis.title = element_text(size = rel(0.8), + face = "bold"), + title = element_text(size = rel(0.9)) + ) ) -require(largeVis) +rebuild <- FALSE + +require(largeVis,quietly = TRUE) ``` This Vingette provides an overview of the largeVis package. ## Introduction -The `largeVis` package offers four functions for visualizing high-dimensional datasets and finding approximate nearest neighbors, based on the `LargeVis` algorithm presented in @TangLZM16: +This package provides `LargeVis` visualizations and fast nearest-neighbor search. The `LargeVis` algorithm, presented in @tang2016visualizing, creates high-quality low-dimensional representaitons of large, high-dimensional datasets, similar to [t-SNE](https://lvdmaaten.github.io/tsne/). + +These visualizations are useful for data exploration, for visualizing complex non-linear functions, and especially for visualizing embeddings such as learned vectors for images. + +A limitation of t-SNE is that because the algorithm has complexity order $O(n^2)$, it is not feasible for use on even moderately sized datasets. [Barnes-Hut](https://arxiv.org/pdf/1301.3342.pdf), an approximation of t-SNE, has complexity $O(n \log n)$ but also quickly becomes infeasible as the size of data grows. `LargeVis` is intended to address the issue by operating in linear $O(n)$ time. It has been benchmarked at more than 30x faster than Barnes-Hut on datasets of approximately 1-million rows, and scaled linearly as long as there is sufficient RAM. + +In addition, `LargeVis` includes an algorithm for finding approximate k-Nearest Neighbors in $O(n)$ time. This algorithm turns out to be faster at finding accurate a-NNs than any other method I was able to test. + +The package also includes a function for visualizing image embeddings by plotting images at the locations given by the `LargeVis` algorithm. + +For a detailed description of the algorithm, please see the original paper, @tang2016visualizing. + +## Package Overview + +The `largeVis` package offers five functions for visualizing high-dimensional datasets and finding approximate nearest neighbors (along with some helper functions): 1. `randomProjectionTreeSearch`, a method for finding approximate nearest neighbors. 2. `projectKNNs`, which takes as input a weighted nearest-neighbor graph and estimates a projection into a low-dimensional space. -3. `vis`, which combines `randomProjectionTreeSearch`, `buildEdgeMatrix`, and `projectKNNs`, along with additional code to implement the `LargeVis` algorithm. -4. `manifoldMap`, which produces a plot for visualizing embeddings of images. +3. `largeVis`, which implements the entire `LargeVis` algorithm. +4. `manifoldMap` (and companon `ggManifoldMap`), which produce a plot for visualizing embeddings of images. +5. `buildWijMatrix` takes a sparse matrix of the distances between nearest neighbors, and returns one with the edges properly weighted for use in `projectKNNs`. See the [original paper](https://arxiv.org/abs/1602.00370) for a detailed description of the algorithm. @@ -64,302 +89,241 @@ If there are NA's, Infs, or NULLs in the input, `randomProjectionTreeSearch` wil If the numerical range covered by the data is large, this can cause errors in or before the `buildEdgeMatrix` function. This is because the algorithm requires calculating $\exp(||\vec{x_i}, \vec{x_j}||^2)$ in the high-dimensional space, which will overflow if the distance between any nearest neighbors exceeds about 26. -If there are duplicates in the input data, while the implementation tries to filter duplicates, it is likely to lead to problems. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during `buildEdgeMatrix`, or stochastic gradient descent. - -## Examples - -```{r MNIST,echo=F,message=F,warning=F,results='hide',eval=F} -darch::provideMNIST(download=T) -load("data/train.RData") - -mnistCoords <- vis(t(trainData) - 0.5, K = 40, tree_threshold = 700, - n_trees = 40, max_iter = 2, verbose=F) -mnistCoords <- mnistCoords$coords -mnistCoords <- scale(t(mnistCoords)) -mnistCoords <- data.frame(mnistCoords) -colnames(mnistCoords) <- c("x", "y") -labs <- apply(trainLabels, MARGIN=1, FUN=function(x) which(x == 1)) -mnistCoords$labels <- factor(labs - 1) -``` - -```{r drawmnist,echo=F,warning=F,fig.width=3.5,fig.height=4,fig.align='center',fig.show='hold'} -load(system.file("extdata", "mnistcoords.Rda", package="largeVis")) -ggplot(mnistCoords, aes(x = x, y = y, color = labels)) + - geom_point(size = 0.1, alpha = 0.3) + - scale_x_continuous(name = "", limits = c(-2.5, 2), breaks = NULL) + - scale_y_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - scale_color_manual(values = colors_divergent_discrete(10)(10)) + - guides(colour = guide_legend(override.aes = list(size=5))) + - ggtitle("MNIST") -``` - -```{r ldafromldavis,echo=F,eval=F} -library(LDAvis) -data("TwentyNewsgroups") -theta <- scale(t(TwentyNewsgroups$theta)) -visObj <- vis(theta, K = 100, n_trees = 20, tree_threshold = 100, - max_iter = 2) - -ngcoords <- scale(t(visObj$coords)) -ngcoords <- data.frame(ngcoords) -colnames(ngcoords) <- c("x", "y") -library(lda) -data("newsgroup.train.labels") -ngcoords$label <- factor(newsgroup.train.labels)[-1] -``` -```{r draw20ng,fig.align='center',echo=F,fig.width=3.5,fig.height=4,eval=T,warning=FALSE,error=FALSE,message=FALSE,fig.show='hold'} -load(system.file("extdata", "ngcoords.Rda", package="largeVis")) -ggplot(ngcoords, - aes(x = x, y = y, color = label)) + - geom_point(size = 0.4, alpha = 0.5) + - scale_color_manual(values = colors_divergent_discrete(20)(20), - guide=FALSE) + - scale_x_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - scale_y_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - ggtitle("20 Newsgroups") -``` - -```{r 3draw,webgl=TRUE,echo=F,eval=F,results='asis'} -# d3coords <- projectKNNs(visObj$wij, dim = 3) -# d3coords <- data.frame(scale(t(d3coords))) -# colnames(d3coords) <- c("x", "y", "z") -# d3coords$label <- factor(newsgroup.train.labels)[-1] -# library(threejs) -# rgl::plot3d(x = d3coords[,1], -# y = d3coords[,2], -# z = d3coords[,3], -# main = "20 Newsgroups", -# type = "p", -# col = c(newsgroup.train.labels, -# newsgroup.test.labels)) -``` +Duplicates in the input data are likely to cause issues. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during `buildEdgeMatrix`, or stochastic gradient descent. ## Overview of Functions and Hyperparameters ### `randomProjectionTreeSearch` -This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, the algorithm creates `n_trees` binary trees dividing the space into leaves of at most `tree_threshold` nodes. A node's candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. In the second phase, for each node, the algorithm looks at the candidate nearest neighbors for that node, as well as each of those nodes' candidate nearest neighbors. The logic of the algorithm is that a node's neighbors' neighbors are likely to be the node's own neighbors. In each iteration, the closest `K` candidate neighbors for each node are kept. +This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, which is based on [Erik Bernhardsson](http://erikbern.com)'s [Annoy](https://github.com/spotify/annoy) algorithm, `n_trees` trees are formed by recursively dividing the space by hyperplanes until at most `tree_threshold` nodes remain in a branch. A node's candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. The `largeVis` algorithm adds a second phase, neighborhood exploration, which considers, for each node, whether the candidate neighbors of the node's candidate immediate neighbors are closer. The logic of the algorithm is that a node's neighbors' neighbors are likely to be the node's own neighbors. In each iteration, the closest `K` candidate neighbors for each node are kept. -The authors of @TangLZM16 suggest that a single iteration of the second phase is generally sufficient to obtain satisfactory performance. +(Note that this implementation of `largeVis` differs from the approach taken by `Annoy`, in that `Annoy` always uses the number of features as the leaf threshold, where `largeVis` allows this to be an adjustable parameter.) -The chart below illlustrates the trade-off between performance and accuracy for the nearest-neighbor search, using various hyperparameters. The data was produced using the `benchmark.R` script in the `inst/` directory. The test data is the 1-million vector, 128-feature [SIFT Dataset](http://corpus-texmex.irisa.fr/), as per Erik Bernhardsson's [ANN Benchmark](https://github.com/erikbern/ann-benchmarks) github. +The authors of @tang2016visualizing suggest that a single iteration of the second phase is generally sufficient to obtain satisfactory performance. -```{r performance,echo=F,eval=F} -benchmark <- readr::read_csv(system.file("extdata", "results.csv", package="largeVis")) -colnames(benchmark) <- c("time", - "precision", - "n_trees", - "max_iters", - "threshold") -benchmark$series <- factor(paste(benchmark$n_trees, "trees,", - benchmark$max_iters, "iterations.")) -``` -```{r plotpeformance,echo=F,fig.width=3.5,fig.height=4,fig.align='center'} -load(system.file("extdata", "benchmark.Rda", package = "largeVis")) -ggplot(benchmark, aes(x = time, y = precision / 100, - group = series, color = series, - shape = series, - label =threshold)) + - geom_point(size = 1) + geom_line(size = 0.5) + - geom_text(vjust = 1, hjust = -0.1, size = 2.5) + - scale_x_continuous("Time (relative)") + - scale_y_log10("Precision", limits = c(0.1,1), - breaks = c(.1, .25, .5, .8, .9, .99)) + - scale_color_manual(values = colors_divergent_discrete(nlevels(benchmark$series))(nlevels(benchmark$series))) + - guides(color = guide_legend(nrow=3)) + - ggtitle(expression( - atop("Time vs. Precision (K = 1000)", - atop(italic("Labelled by Tree Threshold")) - ) - )) -``` - -If `randomProjectionTreeSearch` fails to find the desired number of neighbors, usually the best result is obtained by increasing the tree threshold. If `randomProjectionTreeSearch` fails with an error that no neighbors were found for some nodes, and the tree threshold is already reasonable, this may be an indication that duplicates remain in the input data. +See the vignette "ANN Benchmarks" for additional information. ### `projectKNNs` This function takes as its input a `Matrix::sparseMatrix`, of connections between nodes. The matrix must be symmetric. A non-zero cell implies that node `i` is a nearest neighbor of node `j`, vice-versa, or both. Non-zero values represent the strength of the connection relative to other nearest neighbors of the two nodes. -The `LargeVis` algorithm, explained in detail in @TangLZM16, estimates the embedding by sampling from the identitied nearest-neighbor connections. For each edge, the algorithm also samples `M` non-nearest neighbor negative samples. `M`, along with $\gamma$ and $\alpha$, control the visualization. $\alpha$ controls the desired distance between nearest neighbors. $\gamma$ controls the relative strength of the attractive force between nearest neighbors and repulsive force between non-neighbors. - -The following grid illustrates the effect of the $\alpha$ and $\gamma$ hyperparameters, using the `wiki` dataset which is included with the package: - -```{r wikihyperparameters,echo=F,eval=F} -data(wiki) +The `LargeVis` algorithm, explained in detail in @tang2016visualizing, estimates the embedding by sampling from the identitied nearest-neighbor connections. For each edge, the algorithm also samples `M` non-nearest neighbor negative samples. `M`, along with $\gamma$ and $\alpha$, control the visualization. $\alpha$ controls the desired distance between nearest neighbors. $\gamma$ controls the relative strength of the attractive force between nearest neighbors and repulsive force between non-neighbors. -inputs <- data.frame( - g = rep(c(.5,1,7,14), 4), - a = rep(c(.1,1,5,10), each = 4) -) +The following grid illustrates the effect of the $\alpha$ and $\gamma$ hyperparameters: -agcoords <- do.call(rbind, lapply(1:nrow(inputs), FUN = function(x) { - a <- inputs[x, 'a'] - g <- inputs[x, 'g'] - localcoords <- projectKNNs(wiki, alpha = a, gamma = g,verbose=FALSE) - localcoords <- data.frame(scale(t(localcoords))) - colnames(localcoords) <- c("x", "y") - localcoords$a <- a - localcoords$g <- g - localcoords$activity <- log(Matrix::colSums(wiki)) - localcoords -})) +```{r reload,eval=!rebuild} +load(system.file(package = "largeVis", "extdata/vignettedata.Rda")) ``` -```{r drawhyperparameters,echo=F,fig.width=3.5,fig.height=4,fig.align='center'} -load(system.file("extdata", "agcoords.Rda", package="largeVis")) +```{r drawhyperparameters,echo=F,fig.width=3.5,fig.height=4,fig.align='center',results='asis',cache=FALSE} +if (! exists("agcoords")) { + data(wiki) + inputs <- data.frame( + g = rep(c(.5,1,7,14), 5), + a = rep(c(0,.1,1,5,10), each = 4) + ) + wij <- buildWijMatrix(wiki) + set.seed(1974) + initialcoords <- matrix(rnorm(ncol(wij) * 2), nrow = 2) + + agcoords <- do.call(rbind, + lapply(1:nrow(inputs), + FUN = function(x) { + a <- inputs[x, 'a'] + g <- inputs[x, 'g'] + newcoords <- initialcoords + projectKNNs(wij, alpha = a, + gamma = g, + verbose = FALSE, + coords = newcoords) %>% + t() %>% + scale() %>% + data.frame() %>% + set_colnames(c("x", "y")) %>% + mutate(a = a, g = g, degree = colSums(wiki)) + })) +} + ggplot(agcoords, - aes(x = x, y = y, color = activity)) + - geom_point(alpha = 0.2, size = 0.05) + + aes(x = x, + y = y, + color = degree)) + + geom_point(alpha = 0.2, + size = 0.05) + facet_grid(a ~ g, - labeller = label_bquote(alpha == .(a), gamma == .(g)), + labeller = label_bquote(alpha == .(a), + gamma == .(g)), scales = 'free') + - scale_x_continuous(breaks=NULL,name="") + - scale_y_continuous(breaks=NULL,name = "") + - scale_color_gradientn(colors = colors_continuous(10), guide=FALSE) + - ggtitle(expression(paste("Effect of", alpha, "vs.", gamma, sep = " "))) + scale_x_continuous(breaks = NULL, + name = "") + + scale_y_continuous(breaks = NULL, + name = "") + + scale_color_gradientn(colors = colors_continuous(10), + guide=FALSE) + + ggtitle(expression(paste("Effect of ", alpha, " vs. ", gamma, sep = " "))) ``` The additional hyperparameters $\rho$ and `min-`$\rho$ control the starting and final learning rate for the stochastic gradient descent process. -The algorithm can treat positive edge weights in two different ways. The authors of @TangLZM16 suggest that edge weights should be used to generate a weighted sampling. However, the algorithm for taking a weighted sample runs in $O(n \log n)$. Alternatively, the edge-weights can be applied to the gradients. This is controlled by the `weight_pos_samples` parameter. +The algorithm can treat positive edge weights in two different ways. The authors of @tang2016visualizing suggest that edge weights should be used to generate a weighted sampling. However, the algorithm for taking a weighted sample runs in $O(n \log n)$. Alternatively, the edge-weights can be applied to the gradients. This is controlled by the `weight_pos_samples` parameter. ### `vis` The `vis` function combines `randomProjectionTreeSearch` and `projectKNNs`, along with additional logic for calculating edge weights, to implement the complete `LargeVis` algorithm. -The following chart illustrates the effect of the `M` and `K` parameters, using the `iris` dataset. - -```{r iris,echo=F,fig.width=5,fig.height=5,eval=F} -data(iris) -Ks <- c(5, 10, 20, 40) -Ms <- c(1, 5, 10, 20) -data(iris) -dat <- iris[,1:4] -dupes <- duplicated(dat) -dat <- dat[-dupes,] -labels <- iris$Species[-dupes] -dat <- scale(dat) -dat <- as.matrix(dat) -dat <- t(dat) - -inputs <- data.frame( - K = rep(Ks, length(Ms)), - M = rep(Ms, each = length(Ks)) -) -iriscoords <- do.call(rbind, lapply(1:nrow(inputs), FUN = function(x) { - K <- inputs[x, 'K'] - M <- inputs[x, 'M'] - visO <- vis(dat, K = K, M = M, verbose=FALSE) - localcoords <- data.frame(scale(t(visO$coords))) - colnames(localcoords) <- c("x", "y") - localcoords$K <- K - localcoords$M <- M - localcoords$Species <- as.integer(labels) - localcoords +The following chart illustrates the effect of the `M` and `K` parameters, using the `iris` dataset. Each row re-uses the same set of identified `K` neighbors, and initial coordinates. + +```{r drawiris,echo=F,fig.width=4,fig.height=4.5,fig.align='center',results='asis'} +if (!exists("iriscoords")) { + data(iris) + Ks <- c(5, 10,20,30) + Ms <- c(5, 10, 20) + dat <- iris[,1:4] + dupes <- duplicated(dat) + dat <- dat[-dupes,] + labels <- iris$Species[-dupes] + dat <- as.matrix(dat) + dat <- t(dat) + + set.seed(1974) + coordsinput <- matrix(rnorm(ncol(dat) * 2), nrow = 2) + + iriscoords <- do.call(rbind, lapply(Ks, FUN = function(K) { + neighbors <- randomProjectionTreeSearch(dat, + K = K, + verbose = FALSE) + edges <- buildEdgeMatrix(dat, neighbors, verbose = FALSE) + wij <- buildWijMatrix(edges) + do.call(rbind, lapply(Ms, FUN = function(M) { + coords <- projectKNNs(wij = wij, M = M, + coords = coordsinput, + verbose = TRUE, + sgd_batches = 2000000) + coords <- scale(t(coords)) + coords <- data.frame(coords) + colnames(coords) <- c("x", "y") + coords$K <- K + coords$M <- M + coords$rebuild <- 'no' + coords$Species <- as.integer(labels) + coords + })) })) -iriscoords$Species <- factor(iriscoords$Species) -levels(iriscoords$Species) <- levels(iris$Species) -``` -```{r drawiriscoords,echo=F,fig.width=4,fig.height=4.5,fig.align='center'} -load(system.file("extdata", "iriscoords.Rda", package="largeVis")) + iriscoords$Species <- factor(iriscoords$Species) + levels(iriscoords$Species) <- levels(iris$Species) +} + ggplot(iriscoords, aes(x = x, y = y, - color =Species)) + + color = Species)) + geom_point(size = 0.5) + - scale_x_continuous("", breaks = NULL) + - scale_y_continuous("", breaks = NULL) + - facet_grid(K ~ M, scales = 'free', labeller = label_bquote(K == .(K), M == .(M))) + + scale_x_continuous("", + breaks = NULL) + + scale_y_continuous("", + breaks = NULL) + + facet_grid(K ~ M, + scales = 'free', + labeller = label_bquote(K == .(K), M == .(M))) + scale_color_manual(values = colors_discrete(3)) + ggtitle("Effect of M and K on Iris Dataset") ``` ### `manifoldMap` -The `manifoldMap` function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by `projectKNNs` or `vis`) and an `array` of `N` images, the function samples `n` images and plots them at the coordinates given in the matrix. If the `transparency` parameter is a number between 0 and 1, then the function adds to each image an alpha channel where the value per pixel is proportional to $transparency *$ the image content. +The `manifoldMap` function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by `projectKNNs` or `vis`) and an `array` of `N` images, the function samples `n` images and plots them at the coordinates given in the matrix. -The function can plot both color and greyscale images. +The following code will generate the visualization shown in the examples: -The following code will plot 5000 images sampled from the MNIST dataset at positions generated by `vis`: -```{r loadmnistimages,eval=F,echo=F} -load("data/train.RData") -``` -```{r drawmanifoldmap,echo=T,fig.width=8,fig.height=8,message=F,warning=F,fig.align='center'} -if (exists("trainData")) { - dim(trainData) <- c(60000, 28, 28) - manifoldMap(mnistCoords[,1:2], - n = 5000, - scale = 0.003, - transparency = F, - images = trainData, - xlab="", ylab="", - xlim = c(-2, 2), - ylim = c(-2, 2)) -} +```{r echomanifold,echo=T,eval=F} +dim(trainData) <- c(60000, 28, 28) +aperm(trainData, perm = c(1,3,2), resize = FALSE) +set.seed(1974) +manifoldMap(mnistCoords[,1:2], + n = 5000, + scale = 0.1, + images = trainData, + xlab = "", + ylab = "") ``` -The code is disabled by default in this vignette for data size reasons. - ## Support for Sparse Matrices -`largeVis` supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices. - -For example, the following plot visualizes a tf-idf weighted document-term matrix for a corpus of 5000 political blog entries, as included with the `stm` package. - -```{r tdm,echo=F,eval=F} -library(stm) -data("poliblog5k") -p <- c(0, cumsum(as.numeric(lapply(poliblog5k.docs, function(x) ncol(x))))) -i <- do.call("c", lapply(poliblog5k.docs, function(x) x[1,])) -p[length(p)] <- length(i) -j <- rep(0:(length(diff(p)) - 1), diff(p)) -v <- do.call("c", lapply(poliblog5k.docs, function(x) x[2,])) -poli <- Matrix::sparseMatrix(i = i + 1, j = j + 1, x = v) -dupes <- duplicated(slam::as.simple_triplet_matrix(Matrix::t(poli))) -poli <- poli[, ! dupes] -poli <- poli / log(Matrix::rowSums(poli > 0)) # tf-idf weight -policoords <- vis(poli, K = 100, n_trees = 20, - tree_threshold = 100, max_iter = 10, - M=10,gamma=15, - distance_method = 'Cosine',verbose=F) -polidata <- data.frame(scale(t(policoords$coords))) -colnames(polidata) <- c('x', 'y') -polidata$rating <- poliblog5k.meta$rating[!dupes] -polidata$blog <- poliblog5k.meta$blog[!dupes] -``` -```{r drawtdm,echo=F,fig.height=4,fig.width=7} -load(system.file("extdata", "polidata.Rda", package="largeVis")) -ggplot(polidata, aes(x = x, y = y, color = blog)) + - geom_point(size = 0.3, alpha = 0.8) + - scale_color_manual(values = colors_divergent_discrete(6)(6)) + - facet_grid(. ~ rating, scale = 'free') + - scale_x_continuous("", breaks = NULL) + - scale_y_continuous("", breaks = NULL) + - ggtitle("Visualization of a tf-idf Matrix") +`largeVis` supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices directly, and compare the result with the result of visualizing topic vectors. + +## Visualizing Graphs + +The `largeVis` visualization algorithm can be used to visualize undirected weighted or unweighted acyclic graphs. The included `wiki` dataset is an example. + +The following code illustrates how to import and visualize a graph using the YouTube-communities dataset available [here](https://snap.stanford.edu/data/com-Youtube.html). The data and visualization are not included here for size reasons. + +```{r youtube,eval=F,echo=T} +youtube <- readr::read_tsv(pathToGraphFile, skip=4, col_names=FALSE) +youtube <- as.matrix(youtube) +youtube <- Matrix::sparseMatrix(i = youtube[, 1], + j = youtube[, 2], + x = rep(1, nrow(youtube)), + dims = c(max(youtube), max(youtube))) +youtube <- youtube + t(youtube) +communities <- readr::read_lines(pathToCommunities) +communities <- lapply(communities, + FUN = function(x) as.numeric(unlist(strsplit(x, "\t")))) +community_assignments <- rep(0, + nrow(youtube)) +for (i in 1:length(communities)) community_assignments[communities[[i]]] <- i + +wij <- buildWijMatrix(youtube) +youTube_coordinates <- projectKNNs(youtube) +youTube_coordinates <- data.frame(scale(t(youTube_coordinates))) +colnames(youTube_coordinates) <- c("x", "y") +youTube_coordinates$community <- factor(community_assignments) +youTube_coordinates$alpha <- factor(ifelse(youTube_coordinates$community == 0, 0.05, 0.2)) +ggplot(youTube_coordinates, aes( x = x, + y = y, + color = community, + alpha = alpha, + size = alpha)) + + geom_point() + + scale_color_manual(values = + c("black", colors_continuous(5000)), + guide = FALSE) + + scale_alpha_manual(values = c(0.005, 0.2), guide = FALSE) + + scale_size_manual(values = c(0.03, 0.15), guide = FALSE) + + scale_x_continuous("", + breaks = NULL, limits = c(-2.5,2.5)) + + scale_y_continuous("", + breaks = NULL, limits = c(-2.5,2.5)) + + ggtitle("YouTube Communities") ``` ## Distance Methods -The original `LargeVis` paper used Euclidean distances exclusively. The `largeVis` package offers a choice among Euclidean and Cosine distance measures. +The original `LargeVis` paper used Euclidean distances exclusively. The `largeVis` package offers a choice between Euclidean and Cosine distance measures. + +The implementation is not optimized for cosine distances. ## Memory Consumption -The algorithm is necessarily memory-intensive for large datasets. `neighborsToVectors`, `distance`, and `buildEdgeMatrix` are available as separate functions to facilitate memory-efficient handling of large datasets, because the high-dimensional dataset is not needed after distances have been calculated. In this case, the workflow is: +The algorithm is necessarily memory-intensive for large datasets. -```{r eval=F,echo=T} +A simple way to reduce peak memory usage, is to turn-off the `save_neighbors` parameter when running `vis`. If this is insufficient, the steps of the algorithm can be run separately with the `neighborsToVectors`, `distance`, and `buildEdgeMatrix` functions. In this case, the workflow is: + +```{r lowmemexample,eval=F,echo=T} neighbors <- randomProjectionTreeSearch(largeDataset) -neighborIndices <- neighborsToVectors(neighbors) +edges <- buildEdgeMatrix(data = largeDataset, neighbors = neighbors) rm(neighbors) -distances <- distance(neighborIndices$i, - neighborIndices$j, - largeDataset) -rm(largeDataset) -wij <- buildEdgeMatrix(i = neighborIndices$i, - j = neighborIndices$j, - d = distances) -rm(distances, neighborIndices) -coords <- projectKNNs(wij$wij) +gc() +wij <- buildWijMaatrix(edges) +rm(edges) +gc() +coords <- projectKNNs(wij) ``` -In testing, this method reduced peak RAM requirements by more than 70%. +Note that `gc()` is being called explicitly. The reason is that R will not collect garbage while executing the package's C++ functions, which can require substantial temporary RAM. + +Memory requirements during the neighbor search may be managed by reducing `n_trees` and increasing the `tree_threshold`. The decrease in precision is marginal, and may be compensated-for by increasing `max_iters`. See the benchmarks vignette for further detail. -## Bibliography +## References + +```{r save,eval=rebuild} +save(agcoords, iriscoords, file = "vignettedata/vignettedata.Rda") +``` diff --git a/inst/doc/largeVis.html b/inst/doc/largeVis.html index 88b638f..b532855 100644 --- a/inst/doc/largeVis.html +++ b/inst/doc/largeVis.html @@ -12,7 +12,7 @@ - + largeVis: An Implementation of the LargeVis Algorithm @@ -70,19 +70,29 @@

largeVis: An Implementation of the LargeVis Algorithm

Amos Elberg

-

2016-05-30

+

2016-08-01

This Vingette provides an overview of the largeVis package.

Introduction

-

The largeVis package offers four functions for visualizing high-dimensional datasets and finding approximate nearest neighbors, based on the LargeVis algorithm presented in Tang et al. (2016):

+

This package provides LargeVis visualizations and fast nearest-neighbor search. The LargeVis algorithm, presented in Tang et al. (2016), creates high-quality low-dimensional representaitons of large, high-dimensional datasets, similar to t-SNE.

+

These visualizations are useful for data exploration, for visualizing complex non-linear functions, and especially for visualizing embeddings such as learned vectors for images.

+

A limitation of t-SNE is that because the algorithm has complexity order \(O(n^2)\), it is not feasible for use on even moderately sized datasets. Barnes-Hut, an approximation of t-SNE, has complexity \(O(n \log n)\) but also quickly becomes infeasible as the size of data grows. LargeVis is intended to address the issue by operating in linear \(O(n)\) time. It has been benchmarked at more than 30x faster than Barnes-Hut on datasets of approximately 1-million rows, and scaled linearly as long as there is sufficient RAM.

+

In addition, LargeVis includes an algorithm for finding approximate k-Nearest Neighbors in \(O(n)\) time. This algorithm turns out to be faster at finding accurate a-NNs than any other method I was able to test.

+

The package also includes a function for visualizing image embeddings by plotting images at the locations given by the LargeVis algorithm.

+

For a detailed description of the algorithm, please see the original paper, Tang et al. (2016).

+
+
+

Package Overview

+

The largeVis package offers five functions for visualizing high-dimensional datasets and finding approximate nearest neighbors (along with some helper functions):

  1. randomProjectionTreeSearch, a method for finding approximate nearest neighbors.
  2. projectKNNs, which takes as input a weighted nearest-neighbor graph and estimates a projection into a low-dimensional space.
  3. -
  4. vis, which combines randomProjectionTreeSearch, buildEdgeMatrix, and projectKNNs, along with additional code to implement the LargeVis algorithm.
  5. -
  6. manifoldMap, which produces a plot for visualizing embeddings of images.
  7. +
  8. largeVis, which implements the entire LargeVis algorithm.
  9. +
  10. manifoldMap (and companon ggManifoldMap), which produce a plot for visualizing embeddings of images.
  11. +
  12. buildWijMatrix takes a sparse matrix of the distances between nearest neighbors, and returns one with the edges properly weighted for use in projectKNNs.

See the original paper for a detailed description of the algorithm.

@@ -91,90 +101,119 @@

Data Preparation

For input to largeVis, data should be scaled, NA’s, Infs and NULL removed, and transposed from the R-standard so that examples are columns and features are rows. Duplicates should be removed as well.

If there are NA’s, Infs, or NULLs in the input, randomProjectionTreeSearch will definitely fail.

If the numerical range covered by the data is large, this can cause errors in or before the buildEdgeMatrix function. This is because the algorithm requires calculating \(\exp(||\vec{x_i}, \vec{x_j}||^2)\) in the high-dimensional space, which will overflow if the distance between any nearest neighbors exceeds about 26.

-

If there are duplicates in the input data, while the implementation tries to filter duplicates, it is likely to lead to problems. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during buildEdgeMatrix, or stochastic gradient descent.

- -
-

Examples

-

-

+

Duplicates in the input data are likely to cause issues. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during buildEdgeMatrix, or stochastic gradient descent.

Overview of Functions and Hyperparameters

randomProjectionTreeSearch

-

This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, the algorithm creates n_trees binary trees dividing the space into leaves of at most tree_threshold nodes. A node’s candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. In the second phase, for each node, the algorithm looks at the candidate nearest neighbors for that node, as well as each of those nodes’ candidate nearest neighbors. The logic of the algorithm is that a node’s neighbors’ neighbors are likely to be the node’s own neighbors. In each iteration, the closest K candidate neighbors for each node are kept.

+

This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, which is based on Erik Bernhardsson‘s Annoy algorithm, n_trees trees are formed by recursively dividing the space by hyperplanes until at most tree_threshold nodes remain in a branch. A node’s candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. The largeVis algorithm adds a second phase, neighborhood exploration, which considers, for each node, whether the candidate neighbors of the node’s candidate immediate neighbors are closer. The logic of the algorithm is that a node’s neighbors’ neighbors are likely to be the node’s own neighbors. In each iteration, the closest K candidate neighbors for each node are kept.

+

(Note that this implementation of largeVis differs from the approach taken by Annoy, in that Annoy always uses the number of features as the leaf threshold, where largeVis allows this to be an adjustable parameter.)

The authors of Tang et al. (2016) suggest that a single iteration of the second phase is generally sufficient to obtain satisfactory performance.

-

The chart below illlustrates the trade-off between performance and accuracy for the nearest-neighbor search, using various hyperparameters. The data was produced using the benchmark.R script in the inst/ directory. The test data is the 1-million vector, 128-feature SIFT Dataset, as per Erik Bernhardsson’s ANN Benchmark github.

-

-

If randomProjectionTreeSearch fails to find the desired number of neighbors, usually the best result is obtained by increasing the tree threshold. If randomProjectionTreeSearch fails with an error that no neighbors were found for some nodes, and the tree threshold is already reasonable, this may be an indication that duplicates remain in the input data.

+

See the vignette “ANN Benchmarks” for additional information.

projectKNNs

This function takes as its input a Matrix::sparseMatrix, of connections between nodes. The matrix must be symmetric. A non-zero cell implies that node i is a nearest neighbor of node j, vice-versa, or both. Non-zero values represent the strength of the connection relative to other nearest neighbors of the two nodes.

The LargeVis algorithm, explained in detail in Tang et al. (2016), estimates the embedding by sampling from the identitied nearest-neighbor connections. For each edge, the algorithm also samples M non-nearest neighbor negative samples. M, along with \(\gamma\) and \(\alpha\), control the visualization. \(\alpha\) controls the desired distance between nearest neighbors. \(\gamma\) controls the relative strength of the attractive force between nearest neighbors and repulsive force between non-neighbors.

-

The following grid illustrates the effect of the \(\alpha\) and \(\gamma\) hyperparameters, using the wiki dataset which is included with the package:

-

+

The following grid illustrates the effect of the \(\alpha\) and \(\gamma\) hyperparameters:

+
load(system.file(package = "largeVis", "extdata/vignettedata.Rda"))
+

The additional hyperparameters \(\rho\) and min-\(\rho\) control the starting and final learning rate for the stochastic gradient descent process.

The algorithm can treat positive edge weights in two different ways. The authors of Tang et al. (2016) suggest that edge weights should be used to generate a weighted sampling. However, the algorithm for taking a weighted sample runs in \(O(n \log n)\). Alternatively, the edge-weights can be applied to the gradients. This is controlled by the weight_pos_samples parameter.

vis

The vis function combines randomProjectionTreeSearch and projectKNNs, along with additional logic for calculating edge weights, to implement the complete LargeVis algorithm.

-

The following chart illustrates the effect of the M and K parameters, using the iris dataset.

-

+

The following chart illustrates the effect of the M and K parameters, using the iris dataset. Each row re-uses the same set of identified K neighbors, and initial coordinates.

+

manifoldMap

-

The manifoldMap function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by projectKNNs or vis) and an array of N images, the function samples n images and plots them at the coordinates given in the matrix. If the transparency parameter is a number between 0 and 1, then the function adds to each image an alpha channel where the value per pixel is proportional to \(transparency *\) the image content.

-

The function can plot both color and greyscale images.

-

The following code will plot 5000 images sampled from the MNIST dataset at positions generated by vis:

-
if (exists("trainData")) {
-  dim(trainData) <- c(60000, 28, 28)
-  manifoldMap(mnistCoords[,1:2],
-      n = 5000,
-      scale = 0.003,
-      transparency = F,
-      images = trainData,
-      xlab="", ylab="",
-      xlim = c(-2, 2),
-      ylim = c(-2, 2))
-} 
-

-

The code is disabled by default in this vignette for data size reasons.

+

The manifoldMap function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by projectKNNs or vis) and an array of N images, the function samples n images and plots them at the coordinates given in the matrix.

+

The following code will generate the visualization shown in the examples:

+
dim(trainData) <- c(60000, 28, 28)
+aperm(trainData, perm = c(1,3,2), resize = FALSE)
+set.seed(1974)
+manifoldMap(mnistCoords[,1:2],
+    n = 5000,
+    scale = 0.1,
+    images = trainData,
+    xlab = "", 
+    ylab = "")

Support for Sparse Matrices

-

largeVis supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices.

-

For example, the following plot visualizes a tf-idf weighted document-term matrix for a corpus of 5000 political blog entries, as included with the stm package.

-

+

largeVis supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices directly, and compare the result with the result of visualizing topic vectors.

+
+
+

Visualizing Graphs

+

The largeVis visualization algorithm can be used to visualize undirected weighted or unweighted acyclic graphs. The included wiki dataset is an example.

+

The following code illustrates how to import and visualize a graph using the YouTube-communities dataset available here. The data and visualization are not included here for size reasons.

+
youtube <- readr::read_tsv(pathToGraphFile, skip=4, col_names=FALSE)
+youtube <- as.matrix(youtube)
+youtube <- Matrix::sparseMatrix(i = youtube[, 1],
+                                j = youtube[, 2],
+                                x = rep(1, nrow(youtube)), 
+                                dims = c(max(youtube), max(youtube)))
+youtube <- youtube + t(youtube)
+communities <- readr::read_lines(pathToCommunities)
+communities <- lapply(communities, 
+                      FUN = function(x) as.numeric(unlist(strsplit(x, "\t"))))
+community_assignments <- rep(0, 
+                             nrow(youtube))
+for (i in 1:length(communities)) community_assignments[communities[[i]]] <- i
+
+wij <- buildWijMatrix(youtube)
+youTube_coordinates <- projectKNNs(youtube)
+youTube_coordinates <- data.frame(scale(t(youTube_coordinates)))
+colnames(youTube_coordinates) <- c("x", "y")
+youTube_coordinates$community <- factor(community_assignments)
+youTube_coordinates$alpha <- factor(ifelse(youTube_coordinates$community == 0, 0.05, 0.2))
+ggplot(youTube_coordinates, aes( x = x, 
+                      y = y, 
+                      color = community, 
+                      alpha = alpha, 
+                      size = alpha)) +
+  geom_point() +
+  scale_color_manual(values = 
+                       c("black", colors_continuous(5000)),
+                     guide = FALSE) +
+  scale_alpha_manual(values = c(0.005, 0.2), guide = FALSE) +
+  scale_size_manual(values = c(0.03, 0.15), guide = FALSE) +
+  scale_x_continuous("", 
+                     breaks = NULL, limits = c(-2.5,2.5)) +
+  scale_y_continuous("", 
+                     breaks = NULL, limits = c(-2.5,2.5)) +
+  ggtitle("YouTube Communities")

Distance Methods

-

The original LargeVis paper used Euclidean distances exclusively. The largeVis package offers a choice among Euclidean and Cosine distance measures.

+

The original LargeVis paper used Euclidean distances exclusively. The largeVis package offers a choice between Euclidean and Cosine distance measures.

+

The implementation is not optimized for cosine distances.

Memory Consumption

-

The algorithm is necessarily memory-intensive for large datasets. neighborsToVectors, distance, and buildEdgeMatrix are available as separate functions to facilitate memory-efficient handling of large datasets, because the high-dimensional dataset is not needed after distances have been calculated. In this case, the workflow is:

+

The algorithm is necessarily memory-intensive for large datasets.

+

A simple way to reduce peak memory usage, is to turn-off the save_neighbors parameter when running vis. If this is insufficient, the steps of the algorithm can be run separately with the neighborsToVectors, distance, and buildEdgeMatrix functions. In this case, the workflow is:

neighbors <- randomProjectionTreeSearch(largeDataset)
-neighborIndices <- neighborsToVectors(neighbors)
+edges <- buildEdgeMatrix(data = largeDataset, neighbors = neighbors)
 rm(neighbors)
-distances <- distance(neighborIndices$i, 
-                      neighborIndices$j,
-                      largeDataset)
-rm(largeDataset)
-wij <- buildEdgeMatrix(i = neighborIndices$i, 
-                       j = neighborIndices$j, 
-                       d = distances)
-rm(distances, neighborIndices)
-coords <- projectKNNs(wij$wij)
-

In testing, this method reduced peak RAM requirements by more than 70%.

+gc() +wij <- buildWijMaatrix(edges) +rm(edges) +gc() +coords <- projectKNNs(wij)
+

Note that gc() is being called explicitly. The reason is that R will not collect garbage while executing the package’s C++ functions, which can require substantial temporary RAM.

+

Memory requirements during the neighbor search may be managed by reducing n_trees and increasing the tree_threshold. The decrease in precision is marginal, and may be compensated-for by increasing max_iters. See the benchmarks vignette for further detail.

-
-

Bibliography

+
+

References

+
save(agcoords, iriscoords, file = "vignettedata/vignettedata.Rda")
-
-

Tang, Jian, Jingzhou Liu, Ming Zhang, and Qiaozhu Mei. 2016. “Visualization Large-Scale and High-Dimensional Data.” CoRR abs/1602.00370. http://arxiv.org/abs/1602.00370.

+
+

Tang, Jian, Jingzhou Liu, Ming Zhang, and Qiaozhu Mei. 2016. “Visualizing Large-Scale and High-Dimensional Data.” In Proceedings of the 25th International Conference on World Wide Web, 287–97. International World Wide Web Conferences Steering Committee.

diff --git a/inst/extdata/agcoords.Rda b/inst/extdata/agcoords.Rda deleted file mode 100644 index 9d3f948..0000000 Binary files a/inst/extdata/agcoords.Rda and /dev/null differ diff --git a/inst/extdata/benchmark.Rda b/inst/extdata/benchmark.Rda index 8279efd..b49a207 100644 Binary files a/inst/extdata/benchmark.Rda and b/inst/extdata/benchmark.Rda differ diff --git a/inst/extdata/d3coords.Rda b/inst/extdata/d3coords.Rda deleted file mode 100644 index 9b797c1..0000000 Binary files a/inst/extdata/d3coords.Rda and /dev/null differ diff --git a/inst/extdata/iriscoords.Rda b/inst/extdata/iriscoords.Rda deleted file mode 100644 index 91338c6..0000000 Binary files a/inst/extdata/iriscoords.Rda and /dev/null differ diff --git a/inst/extdata/mnistcoords.Rda b/inst/extdata/mnistcoords.Rda deleted file mode 100644 index 732c96c..0000000 Binary files a/inst/extdata/mnistcoords.Rda and /dev/null differ diff --git a/inst/extdata/ngcoords.Rda b/inst/extdata/ngcoords.Rda deleted file mode 100644 index fef950d..0000000 Binary files a/inst/extdata/ngcoords.Rda and /dev/null differ diff --git a/inst/extdata/polidata.Rda b/inst/extdata/polidata.Rda deleted file mode 100644 index d940f7e..0000000 Binary files a/inst/extdata/polidata.Rda and /dev/null differ diff --git a/inst/extdata/results.csv b/inst/extdata/results.csv deleted file mode 100644 index 32cf3b0..0000000 --- a/inst/extdata/results.csv +++ /dev/null @@ -1,20 +0,0 @@ -523.392,14.1568,10,1,10 -1004.52,32.7121,10,1,20 -3323.268,64.9899,10,1,50 -9881.42,86.2028,10,1,100 -1106.728,37.8124,10,2,10 -10482.836,99.5142,10,2,50 -16623.816,99.9957,10,2,100 -1113.82,15.7282,20,1,10 -1601.836,37.8568,20,1,20 -3979.824,77.5392,20,1,50 -9759.06,94.0059,20,1,100 -1697.068,32.7476,20,2,10 -4145.32399999999,80.7964,20,2,20 -10173.196,99.0039,20,2,50 -1190.696,14.8735,30,1,10 -1916.828,36.3931,30,1,20 -3922.892,79.9649,30,1,50 -1401.26,26.5446,30,2,10 -3328.328,68.9364,30,2,20 -4670.212,86.4823,10,2,20 diff --git a/inst/extdata/vignettedata.Rda b/inst/extdata/vignettedata.Rda new file mode 100644 index 0000000..babd856 Binary files /dev/null and b/inst/extdata/vignettedata.Rda differ diff --git a/inst/extdata/wijstuff.Rda b/inst/extdata/wijstuff.Rda new file mode 100644 index 0000000..f78e918 Binary files /dev/null and b/inst/extdata/wijstuff.Rda differ diff --git a/man/buildEdgeMatrix.Rd b/man/buildEdgeMatrix.Rd index 8f740ea..9d9e624 100644 --- a/man/buildEdgeMatrix.Rd +++ b/man/buildEdgeMatrix.Rd @@ -2,48 +2,24 @@ % Please edit documentation in R/buildEdgeMatrix.R \name{buildEdgeMatrix} \alias{buildEdgeMatrix} -\alias{buildEdgeMatrix.CsparseMatrix} -\alias{buildEdgeMatrix.TsparseMatrix} -\alias{buildEdgeMatrix.default} -\title{Build an edge-weight matrix for the LargeVis algorithm.} +\title{Build an nearest-neighbor graph weighted by distance.} \usage{ -buildEdgeMatrix(x, i, j, p, d, perplexity, verbose) - -\method{buildEdgeMatrix}{default}(x = NULL, i, j, p = NULL, d, - perplexity = 50, verbose = TRUE) - -\method{buildEdgeMatrix}{CsparseMatrix}(x, i = NULL, j = NULL, p = NULL, - d = NULL, perplexity = 50, verbose = TRUE) - -\method{buildEdgeMatrix}{TsparseMatrix}(x, i = NULL, j = NULL, p = NULL, - d = NULL, perplexity = 50, verbose = TRUE) +buildEdgeMatrix(data, neighbors, distance_method = "Euclidean", + verbose = TRUE) } \arguments{ -\item{x}{A sparseMatrix, either a \code{\link[Matrix]{CsparseMatrix-class}} or \code{\link[Matrix]{TsparseMatrix-class}}} - -\item{i}{Indices of one node of the nearest-neighbor graph.} +\item{data}{A matrix with a number of columns equal to the number of columns in `x`} -\item{j}{Indices of the other node.} +\item{neighbors}{An adjacency matrix of the type produced by \code{\link{randomProjectionTreeSearch}}.} -\item{p}{Integer vector of pointers to the initial index of elements of each column. See \code{\link[Matrix]{CsparseMatrix-class}}.} - -\item{d}{The distances between the nodes identified in parameters \code{i} and \code{j}.} - -\item{perplexity}{See the paper for discussion.} +\item{distance_method}{One of "Euclidean" or "Cosine"} \item{verbose}{Verbosity} } \value{ -A list containing: \describe{ -\item{"sigmas"}{A vector of \eqn{2 \dot \sigma^2} calculated for each node.} -\item{"wij"}{A symmetric, sparse matrix of the weights for each edge between nearest neighbors.} -} +A `sparseMatrix` } \description{ -Build an edge-weight matrix for the LargeVis algorithm. -} -\details{ -Implements the portion of the LargeVis algorithm that converts distances between nearest neighbors to an -edge-weight graph. +Build an nearest-neighbor graph weighted by distance. } diff --git a/man/buildWijMatrix.Rd b/man/buildWijMatrix.Rd new file mode 100644 index 0000000..8e19141 --- /dev/null +++ b/man/buildWijMatrix.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildEdgeMatrix.R +\name{buildWijMatrix} +\alias{buildWijMatrix} +\alias{buildWijMatrix.CsparseMatrix} +\alias{buildWijMatrix.TsparseMatrix} +\title{buildWijMatrix} +\usage{ +buildWijMatrix(x, perplexity = 50) + +\method{buildWijMatrix}{TsparseMatrix}(x, perplexity = 50) + +\method{buildWijMatrix}{CsparseMatrix}(x, perplexity = 50) +} +\arguments{ +\item{x}{A sparse matrix} + +\item{perplexity}{Given perplexity.} +} +\value{ +A \code{list} with the following components: \describe{ + \item{'dist'}{An [N,K] matrix of the distances to the nearest neighbors.} + \item{'id'}{An [N,K] matrix of the node indexes of the neartest neighbors. Note that this matrix is 1-indexed, + unlike most other matrices in this package.} + \item{'k'}{The number of nearest neighbors.} + } +} +\description{ +Rescale the weights in an edge matrix to match a given perplexity. +} + diff --git a/man/dbscan.Rd b/man/dbscan.Rd new file mode 100644 index 0000000..2e4c981 --- /dev/null +++ b/man/dbscan.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dbscan.R +\name{dbscan} +\alias{dbscan} +\title{dbscan} +\usage{ +dbscan(data = NULL, neighbors = NULL, edges = NULL, eps, + minPts = nrow(data) + 1, partition = !missing(edges), verbose = TRUE) +} +\arguments{ +\item{data}{Input data, where examples are columns.} + +\item{neighbors}{An adjacency matrix of the type produced by \code{\link{randomProjectionTreeSearch}}} + +\item{edges}{A weighted graph of the type produced by \code{\link{buildEdgeMatrix}}.} + +\item{eps}{See \code{\link[dbscan]{dbscan}}.} + +\item{minPts}{Minimum size of a cluster.'} + +\item{partition}{If \code{TRUE}, attempt to calculate an approximate silhouette so the object returned is also +of class \code{\link[cluster]{partition.object}}, for compatibility with the \code{cluster} package.} + +\item{verbose}{Verbosity level.} +} +\value{ +An \code{\link[dbscan]{dbscan}} object. +} +\description{ +An implementation of the dbscan algorithm. +} +\details{ +This is a preliminary implementation of the OPTICS algorithm that attempts +to leverage the \code{largeVis} nearest-neighbor search. + +One of \code{neighbors} or \code{edges} must be specified. If \code{edges} is missing, +\code{data} must also be given. If \code{data} is given along with either \code{edges} +or \code{neighbors}, the algorithm will attempt a more thorough search. +} +\note{ +Support for dbscan and optics are preliminary, and not fully tested for +correctness. + +This is not the original DBSCAN algorithm. In particular, the neighbor-search strategy in +DBSCAN is not used, in favor of using a pre-calculated neighbor matrix produced incidentally by +`largeVis`. +} + diff --git a/man/facevectors.Rd b/man/facevectors.Rd new file mode 100644 index 0000000..d77a805 --- /dev/null +++ b/man/facevectors.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/facevectors.R +\docType{data} +\name{facevectors} +\alias{facevectors} +\title{Embedding vectors for faces in the Labelled Faces in the Wild dataset} +\format{A data.frame where each row represents an image. The first column is the name of the person in the image, the second column +is the name of the image file, and the remaining columns are the columns of the embedding vector for each image as calculated with +the OpenFace `batch-represent` function.} +\source{ +\url{http://openface-models.storage.cmusatyalab.org/lfw.nn4.small2.v1/labels.csv} + +\url{http://openface-models.storage.cmusatyalab.org/lfw.nn4.small2.v1/reps.csv} +} +\usage{ +facevectors +} +\description{ +A dataset of OpenFace embeddings for the "Labelled Faces in the Wild" dataset, see \url{http://vis-www.cs.umass.edu/lfw/}. +} +\details{ +OpenFace is a facial recognition library. The similarity between two OpenFace vectors should correlate with the +likelihood that the vectors were generated from images of the same person. For details and discussion, +see \url{https://cmusatyalab.github.io/openface/}. The images may be obtained from \url{http://vis-www.cs.umass.edu/lfw/}. +} +\keyword{datasets} + diff --git a/man/ggManifoldMap.Rd b/man/ggManifoldMap.Rd new file mode 100644 index 0000000..feef536 --- /dev/null +++ b/man/ggManifoldMap.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualize.R +\name{ggManifoldMap} +\alias{ggManifoldMap} +\title{Visualize an embedding by ggplotting with images} +\usage{ +ggManifoldMap(ggObject = NULL, x, n = nrow(x), images, scale = 1) +} +\arguments{ +\item{ggObject}{a \code{\link[ggplot2]{ggplot}} object. If not provided, a new \code{ggplot} +object with \code{\link[ggplot2]{geom_blank}} will be created.} + +\item{x}{A \code{largeVis} object or [N,D] matrix of coordinates.} + +\item{n}{The number of images to sample.} + +\item{images}{The images. A 3-D or 4-D array.} + +\item{scale}{Proportion to scale the images to.} +} +\value{ +A \code{ggplot} object. +} +\description{ +Identical to \link{manifoldMap}, but adds images to an existing \code{ggplot2} object or creates one. +} +\details{ +See \code{\link{manifoldMap}}. Note that this function can be considerably slower to display than \code{manifoldMap}. +It therefore should only be used if other features of \code{ggplot2} are required. + +If the objects in the list are \code{matrix} objects, or the array is 3-dimensional, the images will be treated as +greyscale. If there is an additional dimension, it must have a length of 3 and be RGB color layers. +} + diff --git a/man/largeVis.Rd b/man/largeVis.Rd index e1b7b57..c8ae8fe 100644 --- a/man/largeVis.Rd +++ b/man/largeVis.Rd @@ -1,12 +1,67 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/largeVis-package.r +% Please edit documentation in R/largeVis-package.r, R/largeVis.R \docType{package} \name{largeVis} \alias{largeVis} \alias{largeVis-package} \title{largeVis: high-quality visualizations for large, high-dimensionality datasets} +\usage{ +largeVis(x, dim = 2, K = 40, n_trees = 50, tree_threshold = max(10, + ncol(x)), max_iter = 1, distance_method = "Euclidean", perplexity = 50, + sgd_batches = NULL, M = 5, alpha = 1, gamma = 7, rho = 1, + coords = NULL, save_neighbors = TRUE, verbose = TRUE, ...) +} +\arguments{ +\item{x}{A matrix, where the features are rows and the examples are columns.} + +\item{dim}{The number of dimensions in the output} + +\item{K}{The number of nearest-neighbors to use in computing the kNN graph} + +\item{n_trees}{See \code{\link{randomProjectionTreeSearch}}. The default is set at 50, which is the number +used in the examples in the original paper.} + +\item{tree_threshold}{See \code{\link{randomProjectionTreeSearch}}. By default, this is the number of features +in the input set.} + +\item{max_iter}{See \code{\link{randomProjectionTreeSearch}}.} + +\item{distance_method}{One of "Euclidean" or "Cosine." See \code{\link{randomProjectionTreeSearch}}.} + +\item{perplexity}{See \code{\link{buildWijMatrix}}.} + +\item{sgd_batches}{See \code{\link{projectKNNs}}.} + +\item{M}{See \code{\link{projectKNNs}}.} + +\item{alpha}{See \code{\link{projectKNNs}}.} + +\item{gamma}{See \code{\link{projectKNNs}}.} + +\item{rho}{See \code{\link{projectKNNs}}.} + +\item{coords}{A [N,K] matrix of coordinates to use as a starting point -- useful for refining an embedding in stages.} + +\item{save_neighbors}{Whether to include in the output the adjacency matrix of nearest neighbors.} + +\item{verbose}{Verbosity} + +\item{...}{Additional arguments passed to \code{\link{projectKNNs}}.} +} +\value{ +A `largeVis` object with the following slots: + \describe{ + \item{'knns'}{An [N,K] 0-indexed integer matrix, which is an adjacency list of each vertex' identified nearest neighbors. + If the algorithm failed to find \code{K} neighbors, the matrix is padded with \code{NA}'s.} + \item{'wij'}{A sparse [N,N] matrix where each cell represents \eqn{w_{ij}}.} + \item{'call'}{The call.} + \item{'coords'}{A [N,D] matrix of the embedding of the dataset in the low-dimensional space.} + } +} \description{ This is an implementation of the \code{largeVis} algorithm by Tang et al. + +Apply the LargeVis algorithm for visualizing large high-dimensional datasets. } \details{ \code{largeVis} estimates a low-dimensional embedding for high-dimensional data, where the distance between vertices @@ -22,8 +77,30 @@ may be appropriate for some datasets if the algorithm has trouble finding K neig nearest neighbor of each of its nodes. \item Using stochastic gradient descent, estimate an embedding for each vertex in the low-dimensional space. } +} +\examples{ +# iris +data(iris) +dat <- as.matrix(iris[,1:4]) +dat <- scale(dat) +dupes = which(duplicated(dat)) +dat <- dat[-dupes,] # duplicates can cause the algorithm to fail +dat <- t(dat) +visObject <- largeVis(dat, max_iter = 20, K = 10) +\dontrun{ +# mnist +load("./mnist.Rda") +dat <- mnist$images +dim(dat) <- c(42000, 28 * 28) +dat <- (dat / 255) - 0.5 +dat <- t(dat) +coords <- largeVis(dat, n_trees = 50, tree_th = 200, K = 50) +} + } \references{ +Jian Tang, Jingzhou Liu, Ming Zhang, Qiaozhu Mei. \href{https://arxiv.org/abs/1602.00370}{Visualizing Large-scale and High-dimensional Data.} + Jian Tang, Jingzhou Liu, Ming Zhang, Qiaozhu Mei. \href{https://arxiv.org/abs/1602.00370}{Visualizing Large-scale and High-dimensional Data.} } diff --git a/man/lof.Rd b/man/lof.Rd new file mode 100644 index 0000000..4775cdd --- /dev/null +++ b/man/lof.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dbscan.R +\name{lof} +\alias{lof} +\title{Local Outlier Factor Score} +\usage{ +lof(edges) +} +\arguments{ +\item{edges}{An edge matrix of the type produced by \code{\link{buildEdgeMatrix}}.} +} +\value{ +A vector of LOF values for each data point. +} +\description{ +Calculate the Local Outlier Factor (LOF) score for each data point given knowledge +of k-Nearest Neighbors. +} +\references{ +Based on code in the \code{\link[dbscan]{dbscan}} package. +} + diff --git a/man/manifoldMap.Rd b/man/manifoldMap.Rd index 569106c..119cc6e 100644 --- a/man/manifoldMap.Rd +++ b/man/manifoldMap.Rd @@ -4,7 +4,7 @@ \alias{manifoldMap} \title{Visualize an embedding by plotting with images} \usage{ -manifoldMap(x, n, images, scale = 1, transparency = FALSE, ...) +manifoldMap(x, n = nrow(x), images, scale = 1, ...) } \arguments{ \item{x}{A \code{largeVis} object or [N,D] matrix of coordinates.} @@ -15,8 +15,6 @@ manifoldMap(x, n, images, scale = 1, transparency = FALSE, ...) \item{scale}{Proportion to scale the images to.} -\item{transparency}{Whether to add an alpha channel to greyscale images.} - \item{...}{Addiitional parameters passed to \code{plot}.} } \description{ @@ -48,4 +46,10 @@ manifoldMap(coords, mnistimages) } } +\references{ +Andrej Karpapthy. \href{http://cs.stanford.edu/people/karpathy/cnnembed/}{t-SNE Visualization of CNN Codes.} +} +\seealso{ +\code{\link{ggManifoldMap}} +} diff --git a/man/manifoldMapStretch.Rd b/man/manifoldMapStretch.Rd new file mode 100644 index 0000000..8391230 --- /dev/null +++ b/man/manifoldMapStretch.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualize.R +\name{manifoldMapStretch} +\alias{manifoldMapStretch} +\title{manifoldMapStretch} +\usage{ +manifoldMapStretch(x, f, size_x = 500, size_y = 500, image_size = 50, ...) +} +\arguments{ +\item{x}{A [N,D] matrix of coordinates.} + +\item{f}{A function that, called with the index number of a row of \code{x}, returns an R object representing +an image. See the example.} + +\item{size_x}{The width of the requested plot, in pixels.} + +\item{size_y}{The height of the requested plot, in pixels.} + +\item{image_size}{The size to plot each image; each is plotted as a square.} + +\item{...}{Additional parameters passed to \code{plot}.} +} +\description{ +A manifold map that fills the full extent of the plot. +} +\details{ +Ported from \url{http://cs.stanford.edu/people/karpathy/cnnembed/}. Each position is filled with its nearest neighbor. +} +\note{ +This function is experimental. +} +\examples{ +\dontrun{ +# Demonstration of f +load(system.file("extdata", "faces.Rda", package="largeVis")) + +imagepaths <- paste("pathtoimages", + faceLabels[,1], sub("png", "jpg", faceLabels[,2]), sep = "/") + +manifoldMapStretch(as.matrix(faceCoords[,1:2]), + f = function(x) jpeg::readJPEG(imagePaths[x]), + size_x = 5000, size_y = 5000, image_size = 100) +} + +} + diff --git a/man/optics.Rd b/man/optics.Rd new file mode 100644 index 0000000..0a02190 --- /dev/null +++ b/man/optics.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dbscan.R +\name{optics} +\alias{optics} +\title{OPTICS} +\usage{ +optics(data = NULL, neighbors = NULL, edges = NULL, eps, + minPts = nrow(data) + 1, eps_cl, xi, verbose = TRUE) +} +\arguments{ +\item{data}{Input data, where examples are columns.} + +\item{neighbors}{An adjacency matrix of the type produced by \code{\link{randomProjectionTreeSearch}}} + +\item{edges}{A weighted graph of the type produced by \code{\link{buildEdgeMatrix}}.} + +\item{eps}{See \code{\link[dbscan]{optics}}.} + +\item{minPts}{See \code{\link[dbscan]{optics}}.} + +\item{eps_cl}{See \code{\link[dbscan]{optics}}.} + +\item{xi}{See \code{\link[dbscan]{optics}}.} + +\item{verbose}{Vebosity level.} +} +\value{ +An \code{\link[dbscan]{optics}} object. +} +\description{ +An implementation of the OPTICS algorithm. +} +\details{ +This is a preliminary implementation of a variant of the OPTICS algorithm that attempts +to leverage the \code{largeVis} nearest-neighbor search. + +One of \code{neighbors} or \code{edges} must be specified. If \code{edges} is missing, +\code{data} must also be given. If \code{data} is given along with either \code{edges} +or \code{neighbors}, the algorithm will attempt a more thorough search. +} +\note{ +Support for dbscan and optics are preliminary, and not fully tested for +correctness. + +This is not the original OPTICS algorithm. In particular, the neighbor-search strategy in +OPTICS is not used, in favor of using a pre-calculated neighbor matrix produced incidentally by +`largeVis`. +} + diff --git a/man/projectKNNs.Rd b/man/projectKNNs.Rd index 29bb6eb..ebcc2bb 100644 --- a/man/projectKNNs.Rd +++ b/man/projectKNNs.Rd @@ -4,36 +4,29 @@ \alias{projectKNNs} \title{Project a distance matrix into a lower-dimensional space.} \usage{ -projectKNNs(wij, dim = 2, sgd_batches = (length(wij@p) - 1) * 20000, - M = 5, weight_pos_samples = if (alpha == 0) { FALSE } else { TRUE - }, gamma = 7, alpha = 1, rho = 1, coords = NULL, min_rho = 0, - verbose = TRUE) +projectKNNs(wij, dim = 2, sgd_batches = NULL, M = 5, gamma = 7, + alpha = 1, rho = 1, coords = NULL, verbose = TRUE) } \arguments{ \item{wij}{A symmetric sparse matrix of edge weights, in C-compressed format, as created with the \code{Matrix} package.} \item{dim}{The number of dimensions for the projection space.} -\item{sgd_batches}{The number of edges to process during SGD; defaults to 20000 * the number of rows in x, as recommended -by the paper authors.} +\item{sgd_batches}{The number of edges to process during SGD.} \item{M}{The number of negative edges to sample for each positive edge.} -\item{weight_pos_samples}{Whether to sample positive edges according to their edge weights (the default, unless alpha == 0) or take the -weights into account when calculating gradient. See also the Details section.} - \item{gamma}{The strength of the force pushing non-neighbor nodes apart.} -\item{alpha}{Hyperparameter used in the default distance function, \eqn{1 / (1 + \alpha \dot ||y_i - y_j||^2)}. If \code{alpha} is 0, the alternative distance -function \eqn{1 / 1 + exp(||y_i - y_j||^2)} is used instead. These functions relate the distance between points in the low-dimensional projection to the likelihood -that they two points are nearest neighbors.} +\item{alpha}{Hyperparameter used in the default distance function, \eqn{1 / (1 + \alpha \dot ||y_i - y_j||^2)}. The function relates the distance +between points in the low-dimensional projection to the likelihood that the two points are nearest neighbors. Increasing \eqn{\alpha} tends +to push nodes and their neighbors closer together; decreasing \eqn{\alpha} produces a broader distribution. Setting \eqn{\alpha} to zero +enables the alternative distance function. \eqn{\alpha} below zero is meaningless.} \item{rho}{Initial learning rate.} \item{coords}{An initialized coordinate matrix.} -\item{min_rho}{Final learning rate.} - \item{verbose}{Verbosity} } \value{ @@ -51,15 +44,8 @@ The objective function is: \deqn{ O = \sum_{(i,j)\in E} w_{ij} (\log f(||p(e_{ij where \eqn{f()} is a probabilistic function relating the distance between two points in the low-dimensional projection space, and the probability that they are nearest neighbors. -There are two available probabilistic functions, \eqn{1 / (1 + \alpha \dot ||x||^2)} and \eqn{1 / (1 + \exp(||x||^2))}. -The second function, which the paper authors recommend against, is used if parameter \code{alpha} is set to 0. - -The \code{weight_pos_samples} parameter controls how to handle edge-weights. The paper authors recommend using a weighted -sampling approach to select edges, and treating edge-weight as binary in calculating the objective. This is the default. - -However, the algorithm for drawing weighted samples runs in \eqn{O(n \log n)}. The alternative approach, which runs in -\eqn{O(n)}, is to draw unweighted samples and include \eqn{w_{ij}} in the objective function. In addition, the -alternative probabalistic function used when \eqn{\alpha == 0} tends to overflow unless edge weights are used. +The default probabilistic function is \eqn{1 / (1 + \alpha \dot ||x||^2)}. If \eqn{\alpha} is set to zero, +an alternative probabilistic function, \eqn{1 / (1 + \exp(x^2))} will be used instead. Note that the input matrix should be symmetric. If any columns in the matrix are empty, the function will fail. } diff --git a/man/randomProjectionTreeSearch.Rd b/man/randomProjectionTreeSearch.Rd index 57df16d..b540015 100644 --- a/man/randomProjectionTreeSearch.Rd +++ b/man/randomProjectionTreeSearch.Rd @@ -7,20 +7,20 @@ \alias{randomProjectionTreeSearch.matrix} \title{Find approximate k-Nearest Neighbors using random projection tree search.} \usage{ -randomProjectionTreeSearch(x, K = 5, n_trees = 2, tree_threshold = max(10, - nrow(x)), max_iter = 2, max_depth = 32, distance_method = "Euclidean", - verbose = TRUE) +randomProjectionTreeSearch(x, K = 150, n_trees = 50, + tree_threshold = max(10, nrow(x)), max_iter = 1, + distance_method = "Euclidean", verbose = TRUE) -\method{randomProjectionTreeSearch}{matrix}(x, K = 5, n_trees = 2, - tree_threshold = max(10, nrow(x)), max_iter = 2, max_depth = 32, +\method{randomProjectionTreeSearch}{matrix}(x, K = 150, n_trees = 50, + tree_threshold = max(10, nrow(x)), max_iter = 1, distance_method = "Euclidean", verbose = TRUE) -\method{randomProjectionTreeSearch}{CsparseMatrix}(x, K = 5, n_trees = 2, - tree_threshold = max(10, nrow(x)), max_iter = 2, max_depth = 32, +\method{randomProjectionTreeSearch}{CsparseMatrix}(x, K = 150, n_trees = 50, + tree_threshold = max(10, nrow(x)), max_iter = 1, distance_method = "Euclidean", verbose = TRUE) -\method{randomProjectionTreeSearch}{TsparseMatrix}(x, K = 5, n_trees = 2, - tree_threshold = max(10, nrow(x)), max_iter = 2, max_depth = 32, +\method{randomProjectionTreeSearch}{TsparseMatrix}(x, K = 150, n_trees = 50, + tree_threshold = max(10, nrow(x)), max_iter = 1, distance_method = "Euclidean", verbose = TRUE) } \arguments{ @@ -35,8 +35,6 @@ using a value equivalent to the number of features in the input set.} \item{max_iter}{Number of iterations in the neighborhood exploration phase.} -\item{max_depth}{The maximum level of recursion.} - \item{distance_method}{One of "Euclidean" or "Cosine."} \item{verbose}{Whether to print verbose logging using the \code{progress} package.} diff --git a/man/vis.Rd b/man/vis.Rd deleted file mode 100644 index 5d8155d..0000000 --- a/man/vis.Rd +++ /dev/null @@ -1,99 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/largeVis.R -\name{vis} -\alias{vis} -\title{Apply the LargeVis algorithm for visualizing large high-dimensional datasets.} -\usage{ -vis(x, dim = 2, K = 40, n_trees = 50, tree_threshold = max(10, nrow(x)), - max_iter = 3, max_depth = 32, distance_method = "Euclidean", - perplexity = 50, sgd_batches = ncol(x) * 20000, M = 5, - weight_pos_samples = TRUE, alpha = 1, gamma = 7, rho = 1, - min_rho = 0, coords = NULL, verbose = TRUE, ...) -} -\arguments{ -\item{x}{A matrix, where the features are rows and the examples are columns.} - -\item{dim}{The number of dimensions in the output} - -\item{K}{The number of nearest-neighbors to use in computing the kNN graph} - -\item{n_trees}{See \code{\link{randomProjectionTreeSearch}}. The default is set at 50, which is the number -used in the examples in the original paper.} - -\item{tree_threshold}{See \code{\link{randomProjectionTreeSearch}}. By default, this is the number of features -in the input set, which is the setting used in the examples in the original paper. Note the time and memory requirements: -the first pass through the neighborhood exploration phases will involve up to \eqn{N * nTrees * threshold} comparisons.} - -\item{max_iter}{See \code{\link{randomProjectionTreeSearch}}.} - -\item{max_depth}{See \code{\link{randomProjectionTreeSearch}}} - -\item{distance_method}{One of "Euclidean" or "Cosine." See \code{\link{randomProjectionTreeSearch}}.} - -\item{perplexity}{See paper} - -\item{sgd_batches}{See \code{\link{projectKNNs}}.} - -\item{M}{See \code{\link{projectKNNs}}.} - -\item{weight_pos_samples}{See \code{\link{projectKNNs}}.} - -\item{alpha}{See \code{\link{projectKNNs}}.} - -\item{gamma}{See \code{\link{projectKNNs}}.} - -\item{rho}{See \code{\link{projectKNNs}}.} - -\item{min_rho}{\code{\link{projectKNNs}}.} - -\item{coords}{A [N,K] matrix of coordinates to use as a starting point -- useful for refining an embedding in stages.} - -\item{verbose}{Verbosity} - -\item{...}{See paper} -} -\value{ -A `largeVis` object with the following slots: - \describe{ - \item{'knns'}{An [N,K] integer matrix, which is an adjacency list of each vertex' identified nearest neighbors. - If the algorithm failed to find \code{K} neighbors, the matrix is padded with \code{NA}'s.} - \item{'wij'}{A sparse [N,N] matrix where each cell represents \eqn{w_{ij}}.} - \item{'call'}{The call.} - \item{'coords'}{A [N,D] matrix of the embedding of the dataset in the low-dimensional space.} - } -} -\description{ -Implements the \code{vis} -} -\details{ -Note that this implementation expects the data to be free of \code{NaN}'s, \code{NA}'s, \code{Inf}'s, and duplicate rows. -If any of these assumptions are violated, the algorithm will fail. It is also usually a good idea to scale the input data -to have unit norm and mean 0. If there are large values in the input matrix, some computations may oveflow. -} -\examples{ -# iris -data(iris) -dat <- as.matrix(iris[,1:4]) -dat <- scale(dat) -dupes = which(duplicated(dat)) -dat <- dat[-dupes,] # duplicated data potentially can cause the algorithm to fail -dat <- t(dat) -visObject <- vis(dat, max_iter = 20, sgd_batches = 800000, - K = 10, gamma = 2, rho = 1, M = 40, alpha = 20,verbose=FALSE) -\dontrun{ -# mnist -load("./mnist.Rda") -dat <- mnist$images -dim(dat) <- c(42000, 28 * 28) -dat <- (dat / 255) - 0.5 -dat <- t(dat) -coords <- vis(dat, check=FALSE, - n_tree = 50, tree_th = 200, - K = 50, alpha = 2, max.iter = 4) -} - -} -\references{ -Jian Tang, Jingzhou Liu, Ming Zhang, Qiaozhu Mei. \href{https://arxiv.org/abs/1602.00370}{Visualizing Large-scale and High-dimensional Data.} -} - diff --git a/man/wiki.Rd b/man/wiki.Rd deleted file mode 100644 index 642f274..0000000 --- a/man/wiki.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/wiki.R -\docType{data} -\name{wiki} -\alias{wiki} -\title{Voting data on wikipedia from inception until January, 2008.} -\format{A symmetric sparse matrix in C-compressed format. Weights for present edges are either 1, -indicating that each node case a vote for the other, or 0.5. Nodes with fewer than 5 votes were -removed from the dataset.} -\source{ -\url{https://snap.stanford.edu/data/wiki-Vote.html} -} -\usage{ -wiki -} -\description{ -Voting data on wikipedia from inception until January, 2008. -} -\keyword{datasets} - diff --git a/src/Makevars b/src/Makevars index ac2f31e..c637c19 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,4 @@ PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) -PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(FLIBS) $(LAPACK_LIBS) $(BLAS_LIBS) +PKG_CXXFLAGS = $(SHLIB_OPENMP_CFLAGS) -DARMA_64BIT_WORD CXX_STD=CXX11 diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 1c08ed2..74bed6e 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,44 +6,119 @@ using namespace Rcpp; -// sgd -arma::mat sgd(arma::mat coords, const arma::vec& is, const NumericVector js, const NumericVector ps, const NumericVector ws, const double gamma, const double rho, const double minRho, const bool useWeights, const long nBatches, const int M, const double alpha, bool verbose); -RcppExport SEXP largeVis_sgd(SEXP coordsSEXP, SEXP isSEXP, SEXP jsSEXP, SEXP psSEXP, SEXP wsSEXP, SEXP gammaSEXP, SEXP rhoSEXP, SEXP minRhoSEXP, SEXP useWeightsSEXP, SEXP nBatchesSEXP, SEXP MSEXP, SEXP alphaSEXP, SEXP verboseSEXP) { +// dbscan_e +IntegerVector dbscan_e(arma::sp_mat& edges, double eps, int minPts, bool verbose); +RcppExport SEXP largeVis_dbscan_e(SEXP edgesSEXP, SEXP epsSEXP, SEXP minPtsSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< arma::mat >::type coords(coordsSEXP); - Rcpp::traits::input_parameter< const arma::vec& >::type is(isSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type js(jsSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type ps(psSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type ws(wsSEXP); - Rcpp::traits::input_parameter< const double >::type gamma(gammaSEXP); - Rcpp::traits::input_parameter< const double >::type rho(rhoSEXP); - Rcpp::traits::input_parameter< const double >::type minRho(minRhoSEXP); - Rcpp::traits::input_parameter< const bool >::type useWeights(useWeightsSEXP); - Rcpp::traits::input_parameter< const long >::type nBatches(nBatchesSEXP); - Rcpp::traits::input_parameter< const int >::type M(MSEXP); - Rcpp::traits::input_parameter< const double >::type alpha(alphaSEXP); + Rcpp::traits::input_parameter< arma::sp_mat& >::type edges(edgesSEXP); + Rcpp::traits::input_parameter< double >::type eps(epsSEXP); + Rcpp::traits::input_parameter< int >::type minPts(minPtsSEXP); + Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); + __result = Rcpp::wrap(dbscan_e(edges, eps, minPts, verbose)); + return __result; +END_RCPP +} +// dbscan_ed +IntegerVector dbscan_ed(arma::sp_mat& edges, arma::mat& data, double eps, int minPts, bool verbose); +RcppExport SEXP largeVis_dbscan_ed(SEXP edgesSEXP, SEXP dataSEXP, SEXP epsSEXP, SEXP minPtsSEXP, SEXP verboseSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< arma::sp_mat& >::type edges(edgesSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type data(dataSEXP); + Rcpp::traits::input_parameter< double >::type eps(epsSEXP); + Rcpp::traits::input_parameter< int >::type minPts(minPtsSEXP); + Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); + __result = Rcpp::wrap(dbscan_ed(edges, data, eps, minPts, verbose)); + return __result; +END_RCPP +} +// dbscan_nd +IntegerVector dbscan_nd(arma::imat& neighbors, arma::mat& data, double eps, int minPts, bool verbose); +RcppExport SEXP largeVis_dbscan_nd(SEXP neighborsSEXP, SEXP dataSEXP, SEXP epsSEXP, SEXP minPtsSEXP, SEXP verboseSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< arma::imat& >::type neighbors(neighborsSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type data(dataSEXP); + Rcpp::traits::input_parameter< double >::type eps(epsSEXP); + Rcpp::traits::input_parameter< int >::type minPts(minPtsSEXP); + Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); + __result = Rcpp::wrap(dbscan_nd(neighbors, data, eps, minPts, verbose)); + return __result; +END_RCPP +} +// optics_e +List optics_e(arma::sp_mat& edges, double eps, int minPts, bool verbose); +RcppExport SEXP largeVis_optics_e(SEXP edgesSEXP, SEXP epsSEXP, SEXP minPtsSEXP, SEXP verboseSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< arma::sp_mat& >::type edges(edgesSEXP); + Rcpp::traits::input_parameter< double >::type eps(epsSEXP); + Rcpp::traits::input_parameter< int >::type minPts(minPtsSEXP); + Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); + __result = Rcpp::wrap(optics_e(edges, eps, minPts, verbose)); + return __result; +END_RCPP +} +// optics_ed +List optics_ed(arma::sp_mat& edges, arma::mat& data, double eps, int minPts, bool verbose); +RcppExport SEXP largeVis_optics_ed(SEXP edgesSEXP, SEXP dataSEXP, SEXP epsSEXP, SEXP minPtsSEXP, SEXP verboseSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< arma::sp_mat& >::type edges(edgesSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type data(dataSEXP); + Rcpp::traits::input_parameter< double >::type eps(epsSEXP); + Rcpp::traits::input_parameter< int >::type minPts(minPtsSEXP); + Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); + __result = Rcpp::wrap(optics_ed(edges, data, eps, minPts, verbose)); + return __result; +END_RCPP +} +// optics_nd +List optics_nd(arma::imat& neighbors, arma::mat& data, double eps, int minPts, bool verbose); +RcppExport SEXP largeVis_optics_nd(SEXP neighborsSEXP, SEXP dataSEXP, SEXP epsSEXP, SEXP minPtsSEXP, SEXP verboseSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< arma::imat& >::type neighbors(neighborsSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type data(dataSEXP); + Rcpp::traits::input_parameter< double >::type eps(epsSEXP); + Rcpp::traits::input_parameter< int >::type minPts(minPtsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(sgd(coords, is, js, ps, ws, gamma, rho, minRho, useWeights, nBatches, M, alpha, verbose)); + __result = Rcpp::wrap(optics_nd(neighbors, data, eps, minPts, verbose)); return __result; END_RCPP } +// silhouetteDbscan +void silhouetteDbscan(const arma::sp_mat& edges, NumericMatrix sil); +RcppExport SEXP largeVis_silhouetteDbscan(SEXP edgesSEXP, SEXP silSEXP) { +BEGIN_RCPP + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type edges(edgesSEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type sil(silSEXP); + silhouetteDbscan(edges, sil); + return R_NilValue; +END_RCPP +} // searchTrees -arma::mat searchTrees(const int& threshold, const int& n_trees, const int& K, const int& max_recursion_degree, const int& maxIter, const arma::mat& data, const std::string& distMethod, bool verbose); -RcppExport SEXP largeVis_searchTrees(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP max_recursion_degreeSEXP, SEXP maxIterSEXP, SEXP dataSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { +arma::imat searchTrees(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::mat& data, const std::string& distMethod, bool verbose); +RcppExport SEXP largeVis_searchTrees(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP dataSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const int& >::type threshold(thresholdSEXP); Rcpp::traits::input_parameter< const int& >::type n_trees(n_treesSEXP); Rcpp::traits::input_parameter< const int& >::type K(KSEXP); - Rcpp::traits::input_parameter< const int& >::type max_recursion_degree(max_recursion_degreeSEXP); Rcpp::traits::input_parameter< const int& >::type maxIter(maxIterSEXP); Rcpp::traits::input_parameter< const arma::mat& >::type data(dataSEXP); Rcpp::traits::input_parameter< const std::string& >::type distMethod(distMethodSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(searchTrees(threshold, n_trees, K, max_recursion_degree, maxIter, data, distMethod, verbose)); + __result = Rcpp::wrap(searchTrees(threshold, n_trees, K, maxIter, data, distMethod, verbose)); return __result; END_RCPP } @@ -96,72 +171,76 @@ BEGIN_RCPP return __result; END_RCPP } -// distMatrixTowij -arma::sp_mat distMatrixTowij(const NumericVector is, const NumericVector js, const NumericVector xs, const NumericVector sigmas, const int N, bool verbose); -RcppExport SEXP largeVis_distMatrixTowij(SEXP isSEXP, SEXP jsSEXP, SEXP xsSEXP, SEXP sigmasSEXP, SEXP NSEXP, SEXP verboseSEXP) { +// referenceWij +arma::sp_mat referenceWij(const arma::ivec& i, const arma::ivec& j, arma::vec& d, double perplexity); +RcppExport SEXP largeVis_referenceWij(SEXP iSEXP, SEXP jSEXP, SEXP dSEXP, SEXP perplexitySEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< const NumericVector >::type is(isSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type js(jsSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type xs(xsSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type sigmas(sigmasSEXP); - Rcpp::traits::input_parameter< const int >::type N(NSEXP); - Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(distMatrixTowij(is, js, xs, sigmas, N, verbose)); + Rcpp::traits::input_parameter< const arma::ivec& >::type i(iSEXP); + Rcpp::traits::input_parameter< const arma::ivec& >::type j(jSEXP); + Rcpp::traits::input_parameter< arma::vec& >::type d(dSEXP); + Rcpp::traits::input_parameter< double >::type perplexity(perplexitySEXP); + __result = Rcpp::wrap(referenceWij(i, j, d, perplexity)); return __result; END_RCPP } -// sigFunc -double sigFunc(const double sigma, const NumericVector x_i, const double perplexity); -RcppExport SEXP largeVis_sigFunc(SEXP sigmaSEXP, SEXP x_iSEXP, SEXP perplexitySEXP) { +// sgd +arma::mat sgd(arma::mat coords, arma::ivec& targets_i, arma::ivec& sources_j, IntegerVector& ps, NumericVector& weights, const double gamma, const double rho, const long long n_samples, const int M, const double alpha, const bool verbose); +RcppExport SEXP largeVis_sgd(SEXP coordsSEXP, SEXP targets_iSEXP, SEXP sources_jSEXP, SEXP psSEXP, SEXP weightsSEXP, SEXP gammaSEXP, SEXP rhoSEXP, SEXP n_samplesSEXP, SEXP MSEXP, SEXP alphaSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< const double >::type sigma(sigmaSEXP); - Rcpp::traits::input_parameter< const NumericVector >::type x_i(x_iSEXP); - Rcpp::traits::input_parameter< const double >::type perplexity(perplexitySEXP); - __result = Rcpp::wrap(sigFunc(sigma, x_i, perplexity)); + Rcpp::traits::input_parameter< arma::mat >::type coords(coordsSEXP); + Rcpp::traits::input_parameter< arma::ivec& >::type targets_i(targets_iSEXP); + Rcpp::traits::input_parameter< arma::ivec& >::type sources_j(sources_jSEXP); + Rcpp::traits::input_parameter< IntegerVector& >::type ps(psSEXP); + Rcpp::traits::input_parameter< NumericVector& >::type weights(weightsSEXP); + Rcpp::traits::input_parameter< const double >::type gamma(gammaSEXP); + Rcpp::traits::input_parameter< const double >::type rho(rhoSEXP); + Rcpp::traits::input_parameter< const long long >::type n_samples(n_samplesSEXP); + Rcpp::traits::input_parameter< const int >::type M(MSEXP); + Rcpp::traits::input_parameter< const double >::type alpha(alphaSEXP); + Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); + __result = Rcpp::wrap(sgd(coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, verbose)); return __result; END_RCPP } // searchTreesCSparse -arma::mat searchTreesCSparse(const int& threshold, const int& n_trees, const int& K, const int& max_recursion_degree, const int& maxIter, const arma::uvec& i, const arma::uvec& p, const arma::vec& x, const std::string& distMethod, bool verbose); -RcppExport SEXP largeVis_searchTreesCSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP max_recursion_degreeSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP pSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { +arma::mat searchTreesCSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::uvec& i, const arma::uvec& p, const arma::vec& x, const std::string& distMethod, bool verbose); +RcppExport SEXP largeVis_searchTreesCSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP pSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const int& >::type threshold(thresholdSEXP); Rcpp::traits::input_parameter< const int& >::type n_trees(n_treesSEXP); Rcpp::traits::input_parameter< const int& >::type K(KSEXP); - Rcpp::traits::input_parameter< const int& >::type max_recursion_degree(max_recursion_degreeSEXP); Rcpp::traits::input_parameter< const int& >::type maxIter(maxIterSEXP); Rcpp::traits::input_parameter< const arma::uvec& >::type i(iSEXP); Rcpp::traits::input_parameter< const arma::uvec& >::type p(pSEXP); Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP); Rcpp::traits::input_parameter< const std::string& >::type distMethod(distMethodSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(searchTreesCSparse(threshold, n_trees, K, max_recursion_degree, maxIter, i, p, x, distMethod, verbose)); + __result = Rcpp::wrap(searchTreesCSparse(threshold, n_trees, K, maxIter, i, p, x, distMethod, verbose)); return __result; END_RCPP } // searchTreesTSparse -arma::mat searchTreesTSparse(const int& threshold, const int& n_trees, const int& K, const int& max_recursion_degree, const int& maxIter, const arma::uvec& i, const arma::uvec& j, const arma::vec& x, const std::string& distMethod, bool verbose); -RcppExport SEXP largeVis_searchTreesTSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP max_recursion_degreeSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP jSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { +arma::mat searchTreesTSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::uvec& i, const arma::uvec& j, const arma::vec& x, const std::string& distMethod, bool verbose); +RcppExport SEXP largeVis_searchTreesTSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP jSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< const int& >::type threshold(thresholdSEXP); Rcpp::traits::input_parameter< const int& >::type n_trees(n_treesSEXP); Rcpp::traits::input_parameter< const int& >::type K(KSEXP); - Rcpp::traits::input_parameter< const int& >::type max_recursion_degree(max_recursion_degreeSEXP); Rcpp::traits::input_parameter< const int& >::type maxIter(maxIterSEXP); Rcpp::traits::input_parameter< const arma::uvec& >::type i(iSEXP); Rcpp::traits::input_parameter< const arma::uvec& >::type j(jSEXP); Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP); Rcpp::traits::input_parameter< const std::string& >::type distMethod(distMethodSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(searchTreesTSparse(threshold, n_trees, K, max_recursion_degree, maxIter, i, j, x, distMethod, verbose)); + __result = Rcpp::wrap(searchTreesTSparse(threshold, n_trees, K, maxIter, i, j, x, distMethod, verbose)); return __result; END_RCPP } diff --git a/src/dbscan.cpp b/src/dbscan.cpp new file mode 100644 index 0000000..26d1c4c --- /dev/null +++ b/src/dbscan.cpp @@ -0,0 +1,486 @@ +// [[Rcpp::plugins(openmp)]] +// [[Rcpp::plugins(cpp11)]] +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppProgress)]] +#include "largeVis.h" +#include + +using namespace Rcpp; +using namespace std; +using namespace arma; + +typedef pair iddist; + +class CompareDist { +public: + bool operator()(iddist n1, iddist n2) { + return n1.second > n2.second; + } +}; + +typedef std::priority_queue, + CompareDist> NNheap; +typedef std::vector NNlist; + +/* + * The code below is based on code which bears the following copyright notice; portions may + * remain subject to it: +####################################################################### +# dbscan - Density Based Clustering of Applications with Noise +# and Related Algorithms +# Copyright (C) 2015 Michael Hahsler + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +class Cluster { +protected: + const long long N; + int K; + const double radius2; + const int minPts; + int D; + arma::mat* data = NULL; + arma::imat* neighbors = NULL; + arma::sp_mat* edges = NULL; + bool hasEdges; + bool hasData; + + std::vector visited; + + Progress progress; + + inline double dist2(const double * x_i, + long long id) const { + double dist = 0; + const double * x_j = data->colptr(id); + for (int i = 0; i != D; i++) dist += (x_i[i] - x_j[i]) * (x_i[i] - x_j[i]); + return dist; + } + + Cluster(arma::imat& neighbors, + double eps, + int minPts, + bool verbose) : N(neighbors.n_cols), + K(neighbors.n_rows), + radius2{eps * eps}, + minPts{minPts}, + neighbors{&neighbors}, + hasEdges(false), + hasData(false), + visited(std::vector(N, false)), + progress(Progress(N, verbose)) {} + + Cluster(arma::sp_mat& edges, + double eps, + int minPts, + bool verbose) : N(edges.n_cols), + radius2{eps * eps}, minPts{minPts}, + edges{&edges}, + hasEdges(true), + hasData(false), + visited(std::vector(N, false)), + progress(Progress(N, verbose)) + { } + + void frNNrecurse(double distStart, + const double * x_i, + long long id, + NNheap& found, + std::set& checked) const { + double dist; + for (auto it = neighbors ->begin_col(id); + it != neighbors->end_col(id) && *it != -1; + it++) { + if (checked.insert(*it).second) { + dist = dist2(x_i, *it); + if (dist < radius2) { + found.push(iddist(*it, dist)); + frNNrecurse(distStart + dist, x_i, *it, found, checked); + } + } + } + } + + void frNNrecurseEdge(double distStart, + const double * x_i, + long long id, + NNheap& found, + std::set& checked) const { + double dist; + for (auto it = edges -> begin_col(id); + it != edges -> end_col(id); + it++) { + if (checked.insert(it.row()).second && distStart + *it < radius2) { + dist = dist2(x_i, it.row()); + if (dist < radius2) { + found.push(iddist(it.row(), dist)); + frNNrecurseEdge(distStart + dist, + x_i, + it.row(), + found, + checked); + } + } + } + } + + NNlist fixedRadiusNearestNeighbors(long long id) const { + NNheap found = NNheap(); + std::set checked = std::set(); + + found.push(iddist(id, 0)); + checked.insert(id); + + const double * x_i = (hasData) ? data->colptr(id) : NULL; + if (hasEdges) { + auto end = edges -> end_col(id); + for (auto it = edges -> begin_col(id); it != end; it++) { + if (checked.insert(it.row()).second) { + found.push(iddist(it.row(), *it)); + if (hasData)frNNrecurseEdge(0, x_i, it.row(), found, checked); + } + } + } else for (auto it = neighbors->begin_col(id); + it != neighbors->end_col(id) && *it != -1; + it++) { + frNNrecurse(0, x_i, *it, found, checked); + } + + NNlist ret = NNlist(); + while (! found.empty()) { + ret.push_back(found.top()); + found.pop(); + } + return ret; + } + +public: + void setData(arma::mat& data) { + this -> data = &data; + D = data.n_rows; + hasData = true; + } + virtual IntegerVector run() = 0; +}; + +class DBSCAN : public Cluster { +public: + DBSCAN( arma::imat& neighbors, + double eps, + int minPts, + bool verbose) : Cluster(neighbors, eps, minPts, verbose) {} + + DBSCAN( arma::SpMat& edges, + double eps, + int minPts, + bool verbose) : Cluster(edges, eps, minPts, verbose) {} + + IntegerVector run() { + if (hasData + hasEdges < 1) stop("Need either data or edges."); + std::vector< std::vector > clusters; // vector of vectors == list + NNlist frNN1, frNN2; + + for (long long n = 0; n != N; n++) if (progress.increment()) { + if (visited[n]) continue; + + // start new cluster and expand + frNN1 = fixedRadiusNearestNeighbors(n); + if (frNN1.size() < minPts) continue; + std::vector cluster; + cluster.push_back(n); + visited[n] = true; + + while (!frNN1.empty()) { + long long j = frNN1.back().first; + frNN1.pop_back(); + + if (visited[j]) continue; // point already processed + visited[j] = true; + + frNN2 = fixedRadiusNearestNeighbors(j); + + if (frNN2.size() >= minPts) { // expand neighborhood + copy(frNN2.begin(), frNN2.end(), + back_inserter(frNN1)); + cluster.push_back(j); + } + } + clusters.push_back(cluster); + } + IntegerVector ret = IntegerVector(N,0); + for (int c = 0; c < clusters.size(); c++) { + std::vector cluster = clusters[c]; + for (int j=0; j < cluster.size(); j++) ret[cluster[j]] = c + 1 ; + } + return ret; + } +}; + +class OPTICS : public Cluster { +protected: + std::vector orderedPoints, seeds; + std::vector ds; +public: + std::vector reachdist, coredist; + OPTICS( arma::imat& neighbors, + double eps, + int minPts, + bool verbose) : Cluster(neighbors, eps, minPts, verbose), + orderedPoints(std::vector()), + seeds(std::vector()), + ds(std::vector()), + reachdist(std::vector(N, INFINITY)), + coredist(std::vector(N, INFINITY)) { + orderedPoints.reserve(N); + } + + OPTICS( arma::SpMat& edges, + double eps, + int minPts, + bool verbose) : Cluster(edges, eps, minPts, verbose), + orderedPoints(std::vector()), + seeds(std::vector()), + ds(std::vector()), + reachdist(std::vector(N, INFINITY)), + coredist(std::vector(N, INFINITY)) { + orderedPoints.reserve(N); + } + + void update(NNlist& frNeighbors, + long long p) { + + std::vector::iterator pos_seeds; + long double newreachdist; + long long o; + long double o_d; + + while(!frNeighbors.empty()) { + o = frNeighbors.back().first; + o_d = frNeighbors.back().second; + frNeighbors.pop_back(); + + if(visited[o]) continue; + + newreachdist = std::max(coredist[p], o_d); + + if(reachdist[o] == INFINITY) { + reachdist[o] = newreachdist; + seeds.push_back(o); + } else if(newreachdist < reachdist[o]) reachdist[o] = newreachdist; + } + } + + IntegerVector run() { + if (hasData + hasEdges < 1) stop("Need either data or edges."); + NNlist frNeighbors; + for (long long n = 0; n < N; n++) if (progress.increment()) { + if (visited[n]) continue; + + frNeighbors = fixedRadiusNearestNeighbors(n); + visited[n] = true; + + // find core distance + if(frNeighbors.size() >= (size_t) minPts) + coredist[n] = frNeighbors[minPts-1].second; + + orderedPoints.push_back(n); + + if (coredist[n] == INFINITY) continue; // core-dist is undefined + + // update + update(frNeighbors, n); + + long long q; + while (!seeds.empty()) { + // get smallest dist (to emulate priority queue). All should have already + // a reachability distance ::iterator q_it = seeds.begin(); + for (std::vector::iterator it = seeds.begin(); + it!=seeds.end(); ++it) if (reachdist[*it] < reachdist[*q_it] || + (reachdist[*it] == reachdist[*q_it] && + *q_it < *it)) q_it = it; + q = *q_it; + seeds.erase(q_it); + + //N2 = regionQueryDist(q, dataPts, kdTree, eps2, approx); + frNeighbors = fixedRadiusNearestNeighbors(q); + visited[q] = true; + + // update core distance + if(frNeighbors.size() >= (size_t) minPts) { + coredist[q] = frNeighbors[minPts-1].second; + } + + orderedPoints.push_back(q); + + if(frNeighbors.size() < (size_t) minPts) continue; // == q has no core dist. + + // update seeds + update(frNeighbors, q); + } + } + return IntegerVector(orderedPoints.begin(), orderedPoints.end())+1; + } +}; + +// [[Rcpp::export]] +IntegerVector dbscan_e(arma::sp_mat& edges, + double eps, + int minPts, + bool verbose) { + DBSCAN db = DBSCAN(edges, eps, minPts, verbose); + return db.run(); +} + +// [[Rcpp::export]] +IntegerVector dbscan_ed(arma::sp_mat& edges, + arma::mat& data, + double eps, + int minPts, + bool verbose) { + DBSCAN db = DBSCAN(edges, eps, minPts, verbose); + db.setData(data); + return db.run(); +} + +// [[Rcpp::export]] +IntegerVector dbscan_nd(arma::imat& neighbors, + arma::mat& data, + double eps, + int minPts, + bool verbose) { + DBSCAN db = DBSCAN(neighbors, eps, minPts, verbose); + db.setData(data); + return db.run(); +} + +List optics_assemble(OPTICS& opt) { + List ret; + IntegerVector vec = opt.run(); + ret["order"] = vec; + ret["reachdist"] = sqrt(NumericVector(opt.reachdist.begin(), opt.reachdist.end())); + ret["coredist"] = sqrt(NumericVector(opt.coredist.begin(), opt.coredist.end())); + + return ret; +} + +// [[Rcpp::export]] +List optics_e(arma::sp_mat& edges, + double eps, + int minPts, + bool verbose) { + OPTICS opt = OPTICS(edges, eps, minPts, verbose); + return optics_assemble(opt); +} + +// [[Rcpp::export]] +List optics_ed(arma::sp_mat& edges, + arma::mat& data, + double eps, + int minPts, + bool verbose) { + OPTICS opt = OPTICS(edges, eps, minPts, verbose); + opt.setData(data); + return optics_assemble(opt); +} + +// [[Rcpp::export]] +List optics_nd(arma::imat& neighbors, + arma::mat& data, + double eps, + int minPts, + bool verbose) { + OPTICS opt = OPTICS(neighbors, eps, minPts, verbose); + opt.setData(data); + return optics_assemble(opt); +} + +// [[Rcpp::export]] +void silhouetteDbscan(const arma::sp_mat& edges, + NumericMatrix sil) { + long long N = edges.n_cols; + const NumericVector clusters = sil.column(0); + int K = max(clusters) + 1; // n clusters + long long* counts = new long long[K]; + for (int k = 0; k != K; k++) counts[k] = 0; + double* diC = new double[K * N]; + for (long long kn = 0; kn != K * N; kn++) diC[kn] = 0; + + /* + * Loop through points, accumulating for each cluster the total distance of the point + * to all points in that cluster. Distances to points in the noise cluster are ignored. + * (Distances from points in the noise cluster, are tracked.) + * + * Using a kNN matrix to start, we don't have all N^2 distances. When a distance is missing, + * we use 2 * the distance from that point to its furthest known nearest neighbor. + */ + for (long long n = 0; n != N; n++) { + long long last = 0; + auto end = edges.end_col(n); + int ns_cluster = (int) clusters[n]; + counts[ns_cluster]++; + double* diN = diC + (K * n); + double maxD = edges.col(n).max(); + for (auto it = edges.begin_col(n); + it != end; + it++) { + while (last != it.row()) { + if (clusters[last] != 0 || ns_cluster == 0) { + double d = edges(n, it.row()); + d = (d == 0) ? maxD : d; + diN[(int) clusters[last]] += d; + } + last++; + } + if (it.row() == n) continue; + if (clusters[last] != 0 || ns_cluster == 0) diN[(int) clusters[last]] += *it; + } + while (last != N) { + if (clusters[last] != 0 || ns_cluster == 0) diN[(int) clusters[last]] += maxD; + last++; + } + } + + for (long long n = 0; n != N; n++) { + const int ns_cluster = (int) clusters[n]; + bool computeSn = TRUE; + double* diN = diC + (K * n); + + for (long long k = 0; k != K; k++) { + if (k == ns_cluster) { + if (counts[k] == 1) computeSn = FALSE; + else diN[k] = diN[k] / (counts[k] - 1); + } else if (ns_cluster != 0) diN[k] = diN[k] / counts[k]; + } + const double an = diN[ns_cluster]; + /* + * Iterate through clusters to find nearest cluster-neighbor. + * Start by assuming neighbor of cluster 1 is 2, and of all other clusters is 1. + */ + int candidateNeighbor = (ns_cluster == 1) ? 2 : 1; + double bn = diN[candidateNeighbor]; + for (int k = 1; k != K; k++) if (k != ns_cluster && diN[k] < bn) { + bn = diN[k]; + candidateNeighbor = k; + } + sil(n, 1) = candidateNeighbor; + if (counts[ns_cluster] == 1) bn = 0; + else bn = (bn - an) / std::max(bn, an); + sil(n, 2) = bn; + } +} + diff --git a/src/denseneighbors.cpp b/src/denseneighbors.cpp new file mode 100644 index 0000000..8ea7c5c --- /dev/null +++ b/src/denseneighbors.cpp @@ -0,0 +1,301 @@ +// [[Rcpp::plugins(openmp)]] +// [[Rcpp::plugins(cpp11)]] +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppProgress)]] +#include "largeVis.h" + +using namespace Rcpp; +using namespace std; +using namespace arma; + +/* +* When a leaf node is found, store the identity of the neighbors +* along with each vertex. +*/ +void addNeighbors(const arma::ivec& indices, + Neighborhood* heap[], + const int I) { + ivec neighbors = ivec(indices); + Neighborhood tmpStorage = Neighborhood(); + ivec::iterator newEnd = neighbors.end(); +#ifdef _OPENMP +#pragma omp critical +#endif +{ + for (ivec::iterator it = neighbors.begin(); + it != newEnd; + it++) { + tmpStorage.clear(); + tmpStorage.swap(*heap[*it]); + heap[*it] -> reserve(tmpStorage.size() + I); + ivec::iterator newIt = neighbors.begin(); + vector::iterator oldIt = tmpStorage.begin(); + vector::iterator oldEnd = tmpStorage.end(); + int last; + int best = -1; + while (oldIt != oldEnd || newIt != newEnd) { + if (oldIt == oldEnd) best = *newIt++; + else if (newIt == newEnd) best = *oldIt++; + else best = (*newIt < *oldIt) ? *newIt++ : *oldIt++; + if (best == last || best == *it) continue; + heap[*it] -> push_back(best); + last = best; + } + } +} +} + +arma::vec hyperplane(const arma::ivec& indices, + const arma::mat& data, + const int I) { + vec direction = vec(indices.size()); + int x1idx, x2idx; + vec v; + vec m; + do { + const vec selections = randu(2) * (I - 1); + x1idx = indices[selections[0]]; + x2idx = indices[selections[1]]; + if (x1idx == x2idx) x2idx = indices[((int)selections[1] + 1) % indices.size()]; + const vec x2 = data.col(x2idx); + const vec x1 = data.col(x1idx); + // Get hyperplane + m = (x1 + x2) / 2; // Base point of hyperplane + const vec d = x1 - x2; + v = d / as_scalar(norm(d, 2)); // unit vector + } while (x1idx == x2idx); + + for (int i = 0; i < indices.size(); i++) { + const vec X = data.col(indices[i]); + direction[i] = dot((X - m), v); + } + return direction; +} + +/* +* The recursive function for the annoy neighbor search +* algorithm. Partitions space by a random hyperplane, +* and calls itself recursively (twice) on each side. +* +* If called with fewer nodes than the threshold, +* +*/ +void searchTree(const int& threshold, + const arma::ivec& indices, + const arma::mat& data, + Neighborhood* heap[], + Progress& progress) { + const int I = indices.size(); + // const int D = data.n_rows; + if (progress.check_abort()) return; + if (I < 2) stop("Tree split failure."); + if (I <= threshold) { + addNeighbors(indices, heap, I); + progress.increment(I); + return; + } + vec direction = hyperplane(indices, data, I); + const double middle = median(direction); + const uvec left = find(direction > middle); + const uvec right = find(direction <= middle); + + if (left.size() >= 2 && right.size() >= 2) { + searchTree(threshold, indices(left), data, heap, progress); + searchTree(threshold, indices(right), data, heap, progress); + } else { // Handles the rare case where the split fails because of equidistant points + searchTree(threshold, indices.subvec(0, indices.size() / 2), data, heap, progress); + searchTree(threshold, indices.subvec(indices.size() / 2, indices.size() - 1), data, heap, progress); + } +}; + +Neighborhood** createNeighborhood(int N) { + Neighborhood** treeNeighborhoods = new Neighborhood*[N]; + for (int i = 0; i < N; i++) { + int seed[] = {i}; + treeNeighborhoods[i] = new vector(seed, seed + sizeof(seed) / sizeof(int)); + } + return treeNeighborhoods; +} + +void copyHeapToMatrix(set* tree, + const int K, + const int i, + arma::imat& knns) { + set::iterator sortIterator = tree -> begin(); + set::iterator end = tree -> end(); + int j = 0; + while (sortIterator != end) knns(j++, i) = *sortIterator++; + if (j == 0) stop("Tree failure."); + while (j < K) knns(j++, i) = -1; +} + +void addDistance(const arma::vec& x_i, + const arma::mat& data, + const int j, + MaxHeap& heap, + const int K, + double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j)) { + const double d = distanceFunction(x_i, data.col(j)); + if (d != 0) { + heap.emplace(d, j); + if (heap.size() > K) heap.pop(); + } +} + +void heapToSet(MaxHeap& heap, set* set) { + while (! heap.empty()) { + set -> emplace(heap.top().n); + heap.pop(); + } +} + +arma::imat annoy(const int n_trees, + const int threshold, + const arma::mat& data, + const int N, + const int K, + double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j), + Progress& p) { + set** treeHolder = new set*[N]; + Neighborhood** treeNeighborhoods = createNeighborhood(N); + const ivec indices = regspace(0, N - 1); + +#ifdef _OPENMP +#pragma omp parallel for shared(treeNeighborhoods) +#endif + for (int t = 0; t < n_trees; t++) if (! p.check_abort()) + searchTree(threshold, + indices, + data, + treeNeighborhoods, + p + ); + if (p.check_abort()) return imat(K, N); + // Reduce size from threshold * n_trees to top K, and sort + MaxHeap thisHeap = MaxHeap(); +#ifdef _OPENMP +#pragma omp parallel for shared(treeHolder, treeNeighborhoods) private(thisHeap) +#endif + for (int i = 0; i < N; i++) if (p.increment()) { + const vec x_i = data.col(i); + vector *neighborhood = treeNeighborhoods[i]; + for (vector::iterator j = neighborhood -> begin(); + j != neighborhood -> end(); + j++) + addDistance(x_i, data, *j, thisHeap, K, distanceFunction); + delete treeNeighborhoods[i]; + treeHolder[i] = new set(); + heapToSet(thisHeap, treeHolder[i]); + } + // Copy sorted neighborhoods into matrix. This is faster than + // sorting in-place. + imat knns = imat(K,N); +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int i = 0; i < N; i++) if (p.increment()) { + copyHeapToMatrix(treeHolder[i], K, i, knns); + delete treeHolder[i]; + } + return knns; +} + +// [[Rcpp::export]] +arma::imat searchTrees(const int& threshold, + const int& n_trees, + const int& K, + const int& maxIter, + const arma::mat& data, + const std::string& distMethod, + bool verbose) { + + const int N = data.n_cols; + + double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j); + if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = relDist; + else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = cosDist; + else distanceFunction = relDist; + + Progress p((N * n_trees) + (2 * N) + (N * maxIter), verbose); + + imat knns; + { + mat dataMat; + if (distMethod.compare(std::string("Cosine")) == 0) dataMat = normalise(data); + else dataMat = data; + knns = annoy(n_trees, + threshold, + dataMat, + N, + K, + distanceFunction, + p); + } + + if (p.check_abort()) return imat(0); + imat old_knns = imat(K,N); + for (int T = 0; T < maxIter; T++) if (! p.check_abort()) { + imat tmp = old_knns; + old_knns = knns; + knns = tmp; + MaxHeap thisHeap = MaxHeap(); + set sorter = set(); +#ifdef _OPENMP +#pragma omp parallel for shared(old_knns, knns) private(thisHeap, sorter) +#endif + for (int i = 0; i < N; i++) if (p.increment()) { + const vec x_i = data.col(i); + + PositionVector positions = PositionVector(), ends = PositionVector(); + positions.reserve(K + 1); ends.reserve(K + 1); + + positions.push_back(old_knns.begin_col(i)); + ends.push_back(old_knns.end_col(i)); + + for (imat::col_iterator it = old_knns.begin_col(i); + it != ends[0] && *it != -1; + it++) { + positions.push_back(old_knns.begin_col(*it)); + ends.push_back(old_knns.end_col(*it)); + } + + int lastOne = N + 1; + // This is a K + 1 vector merge sort running in O(K * N) + PositionVector::iterator theEnd = positions.end(); + while (true) { + imat::col_iterator whch = 0; + + for (pair< PositionVector::iterator, + PositionVector::iterator > it(positions.begin(), + ends.begin()); + it.first != theEnd; + it.first++, it.second++) while (*it.first != *it.second) { // For each neighborhood, keep going until + // we find a non-dupe or get to the end + + if (**it.first == -1) advance(*it.first, distance(*it.first, *it.second)); + else if (**it.first == i || **it.first == lastOne) advance(*it.first, 1); + else if (whch == 0 || **it.first < *whch) { + whch = *it.first; + break; + } else break; + } + if (whch == 0) break; + lastOne = *whch; + advance(whch, 1); + + addDistance(x_i, data, lastOne, thisHeap, K, distanceFunction); + } + + sorter.clear(); + heapToSet(thisHeap, &sorter); + + set::iterator sortIterator = sorter.begin(); + int j = 0; + while (sortIterator != sorter.end()) knns(j++, i) = *sortIterator++; + if (j == 0) stop("Neighbor exploration failure."); + while (j < K) knns(j++,i) = -1; + } + } + return knns; +}; + diff --git a/src/distance.cpp b/src/distance.cpp new file mode 100644 index 0000000..366825a --- /dev/null +++ b/src/distance.cpp @@ -0,0 +1,111 @@ +// [[Rcpp::plugins(openmp)]] +// [[Rcpp::plugins(cpp11)]] +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppProgress)]] +#include "largeVis.h" + +double relDist(const arma::vec& i, const arma::vec& j) { + const int lim = i.n_elem; + double cnt = 0; + for (int idx = 0; idx < lim; idx++) cnt += ((i[idx] - j[idx]) * (i[idx] - j[idx])); + return cnt; +} +// Vanilla euclidean +double dist(const arma::vec& i, const arma::vec& j) { + return sqrt(relDist(i,j)); +} + +// Vanilla cosine distance calculation +double cosDist(const arma::vec& i, const arma::vec& j) { + int D = i.n_elem; + double pp = 0, qq = 0, pq = 0; + for (int d = 0; d < D; d++) { + pp += (i[d]) * (i[d]); + qq += (j[d]) * (j[d]); + pq += (i[d]) * (j[d]); + } + double ppqq = pp * qq; + if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq); + else return 2.0; // cos is 0 +} +// Versions of the distance functions for finding the neighbors +// of sparse matrices. Not optimized. +double sparseDist(const sp_mat& i, const sp_mat& j) { + return as_scalar(sqrt(sum(square(i - j)))); +} +double sparseCosDist(const sp_mat& i, const sp_mat& j) { + return 2.0 - 2.0 * (as_scalar((dot(i,j)) / as_scalar(norm(i,2) * norm(j,2)))); +} +double sparseRelDist(const sp_mat& i, const sp_mat& j) { + return as_scalar(sum(square(i - j))); +} + +/* + * Fast calculation of pairwise distances with the result stored in a pre-allocated vector. + */ +// [[Rcpp::export]] +arma::vec fastDistance(const NumericVector is, + const NumericVector js, + const arma::mat& data, + const std::string& distMethod, + bool verbose) { + + Progress p(is.size(), verbose); + vec xs = vec(is.size()); + double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j); + if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = dist; + else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = cosDist; +#ifdef _OPENMP +#pragma omp parallel for shared (xs) +#endif + for (int i=0; i < is.length(); i++) if (p.increment()) xs[i] = + distanceFunction(data.col(is[i]), data.col(js[i])); + return xs; +}; + +arma::vec fastSparseDistance(const arma::vec& is, + const arma::vec& js, + const sp_mat& data, + const std::string& distMethod, + bool verbose) { + + Progress p(is.size(), verbose); + vec xs = vec(is.size()); + double (*distanceFunction)( + const sp_mat& x_i, + const sp_mat& x_j); + if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = sparseDist; + else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = sparseCosDist; +#ifdef _OPENMP +#pragma omp parallel for shared (xs) +#endif + for (int i=0; i < is.size(); i++) if (p.increment()) xs[i] = + distanceFunction(data.col(is[i]), data.col(js[i])); + return xs; +}; + +// [[Rcpp::export]] +arma::vec fastCDistance(const arma::vec& is, + const arma::vec& js, + const arma::uvec& i_locations, + const arma::uvec& p_locations, + const arma::vec& x, + const std::string& distMethod, + bool verbose) { + const int N = p_locations.size() - 1; + const sp_mat data = sp_mat(i_locations, p_locations, x, N, N); + return fastSparseDistance(is,js,data,distMethod,verbose); +} + +// [[Rcpp::export]] +arma::vec fastSDistance(const arma::vec& is, + const arma::vec& js, + const arma::uvec& i_locations, + const arma::uvec& j_locations, + const arma::vec& x, + const std::string& distMethod, + bool verbose) { + const umat locations = join_cols(i_locations, j_locations); + const sp_mat data = sp_mat(locations, x); + return fastSparseDistance(is,js,data,distMethod,verbose); +} diff --git a/src/edgeweights.cpp b/src/edgeweights.cpp new file mode 100644 index 0000000..11beae5 --- /dev/null +++ b/src/edgeweights.cpp @@ -0,0 +1,152 @@ +// [[Rcpp::plugins(openmp)]] +// [[Rcpp::plugins(cpp11)]] +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppProgress)]] +#include "largeVis.h" + +using namespace Rcpp; +using namespace std; +using namespace arma; + +class ReferenceEdges { +protected: + // arma::vec sigmas; + const double perplexity; + const long long n_edges; + const int n_vertices; + std::vector edge_from, edge_to, head, next, reverse; + std::vector edge_weight; + +public: + ReferenceEdges(double perplexity, + const arma::ivec& from, + const arma::ivec& to, + const arma::vec& weights) : perplexity{perplexity}, + n_edges(from.size()), + n_vertices(from[(long) n_edges - 1] + 1), + edge_from(std::vector()), + edge_to(std::vector()), + head(std::vector()), + next(std::vector()), + reverse(std::vector()), + edge_weight(std::vector()) { + // sigmas = vec(n_vertices); + long n_edge = 0; + for (int i = 0; i < n_vertices; i++) head.push_back(-1); + for (int x = 0; x < n_vertices; x++) { + while (from[n_edge] == x) { + edge_from.push_back(x); + edge_to.push_back(to[n_edge]); + edge_weight.push_back(weights[n_edge] * weights[n_edge]); + next.push_back(head[x]); + reverse.push_back(-1); + head[x] = n_edge++; + } + } + } + void similarityOne(long id) { + double beta, lo_beta, hi_beta, sum_weight, H, tmp; + long p; + beta = 1; + lo_beta = hi_beta = -1; + + for (int iter = 0; iter < 200; ++iter) { + H = sum_weight = 0; + for (p = head[id]; p >= 0; p = next[p]) { + sum_weight += tmp = exp(-beta * edge_weight[p]); + H += beta * (edge_weight[p] * tmp); + } + H = (H / sum_weight) + log(sum_weight); + if (fabs(H - log(perplexity)) < 1e-5) break; + if (H > log(perplexity)) { + lo_beta = beta; + if (hi_beta < 0) beta *= 2; else beta = (beta + hi_beta) / 2; + } else { + hi_beta = beta; + if (lo_beta < 0) beta /= 2; else beta = (lo_beta + beta) / 2; + } + } + for (p = head[id], sum_weight = 0; p >= 0; p = next[p]) { + sum_weight += edge_weight[p] = exp(-beta * edge_weight[p]); + } + for (p = head[id]; p >= 0; p = next[p]){ + edge_weight[p] /= sum_weight; + } + // sigmas[id] = beta; + } + + void searchReverse(int id) { + long long y, p, q; + for (p = head[id]; p >= 0; p = next[p]) { + y = edge_to[p]; + for (q = head[id]; q >= 0; q = next[q]) { + if (edge_to[q] == id) break; + } + reverse[p] = q; + } + } + + void run() { + #pragma omp parallel for + for (int id = 0; id < n_vertices; id++) { + similarityOne(id); + } + #pragma omp parallel for + for (int id = 0; id < n_vertices; id++) { + searchReverse(id); + } + long long n_edge = edge_to.size(); + double sum_weight = 0; + for (int id = 0; id != n_vertices; id++) { + for (long long p = head[id]; p >= 0; p = next[p]) { + long long y = edge_to[p]; + long long q = reverse[p]; + if (q == -1) { + edge_from.push_back(y); + edge_to.push_back(id); + edge_weight.push_back(0); + next.push_back(head[y]); + reverse.push_back(p); + q = reverse[p] = head[y] = n_edge++; + } + if (id > y){ + sum_weight += edge_weight[p] + edge_weight[q]; + edge_weight[p] = edge_weight[q] = (edge_weight[p] + edge_weight[q]) / 2; + } + } + } + } + + arma::sp_mat getWIJ() { + umat locations = umat(2, edge_from.size()); + vec values = vec(edge_weight.size()); + for (long long i = 0; i < edge_from.size(); i++) { + locations(0, i) = edge_from[i]; + locations(1, i) = edge_to[i]; + values[i] = edge_weight[i]; + } + sp_mat wij = sp_mat( + true, // add_values + locations, + values, + n_vertices, n_vertices // n_col and n_row + ); + return wij; + } + + // arma::vec getSigmas() { + // return sigmas; + // } +}; + +// [[Rcpp::export]] +arma::sp_mat referenceWij(const arma::ivec& i, + const arma::ivec& j, + arma::vec& d, + double perplexity) { + ReferenceEdges ref = ReferenceEdges(perplexity, i, j, d); + // vec sigmas = ref.getSigmas(); + ref.run(); + return ref.getWIJ(); + // return wij; +} diff --git a/src/gradients.cpp b/src/gradients.cpp new file mode 100644 index 0000000..3584c68 --- /dev/null +++ b/src/gradients.cpp @@ -0,0 +1,123 @@ +// [[Rcpp::plugins(openmp)]] +// [[Rcpp::plugins(cpp11)]] +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppProgress)]] +#include "largeVis.h" + +/* + * Efficient clamp + */ + +// #ifdef __SSE2__ +// #include +// #include +// double clamp ( double val, double minval, double maxval ){ +// __builtin_ia32_storesd( &val, __builtin_ia32_minsd( __builtin_ia32_maxsd(__builtin_ia32_loadupd(&val), +// __builtin_ia32_loadupd(&minval)), +// __builtin_ia32_loadupd(&maxval))); +// return val; +// } +// #else +// inline double max(double val, double maxval) { +// return (val > maxval) ? maxval : val; +// } +// inline double min(double val, double minval) { +// return (val < minval) ? minval : val; +// } +// double clamp(double val, double cap) { +// return min(max(val, -cap), cap); +// } +// #endif + + + + +// Parent class +void Gradient::positiveGradient(const double* i, + const double* j, + double* holder) const { + const double dist_squared = distAndVector(i, j, holder); + _positiveGradient(dist_squared, holder); +} +void Gradient::negativeGradient(const double* i, + const double* k, + double* holder) const { + const double dist_squared = distAndVector(i, k, holder); + _negativeGradient(dist_squared, holder); +} +// Copies the vector sums into a vector while it computes distance^2 - +// useful in calculating the gradients during SGD +inline double Gradient::distAndVector(const double *x_i, + const double *x_j, + double *output) const { + double cnt = 0; + for (int d = 0; d < D; d++) { + double t = x_i[d] - x_j[d]; + output[d] = t; + cnt += t * t; + } + return cnt; +} + +inline double Gradient::clamp(double val) const { + return fmin(fmax(val, -cap), cap); +} + +inline void Gradient::multModify(double *col, const double adj) const { + for (int i = 0; i != D; i++) col[i] = clamp(col[i] * adj); +} + +/* + * Generalized gradient with an alpha parameter + */ + +void AlphaGradient::_positiveGradient(const double dist_squared, + double* holder) const { + const double grad = twoalpha / (1 + alpha * dist_squared); + multModify(holder, grad); +} + +void AlphaGradient::_negativeGradient(const double dist_squared, + double* holder) const { + const double adk = alpha * dist_squared; + const double grad = alphagamma / (dist_squared * (adk + 1)); + multModify(holder, grad); +} + +/* + * Optimized gradient for alpha == 1 + */ +AlphaOneGradient::AlphaOneGradient(const double g, + const int d) : AlphaGradient(1, g, d) { +} + +void AlphaOneGradient::_positiveGradient(const double dist_squared, + double* holder) const { + const double grad = - 2 / (1 + dist_squared); + multModify(holder, grad); +} + +void AlphaOneGradient::_negativeGradient(const double dist_squared, + double* holder) const { + const double grad = alphagamma / (1 + dist_squared) / (0.1 + dist_squared); + multModify(holder, grad); +} + +/* + * Alternative probabilistic function (sigmoid) + */ + + +void ExpGradient::_positiveGradient(const double dist_squared, + double* holder) const { + const double expsq = exp(dist_squared); + const double grad = (dist_squared > 4) ? -1 : + -(expsq / (expsq + 1)); + multModify(holder, grad); +} +void ExpGradient::_negativeGradient(const double dist_squared, + double* holder) const { + const double grad = (dist_squared > gammagamma) ? 0 : + gamma / (1 + exp(dist_squared)); + multModify(holder, grad); +} diff --git a/src/helpers.cpp b/src/helpers.cpp deleted file mode 100644 index 2b2903e..0000000 --- a/src/helpers.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include -// [[Rcpp::plugins(openmp)]] -#include -using namespace Rcpp; - -double dist(const arma::vec& i, const arma::vec& j) { - return sqrt(sum(square(i - j))); -} - -double cosDist(const arma::vec& i, const arma::vec& j) { - return 1 - (dot(i,j) / (sqrt(sum(square(i))) * sqrt(sum(square(j))))); -} - -double sparseDist(const arma::sp_mat& i, const arma::sp_mat& j) { - return arma::as_scalar(sqrt(sum(square(i - j)))); -} - -double sparseCosDist(const arma::sp_mat& i, const arma::sp_mat& j) { - return 1 - (arma::as_scalar((dot(i,j)) / arma::as_scalar(arma::norm(i,2) * arma::norm(j,2)))); -} diff --git a/src/helpers.h b/src/helpers.h deleted file mode 100644 index bf5735d..0000000 --- a/src/helpers.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef LARGEVISHELPERS -#define LARGEVISHELPERS -using namespace Rcpp; - -double dist(const arma::vec& i, const arma::vec& j); - -double cosDist(const arma::vec& i, const arma::vec& j); - -double sparseDist(const arma::sp_mat& i, const arma::sp_mat& j); - -double sparseCosDist(const arma::sp_mat& i, const arma::sp_mat& j); - -#endif diff --git a/src/largeVis.cpp b/src/largeVis.cpp index cf211f7..75577b6 100644 --- a/src/largeVis.cpp +++ b/src/largeVis.cpp @@ -1,191 +1,153 @@ -#include // [[Rcpp::plugins(openmp)]] -#include -#include "progress.hpp" -#include -#include -#include -#include -#include -#include +// [[Rcpp::plugins(cpp11)]] +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppProgress)]] +#include "largeVis.h" + using namespace Rcpp; using namespace std; +using namespace arma; + +class Visualizer { +protected: + const int D; + const int M; + const int M2; + + long long * const targetPointer; + long long * const sourcePointer; + double * const coordsPtr; + const long long n_samples; + + double rho; + double rhoIncrement; + + AliasTable negAlias; + AliasTable posAlias; + Gradient* grad; + + IntegerVector ps; + +public: + Visualizer(long long * sourcePtr, + long long * targetPtr, + int D, + double * coordPtr, + int M, + double rho, + long long n_samples) : D{D}, M{M}, M2(M * 2), + targetPointer{targetPtr}, + sourcePointer{sourcePtr}, + coordsPtr{coordPtr}, + n_samples{n_samples}, + rho{rho}, + rhoIncrement(rho / n_samples) { } + + void initAlias(IntegerVector& newps, + const NumericVector& weights) { + ps = newps; + NumericVector pdiffs = pow(diff(newps), 0.75); + negAlias.initialize(pdiffs); + posAlias.initialize(weights); + negAlias.initRandom(); + posAlias.initRandom(); + } -// The Euclidean distance between two vectors + void setGradient(Gradient * newGrad) { + grad = newGrad; + } -inline double dist(arma::vec i, arma::vec j) { - return sum(square(i - j)); -} + void operator()(long long startSampleIdx, int batchSize) { + long long e_ij; + int i, j, k, d, m, shortcircuit, example = 0; + double firstholder[10], secondholder[10]; + double * y_i, * y_j; + long long * searchBegin, * searchEnd; + + double localRho = rho; + while (example++ != batchSize && localRho > 0) { + // * (1 - (startSampleIdx / n_samples)); + e_ij = posAlias(); + j = targetPointer[e_ij]; + i = sourcePointer[e_ij]; + + y_i = coordsPtr + (i * D); + + y_j = coordsPtr + (j * D); + grad -> positiveGradient(y_i, y_j, firstholder); + for (d = 0; d != D; d++) y_j[d] -= firstholder[d] * localRho; + + searchBegin = targetPointer + ps[i]; + searchEnd = targetPointer + ps[i + 1]; + shortcircuit = 0; m = 0; + + while (m != M && shortcircuit != M2) { + k = negAlias(); + shortcircuit++; + // Check that the draw isn't one of i's edges + if (k == i || + k == j || + binary_search( searchBegin, + searchEnd, + k)) continue; + m++; -/* - * Some helper functions useful in debugging. - */ -void checkVector(const arma::vec& x, - const std::string& label) { - if (x.has_nan() || x.has_inf()) - Rcout << "\n Failure at " << label; -}; + y_j = coordsPtr + (k * D); + grad -> negativeGradient(y_i, y_j, secondholder); -double objective(const arma::mat& inputs, double gamma, double alpha) { - double objective = log(1 / (1 + (alpha * dist(inputs.col(0), inputs.col(1))))); - for (int i = 2; i < 7; i++) - objective += gamma * log(1 - (1 / (1 + alpha * dist(inputs.col(0), inputs.col(i))))); - return objective; -} - -void checkGrad(const arma::vec& x, - const arma::vec& y, - const arma::vec& grad, - bool together, - const string& label) { - double oldDist = dist(x,y); - double newDist = dist(x + grad, y - grad); - if (together && newDist > oldDist) Rcout << "\nGrad " << label << " yi " << x << " other " << y << " grad " << grad << " moved further apart."; - else if (! together && newDist < oldDist) Rcout << "\nGrad " << label << " yi " << x << " other " << y << " grad " << grad << "moved closer together."; -}; -/* - * The stochastic gradient descent function. Asynchronicity is enabled by openmp. - */ + for (d = 0; d != D; d++) y_j[d] -= secondholder[d] * localRho; + for (d = 0; d != D; d++) firstholder[d] += secondholder[d]; + } + for (d = 0; d != D; d++) y_i[d] += firstholder[d] * localRho; + localRho -= rhoIncrement; + } + rho -= (rhoIncrement * batchSize); + } +}; // [[Rcpp::export]] arma::mat sgd(arma::mat coords, - const arma::vec& is, // vary randomly - const NumericVector js, // ordered - const NumericVector ps, // N+1 length vector of indices to start of each row j in vector is - const NumericVector ws, // w{ij} + arma::ivec& targets_i, // vary randomly + arma::ivec& sources_j, // ordered + IntegerVector& ps, // N+1 length vector of indices to start of each row j in vector is + NumericVector& weights, // w{ij} const double gamma, const double rho, - const double minRho, - const bool useWeights, - const long nBatches, + const long long n_samples, const int M, const double alpha, - bool verbose) { - - Progress progress(nBatches, verbose); - - const int D = coords.n_rows; - const int N = ps.size() - 1; - const int E = ws.length(); - // Calculate negative sample weights, d_{i}^0.75. - // Stored as a vector of cumulative sums, normalized, so it can - // be readily searched using binary searches. - arma::vec negativeSampleWeights = pow(diff(ps), 0.75); - const double scale = sum(negativeSampleWeights); - negativeSampleWeights = negativeSampleWeights / scale; - negativeSampleWeights = cumsum(negativeSampleWeights); - - // positive edges for sampling - arma::vec positiveEdgeWeights; - if (! useWeights) { - const double posScale = sum(ws); - positiveEdgeWeights = arma::vec(E); - positiveEdgeWeights[0] = ws[0] / posScale; - for (int idx = 1; idx < E; idx++) - positiveEdgeWeights[idx] = positiveEdgeWeights[idx - 1] + (ws[idx] / posScale); - } - - const int posSampleLength = ((nBatches > 1000000) ? 1000000 : (int) nBatches); - arma::vec positiveSamples = arma::randu(posSampleLength); - - // Iterate through the edges in the positiveEdges vector -#pragma omp parallel for shared(coords, positiveSamples) schedule(static) - for (long eIdx=0; eIdx < nBatches; eIdx++) { - if (progress.increment()) { - const double posTarget = *(positiveSamples.begin() + (eIdx % posSampleLength)); - int k; - int e_ij; - if (useWeights) { - e_ij = posTarget * (E - 1); - } else { - e_ij = std::distance(positiveEdgeWeights.begin(), - std::upper_bound(positiveEdgeWeights.begin(), - positiveEdgeWeights.end(), - posTarget)); - } - const int i = is[e_ij]; - const int j = js[e_ij]; - - const double localRho = rho - ((rho - minRho) * eIdx / nBatches); - - //if ((arma::randn(1))[0] < 0) swap(i, j); - - const arma::vec y_i = coords.col(i); - const arma::vec y_j = coords.col(j); - - // wij - const double w = (useWeights) ? ws[e_ij] : 1; - - const double dist_ij = dist(y_i, y_j); - - const arma::vec d_dist_ij = (y_i - y_j) / sqrt(dist_ij); - double p_ij; - if (alpha == 0) p_ij = 1 / (1 + exp(dist_ij)); - else p_ij = 1 / (1 + (alpha * dist_ij)); - - arma::vec d_p_ij; - if (alpha == 0) d_p_ij = d_dist_ij * -2 * dist_ij * exp(dist_ij) / pow(1 + exp(dist_ij), 2); - else d_p_ij = d_dist_ij * -2 * dist_ij * alpha / pow(1 + (dist_ij * alpha),2); - - //double o = log(p_ij); - const arma::vec d_j = (1 / p_ij) * d_p_ij; - // alternative: d_i - 2 * alpha * (y_i - y_j) / (alpha * sum(square(y_i - y_j))) - - arma::vec samples = arma::randu(M * 2); - arma::vec::iterator targetIt = samples.begin(); - int sampleIdx = 1; - // The indices of the nodes with edges to i - arma::vec searchVector = is.subvec(ps[i], ps[i + 1] - 1); - arma::vec d_i = d_j; - int m = 0; - while (m < M) { - if (sampleIdx % (M * 2) == 0) samples.randu(); - // binary search implementing weighted sampling - const double target = targetIt[sampleIdx++ % (M * 2)]; - int k; - if (useWeights) k = target * (N - 1); - else k = std::distance(negativeSampleWeights.begin(), - std::upper_bound(negativeSampleWeights.begin(), - negativeSampleWeights.end(), - target) - ); - - if (k == i || - k == j || - sum(searchVector == k) > 0) continue; - const arma::vec y_k = coords.col(k); - - const double dist_ik = dist(y_i, y_k); - if (dist_ik == 0) continue; // Duplicates - - const arma::vec d_dist_ik = (y_i - y_k) / sqrt(dist_ik); - - double p_ik; - if (alpha == 0) p_ik = 1 - (1 / (1 + exp(dist_ik))); - else p_ik = 1 - (1 / (1 + (alpha * dist_ik))); - - arma::vec d_p_ik; - if (alpha == 0) d_p_ik = d_dist_ik * 2 * dist_ik * exp(dist_ik) / pow(1 + exp(dist_ik),2); - else d_p_ik = d_dist_ik * 2 * dist_ik * alpha / pow(1 + (alpha * dist_ik),2); - //o += (gamma * log(p_ik)); - - const arma::vec d_k = (gamma / p_ik) * d_p_ik; - // alternative: d_k = 2 * alpha * (y_i - y_k) / (square(1 + (alpha * sum(square(y_i - y_k)))) * (1 - (1 / (alpha * sum(square(y_i - y_k)))))) - - d_i += d_k; - for (int idx = 0; idx < D; idx++) coords(idx,k) -= d_k[idx] * localRho * w; - - m++; - } - - for (int idx = 0; idx < D; idx++) { - coords(idx,j) -= d_j[idx] * w * localRho; - coords(idx,i) += d_i[idx] * w * localRho; - } - - if (eIdx >0 && eIdx % posSampleLength == 0) positiveSamples.randu(); - } + const bool verbose) { + + Progress progress(n_samples, verbose); + int D = coords.n_rows; + if (D > 10) stop("Limit of 10 dimensions for low-dimensional space."); + Visualizer v(sources_j.memptr(), + targets_i.memptr(), + coords.n_rows, + coords.memptr(), + M, + rho, + n_samples); + v.initAlias(ps, weights); + + if (alpha == 0) v.setGradient(new ExpGradient(gamma, D)); + else if (alpha == 1) v.setGradient(new AlphaOneGradient(gamma, D)); + else v.setGradient(new AlphaGradient(alpha, gamma, D)); + + const int batchSize = 8192; + const long long barrier = (n_samples * .95 < n_samples - coords.n_cols) ? n_samples * .95 : n_samples - coords.n_cols; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (long long eIdx = 0; eIdx < barrier; eIdx += batchSize) if (progress.increment(batchSize)) { + v(eIdx, batchSize); } +#ifdef _OPENMP +#pragma omp barrier +#endif + for (long long eIdx = barrier; eIdx < n_samples; eIdx += batchSize) if (progress.increment(batchSize)) v(eIdx, batchSize); return coords; }; + diff --git a/src/largeVis.h b/src/largeVis.h new file mode 100644 index 0000000..9f7f177 --- /dev/null +++ b/src/largeVis.h @@ -0,0 +1,258 @@ +#ifndef _LARGEVIS +#define _LARGEVIS +#include + +#ifdef _OPENMP +#include +#endif +#include "progress.hpp" +#include +#include +#include +#include +#include +#include + +using namespace Rcpp; +using namespace std; +using namespace arma; + +/* + * Neighbor search + */ +struct HeapObject { + double d; + int n; + HeapObject(double d, int n) : d(d), n(n) {} + bool operator<(const struct HeapObject& other) const { + return d < other.d; + } +}; +typedef priority_queue MaxHeap; +typedef vector< imat::col_iterator > PositionVector; +typedef vector Neighborhood; +Neighborhood** createNeighborhood(int N); +void copyHeapToMatrix(set* tree, + const int K, + const int i, + arma::imat& knns); +void addDistance(const arma::vec& x_i, + const arma::mat& data, + const int j, + MaxHeap& heap, + const int K, + double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j)); +void heapToSet(MaxHeap& heap, set* set); +arma::imat annoy(const int n_trees, + const int threshold, + const arma::mat& data, + const int N, + const int K, + double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j), + Progress& p); +void addNeighbors(const arma::ivec& indices, + Neighborhood* heap[], + const int I); +/* + * Distance Functions + */ +double dist(const arma::vec& i, const arma::vec& j); +double relDist(const arma::vec& i, const arma::vec& j); +double cosDist(const arma::vec& i, const arma::vec& j); +double sparseDist(const sp_mat& i, const sp_mat& j); +double sparseRelDist(const sp_mat& i, const sp_mat& j); +double sparseCosDist(const sp_mat& i, const sp_mat& j); + +// Exported distance functions for high dimensional space +arma::vec fastDistance(const NumericVector is, + const NumericVector js, + const arma::mat& data, + const std::string& distMethod, + bool verbose); +arma::vec fastSparseDistance(const arma::vec& is, + const arma::vec& js, + const sp_mat& data, + const std::string& distMethod, + bool verbose); +arma::vec fastCDistance(const arma::vec& is, + const arma::vec& js, + const arma::uvec& i_locations, + const arma::uvec& p_locations, + const arma::vec& x, + const std::string& distMethod, + bool verbose); +arma::vec fastSDistance(const arma::vec& is, + const arma::vec& js, + const arma::uvec& i_locations, + const arma::uvec& j_locations, + const arma::vec& x, + const std::string& distMethod, + bool verbose); + +/* + * Functions related to the alias algorithm + */ +typedef double realsies; + +template +class AliasTable { +private: + T N = 0; + realsies* probs = NULL; + T* aliases = NULL; + +public: + AliasTable() {} + AliasTable(T N) : N{N} { + probs = new realsies[N]; + aliases = new T[N]; + } + + void initialize(const NumericVector& weights) { + if (N == 0) { + N = weights.size(); + probs = new realsies[N]; + aliases = new T[N]; + } + // AliasTable(const NumericVector& weights) : + // N(weights.size()), + // probs(new realsies[N]), + // aliases(new T[N]) { + const long double sm = sum(weights); + for (T i = 0; i < N; i++) probs[i] = weights[i] * N / sm; + queue small = queue(); + queue large = queue(); + for (T i = 0; i < N; i++) ((probs[i] < 1) ? + small : + large).push(i); + while (! large.empty() & ! small.empty()) { + T big = large.front(); + large.pop(); + T little = small.front(); + small.pop(); + aliases[little] = big; + probs[big] = probs[big] + probs[little] - 1; + (probs[big] < 1 ? small : large).push(big); + } + long double accu = 0; + while (! large.empty()) { + accu += 1 - large.front(); + probs[large.front()] = 1; + large.pop(); + } + while (! small.empty()) { + accu += 1 - small.front(); + probs[small.front()] = 1; + small.pop(); + } + if (accu > 1e-5) warning("Numerical instability in alias table " + to_string(accu)); + }; + + T search(realsies random, realsies random2) const { + T candidate = random * N; + return (random2 >= probs[candidate]) ? aliases[candidate] : candidate; + }; + + // const gsl_rng_type *gsl_T = NULL; + // gsl_rng *gsl_r = NULL; + // + // void initRandom() { + // initRandom(314159265); + // } + // void initRandom(long seed) { + // gsl_T = gsl_rng_rand48; + // gsl_r = gsl_rng_alloc(gsl_T); + // gsl_rng_set(gsl_r, seed); + // } + std::uniform_real_distribution rnd; + std::mt19937_64 mt; + + void initRandom(long seed) { + mt = mt19937_64(seed); + rnd = uniform_real_distribution(); + } + void initRandom() { + std::random_device seed; + initRandom(seed()); + } + + T operator()() { + realsies dub1 = rnd(mt); + realsies dub2 = rnd(mt); + return search(dub1, dub2); + // return search(gsl_rng_uniform(gsl_r), gsl_rng_uniform(gsl_r)); + } +}; + +/* + * Gradients + */ +class Gradient { +protected: + const double gamma; + double cap; + const int D; + Gradient(const double g, + const int d) : gamma{g}, cap(5), D{d} {}; + virtual void _positiveGradient(const double dist_squared, + double* holder) const = 0; + virtual void _negativeGradient(const double dist_squared, + double* holder) const = 0; + inline void multModify(double *col, const double adj) const; + inline double clamp(double val) const; + +public: + virtual void positiveGradient(const double* i, + const double* j, + double* holder) const; + virtual void negativeGradient(const double* i, + const double* k, + double* holder) const; + inline double distAndVector(const double *x_i, + const double *x_j, + double *output) const; +}; + +class AlphaGradient: public Gradient { + const double alpha; + const double twoalpha; +protected: + const double alphagamma; + virtual void _positiveGradient(const double dist_squared, + double* holder) const; + virtual void _negativeGradient(const double dist_squared, + double* holder) const; +public: + AlphaGradient(const double a, + const double g, + const int d) : Gradient(g, d), + alpha{a}, + twoalpha(alpha * -2), + alphagamma(alpha * gamma * 2) { } ; +}; + +class AlphaOneGradient: public AlphaGradient { +public: + AlphaOneGradient(const double g, + const int d); +protected: + virtual void _positiveGradient(const double dist_squared, + double* holder) const; + virtual void _negativeGradient(const double dist_squared, + double* holder) const; +}; + +class ExpGradient: public Gradient { +public: + const double gammagamma; + ExpGradient(const double g, const int d) : Gradient(g, d), + gammagamma(gamma * gamma) { + cap = gamma; + }; +protected: + virtual void _positiveGradient(const double dist_squared, + double* holder) const; + virtual void _negativeGradient(const double dist_squared, + double* holder) const; +}; +#endif diff --git a/src/neighbors.cpp b/src/neighbors.cpp deleted file mode 100644 index a7e76df..0000000 --- a/src/neighbors.cpp +++ /dev/null @@ -1,242 +0,0 @@ -#include -// [[Rcpp::plugins(openmp)]] -#include "progress.hpp" -#include -#include -#include -#include -#include -#include -#include -#include "helpers.h" -using namespace Rcpp; -using namespace std; - -/* -* Functions for identifying candidate nearest neighbors using random projection trees and neighborhood exploration. -*/ - -struct heapObject { - double d; - int n; - - heapObject(double d, int n) : d(d), n(n) {} - - bool operator<(const struct heapObject& other) const { - return d < other.d; - } -}; - -double relDist(const arma::vec& i, const arma::vec& j) { - return sum(square(i - j)); -} - -void searchTree(const int& threshold, - const arma::vec& indices, - const arma::mat& data, - std::vector* >& heap, - const int& iterations, - Progress& progress) { - const int I = indices.size(); - const int D = data.n_rows; - if (progress.check_abort()) return; - if (I < 2) stop("Tree split failure."); - if (I == 2) { - #pragma omp critical - { - heap[indices[0]] -> push_back(indices[1]); - heap[indices[1]] -> push_back(indices[0]); - } - return; - } - if (I < threshold || iterations == 0) { - #pragma omp critical - { - for (int i = 0; i < I; i++) { - heap[indices[i]] -> reserve(I - 1); - for (int j = 0; j < I; j++) if (i != j) heap[indices[i]] -> push_back(indices[j]); - } - } - progress.increment(I); - return; - } - arma::vec direction = arma::vec(indices.size()); - { - int x1idx, x2idx; - arma::vec v; - arma::vec m; - do { - const arma::vec selections = arma::randu(2) * (I - 1); - x1idx = indices[selections[0]]; - x2idx = indices[selections[1]]; - if (x1idx == x2idx) x2idx = indices[((int)selections[1] + 1) % indices.size()]; - const arma::vec x2 = data.col(x2idx); - const arma::vec x1 = data.col(x1idx); - // Get hyperplane - m = (x1 + x2) / 2; // Base point of hyperplane - const arma::vec d = x1 - x2; - v = d / arma::as_scalar(arma::norm(d, 2)); // unit vector - } while (x1idx == x2idx); - - for (int i = 0; i < indices.size(); i++) { - const int I = indices[i]; - const arma::vec X = data.col(I); - direction[i] = dot((X - m), v); - } - } - // Normalize direction - const double middle = arma::median(direction); - - const arma::uvec left = arma::find(direction > middle); - const arma::uvec right = arma::find(direction <= middle); - if (left.size() >= 2 && right.size() >= 2) { - searchTree(threshold, indices(left), data, heap, iterations - 1, progress); - searchTree(threshold, indices(right), data, heap, iterations - 1, progress); - } else { - searchTree(threshold, indices.subvec(0, indices.size() / 2), data, heap, iterations - 1, progress); - searchTree(threshold, indices.subvec(indices.size() / 2, indices.size() - 1), data, heap, iterations - 1, progress); - } -}; - - - -// [[Rcpp::export]] -arma::mat searchTrees(const int& threshold, - const int& n_trees, - const int& K, - const int& max_recursion_degree, - const int& maxIter, - const arma::mat& data, - const std::string& distMethod, - bool verbose) { - - const int N = data.n_cols; - - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j); - if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = relDist; - else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = cosDist; - else distanceFunction = relDist; - - Progress p((N * n_trees) + (N) + (N * maxIter), verbose); - - std::vector* > treeNeighborhoods = std::vector* >(N); - for (int i = 0; i < N; i++) { - int seed[] = {i}; - treeNeighborhoods[i] = new std::vector(seed, seed + sizeof(seed) / sizeof(int)); - } - - { // Artificial scope to destroy indices - arma::vec indices = arma::regspace(0, N - 1); - - #pragma omp parallel for shared(indices,treeNeighborhoods) - for (int t = 0; t < n_trees; t++) if (! p.check_abort()) { - searchTree(threshold, - indices, - data, - treeNeighborhoods, - max_recursion_degree, // maximum permitted level of recursion - p - ); - - if (t > 0 && ! p.check_abort()) - #pragma omp critical - { - for (int i = 0; i < N; i++) { - std::vector* neighbors = treeNeighborhoods[i]; - std::sort(neighbors -> begin(), neighbors -> end()); - std::vector::iterator theEnd = std::unique(neighbors -> begin(), neighbors -> end()); - neighbors -> erase(theEnd, neighbors -> end()); - if (neighbors -> size() < 3) stop("Tree failure."); - } - } - } - } - - if (p.check_abort()) return arma::mat(0); - - // Initialize the knn matrix, and reduce the number of candidate neighbors per node - // to K. Otherwise the first neighborhood exploration pass takes N * trees * (threshold + 1), - // instead of (N * K), which is prohibitive of large thresholds. - arma::mat knns = arma::mat(threshold,N); - knns.fill(-1); - #pragma omp parallel for shared(knns) - for (int i = 0; i < N; i++) if (p.increment()){ - const arma::vec x_i = data.col(i); - std::priority_queue maxHeap = std::priority_queue(); - std::vector* stack = treeNeighborhoods[i]; - for (std::vector::iterator it = stack -> begin(); it != stack -> end(); it++) { - const double d = distanceFunction(x_i, data.col(*it)); - maxHeap.push(heapObject(d, *it)); - if (maxHeap.size() > threshold) maxHeap.pop(); - } - int j = 0; - do { - knns(j,i) = maxHeap.top().n; - maxHeap.pop(); - j++; - } while (j < threshold && ! maxHeap.empty()); - if (j == 1 && knns(0,i) == -1) stop("Bad neighbor matrix."); - } - if (p.check_abort()) return arma::mat(0); - - for (int T = 0; T < maxIter; T++) { - arma::mat old_knns = knns; - knns = arma::mat(K,N); - knns.fill(-1); - #pragma omp parallel for shared(knns, treeNeighborhoods) - for (int i = 0; i < N; i++) if (p.increment()) { - double d; - - const arma::vec neighborhood = old_knns.col(i); - const arma::vec x_i = data.col(i); - - std::priority_queue heap; - std::vector pastVisitors = *(treeNeighborhoods[i]); - pastVisitors.reserve((K + 1) * K); - // Loop through immediate neighbors of i - for (int jidx = 0; jidx < old_knns.n_rows; jidx++) { - const int j = neighborhood[jidx]; - if (j == -1) break; - if (j == i) continue; // This should never happen - d = distanceFunction(x_i, data.col(j)); - if (d == 0) continue; // duplicate - heap.push(heapObject(d, j)); - if (heap.size() > K) heap.pop(); - - // For each immediate neighbor j, loop through its neighbors - const arma::vec locality = old_knns.col(j); - for (int kidx = 0; kidx < old_knns.n_rows; kidx++) { - const int k = locality[kidx]; - if (k == -1) break; - if (k == i) continue; - // Check if this is a neighbor we've already seen. O(log k) - std::pair::iterator, - std::vector::iterator > firstlast = std::equal_range(pastVisitors.begin(), - pastVisitors.end(), - k); - if (*(firstlast.first) == k) continue; // Found - - if (firstlast.second == pastVisitors.end()) pastVisitors.push_back(k); - else pastVisitors.insert(firstlast.second, k); - - d = distanceFunction(x_i, data.col(k)); - if (d == 0) continue; - if (heap.size() < K) heap.push(heapObject(d,k)); - else if (d < heap.top().d) { - heap.push(heapObject(d, k)); - if (heap.size() > K) heap.pop(); - } - } - } - int j = 0; - while (j < K && ! heap.empty()) { - knns(j, i) = heap.top().n; - heap.pop(); - j++; - } - if (j == 0) stop("Failure in neighborhood exploration - this should never happen."); - std::vector(pastVisitors).swap(pastVisitors); // pre-C++11 shrink - } - } - return knns; -}; diff --git a/src/pjicalculation.cpp b/src/pjicalculation.cpp deleted file mode 100644 index 4658927..0000000 --- a/src/pjicalculation.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include -// [[Rcpp::plugins(openmp)]] -#include "progress.hpp" -#include -#include -#include -#include -#include -#include -#include -#include "helpers.h" -using namespace Rcpp; -using namespace std; - -/* - * Fast calculation of pairwise distances with the result stored in a pre-allocated vector. - */ -// [[Rcpp::export]] -arma::vec fastDistance(const NumericVector is, - const NumericVector js, - const arma::mat& data, - const std::string& distMethod, - bool verbose) { - - Progress p(is.size(), verbose); - arma::vec xs = arma::vec(is.size()); - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j); - if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = dist; - else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = cosDist; - - #pragma omp parallel for shared (xs) - for (int i=0; i < is.length(); i++) if (p.increment()) xs[i] = - distanceFunction(data.col(is[i]), data.col(js[i])); - return xs; -}; - -arma::vec fastSparseDistance(const arma::vec& is, - const arma::vec& js, - const arma::sp_mat& data, - const std::string& distMethod, - bool verbose) { - - Progress p(is.size(), verbose); - arma::vec xs = arma::vec(is.size()); - double (*distanceFunction)( - const arma::sp_mat& x_i, - const arma::sp_mat& x_j); - if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = sparseDist; - else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = sparseCosDist; - -#pragma omp parallel for shared (xs) - for (int i=0; i < is.size(); i++) if (p.increment()) xs[i] = - distanceFunction(data.col(is[i]), data.col(js[i])); - return xs; -}; - -// [[Rcpp::export]] -arma::vec fastCDistance(const arma::vec& is, - const arma::vec& js, - const arma::uvec& i_locations, - const arma::uvec& p_locations, - const arma::vec& x, - const std::string& distMethod, - bool verbose) { - const int N = p_locations.size() - 1; - const arma::sp_mat data = arma::sp_mat(i_locations, p_locations, x, N, N); - return fastSparseDistance(is,js,data,distMethod,verbose); -} - -// [[Rcpp::export]] -arma::vec fastSDistance(const arma::vec& is, - const arma::vec& js, - const arma::uvec& i_locations, - const arma::uvec& j_locations, - const arma::vec& x, - const std::string& distMethod, - bool verbose) { - const arma::umat locations = arma::join_cols(i_locations, j_locations); - const arma::sp_mat data = arma::sp_mat(locations, x); - return fastSparseDistance(is,js,data,distMethod,verbose); -} - -// Take four vectors (i indices, j indices, edge distances, and sigmas), and calculate -// p(j|i) and then w_{ij}. -// [[Rcpp::export]] -arma::sp_mat distMatrixTowij( - const NumericVector is, - const NumericVector js, - const NumericVector xs, - const NumericVector sigmas, - const int N, - bool verbose -) { - - Progress p(xs.size() * 2, verbose); - arma::vec rowSums = arma::vec(N); - arma::vec pjis = arma::vec(is.length()); - for (int idx=0; idx < N; idx++) rowSums[idx] = 0; - // Compute pji, accumulate rowSums at the same time - #pragma omp parallel for shared(pjis, rowSums) - for (int e=0; e < pjis.size(); e++) if (p.increment()){ - const int i = is[e]; - const double pji = exp(- pow(xs[e], 2)) / sigmas[i]; - pjis[e] = pji; - #pragma omp atomic - rowSums[i] += pji; - } - if (p.check_abort()) return arma::sp_mat(0); - // Now convert p(j|i) to w_{ij} by symmetrizing. - // Loop through the edges, and populate a location matrix and value vector for - // the arma::sp_mat batch insertion constructor. Put all coordinates in the - // lower triangle. The constructor will automatically add duplicates. - arma::vec values = arma::vec(pjis.size()); - arma::umat locations = arma::umat(2, pjis.size()); - #pragma omp parallel for shared(locations, values) - for (int e = 0; e < pjis.size(); e++) if (p.increment()) { - int newi = is[e], newj = js[e]; - if (newi < newj) std::swap(newi, newj); - values[e] = ((pjis[e] / rowSums[is[e]]) / (2 * N)); - locations(1,e) = newi; - locations(0,e) = newj; - } - arma::sp_mat wij = arma::sp_mat( - true, // add_values - locations, - values, - N, N // n_col and n_row - ); - wij = wij + wij.t(); - return wij; -}; - - -// [[Rcpp::export]] -double sigFunc(const double sigma, - const NumericVector x_i, - const double perplexity) { - const NumericVector xs = exp(- pow(x_i,2) / sigma); - const NumericVector softxs = xs / sum(xs); - const double p2 = - sum(log(softxs) / log(2)) / xs.length(); - return pow(perplexity - p2, 2); -}; diff --git a/src/sparse.cpp b/src/sparse.cpp new file mode 100644 index 0000000..1ae5f08 --- /dev/null +++ b/src/sparse.cpp @@ -0,0 +1,242 @@ +// [[Rcpp::plugins(openmp)]] +// [[Rcpp::plugins(cpp11)]] +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppProgress)]] +#include "largeVis.h" + +using namespace Rcpp; +using namespace std; +using namespace arma; + +void searchTree(const int& threshold, + const arma::ivec& indices, + const sp_mat& data, + Neighborhood* heap[], + Progress& progress) { + const int I = indices.size(); + // const int D = data.n_rows; + if (progress.check_abort()) return; + if (I < 2) stop("Tree split failure."); + if (I <= threshold) { + addNeighbors(indices, heap, I); + progress.increment(I); + return; + } + vec direction = vec(indices.size()); + { + int x1idx, x2idx; + sp_mat v; + sp_mat m; + do { + const vec selections = randu(2) * (I - 1); + x1idx = indices[selections[0]]; + x2idx = indices[selections[1]]; + if (x1idx == x2idx) x2idx = indices[((int)selections[1] + 1) % indices.size()]; + const SpSubview x2 = data.col(x2idx); + const SpSubview x1 = data.col(x1idx); + // Get hyperplane + m = (x1 + x2) / 2; // Base point of hyperplane + const sp_mat d = x1 - x2; + const double dn = as_scalar(norm(d, 2)); + v = d / dn; // unit vector + } while (x1idx == x2idx); + + for (int i = 0; i < indices.size(); i++) { + const int I = indices[i]; + const SpSubview X = data.col(I); + direction[i] = dot((X - m), v); + } + } + // Normalize direction + const double middle = median(direction); + + const uvec left = find(direction > middle); + const uvec right = find(direction <= middle); + if (left.size() >= 2 && right.size() >= 2) { + searchTree(threshold, indices(left), data, heap, progress); + searchTree(threshold, indices(right), data, heap, progress); + } else { + searchTree(threshold, indices.subvec(0, indices.size() / 2), data, heap, progress); + searchTree(threshold, indices.subvec(indices.size() / 2, indices.size() - 1), data, heap, progress); + } +}; + + +arma::mat searchTreesSparse(const int& threshold, + const int& n_trees, + const int& K, + const int& maxIter, + const sp_mat& data, + const std::string& distMethod, + bool verbose) { + + const int N = data.n_cols; + + double (*distanceFunction)(const sp_mat& x_i, const sp_mat& x_j); + if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = sparseRelDist; + else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = sparseCosDist; + else distanceFunction = sparseRelDist; + + Progress p((N * n_trees) + (N) + (N * maxIter), verbose); + + Neighborhood** treeNeighborhoods = createNeighborhood(N); + + { // Artificial scope to destroy indices + sp_mat dataMat; + if (distMethod.compare(std::string("Cosine")) == 0) { + dataMat = sp_mat(data); + for (int d = 0; d < dataMat.n_cols; d++) dataMat.col(d) /= norm(dataMat.col(d)); + } else { + dataMat = data; + } + ivec indices = regspace(0, N - 1); +#ifdef _OPENMP +#pragma omp parallel for shared(indices,treeNeighborhoods) +#endif + for (int t = 0; t < n_trees; t++) if (! p.check_abort()) { + searchTree(threshold, + indices, + dataMat, + treeNeighborhoods, + p + ); + + if (t > 0 && ! p.check_abort()) +#ifdef _OPENMP +#pragma omp critical +#endif + { + for (int i = 0; i < N; i++) { + vector* neighbors = treeNeighborhoods[i]; + sort(neighbors -> begin(), neighbors -> end()); + vector::iterator theEnd = unique(neighbors -> begin(), neighbors -> end()); + neighbors -> erase(theEnd, neighbors -> end()); + if (neighbors -> size() < 3) stop("Tree failure."); + } + } + } + } + + if (p.check_abort()) return mat(0); + + // Initialize the knn matrix, and reduce the number of candidate neighbors per node + // to K. Otherwise the first neighborhood exploration pass takes N * trees * (threshold + 1), + // instead of (N * K), which is prohibitive of large thresholds. + mat knns = mat(threshold,N); + knns.fill(-1); +#ifdef _OPENMP +#pragma omp parallel for shared(knns) +#endif + for (int i = 0; i < N; i++) if (p.increment()){ + const SpSubview x_i = data.col(i); + priority_queue MaxHeap = priority_queue(); + vector* stack = treeNeighborhoods[i]; + for (vector::iterator it = stack -> begin(); it != stack -> end(); it++) { + const double d = distanceFunction(x_i, data.col(*it)); + MaxHeap.push(HeapObject(d, *it)); + if (MaxHeap.size() > threshold) MaxHeap.pop(); + } + int j = 0; + do { + knns(j,i) = MaxHeap.top().n; + MaxHeap.pop(); + j++; + } while (j < threshold && ! MaxHeap.empty()); + if (j == 1 && knns(0,i) == -1) stop("Bad neighbor matrix."); + } + if (p.check_abort()) return mat(0); + + for (int T = 0; T < maxIter; T++) { + mat old_knns = knns; + knns = mat(K,N); + knns.fill(-1); +#ifdef _OPENMP +#pragma omp parallel for shared(knns, treeNeighborhoods) +#endif + for (int i = 0; i < N; i++) if (p.increment()) { + double d; + + const vec neighborhood = old_knns.col(i); + const SpSubview x_i = data.col(i); + + priority_queue heap; + vector pastVisitors = *(treeNeighborhoods[i]); + pastVisitors.reserve((K + 1) * K); + // Loop through immediate neighbors of i + for (int jidx = 0; jidx < old_knns.n_rows; jidx++) { + const int j = neighborhood[jidx]; + if (j == -1) break; + if (j == i) continue; // This should never happen + d = distanceFunction(x_i, data.col(j)); + if (d == 0) continue; // duplicate + heap.push(HeapObject(d, j)); + if (heap.size() > K) heap.pop(); + + // For each immediate neighbor j, loop through its neighbors + const vec locality = old_knns.col(j); + for (int kidx = 0; kidx < old_knns.n_rows; kidx++) { + const int k = locality[kidx]; + if (k == -1) break; + if (k == i) continue; + // Check if this is a neighbor we've already seen. O(log k) + pair::iterator, + vector::iterator > firstlast = equal_range(pastVisitors.begin(), + pastVisitors.end(), + k); + if (*(firstlast.first) == k) continue; // Found + + if (firstlast.second == pastVisitors.end()) pastVisitors.push_back(k); + else pastVisitors.insert(firstlast.second, k); + + d = distanceFunction(x_i, data.col(k)); + if (d == 0) continue; + if (heap.size() < K) heap.push(HeapObject(d,k)); + else if (d < heap.top().d) { + heap.push(HeapObject(d, k)); + if (heap.size() > K) heap.pop(); + } + } + } + int j = 0; + while (j < K && ! heap.empty()) { + knns(j, i) = heap.top().n; + heap.pop(); + j++; + } + if (j == 0) stop("Failure in neighborhood exploration - this should never happen."); + vector(pastVisitors).swap(pastVisitors); // pre-C++11 shrink + } + } + return knns; +}; + + +// [[Rcpp::export]] +arma::mat searchTreesCSparse(const int& threshold, + const int& n_trees, + const int& K, + const int& maxIter, + const arma::uvec& i, + const arma::uvec& p, + const arma::vec& x, + const std::string& distMethod, + bool verbose) { + const int N = p.size() -1; + const sp_mat data = sp_mat(i,p,x,N,N); + return searchTreesSparse(threshold,n_trees,K,maxIter,data,distMethod,verbose); +} + +// [[Rcpp::export]] +arma::mat searchTreesTSparse(const int& threshold, + const int& n_trees, + const int& K, + const int& maxIter, + const arma::uvec& i, + const arma::uvec& j, + const arma::vec& x, + const std::string& distMethod, + bool verbose) { + const umat locations = join_cols(i,j); + const sp_mat data = sp_mat(locations,x); + return searchTreesSparse(threshold,n_trees,K,maxIter,data,distMethod,verbose); +} diff --git a/src/sparseneighbors.cpp b/src/sparseneighbors.cpp deleted file mode 100644 index 3a47657..0000000 --- a/src/sparseneighbors.cpp +++ /dev/null @@ -1,272 +0,0 @@ -#include -// [[Rcpp::plugins(openmp)]] -#include "progress.hpp" -#include -#include -#include -#include -#include -#include -#include -#include "helpers.h" -using namespace Rcpp; -using namespace std; - -/* -* Functions for identifying candidate nearest neighbors using random projection trees and neighborhood exploration. -*/ - -struct heapObject { - double d; - int n; - - heapObject(double d, int n) : d(d), n(n) {} - - bool operator<(const struct heapObject& other) const { - return d < other.d; - } -}; - -double sparseRelDist(const arma::sp_mat& i, const arma::sp_mat& j) { - return arma::as_scalar(sum(square(i - j))); -} - -void searchTree(const int& threshold, - const arma::vec& indices, - const arma::sp_mat& data, - std::vector* >& heap, - const int& iterations, - Progress& progress) { - const int I = indices.size(); - const int D = data.n_rows; - if (progress.check_abort()) return; - if (I < 2) stop("Tree split failure."); - if (I == 2) { - #pragma omp critical - { - heap[indices[0]] -> push_back(indices[1]); - heap[indices[1]] -> push_back(indices[0]); - } - return; - } - if (I < threshold || iterations == 0) { - #pragma omp critical - { - for (int i = 0; i < I; i++) { - heap[indices[i]] -> reserve(I - 1); - for (int j = 0; j < I; j++) if (i != j) heap[indices[i]] -> push_back(indices[j]); - } - } - progress.increment(I); - return; - } - arma::vec direction = arma::vec(indices.size()); - { - int x1idx, x2idx; - arma::sp_mat v; - arma::sp_mat m; - do { - const arma::vec selections = arma::randu(2) * (I - 1); - x1idx = indices[selections[0]]; - x2idx = indices[selections[1]]; - if (x1idx == x2idx) x2idx = indices[((int)selections[1] + 1) % indices.size()]; - const arma::SpSubview x2 = data.col(x2idx); - const arma::SpSubview x1 = data.col(x1idx); - // Get hyperplane - m = (x1 + x2) / 2; // Base point of hyperplane - const arma::sp_mat d = x1 - x2; - const double dn = arma::as_scalar(arma::norm(d, 2)); - v = d / dn; // unit vector - } while (x1idx == x2idx); - - for (int i = 0; i < indices.size(); i++) { - const int I = indices[i]; - const arma::SpSubview X = data.col(I); - direction[i] = dot((X - m), v); - } - } - // Normalize direction - const double middle = arma::median(direction); - - const arma::uvec left = arma::find(direction > middle); - const arma::uvec right = arma::find(direction <= middle); - if (left.size() >= 2 && right.size() >= 2) { - searchTree(threshold, indices(left), data, heap, iterations - 1, progress); - searchTree(threshold, indices(right), data, heap, iterations - 1, progress); - } else { - searchTree(threshold, indices.subvec(0, indices.size() / 2), data, heap, iterations - 1, progress); - searchTree(threshold, indices.subvec(indices.size() / 2, indices.size() - 1), data, heap, iterations - 1, progress); - } -}; - -arma::mat searchTreesSparse(const int& threshold, - const int& n_trees, - const int& K, - const int& max_recursion_degree, - const int& maxIter, - const arma::sp_mat& data, - const std::string& distMethod, - bool verbose) { - - const int N = data.n_cols; - - double (*distanceFunction)(const arma::sp_mat& x_i, const arma::sp_mat& x_j); - if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = sparseRelDist; - else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = sparseCosDist; - else distanceFunction = sparseRelDist; - - Progress p((N * n_trees) + (N) + (N * maxIter), verbose); - - std::vector* > treeNeighborhoods = std::vector* >(N); - for (int i = 0; i < N; i++) { - int seed[] = {i}; - treeNeighborhoods[i] = new std::vector(seed, seed + sizeof(seed) / sizeof(int)); - } - - { // Artificial scope to destroy indices - arma::vec indices = arma::regspace(0, N - 1); - - #pragma omp parallel for shared(indices,treeNeighborhoods) - for (int t = 0; t < n_trees; t++) if (! p.check_abort()) { - searchTree(threshold, - indices, - data, - treeNeighborhoods, - max_recursion_degree, // maximum permitted level of recursion - p - ); - - if (t > 0 && ! p.check_abort()) - #pragma omp critical - { - for (int i = 0; i < N; i++) { - std::vector* neighbors = treeNeighborhoods[i]; - std::sort(neighbors -> begin(), neighbors -> end()); - std::vector::iterator theEnd = std::unique(neighbors -> begin(), neighbors -> end()); - neighbors -> erase(theEnd, neighbors -> end()); - if (neighbors -> size() < 3) stop("Tree failure."); - } - } - } - } - - if (p.check_abort()) return arma::mat(0); - - // Initialize the knn matrix, and reduce the number of candidate neighbors per node - // to K. Otherwise the first neighborhood exploration pass takes N * trees * (threshold + 1), - // instead of (N * K), which is prohibitive of large thresholds. - arma::mat knns = arma::mat(threshold,N); - knns.fill(-1); - #pragma omp parallel for shared(knns) - for (int i = 0; i < N; i++) if (p.increment()){ - const arma::SpSubview x_i = data.col(i); - std::priority_queue maxHeap = std::priority_queue(); - std::vector* stack = treeNeighborhoods[i]; - for (std::vector::iterator it = stack -> begin(); it != stack -> end(); it++) { - const double d = distanceFunction(x_i, data.col(*it)); - maxHeap.push(heapObject(d, *it)); - if (maxHeap.size() > threshold) maxHeap.pop(); - } - int j = 0; - do { - knns(j,i) = maxHeap.top().n; - maxHeap.pop(); - j++; - } while (j < threshold && ! maxHeap.empty()); - if (j == 1 && knns(0,i) == -1) stop("Bad neighbor matrix."); - } - if (p.check_abort()) return arma::mat(0); - - for (int T = 0; T < maxIter; T++) { - arma::mat old_knns = knns; - knns = arma::mat(K,N); - knns.fill(-1); - #pragma omp parallel for shared(knns, treeNeighborhoods) - for (int i = 0; i < N; i++) if (p.increment()) { - double d; - - const arma::vec neighborhood = old_knns.col(i); - const arma::SpSubview x_i = data.col(i); - - std::priority_queue heap; - std::vector pastVisitors = *(treeNeighborhoods[i]); - pastVisitors.reserve((K + 1) * K); - // Loop through immediate neighbors of i - for (int jidx = 0; jidx < old_knns.n_rows; jidx++) { - const int j = neighborhood[jidx]; - if (j == -1) break; - if (j == i) continue; // This should never happen - d = distanceFunction(x_i, data.col(j)); - if (d == 0) continue; // duplicate - heap.push(heapObject(d, j)); - if (heap.size() > K) heap.pop(); - - // For each immediate neighbor j, loop through its neighbors - const arma::vec locality = old_knns.col(j); - for (int kidx = 0; kidx < old_knns.n_rows; kidx++) { - const int k = locality[kidx]; - if (k == -1) break; - if (k == i) continue; - // Check if this is a neighbor we've already seen. O(log k) - std::pair::iterator, - std::vector::iterator > firstlast = std::equal_range(pastVisitors.begin(), - pastVisitors.end(), - k); - if (*(firstlast.first) == k) continue; // Found - - if (firstlast.second == pastVisitors.end()) pastVisitors.push_back(k); - else pastVisitors.insert(firstlast.second, k); - - d = distanceFunction(x_i, data.col(k)); - if (d == 0) continue; - if (heap.size() < K) heap.push(heapObject(d,k)); - else if (d < heap.top().d) { - heap.push(heapObject(d, k)); - if (heap.size() > K) heap.pop(); - } - } - } - int j = 0; - while (j < K && ! heap.empty()) { - knns(j, i) = heap.top().n; - heap.pop(); - j++; - } - if (j == 0) stop("Failure in neighborhood exploration - this should never happen."); - std::vector(pastVisitors).swap(pastVisitors); // pre-C++11 shrink - } - } - return knns; -}; - -// [[Rcpp::export]] -arma::mat searchTreesCSparse(const int& threshold, - const int& n_trees, - const int& K, - const int& max_recursion_degree, - const int& maxIter, - const arma::uvec& i, - const arma::uvec& p, - const arma::vec& x, - const std::string& distMethod, - bool verbose) { - const int N = p.size() -1; - const arma::sp_mat data = arma::sp_mat(i,p,x,N,N); - return searchTreesSparse(threshold,n_trees,K,max_recursion_degree,maxIter,data,distMethod,verbose); -} - -// [[Rcpp::export]] -arma::mat searchTreesTSparse(const int& threshold, - const int& n_trees, - const int& K, - const int& max_recursion_degree, - const int& maxIter, - const arma::uvec& i, - const arma::uvec& j, - const arma::vec& x, - const std::string& distMethod, - bool verbose) { - const arma::umat locations = arma::join_cols(i,j); - const arma::sp_mat data = arma::sp_mat(locations,x); - return searchTreesSparse(threshold,n_trees,K,max_recursion_degree,maxIter,data,distMethod,verbose); -} diff --git a/tests/testthat/testclusters2.R b/tests/testthat/testclusters2.R new file mode 100644 index 0000000..5c3710a --- /dev/null +++ b/tests/testthat/testclusters2.R @@ -0,0 +1,33 @@ +context("cluster") +library(largeVis) +set.seed(1974) +data(iris) +dat <- as.matrix(iris[, 1:4]) +dat <- scale(dat) +dupes <- which(duplicated(dat)) +dat <- dat[-dupes, ] +dat <- t(dat) +neighbors <- randomProjectionTreeSearch(dat, K = 20, verbose = FALSE) +edges <- buildEdgeMatrix(data = dat, + neighbors = neighbors, + verbose = FALSE) +test_that("optics doesn't crash on iris with neighbors and data", { + expect_silent(optics(neighbors = neighbors, data = dat, eps = 10, minPts = 10, verbose = FALSE)) +}) + +test_that("optics doesn't crash on iris with edges", { + expect_silent(optics(edges = edges, eps = 10, minPts = 10, verbose = FALSE)) +}) + +test_that("optics doesn't crash on iris with edges and data", { + expect_silent(optics(edges = edges, data = dat, eps = 10, minPts = 10, verbose = FALSE)) +}) + +test_that("dbscan doesn't crash on iris with edges", { + expect_silent(dbscan(edges = edges, eps = 10, minPts = 10, verbose = FALSE, partition = FALSE)) +}) + +test_that("dbscan doesn't crash on iris with partitions", { + expect_silent(clusters <- dbscan(edges = edges, eps = 10, minPts = 10, + verbose = FALSE, partition = TRUE)) +}) diff --git a/tests/testthat/testdistance.R b/tests/testthat/testdistance.R new file mode 100644 index 0000000..71d670d --- /dev/null +++ b/tests/testthat/testdistance.R @@ -0,0 +1,38 @@ +context("distance") + +set.seed(1974) +test_matrix <- matrix(rnorm(100), nrow = 10) +index_matrix <- matrix(c(rep(0:9, each = 10), rep(0:9, 10)), + ncol = 2, byrow = FALSE) +test_matrix <- t(test_matrix) + +test_that("Euclidean distances are correct", { + + distances <- as.matrix(dist(test_matrix, method = "euclidean")) + new_distances <- distance(as.vector(index_matrix[, 2]), + as.vector(index_matrix[, 1]), + x = test_matrix, + "Euclidean", + verbose = FALSE) + diffs <- as.vector(distances) - new_distances + expect_lt(sum(diffs), 1e-10) +}) + +test_that("Cosine distances are correct", { + set.seed(1974) + cos.sim <- function(x, i, j) { + A <- x[, i] + B <- x[, j] + return( 1 - (sum(A * B) / sqrt(sum(A ^ 2) * sum(B ^ 2)) )) + } + distances <- apply(index_matrix + 1, + MARGIN=1, + FUN = function(x) cos.sim(test_matrix, x[1], x[2])) + new_distances <- distance(as.vector(index_matrix[, 2]), + as.vector(index_matrix[, 1]), + x = test_matrix, + "Cosine", + verbose = FALSE) + diffs <- as.vector(distances) - new_distances + expect_lt(sum(diffs), 1e-10) +}) diff --git a/tests/testthat/testother.R b/tests/testthat/testother.R new file mode 100644 index 0000000..ea0b846 --- /dev/null +++ b/tests/testthat/testother.R @@ -0,0 +1,79 @@ +context("wij") + +test_that("wij handles small K", { + set.seed(1974) + data(iris) + dat <- as.matrix(iris[, 1:4]) + dat <- scale(dat) + dupes <- which(duplicated(dat)) + dat <- dat[-dupes, ] + dat <- t(dat) + neighbors <- randomProjectionTreeSearch(dat, K = 5, verbose = FALSE) + edges <- buildEdgeMatrix(dat, neighbors, verbose = FALSE) + expect_silent(wij <- buildWijMatrix(edges)) +}) + +context("vis") +set.seed(1974) +data(iris) +dat <- as.matrix(iris[, 1:4]) +dat <- scale(dat) +dupes <- which(duplicated(dat)) +dat <- dat[-dupes, ] +dat <- t(dat) + +test_that("largeVis works", { + visObject <- largeVis(dat, max_iter = 20, n_trees = 100, + tree_threshold = 50, sgd_batches = 1000, + K = 20, verbose = FALSE) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis does not NaN on iris", { + visObject <- largeVis(dat, max_iter = 20, + coords = matrix(rnorm(ncol(dat) * 2), nrow = 2), + K = 20, verbose = FALSE, + sgd_batches = 20000 * 150) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis works when alpha == 0", { + visObject <- largeVis(dat, + max_iter = 20, + sgd_batches = 10000, + K = 10, + alpha = 0, + verbose = FALSE) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis works with cosine", { + visObject <- largeVis(dat, max_iter = 20, + sgd_batches = 1000, + K = 10, verbose = FALSE, + distance_method = "Cosine") + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis continues to work as it scales up", { + visObject <- largeVis(dat, max_iter = 20, sgd_batches = 1000, + K = 10, gamma = 0.5, verbose = FALSE) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) + for (i in c(10000, 100000, 1000000, 20000 * length(visObject$wij@x))) { + coords <- projectKNNs(visObject$wij, sgd_batches = i, + verbose = FALSE) + expect_false(any(is.na(coords))) + expect_false(any(is.nan(coords))) + expect_false(any(is.infinite(coords))) + } +}) \ No newline at end of file diff --git a/tests/testthat/tests.R b/tests/testthat/tests.R index aeafb0c..f198ee7 100644 --- a/tests/testthat/tests.R +++ b/tests/testthat/tests.R @@ -1,148 +1,60 @@ -context("largeVis") +context("neighbors") -test_that("Can determine iris neighbors", { +test_that("Trees does not error", { data (iris) set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) dat <- as.matrix(iris[, 1:4]) dat <- scale(dat) dupes <- which(duplicated(dat)) dat <- dat[-dupes, ] dat <- t(dat) - neighbors <- randomProjectionTreeSearch(dat, - K = 5, - n_trees = 10, - tree_threshold = 20, - max_iter = 10, - verbose = FALSE) - expect_equal(nrow(neighbors), 5) - expect_equal(ncol(neighbors), ncol(dat)) - expect_equal(sum(neighbors == -1), 0) - expect_equal(sum(neighbors[, 1:40] > 50), 0) -}) + expect_silent(neighbors <- randomProjectionTreeSearch(dat, + K = 5, + n_trees = 10, + tree_threshold = 20, + max_iter = 0, + verbose = FALSE)) -test_that("Can determine iris neighbors accurately", { - M <- 5 - set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) - data (iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - d_matrix = as.matrix(dist(dat, method = 'euclidean')) - bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) - bests <- bests[-1,] - 1 - dat <- t(dat) - neighbors <- randomProjectionTreeSearch(dat, - K = M, - n_trees = 10, - tree_threshold = 20, - max_iter = 2, - verbose = FALSE) - scores <- lapply(1:ncol(dat), FUN = function(x) sum(neighbors[,x] %in% bests[,x])) - score <- sum(as.numeric(scores)) - expect_gt(score, .99 * ncol(dat) * M) }) -test_that("largeVis works", { +test_that("Trees does not error if neighbors are explored once", { + data (iris) set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) - data(iris) dat <- as.matrix(iris[, 1:4]) dat <- scale(dat) dupes <- which(duplicated(dat)) dat <- dat[-dupes, ] dat <- t(dat) - visObject <- vis(dat, max_iter = 20, sgd_batches = 1000, - K = 10, gamma = 0.5, verbose = FALSE) - expect_equal(sum(any(is.na(visObject$coords)) + - any(is.nan(visObject$coords)) + - any(is.infinite(visObject$coords))), - 0) -}) -test_that("largeVis works without weights", { - set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) - data(iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) - visObject <- vis(dat, - max_iter = 20, - sgd_batches = 1000, - weight_pos_samples = FALSE, - K = 10, - verbose = FALSE) - expect_equal(sum(any(is.na(visObject$coords)) + - any(is.nan(visObject$coords)) + - any(is.infinite(visObject$coords))), - 0) + expect_silent(neighbors <- randomProjectionTreeSearch(dat, + K = 5, + n_trees = 50, + tree_threshold = 20, + max_iter = 1, + verbose = FALSE)) + }) -test_that("largeVis works with cosine", { +test_that("Trees does not error if neighbors are explored more than once", { + data (iris) set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) - data(iris) dat <- as.matrix(iris[, 1:4]) dat <- scale(dat) dupes <- which(duplicated(dat)) dat <- dat[-dupes, ] dat <- t(dat) - visObject <- vis(dat, max_iter = 20, sgd_batches = 1000, - K = 10, verbose = FALSE, distance_method="Cosine") - expect_equal(sum(any(is.na(visObject$coords)) + - any(is.nan(visObject$coords)) + - any(is.infinite(visObject$coords))), - 0) -}) - -test_that("Euclidean distances are correct", { - set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) - test_matrix <- matrix(rnorm(100), nrow = 10) - distances <- as.matrix(dist(test_matrix, method = "euclidean")) - index_matrix <- matrix(c(rep(0:9, each = 10), rep(0:9, 10)), - ncol = 2, byrow = FALSE) - test_matrix <- t(test_matrix) - new_distances <- distance(as.vector(index_matrix[,2]), - as.vector(index_matrix[,1]), - x = test_matrix, - "Euclidean", - verbose = FALSE) - diffs <- as.vector(distances) - new_distances - expect_lt(sum(diffs), 1e-10) -}) -test_that("Cosine distances are correct", { - set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) - cos.sim <- function(x, i, j) { - A = x[,i] - B = x[,j] - return( 1 - (sum(A*B)/sqrt(sum(A^2)*sum(B^2)) )) - } - test_matrix <- matrix(rnorm(100), nrow = 10) - index_matrix <- matrix(c(rep(0:9, each = 10), rep(0:9, 10)), - ncol = 2, byrow = FALSE) - distances <- apply(index_matrix + 1, - MARGIN=1, - FUN = function(x) cos.sim(test_matrix, x[1], x[2])) - new_distances <- distance(as.vector(index_matrix[,2]), - as.vector(index_matrix[,1]), - x = test_matrix, - "Cosine", - verbose = FALSE) - diffs <- as.vector(distances) - new_distances - expect_lt(sum(diffs), 1e-10) + expect_silent(neighbors <- randomProjectionTreeSearch(dat, + K = 5, + n_trees = 50, + tree_threshold = 20, + max_iter = 2, + verbose = FALSE)) }) -test_that("buildEdgeMatrix are the same", { +test_that("Can determine iris neighbors", { + data (iris) set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) dat <- as.matrix(iris[, 1:4]) dat <- scale(dat) dupes <- which(duplicated(dat)) @@ -150,124 +62,210 @@ test_that("buildEdgeMatrix are the same", { dat <- t(dat) neighbors <- randomProjectionTreeSearch(dat, K = 5, - n_trees = 10, - tree_threshold = 20, + n_trees = 20, + tree_threshold = 30, max_iter = 10, verbose = FALSE) - is <- rep(0:(ncol(dat) - 1), each = 5) - js <- as.vector(neighbors) - is <- is[! js == -1] - js <- js[! js == -1] - dupes <- duplicated(data.frame(is, js)) - is <- is[! dupes] - js <- js[! dupes] - ord <- order(is) - is <- is[ord] - js <- js[ord] - distances <- as.matrix(dist(t(dat))) - distances <- as.numeric(lapply(1:length(is), FUN = function(x) { - distances[is[x] + 1, js[x] + 1] - })) - - ps <- i2p(is) - sigwij <- buildEdgeMatrix(i = is, - j = js, - p = ps, - d = distances, - verbose = F) - mat <- Matrix::sparseMatrix(i = js, - p = ps, - x = distances, - dims = c(ncol(dat), ncol(dat)), - giveCsparse = TRUE, - index1=FALSE) - sigwij2 <- buildEdgeMatrix(mat, verbose = F) - - score <- sum(sigwij$wij@x != sigwij2$wij@x) - expect_lt(score, 450) - tmat <- Matrix::sparseMatrix(i = is, - j = js, - x = distances, - dims = c(ncol(dat), ncol(dat)), - giveCsparse = FALSE, - index1 = FALSE) - sigwij3 <- buildEdgeMatrix(tmat, verbose = F) - score <- sum(sigwij$wij@x != sigwij3$wij@x) - expect_lt(score, 450) + expect_equal(nrow(neighbors), 5) + expect_equal(ncol(neighbors), ncol(dat)) + expect_lt(sum(neighbors == -1), 20) + expect_equal(sum(neighbors[, 1:40] > 50), 0) }) -test_that("largeVis works when alpha == 0", { +test_that("max threshold is sufficient to find all neighbors", { + M <- 5 set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) - data(iris) + data (iris) dat <- as.matrix(iris[, 1:4]) dat <- scale(dat) dupes <- which(duplicated(dat)) dat <- dat[-dupes, ] + d_matrix <- as.matrix(dist(dat, method = "euclidean")) + bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) + bests <- bests[-1,] - 1 dat <- t(dat) - visObject <- vis(dat, - max_iter = 20, - sgd_batches = 10000, - K = 10, - alpha = 0, - verbose = FALSE, - weight_pos_samples = FALSE) - expect_equal(sum(any(is.na(visObject$coords)) + - any(is.nan(visObject$coords)) + - any(is.infinite(visObject$coords))), - 0) + + neighbors <- randomProjectionTreeSearch(dat, + K = M, + n_trees = 1, + tree_threshold = ncol(dat), + max_iter = 0, + verbose = FALSE) + scores <- lapply(1:ncol(dat), FUN = function(x) sum(neighbors[,x] %in% bests[,x])) + score <- sum(as.numeric(scores)) + expect_gte(score, M * ncol(dat) - 1) # Two neighbors are equidistanct }) -test_that("sparseDistances", { +test_that("exploring after max threshold does not reduce accuracy", { M <- 5 set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) data (iris) dat <- as.matrix(iris[, 1:4]) dat <- scale(dat) dupes <- which(duplicated(dat)) dat <- dat[-dupes, ] - mat <- Matrix::sparseMatrix(i = rep(1:nrow(dat), ncol(dat)), - j = rep(1:ncol(dat), each = nrow(dat)), - x = as.vector(dat)) - d = as.matrix(dist(mat, method = 'euclidean')) - index_matrix <- matrix(c( - rep(0:(nrow(dat) - 1), nrow(dat)), - rep(0:(nrow(dat) - 1), each = nrow(dat)) - ), ncol = 2, byrow = FALSE) - mat <- Matrix::t(mat) - new_distances <- distance(mat, - as.vector(index_matrix[,2]), - as.vector(index_matrix[,1]), - "Euclidean", - verbose = FALSE) - diffs <- as.vector(d) - new_distances - expect_lt(sum(diffs), 1e-10) + d_matrix <- as.matrix(dist(dat, method = "euclidean")) + bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) + bests <- bests[-1, ] - 1 + dat <- t(dat) + + neighbors <- randomProjectionTreeSearch(dat, + K = M, + n_trees = 1, + tree_threshold = ncol(dat), + max_iter = 1, + verbose = FALSE) + scores <- lapply(1:ncol(dat), FUN = function(x) sum(neighbors[, x] %in% bests[, x])) + score <- sum(as.numeric(scores)) + expect_gte(score, (M * ncol(dat)) - 1) + oldscore <- score + + neighbors <- randomProjectionTreeSearch(dat, + K = M, + n_trees = 1, + tree_threshold = ncol(dat), + max_iter = 5, + verbose = FALSE) + scores <- lapply(1:ncol(dat), FUN = function(x) sum(neighbors[, x] %in% bests[, x])) + score <- sum(as.numeric(scores)) + expect_gte(score, oldscore) }) -test_that("Can determine sparse iris neighbors accurately", { +test_that("Can determine iris neighbors accurately", { M <- 5 set.seed(1974) - RcppArmadillo::armadillo_set_seed(1974) data (iris) dat <- as.matrix(iris[, 1:4]) dat <- scale(dat) dupes <- which(duplicated(dat)) dat <- dat[-dupes, ] - mat <- Matrix::sparseMatrix(i = rep(1:nrow(dat), ncol(dat)), - j = rep(1:ncol(dat), each = nrow(dat)), - x = as.vector(dat)) - d_matrix = as.matrix(dist(mat, method = 'euclidean')) + d_matrix <- as.matrix(dist(dat, method = "euclidean")) bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) - bests <- bests[-1,] - 1 - mat <- Matrix::t(mat) - neighbors <- randomProjectionTreeSearch(mat, + bests <- bests[-1, ] - 1 + dat <- t(dat) + + neighbors <- randomProjectionTreeSearch(dat, K = M, n_trees = 10, - tree_threshold = 20, - max_iter = 2, + tree_threshold = 10, + max_iter = 10, verbose = FALSE) - scores <- lapply(1:nrow(dat), FUN = function(x) sum(neighbors[,x] %in% bests[,x])) + scores <- lapply(1:ncol(dat), + FUN = function(x) sum(neighbors[, x] %in% bests[, x])) score <- sum(as.numeric(scores)) - expect_gt(score, .99 * ncol(dat) * M) + expect_gt(score, (ncol(dat) * M) - 15) +}) + +# test_that("Knows how to converge", { +# M <- 5 +# set.seed(1974) +# RcppArmadillo::armadillo_set_seed(1974) +# data (iris) +# dat <- as.matrix(iris[, 1:4]) +# dat <- scale(dat) +# dupes <- which(duplicated(dat)) +# dat <- dat[-dupes, ] +# d_matrix = as.matrix(dist(dat, method = "euclidean")) +# bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) +# bests <- bests[-1,] - 1 +# dat <- t(dat) +# +# neighbors <- randomProjectionTreeSearch(dat, +# K = M, +# n_trees = 10, +# tree_threshold = 10, +# max_iter = 100000, +# verbose = FALSE) +# scores <- lapply(1:ncol(dat), FUN = function(x) sum(neighbors[,x] %in% bests[,x])) +# score <- sum(as.numeric(scores)) +# expect_gt(score, (ncol(dat) * M) - 15) +# }) + + + +test_that("With a bigger dataset, increasing threshold improves result", { + M <- 10 + data (quakes) + dat <- as.matrix(quakes) + dat <- scale(dat) + d_matrix = as.matrix(dist(dat, method = "euclidean")) + bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) + bests <- bests[-1, ] - 1 + dat <- t(dat) + + oldscore <- 0 + + for (t in c(10, 30, 60, 90)) { + set.seed(1974) + neighbors <- randomProjectionTreeSearch(dat, + K = M, + n_trees = 10, + tree_threshold = t, + max_iter = 0, + verbose = FALSE) + scores <- lapply(1:ncol(dat), + FUN = function(x) sum(neighbors[, x] %in% bests[, x])) + score <- sum(as.numeric(scores)) / (M * ncol(dat)) + expect_gte(score, oldscore * 0.99) # Allow some gap here to account for randomness + if (score == 1) break; + oldscore <- score + } +}) + +test_that("With a bigger dataset, increasing n_trees improves result", { + M <- 10 + set.seed(1974) + data (quakes) + dat <- as.matrix(quakes) + dat <- scale(dat) + d_matrix = as.matrix(dist(dat, method = "euclidean")) + bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) + bests <- bests[-1,] - 1 + dat <- t(dat) + + oldscore <- 0 + + for (t in c(10, 30, 60, 90)) { + neighbors <- randomProjectionTreeSearch(dat, + K = M, + n_trees = t, + tree_threshold = 10, + max_iter = 0, + verbose = FALSE) + scores <- lapply(1:ncol(dat), + FUN = function(x) sum(neighbors[, x] %in% bests[, x])) + score <- sum(as.numeric(scores)) / (M * ncol(dat)) + expect_gte(score, oldscore * 0.99) + if (score == 1) break; + oldscore <- score + } +}) + +test_that("With a bigger dataset, increasing iters improves result", { + M <- 10 + set.seed(1974) + data (quakes) + dat <- as.matrix(quakes) + dat <- scale(dat) + d_matrix = as.matrix(dist(dat, method = "euclidean")) + bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) + bests <- bests[ - 1,] - 1 + dat <- t(dat) + + oldscore <- 0 + + for (t in c(0, 1, 5, 10)) { + neighbors <- randomProjectionTreeSearch(dat, + K = M, + n_trees = 10, + tree_threshold = 10, + max_iter = t, + verbose = FALSE) + scores <- lapply(1:ncol(dat), + FUN = function(x) sum(neighbors[, x] %in% bests[, x])) + score <- sum(as.numeric(scores)) / (M * ncol(dat)) + expect_gte(score, oldscore * 0.99) + if (score == 1) break; + oldscore <- score + } }) diff --git a/tests/testthat/testsparse.R b/tests/testthat/testsparse.R new file mode 100644 index 0000000..dc80ab2 --- /dev/null +++ b/tests/testthat/testsparse.R @@ -0,0 +1,91 @@ +context("sparse") + +test_that("buildEdgeMatrix are the same, Euclidean", { + set.seed(1974) + dat <- as.matrix(iris[, 1:4]) + dat <- scale(dat) + dupes <- which(duplicated(dat)) + dat <- dat[-dupes, ] + dat <- t(dat) + neighbors <- randomProjectionTreeSearch(dat, + K = 5, + n_trees = 10, + tree_threshold = 20, + max_iter = 10, + verbose = FALSE) + edges1 <- buildEdgeMatrix(data = dat, neighbors = neighbors, verbose = FALSE) + edges2 <- buildEdgeMatrix(data = Matrix(dat, sparse = TRUE), neighbors = neighbors, verbose = FALSE) + score <- sum(edges1@x - edges2@x) + expect_lt(score, 1) +}) + +test_that("buildEdgeMatrix are the same, Cosine", { + set.seed(1974) + dat <- as.matrix(iris[, 1:4]) + dat <- scale(dat) + dupes <- which(duplicated(dat)) + dat <- dat[-dupes, ] + dat <- t(dat) + neighbors <- randomProjectionTreeSearch(dat, + K = 5, + n_trees = 10, + tree_threshold = 20, + max_iter = 10, + verbose = FALSE) + edges1 <- buildEdgeMatrix(data = dat, neighbors = neighbors, verbose = FALSE, distance_method = "Cosine") + edges2 <- buildEdgeMatrix(data = Matrix(dat, sparse = TRUE), neighbors = neighbors, verbose = FALSE, distance_method = "Cosine") + score <- sum(edges1@x - edges2@x) + expect_lt(score, 1) +}) + +test_that("sparseDistances", { + M <- 5 + set.seed(1974) + data (iris) + dat <- as.matrix(iris[, 1:4]) + dat <- scale(dat) + dupes <- which(duplicated(dat)) + dat <- dat[-dupes, ] + mat <- Matrix::sparseMatrix(i = rep(1:nrow(dat), ncol(dat)), + j = rep(1:ncol(dat), each = nrow(dat)), + x = as.vector(dat)) + d = as.matrix(dist(mat, method = "euclidean")) + index_matrix <- matrix(c( + rep(0:(nrow(dat) - 1), nrow(dat)), + rep(0:(nrow(dat) - 1), each = nrow(dat)) + ), ncol = 2, byrow = FALSE) + mat <- Matrix::t(mat) + new_distances <- distance(mat, + as.vector(index_matrix[, 2]), + as.vector(index_matrix[, 1]), + "Euclidean", + verbose = FALSE) + diffs <- as.vector(d) - new_distances + expect_lt(sum(diffs), 1e-10) +}) + +test_that("Can determine sparse iris neighbors accurately", { + M <- 5 + set.seed(1974) + data (iris) + dat <- as.matrix(iris[, 1:4]) + dat <- scale(dat) + dupes <- which(duplicated(dat)) + dat <- dat[-dupes, ] + mat <- Matrix::sparseMatrix(i = rep(1:nrow(dat), ncol(dat)), + j = rep(1:ncol(dat), each = nrow(dat)), + x = as.vector(dat)) + d_matrix <- as.matrix(dist(mat, method = "euclidean")) + bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) + bests <- bests[-1,] - 1 + mat <- Matrix::t(mat) + neighbors <- randomProjectionTreeSearch(mat, + K = M, + n_trees = 10, + max_iter = 2, + tree_threshold = 20, + verbose = FALSE) + scores <- lapply(1:nrow(dat), FUN = function(x) sum(neighbors[, x] %in% bests[, x])) + score <- sum(as.numeric(scores)) + expect_gt(score, .99 * ncol(dat) * M) +}) diff --git a/vignettedata/ngcoords.Rda b/vignettedata/ngcoords.Rda new file mode 100644 index 0000000..5c79b2c Binary files /dev/null and b/vignettedata/ngcoords.Rda differ diff --git a/vignettes/TangLZM16.bib b/vignettes/TangLZM16.bib index c49c4f5..ab8ee9f 100644 --- a/vignettes/TangLZM16.bib +++ b/vignettes/TangLZM16.bib @@ -1,15 +1,8 @@ -@article{TangLZM16, - author = {Jian Tang and - Jingzhou Liu and - Ming Zhang and - Qiaozhu Mei}, - title = {Visualization Large-scale and High-dimensional Data}, - journal = {CoRR}, - volume = {abs/1602.00370}, - year = {2016}, - url = {http://arxiv.org/abs/1602.00370}, - timestamp = {Tue, 01 Mar 2016 17:47:25 +0100}, - biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/TangLZM16}, - bibsource = {dblp computer science bibliography, http://dblp.org} +@inproceedings{tang2016visualizing, + title={Visualizing Large-scale and High-dimensional Data}, + author={Tang, Jian and Liu, Jingzhou and Zhang, Ming and Mei, Qiaozhu}, + booktitle={Proceedings of the 25th International Conference on World Wide Web}, + pages={287--297}, + year={2016}, + organization={International World Wide Web Conferences Steering Committee} } - diff --git a/vignettes/benchmarks.Rmd b/vignettes/benchmarks.Rmd new file mode 100644 index 0000000..f41cf57 --- /dev/null +++ b/vignettes/benchmarks.Rmd @@ -0,0 +1,198 @@ +--- +title: "ANN Benchmarks" +author: "Amos Elberg" +date: '`r Sys.Date()`' +output: + rmarkdown::html_vignette: default +vignette: | + %\VignetteIndexEntry{ANN Benchmarks} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setupbenchmark,eval=T,echo=F,warning=F,error=F,message=F} +# Note to reader: Please don't steal the semi-distinctive visual style I spent several minutes creating for myself. +require(ggplot2, + quietly = TRUE) +require(RColorBrewer, + quietly = TRUE) +require(wesanderson, + quietly = TRUE) +require(dplyr, quietly = TRUE) +require(magrittr, quietly = TRUE) +knitr::opts_chunk$set(collapse = TRUE, + comment = "#>", + fig.width = 7, + fig.height = 5) +colors_discrete <- function(x) rep(wes_palette("Darjeeling", + n = min(x, 5)), + 2)[1:x] +colors_divergent_discrete <- function(x) + grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) +colors_continuous <- function(x) wes_palette(name = "Zissou", + n = x, + type = "continuous") + +nacol <- colors_discrete(4)[4] +theme_set( + theme_bw() %+replace% + theme( + legend.key.size = unit(4, "mm"), + legend.title = element_text(size = rel(0.8), + face = "bold"), + legend.margin = unit(0, "cm"), + legend.position = "bottom", + legend.key.size = unit(0.5, "lines"), + legend.text=element_text(size = unit(8, "points")), + axis.title.y = element_text(angle = 90), + axis.text = element_text(size = rel(0.7)), + plot.margin = unit(c(0, 0.5, 1, 0), "lines"), + axis.title = element_text(size = rel(0.8), + face = "bold"), + title = element_text(size = rel(0.9)) + ) +) +``` + +## Overview + +Besides manifold visualization, `largeVis` also includes an extremely efficient approximate nearest-neighbor search that runs in $O(n)$ time. + +This vignette includes benchmarks and recommendations for adjusting hyperparameters in the neighbor search for best results. + +## Hyperparameters + +The `randomProjectionTreeSearch` function has three hyperparameters that trade-off accuracy and efficiency in the neighbor search: + +1. `n_trees` - In the first phase of the function, the number of random projection trees to create. +2. `tree_threshold` - The maximum number of any nodes on a random projection tree leaf. If, after branching, the number of nodes in a branch exceeds this threshold, the branch will be divided again. +3. `max_iters` - The number of iterations for the neighborhood-exploration phase of the algorithm. + +## Data Collection \& Methodology + +The data in the benchmarks below was obtained by running the `benchmark.R` script, which is installed along with the package, on two machines. + +The aim was to replicate as much as possible the methodology used by Erik Bernhardsson's [ANN Benchmark](https://github.com/erikbern/ann-benchmarks) github. However, `ANN Benchmark` is designed for libraries that are designed to build a neighbor index and then rapidly process queries against the index. The measure used by `ANN Benchmark` is therefore queries-per-second. By contract, `largeVis` is concerned with getting neighbors for all of the nodes in a finite dataset as quickly as possible. + +Times shown for `RcppAnnoy` include the time to build a searchable index and query neighbors for all rows in the dataset. + +The data used is the 1-million vector, 128-feature [SIFT Dataset](http://corpus-texmex.irisa.fr/), which is the test data used by `ANN Benchmark`. + +Benchmarks were run on several machines. First, benchmarks were run on a workstation and a server with $K = 100$. Benchmarks were then run on an AWS c4.2xlarge instance with $K = 100$ and $K = 50$, to replicate as closely as possible the conditions of `ANN Benchmark`. + +Results that appear to have used virtual memory, in that the completion time was radically discontinuous with other results from the same machine, were discarded. + +I welcome submissions of output from the script from other hardware. + +## Comparison With Annoy + +The following chart illustrates performance versus the `Annoy` library, as implemented through the `RcppAnnoy` R package. + +To facilitate comparison with the ANN Benchmark charts, the Y-axis shows the number of vectors processed per second. + +```{r plotpeformance,echo=F,fig.align='center',warning=FALSE,message=FALSE} +load(system.file("extdata", "benchmark.Rda", package = "largeVis")) +benchmark %>% + filter(machine != 'Large Server', + machine == 'Workstation' | K == 50) %>% + mutate(facet = precision, + facet = ifelse(facet < 0.95, '', 'Closeup'), + facet = factor(facet)) %>% + ggplot(aes( y = time, + x = precision, + group = series, + fill = series, + shape = series)) + + geom_point(size = 1.5, alpha = 0.7, color = "grey80") + + scale_y_log10(name = "Speed, log (nodes / seconds)") + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 0.925, 0.95, 0.975, 1.0)) + + facet_grid(K + machine ~ facet, scales = "free") + + scale_fill_manual(name = "Method & n. iter.", + values = colors_divergent_discrete(nlevels(benchmark$series))(nlevels(benchmark$series))) + + scale_shape_manual(name = "Method & n. iter.", + values = c(21, 21, 21, 21, 23)) + + # guides(color = guide_legend(nrow=3)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, RcppAnnoy and largeVis", + atop(italic("(n = 10000; Upper Right is Better)")) + ) + )) +``` + + +## Approximate Equivalence of Number of Trees and Tree Threshold + +There is an approximate trade-off in memory use between the tree threshold and number of trees. Peak memory consumption during the tree search phase = N * n_trees * threshold. + +The trade-off is not precise because the tree split phase will return fewer nodes per tree than the threshold. On average, it should return about 3/4 of the threshold. + +On the following chart, points that share the same values of n_trees * threshold, referred to as `tth`, (and number of neighborhood exploration iterations), are shown as the same series. + +```{r constn,echo=F,warning=F} +bench <- benchmark %>% + filter(method == 'largeVis', machine == 'Large Server') %>% + mutate(nn = threshold * n_trees) %>% + group_by(max_iters, nn) %>% + filter(n() > 2) %>% + mutate(series = paste(max_iters, ", ", nn, sep = " ")) +bench$facet <- factor(ifelse(bench$n_trees >= 4, "", "n. trees < 10")) +bench %>% + ggplot(aes(y = time, + x = precision, + fill = series, + group = series, + color = factor(n_trees))) + + geom_point(size = 1.5, alpha = 0.8, shape = 21) + + scale_fill_manual("n. iter, tth", values = colors_divergent_discrete(6)(6)) + + scale_color_grey("n. trees", start = 0.8, end = 0 ) + +# guides(color = FALSE) + + # scale_shape(name = "Iterations", solid = FALSE) + + facet_grid(machine ~ .) + + scale_y_log10(name = "Speed, log (nodes / second)", limits = c(1e2,1e5)) + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, n_trees and tree_threshold", + atop(italic("(100-NN precision, n = 10000; Upper Right is Better)"))))) +``` + +Results that hold nn constant while varying the number of trees and threshold tend to cluster together, however increasing the number of trees (while holding tth constant) tends to improve accuracy and decrease speed. The degree of dispersion increases when a neighborhood exploration iteration is added. + +On the charts below, n_trees * threshold is referred to as `tth`. + +## Effect of Increasing `tth` vs. `max_iters` + + +```{r tree_threshold,echo=F} +bench <- benchmark %>% + filter(method == 'largeVis', + machine != 'Large Server') %>% + mutate(label = ifelse(threshold == 128, "128", "Other"), + label = factor(label), + facet = precision, + facet = ifelse(facet < 0.85, '', 'Closeup')) +bench$facet <- factor(bench$facet) +bench %>% + arrange(nn) %>% + mutate(max_iters = factor(max_iters)) %>% + ggplot(aes(y = time, + x = precision , + color = max_iters, + group = max_iters)) + +# geom_path(size = 0.5, alpha =0.8, arrow = arrow(length = unit(0.05, "inches"))) + + geom_point(size = 1, alpha = 0.8, shape = 16) + + facet_grid(K + machine ~ facet, scales = 'free') + + scale_y_log10(name = "Speed, log (nodes / second)") + + scale_x_continuous("Precision", + breaks = c(0, 0.2, 0.4, 0.6, 0.8, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0)) + + # scale_shape_discrete(name = "", solid = FALSE) + + # guides(color = FALSE) + + scale_color_manual("n. iter", values = colors_discrete(4)) + + ggtitle(expression( + atop("Precision-Performance tradeoff, effect of increasing tth vs. max_iters", + atop(italic("(n = 10000; Upper Right is Better)"))))) +``` + +A single iteration clearly has substantial impact on accuracy. The marginal benefit of additional iterations declines, but adding a second iteration is a more efficient way to improve accuracy than increasing tth. This is consistent with the recommendation of the paper authors. + diff --git a/vignettes/largeVis.Rmd b/vignettes/largeVis.Rmd index afb75da..321c137 100644 --- a/vignettes/largeVis.Rmd +++ b/vignettes/largeVis.Rmd @@ -6,51 +6,78 @@ output: rmarkdown::html_vignette: fig_caption: yes bibliography: TangLZM16.bib -vignette: > - %\VignetteIndexEntry{largeVis} - %\VignetteEngine{knitr::rmarkdown} +vignette: | + %\VignetteIndexEntry{largeVis} + %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -```{r setup,eval=T,echo=F,warning=F,error=F,message=F} -# Note to reader: Please don't steal the semi-distinctive visual style I spent several minutes creating for myself. -library(RColorBrewer,quietly=T) -library(wesanderson,quietly=T) -colors_discrete <- function(x) rep(wes_palette("Darjeeling", n = min(x,5)), - 2)[1:x] -colors_divergent_discrete <- function(x) grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) -colors_continuous <- function(x) wes_palette(name= "Zissou",n = x, type= "continuous") +```{r setupvignette,eval=T,echo=F,warning=F,error=F,message=F} +require(ggplot2, + quietly = TRUE) +require(RColorBrewer, + quietly = TRUE) +require(wesanderson, + quietly = TRUE) +knitr::opts_chunk$set(collapse = TRUE, + comment = "#>", + cache=FALSE) +colors_discrete <- function(x) + rep(wes_palette("Darjeeling", n = min(x, 5)), 2)[1:x] +colors_divergent_discrete <- function(x) + grDevices::colorRampPalette(RColorBrewer::brewer.pal(x, "Spectral")) +colors_continuous <- function(x) wes_palette(name = "Zissou", + n = x, + type = "continuous") nacol <- colors_discrete(4)[4] -require(ggplot2,quietly = T) theme_set( theme_bw() %+replace% theme( - legend.key.size=unit(4,"mm"), - legend.title=element_text(size=rel(0.8), face = "bold"), - legend.margin=unit(0,"cm"), - legend.position="bottom", - legend.key.size=unit(0.5,"lines"), + legend.key.size = unit(4, "mm"), + legend.title = element_text(size = rel(0.8), + face = "bold"), + legend.margin = unit(0, "cm"), + legend.position = "bottom", + legend.key.size = unit(0.5, "lines"), legend.text=element_text(size = unit(8, "points")), - axis.title.y = element_text(angle=90), - axis.text = element_text(size=rel(0.7)), + axis.title.y = element_text(angle = 90), + axis.text = element_text(size = rel(0.7)), plot.margin = unit(c(0, 0.5, 1, 0), "lines"), - axis.title=element_text(size=rel(0.8),face="bold"), - title = element_text(size=rel(0.9)) - ) + axis.title = element_text(size = rel(0.8), + face = "bold"), + title = element_text(size = rel(0.9)) + ) ) -require(largeVis) +rebuild <- FALSE + +require(largeVis,quietly = TRUE) ``` This Vingette provides an overview of the largeVis package. ## Introduction -The `largeVis` package offers four functions for visualizing high-dimensional datasets and finding approximate nearest neighbors, based on the `LargeVis` algorithm presented in @TangLZM16: +This package provides `LargeVis` visualizations and fast nearest-neighbor search. The `LargeVis` algorithm, presented in @tang2016visualizing, creates high-quality low-dimensional representaitons of large, high-dimensional datasets, similar to [t-SNE](https://lvdmaaten.github.io/tsne/). + +These visualizations are useful for data exploration, for visualizing complex non-linear functions, and especially for visualizing embeddings such as learned vectors for images. + +A limitation of t-SNE is that because the algorithm has complexity order $O(n^2)$, it is not feasible for use on even moderately sized datasets. [Barnes-Hut](https://arxiv.org/pdf/1301.3342.pdf), an approximation of t-SNE, has complexity $O(n \log n)$ but also quickly becomes infeasible as the size of data grows. `LargeVis` is intended to address the issue by operating in linear $O(n)$ time. It has been benchmarked at more than 30x faster than Barnes-Hut on datasets of approximately 1-million rows, and scaled linearly as long as there is sufficient RAM. + +In addition, `LargeVis` includes an algorithm for finding approximate k-Nearest Neighbors in $O(n)$ time. This algorithm turns out to be faster at finding accurate a-NNs than any other method I was able to test. + +The package also includes a function for visualizing image embeddings by plotting images at the locations given by the `LargeVis` algorithm. + +For a detailed description of the algorithm, please see the original paper, @tang2016visualizing. + +## Package Overview + +The `largeVis` package offers five functions for visualizing high-dimensional datasets and finding approximate nearest neighbors (along with some helper functions): 1. `randomProjectionTreeSearch`, a method for finding approximate nearest neighbors. 2. `projectKNNs`, which takes as input a weighted nearest-neighbor graph and estimates a projection into a low-dimensional space. -3. `vis`, which combines `randomProjectionTreeSearch`, `buildEdgeMatrix`, and `projectKNNs`, along with additional code to implement the `LargeVis` algorithm. -4. `manifoldMap`, which produces a plot for visualizing embeddings of images. +3. `largeVis`, which implements the entire `LargeVis` algorithm. +4. `manifoldMap` (and companon `ggManifoldMap`), which produce a plot for visualizing embeddings of images. +5. `buildWijMatrix` takes a sparse matrix of the distances between nearest neighbors, and returns one with the edges properly weighted for use in `projectKNNs`. See the [original paper](https://arxiv.org/abs/1602.00370) for a detailed description of the algorithm. @@ -62,302 +89,241 @@ If there are NA's, Infs, or NULLs in the input, `randomProjectionTreeSearch` wil If the numerical range covered by the data is large, this can cause errors in or before the `buildEdgeMatrix` function. This is because the algorithm requires calculating $\exp(||\vec{x_i}, \vec{x_j}||^2)$ in the high-dimensional space, which will overflow if the distance between any nearest neighbors exceeds about 26. -If there are duplicates in the input data, while the implementation tries to filter duplicates, it is likely to lead to problems. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during `buildEdgeMatrix`, or stochastic gradient descent. - -## Examples - -```{r MNIST,echo=F,message=F,warning=F,results='hide',eval=F} -darch::provideMNIST(download=T) -load("data/train.RData") - -mnistCoords <- vis(t(trainData) - 0.5, K = 40, tree_threshold = 700, - n_trees = 40, max_iter = 2, verbose=F) -mnistCoords <- mnistCoords$coords -mnistCoords <- scale(t(mnistCoords)) -mnistCoords <- data.frame(mnistCoords) -colnames(mnistCoords) <- c("x", "y") -labs <- apply(trainLabels, MARGIN=1, FUN=function(x) which(x == 1)) -mnistCoords$labels <- factor(labs - 1) -``` - -```{r drawmnist,echo=F,warning=F,fig.width=3.5,fig.height=4,fig.align='center',fig.show='hold'} -load(system.file("extdata", "mnistcoords.Rda", package="largeVis")) -ggplot(mnistCoords, aes(x = x, y = y, color = labels)) + - geom_point(size = 0.1, alpha = 0.3) + - scale_x_continuous(name = "", limits = c(-2.5, 2), breaks = NULL) + - scale_y_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - scale_color_manual(values = colors_divergent_discrete(10)(10)) + - guides(colour = guide_legend(override.aes = list(size=5))) + - ggtitle("MNIST") -``` - -```{r ldafromldavis,echo=F,eval=F} -library(LDAvis) -data("TwentyNewsgroups") -theta <- scale(t(TwentyNewsgroups$theta)) -visObj <- vis(theta, K = 100, n_trees = 20, tree_threshold = 100, - max_iter = 2) - -ngcoords <- scale(t(visObj$coords)) -ngcoords <- data.frame(ngcoords) -colnames(ngcoords) <- c("x", "y") -library(lda) -data("newsgroup.train.labels") -ngcoords$label <- factor(newsgroup.train.labels)[-1] -``` -```{r draw20ng,fig.align='center',echo=F,fig.width=3.5,fig.height=4,eval=T,warning=FALSE,error=FALSE,message=FALSE,fig.show='hold'} -load(system.file("extdata", "ngcoords.Rda", package="largeVis")) -ggplot(ngcoords, - aes(x = x, y = y, color = label)) + - geom_point(size = 0.4, alpha = 0.5) + - scale_color_manual(values = colors_divergent_discrete(20)(20), - guide=FALSE) + - scale_x_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - scale_y_continuous(name = "", limits = c(-2, 2.5), breaks = NULL) + - ggtitle("20 Newsgroups") -``` - -```{r 3draw,webgl=TRUE,echo=F,eval=F,results='asis'} -# d3coords <- projectKNNs(visObj$wij, dim = 3) -# d3coords <- data.frame(scale(t(d3coords))) -# colnames(d3coords) <- c("x", "y", "z") -# d3coords$label <- factor(newsgroup.train.labels)[-1] -# library(threejs) -# rgl::plot3d(x = d3coords[,1], -# y = d3coords[,2], -# z = d3coords[,3], -# main = "20 Newsgroups", -# type = "p", -# col = c(newsgroup.train.labels, -# newsgroup.test.labels)) -``` +Duplicates in the input data are likely to cause issues. If the number of duplicates is large, this can cause the random projection tree search to fail. If the number is small, the algorithm may identify a sufficient number of neighbors, but an error may then occur during `buildEdgeMatrix`, or stochastic gradient descent. ## Overview of Functions and Hyperparameters ### `randomProjectionTreeSearch` -This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, the algorithm creates `n_trees` binary trees dividing the space into leaves of at most `tree_threshold` nodes. A node's candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. In the second phase, for each node, the algorithm looks at the candidate nearest neighbors for that node, as well as each of those nodes' candidate nearest neighbors. The logic of the algorithm is that a node's neighbors' neighbors are likely to be the node's own neighbors. In each iteration, the closest `K` candidate neighbors for each node are kept. +This function uses a two-phase algorithm to find approximate nearest neighbors. In the first phase, which is based on [Erik Bernhardsson](http://erikbern.com)'s [Annoy](https://github.com/spotify/annoy) algorithm, `n_trees` trees are formed by recursively dividing the space by hyperplanes until at most `tree_threshold` nodes remain in a branch. A node's candidate nearest neighbors are the union of all nodes with which it shared a leaf on any of the trees. The `largeVis` algorithm adds a second phase, neighborhood exploration, which considers, for each node, whether the candidate neighbors of the node's candidate immediate neighbors are closer. The logic of the algorithm is that a node's neighbors' neighbors are likely to be the node's own neighbors. In each iteration, the closest `K` candidate neighbors for each node are kept. -The authors of @TangLZM16 suggest that a single iteration of the second phase is generally sufficient to obtain satisfactory performance. +(Note that this implementation of `largeVis` differs from the approach taken by `Annoy`, in that `Annoy` always uses the number of features as the leaf threshold, where `largeVis` allows this to be an adjustable parameter.) -The chart below illlustrates the trade-off between performance and accuracy for the nearest-neighbor search, using various hyperparameters. The data was produced using the `benchmark.R` script in the `inst/` directory. The test data is the 1-million vector, 128-feature [SIFT Dataset](http://corpus-texmex.irisa.fr/), as per Erik Bernhardsson's [ANN Benchmark](https://github.com/erikbern/ann-benchmarks) github. +The authors of @tang2016visualizing suggest that a single iteration of the second phase is generally sufficient to obtain satisfactory performance. -```{r performance,echo=F,eval=F} -benchmark <- readr::read_csv(system.file("extdata", "results.csv", package="largeVis")) -colnames(benchmark) <- c("time", - "precision", - "n_trees", - "max_iters", - "threshold") -benchmark$series <- factor(paste(benchmark$n_trees, "trees,", - benchmark$max_iters, "iterations.")) -``` -```{r plotpeformance,echo=F,fig.width=3.5,fig.height=4,fig.align='center'} -load(system.file("extdata", "benchmark.Rda", package = "largeVis")) -ggplot(benchmark, aes(x = time, y = precision / 100, - group = series, color = series, - shape = series, - label =threshold)) + - geom_point(size = 1) + geom_line(size = 0.5) + - geom_text(vjust = 1, hjust = -0.1, size = 2.5) + - scale_x_continuous("Time (relative)") + - scale_y_log10("Precision", limits = c(0.1,1), - breaks = c(.1, .25, .5, .8, .9, .99)) + - scale_color_manual(values = colors_divergent_discrete(nlevels(benchmark$series))(nlevels(benchmark$series))) + - guides(color = guide_legend(nrow=3)) + - ggtitle(expression( - atop("Time vs. Precision (K = 1000)", - atop(italic("Labelled by Tree Threshold")) - ) - )) -``` - -If `randomProjectionTreeSearch` fails to find the desired number of neighbors, usually the best result is obtained by increasing the tree threshold. If `randomProjectionTreeSearch` fails with an error that no neighbors were found for some nodes, and the tree threshold is already reasonable, this may be an indication that duplicates remain in the input data. +See the vignette "ANN Benchmarks" for additional information. ### `projectKNNs` This function takes as its input a `Matrix::sparseMatrix`, of connections between nodes. The matrix must be symmetric. A non-zero cell implies that node `i` is a nearest neighbor of node `j`, vice-versa, or both. Non-zero values represent the strength of the connection relative to other nearest neighbors of the two nodes. -The `LargeVis` algorithm, explained in detail in @TangLZM16, estimates the embedding by sampling from the identitied nearest-neighbor connections. For each edge, the algorithm also samples `M` non-nearest neighbor negative samples. `M`, along with $\gamma$ and $\alpha$, control the visualization. $\alpha$ controls the desired distance between nearest neighbors. $\gamma$ controls the relative strength of the attractive force between nearest neighbors and repulsive force between non-neighbors. - -The following grid illustrates the effect of the $\alpha$ and $\gamma$ hyperparameters, using the `wiki` dataset which is included with the package: - -```{r wikihyperparameters,echo=F,eval=F} -data(wiki) +The `LargeVis` algorithm, explained in detail in @tang2016visualizing, estimates the embedding by sampling from the identitied nearest-neighbor connections. For each edge, the algorithm also samples `M` non-nearest neighbor negative samples. `M`, along with $\gamma$ and $\alpha$, control the visualization. $\alpha$ controls the desired distance between nearest neighbors. $\gamma$ controls the relative strength of the attractive force between nearest neighbors and repulsive force between non-neighbors. -inputs <- data.frame( - g = rep(c(.5,1,7,14), 4), - a = rep(c(.1,1,5,10), each = 4) -) +The following grid illustrates the effect of the $\alpha$ and $\gamma$ hyperparameters: -agcoords <- do.call(rbind, lapply(1:nrow(inputs), FUN = function(x) { - a <- inputs[x, 'a'] - g <- inputs[x, 'g'] - localcoords <- projectKNNs(wiki, alpha = a, gamma = g,verbose=FALSE) - localcoords <- data.frame(scale(t(localcoords))) - colnames(localcoords) <- c("x", "y") - localcoords$a <- a - localcoords$g <- g - localcoords$activity <- log(Matrix::colSums(wiki)) - localcoords -})) +```{r reload,eval=!rebuild} +load(system.file(package = "largeVis", "extdata/vignettedata.Rda")) ``` -```{r drawhyperparameters,echo=F,fig.width=3.5,fig.height=4,fig.align='center'} -load(system.file("extdata", "agcoords.Rda", package="largeVis")) +```{r drawhyperparameters,echo=F,fig.width=3.5,fig.height=4,fig.align='center',results='asis',cache=FALSE} +if (! exists("agcoords") && rebuild) { + data(wiki) + inputs <- data.frame( + g = rep(c(.5,1,7,14), 5), + a = rep(c(0,.1,1,5,10), each = 4) + ) + wij <- buildWijMatrix(wiki, perplexity = 50) + set.seed(1974) + initialcoords <- matrix(rnorm(ncol(wij) * 2), nrow = 2) + + agcoords <- do.call(rbind, + lapply(1:nrow(inputs), + FUN = function(x) { + a <- inputs[x, 'a'] + g <- inputs[x, 'g'] + newcoords <- initialcoords + projectKNNs(wij, alpha = a, + gamma = g, + verbose = FALSE, + coords = newcoords) %>% + t() %>% + scale() %>% + data.frame() %>% + set_colnames(c("x", "y")) %>% + mutate(a = a, g = g, degree = colSums(wiki)) + })) +} + ggplot(agcoords, - aes(x = x, y = y, color = activity)) + - geom_point(alpha = 0.2, size = 0.05) + + aes(x = x, + y = y, + color = degree)) + + geom_point(alpha = 0.2, + size = 0.05) + facet_grid(a ~ g, - labeller = label_bquote(alpha == .(a), gamma == .(g)), + labeller = label_bquote(alpha == .(a), + gamma == .(g)), scales = 'free') + - scale_x_continuous(breaks=NULL,name="") + - scale_y_continuous(breaks=NULL,name = "") + - scale_color_gradientn(colors = colors_continuous(10), guide=FALSE) + - ggtitle(expression(paste("Effect of", alpha, "vs.", gamma, sep = " "))) + scale_x_continuous(breaks = NULL, + name = "") + + scale_y_continuous(breaks = NULL, + name = "") + + scale_color_gradientn(colors = colors_continuous(10), + guide=FALSE) + + ggtitle(expression(paste("Effect of ", alpha, " vs. ", gamma, sep = " "))) ``` The additional hyperparameters $\rho$ and `min-`$\rho$ control the starting and final learning rate for the stochastic gradient descent process. -The algorithm can treat positive edge weights in two different ways. The authors of @TangLZM16 suggest that edge weights should be used to generate a weighted sampling. However, the algorithm for taking a weighted sample runs in $O(n \log n)$. Alternatively, the edge-weights can be applied to the gradients. This is controlled by the `weight_pos_samples` parameter. +The algorithm can treat positive edge weights in two different ways. The authors of @tang2016visualizing suggest that edge weights should be used to generate a weighted sampling. However, the algorithm for taking a weighted sample runs in $O(n \log n)$. Alternatively, the edge-weights can be applied to the gradients. This is controlled by the `weight_pos_samples` parameter. ### `vis` The `vis` function combines `randomProjectionTreeSearch` and `projectKNNs`, along with additional logic for calculating edge weights, to implement the complete `LargeVis` algorithm. -The following chart illustrates the effect of the `M` and `K` parameters, using the `iris` dataset. - -```{r iris,echo=F,fig.width=5,fig.height=5,eval=F} -data(iris) -Ks <- c(5, 10, 20, 40) -Ms <- c(1, 5, 10, 20) -data(iris) -dat <- iris[,1:4] -dupes <- duplicated(dat) -dat <- dat[-dupes,] -labels <- iris$Species[-dupes] -dat <- scale(dat) -dat <- as.matrix(dat) -dat <- t(dat) - -inputs <- data.frame( - K = rep(Ks, length(Ms)), - M = rep(Ms, each = length(Ks)) -) -iriscoords <- do.call(rbind, lapply(1:nrow(inputs), FUN = function(x) { - K <- inputs[x, 'K'] - M <- inputs[x, 'M'] - visO <- vis(dat, K = K, M = M, verbose=FALSE) - localcoords <- data.frame(scale(t(visO$coords))) - colnames(localcoords) <- c("x", "y") - localcoords$K <- K - localcoords$M <- M - localcoords$Species <- as.integer(labels) - localcoords +The following chart illustrates the effect of the `M` and `K` parameters, using the `iris` dataset. Each row re-uses the same set of identified `K` neighbors, and initial coordinates. + +```{r drawiris,echo=F,fig.width=4,fig.height=4.5,fig.align='center',results='asis'} +if (!exists("iriscoords")) { + data(iris) + Ks <- c(5, 10,20,30) + Ms <- c(5, 10, 20) + dat <- iris[,1:4] + dupes <- duplicated(dat) + dat <- dat[-dupes,] + labels <- iris$Species[-dupes] + dat <- as.matrix(dat) + dat <- t(dat) + + set.seed(1974) + coordsinput <- matrix(rnorm(ncol(dat) * 2), nrow = 2) + + iriscoords <- do.call(rbind, lapply(Ks, FUN = function(K) { + neighbors <- randomProjectionTreeSearch(dat, + K = K, + verbose = FALSE) + edges <- buildEdgeMatrix(dat, neighbors, verbose = FALSE) + wij <- buildWijMatrix(edges) + do.call(rbind, lapply(Ms, FUN = function(M) { + coords <- projectKNNs(wij = wij, M = M, + coords = coordsinput, + verbose = TRUE, + sgd_batches = 2000000) + coords <- scale(t(coords)) + coords <- data.frame(coords) + colnames(coords) <- c("x", "y") + coords$K <- K + coords$M <- M + coords$rebuild <- 'no' + coords$Species <- as.integer(labels) + coords + })) })) -iriscoords$Species <- factor(iriscoords$Species) -levels(iriscoords$Species) <- levels(iris$Species) -``` -```{r drawiriscoords,echo=F,fig.width=4,fig.height=4.5,fig.align='center'} -load(system.file("extdata", "iriscoords.Rda", package="largeVis")) + iriscoords$Species <- factor(iriscoords$Species) + levels(iriscoords$Species) <- levels(iris$Species) +} + ggplot(iriscoords, aes(x = x, y = y, - color =Species)) + + color = Species)) + geom_point(size = 0.5) + - scale_x_continuous("", breaks = NULL) + - scale_y_continuous("", breaks = NULL) + - facet_grid(K ~ M, scales = 'free', labeller = label_bquote(K == .(K), M == .(M))) + + scale_x_continuous("", + breaks = NULL) + + scale_y_continuous("", + breaks = NULL) + + facet_grid(K ~ M, + scales = 'free', + labeller = label_bquote(K == .(K), M == .(M))) + scale_color_manual(values = colors_discrete(3)) + ggtitle("Effect of M and K on Iris Dataset") ``` ### `manifoldMap` -The `manifoldMap` function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by `projectKNNs` or `vis`) and an `array` of `N` images, the function samples `n` images and plots them at the coordinates given in the matrix. If the `transparency` parameter is a number between 0 and 1, then the function adds to each image an alpha channel where the value per pixel is proportional to $transparency *$ the image content. +The `manifoldMap` function is useful when the examples being clustered are themselves images. Given a coordinate matrix (as generated by `projectKNNs` or `vis`) and an `array` of `N` images, the function samples `n` images and plots them at the coordinates given in the matrix. -The function can plot both color and greyscale images. +The following code will generate the visualization shown in the examples: -The following code will plot 5000 images sampled from the MNIST dataset at positions generated by `vis`: -```{r loadmnistimages,eval=F,echo=F} -load("data/train.RData") -``` -```{r drawmanifoldmap,echo=T,fig.width=8,fig.height=8,message=F,warning=F,fig.align='center'} -if (exists("trainData")) { - dim(trainData) <- c(60000, 28, 28) - manifoldMap(mnistCoords[,1:2], - n = 5000, - scale = 0.003, - transparency = F, - images = trainData, - xlab="", ylab="", - xlim = c(-2, 2), - ylim = c(-2, 2)) -} +```{r echomanifold,echo=T,eval=F} +dim(trainData) <- c(60000, 28, 28) +aperm(trainData, perm = c(1,3,2), resize = FALSE) +set.seed(1974) +manifoldMap(mnistCoords[,1:2], + n = 5000, + scale = 0.1, + images = trainData, + xlab = "", + ylab = "") ``` -The code is disabled by default in this vignette for data size reasons. - ## Support for Sparse Matrices -`largeVis` supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices. - -For example, the following plot visualizes a tf-idf weighted document-term matrix for a corpus of 5000 political blog entries, as included with the `stm` package. - -```{r tdm,echo=F,eval=F} -library(stm) -data("poliblog5k") -p <- c(0, cumsum(as.numeric(lapply(poliblog5k.docs, function(x) ncol(x))))) -i <- do.call("c", lapply(poliblog5k.docs, function(x) x[1,])) -p[length(p)] <- length(i) -j <- rep(0:(length(diff(p)) - 1), diff(p)) -v <- do.call("c", lapply(poliblog5k.docs, function(x) x[2,])) -poli <- Matrix::sparseMatrix(i = i + 1, j = j + 1, x = v) -dupes <- duplicated(slam::as.simple_triplet_matrix(Matrix::t(poli))) -poli <- poli[, ! dupes] -poli <- poli / log(Matrix::rowSums(poli > 0)) # tf-idf weight -policoords <- vis(poli, K = 100, n_trees = 20, - tree_threshold = 100, max_iter = 10, - M=10,gamma=15, - distance_method = 'Cosine',verbose=F) -polidata <- data.frame(scale(t(policoords$coords))) -colnames(polidata) <- c('x', 'y') -polidata$rating <- poliblog5k.meta$rating[!dupes] -polidata$blog <- poliblog5k.meta$blog[!dupes] -``` -```{r drawtdm,echo=F,fig.height=4,fig.width=7} -load(system.file("extdata", "polidata.Rda", package="largeVis")) -ggplot(polidata, aes(x = x, y = y, color = blog)) + - geom_point(size = 0.3, alpha = 0.8) + - scale_color_manual(values = colors_divergent_discrete(6)(6)) + - facet_grid(. ~ rating, scale = 'free') + - scale_x_continuous("", breaks = NULL) + - scale_y_continuous("", breaks = NULL) + - ggtitle("Visualization of a tf-idf Matrix") +`largeVis` supports sparse matrices. Besides facilitating very large datasets, this makes it practicable to visualize term-document-matrices directly, and compare the result with the result of visualizing topic vectors. + +## Visualizing Graphs + +The `largeVis` visualization algorithm can be used to visualize undirected weighted or unweighted acyclic graphs. The included `wiki` dataset is an example. + +The following code illustrates how to import and visualize a graph using the YouTube-communities dataset available [here](https://snap.stanford.edu/data/com-Youtube.html). The data and visualization are not included here for size reasons. + +```{r youtube,eval=F,echo=T} +youtube <- readr::read_tsv(pathToGraphFile, skip=4, col_names=FALSE) +youtube <- as.matrix(youtube) +youtube <- Matrix::sparseMatrix(i = youtube[, 1], + j = youtube[, 2], + x = rep(1, nrow(youtube)), + dims = c(max(youtube), max(youtube))) +youtube <- youtube + t(youtube) +communities <- readr::read_lines(pathToCommunities) +communities <- lapply(communities, + FUN = function(x) as.numeric(unlist(strsplit(x, "\t")))) +community_assignments <- rep(0, + nrow(youtube)) +for (i in 1:length(communities)) community_assignments[communities[[i]]] <- i + +wij <- buildWijMatrix(youtube) +youTube_coordinates <- projectKNNs(youtube) +youTube_coordinates <- data.frame(scale(t(youTube_coordinates))) +colnames(youTube_coordinates) <- c("x", "y") +youTube_coordinates$community <- factor(community_assignments) +youTube_coordinates$alpha <- factor(ifelse(youTube_coordinates$community == 0, 0.05, 0.2)) +ggplot(youTube_coordinates, aes( x = x, + y = y, + color = community, + alpha = alpha, + size = alpha)) + + geom_point() + + scale_color_manual(values = + c("black", colors_continuous(5000)), + guide = FALSE) + + scale_alpha_manual(values = c(0.005, 0.2), guide = FALSE) + + scale_size_manual(values = c(0.03, 0.15), guide = FALSE) + + scale_x_continuous("", + breaks = NULL, limits = c(-2.5,2.5)) + + scale_y_continuous("", + breaks = NULL, limits = c(-2.5,2.5)) + + ggtitle("YouTube Communities") ``` ## Distance Methods -The original `LargeVis` paper used Euclidean distances exclusively. The `largeVis` package offers a choice among Euclidean and Cosine distance measures. +The original `LargeVis` paper used Euclidean distances exclusively. The `largeVis` package offers a choice between Euclidean and Cosine distance measures. + +The implementation is not optimized for cosine distances. ## Memory Consumption -The algorithm is necessarily memory-intensive for large datasets. `neighborsToVectors`, `distance`, and `buildEdgeMatrix` are available as separate functions to facilitate memory-efficient handling of large datasets, because the high-dimensional dataset is not needed after distances have been calculated. In this case, the workflow is: +The algorithm is necessarily memory-intensive for large datasets. -```{r eval=F,echo=T} +A simple way to reduce peak memory usage, is to turn-off the `save_neighbors` parameter when running `vis`. If this is insufficient, the steps of the algorithm can be run separately with the `neighborsToVectors`, `distance`, and `buildEdgeMatrix` functions. In this case, the workflow is: + +```{r lowmemexample,eval=F,echo=T} neighbors <- randomProjectionTreeSearch(largeDataset) -neighborIndices <- neighborsToVectors(neighbors) +edges <- buildEdgeMatrix(data = largeDataset, neighbors = neighbors) rm(neighbors) -distances <- distance(neighborIndices$i, - neighborIndices$j, - largeDataset) -rm(largeDataset) -wij <- buildEdgeMatrix(i = neighborIndices$i, - j = neighborIndices$j, - d = distances) -rm(distances, neighborIndices) -coords <- projectKNNs(wij$wij) +gc() +wij <- buildWijMaatrix(edges) +rm(edges) +gc() +coords <- projectKNNs(wij) ``` -In testing, this method reduced peak RAM requirements by more than 70%. +Note that `gc()` is being called explicitly. The reason is that R will not collect garbage while executing the package's C++ functions, which can require substantial temporary RAM. + +Memory requirements during the neighbor search may be managed by reducing `n_trees` and increasing the `tree_threshold`. The decrease in precision is marginal, and may be compensated-for by increasing `max_iters`. See the benchmarks vignette for further detail. -## Bibliography +## References + +```{r save,eval=rebuild} +save(agcoords, iriscoords, file = "vignettedata/vignettedata.Rda") +```