diff --git a/.Rbuildignore b/.Rbuildignore index b6368e0..fb619e9 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,10 +11,10 @@ ^wikiwWordVis\.Rda ^wij ^visscratc\.R -^wikiwordcoords\.Rda -^ng20wij\.Rda -^time\.Rda -^wordcoords\.Rda +^wikiwordcoords\.Rda$ +^ng20wij\.Rda$ +^time\.Rda$ +^wordcoords\.Rda$ ^vignettedata/$ ^vignettedata/.$ ^stm\.Rda @@ -58,4 +58,10 @@ ^poliblog/.Rda ^log4j/.spark/.log ^mnist$ -^derby/.log \ No newline at end of file +^derby/.log +^Examples/.Rmd$ +^test/.R$ +^vignettedatamnistcoords/.Rda$ +^vignettedatangcoords/.Rda$ +^wordcoordsweightingbyp/.Rda$ +^words/.png$ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 2e0d9a9..f7ac8ce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -53,4 +53,4 @@ before_install: | after_success: - - Rscript -e 'covr::codecov(branch="reference")' + - Rscript -e 'covr::codecov()' diff --git a/DESCRIPTION b/DESCRIPTION index 0a8182d..7b311f8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: largeVis Type: Package Title: High-Quality Visualizations of Large, High-Dimensional Datasets -Version: 0.1.6 +Version: 0.1.7 Author: Amos B. Elberg Maintainer: Amos Elberg Description: Implements the largeVis algorithm for visualizing very large high-dimensional datasets. Also very fast search for approximate nearest neighbors. @@ -18,7 +18,8 @@ Imports: ggplot2 (>= 0.9.2.1), dbscan LinkingTo: Rcpp,RcppProgress (>= 0.2.1),RcppArmadillo (>= 0.7.100.3.0),testthat(>= 1.0.2) -Suggests: testthat, +Suggests: + testthat, covr, knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 17473b9..1b19a8e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(buildWijMatrix) export(distance) export(ggManifoldMap) export(largeVis) +export(lof) export(manifoldMap) export(manifoldMapStretch) export(neighborsToVectors) diff --git a/NEWS.md b/NEWS.md index db1c3ed..48e444f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,47 +1,68 @@ +### largeVis 0.1.7 +* Bug fixes + + Largely reduced the "fuzzies" +* API Improvements + + Allow the seed to be set for projectKNNs and randomProjectionTreeSearch + + If a seed is given, multi-threading is disabled during sgd and the annoy phase of the neighbor search. These + phases of the algorithm would otherwise be non-deterministic. Note that the performance impact is substantial. + + Verbosity now defaults to the R system option + + The neighbor matrix returned by randomProjectionTreeSearch is now sorted by distance +* Testing + + Improved testing for cosine similarity + + Many tests are improved by ability to set seed +* Clustering + + LOF search now tested and exported. +* Refactorings & Improvements + + Refactored neighbor search to unify code for sparse and dense neighbors, substantially improving sparse performance + + Now using managed pointers in many places + ### largeVis 0.1.6 * Revisions for CRAN release, including verifying correctness by reproducing paper examples, and timing tests/benchmarks - + Tested against the paper authors' wiki-doc and wiki-word datasets - + Tested with up to 2.5m rows, 100m edges (processed in 12 hours). + + Tested against the paper authors' wiki-doc and wiki-word datasets + + Tested with up to 2.5m rows, 100m edges (processed in 12 hours). * Neighbor search: - + Dense search is much, much faster and more efficient - + Tree search for cosine distances uses normalized vectors + + Dense search is much, much faster and more efficient + + Tree search for cosine distances uses normalized vectors * projectKNNs - + Should be 10x faster for small datasets - + Replaced binary search ( O(n log n) ) with the alias algorithm for weighted sampling ( O(1) ) - + Clips and smooths gradients, per discussion with paper authors - + Optimized implementation for alpha == 1 - + Removed option for mixing weights into loss function - doesn't make sense if gradients are being clipped. - + Fixed OpenMP-related bug which caused visualizations to be "fuzzy" + + Should be 10x faster for small datasets + + Replaced binary search ( O(n log n) ) with the alias algorithm for weighted sampling ( O(1) ) + + Clips and smooths gradients, per discussion with paper authors + + Optimized implementation for alpha == 1 + + Removed option for mixing weights into loss function - doesn't make sense if gradients are being clipped. + + Fixed OpenMP-related bug which caused visualizations to be "fuzzy" + + Switched to the STL random number generator, allowing the user to set a seed for reproducible results. * Vignettes: - + Reuse initialization matrices and neighbors, to make it easier to see the effect of hyperparameters - + Benchmarks now a separate vignette, more detailed - + Examples removed from vignettes and moved to readme - + Added examples of manifold map with color faces using OpenFace vectors + + Reuse initialization matrices and neighbors, to make it easier to see the effect of hyperparameters + + Benchmarks now a separate vignette, more detailed + + Examples removed from vignettes and moved to readme + + Added examples of manifold map with color faces using OpenFace vectors * Sigms, P_ij matrix, w_ij matrix - + Replaced C++ code entirely with new code based on reference implementation - + Refactored R code into `buildEdgeMatrix()` and `buildWijMatrix()`, which are simpler. + + Replaced C++ code entirely with new code based on reference implementation + + Refactored R code into `buildEdgeMatrix()` and `buildWijMatrix()`, which are simpler. * Visualization - + Color manifold maps work - + Ported Karpathy's function for non-overlapping embeddings (experimental) - + Removed transparency parameter - + Added ggManifoldMap function for adding a manifold map to a ggplot2 plot -* vis - + Whether to return neighbors and sigmas now adjustable parameters, for memory reasons - + Runs gc() periodically + + Color manifold maps work + + Ported Karpathy's function for non-overlapping embeddings (experimental) + + Removed transparency parameter + + Added ggManifoldMap function for adding a manifold map to a ggplot2 plot +* largeVis + + vis function renamed largeVis + + Whether to return neighbors now an adjustable parameter, for memory reasons + + No longer return sigmas under any circumstance + + Runs gc() periodically * Data - + Removed most data and extdata that had been included before; this is to reduce size for CRAN submission + + Removed most data and extdata that had been included before; this is to reduce size for CRAN submission * Dependencies & Build - + Many misc changes to simplify dependencies for CRAN - + Re-added ARMA_64BIT_WORD; otherwise, could exceed the limitation on size of an arma sparse matrix with moderately sized datasets (~ 1 M rows, K = 100) - + Now depends on R >= 3.0.2, so RcppProgress and RcppArmadillo could be moved from the Depends section of the DESCRIPTION file - + Will now compile on systems that lack OpenMP (e.g., OS X systems with old versions of xcode). + + Many misc changes to simplify dependencies for CRAN + + Re-added ARMA_64BIT_WORD; otherwise, could exceed the limitation on size of an arma sparse matrix with moderately sized datasets (~ 1 M rows, K = 100) + + Now depends on R >= 3.0.2, so RcppProgress and RcppArmadillo could be moved from the Depends section of the DESCRIPTION file + + Will now compile on systems that lack OpenMP (e.g., OS X systems with old versions of xcode). * Correctness and Testing - + Tests are separated by subject - + Additional, more extensive tests with greater code coverage - + Added travis testing against OSX + + Tests are separated by subject + + Additional, more extensive tests with greater code coverage + + Added travis testing against OSX * Clustering - + Very preliminary support for dbscan and optics added + + Very preliminary support for dbscan and optics added, however these functions have not been exported. ### largeVis 0.1.5 diff --git a/R/RcppExports.R b/R/RcppExports.R index 03d91ce..0f01c4d 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -29,8 +29,8 @@ silhouetteDbscan <- function(edges, sil) { invisible(.Call('largeVis_silhouetteDbscan', PACKAGE = 'largeVis', edges, sil)) } -searchTrees <- function(threshold, n_trees, K, maxIter, data, distMethod, verbose) { - .Call('largeVis_searchTrees', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, data, distMethod, verbose) +searchTrees <- function(threshold, n_trees, K, maxIter, data, distMethod, seed, verbose) { + .Call('largeVis_searchTrees', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, data, distMethod, seed, verbose) } fastDistance <- function(is, js, data, distMethod, verbose) { @@ -49,15 +49,15 @@ referenceWij <- function(i, j, d, perplexity) { .Call('largeVis_referenceWij', PACKAGE = 'largeVis', i, j, d, perplexity) } -sgd <- function(coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, verbose) { - .Call('largeVis_sgd', PACKAGE = 'largeVis', coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, verbose) +sgd <- function(coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, seed, verbose) { + .Call('largeVis_sgd', PACKAGE = 'largeVis', coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, seed, verbose) } -searchTreesCSparse <- function(threshold, n_trees, K, maxIter, i, p, x, distMethod, verbose) { - .Call('largeVis_searchTreesCSparse', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, i, p, x, distMethod, verbose) +searchTreesCSparse <- function(threshold, n_trees, K, maxIter, i, p, x, distMethod, seed, verbose) { + .Call('largeVis_searchTreesCSparse', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, i, p, x, distMethod, seed, verbose) } -searchTreesTSparse <- function(threshold, n_trees, K, maxIter, i, j, x, distMethod, verbose) { - .Call('largeVis_searchTreesTSparse', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, i, j, x, distMethod, verbose) +searchTreesTSparse <- function(threshold, n_trees, K, maxIter, i, j, x, distMethod, seed, verbose) { + .Call('largeVis_searchTreesTSparse', PACKAGE = 'largeVis', threshold, n_trees, K, maxIter, i, j, x, distMethod, seed, verbose) } diff --git a/R/buildEdgeMatrix.R b/R/buildEdgeMatrix.R index ee8cd9e..6561703 100644 --- a/R/buildEdgeMatrix.R +++ b/R/buildEdgeMatrix.R @@ -12,7 +12,7 @@ buildEdgeMatrix <- function(data, neighbors, distance_method = "Euclidean", - verbose = TRUE) { + verbose = options("verbose")) { indices <- neighborsToVectors(neighbors) distances <- distance(indices$i, indices$j, x = data, distance_method, verbose) mat <- sparseMatrix( diff --git a/R/dbscan.R b/R/dbscan.R index c6e45dc..34e3a93 100644 --- a/R/dbscan.R +++ b/R/dbscan.R @@ -35,7 +35,7 @@ optics <- function(data = NULL, minPts = nrow(data) + 1, eps_cl, xi, - verbose = TRUE) { + verbose = getOption("verbose", TRUE)) { if (! is.null(edges) && is.null(data)) ret <- optics_e(edges = edges, eps = as.double(eps), minPts = as.integer(minPts), @@ -96,7 +96,7 @@ dbscan <- function(data = NULL, eps, minPts = nrow(data) + 1, partition = !missing(edges), - verbose = TRUE) { + verbose = getOption("verbose", TRUE)) { if (! is.null(edges) && is.null(data)) ret <- dbscan_e(edges = edges, @@ -150,8 +150,28 @@ edgeMatrixToKNNS <- function(edges) { list(dist = t(dist), id = t(id), k = k) } +# The source code for function lof is based on code that bore this license: +####################################################################### +# dbscan - Density Based Clustering of Applications with Noise +# and Related Algorithms +# Copyright (C) 2015 Michael Hahsler -#' Local Outlier Factor Score +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +#' @title Local Outlier Factor Score #' #' @description Calculate the Local Outlier Factor (LOF) score for each data point given knowledge #' of k-Nearest Neighbors. @@ -161,21 +181,26 @@ edgeMatrixToKNNS <- function(edges) { #' @references Based on code in the \code{\link[dbscan]{dbscan}} package. #' #' @return A vector of LOF values for each data point. +#' @export lof <- function(edges) { kNNlist <- edgeMatrixToKNNS(edges) N <- nrow(kNNlist$id) K <- kNNlist$k + # lrd <- rep(0, N) lrd <- rep(0, N) - for(i in 1:N) { - input <- kNNlist$dist[c(i, kNNlist$id[i, ]) ,] - lrd[i] <- 1 / (sum(apply(input, MARGIN = 1, max)) / K) - } + # for(i in 1:N) { + # input <- kNNlist$dist[c(i, kNNlist$id[i, ]) ,] + # lrd[i] <- 1 / (sum(apply(input, MARGIN = 1, max)) / K) + # } + for(i in 1:N) lrd[i] <- 1/(sum(apply( + cbind(kNNlist$dist[kNNlist$id[i,], K], kNNlist$dist[i,]), + 1, max)) / K) - lof <- rep(0, N) - for (i in 1:N) lof[i] <- sum(lrd[kNNlist$id[i,]])/K / lrd[i] + ret <- rep(0, N) + for (i in 1:N) ret[i] <- sum(lrd[kNNlist$id[i,]])/K / lrd[i] - lof[is.nan(lof)] <- NA + ret[is.nan(ret)] <- NA - lof + ret } diff --git a/R/distance.R b/R/distance.R index f919cc0..5d1cceb 100644 --- a/R/distance.R +++ b/R/distance.R @@ -25,7 +25,7 @@ distance.matrix <- function(x, i, j, distance_method = "Euclidean", - verbose = TRUE) { + verbose = getOption("verbose", TRUE)) { return (fastDistance(i, j, x, @@ -39,7 +39,7 @@ distance.CsparseMatrix <- function(x, i, j, distance_method = "Euclidean", - verbose = TRUE) { + verbose = getOption("verbose", TRUE)) { return(fastCDistance(i, j, x@i, @@ -56,7 +56,7 @@ distance.TsparseMatrix <- function( i, j, distance_method="Euclidean", - verbose=TRUE) { + verbose = getOption("verbose", TRUE)) { return(fastSDistance(i, j, x@i, diff --git a/R/largeVis.R b/R/largeVis.R index 7152d42..5d08404 100644 --- a/R/largeVis.R +++ b/R/largeVis.R @@ -10,13 +10,7 @@ #' @param max_iter See \code{\link{randomProjectionTreeSearch}}. #' @param distance_method One of "Euclidean" or "Cosine." See \code{\link{randomProjectionTreeSearch}}. #' @param perplexity See \code{\link{buildWijMatrix}}. -#' @param sgd_batches See \code{\link{projectKNNs}}. -#' @param M See \code{\link{projectKNNs}}. -#' @param alpha See \code{\link{projectKNNs}}. -#' @param gamma See \code{\link{projectKNNs}}. -#' @param rho See \code{\link{projectKNNs}}. #' @param save_neighbors Whether to include in the output the adjacency matrix of nearest neighbors. -#' @param coords A [N,K] matrix of coordinates to use as a starting point -- useful for refining an embedding in stages. #' @param verbose Verbosity #' @param ... Additional arguments passed to \code{\link{projectKNNs}}. #' @@ -53,26 +47,18 @@ #' largeVis <- function(x, dim = 2, - K = 40, + K = 50, n_trees = 50, tree_threshold = max(10, ncol(x)), max_iter = 1, distance_method = "Euclidean", - perplexity = 50, - - sgd_batches = NULL, - M = 5, - alpha = 1, - gamma = 7, - rho = 1, - - coords = NULL, + perplexity = max(50, K / 3), save_neighbors = TRUE, - verbose = TRUE, + verbose = getOption("verbose", TRUE), ...) { ############################################# @@ -109,13 +95,7 @@ largeVis <- function(x, ####################################################### coords <- projectKNNs(wij = wij, dim = dim, - sgd_batches = sgd_batches, - M = M, - gamma = gamma, verbose = verbose, - alpha = alpha, - coords = coords, - rho = rho, ...) ####################################################### diff --git a/R/projectKNNs.R b/R/projectKNNs.R index bb9e1bd..963f676 100644 --- a/R/projectKNNs.R +++ b/R/projectKNNs.R @@ -28,8 +28,15 @@ #' enables the alternative distance function. \eqn{\alpha} below zero is meaningless. #' @param rho Initial learning rate. #' @param coords An initialized coordinate matrix. +#' @param seed Random seed to be passed to the C++ functions; sampled from hardware entropy pool if \code{NULL} (the default). +#' Note that if the seed is not \code{NULL} (the default), the maximum number of threads will be set to 1 in phases of the algorithm +#' that would otherwise be non-deterministic. #' @param verbose Verbosity #' +#' @note If specified, \code{seed} is passed to the C++ and used to initialize the random number generator. This will not, however, be +#' sufficient to ensure reproducible results, because the initial coordinate matrix is generated using the \code{R} random number generator. +#' To ensure reproducibility, call \code{\link[base]{set.seed}} before calling this function, or pass it a pre-allocated coordinate matrix. +#' #' @return A dense [N,D] matrix of the coordinates projecting the w_ij matrix into the lower-dimensional space. #' @export #' @examples @@ -50,7 +57,8 @@ projectKNNs <- function(wij, # symmetric sparse matrix alpha = 1, rho = 1, coords = NULL, - verbose = TRUE) { + seed = NULL, + verbose = getOption("verbose", TRUE)) { if (alpha < 0) stop("alpha < 0 is meaningless") N <- (length(wij@p) - 1) @@ -86,6 +94,7 @@ projectKNNs <- function(wij, # symmetric sparse matrix alpha = alpha, gamma = gamma, M = M, rho = rho, n_samples = sgd_batches, + seed, verbose = verbose) return(coords) diff --git a/R/projectionTreeSearch.R b/R/projectionTreeSearch.R index 4f2f4d1..33ea2db 100644 --- a/R/projectionTreeSearch.R +++ b/R/projectionTreeSearch.R @@ -14,6 +14,8 @@ #' using a value equivalent to the number of features in the input set. #' @param max_iter Number of iterations in the neighborhood exploration phase. #' @param distance_method One of "Euclidean" or "Cosine." +#' @param seed Random seed passed to the C++ functions. If \code{seed} is not \code{NULL} (the default), +#' the maximum number of threads will be set to 1 in phases that would be non-determinstic otherwise. #' @param verbose Whether to print verbose logging using the \code{progress} package. #' #' @return A [K, N] matrix of the approximate K nearest neighbors for each vertex. @@ -24,7 +26,8 @@ randomProjectionTreeSearch <- function(x, tree_threshold = max(10, nrow(x)), max_iter = 1, distance_method = "Euclidean", - verbose= TRUE) + seed = NULL, + verbose = getOption("verbose", TRUE)) UseMethod("randomProjectionTreeSearch") #' @export @@ -35,7 +38,8 @@ randomProjectionTreeSearch.matrix <- function(x, tree_threshold = max(10, nrow(x)), max_iter = 1, distance_method = "Euclidean", - verbose= TRUE) { + seed = NULL, + verbose = getOption("verbose", TRUE)) { if (verbose) cat("Searching for neighbors.\n") if (distance_method == "Cosine") x <- x / rowSums(x) @@ -46,6 +50,7 @@ randomProjectionTreeSearch.matrix <- function(x, maxIter = max_iter, data = x, distMethod = distance_method, + seed = seed, verbose = verbose) if (sum(colSums(knns != -1) == 0) > 0) @@ -68,7 +73,8 @@ randomProjectionTreeSearch.CsparseMatrix <- function(x, tree_threshold = max(10, nrow(x)), max_iter = 1, distance_method = "Euclidean", - verbose= TRUE) { + seed = NULL, + verbose = getOption("verbose", TRUE)) { if (verbose) cat("Searching for neighbors.\n") knns <- searchTreesCSparse(threshold = tree_threshold, @@ -79,6 +85,7 @@ randomProjectionTreeSearch.CsparseMatrix <- function(x, p = x@p, x = x@x, distMethod = distance_method, + seed = seed, verbose = verbose) if (sum(colSums(knns != -1) == 0) > 0) @@ -103,7 +110,8 @@ randomProjectionTreeSearch.TsparseMatrix <- function(x, max_iter = 1, distance_method = "Euclidean", - verbose= TRUE) { + seed = NULL, + verbose = getOption("verbose", TRUE)) { if (verbose) cat("Searching for neighbors.\n") knns <- searchTreesTSparse(threshold = tree_threshold, @@ -114,6 +122,7 @@ randomProjectionTreeSearch.TsparseMatrix <- function(x, j = x@j, x = x@x, distMethod = distance_method, + seed = seed, verbose = verbose) if (sum(colSums(knns != -1) == 0) > 0) diff --git a/README.md b/README.md index 1ae5aa0..443c760 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ largeVis ================ -[![Travis-CI Build Status](https://travis-ci.org/elbamos/largeVis.svg?branch=0.1.6)](https://travis-ci.org/elbamos/largeVis) [![Coverage Status](https://img.shields.io/codecov/c/github/elbamos/largeVis/0.1.6.svg)](https://codecov.io/gh/elbamos/largeVis/branch/0.1.6) [![https://gitter.im/elbamos/largeVis](https://badges.gitter.im/elbamos/largeVis.svg)](https://gitter.im/elbamos/largeVis?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/elbamos/largeVis?branch=0.1.6&svg=true)](https://ci.appveyor.com/project/elbamos/largeVis?branch=0.1.6) +[![Travis-CI Build Status](https://travis-ci.org/elbamos/largeVis.svg?branch=0.1.7)](https://travis-ci.org/elbamos/largeVis) [![Coverage Status](https://img.shields.io/codecov/c/github/elbamos/largeVis/0.1.7.svg)](https://codecov.io/gh/elbamos/largeVis/branch/0.1.7) [![https://gitter.im/elbamos/largeVis](https://badges.gitter.im/elbamos/largeVis.svg)](https://gitter.im/elbamos/largeVis?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/elbamos/largeVis?branch=0.1.7&svg=true)](https://ci.appveyor.com/project/elbamos/largeVis?branch=0.1.7) This is an implementation of the `largeVis` algorithm described in (). It also incorporates a very fast algorithm for estimating k-nearest neighbors, implemented in C++ with `Rcpp` and `OpenMP`, and for visualizing a map of the manifold like [this](http://cs.stanford.edu/people/karpathy/cnnembed/). diff --git a/man/buildEdgeMatrix.Rd b/man/buildEdgeMatrix.Rd index 9d9e624..af3cf15 100644 --- a/man/buildEdgeMatrix.Rd +++ b/man/buildEdgeMatrix.Rd @@ -5,7 +5,7 @@ \title{Build an nearest-neighbor graph weighted by distance.} \usage{ buildEdgeMatrix(data, neighbors, distance_method = "Euclidean", - verbose = TRUE) + verbose = options("verbose")) } \arguments{ \item{data}{A matrix with a number of columns equal to the number of columns in `x`} diff --git a/man/dbscan.Rd b/man/dbscan.Rd index 2e4c981..53da5ed 100644 --- a/man/dbscan.Rd +++ b/man/dbscan.Rd @@ -5,7 +5,8 @@ \title{dbscan} \usage{ dbscan(data = NULL, neighbors = NULL, edges = NULL, eps, - minPts = nrow(data) + 1, partition = !missing(edges), verbose = TRUE) + minPts = nrow(data) + 1, partition = !missing(edges), + verbose = getOption("verbose", TRUE)) } \arguments{ \item{data}{Input data, where examples are columns.} diff --git a/man/distance.Rd b/man/distance.Rd index a629e34..9a42f7f 100644 --- a/man/distance.Rd +++ b/man/distance.Rd @@ -10,13 +10,13 @@ distance(x, i, j, distance_method, verbose) \method{distance}{matrix}(x, i, j, distance_method = "Euclidean", - verbose = TRUE) + verbose = getOption("verbose", TRUE)) \method{distance}{CsparseMatrix}(x, i, j, distance_method = "Euclidean", - verbose = TRUE) + verbose = getOption("verbose", TRUE)) \method{distance}{TsparseMatrix}(x, i, j, distance_method = "Euclidean", - verbose = TRUE) + verbose = getOption("verbose", TRUE)) } \arguments{ \item{x}{A (potentially sparse) matrix, where examples are columns and features are rows.} diff --git a/man/largeVis.Rd b/man/largeVis.Rd index c8ae8fe..5640aac 100644 --- a/man/largeVis.Rd +++ b/man/largeVis.Rd @@ -6,10 +6,10 @@ \alias{largeVis-package} \title{largeVis: high-quality visualizations for large, high-dimensionality datasets} \usage{ -largeVis(x, dim = 2, K = 40, n_trees = 50, tree_threshold = max(10, - ncol(x)), max_iter = 1, distance_method = "Euclidean", perplexity = 50, - sgd_batches = NULL, M = 5, alpha = 1, gamma = 7, rho = 1, - coords = NULL, save_neighbors = TRUE, verbose = TRUE, ...) +largeVis(x, dim = 2, K = 50, n_trees = 50, tree_threshold = max(10, + ncol(x)), max_iter = 1, distance_method = "Euclidean", + perplexity = max(50, K/3), save_neighbors = TRUE, + verbose = getOption("verbose", TRUE), ...) } \arguments{ \item{x}{A matrix, where the features are rows and the examples are columns.} @@ -30,18 +30,6 @@ in the input set.} \item{perplexity}{See \code{\link{buildWijMatrix}}.} -\item{sgd_batches}{See \code{\link{projectKNNs}}.} - -\item{M}{See \code{\link{projectKNNs}}.} - -\item{alpha}{See \code{\link{projectKNNs}}.} - -\item{gamma}{See \code{\link{projectKNNs}}.} - -\item{rho}{See \code{\link{projectKNNs}}.} - -\item{coords}{A [N,K] matrix of coordinates to use as a starting point -- useful for refining an embedding in stages.} - \item{save_neighbors}{Whether to include in the output the adjacency matrix of nearest neighbors.} \item{verbose}{Verbosity} diff --git a/man/optics.Rd b/man/optics.Rd index 0a02190..51eed0f 100644 --- a/man/optics.Rd +++ b/man/optics.Rd @@ -5,7 +5,7 @@ \title{OPTICS} \usage{ optics(data = NULL, neighbors = NULL, edges = NULL, eps, - minPts = nrow(data) + 1, eps_cl, xi, verbose = TRUE) + minPts = nrow(data) + 1, eps_cl, xi, verbose = getOption("verbose", TRUE)) } \arguments{ \item{data}{Input data, where examples are columns.} diff --git a/man/projectKNNs.Rd b/man/projectKNNs.Rd index ebcc2bb..22c4ffd 100644 --- a/man/projectKNNs.Rd +++ b/man/projectKNNs.Rd @@ -5,7 +5,8 @@ \title{Project a distance matrix into a lower-dimensional space.} \usage{ projectKNNs(wij, dim = 2, sgd_batches = NULL, M = 5, gamma = 7, - alpha = 1, rho = 1, coords = NULL, verbose = TRUE) + alpha = 1, rho = 1, coords = NULL, seed = NULL, + verbose = getOption("verbose", TRUE)) } \arguments{ \item{wij}{A symmetric sparse matrix of edge weights, in C-compressed format, as created with the \code{Matrix} package.} @@ -27,6 +28,10 @@ enables the alternative distance function. \eqn{\alpha} below zero is meaningles \item{coords}{An initialized coordinate matrix.} +\item{seed}{Random seed to be passed to the C++ functions; sampled from hardware entropy pool if \code{NULL} (the default). +Note that if the seed is not \code{NULL} (the default), the maximum number of threads will be set to 1 in phases of the algorithm +that would otherwise be non-deterministic.} + \item{verbose}{Verbosity} } \value{ @@ -49,6 +54,11 @@ an alternative probabilistic function, \eqn{1 / (1 + \exp(x^2))} will be used in Note that the input matrix should be symmetric. If any columns in the matrix are empty, the function will fail. } +\note{ +If specified, \code{seed} is passed to the C++ and used to initialize the random number generator. This will not, however, be +sufficient to ensure reproducible results, because the initial coordinate matrix is generated using the \code{R} random number generator. +To ensure reproducibility, call \code{\link[base]{set.seed}} before calling this function, or pass it a pre-allocated coordinate matrix. +} \examples{ \dontrun{ data(wiki) diff --git a/man/randomProjectionTreeSearch.Rd b/man/randomProjectionTreeSearch.Rd index b540015..6de1f86 100644 --- a/man/randomProjectionTreeSearch.Rd +++ b/man/randomProjectionTreeSearch.Rd @@ -9,19 +9,23 @@ \usage{ randomProjectionTreeSearch(x, K = 150, n_trees = 50, tree_threshold = max(10, nrow(x)), max_iter = 1, - distance_method = "Euclidean", verbose = TRUE) + distance_method = "Euclidean", seed = NULL, + verbose = getOption("verbose", TRUE)) \method{randomProjectionTreeSearch}{matrix}(x, K = 150, n_trees = 50, tree_threshold = max(10, nrow(x)), max_iter = 1, - distance_method = "Euclidean", verbose = TRUE) + distance_method = "Euclidean", seed = NULL, + verbose = getOption("verbose", TRUE)) \method{randomProjectionTreeSearch}{CsparseMatrix}(x, K = 150, n_trees = 50, tree_threshold = max(10, nrow(x)), max_iter = 1, - distance_method = "Euclidean", verbose = TRUE) + distance_method = "Euclidean", seed = NULL, + verbose = getOption("verbose", TRUE)) \method{randomProjectionTreeSearch}{TsparseMatrix}(x, K = 150, n_trees = 50, tree_threshold = max(10, nrow(x)), max_iter = 1, - distance_method = "Euclidean", verbose = TRUE) + distance_method = "Euclidean", seed = NULL, + verbose = getOption("verbose", TRUE)) } \arguments{ \item{x}{A (potentially sparse) matrix, where examples are columnns and features are rows.} @@ -37,6 +41,9 @@ using a value equivalent to the number of features in the input set.} \item{distance_method}{One of "Euclidean" or "Cosine."} +\item{seed}{Random seed passed to the C++ functions. If \code{seed} is not \code{NULL} (the default), +the maximum number of threads will be set to 1 in phases that would be non-determinstic otherwise.} + \item{verbose}{Whether to print verbose logging using the \code{progress} package.} } \value{ diff --git a/src/Makevars b/src/Makevars index c637c19..59376bc 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,4 @@ PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(FLIBS) $(LAPACK_LIBS) $(BLAS_LIBS) -PKG_CXXFLAGS = $(SHLIB_OPENMP_CFLAGS) -DARMA_64BIT_WORD -CXX_STD=CXX11 +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_OPENMP_CFLAGS) $(FLIBS) $(LAPACK_LIBS) $(BLAS_LIBS) +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD +CXX_STD=CXX11 \ No newline at end of file diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 74bed6e..2ca7b38 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -106,8 +106,8 @@ BEGIN_RCPP END_RCPP } // searchTrees -arma::imat searchTrees(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::mat& data, const std::string& distMethod, bool verbose); -RcppExport SEXP largeVis_searchTrees(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP dataSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { +arma::imat searchTrees(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::mat& data, const std::string& distMethod, Rcpp::Nullable< Rcpp::NumericVector> seed, bool verbose); +RcppExport SEXP largeVis_searchTrees(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP dataSEXP, SEXP distMethodSEXP, SEXP seedSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; @@ -117,8 +117,9 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const int& >::type maxIter(maxIterSEXP); Rcpp::traits::input_parameter< const arma::mat& >::type data(dataSEXP); Rcpp::traits::input_parameter< const std::string& >::type distMethod(distMethodSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable< Rcpp::NumericVector> >::type seed(seedSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(searchTrees(threshold, n_trees, K, maxIter, data, distMethod, verbose)); + __result = Rcpp::wrap(searchTrees(threshold, n_trees, K, maxIter, data, distMethod, seed, verbose)); return __result; END_RCPP } @@ -186,29 +187,30 @@ BEGIN_RCPP END_RCPP } // sgd -arma::mat sgd(arma::mat coords, arma::ivec& targets_i, arma::ivec& sources_j, IntegerVector& ps, NumericVector& weights, const double gamma, const double rho, const long long n_samples, const int M, const double alpha, const bool verbose); -RcppExport SEXP largeVis_sgd(SEXP coordsSEXP, SEXP targets_iSEXP, SEXP sources_jSEXP, SEXP psSEXP, SEXP weightsSEXP, SEXP gammaSEXP, SEXP rhoSEXP, SEXP n_samplesSEXP, SEXP MSEXP, SEXP alphaSEXP, SEXP verboseSEXP) { +arma::mat sgd(arma::mat coords, arma::ivec& targets_i, arma::ivec& sources_j, arma::ivec& ps, arma::vec& weights, const double gamma, const double rho, const long long n_samples, const int M, const double alpha, const Rcpp::Nullable seed, const bool verbose); +RcppExport SEXP largeVis_sgd(SEXP coordsSEXP, SEXP targets_iSEXP, SEXP sources_jSEXP, SEXP psSEXP, SEXP weightsSEXP, SEXP gammaSEXP, SEXP rhoSEXP, SEXP n_samplesSEXP, SEXP MSEXP, SEXP alphaSEXP, SEXP seedSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< arma::mat >::type coords(coordsSEXP); Rcpp::traits::input_parameter< arma::ivec& >::type targets_i(targets_iSEXP); Rcpp::traits::input_parameter< arma::ivec& >::type sources_j(sources_jSEXP); - Rcpp::traits::input_parameter< IntegerVector& >::type ps(psSEXP); - Rcpp::traits::input_parameter< NumericVector& >::type weights(weightsSEXP); + Rcpp::traits::input_parameter< arma::ivec& >::type ps(psSEXP); + Rcpp::traits::input_parameter< arma::vec& >::type weights(weightsSEXP); Rcpp::traits::input_parameter< const double >::type gamma(gammaSEXP); Rcpp::traits::input_parameter< const double >::type rho(rhoSEXP); Rcpp::traits::input_parameter< const long long >::type n_samples(n_samplesSEXP); Rcpp::traits::input_parameter< const int >::type M(MSEXP); Rcpp::traits::input_parameter< const double >::type alpha(alphaSEXP); + Rcpp::traits::input_parameter< const Rcpp::Nullable >::type seed(seedSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(sgd(coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, verbose)); + __result = Rcpp::wrap(sgd(coords, targets_i, sources_j, ps, weights, gamma, rho, n_samples, M, alpha, seed, verbose)); return __result; END_RCPP } // searchTreesCSparse -arma::mat searchTreesCSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::uvec& i, const arma::uvec& p, const arma::vec& x, const std::string& distMethod, bool verbose); -RcppExport SEXP largeVis_searchTreesCSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP pSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { +arma::imat searchTreesCSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::uvec& i, const arma::uvec& p, const arma::vec& x, const std::string& distMethod, Rcpp::Nullable< Rcpp::NumericVector> seed, bool verbose); +RcppExport SEXP largeVis_searchTreesCSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP pSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP seedSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; @@ -220,14 +222,15 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const arma::uvec& >::type p(pSEXP); Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP); Rcpp::traits::input_parameter< const std::string& >::type distMethod(distMethodSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable< Rcpp::NumericVector> >::type seed(seedSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(searchTreesCSparse(threshold, n_trees, K, maxIter, i, p, x, distMethod, verbose)); + __result = Rcpp::wrap(searchTreesCSparse(threshold, n_trees, K, maxIter, i, p, x, distMethod, seed, verbose)); return __result; END_RCPP } // searchTreesTSparse -arma::mat searchTreesTSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::uvec& i, const arma::uvec& j, const arma::vec& x, const std::string& distMethod, bool verbose); -RcppExport SEXP largeVis_searchTreesTSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP jSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP verboseSEXP) { +arma::imat searchTreesTSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, const arma::uvec& i, const arma::uvec& j, const arma::vec& x, const std::string& distMethod, Rcpp::Nullable< Rcpp::NumericVector> seed, bool verbose); +RcppExport SEXP largeVis_searchTreesTSparse(SEXP thresholdSEXP, SEXP n_treesSEXP, SEXP KSEXP, SEXP maxIterSEXP, SEXP iSEXP, SEXP jSEXP, SEXP xSEXP, SEXP distMethodSEXP, SEXP seedSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; @@ -239,8 +242,9 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const arma::uvec& >::type j(jSEXP); Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP); Rcpp::traits::input_parameter< const std::string& >::type distMethod(distMethodSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable< Rcpp::NumericVector> >::type seed(seedSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - __result = Rcpp::wrap(searchTreesTSparse(threshold, n_trees, K, maxIter, i, j, x, distMethod, verbose)); + __result = Rcpp::wrap(searchTreesTSparse(threshold, n_trees, K, maxIter, i, j, x, distMethod, seed, verbose)); return __result; END_RCPP } diff --git a/src/denseneighbors.cpp b/src/denseneighbors.cpp index 8ea7c5c..18de30e 100644 --- a/src/denseneighbors.cpp +++ b/src/denseneighbors.cpp @@ -2,203 +2,58 @@ // [[Rcpp::plugins(cpp11)]] // [[Rcpp::depends(RcppArmadillo)]] // [[Rcpp::depends(RcppProgress)]] -#include "largeVis.h" +#include "neighbors.h" using namespace Rcpp; using namespace std; using namespace arma; -/* -* When a leaf node is found, store the identity of the neighbors -* along with each vertex. -*/ -void addNeighbors(const arma::ivec& indices, - Neighborhood* heap[], - const int I) { - ivec neighbors = ivec(indices); - Neighborhood tmpStorage = Neighborhood(); - ivec::iterator newEnd = neighbors.end(); -#ifdef _OPENMP -#pragma omp critical -#endif -{ - for (ivec::iterator it = neighbors.begin(); - it != newEnd; - it++) { - tmpStorage.clear(); - tmpStorage.swap(*heap[*it]); - heap[*it] -> reserve(tmpStorage.size() + I); - ivec::iterator newIt = neighbors.begin(); - vector::iterator oldIt = tmpStorage.begin(); - vector::iterator oldEnd = tmpStorage.end(); - int last; - int best = -1; - while (oldIt != oldEnd || newIt != newEnd) { - if (oldIt == oldEnd) best = *newIt++; - else if (newIt == newEnd) best = *oldIt++; - else best = (*newIt < *oldIt) ? *newIt++ : *oldIt++; - if (best == last || best == *it) continue; - heap[*it] -> push_back(best); - last = best; - } - } -} -} - -arma::vec hyperplane(const arma::ivec& indices, - const arma::mat& data, - const int I) { - vec direction = vec(indices.size()); - int x1idx, x2idx; - vec v; - vec m; - do { - const vec selections = randu(2) * (I - 1); - x1idx = indices[selections[0]]; - x2idx = indices[selections[1]]; - if (x1idx == x2idx) x2idx = indices[((int)selections[1] + 1) % indices.size()]; - const vec x2 = data.col(x2idx); - const vec x1 = data.col(x1idx); - // Get hyperplane - m = (x1 + x2) / 2; // Base point of hyperplane - const vec d = x1 - x2; - v = d / as_scalar(norm(d, 2)); // unit vector - } while (x1idx == x2idx); - - for (int i = 0; i < indices.size(); i++) { - const vec X = data.col(indices[i]); - direction[i] = dot((X - m), v); - } - return direction; -} - -/* -* The recursive function for the annoy neighbor search -* algorithm. Partitions space by a random hyperplane, -* and calls itself recursively (twice) on each side. -* -* If called with fewer nodes than the threshold, -* -*/ -void searchTree(const int& threshold, - const arma::ivec& indices, - const arma::mat& data, - Neighborhood* heap[], - Progress& progress) { - const int I = indices.size(); - // const int D = data.n_rows; - if (progress.check_abort()) return; - if (I < 2) stop("Tree split failure."); - if (I <= threshold) { - addNeighbors(indices, heap, I); - progress.increment(I); - return; - } - vec direction = hyperplane(indices, data, I); - const double middle = median(direction); - const uvec left = find(direction > middle); - const uvec right = find(direction <= middle); - - if (left.size() >= 2 && right.size() >= 2) { - searchTree(threshold, indices(left), data, heap, progress); - searchTree(threshold, indices(right), data, heap, progress); - } else { // Handles the rare case where the split fails because of equidistant points - searchTree(threshold, indices.subvec(0, indices.size() / 2), data, heap, progress); - searchTree(threshold, indices.subvec(indices.size() / 2, indices.size() - 1), data, heap, progress); - } +class EuclideanAdder : public DistanceAdder { +protected: + virtual distancetype distanceFunction(const arma::vec& x_i, const arma::vec& x_j) { + return relDist(x_i, x_j); + } +public: + EuclideanAdder(const arma::mat& data, const kidxtype K) : DistanceAdder(data, K) {} }; -Neighborhood** createNeighborhood(int N) { - Neighborhood** treeNeighborhoods = new Neighborhood*[N]; - for (int i = 0; i < N; i++) { - int seed[] = {i}; - treeNeighborhoods[i] = new vector(seed, seed + sizeof(seed) / sizeof(int)); - } - return treeNeighborhoods; -} - -void copyHeapToMatrix(set* tree, - const int K, - const int i, - arma::imat& knns) { - set::iterator sortIterator = tree -> begin(); - set::iterator end = tree -> end(); - int j = 0; - while (sortIterator != end) knns(j++, i) = *sortIterator++; - if (j == 0) stop("Tree failure."); - while (j < K) knns(j++, i) = -1; -} - -void addDistance(const arma::vec& x_i, - const arma::mat& data, - const int j, - MaxHeap& heap, - const int K, - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j)) { - const double d = distanceFunction(x_i, data.col(j)); - if (d != 0) { - heap.emplace(d, j); - if (heap.size() > K) heap.pop(); - } -} - -void heapToSet(MaxHeap& heap, set* set) { - while (! heap.empty()) { - set -> emplace(heap.top().n); - heap.pop(); - } -} - -arma::imat annoy(const int n_trees, - const int threshold, - const arma::mat& data, - const int N, - const int K, - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j), - Progress& p) { - set** treeHolder = new set*[N]; - Neighborhood** treeNeighborhoods = createNeighborhood(N); - const ivec indices = regspace(0, N - 1); +class CosineAdder : public DistanceAdder { +protected: + virtual distancetype distanceFunction(const arma::vec& x_i, const arma::vec& x_j) { + return cosDist(x_i, x_j); + } +public: + CosineAdder(const arma::mat& data, const kidxtype K) : DistanceAdder(data, K) {} +}; -#ifdef _OPENMP -#pragma omp parallel for shared(treeNeighborhoods) -#endif - for (int t = 0; t < n_trees; t++) if (! p.check_abort()) - searchTree(threshold, - indices, - data, - treeNeighborhoods, - p - ); - if (p.check_abort()) return imat(K, N); - // Reduce size from threshold * n_trees to top K, and sort - MaxHeap thisHeap = MaxHeap(); -#ifdef _OPENMP -#pragma omp parallel for shared(treeHolder, treeNeighborhoods) private(thisHeap) -#endif - for (int i = 0; i < N; i++) if (p.increment()) { - const vec x_i = data.col(i); - vector *neighborhood = treeNeighborhoods[i]; - for (vector::iterator j = neighborhood -> begin(); - j != neighborhood -> end(); - j++) - addDistance(x_i, data, *j, thisHeap, K, distanceFunction); - delete treeNeighborhoods[i]; - treeHolder[i] = new set(); - heapToSet(thisHeap, treeHolder[i]); - } - // Copy sorted neighborhoods into matrix. This is faster than - // sorting in-place. - imat knns = imat(K,N); -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < N; i++) if (p.increment()) { - copyHeapToMatrix(treeHolder[i], K, i, knns); - delete treeHolder[i]; - } - return knns; -} +class DenseAnnoySearch : public AnnoySearch { +protected: + virtual arma::vec hyperplane(const arma::ivec& indices) { + vec direction = vec(indices.size()); + vertexidxtype x1idx, x2idx; + vec v; + vec m; + do { + x1idx = indices[sample(indices.n_elem)]; + x2idx = indices[sample(indices.n_elem)]; + if (x1idx == x2idx) x2idx = indices[sample(indices.n_elem)]; + const vec x2 = data.col(x2idx); + const vec x1 = data.col(x1idx); + // Get hyperplane + m = (x1 + x2) / 2; // Base point of hyperplane + const vec d = x1 - x2; + v = d / as_scalar(norm(d, 2)); // unit vector + } while (x1idx == x2idx); + + for (vertexidxtype i = 0; i < indices.n_elem; i++) { + const vec X = data.col(indices[i]); + direction[i] = dot((X - m), v); + } + return direction; + } +public: + DenseAnnoySearch(const arma::mat& data, Progress& p) : AnnoySearch(data, p) {} +}; // [[Rcpp::export]] arma::imat searchTrees(const int& threshold, @@ -207,95 +62,22 @@ arma::imat searchTrees(const int& threshold, const int& maxIter, const arma::mat& data, const std::string& distMethod, + Rcpp::Nullable< Rcpp::NumericVector> seed, bool verbose) { - - const int N = data.n_cols; - - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j); - if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = relDist; - else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = cosDist; - else distanceFunction = relDist; + const vertexidxtype N = data.n_cols; + std::shared_ptr< DistanceAdder > adder; + if (distMethod.compare(std::string("Cosine")) == 0) adder = std::shared_ptr< DistanceAdder >(new CosineAdder(data, K)); + else adder = std::shared_ptr< DistanceAdder >(new EuclideanAdder(data, K)); Progress p((N * n_trees) + (2 * N) + (N * maxIter), verbose); - imat knns; - { - mat dataMat; - if (distMethod.compare(std::string("Cosine")) == 0) dataMat = normalise(data); - else dataMat = data; - knns = annoy(n_trees, - threshold, - dataMat, - N, - K, - distanceFunction, - p); - } - - if (p.check_abort()) return imat(0); - imat old_knns = imat(K,N); - for (int T = 0; T < maxIter; T++) if (! p.check_abort()) { - imat tmp = old_knns; - old_knns = knns; - knns = tmp; - MaxHeap thisHeap = MaxHeap(); - set sorter = set(); -#ifdef _OPENMP -#pragma omp parallel for shared(old_knns, knns) private(thisHeap, sorter) -#endif - for (int i = 0; i < N; i++) if (p.increment()) { - const vec x_i = data.col(i); - - PositionVector positions = PositionVector(), ends = PositionVector(); - positions.reserve(K + 1); ends.reserve(K + 1); - - positions.push_back(old_knns.begin_col(i)); - ends.push_back(old_knns.end_col(i)); - - for (imat::col_iterator it = old_knns.begin_col(i); - it != ends[0] && *it != -1; - it++) { - positions.push_back(old_knns.begin_col(*it)); - ends.push_back(old_knns.end_col(*it)); - } - - int lastOne = N + 1; - // This is a K + 1 vector merge sort running in O(K * N) - PositionVector::iterator theEnd = positions.end(); - while (true) { - imat::col_iterator whch = 0; - - for (pair< PositionVector::iterator, - PositionVector::iterator > it(positions.begin(), - ends.begin()); - it.first != theEnd; - it.first++, it.second++) while (*it.first != *it.second) { // For each neighborhood, keep going until - // we find a non-dupe or get to the end - - if (**it.first == -1) advance(*it.first, distance(*it.first, *it.second)); - else if (**it.first == i || **it.first == lastOne) advance(*it.first, 1); - else if (whch == 0 || **it.first < *whch) { - whch = *it.first; - break; - } else break; - } - if (whch == 0) break; - lastOne = *whch; - advance(whch, 1); - - addDistance(x_i, data, lastOne, thisHeap, K, distanceFunction); - } - - sorter.clear(); - heapToSet(thisHeap, &sorter); - - set::iterator sortIterator = sorter.begin(); - int j = 0; - while (sortIterator != sorter.end()) knns(j++, i) = *sortIterator++; - if (j == 0) stop("Neighbor exploration failure."); - while (j < K) knns(j++,i) = -1; - } - } - return knns; -}; - + mat dataMat; + if (distMethod.compare(std::string("Cosine")) == 0) dataMat = normalise(data, 2, 0); + else dataMat = data; + DenseAnnoySearch annoy = DenseAnnoySearch(dataMat, p); + annoy.setSeed(seed); + annoy.trees(n_trees, threshold); + annoy.reduce(K, adder); + annoy.convertToMatrix(K); + return (maxIter == 0) ? annoy.getMatrix(adder) : annoy.exploreNeighborhood(maxIter, adder); +} \ No newline at end of file diff --git a/src/distance.cpp b/src/distance.cpp index 366825a..08fc015 100644 --- a/src/distance.cpp +++ b/src/distance.cpp @@ -4,39 +4,39 @@ // [[Rcpp::depends(RcppProgress)]] #include "largeVis.h" -double relDist(const arma::vec& i, const arma::vec& j) { - const int lim = i.n_elem; - double cnt = 0; - for (int idx = 0; idx < lim; idx++) cnt += ((i[idx] - j[idx]) * (i[idx] - j[idx])); +distancetype relDist(const arma::vec& i, const arma::vec& j) { + const dimidxtype D = i.n_elem; + distancetype cnt = 0; + for (dimidxtype idx = 0; idx < D; idx++) cnt += ((i[idx] - j[idx]) * (i[idx] - j[idx])); return cnt; } // Vanilla euclidean -double dist(const arma::vec& i, const arma::vec& j) { +distancetype dist(const arma::vec& i, const arma::vec& j) { return sqrt(relDist(i,j)); } // Vanilla cosine distance calculation -double cosDist(const arma::vec& i, const arma::vec& j) { - int D = i.n_elem; - double pp = 0, qq = 0, pq = 0; - for (int d = 0; d < D; d++) { +distancetype cosDist(const arma::vec& i, const arma::vec& j) { + const dimidxtype D = i.n_elem; + distancetype pp = 0, qq = 0, pq = 0; + for (dimidxtype d = 0; d < D; d++) { pp += (i[d]) * (i[d]); qq += (j[d]) * (j[d]); pq += (i[d]) * (j[d]); } - double ppqq = pp * qq; + distancetype ppqq = pp * qq; if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq); else return 2.0; // cos is 0 } // Versions of the distance functions for finding the neighbors // of sparse matrices. Not optimized. -double sparseDist(const sp_mat& i, const sp_mat& j) { +distancetype sparseDist(const sp_mat& i, const sp_mat& j) { return as_scalar(sqrt(sum(square(i - j)))); } -double sparseCosDist(const sp_mat& i, const sp_mat& j) { +distancetype sparseCosDist(const sp_mat& i, const sp_mat& j) { return 2.0 - 2.0 * (as_scalar((dot(i,j)) / as_scalar(norm(i,2) * norm(j,2)))); } -double sparseRelDist(const sp_mat& i, const sp_mat& j) { +distancetype sparseRelDist(const sp_mat& i, const sp_mat& j) { return as_scalar(sum(square(i - j))); } @@ -52,13 +52,13 @@ arma::vec fastDistance(const NumericVector is, Progress p(is.size(), verbose); vec xs = vec(is.size()); - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j); + distancetype (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j); if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = dist; else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = cosDist; #ifdef _OPENMP #pragma omp parallel for shared (xs) #endif - for (int i=0; i < is.length(); i++) if (p.increment()) xs[i] = + for (vertexidxtype i=0; i < is.length(); i++) if (p.increment()) xs[i] = distanceFunction(data.col(is[i]), data.col(js[i])); return xs; }; @@ -71,7 +71,7 @@ arma::vec fastSparseDistance(const arma::vec& is, Progress p(is.size(), verbose); vec xs = vec(is.size()); - double (*distanceFunction)( + distancetype (*distanceFunction)( const sp_mat& x_i, const sp_mat& x_j); if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = sparseDist; @@ -79,7 +79,7 @@ arma::vec fastSparseDistance(const arma::vec& is, #ifdef _OPENMP #pragma omp parallel for shared (xs) #endif - for (int i=0; i < is.size(); i++) if (p.increment()) xs[i] = + for (vertexidxtype i=0; i < is.n_elem; i++) if (p.increment()) xs[i] = distanceFunction(data.col(is[i]), data.col(js[i])); return xs; }; @@ -92,7 +92,7 @@ arma::vec fastCDistance(const arma::vec& is, const arma::vec& x, const std::string& distMethod, bool verbose) { - const int N = p_locations.size() - 1; + const vertexidxtype N = p_locations.n_elem - 1; const sp_mat data = sp_mat(i_locations, p_locations, x, N, N); return fastSparseDistance(is,js,data,distMethod,verbose); } diff --git a/src/edgeweights.cpp b/src/edgeweights.cpp index 11beae5..f00df4e 100644 --- a/src/edgeweights.cpp +++ b/src/edgeweights.cpp @@ -12,10 +12,11 @@ class ReferenceEdges { protected: // arma::vec sigmas; const double perplexity; - const long long n_edges; - const int n_vertices; - std::vector edge_from, edge_to, head, next, reverse; - std::vector edge_weight; + const edgeidxtype n_edges; + const vertexidxtype n_vertices; + std::vector< vertexidxtype > edge_from, edge_to; + std::vector< edgeidxtype> head, next, reverse; + std::vector< double > edge_weight; public: ReferenceEdges(double perplexity, @@ -24,16 +25,16 @@ class ReferenceEdges { const arma::vec& weights) : perplexity{perplexity}, n_edges(from.size()), n_vertices(from[(long) n_edges - 1] + 1), - edge_from(std::vector()), - edge_to(std::vector()), - head(std::vector()), - next(std::vector()), - reverse(std::vector()), + edge_from(std::vector< vertexidxtype >()), + edge_to(std::vector< vertexidxtype >()), + head(std::vector< edgeidxtype >()), + next(std::vector< edgeidxtype >()), + reverse(std::vector< edgeidxtype >()), edge_weight(std::vector()) { // sigmas = vec(n_vertices); - long n_edge = 0; - for (int i = 0; i < n_vertices; i++) head.push_back(-1); - for (int x = 0; x < n_vertices; x++) { + edgeidxtype n_edge = 0; + for (vertexidxtype i = 0; i < n_vertices; i++) head.push_back(-1); + for (vertexidxtype x = 0; x < n_vertices; x++) { while (from[n_edge] == x) { edge_from.push_back(x); edge_to.push_back(to[n_edge]); @@ -44,9 +45,10 @@ class ReferenceEdges { } } } - void similarityOne(long id) { + + void similarityOne(vertexidxtype id) { double beta, lo_beta, hi_beta, sum_weight, H, tmp; - long p; + vertexidxtype p; beta = 1; lo_beta = hi_beta = -1; @@ -75,8 +77,9 @@ class ReferenceEdges { // sigmas[id] = beta; } - void searchReverse(int id) { - long long y, p, q; + void searchReverse(vertexidxtype id) { + vertexidxtype y; + edgeidxtype p, q; for (p = head[id]; p >= 0; p = next[p]) { y = edge_to[p]; for (q = head[id]; q >= 0; q = next[q]) { @@ -88,19 +91,19 @@ class ReferenceEdges { void run() { #pragma omp parallel for - for (int id = 0; id < n_vertices; id++) { + for (vertexidxtype id = 0; id < n_vertices; id++) { similarityOne(id); } #pragma omp parallel for - for (int id = 0; id < n_vertices; id++) { + for (vertexidxtype id = 0; id < n_vertices; id++) { searchReverse(id); } - long long n_edge = edge_to.size(); + edgeidxtype n_edge = edge_to.size(); double sum_weight = 0; - for (int id = 0; id != n_vertices; id++) { - for (long long p = head[id]; p >= 0; p = next[p]) { - long long y = edge_to[p]; - long long q = reverse[p]; + for (vertexidxtype id = 0; id != n_vertices; id++) { + for (edgeidxtype p = head[id]; p >= 0; p = next[p]) { + vertexidxtype y = edge_to[p]; + edgeidxtype q = reverse[p]; if (q == -1) { edge_from.push_back(y); edge_to.push_back(id); @@ -120,7 +123,7 @@ class ReferenceEdges { arma::sp_mat getWIJ() { umat locations = umat(2, edge_from.size()); vec values = vec(edge_weight.size()); - for (long long i = 0; i < edge_from.size(); i++) { + for (edgeidxtype i = 0; i < edge_from.size(); i++) { locations(0, i) = edge_from[i]; locations(1, i) = edge_to[i]; values[i] = edge_weight[i]; diff --git a/src/gradients.cpp b/src/gradients.cpp index 3584c68..ecdda71 100644 --- a/src/gradients.cpp +++ b/src/gradients.cpp @@ -33,23 +33,23 @@ // Parent class -void Gradient::positiveGradient(const double* i, - const double* j, - double* holder) const { +void Gradient::positiveGradient(const coordinatetype* i, + const coordinatetype* j, + coordinatetype* holder) const { const double dist_squared = distAndVector(i, j, holder); _positiveGradient(dist_squared, holder); } -void Gradient::negativeGradient(const double* i, - const double* k, - double* holder) const { +void Gradient::negativeGradient(const coordinatetype* i, + const coordinatetype* k, + coordinatetype* holder) const { const double dist_squared = distAndVector(i, k, holder); _negativeGradient(dist_squared, holder); } // Copies the vector sums into a vector while it computes distance^2 - // useful in calculating the gradients during SGD -inline double Gradient::distAndVector(const double *x_i, - const double *x_j, - double *output) const { +inline double Gradient::distAndVector(const coordinatetype *x_i, + const coordinatetype *x_j, + coordinatetype *output) const { double cnt = 0; for (int d = 0; d < D; d++) { double t = x_i[d] - x_j[d]; @@ -59,47 +59,47 @@ inline double Gradient::distAndVector(const double *x_i, return cnt; } -inline double Gradient::clamp(double val) const { +inline coordinatetype Gradient::clamp(coordinatetype val) const { return fmin(fmax(val, -cap), cap); } -inline void Gradient::multModify(double *col, const double adj) const { - for (int i = 0; i != D; i++) col[i] = clamp(col[i] * adj); +inline void Gradient::multModify(coordinatetype *col, const coordinatetype adj) const { + for (dimidxtype i = 0; i != D; i++) col[i] = clamp(col[i] * adj); } /* * Generalized gradient with an alpha parameter */ -void AlphaGradient::_positiveGradient(const double dist_squared, - double* holder) const { - const double grad = twoalpha / (1 + alpha * dist_squared); +void AlphaGradient::_positiveGradient(const distancetype dist_squared, + coordinatetype* holder) const { + const distancetype grad = twoalpha / (1 + alpha * dist_squared); multModify(holder, grad); } -void AlphaGradient::_negativeGradient(const double dist_squared, - double* holder) const { - const double adk = alpha * dist_squared; - const double grad = alphagamma / (dist_squared * (adk + 1)); +void AlphaGradient::_negativeGradient(const distancetype dist_squared, + coordinatetype* holder) const { + const distancetype adk = alpha * dist_squared; + const distancetype grad = alphagamma / (dist_squared * (adk + 1)); multModify(holder, grad); } /* * Optimized gradient for alpha == 1 */ -AlphaOneGradient::AlphaOneGradient(const double g, - const int d) : AlphaGradient(1, g, d) { +AlphaOneGradient::AlphaOneGradient(const distancetype g, + const dimidxtype d) : AlphaGradient(1, g, d) { } -void AlphaOneGradient::_positiveGradient(const double dist_squared, - double* holder) const { - const double grad = - 2 / (1 + dist_squared); +void AlphaOneGradient::_positiveGradient(const distancetype dist_squared, + coordinatetype* holder) const { + const distancetype grad = - 2 / (1 + dist_squared); multModify(holder, grad); } -void AlphaOneGradient::_negativeGradient(const double dist_squared, - double* holder) const { - const double grad = alphagamma / (1 + dist_squared) / (0.1 + dist_squared); +void AlphaOneGradient::_negativeGradient(const distancetype dist_squared, + coordinatetype* holder) const { + const distancetype grad = alphagamma / (1 + dist_squared) / (0.1 + dist_squared); multModify(holder, grad); } @@ -108,16 +108,16 @@ void AlphaOneGradient::_negativeGradient(const double dist_squared, */ -void ExpGradient::_positiveGradient(const double dist_squared, - double* holder) const { - const double expsq = exp(dist_squared); - const double grad = (dist_squared > 4) ? -1 : +void ExpGradient::_positiveGradient(const distancetype dist_squared, + coordinatetype* holder) const { + const distancetype expsq = exp(dist_squared); + const distancetype grad = (dist_squared > 4) ? -1 : -(expsq / (expsq + 1)); multModify(holder, grad); } -void ExpGradient::_negativeGradient(const double dist_squared, - double* holder) const { - const double grad = (dist_squared > gammagamma) ? 0 : +void ExpGradient::_negativeGradient(const distancetype dist_squared, + coordinatetype* holder) const { + const distancetype grad = (dist_squared > gammagamma) ? 0 : gamma / (1 + exp(dist_squared)); multModify(holder, grad); } diff --git a/src/largeVis.cpp b/src/largeVis.cpp index 75577b6..39d6a53 100644 --- a/src/largeVis.cpp +++ b/src/largeVis.cpp @@ -10,61 +10,96 @@ using namespace arma; class Visualizer { protected: - const int D; + const dimidxtype D; const int M; const int M2; - long long * const targetPointer; - long long * const sourcePointer; - double * const coordsPtr; - const long long n_samples; + vertexidxtype * const targetPointer; + vertexidxtype * const sourcePointer; + coordinatetype * const coordsPtr; + const iterationtype n_samples; - double rho; - double rhoIncrement; + distancetype rho; + distancetype rhoIncrement; - AliasTable negAlias; - AliasTable posAlias; - Gradient* grad; + AliasTable< vertexidxtype > negAlias; + AliasTable< edgeidxtype > posAlias; + std::shared_ptr< Gradient > grad; - IntegerVector ps; + vertexidxtype* ps; + + int storedThreads = 0; public: - Visualizer(long long * sourcePtr, - long long * targetPtr, - int D, - double * coordPtr, + Visualizer(vertexidxtype * sourcePtr, + vertexidxtype * targetPtr, + dimidxtype D, + coordinatetype * coordPtr, int M, - double rho, - long long n_samples) : D{D}, M{M}, M2(M * 2), + distancetype rho, + iterationtype n_samples) : D{D}, M{M}, M2(M * 2), targetPointer{targetPtr}, sourcePointer{sourcePtr}, coordsPtr{coordPtr}, n_samples{n_samples}, rho{rho}, - rhoIncrement(rho / n_samples) { } - - void initAlias(IntegerVector& newps, - const NumericVector& weights) { - ps = newps; - NumericVector pdiffs = pow(diff(newps), 0.75); - negAlias.initialize(pdiffs); - posAlias.initialize(weights); - negAlias.initRandom(); - posAlias.initRandom(); - } + rhoIncrement((rho - 0.0001) / n_samples) { } +#ifdef _OPENMP + ~Visualizer() { + if (storedThreads > 0) omp_set_num_threads(storedThreads); + } +#endif - void setGradient(Gradient * newGrad) { - grad = newGrad; + void initAlias(arma::ivec& newps, + const arma::vec& weights, + const arma::ivec& targets, + Rcpp::Nullable seed) { + vertexidxtype N = newps.n_elem - 1; + ps = newps.memptr(); + distancetype* negweights = new distancetype[N]; + for (vertexidxtype n = 0; n < N; n++) negweights[n] = 0; + for (vertexidxtype p = 0; p < N; p++) { + for (edgeidxtype e = newps[p]; + e != newps[p + 1]; + e++) { + //negweights[targets[e]] += weights[e]; + negweights[p] += weights[e]; + } + } + distancetype sum_weight = 0; + for (vertexidxtype n = 0; n < N; n++) sum_weight += negweights[n] = pow(negweights[n], 0.75); + negAlias.initialize(negweights, N); + posAlias.initialize(weights.memptr(), weights.n_elem); + + if (seed.isNotNull()) { +#ifdef _OPENMP + storedThreads = omp_get_max_threads(); + omp_set_num_threads(1); +#endif + long long innerSeed = Rcpp::NumericVector(seed)[0]; + innerSeed = negAlias.initRandom(innerSeed); + posAlias.initRandom(innerSeed); + } else { + negAlias.initRandom(); + posAlias.initRandom(); + } + } + + void setGradient(double alpha, double gamma, dimidxtype D) { + if (alpha == 0) grad = std::shared_ptr< Gradient > (new ExpGradient(gamma, D)); + else if (alpha == 1) grad = std::shared_ptr< Gradient > (new AlphaOneGradient(gamma, D)); + else grad = std::shared_ptr< Gradient > (new AlphaGradient(alpha, gamma, D)); } - void operator()(long long startSampleIdx, int batchSize) { - long long e_ij; - int i, j, k, d, m, shortcircuit, example = 0; - double firstholder[10], secondholder[10]; - double * y_i, * y_j; - long long * searchBegin, * searchEnd; + void operator()(iterationtype startSampleIdx, int batchSize) { + edgeidxtype e_ij; + int m, shortcircuit, example = 0; + vertexidxtype i, j, k; + dimidxtype d; + vertexidxtype * searchBegin, * searchEnd; + coordinatetype firstholder[10], secondholder[10], * y_i, * y_j; - double localRho = rho; + distancetype localRho = rho; while (example++ != batchSize && localRho > 0) { // * (1 - (startSampleIdx / n_samples)); e_ij = posAlias(); @@ -72,15 +107,15 @@ class Visualizer { i = sourcePointer[e_ij]; y_i = coordsPtr + (i * D); - y_j = coordsPtr + (j * D); + grad -> positiveGradient(y_i, y_j, firstholder); + for (d = 0; d != D; d++) y_j[d] -= firstholder[d] * localRho; searchBegin = targetPointer + ps[i]; searchEnd = targetPointer + ps[i + 1]; shortcircuit = 0; m = 0; - while (m != M && shortcircuit != M2) { k = negAlias(); shortcircuit++; @@ -95,7 +130,6 @@ class Visualizer { y_j = coordsPtr + (k * D); grad -> negativeGradient(y_i, y_j, secondholder); - for (d = 0; d != D; d++) y_j[d] -= secondholder[d] * localRho; for (d = 0; d != D; d++) firstholder[d] += secondholder[d]; } @@ -103,6 +137,14 @@ class Visualizer { localRho -= rhoIncrement; } rho -= (rhoIncrement * batchSize); +//#ifdef _OPENMP +//#pragma omp critical +//#endif/ +// {/ +// actualIterationCount += batchSize; +// rho = initialRho * (1 - actualIterationCount / n_samples + 1.0); + // rho -= (rhoIncrement * batchSize);/ +// } } }; @@ -110,44 +152,40 @@ class Visualizer { arma::mat sgd(arma::mat coords, arma::ivec& targets_i, // vary randomly arma::ivec& sources_j, // ordered - IntegerVector& ps, // N+1 length vector of indices to start of each row j in vector is - NumericVector& weights, // w{ij} + arma::ivec& ps, // N+1 length vector of indices to start of each row j in vector is + arma::vec& weights, // w{ij} const double gamma, const double rho, const long long n_samples, const int M, const double alpha, + const Rcpp::Nullable seed, const bool verbose) { - Progress progress(n_samples, verbose); - int D = coords.n_rows; + dimidxtype D = coords.n_rows; if (D > 10) stop("Limit of 10 dimensions for low-dimensional space."); Visualizer v(sources_j.memptr(), targets_i.memptr(), coords.n_rows, - coords.memptr(), + (coordinatetype*) coords.memptr(), M, - rho, - n_samples); - v.initAlias(ps, weights); - - if (alpha == 0) v.setGradient(new ExpGradient(gamma, D)); - else if (alpha == 1) v.setGradient(new AlphaOneGradient(gamma, D)); - else v.setGradient(new AlphaGradient(alpha, gamma, D)); - + (distancetype) rho, + (iterationtype) n_samples); + v.initAlias(ps, weights, targets_i, seed); + v.setGradient(alpha, gamma, D); const int batchSize = 8192; - const long long barrier = (n_samples * .95 < n_samples - coords.n_cols) ? n_samples * .95 : n_samples - coords.n_cols; + const iterationtype barrier = (n_samples * .95 < n_samples - coords.n_cols) ? n_samples * .95 : n_samples - coords.n_cols; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (long long eIdx = 0; eIdx < barrier; eIdx += batchSize) if (progress.increment(batchSize)) { + for (iterationtype eIdx = 0; eIdx < barrier; eIdx += batchSize) if (progress.increment(batchSize)) { v(eIdx, batchSize); } #ifdef _OPENMP #pragma omp barrier #endif - for (long long eIdx = barrier; eIdx < n_samples; eIdx += batchSize) if (progress.increment(batchSize)) v(eIdx, batchSize); + for (iterationtype eIdx = barrier; eIdx < n_samples; eIdx += batchSize) if (progress.increment(batchSize)) v(eIdx, batchSize); return coords; }; diff --git a/src/largeVis.h b/src/largeVis.h index 9f7f177..e781269 100644 --- a/src/largeVis.h +++ b/src/largeVis.h @@ -12,56 +12,31 @@ #include #include #include +#include using namespace Rcpp; using namespace std; using namespace arma; - /* - * Neighbor search + * Global types */ -struct HeapObject { - double d; - int n; - HeapObject(double d, int n) : d(d), n(n) {} - bool operator<(const struct HeapObject& other) const { - return d < other.d; - } -}; -typedef priority_queue MaxHeap; -typedef vector< imat::col_iterator > PositionVector; -typedef vector Neighborhood; -Neighborhood** createNeighborhood(int N); -void copyHeapToMatrix(set* tree, - const int K, - const int i, - arma::imat& knns); -void addDistance(const arma::vec& x_i, - const arma::mat& data, - const int j, - MaxHeap& heap, - const int K, - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j)); -void heapToSet(MaxHeap& heap, set* set); -arma::imat annoy(const int n_trees, - const int threshold, - const arma::mat& data, - const int N, - const int K, - double (*distanceFunction)(const arma::vec& x_i, const arma::vec& x_j), - Progress& p); -void addNeighbors(const arma::ivec& indices, - Neighborhood* heap[], - const int I); +typedef double distancetype; +typedef double coordinatetype; +typedef long long vertexidxtype; +typedef long long edgeidxtype; +typedef long long iterationtype; +typedef int dimidxtype; +typedef int kidxtype; + /* * Distance Functions */ -double dist(const arma::vec& i, const arma::vec& j); -double relDist(const arma::vec& i, const arma::vec& j); -double cosDist(const arma::vec& i, const arma::vec& j); -double sparseDist(const sp_mat& i, const sp_mat& j); -double sparseRelDist(const sp_mat& i, const sp_mat& j); -double sparseCosDist(const sp_mat& i, const sp_mat& j); +distancetype dist(const arma::vec& i, const arma::vec& j); +distancetype relDist(const arma::vec& i, const arma::vec& j); +distancetype cosDist(const arma::vec& i, const arma::vec& j); +distancetype sparseDist(const sp_mat& i, const sp_mat& j); +distancetype sparseRelDist(const sp_mat& i, const sp_mat& j); +distancetype sparseCosDist(const sp_mat& i, const sp_mat& j); // Exported distance functions for high dimensional space arma::vec fastDistance(const NumericVector is, @@ -92,34 +67,27 @@ arma::vec fastSDistance(const arma::vec& is, /* * Functions related to the alias algorithm */ -typedef double realsies; template class AliasTable { private: - T N = 0; - realsies* probs = NULL; - T* aliases = NULL; + std::unique_ptr< coordinatetype[] > probs; + std::unique_ptr< T[] > aliases; + std::uniform_real_distribution< coordinatetype > rnd = std::uniform_real_distribution< coordinatetype >(); + std::mt19937_64 mt; + T N; public: - AliasTable() {} - AliasTable(T N) : N{N} { - probs = new realsies[N]; - aliases = new T[N]; + AliasTable() { } - void initialize(const NumericVector& weights) { - if (N == 0) { - N = weights.size(); - probs = new realsies[N]; - aliases = new T[N]; - } - // AliasTable(const NumericVector& weights) : - // N(weights.size()), - // probs(new realsies[N]), - // aliases(new T[N]) { - const long double sm = sum(weights); - for (T i = 0; i < N; i++) probs[i] = weights[i] * N / sm; + void initialize(const distancetype* weights, T N) { + this -> N = N; + probs = std::unique_ptr< coordinatetype[] >( new coordinatetype[N] ); + aliases = std::unique_ptr< T[] >(new T[N]); + distancetype sm = 0; + for (T i = 0; i != N; i++) sm += weights[i]; + for (T i = 0; i != N; i++) probs[i] = weights[i] * N / sm; queue small = queue(); queue large = queue(); for (T i = 0; i < N; i++) ((probs[i] < 1) ? @@ -148,39 +116,24 @@ class AliasTable { if (accu > 1e-5) warning("Numerical instability in alias table " + to_string(accu)); }; - T search(realsies random, realsies random2) const { + T search(coordinatetype random, coordinatetype random2) const { T candidate = random * N; return (random2 >= probs[candidate]) ? aliases[candidate] : candidate; }; - // const gsl_rng_type *gsl_T = NULL; - // gsl_rng *gsl_r = NULL; - // - // void initRandom() { - // initRandom(314159265); - // } - // void initRandom(long seed) { - // gsl_T = gsl_rng_rand48; - // gsl_r = gsl_rng_alloc(gsl_T); - // gsl_rng_set(gsl_r, seed); - // } - std::uniform_real_distribution rnd; - std::mt19937_64 mt; - - void initRandom(long seed) { + long long initRandom(long long seed) { mt = mt19937_64(seed); - rnd = uniform_real_distribution(); + return mt(); } void initRandom() { std::random_device seed; initRandom(seed()); } - + coordinatetype getRand() { + return rnd(mt); + } T operator()() { - realsies dub1 = rnd(mt); - realsies dub2 = rnd(mt); - return search(dub1, dub2); - // return search(gsl_rng_uniform(gsl_r), gsl_rng_uniform(gsl_r)); + return search(rnd(mt), rnd(mt)); } }; @@ -189,43 +142,43 @@ class AliasTable { */ class Gradient { protected: - const double gamma; - double cap; - const int D; - Gradient(const double g, - const int d) : gamma{g}, cap(5), D{d} {}; - virtual void _positiveGradient(const double dist_squared, - double* holder) const = 0; - virtual void _negativeGradient(const double dist_squared, - double* holder) const = 0; - inline void multModify(double *col, const double adj) const; - inline double clamp(double val) const; + const distancetype gamma; + distancetype cap; + const dimidxtype D; + Gradient(const distancetype g, + const dimidxtype d) : gamma{g}, cap(5), D{d} {}; + virtual void _positiveGradient(const distancetype dist_squared, + coordinatetype* holder) const = 0; + virtual void _negativeGradient(const distancetype dist_squared, + coordinatetype* holder) const = 0; + inline void multModify(coordinatetype *col, coordinatetype adj) const; + inline coordinatetype clamp(coordinatetype val) const; public: - virtual void positiveGradient(const double* i, - const double* j, - double* holder) const; - virtual void negativeGradient(const double* i, - const double* k, - double* holder) const; - inline double distAndVector(const double *x_i, - const double *x_j, - double *output) const; + virtual void positiveGradient(const coordinatetype* i, + const coordinatetype* j, + coordinatetype* holder) const; + virtual void negativeGradient(const coordinatetype* i, + const coordinatetype* k, + coordinatetype* holder) const; + inline distancetype distAndVector(const coordinatetype *x_i, + const coordinatetype *x_j, + coordinatetype *output) const; }; class AlphaGradient: public Gradient { - const double alpha; - const double twoalpha; + const coordinatetype alpha; + const coordinatetype twoalpha; protected: - const double alphagamma; + const coordinatetype alphagamma; virtual void _positiveGradient(const double dist_squared, - double* holder) const; + coordinatetype* holder) const; virtual void _negativeGradient(const double dist_squared, - double* holder) const; + coordinatetype* holder) const; public: - AlphaGradient(const double a, - const double g, - const int d) : Gradient(g, d), + AlphaGradient(const distancetype a, + const distancetype g, + const dimidxtype D) : Gradient(g, D), alpha{a}, twoalpha(alpha * -2), alphagamma(alpha * gamma * 2) { } ; @@ -233,26 +186,26 @@ class AlphaGradient: public Gradient { class AlphaOneGradient: public AlphaGradient { public: - AlphaOneGradient(const double g, - const int d); + AlphaOneGradient(const distancetype g, + const dimidxtype D); protected: - virtual void _positiveGradient(const double dist_squared, - double* holder) const; - virtual void _negativeGradient(const double dist_squared, - double* holder) const; + virtual void _positiveGradient(const distancetype dist_squared, + coordinatetype* holder) const; + virtual void _negativeGradient(const distancetype dist_squared, + coordinatetype* holder) const; }; class ExpGradient: public Gradient { public: - const double gammagamma; - ExpGradient(const double g, const int d) : Gradient(g, d), + const coordinatetype gammagamma; + ExpGradient(const distancetype g, const dimidxtype d) : Gradient(g, d), gammagamma(gamma * gamma) { cap = gamma; }; protected: - virtual void _positiveGradient(const double dist_squared, - double* holder) const; - virtual void _negativeGradient(const double dist_squared, - double* holder) const; + virtual void _positiveGradient(const distancetype dist_squared, + coordinatetype* holder) const; + virtual void _negativeGradient(const distancetype dist_squared, + coordinatetype* holder) const; }; #endif diff --git a/src/neighbors.h b/src/neighbors.h new file mode 100644 index 0000000..c0d4fd6 --- /dev/null +++ b/src/neighbors.h @@ -0,0 +1,310 @@ +#ifndef _LARGEVISNEIGHBORS +#define _LARGEVISNEIGHBORS +#include "largeVis.h" + +using namespace Rcpp; +using namespace std; +using namespace arma; + +struct HeapObject { + distancetype d; + vertexidxtype n; + HeapObject(distancetype d, vertexidxtype n) : d(d), n(n) {} + bool operator<(const struct HeapObject& other) const { + return d < other.d; + } +}; +typedef priority_queue MaxHeap; +typedef vector< imat::col_iterator > PositionVector; +typedef vector< vertexidxtype > Neighborhood; + +template +class DistanceAdder { +protected: + const M& data; + const kidxtype K; + virtual double distanceFunction(const V& x_i, const V& x_j) = 0; + DistanceAdder(const M& data, + const kidxtype K) : data{data}, K{K} {} +public: + void add(MaxHeap& thisHeap, + const V& x_i, + const vertexidxtype j) { + const distancetype d = distanceFunction(x_i, data.col(j)); + thisHeap.emplace(d, j); + if (thisHeap.size() > K) thisHeap.pop(); + } +}; + +template +class AnnoySearch { +protected: + const M& data; + const vertexidxtype N; + Progress& p; + std::unique_ptr< Neighborhood[] > treeNeighborhoods; + std::unique_ptr< set< vertexidxtype >[] > treeHolder; + imat knns; + + int threshold = 0; + int storedThreads = 0; + + std::uniform_real_distribution rnd; + std::mt19937_64 mt; + + virtual arma::vec hyperplane(const arma::ivec& indices) = 0; + + void addNeighbors(const arma::ivec& indices) { + ivec neighbors = ivec(indices); + Neighborhood tmpStorage = Neighborhood(); + ivec::iterator newEnd = neighbors.end(); +#ifdef _OPENMP +#pragma omp critical +#endif + { + for (ivec::iterator it = neighbors.begin(); + it != newEnd; + it++) { + tmpStorage.clear(); + tmpStorage.swap(treeNeighborhoods[*it]); + treeNeighborhoods[*it].reserve(tmpStorage.size() + indices.n_elem); + ivec::iterator newIt = neighbors.begin(); + vector< vertexidxtype >::iterator oldIt = tmpStorage.begin(); + vector< vertexidxtype >::iterator oldEnd = tmpStorage.end(); + vertexidxtype last; + vertexidxtype best = -1; + while (oldIt != oldEnd || newIt != newEnd) { + if (oldIt == oldEnd) best = *newIt++; + else if (newIt == newEnd) best = *oldIt++; + else best = (*newIt < *oldIt) ? *newIt++ : *oldIt++; + if (best == last || best == *it) continue; + treeNeighborhoods[*it].push_back(best); + last = best; + } + } + } + } + + void recurse(const arma::ivec& indices) { + const vertexidxtype I = indices.n_elem; + // const int D = data.n_rows; + if (p.check_abort()) return; + if (I < 2) stop("Tree split failure."); + if (I <= threshold) { + addNeighbors(indices); + p.increment(I); + return; + } + + vec direction = hyperplane(indices); + const distancetype middle = median(direction); + const uvec left = find(direction > middle); + const uvec right = find(direction <= middle); + + if (left.size() >= 2 && right.size() >= 2) { + recurse(indices(left)); + recurse(indices(right)); + } else { // Handles the rare case where the split fails because of equidistant points + recurse(indices.subvec(0, indices.size() / 2)); + recurse(indices.subvec(indices.size() / 2, indices.size() - 1)); + } + }; + + inline long sample(long i) { + return (long) (rnd(mt) * (i - 1)); + } + + inline void copyHeapToMatrix(set< vertexidxtype >& tree, + const kidxtype K, + const vertexidxtype i) { + set< vertexidxtype >::iterator sortIterator = tree.begin(); + set< vertexidxtype >::iterator end = tree.end(); + vertexidxtype j = 0; + while (sortIterator != end) knns(j++, i) = *sortIterator++; + if (j == 0) stop("Tree failure."); + while (j < K) knns(j++, i) = -1; + } + + inline void heapToSet(MaxHeap& thisHeap, + set< vertexidxtype >& theSet) const { + while (! thisHeap.empty()) { + theSet.emplace(thisHeap.top().n); + thisHeap.pop(); + } + } + +public: + AnnoySearch(const M& data, Progress& p) : data{data}, N(data.n_cols), p{p} { } + + void setSeed(Rcpp::Nullable< Rcpp::NumericVector > seed) { + long innerSeed; + if (seed.isNotNull()) { +#ifdef _OPENMP + storedThreads = omp_get_max_threads(); + omp_set_num_threads(1); +#endif + innerSeed = NumericVector(seed)[0]; + } else { + std::random_device hardseed; + innerSeed = hardseed(); + } + mt = mt19937_64(innerSeed); + } + + void trees(const int n_trees, const int newThreshold) { + threshold = newThreshold; + treeNeighborhoods = std::unique_ptr< Neighborhood[] >(new Neighborhood[N]); + for (vertexidxtype i = 0; i < N; i++) { + treeNeighborhoods[i].push_back(i); + } + const ivec indices = regspace(0, data.n_cols - 1); +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int t = 0; t < n_trees; t++) if (! p.check_abort()) { + recurse(indices); + } +#ifdef _OPENMP + if (storedThreads > 0) omp_set_num_threads(storedThreads); +#endif + } + + void reduce(const kidxtype K, + std::shared_ptr< DistanceAdder > adder) { + treeHolder = std::unique_ptr< set[] >(new set< vertexidxtype >[N]); +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (vertexidxtype i = 0; i < N; i++) if (p.increment()) { + const V x_i = data.col(i); + MaxHeap thisHeap = MaxHeap(); + vector< vertexidxtype > neighborhood = treeNeighborhoods[i]; + for (vector< vertexidxtype >::iterator j = neighborhood.begin(); + j != neighborhood.end(); + j++) + adder -> add(thisHeap, x_i, *j); + treeNeighborhoods[i].clear(); + treeHolder[i] = set< vertexidxtype >(); + heapToSet(thisHeap, treeHolder[i]); + } + } + + void convertToMatrix(const kidxtype K) { + knns = imat(K,N); +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (vertexidxtype i = 0; i < N; i++) if (p.increment()) { + copyHeapToMatrix(treeHolder[i], K, i); + treeHolder[i].clear(); + } + } + + /* + * Re-sort by distance. + */ + arma::imat getMatrix(std::shared_ptr< DistanceAdder > adder) { + const kidxtype K = knns.n_rows; +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (vertexidxtype i = 0; i < N; i++) if (p.increment()) { + const V x_i = data.col(i); + MaxHeap thisHeap = MaxHeap(); + for (imat::col_iterator it = knns.begin_col(i); + it != knns.end_col(i); + it++) { + adder -> add( thisHeap, x_i, *it); + } + vertexidxtype j = K - 1; + while (j >= thisHeap.size()) knns(j--, i) = -1; + while (! thisHeap.empty()) { + knns(j--, i) = thisHeap.top().n; + thisHeap.pop(); + } + } + return knns; + } + + arma::imat exploreNeighborhood(const int maxIter, + std::shared_ptr< DistanceAdder > adder) { + const kidxtype K = knns.n_rows; + imat old_knns = imat(K,N); + for (int T = 0; T < maxIter; T++) if (! p.check_abort()) { + imat tmp = old_knns; + old_knns = knns; + knns = tmp; + MaxHeap thisHeap = MaxHeap(); + set< vertexidxtype > sorter = set< vertexidxtype >(); +#ifdef _OPENMP +#pragma omp parallel for shared(old_knns) private(thisHeap, sorter) +#endif + for (vertexidxtype i = 0; i < N; i++) if (p.increment()) { + const V x_i = data.col(i); + + PositionVector positions = PositionVector(), ends = PositionVector(); + positions.reserve(K + 1); ends.reserve(K + 1); + + positions.push_back(old_knns.begin_col(i)); + ends.push_back(old_knns.end_col(i)); + + for (imat::col_iterator it = old_knns.begin_col(i); + it != ends[0] && *it != -1; + it++) { + positions.push_back(old_knns.begin_col(*it)); + ends.push_back(old_knns.end_col(*it)); + } + + vertexidxtype lastOne = N + 1; + // This is a K + 1 vector merge sort running in O(K * N) + PositionVector::iterator theEnd = positions.end(); + while (true) { + imat::col_iterator whch = 0; + + for (pair< PositionVector::iterator, + PositionVector::iterator > it(positions.begin(), + ends.begin()); + it.first != theEnd; + it.first++, it.second++) while (*it.first != *it.second) { // For each neighborhood, keep going until + // we find a non-dupe or get to the end + + if (**it.first == -1) advance(*it.first, distance(*it.first, *it.second)); + else if (**it.first == i || **it.first == lastOne) advance(*it.first, 1); + else if (whch == 0 || **it.first < *whch) { + whch = *it.first; + break; + } else break; + } + if (whch == 0) break; + lastOne = *whch; + advance(whch, 1); + + adder ->add( thisHeap, x_i, lastOne); + } + /* + * Before the last iteration, we keep the matrix sorted by vertexid, which makes the merge above + * more efficient. In the last iteration, sort by distance. + */ + if (T != maxIter - 1) { + sorter.clear(); + heapToSet(thisHeap, sorter); + + set< vertexidxtype >::iterator sortIterator = sorter.begin(); + vertexidxtype j = 0; + while (sortIterator != sorter.end()) knns(j++, i) = *sortIterator++; + if (j == 0) stop("Neighbor exploration failure."); + while (j < K) knns(j++,i) = -1; + } else { + vertexidxtype j = K - 1; + while (j >= thisHeap.size()) knns(j--, i) = -1; + while (! thisHeap.empty()) { + knns(j--, i) = thisHeap.top().n; + thisHeap.pop(); + } + } + } + } + return knns; + } +}; +#endif \ No newline at end of file diff --git a/src/sparse.cpp b/src/sparse.cpp index 1ae5f08..fa68896 100644 --- a/src/sparse.cpp +++ b/src/sparse.cpp @@ -2,217 +2,97 @@ // [[Rcpp::plugins(cpp11)]] // [[Rcpp::depends(RcppArmadillo)]] // [[Rcpp::depends(RcppProgress)]] -#include "largeVis.h" +#include "neighbors.h" using namespace Rcpp; using namespace std; using namespace arma; -void searchTree(const int& threshold, - const arma::ivec& indices, - const sp_mat& data, - Neighborhood* heap[], - Progress& progress) { - const int I = indices.size(); - // const int D = data.n_rows; - if (progress.check_abort()) return; - if (I < 2) stop("Tree split failure."); - if (I <= threshold) { - addNeighbors(indices, heap, I); - progress.increment(I); - return; - } - vec direction = vec(indices.size()); - { - int x1idx, x2idx; - sp_mat v; - sp_mat m; - do { - const vec selections = randu(2) * (I - 1); - x1idx = indices[selections[0]]; - x2idx = indices[selections[1]]; - if (x1idx == x2idx) x2idx = indices[((int)selections[1] + 1) % indices.size()]; - const SpSubview x2 = data.col(x2idx); - const SpSubview x1 = data.col(x1idx); - // Get hyperplane - m = (x1 + x2) / 2; // Base point of hyperplane - const sp_mat d = x1 - x2; - const double dn = as_scalar(norm(d, 2)); - v = d / dn; // unit vector - } while (x1idx == x2idx); - - for (int i = 0; i < indices.size(); i++) { - const int I = indices[i]; - const SpSubview X = data.col(I); - direction[i] = dot((X - m), v); - } - } - // Normalize direction - const double middle = median(direction); +class EuclideanSparseAdder : public DistanceAdder { +protected: + virtual distancetype distanceFunction(const arma::sp_mat& x_i, const arma::sp_mat& x_j) { + return sparseRelDist(x_i, x_j); + } +public: + EuclideanSparseAdder(const arma::sp_mat& data, const kidxtype K) : DistanceAdder(data, K) {} +}; - const uvec left = find(direction > middle); - const uvec right = find(direction <= middle); - if (left.size() >= 2 && right.size() >= 2) { - searchTree(threshold, indices(left), data, heap, progress); - searchTree(threshold, indices(right), data, heap, progress); - } else { - searchTree(threshold, indices.subvec(0, indices.size() / 2), data, heap, progress); - searchTree(threshold, indices.subvec(indices.size() / 2, indices.size() - 1), data, heap, progress); - } +class CosineSparseAdder : public DistanceAdder { +protected: + virtual distancetype distanceFunction(const arma::sp_mat& x_i, const arma::sp_mat& x_j) { + return sparseCosDist(x_i, x_j); + } +public: + CosineSparseAdder(const arma::sp_mat& data, const kidxtype K) : DistanceAdder(data, K) {} }; +class SparseAnnoySearch : public AnnoySearch { +protected: + virtual arma::vec hyperplane(const arma::ivec& indices) { + const vertexidxtype I = indices.n_elem; + vec direction = vec(indices.size()); + { + vertexidxtype x1idx, x2idx; + sp_mat v; + sp_mat m; + do { + x1idx = indices[sample(I)]; + x2idx = indices[sample(I)]; + if (x1idx == x2idx) x2idx = indices[sample(I)]; + const sp_mat x2 = data.col(x2idx); + const sp_mat x1 = data.col(x1idx); + // Get hyperplane + m = (x1 + x2) / 2; // Base point of hyperplane + const sp_mat d = x1 - x2; + const distancetype dn = as_scalar(norm(d, 2)); + v = d / dn; // unit vector + } while (x1idx == x2idx); + + for (vertexidxtype i = 0; i < indices.size(); i++) { + const vertexidxtype I = indices[i]; + const sp_mat X = data.col(I); + direction[i] = dot((X - m), v); + } + } + return direction; + } +public: + SparseAnnoySearch(const arma::sp_mat& data, Progress& p) : AnnoySearch(data, p) {} +}; -arma::mat searchTreesSparse(const int& threshold, +arma::imat searchTreesSparse(const int& threshold, const int& n_trees, - const int& K, + const kidxtype& K, const int& maxIter, const sp_mat& data, const std::string& distMethod, + Rcpp::Nullable< Rcpp::NumericVector> seed, bool verbose) { - - const int N = data.n_cols; - - double (*distanceFunction)(const sp_mat& x_i, const sp_mat& x_j); - if (distMethod.compare(std::string("Euclidean")) == 0) distanceFunction = sparseRelDist; - else if (distMethod.compare(std::string("Cosine")) == 0) distanceFunction = sparseCosDist; - else distanceFunction = sparseRelDist; - - Progress p((N * n_trees) + (N) + (N * maxIter), verbose); - - Neighborhood** treeNeighborhoods = createNeighborhood(N); - - { // Artificial scope to destroy indices - sp_mat dataMat; - if (distMethod.compare(std::string("Cosine")) == 0) { - dataMat = sp_mat(data); - for (int d = 0; d < dataMat.n_cols; d++) dataMat.col(d) /= norm(dataMat.col(d)); - } else { - dataMat = data; - } - ivec indices = regspace(0, N - 1); -#ifdef _OPENMP -#pragma omp parallel for shared(indices,treeNeighborhoods) -#endif - for (int t = 0; t < n_trees; t++) if (! p.check_abort()) { - searchTree(threshold, - indices, - dataMat, - treeNeighborhoods, - p - ); - - if (t > 0 && ! p.check_abort()) -#ifdef _OPENMP -#pragma omp critical -#endif - { - for (int i = 0; i < N; i++) { - vector* neighbors = treeNeighborhoods[i]; - sort(neighbors -> begin(), neighbors -> end()); - vector::iterator theEnd = unique(neighbors -> begin(), neighbors -> end()); - neighbors -> erase(theEnd, neighbors -> end()); - if (neighbors -> size() < 3) stop("Tree failure."); - } - } - } - } - - if (p.check_abort()) return mat(0); - - // Initialize the knn matrix, and reduce the number of candidate neighbors per node - // to K. Otherwise the first neighborhood exploration pass takes N * trees * (threshold + 1), - // instead of (N * K), which is prohibitive of large thresholds. - mat knns = mat(threshold,N); - knns.fill(-1); -#ifdef _OPENMP -#pragma omp parallel for shared(knns) -#endif - for (int i = 0; i < N; i++) if (p.increment()){ - const SpSubview x_i = data.col(i); - priority_queue MaxHeap = priority_queue(); - vector* stack = treeNeighborhoods[i]; - for (vector::iterator it = stack -> begin(); it != stack -> end(); it++) { - const double d = distanceFunction(x_i, data.col(*it)); - MaxHeap.push(HeapObject(d, *it)); - if (MaxHeap.size() > threshold) MaxHeap.pop(); - } - int j = 0; - do { - knns(j,i) = MaxHeap.top().n; - MaxHeap.pop(); - j++; - } while (j < threshold && ! MaxHeap.empty()); - if (j == 1 && knns(0,i) == -1) stop("Bad neighbor matrix."); - } - if (p.check_abort()) return mat(0); - - for (int T = 0; T < maxIter; T++) { - mat old_knns = knns; - knns = mat(K,N); - knns.fill(-1); -#ifdef _OPENMP -#pragma omp parallel for shared(knns, treeNeighborhoods) -#endif - for (int i = 0; i < N; i++) if (p.increment()) { - double d; - - const vec neighborhood = old_knns.col(i); - const SpSubview x_i = data.col(i); - - priority_queue heap; - vector pastVisitors = *(treeNeighborhoods[i]); - pastVisitors.reserve((K + 1) * K); - // Loop through immediate neighbors of i - for (int jidx = 0; jidx < old_knns.n_rows; jidx++) { - const int j = neighborhood[jidx]; - if (j == -1) break; - if (j == i) continue; // This should never happen - d = distanceFunction(x_i, data.col(j)); - if (d == 0) continue; // duplicate - heap.push(HeapObject(d, j)); - if (heap.size() > K) heap.pop(); - - // For each immediate neighbor j, loop through its neighbors - const vec locality = old_knns.col(j); - for (int kidx = 0; kidx < old_knns.n_rows; kidx++) { - const int k = locality[kidx]; - if (k == -1) break; - if (k == i) continue; - // Check if this is a neighbor we've already seen. O(log k) - pair::iterator, - vector::iterator > firstlast = equal_range(pastVisitors.begin(), - pastVisitors.end(), - k); - if (*(firstlast.first) == k) continue; // Found - - if (firstlast.second == pastVisitors.end()) pastVisitors.push_back(k); - else pastVisitors.insert(firstlast.second, k); - - d = distanceFunction(x_i, data.col(k)); - if (d == 0) continue; - if (heap.size() < K) heap.push(HeapObject(d,k)); - else if (d < heap.top().d) { - heap.push(HeapObject(d, k)); - if (heap.size() > K) heap.pop(); - } - } - } - int j = 0; - while (j < K && ! heap.empty()) { - knns(j, i) = heap.top().n; - heap.pop(); - j++; - } - if (j == 0) stop("Failure in neighborhood exploration - this should never happen."); - vector(pastVisitors).swap(pastVisitors); // pre-C++11 shrink - } - } - return knns; -}; - + const vertexidxtype N = data.n_cols; + + std::shared_ptr< DistanceAdder > adder; + if (distMethod.compare(std::string("Cosine")) == 0) adder = std::shared_ptr< DistanceAdder >(new CosineSparseAdder(data, K)); + else adder = std::shared_ptr< DistanceAdder >(new EuclideanSparseAdder(data, K)); + + Progress p((N * n_trees) + (2 * N) + (N * maxIter), verbose); + + sp_mat dataMat; + if (distMethod.compare(std::string("Cosine")) == 0) { + dataMat = sp_mat(data); + for (vertexidxtype d = 0; d < dataMat.n_cols; d++) dataMat.col(d) /= norm(dataMat.col(d)); + } else { + dataMat = data; + } + SparseAnnoySearch annoy = SparseAnnoySearch(dataMat, p); + annoy.setSeed(seed); + annoy.trees(n_trees, threshold); + annoy.reduce(K, adder); + annoy.convertToMatrix(K); + return (maxIter == 0) ? annoy.getMatrix(adder) : annoy.exploreNeighborhood(maxIter, adder); +} // [[Rcpp::export]] -arma::mat searchTreesCSparse(const int& threshold, +arma::imat searchTreesCSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, @@ -220,14 +100,15 @@ arma::mat searchTreesCSparse(const int& threshold, const arma::uvec& p, const arma::vec& x, const std::string& distMethod, + Rcpp::Nullable< Rcpp::NumericVector> seed, bool verbose) { - const int N = p.size() -1; + const vertexidxtype N = p.size() -1; const sp_mat data = sp_mat(i,p,x,N,N); - return searchTreesSparse(threshold,n_trees,K,maxIter,data,distMethod,verbose); + return searchTreesSparse(threshold,n_trees,K,maxIter,data,distMethod,seed,verbose); } // [[Rcpp::export]] -arma::mat searchTreesTSparse(const int& threshold, +arma::imat searchTreesTSparse(const int& threshold, const int& n_trees, const int& K, const int& maxIter, @@ -235,8 +116,9 @@ arma::mat searchTreesTSparse(const int& threshold, const arma::uvec& j, const arma::vec& x, const std::string& distMethod, + Rcpp::Nullable< Rcpp::NumericVector> seed, bool verbose) { const umat locations = join_cols(i,j); const sp_mat data = sp_mat(locations,x); - return searchTreesSparse(threshold,n_trees,K,maxIter,data,distMethod,verbose); + return searchTreesSparse(threshold,n_trees,K,maxIter,data,distMethod,seed,verbose); } diff --git a/tests/testthat/testclusters2.R b/tests/testthat/testclusters2.R index 5c3710a..536b43a 100644 --- a/tests/testthat/testclusters2.R +++ b/tests/testthat/testclusters2.R @@ -1,5 +1,5 @@ context("cluster") -library(largeVis) +library(dbscan, quietly = TRUE) set.seed(1974) data(iris) dat <- as.matrix(iris[, 1:4]) @@ -7,27 +7,43 @@ dat <- scale(dat) dupes <- which(duplicated(dat)) dat <- dat[-dupes, ] dat <- t(dat) -neighbors <- randomProjectionTreeSearch(dat, K = 20, verbose = FALSE) +K <- 20 +neighbors <- randomProjectionTreeSearch(dat, K = K, verbose = FALSE) edges <- buildEdgeMatrix(data = dat, neighbors = neighbors, verbose = FALSE) test_that("optics doesn't crash on iris with neighbors and data", { - expect_silent(optics(neighbors = neighbors, data = dat, eps = 10, minPts = 10, verbose = FALSE)) + expect_silent(largeVis:::optics(neighbors = neighbors, data = dat, eps = 10, minPts = 10, verbose = FALSE)) }) test_that("optics doesn't crash on iris with edges", { - expect_silent(optics(edges = edges, eps = 10, minPts = 10, verbose = FALSE)) + expect_silent(largeVis:::optics(edges = edges, eps = 10, minPts = 10, verbose = FALSE)) }) test_that("optics doesn't crash on iris with edges and data", { - expect_silent(optics(edges = edges, data = dat, eps = 10, minPts = 10, verbose = FALSE)) + expect_silent(largeVis:::optics(edges = edges, data = dat, eps = 10, minPts = 10, verbose = FALSE)) }) test_that("dbscan doesn't crash on iris with edges", { - expect_silent(dbscan(edges = edges, eps = 10, minPts = 10, verbose = FALSE, partition = FALSE)) + expect_silent(largeVis:::dbscan(edges = edges, eps = 10, minPts = 10, verbose = FALSE, partition = FALSE)) }) test_that("dbscan doesn't crash on iris with partitions", { - expect_silent(clusters <- dbscan(edges = edges, eps = 10, minPts = 10, + expect_silent(clusters <- largeVis:::dbscan(edges = edges, eps = 10, minPts = 10, verbose = FALSE, partition = TRUE)) }) + +test_that(paste("LOF is consistent", K), { + truelof <- dbscan::lof(t(dat), k = K) + ourlof <- largeVis:::lof(edges) + expect_lt(sum(truelof - ourlof)^2 / ncol(dat), 0.4) +}) + +test_that("LOF is consistent 10", { + edges <- buildEdgeMatrix(data = dat, + neighbors = neighbors[1:10,], + verbose = FALSE) + truelof <- dbscan::lof(t(dat), k = 10) + ourlof <- largeVis:::lof(edges) + expect_lt(sum(truelof - ourlof)^2 / ncol(dat), 0.4) +}) \ No newline at end of file diff --git a/tests/testthat/testother.R b/tests/testthat/testother.R deleted file mode 100644 index ea0b846..0000000 --- a/tests/testthat/testother.R +++ /dev/null @@ -1,79 +0,0 @@ -context("wij") - -test_that("wij handles small K", { - set.seed(1974) - data(iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) - neighbors <- randomProjectionTreeSearch(dat, K = 5, verbose = FALSE) - edges <- buildEdgeMatrix(dat, neighbors, verbose = FALSE) - expect_silent(wij <- buildWijMatrix(edges)) -}) - -context("vis") -set.seed(1974) -data(iris) -dat <- as.matrix(iris[, 1:4]) -dat <- scale(dat) -dupes <- which(duplicated(dat)) -dat <- dat[-dupes, ] -dat <- t(dat) - -test_that("largeVis works", { - visObject <- largeVis(dat, max_iter = 20, n_trees = 100, - tree_threshold = 50, sgd_batches = 1000, - K = 20, verbose = FALSE) - expect_false(any(is.na(visObject$coords))) - expect_false(any(is.nan(visObject$coords))) - expect_false(any(is.infinite(visObject$coords))) -}) - -test_that("largeVis does not NaN on iris", { - visObject <- largeVis(dat, max_iter = 20, - coords = matrix(rnorm(ncol(dat) * 2), nrow = 2), - K = 20, verbose = FALSE, - sgd_batches = 20000 * 150) - expect_false(any(is.na(visObject$coords))) - expect_false(any(is.nan(visObject$coords))) - expect_false(any(is.infinite(visObject$coords))) -}) - -test_that("largeVis works when alpha == 0", { - visObject <- largeVis(dat, - max_iter = 20, - sgd_batches = 10000, - K = 10, - alpha = 0, - verbose = FALSE) - expect_false(any(is.na(visObject$coords))) - expect_false(any(is.nan(visObject$coords))) - expect_false(any(is.infinite(visObject$coords))) -}) - -test_that("largeVis works with cosine", { - visObject <- largeVis(dat, max_iter = 20, - sgd_batches = 1000, - K = 10, verbose = FALSE, - distance_method = "Cosine") - expect_false(any(is.na(visObject$coords))) - expect_false(any(is.nan(visObject$coords))) - expect_false(any(is.infinite(visObject$coords))) -}) - -test_that("largeVis continues to work as it scales up", { - visObject <- largeVis(dat, max_iter = 20, sgd_batches = 1000, - K = 10, gamma = 0.5, verbose = FALSE) - expect_false(any(is.na(visObject$coords))) - expect_false(any(is.nan(visObject$coords))) - expect_false(any(is.infinite(visObject$coords))) - for (i in c(10000, 100000, 1000000, 20000 * length(visObject$wij@x))) { - coords <- projectKNNs(visObject$wij, sgd_batches = i, - verbose = FALSE) - expect_false(any(is.na(coords))) - expect_false(any(is.nan(coords))) - expect_false(any(is.infinite(coords))) - } -}) \ No newline at end of file diff --git a/tests/testthat/tests.R b/tests/testthat/tests.R index f198ee7..954d8e9 100644 --- a/tests/testthat/tests.R +++ b/tests/testthat/tests.R @@ -1,13 +1,14 @@ context("neighbors") +data (iris) +set.seed(1974) +dat <- as.matrix(iris[, 1:4]) +dat <- scale(dat) +dupes <- which(duplicated(dat)) +dat <- dat[-dupes, ] +dat <- t(dat) + test_that("Trees does not error", { - data (iris) - set.seed(1974) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) expect_silent(neighbors <- randomProjectionTreeSearch(dat, K = 5, n_trees = 10, @@ -18,14 +19,6 @@ test_that("Trees does not error", { }) test_that("Trees does not error if neighbors are explored once", { - data (iris) - set.seed(1974) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) - expect_silent(neighbors <- randomProjectionTreeSearch(dat, K = 5, n_trees = 50, @@ -36,14 +29,6 @@ test_that("Trees does not error if neighbors are explored once", { }) test_that("Trees does not error if neighbors are explored more than once", { - data (iris) - set.seed(1974) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) - expect_silent(neighbors <- randomProjectionTreeSearch(dat, K = 5, n_trees = 50, @@ -53,13 +38,6 @@ test_that("Trees does not error if neighbors are explored more than once", { }) test_that("Can determine iris neighbors", { - data (iris) - set.seed(1974) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) neighbors <- randomProjectionTreeSearch(dat, K = 5, n_trees = 20, @@ -74,16 +52,9 @@ test_that("Can determine iris neighbors", { test_that("max threshold is sufficient to find all neighbors", { M <- 5 - set.seed(1974) - data (iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - d_matrix <- as.matrix(dist(dat, method = "euclidean")) + d_matrix <- as.matrix(dist(t(dat), method = "euclidean")) bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) bests <- bests[-1,] - 1 - dat <- t(dat) neighbors <- randomProjectionTreeSearch(dat, K = M, @@ -98,16 +69,9 @@ test_that("max threshold is sufficient to find all neighbors", { test_that("exploring after max threshold does not reduce accuracy", { M <- 5 - set.seed(1974) - data (iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - d_matrix <- as.matrix(dist(dat, method = "euclidean")) + d_matrix <- as.matrix(dist(t(dat), method = "euclidean")) bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) bests <- bests[-1, ] - 1 - dat <- t(dat) neighbors <- randomProjectionTreeSearch(dat, K = M, @@ -131,141 +95,86 @@ test_that("exploring after max threshold does not reduce accuracy", { expect_gte(score, oldscore) }) -test_that("Can determine iris neighbors accurately", { +test_that("Can determine iris neighbors accurately, Euclidean", { M <- 5 - set.seed(1974) - data (iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - d_matrix <- as.matrix(dist(dat, method = "euclidean")) - bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) + d_matrix <- as.matrix(dist(t(dat), method = "euclidean")) + bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) bests <- bests[-1, ] - 1 - dat <- t(dat) neighbors <- randomProjectionTreeSearch(dat, K = M, - n_trees = 10, - tree_threshold = 10, - max_iter = 10, - verbose = FALSE) - scores <- lapply(1:ncol(dat), - FUN = function(x) sum(neighbors[, x] %in% bests[, x])) - score <- sum(as.numeric(scores)) - expect_gt(score, (ncol(dat) * M) - 15) + n_trees = 20, + tree_threshold = 30, + max_iter = 12, + verbose = FALSE, + seed = 1974) + expect_lte(sum(neighbors != bests, na.rm = TRUE), 5) }) -# test_that("Knows how to converge", { -# M <- 5 -# set.seed(1974) -# RcppArmadillo::armadillo_set_seed(1974) -# data (iris) -# dat <- as.matrix(iris[, 1:4]) -# dat <- scale(dat) -# dupes <- which(duplicated(dat)) -# dat <- dat[-dupes, ] -# d_matrix = as.matrix(dist(dat, method = "euclidean")) -# bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) -# bests <- bests[-1,] - 1 -# dat <- t(dat) -# -# neighbors <- randomProjectionTreeSearch(dat, -# K = M, -# n_trees = 10, -# tree_threshold = 10, -# max_iter = 100000, -# verbose = FALSE) -# scores <- lapply(1:ncol(dat), FUN = function(x) sum(neighbors[,x] %in% bests[,x])) -# score <- sum(as.numeric(scores)) -# expect_gt(score, (ncol(dat) * M) - 15) -# }) - - +M <- 10 +data (quakes) +dat <- as.matrix(quakes) +quakes <- scale(dat) +d_matrix = as.matrix(dist(quakes, method = "euclidean")) test_that("With a bigger dataset, increasing threshold improves result", { - M <- 10 - data (quakes) - dat <- as.matrix(quakes) - dat <- scale(dat) - d_matrix = as.matrix(dist(dat, method = "euclidean")) bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) bests <- bests[-1, ] - 1 - dat <- t(dat) - oldscore <- 0 + oldscore <- nrow(quakes) * M - for (t in c(10, 30, 60, 90)) { + for (t in c(10, 40, 80, 160)) { set.seed(1974) - neighbors <- randomProjectionTreeSearch(dat, + neighbors <- randomProjectionTreeSearch(t(quakes), K = M, - n_trees = 10, + n_trees = 20, tree_threshold = t, max_iter = 0, - verbose = FALSE) - scores <- lapply(1:ncol(dat), - FUN = function(x) sum(neighbors[, x] %in% bests[, x])) - score <- sum(as.numeric(scores)) / (M * ncol(dat)) - expect_gte(score, oldscore * 0.99) # Allow some gap here to account for randomness - if (score == 1) break; + verbose = FALSE, + seed = 1974) + score <- sum(neighbors != bests, na.rm = TRUE) + expect_lte(score, oldscore, label = t) oldscore <- score } }) test_that("With a bigger dataset, increasing n_trees improves result", { - M <- 10 - set.seed(1974) - data (quakes) - dat <- as.matrix(quakes) - dat <- scale(dat) - d_matrix = as.matrix(dist(dat, method = "euclidean")) bests <- apply(d_matrix, MARGIN=1, FUN = function(x) order(x)[1:(M + 1)]) bests <- bests[-1,] - 1 - dat <- t(dat) - oldscore <- 0 + oldscore <- nrow(quakes) * M - for (t in c(10, 30, 60, 90)) { - neighbors <- randomProjectionTreeSearch(dat, + for (t in c(5, 20, 40, 90)) { + neighbors <- randomProjectionTreeSearch(t(quakes), K = M, n_trees = t, tree_threshold = 10, max_iter = 0, - verbose = FALSE) - scores <- lapply(1:ncol(dat), - FUN = function(x) sum(neighbors[, x] %in% bests[, x])) - score <- sum(as.numeric(scores)) / (M * ncol(dat)) - expect_gte(score, oldscore * 0.99) - if (score == 1) break; + verbose = FALSE, + seed = 1974) + score <- sum(neighbors != bests, na.rm = TRUE) + expect_lte(score, oldscore, label = t) oldscore <- score } }) test_that("With a bigger dataset, increasing iters improves result", { M <- 10 - set.seed(1974) - data (quakes) - dat <- as.matrix(quakes) - dat <- scale(dat) - d_matrix = as.matrix(dist(dat, method = "euclidean")) bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) bests <- bests[ - 1,] - 1 - dat <- t(dat) - oldscore <- 0 + oldscore <- nrow(quakes) * M - for (t in c(0, 1, 5, 10)) { - neighbors <- randomProjectionTreeSearch(dat, + for (t in c(0, 5, 20, 40)) { + neighbors <- randomProjectionTreeSearch(t(quakes), K = M, - n_trees = 10, + n_trees = 5, tree_threshold = 10, max_iter = t, - verbose = FALSE) - scores <- lapply(1:ncol(dat), - FUN = function(x) sum(neighbors[, x] %in% bests[, x])) - score <- sum(as.numeric(scores)) / (M * ncol(dat)) - expect_gte(score, oldscore * 0.99) - if (score == 1) break; + verbose = FALSE, + seed = 1974) + score <- max(0, sum(neighbors != bests, na.rm = TRUE)) + expect_lte(score, oldscore, label = t) oldscore <- score } }) diff --git a/tests/testthat/testsparse.R b/tests/testthat/testsparse.R index dc80ab2..3ba40da 100644 --- a/tests/testthat/testsparse.R +++ b/tests/testthat/testsparse.R @@ -1,12 +1,12 @@ context("sparse") +set.seed(1974) +dat <- as.matrix(iris[, 1:4]) +dat <- scale(dat) +dupes <- which(duplicated(dat)) +dat <- dat[-dupes, ] +dat <- t(dat) test_that("buildEdgeMatrix are the same, Euclidean", { - set.seed(1974) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) neighbors <- randomProjectionTreeSearch(dat, K = 5, n_trees = 10, @@ -14,18 +14,12 @@ test_that("buildEdgeMatrix are the same, Euclidean", { max_iter = 10, verbose = FALSE) edges1 <- buildEdgeMatrix(data = dat, neighbors = neighbors, verbose = FALSE) - edges2 <- buildEdgeMatrix(data = Matrix(dat, sparse = TRUE), neighbors = neighbors, verbose = FALSE) + edges2 <- buildEdgeMatrix(data = Matrix::Matrix(dat, sparse = TRUE), neighbors = neighbors, verbose = FALSE) score <- sum(edges1@x - edges2@x) expect_lt(score, 1) }) test_that("buildEdgeMatrix are the same, Cosine", { - set.seed(1974) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] - dat <- t(dat) neighbors <- randomProjectionTreeSearch(dat, K = 5, n_trees = 10, @@ -33,28 +27,21 @@ test_that("buildEdgeMatrix are the same, Cosine", { max_iter = 10, verbose = FALSE) edges1 <- buildEdgeMatrix(data = dat, neighbors = neighbors, verbose = FALSE, distance_method = "Cosine") - edges2 <- buildEdgeMatrix(data = Matrix(dat, sparse = TRUE), neighbors = neighbors, verbose = FALSE, distance_method = "Cosine") + edges2 <- buildEdgeMatrix(data = Matrix::Matrix(dat, sparse = TRUE), neighbors = neighbors, verbose = FALSE, distance_method = "Cosine") score <- sum(edges1@x - edges2@x) expect_lt(score, 1) }) test_that("sparseDistances", { M <- 5 - set.seed(1974) - data (iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] mat <- Matrix::sparseMatrix(i = rep(1:nrow(dat), ncol(dat)), j = rep(1:ncol(dat), each = nrow(dat)), x = as.vector(dat)) - d = as.matrix(dist(mat, method = "euclidean")) + d = as.matrix(dist(t(as.matrix(mat)), method = "euclidean")) index_matrix <- matrix(c( - rep(0:(nrow(dat) - 1), nrow(dat)), - rep(0:(nrow(dat) - 1), each = nrow(dat)) + rep(0:(ncol(dat) - 1), ncol(dat)), + rep(0:(ncol(dat) - 1), each = ncol(dat)) ), ncol = 2, byrow = FALSE) - mat <- Matrix::t(mat) new_distances <- distance(mat, as.vector(index_matrix[, 2]), as.vector(index_matrix[, 1]), @@ -66,26 +53,17 @@ test_that("sparseDistances", { test_that("Can determine sparse iris neighbors accurately", { M <- 5 - set.seed(1974) - data (iris) - dat <- as.matrix(iris[, 1:4]) - dat <- scale(dat) - dupes <- which(duplicated(dat)) - dat <- dat[-dupes, ] mat <- Matrix::sparseMatrix(i = rep(1:nrow(dat), ncol(dat)), j = rep(1:ncol(dat), each = nrow(dat)), x = as.vector(dat)) - d_matrix <- as.matrix(dist(mat, method = "euclidean")) + d_matrix <- as.matrix(dist(t(as.matrix(mat)), method = "euclidean")) bests <- apply(d_matrix, MARGIN = 1, FUN = function(x) order(x)[1:(M + 1)]) bests <- bests[-1,] - 1 - mat <- Matrix::t(mat) neighbors <- randomProjectionTreeSearch(mat, K = M, n_trees = 10, max_iter = 2, tree_threshold = 20, verbose = FALSE) - scores <- lapply(1:nrow(dat), FUN = function(x) sum(neighbors[, x] %in% bests[, x])) - score <- sum(as.numeric(scores)) - expect_gt(score, .99 * ncol(dat) * M) + expect_lte(sum(neighbors - bests, na.rm = TRUE), 5) }) diff --git a/tests/testthat/testvis.R b/tests/testthat/testvis.R new file mode 100644 index 0000000..e956a7b --- /dev/null +++ b/tests/testthat/testvis.R @@ -0,0 +1,64 @@ +context("vis") +set.seed(1974) +data(iris) +dat <- as.matrix(iris[, 1:4]) +dat <- scale(dat) +dupes <- which(duplicated(dat)) +dat <- dat[-dupes, ] +dat <- t(dat) + +test_that("largeVis works", { + visObject <- largeVis(dat, max_iter = 20, n_trees = 100, + tree_threshold = 50, sgd_batches = 1000, + K = 20, verbose = FALSE) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis does not NaN on iris", { + visObject <- largeVis(dat, max_iter = 20, + coords = matrix(rnorm(ncol(dat) * 2), nrow = 2), + K = 20, verbose = FALSE, + sgd_batches = 20000 * 150) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis works when alpha == 0", { + visObject <- largeVis(dat, + max_iter = 20, + sgd_batches = 10000, + K = 10, + alpha = 0, + verbose = FALSE) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis works with cosine", { + visObject <- largeVis(dat, max_iter = 20, + sgd_batches = 1000, + K = 10, verbose = FALSE, + distance_method = "Cosine") + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) +}) + +test_that("largeVis continues to work as it scales up", { + visObject <- largeVis(dat, max_iter = 20, sgd_batches = 1000, + K = 10, gamma = 0.5, verbose = FALSE) + expect_false(any(is.na(visObject$coords))) + expect_false(any(is.nan(visObject$coords))) + expect_false(any(is.infinite(visObject$coords))) + for (i in c(10000, 100000, 1000000, 20000 * length(visObject$wij@x))) { + coords <- projectKNNs(visObject$wij, sgd_batches = i, + verbose = FALSE) + expect_false(any(is.na(coords))) + expect_false(any(is.nan(coords))) + expect_false(any(is.infinite(coords))) + } +}) \ No newline at end of file diff --git a/vignettes/benchmarks.Rmd b/vignettes/benchmarks.Rmd index f41cf57..0fd16bf 100644 --- a/vignettes/benchmarks.Rmd +++ b/vignettes/benchmarks.Rmd @@ -12,14 +12,14 @@ vignette: | ```{r setupbenchmark,eval=T,echo=F,warning=F,error=F,message=F} # Note to reader: Please don't steal the semi-distinctive visual style I spent several minutes creating for myself. -require(ggplot2, +library(ggplot2, quietly = TRUE) -require(RColorBrewer, +library(RColorBrewer, quietly = TRUE) -require(wesanderson, +library(wesanderson, quietly = TRUE) -require(dplyr, quietly = TRUE) -require(magrittr, quietly = TRUE) +library(dplyr, quietly = TRUE) +library(magrittr, quietly = TRUE) knitr::opts_chunk$set(collapse = TRUE, comment = "#>", fig.width = 7, diff --git a/vignettes/largeVis.Rmd b/vignettes/largeVis.Rmd index 321c137..eeb1f9d 100644 --- a/vignettes/largeVis.Rmd +++ b/vignettes/largeVis.Rmd @@ -324,6 +324,6 @@ Memory requirements during the neighbor search may be managed by reducing `n_tre ## References -```{r save,eval=rebuild} +```{r save,eval=rebuild,echo=F} save(agcoords, iriscoords, file = "vignettedata/vignettedata.Rda") ```