Skip to content

Commit

Permalink
Sparse matrices. Appveyor.
Browse files Browse the repository at this point in the history
  • Loading branch information
elbamos committed May 29, 2016
1 parent 68a308b commit 0296733
Show file tree
Hide file tree
Showing 57 changed files with 1,160 additions and 336 deletions.
16 changes: 16 additions & 0 deletions .Rbuildignore
Expand Up @@ -5,3 +5,19 @@
^\./README_files$
^revdep$
^.*\.RData
^appveyor\.yml$
^\./inst/doc$
^\./vignettes/.\.pdf$
^\./vignettes/.\.md$
^\./vignettes/.\.html$
^\./vignettes/largeVis_files$
^\./vignettes/largeVis_cache$
^\./vignettes/cache$
^\./vignettes/data$
^\./libs$
^\./doc$
^\./Rplots\.pdf$
^\./.Rda$
^\./.bin$
^\./.\.R$
^\./.\.txt$
12 changes: 8 additions & 4 deletions DESCRIPTION
@@ -1,7 +1,7 @@
Package: largeVis
Type: Package
Title: High-Quality Visualizations of Large, High-Dimensional Datasets
Version: 0.1.4
Version: 0.1.5
Author: Amos B. Elberg
Maintainer: Amos Elberg <amos.elberg@gmail.com>
Description: Implements the largeVis algorithm for visualizing very large high-dimensional datasets. Also very fast search for approximate nearest neighbors.
Expand All @@ -15,20 +15,24 @@ Imports:
parallel,
Rcpp (>= 0.12.4),
RcppProgress (>= 0.2.1),
RcppArmadillo (>= 0.6.700.6.0),
RcppArmadillo (>= 0.7.100.3.0),
abind
LinkingTo: Rcpp,RcppProgress,RcppArmadillo
Suggests: testthat,
covr,
knitr,
rmarkdown,
ggplot2,
readr,
wesanderson,
RColorBrewer,
FNN
RANN,
stm,
darch,
slam
URL: https://github.com/elbamos/largeVis
BugReports: https://github.com/elbamos/largeVis/issues
NeedsCompilation: yes
OS_type: unix
OS_type: unix, windows
BuildVignettes: FALSE
VignetteBuilder: knitr
8 changes: 8 additions & 0 deletions NAMESPACE
Expand Up @@ -3,8 +3,16 @@
S3method(buildEdgeMatrix,CsparseMatrix)
S3method(buildEdgeMatrix,TsparseMatrix)
S3method(buildEdgeMatrix,default)
S3method(distance,CsparseMatrix)
S3method(distance,TsparseMatrix)
S3method(distance,matrix)
S3method(randomProjectionTreeSearch,CsparseMatrix)
S3method(randomProjectionTreeSearch,TsparseMatrix)
S3method(randomProjectionTreeSearch,matrix)
export(buildEdgeMatrix)
export(distance)
export(manifoldMap)
export(neighborsToVectors)
export(projectKNNs)
export(randomProjectionTreeSearch)
export(vis)
Expand Down
11 changes: 10 additions & 1 deletion NEWS.md
@@ -1,5 +1,14 @@


### largeVis 0.1.5

* Handles substantially larger datasets
* Support for sparse matrices (for *much* larger datasets)
* Added better error reporting for tree search
* Handle situation in tree search where nodes are equidistant from the hyperplane
* Broke-out several components as separate functions, which makes a more-memory-efficient mode of operation possible
* Removed some unnecessary checking when processing neighbor graph
* C++11 and RcppArmadillo 0.7.100.3.0 are now required (this was necessary for support for larger datasets)
* Added appveyor to check Windows compatibility

### largeVis 0.1.4

Expand Down
20 changes: 18 additions & 2 deletions R/RcppExports.R
Expand Up @@ -9,8 +9,16 @@ searchTrees <- function(threshold, n_trees, K, max_recursion_degree, maxIter, da
.Call('largeVis_searchTrees', PACKAGE = 'largeVis', threshold, n_trees, K, max_recursion_degree, maxIter, data, distMethod, verbose)
}

distance <- function(is, js, data, distMethod, verbose) {
.Call('largeVis_distance', PACKAGE = 'largeVis', is, js, data, distMethod, verbose)
fastDistance <- function(is, js, data, distMethod, verbose) {
.Call('largeVis_fastDistance', PACKAGE = 'largeVis', is, js, data, distMethod, verbose)
}

fastCDistance <- function(is, js, i_locations, p_locations, x, distMethod, verbose) {
.Call('largeVis_fastCDistance', PACKAGE = 'largeVis', is, js, i_locations, p_locations, x, distMethod, verbose)
}

fastSDistance <- function(is, js, i_locations, j_locations, x, distMethod, verbose) {
.Call('largeVis_fastSDistance', PACKAGE = 'largeVis', is, js, i_locations, j_locations, x, distMethod, verbose)
}

distMatrixTowij <- function(is, js, xs, sigmas, N, verbose) {
Expand All @@ -21,3 +29,11 @@ sigFunc <- function(sigma, x_i, perplexity) {
.Call('largeVis_sigFunc', PACKAGE = 'largeVis', sigma, x_i, perplexity)
}

searchTreesCSparse <- function(threshold, n_trees, K, max_recursion_degree, maxIter, i, p, x, distMethod, verbose) {
.Call('largeVis_searchTreesCSparse', PACKAGE = 'largeVis', threshold, n_trees, K, max_recursion_degree, maxIter, i, p, x, distMethod, verbose)
}

searchTreesTSparse <- function(threshold, n_trees, K, max_recursion_degree, maxIter, i, j, x, distMethod, verbose) {
.Call('largeVis_searchTreesTSparse', PACKAGE = 'largeVis', threshold, n_trees, K, max_recursion_degree, maxIter, i, j, x, distMethod, verbose)
}

3 changes: 2 additions & 1 deletion R/buildEdgeMatrix.R
Expand Up @@ -26,10 +26,11 @@ buildEdgeMatrix <- function(x,i,j,p,d,perplexity,verbose) UseMethod("buildEdgeMa
buildEdgeMatrix.default <- function(x = NULL,
i,
j,
p,
p = NULL,
d,
perplexity = 50,
verbose = TRUE) {
if (is.null(p)) p <- i2p(i)
N <- max(max(i), max(j)) + 1

if (verbose) {
Expand Down
66 changes: 66 additions & 0 deletions R/distance.R
@@ -0,0 +1,66 @@
#' Calculate pairwise Euclidean or angular distances efficiently
#'
#' This function is a wrapper around a C++ function that calculates pairwise distances in a memory- and CPU-efficient manner.
#'
#' The Euclidean or angular distances between columns in `x` identified by parameters `i` and `j` are calculated and returned.
#'
#' @param i 0-indexed vector of column indices.
#' @param j 0-indexed vector of column indices.
#' @param x A (potentially sparse) matrix, where examples are columns and features are rows.
#' @param distance_method One of "Euclidean" or "Cosine."
#' @param verbose Verbosity.
#'
#' @return A vector of the distances between the columns in `x` indexed by `i` and `j`.
#' @export
distance <- function(x,i,j,distance_method,verbose) UseMethod('distance')

#' @export
#' @rdname distance
distance.matrix <- function(x,
i,
j,
distance_method = 'Euclidean',
verbose = TRUE) {
return (fastDistance(i,j,x,distance_method,verbose))
}

#' @export
#' @rdname distance
distance.CsparseMatrix <- function(x,
i,
j,
distance_method = 'Euclidean',
verbose = TRUE) {
return(fastCDistance(i,j,x@i,x@p,x@x,distance_method,verbose))
}

#' @export
#' @rdname distance
distance.TsparseMatrix <- function(
x,i,j,distance_method='Euclidean', verbose=TRUE) {
return(fastSDistance(i,j,x@i,x@j,x@p,distance_method,verbose))
}

#' A utility function to convert a k-NN graph to a pair of 0-indexed vectors of indices.
#'
#' In the returned list, the nodes indexed by `j` are the identified nearest neighbors of the nodes indexed by `i`.
#' In other words, if `i = c(0,0,0,1,1,1)` and `j = c(1,2,3,2,3,4)`, then nodes `1, 2 & 3` are nearest neighbors of node 0,
#' but node 0 is not a nearest neighbor of node 1.
#'
#' @param x A `[K,N]` matrix of indices of the nearest neighbors of each vertex. 0-indexed.
#'
#' @return A list with fields:
#' \describe{
#' \item{i}{The slowly-varying indices of x}
#' \item{j}{The quickly-varying indices of x}
#' }
#' @export
neighborsToVectors <- function(x) {
K = nrow(x)
N = ncol(x)
is <- rep(0:(N - 1), each = K)
js <- as.vector(x)
is <- is[! js == -1]
js <- js[! js == -1]
return (list(i = is, j = js))
}
34 changes: 5 additions & 29 deletions R/largeVis.R
Expand Up @@ -9,7 +9,6 @@
#' @param x A matrix, where the features are rows and the examples are columns.
#' @param dim The number of dimensions in the output
#' @param K The number of nearest-neighbors to use in computing the kNN graph
#' @param check.assumptions Whether to check the input matrix for duplicates, \code{NA}`s, etc.
#' @param n_trees See \code{\link{randomProjectionTreeSearch}}. The default is set at 50, which is the number
#' used in the examples in the original paper.
#' @param tree_threshold See \code{\link{randomProjectionTreeSearch}}. By default, this is the number of features
Expand Down Expand Up @@ -71,8 +70,6 @@ vis <- function(x,
dim = 2,
K = 40,

check.assumptions = TRUE,

n_trees = 50,
tree_threshold = max(10, nrow(x)),
max_iter = 3,
Expand All @@ -95,13 +92,6 @@ vis <- function(x,
...) {
N <- ncol(x)

if (check.assumptions) {
if ( (any(is.na(x)) +
any(is.infinite(x)) +
any(is.nan(x))) > 0)
stop("Missing values present in input matrix.")
}

#############################################
# Search for kNearestNeighbors
#############################################
Expand All @@ -118,26 +108,14 @@ vis <- function(x,
# Clean knns
#############################################
if (verbose[1]) cat("Calculating edge weights...")
# These vectors are analogous to the components of a sparse matrix,
# but both triple and C-compressed forms are created.
# The i and j vectors are 0-indexed while p is 1-indexed.
is <- rep(0:(N - 1), each = K)
js <- as.vector(knns)
is <- is[! js == -1]
js <- js[! js == -1]
dupes <- duplicated(data.frame(is, js))
is <- is[! dupes]
js <- js[! dupes]
ord <- order(is)
is <- is[ord]
js <- js[ord]
neighborIndices <- neighborsToVectors(knns)

#######################################################
# Calculate edge weights for candidate neighbors
#######################################################
if (verbose) cat("Calculating neighbor distances.\n")

xs <- distance(is, js, x, distance_method,verbose)[, 1]
xs <- distance(x = x, neighborIndices$i, neighborIndices$j, distance_method,verbose)[, 1]

if (verbose) cat("\n")

Expand All @@ -154,15 +132,13 @@ vis <- function(x,
# Get w_{ij}
#######################################################

ps <- i2p(is)
sigwij <- buildEdgeMatrix(i = is,
j = js,
p = ps,
sigwij <- buildEdgeMatrix(i = neighborIndices$i,
j = neighborIndices$j,
d = xs,
perplexity = perplexity,
verbose = verbose)


rm(neighborIndices)
#######################################################
# Estimate embeddings
#######################################################
Expand Down
1 change: 1 addition & 0 deletions R/projectKNNs.R
Expand Up @@ -67,6 +67,7 @@ projectKNNs <- function(wij, # symmetric sparse matrix
if (alpha == 0) warning("The alternative (alpha == 0) distance function is not fully implemented.")
N <- (length(wij@p) -1)
js <- rep(0:(N - 1), diff(wij@p))
if (any(is.na(js))) stop("NAs in the index vector.")
is <- wij@i

##############################################
Expand Down
86 changes: 81 additions & 5 deletions R/projectionTreeSearch.R
Expand Up @@ -7,7 +7,7 @@
#' distinct partitionable clusters, try increasing the \code{tree_threshold} to increase the number
#' of returned neighbors.
#'
#' @param x A matrix.
#' @param x A (potentially sparse) matrix, where examples are columnns and features are rows.
#' @param K How many nearest neighbors to seek for each node.
#' @param n_trees The number of trees to build.
#' @param tree_threshold The threshold for creating a new branch. The paper authors suggest
Expand All @@ -19,10 +19,18 @@
#'
#' @return A [K, N] matrix of the approximate K nearest neighbors for each vertex.
#' @export
#'
#' @examples
#'
randomProjectionTreeSearch <- function(x,
K = 5,
n_trees = 2,
tree_threshold = max(10, nrow(x)),
max_iter = 2,
max_depth = 32,
distance_method = "Euclidean",
verbose= TRUE) UseMethod('randomProjectionTreeSearch')

#' @export
#' @rdname randomProjectionTreeSearch
randomProjectionTreeSearch.matrix <- function(x,
K = 5,
n_trees = 2,
tree_threshold = max(10, nrow(x)),
Expand All @@ -40,11 +48,79 @@ randomProjectionTreeSearch <- function(x,
distance_method,
verbose = verbose)

if (sum(colSums(knns != -1) == 0) + sum(is.na(knns)) + sum(is.nan(knns)) > 0)
if (sum(colSums(knns != -1) == 0) > 0)
stop ("After neighbor search, no candidates for some nodes.")
if (sum(is.na(knns)) + sum(is.nan(knns)) > 0)
stop ("NAs or nans in neighbor graph.")
if (verbose[1] && sum(knns == -1) > 0)
warning ("Wanted to find", nrow(knns) * ncol(knns), " neighbors, but only found",
((nrow(knns) * ncol(knns)) - sum(knns == -1)))

return(knns)
}

#' @export
#' @rdname randomProjectionTreeSearch
randomProjectionTreeSearch.CsparseMatrix <- function(x,
K = 5,
n_trees = 2,
tree_threshold = max(10, nrow(x)),
max_iter = 2,
max_depth = 32,
distance_method = "Euclidean",
verbose= TRUE) {
if (verbose) cat("Searching for neighbors.\n")

knns <- searchTreesCSparse(threshold = tree_threshold,
n_trees = n_trees,
K = K, max_recursion_degree = max_depth,
maxIter = max_iter,
i = x@i,
p = x@p,
x = x@x,
distance_method,
verbose = verbose)

if (sum(colSums(knns != -1) == 0) > 0)
stop ("After neighbor search, no candidates for some nodes.")
if (sum(is.na(knns)) + sum(is.nan(knns)) > 0)
stop ("NAs or nans in neighbor graph.")
if (verbose[1] && sum(knns == -1) > 0)
warning ("Wanted to find", nrow(knns) * ncol(knns), " neighbors, but only found",
((nrow(knns) * ncol(knns)) - sum(knns == -1)))

return(knns)
}

#' @export
#' @rdname randomProjectionTreeSearch
randomProjectionTreeSearch.TsparseMatrix <- function(x,
K = 5,
n_trees = 2,
tree_threshold = max(10, nrow(x)),
max_iter = 2,
max_depth = 32,
distance_method = "Euclidean",
verbose= TRUE) {
if (verbose) cat("Searching for neighbors.\n")

knns <- searchTreesTSparse(threshold = tree_threshold,
n_trees = n_trees,
K = K, max_recursion_degree = max_depth,
maxIter = max_iter,
i = x@i,
j = x@j,
x = x@x,
distance_method,
verbose = verbose)

if (sum(colSums(knns != -1) == 0) > 0)
stop ("After neighbor search, no candidates for some nodes.")
if (sum(is.na(knns)) + sum(is.nan(knns)) > 0)
stop ("NAs or nans in neighbor graph.")
if (verbose[1] && sum(knns == -1) > 0)
warning ("Wanted to find", nrow(knns) * ncol(knns), " neighbors, but only found",
((nrow(knns) * ncol(knns)) - sum(knns == -1)))

return(knns)
}

0 comments on commit 0296733

Please sign in to comment.