From d4d854de64bd00f64b9250d4650e2b1093fc3311 Mon Sep 17 00:00:00 2001
From: Sanjay Mohan <sanjmohan@users.noreply.github.com>
Date: Wed, 13 May 2020 17:06:56 -0700
Subject: [PATCH] add transform data functionality (#26)

Co-authored-by: Sanjay Mohan <>
---
 README.md           |  27 +++++++-
 src/UMAP.jl         |   2 +-
 src/embeddings.jl   |  63 ++++++++++++-----
 src/umap_.jl        | 161 +++++++++++++++++++++++++++++++++++---------
 src/utils.jl        |  64 +++++++++++++++---
 test/umap_tests.jl  | 107 +++++++++++++++++++++++++----
 test/utils_tests.jl |  11 +++
 7 files changed, 359 insertions(+), 76 deletions(-)
diff --git a/README.md b/README.md
index d988436..5e3a893 100644
--- a/README.md
+++ b/README.md
@@ -26,13 +26,38 @@ UMAP can use a precomputed distance matrix instead of finding the nearest neighb
 embedding = umap(distances, n_components; metric=:precomputed)
 ```
 
+## Fitting a UMAP model to a dataset and transforming new data
+
+### Constructing a model
+To construct a model to use for embedding new data, use the constructor:
+```jl
+model = UMAP_(X, n_components; <kwargs>)
+```
+where the constructor takes the same keyword arguments (kwargs) as `umap`. The returned object has the following fields:
+```jl
+model.graph     # The graph of fuzzy simplicial set membership strengths of each point in the dataset
+model.embedding # The embedding of the dataset
+model.data      # A reference to the original dataset
+model.knns      # A matrix of indices of nearest neighbors of points in the dataset,
+                # as determined on the original manifold (may be approximate)
+model.dists     # The distances of the neighbors indicated by model.knns
+```
+
+### Embedding new data
+To transform new data into the existing embedding of a UMAP model, use the `transform` function:
+```jl
+Q_embedding = transform(model, Q; <kwargs>)
+```
+where `Q` is a matrix of new query data to embed into the existing embedding, and `model` is the object obtained from the `UMAP_` call above. `Q` must come from a space of the same dimensionality as `model.data` (ie `X` in the `UMAP_` call above).
+
+The remaining keyword arguments (kwargs) are the same as for above functions.
+
 ## Implementation Details
 There are two main steps involved in UMAP: building a weighted graph with edges connecting points to their nearest neighbors, and optimizing the low-dimensional embedding of that graph. The first step is accomplished either by an exact kNN search (for datasets with `< 4096` points) or by the approximate kNN search algorithm, [NNDescent](https://github.com/dillondaudert/NearestNeighborDescent.jl). This step is also usually the most costly.
 
 The low-dimensional embedding is initialized (by default) with the eigenvectors of the normalized Laplacian of the kNN graph. These are found using ARPACK (via [Arpack.jl](https://github.com/JuliaLinearAlgebra/Arpack.jl)).
 
 ## Current Limitations
-- **No transform**: Only one-time embeddings are possible at the moment. That is to say, it isn't possible to "fit" UMAP to a dataset and then use it to "transform" new data
 - **Input data types**: Only data points that are represented by vectors of numbers (passed in as a matrix) are valid inputs. This is mostly due to a lack of support for other formats in [NNDescent](https://github.com/dillondaudert/NearestNeighborDescent.jl). Support for e.g. string datasets is possible in the future
 - **Sequential**: This implementation does not take advantage of any parallelism
 
diff --git a/src/UMAP.jl b/src/UMAP.jl
index 2e48109..933eb9a 100644
--- a/src/UMAP.jl
+++ b/src/UMAP.jl
@@ -11,6 +11,6 @@ include("utils.jl")
 include("embeddings.jl")
 include("umap_.jl")
 
-export umap, UMAP_
+export umap, UMAP_, transform
 
 end # module
diff --git a/src/embeddings.jl b/src/embeddings.jl
index db0cdf3..ea86fed 100644
--- a/src/embeddings.jl
+++ b/src/embeddings.jl
@@ -19,6 +19,21 @@ function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:ran
     return [20 .* rand(T, n_components) .- 10 for _ in 1:size(graph, 1)]
 end
 
+"""
+    initialize_embedding(graph::AbstractMatrix{<:Real}, ref_embedding::AbstractMatrix{T<:AbstractFloat}) -> embedding
+
+Initialize an embedding of points corresponding to the columns of the `graph`, by taking weighted means of
+the columns of `ref_embedding`, where weights are values from the rows of the `graph`.
+
+The resulting embedding will have shape `(size(ref_embedding, 1), size(graph, 2))`, where `size(ref_embedding, 1)`
+is the number of components (dimensions) of the `reference embedding`, and `size(graph, 2)` is the number of 
+samples in the resulting embedding. Its elements will have type T.
+"""
+function initialize_embedding(graph::AbstractMatrix{<:Real}, ref_embedding::AbstractMatrix{T})::Vector{Vector{T}} where {T<:AbstractFloat}
+    embed = (ref_embedding * graph) ./ (sum(graph, dims=1) .+ eps(T))
+    return Vector{T}[eachcol(embed)...]
+end
+
 """
     spectral_layout(graph, embed_dim) -> embedding
 
@@ -46,20 +61,28 @@ function spectral_layout(graph::SparseMatrixCSC{T},
 end
 
 """
-    optimize_embedding(graph, embedding, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate) -> embedding
+    optimize_embedding(graph, query_embedding, ref_embedding, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate, _a=nothing, _b=nothing; move_ref=false) -> embedding
 
 Optimize an embedding by minimizing the fuzzy set cross entropy between the high and low dimensional simplicial sets using stochastic gradient descent.
+Optimize "query" samples with respect to "reference" samples.
 
 # Arguments
 - `graph`: a sparse matrix of shape (n_samples, n_samples)
-- `embedding`: a vector of length (n_samples,) of vectors representing the embedded data points
+- `query_embedding`: a vector of length (n_samples,) of vectors representing the embedded data points to be optimized ("query" samples)
+- `ref_embedding`: a vector of length (n_samples,) of vectors representing the embedded data points to optimize against ("reference" samples)
 - `n_epochs`: the number of training epochs for optimization
 - `initial_alpha`: the initial learning rate
 - `gamma`: the repulsive strength of negative samples
-- `neg_sample_rate::Integer`: the number of negative samples per positive sample
+- `neg_sample_rate`: the number of negative samples per positive sample
+- `_a`: this controls the embedding. If the actual argument is `nothing`, this is determined automatically by `min_dist` and `spread`.
+- `_b`: this controls the embedding. If the actual argument is `nothing`, this is determined automatically by `min_dist` and `spread`.
+
+# Keyword Arguments
+- `move_ref::Bool = false`: if true, also improve the embeddings in `ref_embedding`, else fix them and only improve embeddings in `query_embedding`.
 """
 function optimize_embedding(graph,
-                            embedding,
+                            query_embedding,
+                            ref_embedding,
                             n_epochs,
                             initial_alpha,
                             min_dist,
@@ -67,8 +90,10 @@ function optimize_embedding(graph,
                             gamma,
                             neg_sample_rate,
                             _a=nothing,
-                            _b=nothing)
+                            _b=nothing;
+                            move_ref::Bool=false)
     a, b = fit_ab(min_dist, spread, _a, _b)
+    self_reference = query_embedding === ref_embedding
 
     alpha = initial_alpha
     for e in 1:n_epochs
@@ -77,34 +102,38 @@ function optimize_embedding(graph,
                 j = rowvals(graph)[ind]
                 p = nonzeros(graph)[ind]
                 if rand() <= p
-                    sdist = evaluate(SqEuclidean(), embedding[i], embedding[j])
+                    sdist = evaluate(SqEuclidean(), query_embedding[i], ref_embedding[j])
                     if sdist > 0
                         delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b)
                     else
                         delta = 0
                     end
-                    @simd for d in eachindex(embedding[i])
-                        grad = clamp(delta * (embedding[i][d] - embedding[j][d]), -4, 4)
-                        embedding[i][d] += alpha * grad
-                        embedding[j][d] -= alpha * grad
+                    @simd for d in eachindex(query_embedding[i])
+                        grad = clamp(delta * (query_embedding[i][d] - ref_embedding[j][d]), -4, 4)
+                        query_embedding[i][d] += alpha * grad
+                        if move_ref
+                            ref_embedding[j][d] -= alpha * grad
+                        end
                     end
 
                     for _ in 1:neg_sample_rate
-                        k = rand(1:size(graph, 2))
-                        i != k || continue
-                        sdist = evaluate(SqEuclidean(), embedding[i], embedding[k])
+                        k = rand(eachindex(ref_embedding))
+                        if i == k && self_reference
+                            continue
+                        end
+                        sdist = evaluate(SqEuclidean(), query_embedding[i], ref_embedding[k])
                         if sdist > 0
                             delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b))
                         else
                             delta = 0
                         end
-                        @simd for d in eachindex(embedding[i])
+                        @simd for d in eachindex(query_embedding[i])
                             if delta > 0
-                                grad = clamp(delta * (embedding[i][d] - embedding[k][d]), -4, 4)
+                                grad = clamp(delta * (query_embedding[i][d] - ref_embedding[k][d]), -4, 4)
                             else
                                 grad = 4
                             end
-                            embedding[i][d] += alpha * grad
+                            query_embedding[i][d] += alpha * grad
                         end
                     end
 
@@ -114,5 +143,5 @@ function optimize_embedding(graph,
         alpha = initial_alpha*(1 - e//n_epochs)
     end
 
-    return embedding
+    return query_embedding
 end
diff --git a/src/umap_.jl b/src/umap_.jl
index 72e5cd1..be745da 100644
--- a/src/umap_.jl
+++ b/src/umap_.jl
@@ -1,22 +1,32 @@
 # an implementation of Uniform Manifold Approximation and Projection
 # for Dimension Reduction, L. McInnes, J. Healy, J. Melville, 2018.
 
-# NOTE: unused for now
-struct UMAP_{S <: Real, M <: AbstractMatrix{S}, N <: AbstractMatrix{S}}
+struct UMAP_{S <: Real, M <: AbstractMatrix{S}, N <: AbstractMatrix{S}, D<:AbstractMatrix{S}, K<:AbstractMatrix{<:Integer}, I<:AbstractMatrix{S}}
     graph::M
     embedding::N
+    data::D
+    knns::K
+    dists::I
 
-    function UMAP_{S, M, N}(graph, embedding) where {S<:Real,
-                                                     M<:AbstractMatrix{S},
-                                                     N<:AbstractMatrix{S}}
+    function UMAP_{S, M, N, D, K, I}(graph, embedding, data, knns, dists) where {S<:Real,
+                                                                                 M<:AbstractMatrix{S},
+                                                                                 N<:AbstractMatrix{S},
+                                                                                 D<:AbstractMatrix{S},
+                                                                                 K<:AbstractMatrix{<:Integer},
+                                                                                 I<:AbstractMatrix{S}}
         issymmetric(graph) || isapprox(graph, graph') || error("UMAP_ constructor expected graph to be a symmetric matrix")
-        new(graph, embedding)
+        size(knns) == size(dists) || error("UMAP_ constructor expected knns and dists to have equal size")
+        new(graph, embedding, data, knns, dists)
     end
 end
-function UMAP_(graph::M, embedding::N) where {S <: Real,
-                                              M <: AbstractMatrix{S},
-                                              N <: AbstractMatrix{S}}
-    return UMAP_{S, M, N}(graph, embedding)
+
+function UMAP_(graph::M, embedding::N, data::D, knns::K, dists::I) where {S<:Real,
+                                                                          M<:AbstractMatrix{S},
+                                                                          N<:AbstractMatrix{S},
+                                                                          D<:AbstractMatrix{S},
+                                                                          K<:AbstractMatrix{<:Integer},
+                                                                          I<:AbstractMatrix{S}}
+    return UMAP_{S, M, N, D, K, I}(graph, embedding, data, knns, dists)
 end
 
 const SMOOTH_K_TOLERANCE = 1e-5
@@ -28,6 +38,27 @@ const SMOOTH_K_TOLERANCE = 1e-5
 Embed the data `X` into a `n_components`-dimensional space. `n_neighbors` controls
 how many neighbors to consider as locally connected.
 
+See `UMAP_` for a description of keyword arguments.
+"""
+function umap(args...; kwargs...)
+    # this is just a convenience function for now
+    return UMAP_(args...; kwargs...).embedding
+end
+
+"""
+    UMAP_(X::AbstractMatrix[, n_components=2]; <kwargs>) -> UMAP_ object
+
+Create a model representing the embedding of data `X` into `n_components`-dimensional space. 
+The returned model has the following fields:
+
+- `graph`: the graph representing the fuzzy simplicial set of the manifold of `X`.
+- `embedding`: the `n-component`-dimensional embedding of the data `X`.
+- `data`: a reference to the input data `X`.
+- `knns`: a matrix of indices of `X` representing each point's nearest neighbors according to `metric`.
+          `knns[j, i]` is the index of point i's jth nearest neighbor.
+- `dists`: the respective distances of the above neighbors.
+           `dists[j, i]` is the distance of point i's jth nearest neighbor.
+
 # Keyword Arguments
 - `n_neighbors::Integer = 15`: the number of neighbors to consider as locally connected. Larger values capture more global structure in the data, while small values capture more local structure.
 - `metric::{SemiMetric, Symbol} = Euclidean()`: the metric to calculate distance in the input space. It is also possible to pass `metric = :precomputed` to treat `X` like a precomputed distance matrix.
@@ -43,12 +74,6 @@ how many neighbors to consider as locally connected.
 - `a = nothing`: this controls the embedding. By default, this is determined automatically by `min_dist` and `spread`.
 - `b = nothing`: this controls the embedding. By default, this is determined automatically by `min_dist` and `spread`.
 """
-function umap(args...; kwargs...)
-    # this is just a convenience function for now
-    return UMAP_(args...; kwargs...).embedding
-end
-
-
 function UMAP_(X::AbstractMatrix{S},
                n_components::Integer = 2;
                n_neighbors::Integer = 15,
@@ -75,42 +100,114 @@ function UMAP_(X::AbstractMatrix{S},
     local_connectivity > 0 || throw(ArgumentError("local_connectivity must be greater than 0"))
 
 
-
     # main algorithm
-    graph = fuzzy_simplicial_set(X, n_neighbors, metric, local_connectivity, set_operation_ratio)
+    knns, dists = knn_search(X, n_neighbors, metric)
+    graph = fuzzy_simplicial_set(knns, dists, n_neighbors, size(X, 2), local_connectivity, set_operation_ratio)
 
     embedding = initialize_embedding(graph, n_components, Val(init))
 
-    embedding = optimize_embedding(graph, embedding, n_epochs, learning_rate, min_dist, spread, repulsion_strength, neg_sample_rate)
+    embedding = optimize_embedding(graph, embedding, embedding, n_epochs, learning_rate, min_dist, spread, repulsion_strength, neg_sample_rate, move_ref=true)
     # TODO: if target variable y is passed, then construct target graph
     #       in the same manner and do a fuzzy simpl set intersection
 
-    return UMAP_(graph, hcat(embedding...))
+    return UMAP_(graph, hcat(embedding...), X, knns, dists)
 end
 
 """
-    fuzzy_simplicial_set(X, n_neighbors, metric, local_connectivity, set_op_ratio) -> graph::SparseMatrixCSC
+    transform(model::UMAP_, Q::AbstractMatrix; <kwargs>) -> embedding
+
+Use the given model to embed new points into an existing embedding. `Q` is a matrix of some number of points (columns)
+in the same space as `model.data`. The returned embedding is the embedding of these points in n-dimensional space, where
+n is the dimensionality of `model.embedding`. This embedding is created by finding neighbors of `Q` in `model.embedding`
+and optimizing cross entropy according to membership strengths according to these neighbors.
 
-Construct the local fuzzy simplicial sets of each point in `X` by
-finding the approximate nearest `n_neighbors`, normalizing the distances
+# Keyword Arguments
+- `n_neighbors::Integer = 15`: the number of neighbors to consider as locally connected. Larger values capture more global structure in the data, while small values capture more local structure.
+- `metric::{SemiMetric, Symbol} = Euclidean()`: the metric to calculate distance in the input space. It is also possible to pass `metric = :precomputed` to treat `X` like a precomputed distance matrix.
+- `n_epochs::Integer = 300`: the number of training epochs for embedding optimization
+- `learning_rate::Real = 1`: the initial learning rate during optimization
+- `init::Symbol = :spectral`: how to initialize the output embedding; valid options are `:spectral` and `:random`
+- `min_dist::Real = 0.1`: the minimum spacing of points in the output embedding
+- `spread::Real = 1`: the effective scale of embedded points. Determines how clustered embedded points are in combination with `min_dist`.
+- `set_operation_ratio::Real = 1`: interpolates between fuzzy set union and fuzzy set intersection when constructing the UMAP graph (global fuzzy simplicial set). The value of this parameter should be between 1.0 and 0.0: 1.0 indicates pure fuzzy union, while 0.0 indicates pure fuzzy intersection.
+- `local_connectivity::Integer = 1`: the number of nearest neighbors that should be assumed to be locally connected. The higher this value, the more connected the manifold becomes. This should not be set higher than the intrinsic dimension of the manifold.
+- `repulsion_strength::Real = 1`: the weighting of negative samples during the optimization process.
+- `neg_sample_rate::Integer = 5`: the number of negative samples to select for each positive sample. Higher values will increase computational cost but result in slightly more accuracy.
+- `a = nothing`: this controls the embedding. By default, this is determined automatically by `min_dist` and `spread`.
+- `b = nothing`: this controls the embedding. By default, this is determined automatically by `min_dist` and `spread`.
+"""
+function transform(model::UMAP_,
+                   Q::AbstractMatrix{S};
+                   n_neighbors::Integer = 15,
+                   metric::Union{SemiMetric, Symbol} = Euclidean(),
+                   n_epochs::Integer = 100,
+                   learning_rate::Real = 1,
+                   min_dist::Real = 1//10,
+                   spread::Real = 1,
+                   set_operation_ratio::Real = 1,
+                   local_connectivity::Integer = 1,
+                   repulsion_strength::Real = 1,
+                   neg_sample_rate::Integer = 5,
+                   a::Union{Real, Nothing} = nothing,
+                   b::Union{Real, Nothing} = nothing
+                   ) where {S<:Real}
+    # argument checking
+    size(Q, 2) > n_neighbors > 0                     || throw(ArgumentError("size(Q, 2) must be greater than n_neighbors and n_neighbors must be greater than 0"))
+    learning_rate > 0                                || throw(ArgumentError("learning_rate must be greater than 0"))
+    min_dist > 0                                     || throw(ArgumentError("min_dist must be greater than 0"))
+    0 ≤ set_operation_ratio ≤ 1                      || throw(ArgumentError("set_operation_ratio must lie in [0, 1]"))
+    local_connectivity > 0                           || throw(ArgumentError("local_connectivity must be greater than 0"))
+    size(model.data, 2) == size(model.embedding, 2)  || throw(ArgumentError("model.data must have same number of columns as model.embedding"))
+    size(model.data, 1) == size(Q, 1)                || throw(ArgumentError("size(model.data, 1) must equal size(Q, 1)"))
+
+
+    n_epochs = max(0, n_epochs)
+    # main algorithm
+    knns, dists = knn_search(model.data, Q, n_neighbors, metric, model.knns, model.dists)
+    graph = fuzzy_simplicial_set(knns, dists, n_neighbors, size(model.data, 2), local_connectivity, set_operation_ratio, false)
+
+    embedding = initialize_embedding(graph, model.embedding)
+    ref_embedding = collect(eachcol(model.embedding))
+    embedding = optimize_embedding(graph, embedding, ref_embedding, n_epochs, learning_rate, min_dist, spread, repulsion_strength, neg_sample_rate, a, b, move_ref=false)
+
+    return reduce(hcat, embedding)
+end
+
+
+"""
+    fuzzy_simplicial_set(knns, dists, n_neighbors, n_points, local_connectivity, set_op_ratio, apply_fuzzy_combine=true) -> membership_graph::SparseMatrixCSC, 
+
+Construct the local fuzzy simplicial sets of each point represented by its distances
+to its `n_neighbors` nearest neighbors, stored in `knns` and `dists`, normalizing the distances
 on the manifolds, and converting the metric space to a simplicial set.
+`n_points` indicates the total number of points of the original data, while `knns` contains
+indices of some subset of those points (ie some subset of 1:`n_points`). If `knns` represents
+neighbors of the elements of some set with itself, then `knns` should have `n_points` number of
+columns. Otherwise, these two values may be inequivalent.
+If `apply_fuzzy_combine` is true, use intersections and unions to combine
+fuzzy sets of neighbors (default true).
+
+The returned graph will have size (`n_points`, size(knns, 2)).
 """
-function fuzzy_simplicial_set(X,
+function fuzzy_simplicial_set(knns,
+                              dists,
                               n_neighbors,
-                              metric,
+                              n_points,
                               local_connectivity,
-                              set_operation_ratio)
-
-    knns, dists = knn_search(X, n_neighbors, metric)
+                              set_operation_ratio,
+                              apply_fuzzy_combine=true)
 
     σs, ρs = smooth_knn_dists(dists, n_neighbors, local_connectivity)
 
     rows, cols, vals = compute_membership_strengths(knns, dists, σs, ρs)
-    fs_set = sparse(rows, cols, vals, size(knns, 2), size(knns, 2))
-
-    res = combine_fuzzy_sets(fs_set, set_operation_ratio)
+    fs_set = sparse(rows, cols, vals, n_points, size(knns, 2))
 
-    return dropzeros(res)
+    if apply_fuzzy_combine
+        res = combine_fuzzy_sets(fs_set, set_operation_ratio)
+        return dropzeros(res)
+    else
+        return dropzeros(fs_set)
+    end
 end
 
 """
diff --git a/src/utils.jl b/src/utils.jl
index e888a04..cd0cd65 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -22,22 +22,27 @@ function fit_ab(min_dist, spread, ::Nothing, ::Nothing)
     return a, b
 end
 
-knn_search(dist_mat, k, metric::Symbol) = knn_search(dist_mat, k, Val(metric))
 
-"""
-    knn_search(dist_mat, k, :precomputed) -> knns, dists
+knn_search(X::AbstractMatrix, k, metric::Symbol) = knn_search(X, k, Val(metric))
 
-Find the `k` nearest neighbors of each point in a precomputed distance
-matrix.
-"""
-knn_search(dist_mat, k, ::Val{:precomputed}) = _knn_from_dists(dist_mat, k)
+# treat given matrix `X` as distance matrix
+knn_search(X::AbstractMatrix, k, ::Val{:precomputed}) = _knn_from_dists(X, k)
 
 """
     knn_search(X, k, metric) -> knns, dists
 
-Find the `k` nearest neighbors of each point in `X` by `metric`.
+Find the `k` nearest neighbors of each point.
+
+`metric` may be of type:
+- ::Symbol - `knn_search` is dispatched to one of the following based on the evaluation of `metric`:
+- ::Val(:precomputed) - computes neighbors from `X` treated as a precomputed distance matrix.
+- ::SemiMetric - computes neighbors from `X` treated as samples, using the given metric.
+
+# Returns
+- `knns`: `knns[j, i]` is the index of node i's jth nearest neighbor.
+- `dists`: `dists[j, i]` is the distance of node i's jth nearest neighbor.
 """
-function knn_search(X,
+function knn_search(X::AbstractMatrix,
                     k,
                     metric::SemiMetric)
     if size(X, 2) < 4096
@@ -69,8 +74,45 @@ function knn_search(X::AbstractMatrix{S},
     return knn_matrices(knngraph)
 end
 
-function _knn_from_dists(dist_mat::AbstractMatrix{S}, k) where {S <: Real}
-    knns_ = [partialsortperm(view(dist_mat, :, i), 2:k+1) for i in 1:size(dist_mat, 1)]
+"""
+    knn_search(X, Q, k, metric, knns, dists) -> knns, dists
+
+Given a matrix `X` and a matrix `Q`, use the given metric to compute the `k` nearest neighbors out of the
+columns of `X` from the queries (columns in `Q`). 
+If the matrices are large, reconstruct the approximate nearest neighbors graph of `X` using the given `knns` and `dists`,
+representing indices and distances of pairwise neighbors of `X`, and use this to search for approximate nearest 
+neighbors of `Q`.
+If the matrices are small, search for exact nearest neighbors of `Q` by computing all pairwise distances with `X`.
+
+`metric` may be of type:
+- ::Symbol - `knn_search` is dispatched to one of the following based on the evaluation of `metric`:
+- ::Val(:precomputed) - computes neighbors from `X` treated as a precomputed distance matrix.
+- ::SemiMetric - computes neighbors from `X` treated as samples, using the given metric.
+
+# Returns
+- `knns`: `knns[j, i]` is the index of node i's jth nearest neighbor.
+- `dists`: `dists[j, i]` is the distance of node i's jth nearest neighbor.
+"""
+function knn_search(X::AbstractMatrix, 
+                    Q::AbstractMatrix,
+                    k::Integer,
+                    metric::SemiMetric,
+                    knns::AbstractMatrix{<:Integer},
+                    dists::AbstractMatrix{<:Real})
+    if size(X, 2) < 4096
+        return _knn_from_dists(pairwise(metric, X, Q, dims=2), k, ignore_diagonal=false)
+    else
+        knngraph = HeapKNNGraph(collect(eachcol(X)), metric, knns, dists)
+        return search(knngraph, collect(eachcol(Q)), k; max_candidates=8*k)
+    end
+end
+
+
+function _knn_from_dists(dist_mat::AbstractMatrix{S}, k::Integer; ignore_diagonal=true) where {S <: Real}
+    # Ignore diagonal 0 elements (which will be smallest) when distance matrix represents pairwise distances of the same set
+    # If dist_mat represents distances between two different sets, diagonal elements be nontrivial
+    range = (1:k) .+ ignore_diagonal
+    knns_ = [partialsortperm(view(dist_mat, :, i), range) for i in 1:size(dist_mat, 2)]
     dists_ = [dist_mat[:, i][knns_[i]] for i in eachindex(knns_)]
     knns = hcat(knns_...)::Matrix{Int}
     dists = hcat(dists_...)::Matrix{S}
diff --git a/test/umap_tests.jl b/test/umap_tests.jl
index 036e9dd..bbd647a 100644
--- a/test/umap_tests.jl
+++ b/test/umap_tests.jl
@@ -18,26 +18,31 @@
         @test umap_ isa UMAP_{Float64}
         @test size(umap_.graph) == (100, 100)
         @test size(umap_.embedding) == (2, 100)
+        @test umap_.data === data
 
         data = rand(Float32, 5, 100)
         @test UMAP_(data; init=:random) isa UMAP_{Float32}
     end
 
     @testset "fuzzy_simpl_set" begin
-        data = rand(20, 500)
-        k = 5
-        umap_graph = fuzzy_simplicial_set(data, k, Euclidean(), 1, 1.)
+        # knns = rand(1:50, 5, 50)
+        # dists = rand(5, 50)
+        knns = [2 3 2; 3 1 1]
+        dists = [1.5 .5 .5; 2. 1.5 2.]
+        k = 3
+        umap_graph = fuzzy_simplicial_set(knns, dists, k, 3, 1, 1.)
         @test issymmetric(umap_graph)
         @test all(0. .<= umap_graph .<= 1.)
-        data = rand(Float32, 20, 500)
-        umap_graph = fuzzy_simplicial_set(data, k, Euclidean(), 1, 1.f0)
+        @test size(umap_graph) == (3, 3)
+
+        dists = convert.(Float32, dists)
+        umap_graph = fuzzy_simplicial_set(knns, dists, k, 3, 1, 1.f0)
         @test issymmetric(umap_graph)
         @test eltype(umap_graph) == Float32
 
-        data = 2 .* rand(20, 1000) .- 1
-        umap_graph = fuzzy_simplicial_set(data, k, CosineDist(), 1, 1.)
-        @test issymmetric(umap_graph)
+        umap_graph = fuzzy_simplicial_set(knns, dists, k, 200, 1, 1., false)
         @test all(0. .<= umap_graph .<= 1.)
+        @test size(umap_graph) == (200, 3)
     end
 
     @testset "smooth_knn_dists" begin
@@ -87,18 +92,31 @@
     end
 
     @testset "optimize_embedding" begin
-        Random.seed!(0)
-        A = sprand(10000, 10000, 0.001)
-        B = dropzeros(A + A' - A .* A')
-        layout = initialize_embedding(B, 5, Val(:random))
+        graph1 = sparse(Symmetric(sprand(6,6,0.4)))
+        graph2 = sparse(sprand(5,3,0.4))
+        graph3 = sparse(sprand(3,5,0.4))
+        embedding = Vector{Float64}[[3, 1], [4, 5], [2, 3], [1, 7], [6, 3], [2, 6]]
+
         n_epochs = 1
         initial_alpha = 1.
         min_dist = 1.
         spread = 1.
         gamma = 1.
         neg_sample_rate = 5
-        embedding = optimize_embedding(B, layout, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate)
-        @test embedding isa Array{Array{Float64, 1}, 1}
+        for graph in [graph1, graph2, graph3]
+            ref_embedding = collect(eachcol(rand(2, size(graph, 1))))
+            old_ref_embedding = deepcopy(ref_embedding)
+            query_embedding = rand(2, size(graph, 2))
+            query_embedding = [query_embedding[:, i] for i in 1:size(query_embedding, 2)]
+            res_embedding = optimize_embedding(graph, query_embedding, ref_embedding, n_epochs, initial_alpha, 
+                                               min_dist, spread, gamma, neg_sample_rate, move_ref=false)
+            @test res_embedding isa Array{Array{Float64, 1}, 1}
+            @test length(res_embedding) == length(query_embedding)
+            for i in 1:length(res_embedding)
+                @test length(res_embedding[i]) == length(query_embedding[i])
+            end
+            @test isapprox(old_ref_embedding, ref_embedding, atol=1e-4)
+        end
     end
 
     @testset "spectral_layout" begin
@@ -112,4 +130,65 @@
         @inferred spectral_layout(convert(SparseMatrixCSC{Float32}, B), 5)
     end
 
+    @testset "initialize_embedding" begin
+        graph = [5 0 1 1;
+                 2 4 1 1;
+                 3 6 8 8] ./10
+        ref_embedding = Float64[1 2 0;
+                                0 2 -1]
+        actual = [[9, 1], [8, 2], [3, -6], [3, -6]] ./10
+
+        embedding = initialize_embedding(graph, ref_embedding)
+        @test embedding isa AbstractVector{<:AbstractVector{Float64}}
+        @test length(embedding) == length(actual)
+        for i in 1:length(embedding)
+            @test length(embedding[i]) == length(actual[i])
+        end
+        @test isapprox(embedding, actual, atol=1e-8)
+
+        graph = Float16.(graph[:, [1,2]])
+        graph[:, end] .= 0
+        ref_embedding = Float16[1 2 0;
+                                0 2 -1]
+        actual = Vector{Float16}[[9, 1], [0, 0]] ./10
+
+        embedding = initialize_embedding(graph, ref_embedding)
+        @test embedding isa AbstractVector{<:AbstractVector{Float16}}
+        @test length(embedding) == length(actual)
+        for i in 1:length(embedding)
+            @test length(embedding[i]) == length(actual[i])
+        end
+        @test isapprox(embedding, actual, atol=1e-2)
+    end
+
+    @testset "umap transform" begin
+        @testset "argument validation tests" begin
+            data = rand(5, 10)
+            model = UMAP_(data, 2, n_neighbors=2, n_epochs=1)
+            query = rand(5, 8)
+            @test_throws ArgumentError transform(model, rand(6, 8); n_neighbors=3) # query size error
+            @test_throws ArgumentError transform(model, query; n_neighbors=0) # n_neighbors error
+            @test_throws ArgumentError transform(model, query; n_neighbors=15) # n_neighbors error
+            @test_throws ArgumentError transform(model, query; n_neighbors=1, min_dist = 0.) # min_dist error
+
+            model = UMAP_(model.graph, model.embedding, rand(6, 10), model.knns, model.dists)
+            @test_throws ArgumentError transform(model, query; n_neighbors=3) # data size error
+            model = UMAP_(model.graph, model.embedding, rand(5, 9), model.knns, model.dists)
+            @test_throws ArgumentError transform(model, query; n_neighbors=3) # data size error
+        end
+        
+        @testset "transform test" begin
+            data = rand(5, 30)
+            model = UMAP_(data, 2, n_neighbors=2, n_epochs=1)
+            embedding = transform(model, rand(5, 10), n_epochs=5, n_neighbors=5)
+            @test size(embedding) == (2, 10)
+            @test typeof(embedding) == typeof(model.embedding)
+
+            data = rand(Float32, 5, 30)
+            model = UMAP_(data, 2, n_neighbors=2, n_epochs=1)
+            embedding = @inferred transform(model, rand(5, 50), n_epochs=5, n_neighbors=5)
+            @test size(embedding) == (2, 50)
+            @test typeof(embedding) == typeof(model.embedding)
+        end
+    end
 end
diff --git a/test/utils_tests.jl b/test/utils_tests.jl
index ebf6730..19515fa 100644
--- a/test/utils_tests.jl
+++ b/test/utils_tests.jl
@@ -25,6 +25,17 @@
 
         end
 
+        @testset "query data tests" begin
+            X = [0.0 -2.0 0.0; 0.0 -1.0 2.0]
+            Q = [-2 0; 0 -1]
+            true_knns = [2 1; 1 2]
+            true_dists = [1.0 1.0; 2.0 2.0]
+            knns = [3 1 1; 2 3 2]
+            dists = [2.0 2.236 2.0; 2.236 3.606 3.606]
+            ret_knns, ret_dists = knn_search(X, Q, 2, Euclidean(), knns, dists)
+            @test ret_knns == true_knns
+            @test isapprox(ret_dists, true_dists, atol=1e-4)
+        end
     end
 
     @testset "combine_fuzzy_sets tests" begin