From e8632eff36ba26ab0a9e4ec86f98bf3dc723c6bf Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 21 Jun 2019 14:52:07 -0400 Subject: [PATCH 1/9] Rename fit_phi -> fit_ab --- src/umap_.jl | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/umap_.jl b/src/umap_.jl index 7738936..7bb76ae 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -88,16 +88,16 @@ function fuzzy_simplicial_set(X, metric, local_connectivity, set_operation_ratio) - + knns, dists = knn_search(X, n_neighbors, metric) σs, ρs = smooth_knn_dists(dists, n_neighbors, local_connectivity) rows, cols, vals = compute_membership_strengths(knns, dists, σs, ρs) fs_set = sparse(rows, cols, vals, size(knns, 2), size(knns, 2)) - + res = combine_fuzzy_sets(fs_set, set_operation_ratio) - + return dropzeros(res) end @@ -108,15 +108,15 @@ Compute the distances to the nearest neighbors for a continuous value `k`. Retur the approximated distances to the kth nearest neighbor (`knn_dists`) and the nearest neighbor (nn_dists) from each point. """ -function smooth_knn_dists(knn_dists::AbstractMatrix{S}, - k::Integer, +function smooth_knn_dists(knn_dists::AbstractMatrix{S}, + k::Integer, local_connectivity::Real; niter::Integer=64, bandwidth::Real=1, ktol = 1e-5) where {S <: Real} - + nonzero_dists(dists) = @view dists[dists .> 0.] - ρs = zeros(S, size(knn_dists, 2)) + ρs = zeros(S, size(knn_dists, 2)) σs = Array{S}(undef, size(knn_dists, 2)) for i in 1:size(knn_dists, 2) nz_dists = nonzero_dists(knn_dists[:, i]) @@ -126,17 +126,17 @@ function smooth_knn_dists(knn_dists::AbstractMatrix{S}, if index > 0 ρs[i] = nz_dists[index] if interpolation > SMOOTH_K_TOLERANCE - ρs[i] += interpolation * (nz_dists[index+1] - nz_dists[index]) + ρs[i] += interpolation * (nz_dists[index+1] - nz_dists[index]) end else - ρs[i] = interpolation * nz_dists[1] + ρs[i] = interpolation * nz_dists[1] end elseif length(nz_dists) > 0 - ρs[i] = maximum(nz_dists) + ρs[i] = maximum(nz_dists) end @inbounds σs[i] = smooth_knn_dist(knn_dists[:, i], ρs[i], k, local_connectivity, bandwidth, niter, ktol) end - + return ρs, σs end @@ -219,7 +219,7 @@ Optimize an embedding by minimizing the fuzzy set cross entropy between the high - `embedding`: a dense matrix of shape (n_components, n_samples) - `n_epochs`: the number of training epochs for optimization - `initial_alpha`: the initial learning rate -- `gamma`: the repulsive strength of negative samples +- `gamma`: the repulsive strength of negative samples - `neg_sample_rate::Integer`: the number of negative samples per positive sample """ function optimize_embedding(graph, @@ -230,7 +230,7 @@ function optimize_embedding(graph, spread, gamma, neg_sample_rate) - a, b = fit_ϕ(min_dist, spread) + a, b = fit_ab(min_dist, spread) alpha = initial_alpha for e in 1:n_epochs @@ -283,12 +283,12 @@ function optimize_embedding(graph, end """ - fit_ϕ(min_dist, spread) -> a, b + fit_ab(min_dist, spread) -> a, b Find a smooth approximation to the membership function of points embedded in ℜᵈ. This fits a smooth curve that approximates an exponential decay offset by `min_dist`. """ -function fit_ϕ(min_dist, spread) +function fit_ab(min_dist, spread) ψ(d) = d >= min_dist ? exp(-(d - min_dist)/spread) : 1. xs = LinRange(0., spread*3, 300) ys = map(ψ, xs) From 85183b07f529eac6a42fa940d46c4dd89d65ea8a Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 21 Jun 2019 15:10:17 -0400 Subject: [PATCH 2/9] Move fit_ab to utils.jl; allow passing a, b as user args --- src/umap_.jl | 22 ++++----------------- src/utils.jl | 47 +++++++++++++++++++++++++++++++++------------ test/runtests.jl | 4 ++-- test/utils_tests.jl | 18 ++++++++++------- 4 files changed, 52 insertions(+), 39 deletions(-) diff --git a/src/umap_.jl b/src/umap_.jl index 7bb76ae..ef79464 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -229,8 +229,10 @@ function optimize_embedding(graph, min_dist, spread, gamma, - neg_sample_rate) - a, b = fit_ab(min_dist, spread) + neg_sample_rate, + _a=nothing, + _b=nothing) + a, b = fit_ab(min_dist, spread, _a, _b) alpha = initial_alpha for e in 1:n_epochs @@ -282,22 +284,6 @@ function optimize_embedding(graph, return embedding end -""" - fit_ab(min_dist, spread) -> a, b - -Find a smooth approximation to the membership function of points embedded in ℜᵈ. -This fits a smooth curve that approximates an exponential decay offset by `min_dist`. -""" -function fit_ab(min_dist, spread) - ψ(d) = d >= min_dist ? exp(-(d - min_dist)/spread) : 1. - xs = LinRange(0., spread*3, 300) - ys = map(ψ, xs) - @. curve(x, p) = (1. + p[1]*x^(2*p[2]))^(-1) - result = curve_fit(curve, xs, ys, [1., 1.], lower=[0., -Inf]) - a, b = result.param - return a, b -end - """ spectral_layout(graph, embed_dim) -> embedding diff --git a/src/utils.jl b/src/utils.jl index 5b3266e..637f414 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,10 +1,33 @@ +#= +Utilities used by UMAP.jl +=# + + +@inline fit_ab(_, __, a, b) = a, b + +""" + fit_ab(min_dist, spread, _a, _b) -> a, b + +Find a smooth approximation to the membership function of points embedded in ℜᵈ. +This fits a smooth curve that approximates an exponential decay offset by `min_dist`, +returning the parameters `(a, b)`. +""" +function fit_ab(min_dist, spread, ::Nothing, ::Nothing) + ψ(d) = d >= min_dist ? exp(-(d - min_dist)/spread) : 1. + xs = LinRange(0., spread*3, 300) + ys = map(ψ, xs) + @. curve(x, p) = (1. + p[1]*x^(2*p[2]))^(-1) + result = curve_fit(curve, xs, ys, [1., 1.], lower=[0., -Inf]) + a, b = result.param + return a, b +end knn_search(dist_mat, k, metric::Symbol) = knn_search(dist_mat, k, Val(metric)) """ knn_search(dist_mat, k, :precomputed) -> knns, dists -Find the `k` nearest neighbors of each point in a precomputed distance +Find the `k` nearest neighbors of each point in a precomputed distance matrix. """ knn_search(dist_mat, k, ::Val{:precomputed}) = _knn_from_dists(dist_mat, k) @@ -26,8 +49,8 @@ end # compute all pairwise distances # return the nearest k to each point v, other than v itself -function knn_search(X::AbstractMatrix{S}, - k, +function knn_search(X::AbstractMatrix{S}, + k, metric, ::Val{:pairwise}) where {S <: Real} num_points = size(X, 2) @@ -38,14 +61,14 @@ function knn_search(X::AbstractMatrix{S}, end # find the approximate k nearest neighbors using NNDescent -function knn_search(X::AbstractMatrix{S}, - k, - metric, +function knn_search(X::AbstractMatrix{S}, + k, + metric, ::Val{:approximate}) where {S <: Real} knngraph = DescentGraph(X, k, metric) - return knngraph.indices, knngraph.distances + return knngraph.indices, knngraph.distances end - + function _knn_from_dists(dist_mat::AbstractMatrix{S}, k) where {S <: Real} knns_ = [partialsortperm(view(dist_mat, :, i), 2:k+1) for i in 1:size(dist_mat, 1)] dists_ = [dist_mat[:, i][knns_[i]] for i in eachindex(knns_)] @@ -54,11 +77,11 @@ function _knn_from_dists(dist_mat::AbstractMatrix{S}, k) where {S <: Real} return knns, dists end - -# combine local fuzzy simplicial sets -@inline function combine_fuzzy_sets(fs_set::AbstractMatrix{T}, + +# combine local fuzzy simplicial sets +@inline function combine_fuzzy_sets(fs_set::AbstractMatrix{T}, set_op_ratio) where {T} - return set_op_ratio .* fuzzy_set_union(fs_set) .+ + return set_op_ratio .* fuzzy_set_union(fs_set) .+ (one(T) - set_op_ratio) .* fuzzy_set_intersection(fs_set) end diff --git a/test/runtests.jl b/test/runtests.jl index c266629..0acc8dc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,8 +3,8 @@ using Distances: Euclidean, CosineDist using SparseArrays using LinearAlgebra using UMAP -using UMAP: fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets +using UMAP: fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets, fit_ab include("utils_tests.jl") -include("umap_tests.jl") \ No newline at end of file +include("umap_tests.jl") diff --git a/test/utils_tests.jl b/test/utils_tests.jl index e9b7fe6..ebf6730 100644 --- a/test/utils_tests.jl +++ b/test/utils_tests.jl @@ -10,7 +10,7 @@ @test knns == true_knns @test dists == true_dists end - + @testset "precomputed tests" begin dist_mat = [0. 2. 1.; 2. 0. 3.; @@ -22,19 +22,23 @@ knns, dists = knn_search(dist_mat, 2, :precomputed) @test knns == true_knns @test dists == true_dists - + end - + end - + @testset "combine_fuzzy_sets tests" begin A = [1.0 0.1; 0.4 1.0] union_res = [1.0 0.46; 0.46 1.0] res = combine_fuzzy_sets(A, 1.0) @test isapprox(res, union_res) inter_res = [1.0 0.04; 0.04 1.0] - res = combine_fuzzy_sets(A, 0.0) + res = combine_fuzzy_sets(A, 0.0) @test isapprox(res, inter_res) - + end -end \ No newline at end of file + + @testset "fit_ab tests" begin + @test all((1, 2) .== fit_ab(-1, 0, 1, 2)) + end +end From c0ba1578398893a6e107946ee92c9dbe99b8e4ad Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 21 Jun 2019 15:19:23 -0400 Subject: [PATCH 3/9] Remove superfluous ktol param; change type of k to Real in smooth_k_dist --- src/umap_.jl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/umap_.jl b/src/umap_.jl index ef79464..92e67fb 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -109,11 +109,10 @@ the approximated distances to the kth nearest neighbor (`knn_dists`) and the nearest neighbor (nn_dists) from each point. """ function smooth_knn_dists(knn_dists::AbstractMatrix{S}, - k::Integer, - local_connectivity::Real; + k::Real, + local_connectivity::Integer; niter::Integer=64, - bandwidth::Real=1, - ktol = 1e-5) where {S <: Real} + bandwidth::Real=1) where {S <: Real} nonzero_dists(dists) = @view dists[dists .> 0.] ρs = zeros(S, size(knn_dists, 2)) @@ -134,19 +133,19 @@ function smooth_knn_dists(knn_dists::AbstractMatrix{S}, elseif length(nz_dists) > 0 ρs[i] = maximum(nz_dists) end - @inbounds σs[i] = smooth_knn_dist(knn_dists[:, i], ρs[i], k, local_connectivity, bandwidth, niter, ktol) + @inbounds σs[i] = smooth_knn_dist(knn_dists[:, i], ρs[i], k, bandwidth, niter) end return ρs, σs end # calculate sigma for an individual point -@fastmath function smooth_knn_dist(dists::AbstractVector, ρ, k, local_connectivity, bandwidth, niter, ktol) +@fastmath function smooth_knn_dist(dists::AbstractVector, ρ, k, bandwidth, niter) target = log2(k)*bandwidth lo, mid, hi = 0., 1., Inf for n in 1:niter psum = sum(exp.(-max.(dists .- ρ, 0.)./mid)) - if abs(psum - target) < ktol + if abs(psum - target) < SMOOTH_K_TOLERANCE break end if psum > target From 770d8f31e4e181efe9399bb1a9068657ef5d1759 Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 21 Jun 2019 15:25:26 -0400 Subject: [PATCH 4/9] Update smooth_knn_tests; local_connectivity::Real --- src/umap_.jl | 2 +- test/runtests.jl | 2 +- test/umap_tests.jl | 13 ++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/umap_.jl b/src/umap_.jl index 92e67fb..4823edc 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -110,7 +110,7 @@ and the nearest neighbor (nn_dists) from each point. """ function smooth_knn_dists(knn_dists::AbstractMatrix{S}, k::Real, - local_connectivity::Integer; + local_connectivity::Real; niter::Integer=64, bandwidth::Real=1) where {S <: Real} diff --git a/test/runtests.jl b/test/runtests.jl index 0acc8dc..e56bda6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,7 @@ using Distances: Euclidean, CosineDist using SparseArrays using LinearAlgebra using UMAP -using UMAP: fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets, fit_ab +using UMAP: fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets, fit_ab, SMOOTH_K_TOLERANCE include("utils_tests.jl") diff --git a/test/umap_tests.jl b/test/umap_tests.jl index 45d5b82..a530cf5 100644 --- a/test/umap_tests.jl +++ b/test/umap_tests.jl @@ -35,7 +35,7 @@ umap_graph = fuzzy_simplicial_set(data, k, Euclidean(), 1, 1.f0) @test issymmetric(umap_graph) @test eltype(umap_graph) == Float32 - + data = 2 .* rand(20, 1000) .- 1 umap_graph = fuzzy_simplicial_set(data, k, CosineDist(), 1, 1.) @test issymmetric(umap_graph) @@ -49,10 +49,9 @@ local_connectivity = 1 bandwidth = 1. niter = 64 - ktol = 1e-5 - sigma = smooth_knn_dist(dists, rho, k, local_connectivity, bandwidth, niter, ktol) + sigma = smooth_knn_dist(dists, rho, k, bandwidth, niter) psum(ds, r, s) = sum(exp.(-max.(ds .- r, 0.) ./ s)) - @test psum(dists, rho, sigma) - log2(k)*bandwidth < ktol + @test psum(dists, rho, sigma) - log2(k)*bandwidth < SMOOTH_K_TOLERANCE knn_dists = [0. 0. 0.; 1. 2. 3.; @@ -63,14 +62,14 @@ rhos, sigmas = smooth_knn_dists(knn_dists, k, local_connectivity) @test rhos == [1., 2., 3.] diffs = [psum(knn_dists[:,i], rhos[i], sigmas[i]) for i in 1:3] .- log2(6) - @test all(diffs .< 1e-5) - + @test all(diffs .< SMOOTH_K_TOLERANCE) + knn_dists = [0. 0. 0.; 0. 1. 2.; 0. 2. 3.] rhos, sigmas = smooth_knn_dists(knn_dists, 2, 1) @test rhos == [0., 1., 2.] - + rhos, sigmas = smooth_knn_dists(knn_dists, 2, 1.5) @test rhos == [0., 1.5, 2.5] end From 832999586d0e9eb389729a29bef0739e97e2a06d Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 21 Jun 2019 15:31:00 -0400 Subject: [PATCH 5/9] Add some more simple arg checking --- src/umap_.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/umap_.jl b/src/umap_.jl index 4823edc..8753e31 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -61,8 +61,13 @@ function UMAP_(X::AbstractMatrix{S}, # argument checking size(X, 2) > n_neighbors > 0|| throw(ArgumentError("size(X, 2) must be greater than n_neighbors and n_neighbors must be greater than 0")) size(X, 1) > n_components > 1 || throw(ArgumentError("size(X, 1) must be greater than n_components and n_components must be greater than 1")) - min_dist > 0. || throw(ArgumentError("min_dist must be greater than 0")) - #n_epochs > 0 || throw(ArgumentError("n_epochs must be greater than 1")) + n_epochs > 0 || throw(ArgumentError("n_epochs must be greater than 0")) + learning_rate > 0 || throw(ArgumentError("learning_rate must be greater than 0")) + min_dist > 0 || throw(ArgumentError("min_dist must be greater than 0")) + 0 ≤ set_operation_ratio ≤ 1 || throw(ArgumentError("set_operation_ratio must lie in [0, 1]")) + local_connectivity > 0 || throw(ArgumentError("local_connectivity must be greater than 0")) + + # main algorithm graph = fuzzy_simplicial_set(X, n_neighbors, metric, local_connectivity, set_operation_ratio) From 758b94eaa47e00dd0df44a92c63b83b8d58f607d Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 21 Jun 2019 15:40:21 -0400 Subject: [PATCH 6/9] Use integers/rationals instead of Float64 for literals in optimize_emb --- src/umap_.jl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/umap_.jl b/src/umap_.jl index 8753e31..7775dd4 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -66,7 +66,7 @@ function UMAP_(X::AbstractMatrix{S}, min_dist > 0 || throw(ArgumentError("min_dist must be greater than 0")) 0 ≤ set_operation_ratio ≤ 1 || throw(ArgumentError("set_operation_ratio must lie in [0, 1]")) local_connectivity > 0 || throw(ArgumentError("local_connectivity must be greater than 0")) - + # main algorithm @@ -247,13 +247,13 @@ function optimize_embedding(graph, p = nonzeros(graph)[ind] if rand() <= p @views sdist = evaluate(SqEuclidean(), embedding[:, i], embedding[:, j]) - if sdist > 0. - delta = (-2. * a * b * sdist^(b-1))/(1. + a*sdist^b) + if sdist > 0 + delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b) else - delta = 0. + delta = 0 end @simd for d in 1:size(embedding, 1) - grad = clamp(delta * (embedding[d,i] - embedding[d,j]), -4., 4.) + grad = clamp(delta * (embedding[d,i] - embedding[d,j]), -4, 4) embedding[d,i] += alpha * grad embedding[d,j] -= alpha * grad end @@ -263,17 +263,17 @@ function optimize_embedding(graph, @views sdist = evaluate(SqEuclidean(), embedding[:, i], embedding[:, k]) if sdist > 0 - delta = (2. * gamma * b) / ((0.001 + sdist)*(1. + a*sdist^b)) + delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b)) elseif i == k continue else - delta = 0. + delta = 0 end @simd for d in 1:size(embedding, 1) - if delta > 0. - grad = clamp(delta * (embedding[d, i] - embedding[d, k]), -4., 4.) + if delta > 0 + grad = clamp(delta * (embedding[d, i] - embedding[d, k]), -4, 4) else - grad = 4. + grad = 4 end embedding[d, i] += alpha * grad end @@ -282,7 +282,7 @@ function optimize_embedding(graph, end end end - alpha = initial_alpha*(1. - e/n_epochs) + alpha = initial_alpha*(1 - e//n_epochs) end return embedding From 7a3f11adb93ffeffe3d8b1d8ef88a7c53b2a068d Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 21 Jun 2019 16:08:36 -0400 Subject: [PATCH 7/9] Test on Julia 1.2 --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index aa98c40..b0460f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,14 @@ language: julia -os: +os: - linux julia: - 1.0 - 1.1 + - 1.2 - nightly -script: +script: - julia -e 'import Pkg; Pkg.build(); Pkg.test(; coverage=true)' codecov: true From 84b2192899c40648a6573a82a1257a93a711afcc Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 5 Jul 2019 17:32:07 -0400 Subject: [PATCH 8/9] Add parameters for graph and embedding to make UMAP type stable --- src/umap_.jl | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/umap_.jl b/src/umap_.jl index 7775dd4..8d24819 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -2,15 +2,22 @@ # for Dimension Reduction, L. McInnes, J. Healy, J. Melville, 2018. # NOTE: unused for now -struct UMAP_{S} - graph::AbstractMatrix{S} - embedding::AbstractMatrix{S} +struct UMAP_{S <: Real, M <: AbstractMatrix{S}, N <: AbstractMatrix{S}} + graph::M + embedding::N - function UMAP_(graph::AbstractMatrix{S}, embedding::AbstractMatrix{S}) where {S<:Real} + function UMAP_{S, M, N}(graph, embedding) where {S<:Real, + M<:AbstractMatrix{S}, + N<:AbstractMatrix{S}} issymmetric(graph) || isapprox(graph, graph') || error("UMAP_ constructor expected graph to be a symmetric matrix") - new{S}(graph, embedding) + new(graph, embedding) end end +function UMAP_(graph::M, embedding::N) where {S <: Real, + M <: AbstractMatrix{S}, + N <: AbstractMatrix{S}} + return UMAP_{S, M, N}(graph, embedding) +end const SMOOTH_K_TOLERANCE = 1e-5 From a19dbc39b56d04ac705205f52943fd14c4291ac8 Mon Sep 17 00:00:00 2001 From: Dillon Daudert Date: Fri, 5 Jul 2019 17:43:11 -0400 Subject: [PATCH 9/9] Bump to version v0.1.5 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 1277d11..9b58305 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "UMAP" uuid = "c4f8c510-2410-5be4-91d7-4fbaeb39457e" authors = ["Dillon Daudert "] -version = "0.1.4" +version = "0.1.5" [deps] Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"