Skip to content

Commit

Permalink
Merge a19dbc3 into 53f05f2
Browse files Browse the repository at this point in the history
  • Loading branch information
dillondaudert committed Jul 5, 2019
2 parents 53f05f2 + a19dbc3 commit 1339f3f
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 83 deletions.
5 changes: 3 additions & 2 deletions .travis.yml
@@ -1,13 +1,14 @@
language: julia
os:
os:
- linux

julia:
- 1.0
- 1.1
- 1.2
- nightly

script:
script:
- julia -e 'import Pkg; Pkg.build(); Pkg.test(; coverage=true)'

codecov: true
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
@@ -1,7 +1,7 @@
name = "UMAP"
uuid = "c4f8c510-2410-5be4-91d7-4fbaeb39457e"
authors = ["Dillon Daudert <dillongdaudert@gmail.com>"]
version = "0.1.4"
version = "0.1.5"

[deps]
Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
Expand Down
101 changes: 49 additions & 52 deletions src/umap_.jl
Expand Up @@ -2,15 +2,22 @@
# for Dimension Reduction, L. McInnes, J. Healy, J. Melville, 2018.

# NOTE: unused for now
struct UMAP_{S}
graph::AbstractMatrix{S}
embedding::AbstractMatrix{S}
struct UMAP_{S <: Real, M <: AbstractMatrix{S}, N <: AbstractMatrix{S}}
graph::M
embedding::N

function UMAP_(graph::AbstractMatrix{S}, embedding::AbstractMatrix{S}) where {S<:Real}
function UMAP_{S, M, N}(graph, embedding) where {S<:Real,
M<:AbstractMatrix{S},
N<:AbstractMatrix{S}}
issymmetric(graph) || isapprox(graph, graph') || error("UMAP_ constructor expected graph to be a symmetric matrix")
new{S}(graph, embedding)
new(graph, embedding)
end
end
function UMAP_(graph::M, embedding::N) where {S <: Real,
M <: AbstractMatrix{S},
N <: AbstractMatrix{S}}
return UMAP_{S, M, N}(graph, embedding)
end

const SMOOTH_K_TOLERANCE = 1e-5

Expand Down Expand Up @@ -61,8 +68,13 @@ function UMAP_(X::AbstractMatrix{S},
# argument checking
size(X, 2) > n_neighbors > 0|| throw(ArgumentError("size(X, 2) must be greater than n_neighbors and n_neighbors must be greater than 0"))
size(X, 1) > n_components > 1 || throw(ArgumentError("size(X, 1) must be greater than n_components and n_components must be greater than 1"))
min_dist > 0. || throw(ArgumentError("min_dist must be greater than 0"))
#n_epochs > 0 || throw(ArgumentError("n_epochs must be greater than 1"))
n_epochs > 0 || throw(ArgumentError("n_epochs must be greater than 0"))
learning_rate > 0 || throw(ArgumentError("learning_rate must be greater than 0"))
min_dist > 0 || throw(ArgumentError("min_dist must be greater than 0"))
0 set_operation_ratio 1 || throw(ArgumentError("set_operation_ratio must lie in [0, 1]"))
local_connectivity > 0 || throw(ArgumentError("local_connectivity must be greater than 0"))



# main algorithm
graph = fuzzy_simplicial_set(X, n_neighbors, metric, local_connectivity, set_operation_ratio)
Expand All @@ -88,16 +100,16 @@ function fuzzy_simplicial_set(X,
metric,
local_connectivity,
set_operation_ratio)

knns, dists = knn_search(X, n_neighbors, metric)

σs, ρs = smooth_knn_dists(dists, n_neighbors, local_connectivity)

rows, cols, vals = compute_membership_strengths(knns, dists, σs, ρs)
fs_set = sparse(rows, cols, vals, size(knns, 2), size(knns, 2))

res = combine_fuzzy_sets(fs_set, set_operation_ratio)

return dropzeros(res)
end

Expand All @@ -108,15 +120,14 @@ Compute the distances to the nearest neighbors for a continuous value `k`. Retur
the approximated distances to the kth nearest neighbor (`knn_dists`)
and the nearest neighbor (nn_dists) from each point.
"""
function smooth_knn_dists(knn_dists::AbstractMatrix{S},
k::Integer,
function smooth_knn_dists(knn_dists::AbstractMatrix{S},
k::Real,
local_connectivity::Real;
niter::Integer=64,
bandwidth::Real=1,
ktol = 1e-5) where {S <: Real}

bandwidth::Real=1) where {S <: Real}

nonzero_dists(dists) = @view dists[dists .> 0.]
ρs = zeros(S, size(knn_dists, 2))
ρs = zeros(S, size(knn_dists, 2))
σs = Array{S}(undef, size(knn_dists, 2))
for i in 1:size(knn_dists, 2)
nz_dists = nonzero_dists(knn_dists[:, i])
Expand All @@ -126,27 +137,27 @@ function smooth_knn_dists(knn_dists::AbstractMatrix{S},
if index > 0
ρs[i] = nz_dists[index]
if interpolation > SMOOTH_K_TOLERANCE
ρs[i] += interpolation * (nz_dists[index+1] - nz_dists[index])
ρs[i] += interpolation * (nz_dists[index+1] - nz_dists[index])
end
else
ρs[i] = interpolation * nz_dists[1]
ρs[i] = interpolation * nz_dists[1]
end
elseif length(nz_dists) > 0
ρs[i] = maximum(nz_dists)
ρs[i] = maximum(nz_dists)
end
@inbounds σs[i] = smooth_knn_dist(knn_dists[:, i], ρs[i], k, local_connectivity, bandwidth, niter, ktol)
@inbounds σs[i] = smooth_knn_dist(knn_dists[:, i], ρs[i], k, bandwidth, niter)
end

return ρs, σs
end

# calculate sigma for an individual point
@fastmath function smooth_knn_dist(dists::AbstractVector, ρ, k, local_connectivity, bandwidth, niter, ktol)
@fastmath function smooth_knn_dist(dists::AbstractVector, ρ, k, bandwidth, niter)
target = log2(k)*bandwidth
lo, mid, hi = 0., 1., Inf
for n in 1:niter
psum = sum(exp.(-max.(dists .- ρ, 0.)./mid))
if abs(psum - target) < ktol
if abs(psum - target) < SMOOTH_K_TOLERANCE
break
end
if psum > target
Expand Down Expand Up @@ -219,7 +230,7 @@ Optimize an embedding by minimizing the fuzzy set cross entropy between the high
- `embedding`: a dense matrix of shape (n_components, n_samples)
- `n_epochs`: the number of training epochs for optimization
- `initial_alpha`: the initial learning rate
- `gamma`: the repulsive strength of negative samples
- `gamma`: the repulsive strength of negative samples
- `neg_sample_rate::Integer`: the number of negative samples per positive sample
"""
function optimize_embedding(graph,
Expand All @@ -229,8 +240,10 @@ function optimize_embedding(graph,
min_dist,
spread,
gamma,
neg_sample_rate)
a, b = fit_ϕ(min_dist, spread)
neg_sample_rate,
_a=nothing,
_b=nothing)
a, b = fit_ab(min_dist, spread, _a, _b)

alpha = initial_alpha
for e in 1:n_epochs
Expand All @@ -241,13 +254,13 @@ function optimize_embedding(graph,
p = nonzeros(graph)[ind]
if rand() <= p
@views sdist = evaluate(SqEuclidean(), embedding[:, i], embedding[:, j])
if sdist > 0.
delta = (-2. * a * b * sdist^(b-1))/(1. + a*sdist^b)
if sdist > 0
delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b)
else
delta = 0.
delta = 0
end
@simd for d in 1:size(embedding, 1)
grad = clamp(delta * (embedding[d,i] - embedding[d,j]), -4., 4.)
grad = clamp(delta * (embedding[d,i] - embedding[d,j]), -4, 4)
embedding[d,i] += alpha * grad
embedding[d,j] -= alpha * grad
end
Expand All @@ -257,17 +270,17 @@ function optimize_embedding(graph,
@views sdist = evaluate(SqEuclidean(),
embedding[:, i], embedding[:, k])
if sdist > 0
delta = (2. * gamma * b) / ((0.001 + sdist)*(1. + a*sdist^b))
delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b))
elseif i == k
continue
else
delta = 0.
delta = 0
end
@simd for d in 1:size(embedding, 1)
if delta > 0.
grad = clamp(delta * (embedding[d, i] - embedding[d, k]), -4., 4.)
if delta > 0
grad = clamp(delta * (embedding[d, i] - embedding[d, k]), -4, 4)
else
grad = 4.
grad = 4
end
embedding[d, i] += alpha * grad
end
Expand All @@ -276,28 +289,12 @@ function optimize_embedding(graph,
end
end
end
alpha = initial_alpha*(1. - e/n_epochs)
alpha = initial_alpha*(1 - e//n_epochs)
end

return embedding
end

"""
fit_ϕ(min_dist, spread) -> a, b
Find a smooth approximation to the membership function of points embedded in ℜᵈ.
This fits a smooth curve that approximates an exponential decay offset by `min_dist`.
"""
function fit_ϕ(min_dist, spread)
ψ(d) = d >= min_dist ? exp(-(d - min_dist)/spread) : 1.
xs = LinRange(0., spread*3, 300)
ys = map(ψ, xs)
@. curve(x, p) = (1. + p[1]*x^(2*p[2]))^(-1)
result = curve_fit(curve, xs, ys, [1., 1.], lower=[0., -Inf])
a, b = result.param
return a, b
end

"""
spectral_layout(graph, embed_dim) -> embedding
Expand Down
47 changes: 35 additions & 12 deletions src/utils.jl
@@ -1,10 +1,33 @@
#=
Utilities used by UMAP.jl
=#


@inline fit_ab(_, __, a, b) = a, b

"""
fit_ab(min_dist, spread, _a, _b) -> a, b
Find a smooth approximation to the membership function of points embedded in ℜᵈ.
This fits a smooth curve that approximates an exponential decay offset by `min_dist`,
returning the parameters `(a, b)`.
"""
function fit_ab(min_dist, spread, ::Nothing, ::Nothing)
ψ(d) = d >= min_dist ? exp(-(d - min_dist)/spread) : 1.
xs = LinRange(0., spread*3, 300)
ys = map(ψ, xs)
@. curve(x, p) = (1. + p[1]*x^(2*p[2]))^(-1)
result = curve_fit(curve, xs, ys, [1., 1.], lower=[0., -Inf])
a, b = result.param
return a, b
end

knn_search(dist_mat, k, metric::Symbol) = knn_search(dist_mat, k, Val(metric))

"""
knn_search(dist_mat, k, :precomputed) -> knns, dists
Find the `k` nearest neighbors of each point in a precomputed distance
Find the `k` nearest neighbors of each point in a precomputed distance
matrix.
"""
knn_search(dist_mat, k, ::Val{:precomputed}) = _knn_from_dists(dist_mat, k)
Expand All @@ -26,8 +49,8 @@ end

# compute all pairwise distances
# return the nearest k to each point v, other than v itself
function knn_search(X::AbstractMatrix{S},
k,
function knn_search(X::AbstractMatrix{S},
k,
metric,
::Val{:pairwise}) where {S <: Real}
num_points = size(X, 2)
Expand All @@ -38,14 +61,14 @@ function knn_search(X::AbstractMatrix{S},
end

# find the approximate k nearest neighbors using NNDescent
function knn_search(X::AbstractMatrix{S},
k,
metric,
function knn_search(X::AbstractMatrix{S},
k,
metric,
::Val{:approximate}) where {S <: Real}
knngraph = DescentGraph(X, k, metric)
return knngraph.indices, knngraph.distances
return knngraph.indices, knngraph.distances
end

function _knn_from_dists(dist_mat::AbstractMatrix{S}, k) where {S <: Real}
knns_ = [partialsortperm(view(dist_mat, :, i), 2:k+1) for i in 1:size(dist_mat, 1)]
dists_ = [dist_mat[:, i][knns_[i]] for i in eachindex(knns_)]
Expand All @@ -54,11 +77,11 @@ function _knn_from_dists(dist_mat::AbstractMatrix{S}, k) where {S <: Real}
return knns, dists
end

# combine local fuzzy simplicial sets
@inline function combine_fuzzy_sets(fs_set::AbstractMatrix{T},

# combine local fuzzy simplicial sets
@inline function combine_fuzzy_sets(fs_set::AbstractMatrix{T},
set_op_ratio) where {T}
return set_op_ratio .* fuzzy_set_union(fs_set) .+
return set_op_ratio .* fuzzy_set_union(fs_set) .+
(one(T) - set_op_ratio) .* fuzzy_set_intersection(fs_set)
end

Expand Down
4 changes: 2 additions & 2 deletions test/runtests.jl
Expand Up @@ -3,8 +3,8 @@ using Distances: Euclidean, CosineDist
using SparseArrays
using LinearAlgebra
using UMAP
using UMAP: fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets
using UMAP: fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets, fit_ab, SMOOTH_K_TOLERANCE


include("utils_tests.jl")
include("umap_tests.jl")
include("umap_tests.jl")
13 changes: 6 additions & 7 deletions test/umap_tests.jl
Expand Up @@ -35,7 +35,7 @@
umap_graph = fuzzy_simplicial_set(data, k, Euclidean(), 1, 1.f0)
@test issymmetric(umap_graph)
@test eltype(umap_graph) == Float32

data = 2 .* rand(20, 1000) .- 1
umap_graph = fuzzy_simplicial_set(data, k, CosineDist(), 1, 1.)
@test issymmetric(umap_graph)
Expand All @@ -49,10 +49,9 @@
local_connectivity = 1
bandwidth = 1.
niter = 64
ktol = 1e-5
sigma = smooth_knn_dist(dists, rho, k, local_connectivity, bandwidth, niter, ktol)
sigma = smooth_knn_dist(dists, rho, k, bandwidth, niter)
psum(ds, r, s) = sum(exp.(-max.(ds .- r, 0.) ./ s))
@test psum(dists, rho, sigma) - log2(k)*bandwidth < ktol
@test psum(dists, rho, sigma) - log2(k)*bandwidth < SMOOTH_K_TOLERANCE

knn_dists = [0. 0. 0.;
1. 2. 3.;
Expand All @@ -63,14 +62,14 @@
rhos, sigmas = smooth_knn_dists(knn_dists, k, local_connectivity)
@test rhos == [1., 2., 3.]
diffs = [psum(knn_dists[:,i], rhos[i], sigmas[i]) for i in 1:3] .- log2(6)
@test all(diffs .< 1e-5)
@test all(diffs .< SMOOTH_K_TOLERANCE)

knn_dists = [0. 0. 0.;
0. 1. 2.;
0. 2. 3.]
rhos, sigmas = smooth_knn_dists(knn_dists, 2, 1)
@test rhos == [0., 1., 2.]

rhos, sigmas = smooth_knn_dists(knn_dists, 2, 1.5)
@test rhos == [0., 1.5, 2.5]
end
Expand Down

0 comments on commit 1339f3f

Please sign in to comment.