# Manifold learning with continuous-nearest-neighbor distances

In [None]:
import numpy as np
import copy

import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3

from sklearn.datasets import make_swiss_roll, make_s_curve
from sklearn.decomposition import PCA
import sklearn.manifold as manifold

# NOTE: make sure "path/to/datafold" is in sys.path or PYTHONPATH if not installed
import datafold.dynfold as dfold
import datafold.pcfold as pfold
from datafold.utils.plot import plot_pairwise_eigenvector

random_state = 1

## Generate a Gaussian distribution on the line

This creates problems for kernels using the standard distance, because the density at the edges vanishes.

In [None]:
nr_samples = 25000
rng = np.random.default_rng(random_state)

# reduce number of points for plotting 
nr_samples_plot = 1000
idx_plot= rng.permutation(nr_samples)[0:nr_samples_plot]

# generate point cloud 
X = rng.uniform(-1,1,size=(nr_samples,))#.reshape((-1,))
X = rng.standard_normal(nr_samples).reshape((-1,))
X_color = X.copy()

# plot
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111)
ax.hist(X, 50, density=True) 
ax.set_xlabel("x"); ax.set_ylabel("density");
ax.set_title("points distributed according to a standard normal on the real line");

## Test standard DMAPs
With a normal Euclidean distance, the squared exponential kernel does not produce the correct eigenfunctions.

In [None]:
X_pcm = pfold.PCManifold(X.reshape(-1,1))
X_pcm.optimize_parameters()

def optimize_parameters(data, k_neighbors_min = 25, random_state=1):
    rng = np.random.default_rng(random_state)
    x_subsample = data[rng.permutation(data.shape[0])[0:100],:]
    dists = scipy.spatial.distance.cdist(x_subsample, data)
    idx = np.argsort(dists, axis=1)[:,k_neighbors_min]
    k_neighbors_dist = [dists[k,idx[k]] for k in range(dists.shape[0])]#[:,k_neighbors_min])
    return np.max(k_neighbors_dist)#[:,k_neighbors_min].shape
    
#X_pcm.cut_off = optimize_parameters(X_pcm, k_neighbors_min=5)
#X_pcm.kernel.epsilon = 0

In [None]:
print(f'epsilon={X_pcm.kernel.epsilon}, cut-off={X_pcm.cut_off}')

dmap = dfold.DiffusionMaps(epsilon=X_pcm.kernel.epsilon, cut_off=X_pcm.cut_off, n_eigenpairs=7)
dmap = dmap.fit(X_pcm)
evecs, evals = dmap.eigenvectors_, dmap.eigenvalues_

plot_pairwise_eigenvector(eigenvectors=dmap.eigenvectors_[idx_plot, :], n=1, 
                          fig_params=dict(figsize=[6,6]), 
                          scatter_params=dict(cmap=plt.cm.Spectral, c=X_color[idx_plot]))

# ckNN distances

With the ckNN kernel, the eigenfunctions should be consistent.

In [None]:
from datafold.pcfold.kernels import ContinuousNNKernel

X_pcm = pfold.PCManifold(X.reshape(-1,1))


def cknn_dmaps(X, n_eigenpairs=10, k_neighbor=10, delta=1/4., kern_tol=1e-12, use_cknn_kernel=False):
    X_ = pfold.PCManifold(X)
    X_.optimize_parameters()
    
    if use_cknn_kernel:
        cknn_kernel = ContinuousNNKernel(k_neighbor=k_neighbor, delta=delta)
        dist_factors,_ = cknn_kernel(X_, dist_cut_off=X_.cut_off,
                                     dist_backend='rdist', dist_backend_kwargs={"kmin":1+k_neighbor})

        dist = scipy.sparse.csr_matrix(dist_factors, dtype=np.float)
    else:
        dist,indices = scipy.spatial.cKDTree(X_).query(X_, k=k_neighbor)
        X_.cut_off = np.max(dist)
        print("max cutoff: ", X_.cut_off)
        dist = X_.compute_distance_matrix()
        
        row_k_dists = np.array([1/np.sort(dist.getrow(i).data)[:k_neighbor][-1] for i in range(dist.shape[0])])
        row_normalizer = np.sqrt(row_k_dists)

        dist = scipy.sparse.diags(row_normalizer) @ dist @ scipy.sparse.diags(row_normalizer)
        dist[dist >= delta] = 0
        dist[dist > 0] = 1
        dist.eliminate_zeros()
    
    print(f'sparsity distance: {int(dist.nnz/dist.shape[0])} elements per row on average')
    
    kern = dist.copy()
    kern.data = np.exp(-kern.data**2)
    kern = kern - scipy.sparse.diags(np.array(kern.diagonal()).ravel()) + scipy.sparse.identity(kern.shape[0])
    
    kn_ = 1/np.array(np.sqrt(kern.sum(axis=0))).ravel()
    kern = scipy.sparse.diags(kn_) @ kern @ scipy.sparse.diags(kn_)
    
    kn_ = 1/np.array(np.sqrt(kern.sum(axis=0))).ravel()
    kern = scipy.sparse.diags(kn_) @ kern @ scipy.sparse.diags(kn_)
    
    kern = scipy.sparse.csr_matrix(kern)
    kern.eliminate_zeros()
    print(f'sparsity kernel:   {int(kern.nnz/kern.shape[0])} elements per row on average')
    
    eigenvalues_, eigenvectors_ = scipy.sparse.linalg.eigsh(kern, k=n_eigenpairs)
    idx = np.argsort(np.abs(eigenvalues_))[::-1]
    eigenvalues_ = eigenvalues_[idx]
    eigenvectors_ = eigenvectors_[:, idx]
    
    return scipy.sparse.diags(kn_) @ eigenvectors_, eigenvalues_


evecs_cknn, evals_cknn = cknn_dmaps(X_pcm, n_eigenpairs=7, k_neighbor=15, delta=2., use_cknn_kernel=True)

In [None]:
plot_pairwise_eigenvector(eigenvectors=evecs_cknn[idx_plot, :], n=1, 
                          fig_params=dict(figsize=[6,6]), 
                          scatter_params=dict(cmap=plt.cm.Spectral, c=X_color[idx_plot]))

fig,ax=plt.subplots(1,4,figsize=(12,3),sharey=True)
for k in range(len(ax)):
    ax[k].scatter(X_pcm[idx_plot,0], evecs_cknn[idx_plot,k])