In [None]:
# Swiss roll hyperparameter estimation example

# import all necessary packages
import numpy as np
import scipy.interpolate
from skopt import gp_minimize


import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3

from sklearn.datasets import make_swiss_roll, make_s_curve
from sklearn.decomposition import PCA
import sklearn.manifold as manifold

from tutorial_progress import TutorialBar

# NOTE: make sure "path/to/datafold" is in sys.path or PYTHONPATH if not installed
import datafold.dynfold as dfold
import datafold.pcfold as pfold

np.random.seed(5)

nr_samples = 1000

fig = plt.figure(figsize=(8, 8))

# generating the nonlinear dataset 
nonlinear_data, color_nonlinear = make_s_curve(nr_samples, noise=0)  # using scikit learn package

plot_idx= np.random.permutation(nr_samples)[0:2000]
ax = fig.add_subplot(111, projection="3d")
ax.scatter(nonlinear_data[plot_idx, 0], nonlinear_data[plot_idx, 1], nonlinear_data[plot_idx, 2],
           c=color_nonlinear[plot_idx], cmap=plt.cm.Spectral) 
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.set_title("nonlinear data point cloud");
ax.view_init(10,70)

In [None]:
# Construct the PCManifold, optimize hyperparameters, compute diffusion maps.

pcm = pfold.PCManifold(nonlinear_data)
pcm.optimize_parameters()

def loss_dmaps(params):
    eps = np.exp(params[0])
    cut_off = params[1]

    dmap = dfold.DiffusionMaps(epsilon=eps, cut_off=cut_off, n_eigenpairs=10)
    dmap = dmap.fit(pcm)
    evecs, evals = dmap.eigenvectors_, dmap.eigenvalues_
    
    # one possibility to evaluate the "quality" of a DMAP embedding
    # is to try to map back to the original space (here, pcm)
    c,res,_,_ = np.linalg.lstsq(evecs, pcm, rcond=1e-10)
    return float(np.sum(res))

n_iters_dmap = 25
bounds_dmaps = np.array([[np.log(pcm.kernel.epsilon/5),np.log(pcm.kernel.epsilon*5)],\
                         [pcm.cut_off/2,pcm.cut_off*2]])

# the minimization throws some unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

bar = TutorialBar(n_iters_dmap)
res = gp_minimize(loss_dmaps,           # the function to minimize
                  bounds_dmaps,         # the bounds on each dimension of x
                  acq_func="EI",        # the acquisition function
                  n_calls=n_iters_dmap, # the number of evaluations of the loss
                  n_random_starts=5,    # the number of random initialization points
                  random_state=1234,    # the random seed
                  callback=lambda state: bar.update()) # the progress bar
xp,yp = np.row_stack(res.x_iters), np.row_stack(res.func_vals)

warnings.filterwarnings("default")

ind_best = np.argmin(yp)
xp_best = xp[ind_best,:]

opt_epsilon = np.exp(xp_best[0])
opt_cutoff = xp_best[1]

print(f'Previously: epsilon={pcm.kernel.epsilon}, cut-off={pcm.cut_off}')
print(f'Optimal: epsilon={opt_epsilon}, cut-off={opt_cutoff}')

# after the optimal parameters have been found, we can construct DMAP
dmap = dfold.DiffusionMaps(epsilon=opt_epsilon, cut_off=opt_cutoff, n_eigenpairs=10)
dmap = dmap.fit(pcm)
evecs, evals = dmap.eigenvectors_, dmap.eigenvalues_

In [None]:
fig,ax = plt.subplots(1,5,figsize=(12, 3),sharey=True)
for k in range(5):
    ax[k].scatter(evecs[plot_idx, 1], evecs[plot_idx,2+k],s=5, c=color_nonlinear[plot_idx], cmap='viridis') 
    ax[k].set_xlabel(r"$\phi_1$"); ax[k].set_title(r"$\phi_%g$" % (k+2));
fig.tight_layout()

In [None]:
# now sample the parameters on a grid to be able to compare the hyperparameter results
eps_space = np.linspace(bounds_dmaps[0][0],bounds_dmaps[0][1],5)
cutoff_space = np.linspace(bounds_dmaps[1][0],bounds_dmaps[1][1],5)
epss,cutoffs = np.meshgrid(eps_space,cutoff_space)
loss_dmaps_all = np.zeros(epss.shape)

bar = TutorialBar(np.prod(epss.shape))
for k1 in range(epss.shape[0]):
    for k2 in range(epss.shape[1]):
        loss_dmaps_all[k1,k2] = loss_dmaps([epss[k1,k2], cutoffs[k1,k2]])
        bar.update()

In [None]:
# create cubic interpolation to get a smoother picture
points = np.column_stack([epss.ravel(), cutoffs.ravel()])

eps_space2 = np.linspace(bounds_dmaps[0][0],bounds_dmaps[0][1],50)
cutoff_space2 = np.linspace(bounds_dmaps[1][0],bounds_dmaps[1][1],50)
epss2,cutoffs2 = np.meshgrid(eps_space2,cutoff_space2)

grid_loss = scipy.interpolate.griddata(points, -np.log(loss_dmaps_all.ravel()),\
                     (epss2, cutoffs2),  method='cubic')

In [None]:
# plot the results
c = -np.log(yp+1e-10)
c = np.arange(0,xp.shape[0])/2+1

fig,ax=plt.subplots(1,2,figsize=(10,5))
ax[0].imshow(grid_loss,\
             extent=[np.min(eps_space),np.max(eps_space),np.min(cutoff_space),np.max(cutoff_space)],\
             origin='lower', cmap='viridis')
ax[0].scatter(*xp.T,s=np.arange(0,xp.shape[0])+10,c=c,cmap='bwr')
ax[0].plot(*xp_best.T,'gx')
ax[0].set_title('sampling in parameter space')
ax[0].set_xlabel(r'$\log(\epsilon)$')
ax[0].set_ylabel(r'cut-off')
ax[0].set_aspect((np.max(eps_space)-np.min(eps_space)) / (np.max(cutoff_space)-np.min(cutoff_space)))

ax[1].plot(np.log(yp))
ax[1].plot(ind_best,np.log(yp[ind_best]),'gx')
ax[1].set_xlabel('iteration')
ax[1].set_ylabel(r'$\log$(error)');
ax[1].set_title('error over iterations');