# In-class notebook: 2024-02-12

In this notebook, we will look at a few different ways to reduce the dimensionality of the data. We demonstrate this on a set of SDSS galaxy spectra. 

This notebook is intended to support Chapter 7 of the textbook, and material is taken from the following script (from astroML):
* https://github.com/astroML/astroML-notebooks/blob/main/chapter7/astroml_chapter7_Dimensionality_Reduction.ipynb
* https://github.com/astroML/astroML_figures/blob/main/book_figures/chapter7/fig_PCA_LLE.py

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Ellipse
from matplotlib import ticker

In [None]:
# let's first create some fake data with correlated structure
# you can rerun this step and get different patters
X = np.random.normal(size=(100, 3))

In [None]:
# X is a 100 x 3 matrix we wish to decompose 
# think 100 realizations of a data vector with 3 elements
plt.imshow(X.T)

## We first perform PCA on this using the SVD method 

In [None]:
# first approach: do the SVD decomposition on X
U, Sdiag, VT = np.linalg.svd(X, full_matrices=False)

In [None]:
# this is the eigen values, note that it decreases monotonically
print(Sdiag)
plt.plot(Sdiag)
plt.ylabel("eigen value")
plt.xlabel("index")

In [None]:
# this is the eigen vectors, from the above we know the first 3 are important
for i in range(3):
    plt.plot(VT.T[i], label=str(i))
plt.legend()

In [None]:
# this is the "coefficients" when projecting onto the new axis
plt.imshow(U.T)
U.shape

In [None]:
# recover original X marix from the decomposition
plt.imshow((np.dot(np.dot(U, np.diag(Sdiag)), VT)-X)/X)
plt.colorbar()

In [None]:
plt.figure(figsize=(4,10))
for i in range(3):
    E1 = np.diag(Sdiag)
    E1[i:,i:] = 0
    plt.subplot(1,3,i+1)
    plt.imshow(np.dot(np.dot(U, E1), VT))
    plt.tick_params(left = False, bottom=False, labelleft = False, labelbottom = False)
    plt.clim(-2.5,2.5)

In [None]:
# second approach: form the covariance matrix
CX = np.dot(X.T, X)

# get the eigen values and the rotation from the covariance matrix
CYdiag, R = np.linalg.eigh(CX)

In [None]:
CYdiag

In [None]:
# check that we recover the same eigen values
plt.plot(CYdiag[::-1])
plt.plot(Sdiag**2, ls='--')
plt.ylabel("eigen value")
plt.xlabel("index")

In [None]:
VT[::-1].T/R

## Now with a slightly more complicated case where the data is correlated 

In [None]:
# with the same fake data, we project it to a higher dimension
# note that this data only has 3 intrinsic dimensions
X = np.random.normal(size=(100, 3))
R0 = np.random.random((3, 10))
X = np.dot(X, R0)

In [None]:
# X is now a 100 x 10 matrix we wish to decompose 
# think 100 realizations of a data vector with 10 elements
plt.imshow(X.T)

In [None]:
# we do the same thing
U, Sdiag, VT = np.linalg.svd(X, full_matrices=False)
CX = np.dot(X.T, X)
CYdiag, R = np.linalg.eigh(CX)

plt.plot(Sdiag**2)
plt.plot(CYdiag[::-1], ls='--')
plt.ylabel("eigen value^2")
plt.xlabel("index")
plt.yscale("linear")
# check what happens when we swtich to log?
# note that only the first 3 are meaningful!

In [None]:
plt.figure(figsize=(10,10))
for i in range(10):
    E1 = np.diag(Sdiag)
    E1[i:,i:] = 0
    plt.subplot(1,10,i+1)
    plt.imshow(np.dot(np.dot(U, E1), VT))
    plt.tick_params(left = False, bottom=False, labelleft = False, labelbottom = False)
    plt.clim(-3,3)

## Ok now we can use the "blackbox" PCA routine in sklearn

You can read more here: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html, including different SVD solvers one can choose from etc..

In [None]:
from sklearn.decomposition import PCA

# tell sklearn to do a PCA decomposition and we just take 4 PCs
pca = PCA(n_components=4, svd_solver='full')
pca.fit(X)
comp = pca.transform(X)

In [None]:
mean = pca.mean_ # mean of the data vectors
components = pca.components_ # first 4 components
var = pca.explained_variance_ # first 4 eigen values

In [None]:
plt.plot(mean)
plt.plot(np.mean(X, axis=0), ls='--')

In [None]:
print(var)
print(Sdiag**2)
# note that in previous operations we ignored the prefactors! 
# this is resulting in the factor of ~100 here

In [None]:
# note positivity
for i in range(4):
    plt.plot(np.abs(components[i]), color='k')
    plt.plot(np.abs(VT[i]), label=str(i), ls='--')
plt.legend()

## Now let's try this on a set of SDSS spectra

### We randomly select 15 spectra to look at

SDSS internally did a PCA reconstruction already, so we are plotting the raw and reconstructed spectra here.

In [None]:
from astroML.datasets import sdss_corrected_spectra

#----------------------------------------------------------------------
# Use pre-computed PCA to reconstruct spectra
data = sdss_corrected_spectra.fetch_sdss_corrected_spectra()
spectra_raw = data['spectra']
spectra_corr = sdss_corrected_spectra.reconstruct_spectra(data)
wavelengths = sdss_corrected_spectra.compute_wavelengths(data)

#------------------------------------------------------------
# select random spectra
np.random.seed(5)
nrows = 5
ncols = 3
ind = np.random.randint(spectra_corr.shape[0], size=nrows * ncols)
spec_sample_raw = spectra_raw[ind]
spec_sample_corr = spectra_corr[ind]

In [None]:
fig = plt.figure(figsize=(10, 8))

fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05,
                    bottom=0.1, top=0.95, hspace=0.05)

for i in range(ncols):
    for j in range(nrows):
        ax = fig.add_subplot(nrows, ncols, ncols * j + 1 + i)
        ax.plot(wavelengths, spec_sample_raw[ncols * j + i], '-k', lw=1)
        ax.plot(wavelengths, spec_sample_corr[ncols * j + i], '-k', lw=1, c='blue')               
        ax.set_xlim(3100, 7999)

        ax.yaxis.set_major_formatter(plt.NullFormatter())
        ax.xaxis.set_major_locator(plt.MultipleLocator(1000))
        if j < nrows - 1:
            ax.xaxis.set_major_formatter(plt.NullFormatter())
        else:
            plt.xlabel(r'wavelength $(\AA)$')

        ylim = ax.get_ylim()
        dy = 0.05 * (ylim[1] - ylim[0])
        ax.set_ylim(ylim[0] - dy, ylim[1] + dy)

### Next we compute the PCA components

See this file for how we do this with masked data: https://github.com/astroML/astroML/blob/main/examples/datasets/compute_sdss_pca.py

In [None]:
#------------------------------------------------------------
# Compute PCA components

# Eigenvalues can be computed using PCA as in the commented code below:

#from sklearn.decomposition import PCA
#pca = PCA()
#pca.fit(spectra_corr)
#evals = pca.explained_variance_ratio_
#evals_cs = evals.cumsum()

#  because the spectra have been reconstructed from masked values, this
#  is not exactly correct in this case: we'll use the values computed
#  in the file compute_sdss_pca.py
evals = data['evals'] ** 2
evals_cs = evals.cumsum()
evals_cs /= evals_cs[-1]
evecs = data['evecs']
spec_mean = spectra_corr.mean(0)

#------------------------------------------------------------
# Find the coefficients of a particular spectrum
spec = spectra_corr[1]
coeff = np.dot(evecs, spec - spec_mean)

#------------------------------------------------------------
# Plot the sequence of reconstructions
fig = plt.figure(figsize=(5, 5))
fig.subplots_adjust(hspace=0, top=0.95, bottom=0.1, left=0.12, right=0.93)

for i, n in enumerate([0, 4, 8, 20]):
    ax = fig.add_subplot(411 + i)
    ax.plot(wavelengths, spec, '-', c='gray')
    ax.plot(wavelengths, spec_mean + np.dot(coeff[:n], evecs[:n]), '-k')

    if i < 3:
        ax.xaxis.set_major_formatter(plt.NullFormatter())

    ax.set_ylim(-2, 21)
    ax.set_ylabel('flux')

    if n == 0:
        text = "mean"
    elif n == 1:
        text = "mean + 1 component\n"
        text += r"$(\sigma^2_{tot} = %.2f)$" % evals_cs[n - 1]
    else:
        text = "mean + %i components\n" % n
        text += r"$(\sigma^2_{tot} = %.2f)$" % evals_cs[n - 1]

    ax.text(0.02, 0.93, text, ha='left', va='top', transform=ax.transAxes)

fig.axes[-1].set_xlabel(r'${\rm wavelength\ (\AA)}$')

In [None]:
fig = plt.figure(figsize=(10, 7.5))
fig.subplots_adjust(hspace=0.05, bottom=0.12)

ax = fig.add_subplot(211, xscale='log', yscale='log')
ax.grid()
ax.plot(evals, c='k')
ax.set_ylabel('Normalized Eigenvalues')
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.set_ylim(5E-4, 100)

ax = fig.add_subplot(212, xscale='log')
ax.grid()
ax.semilogx(evals_cs, color='k')
ax.set_xlabel('Eigenvalue Number')
ax.set_ylabel('Cumulative Eigenvalues')
ax.set_ylim(0.65, 1.00)

## Now compare all dimensional reduction methods: PCA, NMF and ICA

In [None]:
from sklearn.decomposition import NMF, FastICA, PCA

data = sdss_corrected_spectra.fetch_sdss_corrected_spectra()
spectra = sdss_corrected_spectra.reconstruct_spectra(data)
wavelengths = sdss_corrected_spectra.compute_wavelengths(data)

#----------------------------------------------------------------------
# Compute PCA, ICA, and NMF components
def compute_PCA_ICA_NMF(n_components=5):
    spec_mean = spectra.mean(0)

    # PCA: use randomized PCA for speed
    pca = PCA(n_components - 1, random_state=0, svd_solver='randomized')
    pca.fit(spectra)
    pca_comp = np.vstack([spec_mean,
                          pca.components_])

    # ICA treats sequential observations as related.  Because of this, we need
    # to fit with the transpose of the spectra
    ica = FastICA(n_components - 1, random_state=0)
    ica.fit(spectra.T)
    ica_comp = np.vstack([spec_mean,
                          ica.transform(spectra.T).T])

    # NMF requires all elements of the input to be greater than zero
    spectra[spectra < 0] = 0
    nmf = NMF(n_components, random_state=0)
    nmf.fit(spectra)
    nmf_comp = nmf.components_

    return pca_comp, ica_comp, nmf_comp

n_components = 5
decompositions = compute_PCA_ICA_NMF(n_components)


In [None]:
#----------------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(10, 8))
fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05,
                    bottom=0.1, top=0.95, hspace=0.05)

titles = ['PCA components', 'ICA components', 'NMF components']

for i, comp in enumerate(decompositions):
    for j in range(n_components):
        ax = fig.add_subplot(n_components, 3, 3 * j + 1 + i)

        ax.yaxis.set_major_formatter(plt.NullFormatter())
        ax.xaxis.set_major_locator(plt.MultipleLocator(1000))
        if j < n_components - 1:
            ax.xaxis.set_major_formatter(plt.NullFormatter())
        else:
            ax.xaxis.set_major_locator(
                plt.FixedLocator(list(range(3000, 7999, 1000))))
            ax.set_xlabel(r'wavelength ${\rm (\AA)}$')

        ax.plot(wavelengths, comp[j], '-k', lw=1)

        # plot zero line
        xlim = [3000, 8000]
        ax.plot(xlim, [0, 0], '-', c='gray', lw=1)

        if j == 0:
            ax.set_title(titles[i])

        if titles[i].startswith('PCA') or titles[i].startswith('ICA'):
            if j == 0:
                label = 'mean'
            else:
                label = 'component %i' % j
        else:
            label = 'component %i' % (j + 1)

        ax.text(0.03, 0.94, label, transform=ax.transAxes,
                ha='left', va='top')

        for l in ax.get_xticklines() + ax.get_yticklines():
            l.set_markersize(2)

        # adjust y limits
        ylim = plt.ylim()
        dy = 0.05 * (ylim[1] - ylim[0])

        ax.set_ylim(ylim[0] - dy, ylim[1] + 4 * dy)
        ax.set_xlim(xlim)



## Look at one more complicated example for LLE (locally linear embedding)

In [None]:
from sklearn import manifold, neighbors

from astroML.datasets import sdss_corrected_spectra
from astroML.datasets import fetch_sdss_corrected_spectra
from astroML.plotting.tools import discretize_cmap
from astroML.utils.decorators import pickle_results

#------------------------------------------------------------
# Set up color-map properties
clim = (1.5, 6.5)
cmap = discretize_cmap(plt.cm.jet, 5)
cdict = ['unknown', 'star', 'absorption galaxy',
         'galaxy', 'emission galaxy',
         'narrow-line QSO', 'broad-line QSO']
cticks = [2, 3, 4, 5, 6]
formatter = plt.FuncFormatter(lambda t, *args: cdict[int(np.round(t))])

#------------------------------------------------------------
# Fetch the data; PCA coefficients have been pre-computed
data = fetch_sdss_corrected_spectra()
coeffs_PCA = data['coeffs']
c_PCA = data['lineindex_cln']
spec = sdss_corrected_spectra.reconstruct_spectra(data)
color = data['lineindex_cln']


In [None]:
#------------------------------------------------------------
# Compute the LLE projection; save the results
@pickle_results("spec_LLE.pkl")
def compute_spec_LLE(n_neighbors=10, out_dim=3):
    # Compute the LLE projection
    LLE = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=out_dim, 
                                          method='modified',eigen_solver='dense')
    Y_LLE = LLE.fit_transform(spec)
    print(" - finished LLE projection")

    # remove outliers for the plot
    BT = neighbors.BallTree(Y_LLE)
    dist, ind = BT.query(Y_LLE, n_neighbors)
    dist_to_n = dist[:, -1]
    dist_to_n -= dist_to_n.mean()
    std = np.std(dist_to_n)
    flag = (dist_to_n > 0.25 * std)
    print(" - removing {0} outliers for plot".format(flag.sum()))

    return Y_LLE[~flag], color[~flag]

coeffs_LLE, c_LLE = compute_spec_LLE(10, 3)


In [None]:
#----------------------------------------------------------------------
# Plot the results:
for (c, coeffs, xlim) in zip([c_PCA, c_LLE],
                             [coeffs_PCA, coeffs_LLE],
                             [(-1.2, 1.0), (-0.01, 0.014)]):
    fig = plt.figure(figsize=(5, 3.75))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    # axes for colorbar
    cax = plt.axes([0.525, 0.525, 0.02, 0.35])

    # Create scatter-plots
    scatter_kwargs = dict(s=4, lw=0, edgecolors='none', c=c, cmap=cmap)

    ax1 = plt.subplot(221)
    im1 = ax1.scatter(coeffs[:, 0], coeffs[:, 1], **scatter_kwargs)
    im1.set_clim(clim)
    ax1.set_ylabel('$c_2$')

    ax2 = plt.subplot(223)
    im2 = ax2.scatter(coeffs[:, 0], coeffs[:, 2], **scatter_kwargs)
    im2.set_clim(clim)
    ax2.set_xlabel('$c_1$')
    ax2.set_ylabel('$c_3$')

    ax3 = plt.subplot(224)
    im3 = ax3.scatter(coeffs[:, 1], coeffs[:, 2], **scatter_kwargs)
    im3.set_clim(clim)
    ax3.set_xlabel('$c_2$')

    fig.colorbar(im3, ax=ax3, cax=cax,
                 ticks=cticks,
                 format=formatter)

    ax1.xaxis.set_major_formatter(plt.NullFormatter())
    ax3.yaxis.set_major_formatter(plt.NullFormatter())

    ax1.set_xlim(xlim)
    ax2.set_xlim(xlim)

    for ax in (ax1, ax2, ax3):
        ax.xaxis.set_major_locator(plt.MaxNLocator(5))
        ax.yaxis.set_major_locator(plt.MaxNLocator(5))