In [None]:
# PCA
# Principal Component Analysis (PCA) is by far the most popular dimensionality reduction algorithm.
# First it identifies the hyperplane that lies closest to the data, and then it projects the data onto it

# So how can you find the principal components of a training set? Luckily, there is a standard matrix
# factorization technique called Singular Value Decomposition (SVD) that can decompose the training set
# matrix X into the dot product of three matrices U · Σ · V T , where V T contains all the principal components
# that we are looking for

# The following Python code uses NumPy’s svd() function to obtain all the principal components of the
# training set, then extracts the first two PCs:
X_centered = X - X.mean(axis=0)
U, s, V = np.linalg.svd(X_centered)
c1 = V.T[:, 0]
c2 = V.T[:, 1]
# The following Python code projects the training set onto the plane defined by the first two principal
# components:
W2 = V.T[:, :2]
X2D = X_centered.dot(W2)

In [None]:
# Using Scikit-Learn
# Scikit-Learn’s PCA class implements PCA using SVD decomposition just like we did before. The
# following code applies PCA to reduce the dimensionality of the dataset down to two dimensions (note
# that it automatically takes care of centering the data):
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)
# After fitting the PCA transformer to the dataset, you can access the principal components using the
# components_ variable (note that it contains the PCs as horizontal vectors, so, for example, the first
# principal component is equal to pca.components_.T[:, 0] ).

In [None]:
# Another very useful piece of information is the explained variance ratio of each principal component,
# available via the explained_variance_ratio_ variable.
pca.explained_variance_ratio_

In [None]:
# Instead of arbitrarily choosing the number of dimensions to reduce down to, it is generally preferable to
# choose the number of dimensions that add up to a sufficiently large portion of the variance (e.g., 95%).
# Unless, of course, you are reducing dimensionality for data visualization — in that case you will
# generally want to reduce the dimensionality down to 2 or 3.

In [None]:
# The following code computes PCA without reducing dimensionality, then computes the minimum number
# of dimensions required to preserve 95% of the training set’s variance:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
# You could then set n_components=d and run PCA again. However, there is a much better option: instead
# of specifying the number of principal components you want to preserve, you can set n_components to be
# a float between 0.0 and 1.0 , indicating the ratio of variance you wish to preserve:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [None]:
# Incremental PCA


# One problem with the preceding implementation of PCA is that it requires the whole training set to fit in
# memory in order for the SVD algorithm to run. Fortunately, Incremental PCA (IPCA) algorithms have
# been developed: you can split the training set into mini-batches and feed an IPCA algorithm one mini-
# batch at a time. This is useful for large training sets, and also to apply PCA online (i.e., on the fly, as new
# instances arrive).

from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
inc_pca.partial_fit(X_batch)


# Alternatively, you can use NumPy’s memmap class, which allows you to manipulate a large array stored in
# a binary file on disk as if it were entirely in memory; the class loads only the data it needs in memory,
# when it needs it.
# Since the IncrementalPCA class uses only a small part of the array at any given time,
# the memory usage remains under control. This makes it possible to call the usual fit() method, as you
# can see in the following code:
X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m, n))
batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)

In [None]:
# Randomized PCA


# Scikit-Learn offers yet another option to perform PCA, called Randomized PCA. This is a stochastic
# algorithm that quickly finds an approximation of the first d principal components. Its computational
# complexity is O(m × d 2 ) + O(d 3 ), instead of O(m × n 2 ) + O(n 3 ), so it is dramatically faster than the
# previous algorithms when d is much smaller than n.
rnd_pca = PCA(n_components=154, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_train)

In [None]:
# KERNEL PCA
# It turns out that the same trick can be applied to PCA, making it possible to perform complex nonlinear
# projections for dimensionality reduction. This is called Kernel PCA (kPCA). 6 It is often good at
# preserving clusters of instances after projection, or sometimes even unrolling datasets that lie close to a
# twisted manifold.


from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

In [None]:
# how to select a kernel
# This decision can be made by finding the pre-image reconstruction error
# It can be done this way

# Scikit-Learn will do this automatically if you set fit_inverse_transform=True , as shown in the
# following code: 7
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.0433,
fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)


You can then compute the reconstruction pre-image error:
from sklearn.metrics import mean_squared_error
mean_squared_error(X, X_preimage)

In [None]:
# LLE

# Locally Linear Embedding (LLE) 8 is another very powerful nonlinear dimensionality reduction
# (NLDR) technique. It is a Manifold Learning technique that does not rely on projections like the previous
# algorithms. In a nutshell, LLE works by first measuring how each training instance linearly relates to its
# closest neighbors (c.n.), and then looking for a low-dimensional representation of the training set where
# these local relationships are best preserved (more details shortly). This makes it particularly good at
# unrolling twisted manifolds, especially when there is not too much noise.

from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)
