# Geometrical Methods in Machine Learning
## Seminar 4: Kernel PCA

In [None]:
from __future__ import print_function

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import numpy as np
from sklearn.datasets import load_digits, fetch_olivetti_faces, make_moons, make_circles
from sklearn.datasets.samples_generator import make_swiss_roll
from sklearn.metrics.pairwise import linear_kernel, rbf_kernel, sigmoid_kernel, cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler

## 0. Moons

In [None]:
# load data
X, y = make_moons(n_samples=500, noise=0.01, random_state=42)
X_std = StandardScaler().fit_transform(X)

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(X_std[y==0, 0], X_std[y==0, 1], color='red', alpha=0.25)
plt.scatter(X_std[y==1, 0], X_std[y==1, 1], color='blue', alpha=0.25)
plt.title('A nonlinear 2D dataset')
plt.ylabel('Y')
plt.xlabel('X')
plt.grid(linestyle="dotted")
plt.show()

In [None]:
# apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

pca_1d = PCA(n_components=1)
X_pca_1d = pca_1d.fit_transform(X)

In [None]:
plt.figure(figsize=(16,4.5))

plt.subplot(131)
plt.title('Original data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(linestyle="dotted")
plt.scatter(X[y==0, 0], X[y==0, 1], color='red', alpha=0.25)
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', alpha=0.25)

plt.subplot(132)
plt.title('PCA')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(linestyle="dotted")
plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], color='red', alpha=0.25)
plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], color='blue', alpha=0.25)

plt.subplot(133)
plt.title('Projection to PC1')
plt.xlabel('PC1')
plt.grid(linestyle="dotted")
plt.scatter(X_pca_1d[y==0, 0], np.zeros((X_pca.shape[0]/2,1)), color='red', alpha=0.25)
plt.scatter(X_pca_1d[y==1, 0], np.zeros((X_pca.shape[0]/2,1)), color='blue', alpha=0.25)

plt.show()

## 1. Kernels

Kernel is any function of the form:

$$K(\mathbf{x}, \mathbf{x}') = \langle \phi(\mathbf{x}), \phi(\mathbf{x}') \rangle$$

where $\phi$ is a function that projects vectors $\mathbf{x}$ and $\mathbf{x}'$ into a new vector space. The kernel function computes the inner-product between two projected vectors.

### 1.1. Linear kernel

A _linear kernel_ is given by $\mathbf{x}^T\mathbf{x}$, thus the inner product in the original vector space. Consider a $n \times m$ dataset $\mathbf{X} = \{ \mathbf{x}_1, \dots, \mathbf{x}_n\}$ with $n$ observations and $m$ features, then _Gram matrix_ $\mathbf{G} = \mathbf{X} \mathbf{X}^T$ is a matrix of inner-products, in this notation covariance matrix $\mathbf{S}$ is given by $\mathbf{X}^T\mathbf{X}$.

#### Exercise

Compute the matrix of pairwise linear kernels for a dataset $\mathbf{X}$, check that it is Gram matrix. You can use `linear_kernel` from `sklearn.metrics.pairwise`.

In [None]:
# your code here

KernelPCA with linear kernel obviously is equivalent to PCA.

In [None]:
# apply KernelPCA with linear kernel
kpca = 
X_kpca = 

kpca_1d = 
X_kpca_1d = 

In [None]:
plt.figure(figsize=(16,4.5))

plt.subplot(131)
plt.title('Original data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(linestyle="dotted")
plt.scatter(X[y==0, 0], X[y==0, 1], color='red', alpha=0.25)
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', alpha=0.25)

plt.subplot(132)
plt.title('Kernel PCA /w linear kernel')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', alpha=0.25)
plt.scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', alpha=0.25)

plt.subplot(133)
plt.title('Projection to PC1')
plt.xlabel('PC1')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca_1d[y==0, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='red', alpha=0.25)
plt.scatter(X_kpca_1d[y==1, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='blue', alpha=0.25)

plt.show()

### 1.2. Gaussian kernel

A radial basis function is a real-valued function whose value depends only on the distance from some other point $x'$, so that $\phi (\mathbf {x} ,\mathbf {x'} )= \phi (\|\mathbf {x} -\mathbf {x'} \|)$.

Radial basis kernel is given by:

$$K(\mathbf{x}, \mathbf{x}') = \exp \left(- \gamma \| \mathbf{x} -\mathbf{x}' \|^2\right)$$

 If $\gamma = 2 \sigma^{-1}$ it is known as the Gaussian kernel:

$$K(\mathbf{x}, \mathbf{x}') = \exp \left(- \frac{\| \mathbf{x} -\mathbf{x}' \|^2 }{2 \sigma^2}\right)$$

#### Exercise

Compute the matrix of pairwise RBF kernels for a dataset $\mathbf{X}$. Implement Gaussian kernel, check it vaues are equals scikit-learn solution. Check kernel values for different $\gamma$ and $\sigma$. You can use `rbf_kernel` from `sklearn.metrics.pairwise`.

In [None]:
# compute K, matrix of paiwise RBF kernels for dataset X
K = 

In [None]:
# implement Gaussian kernel
def gaussian_kernel

#### Exercise

Apply KernelPCA with Gaussian kernel with different values of $\gamma$ to Moons dataset.

In [None]:
# apply KernelPCA with Gaussian kernel
kpca = 
X_kpca = 

kpca_1d = 
X_kpca_1d = 

In [None]:
plt.figure(figsize=(16,4.5))

plt.subplot(131)
plt.title('Original data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(linestyle="dotted")
plt.scatter(X[y==0, 0], X[y==0, 1], color='red', alpha=0.25)
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', alpha=0.25)

plt.subplot(132)
plt.title('Kernel PCA /w Gaussian kernel')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', alpha=0.25)
plt.scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', alpha=0.25)

plt.subplot(133)
plt.title('Projection to PC1')
plt.xlabel('PC1')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca_1d[y==0, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='red', alpha=0.25)
plt.scatter(X_kpca_1d[y==1, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='blue', alpha=0.25)

plt.show()

### 1.3 Polynomial kernel

The polynomial kernel is defined as:

$$K(\mathbf{x}, \mathbf{x'}) = (\gamma \mathbf{x}^\top \mathbf{x}' + c)^d$$

where $\mathbf{x}, \mathbf{x'}$ are the input vectors, $\gamma$, $c$ are scalars and $d$ is the kernel degree.

#### Exercise

Apply KernelPCA with polynomial kernel to Moons dataset.

In [None]:
# apply KernelPCA with polynomial kernel
kpca = 
X_kpca = 

kpca_1d = 
X_kpca_1d = 

In [None]:
plt.figure(figsize=(16,4.5))

plt.subplot(131)
plt.title('Original data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(linestyle="dotted")
plt.scatter(X[y==0, 0], X[y==0, 1], color='red', alpha=0.25)
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', alpha=0.25)

plt.subplot(132)
plt.title('Kernel PCA /w polynomial kernel')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', alpha=0.25)
plt.scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', alpha=0.25)

plt.subplot(133)
plt.title('Projection to PC1')
plt.xlabel('PC1')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca_1d[y==0, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='red', alpha=0.25)
plt.scatter(X_kpca_1d[y==1, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='blue', alpha=0.25)

plt.show()

### 1.4. Cosine similarity

Cosine similarity is the normalized dot product between vectors $\mathbf{x}, \mathbf{x}'$ which is the cosine of the angle between the points denoted by the vectors:

$$K(\mathbf{x}, \mathbf{x}') = \frac{\mathbf{x} \mathbf{x}'^\top}{\|\mathbf{x}\| \|\mathbf{x}'\|}$$

#### Exercise

Apply Kernel PCA with cosine similarity as precomputed kernel to Moons dataset.

In [None]:
# compute K, matrix of paiwise cosine similarities of dataset X
K = 

In [None]:
# apply KernelPCA with cosine similarity as precomputed kernel
kpca = 
X_kpca =  d

kpca_1d = 
X_kpca_1d = 

In [None]:
plt.figure(figsize=(16,4.5))

plt.subplot(131)
plt.title('Original data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(linestyle="dotted")
plt.scatter(X[y==0, 0], X[y==0, 1], color='red', alpha=0.25)
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', alpha=0.25)

plt.subplot(132)
plt.title('Kernel PCA /w cosine similarity kernel')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', alpha=0.25)
plt.scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', alpha=0.25)

plt.subplot(133)
plt.title('Projection to PC1')
plt.xlabel('PC1')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca_1d[y==0, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='red', alpha=0.25)
plt.scatter(X_kpca_1d[y==1, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='blue', alpha=0.25)

plt.show()

## 2. Circles

In [None]:
# load data
X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2)

In [None]:
plt.figure(figsize=(6,6))

plt.scatter(X[y==0, 0], X[y==0, 1], color='red', alpha=0.25)
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', alpha=0.25)

plt.title('A nonlinear 2D dataset')
plt.ylabel('Y')
plt.xlabel('X')
plt.grid(linestyle="dotted")

plt.show()

#### Exercise

Apply Kernel PCA with Gaussian kernel, find a range of parameter $\gamma$ when data is linearly separable.

In [None]:
# apply KernelPCA with Gaussian kernel
kpca =
X_kpca = 

kpca_1d = 
X_kpca_1d = 

In [None]:
plt.figure(figsize=(16,4.5))

plt.subplot(131)
plt.title('Original data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(linestyle="dotted")
plt.scatter(X[y==0, 0], X[y==0, 1], color='red', alpha=0.25)
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', alpha=0.25)

plt.subplot(132)
plt.title('Kernel PCA /w Gaussian kernel')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', alpha=0.25)
plt.scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', alpha=0.25)

plt.subplot(133)
plt.title('Projection to PC1')
plt.xlabel('PC1')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca_1d[y==0, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='red', alpha=0.25)
plt.scatter(X_kpca_1d[y==1, 0], np.zeros((X_kpca_1d.shape[0]/2,1)), color='blue', alpha=0.25)

plt.show()

## 3. Swiss roll

In [None]:
X, color = make_swiss_roll(n_samples=1000, random_state=123)

fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap="Reds")
plt.title('Swiss Roll in 3D')
plt.show()

#### Exercise

Apply Kernel PCA with different kernels and kernel parameters to Swiss roll dataset. Compare results.

In [None]:
kpca = 
X_kpca = 

kpca_1d = 
X_kpca_1d = 

In [None]:
plt.figure(figsize=(12,5.75))

plt.subplot(121)
plt.title('Original data')
plt.xlabel('X')
plt.ylabel('Z')
plt.grid(linestyle="dotted")
plt.scatter(X[:, 0], X[:, 2], c=color, cmap="Reds", alpha=0.25)

plt.subplot(122)
plt.title('Kernel PCA /w Gaussian kernel')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(linestyle="dotted")
plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=color, cmap="Reds", alpha=0.25)

plt.show()