In [1]:
#!/usr/bin/env python
#
import numpy as np
from tools import *
from sklearn.metrics.pairwise import *
#

This code test the Mercer's Theorem<br>
Given<br>
1. $K$ as the kernel matrix for $n$ samples.<br>
2. $\{e_1, e_2, ..., e_m \}$ as its eigenfunctions <br>
2. $\{v_1, v_2, ...,  \}$ as its eigenvectors<br>
3. $\{\sigma_1, \sigma_2, ..., \sigma_m \}$ as its eigenvalues <br>
4. $X \subset \mathbb{R}^n$<br>
5. $k: \mathcal{X} \times \mathcal{X}$ be a symmetric continuous kernel function.<br>
then Mercer's Thm states that<br>
<br>
$$k(x, y) = \sum_{i=1}^\infty e_i(x) e_i(y)$$<br>
      $$= \begin{bmatrix} e_1(x) & e_2(x) & ... & e_m(x) & \end{bmatrix} \begin{bmatrix} e_1(x)  \\ e_2(x) \\ ... \\ e_m(x) \\ \end{bmatrix}      $$<br>
<br>
Implying that using eigenfunction as the basis gives a much smaller set of basis functions as the feature map<br>
Here, the eigenfunction is defined as <br>
$$e_i = \frac{1}{\sqrt{\sigma_i}} \Phi^{\top} v_i$$<br>
<br>
Therefore, using the eigenfunction as feature maps, we have<br>
$$\Phi = \begin{bmatrix} e_1(x) & e_2(x) & ... \end{bmatrix} = \begin{bmatrix} \Phi e_1 & \Phi e_2 & ... \end{bmatrix} = \begin{bmatrix} \frac{1}{\sqrt{\sigma_1}} \Phi \Phi^{\top} v_1 &  \frac{1}{\sqrt{\sigma_2}} \Phi \Phi^{\top} v_2 &  & ... \end{bmatrix}  = \begin{bmatrix} \frac{1}{\sqrt{\sigma_1}} K v_1 &  \frac{1}{\sqrt{\sigma_2}} K v_2 &  & ... \end{bmatrix} = K \begin{bmatrix} v_1 & v_2 & ... \end{bmatrix} \Sigma = K V \Sigma.$$<br>
<br>
Where <br>
$$ \Sigma = \begin{bmatrix} \frac{1}{\sqrt{\sigma_1}} & 0 & 0 & ... \\  0 & \frac{1}{\sqrt{\sigma_2}} & 0 & ...  \\  ... & ... & ... \end{bmatrix}$$<br>
<br>
<div class="alert alert-block alert-info"><br>
<b>Tip:</b> <br>
In this experiment, we are going to <br>
a. generate 10 random samples <br>
b. From these samples, we will directly compute the kernel matrix $K$                                                                           <br>
c. After $K$, we are going to use mercer's theorem to generate $\Phi$ with the eigenfunctions                                                   <br>
d. If Mercer is correct, then the feature map generated using the eigenfunctions $\Phi$ should give us the condition that $\Phi \Phi^{\top} = K$<br>
</div>

In [2]:
# Generate kernel matrix
γ = 0.5
X = np.random.randn(10,2)
K = rbf_kernel(X, gamma=γ)
#
# Generate feature maps via the eigenfunctinos
[D,V] = eigh_sort(K)
Σ = np.diag(1/np.sqrt(D[0:9]))
V = V[:, 0:9]

Φ = K.dot(V).dot(Σ)
K2 = Φ.dot(Φ.T)

<br>
Remember that since this is a Gaussian Kernel, the feature map should be $\Phi \in \mathbb{R}^{n \times \infty}$, <br>
but through Mercer's theorem, we are apply to obtain $\Phi \in \mathbb{R}^{n \times 9}$. <br>
This is much smaller and easier to deal with. <br>
<br>
Lastly, when we print out the kernel matrix of $K$ and $K_2$, notice<br>
that they are approximately the same. <br>


In [3]:
print(K, '\n') 
print(K2)
#

[[1.     0.204  0.6798 0.465  0.3538 0.5469 0.7401 0.4797 0.0218 0.2024]
 [0.204  1.     0.3028 0.7858 0.4692 0.7631 0.2694 0.6968 0.1031 0.9396]
 [0.6798 0.3028 1.     0.6959 0.1795 0.508  0.3391 0.3589 0.0063 0.2267]
 [0.465  0.7858 0.6959 1.     0.4026 0.8479 0.3942 0.6876 0.0386 0.6638]
 [0.3538 0.4692 0.1795 0.4026 1.     0.7387 0.7386 0.8897 0.4035 0.638 ]
 [0.5469 0.7631 0.508  0.8479 0.7387 1.     0.6662 0.9566 0.125  0.7887]
 [0.7401 0.2694 0.3391 0.3942 0.7386 0.6662 1.     0.7256 0.1286 0.339 ]
 [0.4797 0.6968 0.3589 0.6876 0.8897 0.9566 0.7256 1.     0.2148 0.799 ]
 [0.0218 0.1031 0.0063 0.0386 0.4035 0.125  0.1286 0.2148 1.     0.2051]
 [0.2024 0.9396 0.2267 0.6638 0.638  0.7887 0.339  0.799  0.2051 1.    ]] 

[[1.     0.204  0.6798 0.465  0.3538 0.5469 0.7401 0.4796 0.0218 0.2024]
 [0.204  1.     0.3028 0.7858 0.4692 0.7631 0.2694 0.6968 0.1031 0.9396]
 [0.6798 0.3028 1.     0.6959 0.1795 0.508  0.3391 0.3589 0.0063 0.2267]
 [0.465  0.7858 0.6959 1.     0.4026 0.848  0.39