In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

from algorithms import my_pca, my_kpca

# 1 Kernel PCA

You can use external libraries for linear algebra operations but you are expected to write your
own algorithms.

## 1.1 Exercise 1

Use the `kPCA_data_2024.txt` and `kPCA_labels_2024.txt` uploaded in the Datasets folder. \
The first file contains the variables describing the data, while the second one contains the labels
of the classes associated to it.

1. Visualize and explore the data, how many dimensions it has? If you plot the first two dimensions what you see?
2. Apply your own implementation of PCA to the dataset and plot the eigenvalue spectrum.
3. Project the data in the first two principal components and color by class.
4. Implement your own version of Kernel PCA.
5. Apply Kernel PCA to the dataset. Test both a Gaussian kernel with width $\sigma \in [0.05, 2.0]$ and
a polynomial kernel varying the value of $\delta \in \mathbb{N}$.
6. Plot the transformed data in 2d and 3d for the different kernels.

---

**Notes** \
Use the second version of the polynomial kernel introduced in class (the one with ”1 + . . .” ).


In [2]:
data = pd.read_table("./Datasets/kPCA_data_2024.txt", sep=' ' )
labels = pd.read_table("./Datasets/kPCA_labels_2024.txt")
data.head(10)

Unnamed: 0,4.844007303819926336e-02,-6.057277237340876752e-01,-8.064553150260835856e-01,-8.903855782271943853e-02,-1.737847493837963719e-01,2.076300575834306272e+00,3.136215346121585812e-01,-6.046344739416281699e-02,5.887963774585065063e-02,2.217365288367019460e-01,1.393039464461545140e-01,3.023198473065739911e-02,7.705470842646063023e-02,9.108284679853901888e-01,-5.462696605763599396e-03,-7.489078223921445954e-01,-4.726946847934275220e-01,-3.725548172923394685e-01,2.654501526956469792e-01,4.954478209400864364e-01
0,-0.295462,-0.022271,-0.156778,-0.186206,1.065956,-0.109448,0.660369,-0.337726,1.062067,-0.201562,0.512837,-1.07472,0.49996,-0.328338,-0.046867,-0.322455,-0.268781,-0.601436,1.261508,-0.97603
1,0.907188,-0.641413,0.376778,-0.0595,0.579425,-0.771757,0.400979,0.09662,0.000697,1.569824,-0.628336,0.094161,0.959419,0.047134,-0.527203,-0.117238,-0.699726,0.442404,0.395835,-0.600845
2,0.780795,0.290714,0.536952,0.029201,-0.072487,0.435681,-0.570775,0.804292,-0.429733,-1.330242,0.408977,0.434869,0.020344,-0.692591,-0.448373,0.550437,0.655933,0.739303,-0.415398,-0.784031
3,0.441323,-0.433679,0.067781,-0.621546,0.419723,-0.957915,0.338285,0.014053,-0.010859,0.756259,-0.514746,-0.027585,0.959982,0.001654,-0.548821,-0.119099,-0.802109,1.094712,-1.142502,0.702591
4,-0.065164,-0.040561,-0.669958,-0.040982,-0.699678,-0.325913,-0.128257,0.097608,0.686576,1.120796,-0.020707,0.011008,-0.267033,-0.944402,0.905137,-1.111122,-0.213749,1.222356,0.202653,-0.307991
5,0.056646,0.410302,0.056863,-0.323346,-0.261563,1.724,0.264337,0.119102,-0.454987,1.114121,0.9384,-0.772304,-0.388,1.100194,0.081941,0.235535,-0.466863,-0.224178,-0.000573,0.09664
6,-0.736735,-0.148512,-0.075388,0.572474,0.675302,0.642757,-0.311563,-0.224402,0.622451,0.200635,-1.189875,0.048083,0.031853,0.103544,-0.372252,1.17977,-0.44619,0.742163,-0.646883,0.107995
7,-0.481976,-0.025659,0.15084,0.813668,0.171317,0.663545,0.501602,0.258359,0.597251,0.329638,-1.222861,-0.457942,-0.365414,1.002387,-0.727236,1.126941,-0.217852,0.79847,0.201119,-0.024589
8,-0.607208,-0.522803,0.308843,0.358578,0.836118,-0.107347,0.073622,0.398357,-0.216316,0.694409,0.899109,-0.444841,0.200321,0.315959,-0.316035,0.724769,0.450792,1.453086,-0.416107,-1.342363
9,-0.32559,0.25075,-0.154506,0.47207,-0.108509,0.198018,-1.281368,-0.663286,-0.734061,-0.8435,0.357441,-0.329136,-0.115886,0.182965,0.490656,0.280728,0.544659,0.034985,0.47055,0.474415


In [3]:
data.shape

(6631, 20)

In [4]:
px.scatter(x=data.iloc[:, 2], y=data.iloc[:, 1], color=labels.iloc[:, 0]).show()

They is a complete mess here! So probably PCA will fail... No reason to see a linear relationship between data...

In [5]:
data_pca, dimension, eigenvectors = my_pca(data, dim=2)
px.scatter(data_pca.iloc[:, 0], data_pca.iloc[:, 1], color=labels.iloc[:, 0], title=f"Data with {dimension}d PCA").show()

In [6]:
data_pca, dimension, eigenvectors = my_pca(data, dim=3)
fig = px.scatter_3d(x=data_pca.iloc[:, 0], y=data_pca.iloc[:, 1], z=data_pca.iloc[:, 2], color=labels.iloc[:, 0],  title=f"Data with {dimension}d PCA")
fig.update_traces(marker=dict(size=1)).show()

The data are still a mess and are not linearly dividable, so we need more advanced tools.

In [7]:
data_pca, dimension, eigenvectors = my_kpca(data, dim=3, kernel_type='rbf', kernel_param=0.5)

0
1
2
3
4
5
6


In [8]:
data_pca = pd.DataFrame(data_pca)
fig = px.scatter_3d(x=data_pca.iloc[:, 0], y=data_pca.iloc[:, 1], z=data_pca.iloc[:, 2], color=labels.iloc[:, 0],  title=f"Data with {dimension}d PCA")
fig.update_traces(marker=dict(size=1)).show()