In [1]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

from sklearn.model_selection import train_test_split

# PCA & Kernel-PCA

As we saw in previous notebooks, PCA and kernel-PCA can be used to perform dimensionality reduction and get a new and smaller dataset with two possible goals:

- Decrease computational cost


- Have fewer irrelevant variables or noise.

## Load Data

In [2]:
from google.colab import files
uploaded = files.upload()

Saving iris.csv to iris.csv


In [8]:
import io
dat = pd.read_csv(io.BytesIO(uploaded['iris.csv']), sep = "|")
dat.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


Split into train and val.

In [9]:
X_train, X_val = train_test_split(dat, test_size=0.3, random_state=1);


## PCA

It transforms the original variables into a set of new variables, a combination of the previous ones, linearly uncorrelated.


These variables are called **principal components**.


These components are the ones that contain the most information from the original dataset, i.e., they represent the directions with more variance or "movement" between points.

![imagen.png](attachment:imagen.png)

In [12]:
# 1) Import model
from sklearn.decomposition import PCA

# 3) Define model
pca = PCA()

# 4) Train model
pca.fit(X_train.drop('Species', axis = 1))

# 5) Make "predictions"
dat_pca = pd.DataFrame(pca.transform(dat.drop('Species', axis = 1)))
dat_pca

Unnamed: 0,0,1,2,3
0,-2.643440,0.403419,-0.017038,-0.016677
1,-2.683715,-0.096924,-0.191215,-0.104410
2,-2.855943,-0.056869,0.037132,-0.023230
3,-2.715234,-0.230288,0.050643,0.076491
4,-2.686849,0.414629,0.100375,0.048163
...,...,...,...,...
145,1.983507,0.183663,0.155847,-0.432826
146,1.554138,-0.371823,-0.134768,-0.252217
147,1.801422,0.084119,0.107727,-0.141930
148,1.943930,0.129766,0.697438,-0.042687


Let's select number of components based on ratio of variance explained.

In [13]:
variance_explained = np.cumsum(pca.explained_variance_ratio_)
variance_explained

array([0.92935669, 0.97636704, 0.99561696, 1.        ])

In [14]:
threshold = 0.99
n_components = np.min(np.where(variance_explained > threshold)) + 1
dat_new = dat_pca.iloc[:,0:n_components]
dat_new

Unnamed: 0,0,1,2
0,-2.643440,0.403419,-0.017038
1,-2.683715,-0.096924,-0.191215
2,-2.855943,-0.056869,0.037132
3,-2.715234,-0.230288,0.050643
4,-2.686849,0.414629,0.100375
...,...,...,...
145,1.983507,0.183663,0.155847
146,1.554138,-0.371823,-0.134768
147,1.801422,0.084119,0.107727
148,1.943930,0.129766,0.697438


## Kernel-PCA

Follows the same intuition as PCA, but this time the components are not restricted to be linear. This means that you can actually perform dimensionality augmentation in addition to dimensionality reduction using kernel-PCA.

You can choose between these kernels (do they sound familiar to you?):

- linear


- poly


- rbf


- sigmoid


- cosine


- precomputed


In [15]:
from sklearn.decomposition import KernelPCA
?KernelPCA

In [18]:
threshold = 0.8
kernel = 'rbf'

# 3) Define model
pca = KernelPCA(kernel = kernel, n_components = 4)

# 4) Train model
pca.fit(X_train.drop('Species', axis = 1))

# 5) Make "predictions"
dat_pca = pd.DataFrame(pca.transform(dat.drop('Species', axis = 1)))

dat_pca

Unnamed: 0,0,1,2,3
0,0.805597,0.052245,-0.097526,0.105948
1,0.793666,0.036090,-0.054282,-0.129304
2,0.802957,0.065609,-0.086720,-0.123636
3,0.787260,0.038130,-0.049692,-0.181275
4,0.804090,0.060144,-0.104187,0.108546
...,...,...,...,...
145,-0.516104,0.376079,-0.129164,-0.026794
146,-0.546584,0.031292,-0.212206,-0.029105
147,-0.561752,0.270082,-0.198791,-0.013761
148,-0.492682,0.356247,-0.177992,-0.201493


## Define Custom Function

In [None]:
def pca_function(X, X_train, method = 'pca', threshold = 0.8, n_components = None,
                 kernel = 'rbf'):
    if method == 'pca':
        # 3) Define model
        pca = PCA()

        # 4) Train model
        pca.fit(X_train.drop('Species', axis = 1))

        # 5) Make "predictions"
        dat_pca = pd.DataFrame(pca.transform(X.drop('Species', axis = 1)))

        if n_components == None:
            variance_explained = np.cumsum(pca.explained_variance_ratio_)
            n_components = np.min(np.where(variance_explained > threshold)) + 1

    elif method == 'kernel-pca':

        # 3) Define model
        pca = KernelPCA(kernel = kernel)

        # 4) Train model
        pca.fit(X_train)

        # 5) Make "predictions"
        dat_pca = pd.DataFrame(pca.transform(X.drop('Species', axis = 1)))

        if n_components == None:
            variance_explained = np.cumsum(pca.lambdas_) / np.sum(pca.lambdas_)
            n_components = np.min(np.where(variance_explained > threshold)) + 1

    X = dat_pca.iloc[:,0:n_components]
    return X


In [None]:
dat_new = pca_function(dat, X_train)
dat_new

Unnamed: 0,0
0,-31.289498
1,115.456596
2,-7.607357
3,28.256540
4,5.927158
...,...
995,-31.844249
996,-32.035543
997,-31.385145
998,-31.366016


In [None]:
dat_new = pca_function(dat, X_train, threshold = 0.99)
dat_new

Unnamed: 0,0,1
0,-31.289498,-25.929007
1,115.456596,14.885620
2,-7.607357,-6.472422
3,28.256540,-16.788257
4,5.927158,-21.216214
...,...,...
995,-31.844249,3.065687
996,-32.035543,13.063857
997,-31.385145,-20.929922
998,-31.366016,-21.929739


In [None]:
dat_new = pca_function(dat, X_train, n_components = 2)
dat_new

Unnamed: 0,0,1
0,-31.289498,-25.929007
1,115.456596,14.885620
2,-7.607357,-6.472422
3,28.256540,-16.788257
4,5.927158,-21.216214
...,...,...
995,-31.844249,3.065687
996,-32.035543,13.063857
997,-31.385145,-20.929922
998,-31.366016,-21.929739


In [None]:
dat_new = pca_function(dat, X_train, method = 'kernel-pca', kernel = 'linear')
dat_new

Unnamed: 0,0
0,-31.289498
1,115.456596
2,-7.607357
3,28.256540
4,5.927158
...,...
995,-31.844249
996,-32.035543
997,-31.385145
998,-31.366016


In [None]:
dat_new = pca_function(dat, X_train, method = 'kernel-pca', n_components = 10)
dat_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.109489,0.413084,0.390414,0.597827,-0.031412,-0.005823,-0.029699,-0.013380,0.023543,-0.098665
1,-0.090998,-0.035135,-0.031743,0.000304,-0.030444,-0.006538,-0.000169,0.006669,-0.024806,-0.012002
2,-0.090381,-0.034842,-0.031353,0.000298,-0.029861,-0.006405,-0.000165,0.006510,-0.024168,-0.011690
3,-0.090178,-0.034743,-0.031216,0.000296,-0.029638,-0.006353,-0.000163,0.006444,-0.023897,-0.011556
4,-0.092207,-0.035719,-0.032547,0.000316,-0.031736,-0.006836,-0.000178,0.007044,-0.026343,-0.012759
...,...,...,...,...,...,...,...,...,...,...
995,-0.144164,-0.064716,-0.098745,0.035569,0.687031,-0.428464,-0.042839,0.038877,0.058148,0.031873
996,-0.119881,-0.050240,-0.058377,0.001374,-0.116392,0.003465,0.024123,0.550067,0.264132,0.122805
997,0.665170,-0.386472,0.335481,0.157687,0.030836,0.007694,0.018901,0.004041,-0.003912,0.091196
998,0.830973,-0.073046,-0.271427,0.205002,-0.016981,-0.003137,-0.015634,-0.007891,0.016247,-0.042754
