In [2]:
# %load ../helpers/header.py
SUB_DIR = "DimenRed"
# load ENV path to project from .profile 
import os, sys
PROJECT_ROOT_DIR=os.environ.get('ML_PATH')
sys.path.append(os.path.join(PROJECT_ROOT_DIR, "helpers")) # add helper modules to path

# MPL 
import MPL_header #load common MPL imports (from helpers)
import matplotlib.pyplot as plt
%matplotlib inline 
# %matplotlib widget 

# NP, constant seed, PD 
import numpy as np
np.random.seed(12345)
import pandas as pd

# Where to save the figures
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "fig", SUB_DIR)
# IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "fig")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## The more dimensions the training set has, the greater the risk of overfitting it. One solution to the curse of dimensionality could be to increase the size of the training set to reach a sufficient density of training instances. Two main approaches to reducing dimensionality: projection and Manifold Learning. Training instances lie within (or close to) a much lower-dimensional subspace of the high-dimensional space. In general/often, the task at hand (e.g., classification or regression) will be simpler if expressed in the lower-dimensional space of the manifold. Reducing the dimensionality of your training set before training a model will usually speed up training, but it may not always lead to a better or simpler solution; it all depends on the dataset.

# PCA: select hyperplane with largest variance (more info and minmises MSD to projection)
## Finds principal axis of max variance, and the next axis (up to D) - ortogonal to previous - to account for remaning variacne in other D.
## !N.B. PCA (outside of sklearn) needs centered data as input for the SVD

`from sklearn.decomposition import PCA`  
`pca = PCA(n_components = 2)`    
`X2D = pca.fit_transform(X)`

## `pca.explained_variance_ratio_` allows to check variance in each D

## Chosing D to explain e.g. 95% of variance
`pca = PCA()`  
`pca.fit(X_train)`  
`cumsum = np.cumsum(pca.explained_variance_ratio_)`  
`d = np.argmax(cumsum >= 0.95) + 1`  
## or simply
`pca = PCA(n_components=0.95)`  
`X_reduced = pca.fit_transform(X_train)`  

## also nice to plot cumsum vs. d - looking for a shoulder

# reconstruction error = mean squared distance between the original data and the reconstructed data
`pca = PCA(n_components = 154)`  
`X_reduced = pca.fit_transform(X_train)`  
`X_recovered = pca.inverse_transform(X_reduced)`  
`$X_{rec} = X_{d-proj}W_{d}^T$`

## Incremental PCA: one mini-batch at a time, for large datasets that can't fit into memory `from sklearn.decomposition import IncrementalPCA`, or `np.memmap` to manipulate a large array stored in a binary file on disk as if it were entirely in memory.

# Kernel PCA

### N.B. kernel trick: a linear decision boundary in the high-dimensional feature space corresponds to a complex nonlinear decision boundary in the original space.

`from sklearn.decomposition import KernelPCA`  
`rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)`  
`X_reduced = rbf_pca.fit_transform(X)`  

## you can use grid search to select the kernel and hyperparameters that lead to the best performance on a task (e.g. classification) (unsup-supervised method)
`from sklearn.model_selection import GridSearchCV `  
`from sklearn.linear_model import LogisticRegression `  
`from sklearn.pipeline import Pipeline`  
`clf = Pipeline([  ("kpca", KernelPCA(n_components=2)),  ("log_reg", LogisticRegression()) ])`  
`param_grid = [{ "kpca__gamma": np.linspace(0.03, 0.05, 10), "kpca__kernel": ["rbf", "sigmoid"] }]`  
`grid_search = GridSearchCV(clf, param_grid, cv=3)`   
`grid_search.fit(X, y)`  
` print(grid_search.best_params_)`

## or entirely unsupervsied: lowest reconstruction error. N.B. see finder point on true vs pre-image errror! `fit_inverse_transform=True` and `inverse_transform()` for KernelPCA.

# Locally Linear Embedding (LLE) - preserving closest neighbours' relations
##  is another powerful nonlinear dimensionality reduc‐ tion (NLDR) technique. It is a Manifold Learning technique that does not rely on projections

`from sklearn.manifold import LocallyLinearEmbedding`  
`lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)`  
`X_reduced = lle.fit_transform(X)`

# Other options: Random Projections, Multidimensional Scaling (MDS), Isomap, t-SNE...
