# Principal Component Analysis

Principal component analysis is a fast and flexible unsupervised method for dimensionality reduction in data. 
Its behavior is easiest to visualize by looking at a two-dimensional dataset. 

Consider the following 200 points:

In [2]:

import numpy as np
from sklearn import datasets

import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn import datasets
import os, sys, plotly.graph_objects as go
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path) 
from erudition.learning.helpers.plots.plotly_render import render, scatter


In [32]:
rng = np.random.RandomState(1)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T

s=scatter(X[:, 0], X[:, 1], '', mode='markers', size=8, color='yellow')
fig=go.Figure(data=s)

fig.add_shape(
        # Line Vertical
        go.layout.Shape(
            type="line",
            x0=0,
            y0=0,
            x1=2.47419458,
            y1=0.86089761,
            line=dict(
                color="pink",
                width=3
            )
))
fig.add_shape(
        # Line Vertical
        go.layout.Shape(
            type="line",
            x0=0,
            y0=0,
            x1=0.38515116,
            y1=-0.1340136,
            line=dict(
                color="pink",
                width=3
            )
))




render(fig, '', width=600, height=600)


Now, instead of trying to predict the values of y from x we want the unsupervised learning problem attempt to learn about the relationship between the x and y values.
In principal component analysis, this relationship is quantified by finding a list of the principal axes in the data, and using those axes to describe the dataset. 
Using Scikit-Learn's PCA estimator, we can compute this as follows:

In [33]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X)

for length, vector in zip(pca.explained_variance_, pca.components_):
    print('>-----')
    v = vector * 3 * np.sqrt(length)
    print(v)
    print(pca.mean_)
    print('<-----')

>-----
[-2.47419458 -0.86089761]
[ 0.03351168 -0.00408072]
<-----
>-----
[-0.1340136   0.38515116]
[ 0.03351168 -0.00408072]
<-----


In [39]:
pca = PCA(n_components=1)
pca.fit(X)

X_pca = pca.transform(X)

print('original shape: ', X.shape)
print('transformed shape: ', X_pca.shape)

X_new = pca.inverse_transform(X_pca)

s=scatter(X[:, 0], X[:, 1], 'Original', mode='markers', size=8, color='yellow', opacity=0.5)
s_pca=scatter(X_new[:, 0], X_new[:, 1], 'Tranformed', mode='markers', size=8, color='pink', opacity=0.5)
fig=go.Figure(data=[s, s_pca])

render(fig, '')

original shape:  (200, 2)
transformed shape:  (200, 1)
