# PCA tutorial

In [77]:
# General imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.decomposition import PCA

### Read in data

In [78]:
# This data is known as the UCI ML Breast Cancer Wisconsin Diagnostic datset
(data,target) = datasets.load_breast_cancer(return_X_y=True)

In [79]:
data = pd.DataFrame(data)
target = pd.Series(target)

In [6]:
data.shape

(569, 30)

### Standardize data

In [80]:
data_std = data.subtract(data.mean()).divide(data.std())

### Get covariance matrix

In [81]:
data_cov = data_std.cov()

In [83]:
data_cov.shape

(30, 30)

### Do eigenvalue decomposition on covariance matrix

In [84]:
eig = np.linalg.eig(data_cov.values)

#### Get eigenvalues

In [85]:
eigvalues = eig[0]

#### Get eigenvectors

In [86]:
# Each column is an eigenvector. Each element of the eigenvector can be called a "loading"
eigenvectors = eig[1]

In [101]:
eigenvectors.shape

(30, 30)

### Transform data

In [87]:
transformed = data_std.values.dot(eigenvectors)

In [88]:
transformed.shape

(569, 30)

### Scikit learn version

In [91]:
pca = PCA()
sklearn_transform = pca.fit_transform(data_std)

In [92]:
sklearn_transform.shape

(569, 30)

### See if transformed datasets from both approaches are equal

In [93]:
pd.DataFrame(transformed).equals(pd.DataFrame(sklearn_transform))

False

In [96]:
np.isclose(transformed, sklearn_transform)

array([[ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False]])

In [97]:
transformed

array([[ 9.18475521,  1.94687003, -1.12217877, ..., -0.06878939,
         0.34526111, -0.09643004],
       [ 2.38570263, -3.76485906, -0.52882737, ...,  0.0944953 ,
        -0.11403274,  0.07725949],
       [ 5.72885549, -1.07422859, -0.55126254, ...,  0.06025601,
        -0.20435242, -0.31079325],
       ...,
       [ 1.25507494, -1.90062436,  0.56223582, ..., -0.06329679,
        -0.08134834,  0.0365601 ],
       [10.36567336,  1.67054021, -1.87537919, ...,  0.04244096,
         0.17306409, -0.14052386],
       [-5.4704299 , -0.67004722,  1.4891328 , ...,  0.24302371,
         0.16311055, -0.27443823]])

In [99]:
sklearn_transform[0]

array([ 9.18475521,  1.94687003, -1.12217877,  3.63053641, -1.19405948,
        1.41018364,  2.15747152, -0.39805698, -0.15698023, -0.87663054,
        0.26272429, -0.8582593 ,  0.10329677, -0.6901968 , -0.60126408,
        0.74446075, -0.2652374 , -0.54907956, -0.1336499 ,  0.34526111,
        0.09643004,  0.06878939,  0.08444429, -0.17510221, -0.15088729,
       -0.20132631, -0.25236294, -0.03388464,  0.04560759, -0.04712774])

In [100]:
transformed[0]

array([ 9.18475521,  1.94687003, -1.12217877,  3.63053641,  1.19405948,
        1.41018364, -2.15747152,  0.39805698, -0.15698023, -0.87663054,
        0.26272429, -0.8582593 ,  0.10329677,  0.6901968 , -0.60126408,
        0.74446075, -0.2652374 , -0.54907956, -0.1336499 , -0.04712774,
        0.04560759,  0.03388464, -0.25236294,  0.20132631, -0.15088729,
        0.17510221, -0.08444429, -0.06878939,  0.34526111, -0.09643004])

In [104]:
# Should be all 0's on the diagonal
pd.DataFrame(transformed).cov()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,13.28161,2.601987e-15,-2.23921e-15,-1.601223e-15,3.027312e-15,-6.504969e-16,-3.37758e-16,2.745847e-15,5.6293e-16,1.103968e-15,...,3.050181e-15,-4.284523e-16,-1.509356e-15,1.488637e-15,8.670685e-16,-8.381402e-16,-1.788866e-15,2.564459e-16,-3.455765e-16,4.096879e-16
1,2.601987e-15,5.691355,2.25172e-16,-1.200917e-15,-2.501911e-17,0.0,7.88102e-16,1.907707e-16,-3.002293e-16,7.568281e-16,...,-1.134753e-15,5.80717e-16,5.879491e-16,-1.188408e-15,-2.55664e-16,-1.813886e-16,4.784905e-16,2.294233e-16,8.608138e-16,-5.551115e-16
2,-2.23921e-15,2.25172e-16,2.817949,-2.752102e-16,5.754395e-16,5.316561e-16,1.426089e-15,3.064841e-16,3.502675e-16,-1.352596e-16,...,-4.55426e-17,-3.45381e-16,-8.503566e-16,2.478456e-16,-4.984276e-18,-9.851275000000001e-17,2.838105e-16,-5.293106e-16,-3.775345e-16,4.190701e-16
3,-1.601223e-15,-1.200917e-15,-2.752102e-16,1.98064,-2.054694e-15,8.537771e-16,1.163389e-15,-2.384634e-16,-6.848981e-16,3.236847e-16,...,9.435918e-17,-9.968552e-18,-9.401713000000001e-17,-3.4205820000000006e-17,-1.078949e-16,3.291577e-16,-2.236083e-16,2.869379e-16,-2.824423e-17,3.596497e-17
4,3.027312e-15,-2.501911e-17,5.754395e-16,-2.054694e-15,1.648731,9.444714e-16,6.848981e-16,1.032038e-15,-8.037389e-16,1.376051e-16,...,2.2575840000000002e-17,1.955717e-16,9.460351000000001e-17,9.929459000000001e-17,-3.424491e-16,-2.009347e-16,-2.60746e-16,2.02303e-16,-8.092119000000001e-17,3.252484e-16
5,-6.504969e-16,0.0,5.316561e-16,8.537771e-16,9.444714e-16,1.207357,-3.37758e-16,1.813886e-16,-7.974841e-17,5.144555e-16,...,-4.109584e-17,2.218491e-17,-2.8928350000000004e-17,4.456529e-16,-9.577628e-17,-2.478456e-16,1.27832e-16,-1.591792e-16,-2.1500800000000002e-17,1.266592e-16
6,-3.37758e-16,7.88102e-16,1.426089e-15,1.163389e-15,6.848981e-16,-3.37758e-16,0.6752201,-1.000764e-16,0.0,3.768504e-16,...,3.4321870000000005e-17,2.697373e-17,2.280795e-16,2.847878e-16,4.769268e-17,-2.971019e-17,-9.372393e-17,-3.850597e-17,-1.219682e-16,1.305685e-16
7,2.745847e-15,1.907707e-16,3.064841e-16,-2.384634e-16,1.032038e-15,1.813886e-16,-1.000764e-16,0.4766171,-2.126624e-16,-4.3001600000000004e-17,...,-6.723886e-17,5.497363e-17,-4.945184e-17,1.646521e-16,1.386801e-16,9.069428000000001e-17,-1.07504e-16,-7.877111e-17,-6.645701e-18,-2.306449e-16
8,5.6293e-16,-3.002293e-16,3.502675e-16,-6.848981e-16,-8.037389e-16,-7.974841e-17,0.0,-2.126624e-16,0.4168948,-2.001529e-16,...,-1.220567e-17,2.834196e-17,-1.983937e-16,-9.421259000000001e-17,3.9483280000000004e-17,-3.643408e-16,1.12586e-16,3.47922e-16,-1.278687e-16,1.2705020000000001e-17
9,1.103968e-15,7.568281e-16,-1.352596e-16,3.236847e-16,1.376051e-16,5.144555e-16,3.768504e-16,-4.3001600000000004e-17,-2.001529e-16,0.3506935,...,-2.069452e-16,2.492627e-16,1.980028e-16,1.821704e-16,-7.271179e-17,1.407325e-17,-4.3001600000000004e-17,-2.345542e-18,-3.831051e-17,2.369974e-16
