# Principal Component Analysis from scratch using numpy  

Unsupervised learning method used for dimensionality reduction of dataset.  
  
Goal: 
 - Find a set of dimensions (orthogonal, linearly independent) 
 - Ranked according to variance of data along them 
 - Select the most relevant ones that reduce the projection error
 - Projected points should have maximum variance

Covariance Matrix = $Cov(X, Y) = \dfrac 1 n \sum {(X_i - \bar X)(Y_i - \bar Y)^T}$

Calculate:
 - Eigen vectors of $Cov(X, X)$, pointing in the direction of maximum variance  
 - Eigen values representing their corresponding importance
 
$A \tilde v = \lambda \tilde v$

In [None]:
import numpy as np

In [None]:
class PCA:
  def __init__(self, nComponents):
    self.nComponents = nComponents

  def fit(self, X):
    self.mean = np.mean(X, axis=0)
    X -= self.mean

    cov = np.cov(X.T)

    eigenvectors, eigenvalues = np.linalg.eig(cov)
    eigenvectors = eigenvectors.T

    idxs = np.argsort(eigenvalues)[::-1]
    eigenvalues, eigenvectors = eigenvalues[idxs], eigenvectors[idxs]

    self.components = eigenvectors[:self.nComponents]

  def transform(self, X):
    X -= self.mean
    return np.dot(X, self.components.T)

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
X, y = iris.data, iris.target

pca = PCA(nComponents=2)
pca.fit(X)
XProjection = pca.transform(X)

print("Shape of X", X.shape)
print("Shape of transformed X", XProjection.shape)

In [None]:
from matplotlib import pyplot as plt

x1, x2 = XProjection[:, 0], XProjection[:, 1]

plt.scatter(x1, x2, c=y, alpha=0.8, cmap=plt.get_cmap("viridis"))
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()