# Introduction
In my previous notebook `2. Text Processing` I tokenized and stemmed the bodies and subjects of 6041 emails from the Spamassassin corpus. 

Eventually I will represent emails as one-hot vectors, with every vector representing a word in the vocabulary. Currently this would lead to a 98401-dimensional space.

In this notebook I will implement Principal Component Analysis, PCA, and use it for dimensionality reduction on my data. 

# Implementing PCA
I will start of by implementing PCA using numpy.

In [1]:
import pandas as pd
import numpy as np

In [31]:
X = np.array([[1, 2], [1,3], [1,5]])

In [45]:
X

array([[1, 2],
       [1, 3],
       [1, 5]])

In [33]:
X.mean(axis=0)

array([ 1.        ,  3.33333333])

In [34]:
# Implementation using numpys single value decomposition
class PCA_svd():
    def transform(self, X, dims):
        _, _, V = np.linalg.svd(X - X.mean(axis=0))
        V_dims = V[:dims]
        return X.dot(V_dims.T)
        

In [173]:
# Implementations using correlation matrix
class PCA():
    def fit(self, X):
        X_cov = np.cov(X.T)
        eig_vals, eig_vecs = np.linalg.eig(X_cov)
        sort_index = eig_vals.argsort()[::-1]
        self.eig_vecs = eig_vecs[sort_index]
        eig_vals = eig_vals[sort_index]
        self.cum_information = np.cumsum(eig_vals)/eig_vals.sum()
        
    def transform(self, X, dim):
        # If dim is set to a ratio, then find out how many dimensions are required to keep that ratio of information
        if 0 <= dim < 1:
            dim = np.argmax(self.cum_information > dim)+1
        return X.dot(self.eig_vecs[:dim].T)
        
        
        

In [174]:
from sklearn.decomposition import PCA as sklearnPCA

In [175]:
pca = PCA()

In [196]:
sk_pca = sklearnPCA()

In [183]:
sk_pca.fit_transform(X) + 3.33

array([[ 1.99666667],
       [ 2.99666667],
       [ 4.99666667]])

In [178]:
X

array([[1, 2],
       [1, 3],
       [1, 5]])

In [179]:
pca.fit(X)

In [180]:
pca.transform(X, .1)

array([[ 2.],
       [ 3.],
       [ 5.]])

In [184]:
pca_svd = PCA_svd()

In [186]:
pca_svd.transform(X, 1)

array([[ 2.],
       [ 3.],
       [ 5.]])

In [187]:
from numpy.random import rand

In [188]:
Y = rand(5, 5)

In [190]:
pca.fit(Y)

In [191]:
pca.cum_information

array([ 0.42237217,  0.76239816,  0.96264199,  1.        ,  1.        ])

In [193]:
pca.transform(Y, 3)

array([[-0.27139445,  0.41614238, -0.4032952 ],
       [-0.07894981,  0.37867925, -0.16529788],
       [ 0.17594156,  0.16944231, -0.51640172],
       [-0.38000649,  0.52747682, -0.3632502 ],
       [ 0.01046005,  0.4687805 , -0.24592952]])

In [197]:
sk_pca.fit(Y)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [199]:
np.cumsum(sk_pca.explained_variance_ratio_)

array([ 0.42237217,  0.76239816,  0.96264199,  1.        ,  1.        ])

In [203]:
sk_pca.transform(Y)

array([[ -9.23306768e-02,  -9.35520580e-02,  -1.30655896e-01,
          1.50028545e-01,  -3.74643787e-17],
       [ -3.50335692e-01,   1.08582635e-01,  -1.97401277e-01,
         -8.93759605e-02,   9.10290635e-18],
       [  1.68169579e-01,   4.34315758e-01,   1.44639856e-01,
          1.67358191e-02,   6.48580886e-17],
       [  4.45585812e-01,  -1.95425967e-01,  -1.20263696e-01,
         -5.43464918e-02,  -1.67884746e-17],
       [ -1.71089022e-01,  -2.53920368e-01,   3.03681014e-01,
         -2.30419119e-02,   1.00179900e-16]])

In [205]:
sk_pca.get_covariance()

array([[ 0.06746722, -0.01244519,  0.00485667,  0.01361586,  0.01228046],
       [-0.01244519,  0.05759889, -0.00502272,  0.00630198, -0.0138196 ],
       [ 0.00485667, -0.00502272,  0.00466973, -0.01254707,  0.01023402],
       [ 0.01361586,  0.00630198, -0.01254707,  0.04929082, -0.03691549],
       [ 0.01228046, -0.0138196 ,  0.01023402, -0.03691549,  0.05024934]])

In [207]:
np.cov(Y.T)

array([[ 0.06746722, -0.01244519,  0.00485667,  0.01361586,  0.01228046],
       [-0.01244519,  0.05759889, -0.00502272,  0.00630198, -0.0138196 ],
       [ 0.00485667, -0.00502272,  0.00466973, -0.01254707,  0.01023402],
       [ 0.01361586,  0.00630198, -0.01254707,  0.04929082, -0.03691549],
       [ 0.01228046, -0.0138196 ,  0.01023402, -0.03691549,  0.05024934]])