# using Numpy

In [1]:
import numpy as np

In [6]:
def generate_synthetic_data(n_samples, n_features):
    X = np.random.randn(n_samples, n_features)
    X = X / np.linalg.norm(X, axis=1)[:, np.newaxis]
    return X

In [10]:
def pca(data, num_components=None):
    #Centring the Data
    data -= np.mean(data, axis=0)
    #Compute COV matrix
    convariance = np.cov(data, rowvar=False)
    
    #Calculate Eigen value and vector
    eigenvalues, eigenvectors = np.linalg.eig(convariance)
    
    #Sort the eigenvalue and vector in desc order
    sorted_vec = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_vec]
    eigenvectors = eigenvectors[:, sorted_vec]
    
    #Select the top n comp. for eigenvector
    p_component = eigenvectors[: , :num_components]
    
    #Projecting the data
    projected_data = np.dot(data, p_component)
    
    #Explained variance
    explained_variance = eigenvalues/np.sum(eigenvalues)
    
    return projected_data, explained_variance

In [11]:
X = generate_synthetic_data(1000, 5)
projected_data, explained_variance = pca(X, 3)

print("Original shape of data", X.shape)
print('Project shape of data', projected_data.shape)
print("Explained variances", explained_variance)

Original shape of data (1000, 5)
Project shape of data (1000, 3)
Explained variances [0.22200045 0.2096647  0.20288186 0.19098192 0.17447106]


In [12]:
projected_data

array([[-0.41230283,  0.49106111, -0.73215907],
       [ 0.82192289,  0.2432551 ,  0.2227165 ],
       [-0.89520504, -0.19664945, -0.33253233],
       ...,
       [-0.55113156, -0.07947803, -0.62984474],
       [-0.30431149, -0.14061403,  0.6798042 ],
       [-0.21028067,  0.05371506, -0.69618273]])

## pca using sklearn

In [13]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

In [16]:
#PCA
num_components = 3
pca = PCA(n_components=num_components)
preojected_data_sklearn = pca.fit_transform(X)

print("Original shape of data", X.shape)
print('Project shape of data', preojected_data_sklearn.shape)

explained_var_ratio = pca.explained_variance_ratio_
print("Explained variances", explained_var_ratio)

Original shape of data (1000, 5)
Project shape of data (1000, 3)
Explained variances [0.22200045 0.2096647  0.20288186]
