<a href="https://colab.research.google.com/github/gachet/ad-1-24/blob/main/pca/Build_PCA_(Principal_Component_Analysis)_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import pandas and numpy library
import pandas as pd
import numpy as np

In [None]:
# Import data using pandas
iris = pd.read_csv('IRIS.csv')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [None]:
#Drop label (Y) for dataset

iris_features = iris.drop(columns=['species'])

iris_features.shape

(150, 4)

In [None]:
iris_features

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


# Standardize Data

In [None]:
#standardize dataset

iris_features = (iris_features - iris_features.mean()) / iris_features.std()

#convert dataframe to numpy array
iris_features = iris_features.to_numpy()

iris_features

array([[-8.97673879e-01,  1.02861128e+00, -1.33679402e+00,
        -1.30859282e+00],
       [-1.13920048e+00, -1.24540379e-01, -1.33679402e+00,
        -1.30859282e+00],
       [-1.38072709e+00,  3.36720285e-01, -1.39346985e+00,
        -1.30859282e+00],
       [-1.50149039e+00,  1.06089953e-01, -1.28011819e+00,
        -1.30859282e+00],
       [-1.01843718e+00,  1.25924161e+00, -1.33679402e+00,
        -1.30859282e+00],
       [-5.35383973e-01,  1.95113261e+00, -1.16676652e+00,
        -1.04652483e+00],
       [-1.50149039e+00,  7.97980949e-01, -1.33679402e+00,
        -1.17755883e+00],
       [-1.01843718e+00,  7.97980949e-01, -1.28011819e+00,
        -1.30859282e+00],
       [-1.74301699e+00, -3.55170711e-01, -1.33679402e+00,
        -1.30859282e+00],
       [-1.13920048e+00,  1.06089953e-01, -1.28011819e+00,
        -1.43962681e+00],
       [-5.35383973e-01,  1.48987194e+00, -1.28011819e+00,
        -1.30859282e+00],
       [-1.25996379e+00,  7.97980949e-01, -1.22344235e+00,
      

# Compute Covariance Matrix

In [None]:
def covariance_matrix(X, n_rows):
  """The function is a matrix multiplication of dataset and the transpose of the dataset.
  See the formula for C in theory section. Mean is not substracted as it was substracted when standardizing the data """
  covariance_matrix = sum([X[i].reshape(-1,1) @ X[i].reshape(1,-1)for i in range(n_rows)])/ n_rows
  return covariance_matrix

covariance_matrix(iris_features, 150)

array([[ 0.99333333, -0.10864012,  0.86594246,  0.81250061],
       [-0.10864012,  0.99333333, -0.41771266, -0.35416713],
       [ 0.86594246, -0.41771266,  0.99333333,  0.95633872],
       [ 0.81250061, -0.35416713,  0.95633872,  0.99333333]])

# Compute Eigen Values and Eigen Vectors of Covariance Matrix

In [None]:
def eigen_vectors(X, n_rows):
  C = covariance_matrix(X, n_rows)
  eigen_vectors = np.linalg.eig(C)
  return eigen_vectors

# the function returns two arrays - the first are the eigen values and the second the eigen vectors
eigen_values, eigen_vectors = eigen_vectors(iris_features, 150)

In [None]:
eigen_values

array([2.89141263, 0.91507946, 0.14637092, 0.02047032])

In [None]:
eigen_vectors

array([[ 0.52237162, -0.37231836, -0.72101681,  0.26199559],
       [-0.26335492, -0.92555649,  0.24203288, -0.12413481],
       [ 0.58125401, -0.02109478,  0.14089226, -0.80115427],
       [ 0.56561105, -0.06541577,  0.6338014 ,  0.52354627]])

# Sort Eigenvalues

In [None]:
#sort eigenvalues from smallest to greatest magnitude using argsort function

sort = np.argsort(eigen_values)[::-1]

sort

array([0, 1, 2, 3])

In [None]:
# use sort array to order eigenvectors

principal_components = eigen_vectors[:,sort]

principal_components

array([[ 0.52237162, -0.37231836, -0.72101681,  0.26199559],
       [-0.26335492, -0.92555649,  0.24203288, -0.12413481],
       [ 0.58125401, -0.02109478,  0.14089226, -0.80115427],
       [ 0.56561105, -0.06541577,  0.6338014 ,  0.52354627]])

# Create a PCA Function

In [None]:
def PCA_from_Scratch(X, n_rows, n_components):
  X = X.copy()
  C = sum([X[i].reshape(-1,1) @ X[i].reshape(1,-1)for i in range(n_rows)])/ n_rows
  eigen_values, eigen_vectors  = np.linalg.eig(C)
  sort = np.argsort(eigen_values)[::-1]
  principal_components = eigen_vectors[:,sort]
  return principal_components[:n_components]


principal_components_2 = PCA_from_Scratch(iris_features, 150,2)

principal_components_2






array([[ 0.52237162, -0.37231836, -0.72101681,  0.26199559],
       [-0.26335492, -0.92555649,  0.24203288, -0.12413481]])

# Project the Dataset

In [None]:

def transform(X,principal_components):
  X = X.copy()
  X_proj = X.dot(principal_components.T)
  return X_proj


# transforming the iris dataset from four dimensions to two dimensions

X_transform = transform(iris_features,principal_components_2)

X_transform

array([[-0.23088481, -0.8767372 ],
       [ 0.07228808,  0.25417702],
       [-0.18475015, -0.12285598],
       [-0.24369402,  0.14984386],
       [-0.37983604, -1.058395  ],
       [-0.43903851, -1.81737315],
       [-0.42610319, -0.52052364],
       [-0.24896446, -0.61775478],
       [-0.15726062,  0.62665647],
       [-0.08877438,  0.07069892],
       [-0.25423489, -1.38535342],
       [-0.41599533, -0.54043014],
       [-0.02512557,  0.30224651],
       [-0.21794949,  0.42011231],
       [-0.13691264, -2.19410431],
       [-0.59739962, -3.00752582],
       [-0.27558159, -1.87224281],
       [-0.19655448, -0.89300308],
       [-0.19825096, -1.6830567 ],
       [-0.49502244, -1.51966987],
       [-0.07835963, -0.71753438],
       [-0.3748242 , -1.32247435],
       [-0.46871241, -0.98605022],
       [-0.0787507 , -0.4574598 ],
       [-0.53858802, -0.4992779 ],
       [ 0.05364295,  0.24980824],
       [-0.22116803, -0.63656912],
       [-0.20866572, -0.8948234 ],
       [-0.08193358,

# PCA using Scikit Learn Library

In [None]:
from sklearn.decomposition import PCA



In [None]:
#initialize Scikit PCA
pca = PCA(n_components=4)

#Fit Scikit PCA using data
pca.fit(iris_features)

In [None]:
# generate principal components

SKL_PCA_components = pca.components_.T

SKL_PCA_components


array([[ 0.52237162,  0.37231836, -0.72101681, -0.26199559],
       [-0.26335492,  0.92555649,  0.24203288,  0.12413481],
       [ 0.58125401,  0.02109478,  0.14089226,  0.80115427],
       [ 0.56561105,  0.06541577,  0.6338014 , -0.52354627]])

In [None]:
#Principal components from Scikit library are similar to derived Principal components
principal_components

array([[ 0.52237162, -0.37231836, -0.72101681,  0.26199559],
       [-0.26335492, -0.92555649,  0.24203288, -0.12413481],
       [ 0.58125401, -0.02109478,  0.14089226, -0.80115427],
       [ 0.56561105, -0.06541577,  0.6338014 ,  0.52354627]])