<a href="https://colab.research.google.com/github/gachet/ad-1-24/blob/main/pca/TRANSFORM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
#Original dataset X
X=np.array([[11,2,25],[12,20, 31],[5,6,7],[200,10,22]])

#Get covariance matrix from X, if using numpy.cov,
#make sure set parameter rowvar to False

covX=np.cov(X, rowvar=False)

#Get eigenvalues and eigenvectors from covX
eigenvals, eigenvecs = np.linalg.eig(covX)

#pair the eigenvalue and corresponding eigenvector
pair = [(i[0], i[1]) for i in zip(eigenvals, eigenvecs)]
#Sort the eigenvectors by its eigenvalues in descending order
sortedPair=[[i[0], i[1]] for i in sorted(pair, key=lambda x: abs(x[0]), reverse=True)]

#Get sorted eigenvectors, keep the sort order.
W=np.array([i[1] for i in sortedPair])

def findTopPrincipalComponentNumber(eigenvals:list,threshold:int=0.80) -> (int, float):
        ''' findTopPrincipalComponentNumber will return k number that will meet the threshold'''
        total = sum(eigenvals)
        #Sort the Eigenvalues in descending order,
        #calculate each eiganvalue that contains percentage of information in raw dataset
        eachPercentList = [(i / total) for i in sorted(eigenvals, reverse=True)]
        cumulativePercept=0
        cumulativePerceptList=[]
        for i in eachPercentList:
            cumulativePercept+=i
            cumulativePerceptList.append(cumulativePercept)
            if cumulativePercept >= threshold:
                break
        return (len(cumulativePerceptList), float(cumulativePerceptList[-1]))
#get the k to have 95% of the original information from raw dataset
bestK, coveredPercentage = findTopPrincipalComponentNumber(eigenvals, 0.95)

#pca projected dataset is X times W
Z=np.matmul(X, W)

#Get bestK columns from Z, to created dimension reduced dataset pcaZ
#that has 98% of the information from original X
pcaZ=[]
for i in Z.tolist():
    subZ=[]
    for j in range(bestK):
        subZ.append(i[j])
    pcaZ.append(subZ)

#To reconstruct, can not use pcaZ, need to use Z
#Since Z = X times W, then X = Z times inversed(W)
restoredX = np.matmul(Z, np.linalg.inv(W))

#Restored original dataset, that should match original raw dataset X
restoredX

'''
array([[ 11.,   2.,  25.],
       [ 12.,  20.,  31.],
       [  5.,   6.,   7.],
       [200.,  10.,  22.]])
'''

#Original raw dataset
X

'''
array([[ 11,   2,  25],
       [ 12,  20,  31],
       [  5,   6,   7],
       [200,  10,  22]])
'''

'\narray([[ 11,   2,  25],\n       [ 12,  20,  31],\n       [  5,   6,   7],\n       [200,  10,  22]])\n'