In [1]:
import numpy as np

In [92]:
#Original dataset X
X=np.array([[11,2,25],[12,20, 31],[5,6,7],[200,10,22]])

In [166]:
#Get covariance matrix from X, if using numpy.cov, make sure set parameter rowvar to False
# https://stackoverflow.com/questions/43174624/numpy-covariance-matrix-numpy-cov
    
covX=np.cov(X, rowvar=False)

In [167]:
covX

array([[9098.        ,   42.        ,   79.        ],
       [  42.        ,   59.66666667,   41.5       ],
       [  79.        ,   41.5       ,  104.25      ]])

In [168]:
#Get eigenvalues and eigenvectors from covX
eigenvals, eigenvecs = np.linalg.eig(covX)

In [169]:
eigenvals

array([9098.89241304,  128.17622396,   34.84802966])

In [170]:
eigenvecs

array([[ 9.99950260e-01,  9.96128016e-03, -5.00975875e-04],
       [ 4.68660548e-03, -5.13610200e-01, -8.58010838e-01],
       [ 8.80419265e-03, -8.57965812e-01,  5.13631338e-01]])

In [171]:
#pair the eigenvalue and corresponding eigenvector
pair = [(i[0], i[1]) for i in zip(eigenvals, eigenvecs)]
#Sort the eigenvectors by its eigenvalues in descending order
sortedPair=[[i[0], i[1]] for i in sorted(pair, key=lambda x: abs(x[0]), reverse=True)]
        
        




In [172]:
sortedPair

[[9098.892413038524,
  array([ 9.99950260e-01,  9.96128016e-03, -5.00975875e-04])],
 [128.17622396429107, array([ 0.00468661, -0.5136102 , -0.85801084])],
 [34.84802966385698, array([ 0.00880419, -0.85796581,  0.51363134])]]

In [173]:
#Get sorted eigenvectors, keep the sort order.
W=np.array([i[1] for i in sortedPair])

In [174]:
W

array([[ 9.99950260e-01,  9.96128016e-03, -5.00975875e-04],
       [ 4.68660548e-03, -5.13610200e-01, -8.58010838e-01],
       [ 8.80419265e-03, -8.57965812e-01,  5.13631338e-01]])

In [175]:
def findTopPrincipalComponentNumber(eigenvals:list,threshold:int=0.80) -> (int, float):
        ''' findTopPrincipalComponentNumber will return k number that will meet the threshold'''
        total = sum(eigenvals)
        #Sort the Eigenvalues in descending order, 
        #calculate each eiganvalue that contains percentage of information in raw dataset                                                                
        eachPercentList = [(i / total) for i in sorted(eigenvals, reverse=True)] 
        cumulativePercept=0
        cumulativePerceptList=[]
        for i in eachPercentList:
            cumulativePercept+=i
            cumulativePerceptList.append(cumulativePercept)
            if cumulativePercept >= threshold:
                break
        return (len(cumulativePerceptList), float(cumulativePerceptList[-1]))

In [176]:
bestK, coveredPercentage = findTopPrincipalComponentNumber(eigenvals, 0.95)

In [177]:
bestK

1

In [178]:
coveredPercentage

0.982398432258102

In [179]:
W

array([[ 9.99950260e-01,  9.96128016e-03, -5.00975875e-04],
       [ 4.68660548e-03, -5.13610200e-01, -8.58010838e-01],
       [ 8.80419265e-03, -8.57965812e-01,  5.13631338e-01]])

In [180]:
X

array([[ 11,   2,  25],
       [ 12,  20,  31],
       [  5,   6,   7],
       [200,  10,  22]])

In [181]:
#pca projected dataset is X times W
Z=np.matmul(X, W)



In [182]:
Z

array([[ 11.22893088, -22.36679163,  11.11925103],
       [ 12.3660652 , -36.74960882,  -1.243657  ],
       [  5.08950028,  -9.03761549,  -1.55515054],
       [200.23061024, -22.01909384,   2.61958588]])

In [183]:
#Get bestK columns from Z, to created dimension reduced dataset pcaZ 
#that has 98% of the information from original X
pcaZ=[]
for i in Z.tolist():
    subZ=[]
    for j in range(1):
        subZ.append(i[j])
    pcaZ.append(subZ)
pcaZ

[[11.22893088422844],
 [12.366065198478557],
 [5.089500280055335],
 [200.23061023781398]]

In [184]:
#To reconstruct, can not use pcaZ, need to use Z
#Since Z = X times W, then X = Z times inversed(W)
restoredX = np.matmul(Z, np.linalg.inv(W))

In [185]:
#Restored original dataset, that should match original raw dataset X
restoredX

array([[ 11.,   2.,  25.],
       [ 12.,  20.,  31.],
       [  5.,   6.,   7.],
       [200.,  10.,  22.]])

In [186]:
#Original raw dataset
X

array([[ 11,   2,  25],
       [ 12,  20,  31],
       [  5,   6,   7],
       [200,  10,  22]])