In [1]:
import numpy as np
from numpy.linalg import eig
from sklearn.metrics import mean_squared_error

In [2]:
def pca_reconstruction(cov, C, variance_explained=1):
    eig_values, eig_vectors = eig(cov)
    k = len(eig_values)
    if variance_explained != 1:
        trace = sum(eig_values)
        var_explained = 0
        for i, val in enumerate(eig_values):
            var_explained += val/trace
#             print(f'{i} - {var_explained}')
            if var_explained >= variance_explained:
                k = i
    Y = eig_vectors[:,0:k].T.dot(C.T)
    C_estimate = eig_vectors[:,0:k].dot(Y)
    return C_estimate

In [3]:
## generate 3 dimension data set of 10 points
np.random.seed(1)
rng = np.random.RandomState(1999)
A = rng.randn(10, 3) + 2
print('A')
print(A)
M = np.mean(A, axis=0)
C = A - M
cov = np.cov(C.T)
C_estimate = pca_reconstruction(cov, C, variance_explained=0.9025)
A_estimate = C_estimate.T + M
print('A_estimate')
print(A_estimate)
print(f'mse {mean_squared_error(A, A_estimate)}')

A
[[1.68251986 2.69206233 0.71562236]
 [2.39334583 2.21180288 1.58854318]
 [2.94547341 2.90643341 1.45884194]
 [3.20463807 1.92962428 2.70887691]
 [0.96984499 1.03195449 3.3538902 ]
 [2.80426695 1.64460456 1.19536682]
 [2.2470435  1.8595206  1.9274096 ]
 [0.68802366 0.16304126 1.75841876]
 [1.16916187 0.6220643  1.90347192]
 [2.09439562 1.49333031 2.62422045]]
A_estimate
[[2.23893731 2.70507408 1.79826347]
 [2.44874966 2.21309849 1.69634439]
 [2.93485582 2.90618512 1.43818292]
 [2.63649479 1.91633832 1.60342043]
 [0.6107843  1.0235579  2.65525311]
 [2.93642103 1.64769497 1.45250369]
 [2.19698175 1.85834991 1.83000268]
 [1.04066331 0.1712877  2.44456219]
 [1.36007201 0.62652871 2.27493252]
 [1.79475379 1.48632322 2.04119674]]
mse 0.16514016595783557


In [4]:
mse = (np.square(A - A_estimate)).mean(axis=1)
mse

array([4.93960488e-01, 4.89745438e-03, 1.79863365e-04, 5.14999110e-01,
       2.05696288e-01, 2.78645395e-02, 3.99855285e-03, 1.98405179e-01,
       5.81498629e-02, 1.43250322e-01])

## Online Covariance

**Assuming A is a window, let's estimate its covariance using incremental technique and then compute the PCA reconstruction**

In [5]:
from ipynb.fs.full.OnlineStatistics import OnlineStatistics
M = np.mean(A, axis=0)
C = A - M

## online technique to update covariance
ostat = OnlineStatistics(A.shape[1])
ostat.update(A)

C_estimate = pca_reconstruction(ostat.cov, C, variance_explained=0.9025)
A_estimate_online = C_estimate.T + M
print(f'mse {mean_squared_error(A, A_estimate_online)}')

mse 0.16514016595783557


**Another way to estimate covariance of A online and then compute the PCA reconstruction**

In [9]:
from ipynb.fs.full.OnlineStatistics import OnlineStatistics
M = np.mean(A, axis=0)
C = A - M
ostat = OnlineStatistics(A.shape[1])
for x in A:
    ostat.update(x.reshape((1,-1)))
C_estimate = pca_reconstruction(ostat.cov, C, variance_explained=0.9025)
A_estimate_online = C_estimate.T + M
print(f'mse {mean_squared_error(A, A_estimate_online)}')

mse 2.120063682781469e-31


In [7]:
print(f'A_estimate_online \n{A_estimate_online}')
print(f'A_estimate \n{A_estimate}')

A_estimate_online 
[[2.23893731 2.70507408 1.79826347]
 [2.44874966 2.21309849 1.69634439]
 [2.93485582 2.90618512 1.43818292]
 [2.63649479 1.91633832 1.60342043]
 [0.6107843  1.0235579  2.65525311]
 [2.93642103 1.64769497 1.45250369]
 [2.19698175 1.85834991 1.83000268]
 [1.04066331 0.1712877  2.44456219]
 [1.36007201 0.62652871 2.27493252]
 [1.79475379 1.48632322 2.04119674]]
A_estimate 
[[2.23893731 2.70507408 1.79826347]
 [2.44874966 2.21309849 1.69634439]
 [2.93485582 2.90618512 1.43818292]
 [2.63649479 1.91633832 1.60342043]
 [0.6107843  1.0235579  2.65525311]
 [2.93642103 1.64769497 1.45250369]
 [2.19698175 1.85834991 1.83000268]
 [1.04066331 0.1712877  2.44456219]
 [1.36007201 0.62652871 2.27493252]
 [1.79475379 1.48632322 2.04119674]]
