In [1]:
import numpy as np
import time
from numpy.linalg import eig
from sklearn.metrics import mean_squared_error
from ipynb.fs.full.OnlineStatistics import OnlineStatistics

In [2]:
def pca_reconstruction(cov, C, variance_explained=1, profiling=False):
    if profiling:
        start = time.perf_counter()
    
    eig_values, eig_vectors = eig(cov)
    k = len(eig_values)
    if variance_explained != 1:
        trace = sum(eig_values)
        var_explained = 0
        for i, val in enumerate(eig_values):
            var_explained += val/trace
#             print(f'{i} - {var_explained}')
            if var_explained >= variance_explained:
                k = i
    Y = eig_vectors[:,0:k].T.dot(C.T)
    C_estimate = eig_vectors[:,0:k].dot(Y)
    
    if profiling:
        duration = start - time.perf_counter()
        return C_estimate, duration
    
    return C_estimate

In [3]:
def reconstruct_pca(A, variance_explained=1, profiling=False):
    if profiling:
        start = time.perf_counter()
        
    M = np.mean(A, axis=0)
    C = A - M
    cov = np.cov(C.T)
    C_estimate = pca_reconstruction(cov, C, variance_explained)
    A_estimate = C_estimate.T + M
    
    if profiling:
        duration = start - time.perf_counter()
        return A_estimate, duration

    return A_estimate

## Online Covariance

**Assuming A is a window, let's estimate its covariance using incremental technique and then compute the PCA reconstruction**

In [4]:
def reconstruct_pca_with_online_stat(A, variance_explained=1, profiling=False):
    if profiling:
        start = time.perf_counter()
        
    ostat = OnlineStatistics(A.shape[1])
    for x in A:
        ostat.update(x.reshape((1,-1)))
    C = A - ostat.mean
    C_estimate = pca_reconstruction(ostat.cov, C, variance_explained)
    A_estimate = C_estimate.T + ostat.mean
    if profiling:
        duration = start - time.perf_counter()
        return A_estimate, duration
    return A_estimate

In [5]:
# from ipynb.fs.full.OnlineStatistics import OnlineStatistics
# np.random.seed(1)
# rng = np.random.RandomState(1999)
# A = rng.randn(10,3) + 2
# M = np.mean(A, axis=0)
# C = A - M

# ## online technique to update covariance
# ostat = OnlineStatistics(A.shape[1])
# ostat.update(A)

# C_estimate = pca_reconstruction(ostat.cov, C, variance_explained=0.9025)
# A_estimate_online = C_estimate.T + M
# print(f'mse {mean_squared_error(A, A_estimate_online)}')

**Another way to estimate covariance of A online and then compute the PCA reconstruction**

In [6]:
# from ipynb.fs.full.OnlineStatistics import OnlineStatistics
# M = np.mean(A, axis=0)
# C = A - M
# ostat = OnlineStatistics(A.shape[1])
# for x in A:
#     ostat.update(x.reshape((1,-1)))
# C_estimate = pca_reconstruction(ostat.cov, C, variance_explained=0.9025)
# A_estimate_online = C_estimate.T + M
# print(f'mse {mean_squared_error(A, A_estimate_online)}')

In [7]:
# from ipynb.fs.full.OnlineStatistics import OnlineStatistics
# ostat = OnlineStatistics(A.shape[1])
# for x in A:
#     ostat.update(x.reshape((1,-1)))
# C = A - ostat.mean
# C_estimate = pca_reconstruction(ostat.cov, C, variance_explained=0.9025)
# A_estimate_online = C_estimate.T + ostat.mean
# print(f'mse {mean_squared_error(A, A_estimate_online)}')

In [8]:
# A_estimate_online = reconstruct_pca_with_online_stat(A, variance_explained=0.9025, profiling=False)
# print(f'mse {mean_squared_error(A, A_estimate_online)}')

For each individual technique study the impact of the hyper parameters, and then compare them with different dataset (different size of the matrix)