In [1]:
import numpy as np
import pandas as pd
import time
from ipynb.fs.full.OnlineStatistics import OnlineStatistics
from ipynb.fs.full.PCA import reconstruct_pca
from ipynb.fs.full.PCA import reconstruct_pca_with_online_stat
from ipynb.fs.full.IncrementalPCA import reconstruct_incremental_pca
from ipynb.fs.full.SPIRIT import SPIRIT
from sklearn.decomposition import IncrementalPCA
from sklearn.metrics import mean_squared_error
import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

In [2]:
def generate_random_data(nobjects=10, ndim=3):
    np.random.seed(1)
    rng = np.random.RandomState(1999)
    A = rng.randn(nobjects, ndim) + 2
    return A

In [3]:
def build_estimation_func_dictionary(dataset, nobjects, ndims, 
                                     mse_pca, mse_pca_online, mse_inc_pca, mse_spirit,
                                     time_pca, time_pca_online, time_inc_pca, time_spirit):
    results = {}
    results['dataset'] = dataset
    results['total_data'] = nobjects
    results['dimension'] = ndims
    results['mse_pca'] = mse_pca
    results['mse_pca_online'] = mse_pca_online
    results['mse_inc_pca'] = mse_inc_pca
    results['mse_spirit'] = mse_spirit
    results['time_pca'] = time_pca
    results['time_pca_online'] = time_pca_online
    results['time_inc_pca'] = time_inc_pca
    results['time_spirit'] = time_spirit
    return results

In [4]:
def run_spirit(A):
    # initialize SPIRIT parameters
    W = np.identity(A.shape[1])
    k = 1
    E = 0
    E_est = 0
    t = 1
    d = 0.01 * np.ones(A.shape[1])
    fE = 0.95
    FE = 0.98
    lamda = 1
    A_spirit = None
    total_time_spirit = 0
    for i in range(len(A)):
        xt = A[i,:]
        W, d, k, xt_estimate, time_spirit = SPIRIT(xt, t, E, E_est, W, d, k, lamda, fE, FE, profiling=True)
        if A_spirit is None:
            A_spirit = xt_estimate
        else:
            A_spirit = np.vstack((A_spirit, xt_estimate))
        total_time_spirit = total_time_spirit + time_spirit
    mse_spirit = mean_squared_error(A, A_spirit)
    return total_time_spirit, mse_spirit

In [5]:
def run_pca(A, variance_explained=0.9025):
    A_pca, time_pca= reconstruct_pca(A, variance_explained, profiling=True)
    mse_pca = mean_squared_error(A, A_pca)
    return time_pca, mse_pca

In [6]:
def run_pca_with_online_stat(A, variance_explained=0.9025):
    A_pca_online, time_pca_online= reconstruct_pca_with_online_stat(A, variance_explained, profiling=True)
    mse_pca_online = mean_squared_error(A, A_pca_online)
    return time_pca_online, mse_pca_online

In [7]:
def run_incremental_pca(A, variance_explained=0.9025):
    init_time = time.perf_counter()
    inc_pca = IncrementalPCA()
    init_time_duration = time.perf_counter() - init_time
    A_inc_pca, time_inc_pca = reconstruct_incremental_pca(A, inc_pca, variance_explained, profiling=True)
    time_inc_pca += init_time_duration
    mse_inc_pca = mean_squared_error(A, A_inc_pca)
    return time_inc_pca, mse_inc_pca

In [8]:
def build_experiment_dataframe(filenames, time='miliseconds', variance_explained=0.9025):
    dataset = list ()
    nobjects = list ()
    ndims = list ()
    mse_pca = list ()
    mse_pca_online = list ()
    mse_inc_pca = list ()
    mse_spirit = list ()
    time_pca = list ()
    time_pca_online = list ()
    time_inc_pca = list ()
    time_spirit  = list ()
    
    if time=='miliseconds':
        multiplier = 1000
    
    for filename in filenames:
        logging.info(f'Running {filename}')
        idx = filename.rfind('/')
        ds_name = filename[idx+1:].replace('.csv', '')
        dataset.append(ds_name)
        raw = pd.read_csv(filename)
        data = raw.drop(['label'], axis=1).values
        
        if 'covtype' in filename or 'kddcup1999' in filename:
            data = data[0:18000,:]
        
        nobjects.append(data.shape[0])
        ndims.append(data.shape[1])
        
        t_pca, m_pca = run_pca(data, variance_explained)
        time_pca.append(t_pca * multiplier)
        mse_pca.append(m_pca)
        
        t_pca_online, m_pca_online = run_pca_with_online_stat(data, variance_explained)
        time_pca_online.append(t_pca_online * multiplier)
        mse_pca_online.append(m_pca_online)
        
        t_inc_pca, m_inc_pca = run_incremental_pca(data, variance_explained)
        time_inc_pca.append(t_inc_pca * multiplier)
        mse_inc_pca.append(m_inc_pca)
        
        t_spirit, m_spirit = run_spirit(data)
        time_spirit.append(t_spirit * multiplier)
        mse_spirit.append(m_spirit)  
    
    results = {}
    results['dataset'] = dataset
    results['total_data'] = nobjects
    results['dimension'] = ndims
    results['mse_pca'] = mse_pca
    results['mse_pca_online'] = mse_pca_online
    results['mse_inc_pca'] = mse_inc_pca
    results['mse_spirit'] = mse_spirit
    results['time_pca'] = time_pca
    results['time_pca_online'] = time_pca_online
    results['time_inc_pca'] = time_inc_pca
    results['time_spirit'] = time_spirit
    
    return pd.DataFrame(results)

In [9]:
def build_experiment_synthetic_data(metadata, time='miliseconds', variance_explained=0.9025):
    dataset = list ()
    nobjects = list ()
    ndims = list ()
    mse_pca = list ()
    mse_pca_online = list ()
    mse_inc_pca = list ()
    mse_spirit = list ()
    time_pca = list ()
    time_pca_online = list ()
    time_inc_pca = list ()
    time_spirit  = list ()
    
    if time=='miliseconds':
        multiplier = 1000
    
    for info in metadata:
        logging.info(f'Running {info}')
        title = f'({info[0]}, {info[1]})'
        dataset.append(title)
        data = generate_random_data(nobjects=info[0], ndim=info[1])
        nobjects.append(data.shape[0])
        ndims.append(data.shape[1])
        
        t_pca, m_pca = run_pca(data, variance_explained)
        time_pca.append(t_pca * multiplier)
        mse_pca.append(m_pca)
        
        t_pca_online, m_pca_online = run_pca_with_online_stat(data, variance_explained)
        time_pca_online.append(t_pca_online * multiplier)
        mse_pca_online.append(m_pca_online)
        
        t_inc_pca, m_inc_pca = run_incremental_pca(data, variance_explained)
        time_inc_pca.append(t_inc_pca * multiplier)
        mse_inc_pca.append(m_inc_pca)
        
        t_spirit, m_spirit = run_spirit(data)
        time_spirit.append(t_spirit * multiplier)
        mse_spirit.append(m_spirit)  
    results = {}
    results['dataset'] = dataset
    results['total_data'] = nobjects
    results['dimension'] = ndims
    results['mse_pca'] = mse_pca
    results['mse_pca_online'] = mse_pca_online
    results['mse_inc_pca'] = mse_inc_pca
    results['mse_spirit'] = mse_spirit
    results['time_pca'] = time_pca
    results['time_pca_online'] = time_pca_online
    results['time_inc_pca'] = time_inc_pca
    results['time_spirit'] = time_spirit
    return pd.DataFrame(results)

## Test on synthetic dataset

In [10]:
metadata = ((10,3),
            (1000,3),
            (1000,20),
            (1000,100),
            (10000,3),
            (10000,20),
            (10000,100)
            )
df_synthetic = build_experiment_synthetic_data(metadata, variance_explained=0.9025)
df_synthetic.to_pickle('./estimation_synthetic.pkl')

Running (10, 3)
Running (1000, 3)
Running (1000, 20)
Running (1000, 100)
Running (10000, 3)
Running (10000, 20)
Running (10000, 100)


In [11]:
df_synthetic.round(decimals=2)

Unnamed: 0,dataset,total_data,dimension,mse_pca,mse_pca_online,mse_inc_pca,mse_spirit,time_pca,time_pca_online,time_inc_pca,time_spirit
0,"(10, 3)",10,3,0.17,0.17,0.05,0.23,4.69,1.98,2.76,6.41
1,"(1000, 3)",1000,3,0.32,0.32,0.32,0.35,4.2,161.7,8.42,543.11
2,"(1000, 20)",1000,20,0.05,0.05,0.04,0.2,19.87,147.87,16.16,1710.18
3,"(1000, 100)",1000,100,0.01,0.01,0.0,0.21,41.12,499.0,37.98,4455.85
4,"(10000, 3)",10000,3,0.33,0.33,0.32,0.35,6.45,1107.33,9.06,5537.34
5,"(10000, 20)",10000,20,0.05,0.05,0.05,0.2,14.98,1311.6,24.14,17905.94
6,"(10000, 100)",10000,100,0.01,0.01,0.01,0.17,85.72,4095.45,137.06,90879.59


In [12]:
df_synthetic2 = build_experiment_synthetic_data(metadata, variance_explained=0.5)
df_synthetic2.to_pickle('./estimation_synthetic2.pkl')

Running (10, 3)
Running (1000, 3)
Running (1000, 20)
Running (1000, 100)
Running (10000, 3)
Running (10000, 20)
Running (10000, 100)


In [13]:
df_synthetic2.round(decimals=2)

Unnamed: 0,dataset,total_data,dimension,mse_pca,mse_pca_online,mse_inc_pca,mse_spirit,time_pca,time_pca_online,time_inc_pca,time_spirit
0,"(10, 3)",10,3,0.17,0.17,0.05,0.23,0.94,2.18,2.23,5.16
1,"(1000, 3)",1000,3,0.32,0.32,0.32,0.35,2.74,144.73,3.1,445.43
2,"(1000, 20)",1000,20,0.05,0.05,0.04,0.2,3.79,145.49,5.5,1601.81
3,"(1000, 100)",1000,100,0.01,0.01,0.0,0.21,42.38,485.68,22.89,4057.19
4,"(10000, 3)",10000,3,0.33,0.33,0.32,0.35,4.83,848.65,8.21,5064.95
5,"(10000, 20)",10000,20,0.05,0.05,0.05,0.2,17.97,1276.76,59.19,18842.25
6,"(10000, 100)",10000,100,0.01,0.01,0.01,0.17,110.73,4269.2,137.3,97585.34


In [14]:
df_synthetic3 = build_experiment_synthetic_data(metadata, variance_explained=1)
df_synthetic3.to_pickle('./estimation_synthetic3.pkl')

Running (10, 3)
Running (1000, 3)
Running (1000, 20)
Running (1000, 100)
Running (10000, 3)
Running (10000, 20)
Running (10000, 100)


In [15]:
df_synthetic3.round(decimals=2)

Unnamed: 0,dataset,total_data,dimension,mse_pca,mse_pca_online,mse_inc_pca,mse_spirit,time_pca,time_pca_online,time_inc_pca,time_spirit
0,"(10, 3)",10,3,0.0,0.0,0.0,0.23,1.08,2.41,4.66,8.63
1,"(1000, 3)",1000,3,0.0,0.0,0.0,0.35,1.23,149.54,3.61,569.71
2,"(1000, 20)",1000,20,0.0,0.0,0.0,0.2,6.19,172.85,5.78,1651.99
3,"(1000, 100)",1000,100,0.0,0.0,0.0,0.21,28.54,547.81,22.82,4772.35
4,"(10000, 3)",10000,3,0.0,0.0,0.0,0.35,4.04,1005.22,8.14,5898.83
5,"(10000, 20)",10000,20,0.0,0.0,0.0,0.2,12.7,1482.71,28.07,18401.14
6,"(10000, 100)",10000,100,0.0,0.0,0.0,0.17,96.92,5267.56,151.77,134141.09


## Test on outlier detection dataset

In [16]:
filenames = ('../../data/xpacs/arrythmiadata.csv',
             '../../data/xpacs/breastdata.csv',
             '../../data/xpacs/digits2data.csv',
             '../../data/xpacs/digitsdata.csv',
             '../../data/xpacs/winedata.csv',
             '../../data/xpacs/yeastdata.csv',
             '../../data/forestcover/covtype.csv',
             '../../data/kddcup1999/kddcup1999_10percent_number_only.csv'
            )
df = build_experiment_dataframe(filenames, variance_explained=0.9025)
df.to_pickle('./estimation_1.pkl')

Running ../../data/xpacs/arrythmiadata.csv
Running ../../data/xpacs/breastdata.csv
Running ../../data/xpacs/digits2data.csv
Running ../../data/xpacs/digitsdata.csv
Running ../../data/xpacs/winedata.csv
Running ../../data/xpacs/yeastdata.csv
Running ../../data/forestcover/covtype.csv
Running ../../data/kddcup1999/kddcup1999_10percent_number_only.csv


In [17]:
df.round(decimals=2)

Unnamed: 0,dataset,total_data,dimension,mse_pca,mse_pca_online,mse_inc_pca,mse_spirit,time_pca,time_pca_online,time_inc_pca,time_spirit
0,arrythmiadata,332,172,0.0,0.0,0.0,0.01,63.67,410.81,33.13,685.81
1,breastdata,683,9,0.0,0.0,0.0,0.0,2.69,94.76,4.09,696.93
2,digits2data,1266,16,0.0,0.0,0.0,0.02,6.14,199.69,5.08,1232.73
3,digitsdata,1371,16,0.0,0.0,0.0,0.02,3.92,254.8,65.65,798.89
4,winedata,95,13,0.0,0.0,0.0,0.01,0.95,19.89,7.8,94.77
5,yeastdata,592,8,0.0,0.0,0.0,0.01,1.58,83.69,3.22,325.14
6,covtype,18000,55,0.0,0.0,0.0,87468.55,97.45,6012.34,159.91,15956.3
7,kddcup1999_10percent_number_only,18000,35,0.0,0.0,0.0,1245613.82,44.7,4329.54,85.85,14042.41


In [18]:
df2 = build_experiment_dataframe(filenames, variance_explained=0.5)
df2.to_pickle('./estimation_2.pkl')

Running ../../data/xpacs/arrythmiadata.csv
Running ../../data/xpacs/breastdata.csv
Running ../../data/xpacs/digits2data.csv
Running ../../data/xpacs/digitsdata.csv
Running ../../data/xpacs/winedata.csv
Running ../../data/xpacs/yeastdata.csv
Running ../../data/forestcover/covtype.csv
Running ../../data/kddcup1999/kddcup1999_10percent_number_only.csv


In [20]:
df2.round(decimals=2)

Unnamed: 0,dataset,total_data,dimension,mse_pca,mse_pca_online,mse_inc_pca,mse_spirit,time_pca,time_pca_online,time_inc_pca,time_spirit
0,arrythmiadata,332,172,0.0,0.0,0.0,0.01,57.65,449.03,28.96,421.48
1,breastdata,683,9,0.0,0.0,0.0,0.0,2.11,94.49,3.25,629.99
2,digits2data,1266,16,0.0,0.0,0.0,0.02,2.45,189.79,4.21,1023.82
3,digitsdata,1371,16,0.0,0.0,0.0,0.02,2.5,194.63,4.87,903.71
4,winedata,95,13,0.0,0.0,0.0,0.01,1.15,15.39,2.96,71.83
5,yeastdata,592,8,0.0,0.0,0.0,0.01,1.63,83.06,3.21,345.43
6,covtype,18000,55,0.0,0.0,0.0,87468.55,98.95,5259.82,148.74,10170.53
7,kddcup1999_10percent_number_only,18000,35,0.0,0.0,0.0,1245613.82,37.55,3034.5,93.81,8586.65
