In [1]:
import os
import sys
import numpy as np
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.decomposition import PCA
import seaborn as sns
import datetime
from scipy.stats import sem
import matplotlib.cm as cm
import pathlib
import traceback
import gc
import configs

from data.ValidationDataset import NoisyDataset

In [2]:
# Arg
conv_idx = 1
t = 0

In [3]:
netname = 'pnet'
engram_dir = '/mnt/smb/locker/abbott-locker/hcnn/'
activations_dir = f'{engram_dir}3_activations/{netname}/'
pca_activations_dir = f'{engram_dir}4_activations_pca/{netname}/'

In [4]:
bg_types = ['pinkNoise', 'AudScene', 'Babble8Spkr']
snr_types = [-9.0, -6.0, -3.0, 0.0, 3.0]

In [5]:
def get_data(sample_size=None, transform=None):
    X = []
    bgs = []
    snrs = []
    dset_idxs = []
    n_total_data = 0
    n_sampled_data = 0
    for bg in bg_types:
        for snr in snr_types:
            activ_dir = f'{activations_dir}{bg}_snr{int(snr)}/'
            for results_file in os.listdir(activ_dir):
                results_filepath = f'{activ_dir}{results_file}'
                results = h5py.File(results_filepath, 'r')
            if conv_idx > 3:
                activ = np.array(results[f'conv{conv_idx}_W_{t}_activations'])
            else:
                activ = np.array(results[f'conv{conv_idx}_{t}_activations'])
            n_data = activ.shape[0]
            n_total_data += n_data
            if sample_size != None:
                sample_idxs = np.random.choice(n_data, size=sample_size)
                activ = activ[sample_idxs]
                n_sampled_data += sample_size
                _dset_idxs = list(sample_idxs)
            else:
                n_sampled_data += n_data
                _dset_idxs = list(range(n_data))
            new_n_data = activ.shape[0]
            activ = list(activ.reshape((new_n_data, -1)))
            X.extend(activ)
            snrs.extend([snr]*new_n_data)
            bgs.extend([bg]*new_n_data)
            dset_idxs.extend(_dset_idxs)
            
            del results
            del activ
            gc.collect()

    idxs = np.arange(len(X))
    np.random.shuffle(idxs)

    X = np.array(X)[idxs]
    bgs = np.array(bgs)[idxs]
    snrs = np.array(snrs)[idxs]
    dset_idxs = np.array(dset_idxs)[idxs]
    
    print(f'Sampled {n_sampled_data}/{n_total_data} data')
    print(f'with {X.shape[1]} features')
    
    return X, bgs, snrs, dset_idxs

In [6]:
def get_cpu_usage():
    total_memory, used_memory, free_memory = map(
        int, os.popen('free -t --giga').readlines()[-1].split()[1:])

    # Memory usage
    p_used = round((used_memory/total_memory) * 100, 2)
    print(f"RAM {used_memory} GB, {p_used}% used")

# Determine n_components via random sampling

In [7]:
sample_size = 100

In [8]:
n_components = 1500

In [37]:
for _ in range(5):
    print('Retrieving data')
    X, _, _, _ = get_data(sample_size=sample_size)
    print('Fitting Model:')
    pca = PCA(n_components=0.8)
    pca.fit(X)
    print(pca.n_components_)
    get_cpu_usage()
    del X
    del pca
    gc.collect()

Retrieving data
Sampled 1500/8279 data
with 707520 features
Fitting Model:
317
RAM memory % used: 10.56
Retrieving data
Sampled 1500/8279 data
with 707520 features
Fitting Model:
311
RAM memory % used: 10.56
Retrieving data
Sampled 1500/8279 data
with 707520 features
Fitting Model:
310
RAM memory % used: 10.56
Retrieving data
Sampled 1500/8279 data
with 707520 features
Fitting Model:
315
RAM memory % used: 10.56
Retrieving data
Sampled 1500/8279 data
with 707520 features
Fitting Model:
315
RAM memory % used: 10.57


# Extract n_components from small sample

In [11]:
X, bgs, snrs, dset_idxs = get_data(sample_size=250)
print('Fitting Model:')
pca = PCA(n_components=315)
X_pca = pca.fit_transform(X)
print(np.sum(pca.explained_variance_ratio_))
get_cpu_usage()

Sampled 3750/8279 data
with 707520 features
Fitting Model:
0.7493509
RAM 19 GB, 9.95% used


# Extract n_components from all

In [7]:
X, bgs, snrs, dset_idxs = get_data()
print('Fitting Model:')
pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X)
print(pca.n_components_)
print(np.sum(pca.explained_variance_ratio_))
get_cpu_usage()

Sampled 8279/8279 data
with 707520 features
Fitting Model:
1334
0.9000441
RAM 69 GB, 27.27% used


# Save components and PCA model

In [8]:
os.makedirs(pca_activations_dir, exist_ok=True)

In [14]:
pca_filename = f'PCAmodel_conv{conv_idx}_t{t}'
with open(f'{pca_activations_dir}{pca_filename}.p', 'wb') as f:
    pickle.dump(pca, f)

In [33]:
data_filename = f'data_conv{conv_idx}_t{t}'
with h5py.File(f'{pca_activations_dir}{data_filename}.hdf5', 'x') as f_out:
    data_dict = {}
    data_dict['X_pca'] = f_out.create_dataset('X_pca', data=X_pca)
    bgs_ascii = [n.encode("ascii", "ignore") for n in bgs]
    data_dict['bgs'] = f_out.create_dataset('bgs', data=bgs_ascii, dtype='S10')
    data_dict['snrs'] = f_out.create_dataset('snrs', data=snrs)
    data_dict['dset_idxs'] = f_out.create_dataset('dset_idxs', data=dset_idxs)