# PCA

### In this notebook we tackle the problem of sparsity in the feature representations of the different modes (chromatin accessibility, gene expression, surface protein levels). 

As noted in https://www.kaggle.com/code/leohash/complete-eda-of-mmscel-integration-data/notebook, DNA data has between 1-30k of the 229k features being nonzero, RNA data has 2-8k of the ~28k features as nonzero, and protein data has a small number of features and is sparse, which means this notebook don't care :)

We will apply PCA (and maybe some other techniques) to investigate whether we can usefully hop into a lower-dimensional, densely-populated representation for either of these two modes.

In [1]:
# imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from datasets import SparseDataset, H5Dataset

In [None]:
d = SparseDataset('train', )

In [None]:
# -----------------------------------------------------------

In [None]:
from sklearn.decomposition import IncrementalPCA
import tqdm

class PCA():
    def __init__(self, n_components: int):
        self.n_components = n_components
        self.ipca = IncrementalPCA(n_components, whiten=False)
    
    def fit(self, filename: str, batch_size=1000, limit=1e9):
        """
        Fits PCA to data stored in the .h5 file given.
        Uses incremental PCA and batching for certain memory reasons.
        """
        batch_size = max(batch_size, self.n_components)
        l = h5Loader(filename, batch_size, shuffle=limit == 1e9)
        
        print('Fitting with a batch size of {}!'.format(batch_size))
        for i, batch in enumerate(tqdm.tqdm(l)):
            if i == limit: break
            self.ipca.partial_fit(batch)
        
    def evaluate(self, filename: str, batch_size=1000, limit=1e9):
        """
        Evaluates reconstruction error of PCA fit with data stored in the .h5 file given.
        Uses batching for certain memory reasons.
        """
        batch_size = max(batch_size, self.n_components)
        l = h5Loader(filename, batch_size, shuffle=limit == 1e9)
        
        print('Evaluating with a batch size of {}!'.format(batch_size))
        errs = 0.
        for i, batch in enumerate(tqdm.tqdm(l)):
            if i == limit: break
            t = self.ipca.transform(batch)
            b_hat = self.ipca.inverse_transform(t)
            diffs = np.linalg.norm(b_hat - batch, axis=-1)
            errs += diffs.sum()
        errs /= batch_size * min(limit, len(l))
        return errs

# Da Work

## DNA (Chromatin Accessibility)

In [None]:
train_f = FP_MULTIOME_TRAIN_INPUTS
test_f = FP_MULTIOME_TEST_INPUTS

max_n_components = 20000

p_dna = PCA(max_n_components)
p_dna.fit(train_f)
p_dna.fit(test_f)

## RNA (Gene Expression)

In [None]:
train_f = FP_CITE_TRAIN_INPUTS

p10k_rna = PCA(10000)
p10k_rna.fit(train_f)
p10k_rna.evaluate(train_f)

# random test bs

In [None]:
import h5sparse as h5
import scipy.sparse as ss
import sys

h5f = h5.File(FP_MULTIOME_TEST_INPUTS)

normalboi = h5f['test_multi_inputs']['block0_values'][:5000]
print(sys.getsizeof(normalboi), normalboi.__class__)
sparseboi = ss.csr_array(normalboi)
print(sys.getsizeof(sparseboi.data), sparseboi.__class__)

from sklearn.decomposition import PCA, TruncatedSVD
import time

p1 = PCA(5000)
p2 = PCA(5000)
t = TruncatedSVD(5000)


s = time.perf_counter()
t.fit(sparseboi)
print(time.perf_counter() - s)

In [None]:
def evaluate(filename: str, batch_size=1000, limit=1e9):
    """
    Evaluates reconstruction error of PCA fit with data stored in the .h5 file given.
    Uses batching for certain memory reasons.
    """
    batch_size = max(batch_size, 5000)
    l = h5Loader(filename, batch_size, shuffle=limit == 1e9)

    print('Evaluating with a batch size of {}!'.format(batch_size))
    errs = 0.
    for i, batch in enumerate(tqdm.tqdm(l)):
        if i == limit: break
        tr = t.transform(batch)
        b_hat = t.inverse_transform(tr)
        diffs = np.linalg.norm(b_hat - batch, axis=-1)
        errs += diffs.sum()
    errs /= batch_size * min(limit, len(l))
    return errs
evaluate(FP_MULTIOME_TEST_INPUTS)