# PCA

### In this notebook we tackle the problem of sparsity in the feature representations of the different modes (chromatin accessibility, gene expression, surface protein levels). 

As noted in https://www.kaggle.com/code/leohash/complete-eda-of-mmscel-integration-data/notebook, DNA data has between 1-30k of the 229k features being nonzero, RNA data has 2-8k of the ~28k features as nonzero, and protein data has a small number of features and is sparse, which means this notebook don't care :)

We will apply PCA (and maybe some other techniques) to investigate whether we can usefully hop into a lower-dimensional, densely-populated representation for either of these two modes.

In [None]:
# imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tqdm

from datasets import SparseDataset, H5Dataset

from sklearn.decomposition import PCA, SparsePCA, IncrementalPCA

In [None]:
# -----------------------------------------------------------

# Da Work

## DNA (Chromatin Accessibility)

In [2]:
# find max variance columns
from datasets import H5Dataset, SparseDataset
import numpy as np
import tqdm
import torch

sums = np.zeros(228942)
squared_sums = np.zeros(228942)
num_nonzero = np.zeros(228942)

d = SparseDataset('all', 'multi')
s = d.get_dataloader(512)
for x, day, y in tqdm.tqdm(s):
    sums += x.sum(dim=0).numpy()
    squared_sums += torch.square(x).sum(dim=0).numpy()
    num_nonzero += (x != 0).sum(dim=0).numpy()
    
variances = squared_sums / 105942 - np.square(sums / 105942)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 207/207 [03:56<00:00,  1.14s/it]


In [23]:
var_idxs = np.argsort(variances)[::-1]
nz_idxs = np.argsort(num_nonzero)[::-1]

In [31]:
ranks = np.zeros(228942)
for i, v in enumerate(var_idxs):
    ranks[v] += i
for i, v in enumerate(nz_idxs):
    ranks[v] += i
best_idxs = np.argsort(ranks)

In [33]:
np.save('data/multi_best_idxs.npy', best_idxs)
np.save('data/multi_var_idxs.npy', var_idxs)
np.save('data/multi_nz_idxs.npy', nz_idxs)

## RNA (Gene Expression)

In [None]:
import pandas as pd

multi_df = pd.read_hdf('data/train_multi_targets.h5', start=1000, stop=2000)
cite_df = pd.read_hdf('data/train_cite_inputs.h5', start=1000, stop=2000)

multi_keys = list(multi_df.keys())
cite_keys = list(cite_df.keys())

for i in range(len(cite_keys)):
    cite_keys[i] = cite_keys[i].split('_')[0]

multi_idxs = []
cite_idxs = []
for i, s in enumerate(multi_keys):
    if s in cite_keys:
        multi_idxs.append(i)
        cite_idxs.append(cite_keys.index(s))

In [None]:
# multi_rna = np.asarray(H5Dataset('all', 'multi').targets_h5)
# cite_rna = np.asarray(H5Dataset('all', 'cite').inputs_h5)
# multi_shared = multi_rna[:, multi_idxs]
# cite_shared = cite_rna[:, cite_idxs]
# # shared = np.concatenate((multi_shared, cite_shared), axis=0)

In [None]:
batch_size=5120

multi_shared_loader = H5Dataset('all', 'multi').get_dataloader(batch_size)
cite_shared_loader = H5Dataset('all', 'cite').get_dataloader(batch_size)

p = IncrementalPCA(5000, batch_size=batch_size)

for (x, day), y in tqdm.tqdm(multi_shared_loader):
    rna = y.numpy()
    rna = rna[:, multi_idxs]
    p.fit((rna != 0).astype(float))

In [None]:
# for (x, day), y in tqdm.tqdm(multi_shared_loader):
#     rna = y.numpy()
#     rna = (rna[:, multi_idxs] != 0).astype(float)
#     t = p.transform(rna)
#     r = p.inverse_transform(t)
#     print(r[0, :20])
#     print(rna[0, :20])
#     break

In [None]:
for (x, day), y in tqdm.tqdm(cite_shared_loader):
    rna = x.numpy()
    rna = rna[:, cite_idxs]
    p.fit((rna != 0).astype(float))

In [None]:
for (x, day), y in tqdm.tqdm(multi_shared_loader):
    rna = y.numpy()
    rna = (rna[:, multi_idxs] != 0).astype(float)
    t = p.transform(rna)
    r = p.inverse_transform(t)
    print(r[0, :20])
    print(rna[0, :20])
    break
for (x, day), y in tqdm.tqdm(cite_shared_loader):
    rna = x.numpy()
    rna = (rna[:, cite_idxs] != 0).astype(float)
    t = p.transform(rna)
    r = p.inverse_transform(t)
    print(r[0, :20])
    print(rna[0, :20])
    break

In [None]:
p2 = IncrementalPCA(5120)
with open('data/pca.pkl', 'rb') 
p2 = pickle.load(f)