# PCA

### In this notebook we tackle the problem of sparsity in the feature representations of the different modes (chromatin accessibility, gene expression, surface protein levels). 

As noted in https://www.kaggle.com/code/leohash/complete-eda-of-mmscel-integration-data/notebook, DNA data has between 1-30k of the 229k features being nonzero, RNA data has 2-8k of the ~28k features as nonzero, and protein data has a small number of features and is sparse, which means this notebook don't care :)

We will apply PCA (and maybe some other techniques) to investigate whether we can usefully hop into a lower-dimensional, densely-populated representation for either of these two modes.

In [2]:
# imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tqdm

from datasets import SparseDataset, H5Dataset

from sklearn.decomposition import PCA, SparsePCA, IncrementalPCA

In [None]:
# -----------------------------------------------------------

# Da Work

## DNA (Chromatin Accessibility)

In [None]:
train_f = FP_MULTIOME_TRAIN_INPUTS
test_f = FP_MULTIOME_TEST_INPUTS

max_n_components = 20000

p_dna = PCA(max_n_components)
p_dna.fit(train_f)
p_dna.fit(test_f)

## RNA (Gene Expression)

In [3]:
import pandas as pd

multi_df = pd.read_hdf('data/train_multi_targets.h5', start=1000, stop=2000)
cite_df = pd.read_hdf('data/train_cite_inputs.h5', start=1000, stop=2000)

multi_keys = list(multi_df.keys())
cite_keys = list(cite_df.keys())

for i in range(len(cite_keys)):
    cite_keys[i] = cite_keys[i].split('_')[0]

multi_idxs = []
cite_idxs = []
for i, s in enumerate(multi_keys):
    if s in cite_keys:
        multi_idxs.append(i)
        cite_idxs.append(cite_keys.index(s))

In [None]:
# multi_rna = np.asarray(H5Dataset('all', 'multi').targets_h5)
# cite_rna = np.asarray(H5Dataset('all', 'cite').inputs_h5)
# multi_shared = multi_rna[:, multi_idxs]
# cite_shared = cite_rna[:, cite_idxs]
# # shared = np.concatenate((multi_shared, cite_shared), axis=0)

In [10]:
batch_size=5120

multi_shared_loader = H5Dataset('all', 'multi').get_dataloader(batch_size)
cite_shared_loader = H5Dataset('all', 'cite').get_dataloader(batch_size)

p = IncrementalPCA(5000, batch_size=batch_size)

for (x, day), y in tqdm.tqdm(multi_shared_loader):
    rna = y.numpy()
    rna = rna[:, multi_idxs]
    p.fit((rna != 0).astype(float))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [40:30<00:00, 121.51s/it]


In [12]:
# for (x, day), y in tqdm.tqdm(multi_shared_loader):
#     rna = y.numpy()
#     rna = (rna[:, multi_idxs] != 0).astype(float)
#     t = p.transform(rna)
#     r = p.inverse_transform(t)
#     print(r[0, :20])
#     print(rna[0, :20])
#     break

  0%|                                                                                                                                                                        | 0/20 [00:39<?, ?it/s]

[ 0.06714004  0.05487646 -0.07046997  0.17677267  0.00583439  0.08139133
  0.51982441  0.68326941 -0.0283741  -0.27715162  0.58717835  0.0560826
 -0.11472318 -0.09067327  0.12867208  0.09864197  0.58704133  0.03540484
  0.83939874  0.1652116 ]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0.]





In [13]:
for (x, day), y in tqdm.tqdm(cite_shared_loader):
    rna = x.numpy()
    rna = rna[:, cite_idxs]
    p.fit((rna != 0).astype(float))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [22:24<00:00, 103.44s/it]


In [15]:
for (x, day), y in tqdm.tqdm(multi_shared_loader):
    rna = y.numpy()
    rna = (rna[:, multi_idxs] != 0).astype(float)
    t = p.transform(rna)
    r = p.inverse_transform(t)
    print(r[0, :20])
    print(rna[0, :20])
    break
for (x, day), y in tqdm.tqdm(cite_shared_loader):
    rna = x.numpy()
    rna = (rna[:, cite_idxs] != 0).astype(float)
    t = p.transform(rna)
    r = p.inverse_transform(t)
    print(r[0, :20])
    print(rna[0, :20])
    break

  0%|                                                                                                                                                                        | 0/20 [00:38<?, ?it/s]


[-0.21263749  0.07339686 -0.0508241   0.18521511 -0.01095684  0.10056191
 -0.10150701  0.2873682  -0.17385937 -0.1198185   0.6742844   0.15951751
  0.19980298 -0.0106914   0.22917121 -0.0055773   0.10293863 -0.15654958
  0.02219364 -0.04935069]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


  0%|                                                                                                                                                                        | 0/13 [00:24<?, ?it/s]

[ 0.10195501 -0.03635179  0.06681627 -0.03937548 -0.01161978 -0.08236591
  0.08501981  0.97663378  0.23727503  1.22752942  0.83175774  0.53139381
  0.44288236  0.02910783 -0.08121857  0.03334935  0.66305163 -0.06371141
  0.51174748  0.3771991 ]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0.]





In [19]:
p2 = IncrementalPCA(5120)
with open('data/pca.pkl', 'rb') 
p2 = pickle.load(f)