# Incremental PCA

The notebook demonstrates the use of [scikit-learn IncrementalPCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA) to perform PCA on Census data.

Approach:

* Use a SOMA query to define the cells to be embedded,
* From these cells, select N top genes using the `experimental.pp.highly_variable_genes` method,
* Incrementally train over the selected cells and the N top genes,
* Compute components, and annotate the `obs` dataframe.

Depending on the number of cells and genes selected, this can be a resource intensive computation. It is known to complete succesfully when trained on the top 5000 genes for all cells in the human and mouse Census data, but requires a large host. For example, the full human PCA has been succesfully demonstrated on an AWS EC2 c6id.32xlarge instance.

In [10]:
import cellxgene_census
import numpy as np
import tiledbsoma as soma
from cellxgene_census.experimental import X_sparse_iter
from cellxgene_census.experimental.pp import highly_variable_genes
from sklearn.decomposition import IncrementalPCA


"""
Configuration - the dataset and computational parameters.
"""
census_version = "latest"  # which Census version is used
experiment_name = "mus_musculus"  # which organism: mus_musculus or homo_sapiens
obs_value_filter = "tissue_general == 'heart'"  # the subset of cells (both train and embed). Set to None if all cells.
n_components = 30  # number of components to keep in the final result
n_top_genes = 3000  # number of genes to use as analysis input

In [11]:
with cellxgene_census.open_soma(census_version=census_version) as census:
    exp = census["census_data"][experiment_name]

    with exp.axis_query(
        measurement_name="RNA",
        obs_query=soma.AxisQuery(value_filter=obs_value_filter),
    ) as query:
        print(f"{query.n_obs} cells selected")
        print("Begining HVG calculation")
        hvgs = highly_variable_genes(query, n_top_genes=n_top_genes)
        var_soma_joinids = hvgs[hvgs.highly_variable].index.to_numpy()
        del hvgs
        print("Finished HVG calculation")

    with exp.axis_query(
        measurement_name="RNA",
        obs_query=soma.AxisQuery(value_filter=obs_value_filter),
        var_query=soma.AxisQuery(coords=(var_soma_joinids,)),
    ) as query:
        print("Start training")
        pca = IncrementalPCA(n_components=n_components)
        training_chunk_size = 2000
        for n, (_, chunk) in enumerate(X_sparse_iter(query, row_stride=2**17)):
            for i in range(0, chunk.shape[0], training_chunk_size):
                training_chunk = chunk[i : i + training_chunk_size, :].toarray()
                pca.partial_fit(training_chunk)
        print("End training")

        obs = query.obs(column_names=["soma_joinid"]).concat().to_pandas().set_index("soma_joinid")
        for colname in (f"X_pca_{n}" for n in range(0, n_components)):
            obs[colname] = np.zeros((len(obs),), dtype=np.float32)

        print("Start transform")
        for n, ((obs_join_ids, _), chunk) in enumerate(X_sparse_iter(query)):
            chunk_trnsfm = pca.transform(chunk.toarray())
            for c in range(n_components):
                obs.loc[obs_join_ids, f"X_pca_{c}"] = chunk_trnsfm[:, c]
        print("Complete")

obs

The "latest" release is currently 2023-06-20. Specify 'census_version="2023-06-20"' in future calls to open_soma() to ensure data consistency.


36564 cells selected
Begining HVG calculation
Finished HVG calculation
Start training
End training
Start transform
Complete


Unnamed: 0_level_0,tissue_general,X_pca_0,X_pca_1,X_pca_2,X_pca_3,X_pca_4,X_pca_5,X_pca_6,X_pca_7,X_pca_8,...,X_pca_20,X_pca_21,X_pca_22,X_pca_23,X_pca_24,X_pca_25,X_pca_26,X_pca_27,X_pca_28,X_pca_29
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3095361,heart,-5939.924616,-683.212243,-1018.494393,-938.109461,-723.603865,-349.299128,1252.384492,-1097.098897,-904.277142,...,354.649033,210.514319,-204.664050,60.400237,-286.188883,-120.819543,-45.565343,69.815948,39.614116,-19.950722
3095370,heart,-5931.001245,-687.428403,-1003.767999,-903.132504,-719.998011,-312.157419,1243.583096,-1093.206478,-895.446555,...,346.286419,209.511578,-200.358087,60.643966,-272.080179,-119.591779,-49.537458,71.157729,37.907657,17.635504
3095380,heart,-5891.521663,-717.565051,-607.777060,-820.863670,-698.926769,-358.779445,1237.778903,-1056.736349,-851.969458,...,362.640358,247.158346,-194.671093,102.920985,-289.540284,-121.231404,-60.409553,86.420251,42.559263,-8.047986
3095390,heart,-5868.478091,-731.216139,-1009.235700,-905.582552,-727.405603,-281.349989,843.866254,-557.627050,-1050.950655,...,242.980296,168.093151,-62.745206,99.043938,426.744830,-510.560497,-378.656012,120.614550,54.233555,317.179114
3095401,heart,-5950.154810,-680.357143,-955.184903,-973.048567,-724.860929,-344.129434,1246.819625,-1091.254654,-890.084668,...,342.561601,188.250719,-202.109816,48.451701,-284.054903,-121.803647,-46.637247,67.746066,37.148081,-23.618912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3792056,heart,-6070.793019,-576.932947,-777.651998,-1010.133155,-721.661520,-337.543492,1237.298761,-1113.007633,-903.869106,...,348.695222,213.762276,-207.319539,56.181413,-283.702904,-121.566124,-37.949715,63.273040,37.377956,-34.417096
3792057,heart,-6070.292962,-577.806380,-533.410646,-984.353145,-710.431879,-336.139309,1240.201049,-1114.883580,-903.657119,...,350.349235,214.449739,-208.534325,49.449681,-280.792881,-119.868979,-34.932957,61.788899,38.705193,-40.058812
3792058,heart,-6070.159234,-576.818516,-1006.043546,-1033.531924,-732.057846,-337.263470,1235.408180,-1109.466385,-902.539810,...,347.007402,212.573611,-206.802934,57.615153,-283.547158,-121.804596,-36.891031,63.560573,36.485322,-29.753061
3792059,heart,-6069.944311,-575.833019,-1006.205928,-1034.535673,-732.117965,-337.770122,1234.948315,-1110.188153,-897.923802,...,344.864954,208.188224,-206.919327,53.891558,-284.967386,-122.585593,-38.911101,63.575307,36.902151,-32.426673
