# Incremental PCA

The notebook demonstrates the use of [scikit-learn IncrementalPCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA) to perform PCA on Census data.

Approach:

* Use a SOMA query to define the cells to be embedded,
* From these cells, select N top genes using the `experimental.pp.highly_variable_genes` method,
* Incrementally train over the selected cells and the N top genes,
* Compute components, and annotate the `obs` dataframe.

Depending on the number of cells and genes selected, this can be a resource intensive computation. It is known to complete succesfully when trained on the top 5000 genes for all cells in the human and mouse Census data, but requires a large host. For example, the full human PCA has been succesfully demonstrated on an AWS EC2 c6id.32xlarge instance.

In [1]:
import cellxgene_census
import numpy as np
import tiledbsoma as soma
from cellxgene_census.experimental.util import X_sparse_iter
from cellxgene_census.experimental.pp import highly_variable_genes
from sklearn.decomposition import IncrementalPCA


"""
Configuration - the dataset and computational parameters.
"""
census_version = "latest"  # which Census version is used
experiment_name = "mus_musculus"  # which organism: mus_musculus or homo_sapiens
obs_value_filter = "tissue_general == 'heart'"  # the subset of cells (both train and embed). Set to None if all cells.
n_components = 30  # number of components to keep in the final result
n_top_genes = 3000  # number of genes to use as analysis input

In [3]:
with cellxgene_census.open_soma(census_version=census_version) as census:
    exp = census["census_data"][experiment_name]

    with exp.axis_query(
        measurement_name="RNA",
        obs_query=soma.AxisQuery(value_filter=obs_value_filter),
    ) as query:
        print(f"{query.n_obs} cells selected")
        print("Beginning HVG calculation")
        hvgs = highly_variable_genes(query, n_top_genes=n_top_genes)
        var_soma_joinids = hvgs[hvgs.highly_variable].index.to_numpy()
        del hvgs
        print("Finished HVG calculation")

    with exp.axis_query(
        measurement_name="RNA",
        obs_query=soma.AxisQuery(value_filter=obs_value_filter),
        var_query=soma.AxisQuery(coords=(var_soma_joinids,)),
    ) as query:
        print("Start training")
        pca = IncrementalPCA(n_components=n_components)
        training_chunk_size = 2000
        for n, (_, chunk) in enumerate(X_sparse_iter(query, stride=2**17)):
            for i in range(0, chunk.shape[0], training_chunk_size):
                training_chunk = chunk[i : i + training_chunk_size, :].toarray()
                pca.partial_fit(training_chunk)
        print("End training")

        obs = query.obs(column_names=["soma_joinid"]).concat().to_pandas().set_index("soma_joinid")
        for colname in (f"X_pca_{n}" for n in range(0, n_components)):
            obs[colname] = np.zeros((len(obs),), dtype=np.float32)

        print("Start transform")
        for n, ((obs_join_ids, _), chunk) in enumerate(X_sparse_iter(query)):
            chunk_trnsfm = pca.transform(chunk.toarray())
            for c in range(n_components):
                obs.loc[obs_join_ids, f"X_pca_{c}"] = chunk_trnsfm[:, c]
        print("Complete")

obs

The "latest" release is currently 2023-06-28. Specify 'census_version="2023-06-28"' in future calls to open_soma() to ensure data consistency.


36564 cells selected
Beginning HVG calculation
Finished HVG calculation
Start training
End training
Start transform
Complete


Unnamed: 0_level_0,tissue_general,X_pca_0,X_pca_1,X_pca_2,X_pca_3,X_pca_4,X_pca_5,X_pca_6,X_pca_7,X_pca_8,...,X_pca_20,X_pca_21,X_pca_22,X_pca_23,X_pca_24,X_pca_25,X_pca_26,X_pca_27,X_pca_28,X_pca_29
soma_joinid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3095361,heart,-5939.924615,-683.212275,-1018.494281,-938.109799,-723.603279,-349.299284,1252.382706,-1097.100917,-904.277427,...,354.648672,210.515004,-204.663918,60.400421,-286.188903,-120.819550,-45.565330,69.815616,39.614296,-19.950906
3095370,heart,-5931.001244,-687.428434,-1003.767889,-903.132840,-719.997444,-312.157573,1243.581321,-1093.208485,-895.446839,...,346.286061,209.512250,-200.357956,60.644144,-272.080191,-119.591777,-49.537452,71.157352,37.907837,17.635257
3095380,heart,-5891.521664,-717.565068,-607.776972,-820.863973,-698.926246,-358.779596,1237.777258,-1056.738368,-851.969745,...,362.640003,247.159011,-194.670891,102.921152,-289.540281,-121.231375,-60.409565,86.419813,42.559516,-8.048174
3095390,heart,-5868.478091,-731.216164,-1009.235602,-905.582908,-727.405069,-281.350112,843.865317,-557.628346,-1050.950832,...,242.980111,168.093599,-62.744946,99.043908,426.744667,-510.560444,-378.656225,120.614270,54.233329,317.178646
3095401,heart,-5950.154809,-680.357173,-955.184791,-973.048902,-724.860325,-344.129590,1246.817861,-1091.256673,-890.084953,...,342.561275,188.251382,-202.109691,48.451889,-284.054934,-121.803659,-46.637227,67.745758,37.148254,-23.619059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3792056,heart,-6070.793018,-576.932975,-777.651884,-1010.133481,-721.660890,-337.543649,1237.296950,-1113.009631,-903.869393,...,348.694855,213.762954,-207.319404,56.181598,-283.702928,-121.566131,-37.949695,63.272752,37.378116,-34.417257
3792057,heart,-6070.292962,-577.806405,-533.410534,-984.353452,-710.431255,-336.139465,1240.199226,-1114.885577,-903.657406,...,350.348863,214.450417,-208.534189,49.449862,-280.792904,-119.868983,-34.932936,61.788630,38.705339,-40.058989
3792058,heart,-6070.159232,-576.818548,-1006.043430,-1033.532268,-732.057211,-337.263627,1235.406379,-1109.468384,-902.540096,...,347.007038,212.574288,-206.802801,57.615339,-283.547180,-121.804603,-36.891014,63.560276,36.485488,-29.753226
3792059,heart,-6069.944310,-575.833051,-1006.205813,-1034.536017,-732.117329,-337.770280,1234.946514,-1110.190152,-897.924088,...,344.864596,208.188897,-206.919198,53.891745,-284.967410,-122.585601,-38.911083,63.575010,36.902317,-32.426830
