In [None]:
import cellxgene_census
import tiledbsoma as soma
import pandas as pd
import numpy as np

In [None]:
DEFAULT_TILEDB_CONFIGURATION = {
    # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters
    "py.init_buffer_bytes": 1 * 1024**3,
    "soma.init_buffer_bytes": 1 * 1024**3,
    # S3 requests should not be signed, since we want to allow anonymous access
    "vfs.s3.no_sign_request": "false",
    "vfs.s3.region": "us-west-2",
}
ctx = soma.options.SOMATileDBContext().replace(tiledb_config=DEFAULT_TILEDB_CONFIGURATION)
census_old = cellxgene_census.open_soma(census_version="latest")
census_new = cellxgene_census.open_soma(uri="s3://bruce-tmp/census-schema-five-prod-QC-build/soma/", context=ctx)

## QC results

Light QC on test build shows no issues

Checks
- Verifying fidelity in data additions, in particular changes to `obs` and addition to `census["census_info"]["organisms"]`.
- Checking new assays added based on updated list of assays, and verifying the addition of expected datasets.
- Validating the existence of  previously missing data due to fixed filter for multi-species datasets.
- Validating calculation of normalized layer for SMART-like technologies

## Checking census["info"]

In [None]:
census_old["census_info"]["summary"].read().concat().to_pandas()

Unnamed: 0,soma_joinid,label,value
0,0,census_schema_version,1.3.0
1,1,census_build_date,2024-03-18
2,2,dataset_schema_version,4.0.0
3,3,total_cell_count,111070547
4,4,unique_cell_count,57996085
5,5,number_donors_homo_sapiens,16622
6,6,number_donors_mus_musculus,4146


In [None]:
census_new["census_info"]["summary"].read().concat().to_pandas()

Unnamed: 0,soma_joinid,label,value
0,0,census_schema_version,2.0.0
1,1,census_build_date,test-build
2,2,dataset_schema_version,5.0.0
3,3,total_cell_count,114199045
4,4,unique_cell_count,59554288
5,5,number_donors_homo_sapiens,17050
6,6,number_donors_mus_musculus,4146


In [None]:
census_new["census_info"]["organisms"].read().concat().to_pandas()

Unnamed: 0,soma_joinid,organism_ontology_term_id,organism_label,organism
0,0,NCBITaxon:9606,Homo sapiens,homo_sapiens
1,1,NCBITaxon:10090,Mus musculus,mus_musculus


In [None]:
census_old["census_info"]["datasets"].read().concat().to_pandas()

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,Publication: https://doi.org/10.1002/hep4.1854...,44531dd9-1388-4416-a117-af0a99de2294,"Single-Cell, Single-Nucleus, and Spatial RNA S...",10.1002/hep4.1854,0895c838-e550-48a3-a777-dbcd35d30272,90c609bd-1439-4cbf-935f-201ecadf0297,Healthy human liver: B cells,0895c838-e550-48a3-a777-dbcd35d30272.h5ad,146
1,1,Publication: https://doi.org/10.1126/sciimmuno...,3a2af25b-2338-4266-aad3-aa8d07473f50,Single-cell analysis of human B cell maturatio...,10.1126/sciimmunol.abe6291,00ff600e-6e2e-4d76-846f-0eec4f0ae417,eefe1452-cf76-40ce-aeba-6c82395edfd8,Human tonsil nonlymphoid cells scRNA,00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad,363
2,2,Publication: https://doi.org/10.1038/s41593-02...,180bff9c-c8a5-4539-b13b-ddbc00d643e6,Molecular characterization of selectively vuln...,10.1038/s41593-020-00764-7,bdacc907-7c26-419f-8808-969eab3ca2e8,a835c57c-3805-4c01-b22f-9bcfe1794d10,Molecular characterization of selectively vuln...,bdacc907-7c26-419f-8808-969eab3ca2e8.h5ad,3799
3,3,Publication: https://doi.org/10.1038/s41467-02...,bf325905-5e8e-42e3-933d-9a9053e9af80,Single-cell Atlas of common variable immunodef...,10.1038/s41467-022-29450-x,a5d95a42-0137-496f-8a60-101e17f263c8,dc8026ac-b646-43d6-b714-a2a895ca9acd,Steady-state B cells - scRNA-seq,a5d95a42-0137-496f-8a60-101e17f263c8.h5ad,1324
4,4,Publication: https://doi.org/10.1038/s41590-02...,93eebe82-d8c3-41bc-a906-63b5b5f24a9d,Single-cell proteo-genomic reference maps of t...,10.1038/s41590-021-01059-0,d3566d6a-a455-4a15-980f-45eb29114cab,6cb0bf3a-9e54-4c86-89b9-160869556e77,blood and bone marrow from a healthy young donor,d3566d6a-a455-4a15-980f-45eb29114cab.h5ad,15502
...,...,...,...,...,...,...,...,...,...,...
758,758,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,0bce33ed-455c-4e12-93f8-b7b04a2de4a1,23445882-1c14-4a99-a1a3-16c2e373a099,Whole dataset: Normalized subset 2,0bce33ed-455c-4e12-93f8-b7b04a2de4a1.h5ad,2863559
759,759,Publication: https://doi.org/10.1101/2023.05.0...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1101/2023.05.08.539485,c2876b1b-06d8-4d96-a56b-5304f815b99a,18011876-b61b-4995-9f51-934cda890e82,Whole Taxonomy - MTG: Seattle Alzheimer's Dise...,c2876b1b-06d8-4d96-a56b-5304f815b99a.h5ad,1226855
760,760,Publication: https://doi.org/10.1101/2023.05.0...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1101/2023.05.08.539485,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,788e2ad7-dca6-415a-b0f4-54bc968ff8db,Whole Taxonomy - DLPFC: Seattle Alzheimer's Di...,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3.h5ad,1309414
761,761,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,dcfa2614-7ca7-4d82-814c-350626eccb26,53ff3a93-c576-4b91-9215-122585dad207,Major cell cluster: Mesoderm,dcfa2614-7ca7-4d82-814c-350626eccb26.h5ad,3267338


In [None]:
census_new["census_info"]["datasets"].read().concat().to_pandas()

Unnamed: 0,soma_joinid,citation,collection_id,collection_name,collection_doi,dataset_id,dataset_version_id,dataset_title,dataset_h5ad_path,dataset_total_cell_count
0,0,Publication: https://doi.org/10.1002/hep4.1854...,44531dd9-1388-4416-a117-af0a99de2294,"Single-Cell, Single-Nucleus, and Spatial RNA S...",10.1002/hep4.1854,0895c838-e550-48a3-a777-dbcd35d30272,fb76c95f-0391-4fac-9fb9-082ce2430b59,Healthy human liver: B cells,0895c838-e550-48a3-a777-dbcd35d30272.h5ad,146
1,1,Publication: https://doi.org/10.1126/sciimmuno...,3a2af25b-2338-4266-aad3-aa8d07473f50,Single-cell analysis of human B cell maturatio...,10.1126/sciimmunol.abe6291,00ff600e-6e2e-4d76-846f-0eec4f0ae417,b6737a5e-9069-4dd6-9a57-92e17a746df9,Human tonsil nonlymphoid cells scRNA,00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad,363
2,2,Publication: https://doi.org/10.1038/s41593-02...,180bff9c-c8a5-4539-b13b-ddbc00d643e6,Molecular characterization of selectively vuln...,10.1038/s41593-020-00764-7,bdacc907-7c26-419f-8808-969eab3ca2e8,0e02290f-b992-450b-8a19-554f73cd7f09,Molecular characterization of selectively vuln...,bdacc907-7c26-419f-8808-969eab3ca2e8.h5ad,3799
3,3,Publication: https://doi.org/10.1038/s41467-02...,bf325905-5e8e-42e3-933d-9a9053e9af80,Single-cell Atlas of common variable immunodef...,10.1038/s41467-022-29450-x,a5d95a42-0137-496f-8a60-101e17f263c8,40832710-d7b1-43fb-b2c2-1cd2255bc3ac,Steady-state B cells - scRNA-seq,a5d95a42-0137-496f-8a60-101e17f263c8.h5ad,1324
4,4,Publication: https://doi.org/10.1038/s41590-02...,93eebe82-d8c3-41bc-a906-63b5b5f24a9d,Single-cell proteo-genomic reference maps of t...,10.1038/s41590-021-01059-0,d3566d6a-a455-4a15-980f-45eb29114cab,eb6c070c-ff67-4c1f-8d4d-65f9fe2119ee,blood and bone marrow from a healthy young donor,d3566d6a-a455-4a15-980f-45eb29114cab.h5ad,15502
...,...,...,...,...,...,...,...,...,...,...
791,791,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,0bce33ed-455c-4e12-93f8-b7b04a2de4a1,ffeb40f8-d4b9-45c4-95cc-5e2674452ef8,Whole dataset: Normalized subset 2,0bce33ed-455c-4e12-93f8-b7b04a2de4a1.h5ad,2863559
792,792,Publication: https://doi.org/10.1101/2023.05.0...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1101/2023.05.08.539485,c2876b1b-06d8-4d96-a56b-5304f815b99a,77dab54a-f2a8-42fc-8c1b-3fda90622ac7,Whole Taxonomy - MTG: Seattle Alzheimer's Dise...,c2876b1b-06d8-4d96-a56b-5304f815b99a.h5ad,1226855
793,793,Publication: https://doi.org/10.1101/2023.05.0...,1ca90a2d-2943-483d-b678-b809bf464c30,SEA-AD: Seattle Alzheimer’s Disease Brain Cell...,10.1101/2023.05.08.539485,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,b0cbf861-edd3-4add-a09a-c8698ed0cedf,Whole Taxonomy - DLPFC: Seattle Alzheimer's Di...,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3.h5ad,1309414
794,794,Publication: https://doi.org/10.1038/s41586-02...,45d5d2c3-bc28-4814-aed6-0bb6f0e11c82,A single-cell transcriptional timelapse of mou...,10.1038/s41586-024-07069-w,dcfa2614-7ca7-4d82-814c-350626eccb26,4ef3a829-b36e-413f-9a32-56f5a91b1041,Major cell cluster: Mesoderm,dcfa2614-7ca7-4d82-814c-350626eccb26.h5ad,3267338


## Missing dataset due to incorrect filter of multi-species data

In [None]:
pd.set_option('display.max_colwidth', None)
datasets = census_old["census_info"]["datasets"].read(column_names=["dataset_id", "dataset_title"]).concat().to_pandas()
datasets[datasets["dataset_title"].str.contains("Individual")]

Unnamed: 0,dataset_id,dataset_title
116,ae5341b8-60fb-4fac-86db-86e49ee66287,Individual Single-Cell RNA-seq PBMC Data from Guo et al.
248,055ca631-6ffb-40de-815e-b931e10718c0,Individual Single-Cell RNA-seq PBMC Data from Wilk et al.
411,4c4cd77c-8fee-4836-9145-16562a8782fe,Individual Single-Cell RNA-seq PBMC Data from Lee et al.
455,59b69042-47c2-47fd-ad03-d21beb99818f,Individual Single-Cell RNA-seq PBMC Data from Arunachalam et al.


In [None]:
datasets = census_new["census_info"]["datasets"].read(column_names=["dataset_id", "dataset_title"]).concat().to_pandas()
datasets[datasets["dataset_title"].str.contains("Individual")]

Unnamed: 0,dataset_id,dataset_title
139,ae5341b8-60fb-4fac-86db-86e49ee66287,Individual Single-Cell RNA-seq PBMC Data from Guo et al.
276,055ca631-6ffb-40de-815e-b931e10718c0,Individual Single-Cell RNA-seq PBMC Data from Wilk et al.
440,4c4cd77c-8fee-4836-9145-16562a8782fe,Individual Single-Cell RNA-seq PBMC Data from Lee et al.
484,59b69042-47c2-47fd-ad03-d21beb99818f,Individual Single-Cell RNA-seq PBMC Data from Arunachalam et al.
542,5e717147-0f75-4de1-8bd2-6fda01b8d75f,Individual Single-Cell RNA-seq PBMC Data from Schulte-Schrepping et al.


**PASS the new Census has the previously missing dataset**

## Checking list of all assays and veryfying new ones

In [None]:
assays_old = census_old["census_data"]["homo_sapiens"].obs.read(column_names=["assay"]).concat().to_pandas().value_counts().reset_index()

In [None]:
assays_new = census_new["census_data"]["homo_sapiens"].obs.read(column_names=["assay"]).concat().to_pandas().value_counts().reset_index()

In [None]:
pd.merge(assasy_old, assays_new, how="outer", on="assay")

Unnamed: 0,assay,count_x,count_y
0,10x 3' transcription profiling,811422.0,819131
1,10x 3' v1,121394.0,121394
2,10x 3' v2,14281666.0,14811657
3,10x 3' v3,36599802.0,38401688
4,10x 5' transcription profiling,1282251.0,1282251
5,10x 5' v1,6450367.0,6450367
6,10x 5' v2,3381738.0,3381738
7,BD Rhapsody Targeted mRNA,96145.0,96145
8,BD Rhapsody Whole Transcriptome Analysis,177276.0,177276
9,CEL-seq2,5244.0,7370


## Normalized layer for SMART-like data

In [None]:
smart_like = pd.read_csv(
    "https://raw.githubusercontent.com/chanzuckerberg/cellxgene-census/bkmartinjr/993-schema-five/docs/census_accepted_assays_full_gene.csv",
    names=["assay_ontology_term_id", "assay"]
)

In [None]:
pd.merge(assays_new, smart_like, how="right", on="assay")

Unnamed: 0,assay,count,assay_ontology_term_id
0,FL-cDNA,,EFO:0003755
1,full length single cell RNA sequencing,,EFO:0008441
2,FRISCR,,EFO:0008747
3,Hi-SCL,,EFO:0008763
4,MATQ-seq,,EFO:0008797
5,Quartz-seq,,EFO:0008877
6,Smart-seq,,EFO:0008930
7,Smart-seq2,190792.0,EFO:0008931
8,SUPeR-seq,,EFO:0008956
9,full length single nucleus RNA sequencing,,EFO:0009810


In [None]:
# get a few cells from each assay present
smartseq1_ids = census_new["census_data"]["homo_sapiens"].obs.read(column_names=["soma_joinid"], value_filter = f"assay == 'Smart-seq2'").concat().to_pandas()
smartseq4_ids = census_new["census_data"]["homo_sapiens"].obs.read(column_names=["soma_joinid"], value_filter = f"assay == 'Smart-seq v4'").concat().to_pandas()

smartseq = smartseq1_ids["soma_joinid"].tolist()[:1000] + smartseq4_ids["soma_joinid"].tolist()[:1000]

adata = cellxgene_census.get_anndata(
    census_new,
    organism="homo_sapiens",
    obs_coords=smartseq,
    X_layers=["normalized"]
)


In [None]:
adata.layers["normalized"]

<2000x60528 sparse matrix of type '<class 'numpy.float32'>'
	with 10132245 stored elements in Compressed Sparse Row format>

In [None]:
adata.obs["assay"].drop_duplicates()

0         Smart-seq2
1000    Smart-seq v4
Name: assay, dtype: object

In [None]:
min(adata.layers["normalized"].data)

2.1492724e-08

In [None]:
max(adata.layers["normalized"].data)

0.580307

In [None]:
adata.layers["normalized"].sum(axis=1).min()

0.99999154

In [None]:
adata.layers["normalized"].sum(axis=1).max()

1.000007

In [None]:
adata.layers["normalized_local"] = adata.X.copy()
adata.layers["normalized_local"] = adata.layers["normalized_local"].multiply(1 / adata.var["feature_length"].values[None,:])
adata.layers["normalized_local"] = adata.layers["normalized_local"].multiply( 1 / adata.layers["normalized_local"].sum(1).A)

In [None]:
adata.layers["normalized_local"].data

array([6.70588850e-04, 5.30853111e-04, 6.05805790e-05, ...,
       1.72411111e-04, 1.03791338e-04, 3.12075306e-06])

In [None]:
adata.layers["normalized"].data

array([6.70582056e-04, 5.30853868e-04, 6.05806708e-05, ...,
       1.72410160e-04, 1.03792176e-04, 3.12074553e-06], dtype=float32)

In [None]:
pre_calc = adata.layers["normalized"].data.copy()
fly_calc = adata.layers["normalized_local"].data.copy()
for r in reversed(range(11)):
    n_differing = np.nonzero(np.round(pre_calc.copy(), r).astype(np.float32) != np.round(fly_calc.copy(), r).astype(np.float32))[0].shape[0]
    percent_differing = n_differing / fly_calc.shape[0]
    print("Decimal points: ", r, " values differing: n = ", n_differing, ", fracion = ", percent_differing)

Decimal points:  10  values differing: n =  6853148 , fracion =  0.6763701430433235
Decimal points:  9  values differing: n =  3399477 , fracion =  0.33551073824211713
Decimal points:  8  values differing: n =  742827 , fracion =  0.07331316998355251
Decimal points:  7  values differing: n =  100107 , fracion =  0.009880041392603515
Decimal points:  6  values differing: n =  10993 , fracion =  0.001084952051593699
Decimal points:  5  values differing: n =  1087 , fracion =  0.0001072812589904804
Decimal points:  4  values differing: n =  110 , fracion =  1.0856429152670509e-05
Decimal points:  3  values differing: n =  9 , fracion =  8.882532943094053e-07
Decimal points:  2  values differing: n =  1 , fracion =  9.869481047882281e-08
Decimal points:  1  values differing: n =  0 , fracion =  0.0
Decimal points:  0  values differing: n =  0 , fracion =  0.0


## Checking schema 5 changes

###  **Breaking change**. Updated the requirements for `cell_type` to annotate `"unknown"` as the label when the `cell_type_ontology_term_id` value is  `"unknown"`.

In [None]:
cell_type_old = census_old["census_data"]["homo_sapiens"].obs.read(column_names=["cell_type"]).concat().to_pandas().value_counts().reset_index()
cell_type_new = census_new["census_data"]["homo_sapiens"].obs.read(column_names=["cell_type"]).concat().to_pandas().value_counts().reset_index()
merged = pd.merge(cell_type_old, cell_type_new, how="outer", on="cell_type")

In [None]:
merged.loc[merged["cell_type"] == "native cell",]

Unnamed: 0,cell_type,count_x,count_y
492,native cell,1491279.0,


In [None]:
merged.loc[merged["cell_type"] == "unknown",]

Unnamed: 0,cell_type,count_x,count_y
681,unknown,,1570991.0


**PASS**

### **Breaking change**. Updated the requirements for `cell_type_ontology_term_id` to replace `"CL:0000003"` for *native cell* with `"unknown"` to indicate that the cell type is unknown.


In [None]:
cell_type_old = census_old["census_data"]["homo_sapiens"].obs.read(column_names=["cell_type_ontology_term_id"]).concat().to_pandas().value_counts().reset_index()
cell_type_new = census_new["census_data"]["homo_sapiens"].obs.read(column_names=["cell_type_ontology_term_id"]).concat().to_pandas().value_counts().reset_index()
merged = pd.merge(cell_type_old, cell_type_new, how="outer", on="cell_type_ontology_term_id")

In [None]:
merged.loc[merged["cell_type_ontology_term_id"] == "CL:0000003",]

Unnamed: 0,cell_type_ontology_term_id,count_x,count_y
1,CL:0000003,1491279.0,


In [None]:
merged.loc[merged["cell_type_ontology_term_id"] == "unknown",]

Unnamed: 0,cell_type_ontology_term_id,count_x,count_y
689,unknown,,1570991.0


**PASS**