In [0]:
import cellxgene_census
import numpy as np
import numba as nb
import scanpy as sc
import json
import scipy
from scipy import stats
import pandas as pd
from collections import defaultdict
from random import choices

## Get all human cell metadata in a dataframe

In [0]:
census = cellxgene_census.open_soma(census_version="2023-10-18")

columns_needed = ['assay', 'tissue', 'disease', 'cell_type_ontology_term_id', 'n_measured_vars', 'is_primary_data', 'nnz', 'soma_joinid']

human_obs_df = census['census_data']['homo_sapiens']['obs'].read().concat().to_pandas()
human_obs_df = human_obs_df[columns_needed]

## Filter the metadata to define which slice of the data to retrieve
- Only tissues where we have overlapping cell types with hubmap
- Min 500 expressed genes per cell
- From an accepted assay
- Healthy cells only (for marker genes)
- Is primary data

In [0]:
overlapping_categories = json.load(open('/dbfs/FileStore/users/sbell/2023-08_norm_characterization/data_tidy/overlapping_categories.json', 'r'))

accepted_assays = ['sci-RNA-seq', 'Seq-Well', 'Drop-seq', 'CEL-seq2', 
            "10x 3' v1", "10x 5' v1", "10x 3' v2", "10x 5' v2", "10x 3' v3", 
            "10x 3' transcription profiling", "10x 5' transcription profiling", "10x technology"]

NUM_MIN_EXPRESSED_GENES = 500

In [0]:
filtered_human_obs_df = human_obs_df.loc[
    (human_obs_df['assay'].isin(accepted_assays)) & 
    (human_obs_df['disease'] == 'normal') &
    (human_obs_df['is_primary_data'] == True) & 
    (human_obs_df['nnz'] >= NUM_MIN_EXPRESSED_GENES) # cell is expressing (count > 0) at least N genes
]

In [0]:
def subsample_join_ids(join_ids, max_n=100000):
    if len(join_ids) > max_n:
        print('found %d ids'%len(join_ids))
        subsampled_join_ids = choices(join_ids, k=max_n)
        return subsampled_join_ids
    else:
        return join_ids

def get_n_expressed_genes(join_ids):
    if len(join_ids) == 0:
        return 0
    adata = cellxgene_census.get_anndata(census, "Homo sapiens", obs_coords=join_ids) # [n_cells x n genes]
    total_expression_per_gene = adata.X.sum(axis=0) # sum across cells for each gene
    nz_expressed_genes_mask = total_expression_per_gene > 0 # [[True False True True ...]]
    expressed_genes = np.squeeze(np.asarray(nz_expressed_genes_mask)) # [True False True True ...]
    return expressed_genes.sum() # 1 + 0 + 1 + 1 + ... = 3 + ...

In [0]:
n_expressed_genes = defaultdict(dict) #{tissue: {cl: int}}

for tissue, cls in overlapping_categories.items():
    print(tissue)
    tissue_df = filtered_human_obs_df.loc[(
            filtered_human_obs_df['tissue'] == tissue
        )]
    for cl in cls:
        cl_df = tissue_df.loc[tissue_df['cell_type_ontology_term_id'] == cl]
        cl_joinids = cl_df['soma_joinid'].values
        n_expressed_genes[tissue][cl] = str(get_n_expressed_genes(cl_joinids))

lung
6573
13841
2602
1201
11010
55539
7675
finished for tissue  lung
pancreas
0
finished for tissue  pancreas
placenta
8816
0
3829
finished for tissue  placenta
bone marrow
413
2847
676
1614
509
1014
7459
518
0
6017
200
0
146
7932
1906
2039
343
789
1084
497
581
1344
2293
17114
10089
1383
2221
3667
0
4590
629
6684
finished for tissue  bone marrow
large intestine
476
134
114
finished for tissue  large intestine
blood
8474
974
29
19392
955
28612
13938
72801
138293
166358
7678
64126
4046
finished for tissue  blood
brain
0
0
0
0
0
0
0
0
0
0
0
0
0
finished for tissue  brain
immune system
0
0
0
0
finished for tissue  immune system
spinal cord
5819
finished for tissue  spinal cord
respiratory system
0
0
0
0
0
0
finished for tissue  respiratory system
heart
42
11319
0
0
finished for tissue  heart
lymph node
3
132
0
108
144
2762
1179
2070
1282
3
308
94
0
finished for tissue  lymph node
small intestine
6456
5332
8726
finished for tissue  small intestine
kidney
0
5
74
926
233
0
0
3522
0
1502
158
5

In [0]:
json.dump(n_expressed_genes, open('/dbfs/FileStore/users/sbell/2023-08_norm_characterization/data_tidy/n_expressed_genes.json', 'w'))