In [1]:
import scanpy as sc
import cupy as cp
import os
import time
import rapids_singlecell as rsc

import warnings

import numpy as np
import rmm
from rmm.allocators.cupy import rmm_cupy_allocator

rmm.reinitialize(
    managed_memory=False,  # Allows oversubscription
    pool_allocator=False,  # default is False
    devices=0,  # GPU device IDs to register. By default registers only GPU 0.
)
cp.cuda.set_allocator(rmm_cupy_allocator)
import zarr
from collections import OrderedDict
from scipy.sparse import csr_matrix
from leidenalg import ModularityVertexPartition
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests
from scipy.sparse import csr_matrix
import scipy
import anndata
from collections import OrderedDict

In [2]:
adata = sc.read_10x_h5("/data/kanferg/Sptial_Omics/playGround/Data/Xenium/output_temp/cell_feature_matrix.h5")
adata.var_names_make_unique()

In [3]:
rsc.get.anndata_to_GPU(adata)
rsc.pp.flag_gene_family(adata, gene_family_name="MT", gene_family_prefix="mt-")
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT"])

In [4]:
# Function to open a Zarr file
def open_zarr(path: str) -> zarr.Group:
    store = (zarr.ZipStore(path, mode="r") if path.endswith(".zip") else zarr.DirectoryStore(path))
    return zarr.group(store=store)
path = "/data/kanferg/Sptial_Omics/playGround/Data/Xenium/output_temp/cells.zarr.zip"
root = open_zarr(path)
column_names = dict(root['cell_summary'].attrs.items())['column_names']
def build_obs(andata,root,column_names):
    for i in range(len(column_names)):
        andata.obs[str(column_names[i])] = np.array(root["cell_summary"])[:,i]
    spatial = andata.obs[["cell_centroid_x", "cell_centroid_y"]]
    adata.obsm["spatial"] = spatial.values
    return andata
andata = build_obs(adata,root,column_names)
andata.var_names_make_unique()
andata.obsm['spatial'] = np.array(andata.obsm['spatial'], dtype=np.float64)

In [5]:
#rsc.pp.filter_cells(andata, min_count=10)
rsc.pp.filter_genes(andata, min_count=10)
andata.uns['config'] = OrderedDict()
andata.uns["config"]["secondary_var_names"] = andata.var_names
andata.layers['counts'] = andata.X.copy()

filtered out 1 genes based on n_cells_by_counts


In [6]:
rsc.pp.normalize_total(andata)
rsc.pp.log1p(andata)
rsc.pp.highly_variable_genes(andata)
rsc.pp.pca(andata, n_comps=15,random_state=1337)
andata

AnnData object with n_obs × n_vars = 708983 × 4623
    obs: 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'cell_centroid_x', 'cell_centroid_y', 'cell_area', 'nucleus_centroid_x', 'nucleus_centroid_y', 'nucleus_area', 'z_level', 'nucleus_count'
    var: 'gene_ids', 'feature_types', 'genome', 'MT', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'config', 'log1p', 'hvg', 'pca'
    obsm: 'spatial', 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [7]:
rsc.pp.neighbors(andata,n_pcs=15,use_rep='X_pca',n_neighbors=45)
rsc.tl.leiden(andata,random_state=1337,resolution=1,key_added='cluster') 

In [8]:
andata

AnnData object with n_obs × n_vars = 708983 × 4623
    obs: 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'cell_centroid_x', 'cell_centroid_y', 'cell_area', 'nucleus_centroid_x', 'nucleus_centroid_y', 'nucleus_area', 'z_level', 'nucleus_count', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'MT', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'config', 'log1p', 'hvg', 'pca', 'neighbors', 'leiden'
    obsm: 'spatial', 'X_pca'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'distances', 'connectivities'

In [9]:
 rsc.tl.rank_genes_groups_logreg(andata, groupby="cluster",groups='all')

In [10]:
andata.uns['rank_genes_groups']

{'params': {'groupby': 'cluster',
  'method': 'logreg',
  'reference': 'rest',
  'use_raw': None},
 'scores': rec.array([(  8.840481 ,  8.998595 ,   3.6403823, 12.043716 ,  8.588599 ,   9.622591 ,  6.1811914,   8.368974 ,  12.05603  ,   3.0648348, 11.529636 ,  9.485098 , 10.24213  , 10.987517 , 10.695103 ,  8.9446335,   2.9888496, 12.260597 ,  13.903993,  12.782721 ,  13.3322115,  13.112552 , 13.004545 ,  0.90236187,   7.246296 , 14.591028 , 12.855499 ,  11.715151 ,  8.462426 , 12.962172 ,  14.218709 ,   8.369181 ),
            (  4.5090456,  1.9773902,   2.281189 ,  7.27484  ,  5.5127244,   8.562887 ,  5.0854   ,   2.5742912,   2.6838362,   1.7455299, 10.023757 ,  5.988313 ,  8.097414 ,  8.411418 ,  6.9229074,  6.1424656,   1.8160405,  8.368665 ,   9.687332,   4.8087406,   3.0125847,   4.602379 , 10.118833 ,  0.8562227 ,   2.7302253,  8.212052 , 11.383274 ,   7.020916 ,  6.352079 ,  6.598752 ,   3.7590384,   8.221044 ),
            (  3.830634 ,  1.8410678,   1.9288886,  4.3064427,  5

In [11]:
andata.uns['rank_genes_groups']['scores'].shape

(4623,)

In [12]:
pd.DataFrame(andata.uns['rank_genes_groups']['scores'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,8.840481,8.998595,3.640382,12.043716,8.588599,9.622591,6.181191,8.368974,12.056030,3.064835,...,13.004545,0.902362,7.246296,14.591028,12.855499,11.715151,8.462426,12.962172,14.218709,8.369181
1,4.509046,1.977390,2.281189,7.274840,5.512724,8.562887,5.085400,2.574291,2.683836,1.745530,...,10.118833,0.856223,2.730225,8.212052,11.383274,7.020916,6.352079,6.598752,3.759038,8.221044
2,3.830634,1.841068,1.928889,4.306443,5.140582,2.359240,4.097266,2.352560,1.485751,1.625350,...,5.864505,0.814046,1.752750,5.913621,5.240136,4.531563,2.285594,6.296628,2.516961,4.328784
3,2.185688,1.500845,1.646121,3.626902,5.005806,1.196436,3.976357,1.414575,1.242658,1.547322,...,4.297292,0.810805,1.564221,3.757536,4.547781,4.110286,1.623824,3.889179,1.710169,2.762456
4,2.121602,1.433345,1.629904,3.311748,4.067332,1.078112,3.933193,1.394857,1.164666,1.444406,...,3.034476,0.729893,1.315363,2.608125,3.606301,2.683715,1.584882,3.527210,1.660909,1.847149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4618,-1.371665,-5.251132,-3.907550,-1.552548,-1.871593,-6.900544,-1.583661,-3.999816,-3.606766,-6.849926,...,-1.130708,-2.542313,-3.616260,-1.088822,-1.106057,-1.393750,-1.123562,-1.414029,-1.587298,-1.562002
4619,-5.034382,-5.293637,-6.360641,-1.742919,-1.903968,-8.109837,-1.660345,-6.474800,-7.085868,-7.327381,...,-1.145682,-2.544186,-4.582023,-1.157092,-1.144707,-1.433599,-1.220483,-1.501960,-9.945375,-1.605339
4620,-9.430278,-6.014729,-9.039707,-1.926757,-1.919408,-9.570837,-1.685159,-9.334538,-7.405418,-7.721484,...,-1.172772,-2.599244,-5.320379,-1.211867,-1.203527,-1.526406,-1.327791,-1.540129,-11.247831,-1.812670
4621,-12.783133,-6.310889,-10.736596,-2.002764,-2.053692,-9.855143,-1.893420,-11.112469,-9.033220,-10.429877,...,-1.254990,-2.760943,-9.158525,-1.373873,-1.861463,-4.554735,-4.085975,-1.590048,-12.676720,-11.770976


In [13]:
np.unique(andata.obs['cluster'])

array(['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18',
       '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28',
       '29', '3', '30', '31', '4', '5', '6', '7', '8', '9'], dtype=object)

In [14]:
andata.uns['rank_genes_groups'].keys()

dict_keys(['params', 'scores', 'names'])

In [15]:
andata.layers

Layers with keys: counts

In [16]:
pd.DataFrame.from_records(andata.uns['rank_genes_groups']['scores'])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,8.840481,8.998595,3.640382,12.043716,8.588599,9.622591,6.181191,8.368974,12.056030,3.064835,...,13.004545,0.902362,7.246296,14.591028,12.855499,11.715151,8.462426,12.962172,14.218709,8.369181
1,4.509046,1.977390,2.281189,7.274840,5.512724,8.562887,5.085400,2.574291,2.683836,1.745530,...,10.118833,0.856223,2.730225,8.212052,11.383274,7.020916,6.352079,6.598752,3.759038,8.221044
2,3.830634,1.841068,1.928889,4.306443,5.140582,2.359240,4.097266,2.352560,1.485751,1.625350,...,5.864505,0.814046,1.752750,5.913621,5.240136,4.531563,2.285594,6.296628,2.516961,4.328784
3,2.185688,1.500845,1.646121,3.626902,5.005806,1.196436,3.976357,1.414575,1.242658,1.547322,...,4.297292,0.810805,1.564221,3.757536,4.547781,4.110286,1.623824,3.889179,1.710169,2.762456
4,2.121602,1.433345,1.629904,3.311748,4.067332,1.078112,3.933193,1.394857,1.164666,1.444406,...,3.034476,0.729893,1.315363,2.608125,3.606301,2.683715,1.584882,3.527210,1.660909,1.847149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4618,-1.371665,-5.251132,-3.907550,-1.552548,-1.871593,-6.900544,-1.583661,-3.999816,-3.606766,-6.849926,...,-1.130708,-2.542313,-3.616260,-1.088822,-1.106057,-1.393750,-1.123562,-1.414029,-1.587298,-1.562002
4619,-5.034382,-5.293637,-6.360641,-1.742919,-1.903968,-8.109837,-1.660345,-6.474800,-7.085868,-7.327381,...,-1.145682,-2.544186,-4.582023,-1.157092,-1.144707,-1.433599,-1.220483,-1.501960,-9.945375,-1.605339
4620,-9.430278,-6.014729,-9.039707,-1.926757,-1.919408,-9.570837,-1.685159,-9.334538,-7.405418,-7.721484,...,-1.172772,-2.599244,-5.320379,-1.211867,-1.203527,-1.526406,-1.327791,-1.540129,-11.247831,-1.812670
4621,-12.783133,-6.310889,-10.736596,-2.002764,-2.053692,-9.855143,-1.893420,-11.112469,-9.033220,-10.429877,...,-1.254990,-2.760943,-9.158525,-1.373873,-1.861463,-4.554735,-4.085975,-1.590048,-12.676720,-11.770976


In [17]:
andata

AnnData object with n_obs × n_vars = 708983 × 4623
    obs: 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'cell_centroid_x', 'cell_centroid_y', 'cell_area', 'nucleus_centroid_x', 'nucleus_centroid_y', 'nucleus_area', 'z_level', 'nucleus_count', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'MT', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'config', 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'rank_genes_groups'
    obsm: 'spatial', 'X_pca'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'distances', 'connectivities'

In [18]:
def z_to_p(z):
    return 2 * (1 - stats.norm.cdf(abs(z)))
p_values = pd.DataFrame.from_records(andata.uns['rank_genes_groups']['scores']).applymap(z_to_p)
p_values_flat = p_values.values.flatten()
_, pvals_corrected, _, _ = multipletests(p_values_flat, alpha=0.05, method='fdr_bh')
pvals_corrected_df = pd.DataFrame(pvals_corrected.reshape(p_values.shape), columns=p_values.columns, index=p_values.index)


  p_values = pd.DataFrame.from_records(andata.uns['rank_genes_groups']['scores']).applymap(z_to_p)


In [19]:
pd.DataFrame.from_records(andata.uns['rank_genes_groups']['scores'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,8.840481,8.998595,3.640382,12.043716,8.588599,9.622591,6.181191,8.368974,12.056030,3.064835,...,13.004545,0.902362,7.246296,14.591028,12.855499,11.715151,8.462426,12.962172,14.218709,8.369181
1,4.509046,1.977390,2.281189,7.274840,5.512724,8.562887,5.085400,2.574291,2.683836,1.745530,...,10.118833,0.856223,2.730225,8.212052,11.383274,7.020916,6.352079,6.598752,3.759038,8.221044
2,3.830634,1.841068,1.928889,4.306443,5.140582,2.359240,4.097266,2.352560,1.485751,1.625350,...,5.864505,0.814046,1.752750,5.913621,5.240136,4.531563,2.285594,6.296628,2.516961,4.328784
3,2.185688,1.500845,1.646121,3.626902,5.005806,1.196436,3.976357,1.414575,1.242658,1.547322,...,4.297292,0.810805,1.564221,3.757536,4.547781,4.110286,1.623824,3.889179,1.710169,2.762456
4,2.121602,1.433345,1.629904,3.311748,4.067332,1.078112,3.933193,1.394857,1.164666,1.444406,...,3.034476,0.729893,1.315363,2.608125,3.606301,2.683715,1.584882,3.527210,1.660909,1.847149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4618,-1.371665,-5.251132,-3.907550,-1.552548,-1.871593,-6.900544,-1.583661,-3.999816,-3.606766,-6.849926,...,-1.130708,-2.542313,-3.616260,-1.088822,-1.106057,-1.393750,-1.123562,-1.414029,-1.587298,-1.562002
4619,-5.034382,-5.293637,-6.360641,-1.742919,-1.903968,-8.109837,-1.660345,-6.474800,-7.085868,-7.327381,...,-1.145682,-2.544186,-4.582023,-1.157092,-1.144707,-1.433599,-1.220483,-1.501960,-9.945375,-1.605339
4620,-9.430278,-6.014729,-9.039707,-1.926757,-1.919408,-9.570837,-1.685159,-9.334538,-7.405418,-7.721484,...,-1.172772,-2.599244,-5.320379,-1.211867,-1.203527,-1.526406,-1.327791,-1.540129,-11.247831,-1.812670
4621,-12.783133,-6.310889,-10.736596,-2.002764,-2.053692,-9.855143,-1.893420,-11.112469,-9.033220,-10.429877,...,-1.254990,-2.760943,-9.158525,-1.373873,-1.861463,-4.554735,-4.085975,-1.590048,-12.676720,-11.770976


In [20]:
pvals_corrected_df.to_numpy().shape

(4623, 32)

In [21]:
p_values.to_numpy().shape

(4623, 32)

In [22]:
andata.uns['rank_genes_groups']['p_values'] = p_values.to_numpy()
andata.uns['rank_genes_groups']['pvals_corrected'] = pvals_corrected_df.to_numpy()

In [23]:
andata

AnnData object with n_obs × n_vars = 708983 × 4623
    obs: 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'cell_centroid_x', 'cell_centroid_y', 'cell_area', 'nucleus_centroid_x', 'nucleus_centroid_y', 'nucleus_area', 'z_level', 'nucleus_count', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'MT', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'config', 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'rank_genes_groups'
    obsm: 'spatial', 'X_pca'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'distances', 'connectivities'

In [24]:
pathout = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_1"

In [25]:
from collections import OrderedDict
import anndata

In [27]:
testandata = andata.copy()
testandata

AnnData object with n_obs × n_vars = 708983 × 4623
    obs: 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'cell_centroid_x', 'cell_centroid_y', 'cell_area', 'nucleus_centroid_x', 'nucleus_centroid_y', 'nucleus_area', 'z_level', 'nucleus_count', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'MT', 'n_cells_by_counts', 'total_counts', 'mean_counts', 'pct_dropout_by_counts', 'log1p_total_counts', 'log1p_mean_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'config', 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'rank_genes_groups'
    obsm: 'spatial', 'X_pca'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'distances', 'connectivities'

In [32]:
import scanpy as sc
import numpy as np
import scipy.sparse
from collections import OrderedDict

def convert_nested_structures(data):
    if isinstance(data, scipy.sparse.spmatrix):
        return data  # Keep as sparse matrix to avoid memory issues
    elif isinstance(data, OrderedDict):
        return {k: convert_nested_structures(v) for k, v in data.items()}
    elif isinstance(data, dict):
        return {k: convert_nested_structures(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_nested_structures(item) for item in data]
    elif isinstance(data, pd.Index):
        return data.tolist()  # Convert pandas Index to list
    else:
        return data

def traverse_and_convert(adata):
    # Traverse uns
    for key in adata.uns.keys():
        adata.uns[key] = convert_nested_structures(adata.uns[key])

    # Traverse varm
    for key in adata.varm.keys():
        adata.varm[key] = convert_nested_structures(adata.varm[key])

    # Traverse obsp
    for key in adata.obsp.keys():
        adata.obsp[key] = convert_nested_structures(adata.obsp[key])

    # Write the AnnData object to an h5ad file
    adata.write_h5ad(os.path.join(pathout,'test.h5ad'))

In [33]:
traverse_and_convert(testandata)

In [34]:
adata = sc.read_h5ad(os.path.join(pathout,'test.h5ad'))

In [39]:
adata.uns['rank_genes_groups']['pvals_corrected'].shape

(4623, 32)

In [73]:
print("prepare for writing")
#andata.X = andata.layers['counts']
#del andata.layers
keys =[keys for keys in andata.obsp.keys()]
for key in keys:
    matrix = andata.obsp[key]
    if isinstance(matrix, scipy.sparse.spmatrix):
        andata.obsp[key] = np.array(andata.obsp[key].todense())
    if isinstance(matrix, OrderedDict):
        andata.obsp[key] = dict(andata.obsp[key]) 
        # andata.obsp.pop(key)
keys = [keys for keys in andata.obsm.keys()]
for key in keys:
    matrix = andata.obsm[key]
    if isinstance(matrix, scipy.sparse.spmatrix):
        andata.obsm[key] = np.array(andata.obsm[key].todense())
    if isinstance(matrix, OrderedDict):
        andata.obsm[key] = dict(andata.obsm[key]) 
        # andata.obsm.pop(key)
keys = [keys for keys in andata.uns.keys()]
for key in keys:
    matrix = andata.uns[key]
    if isinstance(matrix, scipy.sparse.spmatrix):
        andata.uns[key] = np.array(andata.uns[key].todense()) 
        #andata.uns.pop(key)
    if isinstance(matrix, OrderedDict):
        andata.uns[key] = dict(andata.uns[key])
# del andata.uns['config']
# # del andata.uns['spatial']['knn_weights']
# del andata.obsm['geometry'] 
andata.write(os.path.join(pathout,'test.h5ad'))

prepare for writing


In [74]:
andata.write_h5ad(os.path.join(pathout,'test.h5ad'))

In [69]:
del andata.obsp[key]