In [1]:
import numpy as np
import geopandas as gpd
import pandas as pd

import scanpy as sc
import squidpy as sq
import voyagerpy as vp
import seaborn as sns
import os
import pickle
from matplotlib.pyplot import imread
from collections import OrderedDict
import json

from matplotlib import pyplot as plt

In [2]:
from cellphonedb.src.core.methods import cpdb_analysis_method

In [3]:
plt.rcParams['figure.dpi'] = 150
plt.rcParams['font.family'] = ['serif']
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [4]:
path_016 = "/data/kanferg/Sptial_Omics/playGround/Data/Visium_HD_Mouse_Brain_square_example/square_016um"
andata016_ = sc.read_visium(path=path_016)
andata016_
andata016_.var_names_make_unique()
andata016_.obsm['spatial'] = np.array(andata016_.obsm['spatial'], dtype=np.float64)


sc.pp.filter_cells(andata016_, min_counts=1000)

sc.pp.filter_cells(andata016_, min_genes=1000)

sc.pp.filter_genes(andata016_, min_counts=1000)

sc.pp.filter_genes(andata016_, max_counts=6281)

andata016_.obsm['spatial'] = np.array(andata016_.obsm['spatial'], dtype=np.float64)
andata016_.uns['spatial']['img'] = andata016_.uns['spatial']['Visium_HD_Mouse_Brain'].pop("images")
andata016_.uns['spatial']['scale'] = andata016_.uns['spatial']['Visium_HD_Mouse_Brain'].pop("scalefactors")
andata016_.uns['spatial']['metadata'] = andata016_.uns['spatial']['Visium_HD_Mouse_Brain'].pop("metadata")
andata016_.uns['spatial'].pop("Visium_HD_Mouse_Brain")

# change order of images
images = andata016_.uns['spatial'].pop('img')
images_hires = {'lowres':images['lowres'],'hires':images['hires']}
andata016_.uns['spatial']['img'] = images_hires


is_mt = andata016_.var_names.str.startswith('mt')
vp.utils.add_per_cell_qcmetrics(andata016_, subsets={'mito': is_mt})


spot_diameter_fullres = andata016_.uns['spatial']['scale'].pop('spot_diameter_fullres')
andata016_.uns['spatial']['scale']['spot_diameter_fullres'] = {'pxl_col_in_fullres':spot_diameter_fullres,'pxl_row_in_fullres':spot_diameter_fullres}
# insted of vp.spatial.get_visium_spots(andata016_, with_radius=False) I have done:
#scale = andata016_.uns['spatial']['scale']['tissue_lowres_scalef']
scale = 1
scale_dict = andata016_.uns["spatial"].get("scale", {})
spot_diam = scale_dict.get("spot_diameter_fullres")
visium_spots = gpd.GeoSeries.from_xy(andata016_.obsm['spatial'][:,0], andata016_.obsm['spatial'][:,1]).scale(scale, scale, origin=(0, 0))
_ = vp.spatial.set_geometry(andata016_, geom="spot_poly", values=visium_spots)
andata016_.uns['config'] = OrderedDict()
andata016_.uns["config"]["secondary_var_names"] = andata016_.var_names
pathout = "/data/kanferg/Sptial_Omics/VoyagerPy_fork/voyagerpy/out"

qc_features = ["sum", "detected", "subsets_mito_percent"]
andata016_.uns['config'] = OrderedDict()
andata016_.uns["config"]["secondary_var_names"] = andata016_.var_names

  positions = pd.read_csv(files["tissue_positions_file"], header=None)


In [5]:
# The original count data
andata016_.layers['counts'] = andata016_.X.copy()
# Log-normalize the adata.X matrix
vp.utils.log_norm_counts(andata016_, inplace=True)
andata016_.layers['logcounts'] = andata016_.X.copy()


gene_var = vp.utils.model_gene_var(andata016_.layers['logcounts'], gene_names=andata016_.var_names)
hvgs = vp.utils.get_top_hvgs(gene_var)

andata016_.var['highly_variable'] = False
andata016_.var.loc[hvgs, 'highly_variable'] = True

andata016_.X = vp.utils.scale(andata016_.X, center=True)
sc.tl.pca(andata016_, use_highly_variable=True, n_comps=30, random_state=1337)
andata016_.X = andata016_.layers['logcounts'].copy()

from leidenalg import ModularityVertexPartition
sc.pp.neighbors(
    andata016_,
    n_pcs=9,
    use_rep='X_pca',
    method='gauss',
    n_neighbors=80
)
sc.tl.leiden(
    andata016_,
    random_state=29,
    resolution=None,
    key_added='cluster',
    partition_type=ModularityVertexPartition
)

In [6]:
andata016_

AnnData object with n_obs × n_vars = 21445 × 6350
    obs: 'in_tissue', 'array_row', 'array_col', 'n_counts', 'n_genes', 'sum', 'detected', 'subsets_mito_sum', 'subsets_mito_detected', 'subsets_mito_percent', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'n_counts', 'highly_variable'
    uns: 'spatial', 'config', 'pca', 'neighbors', 'leiden'
    obsm: 'spatial', 'geometry', 'X_pca'
    varm: 'PCs'
    layers: 'counts', 'logcounts'
    obsp: 'distances', 'connectivities'

In [14]:
andata016_.var_names

Index(['Xkr4', 'Lypla1', 'Rgs20', 'Rb1cc1', 'Pcmtd1', 'Sntg1', 'Rrs1',
       'Vcpip1', 'Sgk3', 'Cops5',
       ...
       'Rab9', 'Prps2', 'Frmpd4', 'Msl3', 'Hccs', 'Kdm5d', 'Eif2s3y', 'Uty',
       'Ddx3y', 'Vamp7'],
      dtype='object', length=6350)

In [8]:
mouse_gene_ids = andata016_.var['gene_ids'].to_list()
mouse_gene_ids

['ENSMUSG00000051951',
 'ENSMUSG00000025903',
 'ENSMUSG00000002459',
 'ENSMUSG00000025907',
 'ENSMUSG00000051285',
 'ENSMUSG00000025909',
 'ENSMUSG00000061024',
 'ENSMUSG00000045210',
 'ENSMUSG00000025915',
 'ENSMUSG00000025917',
 'ENSMUSG00000056763',
 'ENSMUSG00000048960',
 'ENSMUSG00000005886',
 'ENSMUSG00000025935',
 'ENSMUSG00000025937',
 'ENSMUSG00000025925',
 'ENSMUSG00000079658',
 'ENSMUSG00000025940',
 'ENSMUSG00000042686',
 'ENSMUSG00000025777',
 'ENSMUSG00000025931',
 'ENSMUSG00000025933',
 'ENSMUSG00000028033',
 'ENSMUSG00000026155',
 'ENSMUSG00000026154',
 'ENSMUSG00000026153',
 'ENSMUSG00000026141',
 'ENSMUSG00000033569',
 'ENSMUSG00000048874',
 'ENSMUSG00000026058',
 'ENSMUSG00000004768',
 'ENSMUSG00000026127',
 'ENSMUSG00000045174',
 'ENSMUSG00000037470',
 'ENSMUSG00000010453',
 'ENSMUSG00000001143',
 'ENSMUSG00000001138',
 'ENSMUSG00000026116',
 'ENSMUSG00000026112',
 'ENSMUSG00000026110',
 'ENSMUSG00000060771',
 'ENSMUSG00000058407',
 'ENSMUSG00000026083',
 'ENSMUSG00

In [9]:
from pyorthomap import findOrthologsMmHs

In [10]:
from pybiomart import Server
server = Server(host='http://www.ensembl.org')
server.marts['ENSEMBL_MART_ENSEMBL'].list_datasets()
from pyorthomap import findOrthologsMmHs

human_genes_df = findOrthologsMmHs(from_filters = 'link_ensembl_gene_id', from_values = mouse_gene_ids).map()
human_genes_df

  0%|          | 0/22 [00:00<?, ?it/s]

Unnamed: 0,hgnc_symbol,human_ensembl_gene_id,mouse_ensembl_gene_id,external_gene_name
0,SCYL3,ENSG00000000457,ENSMUSG00000026584,Scyl3
1,ALS2,ENSG00000003393,ENSMUSG00000026024,Als2
2,CLK1,ENSG00000013441,ENSMUSG00000026034,Clk1
3,RB1CC1,ENSG00000023287,ENSMUSG00000025907,Rb1cc1
4,PREX2,ENSG00000046889,ENSMUSG00000048960,Prex2
...,...,...,...,...
45,WNK3,ENSG00000196632,ENSMUSG00000041245,Wnk3
46,NBDY,ENSG00000204272,ENSMUSG00000086316,Nbdy
47,,,ENSMUSG00000031370,Zrsr2
48,,,ENSMUSG00000069049,Eif2s3y


In [17]:
mouse_to_human_dict = dict(zip(human_genes_df['mouse_ensembl_gene_id'].values, human_genes_df['human_ensembl_gene_id'].values))
mouse_to_human_dict

{'ENSMUSG00000026584': 'ENSG00000000457',
 'ENSMUSG00000026024': 'ENSG00000003393',
 'ENSMUSG00000026034': 'ENSG00000013441',
 'ENSMUSG00000025907': 'ENSG00000023287',
 'ENSMUSG00000048960': 'ENSG00000046889',
 'ENSMUSG00000006005': 'ENSG00000047410',
 'ENSMUSG00000026020': 'ENSG00000055044',
 'ENSMUSG00000026600': 'ENSG00000057252',
 'ENSMUSG00000034220': 'ENSG00000063660',
 'ENSMUSG00000026098': 'ENSG00000064933',
 'ENSMUSG00000026311': 'ENSG00000065802',
 'ENSMUSG00000025935': 'ENSG00000067167',
 'ENSMUSG00000026313': 'ENSG00000068024',
 'ENSMUSG00000026434': 'ENSG00000069275',
 'ENSMUSG00000066877': 'ENSG00000071051',
 'ENSMUSG00000026074': 'ENSG00000071054',
 'ENSMUSG00000026110': 'ENSG00000071073',
 'ENSMUSG00000026207': 'ENSG00000072195',
 'ENSMUSG00000064302': 'ENSG00000074054',
 'ENSMUSG00000070565': 'ENSG00000075391',
 'ENSMUSG00000026116': 'ENSG00000075568',
 'ENSMUSG00000070738': 'ENSG00000077044',
 'ENSMUSG00000073557': 'ENSG00000077157',
 'ENSMUSG00000026187': 'ENSG000000

In [15]:
human_genes_df['human_ensembl_gene_id'].values

array(['ENSG00000000457', 'ENSG00000003393', 'ENSG00000013441', ..., nan,
       nan, nan], dtype=object)

In [28]:
andata016Filter = andata016_.copy()
andata016Filter

AnnData object with n_obs × n_vars = 21445 × 6350
    obs: 'in_tissue', 'array_row', 'array_col', 'n_counts', 'n_genes', 'sum', 'detected', 'subsets_mito_sum', 'subsets_mito_detected', 'subsets_mito_percent', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'n_counts', 'highly_variable'
    uns: 'spatial', 'config', 'pca', 'neighbors', 'leiden'
    obsm: 'spatial', 'geometry', 'X_pca'
    varm: 'PCs'
    layers: 'counts', 'logcounts'
    obsp: 'distances', 'connectivities'

In [29]:
def replace_and_filter(mouse_id):
    return mouse_to_human_dict.get(mouse_id, None)
# Apply the replacement function
andata016Filter.var['human_gene_ids'] = andata016Filter.var['gene_ids'].apply(replace_gene_id)

In [30]:
# Create a boolean mask to identify valid entries
valid_entries = andata016Filter.var['human_gene_ids'].notna()
valid_entries

Xkr4        True
Lypla1      True
Rgs20       True
Rb1cc1      True
Pcmtd1      True
           ...  
Kdm5d       True
Eif2s3y    False
Uty         True
Ddx3y       True
Vamp7      False
Name: human_gene_ids, Length: 6350, dtype: bool

In [32]:
andata016Filter = andata016Filter[:,valid_entries]
andata016Filter

View of AnnData object with n_obs × n_vars = 21445 × 6238
    obs: 'in_tissue', 'array_row', 'array_col', 'n_counts', 'n_genes', 'sum', 'detected', 'subsets_mito_sum', 'subsets_mito_detected', 'subsets_mito_percent', 'cluster'
    var: 'gene_ids', 'feature_types', 'genome', 'n_counts', 'highly_variable', 'human_gene_ids'
    uns: 'spatial', 'config', 'pca', 'neighbors', 'leiden'
    obsm: 'spatial', 'geometry', 'X_pca'
    varm: 'PCs'
    layers: 'counts', 'logcounts'
    obsp: 'distances', 'connectivities'

In [33]:
# Drop the original mouse gene_ids column and rename the new column
andata016Filter.var['gene_ids'] = andata016Filter.var['human_gene_ids']
andata016Filter.var['gene_ids']

Xkr4      ENSG00000206579
Lypla1    ENSG00000120992
Rgs20     ENSG00000147509
Rb1cc1    ENSG00000023287
Pcmtd1    ENSG00000168300
               ...       
Msl3      ENSG00000005302
Hccs      ENSG00000004961
Kdm5d     ENSG00000012817
Uty       ENSG00000183878
Ddx3y     ENSG00000067048
Name: gene_ids, Length: 6238, dtype: object

In [34]:
andata016_ = andata016Filter.copy()

In [38]:
andata016_.var['gene_ids'].values

array(['ENSG00000206579', 'ENSG00000120992', 'ENSG00000147509', ...,
       'ENSG00000012817', 'ENSG00000183878', 'ENSG00000067048'],
      dtype=object)

In [40]:
metadata = andata016_.obs[['cluster']].copy()
metadata['Cell'] = metadata.index
metadata.rename(columns={'cluster': 'cell_type'}, inplace=True)
metadata = metadata[['Cell', 'cell_type']]
#counts = pd.DataFrame(andata016_.X.T.todense(), index=andata016_.var['gene_ids'].values, columns=andata016_.obs_names)
counts = pd.DataFrame(andata016_.X.T.todense(), index=andata016_.var['gene_ids'].values, columns=andata016_.obs_names)

In [41]:
counts

Unnamed: 0,s_016um_00342_00082-1,s_016um_00156_00322-1,s_016um_00342_00054-1,s_016um_00036_00081-1,s_016um_00225_00065-1,s_016um_00052_00149-1,s_016um_00156_00102-1,s_016um_00109_00312-1,s_016um_00284_00131-1,s_016um_00204_00145-1,...,s_016um_00124_00306-1,s_016um_00308_00250-1,s_016um_00335_00304-1,s_016um_00046_00224-1,s_016um_00302_00195-1,s_016um_00128_00159-1,s_016um_00288_00288-1,s_016um_00039_00175-1,s_016um_00037_00193-1,s_016um_00144_00329-1
ENSG00000206579,0.000000,0.000000,0.000000,1.257069,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.994712,0.0,0.000000,0.000000,0.000000
ENSG00000120992,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1.044829,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000
ENSG00000147509,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1.044829,0.000000,0.000000,...,0.000000,0.000000,0.699779,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000
ENSG00000023287,0.000000,1.853601,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,1.120497,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000
ENSG00000168300,0.000000,0.000000,0.541553,0.000000,0.0,0.0,1.258512,0.000000,0.812963,1.127325,...,0.000000,0.000000,0.000000,1.309783,0.0,0.000000,0.0,0.000000,1.806128,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000005302,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.994712,0.0,0.000000,0.000000,0.000000
ENSG00000004961,0.000000,0.000000,0.934386,0.000000,0.0,0.0,0.000000,0.000000,0.000000,1.127325,...,0.000000,0.000000,0.000000,0.000000,0.0,0.994712,0.0,1.423303,0.000000,0.000000
ENSG00000012817,0.000000,0.000000,0.541553,0.000000,0.0,0.0,1.258512,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.994712,0.0,0.000000,0.000000,1.374395
ENSG00000183878,0.000000,0.000000,0.541553,0.000000,0.0,0.0,0.000000,1.044829,0.000000,0.000000,...,0.000000,1.288141,0.000000,1.309783,0.0,0.000000,0.0,1.423303,0.000000,0.000000


In [42]:
metadata

Unnamed: 0,Cell,cell_type
s_016um_00342_00082-1,s_016um_00342_00082-1,11
s_016um_00156_00322-1,s_016um_00156_00322-1,3
s_016um_00342_00054-1,s_016um_00342_00054-1,11
s_016um_00036_00081-1,s_016um_00036_00081-1,1
s_016um_00225_00065-1,s_016um_00225_00065-1,10
...,...,...
s_016um_00128_00159-1,s_016um_00128_00159-1,7
s_016um_00288_00288-1,s_016um_00288_00288-1,4
s_016um_00039_00175-1,s_016um_00039_00175-1,3
s_016um_00037_00193-1,s_016um_00037_00193-1,1


In [14]:
import pickle
import os

In [43]:
pathout = '/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_2'

# save pickle
with open(os.path.join(pathout,'test_meta.pickle'), 'wb') as f:
    pickle.dump(metadata, f)
with open(os.path.join(pathout,'test_counts.pickle'), 'wb') as f:
    pickle.dump(counts, f)
# save text
# metadata.to_csv(os.path.join(pathout, 'test_meta.txt'), sep='\t', index=False)

# # Saving the counts DataFrame to a .txt file
# counts.to_csv(os.path.join(pathout, 'test_counts.txt'), sep='\t', index=False)

In [44]:
cpdb_file_path = "/data/kanferg/cellphonedb/NatureProtocols2024_case_studies/v5.0.0/cellphonedb.zip"
meta_file_path = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_2/test_meta.pickle"
counts_file_path = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_2/test_counts.pickle"
out_path = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/cpdb_out"

In [45]:
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

In [46]:
cpdb_results = cpdb_statistical_analysis_method.call(
         cpdb_file_path = cpdb_file_path,
         meta_file_path = meta_file_path,
         counts_file_path = counts_file_path,
         counts_data = 'ensembl',
         output_path = out_path)

Reading user files...
The following user files were loaded successfully:
/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_2/test_counts.pickle
/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_2/test_meta.pickle
[ ][CORE][16/07/24-10:16:11][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:-1 Threads:4 Precision:3
[ ][CORE][16/07/24-10:16:12][INFO] Running Real Analysis
[ ][CORE][16/07/24-10:16:12][INFO] Running Statistical Analysis


100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:37<00:00, 10.23it/s]


[ ][CORE][16/07/24-10:17:50][INFO] Building Pvalues result
[ ][CORE][16/07/24-10:17:50][INFO] Building results
Saved deconvoluted to /data/kanferg/Sptial_Omics/SpatialOmicsToolkit/cpdb_out/statistical_analysis_deconvoluted_07_16_2024_101750.txt
Saved deconvoluted_percents to /data/kanferg/Sptial_Omics/SpatialOmicsToolkit/cpdb_out/statistical_analysis_deconvoluted_percents_07_16_2024_101750.txt
Saved means to /data/kanferg/Sptial_Omics/SpatialOmicsToolkit/cpdb_out/statistical_analysis_means_07_16_2024_101750.txt
Saved pvalues to /data/kanferg/Sptial_Omics/SpatialOmicsToolkit/cpdb_out/statistical_analysis_pvalues_07_16_2024_101750.txt
Saved significant_means to /data/kanferg/Sptial_Omics/SpatialOmicsToolkit/cpdb_out/statistical_analysis_significant_means_07_16_2024_101750.txt
