In this notebook, I will demonstrate an example using melanoma data and assign gene information from the 5k panel.


```bash
source myconda 
mamba activate stlearn-env  
jupyter notebook --ip localhost --port $PORT1 --no-browser 
```

In [2]:
import cupy as cp
import cupyx
import scanpy as sc
import numpy as np
import pandas as pd
#from cupyx.scipy.sparse import csr_matrix
import os
from PIL import Image
from sklearn.linear_model import LinearRegression
import pickle
import pickle
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.sparse import csr_matrix
import pickle

In [3]:
xenium_5k_pan_file_csv = "/data/kanferg/Sptial_Omics/projects/NguyenLab/spatialomicstoolkit/metadata_5kpan/XeniumPrimeMouse5Kpan_tissue_pathways_metadata.csv"
xenium_file = "/data/kanferg/Sptial_Omics/projects/NguyenLab/spatialomicstoolkit/data_out/andata_save_batch_51.h5ad"
pathout = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_5"

In [4]:
pan_5k = pd.read_csv(xenium_5k_pan_file_csv)
pan_5k.head(3)

Unnamed: 0,gene_name,gene_id,num_codewords,num_probesets,protein_name,location,cell_type,cellchat_pathway
0,A1cf,ENSMUSG00000052595,2,2,APOBEC1 complementation factor (APOBEC1-stimul...,Nucleus,,
1,A2m,ENSMUSG00000030111,2,2,Alpha-2-macroglobulin-P (Alpha-2-macroglobulin),Secreted,glioblast;Bergmann glial cell,
2,Aatf,ENSMUSG00000018697,2,2,Protein AATF (Apoptosis-antagonizing transcrip...,Nucleus,,


In [5]:
# remove all the cell_type is na
pan_5k["anoonat"] = pd.notna(pan_5k['cell_type']).values
pan_5k_filter_na = pan_5k.loc[pan_5k["anoonat"],:]
len(pan_5k_filter_na)

2124

In [6]:
from tqdm import tqdm
df_pan5k = pd.DataFrame({"gene":[],"ctype":[]})
for index, row in tqdm(pan_5k_filter_na.iterrows()):
    cell_type_compressed = pd.unique(row["cell_type"])
    cell_type_uncompressed = cell_type_compressed[0].split(';')
    test_temp = pd.DataFrame({"gene":[row["gene_name"]]*len(cell_type_uncompressed),"ctype":cell_type_uncompressed})
    df_pan5k = pd.concat([df_pan5k,test_temp])

2124it [00:01, 1286.09it/s]


In [7]:
ctyp_curr = pd.unique(df_pan5k['ctype'])[0]
curr_gene = df_pan5k.loc[df_pan5k['ctype']==ctyp_curr,'gene'].to_list()


In [8]:
marker_genes_5kp = {}
for ctyp_curr in tqdm(pd.unique(df_pan5k['ctype'])):
    marker_genes_5kp[ctyp_curr] = list(set(df_pan5k.loc[df_pan5k['ctype']==ctyp_curr,'gene'].to_list()))

100%|██████████████████████████████████████████████████████████████████████████| 310/310 [00:00<00:00, 1028.29it/s]


In [9]:
andata = sc.read_h5ad(xenium_file)
sc.pp.normalize_total(andata)
sc.pp.log1p(andata)
sc.pp.scale(andata, max_value=10)
sc.pp.pca(andata, n_comps=30,random_state=1337)



In [10]:
sc.pp.neighbors(andata, n_pcs=15, use_rep='X_pca', n_neighbors=25)
sc.tl.leiden(andata, random_state=1337, resolution=0.5, key_added='cluster')

2024-12-03 12:35:04.583873: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
marker_genes_5kp.keys()

310

In [11]:
sc.pl.dotplot(andata, marker_genes_5kp, groupby="cluster")

  dot_ax.scatter(x, y, **kwds)


ValueError: Image size of 342992x519 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 342992x520 with 5 Axes>

In [69]:
andata.var

Unnamed: 0,gene_ids,feature_types,genome,mean,std
A1cf,ENSMUSG00000052595,Gene Expression,Unknown,0.000164,0.013814
A2m,ENSMUSG00000030111,Gene Expression,Unknown,0.001768,0.043983
Aatf,ENSMUSG00000018697,Gene Expression,Unknown,0.100744,0.254069
Abca1,ENSMUSG00000015243,Gene Expression,Unknown,0.091430,0.254412
Abca13,ENSMUSG00000004668,Gene Expression,Unknown,0.002636,0.049034
...,...,...,...,...,...
Zswim9,ENSMUSG00000070814,Gene Expression,Unknown,0.007130,0.066430
Zup1,ENSMUSG00000039531,Gene Expression,Unknown,0.049494,0.190927
Zyx,ENSMUSG00000029860,Gene Expression,Unknown,0.213926,0.411522
Zzef1,ENSMUSG00000055670,Gene Expression,Unknown,0.070107,0.222990


Using xenium provided annotation list was not helpful. So let’s try a different strategy. 
## Melanoma infiltration of stromal and immune cells - Human Cell Atlas

In [47]:
path_mtx = "/data/kanferg/Sptial_Omics/projects/NguyenLab/data/reference_scrnaseq/human_cell_atlas/E-EHCA-2-quantification-filtered-files/E-EHCA-2.expression_tpm.mtx"
path_obs = "/data/kanferg/Sptial_Omics/projects/NguyenLab/data/reference_scrnaseq/human_cell_atlas/E-EHCA-2-normalised-files/E-EHCA-2.aggregated_filtered_normalised_counts.mtx_rows"
path_var = "/data/kanferg/Sptial_Omics/projects/NguyenLab/data/reference_scrnaseq/human_cell_atlas/E-EHCA-2-normalised-files/E-EHCA-2.aggregated_filtered_normalised_counts.mtx_cols"
base_path = "/data/kanferg/Sptial_Omics/projects/NguyenLab/data/reference_scrnaseq/human_cell_atlas"
path_celltype = base_path+"/ExpDesign-E-EHCA-2.tsv"

In [48]:
# celltype matrix
mtx_ctype = pd.read_csv(path_celltype, sep='\t')
len(mtx_ctype)

6638

In [49]:
# Load the matrix
rdata = sc.read_mtx(path_mtx)
rdata

AnnData object with n_obs × n_vars = 26760 × 5914

In [50]:
rdata.var_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '5904', '5905', '5906', '5907', '5908', '5909', '5910', '5911', '5912',
       '5913'],
      dtype='object', length=5914)

In [44]:
cell_obs = pd.read_csv(path_var, sep='\t', header=None)
cell_obs

Unnamed: 0,0
0,21784_6_10
1,21784_6_100
2,21784_6_101
3,21784_6_102
4,21784_6_103
...,...
5909,22467_8_95
5910,22467_8_96
5911,22467_8_97
5912,22467_8_98


In [None]:
adata.obs = 

https://github.com/Teichlab/celltypist

In [11]:
import celltypist
import celltypist as ct
from celltypist import models

In [4]:
#Download all the available models.
models.download_models()
#Update all models by re-downloading the latest versions if you think they may be outdated.
models.download_models(force_update = True)

📂 Storing models in /home/kanferg/.celltypist/data/models
⏩ Skipping [1/50]: Immune_All_Low.pkl (file exists)
⏩ Skipping [2/50]: Immune_All_High.pkl (file exists)
💾 Downloading model [3/50]: Adult_COVID19_PBMC.pkl
⏩ Skipping [4/50]: Adult_CynomolgusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [5/50]: Adult_Human_PancreaticIslet.pkl (file exists)
⏩ Skipping [6/50]: Adult_Human_Skin.pkl (file exists)
⏩ Skipping [7/50]: Adult_Mouse_Gut.pkl (file exists)
⏩ Skipping [8/50]: Adult_Mouse_OlfactoryBulb.pkl (file exists)
⏩ Skipping [9/50]: Adult_Pig_Hippocampus.pkl (file exists)
⏩ Skipping [10/50]: Adult_RhesusMacaque_Hippocampus.pkl (file exists)
⏩ Skipping [11/50]: Autopsy_COVID19_Lung.pkl (file exists)
⏩ Skipping [12/50]: COVID19_HumanChallenge_Blood.pkl (file exists)
⏩ Skipping [13/50]: COVID19_Immune_Landscape.pkl (file exists)
⏩ Skipping [14/50]: Cells_Adult_Breast.pkl (file exists)
⏩ Skipping [15/50]: Cells_Fetal_Lung.pkl (file exists)
⏩ Skipping [16/50]: Cells_Human_Tonsil.pkl (file

In [6]:
model = models.Model.load()

In [8]:
import cupy as cp
import cupyx
import scanpy as sc
import numpy as np
import pandas as pd
from cupyx.scipy.sparse import csr_matrix
import os
from PIL import Image
from sklearn.linear_model import LinearRegression
import pickle
import pickle
import esda
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame
import libpysal as lps
from libpysal.weights import W
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point
import seaborn as sns 
from scipy.sparse import csr_matrix
import pickle
path = "/data/kanferg/Sptial_Omics/playGround/Data/Xenium/output_temp"
pathout = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_1"
FilePrefix = "_072824" 
grid = sc.read_h5ad(os.path.join(pathout, "grid_save.h5ad"))
file_path = os.path.join(pathout, "grid_uns_mtracies.pkl")

if os.path.getsize(file_path) > 0:
    with open(file_path, 'rb') as buff:
        grid_uns_mtracies = pickle.load(buff)
else:
    print("File is empty. Cannot load data.")
    grid_uns_mtracies = None
grid.uns = {}
grid.uns['cluster'] = pd.read_csv(os.path.join(pathout, "cluster.csv"))
grid.uns['grid_counts'] = grid_uns_mtracies['grid_counts']
grid.uns['grid_xedges'] = grid_uns_mtracies['grid_xedges']
grid.uns['grid_yedges'] = grid_uns_mtracies['grid_yedges']
grid.uns['lrfeatures'] = pd.read_csv(os.path.join(pathout, "lrfeatures.csv"))
grid.uns['lr_summary'] = pd.read_csv(os.path.join(pathout, "lr_summary.csv"))
sparse_matrix = grid.X
row_sums = sparse_matrix.sum(axis=1)
grid.obs['n_counts'] = np.array(row_sums).flatten()

In [13]:
grid.layers['counts'] = grid.X.copy()
sc.pp.normalize_total(grid)
sc.pp.log1p(grid)
grid.layers['log'] = grid.X.copy()

In [17]:
predictions = ct.annotate(grid, model=model, majority_voting=False, over_clustering="cluster")
# convert back to anndata||
adata = predictions.to_adata()

🔬 Input data has 35389 cells and 1500 genes
🔗 Matching reference genes in the model
🧬 775 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


In [18]:
adata.obs['majority_voting']

grid_112      Endothelial cells
grid_114      Endothelial cells
grid_115      Endothelial cells
grid_116      Endothelial cells
grid_122      Endothelial cells
                    ...        
grid_62417    Endothelial cells
grid_62420          Fibroblasts
grid_62422          Fibroblasts
grid_62423          Fibroblasts
grid_62444          Fibroblasts
Name: majority_voting, Length: 35389, dtype: category
Categories (3, object): ['Endothelial cells', 'Fibroblasts', 'Mast cells']