In [1]:
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

In [2]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scvi

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())

In [3]:
import celltypist
from celltypist import models

Data from [czi](https://cellxgene.cziscience.com/datasets) collection of breast cancer 6 samples.

In [4]:
ref_name = "/data/kanferg/Sptial_Omics/playGround/Data/Breast_Cancer/ref/ccdb972d-6655-43ae-9ad8-f895bd893d8a.h5ad"
adata_ref_init = sc.read_h5ad(ref_name)

In [5]:
pd.unique(adata_ref_init.obs['tissue'])

['breast', 'liver', 'brain', 'chest wall', 'skin epidermis', 'axilla', 'bone spine']
Categories (7, object): ['breast', 'brain', 'skin epidermis', 'liver', 'axilla', 'bone spine', 'chest wall']

In [6]:
# select all the breast samples
adata_ref_tisueRM = adata_ref_init[adata_ref_init.obs['tissue']=='breast',:].copy()
# select all the protein_coding
adata_ref = adata_ref_tisueRM[:,adata_ref_tisueRM.var['feature_type']=='protein_coding'].copy()
adata_ref

AnnData object with n_obs × n_vars = 34164 × 18626
    obs: 'condition', 'replicate', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'RNA_snn_res.0.8', 'seurat_clusters', 'labels_score', 'Order', 'Lane', 'Index', 'cancer', 'reference', 'flowcell', 'min_umis', 'min_genes', 'percent_mito', 'expected_cells', 'total_droplets', 'z_dim', 'z_layers', 'channel_id', 'labels_cl_unif_per_channel', 'filt_median_genes', 'filt_median_umi', 'pass', 'ccpm_id', 'htapp', 'sequenced', 'stage_at_diagnosis', 'metastatic_presentation', 'biopsy_days_after_metastasis', 'ER_primary', 'ER_biopsy', 'PR_primary', 'PR_biopsy', 'HER2_primary', 'HER2_biopsy', 'receptors_primary', 'receptors_biopsy', 'site_biopsy', 'histology_breast', 'histology_biopsy', 'sampleid', 'cnv_cors', 'cnv_cors_max', 'cnv_score', 'cnv_ref_score', 'cnv_score_norm', 'cnv_score_norm_norm', 'cnv_condition', 'cnv_score_norm_norm2', 'pam50_Basal_single', 'pam50_Her2_single', 'pam50_LumA_single', 'pam50_LumB_single', 'pam50_Normal_single', 'pam50_m

In [7]:
pd.unique(adata_ref.var['feature_name'])

['SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1', ..., 'ENSG00000278633.1', 'ENSG00000276345.1', 'ENSG00000275063.1', 'ENSG00000271254.7', 'ENSG00000268674.2']
Length: 18626
Categories (18626, object): ['A1BG', 'A1CF', 'A2M', 'A2ML1', ..., 'ZYG11B', 'ZYX', 'ZZEF1', 'ZZZ3']

In [8]:
pd.unique(adata_ref.obs['cell_type'])

['malignant cell', 'blood vessel endothelial cell', 'adipocyte', 'fibroblast', 'T cell', ..., 'macrophage', 'blood vessel smooth muscle cell', 'mature NK T cell', 'endothelial cell of hepatic sinusoid', 'chondrocyte']
Length: 11
Categories (11, object): ['fibroblast', 'blood vessel endothelial cell', 'T cell', 'adipocyte', ..., 'mature NK T cell', 'malignant cell', 'blood vessel smooth muscle cell', 'endothelial cell of hepatic sinusoid']

In [9]:
pd.unique(adata_ref.obs['Lane'])

['5', '4', '3']
Categories (3, object): ['3', '4', '5']

In [10]:
pd.unique(adata_ref.obs['tissue'])

['breast']
Categories (1, object): ['breast']

In [11]:
adata_ref.obs["cell_type"]

cellid
HTAPP-225-SMP-6756-TST-channel1_TGCATCCAGTTGCTGT-1                   malignant cell
HTAPP-225-SMP-6756-TST-channel1_TGCAGTATCCTGCCAT-1    blood vessel endothelial cell
HTAPP-225-SMP-6756-TST-channel1_TCACGCTAGCATGTTC-1                   malignant cell
HTAPP-225-SMP-6756-TST-channel1_GGAGGATTCCCTCAAC-1                   malignant cell
HTAPP-225-SMP-6756-TST-channel1_ATCACAGTCTTGGTCC-1                        adipocyte
                                                                  ...              
HTAPP-806-SMP-6789-TST-channel2_ATGCCTCTCTAGGCCG-1                       macrophage
HTAPP-806-SMP-6789-TST-channel2_TGAACGTCAGCTACTA-1                       macrophage
HTAPP-806-SMP-6789-TST-channel2_GGGACCTAGGTAATCA-1                           T cell
HTAPP-806-SMP-6789-TST-channel2_AGACACTCACCTGTCT-1                           T cell
HTAPP-806-SMP-6789-TST-channel2_AATGAAGTCCGAGTGC-1                           T cell
Name: cell_type, Length: 34164, dtype: category
Categories (11, objec

In [12]:
adata_ref.var.head(3)

Unnamed: 0,n_cells,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length,feature_type
ENSG00000187634,19705,False,19705,0.047509,95.281856,19842.0,False,SAMD11,NCBITaxon:9606,gene,1731,protein_coding
ENSG00000188976,50223,False,50223,0.12355,87.974658,51600.0,False,NOC2L,NCBITaxon:9606,gene,1244,protein_coding
ENSG00000187961,8648,False,8648,0.020752,97.929332,8667.0,False,KLHL17,NCBITaxon:9606,gene,934,protein_coding


In [21]:
adata_ref.var.index = adata_ref.var["feature_name"].astype(str).values

In [26]:
adata_ref.var["n_cells"]

SAMD11               19705
NOC2L                50223
KLHL17                8648
PLEKHN1              12560
PERM1                 1881
                     ...  
ENSG00000278633.1       29
ENSG00000276345.1      701
ENSG00000275063.1       11
ENSG00000271254.7     5541
ENSG00000268674.2       10
Name: n_cells, Length: 18626, dtype: int64

In [27]:
from anndata import AnnData
adata = AnnData(adata_ref.X, obs={"CellType": adata_ref.obs["cell_type"].values,"nCount_RNA":adata_ref.obs["nCount_RNA"].values} , var ={"n_cells":adata_ref.var["n_cells"].values, "feature_name":adata_ref.var["feature_name"].astype(str).values} )
adata

AnnData object with n_obs × n_vars = 34164 × 18626
    obs: 'CellType', 'nCount_RNA'
    var: 'n_cells', 'feature_name'

In [30]:
# convertig to none catagorical data: adata.var['feature_name'].categories.tolist()
adata.var.index = adata.var["feature_name"].values

In [35]:
adata.obs.groupby('CellType').size()

CellType
fibroblast                               2797
blood vessel endothelial cell            2816
T cell                                   1274
adipocyte                                 595
chondrocyte                                 1
macrophage                                639
plasma cell                                31
mature NK T cell                           25
malignant cell                          25685
blood vessel smooth muscle cell           300
endothelial cell of hepatic sinusoid        1
dtype: int64

In [36]:
len(adata)

34164

In [37]:
remove_cell = ['endothelial cell of hepatic sinusoid','chondrocyte']
adata = adata[~adata.obs['CellType'].isin(remove_cell), :]
adata.obs.groupby('CellType').size()

CellType
fibroblast                          2797
blood vessel endothelial cell       2816
T cell                              1274
adipocyte                            595
macrophage                           639
plasma cell                           31
mature NK T cell                      25
malignant cell                     25685
blood vessel smooth muscle cell      300
dtype: int64

In [38]:
np.sort(np.array(adata.X.sum(axis=1)))

array([[4524.5474],
       [5261.9346],
       [4531.005 ],
       ...,
       [1107.055 ],
       [1050.8928],
       [ 991.8363]], dtype=float32)

In [47]:
def plot_dist(andata,column,ax,type = 'obs', bins = 'auto',title = '',xlab = '',ylab =''):
    '''
    You can replace 'auto' with any other method (e.g., 'fd', 'doane', 'scott', 'rice', 'sturges', or 'sqrt')
    '''
    palette1 = sns.color_palette("colorblind",9)
    if type == 'obs':
        arr = andata.obs[column].values
    else:
        arr = andata.var[column].values
    bin_edges = np.histogram_bin_edges(arr, bins='auto')
    # Calculate bin edges using NumPy's 'auto' method
    # Calculate bin width
    bin_width = bin_edges[1] - bin_edges[0]
    set_image_para()
    sns.histplot(arr, binwidth=bin_width,palette=palette1,ax = ax, kde=True)
    ax.set_ylabel(ylab)
    ax.set_xlabel(xlab)
    ax.set_title(title)

In [39]:
#andata.var_names_make_unique()
adata.layers["count"] = adata.X
sc.pp.normalize_total(adata, target_sum = 1e4)
sc.pp.log1p(adata)

In [40]:
rdata = adata.copy()
rdata = rdata[~rdata.obs.CellType.isna()]

In [41]:
np.sort(np.array(rdata.X.sum(axis=1)))

array([[5554.1914 ],
       [5980.73   ],
       [5470.9233 ],
       ...,
       [1131.4521 ],
       [1081.2574 ],
       [1011.51495]], dtype=float32)

In [42]:
rdata

View of AnnData object with n_obs × n_vars = 34162 × 18626
    obs: 'CellType', 'nCount_RNA'
    var: 'n_cells', 'feature_name'
    uns: 'log1p'
    layers: 'count'

In [43]:
rdata.var_names

Index(['SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1', 'HES4', 'ISG15',
       'AGRN', 'RNF223', 'C1orf159',
       ...
       'ENSG00000276017.1', 'ENSG00000278817.1', 'ENSG00000277196.4',
       'ENSG00000277630.4', 'ENSG00000278384.1', 'ENSG00000278633.1',
       'ENSG00000276345.1', 'ENSG00000275063.1', 'ENSG00000271254.7',
       'ENSG00000268674.2'],
      dtype='object', length=18626)

In [44]:
ref_model = celltypist.train(rdata, labels = 'CellType', n_jobs = 22,
                            use_SGD = False,
                            feature_selection = True, top_genes = 300)

🍳 Preparing data before training
✂️ 665 non-expressed genes are filtered out
🔬 Input data has 34162 cells and 17961 genes
⚖️ Scaling input data
🏋️ Training data using SGD logistic regression
🔎 Selecting features
🧬 1870 features are selected
🏋️ Starting the second round of training
🏋️ Training data using logistic regression
✅ Model training done!


In [45]:
ref_path = "/data/kanferg/Sptial_Omics/playGround/Data/Breast_Cancer/ref/"
ref_model.write(ref_path + 'ref.pkl')

In [46]:
model_low = models.Model.load(model="Immune_All_Low.pkl")

In [83]:
pathout = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_4"
andata = sc.read_h5ad(os.path.join(pathout, "adata_concat_BreastCancer_harmony.h5ad"))

In [84]:
andata

AnnData object with n_obs × n_vars = 293960 × 1094
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'total_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'cluster', 'batch', 'cluster_harmony'
    var: 'gene_ids', 'feature_types', 'genome', 'MT', 'highly_variable', 'n_cells_by_counts-0', 'total_counts-0', 'mean_counts-0', 'pct_dropout_by_counts-0', 'log1p_total_counts-0', 'log1p_mean_counts-0', 'highly_variable_rank-0', 'means-0', 'variances-0', 'variances_norm-0', 'mean-0', 'std-0', 'n_cells_by_counts-1', 'total_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'log1p_total_counts-1', 'log1p_mean_counts-1', 'highly_variable_rank-1', 'means-1', 'variances-1', 'variances_norm-1', 'mean-1', 'std-1'
    uns: 'before', 'harmony', 'leiden', 'pca', 'umap'
    obsm: 'X_before_umap', 'X_pca', 'X_pca_before', 'X_umap', 'harmony_umap', 'spatial'
    varm: 'PCs'
    layers: 'counts', 'log'
    obsp: 'b

In [48]:
andata.X = andata.layers['counts']
sc.pp.normalize_total(andata, target_sum = 1e4)
sc.pp.log1p(andata)

In [49]:
predictions = celltypist.annotate(andata, model=model_low, majority_voting=False)

🔬 Input data has 293960 cells and 1094 genes
🔗 Matching reference genes in the model
🧬 687 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


In [50]:
ref_model

CellTypist model with 9 cell types and 1870 features
    date: 2024-11-12 16:08:27.219132
    cell types: T cell, adipocyte, ..., plasma cell
    features: MXRA8, MORN1, ..., MT-ND3

In [51]:
predictions_adata = predictions.to_adata()
andata.obs["low_label"] = predictions_adata.obs.loc[andata.obs.index, "predicted_labels"]
andata.obs["low_score"] = predictions_adata.obs.loc[andata.obs.index, "conf_score"]

In [53]:
predictions = celltypist.annotate(andata, model=ref_model, majority_voting=False)
predictions_adata = predictions.to_adata()
andata.obs["ref_label"] = predictions_adata.obs.loc[andata.obs.index, "predicted_labels"]
andata.obs["ref_score"] = predictions_adata.obs.loc[andata.obs.index, "conf_score"]

🔬 Input data has 293960 cells and 1094 genes
🔗 Matching reference genes in the model
🧬 452 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


In [56]:
predictions = andata.obs[['low_label', 'low_score', 'ref_label', 'ref_score']]

predictions

Unnamed: 0,low_label,low_score,ref_label,ref_score
s_016um_00107_00066-1-0,Double-positive thymocytes,0.011057,malignant cell,0.882547
s_016um_00126_00213-1-0,Follicular helper T cells,0.043939,blood vessel endothelial cell,0.445729
s_016um_00329_00125-1-0,CD16- NK cells,0.050305,malignant cell,0.430702
s_016um_00184_00346-1-0,Epithelial cells,0.012153,malignant cell,0.881227
s_016um_00258_00092-1-0,Fibroblasts,0.984018,macrophage,0.999984
...,...,...,...,...
s_016um_00001_00157-1-1,Regulatory T cells,0.011739,malignant cell,0.970702
s_016um_00404_00257-1-1,Fibroblasts,0.981071,fibroblast,1.000000
s_016um_00412_00400-1-1,Double-positive thymocytes,0.017417,malignant cell,0.988538
s_016um_00361_00190-1-1,Fibroblasts,0.071051,fibroblast,0.067396


<center><H1>scVI

In [65]:
# needs row count data and celltype in ref dataset and query data set. also batch
rdata.obs['batch'] = 'ref'
rdata.X = rdata.layers['count']

# load breastCancer data
pathout = "/data/kanferg/Sptial_Omics/SpatialOmicsToolkit/out_4"
andata_bc = sc.read_h5ad(os.path.join(pathout, "adata_concat_BreastCancer_harmony.h5ad"))
andata_bc.obs['CellType'] = 'Unknown'
andata_bc.X = andata_bc.layers['counts']

In [78]:
andata = sc.concat((andata_bc,rdata))
scvi.model.SCVI.setup_anndata(andata, batch_key='batch')
vae = scvi.model.SCVI(andata,use_gpu=True)
vae.train()

Trainer will use only 1 of 3 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=3)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Training:   0%|          | 0/24 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=24` reached.


In [79]:
lvae = scvi.model.SCANVI.from_scvi_model(vae, adata = andata, unlabeled_category = 'Unknown',
                                        labels_key = 'CellType')
lvae.train(max_epochs=20, n_samples_per_label=100)

[34mINFO    [0m Training for [1;36m20[0m epochs.                                                                                   


Trainer will use only 1 of 3 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=3)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Training:   0%|          | 0/20 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [80]:
andata.obs['predicted'] = lvae.predict(andata)

In [81]:
andata.obs['transfer_score'] = lvae.predict(soft = True).max(axis = 1)

In [82]:
andata

AnnData object with n_obs × n_vars = 328122 × 1066
    obs: 'batch', 'CellType', '_scvi_batch', '_scvi_labels', 'predicted', 'transfer_score'
    uns: '_scvi_uuid', '_scvi_manager_uuid'

In [None]:
andata_