In [1]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import celltypist
from celltypist import models
import anndata as ad
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

In [6]:
tm_droplet = sc.read_h5ad('/data/kanferg/Sptial_Omics/projects/NguyenLab/data/czi/TM_droplet.h5ad')
tm_facs = sc.read_h5ad('/data/kanferg/Sptial_Omics/projects/NguyenLab/data/czi/TM_facs.h5ad')

In [7]:
tm_droplet = tm_droplet[tm_droplet.obs.tissue=="Liver"].copy()
tm_facs = tm_facs[tm_facs.obs.tissue=="Liver"].copy()

gene_len = pd.read_csv(
    "https://raw.githubusercontent.com/chenlingantelope/HarmonizationSCANVI/master/data/gene_len.txt",
    delimiter=" ",
    header=None,
    index_col=0,
)

In [9]:
tm_droplet.obs['CellType'] = tm_droplet.obs['cell_ontology_class']
tm_facs.obs['CellType'] = tm_facs.obs['cell_ontology_class']
gene_len = gene_len.reindex(tm_facs.var.index).dropna()
tm_facs = tm_facs[:, gene_len.index]
assert (tm_facs.var.index == gene_len.index).sum() == tm_facs.shape[1]
tm_facs.X = tm_facs.X / gene_len[1].values * np.median(gene_len[1].values)
# round to integer
tm_facs.X = np.rint(tm_facs.X)

In [14]:
adata = ad.concat([tm_droplet, tm_facs],join='inner', merge='same')

In [15]:
adata

AnnData object with n_obs × n_vars = 10153 × 18244
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'CellType'

In [16]:
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [17]:
sc.pp.filter_genes(adata, min_cells = 50)
adata

AnnData object with n_obs × n_vars = 10153 × 13185
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'CellType'
    var: 'n_cells'
    uns: 'log1p'
    layers: 'counts'

In [18]:
ref_model = celltypist.train(adata, labels = 'CellType', n_jobs = 22,
                            use_SGD = False,
                            feature_selection = True, top_genes = 300)

🍳 Preparing data before training
🔬 Input data has 10153 cells and 13185 genes
⚖️ Scaling input data
🏋️ Training data using SGD logistic regression
🔎 Selecting features
🧬 3303 features are selected
🏋️ Starting the second round of training
🏋️ Training data using logistic regression
✅ Model training done!


In [29]:
# del adata

In [21]:
pathout = "/data/kanferg/Sptial_Omics/projects/NatalieLab/liver_cancer/spatialomicstoolkit/out_1"
andata = sc.read_h5ad(os.path.join(pathout, "andata_filter_logNorm_hvg_spatialleiden.h5ad"))

In [22]:
andata.X = andata.layers['counts']
sc.pp.normalize_total(andata, target_sum=1e4)
sc.pp.log1p(andata)

In [24]:
def predict_cells(adata):
    predictions = celltypist.annotate(adata, model=ref_model, majority_voting=False)
    predictions_adata = predictions.to_adata()
    adata.obs["ref_label"] = predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
    adata.obs["ref_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]
    return adata.obs

In [25]:
predictions = predict_cells(andata)

🔬 Input data has 447727 cells and 5000 genes
🔗 Matching reference genes in the model
🧬 878 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


In [26]:
predictions.head(3)

Unnamed: 0,n_genes_by_counts,total_counts,log1p_n_genes_by_counts,log1p_total_counts,total_counts_MT,pct_counts_MT,log1p_total_counts_MT,cluster,spatialleiden,predicted_labels,conf_score,ref_label,ref_score
s_008um_00269_00526-1,499,995.0,6.214608,6.903747,0.0,0.0,0.0,8,1,hepatocyte,0.978664,hepatocyte,0.978664
s_008um_00484_00168-1,262,412.0,5.572154,6.023448,0.0,0.0,0.0,3,8,hepatocyte,0.993176,hepatocyte,0.993176
s_008um_00547_00611-1,359,744.0,5.886104,6.613384,0.0,0.0,0.0,4,3,hepatocyte,0.998166,hepatocyte,0.998166


In [27]:
andata.obs['ref_label']

s_008um_00269_00526-1    hepatocyte
s_008um_00484_00168-1    hepatocyte
s_008um_00547_00611-1    hepatocyte
s_008um_00693_00628-1    hepatocyte
s_008um_00260_00253-1    hepatocyte
                            ...    
s_008um_00610_00321-1    hepatocyte
s_008um_00565_00596-1    hepatocyte
s_008um_00307_00022-1    hepatocyte
s_008um_00172_00448-1    hepatocyte
s_008um_00247_00283-1    hepatocyte
Name: ref_label, Length: 447727, dtype: category
Categories (5, object): ['Kupffer cell', 'NK cell', 'endothelial cell of hepatic sinusoid', 'hepatocyte', 'myeloid leukocyte']

In [28]:
# andata.obs = andata.obs.merge(right = predictions, left_index=True, right_index=True)
# andata.obs 

In [33]:
import sys
for cluster, group in andata.obs.groupby('spatialleiden'):
    # Clear the console
    os.system('cls' if os.name == 'nt' else 'clear')
    print(f"Cluster: {cluster}")
    print(f"Labels: {group['ref_label'].value_counts()}")
    print(f"Mode: {group['ref_label'].mode()}")
    # Wait for user response
    
    input("\nPress Enter to continue to the next cluster...")

[H[2JCluster: 0
Labels: ref_label
hepatocyte                              76557
endothelial cell of hepatic sinusoid      108
Kupffer cell                                4
NK cell                                     1
myeloid leukocyte                           0
Name: count, dtype: int64
Mode: 0    hepatocyte
Name: ref_label, dtype: category
Categories (5, object): ['Kupffer cell', 'NK cell', 'endothelial cell of hepatic sinusoid', 'hepatocyte', 'myeloid leukocyte']

Press Enter to continue to the next cluster...
[H[2JCluster: 1
Labels: ref_label
hepatocyte                              64462
Kupffer cell                              250
endothelial cell of hepatic sinusoid       26
NK cell                                     1
myeloid leukocyte                           0
Name: count, dtype: int64
Mode: 0    hepatocyte
Name: ref_label, dtype: category
Categories (5, object): ['Kupffer cell', 'NK cell', 'endothelial cell of hepatic sinusoid', 'hepatocyte', 'myeloid leukocyte']

Pre