In [1]:
import scanpy as sc
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np

In [10]:
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numba
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
warnings.simplefilter("ignore", category=NumbaDeprecationWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [11]:
import celltypist
from celltypist import models

In [2]:
models.get_all_models()

🔎 No available models. Downloading...
📜 Retrieving model list from server https://celltypist.cog.sanger.ac.uk/models/models.json
📚 Total models in list: 54
📂 Storing models in /root/.celltypist/data/models
💾 Downloading model [1/54]: Immune_All_Low.pkl
💾 Downloading model [2/54]: Immune_All_High.pkl
💾 Downloading model [3/54]: Adult_COVID19_PBMC.pkl
💾 Downloading model [4/54]: Adult_CynomolgusMacaque_Hippocampus.pkl
💾 Downloading model [5/54]: Adult_Human_MTG.pkl
💾 Downloading model [6/54]: Adult_Human_PancreaticIslet.pkl
💾 Downloading model [7/54]: Adult_Human_PrefrontalCortex.pkl
💾 Downloading model [8/54]: Adult_Human_Skin.pkl
💾 Downloading model [9/54]: Adult_Human_Vascular.pkl
💾 Downloading model [10/54]: Adult_Mouse_Gut.pkl
💾 Downloading model [11/54]: Adult_Mouse_OlfactoryBulb.pkl
💾 Downloading model [12/54]: Adult_Pig_Hippocampus.pkl
💾 Downloading model [13/54]: Adult_RhesusMacaque_Hippocampus.pkl
💾 Downloading model [14/54]: Autopsy_COVID19_Lung.pkl
💾 Downloading model [15/54]

['Mouse_Postnatal_DentateGyrus.pkl',
 'Developing_Human_Thymus.pkl',
 'Adult_Human_PrefrontalCortex.pkl',
 'Human_Developmental_Retina.pkl',
 'Fetal_Human_Pancreas.pkl',
 'Adult_Human_PancreaticIslet.pkl',
 'Human_Embryonic_YolkSac.pkl',
 'Cells_Adult_Breast.pkl',
 'Mouse_Whole_Brain.pkl',
 'Fetal_Human_Pituitary.pkl',
 'Human_Colorectal_Cancer.pkl',
 'Developing_Human_Brain.pkl',
 'Adult_Mouse_OlfactoryBulb.pkl',
 'Adult_CynomolgusMacaque_Hippocampus.pkl',
 'Human_Longitudinal_Hippocampus.pkl',
 'Adult_Human_MTG.pkl',
 'Adult_Human_Skin.pkl',
 'Mouse_Dentate_Gyrus.pkl',
 'Developing_Mouse_Brain.pkl',
 'Mouse_Isocortex_Hippocampus.pkl',
 'Adult_Mouse_Gut.pkl',
 'Cells_Lung_Airway.pkl',
 'Cells_Fetal_Lung.pkl',
 'Cells_Human_Tonsil.pkl',
 'Fetal_Human_Retina.pkl',
 'Immune_All_Low.pkl',
 'Fetal_Human_Skin.pkl',
 'Adult_COVID19_PBMC.pkl',
 'Healthy_COVID19_PBMC.pkl',
 'Developing_Human_Gonads.pkl',
 'Developing_Mouse_Hippocampus.pkl',
 'Nuclei_Lung_Airway.pkl',
 'Human_PF_Lung.pkl',
 'De

In [7]:
filename = os.path.basename('../../../oscb/user_storage/Benchmarks/facs-Bladder_1751302627486/QC/results/313b1738828fdf0d5157af2b12a71be6/facs_Bladder_MAGIC_imputation.h5ad')

In [2]:
os.path.dirname('../../../oscb/user_storage/Benchmarks/facs-Bladder_1751302627486/QC/results/313b1738828fdf0d5157af2b12a71be6/facs_Bladder_MAGIC_imputation.h5ad')

'../../../oscb/user_storage/Benchmarks/facs-Bladder_1751302627486/QC/results/313b1738828fdf0d5157af2b12a71be6'

In [8]:
filename.split(".")[0]

'facs_Bladder_MAGIC_imputation'

In [34]:
def run_celltypist(adata, model_name, refs = [], labels = None, species = 'mouse'):
    model = celltypist.Model.load(model_name)
    if species == 'mouse' and "Mouse" not in model_name:
        model.convert()
    adata = reset_x_to_raw(adata)

    sc.pp.filter_genes(adata, min_cells = 10)
    sc.pp.normalize_total(adata, target_sum=1e4) #not recommended for typical pp
    sc.pp.log1p(adata)
    
    if type(adata.X) != np.ndarray:
        adata.X = adata.X.toarray()
    
    predictions = celltypist.annotate(adata, model=model, majority_voting=True)
    predictions_adata = predictions.to_adata()
    adata.obs["celltypist_label"] = predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
    adata.obs["celltypist_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]

    if len(refs) > 0 and labels is not None:
        for input in refs:
            try:
                name = os.path.basename(input).split(".")[0]
                ref_ad = sc.read_h5ad(input)
                ref_ad = reset_x_to_raw(ref_ad)
                sc.pp.filter_genes(ref_ad, min_cells = 10)
                sc.pp.normalize_total(ref_ad, target_sum = 1e4) #Note this is only for cell annotation, recommended by authors but not best
                sc.pp.log1p(ref_ad)

                ref_ad = ref_ad[~ref_ad.obs[labels].isna()]
                ref_model = celltypist.train(ref_ad, labels = labels, n_jobs = 4, use_SGD = False, feature_selection = True, top_genes = 300)
                ref_predictions = celltypist.annotate(adata, model=ref_model, majority_voting=False)
                ref_predictions_adata = ref_predictions.to_adata()
                adata.obs["ref_"+name+"_label"] = ref_predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
                adata.obs["ref_"+name+"_score"] = ref_predictions_adata.obs.loc[adata.obs.index, "conf_score"]
            except Exception as e:
                print(e)
                continue

    return adata

In [15]:
def reset_x_to_raw(adata, min_genes=200):
        if is_normalized(adata.X, min_genes) and not check_nonnegative_integers(adata.X):
            if "raw_counts" in adata.layers.keys():
                adata.layers["normalized_X"] = adata.X.copy()
                adata.X = adata.layers['raw_counts'].copy()
            elif adata.raw.X is not None:
                adata.layers["normalized_X"] = adata.X.copy()
                adata.X = adata.raw.X.copy()
            else:
                raise ValueError("Raw counts are not available.")
        
        return adata

In [28]:
from typing import Optional, Union
import scipy.sparse as sp_sparse
from scipy.sparse import csr_matrix
import h5py
from anndata._core.sparse_dataset import SparseDataset
import jax
import jax.numpy as jnp

def is_normalized(expression_matrix, min_genes=200):
    if (not isinstance(expression_matrix, np.ndarray)):
        expression_matrix = expression_matrix.toarray()

    if np.min(expression_matrix) < 0 or np.max(expression_matrix) < min_genes:
        return True
    else:
        return False
        

def check_nonnegative_integers(
    data: Union[pd.DataFrame, np.ndarray, sp_sparse.spmatrix, h5py.Dataset],
    n_to_check: int = 20,
):
    """Approximately checks values of data to ensure it is count data."""
    # for backed anndata
    if isinstance(data, h5py.Dataset) or isinstance(data, SparseDataset):
        data = data[:100]

    if isinstance(data, np.ndarray):
        data = data
    elif issubclass(type(data), sp_sparse.spmatrix):
        data = data.data
    elif isinstance(data, pd.DataFrame):
        data = data.to_numpy()
    else:
        raise TypeError("data type not understood")

    ret = True
    if len(data) != 0:
        inds = np.random.choice(len(data), size=(n_to_check,))
        check = jax.device_put(data.flat[inds], device=jax.devices("cpu")[0])
        negative, non_integer = _is_not_count_val(check)
        ret = not (negative or non_integer)
    return ret

In [44]:
adata = sc.read_h5ad('../../../oscb/user_storage/Benchmarks/facs-Bladder_1751302627486/QC/results/313b1738828fdf0d5157af2b12a71be6/facs_Bladder_MAGIC_imputation.h5ad')

In [37]:
adata = run_celltypist(adata, model_name='Adult_Mouse_Gut.pkl', refs=['../../../oscb/user_storage/Benchmarks/facs-Bladder_1751302627486/QC/results/313b1738828fdf0d5157af2b12a71be6/facs_Bladder_MAGIC_imputation.h5ad'], labels='cell_ontology_class')

🔬 Input data has 1378 cells and 1929 genes
🔗 Matching reference genes in the model
🧬 852 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 5
🗳️ Majority voting the predictions
✅ Majority voting done!
🍳 Preparing data before training
🔬 Input data has 1378 cells and 1929 genes
⚖️ Scaling input data
🏋️ Training data using SGD logistic regression
🔎 Selecting features
🧬 300 features are selected
🏋️ Starting the second round of training
🏋️ Training data using logistic regression
✅ Model training done!
🔬 Input data has 1378 cells and 1929 genes
🔗 Matching reference genes in the model
🧬 300 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!


In [38]:
adata.obs

Unnamed: 0,orig.ident,n_counts,n_genes,nReads,plate.barcode,mouse.id,tissue,subtissue,FACS.selection,mouse.sex,...,MAGIC_leiden,MAGIC_louvain,predicted_labels,over_clustering,majority_voting,conf_score,celltypist_label,celltypist_score,ref_facs_Bladder_MAGIC_imputation_label,ref_facs_Bladder_MAGIC_imputation_score
A1.B000610.3_56_F.1.1,Bladder,119565.0,364,610727.0,B000610,3_56_F,Bladder,,Multiple,F,...,11,7,bladder cell,39,Fibroblast,0.999537,Fibroblast,0.352776,bladder cell,0.999537
A1.B002764.3_38_F.1.1,Bladder,3184.0,90,320035.0,B002764,3_38_F,Bladder,,Multiple,F,...,24,3,bladder urothelial cell,68,Fibroblast,0.981892,B cell,0.100343,bladder urothelial cell,0.981892
A1.B002771.3_39_F.1.1,Bladder,197586.0,489,1044981.0,B002771,3_39_F,Bladder,,Multiple,F,...,14,4,bladder cell,27,Fibroblast,0.999969,TA,0.980678,bladder cell,0.999969
A1.D041914.3_8_M.1.1,Bladder,70714.0,405,447232.0,D041914,3_8_M,Bladder,,Multiple,M,...,1,5,bladder cell,43,Fibroblast,0.999982,Fibroblast,0.190072,bladder cell,0.999982
A1.D042253.3_9_M.1.1,Bladder,51411.0,528,330249.0,D042253,3_9_M,Bladder,,Multiple,M,...,6,6,bladder cell,58,Fibroblast,0.999872,TA,0.936617,bladder cell,0.999872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P8.MAA000487.3_10_M.1.1,Bladder,269238.0,637,1668602.0,MAA000487,3_10_M,Bladder,,Multiple,M,...,7,15,bladder urothelial cell,3,Enterocyte.2,0.999947,Enterocyte.2,0.288735,bladder urothelial cell,0.999947
P9.B000610.3_56_F.1.1,Bladder,121666.0,522,865291.0,B000610,3_56_F,Bladder,,Multiple,F,...,3,0,bladder urothelial cell,59,TA,0.999834,TA,0.287399,bladder urothelial cell,0.999834
P9.B002771.3_39_F.1.1,Bladder,187199.0,556,1056676.0,B002771,3_39_F,Bladder,,Multiple,F,...,21,1,bladder urothelial cell,31,TA,0.999980,TA,0.626303,bladder urothelial cell,0.999980
P9.D042253.3_9_M.1.1,Bladder,109706.0,612,493818.0,D042253,3_9_M,Bladder,,Multiple,M,...,1,5,bladder cell,43,Fibroblast,0.999937,Fibroblast,0.514495,bladder cell,0.999937


In [50]:
def scvi_transfer(adata, refs = [], labels = None):
    import scvi
    adata = reset_x_to_raw(adata)
    adata.obs['CellType'] = 'Unknown'
    adata.obs['Batch'] = 'Unknown'

    adatas = [sc.read_h5ad(input) for input in refs]
    dater = sc.concat(adatas, join='outer')
    sc.pp.filter_genes(dater, min_cells = 10)
    dater = dater[~dater.obs[labels].isna()]
    dater = reset_x_to_raw(dater)
    dater.obs['CellType'] = dater.obs[labels]
    dater.obs['Batch'] = 'reference'
    dater = sc.concat((adata, dater))
    dater.obs['Sample'] = dater.obs.index

    sc.pp.highly_variable_genes(dater, flavor = 'seurat_v3', n_top_genes=3000, batch_key="Batch", subset = True)
    scvi.model.SCVI.setup_anndata(dater, batch_key='Batch', categorical_covariate_keys = ['Sample'])
    vae = scvi.model.SCVI(dater)
    vae.train(max_epochs = 400, early_stopping = True)

    lvae = scvi.model.SCANVI.from_scvi_model(vae, adata = dater, unlabeled_category = 'Unknown',
                                        labels_key = 'CellType')

    lvae.train(max_epochs=20, n_samples_per_label=100)

    dater.obs['scVI_predicted'] = lvae.predict(dater)
    dater.obs['scVI_transfer_score'] = lvae.predict(soft = True).max(axis = 1)
    dater = dater[dater.obs.Batch == 'Unknown']

    adata.obs = adata.obs.merge(right = dater.obs[['scVI_predicted', 'scVI_transfer_score']], left_index=True, right_index=True)
    adata.obs = adata.obs.drop('CellType', axis=1) 

    return adata

@jax.jit
def _is_not_count_val(data: jnp.ndarray):
    negative = jnp.any(data < 0)
    non_integer = jnp.any(data % 1 != 0)

    return negative, non_integer

In [51]:
adata = scvi_transfer(adata, refs=['../../../oscb/user_storage/Benchmarks/facs-Bladder_1751302627486/QC/results/313b1738828fdf0d5157af2b12a71be6/facs_Bladder_MAGIC_imputation.h5ad'], labels='cell_ontology_class')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 165/400:  41%|████████████████████████████████████████████▉                                                                | 165/400 [01:31<02:10,  1.81it/s, loss=3.31e+03, v_num=1]
Monitored metric elbo_validation did not improve in the last 45 records. Best score: 3599.561. Signaling Trainer to stop.
[34mINFO    [0m Training for [1;36m20[0m epochs.                                                                                   


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 20/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.03s/it, loss=3.3e+03, v_num=1]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 20/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.03s/it, loss=3.3e+03, v_num=1]


In [52]:
adata.obs

Unnamed: 0,orig.ident,n_counts,n_genes,nReads,plate.barcode,mouse.id,tissue,subtissue,FACS.selection,mouse.sex,...,seurat_clusters,doublet_score,doublet_class,leiden,louvain,MAGIC_leiden,MAGIC_louvain,Batch,scVI_predicted,scVI_transfer_score
A1.B000610.3_56_F.1.1,Bladder,119565.0,364,610727.0,B000610,3_56_F,Bladder,,Multiple,F,...,0,0.060606,Singlet,1,5,11,7,Unknown,bladder cell,0.997460
A1.B002764.3_38_F.1.1,Bladder,3184.0,90,320035.0,B002764,3_38_F,Bladder,,Multiple,F,...,4,0.090909,Singlet,3,2,24,3,Unknown,bladder urothelial cell,0.882903
A1.B002771.3_39_F.1.1,Bladder,197586.0,489,1044981.0,B002771,3_39_F,Bladder,,Multiple,F,...,0,0.066667,Singlet,1,5,14,4,Unknown,bladder cell,0.998243
A1.D041914.3_8_M.1.1,Bladder,70714.0,405,447232.0,D041914,3_8_M,Bladder,,Multiple,M,...,0,0.030303,Singlet,0,5,1,5,Unknown,bladder cell,0.997618
A1.D042253.3_9_M.1.1,Bladder,51411.0,528,330249.0,D042253,3_9_M,Bladder,,Multiple,M,...,0,0.115152,Singlet,1,0,6,6,Unknown,bladder cell,0.997451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P8.MAA000487.3_10_M.1.1,Bladder,269238.0,637,1668602.0,MAA000487,3_10_M,Bladder,,Multiple,M,...,1,0.121212,Singlet,2,1,7,15,Unknown,bladder urothelial cell,0.993759
P9.B000610.3_56_F.1.1,Bladder,121666.0,522,865291.0,B000610,3_56_F,Bladder,,Multiple,F,...,4,0.139394,Singlet,3,2,3,0,Unknown,bladder urothelial cell,0.988937
P9.B002771.3_39_F.1.1,Bladder,187199.0,556,1056676.0,B002771,3_39_F,Bladder,,Multiple,F,...,1,0.151515,Singlet,2,3,21,1,Unknown,bladder urothelial cell,0.995391
P9.D042253.3_9_M.1.1,Bladder,109706.0,612,493818.0,D042253,3_9_M,Bladder,,Multiple,M,...,3,0.072727,Singlet,0,4,1,5,Unknown,bladder cell,0.996994
