In [1]:
# Seed for reproducibility
import torch
import numpy as np
import pandas as pd
import scanpy as sc
from typing import Tuple

import sys
sys.path.append('..')
from tools.formating.formating import *
from tools.utils.utils import run_dimension_reduction
from tools.evaluation.monitor import *

# scVI imports
import scvi
print(scvi.__version__)
from scvi.model.utils import mde
import pymde

torch.manual_seed(0)
np.random.seed(0)
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)

  warn(
Global seed set to 0


0.20.3


In [2]:
input = "/ps/ai-ready/data/Clustering/Tabula_Sapiens/TS_Skin_test.h5ad"
adata = load_anndata(input)
adata

/ps/ai-ready/data/Clustering/Tabula_Sapiens/TS_Skin_test.h5ad


AnnData object with n_obs × n_vars = 725 × 3853
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'doublet_scores', 'predicted_doublets'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: '_scvi', '_training_mode', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_comput

In [3]:
monitor = Monitor(1)

In [4]:
# scVI uses non normalized data so we keep the original data in a separate AnnData object, then the normalization steps are performed
scvi.model.SCVI.setup_anndata(
    adata,
    layer="raw_counts"
)

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


In [5]:
model = scvi.model.SCVI(adata)
model



In [6]:
model.train()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Epoch 400/400: 100%|█████████████████████████████████████████████████████████| 400/400 [01:05<00:00,  6.48it/s, loss=1.28e+03, v_num=1]

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 400/400: 100%|█████████████████████████████████████████████████████████| 400/400 [01:05<00:00,  6.15it/s, loss=1.28e+03, v_num=1]


In [7]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent

In [8]:
denoised = model.get_normalized_expression(adata, library_size=1e4)
adata.layers["scvi_normalized"] = model.get_normalized_expression(
    library_size=10e4
)

In [9]:
adata.layers["raw_counts"].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [10]:
adata.obsm["X_scVI"].shape[1]

10

In [11]:
adata = run_dimension_reduction(adata, use_rep='X_scVI', n_pcs=10, n_neighbors=15)

In [12]:
adata.obsm["X_mde"] = mde(adata.obsm["X_scVI"])

In [13]:
time_points, cpu_usage, mem_usage = monitor.stop()

In [14]:
time_points

[1700543559.2054741,
 1700543560.209833,
 1700543561.2118258,
 1700543562.213993,
 1700543563.2188718,
 1700543564.2208657,
 1700543565.2235122,
 1700543566.2254753,
 1700543567.2276356,
 1700543568.2297757,
 1700543569.231599,
 1700543570.235101,
 1700543571.2377546,
 1700543572.2397528,
 1700543573.2438443,
 1700543574.2472901,
 1700543575.2494252,
 1700543576.2514162,
 1700543577.2535086,
 1700543578.255871,
 1700543579.2591426,
 1700543580.261378,
 1700543581.2634895,
 1700543582.2656667,
 1700543583.267905,
 1700543584.2712798,
 1700543585.2737305,
 1700543586.2759404,
 1700543587.2786517,
 1700543588.2813435,
 1700543589.2845485,
 1700543590.288331,
 1700543591.2899723,
 1700543592.2969956,
 1700543593.300046,
 1700543594.3025682,
 1700543595.3048747,
 1700543596.3070018,
 1700543597.309115,
 1700543598.31188,
 1700543599.314225,
 1700543600.3163292,
 1700543601.3184187,
 1700543602.3204885,
 1700543603.3226953,
 1700543604.3248866,
 1700543605.327333,
 1700543606.3295157,
 17005

In [15]:
cpu_usage

[0.0,
 29.8,
 42.6,
 43.0,
 43.3,
 43.2,
 41.8,
 42.4,
 41.7,
 41.2,
 41.4,
 40.9,
 41.4,
 41.3,
 41.8,
 41.4,
 41.5,
 41.0,
 40.8,
 41.5,
 41.9,
 40.7,
 41.7,
 42.4,
 42.3,
 41.5,
 41.7,
 38.9,
 40.0,
 41.5,
 41.7,
 41.0,
 42.0,
 40.7,
 40.7,
 41.1,
 40.7,
 40.4,
 40.9,
 40.8,
 40.4,
 40.3,
 40.8,
 40.8,
 42.2,
 40.7,
 41.0,
 41.4,
 40.8,
 40.8,
 40.6,
 40.1,
 41.5,
 40.9,
 41.7,
 39.9,
 40.1,
 40.8,
 40.8,
 41.0,
 41.2,
 41.1,
 40.9,
 40.0,
 40.7,
 40.8,
 23.9,
 65.0,
 74.6,
 83.1,
 91.6,
 12.3,
 1.9,
 1.8,
 1.8,
 1.8,
 1.8,
 1.8,
 1.8,
 1.8,
 14.0,
 58.5,
 50.0,
 20.5,
 1.8,
 20.0,
 70.8,
 75.5]

In [16]:
mem_usage

[3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7,
 3.7]

In [17]:
adata.obs

Unnamed: 0_level_0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,...,log1p_total_counts_hb,pct_counts_hb,outlier,mt_outlier,doublet_scores,predicted_doublets,_scvi_batch,_scvi_labels,leiden_X_scVI,louvain_X_scVI
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGTTATGGA_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,2695.0,890,"cd8-positive, alpha-beta memory t cell",CD8-positive memory T cell,True,immune,...,0.693147,0.036900,False,False,0.050685,False,0,0,2,1
AACAAGACAGCAGGAT_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,3207.0,1033,"cd4-positive, alpha-beta memory t cell",CD4-positive memory T cell,True,immune,...,0.693147,0.030998,False,False,0.070626,False,0,0,2,1
AAGATAGAGCCATATC_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,3564.0,1148,"naive thymus-derived cd8-positive, alpha-beta ...",Naive CD8-positive T cell,True,immune,...,0.000000,0.000000,False,False,0.060601,False,0,0,4,2
AAGGTAACAATCCTAG_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,4133.0,1187,t cell,alpha-beta T cell,True,immune,...,0.693147,0.024027,False,False,0.024758,False,0,0,2,1
AAGTCGTAGCAGTAAT_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,3731.0,1737,stromal cell,stromal cells,True,stromal,...,0.000000,0.000000,False,False,0.357937,True,0,0,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGATGGAGTGGCGAT_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,4114.0,1490,t cell,alpha-beta T cell,True,immune,...,0.000000,0.000000,False,False,0.058805,False,0,0,9,8
TTGCCTGGTTACCCTC_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,16208.0,3578,stromal cell,stromal cells,True,stromal,...,0.000000,0.000000,False,False,0.027185,False,0,0,3,0
TTGTGGACACAGTGTT_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,25516.0,4128,endothelial cell,endothelial cells,True,endothelial,...,1.791759,0.019158,False,False,0.043766,False,0,0,1,3
TTGTTGTAGTATAGAC_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,10031.0,2639,stromal cell,stromal cells,True,stromal,...,0.693147,0.009888,False,False,0.040084,False,0,0,3,0


In [19]:
from sklearn.metrics.cluster import adjusted_rand_score
ari=adjusted_rand_score(adata.obs["cell_ontology_class"], adata.obs["leiden_X_scVI"])
ari

0.43429292819059256

In [21]:
adata.obs["cell_ontology_class"].nunique()

22

In [27]:
# from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score as ARI
from sklearn.metrics import normalized_mutual_info_score as NMI
from sklearn.metrics import silhouette_score
# from sklearn.mixture import GaussianMixture as GMM

def clustering_scores(labels, labels_pred, embedding):
    asw_score = silhouette_score(embedding, labels)
    nmi_score = NMI(labels, labels_pred)
    ari_score = ARI(labels, labels_pred)
    print(
        "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f"
        % (asw_score, nmi_score, ari_score)
    )
    return asw_score, nmi_score, ari_score

In [29]:
asw_score, nmi_score, ari_score = clustering_scores(adata.obs["cell_ontology_class"], adata.obs["leiden_X_scVI"], adata.obsm["X_mde"])

Clustering Scores:
Silhouette: 0.0631
NMI: 0.6572
ARI: 0.4343
