In [1]:
# Seed for reproducibility
import torch
import numpy as np
import pandas as pd
import scanpy as sc
from typing import Tuple

import sys
sys.path.append('..')
from tools.formating.formating import *
from tools.utils.reduction import run_dimension_reduction
from tools.evaluation.monitor import *

# scVI imports
import scvi
print(scvi.__version__)
from scvi.model.utils import mde
import pymde

torch.manual_seed(0)
np.random.seed(0)
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)

  warn(
Global seed set to 0


0.20.3


In [2]:
input = "/ps/ai-ready/data/Clustering/Tabula_Sapiens/TS_Skin_test.h5ad"
adata = load_anndata(input)
adata

/ps/ai-ready/data/Clustering/Tabula_Sapiens/TS_Skin_test.h5ad


AnnData object with n_obs × n_vars = 725 × 3853
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'doublet_scores', 'predicted_doublets'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: '_scvi', '_training_mode', 'cell_ontology_class_colors', 'dendrogram_cell_type_tissue', 'dendrogram_comput

In [3]:
monitor = Monitor(1)

In [4]:
# scVI uses non normalized data so we keep the original data in a separate AnnData object, then the normalization steps are performed
scvi.model.SCVI.setup_anndata(
    adata,
    layer="raw_counts"
)

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


In [5]:
model = scvi.model.SCVI(adata)
model



In [6]:
model.train()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Epoch 400/400: 100%|█████████████████████████████████████████████████████████████████| 400/400 [01:03<00:00,  6.59it/s, loss=1.28e+03, v_num=1]

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 400/400: 100%|█████████████████████████████████████████████████████████████████| 400/400 [01:03<00:00,  6.25it/s, loss=1.28e+03, v_num=1]


In [7]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent

In [8]:
denoised = model.get_normalized_expression(adata, library_size=1e4)
adata.layers["scvi_normalized"] = model.get_normalized_expression(
    library_size=10e4
)

In [9]:
adata.layers["raw_counts"].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [10]:
adata.obsm["X_scVI"].shape[1]

10

In [11]:
adata = run_dimension_reduction(adata, use_rep='X_scVI', n_pcs=10, n_neighbors=15)

In [12]:
adata.obsm["X_mde"] = mde(adata.obsm["X_scVI"])

In [13]:
time_points, cpu_usage, mem_usage, gpu_mem_usage = monitor.stop()

In [14]:
time_points

[1701624491.8286164,
 1701624492.8329494,
 1701624493.833886,
 1701624494.8361275,
 1701624495.8378978,
 1701624496.8408492,
 1701624497.8435674,
 1701624498.8456545,
 1701624499.8489292,
 1701624500.8515346,
 1701624501.853912,
 1701624502.8563924,
 1701624503.8632188,
 1701624504.8652425,
 1701624505.867336,
 1701624506.8700318,
 1701624507.8728025,
 1701624508.8750372,
 1701624509.878413,
 1701624510.8806481,
 1701624511.8837578,
 1701624512.887051,
 1701624513.8893747,
 1701624514.8916342,
 1701624515.8938978,
 1701624516.895957,
 1701624517.8981538,
 1701624518.9001918,
 1701624519.9033468,
 1701624520.9055579,
 1701624521.9078102,
 1701624522.9111528,
 1701624523.913342,
 1701624524.9155886,
 1701624525.91788,
 1701624526.9197366,
 1701624527.921897,
 1701624528.9241438,
 1701624529.9263089,
 1701624530.9297833,
 1701624531.932529,
 1701624532.9342127,
 1701624533.9371543,
 1701624534.9395375,
 1701624535.941685,
 1701624536.945624,
 1701624537.9477863,
 1701624538.9503613,
 1701

In [15]:
x = [n for n in range(len(time_points))]
x

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85]

In [16]:
cpu_usage

[14.3,
 30.1,
 43.5,
 44.9,
 44.8,
 43.5,
 42.8,
 42.2,
 41.8,
 41.1,
 40.5,
 41.5,
 40.3,
 41.6,
 41.1,
 40.5,
 41.3,
 42.4,
 41.7,
 40.9,
 42.0,
 41.8,
 42.3,
 41.3,
 41.6,
 42.4,
 41.7,
 41.5,
 41.2,
 41.2,
 41.4,
 42.4,
 41.9,
 42.0,
 42.1,
 40.5,
 41.5,
 42.0,
 41.6,
 41.6,
 42.5,
 41.4,
 40.6,
 41.8,
 42.3,
 41.6,
 40.7,
 41.8,
 40.0,
 41.2,
 41.9,
 40.8,
 41.7,
 41.6,
 42.5,
 41.0,
 41.3,
 40.4,
 41.8,
 41.4,
 41.7,
 41.6,
 40.4,
 41.1,
 40.8,
 21.9,
 62.5,
 76.4,
 87.1,
 31.2,
 1.8,
 1.8,
 1.9,
 1.8,
 1.8,
 1.8,
 1.9,
 1.9,
 7.7,
 58.2,
 49.7,
 17.3,
 1.8,
 25.1,
 77.3,
 81.2]

In [17]:
mem_usage

[5.8,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9,
 5.9]

In [18]:
# gpu_mem_usage

In [19]:
sum(gpu_mem_usage) == 0

True

In [20]:
adata.obs

Unnamed: 0_level_0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,...,log1p_total_counts_hb,pct_counts_hb,outlier,mt_outlier,doublet_scores,predicted_doublets,_scvi_batch,_scvi_labels,leiden_X_scVI,louvain_X_scVI
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGTTATGGA_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,2695.0,890,"cd8-positive, alpha-beta memory t cell",CD8-positive memory T cell,True,immune,...,0.693147,0.036900,False,False,0.050685,False,0,0,2,1
AACAAGACAGCAGGAT_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,3207.0,1033,"cd4-positive, alpha-beta memory t cell",CD4-positive memory T cell,True,immune,...,0.693147,0.030998,False,False,0.070626,False,0,0,2,1
AAGATAGAGCCATATC_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,3564.0,1148,"naive thymus-derived cd8-positive, alpha-beta ...",Naive CD8-positive T cell,True,immune,...,0.000000,0.000000,False,False,0.060601,False,0,0,4,2
AAGGTAACAATCCTAG_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,4133.0,1187,t cell,alpha-beta T cell,True,immune,...,0.693147,0.024027,False,False,0.024758,False,0,0,2,1
AAGTCGTAGCAGTAAT_TSP10_Skin_NA_10X_1_1,Skin,10X,TSP10,,3731.0,1737,stromal cell,stromal cells,True,stromal,...,0.000000,0.000000,False,False,0.357937,True,0,0,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGATGGAGTGGCGAT_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,4114.0,1490,t cell,alpha-beta T cell,True,immune,...,0.000000,0.000000,False,False,0.058805,False,0,0,9,8
TTGCCTGGTTACCCTC_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,16208.0,3578,stromal cell,stromal cells,True,stromal,...,0.000000,0.000000,False,False,0.027185,False,0,0,3,0
TTGTGGACACAGTGTT_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,25516.0,4128,endothelial cell,endothelial cells,True,endothelial,...,1.791759,0.019158,False,False,0.043766,False,0,0,1,3
TTGTTGTAGTATAGAC_TSP14_Skin_Chest_10X_1_1,Skin,10X,TSP14,Chest,10031.0,2639,stromal cell,stromal cells,True,stromal,...,0.693147,0.009888,False,False,0.040084,False,0,0,3,0


In [21]:
from sklearn.metrics.cluster import adjusted_rand_score
ari=adjusted_rand_score(adata.obs["cell_ontology_class"], adata.obs["leiden_X_scVI"])
ari

0.43429292819059256

In [22]:
adata.obs["cell_ontology_class"].nunique()

22

In [23]:
# from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score as ARI
from sklearn.metrics import normalized_mutual_info_score as NMI
from sklearn.metrics import silhouette_score
# from sklearn.mixture import GaussianMixture as GMM

def clustering_scores(labels, labels_pred, embedding):
    asw_score = silhouette_score(embedding, labels)
    nmi_score = NMI(labels, labels_pred)
    ari_score = ARI(labels, labels_pred)
    print(
        "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f"
        % (asw_score, nmi_score, ari_score)
    )
    return asw_score, nmi_score, ari_score

In [24]:
asw_score, nmi_score, ari_score = clustering_scores(adata.obs["cell_ontology_class"], adata.obs["leiden_X_scVI"], adata.obsm["X_mde"])

Clustering Scores:
Silhouette: 0.2075
NMI: 0.6572
ARI: 0.4343


In [25]:
from datetime import datetime

datetime.fromtimestamp(1701102855.5850654)

datetime.datetime(2023, 11, 27, 16, 34, 15, 585065)

In [26]:
y = dict(CPU=cpu_usage, Memory=mem_usage, GPU=gpu_mem_usage)

In [27]:
type(y)

dict

In [28]:
from tools.visualization.plot import *

line = plot_line(time_points, y)

with open('line.txt','w') as f:
   f.write(str(line))

line

{'data': [{'type': 'scatter',
   'x': [0,
    1,
    2,
    3,
    4,
    5,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27,
    28,
    29,
    30,
    31,
    32,
    33,
    34,
    35,
    36,
    37,
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    51,
    52,
    53,
    54,
    55,
    56,
    57,
    58,
    59,
    60,
    61,
    62,
    63,
    64,
    65,
    66,
    67,
    68,
    69,
    70,
    71,
    72,
    73,
    74,
    75,
    76,
    77,
    78,
    79,
    80,
    81,
    82,
    83,
    84,
    85],
   'y': [14.3,
    30.1,
    43.5,
    44.9,
    44.8,
    43.5,
    42.8,
    42.2,
    41.8,
    41.1,
    40.5,
    41.5,
    40.3,
    41.6,
    41.1,
    40.5,
    41.3,
    42.4,
    41.7,
    40.9,
    42.0,
    41.8,
    42.3,
    41.3,
    41.6,
    42.4,
    41.7,
    41.5,
 

In [29]:
x= ['ARI', 'Silhouette', 'NMI']
y1 = dict(scVI=[ari_score, asw_score, nmi_score])

In [30]:
from tools.visualization.plot import *

bar = plot_bar(x, y1)

with open('bar.txt','w') as f:
   f.write(str(bar))

bar

{'data': [{'type': 'bar',
   'x': ['ARI', 'Silhouette', 'NMI'],
   'y': [0.43429292819059256, 0.20750034, 0.6572156935259896],
   'text': [0.43429292819059256, 0.20750034, 0.6572156935259896],
   'textposition': 'auto',
   'name': 'scVI',
   'marker': {'opacity': 0.5}}],
 'layout': {'title': 'Benchmarks',
  'xaxis': {'tickangle': 0},
  'margin': {'r': 50, 'l': 50, 't': 50, 'b': 50},
  'barmode': 'group',
  'hovermode': 'closest',
  'transition': {'duration': 100},
  'autosize': True,
  'width': 1000,
  'height': 750}}