## This is a much bigger dataset than the 3000 PBMC sample, as it contains 50,000 T-cells. We run visualizer using sparse data as input and the performance is still quite good

In [1]:
%gui osx
%load_ext py5

In [2]:
import pandas as pd
import numpy as np
from scipy.io import mmread
import scipy.sparse as sparse
from sciviewer import SCIViewer
import scanpy as sc

In [3]:
%load_ext autoreload
%autoreload 2

In [5]:
print('DOWNLOADING AND EXTRACTING EXAMPLE DATA')
! mkdir -p ../data
! wget https://www.dropbox.com/s/d41kndoe16bi1i1/Tcells50k.h5ad -O ../data/Tcells50k.h5ad

DOWNLOADING AND EXTRACTING EXAMPLE DATA
--2022-09-05 18:52:05--  https://www.dropbox.com/s/srzk17uvnhhhsgi/Tcells50k.h5ad
Resolving www.dropbox.com (www.dropbox.com)... 162.125.4.18
Connecting to www.dropbox.com (www.dropbox.com)|162.125.4.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/srzk17uvnhhhsgi/Tcells50k.h5ad [following]
--2022-09-05 18:52:05--  https://www.dropbox.com/s/raw/srzk17uvnhhhsgi/Tcells50k.h5ad
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc6cdaa5f5ef9c75c533646abfdb.dl.dropboxusercontent.com/cd/0/inline/BsanPLzQni5mlNCoMW0W796sutO-ioq3AJMofbTUYe6ijhWE1qqqf93VJsSLkVsbQF5BzIIAnT9902PUr3Uy__Y0q_KNhoSPjojm_lU7iyROdjkieQmvCoMk1di4aaLrJk3T6KM6qPvBL0bGs1erGWaaMvFBwMXPdEThgSkyFJN29w/file# [following]
--2022-09-05 18:52:06--  https://uc6cdaa5f5ef9c75c533646abfdb.dl.dropboxusercontent.com/cd/0/inline/BsanPLzQni5mlNCoMW0W796sutO-ioq3AJMofbTUYe6ijhWE1qqqf93VJsSLkV

## Load the data as a Scanpy AnnData 

In [6]:
data = sc.read('../data/Tcells50k.h5ad')

In [7]:
data.X

<52899x12563 sparse matrix of type '<class 'numpy.float32'>'
	with 28209469 stored elements in Compressed Sparse Row format>

#### the data is in sparse row format rather than sparse column so we need to convert it

In [8]:
data.X = sparse.csc_matrix(data.X)

#### now you can pase the AnnData object to sciviewer and it'll use the obsm['X_umap'] attribute for the 2D embedding and the .X attribute for the expression data by default

In [9]:
# gene and cell names are provided to the gene_names and cell_names attributes since
# those don't go directly with the sparse data structure

svobj = SCIViewer(data)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
0.5217690467834473 seconds to select and project cells
Selected 6870 cells
Calculating correlations...
0.552480936050415 seconds to calculate correlations. Sparsity:  True


In [10]:
## This attribute get updated in real time whenever cells are selected
svobj.selected_cells.head()

Unnamed: 0,index,cell_name,projection
0,0,AAACATACACCCAA-1,0.916526
1,7,AAACATTGCTTCGC-1,0.051023
2,15,AAACCGTGTCACCC-1,0.862504
3,18,AAACGCACGGTACT-1,0.116701
4,26,AAACGGCTGCAGAG-1,0.516208


In [11]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P').head()

Unnamed: 0,R,P
DUSP2,0.449746,0.0
GZMH,-0.787988,0.0
RPL13,0.59575,0.0
RPS2,0.474982,0.0
IL7R,0.444177,0.0


0.8255929946899414 seconds to select and project cells
Selected 13160 cells
Calculating differential expression...
0.00922703742980957 seconds to calculate genesums. Sparsity:  True
0.07808995246887207 seconds to calculate squared genesums. Sparsity:  True
0.7021546363830566 seconds to calculate differential expression. Sparsity:  True
Selected gene RPL11
Min/max expression level for gene RPL11 0.0 5.7148066


In [12]:
## This gets updated in real time when cells are selected in differential mode
svobj.results_diffexpr.sort_values(by='P').head()

Unnamed: 0,T,P
PMAIP1,-15.761498,0.0
SLC1A7,-16.943316,0.0
RPLP0,15.777176,0.0
HLA-C,-31.591606,0.0
HLA-B,-37.330349,0.0


## You can re-run the analysis using dense data and see how much slower it is

In [13]:
expr_dense = pd.DataFrame(data.X.todense(), columns=data.var.index, index=data.obs.index)

In [14]:
expr_dense.iloc[:5, :5]

Unnamed: 0,LINC00115,FAM41C,NOC2L,KLHL17,PLEKHN1
AAACATACACCCAA-1,0.0,0.0,0.0,0.0,0.0
AAACATACCCCTCA-1,0.0,0.0,0.0,0.0,0.0
AAACATACCGGAGA-1,0.0,0.0,0.0,0.0,0.0
AAACATACTCTTCA-1,0.0,0.0,0.0,0.0,0.0
AAACATACTGGATC-1,0.0,0.0,0.0,0.0,0.0


In [15]:
umap = data.obsm['X_umap']

EXPORTING DATA...
BYE


2022-09-05 18:54:20.343 python[55569:1345364] NewtNSView::dealloc: softLock still hold @ dealloc!


#### Here we pass the expression data as a Pandas DataFrame and the umap as a Numpy ndarray

In [16]:
svobj_dense = SCIViewer(expr_dense, umap)
svobj_dense.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
0.5447080135345459 seconds to select and project cells
Selected 7000 cells
Calculating correlations...
2.945772171020508 seconds to calculate correlations. Sparsity:  False
EXPORTING DATA...
BYE


2022-09-05 18:54:56.581 python[55569:1345364] NewtNSView::dealloc: softLock still hold @ dealloc!
