## This is a much bigger dataset than the 3000 PBMC sample, as it contains 50,000 T-cells. We run visualizer using sparse data as input and the performance is still quite good

In [1]:
import pandas as pd
import numpy as np
from scipy.io import mmread
import scipy.sparse as sparse
from sciviewer import SCIViewer
import scanpy as sc

In [2]:
%load_ext py5
%gui osx
%load_ext autoreload
%autoreload 2

In [3]:
rm -R ../data

rm: ../data: No such file or directory


In [4]:
print('DOWNLOADING AND EXTRACTING EXAMPLE DATA')
! mkdir -p ../data
! wget https://www.dropbox.com/s/srzk17uvnhhhsgi/Tcells50k.h5ad -O ../data/Tcells50k.h5ad

DOWNLOADING AND EXTRACTING EXAMPLE DATA
--2021-07-07 21:14:27--  https://www.dropbox.com/s/srzk17uvnhhhsgi/Tcells50k.h5ad
Resolving www.dropbox.com (www.dropbox.com)... 162.125.4.18
Connecting to www.dropbox.com (www.dropbox.com)|162.125.4.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/srzk17uvnhhhsgi/Tcells50k.h5ad [following]
--2021-07-07 21:14:28--  https://www.dropbox.com/s/raw/srzk17uvnhhhsgi/Tcells50k.h5ad
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc1090318c001010e03dabddb7b0.dl.dropboxusercontent.com/cd/0/inline/BR40GTc-hmPis8J01cI5D6BRQNhCW0L18FHXmJ26E5IKFW04-vlc41trjcGKbbSf_q116rx-IVPCQGB62i8MaojjHXOSxooearPxZCdAnthFqZW5hMrqiR8WqCttfWD3_tm_56gO8ayIj0tF4y1salBX/file# [following]
--2021-07-07 21:14:28--  https://uc1090318c001010e03dabddb7b0.dl.dropboxusercontent.com/cd/0/inline/BR40GTc-hmPis8J01cI5D6BRQNhCW0L18FHXmJ26E5IKFW04-vlc41trjcGKbbSf_q116rx-

## Load the data as a Scanpy AnnData 

In [3]:
data = sc.read('../data/Tcells50k.h5ad')

In [4]:
data.X

<52899x12563 sparse matrix of type '<class 'numpy.float32'>'
	with 28209469 stored elements in Compressed Sparse Row format>

#### the data is in sparse row format rather than sparse column so we need to convert it

In [5]:
data.X = sparse.csc.csc_matrix(data.X)

#### now you can pase the AnnData object to sciviewer and it'll use the obsm['X_umap'] attribute for the 2D embedding and the .X attribute for the expression data by default

In [11]:
# gene and cell names are provided to the gene_names and cell_names attributes since
# those don't go directly with the sparse data structure

svobj = SCIViewer(data)
svobj.explore_data()

1.7327361106872559 seconds to select and project cells
Selected 12642 cells
Calculating correlations...
1.662114143371582 seconds to calculate correlations. Sparsity:  True
Selected gene LTB
Min/max expression level for gene LTB 0.0 5.9866385
Selected gene NKG7
Min/max expression level for gene NKG7 0.0 5.463012
Selected gene RPL13
Min/max expression level for gene RPL13 2.1021698 6.324769
0.4980888366699219 seconds to select and project cells
Selected 588 cells
Calculating differential expression...
0.019134044647216797 seconds to calculate genesums. Sparsity:  True
0.3047330379486084 seconds to calculate squared genesums. Sparsity:  True
2.226783037185669 seconds to calculate differential expression. Sparsity:  True
Selected gene S100A4
Min/max expression level for gene S100A4 0.0 5.2891912
Selected gene S100B
Min/max expression level for gene S100B 0.0 4.525306
Selected gene RPL13
Min/max expression level for gene RPL13 2.1021698 6.324769
EXPORTING DATA...
BYE


In [12]:
## This attribute get updated in real time whenever cells are selected

svobj.selected_cells.head()

Unnamed: 0,index,cell_name,projection
0,10,AAACATTGTCTTCA-1,0.101913
1,34,AAACTTGAATCGTG-1,0.177484
2,52,AAAGAGACGTTACG-1,0.008189
3,106,AAATCTGACTACGA-1,0.712988
4,153,AACAGCACCACTTT-1,0.442798


In [13]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P').head()

Unnamed: 0,R,P
GZMH,-0.82383,0.0
FGFBP2,-0.70718,0.0
RPL10,0.434397,0.0
IL7R,0.473644,0.0
CD27,0.363631,0.0


In [14]:
## This gets updated in real time when cells are selected in differential mode
svobj.results_diffexpr.sort_values(by='P').head()

Unnamed: 0,T,P
KLRC1,-37.101429,1.978595e-297
S100A4,66.929306,6.983141e-297
S100B,-41.172668,6.404913999999999e-284
GPR56,-35.789577,3.641851e-277
TTC38,-33.040241,5.972819999999999e-237


## You can re-run the analysis using dense data and see how much slower it is

In [6]:
expr_dense = pd.DataFrame(data.X.todense(), columns=data.var.index, index=data.obs.index)

In [7]:
expr_dense.iloc[:5, :5]

Unnamed: 0,LINC00115,FAM41C,NOC2L,KLHL17,PLEKHN1
AAACATACACCCAA-1,0.0,0.0,0.0,0.0,0.0
AAACATACCCCTCA-1,0.0,0.0,0.0,0.0,0.0
AAACATACCGGAGA-1,0.0,0.0,0.0,0.0,0.0
AAACATACTCTTCA-1,0.0,0.0,0.0,0.0,0.0
AAACATACTGGATC-1,0.0,0.0,0.0,0.0,0.0


In [8]:
umap = data.obsm['X_umap']

#### Here we pass the expression data as a Pandas DataFrame and the umap as a Numpy ndarray

In [10]:
svobj_dense = SCIViewer(expr_dense, umap)
svobj_dense.explore_data()

1.6459579467773438 seconds to select and project cells
Selected 12823 cells
Calculating correlations...
20.05166006088257 seconds to calculate correlations. Sparsity:  False
Selected gene LTB
Min/max expression level for gene LTB 0.0 5.9866385
Selected gene NKG7
Min/max expression level for gene NKG7 0.0 5.463012
0.5081827640533447 seconds to select and project cells
Selected 567 cells
Calculating differential expression...
3.976001024246216 seconds to calculate genesums. Sparsity:  False
8.784842014312744 seconds to calculate squared genesums. Sparsity:  False
15.340595960617065 seconds to calculate differential expression. Sparsity:  False
Selected gene S100A4
Min/max expression level for gene S100A4 0.0 5.2891912
Selected gene KLRC1
Min/max expression level for gene KLRC1 0.0 4.1433806
EXPORTING DATA...
BYE
