## This is a much bigger dataset than the 3000 PBMC sample, as it contains 50,000 T-cells. We run visualizer using sparse data as input and the performance is still quite good

In [1]:
import pandas as pd
import numpy as np
from scipy.io import mmread
import scipy.sparse as sparse
from sciviewer import SCIViewer

In [2]:
%load_ext py5
%gui osx
%load_ext autoreload
%autoreload 2

In [3]:
print('DOWNLOADING AND EXTRACTING EXAMPLE DATA')
! mkdir -p ../data
! wget https://www.dropbox.com/s/n3dbhu03845qwmb/Tcell50K_expression_log2TP10K_20210409.barcodes.tsv -O ../data/Tcell50K_expression_log2TP10K_20210409.barcodes.tsv
! wget https://www.dropbox.com/s/8df9uc5yzxv8zob/Tcell50K_expression_log2TP10K_20210409.genes.tsv -O ../data/Tcell50K_expression_log2TP10K_20210409.genes.tsv
! wget https://www.dropbox.com/s/w0iklq4uifsyljn/Tcell50K_expression_log2TP10K_20210409.umap.tsv -O ../data/Tcell50K_expression_log2TP10K_20210409.umap.tsv
! wget https://www.dropbox.com/s/zewzv04mrfoggix/Tcell50K_expression_log2TP10K_20210409.mtx -O ../data/Tcell50K_expression_log2TP10K_20210409.mtx

DOWNLOADING AND EXTRACTING EXAMPLE DATA
--2021-04-20 19:31:56--  https://www.dropbox.com/s/n3dbhu03845qwmb/Tcell50K_expression_log2TP10K_20210409.barcodes.tsv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.4.18
Connecting to www.dropbox.com (www.dropbox.com)|162.125.4.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/n3dbhu03845qwmb/Tcell50K_expression_log2TP10K_20210409.barcodes.tsv [following]
--2021-04-20 19:31:57--  https://www.dropbox.com/s/raw/n3dbhu03845qwmb/Tcell50K_expression_log2TP10K_20210409.barcodes.tsv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc887d3c6dcb205554d72ced7a55.dl.dropboxusercontent.com/cd/0/inline/BNB9Md2Z56v4Pth4UTTjEhmhCukp7M0HCAqMURw4PUcr4AF1vy_i8eSE7mfeXoAV4QZL_aRl0YtGtMUG676Q3RCVECEH5hcgPC6HBJlN7l7v-jRDzzHB6DZHxZwV0WvH-HV9CMBhWZVm2c32HJjJQZhR/file# [following]
--2021-04-20 19:31:57--  https://uc887d3c6dcb205554d72ced7a55

In [4]:
print("LOADING UMAP DATA...")

umap = pd.read_csv('../data/Tcell50K_expression_log2TP10K_20210409.umap.tsv', sep='\t', index_col=0)
umap.head()

LOADING UMAP DATA...


Unnamed: 0,UMAP_1,UMAP_2
AAACATACACCCAA-1,3.975246,10.370767
AAACATACCCCTCA-1,9.388674,1.431675
AAACATACCGGAGA-1,12.206055,11.943375
AAACATACTCTTCA-1,15.312049,-2.373958
AAACATACTGGATC-1,10.571509,-6.149192


In [5]:
print("LOADING GENE EXPRESSION DATA...")

expr_sparse = mmread('../data/Tcell50K_expression_log2TP10K_20210409.mtx')
expr_sparse = sparse.csc_matrix(expr_sparse)
expr_sparse

LOADING GENE EXPRESSION DATA...


<52899x12563 sparse matrix of type '<class 'numpy.float64'>'
	with 28209469 stored elements in Compressed Sparse Column format>

In [6]:
genes = list(pd.read_csv('../data/Tcell50K_expression_log2TP10K_20210409.genes.tsv', sep='\t', header=None)[0])
barcodes = list(pd.read_csv('../data/Tcell50K_expression_log2TP10K_20210409.barcodes.tsv', sep='\t', header=None)[0])

In [7]:
# gene and cell names are provided to the gene_names and cell_names attributes since
# those don't go directly with the sparse data structure

svobj = SCIViewer(umap, expr_sparse, gene_names=genes,
                    cell_names=barcodes)
svobj.explore_data()

1.7960572242736816 seconds to select and project cells
Selected 12282 cells
Calculating differential expression...
0.07000613212585449 seconds to calculate genesums. Sparsity:  True
0.7171468734741211 seconds to calculate squared genesums. Sparsity:  True
2.4086992740631104 seconds to calculate differential expression. Sparsity:  True
Selected gene NKG7
Min/max expression level for gene NKG7 0.0 5.463
Selected gene GZMA
Min/max expression level for gene GZMA 0.0 4.8103
Selected gene CCL5
Min/max expression level for gene CCL5 0.0 5.743


In [8]:
## This attribute get updated in real time whenever cells are selected

svobj.selected_cells.head()

Unnamed: 0,index,cell_name,projection
0,0,AAACATACACCCAA-1,0.850499
1,7,AAACATTGCTTCGC-1,0.141245
2,12,AAACCGTGCCCAAA-1,0.635838
3,13,AAACCGTGCGATAC-1,0.859967
4,15,AAACCGTGTCACCC-1,0.808978


In [11]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P').head()

Unnamed: 0,R,P
FOS,0.43886,5.278833000000001e-255
DUSP1,0.428832,2.464365e-242
S100A4,0.410756,1.4854000000000001e-220
LTB,0.390538,9.784411e-198
MALAT1,-0.346522,2.5755180000000004e-153


EXPORTING DATA...
BYE


In [10]:
## This gets updated in real time when cells are selected in differential mode
svobj.results_diffexpr.sort_values(by='P').head()

Unnamed: 0,T,P
JUNB,-56.351577,0.0
RPS8,-57.105766,0.0
COTL1,-46.026753,0.0
PIK3IP1,-53.734194,0.0
KLRD1,43.096151,0.0


0.7509219646453857 seconds to select and project cells
Selected 2912 cells
Calculating correlations...
2.000452995300293 seconds to calculate correlations. Sparsity:  True
Selected gene GNLY
Min/max expression level for gene GNLY 0.0 6.0092
Selected gene CD8B
Min/max expression level for gene CD8B 0.0 4.8472
0.6916131973266602 seconds to select and project cells
Selected 2378 cells
Calculating correlations...
1.4387142658233643 seconds to calculate correlations. Sparsity:  True
Selected gene FOS
Min/max expression level for gene FOS 0.0 4.8013
1.1787519454956055 seconds to select and project cells
Selected 5441 cells
Calculating correlations...
1.5225191116333008 seconds to calculate correlations. Sparsity:  True
Selected gene FOS
Min/max expression level for gene FOS 0.0 4.8013


## You can re-run the analysis using dense data and see how much slower it is

In [12]:
expr_dense = pd.DataFrame(expr_sparse.todense(), columns=genes, index=barcodes)

In [13]:
expr_dense.iloc[:5, :5]

Unnamed: 0,LINC00115,FAM41C,NOC2L,KLHL17,PLEKHN1
AAACATACACCCAA-1,0.0,0.0,0.0,0.0,0.0
AAACATACCCCTCA-1,0.0,0.0,0.0,0.0,0.0
AAACATACCGGAGA-1,0.0,0.0,0.0,0.0,0.0
AAACATACTCTTCA-1,0.0,0.0,0.0,0.0,0.0
AAACATACTGGATC-1,0.0,0.0,0.0,0.0,0.0


In [14]:
expr_dense.shape

(52899, 12563)

In [16]:
svobj_dense = SCIViewer(umap, expr_dense)
svobj_dense.explore_data()

0.9585070610046387 seconds to select and project cells
Selected 5328 cells
Calculating differential expression...
8.030466079711914 seconds to calculate genesums. Sparsity:  False
26.470013856887817 seconds to calculate squared genesums. Sparsity:  False


  remainder_stds = np.sqrt((self.gene_sqsum - selected_stds - (remainder_N*remainder_means**2)) / (remainder_N -1))


63.93354320526123 seconds to calculate differential expression. Sparsity:  False
Selected gene NKG7
Min/max expression level for gene NKG7 0.0 5.463
Selected gene GNLY
Min/max expression level for gene GNLY 0.0 6.0092
Selected gene CCL5
Min/max expression level for gene CCL5 0.0 5.743
0.7948040962219238 seconds to select and project cells
Selected 2194 cells
Calculating correlations...


  rs = np.dot(DP, DO) / np.sqrt(np.sum(DO ** 2, 0) * np.sum(DP ** 2))


13.287477016448975 seconds to calculate correlations. Sparsity:  False
