In [1]:
%load_ext py5
%gui osx

In [2]:
import pandas as pd
import numpy as np
from sciviewer import SCIViewer
import scanpy as sc

In [3]:
print('DOWNLOADING AND EXTRACTING EXAMPLE DATA')
! mkdir -p ../data
! wget https://www.dropbox.com/s/gx9y42m4knyi1cl/pbmc3k_umap_20210420.tsv -O ../data/pbmc3k_umap_20210420.tsv
! wget https://www.dropbox.com/s/afnc6cbedvsez75/pbmc3k_expression_log2TP10K_20210420.tsv -O ../data/pbmc3k_expression_log2TP10K_20210420.tsv
! wget https://www.dropbox.com/s/kmryxbttn7e0wh3/pbmc3k_20210420.h5ad -O ../data/pbmc3k_20210420.h5ad
! ls ../data

DOWNLOADING AND EXTRACTING EXAMPLE DATA
--2021-07-07 11:25:02--  https://www.dropbox.com/s/gx9y42m4knyi1cl/pbmc3k_umap_20210420.tsv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.4.18
Connecting to www.dropbox.com (www.dropbox.com)|162.125.4.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/gx9y42m4knyi1cl/pbmc3k_umap_20210420.tsv [following]
--2021-07-07 11:25:02--  https://www.dropbox.com/s/raw/gx9y42m4knyi1cl/pbmc3k_umap_20210420.tsv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc4c613f673844ff25f2c70df6bb.dl.dropboxusercontent.com/cd/0/inline/BR2gVH7sif-3yvyOw0uCL57Kk769Ys_POK-Md_SSIXk89gE70OMnmzfs1apjsS6kChuYqgQPSepa0bxkqpwP2m9njljXNqo6bnER9YMI8uviVAWVqThBLA93pFsa-d7xLEbc2ViEViLNQYgp3myLuEBr/file# [following]
--2021-07-07 11:25:03--  https://uc4c613f673844ff25f2c70df6bb.dl.dropboxusercontent.com/cd/0/inline/BR2gVH7sif-3yvyOw0uCL57Kk769Ys_POK-Md_SSIX

## Here we illustrate how to run Sciviewer with several potential input types.
- [passing in the 2D embedding data and the gene expression data as Pandas DataFrames](#dataframe)
- [passing a Scanpy AnnData object with dense data ](#denseadata)
- [passing a Scanpy AnnData object with sparse data ](#sparseadata)
- [passing a sparse csc_matrix](#sparse_matrix)


<a id="dataframe"></a>

## Dense Pandas DataFrame

Perhaps the simplest way to pass the data is with Pandas DataFrames. The cell names are learned from the rows and the gene names are learned from the columns. The ordering of the cells in the 2D embedding and the gene expression matrix are assumed to match

In [4]:
print("LOADING UMAP DATA...")

umap = pd.read_csv('../data/pbmc3k_umap_20210420.tsv', sep='\t', index_col=0)
umap.head() 

LOADING UMAP DATA...


Unnamed: 0,UMAP_1,UMAP_2
AAACATACAACCAC-1,9.728817,4.212151
AAACATTGAGCTAC-1,3.799765,10.181845
AAACATTGATCAGC-1,7.023628,4.829623
AAACCGTGCTTCCG-1,-0.298523,2.024061
AAACCGTGTATGCG-1,8.018503,-0.300426


In [5]:
print("LOADING GENE EXPRESSION DATA...")

expr = pd.read_csv('../data/pbmc3k_expression_log2TP10K_20210420.tsv', sep='\t', index_col=0)
expr.iloc[:5,:5]

LOADING GENE EXPRESSION DATA...


Unnamed: 0,AL627309.1,AP006222.2,RP11-206L10.2,RP11-206L10.9,LINC00115
AAACATACAACCAC-1,0.0,0.0,0.0,0.0,0.0
AAACATTGAGCTAC-1,0.0,0.0,0.0,0.0,0.0
AAACATTGATCAGC-1,0.0,0.0,0.0,0.0,0.0
AAACCGTGCTTCCG-1,0.0,0.0,0.0,0.0,0.0
AAACCGTGTATGCG-1,0.0,0.0,0.0,0.0,0.0


In [6]:
svobj = SCIViewer(expr, umap)
svobj.explore_data()

0.18897628784179688 seconds to select and project cells
Selected 1613 cells
Calculating correlations...
3.5531578063964844 seconds to calculate correlations. Sparsity:  False
Selected gene RPL32
Min/max expression level for gene RPL32 0.0 5.7993245
Selected gene RPS12
Min/max expression level for gene RPS12 0.0 6.1948423
Selected gene CST7
Min/max expression level for gene CST7 0.0 5.537982


#### The class attributes below get populated in real time and thus can be accessed when the interactive viewer is running

In [7]:
## This attribute get updated in real time whenever cells are selected

svobj.selected_cells.head()

Unnamed: 0,index,cell_name,projection
0,0,AAACATACAACCAC-1,0.585163
1,2,AAACATTGATCAGC-1,0.631541
2,4,AAACCGTGTATGCG-1,0.134188
3,5,AAACGCACTGGTAC-1,0.597343
4,6,AAACGCTGACCAGT-1,0.688772


In [8]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P').head()

Unnamed: 0,R,P
CST7,-0.783907,0.0
NKG7,-0.847512,0.0
GZMA,-0.743711,4.643779e-284
GZMB,-0.740909,8.113502999999999e-281
PRF1,-0.724881,4.794203e-263


0.06251382827758789 seconds to select and project cells
Selected 340 cells
Calculating differential expression...
0.03123188018798828 seconds to calculate genesums. Sparsity:  False
0.3236961364746094 seconds to calculate squared genesums. Sparsity:  False


  remainder_stds = np.sqrt((self.gene_sqsum - selected_stds - (remainder_N*remainder_means**2)) / (remainder_N -1))


1.406182050704956 seconds to calculate differential expression. Sparsity:  False
Selected gene CD74
Min/max expression level for gene CD74 0.0 6.048452


In [9]:
## This gets updated in real time when cells are selected in differential mode
svobj.results_diffexpr.sort_values(by='P').head()

Unnamed: 0,T,P
CD74,79.808614,0.0
HLA-DRA,74.388502,0.0
HLA-DPB1,53.818944,3.453053e-271
HLA-DRB1,49.025235,3.0421489999999997e-242
CD3D,-36.686168,2.985059e-206


EXPORTING DATA...
BYE


<a id="denseadata"></a>

## AnnData with dense input matrix
Alternatively, the data can be passed as a Scanpy AnnData object. By default the expression data is read from the .X attribute and the umap data is read from .obsm with the key 'X_umap'. However alternative embeddings can be provided with the `embedding_name` argument and the expression data can be read from the .raw.X attribute by setting the use_raw attribute to True

In [3]:
data = sc.read('../data/pbmc3k_20210420.h5ad')
data

AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

#### Here the data in data.X is dense and the data in data.raw.X is sparse. We first illustrate the method with the dense data and subsequently with the sparse data

In [14]:
svobj = SCIViewer(data, embedding_name='X_umap', use_raw=False)
svobj.explore_data()

0.18799591064453125 seconds to select and project cells
Selected 1613 cells
Calculating correlations...
0.34326720237731934 seconds to calculate correlations. Sparsity:  False
Selected gene LTB
Min/max expression level for gene LTB -1.7345115 2.5181878
0.03163003921508789 seconds to select and project cells
Selected 33 cells
Calculating differential expression...
0.006718158721923828 seconds to calculate genesums. Sparsity:  False
0.02289104461669922 seconds to calculate squared genesums. Sparsity:  False
0.36951398849487305 seconds to calculate differential expression. Sparsity:  False
EXPORTING DATA...
BYE


<a id="sparseadata"></a>

## AnnData with sparse input matrix


In [15]:
data.raw.X

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Row format>

#### This data is sparse but is in compressed sparse row format (csr_matrix) rather than csc_matrix, so we need to convert it to csc_matrix format

In [4]:
import scipy.sparse as sp

In [5]:
z = sp.csc.csc_matrix(data.raw.X)

In [6]:
data.raw = sc.AnnData(z, var=data.raw.var, obs=data.obs)

In [7]:
data.raw.X

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Column format>

#### Now that data.raw.X is in csc_matrix format, it can be passed to sciviewer

In [22]:
svobj = SCIViewer(data, embedding_name='X_umap', use_raw=True)
svobj.explore_data()

0.09190821647644043 seconds to select and project cells
Selected 615 cells
Calculating correlations...
1.2728462219238281 seconds to calculate correlations. Sparsity:  True
Selected gene FCGR3A
Min/max expression level for gene FCGR3A 0.0 4.1987143
0.02237415313720703 seconds to select and project cells
Selected 13 cells
Calculating differential expression...
0.002691984176635742 seconds to calculate genesums. Sparsity:  True
0.018991947174072266 seconds to calculate squared genesums. Sparsity:  True
1.1586229801177979 seconds to calculate differential expression. Sparsity:  True
Selected gene GPX1
Min/max expression level for gene GPX1 0.0 5.552269
Selected gene PF4
Min/max expression level for gene PF4 0.0 5.5560412
EXPORTING DATA...
BYE


<a id="sparse_matrix"></a>

## Passing a sparse matrix directly

We can also pass a sparse matrix directly. Since no information is provided about the gene and cell names, we pass those as separate arguments

In [8]:
umap = pd.DataFrame(data.obsm['X_umap'], index=data.obs.index, columns=['UMAP_1', 'UMAP_2'])
umap.head()

Unnamed: 0,UMAP_1,UMAP_2
AAACATACAACCAC-1,9.728817,4.212151
AAACATTGAGCTAC-1,3.799765,10.181845
AAACATTGATCAGC-1,7.023628,4.829623
AAACCGTGCTTCCG-1,-0.298523,2.024061
AAACCGTGTATGCG-1,8.018503,-0.300426


In [9]:
cell_names = list(data.obs.index)
cell_names[:5]

['AAACATACAACCAC-1',
 'AAACATTGAGCTAC-1',
 'AAACATTGATCAGC-1',
 'AAACCGTGCTTCCG-1',
 'AAACCGTGTATGCG-1']

In [10]:
gene_names = list(data.raw.var.index)
gene_names[:5]

['AL627309.1', 'AP006222.2', 'RP11-206L10.2', 'RP11-206L10.9', 'LINC00115']

In [11]:
expr = data.raw.X

In [12]:
svobj = SCIViewer(expr, umap, gene_names=gene_names, cell_names=cell_names)
svobj.explore_data()

0.08341598510742188 seconds to select and project cells
Selected 626 cells
Calculating correlations...
1.2140448093414307 seconds to calculate correlations. Sparsity:  True
Selected gene FCGR3A
Min/max expression level for gene FCGR3A 0.0 4.1987143
0.039552927017211914 seconds to select and project cells
Selected 14 cells
Calculating differential expression...
0.004708290100097656 seconds to calculate genesums. Sparsity:  True
0.03143000602722168 seconds to calculate squared genesums. Sparsity:  True
1.161715030670166 seconds to calculate differential expression. Sparsity:  True
Selected gene GNLY
Min/max expression level for gene GNLY 0.0 6.19235
EXPORTING DATA...
BYE
