In [1]:
## This extension is needed only if running Sciviewer on a Mac OSX computer
%gui osx

## This extension is needed for any Jupyter notebook running Sciviewer
%load_ext py5

In [2]:
import pandas as pd
import numpy as np
import scanpy as sc

%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../../sciviewer')
from sciviewer import SCIViewer

## Here we illustrate how to run Sciviewer with several potential input types.
- [passing in the 2D embedding data and the gene expression data as Pandas DataFrames](#dataframe)
- [passing a Scanpy AnnData object with dense data ](#denseadata)
- [passing a Scanpy AnnData object with sparse data ](#sparseadata)
- [passing a sparse csc_matrix](#sparse_matrix)


<a id="dataframe"></a>

## Dense Pandas DataFrame

A simple way to pass the data is with Pandas DataFrames. The cell names are learned from the rows and the gene names are learned from the columns. The ordering of the cells in the 2D embedding and the gene expression matrix are assumed to match

In [3]:
print("LOADING UMAP DATA...")

umap = pd.read_csv('../../data/pbmc3k_umap_20210420.tsv', sep='\t', index_col=0)
umap.head() 

LOADING UMAP DATA...


Unnamed: 0,UMAP_1,UMAP_2
AAACATACAACCAC-1,9.728817,4.212151
AAACATTGAGCTAC-1,3.799765,10.181845
AAACATTGATCAGC-1,7.023628,4.829623
AAACCGTGCTTCCG-1,-0.298523,2.024061
AAACCGTGTATGCG-1,8.018503,-0.300426


In [5]:
print("LOADING GENE EXPRESSION DATA...")

expr = pd.read_csv('../../data/pbmc3k_expression_log2TP10K_20210420.tsv', sep='\t', index_col=0)
expr.iloc[:5,:5]

LOADING GENE EXPRESSION DATA...


Unnamed: 0,AL627309.1,AP006222.2,RP11-206L10.2,RP11-206L10.9,LINC00115
AAACATACAACCAC-1,0.0,0.0,0.0,0.0,0.0
AAACATTGAGCTAC-1,0.0,0.0,0.0,0.0,0.0
AAACATTGATCAGC-1,0.0,0.0,0.0,0.0,0.0
AAACCGTGCTTCCG-1,0.0,0.0,0.0,0.0,0.0
AAACCGTGTATGCG-1,0.0,0.0,0.0,0.0,0.0


In [6]:
svobj = SCIViewer(expr, umap)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
0.17818689346313477 seconds to select and project cells
Selected 1599 cells
Calculating correlations...
3.857682943344116 seconds to calculate correlations. Sparsity:  False
Selected gene RPL32
Min/max expression level for gene RPL32 0.0 5.7993245
Selected gene RPS12
Min/max expression level for gene RPS12 0.0 6.1948423
Selected gene RPL13
Min/max expression level for gene RPL13 0.0 6.4361143
Selected gene RPS27
Min/max expression level for gene RPS27 0.0 5.374052
0.05393576622009277 seconds to select and project cells
Selected 341 cells
Calculating differential expression...
0.04273104667663574 seconds to calculate genesums. Sparsity:  False
0.4628751277923584 seconds to calculate squared genesums. Sparsity:  False


  remainder_stds = np.sqrt(rem_val)


1.5150940418243408 seconds to calculate differential expression. Sparsity:  False
Selected gene CD74
Min/max expression level for gene CD74 0.0 6.048452
Selected gene CD3D
Min/max expression level for gene CD3D 0.0 5.092196
Selected gene IL32
Min/max expression level for gene IL32 0.0 5.5627327
0.07646989822387695 seconds to select and project cells
Selected 609 cells
Calculating correlations...
1.4149577617645264 seconds to calculate correlations. Sparsity:  False
Selected gene FCGR3A
Min/max expression level for gene FCGR3A 0.0 4.1987143
0.024641036987304688 seconds to select and project cells
Selected 37 cells
Calculating differential expression...
0.676246166229248 seconds to calculate differential expression. Sparsity:  False
Selected gene HLA-DRA
Min/max expression level for gene HLA-DRA 0.0 5.969516
EXPORTING DATA...
BYE


2021-08-02 13:33:14.197 python[11651:460073] NewtNSView::dealloc: softLock still hold @ dealloc!


In [7]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P', ascending=True).head()

Unnamed: 0,R,P
S100A8,-0.804243,2.668235e-139
FCGR3A,0.803007,1.465832e-138
LYZ,-0.786903,2.237323e-129
S100A9,-0.774451,8.319297e-123
RHOC,0.666671,1.617039e-79


#### The class attributes below get populated in real time and thus can be accessed when the interactive viewer is running

In [8]:
## This attribute get updated in real time whenever cells are selected

svobj.selected_cells.head()

Unnamed: 0,index,cell_name,projection
0,99,AAGATTACCGCCTT-1,0.374208
1,109,AAGCCATGAACTGC-1,0.564893
2,161,AATGCGTGGACGGA-1,0.219624
3,175,AATTACGAATTCCT-1,0.281582
4,251,ACCCGTTGCTTCTA-1,0.845314


In [9]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P').head()

Unnamed: 0,R,P
S100A8,-0.804243,2.668235e-139
FCGR3A,0.803007,1.465832e-138
LYZ,-0.786903,2.237323e-129
S100A9,-0.774451,8.319297e-123
RHOC,0.666671,1.617039e-79


In [10]:
svobj.results_proj_correlation.sort_values(by='R', ascending=True).head()

Unnamed: 0,R,P
S100A8,-0.804243,2.668235e-139
LYZ,-0.786903,2.237323e-129
S100A9,-0.774451,8.319297e-123
LGALS2,-0.591377,1.026797e-58
GPX1,-0.57329,1.6872420000000002e-54


In [11]:
## This gets updated in real time when cells are selected in differential mode
svobj.results_diffexpr.sort_values(by='P').head()

Unnamed: 0,T,P
CD3G,-26.461915,7.723687e-137
GZMA,-24.247722,2.8893610000000002e-117
PRKCQ-AS1,-22.57954,3.111066e-103
LEF1,-22.301159,5.891293000000001e-101
PBXIP1,-22.035322,8.454019e-99


<a id="denseadata"></a>

## AnnData with dense input matrix
Alternatively, the data can be passed as a Scanpy AnnData object. By default the expression data is read from the .X attribute and the umap data is read from .obsm with the key 'X_umap'. However alternative embeddings can be provided with the `embedding_name` argument and the expression data can be read from the .raw.X attribute by setting the use_raw attribute to True

In [12]:
data = sc.read('../../data/pbmc3k_20210420.h5ad')
data

AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

#### Here the data in data.X is dense and the data in data.raw.X is sparse. We first illustrate the method with the dense data and subsequently with the sparse data

In [13]:
svobj = SCIViewer(data, embedding_name='X_umap', use_raw=False)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
0.0865478515625 seconds to select and project cells
Selected 618 cells
Calculating correlations...
0.11456418037414551 seconds to calculate correlations. Sparsity:  False
Selected gene FCGR3A
Min/max expression level for gene FCGR3A -1.7557994 3.9911184
Selected gene ABI3
Min/max expression level for gene ABI3 -1.9131678 7.0717273
0.026217937469482422 seconds to select and project cells
Selected 37 cells
Calculating differential expression...
0.002048969268798828 seconds to calculate genesums. Sparsity:  False
0.02713799476623535 seconds to calculate squared genesums. Sparsity:  False
0.10609889030456543 seconds to calculate differential expression. Sparsity:  False
Selected gene HLA-DPA1
Min/max expression level for gene HLA-DPA1 -1.6818804 3.275891
Selected gene HLA-DPB1
Min/max expression level for gene HLA-DPB1 -1.586307 3.6341147
0.05413198471069336 seconds to select and project cells
Selected 341 cells
Calculating correlati

In [23]:
svobj.results_proj_correlation.sort_values(by='R', ascending=False)

Unnamed: 0,R,P
DNASE1L3,0.443783,6.898800e-18
TCL1A,0.412212,2.024419e-15
HLA-DRB1,0.354575,1.541069e-11
CD72,0.312071,3.877771e-09
HLA-DQA1,0.285201,8.371317e-08
...,...,...
KCNE1,,
BRWD1,,
SUMO3,,
S100B,,


<a id="sparseadata"></a>

## AnnData with sparse input matrix


In [24]:
data.raw.X

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Row format>

#### This data is sparse but is in compressed sparse row format (csr_matrix) rather than csc_matrix, so we need to convert it to csc_matrix format

In [25]:
import scipy.sparse as sp

In [26]:
z = sp.csc.csc_matrix(data.raw.X)

In [27]:
data.raw = sc.AnnData(z, var=data.raw.var, obs=data.obs)

In [28]:
data.raw.X

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Column format>

#### Now that data.raw.X is in csc_matrix format, it can be passed to sciviewer

In [29]:
svobj = SCIViewer(data, embedding_name='X_umap', use_raw=True)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
0.022098779678344727 seconds to select and project cells
Selected 15 cells
Calculating differential expression...
0.0058460235595703125 seconds to calculate genesums. Sparsity:  True
0.11003518104553223 seconds to calculate squared genesums. Sparsity:  True
0.7141640186309814 seconds to calculate differential expression. Sparsity:  True
Selected gene GPX1
Min/max expression level for gene GPX1 0.0 5.552269
Selected gene NRGN
Min/max expression level for gene NRGN 0.0 4.4768424
0.05425095558166504 seconds to select and project cells
Selected 340 cells
Calculating differential expression...
0.6401078701019287 seconds to calculate differential expression. Sparsity:  True
Selected gene HLA-DRA
Min/max expression level for gene HLA-DRA 0.0 5.969516
Selected gene HLA-DPB1
Min/max expression level for gene HLA-DPB1 0.0 6.1795025
Selected gene GIMAP7
Min/max expression level for gene GIMAP7 0.0 6.9069724
Selected gene CD3D
Min/max expres

2021-08-02 13:36:35.473 python[11651:460073] NewtNSView::dealloc: softLock still hold @ dealloc!


<a id="sparse_matrix"></a>

## Passing a sparse matrix directly

We can also pass a sparse matrix directly. Since no information is provided about the gene and cell names, we pass those as separate arguments

In [20]:
umap = pd.DataFrame(data.obsm['X_umap'], index=data.obs.index, columns=['UMAP_1', 'UMAP_2'])
umap.head()

Unnamed: 0,UMAP_1,UMAP_2
AAACATACAACCAC-1,9.728817,4.212151
AAACATTGAGCTAC-1,3.799765,10.181845
AAACATTGATCAGC-1,7.023628,4.829623
AAACCGTGCTTCCG-1,-0.298523,2.024061
AAACCGTGTATGCG-1,8.018503,-0.300426


In [21]:
cell_names = list(data.obs.index)
cell_names[:5]

['AAACATACAACCAC-1',
 'AAACATTGAGCTAC-1',
 'AAACATTGATCAGC-1',
 'AAACCGTGCTTCCG-1',
 'AAACCGTGTATGCG-1']

In [22]:
gene_names = list(data.raw.var.index)
gene_names[:5]

['AL627309.1', 'AP006222.2', 'RP11-206L10.2', 'RP11-206L10.9', 'LINC00115']

In [23]:
expr = data.raw.X

In [24]:
expr

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Column format>

In [25]:
svobj = SCIViewer(expr, umap, gene_names=gene_names, cell_names=cell_names)
svobj.explore_data()

0.24500608444213867 seconds to select and project cells
Selected 1613 cells
Calculating correlations...
0.7577908039093018 seconds to calculate correlations. Sparsity:  True
Selected gene RPL32
Min/max expression level for gene RPL32 0.0 5.7993245
0.07483386993408203 seconds to select and project cells
Selected 341 cells
Calculating differential expression...
0.003180980682373047 seconds to calculate genesums. Sparsity:  True
0.023106098175048828 seconds to calculate squared genesums. Sparsity:  True
0.5869648456573486 seconds to calculate differential expression. Sparsity:  True
Selected gene CD74
Min/max expression level for gene CD74 0.0 6.048452
Selected gene HLA-DRA
Min/max expression level for gene HLA-DRA 0.0 5.969516
Selected gene HLA-DPB1
Min/max expression level for gene HLA-DPB1 0.0 6.1795025
EXPORTING DATA...
BYE


2021-07-19 09:48:59.795 python[51388:608890] NewtNSView::dealloc: softLock still hold @ dealloc!
