In [1]:
## This extension is needed only if running Sciviewer on a Mac OSX computer
%gui osx

## This extension is needed for any Jupyter notebook running Sciviewer
%load_ext py5

In [2]:
import pandas as pd
import numpy as np
import scanpy as sc

%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../../sciviewer')
from sciviewer import SCIViewer

## Here we illustrate how to run Sciviewer with several potential input types.
- [passing in the 2D embedding data and the gene expression data as Pandas DataFrames](#dataframe)
- [passing a Scanpy AnnData object with dense data ](#denseadata)
- [passing a Scanpy AnnData object with sparse data ](#sparseadata)
- [passing a sparse csc_matrix](#sparse_matrix)


<a id="dataframe"></a>

## Dense Pandas DataFrame

A simple way to pass the data is with Pandas DataFrames. The cell names are learned from the rows and the gene names are learned from the columns. The ordering of the cells in the 2D embedding and the gene expression matrix are assumed to match

In [3]:
print("LOADING UMAP DATA...")

umap = pd.read_csv('../../data/pbmc3k_umap_20210420.tsv', sep='\t', index_col=0)
umap.head() 

LOADING UMAP DATA...


Unnamed: 0,UMAP_1,UMAP_2
AAACATACAACCAC-1,9.728817,4.212151
AAACATTGAGCTAC-1,3.799765,10.181845
AAACATTGATCAGC-1,7.023628,4.829623
AAACCGTGCTTCCG-1,-0.298523,2.024061
AAACCGTGTATGCG-1,8.018503,-0.300426


In [4]:
print("LOADING GENE EXPRESSION DATA...")

expr = pd.read_csv('../../data/pbmc3k_expression_log2TP10K_20210420.tsv', sep='\t', index_col=0)
expr.iloc[:5,:5]

LOADING GENE EXPRESSION DATA...


Unnamed: 0,AL627309.1,AP006222.2,RP11-206L10.2,RP11-206L10.9,LINC00115
AAACATACAACCAC-1,0.0,0.0,0.0,0.0,0.0
AAACATTGAGCTAC-1,0.0,0.0,0.0,0.0,0.0
AAACATTGATCAGC-1,0.0,0.0,0.0,0.0,0.0
AAACCGTGCTTCCG-1,0.0,0.0,0.0,0.0,0.0
AAACCGTGTATGCG-1,0.0,0.0,0.0,0.0,0.0


In [6]:
svobj = SCIViewer(expr, umap)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
Setting up...
0.017865896224975586 seconds to select and project cells
Selected 160 cells
Calculating correlations...
0.4768638610839844 seconds to calculate correlations. Sparsity:  False
Selected gene FCER1A
Min/max expression level for gene FCER1A 0.0 4.3631954


In [9]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P', ascending=True).head()

Unnamed: 0,R,P
TCL1A,-0.427525,2.807571e-11
AIM2,0.33999,2.073196e-07
CD27,0.335986,2.933639e-07
RP5-887A10.1,0.326839,6.366942e-07
FCER2,-0.320034,1.114949e-06


#### The class attributes below get populated in real time and thus can be accessed when the interactive viewer is running

In [10]:
## This attribute get updated in real time whenever cells are selected
svobj.selected_cells.head()

Unnamed: 0,index,cell_name,projection
0,20,AAAGTTTGGGGTGA-1,0.602639
1,23,AAATCAACAATGCC-1,0.738593
2,31,AAATCCCTGCTATG-1,0.331177
3,55,AACCGATGGTCATG-1,0.364138
4,75,AACGTGTGGCGGAA-1,0.744749


In [11]:
## This gets updated in real time when cells are selected in directional mode
svobj.results_proj_correlation.sort_values(by='P').head()

Unnamed: 0,R,P
TCL1A,-0.427525,2.807571e-11
AIM2,0.33999,2.073196e-07
CD27,0.335986,2.933639e-07
RP5-887A10.1,0.326839,6.366942e-07
FCER2,-0.320034,1.114949e-06


In [12]:
svobj.results_proj_correlation.sort_values(by='R', ascending=True).head()

Unnamed: 0,R,P
TCL1A,-0.427525,2.807571e-11
FCER2,-0.320034,1.114949e-06
SNX29P2,-0.284348,1.693367e-05
CD72,-0.276116,3.016605e-05
CD74,-0.264308,6.691133e-05


In [14]:
## This gets updated in real time when cells are selected in differential mode
svobj.results_diffexpr.sort_values(by='P').head()

Unnamed: 0,T,P
FTH1,61.822418,0.0
CST3,81.813329,0.0
FTL,77.185153,0.0
TYROBP,75.13137,0.0
S100A4,45.770895,2.47347e-306


<a id="denseadata"></a>

## AnnData with dense input matrix
Alternatively, the data can be passed as a Scanpy AnnData object. By default the expression data is read from the .X attribute and the umap data is read from .obsm with the key 'X_umap'. However alternative embeddings can be provided with the `embedding_name` argument and the expression data can be read from the .raw.X attribute by setting the use_raw attribute to True

In [16]:
data = sc.read('../../data/pbmc3k_20210420.h5ad')
data

AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

#### Here the data in data.X is dense and the data in data.raw.X is sparse. We first illustrate the method with the dense data and subsequently with the sparse data

In [17]:
svobj = SCIViewer(data, embedding_name='X_umap', use_raw=False)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
0.03204011917114258 seconds to select and project cells
Selected 475 cells
Calculating correlations...
0.05255699157714844 seconds to calculate correlations. Sparsity:  False


In [18]:
svobj.results_proj_correlation.sort_values(by='R', ascending=False)

Unnamed: 0,R,P
LTB,0.555617,7.890236e-40
MAL,0.342726,1.544537e-14
TRABD2A,0.258347,1.110187e-08
LDLRAP1,0.229241,4.408213e-07
SRSF5,0.210494,3.701575e-06
...,...,...
BACE2,,
SIK1,,
ICOSLG,,
SLC19A1,,


<a id="sparseadata"></a>

## AnnData with sparse input matrix


In [19]:
data.raw.X

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Row format>

#### This data is sparse but is in compressed sparse row format (csr_matrix) rather than csc_matrix, so we need to convert it to csc_matrix format

In [20]:
import scipy.sparse as sp

In [22]:
z = sp.csc_matrix(data.raw.X)

In [23]:
data.raw = sc.AnnData(z, var=data.raw.var, obs=data.obs)

In [24]:
data.raw.X

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Column format>

#### Now that data.raw.X is in csc_matrix format, it can be passed to sciviewer

In [26]:
svobj = SCIViewer(data, embedding_name='X_umap', use_raw=True)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...


<a id="sparse_matrix"></a>

## Passing a sparse matrix directly

We can also pass a sparse matrix directly. Since no information is provided about the gene and cell names, we pass those as separate arguments

In [27]:
umap = pd.DataFrame(data.obsm['X_umap'], index=data.obs.index, columns=['UMAP_1', 'UMAP_2'])
umap.head()

Unnamed: 0,UMAP_1,UMAP_2
AAACATACAACCAC-1,9.728817,4.212151
AAACATTGAGCTAC-1,3.799765,10.181845
AAACATTGATCAGC-1,7.023628,4.829623
AAACCGTGCTTCCG-1,-0.298523,2.024061
AAACCGTGTATGCG-1,8.018503,-0.300426


In [28]:
cell_names = list(data.obs.index)
cell_names[:5]

['AAACATACAACCAC-1',
 'AAACATTGAGCTAC-1',
 'AAACATTGATCAGC-1',
 'AAACCGTGCTTCCG-1',
 'AAACCGTGTATGCG-1']

In [29]:
gene_names = list(data.raw.var.index)
gene_names[:5]

['AL627309.1', 'AP006222.2', 'RP11-206L10.2', 'RP11-206L10.9', 'LINC00115']

In [30]:
expr = data.raw.X

In [31]:
expr

<2638x13714 sparse matrix of type '<class 'numpy.float32'>'
	with 2238732 stored elements in Compressed Sparse Column format>

In [32]:
svobj = SCIViewer(expr, umap, gene_names=gene_names, cell_names=cell_names)
svobj.explore_data()

Create renderer
Start thread
Finish thread
Setting up...
EXPORTING DATA...
BYE


2022-09-05 18:35:00.043 python[48357:1303094] NewtNSView::dealloc: softLock still hold @ dealloc!
