In [1]:
import anndata
import holoviews as hv
hv.extension('bokeh')
import numpy as np
import pandas as pd

import scplot as sp


Read in data. The data consists of 3K PBMCs from a healthy donor from 10x Genomics.

In [2]:
adata = anndata.read('3K_PBMC.h5ad')

In [3]:
adata.obs_keys()

['n_genes', 'percent_mito', 'n_counts', 'louvain', 'leiden']

Violin plot of QC metrics

In [4]:
sp.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], cols=3)

Scatter plot matrix of QC metrics. You can optionally color by the plot by cluster assignment.

In [5]:
sp.scatter_matrix(adata, ['n_genes', 'n_counts', 'percent_mito'], color='louvain')

Violin plot of QC metrics by cluster assignment

In [6]:
sp.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], by='louvain', cols=2, width=450, height=400)

Violin plot of expression of gene expression by cluster

In [7]:
sp.violin(adata, ['CST3', 'NKG7'], by='louvain', cols=2, width=450, height=400)

Embedding of gene expression and cluster assignments

In [8]:
sp.embedding(adata, basis='umap', keys=['CST3', 'NKG7', 'PPBP', 'louvain'])

We can display the labels directly on the plot

In [9]:
sp.embedding(adata, basis='umap', keys=['louvain'], labels_on_data=True,padding=(0.4, 0.05), width=500)

Compare louvain and leiden clusterings

In [10]:
sp.count_plot(adata, 'louvain', 'leiden', stacked=True)

Heatmap of mean gene expression

In [11]:
marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
                'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',
                'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']
sp.heatmap(adata, keys=marker_genes,by='louvain')

Dotplot of gene expression

In [12]:
sp.dotplot(adata, keys=marker_genes, by='louvain')

Scatter plot of FCGR3A versus MS4A7, colored by expression of CD14

In [13]:
sp.scatter(adata, x='FCGR3A', y='MS4A7', color='CD14')

Use the box select tool to select cells. After selection is complete, you can get the selected range.

In [14]:
# save a reference to the plot to get the selection bounds
p = sp.embedding(adata, basis='umap', keys=['CST3'])
p

In [15]:
x = p.df['X_umap1']
y = p.df['X_umap2']
bounds = sp.get_bounds(p[0,0])
if bounds is not None:
    print(bounds)
    selection_adata = adata[(x>=bounds[0]) & (x<=bounds[2]) & (y>= bounds[1])&(y<=bounds[3])]

In [16]:
# Save plot to png
# hv.save(p, 'test.png')

Duplicate cells to create a dataset with 5 million cells. We include only 3 genes to conserve memory. You can also open a large AnnData file in `backed` mode to load data on demand.

In [17]:
genes_to_include = ['CST3', 'NKG7', 'PPBP']
upsampled_adata = anndata.AnnData(adata.raw[:, genes_to_include].X, adata.obs.copy(), pd.DataFrame(index=genes_to_include))
upsampled_adata.obsm['X_umap'] = adata.obsm['X_umap']
upsampled_adata = upsampled_adata[np.repeat(np.arange(0, upsampled_adata.shape[0]), 2000)]
"{:,} cells".format(upsampled_adata.shape[0])

'5,276,000 cells'

In [18]:
sp.embedding(upsampled_adata, basis='umap', keys=['CST3'], nbins=200) # will crash without nbins

In [19]:
sp.scatter(upsampled_adata, x='CST3', y='NKG7', color='PPBP', nbins=200) # will crash without nbins