In [None]:
# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "anywidget>=0.9.0",
#     "jupyter-scatter-scsketch>=0.21.0",
#     "llvmlite>=0.44.0",
#     "numpy>=1.26.0",
#     "pandas>=2.0.0",
#     "scanpy>=1.9.0",
#     "scipy>=1.11.0",
#     "ipywidgets>=8.0.0",
#     "matplotlib>=3.7.0",
#     "requests>=2.31.0",
#     "watchfiles>=0.20.0",
# ]
#
# [tool.uv.sources]
# scsketch = { path = ".", editable = true }
# ///

# scSketch

> **Quick Start**: `uvx scsketch demo` (no cloning required!)  
> Or from this repo: `uvx juv run demo.ipynb`

scSketch provides a custom UI for [Jupyter-Scatter](https://jupyter-scatter.dev) that implements [Directional Analysis from Colubri et al's Sciviewer](https://doi.org/10.1093/bioinformatics/btab689). Sciviewer's directional analysis helps you interpret patterns in embedding visualizations by identifying genes varying locally along any user-specified direction.

For this demo, we're using a single-cell RNA-seq dataset of human oral keratinocytes and the effects of human cytomegalovirus from Kowalik et al. (2025), clustered with Louvain algorithm and embedded with UMAP.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%env ANYWIDGET_HMR=1

env: ANYWIDGET_HMR=1


## Load Data

Load your single-cell data using scanpy and prepare it for visualization.

In [5]:
import urllib.request
import os.path

# Setup data
data_url = "https://www.dropbox.com/scl/fi/ih2laraxptbaeqfgqfax9/pbmc3k_20210420.h5ad?rlkey=fdtrijj9rh971uwvaa5x4qaq9&st=o0g4sf07&dl=1"
data_file = "data/pbmc3k_20210420.h5ad"

if not os.path.exists(data_file):
    os.mkdir("data")
    print("Downloading PBMC 3K demo...")
    urllib.request.urlretrieve(data_url, data_file)
    print("Download complete.")

Downloading PBMC 3K demo...
Download complete.


In [6]:
import scanpy as sc

adata = sc.read(data_file)
adata

AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

## Prepare Data for scSketch

Extract UMAP coordinates, metadata, and gene expression data into a single DataFrame.

In [10]:
import pandas as pd

# UMAP coordinates
umap_df = pd.DataFrame(adata.obsm["X_umap"], columns=["x", "y"], index=adata.obs_names)

# Define metadata columns of interest
metadata_cols = [
    "n_genes",
    "total_counts_mt",
]
metadata_df = adata.obs[metadata_cols]

# Extract gene expression
gene_exp_df = pd.DataFrame(
    adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X,
    columns=adata.var_names,
    index=adata.obs_names,
)

# Combine into single dataframe
df = pd.concat([umap_df, metadata_df, gene_exp_df], axis=1)
df = df.loc[:, ~df.columns.duplicated()]

# Define categorical columns
categorical_cols = [
    "n_genes",
    "total_counts_mt",
]

# Convert categorical columns to strings
for col in categorical_cols:
    df[col] = df[col].astype(str)

df.head()

Unnamed: 0,x,y,n_genes,total_counts_mt,TNFRSF4,CPSF3L,ATAD3C,C1orf86,RER1,TNFRSF25,...,DSCR3,BRWD1,BACE2,SIK1,C21orf33,ICOSLG,SUMO3,SLC19A1,S100B,PRMT2
AAACATACAACCAC-1,9.728817,4.212151,781,73.0,-0.17147,-0.280812,-0.046677,-0.475169,-0.544024,4.928497,...,-0.22657,-0.236269,-0.102943,-0.222116,-0.312401,-0.121678,-0.521229,-0.098269,-0.209095,-0.531203
AAACATTGAGCTAC-1,3.799765,10.181845,1352,186.0,-0.214582,-0.372653,-0.054804,-0.683391,0.633951,-0.334837,...,-0.317531,2.568868,0.007155,-0.445372,1.629285,-0.058662,-0.857163,-0.266844,-0.313146,-0.596654
AAACATTGATCAGC-1,7.023628,4.829623,1131,28.0,-0.376888,-0.295085,-0.057527,-0.520972,1.332648,-0.309362,...,-0.302938,-0.239801,-0.071774,-0.297857,-0.41092,-0.070431,-0.59072,-0.158656,-0.170876,1.379
AAACCGTGCTTCCG-1,-0.298523,2.024061,960,46.0,-0.285241,-0.281735,-0.052227,-0.484929,1.57268,-0.271825,...,-0.262978,-0.231807,-0.093818,-0.24777,2.552079,-0.097402,1.631684,-0.119462,-0.17912,-0.505669
AAACCGTGTATGCG-1,8.018503,-0.300426,522,12.0,-0.256484,-0.220394,-0.0468,-0.345859,-0.333409,-0.208122,...,-0.202237,-0.176766,-0.16735,-0.098665,-0.275836,-0.139482,-0.310095,-0.006877,-0.109614,-0.461946


## Launch scSketch

Create and display the scSketch widget.

In [13]:
from scsketch import ScSketch

sketch = ScSketch(
    data=df,
    categorical_columns=categorical_cols,
    # color_by_default="seurat_clusters",
    height=720,
    background_color="#111111",
)

sketch.show()

VBox(children=(GridBox(children=(VBox(children=(VBox(children=(HBox(children=(VBox(children=(<jscatter.widgets…

## How to Use scSketch

1. **Select points**: Use the rectangle or lasso tool to select cells in the embedding
2. **Add selection**: Click the `+` button to save your selection
3. **Run analysis**: Click `Compute Directional Search` to identify genes varying along the selected direction
4. **Explore results**: Click on genes to see their Reactome pathways, and click pathways to view diagrams
5. **Color by genes**: Use the dropdown to color the embedding by specific genes or metadata

The directional analysis shows genes with their Pearson Correlation Coefficient (R) and p-value (p), representing which genes are most upregulated or downregulated along the selected direction.