In [1]:
# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "anywidget>=0.9.0",
#     "jupyter-scatter>=0.21.0",
#     "llvmlite>=0.44.0",
#     "numpy>=1.26.0",
#     "pandas>=2.0.0",
#     "scanpy>=1.9.0",
#     "scipy>=1.11.0",
#     "ipywidgets>=8.0.0",
#     "matplotlib>=3.7.0",
#     "requests>=2.31.0",
#     "watchfiles>=0.20.0",
# ]
#
# [tool.uv.sources]
# scsketch = { path = ".", editable = true }
# ///

# Welcome to scSketch

This notebook shows the basic scSketch workflow in a Jupyter notebook.

**Quick start**
- No clone: `uvx scsketch demo`
- From this repo: `uvx juv run demo.ipynb`

You need an `AnnData` object with a 2D embedding in `adata.obsm["X_umap"]` and expression in `adata.X`.
This demo covers (1) Directional Analysis (brush) and (2) Differential Expression (freeform lasso).


## Load Data

Load your single-cell data using scanpy and prepare it for visualization.

In [2]:
# Dataset download (shows a widget progress bar while downloading)

import os.path
import urllib.request

import ipywidgets as ipyw
from IPython.display import display

# Pick a dataset (PBMC3k is downloaded automatically; other datasets must exist locally):
# data_file = "pbmc3k.h5ad"
# data_url = "https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad"

data_file = "pbmc12M.h5ad"
data_url = "https://datasets.cellxgene.cziscience.com/c3d1a5e6-780b-4fe9-a39b-1864f927e87b.h5ad"

# This is just a progress bar to track the file download - don't worry about this code.
status = ipyw.HTML("")
bar = ipyw.IntProgress(value=0, min=0, max=100, description="Download", layout=ipyw.Layout(width="100%"))
ui = ipyw.VBox([status])
display(ui)

def _format_bytes(n: int) -> str:
    n = float(n)
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if n < 1024 or unit == "TB":
            return f"{int(n)} {unit}" if unit == "B" else f"{n:.1f} {unit}"
        n /= 1024
    return f"{n:.1f} TB"

if os.path.exists(data_file):
    size = os.path.getsize(data_file)
    status.value = f"<b>Dataset:</b> Found <code>{data_file}</code> ({_format_bytes(size)})."
else:
    if not data_url:
        status.value = f"<b>Dataset:</b> Missing <code>{data_file}</code>. Set <code>data_url</code> to download."
    else:
        ui.children = [status, bar]
        status.value = f"<b>Dataset:</b> Downloading <code>{data_file}</code>…"

        def _hook(blocknum: int, blocksize: int, totalsize: int):
            downloaded = blocknum * blocksize
            if totalsize and totalsize > 0:
                pct = int(min(100, downloaded * 100 / totalsize))
                bar.value = pct
                status.value = (
                    f"<b>Dataset:</b> Downloading <code>{data_file}</code> — {pct}% "
                    f"({_format_bytes(min(downloaded, totalsize))} / {_format_bytes(totalsize)})"
                )
            else:
                status.value = f"<b>Dataset:</b> Downloading <code>{data_file}</code> — {_format_bytes(downloaded)}"

        urllib.request.urlretrieve(data_url, data_file, reporthook=_hook)
        bar.value = 100
        size = os.path.getsize(data_file)
        status.value = f"<b>Dataset:</b> Downloaded <code>{data_file}</code> ({_format_bytes(size)})."


VBox(children=(HTML(value=''),))

In [3]:
# Read in the AnnData object file using scanpy
import scanpy as sc

adata = sc.read(data_file)
adata

AnnData object with n_obs × n_vars = 1206761 × 32357
    obs: 'original_barcodes', 'cell_name', 'batch_id', 'pool_id', 'chip_id', 'well_id', 'n_genes', 'n_reads', 'n_umis', 'total_counts_mito', 'pct_counts_mito', 'doublet_score', 'predicted_AIFI_L1', 'AIFI_L1_score', 'AIFI_L1', 'predicted_AIFI_L2', 'AIFI_L2_score', 'AIFI_L2', 'predicted_AIFI_L3', 'AIFI_L3_score', 'AIFI_L3', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.cmv', 'subject.bmi', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'subject.ageAtFirstDraw', 'sample.visitName', 'sample.drawYear', 'sample.subjectAgeAtDraw', 'specimen.specimenGuid', 'pipeline.fileGuid', 'subject.ageGroup', 'vaccine_year', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'suspension_type', 'assay_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'cell_type'

## Launch scSketch

Create and display the scSketch widget.

In [6]:
from scsketch import ScSketch

# Pick a reasonable default metadata column if present.
preferred = ["louvain", "leiden", "cell_type", "seurat_clusters"]
metadata_cols = [c for c in preferred if c in adata.obs.columns]
if not metadata_cols and len(adata.obs.columns) > 0:
    metadata_cols = [str(adata.obs.columns[0])]

color_by_default = metadata_cols[0] if metadata_cols else "x"

sketch = ScSketch(
    adata=adata,
    metadata_cols=metadata_cols if metadata_cols else None,
    color_by_default=color_by_default,
    height=720,
    background_color="#111111",
)
sketch.show()


VBox(children=(GridBox(children=(VBox(children=(VBox(children=(HBox(children=(VBox(children=(Button(icon='arro…

In [10]:
type(adata.X), adata.X.shape

(scipy.sparse._csr.csr_matrix, (1206761, 32357))

In [11]:
import scipy.sparse as sp
sp.isspmatrix_csr(adata.X)

True

In [9]:
!uv run python benchmarks/perf_retest.py --datasets 1.2m --out benchmarks/perf_results.csv

Wrote 3 rows to benchmarks/perf_results.csv


In [10]:
!uv run python benchmarks/perf_retest.py --out benchmarks/perf_results.csv


This is where adjacency matrices should go now.
  return AnnData(**{

This is where adjacency matrices should go now.
  return AnnData(**{
Wrote 9 rows to benchmarks/perf_results.csv


In [12]:
!uv run python benchmarks/perf_retest.py --datasets 1.2m --selection-npz benchmarks/selection_1_sel1.npz --no-synthetic --out benchmarks/perf_results.csv

Traceback (most recent call last):
  File "/Users/askartemirbek/anywidgets_practice/scsketch/benchmarks/perf_retest.py", line 452, in <module>
    raise SystemExit(main())
                     ^^^^^^
  File "/Users/askartemirbek/anywidgets_practice/scsketch/benchmarks/perf_retest.py", line 357, in main
    exported.append(load_exported_selection(Path(p)))
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/askartemirbek/anywidgets_practice/scsketch/benchmarks/perf_retest.py", line 80, in load_exported_selection
    with np.load(path, allow_pickle=False) as z:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/askartemirbek/anywidgets_practice/scsketch/.venv/lib/python3.12/site-packages/numpy/lib/npyio.py", line 427, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'benchmarks/selection_1_sel1.npz'


## Directional analysis (Brush)

![](docs/assets/dir_analysis_demo.gif)

1. Set **Lasso Type** to **Brush**.
2. Set **Brush Size** to desired size.
3. Brush-select cells along a direction of interest.
4. Click **+** to save the selection.
5. Click **Compute Directional Search**.
6. Click a gene to view expression vs projection along the direction.
7. Use the Reactome pathway table/diagram to interpret hits.

The results table reports Pearson correlation (`R`) and p-value (`p`) for genes associated with the sketched direction.


## Differential expression (Freeform lasso)

![](docs/assets/diff_exp_analysis_demo.gif)

1. Set **Lasso Type** to **Freeform**.
2. Lasso-select a region of cells.
3. Click **Compute DE** to compare selected vs background cells.
4. Click a gene to view the selected vs background expression distribution plot.

The results table reports Welch’s t-statistic (`T`) and p-value (`p`).


## Programmatic access

Directional results for saved selections can be retrieved as a DataFrame:


In [10]:
# After saving + computing a selection, retrieve its directional results:
sketch.get_genes("Selection 1")

Unnamed: 0,gene,correlation,p-value
94,LTB,0.579027,1.181663e-45
296,PVALB,0.387962,3.152135e-19
52,PROK2,0.387906,3.192615e-19
249,ITGA2B,0.387900,3.197076e-19
55,HGD,0.387899,3.197716e-19
...,...,...,...
205,GZMB,-0.754913,2.255080e-92
153,PRF1,-0.757022,3.593394e-93
77,GZMA,-0.780884,8.319740e-103
263,CST7,-0.784403,2.490001e-104


<details>
<summary>Optional: display gene symbols instead of IDs</summary>

scSketch displays genes using `adata.var_names`. If your dataset stores gene symbols in `adata.var["gene_symbols"]`, you can make a visualization-only copy:

```python
adata_view = adata.copy()
if "gene_symbols" in adata_view.var:
    adata_view.var["ensembl_id"] = adata_view.var_names
    adata_view.var_names = adata_view.var["gene_symbols"].astype(str)
    adata_view.var_names_make_unique()

sketch = ScSketch(
    adata=adata_view,
    metadata_cols=metadata_cols if metadata_cols else None,
    color_by_default=color_by_default,
    height=720,
    background_color="#111111",
)
sketch.show()
```

</details>
