In [1]:
# Core scverse libraries
import anndata as ad
import polars as pl

from collections.abc import Iterable

# Data retrieval
import pooch
import scanpy as sc
from lets_plot import *

LetsPlot.setup_html()

from typing import Literal

## todo 

extend `violin` to AnnData.var (genes)    
configure `violins`   
modulize the plots   

In [2]:
data = sc.read("data/pbmc3k.h5ad")  # raw data
data.obs_names_make_unique()
data.obs

  utils.warn_names_duplicates("obs")


Unnamed: 0,sample
AAACCCAAGGATGGCT-1,s1d1
AAACCCAAGGCCTAGA-1,s1d1
AAACCCAAGTGAGTGC-1,s1d1
AAACCCACAAGAGGCT-1,s1d1
AAACCCACATCGTGGC-1,s1d1
...,...
TTTGTTGAGAGTCTGG-1,s1d3
TTTGTTGCAGACAATA-1,s1d3
TTTGTTGCATGTTACG-1,s1d3
TTTGTTGGTAGTCACT-1,s1d3


In [3]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
data.var["mt"] = data.var_names.str.startswith("MT-")
# ribosomal genes
data.var["ribo"] = data.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
data.var["hb"] = data.var_names.str.contains("^HB[^(P)]")

In [4]:
sc.pp.calculate_qc_metrics(data, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)

In [20]:
_THEME_VIOLIN = (
    theme_classic()
    + theme(
        text=element_text(size=12, family="Arial", color="#3f3f3f"),
        title=element_text(size=14, family="Arial", color="#3f3f3f"),
    )
    + scale_fill_viridis()
    + ggsize(400, 400)
)

In [21]:
def violin(
    data: sc.AnnData,
    key: str,
    *,
    violin_fill: str = "#FF00FF",
    violin_color: str = "#2f2f2f",
    point_color: str = "#1f1f1f",
    point_alpha: float = 0.7,
    point_size: float = 0.5,
    trim: bool = False,
    show_tooltips: bool = True,
    show_points: bool = True,
    add_tooltips: list[str] | tuple[str] | Iterable[str] = None,
    custom_tooltips: list[str] | tuple[str] | Iterable[str] = None,
):
    # check if data is an AnnData object
    if not isinstance(data, sc.AnnData):
        msg = "data must be an AnnData object"
        raise TypeError(msg)
    else:
        frame = pl.from_pandas(data.obs, include_index=True).rename({"None": "CellID"})
    # check if key is in the columns
    if key not in frame.columns:
        msg = f"key must be a column in the AnnData object, but {key} is not in the columns"
        raise KeyError(msg)

    # handle tooltips
    base_tooltips = ["CellID", key]
    if not show_tooltips:
        tooltips = "none"  # for letsplot, this removes the tooltips
    else:
        if isinstance(custom_tooltips, Iterable):
            tooltips = list(custom_tooltips)
        elif isinstance(add_tooltips, Iterable):
            tooltips = base_tooltips + list(add_tooltips)
        else:
            tooltips = base_tooltips

    vln = (
        ggplot(data=frame)
        + geom_violin(
            data=frame,
            mapping=aes(y=key),
            fill=violin_fill,
            color=violin_color,
            trim=trim,
            tooltips=layer_tooltips([key]),
        )
        + _THEME_VIOLIN
    )
    if show_points:
        print(tooltips)
        vln += geom_jitter(
            data=frame,
            mapping=aes(y=key),
            color=point_color,
            alpha=point_alpha,
            size=point_size,
            tooltips=layer_tooltips(tooltips),
        )

    return vln

In [22]:
violin(data=data, key="pct_counts_mt", show_points=False)

In [23]:
def violins(data, keys: list, interactive=False, multi_panel=True, **kwargs):
    if multi_panel:
        plots = list()
        for key in keys:
            plots.append(violin(data, key=key, **kwargs))

        vlns = gggrid(plots)
    else:
        frame = pl.from_pandas(data.obs[keys], include_index=True).rename({"None": "CellID"})
        frame = frame.unpivot(index="CellID", variable_name="observations", value_name="value")
        vlns = (
            ggplot(data=frame)
            + geom_violin(aes(x="observations", y="value", fill="observations"))
            + _THEME_VIOLIN
        )

    if interactive:
        return vlns + ggtb()
    else:
        return vlns

In [24]:
data.obs.columns

Index(['sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts',
       'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'total_counts_mt',
       'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb',
       'log1p_total_counts_hb', 'pct_counts_hb'],
      dtype='object')

In [25]:
violins(
    data=data,
    keys=["n_genes_by_counts", "total_counts", "pct_counts_mt"],
)

['CellID', 'n_genes_by_counts']
['CellID', 'total_counts']
['CellID', 'pct_counts_mt']


In [None]:
(
    violins(
        data=data, keys=["n_genes_by_counts", "total_counts", "pct_counts_mt"], multi_panel=False
    )
    + scale_fill_viridis()
    + scale_y_log10()
    + theme(axis_text_x=element_blank())
    + ggsize(800, 600)
)

In [12]:
keys = ["n_genes_by_counts", "total_counts", "pct_counts_mt"]
frame = pl.from_pandas(data.obs[keys], include_index=True).rename({"None": "CellID"})
frame = frame.unpivot(index="CellID", variable_name="observations", value_name="value")

In [13]:
frame

CellID,observations,value
str,str,f64
"""AAACCCAAGGATGGCT-1""","""n_genes_by_counts""",2103.0
"""AAACCCAAGGCCTAGA-1""","""n_genes_by_counts""",3916.0
"""AAACCCAAGTGAGTGC-1""","""n_genes_by_counts""",683.0
"""AAACCCACAAGAGGCT-1""","""n_genes_by_counts""",4330.0
"""AAACCCACATCGTGGC-1""","""n_genes_by_counts""",325.0
…,…,…
"""TTTGTTGAGAGTCTGG-1""","""pct_counts_mt""",45.853661
"""TTTGTTGCAGACAATA-1""","""pct_counts_mt""",6.536541
"""TTTGTTGCATGTTACG-1""","""pct_counts_mt""",3.757332
"""TTTGTTGGTAGTCACT-1""","""pct_counts_mt""",20.487106


In [None]:
(
    ggplot(frame)
    + violin(aes(x="observation", y="value", color="observation"))
    + theme(axis_text_x=element_text(angle=90, hjust=1))
)

TypeError: violin() missing 1 required positional argument: 'key'

In [None]:
frame

CellID,observations,value
str,str,f64
"""AAACCCAAGGATGGCT-1""","""n_genes_by_counts""",2103.0
"""AAACCCAAGGCCTAGA-1""","""n_genes_by_counts""",3916.0
"""AAACCCAAGTGAGTGC-1""","""n_genes_by_counts""",683.0
"""AAACCCACAAGAGGCT-1""","""n_genes_by_counts""",4330.0
"""AAACCCACATCGTGGC-1""","""n_genes_by_counts""",325.0
…,…,…
"""TTTGTTGAGAGTCTGG-1""","""pct_counts_mt""",45.853661
"""TTTGTTGCAGACAATA-1""","""pct_counts_mt""",6.536541
"""TTTGTTGCATGTTACG-1""","""pct_counts_mt""",3.757332
"""TTTGTTGGTAGTCACT-1""","""pct_counts_mt""",20.487106


In [32]:
element_blank()

{'blank': True}