In [1]:
# | echo: false
# | output: false
# | warning: false

from datetime import datetime
from os import path
from os.path import join
from typing import List, Union

import decoupler as dc
import gseapy as gs
import pandas as pd
import panel as pn
import scanpy as sc
import tomlkit

pn.extension("tabulator")

In [5]:
# | echo: true
# | output: false
# | warning: false

CLUSTERING_COL: str = "leiden_1.0"
CONDIITION: Union[str, None] = None
SUBSET: Union[str, List, None] = None

# Check options from gseapy.show_resouces()
ENRICHMENT_DBS: list[str] = []

In [6]:
# | echo: false
# | output: false
# | warning: false

## Pipeline parameters
with open("../config.toml", "r") as f:
    config = tomlkit.parse(f.read())

In [7]:
# | output: false
# | warning: false


# Directories
ROOT_DIR = config["basic"]["ANALYSIS_DIR"]
DIR_SAVE = path.join(ROOT_DIR, config["basic"]["DIR_SAVE"])
DIR_samples = config["basic"]["DIR_SAMPLES"]

# Basic information
ORGANISM: str = config["basic"]["ORGANISM"]
NORMALIZATION_METHOD = config["normalization"]["NORMALIZATION_METHOD"]

if CLUSTERING_COL == "":
    CLUSTERING_COL = config["clustering"]["CLUSTERING_COL"]
elif CLUSTERING_COL == "":
    raise ValueError(
        "No clustering column found. please provide a key for cell grouping"
    )

In [8]:
# | echo: false
# | output: false
# | warning: false

adata = sc.read_h5ad(path.join(DIR_SAVE, "adata.h5ad"))
adata.X = adata.layers[NORMALIZATION_METHOD].copy()

In [None]:
# | echo: false
# | output: false
# | warning: false

sc.tl.rank_genes_groups(
    adata,
    groupby=CLUSTERING_COL,
    mask_var=SUBSET,
    layer=NORMALIZATION_METHOD,
    method="wilcoxon",
    pts=True,
)
sc.tl.filter_rank_genes_groups(adata)

In [None]:
# | echo: false
# | output: false
# | warning: false

cell_types = adata.obs[CLUSTERING_COL].unique().tolist()
de_df = {}
for c_type in cell_types:
    de_df[c_type] = sc.get.rank_genes_groups_df(adata, group=c_type).sort_values(
        by="logfoldchanges", ascending=False
    )

In [None]:
# | echo: false
# | output: false
# | warning: false

writer = pd.ExcelWriter(
    join(DIR_SAVE, datetime.strftime(datetime.now(), "%Y%m%d") + "_DE_results.xlsx"),
    engine="openpyxl",
)
for df_name, df in de_df.items():
    df.to_excel(writer, sheet_name=df_name)
writer.close()

In [None]:
# | echo: false
# | output: false
# | warning: false

cell_types = adata.obs[CLUSTERING_COL].unique().tolist()
de_df = {}
for c_type in cell_types:
    de_df[c_type] = sc.get.rank_genes_groups_df(
        adata, group=c_type, pval_cutoff=0.05
    ).sort_values(by="logfoldchanges", ascending=False)

In [None]:
# | echo: false
# | warning: false

widget_ls = []
for index, key in enumerate(de_df.keys()):
    widget_ls.append(
        (
            key,
            pn.widgets.Tabulator(
                de_df[key],
                header_filters=True,
                pagination="local",
                show_index=False,
                sortable=True,
                disabled=True,
            ),
        )
    )

pn.Tabs(*widget_ls)

# Differentially Expressed genes

## Rank plot

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups(adata)

## Dot plot

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups_dotplot(adata)

## Heatmap

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups_heatmap(adata)

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups_matrixplot(adata)

# Pathway enrichment (Progeny Analysis)

In [None]:
progeny = dc.get_progeny(organism=ORGANISM, top=500)
dc.run_mlm(
    mat=adata,
    net=progeny,
    source="source",
    target="target",
    weight="weight",
    verbose=True,
    use_raw=False,
)

acts = dc.get_acts(adata, obsm_key="mlm_estimate")

sc.pl.matrixplot(
    acts,
    var_names=acts.var_names,
    groupby=CLUSTERING_COL,
    dendrogram=True,
    standard_scale="var",
    colorbar_title="Z-scaled scores",
    cmap="viridis",
)

# Transcription Factor Activity Inference (CollecTRI)

In [None]:
net = dc.get_collectri(organism=ORGANISM, split_complexes=False)

dc.run_ulm(
    mat=adata,
    net=net,
    source="source",
    target="target",
    weight="weight",
    verbose=True,
    use_raw=False,
)

acts = dc.get_acts(adata, obsm_key="ulm_estimate")

df: pd.DataFrame = dc.rank_sources_groups(
    acts, groupby=CLUSTERING_COL, reference="rest", method="t-test_overestim_var"
)
n_markers = 3
source_markers: pd.DataFrame = (
    df.groupby("group")
    .head(n_markers)
    .groupby("group")["names"]
    .apply(lambda x: list(x))
    .to_dict()
)

sc.pl.matrixplot(
    acts,
    source_markers,
    CLUSTERING_COL,
    dendrogram=True,
    standard_scale="var",
    colorbar_title="Z-scaled scores",
    cmap="viridis",
)


# Over-representation analysis

In [11]:
msigdb = dc.get_resource("MSigDB")
list(msigdb.collection.unique())

['chemical_and_genetic_perturbations',
 'immunesigdb',
 'mirna_targets_mirdb',
 'go_molecular_function',
 'tf_targets_gtrf',
 'tf_targets_legacy',
 'oncogenic_signatures',
 'cell_type_signatures',
 'vaccine_response',
 'go_biological_process',
 'cancer_gene_neighborhoods',
 'cancer_modules',
 'go_cellular_component',
 'wikipathways',
 'reactome_pathways',
 'hallmark',
 'mirna_targets_legacy',
 'biocarta_pathways',
 'positional',
 'human_phenotype_ontology',
 'pid_pathways',
 'kegg_pathways']

In [15]:
import numpy as np
# TODO: The ORA method is very slow, check other implementations

go = msigdb[msigdb.collection == "go_biological_process"]
go = go[~go.duplicated(["geneset", "genesymbol"])]

dc.run_ora(
    mat=adata,
    net=go,
    source="geneset",
    target="genesymbol",
    verbose=True,
    use_raw=False,
)

acts = dc.get_acts(adata, obsm_key="ora_estimate")

acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e
df = dc.rank_sources_groups(
    acts, groupby="louvain", reference="rest", method="t-test_overestim_var"
)

n_markers = 5
source_markers = (
    df.groupby("group")
    .head(n_markers)
    .groupby("group")["names"]
    .apply(lambda x: list(x))
    .to_dict()
)

sc.pl.matrixplot(
    acts,
    source_markers,
    CLUSTERING_COL,
    dendrogram=True,
    standard_scale="var",
    colorbar_title="Z-scaled scores",
    cmap="viridis",
)


10811 features of mat are empty, they will be removed.
Running ora on mat with 12327 samples and 25790 targets for 7322 sources.


  0%|          | 0/12327 [00:00<?, ?it/s]