In [1]:
# | echo: false
# | output: false
# | warning: false

from datetime import datetime
from os import path
from os.path import join
from typing import List, Union

import decoupler as dc
import gseapy as gs
import pandas as pd
import panel as pn
import scanpy as sc
import tomlkit

pn.extension("tabulator")

In [6]:
import pooch

EXAMPLE_DATA = pooch.create(
    path=pooch.os_cache("scverse_tutorials"),
    base_url="doi:10.6084/m9.figshare.22716739.v1/",
)
EXAMPLE_DATA.load_registry_from_doi()


In [9]:
samples = {
    "s1d1": "s1d1_filtered_feature_bc_matrix.h5",
    "s1d3": "s1d3_filtered_feature_bc_matrix.h5",
}
adatas = {}

for sample_id, filename in samples.items():
    path = EXAMPLE_DATA.fetch(filename)
    sample_adata = sc.read_10x_h5(path)
    sample_adata.var_names_make_unique()
    adatas[sample_id] = sample_adata

adata = sc.concat(adatas, label="sample")
adata.obs_names_make_unique()
print(adata.obs["sample"].value_counts())
adata


Downloading file 's1d1_filtered_feature_bc_matrix.h5' from 'doi:10.6084/m9.figshare.22716739.v1/s1d1_filtered_feature_bc_matrix.h5' to '/home/mmabrouk/.cache/scverse_tutorials'.
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
Downloading file 's1d3_filtered_feature_bc_matrix.h5' from 'doi:10.6084/m9.figshare.22716739.v1/s1d3_filtered_feature_bc_matrix.h5' to '/home/mmabrouk/.cache/scverse_tutorials'.


sample
s1d1    8785
s1d3    8340
Name: count, dtype: int64


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 17125 × 36601
    obs: 'sample'

In [2]:
# | echo: true
# | output: false
# | warning: false

CLUSTERING_COL: str = "scTAB_annotation_majority_voting"
CONDIITION: Union[str, None] = None
SUBSET: Union[str, List, None] = None

# Check options from gseapy.show_resouces()
ENRICHMENT_DBS: list[str] = []

In [3]:
# | echo: false
# | output: false
# | warning: false

## Pipeline parameters
with open("../config.toml", "r") as f:
    config = tomlkit.parse(f.read())

In [4]:
# | output: false
# | warning: false


# Directories
ROOT_DIR = config["basic"]["ANALYSIS_DIR"]
DIR_SAVE = path.join(ROOT_DIR, config["basic"]["DIR_SAVE"])
DIR_samples = config["basic"]["DIR_SAMPLES"]

# Basic information
ORGANISM: str = config["basic"]["ORGANISM"]
NORMALIZATION_METHOD = config["normalization"]["NORMALIZATION_METHOD"]

if CLUSTERING_COL == "":
    CLUSTERING_COL = config["clustering"]["CLUSTERING_COL"]
elif CLUSTERING_COL == "":
    raise ValueError(
        "No clustering column found. please provide a key for cell grouping"
    )

In [5]:
# | echo: false
# | output: false
# | warning: false

adata = sc.read_h5ad(path.join(DIR_SAVE, "adata.h5ad"))
adata.X = adata.layers[NORMALIZATION_METHOD].copy()

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/home/mohamed/Documents/Bioinformatics/GPM/analysis/scRNAseq_pipeline/save/adata.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
# | echo: false
# | output: false
# | warning: false

sc.tl.rank_genes_groups(
    adata,
    groupby=CLUSTERING_COL,
    mask_var=SUBSET,
    layer=NORMALIZATION_METHOD,
    method="wilcoxon",
    pts=True,
)
sc.tl.filter_rank_genes_groups(adata)

In [None]:
# | echo: false
# | output: false
# | warning: false

cell_types = adata.obs[CLUSTERING_COL].unique().tolist()
de_df = {}
for c_type in cell_types:
    de_df[c_type] = sc.get.rank_genes_groups_df(adata, group=c_type).sort_values(
        by="logfoldchanges", ascending=False
    )

In [None]:
# | echo: false
# | output: false
# | warning: false

writer = pd.ExcelWriter(
    join(DIR_SAVE, datetime.strftime(datetime.now(), "%Y%m%d") + "_DE_results.xlsx"),
    engine="openpyxl",
)
for df_name, df in de_df.items():
    df.to_excel(writer, sheet_name=df_name)
writer.close()

In [None]:
# | echo: false
# | output: false
# | warning: false

cell_types = adata.obs[CLUSTERING_COL].unique().tolist()
de_df = {}
for c_type in cell_types:
    de_df[c_type] = sc.get.rank_genes_groups_df(
        adata, group=c_type, pval_cutoff=0.05
    ).sort_values(by="logfoldchanges", ascending=False)

In [None]:
# | echo: false
# | warning: false

widget_ls = []
for index, key in enumerate(de_df.keys()):
    widget_ls.append(
        (
            key,
            pn.widgets.Tabulator(
                de_df[key],
                header_filters=True,
                pagination="local",
                show_index=False,
                sortable=True,
                disabled=True,
            ),
        )
    )

pn.Tabs(*widget_ls, dynamic=True)

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups(adata)

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups_dotplot(adata)

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups_heatmap(adata)

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups_matrixplot(adata)

In [None]:
# | echo: false
# | warning: false

sc.pl.rank_genes_groups_stacked_violin(adata)