In [None]:
import gseapy as gp
import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

ModuleNotFoundError: No module named 'gseapy'

In [None]:
# ==================== ssGSEA ==================== #

def ssGSEA_from_cellxgene_npz(npz_path: str, cell_ids: list, gene_ids: list, gmt_path: str, out_path: str, chunk_size: int = 2000, min_geneset_size: int = 5, max_geneset_size: int = 5000, do_log1p: bool = True, do_cpm: bool = True):
    
    # load cell by gene matrix
    X = sparse.load_npz(npz_path).tocsr()
    assert X.shape == (len(cell_ids), len(gene_ids))

    # parse GMT into dict: {pathway: [genes]}
    gene_sets = gp.parser.gsea_gmt_parser(gmt_path)

    all_scores = []
    all_cells = []

    for start in range(0, X.shape[0], chunk_size):
        
        end = min(start + chunk_size, X.shape[0])
        Xb = X[start:end, :].astype(np.float32)

        # optional: CPM + log1p to reduce ties (many zeros) and depth effects
        if do_cpm:
            libsize = np.asarray(Xb.sum(axis=1)).ravel()
            libsize[libsize == 0] = 1.0
            Xb = Xb.multiply(1e6 / libsize[:, None])
        if do_log1p:
            Xb = Xb.copy()
            Xb.data = np.log1p(Xb.data)

        # gseapy wants genes by samples (DataFrame)
        expr = pd.DataFrame(
            Xb.toarray().T,
            index=gene_ids,
            columns=cell_ids[start:end],
        )

        res = gp.ssgsea(
            data=expr,
            gene_sets=gene_sets,
            sample_norm_method="rank",
            min_size=min_geneset_size,
            max_size=max_geneset_size,
            outdir=None,
            verbose=False,
            processes=1,
        )

        # res.res2d: pathway by sample
        scores = res.res2d.T  # sample by pathway
        all_scores.append(scores)
        all_cells.extend(cell_ids[start:end])

    scores_df = pd.concat(all_scores, axis=0)
    scores_df = scores_df.reindex(all_cells)  # preserve original order

    scores_df.to_parquet(out_path)
    return scores_df

In [None]:
# ==================== Main operations ==================== #

settings = {"Xenium_5K_BC": {"cell_type_label": True},
            "Xenium_5K_OC": {"cell_type_label": True},
            "Xenium_5K_CC": {"cell_type_label": True},
            "Xenium_5K_LC": {"cell_type_label": False},
            "Xenium_5K_Prostate": {"cell_type_label": False},
            "Xenium_5K_Skin": {"cell_type_label": False}}

for data in settings.keys():
    
    print(f"========== Processing {data}... ==========")
    
    # paths
    data_dir = f"../../data/{data}/"
    utils_dir = "../../data/utils/"
    output_dir = f"../../output/{data}/"
    
    # Read data
    adata = sc.read_h5ad(data_dir + "intermediate_data/adata.h5ad")
    adata_tumor = adata[adata.obs["cell_type_merged"] == "Malignant cell"].copy()
    
    # ---- run nucleus + cytoplasm ----
    # Make sure these match the order used when you built the matrices
    cell_ids = list(adata_tumor.obs["cell_id"])
    gene_ids = list(adata_tumor.var.index)

    gmt_path = utils_dir + "hallmark_pathways.gmt"

    nuc_scores = ssGSEA_from_cellxgene_npz(
        npz_path = data_dir + "processed_data/nuclear_expression_matrix.npz",
        cell_ids = cell_ids,
        gene_ids = gene_ids,
        gmt_path = gmt_path,
        out_path = output_dir + "ssgsea_hallmark_nuclear.parquet",
    )

    cyto_scores = ssGSEA_from_cellxgene_npz(
        npz_path = data_dir + "processed_data/cytoplasmic_expression_matrix.npz",
        cell_ids = cell_ids,
        gene_ids = gene_ids,
        gmt_path = gmt_path,
        out_path = output_dir + "ssgsea_hallmark_cytoplasmic.parquet",
    )

Shape of the nuclear expression matrix: (102180, 5101)
Shape of the cytoplasmic expression matrix: (102180, 5101)
Shape of the nuclear expression matrix: (160250, 5101)
Shape of the cytoplasmic expression matrix: (160250, 5101)
Shape of the nuclear expression matrix: (221355, 5101)
Shape of the cytoplasmic expression matrix: (221355, 5101)
Shape of the nuclear expression matrix: (44624, 5001)
Shape of the cytoplasmic expression matrix: (44624, 5001)
Shape of the nuclear expression matrix: (95429, 5006)
Shape of the cytoplasmic expression matrix: (95429, 5006)
Shape of the nuclear expression matrix: (49126, 5006)
Shape of the cytoplasmic expression matrix: (49126, 5006)
