In [4]:
import gseapy as gp
import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [None]:
# ==================== ssGSEA functions ==================== #

# Read GMT file into dict: {pathway: [genes]}
def read_gmt(gmt_path: str) -> dict:
    gene_sets = {}
    with open(gmt_path, "r") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.rstrip("\n").split("\t")
            if len(parts) < 3:
                continue
            gs_name = parts[0]
            genes = [g for g in parts[2:] if g]
            gene_sets[gs_name] = genes
    return gene_sets

# Convert gseapy ssGSEA res (res.res2d, long format) to scores matrix (sample by pathway)
def res2d_to_scores(res, score_col = "NES"):
    
    df = res.res2d.copy()

    col_map = {c.lower(): c for c in df.columns}
    name_col = col_map.get("name", "Name")
    term_col = col_map.get("term", "Term")

    score_col_actual = None
    for c in df.columns:
        if c.upper() == score_col.upper():
            score_col_actual = c
            break
    if score_col_actual is None:
        raise ValueError(f"Score column {score_col} not found. Available: {list(df.columns)}")

    scores = df.pivot(index=name_col, columns=term_col, values=score_col_actual)
    scores.index.name = "cell_id"
    return scores

# ssGSEA from cell by gene matrix (npz format)
def ssGSEA_from_cellxgene_npz(npz_path: str, cell_ids: list, gene_ids: list, gmt_path: str, out_path: str, chunk_size: int = 2000, min_geneset_size: int = 5, max_geneset_size: int = 5000, do_log1p: bool = True, do_cpm: bool = True):
    
    # load cell by gene matrix
    X = sparse.load_npz(npz_path).tocsr()
    if X.shape != (len(cell_ids), len(gene_ids)):
        raise ValueError(f"Shape mismatch: X {X.shape} vs {(len(cell_ids), len(gene_ids))}")

    # parse GMT into dict: {pathway: [genes]}
    gene_sets = read_gmt(gmt_path)

    all_scores = []

    for start in range(0, X.shape[0], chunk_size):
        
        end = min(start + chunk_size, X.shape[0])
        Xb = X[start:end, :].astype(np.float32)

        # optional: CPM + log1p to reduce ties (many zeros) and depth effects
        if do_cpm:
            libsize = np.asarray(Xb.sum(axis=1)).ravel()
            libsize[libsize == 0] = 1.0
            Xb = Xb.multiply(1e6 / libsize[:, None])
        if do_log1p:
            Xb = Xb.copy()
            Xb.data = np.log1p(Xb.data)

        # gseapy wants genes by samples (DataFrame)
        expr = pd.DataFrame(
            Xb.toarray().T,
            index=gene_ids,
            columns=cell_ids[start:end],
        )

        res = gp.ssgsea(
            data=expr,
            gene_sets=gene_sets,
            sample_norm_method="rank",
            min_size=min_geneset_size,
            max_size=max_geneset_size,
            outdir=None,
            verbose=False,
            processes=1,
        )

        # res.res2d: pathway by sample
        scores = res2d_to_scores(res, score_col = "NES")
        scores = scores.reindex(cell_ids[start:end])
        all_scores.append(scores)
    
    scores_df = pd.concat(all_scores, axis=0)
    if not scores_df.index.is_unique:
        raise ValueError("Duplicate cell IDs in final scores_df index.")

    scores_df.to_parquet(out_path)
    return scores_df

In [None]:
# ==================== Main operations ==================== #

settings = {"Xenium_5K_BC": {"cell_type_label": True},
            "Xenium_5K_OC": {"cell_type_label": True},
            "Xenium_5K_CC": {"cell_type_label": True},
            "Xenium_5K_LC": {"cell_type_label": False},
            "Xenium_5K_Prostate": {"cell_type_label": False},
            "Xenium_5K_Skin": {"cell_type_label": False}}

for data in settings.keys():
    
    print(f"========== Processing {data}... ==========")
    
    # paths
    data_dir = f"../../data/{data}/"
    utils_dir = "../../data/utils/"
    output_dir = f"../../output/{data}/"
    
    # Read data
    adata = sc.read_h5ad(data_dir + "intermediate_data/adata.h5ad")
    adata_tumor = adata[adata.obs["cell_type_merged"] == "Malignant cell"].copy()
    
    # check cell and gene IDs
    cell_ids = list(adata_tumor.obs["cell_id"])
    gene_ids = list(adata_tumor.var.index)
    
    cell_ids_npz = np.load(data_dir + "processed_data/cell_ids.npy", allow_pickle = True).tolist()
    gene_ids_npz = np.load(data_dir + "processed_data/gene_ids.npy", allow_pickle = True).tolist()
    
    if cell_ids_npz != cell_ids:
        raise ValueError("Cell ID order mismatch between NPZ and current adata_tumor!")

    if gene_ids_npz != gene_ids:
        raise ValueError("Gene order mismatch between NPZ and current adata_tumor!")
    
    # run ssGSEA in nuclear and cytoplasmic
    gmt_path = utils_dir + "hallmark_pathways.gmt"
    
    nuc_scores = ssGSEA_from_cellxgene_npz(
        npz_path = data_dir + "processed_data/nuclear_expression_matrix.npz",
        cell_ids = cell_ids,
        gene_ids = gene_ids,
        gmt_path = gmt_path,
        out_path = data_dir + "processed_data/ssgsea_hallmark_nuclear.parquet",
    )

    cyto_scores = ssGSEA_from_cellxgene_npz(
        npz_path = data_dir + "processed_data/cytoplasmic_expression_matrix.npz",
        cell_ids = cell_ids,
        gene_ids = gene_ids,
        gmt_path = gmt_path,
        out_path = data_dir + "processed_data/ssgsea_hallmark_cytoplasmic.parquet",
    )

check
DEBUG chunk 0 - 2000
  scores.index head: ['aaaaaohf-1', 'aaaabkoj-1', 'aaaafefl-1', 'aaaahfjm-1', 'aaaahjao-1']
  expected head   : ['aaaaaohf-1', 'aaaabkoj-1', 'aaaafefl-1', 'aaaahfjm-1', 'aaaahjao-1']
  dup in scores.index? False
  index dtype: <class 'pandas.core.indexes.base.Index'>
DEBUG chunk 2000 - 4000
  scores.index head: ['abbcfmjl-1', 'abbcgpoi-1', 'abbcjlkj-1', 'abbcklof-1', 'abbcknin-1']
  expected head   : ['abbcfmjl-1', 'abbcgpoi-1', 'abbcjlkj-1', 'abbcklof-1', 'abbcknin-1']
  dup in scores.index? False
  index dtype: <class 'pandas.core.indexes.base.Index'>
check2


KeyboardInterrupt: 

In [15]:
aaa = nuc_scores.mean(axis = 0)

In [16]:
aaa.sort_values(ascending = False)

Term
HALLMARK_MYC_TARGETS_V2                       0.262318
HALLMARK_DNA_REPAIR                           0.176404
HALLMARK_MYC_TARGETS_V1                       0.165905
HALLMARK_ESTROGEN_RESPONSE_EARLY              0.139861
HALLMARK_HEDGEHOG_SIGNALING                   0.126865
HALLMARK_ESTROGEN_RESPONSE_LATE                0.11975
HALLMARK_E2F_TARGETS                          0.107964
HALLMARK_NOTCH_SIGNALING                      0.101733
HALLMARK_TGF_BETA_SIGNALING                   0.101345
HALLMARK_MTORC1_SIGNALING                      0.10067
HALLMARK_ANGIOGENESIS                         0.099909
HALLMARK_PI3K_AKT_MTOR_SIGNALING              0.083231
HALLMARK_ANDROGEN_RESPONSE                    0.079449
HALLMARK_INTERFERON_ALPHA_RESPONSE            0.064469
HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY      0.042605
HALLMARK_MITOTIC_SPINDLE                       0.04061
HALLMARK_INTERFERON_GAMMA_RESPONSE            0.030739
HALLMARK_UV_RESPONSE_DN                        0.01975
HALLM