In [7]:
import matplotlib.colors as clr
import matplotlib.pyplot as plt
import gseapy as gp
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from matplotlib.patches import Patch
from scipy import sparse
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [8]:
# Color
color_cts = clr.LinearSegmentedColormap.from_list("magma", ["#000003", "#3B0F6F", "#8C2980", "#F66E5B", "#FD9F6C", "#FBFCBF"], N=256)

In [9]:
# ==================== ssGSEA functions ==================== #

# Read GMT file into dict: {pathway: [genes]}
def read_gmt(gmt_path: str) -> dict:
    gene_sets = {}
    with open(gmt_path, "r") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.rstrip("\n").split("\t")
            if len(parts) < 3:
                continue
            gs_name = parts[0]
            genes = [g for g in parts[2:] if g]
            gene_sets[gs_name] = genes
    return gene_sets

# Convert gseapy ssGSEA res (res.res2d, long format) to scores matrix (sample by pathway)
def res2d_to_scores(res, score_col = "NES"):
    
    df = res.res2d.copy()

    col_map = {c.lower(): c for c in df.columns}
    name_col = col_map.get("name", "Name")
    term_col = col_map.get("term", "Term")

    score_col_actual = None
    for c in df.columns:
        if c.upper() == score_col.upper():
            score_col_actual = c
            break
    if score_col_actual is None:
        raise ValueError(f"Score column {score_col} not found. Available: {list(df.columns)}")

    scores = df.pivot(index=name_col, columns=term_col, values=score_col_actual)
    scores.index.name = "cell_id"
    return scores

# ssGSEA from cell by gene matrix (npz format)
def ssGSEA_from_cellxgene_npz(npz_path: str, cell_ids: list, gene_ids: list, gmt_path: str, out_path: str, chunk_size: int = 2000, min_geneset_size: int = 5, max_geneset_size: int = 5000, do_log1p: bool = True, do_cpm: bool = True):
    
    # load cell by gene matrix
    X = sparse.load_npz(npz_path).tocsr()
    if X.shape != (len(cell_ids), len(gene_ids)):
        raise ValueError(f"Shape mismatch: X {X.shape} vs {(len(cell_ids), len(gene_ids))}")

    # parse GMT into dict: {pathway: [genes]}
    gene_sets = read_gmt(gmt_path)

    all_scores = []

    for start in range(0, X.shape[0], chunk_size):
        
        end = min(start + chunk_size, X.shape[0])
        Xb = X[start:end, :].astype(np.float32)

        # optional: CPM + log1p to reduce ties (many zeros) and depth effects
        if do_cpm:
            libsize = np.asarray(Xb.sum(axis=1)).ravel()
            libsize[libsize == 0] = 1.0
            Xb = Xb.multiply(1e6 / libsize[:, None])
        if do_log1p:
            Xb = Xb.copy()
            Xb.data = np.log1p(Xb.data)

        # gseapy wants genes by samples (DataFrame)
        expr = pd.DataFrame(
            Xb.toarray().T,
            index=gene_ids,
            columns=cell_ids[start:end],
        )

        res = gp.ssgsea(
            data=expr,
            gene_sets=gene_sets,
            sample_norm_method="rank",
            min_size=min_geneset_size,
            max_size=max_geneset_size,
            outdir=None,
            verbose=False,
            processes=1,
        )

        # res.res2d: pathway by sample
        scores = res2d_to_scores(res, score_col = "NES")
        scores = scores.reindex(cell_ids[start:end])
        all_scores.append(scores)
    
    scores_df = pd.concat(all_scores, axis=0)
    if not scores_df.index.is_unique:
        raise ValueError("Duplicate cell IDs in final scores_df index.")

    scores_df.to_parquet(out_path)
    return scores_df

In [10]:
# ==================== Main operations ==================== #

settings = {"Xenium_5K_BC": {"cell_type_label": True},
            "Xenium_5K_OC": {"cell_type_label": True},
            "Xenium_5K_CC": {"cell_type_label": True},
            "Xenium_5K_LC": {"cell_type_label": False},
            "Xenium_5K_Prostate": {"cell_type_label": False},
            "Xenium_5K_Skin": {"cell_type_label": False}}

# for data in settings.keys():
for data in ["Xenium_5K_BC"]:
    
    print(f"========== Processing {data}... ==========")
    
    # paths
    data_dir = f"../../data/{data}/"
    utils_dir = "../../data/_utils/"
    output_dir = f"../../output/{data}/"
    
    # read data
    adata = sc.read_h5ad(data_dir + "intermediate_data/adata.h5ad")
    adata_tumor = adata[adata.obs["cell_type_merged"] == "Malignant cell"].copy()
    
    # determine plot size
    x_range = adata.obs["global_x"].max() - adata.obs["global_x"].min()
    y_range = adata.obs["global_y"].max() - adata.obs["global_y"].min()
    short_edge = min(x_range, y_range)

    scale = 5 / short_edge
    plot_figsize = (int(x_range * scale), int(y_range * scale))
    print(f"Plot size: {plot_figsize}")
    
    # check cell and gene IDs
    cell_ids = list(adata_tumor.obs["cell_id"])
    gene_ids = list(adata_tumor.var.index)
    
    cell_ids_npz = np.load(data_dir + "processed_data/cell_ids.npy", allow_pickle = True).tolist()
    gene_ids_npz = np.load(data_dir + "processed_data/gene_ids.npy", allow_pickle = True).tolist()
    
    if cell_ids_npz != cell_ids:
        raise ValueError("Cell ID order mismatch between NPZ and current adata_tumor!")

    if gene_ids_npz != gene_ids:
        raise ValueError("Gene order mismatch between NPZ and current adata_tumor!")
    
    # run ssGSEA in nuclear and cytoplasmic
    # gmt_path = utils_dir + "hallmark_pathways_filtered.gmt"
    gmt_path = utils_dir + "all_pathways_filtered.gmt"
    
    for segment in ["nuclear", "cytoplasmic"]:
        
        print(f"Running ssGSEA for {segment} compartment...")
        
        # pathway scores
        scores = ssGSEA_from_cellxgene_npz(npz_path = data_dir + f"processed_data/{segment}_expression_matrix.npz",
                                           cell_ids = cell_ids,
                                           gene_ids = gene_ids,
                                           gmt_path = gmt_path,
                                           out_path = data_dir + f"processed_data/ssgsea_hallmark_{segment}.parquet")
        
        # long format
        scores_long = scores.reset_index().melt(id_vars="cell_id", var_name="Pathway", value_name="NES")
        order = scores_long.groupby("Pathway")["NES"].median().sort_values(ascending=False).index
        order_labels = [" ".join(s.capitalize() for s in i.split("_")[1:]) for i in order]
        
        # statistical tests
        stats = []
        for pathway, df in scores_long.groupby("Pathway"):
            nes = df["NES"].dropna().to_numpy(dtype=float)
            if np.allclose(nes, 0):
                pval = 1.0
                stat = 0.0
            else:
                stat, pval = wilcoxon(nes, alternative="two-sided")
            stats.append({"Pathway": pathway, "median": np.median(nes), "pval": pval})
        stats_df = pd.DataFrame(stats)
        stats_df["qval"] = multipletests(stats_df["pval"], method="fdr_bh")[1]
        
        # determine significance
        alpha = 0.05
        stats_df["significance"] = "nonsignificant"
        stats_df.loc[(stats_df["qval"] < alpha) & (stats_df["median"] > 0), "significance"] = "positive"
        stats_df.loc[(stats_df["qval"] < alpha) & (stats_df["median"] < 0), "significance"] = "negative"
        
        scores_long = scores_long.merge(stats_df[["Pathway", "significance"]], on="Pathway", how="left")
        palette = {row.Pathway: "#d73027" if row["significance"] == "positive" else "#4575b4" if row["significance"] == "negative" else "lightgray" for _, row in stats_df.iterrows()}
        legend_handles = [Patch(facecolor="#d73027", edgecolor="black", label="Positive median NES"),
                          Patch(facecolor="#4575b4", edgecolor="black", label="Negative median NES"),
                          Patch(facecolor="lightgray", edgecolor="black", label="Not significant")]

        # boxplot of all pathways
        plt.figure(figsize=(25, 6))
        ax = sns.boxplot(data=scores_long, x="Pathway", y="NES", order=order, showfliers=False, palette=palette)
        ax.axhline(0, color="black", linestyle="--", linewidth=0.8)
        ax.set_xticklabels(order_labels, rotation=45, ha="right", fontsize=8)
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.legend(handles=legend_handles, loc="upper right", frameon=True, fontsize=12)
        plt.savefig(output_dir + f"ssgsea_hallmark_{segment.lower()}.jpeg", dpi = 300, bbox_inches = "tight")
        plt.close()
        
        # plot top pathways
        n_top = 5

        scores_mean = scores.mean(axis = 0).sort_values(ascending = False)
        top_scores = scores_mean.head(n_top)
        
        for pathway in top_scores.index:
            
            # add pathway to adata_tumor.obs
            pathway_label = f"{pathway}"
            adata_tumor.obs[pathway_label] = scores[pathway].values
            
            # plot pathway score
            sc.set_figure_params(figsize = plot_figsize)
            ax = sc.pl.scatter(adata_tumor, x="global_x", y="global_y", color=pathway_label, color_map=color_cts, size=1, show=False)
            ax.grid(False)
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_xlabel("")
            ax.set_ylabel("")
            ax.set_title("")
            for spine in ax.spines.values():
                spine.set_visible(False)
            plt.savefig(output_dir + f"{segment}_{pathway_label}.jpeg", dpi = 300, bbox_inches = "tight")
            plt.close()

Plot size: (5, 7)
Running ssGSEA for nuclear compartment...
Running ssGSEA for cytoplasmic compartment...


In [6]:
scores

Term,HALLMARK_ADIPOGENESIS,HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_APICAL_JUNCTION,HALLMARK_APOPTOSIS,HALLMARK_COAGULATION,HALLMARK_COMPLEMENT,HALLMARK_E2F_TARGETS,HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION,HALLMARK_ESTROGEN_RESPONSE_EARLY,HALLMARK_ESTROGEN_RESPONSE_LATE,...,HALLMARK_MTORC1_SIGNALING,HALLMARK_MYC_TARGETS_V1,HALLMARK_MYOGENESIS,HALLMARK_P53_PATHWAY,HALLMARK_PI3K_AKT_MTOR_SIGNALING,HALLMARK_SPERMATOGENESIS,HALLMARK_TNFA_SIGNALING_VIA_NFKB,HALLMARK_UV_RESPONSE_DN,HALLMARK_UV_RESPONSE_UP,HALLMARK_XENOBIOTIC_METABOLISM
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaaaaohf-1,-0.147863,-0.239232,-0.079289,-0.168468,0.029393,-0.082293,0.162613,-0.052421,0.173272,0.199052,...,0.19664,0.302738,-0.062248,-0.017958,0.117657,-0.321251,0.003354,-0.018084,-0.182755,-0.190275
aaaabkoj-1,-0.036917,-0.265009,-0.04458,-0.156976,-0.014752,-0.108202,0.145965,-0.000812,0.263282,0.237542,...,0.248623,0.395962,-0.072297,-0.005209,0.087987,-0.338244,0.066723,-0.026897,-0.127739,-0.248678
aaaafefl-1,-0.024504,-0.262003,-0.06484,-0.203082,-0.049267,-0.191146,0.256428,-0.068413,0.148,0.187329,...,0.119897,0.187543,-0.068141,-0.06579,0.165157,-0.227175,-0.010294,-0.042604,-0.107217,-0.235041
aaaahfjm-1,-0.191998,-0.244824,-0.117753,-0.169099,-0.001342,-0.219645,0.245856,-0.03382,0.161746,0.156331,...,0.137514,0.360009,-0.111224,-0.05455,0.237034,-0.313198,0.010237,0.022995,-0.070191,-0.19777
aaaahjao-1,-0.158978,-0.251916,-0.126593,-0.24124,-0.027745,-0.220915,0.10565,-0.029581,0.129358,0.133743,...,0.099666,0.200951,-0.097537,-0.059612,0.030013,-0.290994,0.008057,-0.003634,-0.165283,-0.273071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ogicnekl-1,-0.23356,-0.331405,-0.157554,-0.354135,-0.022658,-0.276489,0.137023,-0.024965,0.067085,0.041907,...,0.102543,0.171402,-0.114481,-0.064218,0.043426,-0.368745,-0.007424,0.008919,-0.26778,-0.36511
ognonagn-1,-0.23356,-0.331405,-0.157554,-0.354135,-0.022658,-0.276489,0.137023,-0.024965,0.067085,0.041907,...,0.102543,0.171402,-0.114481,-0.064218,0.043426,-0.368745,-0.007424,0.008919,-0.26778,-0.36511
ogpdggle-1,-0.131531,-0.350216,-0.121166,-0.290292,0.003205,-0.203329,0.154775,-0.032557,0.136753,0.157499,...,0.101273,0.250453,-0.106358,-0.057689,0.019742,-0.347093,0.000783,0.052545,-0.240726,-0.383168
ohfodlmg-1,-0.23356,-0.331405,-0.157554,-0.354135,-0.022658,-0.276489,0.137023,-0.024965,0.067085,0.041907,...,0.102543,0.171402,-0.114481,-0.064218,0.043426,-0.368745,-0.007424,0.008919,-0.26778,-0.36511
