In [125]:
import anndata
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scipy.sparse as sp
import shutil
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

import warnings
warnings.filterwarnings("ignore")
sc.settings.verbosity = 0

In [131]:
# Specify data, setting, and paths
settings = {"Xenium_5K_BC": {"coords": ["global_x", "global_y"], "figsize": (5, 8)},
            "Xenium_5K_OC": {"coords": ["global_y", "global_x"], "figsize": (5, 7)}}

data = "Xenium_5K_OC"
plot_figsize = settings[data]["figsize"]
plot_coords = settings[data]["coords"]

data_dir = f"../../data/{data}/"
utils_dir = "../../data/utils/"
output_dir = f"../../output/{data}/"

DE_genes_dir = output_dir + "DE_genes/"
if os.path.exists(DE_genes_dir):
    shutil.rmtree(DE_genes_dir)
    os.makedirs(DE_genes_dir)
else:
    os.makedirs(DE_genes_dir)

In [132]:
# Read data
genes = pd.read_csv(data_dir + "processed_data/genes.csv")
genes = list(genes.iloc[:, 0])

adata_tumor = sc.read_h5ad(data_dir + "processed_data/adata_tumor.h5ad")
granule_adata = sc.read_h5ad(output_dir + "granule_adata.h5ad")

In [133]:
# Nuclei and cytoplasm expression
X_nuclei = sp.load_npz(data_dir + "processed_data/nuclear_expression_matrix.npz")
adata_tumor_nuclei = anndata.AnnData(X = X_nuclei, obs = adata_tumor.obs.copy(), var = adata_tumor.var.copy())
sc.pp.normalize_total(adata_tumor_nuclei, target_sum = 1e4)
sc.pp.log1p(adata_tumor_nuclei)

X_cyto = sp.load_npz(data_dir + "processed_data/cytoplasmic_expression_matrix.npz")
adata_tumor_cyto = anndata.AnnData(X = X_cyto, obs = adata_tumor.obs.copy(), var = adata_tumor.var.copy())
adata_tumor_cyto_raw = adata_tumor_cyto.copy()
sc.pp.normalize_total(adata_tumor_cyto, target_sum = 1e4)
sc.pp.log1p(adata_tumor_cyto)

In [134]:
# Nuclei and cytoplasm marker genes
DE_settings = {"nuclei": {"data": adata_tumor_nuclei},
               "cyto": {"data": adata_tumor_cyto}}
keys = [i for i in adata_tumor.obs.columns if i.endswith("_subtype")]

for compartment in DE_settings.keys():
    
    adata_test = DE_settings[compartment]["data"]
    
    for key in keys:
        
        sc.tl.rank_genes_groups(adata_test, groupby = key, method = "wilcoxon")
        
        names = pd.DataFrame(adata_test.uns["rank_genes_groups"]["names"])
        logfc = pd.DataFrame(adata_test.uns["rank_genes_groups"]["logfoldchanges"])
        pvals = pd.DataFrame(adata_test.uns["rank_genes_groups"]["pvals"])
        pvals_adj = pd.DataFrame(adata_test.uns["rank_genes_groups"]["pvals_adj"])
        
        for i in adata_test.obs[key].cat.categories:
            df = {"names": names[i], "logfc": logfc[i], "pvals": pvals[i], "pvals_adj": pvals_adj[i]}
            df = pd.DataFrame(df)
            df = df[df["logfc"] >= 0]
            df = df[df["pvals"] <= 0.05]
            df = df.sort_values(by=["logfc"], ascending=False)
            df.to_csv(DE_genes_dir + f"{compartment}_{key}_{i}.csv", index=False)

        print(f"{compartment} {key} DE analysis done.")

nuclei hypoxia_subtype DE analysis done.
nuclei heat_shock_subtype DE analysis done.
nuclei immune_cell_proximity_subtype DE analysis done.
nuclei bcell_proximity_subtype DE analysis done.
nuclei tcell_proximity_subtype DE analysis done.
nuclei tcell_attack_subtype DE analysis done.
nuclei tcell_attack_weighted_subtype DE analysis done.
nuclei mechanical_subtype DE analysis done.
cyto hypoxia_subtype DE analysis done.
cyto heat_shock_subtype DE analysis done.
cyto immune_cell_proximity_subtype DE analysis done.
cyto bcell_proximity_subtype DE analysis done.
cyto tcell_proximity_subtype DE analysis done.
cyto tcell_attack_subtype DE analysis done.
cyto tcell_attack_weighted_subtype DE analysis done.
cyto mechanical_subtype DE analysis done.


In [135]:
# SG high count genes
sg_counts = np.array(granule_adata.X.sum(axis=0)).flatten()
gene_names = granule_adata.var_names

df_sg = pd.DataFrame({"gene": gene_names, "sg_counts": sg_counts}).sort_values("sg_counts", ascending=False)
df_sg.to_csv(DE_genes_dir + "SG_high_count_genes.csv", index=False)

In [76]:
# Concatenate cytoplasm and granule expression
adata_cyto_granule = adata_tumor_cyto_raw.concatenate(granule_adata, batch_key = "modality", batch_categories = ["cyto", "granule"])
sc.pp.normalize_total(adata_cyto_granule, target_sum = 1e4)
sc.pp.log1p(adata_cyto_granule)
adata_cyto_granule

AnnData object with n_obs × n_vars = 151942 × 5101
    obs: 'cell_id', 'global_x', 'global_y', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'cell_type', 'cell_type_merged', 'hypoxia', 'hypoxia_clipped', 'hypoxia_subtype', 'heat_shock', 'heat_shock_clipped', 'heat_shock_subtype', 'immune_cell_proximity_neighbor_counts', 'log_immune_cell_proximity_neighbor_counts', 'immune_cell_proximity_subtype', 'bcell_proximity_neighbor_counts', 'log_bcell_proximity_neighbor_counts', 'bcell_proximity_subtype', 'tcell_proximity_neighbor_counts', 'log_tcell_proximity_neighbor_counts', 'tcell_proximity_subtype', 'tcell_attack_neighbor_counts', 'log_tcell_attack_neighbor_counts', 'tcell_attack_weighted', 'tcell_attack_subtype', 'tcell_attack_weighted_subtype', 'mechanical', 'mechanical_clipped', 'mechanical_sub

In [88]:
# DE analysis for cytoplasm + granule
sc.tl.rank_genes_groups(adata_cyto_granule, groupby = "modality", method = "t-test")
        
names = pd.DataFrame(adata_cyto_granule.uns["rank_genes_groups"]["names"])
logfc = pd.DataFrame(adata_cyto_granule.uns["rank_genes_groups"]["logfoldchanges"])
pvals = pd.DataFrame(adata_cyto_granule.uns["rank_genes_groups"]["pvals"])
pvals_adj = pd.DataFrame(adata_cyto_granule.uns["rank_genes_groups"]["pvals_adj"])

for i in adata_cyto_granule.obs["modality"].cat.categories:
    df = {"names": names[i], "logfc": logfc[i], "pvals": pvals[i], "pvals_adj": pvals_adj[i]}
    df = pd.DataFrame(df)
    df = df[df["logfc"] >= 0]
    df = df[df["pvals"] <= 0.05]
    df = df.sort_values(by=["logfc"], ascending=False)
    df.to_csv(DE_genes_dir + f"{i}.csv", index=False)

print("cyto_granule modality DE analysis done.")

cyto_granule modality DE analysis done.


In [86]:
adata_cyto_granule.X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 3.84293228, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [92]:
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests


In [93]:
# Total SG counts per gene
sg_counts = np.array(granule_adata.X.sum(axis=0)).flatten()

# Total cytoplasm counts per gene
cyto_counts = np.array(adata_tumor_cyto_raw.X.sum(axis=0)).flatten()

gene_names = granule_adata.var_names


In [94]:
total_sg = sg_counts.sum()
total_cyto = cyto_counts.sum()

sg_frac = sg_counts / total_sg
cyto_frac = cyto_counts / total_cyto


In [95]:
# Avoid divide-by-zero
eps = 1e-9

log2_enrichment = np.log2((sg_frac + eps) / (cyto_frac + eps))


In [98]:
pvals = []

for sg, cy in zip(sg_counts, cyto_counts):
    table = np.array([
        [sg, cy],
        [total_sg - sg, total_cyto - cy]
    ])
    
    # Fisher test (alternative="greater" tests SG enrichment)
    _, p = fisher_exact(table, alternative="greater")
    pvals.append(p)

pvals = np.array(pvals)


In [99]:
_, pvals_adj, _, _ = multipletests(pvals, method='fdr_bh')


In [102]:
df = pd.DataFrame({
    "gene": gene_names,
    "sg_counts": sg_counts,
    "cyto_counts": cyto_counts,
    "sg_fraction": sg_frac,
    "cyto_fraction": cyto_frac,
    "log2_enrichment": log2_enrichment,
    "pval": pvals,
    "pval_adj": pvals_adj
})

# Filter to genes enriched in SGs (positive enrichment)
df_enriched = df[(df["log2_enrichment"] > 0) & (df["pval_adj"] <= 0.05)]

# Sort by strongest enrichment
df_enriched = df_enriched.sort_values("log2_enrichment", ascending=False)


In [103]:
df_enriched

Unnamed: 0,gene,sg_counts,cyto_counts,sg_fraction,cyto_fraction,log2_enrichment,pval,pval_adj
3411,PLCE1,14.0,51,0.000027,0.000003,3.211823,4.063478e-09,1.741832e-07
841,CHD5,21.0,77,0.000041,0.000004,3.202573,6.484270e-13,2.876197e-11
807,CENPF,1441.0,5655,0.002792,0.000325,3.104873,0.000000e+00,0.000000e+00
3769,RCOR1,635.0,2685,0.001230,0.000154,2.997232,1.739195e-311,2.274778e-309
802,CELSR2,4171.0,17640,0.008081,0.001012,2.996947,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...
4893,UTRN,35.0,298,0.000068,0.000017,1.987385,7.434807e-11,3.241449e-09
1533,FAT1,29.0,253,0.000056,0.000015,1.952247,4.863652e-09,2.067457e-07
1536,FAT4,11.0,115,0.000021,0.000007,1.691124,1.090848e-03,4.347201e-02
3085,NR3C2,11.0,116,0.000021,0.000007,1.678635,1.163646e-03,4.596724e-02


In [104]:
sg_markers_df = pd.read_excel(utils_dir + "SG_markers.xlsx")
sg_markers_df = sg_markers_df.sort_values(by = "Fraction of RNA molecules in SGs", ascending = False)

thr = 0.4
sg_marker_genes = sg_markers_df[sg_markers_df["Fraction of RNA molecules in SGs"] > thr]["gene"].to_list()
overlap_genes = [i for i in sg_marker_genes if i in genes]

In [106]:
len(overlap_genes)

130

In [109]:
len(list(set(df_enriched["gene"]).intersection(set(overlap_genes))))

129

In [110]:
sg_counts = np.array(granule_adata.X.sum(axis=0)).flatten()
gene_names = granule_adata.var_names

df_sg = pd.DataFrame({
    "gene": gene_names,
    "sg_counts": sg_counts
}).sort_values("sg_counts", ascending=False)

In [114]:
sg_counts = np.array(granule_adata.X.sum(axis=0)).flatten()
cyto_counts = np.array(adata_tumor_cyto_raw.X.sum(axis=0)).flatten()

sg_specificity = sg_counts / (cyto_counts + 1)

df_spec = pd.DataFrame({
    "gene": granule_adata.var_names,
    "sg_counts": sg_counts,
    "cyto_counts": cyto_counts,
    "sg_specificity": sg_specificity
}).sort_values("sg_specificity", ascending=False)


In [116]:
len(list(set(df_spec["gene"].head(130)).intersection(set(overlap_genes))))

128

In [112]:
"RAB11FIP1" in overlap_genes

True

In [117]:
granule_adata

AnnData object with n_obs × n_vars = 49762 × 5101
    obs: 'global_x', 'global_y', 'global_z', 'layer_z', 'sphere_r', 'size', 'comp', 'in_nucleus', 'gene', 'cell_id', 'nearest_cell_type', 'granule_id'
    var: 'genes'
    uns: 'nearest_cell_type_colors'

In [118]:
adata_tumor_cyto

AnnData object with n_obs × n_vars = 102180 × 5101
    obs: 'cell_id', 'global_x', 'global_y', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'cell_type', 'cell_type_merged', 'hypoxia', 'hypoxia_clipped', 'hypoxia_subtype', 'heat_shock', 'heat_shock_clipped', 'heat_shock_subtype', 'immune_cell_proximity_neighbor_counts', 'log_immune_cell_proximity_neighbor_counts', 'immune_cell_proximity_subtype', 'bcell_proximity_neighbor_counts', 'log_bcell_proximity_neighbor_counts', 'bcell_proximity_subtype', 'tcell_proximity_neighbor_counts', 'log_tcell_proximity_neighbor_counts', 'tcell_proximity_subtype', 'tcell_attack_neighbor_counts', 'log_tcell_attack_neighbor_counts', 'tcell_attack_weighted', 'tcell_attack_subtype', 'tcell_attack_weighted_subtype', 'mechanical', 'mechanical_clipped', 'mechanical_sub