In [None]:
import os
import re 
from pathlib import Path

import pandas as pd
import numpy as np
import scanpy as sc
import scanpy.external as sce
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import matplotlib as mpl
from statannotations.Annotator import Annotator

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/alla_Tcells')

mpl.rcParams['pdf.fonttype'] = 42 #make text editable in pdf
mpl.rcParams['svg.fonttype'] = 'none'

In [None]:
merged = ad.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/merged.h5ad')


In [None]:
merged.obs

In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['Sample','Collection', 'UPN', 'ct']].copy()

# Count total cells per Sample and annot
counts = df.groupby(['Sample','Collection', 'UPN', 'ct'], observed=True).size().reset_index(name='count')

# Count total cells per section_id
totals = counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
counts['proportion'] = counts['count'] * 100 / totals

counts.head()

sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["NBM", "NDMM", "PT"]
order_celltypes = sorted(counts["ct"].unique())  # or custom order

g = sns.catplot(
    data=counts,
    x="Collection",
    y="proportion",
    hue="Collection",
    col="ct",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = counts[counts["ct"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Collection",
        y="proportion",
        hue="Collection",
        order=order_timepoints,
        palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of all cells")
    ax.set_title(cell_type)
    ax.set_ylim(0, None)  # adjust y range
    
    for pid, grp in sub.groupby("UPN"):
        if grp["Collection"].isin(["NDMM", "PT"]).sum() >= 2:
            grp_sorted = grp.sort_values("Collection", key=lambda x: x.map({"NDMM": 1, "PT": 2}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Collection"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("NBM", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Collection", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="full",       # displays raw p-value
        comparisons_correction='BH', 
        #comparisons_correction=None,
        verbose=False
    )
    ax, test_results = annotator.apply_and_annotate()       
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("new_cell_types/cell_type_proportions/celltype_boxplots_allcells_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
Tmerged = merged[merged.obs['ct'].isin(['CD8 T', 'CD4 T'])].copy()

In [None]:
sc.pp.highly_variable_genes(Tmerged, n_top_genes=500, batch_key="Panel")

In [None]:
#mark only those that are variable in both batches
Tmerged.var.loc[Tmerged.var['highly_variable_nbatches'] < 2, 'highly_variable'] = False

In [None]:
Tmerged.var[Tmerged.var['highly_variable']]

In [None]:
# Run PCA on only these genes
sc.tl.pca(Tmerged, mask_var="highly_variable", n_comps=30)

In [None]:
sc.pl.pca_variance_ratio(Tmerged, n_pcs=30, log=True)

In [None]:
sc.pl.pca(
    Tmerged,
    color=["Sample", "Sample", "Sample", "Panel", "Panel", "Panel"],
    dimensions=[(0, 1), (2, 3),(4, 5), (0, 1), (2, 3),(4, 5)],
    ncols=3,
    size=2,
)

In [None]:
# Compute the neighborhood graph, which is a prerequisite for UMAP
sc.pp.neighbors(Tmerged, n_neighbors=20, n_pcs=30) # Use 20 neighbors and 30 PCs

# Compute UMAP embedding
sc.tl.umap(Tmerged)

In [None]:
sc.pl.umap(
    Tmerged,
    color="Panel",
    # Setting a smaller point size to get prevent overlap
    size=2,
)


In [None]:
sc.pl.umap(
    Tmerged,
    color=["CD8A","CD4", 'TRAC', 'CD3E'],
    legend_loc="on data",
    frameon=False,
    ncols=2,
)

In [None]:
sc.tl.leiden(Tmerged, flavor="igraph", n_iterations=2, resolution=0.18)


In [None]:
sc.pl.umap(Tmerged, color=["leiden"])

In [None]:
Tmerged.obs

In [None]:
Tmerged_clean = Tmerged[Tmerged.obs['leiden'].isin(['0','2'])].copy()

In [None]:
sc.pp.highly_variable_genes(Tmerged_clean, n_top_genes=500, batch_key="Panel")
#mark only those that are variable in both batches
Tmerged_clean.var.loc[Tmerged_clean.var['highly_variable_nbatches'] < 2, 'highly_variable'] = False
# Run PCA on only these genes
sc.tl.pca(Tmerged_clean, mask_var="highly_variable", n_comps=30)
# Compute the neighborhood graph, which is a prerequisite for UMAP
sc.pp.neighbors(Tmerged_clean, n_neighbors=20, n_pcs=30) # Use 20 neighbors and 30 PCs
# Compute UMAP embedding
sc.tl.umap(Tmerged_clean)



In [None]:
sc.pl.umap(
    Tmerged_clean,
    color="Panel",
    # Setting a smaller point size to get prevent overlap
    size=2,
)

sc.pl.umap(
    Tmerged_clean,
    color=["CD8A","CD4", 'TRAC', 'CD3E'],
    legend_loc="on data",
    frameon=False,
    ncols=2,
)

In [None]:
sc.tl.leiden(Tmerged_clean, flavor="igraph", n_iterations=2, resolution=0.6)
sc.pl.umap(Tmerged_clean, color=["leiden"], legend_loc='on data' )

In [None]:
Tmerged_clean.obs.groupby("leiden").size().reset_index(name='count')

In [None]:
sc.tl.rank_genes_groups(Tmerged_clean, groupby="leiden", method="wilcoxon")

In [None]:
sc.tl.dendrogram(Tmerged_clean,groupby="leiden")
sc.pl.rank_genes_groups_dotplot(Tmerged_clean, groupby="leiden", standard_scale="var", n_genes=5)


In [None]:
Tmerged_clean.write('Tcell_merged_clean_new_celltypes.h5ad')

In [None]:
sc.pl.violin(Tmerged_clean, 'CD8A', groupby='leiden', multi_panel=True)
sc.pl.violin(Tmerged_clean, 'CD4', groupby='leiden', multi_panel=True)

In [None]:
import numpy as np
import pandas as pd
import anndata as ad

def gated_Tcell_types_plus(
    adata,
    thr=0.3,
    k=2,
    k_exh=None,
    k_treg=None,
    k_tcm=None,
    k_gzmk=None,
    k_nk_core=None,
    k_cyto=3,
    reduce_k_if_missing=True,
    layer=None,  # specify layer name if data not in adata.X
):
    """Assign gated T/NK subtypes to an AnnData object based on gene expression."""

    # choose matrix
    mat = adata.layers[layer] if layer else adata.X
    if not isinstance(mat, np.ndarray):
        mat = mat.toarray()
    genes = adata.var_names
    cells = adata.obs_names
    THR = thr

    # ---- helpers ----
    def getk(override, fallback):
        ku = fallback if override is None else override
        ku = int(ku) if ku is not None else 1
        return max(ku, 1)

    def has(gene_list):
        return [g for g in gene_list if g in genes]

    def gene_gt(gene, cutoff=THR):
        if gene not in genes:
            return np.zeros(mat.shape[0], dtype=bool)
        idx = np.where(genes == gene)[0][0]
        return mat[:, idx] > cutoff

    def k_of_n(gene_list, k_use, cutoff=THR):
        g = has(gene_list)
        if not g:
            return np.zeros(mat.shape[0], dtype=bool)
        if reduce_k_if_missing:
            k_use = min(k_use, len(g))
        idx = [np.where(genes == gg)[0][0] for gg in g]
        return (mat[:, idx] > cutoff).sum(axis=1) >= k_use

    def any_expr(gene_list, cutoff=THR):
        return k_of_n(gene_list, 1, cutoff)

    # ---- marker sets ----
    tcr_set = ['CD247','TRAC', 'CD3E','CD3D', 'CD3G']
    cd8_line_set = ['CD8A', 'CD8B']
    cd4_line_set = ['CD4', 'CCR7', 'SELL','IL7R']

    exh_set  = ['PDCD1','LAG3','TIGIT','CTLA4','HAVCR2','TOX','CXCL13']
    treg_set = ['FOXP3','IL2RA','CTLA4','IKZF2','TIGIT','CCR8','TNFRSF18']
    tcm_set  = ['CCR7','SELL','S1PR1']
    eff_set = ['GZMK','GZMA', 'GZMB', 'KLRD1', 'NKG7', 'FGFBP2', 'CCL5']

    cyto_set = ['PRF1','GZMA', 'GZMB','FGFBP2']
    nk_core_set  = ['KLRD1','PRF1','GZMB','FCGR3A', 'GNLY', 'NCAM1']

    # ---- resolve k ----
    K_EXH = getk(k_exh, k)
    K_TREG = getk(k_treg, k)
    K_TCM = getk(k_tcm, k)
    K_GZMK = getk(k_gzmk, k)
    K_NKCORE = getk(k_nk_core, k)
    K_CYTO = getk(k_cyto, k_cyto)

    # ---- base lineages ----
    t_backbone = any_expr(tcr_set)
    cd8_line = any_expr(cd8_line_set)
    #cd4_line = gene_gt('CD4') | any_expr(['IL7R','CCR7','SELL'])
    cd4_line = any_expr(['CD4'])
    
    # ---- T-cell gates ----
    cd8exh = t_backbone & cd8_line & k_of_n(exh_set,  K_EXH)
    cd4treg = t_backbone & cd4_line & k_of_n(treg_set, K_TREG)
    cd4Tcm = t_backbone & cd4_line & k_of_n(tcm_set,  K_TCM) & ~cd4treg
    #cd4Teff = t_backbone & cd4_line & ~cd4treg & ~cd4Tcm
    cd8Teff = t_backbone & cd8_line & k_of_n(eff_set, K_GZMK) & ~cd8exh
    cd8Tcm = t_backbone & cd8_line & k_of_n(tcm_set,  K_TCM) & ~cd8exh
    cd8T = t_backbone & cd8_line
    cd4T = t_backbone & cd4_line
    
    print(cd8exh.sum())
    
    # ---- NK lineage ----
    nk_core = k_of_n(nk_core_set, K_NKCORE)
    tcr_high = any_expr(tcr_set)
    nk_line = nk_core & ~tcr_high

    cd56_hi = any_expr(['NCAM1'])
    cd16_hi = any_expr(['FCGR3A'])
    cyto_hi = k_of_n(cyto_set, K_CYTO)

    nk_cd56bright = nk_line & (cd56_hi | ~cd16_hi) & ~cyto_hi
    nk_cd56dim = nk_line & (cd16_hi | cyto_hi | any_expr(['CX3CR1']))
    #nk_plain = nk_line & ~(nk_cd56bright | nk_cd56dim)
    nk_plain = nk_line 

    # ---- label assignment ----
    labels = np.full(mat.shape[0], 'other_T', dtype=object)
    def set_label(mask, label): labels[mask] = label
        
    
    set_label(cd4T, 'CD4T')
    set_label(cd8T, 'CD8T')
    #set_label(cd4Teff, 'CD4Teff')
    set_label(cd4Tcm, 'CD4Tcm')
    set_label(cd8Teff, 'CD8Teff')
    set_label(cd8Tcm, 'CD8Tcm')
    set_label(cd8exh, 'CD8Texh')
    set_label(cd4treg, 'CD4Treg')
    #set_label(nk_cd56bright, 'NK_CD56bright')
    #set_label(nk_cd56dim, 'NK_CD56dim')
    set_label(nk_plain, 'NK')

    adata.obs['cell_type_gated'] = labels
    return adata

In [None]:
Tmerged_clean = gated_Tcell_types_plus(adata = Tmerged_clean)

print(Tmerged_clean.obs['cell_type_gated'].value_counts())

In [None]:
gate_order = ["CD8Texh", "CD4Treg","CD4Teff", "CD4Tcm", "CD8Teff", "CD8Tcm", "CD8T", "CD4T", 'NK', 'other_T']
celltyping_markers = {
    "Tcell": ['CD247','TRAC', 'CD3E','CD3D'],
    "CD8":['CD8A'],
    "CD4": ['CD4', 'CCR7', 'SELL'],
    "Treg": ['FOXP3','IL2RA','CTLA4', 'TNFRSF9'],
    "CTL": ['PRF1','GZMA', 'GZMB','GZMK', 'KLRD1'], 
    "Texh":['PDCD1','LAG3','TIGIT','CTLA4','HAVCR2'],
    "NK": ['KLRD1','FCGR3A', 'GNLY', 'NCAM1']
}
sc.pl.dotplot(
    Tmerged_clean,
    var_names=celltyping_markers,
    groupby='cell_type_gated', 
    #layer='raw_intensity',
    standard_scale='var',  
    show=False
)
plt.savefig("new_cell_types/Tcell_type_gated_dotplot2.pdf", bbox_inches="tight")

In [None]:
celltyping_markers = {
    "Tcell": ['CD247','TRAC', 'CD3E','CD3D'],
    "CD8":['CD8A'],
    "CD4": ['CD4', 'CCR7', 'SELL'],
    "Treg": ['FOXP3','IL2RA','CTLA4', 'TNFRSF9'],
    "CTL": ['PRF1','GZMA', 'GZMB','GZMK', 'KLRD1'], 
    "Texh":['PDCD1','LAG3','TIGIT','CTLA4','HAVCR2'],
    "NK": ['KLRD1','FCGR3A', 'GNLY', 'NCAM1']
}


sc.pl.matrixplot(
    Tmerged_clean, 
    var_names=celltyping_markers, 
    groupby="cell_type_gated",
    dendrogram=False,              
    standard_scale='var',           
    figsize=(10, 3),
    show=False
)

plt.savefig("new_cell_types/Tcell_type_gated_heatmap2.pdf", bbox_inches="tight")

In [None]:
sc.pl.umap(Tmerged_clean, color=["cell_type_gated"], legend_loc='on data')

In [None]:
Tmerged_clean.write('Tcell_merged_clean_new_celltypes_gated.h5ad')

In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = Tmerged_clean.obs[['Sample','Collection', 'UPN', 'cell_type_gated']].copy()
df = df[(df["cell_type_gated"] != "NK")]

# exclude any "unknown" values 
#df_noUnk = df[(df["manual_gate"] != "Unassigned") & (df["manual_gate"] != "Unk. CD45+")]
#df_noUnk = df[(df["manual_gate"] != "Unknown") & (df["annot"] != "Plasma Cell")]

# Count total cells per Sample and annot
counts = df.groupby(['Sample','Collection', 'UPN', 'cell_type_gated'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample', 'Collection', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['cell_type_gated'].unique()],
        names=['Sample', 'cell_type_gated']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','cell_type_gated', 'count']], on=['Sample','cell_type_gated'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)


# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["NBM", "NDMM", "PT"]
order_celltypes = sorted(full_counts["cell_type_gated"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Collection",
    y="proportion",
    hue="Collection",
    col="cell_type_gated",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["cell_type_gated"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Collection",
        y="proportion",
        hue="Collection",
        order=order_timepoints,
        palette="dark:black",
        #palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=False,
        size=2,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)
    ax.set_ylim(-1, None)  # adjust y range
    
    for pid, grp in sub.groupby("UPN"):
        if grp["Collection"].isin(["NDMM", "PT"]).sum() >= 2:
            grp_sorted = grp.sort_values("Collection", key=lambda x: x.map({"NDMM": 1, "PT": 2}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Collection"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("NBM", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Collection", y="proportion", 
                          order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
        comparisons_correction='fdr_bh', 
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()       
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("new_cell_types/cell_type_proportions/Tcell_celltype2_boxplots_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection

pairs = [("NBM", "NDMM"), ("NDMM", "PT")]

results = []

for cell_type, sub in full_counts.groupby("cell_type_gated"):

    for cond1, cond2 in pairs:

        x = sub.loc[sub["Collection"] == cond1, "proportion"]
        y = sub.loc[sub["Collection"] == cond2, "proportion"]

        # If either group is empty, skip
        if len(x) == 0 or len(y) == 0:
            continue

        # Mann–Whitney U test (unpaired Wilcoxon)
        stat, pval = mannwhitneyu(x, y, alternative="two-sided")

        results.append({
            "cell_type_gated": cell_type,
            "comparison": f"{cond2} vs {cond1}",
            "cond1": cond1,
            "cond2": cond2,
            "n_cond1": len(x),
            "n_cond2": len(y),
            "stat": stat,
            "p_raw": pval,
        })

# Convert to DataFrame
res_df = pd.DataFrame(results)

# Apply FDR correction across all tests
reject, p_fdr = fdrcorrection(res_df["p_raw"], alpha=0.05, method="indep")
res_df["p_fdr"] = p_fdr
res_df["significant_fdr_0.05"] = reject

# Save to CSV
out_path = "new_cell_types/cell_type_proportions/Tcell_celltype2_boxplots_Mann-Whitney_FDR_results.csv"
res_df.to_csv(out_path, index=False)

print("Saved results to:", out_path)
print(res_df)

# Adding  radial neighborhoods information

In [None]:
Tmerged_clean = ad.read_h5ad('Tcell_merged_clean_new_celltypes_gated.h5ad')

In [None]:
radial_neighborhoods = ad.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/radial_neighborhoods/Output/merged_RN.h5ad')




In [None]:
radial_neighborhoods_tb = radial_neighborhoods.obs["rn"]

In [None]:
%xdel radial_neighborhoods

In [None]:
Tmerged_clean.obs= (
    Tmerged_clean.obs
    .reset_index()
    .merge(radial_neighborhoods_tb.reset_index(), on=['index'], how='left')
    .set_index('index')
)

# merged.obs= (
#     merged.obs
#     .reset_index()
#     .merge(radial_neighborhoods_tb.reset_index(), on=['index'], how='left')
#     .set_index('index')
# )
unique_values = Tmerged_clean.obs["rn"].unique()
unique_values = np.sort(unique_values)
unique_values

In [None]:
Tmerged_clean.obs

In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = Tmerged_clean.obs[['Sample','rn','Collection', 'UPN', 'cell_type_gated']].copy()
df = df[(df["cell_type_gated"] != "NK")]
df = df[(df["rn"] != "Unassigned")]

# Count total cells per Sample and annot
full_counts = df.groupby(['Sample','rn','Collection', 'UPN', 'cell_type_gated'], observed=True).size().reset_index(name='count')

# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)

unique_values = full_counts["rn"].unique()
unique_values = np.sort(unique_values)
unique_values = [f"RN{i}" for i in range(1, 14)]

order_celltypes = sorted(full_counts["cell_type_gated"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="rn",
    y="proportion",
    hue="Collection",
    col="cell_type_gated",
    kind="box",
    order=unique_values,
    col_order=order_celltypes,
    palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=True,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["cell_type_gated"] == cell_type]
    sns.stripplot(
        data=sub,
        x="rn",
        y="proportion",
        hue="Collection",
        order=unique_values,
        palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=True,
        size=2,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)
    ax.set_ylim(-0.03, None)  # adjust y range
    
         
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by radial neighborhoods", fontsize=16)
#g.add_legend(title="Timepoint", loc="upper right", bbox_to_anchor=(1.05, 1), frameon=False)
g.fig.legend(title="Timepoint", loc="upper right", bbox_to_anchor=(1.05, 1), frameon=False)
plt.draw()
for ax in g.axes.flatten():
    ax.tick_params(axis="x", labelrotation=90) 
g.fig.savefig("new_cell_types/cell_type_proportions_rad_neighborhoods/Tcell_celltype_boxplots_by_rad_neigh_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = Tmerged_clean.obs[['Sample','rn','Collection', 'UPN', 'cell_type_gated']].copy()
df = df[(df["cell_type_gated"] != "NK")]
df = df[(df["rn"] != "Unassigned")]

# Count total cells per Sample and annot
counts = df.groupby(['Sample','rn','Collection', 'UPN', 'cell_type_gated'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample','Collection', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['rn'].unique(), df['cell_type_gated'].unique()],
        names=['Sample','rn', 'cell_type_gated']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','rn', 'cell_type_gated', 'count']], on=['Sample','rn','cell_type_gated'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)

# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)

rn_order_global = [f"RN{i}" for i in range(1, 14)]
order_celltypes = sorted(full_counts["cell_type_gated"].unique())  # or custom order
order_collections = ["NBM", "NDMM", "PT"]
base_pairs = [("NBM", "NDMM"), ("NDMM", "PT")]

for cell_type in sorted(full_counts["cell_type_gated"].unique()):
    sub = full_counts[full_counts["cell_type_gated"] == cell_type].copy()

    # Keep only RN values that exist for this cell type, in the desired order
    rn_order = [rn for rn in rn_order_global if rn in sub["rn"].unique()]
    if not rn_order:
        print(f"Skipping {cell_type}: no RN values found")
        continue

    plt.figure(figsize=(8, 5))

    # Boxplot
    ax = sns.boxplot(
        data=sub,
        x="rn",
        y="proportion",
        hue="Collection",
        order=rn_order,
        hue_order=order_collections,
        palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        showfliers=False,
    )

    # Stripplot on top
    sns.stripplot(
        data=sub,
        x="rn",
        y="proportion",
        hue="Collection",
        order=rn_order,
        hue_order=order_collections,
        dodge=True,
        palette={"NBM": "black", "NDMM": "black", "PT": "black"},
        size=2,
        linewidth=0,
        edgecolor="black",
        alpha=1,
        ax=ax,
        legend=False,  # avoid duplicate legend
    )
    
    
    ax.set_xlabel("")
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)

    # Rotate x labels
    plt.setp(ax.get_xticklabels(), rotation=90)

    # Build pairs: within each RN, compare NBM vs NDMM and NDMM vs PT,
    # build pairs per RN: ((rn, group1), (rn, group2))
    pairs = []
    for rn in rn_order:
        rn_sub = sub[sub["rn"] == rn]
        for c1, c2 in base_pairs:
            if ((rn_sub["Collection"] == c1).any()
                and (rn_sub["Collection"] == c2).any()):
                pairs.append(((rn, c1), (rn, c2)))

    if pairs:
        annotator = Annotator(
            ax,
            pairs=pairs,
            data=sub,
            x="rn",
            y="proportion",
            hue="Collection",
            order=rn_order,
            hue_order=order_collections,
        )
        annotator.configure(
            test="Mann-Whitney",
            text_format="simple",   # raw p-values
            loc="inside",
            hide_non_significant=True,
            show_test_name=False,
            verbose=False,
        )
        annotator.apply_and_annotate()

    # legend outside
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles[:3],
        labels[:3],
        title="Collection",
        loc="upper right",
        bbox_to_anchor=(1.25, 1),
        frameon=False,
    )
    upper_bound = sub["proportion"].quantile(0.999)
    print(upper_bound)
    ax.set_ylim(-0.03, upper_bound)
    
    plt.tight_layout()

    # Save
    safe_name = cell_type.replace(" ", "_").replace("/", "_")
    outfile = os.path.join(f"new_cell_types/cell_type_proportions_rad_neighborhoods/Tcell_{safe_name}_celltype_boxplots_by_rad_neigh_with0_counts_Mann-Whitney.pdf")
    ax.tick_params(axis="x", labelrotation=90) 
    plt.savefig(outfile, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"Saved {outfile}")
    


In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection

pairs = [("NBM", "NDMM"), ("NDMM", "PT")]

results = []

for (cell_type, rn), sub in full_counts.groupby(["cell_type_gated", "rn"]):

    for cond1, cond2 in pairs:

        x = sub.loc[sub["Collection"] == cond1, "proportion"]
        y = sub.loc[sub["Collection"] == cond2, "proportion"]

        # If either group is empty, skip
        if len(x) == 0 or len(y) == 0:
            continue

        # Mann–Whitney U test (unpaired Wilcoxon)
        stat, pval = mannwhitneyu(x, y, alternative="two-sided")

        results.append({
            "cell_type_gated": cell_type,
            "rn": rn,
            "comparison": f"{cond2} vs {cond1}",
            "cond1": cond1,
            "cond2": cond2,
            "n_cond1": len(x),
            "n_cond2": len(y),
            "stat": stat,
            "p_raw": pval,
        })

# Convert to DataFrame
res_df = pd.DataFrame(results)

# Apply FDR correction across all tests
reject, p_fdr = fdrcorrection(res_df["p_raw"], alpha=0.05, method="indep")
res_df["p_fdr"] = p_fdr
res_df["significant_fdr_0.05"] = reject

# Save to CSV
out_path = "new_cell_types/cell_type_proportions_rad_neighborhoods/Tcell_celltype_boxplots_by_rad_neigh_with0_counts_Mann-Whitney_FDR_results.csv"
res_df.to_csv(out_path, index=False)

print("Saved results to:", out_path)
print(res_df)

In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['Sample','rn','Collection', 'UPN', 'ct']].copy()
df = df[(df["rn"] != "Unassigned")]

# Count total cells per Sample and annot
full_counts = df.groupby(['Sample','rn','Collection', 'UPN', 'ct'], observed=True).size().reset_index(name='count')

# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)

unique_values = full_counts["rn"].unique()
unique_values = np.sort(unique_values)
unique_values = [f"RN{i}" for i in range(1, 14)]

order_celltypes = sorted(full_counts["ct"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="rn",
    y="proportion",
    hue="Collection",
    col="ct",
    kind="box",
    order=unique_values,
    col_order=order_celltypes,
    palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=True,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["ct"] == cell_type]
    sns.stripplot(
        data=sub,
        x="rn",
        y="proportion",
        hue="Collection",
        order=unique_values,
        palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=True,
        size=2,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of all cells")
    ax.set_title(cell_type)
    ax.set_ylim(-0.03, None)  # adjust y range
    

# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by radial neighborhoods", fontsize=16)
#g.add_legend(title="Timepoint", loc="upper right", bbox_to_anchor=(1.05, 1), frameon=False)
g.fig.legend(title="Timepoint", loc="upper right", bbox_to_anchor=(1.05, 1), frameon=False)
plt.draw()
for ax in g.axes.flatten():
    ax.tick_params(axis="x", labelrotation=90)  
g.fig.savefig("new_cell_types/cell_type_proportions_rad_neighborhoods/All_cell_celltype_boxplots_by_rad_neigh_by_timepoint_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()

