In [None]:
import os
import re 
from pathlib import Path

import pandas as pd
import numpy as np
import scanpy as sc
import scanpy.external as sce
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import matplotlib as mpl
from statannotations.Annotator import Annotator

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/CODEX/Run_08272025/analysis/')


mpl.rcParams['pdf.fonttype'] = 42 #make text editable in pdf
mpl.rcParams['svg.fonttype'] = 'none'


In [None]:

s1 = ad.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/CODEX/Run_08272025/analysis/alla_updates/change_rules_order_and_thresholds1/slide1.h5ad')
s2 = ad.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/CODEX/Run_08272025/analysis/alla_updates/change_rules_order_and_thresholds1/slide2.h5ad')




In [None]:
merged = ad.concat([s1, s2], join='outer', keys=['Slide1', 'Slide2'], index_unique='-')



In [None]:
merged.obs["section_id_old"] = merged.obs["section_id"] 
#fix sample swap
merged.obs["section_id"]  = np.select(
    [
        merged.obs["section_id"].str.contains("WU007_NDMM"),   # first condition
        merged.obs["section_id"].str.contains("WU007_PT"),   # second condition
    ],
    [
        # corresponding outputs
        merged.obs["section_id"].str.replace("WU007_NDMM", "WU007_PT"),   
        merged.obs["section_id"].str.replace("WU007_PT", "WU007_NDMM"),
    ],
    default=merged.obs["section_id"],  
)
merged.obs["Timepoint"] = np.select(
    [
        merged.obs["section_id"].str.contains("WU"),   # first condition
        merged.obs["section_id"].str.contains("SN"),   # second condition
    ],
    [
        # corresponding outputs
        merged.obs["section_id"].str.split("_").str[1],   
        "Normal",
    ],
    default=merged.obs["section_id"].str.split("_").str[0]  
)
merged.obs["Patient_ID"] = np.select(
    [
        merged.obs["section_id"].str.contains("WU"),   # first condition
        merged.obs["section_id"].str.contains("SN"),   # second condition
    ],
    [
        # corresponding outputs
        merged.obs["section_id"].str.split("_").str[0],  
        merged.obs["section_id"],
    ],
    default=merged.obs["section_id"].str.split("_").str[1],  
)

merged.obs

In [None]:
def apply_manual_gates_by_section(
    adata,
    rules,
    thresholds,                      # your absolute thresholds dict by marker
    section_col="section_id",
    out_col="manual_gate",
    layer=None,                      # e.g., "raw_intensity"; if None, pull from X/obs
    cell_totals=None,                # 1D array-like of per-cell totals (optional if mean provided)
    mean_total_by_sec=None           # pd.Series: index=section_id, value=mean total
):
    """
    Assigns gates to cells using per-cell thresholds derived as:
       per_cell_threshold(marker) = thresholds[marker] / mean_total_by_sec[section_of_cell]

    Priority resolves conflicts (higher number wins).
    """
    n = adata.n_obs

    # --- mean total per section ---
    if mean_total_by_sec is None:
        if cell_totals is None:
            raise ValueError("Provide either mean_total_by_sec or cell_totals to compute it.")
        sec = adata.obs[section_col]
        mean_total_by_sec = pd.Series(cell_totals, index=sec).groupby(level=0, observed=True).mean()

    # per-cell scale (mean total) aligned to cells
    sec_ids = adata.obs[section_col].values
    scale = mean_total_by_sec.reindex(sec_ids).to_numpy()

    # guard against zeros/negatives (treat as un-gateable by making threshold inf)
    safe_scale = np.where(scale > 0, scale, np.nan)

    # --- helper to pull a marker's intensity vector (1D) ---
    def get_marker_values(marker):
        # Prefer a matrix layer if given and marker is a var
        if layer is not None and marker in adata.var_names:
            M = adata.layers[layer]
            j = adata.var_names.get_loc(marker)
            col = M[:, j]
            return (col.toarray().ravel() if sparse.issparse(col) else np.asarray(col).ravel())
        # If marker stored as a column in .obs (e.g., *_intensity)
        if marker in adata.obs.columns:
            return adata.obs[marker].to_numpy()
        # Fall back to X if marker is a var
        if marker in adata.var_names:
            col = adata[:, marker].X
            return (col.toarray().ravel() if sparse.issparse(col) else np.asarray(col).ravel())
        raise KeyError(f"Marker '{marker}' not found in layer '{layer}', .obs, or .X/.var_names.")

    # --- precompute per-cell thresholds for every marker we need ---
    # collect all marker names mentioned in rules
    needed_markers = set()
    for r in rules:
        for key in ("all", "any", "not_any"):
            if key in r:
                needed_markers.update(r[key].keys())

    # build dict of values and effective thresholds per marker
    marker_vals = {m: get_marker_values(m) for m in needed_markers}

    # effective per-cell threshold arrays: t_eff = abs_t / safe_scale
    # when scale is NaN -> threshold = +inf so comparisons are False
    def per_cell_threshold(abs_t):
        t = np.asarray(abs_t, dtype=float)  # scalar expected
        t_eff = np.divide(t, safe_scale, out=np.full_like(safe_scale, np.inf, dtype=float), where=~np.isnan(safe_scale))
        return t_eff

    eff_thresh = {m: per_cell_threshold(thresholds[m]) for m in needed_markers}

    # --- evaluate rules with priority ---
    labels = np.array(["Unassigned"] * n, dtype=object)
    curr_prio = np.full(n, -np.inf, dtype=float)

    # sort rules by descending priority (higher wins)
    rules_sorted = sorted(rules, key=lambda r: r.get("priority", 0), reverse=True)

    for r in rules_sorted:
        mask = np.ones(n, dtype=bool)

        if "all" in r:
            all_mask = np.ones(n, dtype=bool)
            for m, _ in r["all"].items():
                all_mask &= (marker_vals[m] >= eff_thresh[m])
            mask &= all_mask

        if "any" in r:
            any_mask = np.zeros(n, dtype=bool)
            for m, _ in r["any"].items():
                any_mask |= (marker_vals[m] >= eff_thresh[m])
            mask &= any_mask

        if "not_any" in r:
            not_any_mask = np.zeros(n, dtype=bool)
            for m, _ in r["not_any"].items():
                not_any_mask |= (marker_vals[m] >= eff_thresh[m])
            mask &= ~not_any_mask

        # apply priority: only overwrite if this rule has higher priority
        pr = float(r.get("priority", 0))
        to_set = mask & (pr > curr_prio)
        labels[to_set] = r["name"]
        curr_prio[to_set] = pr

    # write to adata
    # order categories by priority (highest first) then "Unassigned"
    cats = [r["name"] for r in rules_sorted]
    if "Unassigned" not in cats:
        cats.append("Unassigned")
    adata.obs[out_col] = pd.Categorical(labels, categories=cats, ordered=True)
    # keep the scaling used (for reproducibility)
    adata.uns["mean_total_by_section"] = mean_total_by_sec.to_dict()
    adata.uns["absolute_thresholds"] = dict(thresholds)
    adata.uns["gating_section_col"] = section_col
    adata.uns["gating_layer_used"] = layer

    return adata.obs[out_col]

In [None]:
# get manual thresholds: the lowest intensity to see a positive signal in the slide

#updated thresholds
thresholds = {
    'CD45_intensity': 20,
    'CD11b_intensity': 20,
    'CD14_intensity': 7,
    'CD163_intensity': 5,
    'CD68_intensity': 50,
    'CD3e_intensity': 30,
    'CD4_intensity': 46,
    'CD8_intensity': 5,
    'CD79a_intensity': 20,
    'CD20_intensity': 5,
    'FOXP3_intensity': 80,
    'TACI_intensity': 10,
    'CD138_intensity': 10,
    'MPO_intensity': 20,
    'CD38_intensity': 10,
    'Vimentin_intensity': 120,
    'CD34_intensity': 9
    
}

In [None]:
# gating strategy: 

# define rules for each cell type:
rules = [
    {
        "name": "Plasma Cell",
        "priority": 9,  
        "any": {"CD138_intensity": thresholds["CD138_intensity"], "CD38_intensity": thresholds["CD38_intensity"]},
        "not_any": {"CD3e_intensity": thresholds["CD3e_intensity"], "CD20_intensity": thresholds["CD20_intensity"], "CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },
    
    {
        "name": "T Cell",
        "priority": 7,  
        "all": {"CD3e_intensity": thresholds["CD3e_intensity"]},
        "any": {"CD3e_intensity": thresholds["CD3e_intensity"], "CD4_intensity": thresholds["CD4_intensity"], "CD8_intensity": thresholds["CD8_intensity"], "FOXP3_intensity": thresholds["FOXP3_intensity"]},
        "not_any": {"CD20_intensity": thresholds["CD20_intensity"], "CD138_intensity":thresholds["CD138_intensity"],"CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },
    {
        "name": "B Cell",
        "priority": 5,  
        "any": {"CD20_intensity": thresholds["CD20_intensity"], "CD79a_intensity": thresholds["CD79a_intensity"]},
        "not_any": {"CD3e_intensity": thresholds["CD3e_intensity"], "CD138_intensity":thresholds["CD138_intensity"], "CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },

    {
        "name": "Myeloid",
        "priority": 4,  
        "any": {"CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD68_intensity": thresholds["CD68_intensity"], "MPO_intensity": thresholds["MPO_intensity"]},
        #"not_any": {"CD20_intensity": thresholds["CD20_intensity"], "CD3e_intensity": thresholds["CD3e_intensity"], "CD138_intensity": thresholds["CD138_intensity"] }
    },
    {
        "name": "Progenitor Cell",
        "priority": 3, 
        "all": {"CD34_intensity": thresholds["CD34_intensity"]},
        "not_any": {"Vimentin_intensity": thresholds["Vimentin_intensity"]}
    },
    {
        "name": "Stroma",
        "priority": 2,  
        "any": {"Vimentin_intensity": thresholds["Vimentin_intensity"]},
        "not_any": {"CD45_intensity": thresholds["CD45_intensity"]}
    },
    {
        "name": "Unk. CD45+",
        "priority": 1,  
        "any": {"CD45_intensity": thresholds["CD45_intensity"]},
    }
    
]

In [None]:
# normalize intensities by dividing raw intensity of each marker by the average total raw intensity of that section
X = merged.layers["raw_intensity"]   
row_sums = X.sum(axis=1)
print(row_sums.min(), row_sums.max())

print(sparse.issparse(X)) # check if sparse
# because it is sparse,
cell_totals = np.asarray(X.sum(axis=1)).ravel()
sec = merged.obs["section_id"]
mean_total_by_sec = pd.Series(cell_totals, index=sec).groupby(level=0, observed=True).mean()
mean_total_by_sec_dict = mean_total_by_sec.to_dict()
sids = np.array(sec)


In [None]:
# apply gates:
gated = apply_manual_gates_by_section(
    adata=merged,
    rules=rules,
    thresholds=thresholds,
    section_col="section_id",
    out_col="manual_gate",
    #cell_totals=cell_totals,          # omit if you pass mean_total_by_sec=...
    mean_total_by_sec=mean_total_by_sec
)

print(gated.value_counts())

merged.obs['manual_gate'] = gated
gate_order = ["Unassigned", "Unk. CD45+","Progenitor Cell", "Myeloid", "Plasma Cell", "T Cell", "B Cell", "Stroma"]

merged.obs["manual_gate"] = pd.Categorical(
    merged.obs["manual_gate"],
    categories=gate_order,
    ordered=True
)

In [None]:
gate_order = ["Unassigned", "Unk. CD45+","Progenitor Cell", "Myeloid", "Plasma Cell", "T Cell", "B Cell", "Stroma"]
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['section_id','Timepoint', 'Patient_ID', 'manual_gate']].copy()

# exclude any "unknown" values 
#df_noUnk = df[(df["manual_gate"] != "Unassigned") & (df["manual_gate"] != "Unk. CD45+")]
df_noUnk = df[(df["manual_gate"] != "Unassigned")]

# Count total cells per Sample and annot
counts = df_noUnk.groupby(['section_id','Timepoint', 'Patient_ID', 'manual_gate'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['section_id','Timepoint', 'Patient_ID',]].drop_duplicates(subset=['section_id'])
)

all_combos = (
    pd.MultiIndex.from_product([df['section_id'].unique(),df['manual_gate'].unique()],
        names=['section_id', 'manual_gate']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['section_id','manual_gate', 'count']], on=['section_id','manual_gate'], how='left')
    .merge(sample_meta, on='section_id', how='left')
    .fillna({'count': 0})
)

# Count total cells per section_id
totals = full_counts.groupby('section_id', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["Normal", "NDMM", "PT"]
order_celltypes = sorted(counts["manual_gate"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Timepoint",
    y="proportion",
    hue="Timepoint",
    col="manual_gate",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["manual_gate"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Timepoint",
        y="proportion",
        hue="Timepoint",
        order=order_timepoints,
        palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of all cells")
    ax.set_title(cell_type)
    ax.set_ylim(0, None)  # adjust y range
    
    for pid, grp in sub.groupby("Patient_ID"):
        if grp["Timepoint"].isin(["NDMM", "PT"]).sum() >= 2:
            grp_sorted = grp.sort_values("Timepoint", key=lambda x: x.map({"NDMM": 1, "PT": 2}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Timepoint"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("Normal", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Timepoint", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()       
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/celltype_boxplots_Wilcoxon.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['section_id','Timepoint', 'Patient_ID', 'manual_gate']].copy()

# exclude any "unknown" values 
#df_noUnk = df[(df["manual_gate"] != "Unassigned") & (df["manual_gate"] != "Unk. CD45+")]
df_noUnk = df[(df["manual_gate"] != "Unassigned") & (df["manual_gate"] != "Plasma Cell")]

# Count total cells per Sample and annot
counts = df_noUnk.groupby(['section_id','Timepoint', 'Patient_ID', 'manual_gate'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['section_id','Timepoint', 'Patient_ID',]].drop_duplicates(subset=['section_id'])
)

all_combos = (
    pd.MultiIndex.from_product([df['section_id'].unique(),df['manual_gate'].unique()],
        names=['section_id', 'manual_gate']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['section_id','manual_gate', 'count']], on=['section_id','manual_gate'], how='left')
    .merge(sample_meta, on='section_id', how='left')
    .fillna({'count': 0})
)

# Count total cells per section_id
totals = full_counts.groupby('section_id', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["Normal", "NDMM", "PT"]
order_celltypes = sorted(counts["manual_gate"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Timepoint",
    y="proportion",
    hue="Timepoint",
    col="manual_gate",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["manual_gate"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Timepoint",
        y="proportion",
        hue="Timepoint",
        order=order_timepoints,
        palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of non-plasma cells")
    ax.set_title(cell_type)
    ax.set_ylim(0, None)  # adjust y range
    
    for pid, grp in sub.groupby("Patient_ID"):
        if grp["Timepoint"].isin(["NDMM", "PT"]).sum() >= 2:
            grp_sorted = grp.sort_values("Timepoint", key=lambda x: x.map({"NDMM": 1, "PT": 2}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Timepoint"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("Normal", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Timepoint", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()   
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/celltype_boxplots_nonplasma_Wilcoxon.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
gate_order = ["Unassigned", "Unk. CD45+","Progenitor Cell", "Myeloid", "Plasma Cell", "T Cell", "B Cell", "Stroma"]
celltyping_markers = {
    "Imm": ['CD45_intensity'],
    "Prog":['CD34_intensity', 'cKit_intensity'],
    "Mye": ['CD11b_intensity', 'CD14_intensity',  'CD68_intensity', 'MPO_intensity'],
    "PC": ['CD138_intensity', 'CD38_intensity'],
    "T": ['CD3e_intensity', 'CD4_intensity', 'CD8_intensity', 'Granzyme-B_intensity'], #'CD45RO_intensity', 'CD44_intensity'],
    "B": ['CD79a_intensity', 'CD20_intensity', 'CD27_intensity'],
    "Stroma":['Vimentin_intensity']
}
sc.pl.dotplot(
    merged,
    var_names=celltyping_markers,
    groupby='manual_gate', 
    #layer='raw_intensity',
    standard_scale='var',  
    show=False
)
plt.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/merged_manual_gate_dotplot.pdf", bbox_inches="tight")

In [None]:
merged_noUnk = merged[(merged.obs['manual_gate'] != 'Unassigned') & (merged.obs['manual_gate'] != 'Unk. CD45+')].copy()

In [None]:
sc.pl.dotplot(
    merged_noUnk,
    var_names=celltyping_markers,
    groupby='manual_gate', 
    #layer='raw_intensity',
    standard_scale='var',  
    show=False
)
plt.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/merged_manual_gate_dotplot_noUnk.pdf", bbox_inches="tight")

In [None]:
sids = sorted(set(merged.obs['section_id']))
sids

In [None]:
# check for every sample
from matplotlib.backends.backend_pdf import PdfPages

pdf_out = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/merged_per_sample_marker_dotplots.pdf"
with PdfPages(pdf_out) as pdf:
    for s in sids:
        print(s)
        sobj = merged[merged.obs['section_id'] == s].copy()

        dp = sc.pl.dotplot(
            sobj,
            var_names=celltyping_markers,
            groupby="manual_gate",
            standard_scale="var",
            return_fig=True,
            show=False,
            title=s
        )
        
        fig = dp.make_figure()
        #fig.suptitle(f"Sample: {s}", y=1.02)
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(dp.fig)

print(f"Saved: {pdf_out}")

pdf_out = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/merged_per_sample_marker_dotplots_noUnk.pdf"
with PdfPages(pdf_out) as pdf:
    for s in sids:
        print(s)
        sobj = merged_noUnk[merged_noUnk.obs['section_id'] == s].copy()

        dp = sc.pl.dotplot(
            sobj,
            var_names=celltyping_markers,
            groupby="manual_gate",
            standard_scale="var",
            return_fig=True,
            show=False,
            title=s
        )
        
        fig = dp.make_figure()
        #fig.suptitle(f"Sample: {s}", y=1.02)
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(dp.fig)

print(f"Saved: {pdf_out}")

In [None]:
merged.write('alla_updates/change_rules_order_and_thresholds2_fix_swap//merged.h5ad')
merged.obs.to_csv('alla_updates/change_rules_order_and_thresholds2_fix_swap//merged_metadata.csv', index=True) 

In [None]:
# plot cell types as scatterplot with section coordinates
gate_colors = {
    "Unassigned": "#000000",
    "Unk. CD45+": "#fcf338",
    "B Cell": "#032cfc",
    "Plasma Cell": "#ffbafd",
    "T Cell": "#fc0000",
    "Myeloid": "#1dfc00",
    "Progenitor Cell": "#a2d6f9",
    "Stroma": "#ff9e1b"
}

merged.uns["manual_gate_colors"] = [
    gate_colors[cat] for cat in merged.obs["manual_gate"].cat.categories
]

merged_noUnk.uns["manual_gate_colors"] = [
    gate_colors[cat] for cat in merged_noUnk.obs["manual_gate"].cat.categories
]

In [None]:
output_dir = Path("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots")
os.getcwd()

from matplotlib.backends.backend_pdf import PdfPages

with PdfPages(output_dir / "merged_scatterplots_allCells_vs_noUnk.pdf") as pdf:
    for sid in sids:
        f = merged[merged.obs["section_id"] == sid].copy()
        f2 = merged_noUnk[merged_noUnk.obs["section_id"] == sid].copy()
   
        fig, axes = plt.subplots(1, 2, figsize=(10, 10), constrained_layout=True)

        # left: all cells
        sc.pl.scatter(f, x="row", y="col", color="manual_gate", ax=axes[0], legend_loc="none", show=False, size=2)
        axes[0].set_title(f"{sid} — all cells")
        axes[0].set_aspect("equal")
        axes[0].invert_yaxis()
        for coll in axes[0].collections:
            coll.set_rasterized(True)

        # right: exclude Unknown and Multiplets
        sc.pl.scatter(f2, x="row", y="col", color="manual_gate", ax=axes[1], legend_loc="none", show=False, size=2)
        axes[1].set_title(f"{sid} — no unknown or multiplet")
        axes[1].set_aspect("equal")
        axes[1].invert_yaxis()
        for coll in axes[1].collections:
            coll.set_rasterized(True)

        # lock both panels to the same extents for fair side-by-side comparison
        axes[1].set_xlim(axes[0].get_xlim()); axes[1].set_ylim(axes[0].get_ylim())

        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

print("Saved: merged_scatterplots_allCells_vs_noUnk.pdf")


In [None]:
order_celltypes

In [None]:
df_noUnk = df[(df["manual_gate"] != "Unassigned")]

# Count total cells per section_id and manual_gate
counts = df_noUnk .groupby(['section_id', 'Timepoint', 'Patient_ID', 'manual_gate'], observed=True).size().reset_index(name='count')

# Count total cells per section_id
totals = counts.groupby('section_id', observed=True)['count'].transform('sum')

# Add a proportion column
counts['proportion'] = counts['count'] * 100 / totals

counts.head()

plt.figure(figsize=(8, 6))
sns.histplot(
    data=counts,
    x='section_id',
    weights='proportion',
    hue='manual_gate',
    hue_order=gate_order,
    multiple='stack',
    palette=gate_colors,
    legend=True
)
#sns.move_legend(ax, "upper left", bbox_to_anchor=(1.05, 1), title="Manual gate")
plt.ylabel('Proportion of Cells')
plt.xlabel('Section ID')
plt.title('Proportion of Manual Gates per Section')
#plt.legend(title='Manual Gate', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=90, ha='right')
plt.tight_layout()
plt.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/Barplot_cell_types_per_sample.pdf", bbox_inches="tight")
plt.show()

In [None]:
from scipy import sparse

def plot_marker_by_celltype(merged, marker, cell_type, save_path=None, test_type="t-test_ind"):

    mask = (merged.obs['manual_gate'] == cell_type).to_numpy()

    col_idx = merged.var_names.get_loc(marker)

    # Extract NFKB values for B cells
    vals = merged.X[mask, col_idx]
    nfkb_vals = vals.toarray().ravel() if sparse.issparse(vals) else np.asarray(vals).ravel()

    # Get section IDs for those B cells
    section_ids = merged.obs.loc[mask, 'section_id'].to_numpy()

    # Combine into a dataframe
    df = pd.DataFrame({
        'section_id': section_ids,
        'Marker_intensity': nfkb_vals
    })

    # Compute average NFKB per section
    avg_df = df.groupby('section_id', observed=True)['Marker_intensity'].mean().reset_index()

    avg_df["Timepoint"] = np.select(
        [
            avg_df["section_id"].str.contains("WU"),   # first condition
            avg_df["section_id"].str.contains("SN"),   # second condition
        ],
        [
            # corresponding outputs
            avg_df["section_id"].str.split("_").str[1],   # same as str_split_fixed(..., '_', 3)[,2] in R
            "Normal",
        ],
        default=avg_df["section_id"].str.split("_").str[0]  # TRUE ~ first chunk before '_'
    )

    avg_df["Patient_ID"] = np.select(
        [
            avg_df["section_id"].str.contains("WU"),   # first condition
            avg_df["section_id"].str.contains("SN"),   # second condition
        ],
        [
            # corresponding outputs
            avg_df["section_id"].str.split("_").str[0],   # same as str_split_fixed(..., '_', 3)[,2] in R
            avg_df["section_id"],
        ],
        default=avg_df["section_id"].str.split("_").str[1],  # TRUE ~ first chunk before '_'
    )

    avg_df.head()

    from statannotations.Annotator import Annotator
    sns.set(style="whitegrid", font_scale=1.3)

    # Define consistent colors
    palette = {"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"}
    order = ["Normal", "NDMM", "PT"]

    plt.figure(figsize=(7,5))

    # --- Boxplot ---
    ax = sns.boxplot(
        data=avg_df,
        x="Timepoint",
        y="Marker_intensity",
        order=order,
        palette=palette,
        showfliers=False,
        linewidth=1.2
    )

    # --- Add individual points connected by patient ---
    # Sort dataframe so lines connect correctly
    for pid, sub in avg_df.groupby("Patient_ID"):
        sub = sub.sort_values("Timepoint", key=lambda x: x.map({t:i for i,t in enumerate(order)}))
        plt.plot(
            sub["Timepoint"],
            sub["Marker_intensity"],
            marker="o",
            color="black",
            linewidth=1,
            markersize=6,
            alpha=0.9
        )

    # --- Add significance tests ---
    pairs = [("Normal", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=avg_df, x="Timepoint", y="Marker_intensity", order=order)
    annotator.configure(
        test=test_type,          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()

    # --- Customize axes ---
    ax.set_xlabel("Timepoint", fontsize=13)
    ax.set_ylabel(marker, fontsize=13)
    ax.set_title(f"{marker.replace('_intensity', '')} in {cell_type}s", fontsize=15)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved to {save_path}")

    plt.show()
    
    return ax

In [None]:
plot_marker_by_celltype(merged, marker = 'Ki67_intensity', cell_type='B Cell', test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap//merged_plots/Ki67_b-cell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'CD20_intensity', cell_type='B Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CD20_b-cell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'CD79a_intensity', cell_type='B Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CD79a_b-cell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'Phospho IKKa/b_intensity', cell_type='B Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/Phospho_IKKa.b-cell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'NFKB p65_intensity', cell_type='B Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/NFKB_p65.b-cell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'IkBa_intensity', cell_type='B Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/IkBa.b-cell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'PD1_intensity', cell_type='B Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/PD1.b-cell_boxplot_Mann-Whitney.pdf")


In [None]:
plot_marker_by_celltype(merged, marker = 'CD138_intensity', cell_type='Plasma Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CD138_PlasmaCell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'CD38_intensity', cell_type='Plasma Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CD38_PlasmaCell_boxplot_Mann-Whitney.pdf")


plot_marker_by_celltype(merged, marker = 'Ki67_intensity', cell_type='Plasma Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/ki67_PlasmaCell_boxplot_Mann-Whitney.pdf")


In [None]:
plot_marker_by_celltype(merged, marker = 'BAFF_intensity', cell_type='Myeloid',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/BAFF_Myeloid_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'PDL1_intensity', cell_type='Myeloid',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/PDL1_Myeloid_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'TGFB1_intensity', cell_type='Myeloid',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/TGFB1_Myeloid_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'HIF1A_intensity', cell_type='Myeloid',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/HIF1A_Myeloid_boxplot_Mann-Whitney.pdf")



In [None]:
plot_marker_by_celltype(merged, marker = 'PD1_intensity', cell_type='T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/PD1_Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'TIM3_intensity', cell_type='T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/TIM3_Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'CTLA4_intensity', cell_type='T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CTLA4_Tcell_boxplot_Mann-Whitney.pdf")



In [None]:
plot_marker_by_celltype(merged, marker = 'Granzyme-B_intensity', cell_type='T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/Granzyme-B_Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'CD8_intensity', cell_type='T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CD8_Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'CD4_intensity', cell_type='T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CD4_Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype(merged, marker = 'IFNG_intensity', cell_type='T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/IFNG_Tcell_boxplot_Mann-Whitney.pdf")


# introduce CD4 and CD8 T-cells

In [None]:
rules_t = [
    {
        "name": "Plasma Cell",
        "priority": 9,  
        "any": {"CD138_intensity": thresholds["CD138_intensity"], "CD38_intensity": thresholds["CD38_intensity"]},
        "not_any": {"CD3e_intensity": thresholds["CD3e_intensity"], "CD20_intensity": thresholds["CD20_intensity"], "CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },
    {
        "name": "CD8 T Cell",
        "priority": 8,  
        "all": {"CD3e_intensity": thresholds["CD3e_intensity"],  "CD8_intensity": thresholds["CD8_intensity"]},
        "not_any": {"CD20_intensity": thresholds["CD20_intensity"], "CD138_intensity":thresholds["CD138_intensity"],"CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },
    {
        "name": "CD4 T Cell",
        "priority": 7,  
        "all": {"CD3e_intensity": thresholds["CD3e_intensity"],  "CD4_intensity": thresholds["CD8_intensity"]},
        "not_any": {"CD20_intensity": thresholds["CD20_intensity"], "CD138_intensity":thresholds["CD138_intensity"],"CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },
    
    {
        "name": "T Cell",
        "priority": 6,  
        "all": {"CD3e_intensity": thresholds["CD3e_intensity"]},
        "any": {"CD3e_intensity": thresholds["CD3e_intensity"], "CD4_intensity": thresholds["CD4_intensity"], "CD8_intensity": thresholds["CD8_intensity"], "FOXP3_intensity": thresholds["FOXP3_intensity"]},
        "not_any": {"CD20_intensity": thresholds["CD20_intensity"], "CD138_intensity":thresholds["CD138_intensity"],"CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },
    {
        "name": "B Cell",
        "priority": 5,  
        "any": {"CD20_intensity": thresholds["CD20_intensity"], "CD79a_intensity": thresholds["CD79a_intensity"]},
        "not_any": {"CD3e_intensity": thresholds["CD3e_intensity"], "CD138_intensity":thresholds["CD138_intensity"], "CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD163_intensity": thresholds["CD163_intensity"], "CD68_intensity": thresholds["CD68_intensity"]}
    },

    {
        "name": "Myeloid",
        "priority": 4,  
        "any": {"CD11b_intensity": thresholds["CD11b_intensity"], "CD14_intensity": thresholds["CD14_intensity"], "CD68_intensity": thresholds["CD68_intensity"], "MPO_intensity": thresholds["MPO_intensity"]},
        #"not_any": {"CD20_intensity": thresholds["CD20_intensity"], "CD3e_intensity": thresholds["CD3e_intensity"], "CD138_intensity": thresholds["CD138_intensity"] }
    },
    {
        "name": "Progenitor Cell",
        "priority": 3, 
        "all": {"CD34_intensity": thresholds["CD34_intensity"]},
        "not_any": {"Vimentin_intensity": thresholds["Vimentin_intensity"]}
    },
    {
        "name": "Stroma",
        "priority": 2,  
        "any": {"Vimentin_intensity": thresholds["Vimentin_intensity"]},
        "not_any": {"CD45_intensity": thresholds["CD45_intensity"]}
    },
    {
        "name": "Unk. CD45+",
        "priority": 1,  
        "any": {"CD45_intensity": thresholds["CD45_intensity"]},
    }
    
]


gated = apply_manual_gates_by_section(
    adata=merged,
    rules=rules_t,
    thresholds=thresholds,
    section_col="section_id",
    out_col="manual_gate_withT",
    #cell_totals=cell_totals,          # omit if you pass mean_total_by_sec=...
    mean_total_by_sec=mean_total_by_sec
)

print(gated.value_counts())

merged.obs['manual_gate_withT'] = gated
gate_order = ["Unassigned", "Unk. CD45+", 'Progenitor Cell', "Myeloid", "Plasma Cell", "CD8 T Cell", "CD4 T Cell", "T Cell", "B Cell", 'Stroma']

merged.obs["manual_gate_withT"] = pd.Categorical(
    merged.obs["manual_gate_withT"],
    categories=gate_order,
    ordered=True
)

In [None]:
gate_order = ["Unassigned", "Unk. CD45+","Progenitor Cell", "Myeloid", "Plasma Cell", "CD4 T Cell","CD8 T Cell", "B Cell", "Stroma"]
celltyping_markers = {
    "Imm": ['CD45_intensity'],
    "Prog":['CD34_intensity', 'cKit_intensity'],
    "Mye": ['CD11b_intensity', 'CD14_intensity',  'CD68_intensity', 'MPO_intensity'],
    "PC": ['CD138_intensity', 'CD38_intensity'],
    "T": ['CD3e_intensity', 'CD4_intensity', 'CD8_intensity', 'Granzyme-B_intensity', 'CD45RO_intensity', ], #'CD45RO_intensity', 'CD44_intensity'],
    "B": ['CD79a_intensity', 'CD20_intensity', 'CD27_intensity'],
    "Stroma":['Vimentin_intensity']
}
sc.pl.dotplot(
    merged,
    var_names=celltyping_markers,
    groupby='manual_gate_withT', 
    #layer='raw_intensity',
    standard_scale='var',  
    show=False
)
plt.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/merged_manual_gate_dotplot_CD4_CD8.pdf", bbox_inches="tight")

In [None]:
os.getcwd()

In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['section_id',"Timepoint", "Patient_ID", 'manual_gate_withT']].copy()

# exclude any "unknown" values 
#df_noUnk = df[(df["manual_gate"] != "Unassigned") & (df["manual_gate"] != "Unk. CD45+")]
df_noUnk = df[(df["manual_gate_withT"] != "Unassigned")]

# Count total cells per Sample and annot
counts = df_noUnk.groupby(['section_id','Timepoint', 'Patient_ID', 'manual_gate_withT'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['section_id','Timepoint', 'Patient_ID',]].drop_duplicates(subset=['section_id'])
)

all_combos = (
    pd.MultiIndex.from_product([df['section_id'].unique(),df['manual_gate_withT'].unique()],
        names=['section_id', 'manual_gate_withT']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['section_id','manual_gate_withT', 'count']], on=['section_id','manual_gate_withT'], how='left')
    .merge(sample_meta, on='section_id', how='left')
    .fillna({'count': 0})
)

# Count total cells per section_id
totals = full_counts.groupby('section_id', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()


sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["Normal", "NDMM", "PT"]
order_celltypes = sorted(counts["manual_gate_withT"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Timepoint",
    y="proportion",
    hue="Timepoint",
    col="manual_gate_withT",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["manual_gate_withT"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Timepoint",
        y="proportion",
        hue="Timepoint",
        order=order_timepoints,
        palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of all cells")
    ax.set_title(cell_type)
    ax.set_ylim(0, None)  # adjust y range
    
    for pid, grp in sub.groupby("Patient_ID"):
        if grp["Timepoint"].isin(["NDMM", "PT"]).sum() >= 2:
            grp_sorted = grp.sort_values("Timepoint", key=lambda x: x.map({"NDMM": 1, "PT": 2}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Timepoint"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("Normal", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Timepoint", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate() 
    
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/celltype_boxplots_manual_gate_withT_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['section_id',"Timepoint", "Patient_ID", 'manual_gate_withT']].copy()

# exclude any "unknown" values 
#df_noUnk = df[(df["manual_gate"] != "Unassigned") & (df["manual_gate"] != "Unk. CD45+")]
df_noUnk = df[df["manual_gate_withT"].isin(["T Cell", "CD4 T Cell", "CD8 T Cell"])]

# Count total cells per Sample and annot
counts = df_noUnk.groupby(['section_id','Timepoint', 'Patient_ID', 'manual_gate_withT'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['section_id','Timepoint', 'Patient_ID',]].drop_duplicates(subset=['section_id'])
)

all_combos = (
    pd.MultiIndex.from_product([df['section_id'].unique(),df['manual_gate_withT'].unique()],
        names=['section_id', 'manual_gate_withT']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['section_id','manual_gate_withT', 'count']], on=['section_id','manual_gate_withT'], how='left')
    .merge(sample_meta, on='section_id', how='left')
    .fillna({'count': 0})
)

# Count total cells per section_id
totals = full_counts.groupby('section_id', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()


sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["Normal", "NDMM", "PT"]
order_celltypes = sorted(counts["manual_gate_withT"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Timepoint",
    y="proportion",
    hue="Timepoint",
    col="manual_gate_withT",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["manual_gate_withT"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Timepoint",
        y="proportion",
        hue="Timepoint",
        order=order_timepoints,
        palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)
    ax.set_ylim(0, None)  # adjust y range
    
    for pid, grp in sub.groupby("Patient_ID"):
        if grp["Timepoint"].isin(["NDMM", "PT"]).sum() >= 2:
            grp_sorted = grp.sort_values("Timepoint", key=lambda x: x.map({"NDMM": 1, "PT": 2}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Timepoint"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("Normal", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Timepoint", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate() 
    
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/Tcelltype_boxplots_manual_gate_withT_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
from scipy import sparse

def plot_marker_by_celltype_withT(merged, marker, cell_type, save_path=None, test_type="t-test_ind"):

    mask = (merged.obs['manual_gate_withT'] == cell_type).to_numpy()

    col_idx = merged.var_names.get_loc(marker)

    # Extract NFKB values for B cells
    vals = merged.X[mask, col_idx]
    nfkb_vals = vals.toarray().ravel() if sparse.issparse(vals) else np.asarray(vals).ravel()

    # Get section IDs for those B cells
    section_ids = merged.obs.loc[mask, 'section_id'].to_numpy()

    # Combine into a dataframe
    df = pd.DataFrame({
        'section_id': section_ids,
        'Marker_intensity': nfkb_vals
    })

    # Compute average NFKB per section
    avg_df = df.groupby('section_id', observed=True)['Marker_intensity'].mean().reset_index()

    avg_df["Timepoint"] = np.select(
        [
            avg_df["section_id"].str.contains("WU"),   # first condition
            avg_df["section_id"].str.contains("SN"),   # second condition
        ],
        [
            # corresponding outputs
            avg_df["section_id"].str.split("_").str[1],   # same as str_split_fixed(..., '_', 3)[,2] in R
            "Normal",
        ],
        default=avg_df["section_id"].str.split("_").str[0]  # TRUE ~ first chunk before '_'
    )

    avg_df["Patient_ID"] = np.select(
        [
            avg_df["section_id"].str.contains("WU"),   # first condition
            avg_df["section_id"].str.contains("SN"),   # second condition
        ],
        [
            # corresponding outputs
            avg_df["section_id"].str.split("_").str[0],   # same as str_split_fixed(..., '_', 3)[,2] in R
            avg_df["section_id"],
        ],
        default=avg_df["section_id"].str.split("_").str[1],  # TRUE ~ first chunk before '_'
    )

    avg_df.head()

    from statannotations.Annotator import Annotator
    sns.set(style="whitegrid", font_scale=1.3)

    # Define consistent colors
    palette = {"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"}
    order = ["Normal", "NDMM", "PT"]

    plt.figure(figsize=(7,5))

    # --- Boxplot ---
    ax = sns.boxplot(
        data=avg_df,
        x="Timepoint",
        y="Marker_intensity",
        order=order,
        palette=palette,
        showfliers=False,
        linewidth=1.2
    )

    # --- Add individual points connected by patient ---
    # Sort dataframe so lines connect correctly
    for pid, sub in avg_df.groupby("Patient_ID"):
        sub = sub.sort_values("Timepoint", key=lambda x: x.map({t:i for i,t in enumerate(order)}))
        plt.plot(
            sub["Timepoint"],
            sub["Marker_intensity"],
            marker="o",
            color="black",
            linewidth=1,
            markersize=6,
            alpha=0.9
        )

    # --- Add significance tests ---
    pairs = [("Normal", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=avg_df, x="Timepoint", y="Marker_intensity", order=order)
    annotator.configure(
        test=test_type,          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()

    # --- Customize axes ---
    ax.set_xlabel("Timepoint", fontsize=13)
    ax.set_ylabel(marker, fontsize=13)
    ax.set_title(f"{marker.replace('_intensity', '')} in {cell_type}s", fontsize=15)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
        print(f"✅ Saved to {save_path}")

    plt.show()
    
    return ax

In [None]:
plot_marker_by_celltype_withT(merged, marker = 'Granzyme-B_intensity', cell_type='CD8 T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/Granzyme-B_CD8Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype_withT(merged, marker = 'PD1_intensity', cell_type='CD8 T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/PD1_CD8Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype_withT(merged, marker = 'IFNG_intensity', cell_type='CD8 T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/IFNG_CD8Tcell_boxplot_Mann-Whitney.pdf")






In [None]:
plot_marker_by_celltype_withT(merged, marker = 'CTLA4_intensity', cell_type='CD4 T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CTLA4_CD4Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype_withT(merged, marker = 'FOXP3_intensity', cell_type='CD4 T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/FOXP3_CD4Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype_withT(merged, marker = 'CD4_intensity', cell_type='CD4 T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/CD4_CD4Tcell_boxplot_Mann-Whitney.pdf")

plot_marker_by_celltype_withT(merged, marker = 'IFNG_intensity', cell_type='CD4 T Cell',test_type='Mann-Whitney',
                       save_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/IFNG_CD4Tcell_boxplot_Mann-Whitney.pdf")




In [None]:
merged.write('alla_updates/change_rules_order_and_thresholds2_fix_swap/merged.h5ad')
merged.obs.to_csv('alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_metadata.csv', index=True) 

In [None]:
merged = ad.read_h5ad('alla_updates/change_rules_order_and_thresholds2_fix_swap/merged.h5ad')

In [None]:
gate_order = ["Unassigned", "Unk. CD45+","Progenitor Cell", "Myeloid", "Plasma Cell", "CD4 T Cell","CD8 T Cell", "B Cell", "Stroma"]
celltyping_markers = {
    "Imm": ['CD45_intensity'],
    "Prog":['CD34_intensity', 'cKit_intensity'],
    "Mye": ['CD11b_intensity', 'CD14_intensity',  'CD68_intensity', 'MPO_intensity'],
    "PC": ['CD138_intensity', 'CD38_intensity'],
    "T": ['CD3e_intensity', 'CD4_intensity', 'CD8_intensity', 'Granzyme-B_intensity', 'CD45RO_intensity', ], #'CD45RO_intensity', 'CD44_intensity'],
    "B": ['CD79a_intensity', 'CD20_intensity', 'CD27_intensity'],
    "Stroma":['Vimentin_intensity']
}


In [None]:
sc.pl.matrixplot(
    merged, 
    var_names=celltyping_markers, 
    groupby="manual_gate_withT",
    dendrogram=False,              
    standard_scale='var',           
    figsize=(8, 3.7),
    show=False
)

plt.savefig("alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/merged_manual_gate_dotplot_CD4_CD8_heatmap.pdf", bbox_inches="tight")





In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['section_id',"Timepoint", "Patient_ID", 'manual_gate_withT']].copy()

# exclude any "unknown" values 
#df_noUnk = df[(df["manual_gate"] != "Unassigned") & (df["manual_gate"] != "Unk. CD45+")]
df_noUnk = df[df["manual_gate_withT"].isin(["T Cell", "CD4 T Cell", "CD8 T Cell"])]

# Count total cells per Sample and annot
counts = df_noUnk.groupby(['section_id','Timepoint', 'Patient_ID', 'manual_gate_withT'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['section_id','Timepoint', 'Patient_ID',]].drop_duplicates(subset=['section_id'])
)

all_combos = (
    pd.MultiIndex.from_product([df_noUnk['section_id'].unique(),df_noUnk['manual_gate_withT'].unique()],
        names=['section_id', 'manual_gate_withT']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['section_id','manual_gate_withT', 'count']], on=['section_id','manual_gate_withT'], how='left')
    .merge(sample_meta, on='section_id', how='left')
    .fillna({'count': 0})
)

# Count total cells per section_id
totals = full_counts.groupby('section_id', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()


sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["Normal", "NDMM", "PT"]
order_celltypes = sorted(counts["manual_gate_withT"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Timepoint",
    y="proportion",
    hue="Timepoint",
    col="manual_gate_withT",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["manual_gate_withT"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Timepoint",
        y="proportion",
        hue="Timepoint",
        order=order_timepoints,
        palette={"Normal": "forestgreen", "NDMM": "deeppink", "PT": "plum"},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)
    ax.set_ylim(0, None)  # adjust y range
    
    for pid, grp in sub.groupby("Patient_ID"):
        if grp["Timepoint"].isin(["NDMM", "PT"]).sum() >= 2:
            grp_sorted = grp.sort_values("Timepoint", key=lambda x: x.map({"NDMM": 1, "PT": 2}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Timepoint"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("Normal", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Timepoint", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate() 
    
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")

plt.show()



In [None]:
full_counts

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection

pairs = [("Normal", "NDMM"), ("NDMM", "PT")]

results = []

for cell_type, sub in full_counts.groupby("manual_gate_withT"):

    for cond1, cond2 in pairs:

        x = sub.loc[sub["Timepoint"] == cond1, "proportion"]
        y = sub.loc[sub["Timepoint"] == cond2, "proportion"]

        # If either group is empty, skip
        if len(x) == 0 or len(y) == 0:
            continue

        # Mann–Whitney U test (unpaired Wilcoxon)
        stat, pval = mannwhitneyu(x, y, alternative="two-sided")

        results.append({
            "cell_type": cell_type,
            "comparison": f"{cond2} vs {cond1}",
            "cond1": cond1,
            "cond2": cond2,
            "n_cond1": len(x),
            "n_cond2": len(y),
            "stat": stat,
            "p_raw": pval,
        })

# Convert to DataFrame
res_df = pd.DataFrame(results)

# Apply FDR correction across all tests
reject, p_fdr = fdrcorrection(res_df["p_raw"], alpha=0.05, method="indep")
res_df["p_fdr"] = p_fdr
res_df["significant_fdr_0.05"] = reject

# Save to CSV
out_path = "alla_updates/change_rules_order_and_thresholds2_fix_swap/merged_plots/Tcelltype_boxplots_manual_gate_withT_Mann-Whitney_FDR_results.csv"
res_df.to_csv(out_path, index=False)

print("Saved results to:", out_path)
print(res_df)