In [None]:
import os
from pathlib import Path
from typing import Annotated

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
import tifffile

from sklearn.cluster import KMeans
from skimage.color import label2rgb
from sklearn.neighbors import radius_neighbors_graph
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests
from scipy.stats import entropy, chi2_contingency
from matplotlib.backends.backend_pdf import PdfPages
from statannotations.Annotator import Annotator

from scipy import sparse

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

plot_dir = Path('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/radial_neighborhoods/Plots')
plot_dir.mkdir(parents=True, exist_ok=True)
os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/radial_neighborhoods/')
os.getcwd()

In [None]:
merged = sc.read_h5ad("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/radial_neighborhoods/Output/merged_RN.h5ad")

In [None]:
#merged.obs.to_csv('Output/merged_RN_metadata.tsv', sep="\t", index=True)

In [None]:
di_sample_to_collection = merged.obs.set_index('DI_Sample')['Collection'].to_dict()

In [None]:
neighborhood_colors = {
    "RN1":  "#079450",  # later granulo
    "RN2":  "#9e9e9e", # erythroid
    "RN3":  "#ff42ca", # PC
    "RN4": "#00ff1e", # early granulo/mye 
    "RN5": "#241717", # MKC
    "RN6":  '#00ba9e', # other myelo (cDC, ba/eo/ma, low confidence)
    "RN7":  "#00f7ff",  # early B and myelo
    "RN8":  "#b50d0d",  # cytotoxic T NK
    "RN9":  "#de9835",  # endothelial
    "RN10": "#c6db02",  # HSPC
    "RN11": "#7875ff",  # lymphoid
    "RN12": "#fabc02",  # fibro/osteo
    "RN13": "#735b2e",  # pericyte
    'Unassigned': '#FFFFFF'
}

timecols = {"NBM": "#0C7515", "NDMM": "#E619B9", "PT": "#CF99C3"} 

In [None]:
merged.obs

In [None]:
# print annotation csvs with colors
#merged.obs["Original_Barcode"] = merged.obs_names.str.rsplit("_", n=1).str[-1]
sids = sorted(set(merged.obs['Sample']))

outdir =  Path('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/radial_neighborhoods/Output/annotations_rn')

for samp in sids:
    df = (
        merged.obs.loc[merged.obs["Sample"] == samp, ["Original_Barcode", "rn"]]
        .rename(columns={"Original_Barcode": "cell_id", "rn": "group"})
    )
    df['color']=df['group'].map(neighborhood_colors)
    df.to_csv(outdir / f"{samp}_rn.csv", index=False)

In [None]:
def remove_outliers_iqr_grouped(df, group_col, col='frac', k=1.5):
    def _iqr(sub):
        q1 = sub[col].quantile(0.25)
        q3 = sub[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - k * iqr
        upper = q3 + k * iqr
        return sub[(sub[col] >= lower) & (sub[col] <= upper)]
    return df.groupby(group_col, group_keys=False).apply(_iqr)

def remove_outliers_iqr(df, col='frac', k=1.5): 
    q1 = df[col].quantile(0.25) 
    q3 = df[col].quantile(0.75) 
    iqr = q3 - q1 
    lower = q1 - k * iqr 
    upper = q3 + k * iqr 
    return df[(df[col] >= lower) & (df[col] <= upper)]

def p_to_stars(p):
    if p < 0.0001:
        return "****"
    elif p < 0.001:
        return "***"
    elif p < 0.01:
        return "**"
    elif p < 0.05:
        return "*"
    else:
        return "ns"   # not significant
collection_order= ['NBM', 'NDMM', 'PT']

def plot_fraction_boxplot(df_allpts, title, pdf, min_width=4,):
    width = len(df_allpts['rn'].unique())
    fig, ax = plt.subplots(figsize=(max(min_width,width-1), 5))

    #df = df_allpts
    df = remove_outliers_iqr(df_allpts)
    
    sns.boxplot(
        data=df,
        x="rn", y="frac",
        hue="Collection",
        hue_order=collection_order,
        palette=timecols,
        fliersize=0, linewidth=1, ax=ax
    )

    sns.stripplot(
        data=df,
        x="rn", y="frac",
        hue="Collection",
        hue_order=collection_order,
        dodge=True, alpha=1, size=2,
        palette="dark:black", ax=ax
    )

    # remove duplicate legend entries
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[:len(collection_order)],
              labels[:len(collection_order)],
              title="Collection",
              bbox_to_anchor=(1.05, 1), loc="upper left", frameon=False)

    # significance pairs 
    pairs = []
    for rn in df["rn"].unique():
        pairs.extend([
            ((rn, "NBM"), (rn, "NDMM")),
            ((rn, "NDMM"), (rn, "PT")),
            ((rn, "NBM"), (rn, "PT")),
        ])


    if pairs:
        annotator = Annotator(
            ax, pairs, data=df_allpts, x="rn", y="frac",
            hue="Collection", hue_order=collection_order
        )
        annotator.configure(
            test="Mann-Whitney", text_format="star", loc="inside",
            comparisons_correction="BH", hide_non_significant=True, verbose=0
        )
        annotator.apply_and_annotate()

    ax.set_ylabel("Fraction of cells")
    ax.set_xlabel("RN")
    ax.set_title(title)
    sns.despine(ax=ax)
    plt.tight_layout()
    plt.xticks(rotation=90)
    pdf.savefig(fig, bbox_inches="tight")
    plt.close(fig)

In [None]:
# find UPNs present in >1 collection
multi_upns = (
    merged.obs.groupby("UPN", observed=True)["Collection"]
    .nunique()
    .loc[lambda x: x > 1]
    .index
    .tolist()
)

print(f"UPNs with multiple collections: {multi_upns}")
upn_palette = {
    "WU007": "#fa37b9",  # MRD+
    "WU025": "#fa37b9",  # MRD+
    "WU030": "#fa37b9",  # MRD+
    "WU043": "#000000",  # PC burden decreased after transplant; MRD-
    "WU050": "#fa37b9",  # PC burden decreased after transplant; MRD+
    "WU066": "#fa37b9",  # PC burden decreased after transplant; MRD+
    "WU068": "#c4c4c4",  # PC burden decreased after transplant; MRD unk
    "WU107": "#fa37b9",  # PC burden decreased after transplant; MRD+
}

In [None]:
obs = merged.obs.copy()   

collection_order = ["NBM", "NDMM", "PT"]
rn_order = list(neighborhood_colors.keys())  

# Compute per-UPN fractions of each RN

counts = (
    obs.groupby(["UPN", "Collection", "rn"], observed=False)
        .size()
        .reset_index(name="n_cells")
)

totals = (
    counts.groupby(["UPN", "Collection"], observed=True)["n_cells"]
          .sum()
          .reset_index(name="total_cells")
)

frac_df = counts.merge(totals, on=["UPN", "Collection"])
frac_df["frac"] = frac_df["n_cells"] / frac_df["total_cells"]


# get order of subsets based on PT abundance
pt = frac_df[frac_df['Collection'] == 'PT']
median_pt = (
    pt.groupby('rn', observed=False)['frac']
        .median()
        .sort_values(ascending=False)
)
rn_order = median_pt.index.tolist()


frac_df["rn"] = pd.Categorical(
    frac_df["rn"],
    categories=rn_order,
    ordered=True
)

median_pt




In [None]:
frac_df

In [None]:
high1 = frac_df[frac_df['rn'].isin(['RN7', 'RN1', 'RN2'])].copy()
high1['rn'] = high1['rn'].cat.remove_unused_categories()
high2 = frac_df[frac_df['rn'].isin(['RN9', 'RN6', 'RN4', 'RN5', 'RN10'])].copy()
high2['rn'] = high2['rn'].cat.remove_unused_categories()
low = frac_df[frac_df['rn'].isin(['RN13', 'RN12','RN11'])].copy()
low['rn'] = low['rn'].cat.remove_unused_categories()
pc = frac_df[frac_df['rn'].isin(['RN3', 'RN8'])].copy()
pc['rn'] = pc['rn'].cat.remove_unused_categories()

pdf_out = "RN_abundance_boxplots.pdf"
with PdfPages(plot_dir / pdf_out) as pdf:
    plot_fraction_boxplot(high1,
                          "High-abundance Radial Neighborhoods", pdf, min_width=4)
    plot_fraction_boxplot(high2,
                          "High-abundance Radial Neighborhoods", pdf, min_width=5)
    plot_fraction_boxplot( low,
                          "Low-abundance Radial Neighborhoods", pdf, min_width=4,)
    plot_fraction_boxplot( pc,
                          "PC-enriched Radial Neighborhoods", pdf, min_width=3,)

In [None]:
high = frac_df[frac_df['rn'].isin(['RN7', 'RN1', 'RN9', 'RN6', 'RN4', 'RN5', 'RN10'])].copy()
high['rn'] = high['rn'].cat.remove_unused_categories()
low = frac_df[frac_df['rn'].isin(['RN13', 'RN12','RN11'])].copy()
low['rn'] = low['rn'].cat.remove_unused_categories()
pc = frac_df[frac_df['rn'].isin(['RN3', 'RN8'])].copy()
pc['rn'] = pc['rn'].cat.remove_unused_categories()

pdf_out = "RN_abundance_boxplots_inclOutliers.pdf"
with PdfPages(plot_dir / pdf_out) as pdf:
    plot_fraction_boxplot(high,
                          "High-abundance Radial Neighborhoods", pdf)
    plot_fraction_boxplot( low,
                          "Low-abundance Radial Neighborhoods", pdf, min_width=4,)
    plot_fraction_boxplot( pc,
                          "PC-enriched Radial Neighborhoods", pdf, min_width=3,)

In [None]:
df = merged.obs.copy()

with PdfPages(plot_dir / "ct_proportions_per_rn_boxplots.pdf") as pdf:
    for celltype in df['ct'].unique():
        b = df[df['ct'] == celltype].copy()
        
        counts = (
            b.groupby(['UPN', 'Collection', 'rn'], observed=False)
              .size()
              .reset_index(name='n')
        )
        
        totals = (
            counts.groupby(['UPN', 'Collection'], observed=True)['n']
                   .sum()
                   .reset_index(name='total')
        )
        
        counts = counts.merge(totals, on=['UPN', 'Collection'])
        counts['prop'] = counts['n'] / counts['total']

        fig, ax = plt.subplots(figsize=(8,4))
        sns.boxplot(
            data=counts,
            x='rn',
            y='prop',
            palette=timecols,
            fliersize=0,
            hue='Collection',
            ax=ax
        )
        sns.stripplot(
            data=counts,
            x='rn',
            y='prop',
            hue='Collection',
            dodge=True,
            palette='dark:black',
            alpha=1,
            size=3,
            linewidth=0,
            ax=ax
        )

        
        pairs = []
        combos = [("NBM", "NDMM"), ("NDMM", "PT"), ("NBM", "PT")]
        for rn in rn_order:
            available = set(counts.loc[counts["rn"] == rn, "Collection"].unique())
            for c1, c2 in combos:
                if {c1, c2}.issubset(available):
                    pairs.append(((rn, c1), (rn, c2)))
    
        if pairs:
            annotator = Annotator(
                ax,
                pairs,
                data=counts,
                x="rn",
                y="prop",
                hue="Collection",
                order=rn_order,
                hue_order=collection_order,
            )
            annotator.configure(
                test='Mann-Whitney',
                text_format='star',         
                loc='inside',
                comparisons_correction='BH', 
                hide_non_significant=True,
                verbose=1                   
            )
            annotator.apply_and_annotate()
        
        sns.despine(ax=ax)
        ax.set_ylabel(f"Proportion of {celltype}")
        ax.set_title(f"Radial neighborhood membership of {celltype} (per UPN)")

        # hide duplicated legends 
        ax.legend([], [], frameon=False)
        
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)


In [None]:
rn_best_by_ct = {}

for celltype in df["ct"].unique():
    # isolate this cell type
    b = df[df["ct"] == celltype].copy()

    # compute per-sample proportions (your exact same code)
    counts = (
        b.groupby(["UPN", "Collection", "rn"], observed=False)
          .size()
          .reset_index(name="n")
    )
    totals = (
        counts.groupby(["UPN", "Collection"])["n"]
              .sum()
              .reset_index(name="total")
    )
    counts = counts.merge(totals, on=["UPN", "Collection"])
    counts["prop"] = counts["n"] / counts["total"]

    # pivot to wide table: rows = samples, columns = RN
    plotdf = counts.pivot_table(
        index=["UPN", "Collection"],
        columns="rn",
        values="prop",
        fill_value=0,
    ).reset_index()

    # find RN with highest average in PT
    rn_cols = [c for c in plotdf.columns
               if c not in ["UPN", "Collection"]]

    pt_only = plotdf[plotdf["Collection"] == "PT"][rn_cols]

    if len(pt_only) > 0:
        rn_best = pt_only.mean().idxmax()
    else:
        rn_best = None

    rn_best_by_ct[celltype] = rn_best
rn_best_by_ct

In [None]:
collection_order = ["NBM", "NDMM", "PT"]

with PdfPages(plot_dir / "ct_rn_distribution_stackedbar.pdf") as pdf:
    for celltype in df["ct"].unique():
        b = df[df["ct"] == celltype].copy()

        # compute proportions
        counts = (
            b.groupby(["UPN", "Collection", "rn"], observed=False)
              .size()
              .reset_index(name="n")
        )

        totals = (
            counts.groupby(["UPN", "Collection"], observed=True)["n"]
                  .sum()
                  .reset_index(name="total")
        )

        counts = counts.merge(totals, on=["UPN", "Collection"])
        counts["prop"] = counts["n"] / counts["total"]

        # pivot to wide 
        plotdf = counts.pivot_table(
            index=["UPN", "Collection"],
            columns="rn",
            values="prop",
            fill_value=0,
        ).reset_index()

        # sort within each collection by RN1 fraction (descending)
        plotdf["Collection"] = pd.Categorical(plotdf["Collection"], categories=collection_order, ordered=True)
        rn_best = rn_best_by_ct[celltype]
        plotdf = plotdf.sort_values(["Collection", rn_best], ascending=[True, False])
        
        sample_map = (
            df[["UPN", "Collection", "DI_Sample"]]
            .drop_duplicates(subset=["UPN", "Collection"])
            .set_index(["UPN", "Collection"])["DI_Sample"]
        )
        
        # add DI_Sample as a label column
        plotdf["DI_Sample"] = [
            sample_map.loc[(u, c)] if (u, c) in sample_map.index else f"{u}_{c}"
            for u, c in zip(plotdf["UPN"], plotdf["Collection"])
        ]
        
        plotdf["DI_Sample"] = plotdf["DI_Sample"].astype(str)

        # stacked barplot
        fig, ax = plt.subplots(figsize=(10, 4))

        # get the RN columns in order
        rn_cols = [c for c in plotdf.columns if c not in ["UPN", "Collection", "DI_Sample"]]

        bottom = np.zeros(len(plotdf))
        for rn in rn_cols:
            ax.bar(
                plotdf["DI_Sample"],
                plotdf[rn],
                bottom=bottom,
                color=neighborhood_colors[rn] if isinstance(neighborhood_colors, dict) and rn in neighborhood_colors else None,
                label=rn,
                edgecolor="none",
                width=1.0,
            )
            bottom += plotdf[rn].values

        ax.set_ylabel(f"Proportion of {celltype} cells")
        ax.set_xlabel("")
        ax.set_title(f"Radial neighborhood membership of {celltype} (per UPN)")
        ax.set_xticks(range(len(plotdf)))
        ax.set_xticklabels(plotdf["DI_Sample"], rotation=90, ha="center")

        # draw separators between collections
        coll_changes = plotdf["Collection"].ne(plotdf["Collection"].shift()).to_numpy()
        for i, newgrp in enumerate(np.where(coll_changes)[0][1:]):  # skip first
            ax.axvline(newgrp - 0.5, color="black", lw=0.6, alpha=0.4)

        # legend
        ax.legend(
            title="Radial neighborhood",
            bbox_to_anchor=(1.05, 1),
            loc="upper left",
            frameon=False,
        )

        sns.despine(ax=ax)
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

In [None]:
# look at high PC% with different RN composition

hs = ['S16-26183A1U1', 'S14-18864-A1U1', 'S24-19138A1U1', 'S14-11561A1U1', 'S19-25371-A1U1', 'S15-15758-A1U1', 'S17-32736-A1U1']
obs = merged.obs.copy()
di_dict = obs.set_index('Sample')['DI_Sample'].to_dict()

with PdfPages(plot_dir / "high_PC_RN_scatterplots.pdf") as pdf:
    for sid in hs:
        #print(sid)
        di = di_dict[sid]
        fig, ax = plt.subplots(figsize=(8, 8))
    
        subset = obs[obs['Sample']==sid].copy()
        
        sns.scatterplot(subset, x='x_centroid', y='y_centroid', hue='rn', s=0.5, ax=ax, palette=neighborhood_colors, legend=False)
        ax.axis('equal')
        #ax.set_xticklabels([])
        #ax.set_yticklabels([])
        ax.set_title(di)
        ax.collections[0].set_rasterized(True)
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches='tight')
        plt.close(fig)

In [None]:
ct_palette = {
    "HSPC": "#d6e376",
    "Erythroid": "#cfcfcf" ,
    "Megakaryocyte": "#8f8f8f",
    "GMP": "#88cf46",
    "Late Myeloid": "#4ab300",
    "Neutrophil": "#95ad74",
    "Ba/Eo/Ma": "#618038",
    "cDC": "#3bff8c",
    "Monocyte": "#3dd49f",
    "Macrophage": "#03ab70" ,
    "pDC": "#a5c3c4",
    "CD4 T": "#ff8400",
    "CD8 T": "#ff0000",
    "NK": "#9302d1",
    "Early B": "#7cb2e6" ,
    "Mature B": "#045eb5",
    "PC": "#ffbafd",
    "MSC": "#cfc10a",
    "Fibro/Osteo": "#ba9e00",
    "Adipocyte": "#ffe600",
    "Endothelial": "#cc7e7e",
    "vSMC/Pericyte": "#ad4b8e",
    "Low Confidence": "#FFFFFF"
}

In [None]:
rn_counts = (
    obs.groupby(["Sample", "rn"], observed=True)
    .size()
    .reset_index(name="count")
)

with PdfPages(plot_dir / "high_PC_RN_pies.pdf") as pdf:
    for sid in hs:
        #print(sid)
        di = di_dict[sid]
        fig, ax = plt.subplots(figsize=(4, 4))
    
        sub = rn_counts[rn_counts['Sample']==sid].copy()
        colors = [neighborhood_colors[str(r)] for r in sub["rn"]]
        ax.pie(
            sub["count"],
            labels=sub["rn"],
            colors=colors,
            autopct="%.1f%%",
            startangle=90
        )
        ax.set_title(f"RN composition for {di}")
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

ct_counts = (
    obs.groupby(["Sample", "ct"], observed=True)
    .size()
    .reset_index(name="count")
)
with PdfPages(plot_dir / "high_PC_CT_pies.pdf") as pdf:
    for sid in hs:
        #print(sid)
        di = di_dict[sid]
        fig, ax = plt.subplots(figsize=(4, 4))
    
        sub = ct_counts[ct_counts['Sample']==sid].copy()
        colors = [ct_palette[str(r)] for r in sub["ct"]]
        ax.pie(
            sub["count"],
            labels=sub["ct"],
            colors=colors,
            #autopct="%.1f%%",
            startangle=90
        )
        ax.set_title(f"CT composition for {di}")
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)


In [None]:
# look at MRD+ PT with different RN compositon

hs = ['SP110U1-A1-S1Fp1U1', 'SP113U1-A1-S1Fp1U1', 'SP114U1-A1-S1Fp1U1', 'SP118U1-A1-S1Fp1U1Q1', 'SP120U1-A1-S1Fp1U1', 'SP126U1-A1-S1Fp1U1']
obs = merged.obs.copy()
di_dict = obs.set_index('Sample')['DI_Sample'].to_dict()

with PdfPages(plot_dir / "MRDpos_PT_RN_scatterplots.pdf") as pdf:
    for sid in hs:
        #print(sid)
        di = di_dict[sid]
        fig, ax = plt.subplots(figsize=(8, 8))
    
        subset = obs[obs['Sample']==sid].copy()
        
        sns.scatterplot(subset, x='x_centroid', y='y_centroid', hue='rn', s=1, ax=ax, palette=neighborhood_colors, legend=False)
        ax.axis('equal')
        #ax.set_xticklabels([])
        #ax.set_yticklabels([])
        ax.set_title(di)
        ax.collections[0].set_rasterized(True)
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches='tight')
        plt.close(fig)

In [None]:
hs = ['SP110U1-A1-S1Fp1U1', 'SP113U1-A1-S1Fp1U1', 'SP114U1-A1-S1Fp1U1', 'SP118U1-A1-S1Fp1U1Q1', 'SP120U1-A1-S1Fp1U1', 'SP126U1-A1-S1Fp1U1']
rn_counts = (
    obs.groupby(["Sample", "rn"], observed=True)
    .size()
    .reset_index(name="count")
)

with PdfPages(plot_dir / "MRDpos_PT_RN_pies.pdf") as pdf:
    for sid in hs:
        #print(sid)
        di = di_dict[sid]
        fig, ax = plt.subplots(figsize=(4, 4))
    
        sub = rn_counts[rn_counts['Sample']==sid].copy()
        colors = [neighborhood_colors[str(r)] for r in sub["rn"]]
        ax.pie(
            sub["count"],
            labels=sub["rn"],
            colors=colors,
            autopct="%.1f%%",
            startangle=90
        )
        ax.set_title(f"RN composition for {di}")
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

ct_counts = (
    obs.groupby(["Sample", "ct"], observed=True)
    .size()
    .reset_index(name="count")
)
with PdfPages(plot_dir / "MRDpos_PT_CT_pies.pdf") as pdf:
    for sid in hs:
        #print(sid)
        di = di_dict[sid]
        fig, ax = plt.subplots(figsize=(4, 4))
    
        sub = ct_counts[ct_counts['Sample']==sid].copy()
        colors = [ct_palette[str(r)] for r in sub["ct"]]
        ax.pie(
            sub["count"],
            labels=sub["ct"],
            colors=colors,
            #autopct="%.1f%%",
            startangle=90
        )
        ax.set_title(f"CT composition for {di}")
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)
