In [None]:
import os
from pathlib import Path
from typing import Annotated

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
import tifffile

from sklearn.cluster import KMeans
from skimage.color import label2rgb
from sklearn.neighbors import radius_neighbors_graph
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests
from scipy.stats import entropy, chi2_contingency
from matplotlib.backends.backend_pdf import PdfPages
from statannotations.Annotator import Annotator

from scipy import sparse

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/distance_analysis')
os.getcwd()

collection_order = ["NBM", "NDMM", "PT"] 
timecols = {"NBM": "#0C7515", "NDMM": "#E619B9", "PT": "#CF99C3"} 

merged = sc.read_h5ad("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/radial_neighborhoods/Output/merged_RN.h5ad")

In [None]:
def remove_outliers_iqr(df, col="median_dist", k=1.5):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return df[(df[col] >= lower) & (df[col] <= upper)]


In [None]:
adata = merged[merged.obs['ct']!='Low Confidence'].copy()           
sample_key = "DI_Sample"   
time_key   = "Collection"   
ct_key     = "ct"           
x_key      = "x_centroid"
y_key      = "y_centroid"

from itertools import combinations_with_replacement
from sklearn.neighbors import KDTree

obs = adata.obs
results = []

# loop over each sample × timepoint combination
group_cols = [sample_key, time_key]
for (sample, tp), sub_idx in obs.groupby(group_cols).groups.items():
    sub = obs.loc[sub_idx]

    # skip if too few cells 
    if sub.shape[0] < 2:
        continue

    # cell types present in this sample × timepoint
    cts_here = sub[ct_key].dropna().unique().tolist()

    # loop over all unordered pairs, including self-pairs (ct1==ct2)
    for ct1, ct2 in combinations_with_replacement(cts_here, 2):
        mask1 = (sub[ct_key] == ct1)
        mask2 = (sub[ct_key] == ct2)

        n1 = int(mask1.sum())
        n2 = int(mask2.sum())

        if n1 == 0 or n2 == 0:
            results.append({
                sample_key: sample,
                time_key: tp,
                "ct_source": ct1,
                "ct_target": ct2,
                "median_dist": np.nan,
                "mean_dist": np.nan,
                "min_dist": np.nan,
                "n_source_cells": n1,
                "n_target_cells": n2,
            })
            continue

        # coords for target
        target_coords = np.c_[
            sub.loc[mask2, x_key].to_numpy(),
            sub.loc[mask2, y_key].to_numpy()
        ]

        # coords for source
        source_coords = np.c_[
            sub.loc[mask1, x_key].to_numpy(),
            sub.loc[mask1, y_key].to_numpy()
        ]

        # build KDTree on target, query from source
        tree = KDTree(target_coords, leaf_size=40)
        dists, _ = tree.query(source_coords, k=1)
        dists = dists.ravel()

        results.append({
            sample_key: sample,
            time_key: tp,
            "ct_source": ct1,
            "ct_target": ct2,
            "median_dist": float(np.median(dists)),
            "mean_dist": float(dists.mean()),
            "min_dist": float(dists.min()),
            "n_source_cells": n1,
            "n_target_cells": n2,
        })

pairwise_dist_df = pd.DataFrame(results)
pairwise_dist_df

In [None]:
# make heatmap of median distance
# drop NaN distances (cases with 0 cells, etc.)
df = pairwise_dist_df.dropna(subset=["median_dist"]).copy()

# remove outlier samples separately for each (timepoint, source, target)
df_clean = (
    df.groupby([time_key, "ct_source", "ct_target"], group_keys=False)
      .apply(remove_outliers_iqr, col="median_dist", k=1.5)
)

# average median distance across remaining samples
avg_df = (
    df_clean
    .groupby([time_key, "ct_source", "ct_target"], observed=True)
    .agg(avg_median_dist=("median_dist", "mean"),
         n_samples=("median_dist", "size"))
    .reset_index()
)
avg_df

In [None]:
# make a symmetric version by averaging A→B and B→A
sym = avg_df.copy()

# create an unordered pair key
sym["ct_a"] = sym[["ct_source", "ct_target"]].min(axis=1)
sym["ct_b"] = sym[["ct_source", "ct_target"]].max(axis=1)

sym_df = (
    sym.groupby([time_key, "ct_a", "ct_b"], observed=True)
       .agg(avg_median_dist=("avg_median_dist", "mean"))
       .reset_index()
)

all_cts = sorted(
    pd.concat([avg_df["ct_source"], avg_df["ct_target"]]).unique()
)

# CTs present in symmetric table
sym_cts = sorted(
    pd.concat([sym_df["ct_a"], sym_df["ct_b"]]).unique()
)

global_min = sym_df["avg_median_dist"].min()
global_max = sym_df["avg_median_dist"].max()

pdf_path_sym = "ct_distance_heatmaps_symmetric_by_timepoint.pdf"

with PdfPages(pdf_path_sym) as pdf:
    for tp, sub in sym_df.groupby(time_key):
        # build a full symmetric matrix
        mat = pd.DataFrame(
            np.nan,
            index=sym_cts,
            columns=sym_cts
        )

        for _, row in sub.iterrows():
            a = row["ct_a"]
            b = row["ct_b"]
            d = row["avg_median_dist"]
            mat.loc[a, b] = d
            mat.loc[b, a] = d

        #mat_z = (mat - mat.mean()) / mat.std()
        
        plt.figure(figsize=(8, 6))
        ax = sns.clustermap(
            mat,
            annot=False,
            cmap="coolwarm_r",
            square=True,
            cbar_kws={"label": "Avg median distance (µm, symmetric)"},
            vmin=0, vmax=200
        )
        ax.fig.suptitle(f"{tp}: symmetric CT–CT distance")
        
        plt.tight_layout()
        pdf.savefig()
        plt.close()


In [None]:
# get diff betwn PT and NDMM
sub = sym_df[sym_df[time_key].isin(["NDMM", "PT"])].copy()

# pivot to have columns NDMM and PT for each ct_a, ct_b pair
wide = (
    sub.pivot_table(
        index=["ct_a", "ct_b"],
        columns=time_key,
        values="avg_median_dist"
    )
    .reset_index()
)

# keep only pairs that have both NDMM and PT
wide = wide.dropna(subset=["NDMM", "PT"])

# difference: PT - NDMM (positive = farther in PT)
wide["diff_PT_minus_NDMM"] = wide["PT"] - wide["NDMM"]

# all CTs present
sym_cts = sorted(
    pd.concat([wide["ct_a"], wide["ct_b"]]).unique()
)

# build symmetric matrix of differences
diff_mat = pd.DataFrame(
    np.nan,
    index=sym_cts,
    columns=sym_cts
)

for _, row in wide.iterrows():
    a = row["ct_a"]
    b = row["ct_b"]
    d = row["diff_PT_minus_NDMM"]
    diff_mat.loc[a, b] = d
    diff_mat.loc[b, a] = d   # enforce symmetry

max_abs = np.nanmax(np.abs(diff_mat.values))

pdf_path = "CT_distance_change_PT_minus_NDMM.pdf"
with PdfPages(pdf_path) as pdf:
    plt.figure(figsize=(8, 6))
    ax = sns.clustermap(
        diff_mat,
        cmap="coolwarm_r",
        center=0,
        vmin=-100,
        vmax=100,
        square=True,
        cbar_kws={"label": "Δ distance (PT − NDMM, µm)"}
    )
    ax.fig.suptitle("Change in symmetric CT–CT distance (PT − NDMM)")
    plt.tight_layout()
    pdf.savefig()
    plt.close()

print("Saved:", pdf_path)

In [None]:
from sklearn.neighbors import KDTree

group_keys = ["UPN", "Collection"]
receptor_types = ["Early B", "Mature B", "PC"]

results = []

lig_adata=mye.copy()
rec_adata=bpc.copy()

# all UPN+Collection combos seen in either ligands or receptors
lig_groups = set(zip(lig_adata.obs["UPN"], lig_adata.obs["Collection"]))
rec_groups = set(zip(rec_adata.obs["UPN"], rec_adata.obs["Collection"]))
all_groups = sorted(lig_groups.union(rec_groups))

for upn, coll in all_groups:
    lig_mask = (lig_adata.obs["UPN"] == upn) & (lig_adata.obs["Collection"] == coll)
    n_lig = int(lig_mask.sum())

    if n_lig == 0:
        # no ligand cells at all for this sample
        for rtype in receptor_types:
            results.append({
                "UPN": upn,
                "Collection": coll,
                "receptor_type": rtype,
                "min_dist": np.nan,
                "mean_dist": np.nan,
                "median_dist": np.nan,
                "n_ligand_cells": 0,
                "n_receptor_cells": 0,
            })
        continue

    lig_coords = np.c_[
        lig_adata.obs.loc[lig_mask, "x_centroid"].to_numpy(),
        lig_adata.obs.loc[lig_mask, "y_centroid"].to_numpy()
    ]

    for rtype in receptor_types:
        rec_mask = (
            (rec_adata.obs["UPN"] == upn) &
            (rec_adata.obs["Collection"] == coll) &
            (rec_adata.obs["ct"] == rtype)
        )
        n_rec = int(rec_mask.sum())

        if n_rec == 0:
            # no receptors of this type in this sample
            results.append({
                "UPN": upn,
                "Collection": coll,
                "receptor_type": rtype,
                "min_dist": np.nan,
                "mean_dist": np.nan,
                "median_dist": np.nan,
                "n_ligand_cells": n_lig,
                "n_receptor_cells": 0,
            })
            continue

        rec_coords = np.c_[
            rec_adata.obs.loc[rec_mask, "x_centroid"].to_numpy(),
            rec_adata.obs.loc[rec_mask, "y_centroid"].to_numpy()
        ]

        # KDTree of this receptor type
        tree = KDTree(rec_coords, leaf_size=40)
        dists, _ = tree.query(lig_coords, k=1)  # nearest receptor of this type for each ligand cell
        dists = dists.ravel()

        results.append({
            "UPN": upn,
            "Collection": coll,
            "receptor_type": rtype,
            "min_dist": float(dists.min()),
            "mean_dist": float(dists.mean()),
            "median_dist": float(np.median(dists)),
            "n_ligand_cells": n_lig,
            "n_receptor_cells": n_rec,
        })

dist_df = pd.DataFrame(results)