In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import squidpy as sq
from scipy import sparse

import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from scipy.spatial import cKDTree

import seaborn as sns

In [None]:
adata = sc.read_h5ad('../data/RREAE_5k_raw_integration_processed.h5ad')

In [None]:
def lesions_by_dbscan_and_proximity_exclude_ependymal(
    adata,
    coord_x="global_x",
    coord_y="global_y",
    per_sample_col="sample_id",
    celltype_col="cell_type",            # column with cell type labels
    ependymal_label="Ependymal",         # how ependymal cells are labeled
    eps_um=27,                           # microns for DBSCAN neighborhood
    min_samples=25,                      # minimum cells in cluster
    lesion_col="lesion_density_call",
    distance_col="lesion_distance_um",
    binned_col="lesion_distance_bin",
    bins=(0, 10, 25, 50, 100, 200, 500, np.inf),
    labels=("0–10µm","10–25µm","25–50µm","50–100µm","100–200µm","200–500µm",">500µm"),
    micron_per_unit=1.0                  # scale factor if coords not in microns
):
    """
    Detect lesions via DBSCAN density clustering, excluding clusters dominated by ependymal cells.
    Then compute distance to nearest lesion cell and optionally bin distances.
    """

    # Prepare output columns
    adata.obs[lesion_col] = "non_lesion"
    adata.obs[distance_col] = np.nan
    if binned_col is not None:
        adata.obs[binned_col] = pd.Categorical(
            values=[np.nan] * adata.n_obs, categories=labels, ordered=True
        )

    for sample in adata.obs[per_sample_col].unique():
        mask = (adata.obs[per_sample_col] == sample).values
        if not np.any(mask):
            continue

        coords = adata.obs.loc[mask, [coord_x, coord_y]].to_numpy(dtype=float)
        coords_scaled = coords / micron_per_unit

        # Run DBSCAN
        clustering = DBSCAN(
            eps=eps_um / micron_per_unit,
            min_samples=min_samples
        ).fit(coords_scaled)
        cluster_labels = clustering.labels_  # -1 means noise

        lesion_clusters = []
        for cluster_id in np.unique(cluster_labels):
            if cluster_id == -1:
                continue  # skip noise

            idx_cluster = np.where(cluster_labels == cluster_id)[0]
            cell_indices = adata.obs.index[mask][idx_cluster]

            # Exclude ependymal-dominated clusters
            epend_frac = (adata.obs.loc[cell_indices, celltype_col] == ependymal_label).mean()
            if epend_frac > 0.8:  # 80%+ ependymal = not lesion
                continue

            lesion_clusters.append(cluster_id)

        # Mark lesion cells
        lesion_mask_local = np.isin(cluster_labels, lesion_clusters)
        lesion_indices = adata.obs.index[mask][lesion_mask_local]
        adata.obs.loc[lesion_indices, lesion_col] = "lesion"

        # Distance to nearest lesion cell
        if lesion_mask_local.sum() > 0:
            lesion_coords = coords[lesion_mask_local]
            tree = cKDTree(lesion_coords)
            dists, _ = tree.query(coords, k=1)
            adata.obs.loc[mask, distance_col] = dists

            # Bin distances
            if binned_col is not None:
                binned = pd.cut(dists, bins=bins, labels=labels, include_lowest=True, right=False)
                adata.obs.loc[mask, binned_col] = binned.astype("category")

    return adata

In [None]:
adata = lesions_by_dbscan_and_proximity_exclude_ependymal(
    adata,
    coord_x="x_centroid",
    coord_y="y_centroid",
    per_sample_col="sample_id",
    celltype_col="sub_type_III",
    ependymal_label="Ependymal",
    eps_um=26,
    min_samples=30
)

In [None]:
# --------- 0) Merge subtypes into parent labels ---------
merge_map = {
    # Astrocytes
    "DA-Astrocyte": "Astrocyte",
    "Astrocyte": "Astrocyte",

    # Oligodendrocytes
    "DA-Oligodendrocyte": "Oligodendrocyte",
    "Oligodendrocyte": "Oligodendrocyte",

    # OPCs
    "OPC (cycling)": "OPC",
    "OPC": "OPC",

    # Microglia
    "Foamy Microglia": "Microglia",
    "Microglia (cycling)": "Microglia",
    "Microglia (homeostatic)": "Microglia",
    "Microglia (intermediate)": "Microglia",
    "Microglia": "Microglia"
}

celltype_col = "sub_type_III"
adata.obs["celltype_merged"] = adata.obs[celltype_col].replace(merge_map)

In [None]:
#adata.write('../data/RREAE_5k_raw_integration_processed.h5ad')