In [None]:
import os
import re 
from pathlib import Path

import pandas as pd
import numpy as np
import scanpy as sc
import scanpy.external as sce
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/')
os.getcwd()

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42 #make text editable in pdf
mpl.rcParams['svg.fonttype'] = 'none'

In [None]:
merged = sc.read_h5ad('merged.h5ad')

In [None]:
merged

In [None]:
''' gating strategy 
if at least some threshold (ex. 10%) of a cell's total counts come from a given cell-typing gene set (ex. for cell type "A"), 
while no other cell-typing gene set exceeds that threshold, the cell is assigned the A cell type. 
'''

cell_type_markers = {
    "HSPC": list(set(["CD34", "AVP", "SPINK2", "SMIM24", "KIT", "GATA2"])),
    "Erythro": list(set(['PCNA', 'MYC', 'CENPF', 'PARP1', 'GATA1', 'AHSP', 'ALAS2', ' GYPA', 'GYPB', 'HEMGN', 'SLC4A1'])),
    "MKC": ['PF4', 'PLEK'],
    "Granulo": list(set(["FUT4", "MPO", "ELANE", "KIT", "CAMP", "RETN", "MPO", "GATA2", "LTF", "MMP9", "AQP9", "IL1R2", "S100A12", "ITGAM", "MS4A2", "CPA3", "HPGDS"])),
    "cDC": list(set(["CLEC10A", "CD1C", "FCER1A", "CD1A", "CD1E", "GPR183", "HLA-DQB2", "CD83", "CD86", "CCR7", "LAMP3", "CSF2RA", "LILRB2", "LILRB4", "SLAMF1"])),
    "Mc/Mp": list(set([ "CD14", "VCAN", "FCN1", "FCGR1A", "FCGR3A", "CD68", "AIF1", "LILRB2", "LILRB4", "MRC1", "CD163", "LYVE1", "VSIG4", "MARCO", "CD5L"])),
    "pDC": list(set(["IRF8", "RUNX2", "LILRA4", "IL3RA", "FCER1A"])),
    "T": list(set(["CD3D", "CD3E", "TRAC", "CD2", "CD4", "CD8A", "CD247", "IL7R", "FOXP3"])),
    "NK": list(set(["NKG7", "GNLY", "PRF1", "KLRB1", "KLRC1", "KLRD1", "FGFBP2", "SLAMF7"])),
    "B": list(set(["CD19", "MS4A1", "CD79A", "BANK1", "TCL1A", "VPREB1", "SOX4", "PAX5", "CCR7", "SELL", "PLCG2"])),
    "PC": list(set(["MZB1", "SLAMF7", "TNFRSF17", "TENT5C", "PRDM1", "CD27"])),
    "MSC": list(set(["LEPR", "KITLG", "CXCL12", "THY1", "PDGFRA", "PDGFRB", "DPT", "OGN", "DPP4", "BMP4"])),
    "Fibro/Osteo": list(set(["ACTA2", "VCAN", "COL5A2", "FBLN1", "MFAP5", "RUNX2", "BGLAP", "SPP1", "SEMA3A"])),
    "Adipo": list(set(["APOD", "FABP4", "ADIPOQ", "PPARG", "LPL", "PLIN4", "MEDAG", "MEST", "STEAP4", "PEBP4"])),
    "Endo/Pericyte": list(set(["PECAM1", "VWF", "EGFL7", "CAV1", "CAVIN1", "CLEC14A", "KDR", "ENG", "FLT4", "CNN1", "MYH11", "ECSCR", "RAMP2", "TMEM100"])),
}

In [None]:
def subset_row_fraction(adata, marker_list, layer=None):
    """
    For each cell, returns sum(X[row, genes]) / sum(X[row, :]).
    """
    # choose matrix
    X = adata.layers[layer] if layer is not None else adata.X
    var_names = adata.var_names

    # map genes -> indices present
    genes = [g.strip() for g in marker_list]
    idx = [var_names.get_loc(g) for g in genes if g in var_names]

    if sparse.issparse(X):
        Xc = X.tocsc()  # fast column slicing
        numer = np.asarray(Xc[:, idx].sum(axis=1)).ravel() if idx else np.zeros(adata.n_obs)
        denom = np.asarray(Xc.sum(axis=1)).ravel()
    else:
        Xn = np.asarray(X)
        numer = Xn[:, idx].sum(axis=1) if idx else np.zeros(adata.n_obs)
        denom = Xn.sum(axis=1)

    frac = np.divide(numer, denom, out=np.zeros_like(numer, dtype=float), where=denom>0)

    # warning for missing genes
    missing = [g for g in genes if g not in var_names]
    if missing:
        print(f"[subset_row_fraction] Missing genes (ignored): {', '.join(missing)}")

    return frac


from scipy import sparse

def assign_cell_types_simple(
    adata,
    markers: dict,
    threshold: float = 0.10,
    layer: str | None = None,
    result_key: str = "annot",
    unknown_label: str = "unknown",
    none_label: str = "unassigned",
):
    """
    For each cell:
      - compute fraction = sum(X[row, marker_genes]) / sum(X[row, :]) for each cell type
      - if ≥2 fractions >= threshold -> 'unknown'
      - if exactly 1 fraction >= threshold -> that cell type
      - else -> 'unassigned'
    Writes only one column to adata.obs[result_key].
    """
    # compute fractions for each cell type 
    frac_cols = {ct: subset_row_fraction(adata, genes, layer=layer)
                 for ct, genes in markers.items()}
    frac_df = pd.DataFrame(frac_cols, index=adata.obs_names)

    meets = frac_df.ge(threshold)
    n_meet = meets.sum(axis=1)

    calls = pd.Series(none_label, index=frac_df.index, dtype=object)
    calls[n_meet >= 2] = unknown_label
    one_mask = n_meet == 1
    calls[one_mask] = meets[one_mask].idxmax(axis=1)

    adata.obs[result_key] = pd.Categorical(calls)
    return adata.obs[result_key]


In [None]:
# determine optimal threshold for minimizing unknown calls

adata= merged.copy()
# sweep 1%..x% in 1% steps
thresholds = np.arange(0.01, 0.16, 0.01)
multiplet_counts = []
unassigned_counts = []
unk_counts = []

for th in thresholds:
    assign_cell_types_simple(
        adata,
        markers=cell_type_markers,
        threshold=th,
        layer="counts",        # or None to use adata.X
        result_key="annot",
        unknown_label="multiplet",
        none_label="unassigned",
    )
    multiplet_counts.append((adata.obs["annot"] == "multiplet").sum())
    unassigned_counts.append((adata.obs["annot"] == "unassigned").sum())
    unk_counts.append(((adata.obs["annot"] == "multiplet") | (adata.obs["annot"] == "unassigned")).sum())
    
# optional: summarize
summary = pd.DataFrame({
    "threshold": thresholds,
    "threshold_pct": (thresholds * 100).round(0).astype(int),
    "multiplet_count": multiplet_counts,
    "unassigned_count": unassigned_counts,
    "total_unknown": unk_counts,
    "multiplet_frac": np.array(multiplet_counts) / adata.n_obs,
    "unassigned_frac": np.array(unassigned_counts) / adata.n_obs,
    "unknown_frac": np.array(unk_counts) / adata.n_obs
})

In [None]:
print(summary)

# line plot 
plt.figure()
plt.plot(summary["threshold_pct"], summary["multiplet_frac"], marker="o", color="red")
plt.plot(summary["threshold_pct"], summary["unassigned_frac"], marker="o", color="blue")
plt.plot(summary["threshold_pct"], summary["unknown_frac"], marker="o", color="black")
plt.xlabel("Threshold (%)")
plt.ylabel("Unknown cells (count)")
plt.title("Unknown vs. threshold")
plt.grid(True)
plt.show()

In [None]:
assign_cell_types_simple(
    merged,
    markers=cell_type_markers,
    threshold=0.13,
    layer="counts",        
    result_key="annot",
    unknown_label="Multiplet",
    none_label="Unknown",
)
merged.obs['annot'].value_counts()

In [None]:
cat_order = list(cell_type_markers.keys())
cat_order.append("Unknown")
cat_order.append("Multiplet")
cat_order

In [None]:
ctmarkers_simple = {
    "HSPC": ["CD34", "AVP", "SPINK2", "SMIM24", "KIT", "GATA2"],
    "Erythroid": [ 'GATA1', 'AHSP', 'ALAS2', 'HEMGN', 'SLC4A1'],
    'MKC':['PF4', 'PLEK'],
    "Granulo": ["MPO", "CAMP", "LTF", "MMP9", "S100A12"],
    "cDC": ["CLEC10A", "CD1C", "CD1A", "CD1E", "GPR183"],
    "Mc/Mp": [ "CD14", "FCN1", "FCGR1A", "FCGR3A", "CD68", "AIF1", "MRC1", "CD163", "VSIG4"],
    "pDC": ["IRF8", "RUNX2", "LILRA4", "IL3RA", "GZMB"],
    "T Cell": ["CD3D", "CD3E", "TRAC", "CD2", "CD4", "CD8A", "CD247", "IL7R", "FOXP3"],
    "NK Cell": ["NKG7", "GNLY", "GZMA",  "GZMK", "PRF1", "KLRB1", "KLRC1", "KLRD1"],
    "B Cell": [ "VPREB1", "SOX4", "PAX5", "CD19", "MS4A1", "CD79A"],
    "PC": ["MZB1", "SLAMF7", "TNFRSF17", "TENT5C", "PRDM1"],
    "MSC": ["LEPR", "KITLG", "CXCL12", "THY1"],
    "Fibro.": ["PDGFRA", "PDGFRB", "ACTA2", "COL5A2", "FBLN1"],
    "Osteo": ["BGLAP", "SPP1"],
    "Adipo": ["FABP4", "ADIPOQ", "PPARG"],
    "Endo":["PECAM1", "VWF", "EGFL7", "CLEC14A", "KDR", "ENG", "FLT4", "ACTA2", "CNN1", "MYH11",]
}   

In [None]:
# save the above dotplot to pdf
sc.pl.dotplot(
    merged,
    var_names=ctmarkers_simple,
    groupby="annot",
    categories_order=cat_order,   # explicit order
    standard_scale="var",
    figsize=(22,4),
    show=False
)

plt.savefig("celltype_markers_merged_manual_gating_thr0.13.pdf", bbox_inches="tight")

In [None]:
# get breakdown of cell types in each sample
ct_palette = {
    'HSPC': "#d6e376",
    'Erythro': "#a6a6a6",
    'MKC': '#000000',
    'Granulo': "#95ad74",
    'cDC': "#3bff8c",
    'Mc/Mp': "#1aff00",
    'pDC': "#a5c3c4",
    'T': "#fc0000",
    'B': "#032cfc",
    'PC': "#ffbafd",
    'NK': "#9302d1",
    'MSC': "#cfc10a",
    'Fibro/Osteo': "#918043",
    'Adipo': "#f5f067",
    'Endo/Pericyte': "#ab8476",           
    'Unknown': "#e8e8e8",
    'Multiplet': "#FFFFFF"}

In [None]:
merged.obs['annot'] = pd.Categorical(
    merged.obs['annot'], categories=cat_order, ordered=True
)  

merged.obs['Collection'] = pd.Categorical(
    merged.obs['Collection'], categories=['NBM', 'NDMM', 'PT'], ordered=True
)  

merged.obs['Panel'] = pd.Categorical(
    merged.obs['Panel'], categories=['W7JCJE_hMulti', 'BYGXJ6_hMulti'], ordered=True
)  

In [None]:
obs = merged.obs.copy()
obs = obs.sort_values(
    by=["Panel", "Collection"],
    ascending=[True, True], kind="mergesort"
)
cats = pd.unique(obs["DI_Sample"])
obs["DI_Sample"] = pd.Categorical(obs["DI_Sample"], categories=cats, ordered=True)

counts = pd.crosstab(obs['DI_Sample'], obs['annot']).astype(int)
counts = counts.loc[cats]

props = counts.div(counts.sum(axis=1), axis=0).fillna(0.0)
fig, ax = plt.subplots(figsize=(12, 6))
props.plot(
    kind="bar",
    stacked=True,
    ax=ax,
    width=0.98, 
    color=[ct_palette.get(c, '#000000') for c in props.columns],
)
ax.set_xlabel("sample")
ax.set_ylabel("proportion")
ax.margins(x=0)             # remove left/right x-axis padding
ax.legend(loc="center right", bbox_to_anchor=(-0.02, 0.5), frameon=False, title="cell type")
fig.subplots_adjust(left=0.25)
plt.xticks(rotation=90, ha="right")
plt.tight_layout()
fig.savefig("annots_per_sample_proportion_plot.pdf", bbox_inches="tight")  # save to PDF
plt.close(fig)

In [None]:
sids = sorted(set(merged.obs['Sample']))

merged.uns["annot_colors"] = [
    ct_palette[cat] for cat in merged.obs["annot"].cat.categories
]

In [None]:
from matplotlib.backends.backend_pdf import PdfPages

with PdfPages("annots_per_sample_scatterplots.pdf") as pdf:
    for sid in sids:
        f = merged[merged.obs["Sample"] == sid].copy()
        f.obs["annot"] = f.obs["annot"].astype("category")
        cats_all = list(f.obs["annot"].cat.categories)
        pal_all  = [ct_palette.get(c, "#bbbbbb") for c in cats_all]

        f2 = f[(f.obs["annot"] != "Unknown") & (f.obs["annot"] != "Multiplet")].copy()
        f2.obs["annot"] = f2.obs["annot"].astype("category").cat.remove_unused_categories()
        cats_no = list(f2.obs["annot"].cat.categories)
        pal_no  = [ct_palette.get(c, "#bbbbbb") for c in cats_no]

        fig, axes = plt.subplots(1, 2, figsize=(10, 10), constrained_layout=True)

        # left: all cells
        sc.pl.scatter(f, x="x_centroid", y="y_centroid", color="annot", palette=pal_all, ax=axes[0], legend_loc="none", show=False, size=2)
        axes[0].set_title(f"{sid} — all cells")
        axes[0].set_aspect("equal")
        axes[0].invert_yaxis()
        for coll in axes[0].collections:
            coll.set_rasterized(True)

        # right: exclude Unknown and Multiplets
        sc.pl.scatter(f2, x="x_centroid", y="y_centroid", color="annot", palette=pal_no, ax=axes[1], legend_loc="none", show=False, size=2)
        axes[1].set_title(f"{sid} — no unknown or multiplet")
        axes[1].set_aspect("equal")
        axes[1].invert_yaxis()
        for coll in axes[1].collections:
            coll.set_rasterized(True)

        # lock both panels to the same extents for fair side-by-side comparison
        axes[1].set_xlim(axes[0].get_xlim()); axes[1].set_ylim(axes[0].get_ylim())

        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)


In [None]:
merged.write("merged.h5ad")

In [None]:
# check markers for every sample
all_samples = merged.obs['Sample'].unique()
pdf_out = "per_sample_marker_dotplots.pdf"
with PdfPages(pdf_out) as pdf:
    for s in all_samples:
        print(s)
        sobj = merged[merged.obs['Sample'] == s].copy()

        dp = sc.pl.dotplot(
            sobj,
            var_names=ctmarkers_simple,
            groupby="annot",
            categories_order=cat_order,     # keep if valid for this sample
            standard_scale="var",
            return_fig=True,
            show=False,
            title=s
        )
        
        fig = dp.make_figure()
        #fig.suptitle(f"Sample: {s}", y=1.02)
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(dp.fig)

print(f"Saved: {pdf_out}")

In [None]:
ctmarkers_simple = {
    "HSPC": ["CD34", "AVP", "SPINK2", "SMIM24", "KIT", "GATA2"],
    "Erythroid": [ 'GATA1', 'AHSP', 'ALAS2', 'HEMGN', 'SLC4A1'],
    'MKC':['PF4', 'PLEK'],
    "Granulo.": ["MPO", "ELANE", "CAMP", "LTF", "MMP9", "S100A12", "CPA3"],
    "cDC": ["CLEC10A", "CD1C", "CD1A", "CD1E", "GPR183"],
    "Mc/Mp": [ "CD14", "FCN1", "FCGR1A", "FCGR3A", "CD68", "AIF1", "MRC1", "CD163", "VSIG4"],
    "pDC": ["IRF8", "RUNX2", "LILRA4", "IL3RA", "GZMB"],
    "T Cell": ["CD3D", "CD3E", "TRAC", "CD2", "CD4", "CD8A", "CD247", "IL7R", "FOXP3"],
    "NK Cell": ["NKG7", "GNLY", "GZMA",  "GZMK", "PRF1", "KLRB1", "KLRC1", "KLRD1"],
    "B Cell": [ "VPREB1", "SOX4", "PAX5", "CD19", "MS4A1", "CD79A"],
    "PC": ["MZB1", "SLAMF7", "TNFRSF17", "TENT5C", "PRDM1"],
    "MSC": ["LEPR", "KITLG", "CXCL12", "THY1"],
    "Fibro.": ["PDGFRA", "PDGFRB", "ACTA2", "COL5A2", "FBLN1"],
    "Osteo.": ["BGLAP", "SPP1"],
    "Adipo.": ["FABP4", "ADIPOQ", "PPARG"],
    "Endo.":["PECAM1", "VWF", "EGFL7", "CLEC14A", "KDR", "ENG", "FLT4", "MYH11"],
}   

sc.pl.dotplot(
    merged,
    var_names=ctmarkers_simple,
    groupby="annot",
    standard_scale="var",
    figsize=(22,4),
    show=False
)

plt.savefig("celltype_markers_merged_manual_gating_thr0.13_simplified.pdf", bbox_inches="tight")