In [None]:
import os
import re
from pathlib import Path
from typing import Annotated

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scanpy.external as sce
import scanpy as sc
import seaborn as sns
import tifffile
import anndata as ad

from sklearn.cluster import KMeans
from skimage.color import label2rgb
from sklearn.neighbors import radius_neighbors_graph
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import entropy, chi2_contingency

from scipy import sparse
from scipy.stats import mannwhitneyu
import itertools

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

In [None]:

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/CODEX/harshita_analysis/')
os.getcwd()

# Loading and exploration

In [None]:
s1 = ad.read_h5ad('/diskmnt/Users2/harshita/CODEX/slide1.h5ad')
s2 = ad.read_h5ad('/diskmnt/Users2/harshita/CODEX/slide2.h5ad')

In [None]:
merged = ad.read_h5ad('/diskmnt/Users2/harshita/CODEX/merged.h5ad')

In [None]:
merged

In [None]:
merged.obs

In [None]:
merged.obs.describe(include='all')

In [None]:
print(merged.obs['manual_gate'].value_counts(dropna=False))

In [None]:
print(merged.obs['manual_gate_withT'].value_counts(dropna=False))

In [None]:
print(merged.obs['Timepoint'].value_counts(dropna=False))

In [None]:
print(merged.obs['section_id'].value_counts(dropna=False))

In [None]:
merged.var

In [None]:
print("Layers:", merged.layers.keys())


In [None]:
merged.X

In [None]:
merged.layers['section_normalized_intensity'] = merged.X.copy()

In [None]:
markers = ['CD11b_intensity', 'CD14_intensity', 'CD68_intensity', 'CD163_intensity',
    'CD45_intensity', 'CD44_intensity', 'HLA-DR_intensity', 'MPO_intensity',
    'Bcl-2_intensity', 'IKKa_intensity', 'NFKB p65_intensity', 'Phospho IKKa/b_intensity',
    'Ki67_intensity', 'TGFB1_intensity', 'PDL1_intensity', 'BAFF_intensity']

In [None]:
merged_df = merged.to_df()[markers]

for marker in markers:
    plt.figure(figsize=(6,4))
    sns.histplot(merged_df[marker], bins=100, kde=True, color='skyblue')
    plt.title(f"{marker} distribution in Myeloid cells")
    plt.xlabel("Intensity")
    plt.ylabel("Cell count")
    plt.tight_layout()
    plt.show()

# Exploring expression in myeloid cells

In [None]:
myeloid = merged[merged.obs['manual_gate_withT'] == 'Myeloid'].copy()

In [None]:
myeloid

CD11b: cell surface receptor protein expressed on certain immune cells like monocytes, macrophages, granulocytes, and NK cells

CD14: protein involved in the innate immune system that acts as a receptor for certain bacterial components, such as lipopolysaccharide (LPS). It is expressed on the surface of immune cells like monocytes and macrophages, and a soluble form is also found in the blood

CD68: protein highly expressed by human monocytes and tissue macrophages, often used as a marker to identify these immune cells

CD163: protein found on the surface of cells, particularly monocytes and macrophages, that acts as a scavenger receptor to bind and remove harmful hemoglobin-haptoglobin complexes from the blood

CD45: protein, also known as the leukocyte common antigen, that is found on the surface of all nucleated hematopoietic cells except mature erythrocytes and platelets

CD44:  cell surface glycoprotein that plays a crucial role in cell-cell adhesion, migration, and signaling. It is expressed on various cell types, including epithelial, endothelial, and immune cells. 

HLA-DR: protein in the human leukocyte antigen (HLA) system, a type of MHC class II molecule that presents antigens to T cells

MPO: enzyme found primarily in neutrophils, a type of white blood cell

BCL-2: nzyme found primarily in neutrophils, a type of white blood cell

IKKa: Part of the IKK complex, regulates inflammatory gene expression and survival signaling

NFKB p65: Nuclear factor kappa-light-chain-enhancer of activated B cells, p65 subunit

Phospho IKKa/b: central regulator of NF-κB signaling, which controls inflammation, survival, and immune responses.

Ki67: protein that is expressed in cells that are actively dividing (proliferating)

TGFB1: the gene for Transforming Growth Factor Beta 1, a protein that regulates cell growth, proliferation, differentiation, and apoptosis

PDL1: protein expressed on the surface of cancer cells and immune cells

BAFF: protein that regulates B-cell survival, maturation, and differentiation

In [None]:
markers = ['CD11b_intensity', 'CD14_intensity', 'CD68_intensity', 'CD163_intensity',
    'CD45_intensity', 'CD44_intensity', 'HLA-DR_intensity', 'MPO_intensity',
    'Bcl-2_intensity', 'IKKa_intensity', 'NFKB p65_intensity', 'Phospho IKKa/b_intensity',
    'Ki67_intensity', 'TGFB1_intensity', 'PDL1_intensity', 'BAFF_intensity']

In [None]:
myeloid_df = myeloid.to_df()[markers]

for marker in markers:
    plt.figure(figsize=(6,4))
    sns.histplot(myeloid_df[marker], bins=100, kde=True, color='skyblue')
    plt.title(f"{marker} distribution in Myeloid cells")
    plt.xlabel("Intensity")
    plt.ylabel("Cell count")
    plt.tight_layout()
    plt.show()

In [None]:
sc.tl.pca(myeloid, n_comps=20) 
sc.pp.neighbors(myeloid, n_neighbors=30, n_pcs=20)

In [None]:
sc.pl.umap(myeloid, color=["leiden"])

In [None]:
markers = ['CD11b_intensity', 'CD14_intensity', 'CD68_intensity', 'CD163_intensity',
            'MPO_intensity', 'HLA-DR_intensity', 'cKit_intensity']
sc.pl.umap(myeloid, color=markers, cmap='viridis', ncols=2, size=5, show=True)

In [None]:
sc.pl.umap(myeloid, color='CD14_intensity', cmap='viridis', ncols=2, size=5, show=True)

In [None]:
sc.pl.umap(myeloid, color='CD68_intensity', cmap='viridis', ncols=2, size=5, show=True)

In [None]:
myeloid.X = myeloid.layers['raw_intensity'].copy()


In [None]:
sc.pp.scale(myeloid, zero_center=True)


In [None]:
sc.tl.pca(myeloid, n_comps=20) 

In [None]:
sc.pp.neighbors(myeloid, n_neighbors=30, n_pcs=20)

In [None]:
sc.tl.umap(myeloid)

In [None]:
markers = ['CD11b_intensity', 'CD14_intensity', 'CD68_intensity', 'CD163_intensity',
            'MPO_intensity', 'HLA-DR_intensity', 'cKit_intensity', 'HIF1A_intensity']
sc.pl.umap(myeloid, color=markers, cmap='viridis', ncols=2, size=5, show=True)

In [None]:
sc.tl.leiden(myeloid, resolution=0.5, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(myeloid, color='leiden', cmap='viridis')

In [None]:
resolutions = [0.1, 0.2, 0.3, 0.4]

for r in resolutions:
    key = f'leiden_{r}'
    print(f'Running Leiden at resolution={r}...')
    sc.tl.leiden(
        myeloid,
        resolution=r,
        flavor='igraph',
        n_iterations=2,
        key_added=key
    )

In [None]:
sc.pl.umap(myeloid, color=[f'leiden_{r}' for r in resolutions], ncols=2, size=5)

In [None]:
sc.pl.umap(myeloid, color = "section_id")

In [None]:
myeloid.write('/diskmnt/Users2/harshita/CODEX/myeloid.h5ad')

In [None]:
myeloid = sc.read_h5ad('/diskmnt/Users2/harshita/CODEX/myeloid.h5ad')

In [None]:
myeloid

In [None]:
sc.tl.dendrogram(myeloid, groupby="leiden_0.2")
sc.pl.heatmap(myeloid, myeloid.var_names, groupby="leiden_0.2", dendrogram=True, standard_scale = "var")

In [None]:
sc.pl.heatmap(myeloid, myeloid.var_names, groupby="leiden_0.2", dendrogram=True, standard_scale = "obs")

In [None]:
sc.pl.matrixplot(myeloid, myeloid.var_names, groupby="leiden_0.2", dendrogram = True, standard_scale= "group")

In [None]:
sc.tl.rank_genes_groups(
    myeloid,
    groupby="leiden_0.2", 
    method="wilcoxon",         
    key_added="rank_genes_0.2"
)
sc.pl.rank_genes_groups_dotplot(
    myeloid,
    key="rank_genes_0.2",
    n_genes=5
)

In [None]:
sc.get.rank_genes_groups_df(myeloid, key="rank_genes_0.2", group=None)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
deg_df = sc.get.rank_genes_groups_df(myeloid, key="rank_genes_0.2", group=None)
top_genes_df = (deg_df
    .sort_values(["group", "scores"], ascending=[True, False])
    .groupby("group")
    .head(5))
print(top_genes_df)
top_genes_df.groupby("group")["names"].apply(list).to_dict()

0 -> M2 like TAM (CCR6, CD163, low Granzyme-B, CD45RO negative) 

1 -> proliferating APC (ki67 positive) 

2 -> granulocytic-MDSC, inflammatory neutrophiles (high HIF1A)

3 -> NFkB activated TAM (PDL-1+, Phospho-IKKa/b+, CD14+, CD68+)

4 -> myeloid mesenchymal APC (vimentin positive)

5 -> pro-inflammatory myeloid cells (TAC1+, Bcl-2+)

7 -> monocyte standard (CD68+)

8 -> t cells (CD8+, CD3e+, low MPO)

9 -> Activated antigen-presenting myeloid cells (very high HLA-DR+)

6,10,11,12 -> too small

macrophages: 0, 3, 7

neutrophils/MDSCs: 2

APCs: 1, 4, 9

pro-inflammatory cells: 5

In [None]:
myeloid.obs['leiden_0.2'] = myeloid.obs['leiden_0.2'].astype(int)

In [None]:
cluster_to_label = {
    0: "Macrophages", 3: "Macrophages", 7: "Macrophages",
    2: "Neutrophils/MDSCs",
    1: "APCs", 4: "APCs", 9: "APCs",
    5: "Pro-inflammatory cells"}

myeloid.obs['myeloid_subtypes_clustering'] = myeloid.obs['leiden_0.2'].map(cluster_to_label)

In [None]:
myeloid.obs['myeloid_subtypes_clustering'].value_counts(dropna=False)

In [None]:
sc.pl.umap(myeloid, color = "myeloid_subtypes_clustering")

In [None]:
sc.pl.matrixplot(myeloid, myeloid.var_names, groupby="myeloid_subtypes_clustering", dendrogram = True, standard_scale= "group")

In [None]:
myeloid_df = myeloid.to_df()[markers]

for marker in markers:
    plt.figure(figsize=(6,4))
    sns.histplot(myeloid_df[marker], bins=100, kde=True, color='skyblue')
    plt.title(f"{marker} distribution in Myeloid cells")
    plt.xlabel("Intensity")
    plt.ylabel("Cell count")
    plt.tight_layout()
    plt.show()

In [None]:
myeloid

1. high MPO and CD11b positive -> Neutrophils
2. cd 68 positive and CD14 positive -> macrophages
3. high HLA-DR -> APCs
4. high cd8, cd3e -> T cells
5. else pro-inflammatory myeloids

In [None]:
markers = ["MPO_intensity","CD11b_intensity","CD68_intensity","CD14_intensity",
           "HLA-DR_intensity","CD8_intensity","CD3e_intensity"]

df = pd.DataFrame(myeloid.X, columns=myeloid.var_names)

df_markers = df[markers]

percentiles = [0, 25, 50, 62.5, 75, 90, 95, 99]
summary = df_markers.quantile(q=np.array(percentiles)/100.0).T
summary.columns = [f"{p}th percentile" for p in percentiles]

summary

In [None]:
df = pd.DataFrame(myeloid.X, columns=myeloid.var_names)
cell_type = pd.Series(index=df.index, dtype="str")

for idx, row in df.iterrows():
    if (row["MPO_intensity"] > 0.52) and (row["CD11b_intensity"] > -0.17):
        cell_type[idx] = "Neutrophils"
    elif (row["CD68_intensity"] > 0.09) and (row["CD14_intensity"] > 0.12):
        cell_type[idx] = "Macrophages"
    elif row["HLA-DR_intensity"] > 0.19:
        cell_type[idx] = "APCs"
    elif (row["CD8_intensity"] > -0.11) and (row["CD3e_intensity"] > 0.36):
        cell_type[idx] = "T cells"
    else:
        cell_type[idx] = "Pro-inflammatory myeloids"
df.index = myeloid.obs_names
cell_type.index = myeloid.obs_names
myeloid.obs["myeloid_subtype_tree"] = cell_type

In [None]:
print(myeloid.obs["myeloid_subtype_tree"].value_counts())

In [None]:
ct = pd.crosstab(myeloid.obs["myeloid_subtype_tree"], myeloid.obs["myeloid_subtypes_clustering"],
    normalize="index")
sns.heatmap(ct, annot=True,  fmt=".2f", cmap="viridis", 
    cbar_kws={'label': 'Proportion'})
plt.title("Agreement Between Rule-based and Cluster-based Myeloid Subtypes", fontsize=12)
plt.xlabel("Cluster-based subtype")
plt.ylabel("Rule-based subtype")
plt.tight_layout()
plt.show()

# Selecting specific intensities by histogram gating

In [None]:
merged_df = merged.to_df()  # all features in merged.var
bins = np.linspace(merged_df.min().min(), merged_df.max().max(), 101)  # 100 bins

hist_data_merged = {}

for feature in merged_df.columns: 
    values = merged_df[feature]
    counts, bin_edges = np.histogram(values, bins=bins)
    hist_data_merged[feature] = pd.DataFrame({"bin_start": bin_edges[:-1], "bin_end": bin_edges[1:],
        "count": counts})

In [None]:
myeloid_df = myeloid.to_df()
bins = np.linspace(myeloid_df.min().min(), myeloid_df.max().max(), 101)  # 100 bins

hist_data_myeloid = {}

for feature in myeloid_df.columns: 
    values = myeloid_df[feature]
    counts, bin_edges = np.histogram(values, bins=bins)
    hist_data_myeloid[feature] = pd.DataFrame({"bin_start": bin_edges[:-1], "bin_end": bin_edges[1:],
        "count": counts})

In [None]:
hist_data_merged

In [None]:
hist_data_myeloid

In [None]:
merged_df["section_id"] = merged.obs["section_id"].values
merged_df["section_id"] = merged.obs["section_id"].astype(str).values 
for marker in markers:
    plt.figure(figsize=(6, 4))
    sns.histplot(data=merged_df, x=marker, bins=100,
        hue="section_id", multiple="stack", kde=False, stat="count")
    plt.title(f"{marker} distribution by section ID")
    plt.xlabel("Intensity")
    plt.ylabel("Cell count")
    plt.legend(title="Section ID", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
myeloid_df["section_id"] = myeloid.obs["section_id"].values
myeloid_df["section_id"] = myeloid.obs["section_id"].astype(str).values 
for marker in markers:
    plt.figure(figsize=(6, 4))
    sns.histplot(data=myeloid_df, x=marker, bins=100,
        hue="section_id", multiple="stack", kde=False, stat="count")
    plt.title(f"{marker} distribution by section ID")
    plt.xlabel("Intensity")
    plt.ylabel("Cell count")
    plt.legend(title="Section ID", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full column contents (no truncation)
pd.set_option('display.expand_frame_repr', False)

hist_data_merged["HLA-DR_intensity"]

In [None]:
hist_data_myeloid["HLA-DR_intensity"]

In [None]:
#myeloid.X = myeloid.layers['section_normalized_intensity'].copy()

In [None]:
df = myeloid_df = myeloid.to_df()
cell_type = pd.Series(index=df.index, dtype="str")

for idx, row in df.iterrows():
    #if (row["MPO_intensity"] >= 0.012) and (row["CD11b_intensity"] >= 0.012):
    if (row["MPO_intensity"] >= 0.166692):
        cell_type[idx] = "Granulocytes"
    elif row["HLA-DR_intensity"] >= 0.006411:
        cell_type[idx] = "High APCs"
    elif (row["CD68_intensity"] > 0.121813) or (row["CD14_intensity"] > 0.032056):
        cell_type[idx] = "Monocytes"
    #elif (row["CD8_intensity"] > 0.025645) and (row["CD3e_intensity"] > 0.36):
    #    cell_type[idx] = "T cells"
    else:
        cell_type[idx] = "Other myeloids"
df.index = myeloid.obs_names
cell_type.index = myeloid.obs_names
myeloid.obs["myeloid_subtype_hist_gating"] = cell_type

In [None]:
myeloid.obs['myeloid_subtype_hist_gating'].value_counts(dropna=False)

In [None]:
myeloid.obs['myeloid_subtype_hist_gating'].value_counts(normalize=True, dropna=False) * 100

In [None]:
subtype_by_timepoint = pd.crosstab(myeloid.obs['myeloid_subtype_hist_gating'], myeloid.obs['Timepoint'])

print(subtype_by_timepoint)

print("\nPercentages within each Timepoint:")
print(subtype_by_timepoint.div(subtype_by_timepoint.sum(axis=0), axis=1) * 100)

In [None]:
subtype_by_timepoint = pd.crosstab(myeloid.obs['myeloid_subtype_hist_gating'], myeloid.obs['Timepoint'])

print(subtype_by_timepoint)

print("\nPercentages within each Timepoint:")
print(subtype_by_timepoint.div(subtype_by_timepoint.sum(axis=0), axis=1) * 100)

In [None]:
percentages = subtype_by_timepoint.div(subtype_by_timepoint.sum(axis=0), axis=1) * 100

plt.figure(figsize=(10, 6))
percentages.T.plot(kind='bar')
plt.ylabel('Percentage (%)')
plt.title('Percentage of Myeloid Subtypes Across Timepoints')
plt.xticks(rotation=0)
plt.legend(title='Subtype', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
subtype_by_timepoint.plot(kind='bar')
plt.title("Myeloid Subtypes by Timepoint")
plt.xlabel("Myeloid Subtype")
plt.ylabel("Cell Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
percentages.plot(kind='bar')
plt.title("Myeloid Subtypes by Timepoint (Percentage)")
plt.xlabel("Myeloid Subtype")
plt.ylabel("Percentage of Cells (%)")
plt.xticks(rotation=45)
plt.legend(title="Timepoint")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

thresholds = {"MPO_intensity": 0.166692, "HLA-DR_intensity": 0.006411,
              "CD68_intensity": 0.121813, "CD14_intensity": 0.032056,}
markers = ["MPO_intensity", "HLA-DR_intensity", "CD68_intensity", "CD14_intensity"]
for marker in markers:
    plt.figure(figsize=(6, 4))
    sns.histplot(data=merged_df, x=marker, bins=100, hue="section_id", multiple="stack",
        kde=False, stat="count")
    plt.axvline(thresholds[marker], linestyle='--', linewidth=1.5, label=f"Threshold = {thresholds[marker]}")
    plt.title(f"{marker} distribution in all cells by section ID ")
    plt.xlabel("Intensity")
    plt.ylabel("Cell count")
    plt.tight_layout()
    plt.legend([],[], frameon=False)
    plt.show()

In [None]:
subtype_by_section = pd.crosstab(myeloid.obs['myeloid_subtype_hist_gating'], myeloid.obs['section_id'])

print(subtype_by_section)

print("\nPercentages within each Section ID:")
print(subtype_by_section.div(subtype_by_section.sum(axis=0), axis=1) * 100)


In [None]:
plt.figure(figsize=(14, 6))
ax = subtype_by_section.plot(kind='bar', stacked=False, figsize=(14,6))

plt.legend(title='Myeloid Subtype', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
plt.title("Myeloid Subtypes by Section ID")
plt.xlabel("Myeloid Subtype")
plt.ylabel("Cell Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

high MPO and CD11b positive -> Neutrophils
cd 68 positive and CD14 positive -> macrophages
high HLA-DR -> APCs
high cd8, cd3e -> T cells
else pro-inflammatory myeloids


MPO: 
0.000000	0.006411	171022
0.038467	0.044879	40607
0.160281	0.166692	22590
0.211571	0.217982	23315
0.333384	0.339795	27784

Cd11b: this is the threshold at which the values are pretty much the same, more than 0.044879
7	0.044879	0.051290	19894

HLA-DR:
0.019234	0.025645	27472
0.025645	0.032056	1048

CD68:
0.032056	0.038467	137232
0.032056	0.038467	90615

CD14: more than 0.019234
0.012822	0.019234	225848
0.012822	0.019234	224503

In [None]:
sc.pl.matrixplot(myeloid, myeloid.var_names, groupby="myeloid_subtype_hist_gating", standard_scale= "group")

In [None]:
sc.pl.matrixplot(myeloid, myeloid.var_names, groupby="myeloid_subtype_hist_gating", standard_scale= None)

In [None]:
myeloid_markers = ["CD11b_intensity", "CD14_intensity", "CD68_intensity", "CD163_intensity", "MPO_intensity",
                   "HLA-DR_intensity", "BAFF_intensity", "Bcl-2_intensity", "CTLA4_intensity", "IFNG_intensity",
                   "IKKa_intensity", "IkBa_intensity", "NFKB p65_intensity", "Phospho IKKa/b_intensity",
                   "TGFB1_intensity", "TIM3_intensity", "PDL1_intensity", "TACI_intensity",
                   "HIF1A_intensity", "Vimentin_intensity", "Ki67_intensity"]

In [None]:
sc.pl.matrixplot(myeloid, var_names=myeloid_markers, groupby="myeloid_subtype_hist_gating", standard_scale="group")

In [None]:
sc.pl.matrixplot(myeloid, var_names=myeloid_markers, groupby="myeloid_subtype_hist_gating", standard_scale=None)

In [None]:
sc.pl.dotplot(myeloid, var_names=myeloid_markers, groupby="myeloid_subtype_hist_gating", standard_scale="var")

In [None]:
sc.pl.dotplot(myeloid, var_names=myeloid_markers, groupby="myeloid_subtype_hist_gating", standard_scale="group")

In [None]:
granulocytes = myeloid[myeloid.obs["myeloid_subtype_hist_gating"] == "Granulocytes"].copy()
markers = [
    "PD1_intensity", "PDL1_intensity", "CTLA4_intensity", 
    "TIM3_intensity", "TGFB1_intensity", "FOXP3_intensity", 
    "CD163_intensity", "CD14_intensity", "Bcl-2_intensity"
]

## Combining with merged

In [None]:
merged.obs['combined_label_myeloid'] = merged.obs['manual_gate_withT'].copy()
merged.obs['combined_label_myeloid'] = merged.obs['manual_gate_withT'].astype(str)
merged.obs.loc[myeloid.obs.index, 'combined_label_myeloid'] = myeloid.obs['myeloid_subtype_hist_gating'].astype(str)

In [None]:
myeloid.obs['myeloid_subtype_hist_gating'].value_counts()

In [None]:
merged.obs['manual_gate_withT'].value_counts()

In [None]:
merged.obs['combined_label_myeloid'].value_counts()

In [None]:
merged.write('/diskmnt/Users2/harshita/CODEX/merged_myeloid.h5ad')

In [None]:
merged = sc.read_h5ad('/diskmnt/Users2/harshita/CODEX/merged_myeloid.h5ad')

In [None]:
merged.obs['combined_label_myeloid'].value_counts()

In [None]:
gate_order = [
    "Unassigned", "Unk. CD45+", 
    "Progenitor Cell", 
    "Granulocytes", "High APCs", "Monocytes", "Other myeloids",
    "CD4 T Cell", "CD8 T Cell", "T Cell",
    "B Cell", "Plasma Cell",
    "Stroma"]

celltyping_markers = {
    "Imm": ['CD45_intensity'],
    "Prog": ['CD34_intensity', 'cKit_intensity'], 
    "Mye": ['MPO_intensity','CD11b_intensity', 'HLA-DR_intensity', 'CD163_intensity', 'CD14_intensity',  'CD68_intensity'],
    "T": ['CD3e_intensity', 'CD4_intensity', 'CD8_intensity', 'Granzyme-B_intensity'], #'CD45RO_intensity', 'CD44_intensity'],
    "B": ['CD79a_intensity', 'CD20_intensity', 'CD27_intensity'],
     "PC": ['CD138_intensity', 'CD38_intensity'],
    "Strom":['Vimentin_intensity']
}

In [None]:
fig = sc.pl.dotplot(
    merged,
    var_names=celltyping_markers,
    groupby="combined_label_myeloid",
    categories_order=gate_order,
    standard_scale="var",
    show=False,
    return_fig=True
)
fig.savefig("myeloid_annotated_dotplot.pdf", bbox_inches="tight")


In [None]:
celltyping_markers['Mye']

In [None]:
mye = merged[merged.obs['combined_label_myeloid'].isin(["Granulocytes", "High APCs", "Monocytes", "Other myeloids"])].copy()
fig = sc.pl.dotplot(
    mye,
    var_names=celltyping_markers['Mye'], 
    groupby="combined_label_myeloid",
    standard_scale="var",
    show=False,
    return_fig=True
)
fig.savefig("myeloidOnly_annotated_dotplot.pdf", bbox_inches="tight")


In [None]:
t = merged[merged.obs['combined_label_myeloid'].isin(["CD4 T Cell", "CD8 T Cell", "T Cell"])].copy()
fig = sc.pl.dotplot(
    t,
    var_names=celltyping_markers['T'], 
    groupby="combined_label_myeloid",
    standard_scale="var",
    show=False,
    return_fig=True
)
fig.savefig("TcellOnly_annotated_dotplot.pdf", bbox_inches="tight")


In [None]:
b = merged[merged.obs['combined_label_myeloid'].isin(["B Cell"])].copy()
collection_order = ["Normal", "NDMM", "PT"] 
b.obs["Timepoint"] = pd.Categorical(
    b.obs["Timepoint"],
    categories=collection_order,
    ordered=True
)

fig = sc.pl.dotplot(
    b,
    var_names=[ "Ki67_intensity", "Bcl-2_intensity", "TACI_intensity"],
    groupby=["Timepoint"],
    standard_scale="var",
    show=False,
    return_fig=True
)
fig.savefig("BOnly_annotated_dotplot.pdf", bbox_inches="tight")

In [None]:
# compare CT abundance across timepoints
collection_order = ["Normal", "NDMM", "PT"] 
timecols = {"Normal": "#0C7515", "NDMM": "#E619B9", "PT": "#CF99C3"} 

obs = merged.obs 


counts = (
    obs.groupby(["Patient_ID", "Timepoint", "combined_label_myeloid"], observed=True)
       .size()
       .reset_index(name="n")
)

totals = (
    counts.groupby(["Patient_ID", "Timepoint"], observed=True)["n"]
          .sum()
          .reset_index(name="total_n")
)

freq = counts.merge(totals, on=["Patient_ID", "Timepoint"])
freq["frac"] = freq["n"] / freq["total_n"]


bcell = freq[freq["combined_label_myeloid"] == "B Cell"].copy()
bcell["Timepoint"] = pd.Categorical(
    bcell["Timepoint"],
    categories=collection_order,
    ordered=True
)

# Compute mean fractions per Timepoint
mean_lines = (
    bcell.groupby("Timepoint", observed=True)["frac"]
         .mean()
         .reindex(collection_order)
)

# plot
plt.figure(figsize=(10, 4))
ax = sns.barplot(
    data=bcell,
    x="Patient_ID",
    y="frac",
    hue="Timepoint",
    hue_order=collection_order,
    palette=[timecols[t] for t in collection_order],
    dodge=True
)

# Add horizontal dotted lines for each timepoint's mean
for tp in collection_order:
    if tp in mean_lines.index:
        ax.axhline(
            mean_lines.loc[tp],
            linestyle="--",
            linewidth=1.2,
            color=timecols[tp],
            alpha=0.8,
            label=f"{tp} mean"
        )

handles, labels = ax.get_legend_handles_labels()
uniq = dict(zip(labels, handles))
ax.legend(uniq.values(), uniq.keys(), bbox_to_anchor=(1.05, 1), loc="upper left")

ax.set_ylabel("Fraction B Cell")
ax.set_xlabel("Patient")
ax.set_title("Fraction of B Cells by Patient and Timepoint")

plt.xticks(rotation=90)
plt.tight_layout()
#plt.show()

plt.savefig("B_frac_perPatient_barplot.pdf")

In [None]:
merged.obs

# Neighborhood analysis

In [None]:
merged

In [None]:
merged.obs['Timepoint']

In [None]:
merged.obs['Patient_ID']

In [None]:
df = merged.obs.copy()

In [None]:
base = (df[["Patient_ID", "Timepoint"]]
          .drop_duplicates("Patient_ID"))
Patient_ID_to_Timepoint = base.set_index("Patient_ID")["Timepoint"].to_dict()

In [None]:
sample_col = "Patient_ID"
annot_col  = "combined_label_myeloid"
xcol, ycol = "row", "col"
radius = 100.0
df[annot_col] = df[annot_col].astype("category")

per_cell_frames = []
totals_rows = []

all_annots = df[annot_col].cat.categories
ann_idx = pd.Index(all_annots)

for s, g in df.groupby(sample_col, sort=False, observed=True):
    print(s)
    g = g.copy()

    # per-sample totals for annot
    vc = g[annot_col].value_counts().reindex(ann_idx, fill_value=0)
    totals_rows.append(pd.Series({"Sample": s, **vc.to_dict(), "total_cells": len(g)}))

    # neighbor search
    X = g[[xcol, ycol]].to_numpy(dtype=np.float32)
    nn = NearestNeighbors(radius=radius, metric="euclidean", algorithm="kd_tree")
    nn.fit(X)
    neigh_ind = nn.radius_neighbors(X, radius=radius, return_distance=False)  # list of arrays

    # per-annot neighbor counts via bincount
    codes = pd.Categorical(g[annot_col], categories=all_annots, ordered=True).codes
    K = len(all_annots)
    n = len(g)
    counts_mat = np.zeros((n, K), dtype=np.int32)

    for i, inds in enumerate(neigh_ind):
        if inds.size:
            counts_mat[i] = np.bincount(codes[inds], minlength=K)

    total_n_neighbors = counts_mat.sum(axis=1).astype(np.int32)

    # assemble per-cell output for this sample (retain x/y centroids)
    counts_df = pd.DataFrame(counts_mat, index=g.index, columns=all_annots)
    df_out = pd.concat([pd.DataFrame({sample_col: s}, index=g.index), g[[xcol, ycol]].copy(),counts_df], axis=1)
    df_out["total_neighbors"] = total_n_neighbors
    per_cell_frames.append(df_out)

per_cell_df = pd.concat(per_cell_frames, axis=0)
per_sample_totals = pd.DataFrame(totals_rows).set_index("Sample")

In [None]:
per_cell_df

In [None]:
sns.displot(per_cell_df['total_neighbors'])

In [None]:
thresh = 10 #using a threshold for 10 cells
sns.displot(per_cell_df['total_neighbors'])
plt.vlines(thresh, ymin=0, ymax = 35000, colors='red')

In [None]:
per_cell_df_filtered = per_cell_df[per_cell_df['total_neighbors']>=thresh].copy()
per_cell_df_filtered
#per_cell_df_filtered['total_singlet_neighbors'] = per_cell_df_filtered['total_neighbors'].astype(int) - per_cell_df_filtered['Multiplet'].astype(int)
#per_cell_df_filtered_no_multiplet = per_cell_df_filtered.drop(['Multiplet'], axis=1)
#per_cell_df_filtered_no_multiplet

In [None]:
countsdf=per_cell_df_filtered.copy()

denom = countsdf["total_neighbors"].replace(0, np.nan)
filtered_annots = ['Granulocytes', 'Other myeloids', 'Plasma Cell', 'Unassigned', 'High APCs', 'Unk. CD45+',
                   'Stroma', 'Monocytes', 'B Cell', 'CD8 T Cell', 'CD4 T Cell', 'T Cell', 'Progenitor Cell']
# fractions table: keep identifiers/coords, replace counts with fractions
fractions = countsdf[["Patient_ID", "row", "col", "total_neighbors"]].copy()
fractions[filtered_annots] = countsdf[filtered_annots].div(denom, axis=0).fillna(0.0)
fractions

In [None]:
countsdf

In [None]:
# we want to find the optimal k for k-means clustering that maximizes:
# 1. sample representation in each cluster, which we measure with shannon diversity index,
# 2. "uniqueness" of each cluster, which we measure with average distance between cluster centroids
# 3. Collection (NBM, NDMM, PT) separation, which we measure using per-cluster entropy (lower is better)

df = fractions[['Patient_ID']].copy()
df['Timepoint'] = df['Patient_ID'].map(Patient_ID_to_Timepoint)

X = fractions[[c for c in fractions.columns if c in all_annots]].fillna(0).values
Xs = StandardScaler().fit_transform(X)

ks = range(2, 16)
rows = []

for k in ks:
    print(f"evaluation k = {k}")
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(Xs)

    tmp = pd.DataFrame({
        'Patient_ID': df['Patient_ID'].values,
        'Timepoint': df['Timepoint'].values,
        'cluster': labels
    })

    # shannon 
    sh = []
    for _, g in tmp.groupby('cluster'):
        counts = g['Patient_ID'].value_counts().values.astype(float)
        p = counts / counts.sum()
        sh.append(entropy(p))              # natural log base is fine
    mean_shannon = float(np.mean(sh)) if sh else 0.0

    # uniqueness
    pdists = euclidean_distances(km.cluster_centers_)
    centroid_sep = float(pdists[np.triu_indices(k, 1)].mean()) if k > 1 else 0.0

    # separate collection
    collection_entropies = []
    for cid, group in tmp.groupby('cluster'):
        counts = group['Timepoint'].value_counts()
        p = counts / counts.sum()
        collection_entropies.append(entropy(p, base=np.e))
    collection_sep = -np.mean(collection_entropies)

    # weighted composite
    composite = (mean_shannon + centroid_sep + collection_sep) / 3.0

    rows.append({
        'k': k,
        'mean_shannon': mean_shannon,
        'centroid_sep': centroid_sep,
        'collection_sep': collection_sep,
        'composite': composite
    })

res = pd.DataFrame(rows).sort_values('composite', ascending=False)
best_k = int(res.iloc[0]['k'])

# Final fit and attach labels
final = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
fractions['cluster'] = final.fit_predict(Xs)

print(res)
print("Chosen k =", best_k)

In [None]:
fractions['cluster'].value_counts()

In [None]:
name_map = {
    8: 'RN1', 
    10: 'RN2', 
    3: 'RN3', 
    1: 'RN4', 
    4: 'RN5', 
    6: 'RN6',
    5: 'RN7',
    7: 'RN8',
    2: 'RN9',
    9: 'RN10',
    0: 'RN11',
    11: 'RN12'
}
fractions["cluster_name"] = fractions["cluster"].map(name_map)
order = list(name_map.values())
print(order)
fractions["cluster_name"] = pd.Categorical(fractions["cluster_name"], categories=order, ordered=True)
fractions

In [None]:
rn_counts = fractions['cluster_name'].value_counts().reindex(order)
print(rn_counts)

In [None]:
countsdf['radial_neighborhood'] = fractions['cluster_name']
countsdf

In [None]:
rn_counts = countsdf['radial_neighborhood'].value_counts().reindex(order)
print(rn_counts)

In [None]:
df_2 = rn_counts.reset_index()
df_2.columns = ['Radial Neighborhood', 'Count']


plt.figure(figsize=(10, 5))

cats = df_2['Radial Neighborhood'].unique()
x = np.arange(len(cats))

plt.bar(
    x,
    df_2.set_index('Radial Neighborhood').loc[cats, 'Count'],
    width=1.0,
    color='navy'
)

plt.xticks(x, cats, rotation=45, ha='right')

plt.ylabel("Cell Count")
plt.title("Cell Counts per Radial Neighborhood")
plt.tight_layout()


plt.savefig("rn_counts_fullwidth.pdf", bbox_inches="tight")
plt.show()

In [None]:
barcode_to_annot = merged.obs["combined_label_myeloid"].to_dict()
countsdf = countsdf.copy()
countsdf["combined_label_myeloid"] = countsdf.index.map(barcode_to_annot)
countsdf

In [None]:
tally = (countsdf.groupby(["radial_neighborhood", "combined_label_myeloid"], observed=True)
    .size()
    .reset_index(name="n_cells")
)
grouped = (
    tally.pivot(index="combined_label_myeloid", columns="radial_neighborhood", values="n_cells")
         .fillna(0)
).T
grouped

In [None]:
sns.clustermap(grouped.transpose(), cmap='viridis', col_cluster=False)


In [None]:
sns.clustermap(grouped.transpose(), cmap='viridis', standard_scale=0, col_cluster=False)

In [None]:
sns.clustermap(grouped.transpose(), cmap='viridis', standard_scale=1, col_cluster=False)

In [None]:
clusters = grouped.index
cell_types = grouped.columns

odds_matrix = pd.DataFrame(np.nan, index=clusters, columns=cell_types)
pval_matrix = pd.DataFrame(np.nan, index=clusters, columns=cell_types)


from scipy.stats import fisher_exact, hypergeom
from statsmodels.stats.multitest import multipletests

N = grouped.values.sum()         # total cells
row_totals = grouped.sum(axis=1) # cluster totals
col_totals = grouped.sum(axis=0) # cell-type totals

for rn in clusters:
    for ct in cell_types:

        a = grouped.loc[rn, ct]               # in RN & of type CT
        b = row_totals[rn] - a               # in RN & not CT
        c = col_totals[ct] - a               # not RN & type CT
        d = N - (a + b + c)                  # not RN & not CT
        tbl = np.array([[a, b], [c, d]])        
        # Fisher (enrichment)
        odds, fisher_p = fisher_exact(tbl, alternative='greater')
        odds_matrix.loc[rn, ct] = odds
        # Hypergeometric (over-representation)
        m = col_totals[ct]
        k = row_totals[rn]
        q = a - 1 if a > 0 else 0       
        pval_matrix.loc[rn, ct] = hypergeom.sf(q, N, m, k)
# Stabilize zeros 
pvals = pval_matrix.replace(0, 1e-300).values.flatten()
# BH adjust
adj = multipletests(pvals, method='fdr_bh')[1]
# Back to DataFrame
pval_adj = pd.DataFrame(
    adj.reshape(pval_matrix.shape),
    index=clusters, columns=cell_types
)


In [None]:
logOR = np.log2(odds_matrix.replace(0, np.nan)).fillna(0)
scaled = (logOR - logOR.mean()) / logOR.std()
#def p_to_star(p):
#    if p < 0.001: return '***'
#    elif p < 0.01: return '**'
#    elif p < 0.05: return '*'
#    else: return ''
def p_to_star(p):
    if p < 0.05: return '.'
#    elif p < 0.01: return '**'
#    elif p < 0.05: return '*'
    else: return ''


ann = pval_adj.applymap(p_to_star)

In [None]:
logOR

In [None]:
g = sns.clustermap(scaled.T, cmap='coolwarm', row_cluster=True, col_cluster=False)

row_order = g.dendrogram_row.reordered_ind
scaled_reordered = scaled.T.iloc[row_order, :]
ann_reordered = ann.T.iloc[row_order, :]
plt.figure(figsize=(12, 8))
ax = sns.heatmap(
    scaled_reordered,
    cmap='coolwarm',
    center=0, #vmin=-1, vmax=1,
    annot=ann_reordered,
    fmt='',square=True, 
    linewidths=0,
    cbar_kws={'label': 'Scaled log2(OR)'}
)
plt.tight_layout()
plt.savefig("codex_radialneighborhood_k12_enrichment_heatmap.pdf", bbox_inches="tight")
plt.close()

In [None]:
grouped_reordered = grouped.transpose().iloc[row_order, :]
sns.clustermap(grouped_reordered, cmap='viridis', standard_scale=0, col_cluster=False, row_cluster = False)

## Radial neighborhood plots

In [None]:
barcode_to_time = merged.obs["Timepoint"].to_dict()

countsdf["Timepoint"] = countsdf.index.map(barcode_to_time)
countsdf

In [None]:
# look at neighborhood breakdown of P143

p143 = countsdf[countsdf['Patient_ID']=='20799'].copy()

rn_counts = (
    p143["radial_neighborhood"]
    .value_counts()
    .reset_index()
)
rn_counts.columns = ["radial_neighborhood", "count"]
rn_counts["prop"] = rn_counts["count"] / rn_counts["count"].sum()

plt.figure(figsize=(6,4))
sns.barplot(
    data=rn_counts,
    x="radial_neighborhood",
    y="prop",
    color="navy"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Fraction of Cells")
plt.title("Radial Neighborhood Composition — P143")
plt.tight_layout()
plt.show()



In [None]:
obs = countsdf.copy()

# Count cells per Patient_ID × Timepoint × RN
counts = (
    obs.groupby(["Patient_ID", "Timepoint", "radial_neighborhood"], observed=False)
       .size()
       .reset_index(name="n")
)

# Total cells per Patient_ID × Timepoint
totals = (
    counts.groupby(["Patient_ID", "Timepoint"], observed=True)["n"]
          .sum()
          .reset_index(name="total_n")
)

df = counts.merge(totals, on=["Patient_ID", "Timepoint"])
df["prop"] = df["n"] / df["total_n"]


collection_order = ["Normal", "NDMM", "PT"]  
df["Timepoint"] = pd.Categorical(df["Timepoint"], categories=collection_order, ordered=True)


from matplotlib.backends.backend_pdf import PdfPages
from statannotations.Annotator import Annotator

with PdfPages("RN_boxplots_mannwhitney.pdf") as pdf:
    for rn in sorted(df["radial_neighborhood"].unique()):
        sub = df[df["radial_neighborhood"] == rn]

        plt.figure(figsize=(2, 4))
        ax = sns.boxplot(
            data=sub,
            x="Timepoint",
            y="prop",
            order=collection_order,
            palette=timecols,
            fliersize=0
        )
        sns.stripplot(
            data=sub,
            x="Timepoint",
            y="prop",
            order=collection_order,
            color="black",
            alpha=0.8,
            size=3
        )

        for upn, g in sub.groupby("Patient_ID"):
            g = g.set_index("Timepoint").reindex(collection_order)   # ensure correct order
            if g["prop"].notna().sum() >= 2:
                # x positions are 0,1,2 for Timepoint
                xs = [i for i, t in enumerate(collection_order) if pd.notna(g.loc[t, "prop"])]
                ys = [g.loc[t, "prop"] for t in collection_order if pd.notna(g.loc[t, "prop"])]
                ax.plot(xs, ys, color="gray", linewidth=1, alpha=0.5, zorder=1)

        pairs = [("Normal","NDMM"), ("NDMM","PT"), ("Normal","PT")]

        annot = Annotator(
            ax,
            pairs,
            data=sub,
            x="Timepoint",
            y="prop",
            order=collection_order
        )
        annot.configure(
            test="Mann-Whitney",
            text_format="star",
            loc="inside",
            comparisons_correction="BH",
            hide_non_significant=True
        )
        annot.apply_and_annotate()

        plt.ylabel(f"{rn} proportion")
        plt.title(f"{rn}")
        plt.tight_layout()

        pdf.savefig()
        plt.close()

In [None]:
#obs = merged.obs.copy()
obs = merged.obs[merged.obs['combined_label_myeloid'] != 'Plasma Cell'].copy()

# Count cells per Patient_ID × Timepoint × RN
counts = (
    obs.groupby(["Patient_ID", "Timepoint", "combined_label_myeloid"], observed=False)
       .size()
       .reset_index(name="n")
)

# Total cells per Patient_ID × Timepoint
totals = (
    counts.groupby(["Patient_ID", "Timepoint"], observed=True)["n"]
          .sum()
          .reset_index(name="total_n")
)

df = counts.merge(totals, on=["Patient_ID", "Timepoint"])
df["prop"] = df["n"] / df["total_n"]


collection_order = ["Normal", "NDMM", "PT"]  
df["Timepoint"] = pd.Categorical(df["Timepoint"], categories=collection_order, ordered=True)


from matplotlib.backends.backend_pdf import PdfPages
from statannotations.Annotator import Annotator

with PdfPages("celltype_boxplots_exclPC_mannwhitney.pdf") as pdf:
    for rn in sorted(df["combined_label_myeloid"].unique()):
        sub = df[df["combined_label_myeloid"] == rn]

        plt.figure(figsize=(2, 4))
        ax = sns.boxplot(
            data=sub,
            x="Timepoint",
            y="prop",
            order=collection_order,
            palette=timecols,
            fliersize=0
        )
        sns.stripplot(
            data=sub,
            x="Timepoint",
            y="prop",
            order=collection_order,
            color="black",
            alpha=0.8,
            size=3
        )

        for upn, g in sub.groupby("Patient_ID"):
            g = g.set_index("Timepoint").reindex(collection_order)   # ensure correct order
            if g["prop"].notna().sum() >= 2:
                # x positions are 0,1,2 for Timepoint
                xs = [i for i, t in enumerate(collection_order) if pd.notna(g.loc[t, "prop"])]
                ys = [g.loc[t, "prop"] for t in collection_order if pd.notna(g.loc[t, "prop"])]
                ax.plot(xs, ys, color="gray", linewidth=1, alpha=0.5, zorder=1)

        pairs = [("Normal","NDMM"), ("NDMM","PT"), ("Normal","PT")]

        annot = Annotator(
            ax,
            pairs,
            data=sub,
            x="Timepoint",
            y="prop",
            order=collection_order
        )
        annot.configure(
            test="Mann-Whitney",
            text_format="star",
            loc="inside",
            comparisons_correction="BH",
            hide_non_significant=True
        )
        annot.apply_and_annotate()

        plt.ylabel(f"{rn} proportion")
        plt.title(f"{rn}")
        plt.tight_layout()

        pdf.savefig()
        plt.close()

In [None]:
myeloid_types = ["Granulocytes", "Other myeloids", "High APCs", "Monocytes"]
timecols = {"Normal": "#0C7515", "NDMM": "#E619B9", "PT": "#CF99C3"}

# Subset merged.obs to only myeloid cells
myeloid_df = merged.obs[merged.obs['combined_label_myeloid'].isin(myeloid_types)].copy()

# Compute percentages per section (not just per patient)
percent_df = (
    myeloid_df.groupby(["section_id", "Timepoint", "combined_label_myeloid"])
    .size()
    .reset_index(name="count")
)

# Calculate total myeloid cells per section
totals = percent_df.groupby(["section_id", "Timepoint"])["count"].transform("sum")
percent_df["percent"] = percent_df["count"] / totals * 100

# Pivot so each myeloid subtype is a column
plot_df = percent_df.pivot_table(
    index=["section_id", "Timepoint"],
    columns="combined_label_myeloid",
    values="percent",
    fill_value=0
).reset_index()

# Sort sections by timepoint order
plot_df = plot_df.sort_values(
    "Timepoint",
    key=lambda x: x.map({"Normal": 0, "NDMM": 1, "PT": 2})
)

# Plot stacked barplot
fig, ax = plt.subplots(figsize=(12, 6))
bottom = np.zeros(len(plot_df))

for subtype in myeloid_types:
    ax.bar(
        plot_df["section_id"],
        plot_df[subtype],
        bottom=bottom,
        label=subtype
    )
    bottom += plot_df[subtype].values

ax.set_ylabel("Percent of Myeloid Cells per Section")
ax.set_xlabel("Section ID (Grouped by Timepoint)")
ax.set_title("Distribution of Myeloid Subtypes Across Sections and Timepoints")
ax.set_xticks(range(len(plot_df)))
ax.set_xticklabels(plot_df["section_id"], rotation=90, ha='center')
ax.legend(title="Myeloid Subtype", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
merged.obs[['section_id', 'Timepoint']].drop_duplicates().sort_values('Timepoint')

In [None]:
merged.obs

In [None]:
merged.obs.to_csv("/diskmnt/Users2/harshita/CODEX/merged_myeloid_obs.csv", index=True)