In [None]:
# update the merged object by removing doublets detected by lineage subcluster annotation 
# and add subset labels

import anndata
import scanpy as sc
import pandas as pd
import glob
import skimage
import re
import scanpy.external as sce
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf


import os
# Set working directory
os.chdir("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony")

In [None]:
# Verify current working directory
print(os.getcwd())
adata = sc.read_h5ad('combined_noDoublets.h5ad')

In [None]:
base_dir = "/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony"
lin_objs = {}
for lin in adata.obs['lin'].unique():
    lin_adata = sc.read_h5ad(f"{base_dir}/{lin}/annotated.h5ad")
    lin_objs[lin] = lin_adata

In [None]:
subset_mapping = {} # key is cell barcode, value is subset
lin_mapping = {} # key is cell barcode, value is updated lineage
color_mapping = {}
keep_barcodes = []

for lin, lin_adata in lin_objs.items():
    for cid, subset in zip(lin_adata.obs.index, lin_adata.obs['subset']):
        subset_mapping[cid] = subset
    for cid, updated_lineage in zip(lin_adata.obs.index, lin_adata.obs['subset']):
        lin_mapping[cid] = updated_lineage
        
    if 'subset_colors' in lin_adata.uns:
        for cell_type, color in zip(lin_adata.obs['subset'].cat.categories, lin_adata.uns['subset_colors']):
            color_mapping[cell_type] = color

    keep_barcodes += lin_adata.obs.index.to_list()

len(subset_mapping)

In [None]:
obj = adata[keep_barcodes].copy()
obj.shape

In [None]:
color_mapping['PC'] = '#ffbafd'
color_mapping['MKC']='#000000'
color_mapping

In [None]:
obj.obs['subset'] = [subset_mapping.get(cid) for cid in obj.obs.index]
obj.obs['subset'] = obj.obs['subset'].astype('category')
obj.obs['subset'].value_counts()

In [None]:
subset_order = [
    "MSC",
    "HSPC",
    "Early Ery",
    "Late Ery",
    "MKC",
    "Neutrophil",
    "CD14 Mc",
    "CD16 Mc / TAM", 
    "cDC", 
    "pDC",
    "CLP",
    "CD4T",
    "CD8T",
    "T Stim/Exh",
    "gdT/NK",
    "Pro/Pre B",
    "Immature B",
    "Transitional B",
    "Naive B",
    "Memory B",
    "PC"
]

# Check for missing categories in subset_order
subs = obj.obs['subset'].unique().tolist()
set(subs) - set(subset_order)

In [None]:
obj.obs['subset'] = pd.Categorical(
    obj.obs['subset'], categories=subset_order, ordered=True
) 
subset_colors = []
for cell_type in obj.obs['subset'].cat.categories:
    subset_colors.append(color_mapping[cell_type])

obj.uns['subset_colors'] = subset_colors

In [None]:
# save to pdf
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(obj, color='subset', ax=ax, show=False, size=3)
ax.set_aspect('equal')
for coll in ax.collections:
    coll.set_rasterized(True)
plt.savefig("subset_UMAP_cleaned.pdf", bbox_inches="tight")

In [None]:
obj.write('combined_cleaned.h5ad')

In [None]:
obj = sc.read_h5ad('combined_cleaned.h5ad')

In [None]:
subset_markers={
    "MSC":["LEPR", "KITLG", "CXCL12"],
    "HSPC": ["SPINK2" ,"AVP", "CD34"], 
    "Ery": ["GYPA", "AHSP", "HBB"],
    "MKC": ['PF4', 'PPBP'],
    "Neutrophil": ['ELANE', 'MPO', 'AZU1'],
    "CD14 Mc": ['CD14', 'FCGR1A', 'ITGAM', ],
    "CD16 Mc / TAM": ['FCGR3A', 'MS4A7', 'SIGLEC10'], 
    "cDC": ['CLEC10A', 'CD1C', 'FCER1A'],
    "pDC": ["GZMB", "IL3RA",],
    "CLP": ["FLT3"],
    "abT": ['TRAC', 'CD3D', 'TCF7','CD8A'],
    "Texh": ['PDCD1', 'CTLA4',],
    "gdT": ['TRGC1', 'TRDC'],
    "NK":['NCAM1', 'KLRB1', 'KLRC1'],
    "Early B": ['DNTT','VPREB1', 'IGHM'],
    "Late B": ['IGHD', 'CD19', 'MS4A1',],
    "PC": ['SDC1', 'TNFRSF17', 'SLAMF7']
}

sc.pl.dotplot(
    obj,
    var_names=subset_markers,
    groupby="subset",
    #categories_order=subset_order,   # explicit order
    standard_scale="var",
    figsize=(14,5.5),
    show=False
)

plt.savefig("subset_markers_dotplot_cleaned.pdf", bbox_inches="tight")

            

In [None]:
lin_palette = {
    'PC':'#ffbafd',
    'B':'#032cfc', 
    'T_NK':'#fc0000',
    'Mye':'#1eba0d',
    'pDC':'#a5c3c4',
    'Ery':'#d6d6d6',
    'MKC':'#000000'
}

# Assign colors to the AnnData object:
obj.uns['lin_colors'] = [lin_palette[c] for c in obj.obs['lin'].cat.categories]

In [None]:
# save to pdf
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(obj, color='lin', ax=ax, show=False, size=3)
ax.set_aspect('equal')
for coll in ax.collections:
    coll.set_rasterized(True)
plt.savefig("lin_UMAP_cleaned.pdf", bbox_inches="tight")

In [None]:
import seaborn as sns
counts = obj.obs['subset'].value_counts()

plt.figure(figsize=(8,4))

sns.barplot(
    x=counts.index,
    y=counts.values,
)

plt.xticks(rotation=90)
plt.tight_layout()

plt.savefig("nCells_per_subset_barplot.pdf", dpi=300, bbox_inches='tight')
plt.close()


In [None]:
obj.X.max()

In [None]:
obj.layers['normalized'].max()

In [None]:
obj.obs.shape