In [None]:
# subcluster each lineage and annotate
import anndata
import scanpy as sc
import pandas as pd
import glob
import skimage
import re
import scanpy.external as sce
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

import os
# Set working directory
lin='pDC'
os.chdir(f"/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/{lin}")

# Verify current working directory
print(os.getcwd())

# read in HGNC Ig locus genes
ig_genes_table = pd.read_csv(
    "/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/IgGenes_HGNC_geneNamesOrg.txt",
    sep="\t",
    header=0
)
ig_genes = ig_genes_table['Approved symbol'].tolist()

# filter genes for looking at DEGs: only protein coding, no MT, IG, hemoglobins
protein_coding_genes_table = pd.read_csv("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/HGNC_protein_coding_gene.txt", sep="\t")
protein_coding_genes = protein_coding_genes_table['symbol'].tolist()

hemo_genes_table = pd.read_csv("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/hemoglobin_genes.txt", sep="\t")
hb_genes = hemo_genes_table['Gene'].tolist()

protein_coding_filtered = [
    g for g in protein_coding_genes
    if (
        g not in ig_genes and
        g not in hb_genes and
        not re.match(r"^(RPL|RPS|MT-)", g)
    )
]

adata = sc.read_h5ad('split.h5ad')
adata

In [None]:
# temporarily exclude Ig genes from variable gene selection
adata_hvg_calc = adata[:, ~adata.var_names.isin(ig_genes)].copy()

sc.pp.highly_variable_genes(
    adata_hvg_calc,
    n_top_genes=2000,
    batch_key='Sample'
)

# Initialize all genes in full object as False
adata.var['highly_variable'] = False

# Mark HVGs from the subset in the full object
adata.var.loc[adata_hvg_calc.var_names[adata_hvg_calc.var['highly_variable']], 'highly_variable'] = True

sc.tl.pca(adata)
sce.pp.harmony_integrate(adata, "Sample", max_iter_harmony=20)

adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
sc.pp.neighbors(adata)
sc.tl.umap(adata)

#determine resolution for clustering
for res in [0.1, 0.3, 0.5, 1.0]:
    sc.tl.leiden(
        adata, key_added=f"leiden_res_{res:4.2f}", resolution=res, flavor="igraph"
    )

resolutions = ["leiden_res_0.10", "leiden_res_0.30", "leiden_res_0.50", "leiden_res_1.00" ]
fig, axes = plt.subplots(1,4, figsize=(16,5))
axes = axes.flatten()  # flatten to 1D for easy iteration

for ax, res in zip(axes, resolutions):
    sc.pl.umap(adata, color=res, ax=ax, legend_loc='on data', show=False)
    ax.set_aspect('equal') 
    for coll in ax.collections:
        coll.set_rasterized(True)

plt.tight_layout()
plt.savefig("leiden_umaps.pdf")

In [None]:
dev_markers = {
    "pDC":["GZMB", "IL3RA", "COBLL1", "TCF4", "IRF8", "FLT3", "CD38", "CD34", "IL7R"],
    'MMPC': ['SLAMF7', 'MZB1', 'TNFRSF17', 'SDC1', 'CCND1', 'MYC'],
    "Cycl": ["TUBA1B", "HMGB2", "MKI67"],   
    'Ery':['HBA1', 'HBB', 'HBD', 'HBG1', 'HBZ'],
    'T': ['TRAC', 'CD3D', 'CD3E', 'CD4'],
    'B': ['CD19', 'CD79A', 'MS4A1'],
    'Mye': ['LYZ', 'S100A8', 'S100A9'],
    'MKC': ['PF4', 'PPBP', 'THPO'],
}

In [None]:
sc.pl.dotplot(adata, dev_markers, layer='counts', groupby="leiden_res_0.10", standard_scale="var")

In [None]:
cluster2celltype = {
    "0": "pDC", 
    "1": 'pDC',
    "2": "CLP", 
    "3": "Low_Quality", # Mye doublet
    "4": "Low_Quality", # Ery doublet
    "5": "Low_Quality" #PC doublet
}

adata.obs["subset"] = adata.obs["leiden_res_0.10"].map(cluster2celltype)
adata_cleaned = adata[adata.obs['subset']!='Low_Quality'].copy()

In [None]:
subset_order  = [
    "CLP",
    "pDC"
]

markers_simple = {
    "CLP": ["SPINK2","FLT3", "CD38", "IRF8"],
    "pDC": ["GZMB", "IL3RA", "COBLL1", "TCF4",  'MZB1', 'CD4']
    
}

sc.pl.dotplot(
    adata_cleaned,
    var_names=markers_simple,
    layer='counts',
    groupby="subset",
    categories_order=subset_order,   # explicit order
    standard_scale="var",
    figsize=(6,1),
    show=False
)

plt.savefig("subset_markers_dotplot.pdf", bbox_inches="tight")

In [None]:
lin_rename = {
    "CLP": "HSPC", 
    "pDC": 'pDC',
}
adata_cleaned.obs["lin"] = adata_cleaned.obs["subset"].map(cluster2celltype)

In [None]:
# Assign colors to each subset
palette = {
    "CLP":'#d1e5e6',
    'pDC':'#a5c3c4'
}
adata_cleaned.uns['subset_colors'] = [palette[c] for c in adata_cleaned.obs['subset'].cat.categories]

# print subcluster labeled umap
# save to pdf
fig, ax = plt.subplots(figsize=(5,5))
sc.pl.umap(adata_cleaned, color='subset', size=5, ax=ax, legend_loc='on data', show=False)
ax.set_aspect('equal')
for coll in ax.collections:
    coll.set_rasterized(True)
plt.savefig("subset_UMAP.pdf", bbox_inches="tight")

In [None]:
adata_cleaned.layers['normalized'] = adata_cleaned.X
adata_cleaned.write('annotated.h5ad')