In [None]:
# subcluster each lineage and annotate
import anndata
import scanpy as sc
import pandas as pd
import glob
import skimage
import re
import scanpy.external as sce
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

import os
# Set working directory
#-----------
lin='B'
#-----------
os.chdir(f"/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/{lin}")

# Verify current working directory
print(os.getcwd())

# read in HGNC Ig locus genes
ig_genes_table = pd.read_csv(
    "/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/IgGenes_HGNC_geneNamesOrg.txt",
    sep="\t",
    header=0
)
ig_genes = ig_genes_table['Approved symbol'].tolist()

# filter genes for looking at DEGs: only protein coding, no MT, IG, hemoglobins
protein_coding_genes_table = pd.read_csv("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/HGNC_protein_coding_gene.txt", sep="\t")
protein_coding_genes = protein_coding_genes_table['symbol'].tolist()

hemo_genes_table = pd.read_csv("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/hemoglobin_genes.txt", sep="\t")
hb_genes = hemo_genes_table['Gene'].tolist()

protein_coding_filtered = [
    g for g in protein_coding_genes
    if (
        g not in ig_genes and
        g not in hb_genes and
        not re.match(r"^(RPL|RPS|MT-)", g)
    )
]

In [None]:
adata = sc.read_h5ad('split.h5ad')

In [None]:
# temporarily exclude Ig genes from variable gene selection
adata_hvg_calc = adata[:, ~adata.var_names.isin(ig_genes)].copy()

sc.pp.highly_variable_genes(
    adata_hvg_calc,
    n_top_genes=2000,
    batch_key='Sample'
)

# Initialize all genes in full object as False
adata.var['highly_variable'] = False

# Mark HVGs from the subset in the full object
adata.var.loc[adata_hvg_calc.var_names[adata_hvg_calc.var['highly_variable']], 'highly_variable'] = True

sc.tl.pca(adata)
sce.pp.harmony_integrate(adata, "Sample")

adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
sc.pp.neighbors(adata)
sc.tl.umap(adata)

#determine resolution for clustering
for res in [0.1, 0.3, 0.5, 1.0]:
    sc.tl.leiden(
        adata, key_added=f"leiden_res_{res:4.2f}", resolution=res, flavor="igraph"
    )

In [None]:
resolutions = ["leiden_res_0.10", "leiden_res_0.30", "leiden_res_0.50", "leiden_res_1.00", ]
fig, axes = plt.subplots(1,4, figsize=(16,5))
axes = axes.flatten()  # flatten to 1D for easy iteration

for ax, res in zip(axes, resolutions):
    sc.pl.umap(adata, color=res, ax=ax, legend_loc='on data', show=False)
    ax.set_aspect('equal') 
    for coll in ax.collections:
        coll.set_rasterized(True)

plt.tight_layout()
plt.savefig("leiden_umaps.pdf")

In [None]:
dev_markers = {
    "Cycl": ["TUBA1B", "TUBA1B", "STMN1", "HMGB2", "MKI67"], 
    "HSPC":['SPINK2', 'AVP', 'CD34'],
    "CLP": ['FLT3', 'EBF1', 'TCF7', 'LEF1'],
    "ProB": ['RAG1', 'RAG2', 'DNTT', 'KIT', 'CD99', 'PAX5', 'EBF1', 'FOXO1'],
    "PreB":['VPREB1', 'IGLL1', 'CD79A','CD79B'], 
    "ImmB": ['CD19','SOX4', 'NEIL1', 'ACSM3',], 
    "TransB":['CD24', 'CD38', 'MME'],
    "NvB":['MS4A1', 'TNFRSF13C','CD52', 'BANK1', 'IGHM', 'IGHD', 'TCL1A','SELL', 'PLPP5', 'CD83', 'IFI44L', 'IFITM1', 'PARP14'],
    "MemB":["CD27", "IGHG1","IGHG2","IGHG3","IGHG4","IGHA1","IGHA2", 'CD44'],
    "PC":['SDC1', 'FCRL5', 'TNFRSF17', 'SLAMF7'],
    "T":['TRAC', 'CD3E', 'CD3D'],
    "Mye":["LYZ", "CD14", "ITGAM"],
    "MSC": ["LEPR", "CXCL12", "KITLG"]
}

In [None]:
sc.pl.dotplot(adata, dev_markers, groupby="leiden_res_0.50", standard_scale="var")

In [None]:
# remove cluster 6 as it is a mixed B/T doublet
adata_cleaned = adata[adata.obs['leiden_res_0.50']!='6'].copy()
print(adata.shape, adata_cleaned.shape)

In [None]:
sc.pl.dotplot(adata_cleaned, dev_markers, groupby="leiden_res_0.50", standard_scale="var")

In [None]:
# subset the object to only protein-coding genes, no Ig, hemoglobin, mitochondrial, or ribosomal
adata_prot = adata_cleaned[:, adata_cleaned.var_names.isin(protein_coding_filtered)].copy()

# rank genes only among those
sc.tl.rank_genes_groups(adata_prot, groupby="leiden_res_0.50", method="wilcoxon")

# plot top 5 markers per cluster
sc.pl.rank_genes_groups_dotplot(
    adata_prot,
    groupby="leiden_res_0.50",
    standard_scale="var",
    n_genes=5
)

In [None]:
# assign developmental subset to each 
"""subset_assignment = {
    "0": "Late Nv B", 
    "1": "Late Nv B", 
    "2": "Early Nv B", 
    "3": "Memory B", 
    "4": "Transitional B",
    "5": "Late Pre B", 
    "7": "Early Pre B",
    "8": "Pro/Pre B"
}
"""
subset_assignment = {
    "0": "Naive B", 
    "1": "Naive B", 
    "2": "Transitional B", 
    "3": "Memory B", 
    "4": "Transitional B",
    "5": "Immature B", 
    "7": "Pro/Pre B",
    "8": "Pro/Pre B"
}

In [None]:
adata_cleaned.obs["subset"] = adata_cleaned.obs["leiden_res_0.50"].map(subset_assignment)

In [None]:
# make pretty dotplot and save as pdf

b_markers_simple = {
    "Cycling": ["TUBA1B", "HMGB2", "MKI67"], 
    #"HSPC":['SPINK2', 'AVP', 'CD34'],
    "Pro B": ["CD99", 'EBF1', 'SOX4', 'FOXO1',  'PAX5', 'RAG1', 'RAG2', 'DNTT', 'CD19'],
    "Pre B":[ 'CD79A','CD79B', 'VPREB1', 'IGLL1', 'TCL1A'], 
    "Trans. B":['CD24', 'CD38', 'MME', 'IGHM'],
    "Naive B":['IGHD', 'MS4A1','TNFRSF13C', 'SELL', 'PLPP5', 'CD83', 'PARP14','CD52', 'BANK1' ],
    "Memory B":["CD27", "IGHG1","IGHG2" ,"IGHA1","IGHA2", 'FCRL5'],
    #"PC":['PRDM1', 'SDC1', 'FCRL5', 'TNFRSF17', 'SLAMF7'],
    #"T":['TRAC', 'CD3E', 'CD3D']
}

"""
subset_order  = [
    "Pro/Pre B",
    "Early Pre B",
    "Late Pre B", 
    "Transitional B",
    "Early Nv B",
    "Late Nv B",
    "Memory B"
]
"""
subset_order  = [
    "Pro/Pre B",
    #"Early Pre B",
    #"Late Pre B",
    "Immature B",
    "Transitional B",
    #"Early Nv B",
    #"Late Nv B",
    "Naive B",
    "Memory B"
]
# save the above dotplot to pdf
sc.pl.dotplot(
    adata_cleaned,
    var_names=b_markers_simple,
    layer='counts',
    groupby="subset",
    categories_order=subset_order,   # explicit order
    standard_scale="var",
    figsize=(12,1.5),
    show=False
)

plt.savefig("subset_markers_dotplot.pdf", bbox_inches="tight")

In [None]:
# Assign colors to each subset
"""
b_palette = {
    "Pro/Pre B": '#adede7',
    "Early Pre B": '#74d8e3',
    "Late Pre B": '#17d4ff', 
    "Transitional B": '#a2c6eb',
    "Early Nv B": '#5fa5ed',
    "Late Nv B": '#3e7cf0',
    "Memory B" :'#032563'
}
"""
b_palette = {
    "Pro/Pre B": '#adede7',
    "Immature B": '#17d4ff', 
    "Transitional B": '#a2c6eb',
    "Naive B": '#5fa5ed',
    "Memory B" :'#032563'
}
adata_cleaned.uns['subset_colors'] = [b_palette[c] for c in adata_cleaned.obs['subset'].cat.categories]

# print subcluster labeled umap
# save to pdf
fig, ax = plt.subplots(figsize=(5,5))
sc.pl.umap(adata_cleaned, color='subset', size=3, ax=ax, legend_loc='on data', show=False)
ax.set_aspect('equal')
for coll in ax.collections:
    coll.set_rasterized(True)
plt.savefig("subset_UMAP.pdf", bbox_inches="tight")


In [None]:
adata_cleaned.layers['normalized'] = adata_cleaned.X
adata_cleaned.write("annotated.h5ad")