In [None]:
import anndata
import scanpy as sc
import pandas as pd
import glob
import skimage
import re
import scanpy.external as sce
import os

import matplotlib.pyplot as plt

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

# Set working directory
os.chdir("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony")

# Verify current working directory
print(os.getcwd())


In [None]:
h5_directory = "/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/individual_samples/h5ad/"
h5_files = glob.glob(h5_directory + "*.h5ad")
adata_list = []
for file_path in h5_files:
    match = re.search(r'/([^/]*)$', file_path)
    extracted_text = match.group(1)
    sampleid = extracted_text.removesuffix(".h5ad")
    adata = sc.read_h5ad(file_path) 
    # modify cellids to be unique by adding sampleid
    adata.obs_names = [sampleid + str(s) for s in adata.obs_names]
    adata_list.append(adata)

In [None]:
adata_combined = sc.concat(adata_list, axis=0, merge="unique")

In [None]:
adata_combined.X.max()

In [None]:
sc.pl.scatter(adata_combined, "nFeature_RNA","nCount_RNA", color="percent.mt")

In [None]:
adata_combined.write("combined.h5ad")

In [None]:
adata_combined.obs['predicted_doublet'].value_counts()

In [None]:
adata_combined.layers['counts']= adata_combined.X.copy()

adata_combined_noDoublets = adata_combined[adata_combined.obs['predicted_doublet'] == False].copy()
print(adata_combined.shape, adata_combined_noDoublets.shape)

In [None]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_combined_noDoublets, exclude_highly_expressed=True)
# Logarithmize the data
sc.pp.log1p(adata_combined_noDoublets)

In [None]:
# read in HGNC Ig locus genes
ig_genes_table = pd.read_csv(
    "/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/IgGenes_HGNC_geneNamesOrg.txt",
    sep="\t",
    header=0
)
ig_genes = ig_genes_table['Approved symbol'].tolist()

In [None]:
# temporarily exclude Ig genes from variable gene selection
# Subset only genes that are NOT Ig genes
adata_hvg_calc = adata_combined_noDoublets[:, ~adata_combined_noDoublets.var_names.isin(ig_genes)].copy()

sc.pp.highly_variable_genes(
    adata_hvg_calc,
    n_top_genes=3000,
    batch_key='Sample'
)

# Initialize all genes in full object as False
adata_combined_noDoublets.var['highly_variable'] = False

# Mark HVGs from the subset in the full object
adata_combined_noDoublets.var.loc[adata_hvg_calc.var_names[adata_hvg_calc.var['highly_variable']], 'highly_variable'] = True

In [None]:
# Look at top 50 variable genes
# Select only highly variable genes
hvg_df = adata_hvg_calc.var[adata_hvg_calc.var['highly_variable']]
hvg_df

In [None]:
# Sort by normalized dispersion
top50_hvg = hvg_df.sort_values('dispersions_norm', ascending=False).head(50)

print(top50_hvg)

In [None]:
sc.tl.pca(adata_combined_noDoublets)

sc.pl.pca_variance_ratio(adata_combined_noDoublets, n_pcs=50, log=True)

In [None]:
sc.pl.pca(
    adata_combined_noDoublets,
    color=["Collection", "Collection", "percent.mt", "percent.mt"],
    dimensions=[(0, 1), (2, 3), (0, 1), (2, 3)],
    ncols=2,
    size=2
)

In [None]:
sc.pp.neighbors(adata_combined_noDoublets)
sc.tl.umap(adata_combined_noDoublets)
sc.tl.leiden(adata_combined_noDoublets, resolution=0.5)
adata_combined_noDoublets.write("combined_noDoublets.h5ad")

# note 10/7/2025 the above code was run in background

In [None]:
sc.tl.leiden(adata_combined_noDoublets, flavor='igraph', n_iterations=2, resolution=0.5)
# note 10/7/2025 the above code was run in background

In [None]:
sc.pl.umap(
    adata_combined_noDoublets,
    color=["leiden", "Collection", "nCount_RNA", "percent.mt", "nFeature_RNA"],
    # increase horizontal space between panels
    wspace=0.2
)

In [None]:
# plot marker genes and divide into lineages 
# visualize and annotate
marker_genes = {
    "PC":['TNFRSF17','SLAMF7','SDC1'], 
    "B": ["RAG1", "VPREB1", "CD19","MS4A1","CD79A", 'IGHM', 'IGHD', 'IGHA1', 'IGHG1'], 
    "CD4_T": ['CD3D','CD3E','CD4','IL7R','TCF7', 'FOXP3'], 
    "CD8_T":['CD8A','CD8B','NKG7'], 
    "NK":['FCGR3A','GNLY','KLRC1', 'KLRD1'], 
    "CD14":['CD14','S100A9', 'SELL'], 
    "CD16":['MS4A7', 'TNFRSF1B', 'LYN'], 
    "cDC":['FCER1A', 'CD1C', 'CLEC9A'], 
    "NP":['MPO','AZU1','ELANE'], # neutrophils 
    "pDC":["GZMB", "IL3RA", "COBLL1", "TCF4"],
    "MKC":['PF4', 'PPBP', 'THPO'], #megakaryocytes
    "Ery":['HBB', 'HBA1', 'HBA2', 'GATA1', 'KLF1', 'CA3'], # erythroid
    "HSPC": ["SPINK2", "AVP", "CD34"], # progenitor
    "MSC":['CXCL12', 'LEPR', 'KITLG'] 
}  

In [None]:
sc.pl.dotplot(adata_combined_noDoublets, marker_genes, groupby="leiden", standard_scale="var")

In [None]:
# assign a lineage to each leiden cluster based on marker gene expression above

cluster2celltype = {
    "0": 'Mye', # CD16
    "1": 'Mye', # CD14
    "2": "Mye", # HSPCs, granulo 
    "3": "Low_Quality",
    "4": "Mye", #CD14
    "5": "Mye", # cDC, granulo 
    "6": "pDC", 
    "7": "Ery",
    "8": "Low_Quality", # PC, T, monocyte 
    "9": "PC", 
    "10": "PC", # mostly CD16
    "11": "B", 
    "12": "Ery",
    "13": "T_NK", # mix of HSPC and granulo
    "14": "T_NK", # mixed low markers 
    "15": "T_NK", # CD8
    "16": "T_NK",
    '17': 'B', # low transcripts for all cell typing markers
    '18': 'Ery', # T/NK and myeloid doublets
    '19': 'B',
    '20': 'T_NK', # mostly NK
    '21': 'Low_Quality', # T, monocyte, ery
    '22': 'PC', # PC and Ery doublets
    '23': 'MKC',
    '24': 'PC',
    '25': 'Low_Quality', #low for everything
    '26':'PC' #high B markers
}

adata_combined_noDoublets.obs["lin"] = adata_combined_noDoublets.obs["leiden"].map(cluster2celltype)

sc.pl.umap(adata_combined_noDoublets,color="lin")

In [None]:
adata_combined_noDoublets.obs['leiden'].value_counts()

In [None]:
# filter out low quality clusters
adata_combined_noDoublets = adata_combined_noDoublets[adata_combined_noDoublets.obs["lin"] != "Low_Quality"].copy()
adata_combined_noDoublets.write("combined_noDoublets.h5ad")

In [None]:
adata_combined_noDoublets.obs['lin'].unique()

In [None]:
# set lineage as an ordered categorical
lin_order = [
    "PC",
    "B",
    "T_NK",
    "Mye",
    "pDC",
    "Ery",
    #"HSPC",
    "MKC"
]
adata_combined_noDoublets.obs["lin"] = adata_combined_noDoublets.obs["lin"].astype(
    pd.CategoricalDtype(categories=lin_order, ordered=True)
)

In [None]:
lin_palette = {
    'PC':'#ffbafd',
    'B':'#032cfc', 
    'T_NK':'#fc0000',
    'Mye':'#1eba0d',
    'pDC':'#a5c3c4',
    'Ery':'#dce6ca',
    #'HSPC': '#85f2f0',
    'MKC':'#ebca10'
}

# Assign colors to the AnnData object:
adata_combined_noDoublets.uns['lin_colors'] = [lin_palette[c] for c in adata_combined_noDoublets.obs['lin'].cat.categories]

In [None]:
# make pretty UMAP with coordinate ratio 1
fig, ax = plt.subplots(figsize=(5,5))
sc.pl.umap(adata_combined_noDoublets, color='lin', ax=ax, show=False)
ax.set_aspect('equal')
for coll in ax.collections:
    coll.set_rasterized(True)
plt.savefig("lineage_UMAP_merged_noDoublets.pdf", bbox_inches="tight")

In [None]:
# make pretty marker gene plot by lineage
marker_genes_simple = {
    "Plasma Cell":['TNFRSF17','SLAMF7','SDC1'], 
    "B Cell": ["VPREB1", "CD19","MS4A1","CD79A"], 
    "CD4 T Cell": ['CD3D','CD3E','CD4','IL7R','TCF7'], 
    "CD8 T Cell":['CD8A','CD8B','NKG7'], 
    "NK Cell":['FCGR3A','GNLY', 'KLRD1'], 
    "CD14 Mono.":['CD14','S100A9', 'SELL'], 
    "CD16 Mono.":['MS4A7', 'TNFRSF1B', 'LYN'], 
    #"cDC":['FCER1A', 'CD1C', 'CLEC9A'], 
    "Granulo.":['MPO','AZU1','ELANE'], # neutrophils 
    #"HSPC": ["SPINK2", "AVP", "CD34"], # progenitor
    "pDC":["GZMB", "IL3RA", "TCF4"],
    "Erythro.":['HBB', 'HBA1', 'HBA2', 'GATA1', 'KLF1'], # erythroid
    "Megkaryo.":['PF4', 'PPBP'] #megakaryocytes
}  

In [None]:
sc.pl.dotplot(
    adata_combined_noDoublets,
    var_names=marker_genes_simple,
    groupby="lin",
    categories_order=lin_order,   # explicit order
    standard_scale="var",
    figsize=(11,2),
    show=False
)

plt.savefig("lineage_markers_dotplot.pdf", bbox_inches="tight")

In [None]:
adata_combined_noDoublets.X.max()

In [None]:
adata_combined_noDoublets.layers['counts'].max()

In [None]:
adata_combined_noDoublets.obs['lin'].value_counts()

In [None]:
# ----------------------------
# now we split lineages for lineage-level subclustering
adata_combined_noDoublets = sc.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/combined_noDoublets.h5ad')

for lineage in adata_combined_noDoublets.obs["lin"].cat.categories:
    subset = adata_combined_noDoublets[adata_combined_noDoublets.obs["lin"] == lineage].copy()
    subset.write(f"{lineage}/split.h5ad")

In [None]:
adata_combined_noDoublets.layers['normalized']=adata_combined_noDoublets.X
adata_combined_noDoublets.write("combined_noDoublets.h5ad")

In [None]:
# make UMAP of NBM only for Reyka 
nbm_samples = ['SN010', 'SN011', 'SN019', 'SN055','SN056',	'SN066', 'SN077', 'SN089']
nbms = adata_combined_noDoublets[adata_combined_noDoublets.obs['UPN'].isin(nbm_samples)].copy()

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
sc.pl.umap(nbms, color='lin', ax=ax, show=False, size=10)
ax.set_aspect('equal')
for coll in ax.collections:
    coll.set_rasterized(True)
plt.savefig("lineage_UMAP_merged_noDoublets_SenNetNBM.pdf", bbox_inches="tight")

In [None]:
nbms

In [None]:
#------------------
# try with harmony
#------------------
adata_combined_noDoublets = sc.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/combined_noDoublets.h5ad')

hmerged = adata_combined_noDoublets.copy()
sce.pp.harmony_integrate(hmerged, "Sample")

hmerged.obsm['X_pca'] = hmerged.obsm['X_pca_harmony']
sc.pp.neighbors(hmerged)
sc.tl.umap(hmerged)

In [None]:
sc.pl.umap(hmerged, color='lin')

In [None]:
hmerged.write('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/harmony/combined_noDoublets.h5ad')

In [None]:
# compare PC markers in timepoints
pc_markers = ['SDC1', 'CD38', 'TNFRSF17', 'SLAMF7', 'FCRL5', 'GPRC5D']
pc = adata_combined_noDoublets[adata_combined_noDoublets.obs['lin']=='PC'].copy()
sc.pl.dotplot(
    pc,
    var_names=pc_markers,
    groupby="Collection",
    standard_scale="var"
)

In [None]:
sc.pl.dotplot(
    pc,
    var_names=pc_markers,
    layer='counts',
    groupby="Collection",
    standard_scale="var"
)

In [None]:
adata_combined_noDoublets.X.max()