In [None]:
# subcluster each lineage and annotate
import anndata
import scanpy as sc
import pandas as pd
import glob
import skimage
import re
import scanpy.external as sce
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

import os
# Set working directory
lin='T_NK'
os.chdir(f"/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/{lin}")

# Verify current working directory
print(os.getcwd())

# read in HGNC Ig locus genes
ig_genes_table = pd.read_csv(
    "/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/IgGenes_HGNC_geneNamesOrg.txt",
    sep="\t",
    header=0
)
ig_genes = ig_genes_table['Approved symbol'].tolist()

# filter genes for looking at DEGs: only protein coding, no MT, IG, hemoglobins
protein_coding_genes_table = pd.read_csv("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/HGNC_protein_coding_gene.txt", sep="\t")
protein_coding_genes = protein_coding_genes_table['symbol'].tolist()

hemo_genes_table = pd.read_csv("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/resources/hemoglobin_genes.txt", sep="\t")
hb_genes = hemo_genes_table['Gene'].tolist()

protein_coding_filtered = [
    g for g in protein_coding_genes
    if (
        g not in ig_genes and
        g not in hb_genes and
        not re.match(r"^(RPL|RPS|MT-)", g)
    )
]

In [None]:
adata = sc.read_h5ad("split.h5ad")

In [None]:
adata

In [None]:
# temporarily exclude Ig genes from variable gene selection
adata_hvg_calc = adata[:, ~adata.var_names.isin(ig_genes)].copy()

sc.pp.highly_variable_genes(
    adata_hvg_calc,
    n_top_genes=2000,
    batch_key='Sample'
)

# Initialize all genes in full object as False
adata.var['highly_variable'] = False

# Mark HVGs from the subset in the full object
adata.var.loc[adata_hvg_calc.var_names[adata_hvg_calc.var['highly_variable']], 'highly_variable'] = True

sc.tl.pca(adata)
sce.pp.harmony_integrate(adata, "Sample")

adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
sc.pp.neighbors(adata)
sc.tl.umap(adata)

#determine resolution for clustering
for res in [0.1, 0.3, 0.5, 1.0]:
    sc.tl.leiden(
        adata, key_added=f"leiden_res_{res:4.2f}", resolution=res, flavor="igraph"
    )

In [None]:
resolutions = ["leiden_res_0.10", "leiden_res_0.30", "leiden_res_0.50", "leiden_res_1.00"]
fig, axes = plt.subplots(1,4, figsize=(16,5))
axes = axes.flatten()  # flatten to 1D for easy iteration

for ax, res in zip(axes, resolutions):
    sc.pl.umap(adata, color=res, ax=ax, legend_loc='on data', show=False)
    ax.set_aspect('equal') 
    for coll in ax.collections:
        coll.set_rasterized(True)

plt.tight_layout()
plt.savefig("leiden_umaps.pdf")

In [None]:
#determine resolution for clustering
for res in [2.0]:
    sc.tl.leiden(
        adata, key_added=f"leiden_res_{res:4.2f}", resolution=res, flavor="igraph"
    )

In [None]:
sc.pl.umap(adata, color='leiden_res_2.00')    

In [None]:
sc.pl.umap(adata, color='nCount_RNA')   

In [None]:
pd.crosstab(adata.obs['leiden_res_1.00'], adata.obs['Collection'])

In [None]:
adata.obs['Collection'].value_counts()

In [None]:
print(1907/35949)
print(6957/142289)
print(9226/157610)
print(628/6269)

In [None]:
# simplify markers and remove ones that arent expressed highly
dev_markers ={
    "Cycling": ["TUBA1B", "HMGB2", "MKI67",'SLAMF6', 'PDCD1'], 
    "Prog.": ['SOX4', 'BCL11B','DDX17', 'NFAT5','MEF2C'],
    "TCR": [ 'TRGC1', 'TRDC', 'TRAC', 'CD3D', 'CD4', 'CD8A'],
    "Co-stim": ['CD27', 'CD28', 'CD69', 'ICOS', 'CD40LG'],
    "Chemok": [  'SELL','S1PR1','S1PR5', 'CD44', 'LTB',  #high on circulating
                'CCR7', 'CXCR4', # naive
               'CXCR3', 'CCR5', 'CCL3', 'CCL4', 'CCL5', 'XCL1'], # Teff
    "Cytok": ['IFNG', 'TNF'],
    "Cytox": ['GZMA', 'GZMB', 'GZMK', 'PRF1'], 
    "TF": ["TCF7", "LEF1", "FOXP1", # stem-like, naive, quiescent
           "EOMES", # memory
           "TBX21", "PRDM1", "IRF4", "BATF", 'ZEB2'], # effector
    "Inhibit": ['FAS', 'CTLA4', 'TIGIT', 'LAG3', 'HAVCR2', 'TGFB1', 'FOXP3'],
    'CD56dim': ['NCAM1', 'FCGR3A', 'NKG7', 'GNLY', 'SPON2', 'CST7'],
    'CD56bri': ['KLRB1', 'KLRC1', 'IL2RA', 'ZBTB16'],
    "Mye":["LYZ", "CD14", "ITGAM"],
    'PC': ['SDC1', 'MZB1', 'TNFRSF17'],
    'B':['CD19', 'MS4A1', 'CD79A', 'VPREB1'],
    'ery':['HBB', 'HBD', 'AHSP'],
    'MKC':['LEPR', 'KITLG', 'CXCL12']
}

In [None]:
sc.pl.dotplot(adata, dev_markers, groupby="leiden_res_1.00", standard_scale="var")

In [None]:
sc.pl.dotplot(adata, dev_markers, groupby="leiden_res_2.00", standard_scale="var")

In [None]:
# assign developmental subset to each 
subset_assignment = {
    "0": "CD4T", #"CD4T Nv/Rest", 
    "1": "CD4T", #"CD4T Eff/Act", 
    "2": "T Stim/Exh", 
    "3": "CD8T", #"CD8T Cytotoxic", 
    "4": "CD4T", # "CD4T Nv/Rest",
    "5": "Low_Quality", # possibly thymocytes but mixed markers, enriched in relapse but unclear if biological or low quality
    "6": "gdT/NK", #"gdT/CD56dim NK",
    "7": "CD8T", #"CD8T Cytotoxic",
    "8": "CD8T", #"CD8T Cytokine",
    "9":  "gdT/NK", #"gdT/CD56bri NK",
    "10": "Low_Quality" 
}
adata.obs["subset"] = adata.obs["leiden_res_1.00"].map(subset_assignment)

In [None]:
# subset the object to only protein-coding genes, no Ig, hemoglobin, mitochondrial, or ribosomal
adata_prot = adata[:, adata.var_names.isin(protein_coding_filtered)].copy()

# rank genes only among those
sc.tl.rank_genes_groups(adata_prot, groupby="leiden_res_1.00", method="wilcoxon")

# plot top 5 markers per cluster
sc.pl.rank_genes_groups_dotplot(
    adata_prot,
    groupby="leiden_res_1.00",
    standard_scale="var",
    n_genes=5
)

In [None]:
adata_cleaned = adata[adata.obs['subset']!='Low_Quality'].copy()

In [None]:
# make pretty dotplot and save as pdf

markers_simple = {
    #"Progenitor": ['BCL11B', 'DDX17', 'NFAT5'],
    "TCRab": [ 'TRAC', 'CD3D', 'CD4', 'CD8A'], 
    "Co-stim": ['CD27', 'CD28', 'CD69', 'ICOS', 'CD40LG'],
    "Quiescence TF": ["TCF7", "LEF1", "FOXP1"], # stem-like, naive, quiescent
    "Memory / Effector TF": ["EOMES", # memory
           "TBX21", "PRDM1", "IRF4", "BATF", 'ZEB2'], # effector
    "Chemokine": [ 'CCR7', 'CXCR4', # naive
               'CXCR3', 'CCR5', 'CCL3', 'CCL4', 'CCL5', 'XCL1'], # Teff
    "Cytokine": ['IFNG', 'TNF', 'LTB'],
    "Cytotoxic": ['GZMK', 'GZMA', 'GZMB', 'GZMH', 'PRF1'], 
   
    "Cycling": ["TUBA1B", "HMGB2", "MKI67",], 
    "Checkpoint / Regulatory": ['SLAMF6', 'PDCD1', 'FAS', 'CTLA4', 'TIGIT', 'LAG3', 'HAVCR2', 'TGFB1', 'FOXP3'],
    'TCRgd': ['TRGC1', 'TRDC'],
    'CD56bri': ['NCAM1', 'KLRB1', 'KLRC1', 'IL2RA', 'ZBTB16'],
    'CD56dim': ['FCGR3A', 'NKG7', 'GNLY', 'SPON2', 'CST7'],
}
    

In [None]:
"""
subset_order  = [
    #"Early T",
    "CD4T Nv/Rest",
    "CD4T Eff/Act", 
    "CD8T Cytokine",
    "CD8T Cytotoxic",
    "T Stim/Exh",
    "gdT/CD56bri NK",
    "gdT/CD56dim NK"
]
"""
subset_order  = [
    "CD4T",
    "CD8T",
    "T Stim/Exh",
    "gdT/NK"
]

In [None]:
# save the above dotplot to pdf
markers_simple = {
    #"Progenitor": ['BCL11B', 'DDX17', 'NFAT5'],
    "TCRab": [ 'TRAC', 'CD3D', 'CD4', 'CD8A'], 
    "Co-stim": ['CD27', 'CD28', 'CD69', 'ICOS', 'CD40LG'],
   # "Quiescence TF": ["TCF7", "LEF1", "FOXP1"], # stem-like, naive, quiescent
   # "Memory / Effector TF": ["EOMES", # memory
          # "TBX21", "PRDM1", "BATF",], # effector
   # "Chemokine": [ 'CCR7', 'CXCR4', # naive
               #'CXCR3', 'CCR5', 'CCL3', 'CCL4', 'CCL5'], # Teff
    #"Cytokine": ['IFNG', 'TNF'],
    "Cytotoxic": ['GZMK', 'GZMA', 'GZMB', 'GZMH', 'PRF1'], 
   
    #"Cycling": ["TUBA1B", "HMGB2", "MKI67",], 
    "Checkpoint / Regulatory": ['SLAMF6', 'PDCD1', 'FAS', 'CTLA4', 'TIGIT', 'LAG3', 'HAVCR2', 'TGFB1'],
    'TCRgd': ['TRGC1', 'TRDC'],
    'CD56bri': ['NCAM1', 'KLRB1', 'KLRC1'],
    'CD56dim': ['FCGR3A', 'NKG7', 'GNLY', 'SPON2', 'CST7'],
}
    
sc.pl.dotplot(
    adata_cleaned,
    var_names=markers_simple,
    layer='counts',
    groupby="subset",
    categories_order=subset_order,   # explicit order
    standard_scale="var",
    figsize=(10,1.2),
    show=False
)

plt.savefig("subset_markers_dotplot.pdf", bbox_inches="tight")

In [None]:
adata_cleaned.obs['subset'].value_counts()

In [None]:
# Assign colors to each subset
"""
t_palette = {
    "CD4T Nv/Rest": "#eb9449",
    "CD4T Eff/Act": "#c45d04", 
    "CD8T Cytokine": "#e37d7d",
    "CD8T Cytotoxic": "#f00e0e",
    "T Stim/Exh": "#8c3a3a",
    "gdT/CD56bri NK": "#ba6ee0",
    "gdT/CD56dim NK": "#7207a8"  
}
"""
t_palette = {
    "CD4T": "#eb9449",
    "CD8T": "#f00e0e",
    "T Stim/Exh": "#8c3a3a",
    "gdT/NK": "#ba6ee0"
}

adata_cleaned.uns['subset_colors'] = [t_palette[c] for c in adata_cleaned.obs['subset'].cat.categories]

# print subcluster labeled umap
# save to pdf
fig, ax = plt.subplots(figsize=(5,5))
sc.pl.umap(adata_cleaned, color='subset', size=3, ax=ax, legend_loc='on data', show=False)
ax.set_aspect('equal')
for coll in ax.collections:
    coll.set_rasterized(True)
plt.savefig("subset_UMAP.pdf", bbox_inches="tight")

In [None]:
adata_cleaned.layers['normalized'] = adata_cleaned.X

In [None]:
pd.crosstab(adata_cleaned.obs['lin'], adata_cleaned.obs['subset'])

In [None]:
adata_cleaned.write('annotated.h5ad')