## Utility Functions

In [None]:
# Make cluster by cluster plots
def plot_by_cluster(adata, clust_key, size=60, legend_loc=None, **kwargs):
    tmp = adata.copy()

    for i,clust in enumerate(adata.obs[clust_key].cat.categories):
        tmp.obs[clust] = adata.obs[clust_key].isin([clust]).astype('category')
        tmp.uns[clust+'_colors'] = ['#d3d3d3', adata.uns[clust_key+'_colors'][i]]

    sc.pl.umap(tmp, groups=tmp.obs[clust].cat.categories[1:].values, color=adata.obs[clust_key].cat.categories.tolist(), size=size, legend_loc=legend_loc, **kwargs)

In [None]:
# Map annotations to abbreviated names
def annotation_abbr(annotations):
    tmp = []
    for ann in annotations:
        ann = ann.replace('Natural killer T (NKT) cell','NKT')
        ann = ann.replace('Hematopoietic stem cell','HSC')
        ann = ann.replace('Mesenchymal stem cell','MSC')
        ann = ann.replace('Natural killer cell','NK')
        ann = ann.replace('Embryonic stem cell','ESC')
        ann = ann.replace('Regulatory T (Treg) cell', 'Treg')
        ann = ann.replace(' (Th1)','')
        ann = ann.replace('T helper', 'Th')
        ann = ann.replace(' cell','')
        ann = ann.replace('-','UND')
        tmp.append(ann)
    return tmp

In [None]:
def names_make_unique(annotations):
    ann_dic = {}
    # construct dictionary for all names, cluster indices will be stored for each name
    for i,ann in enumerate(annotations):
        if ann_dic.get(ann)==None:
            ann_dic[ann] = [i]
        else:
            ann_dic[ann].append(i)
    #  
    for ann in ann_dic:
        num = len(ann_dic[ann])
        if num > 1:
            for indx in ann_dic[ann]:
                annotations[indx] = '('+ann+')'+'_'+ str(num)
                num -= 1
    return annotations

## Automated Annotation

#### We use SCSA to help annotate the clusters. SCSA annotation is based on a few manually curated database. Instructions can be found https://github.com/bioinfo-ibms-pumc/SCSA.

In [None]:
# %%bash

# cd /SCSA
# python3 SCSA.py -d whole.db -i /scratch/umap_cluster.csv -s scanpy -E -f1.5 -p 0.01 -o /scratch/umap_annotation_result -m txt 

In [None]:
# copy and paste the raw output for umap
umap_annotation_output=[['0', 'Good', 'B cell', 13.5039603520991, 8.176323164417516],
['1', '?', 'Stem cell|Epithelial cell', '7.405988552514153|6.94665636004819', 1.0661227745635566],
['10', 'Good', 'LGR5+ stem cell', 0.4515311265969228, '-'],
['11', '?', 'Natural killer cell|Natural killer T (NKT) cell', '6.220128888075735|4.716953887305404', 1.3186749407951135],
['12', 'Good', 'Follicular helper (Tfh) T cell', 2.0395464663838925, 5.262837321664994],
['13', '?', 'Monocyte|Macrophage', '7.775149253244101|7.300776271963796', 1.0649756907497598],
['14', '?', 'B cell|Monocyte', '8.877604414513305|8.019341042816869', 1.1070241765643831],
['15', '?', 'Regulatory T (Treg) cell|T cell', '9.553171878515663|5.830713537800617', 1.6384224360504567],
['16', 'Good', 'Natural killer T (NKT) cell', 12.762263672005355, 2.590420909203688],
['17', '?', 'B cell|Natural killer T (NKT) cell', '10.019943935689373|5.592625739275372', 1.7916349855707743],
['18', '?', 'Mesenchymal stem cell|Astrocyte', '8.72163411682708|7.206265265084938', 1.210284911254135],
['19', 'Good', 'Endothelial cell', 12.66732020203273, 2.06757190587737],
['2', '?', 'Natural killer T (NKT) cell|T cell', '6.437041806536354|5.3027762478767535', 1.2139003242148418],
['20', '?', 'Mesenchymal stem cell|Astrocyte', '8.567137790087983|7.208846232954828', 1.1884201040277156],
['21', '?', 'B cell|Natural killer T (NKT) cell', '9.181853435561|7.291855328450933', 1.259193034142598],
['22', '?', 'Mast cell|Hematopoietic stem cell', '7.264891439672104|6.3042483839426815', 1.152380267594824],
['3', '?', 'Natural killer T (NKT) cell|T helper cell', '5.089999937175556|3.7732027268885635', 1.3489866051731714],
['4', '?', 'Natural killer cell|T cell', '6.672744460783595|6.42786041204717', 1.0380972879058576],
['5', '?', 'CD8+ T cell|Natural killer T (NKT) cell', 'nan|nan', 'nan'],
['6', '?', 'Microglial cell|T helper2 (Th2) cell', '5.6674005917775165|2.8762809288805062', 1.970391881707938],
['7', '?', 'T cell|Natural killer T (NKT) cell', '5.939816411907659|5.161757383418162', 1.1507352962750566],
['8', '?', 'T cell|Natural killer cell', '8.23750857698224|6.404606034860954', 1.2861850568394997],
['9', '?', 'T cell|Natural killer cell', '6.717320760876801|5.587167462130855', 1.2022766108955902]]


# extract annotations
umap_cluster_names = [None for _ in range(len(umap_annotation_output))]
for annotation in umap_annotation_output:
    index = int(annotation[0])
    name = annotation[2]
    umap_cluster_names[index] = name

In [None]:
umap_cluster_names = annotation_abbr(umap_cluster_names)
umap_cluster_names = names_make_unique(umap_cluster_names)
# View cluster annotations and get the number of labels that needs extra attention
for i,name in enumerate(umap_cluster_names):
    print((i,name))

combined_umap.rename_categories('leiden', umap_cluster_names)
fig, ax = plt.subplots(figsize=(12,12))
sc.pl.umap(combined_umap, color='leiden',legend_loc='on data',frameon=False,ax=ax)