In [None]:
#In an anaconda environment that contains scimap, use csv files that are MCQuant outputs, as input for scimap
#Use glob function to load all csv files, and then convert them to adata
import scimap as sm
import anndata as ad
import pandas as pd
import glob
import sys
import os
import scanpy as sc
import seaborn as sns; sns.set(color_codes=True)
path = path_to_quantification_outputs
files = glob.glob(path +'/*.csv')
filepath = files  # printing file name of desired extension
adata = sm.pp.mcmicro_to_scimap (filepath)

In [None]:
%gui qt

In [None]:
# Create the 'SubjectID' column (reflecting the subject identifications in Table S1 of the manuscript) 
import re
adata.obs['SubjectID'] = adata.obs['imageid'].str.extract(r'^(H\d+|P\d+)')
#Create the 'ConditionID' column
# Create a ConditionID column based on the first letter of 'imageid'
adata.obs['ConditionID'] = adata.obs['imageid'].str[0].map({'H': 'Healthy', 'P': 'Periodontitis'})

In [None]:
#First step of IBEX clustering is using a supervised phenotyping method
#Load your phenotyping csv that was created according to the scimap instructions
phenotype = pd.read_csv('/data/vasileiosionat2/IBEX_FINAL/Scimap/phenotypes_FINAL.csv')
phenotype

In [None]:
#Rescaling data based on unsupervised gating (GMM)
adata = sm.pp.rescale(adata, gate=None)

In [None]:
#Phenotyping individual cells based on the reschaled data and the supervised phenotyping strategy
adata = sm.tl.phenotype_cells(adata, phenotype=phenotype, label='phenotype') 

In [None]:
#Visualizing protein marker expression per phenotype. The phenotype matrixplot is available on github...
sc.pl.matrixplot(adata, var_names= adata.var.index, groupby='phenotype', dendrogram=False, use_raw=False, cmap="vlag", standard_scale='var')

In [None]:
#Initial Kmeans clustering of the IBEX phenotypes using scimap clustering tool
#The exact number of kmeans was determined based on visual inspection of clusters in relation to the marker expression with napari scimap tool, as well as based on the protein expression per cluster.

#Subclustering the epithelial phenotype
adata = sm.tl.cluster (adata, k= 4, method = 'kmeans', sub_cluster = True, subset_genes=['CD138', 'CK19', 'CK5', 'PanCK','S100a8-9'],
    sub_cluster_column='phenotype', sub_cluster_group='epithelial', use_raw = False)

#Subclustering the fibroblast phenotype
adata = sm.tl.cluster (adata, k=4, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='fibroblast', use_raw = False)

#Subclustering the APC phenotype
adata = sm.tl.cluster (adata, k= 4, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='APCs-immune', use_raw = False)

#Subclustering the T cell phenotype
adata = sm.tl.cluster (adata, k= 6, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='T cell', use_raw = False)

#Subclustering the plasma cell phenotype
adata = sm.tl.cluster (adata, k= 4, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='plasma cells', use_raw = False)

#Subclustering the neutrophil phenotype
adata = sm.tl.cluster (adata, k= 4, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='neutrophils', use_raw = False)

#Subclustering the immune-other phenotype
adata = sm.tl.cluster (adata, k= 3, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='immune-other', use_raw = False)

#Subclustering the SMC phenotype
adata = sm.tl.cluster (adata, k= 3, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='SMC', use_raw = False)

#Subclustering the endothelial phenotype
adata = sm.tl.cluster (adata, k= 3, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='endothelial', use_raw = False)

#Subclustering the mast cell phenotype
adata = sm.tl.cluster (adata, k= 2, method = 'kmeans', sub_cluster = True, 
    sub_cluster_column='kmeans', sub_cluster_group='mast cells', use_raw = False)

In [None]:
#Further subclustering of kmeans clusters to acquire more accurate resolution of subsets 

#Subclustering the epithelial clusters based on Ki67 to uncover proliferating epithelial cells
adata = sm.tl.cluster (adata, k= 2, method = 'kmeans', sub_cluster = True, subset_genes=['Ki67'],
    sub_cluster_column='kmeans', sub_cluster_group=['epithelial-0','epithelial-1','epithelial-2','epithelial-3','immune-other-1'], use_raw = False)

#Subclustering the immune-other-0 based on CD138
adata = sm.tl.cluster (adata, k= 2, method = 'kmeans', sub_cluster = True, subset_genes=['CD138'],
    sub_cluster_column='kmeans', sub_cluster_group='immune-other-0', use_raw = False)

#subclustering plasma-cells-1 based on CD3 
adata = sm.tl.cluster (adata, k= 2, method = 'kmeans', sub_cluster = True, subset_genes=['CD3'],
    sub_cluster_column='kmeans', sub_cluster_group='plasma cells-1', use_raw = False)

#Subclustering the T-cell-3 cluster based on CD138
adata = sm.tl.cluster (adata, k= 2, method = 'kmeans', sub_cluster = True, subset_genes=['CD138'],
    sub_cluster_column='kmeans', sub_cluster_group='T cell-3', use_raw = False)

#Subclustering the neutrophil clusters based on CD31, aSMA and Thy-1
adata = sm.tl.cluster (adata, k= 2, method = 'kmeans', sub_cluster = True, subset_genes=['CD31','Thy-1','aSMA'],
    sub_cluster_column='kmeans', sub_cluster_group=['neutrophils-0','neutrophils-1'], use_raw = False)

#Subclustering the neutrophils-0-1 based on Thy-1
adata = sm.tl.cluster (adata, k= 2, method = 'kmeans', sub_cluster = True, subset_genes=['Thy-1'],
    sub_cluster_column='kmeans', sub_cluster_group=['neutrophils-0-1'], use_raw = False)

In [None]:
#Visualization of raw kmeans outputs prior to renaming. The kmeans matrixplot may be found in the github... 
sc.pl.matrixplot(adata, var_names= adata.var.index, groupby='kmeans', dendrogram=False, use_raw=False, cmap="vlag", standard_scale='var')

In [None]:
#Renaming and merging the kmeans cluster outputs according to their marker expression and visual inspection of the cluster
adata.obs['cluster'] = 'test'

APC_kmeans = ['APCs-immune-1','APCs-immune-3']
adata.obs.loc[adata.obs['kmeans'].isin(APC_kmeans), 'cluster'] = 'mAPC' #myeloid APC cluster

APCBV_kmeans = ['APCs-immune-0']
adata.obs.loc[adata.obs['kmeans'].isin(APCBV_kmeans), 'cluster'] = 'mAPC.perivasc' #perivascular myeloid APCS

APCT_kmeans = ['APCs-immune-2']
adata.obs.loc[adata.obs['kmeans'].isin(APCT_kmeans), 'cluster'] = 'T.mAPC.mix' #T cell and mAPC mixture

SMC_kmeans = ['SMC-0','SMC-2']
adata.obs.loc[adata.obs['kmeans'].isin(SMC_kmeans), 'cluster'] = 'SMC.HLA-DR(-)' #HLA-DR (-) smooth muscle cell

APSMC_kmeans = ['SMC-1']
adata.obs.loc[adata.obs['kmeans'].isin(APSMC_kmeans), 'cluster'] = 'SMC.HLA-DR(+)' #HLA-DR (+) smooth muscle cell

CD8TBmix_kmeans = ['T cell-5']
adata.obs.loc[adata.obs['kmeans'].isin(CD8TBmix_kmeans), 'cluster'] = 'Tcyt.B.mix' #T cytotoxic and B cell mixture

Thelper_kmeans = ['T cell-2']
adata.obs.loc[adata.obs['kmeans'].isin(Thelper_kmeans), 'cluster'] = 'T.h' #CD4 (+) T cells (T helper)

PlasmaT_kmeans = ['T cell-3-0', 'plasma cells-1-0', 'plasma cells-1-1']
adata.obs.loc[adata.obs['kmeans'].isin(PlasmaT_kmeans), 'cluster'] = 'Pl.T.mix' # Plasma cell and T cell mixture

B_kmeans = ['b cell']
adata.obs.loc[adata.obs['kmeans'].isin(B_kmeans), 'cluster'] = 'B' #B cells

CD4B_kmeans = ['T cell-1']
adata.obs.loc[adata.obs['kmeans'].isin(CD4B_kmeans), 'cluster'] = 'Th.B.mix' #T helper and B cell mixture

Tcyt_kmeans = ['T cell-4']
adata.obs.loc[adata.obs['kmeans'].isin(Tcyt_kmeans), 'cluster'] = 'T.cyt' # T cytotoxic cells

IntraepiT_kmeans = ['T cell-0']
adata.obs.loc[adata.obs['kmeans'].isin(IntraepiT_kmeans), 'cluster'] = 'T.IE' #Intraepithelial T cells

VEC_kmeans = ['endothelial-2']
adata.obs.loc[adata.obs['kmeans'].isin(VEC_kmeans), 'cluster'] = 'VEC.aSMA(-)' #Vascular endothelial cells without proximity to SMCs, indicative of smaller vessels (eg capillaries)

VEC2_kmeans = ['endothelial-1']
adata.obs.loc[adata.obs['kmeans'].isin(VEC2_kmeans), 'cluster'] = 'VEC.HLA-DR(+)' #HLA-DR(+) vascular endothelial cells

VEC3_kmeans = ['endothelial-0']
adata.obs.loc[adata.obs['kmeans'].isin(VEC3_kmeans), 'cluster'] = 'VEC.HLA-DR(-)' #HLA-DR(-) vascular endothelial cells

BasalEpi_kmeans = ['epithelial-0-0','epithelial-1-1','immune-other-1-0','likely-SMC']
adata.obs.loc[adata.obs['kmeans'].isin(BasalEpi_kmeans), 'cluster'] = 'Ep.or.b-pb' #Basal-parabasal oral epithelial cells

SpinousEpi_kmeans = ['epithelial-3-0']
adata.obs.loc[adata.obs['kmeans'].isin(SpinousEpi_kmeans), 'cluster'] = 'Ep.or.sp' #Spinous oral epithelial cells

CrEpi_kmeans = ['epithelial-2-0']
adata.obs.loc[adata.obs['kmeans'].isin(CrEpi_kmeans), 'cluster'] = 'Ep.TAE-Krt' #Tooth-associated epithelium and oral keratin epithelial cells

PrEpi_kmeans = ['epithelial-0-1','epithelial-1-0','immune-other-1-1','epithelial-3-1', 'epithelial-2-1']
adata.obs.loc[adata.obs['kmeans'].isin(PrEpi_kmeans), 'cluster'] = 'Ep.prol'   #Proliferating epithelial cells)

Fibro_kmeans = ['fibroblast-2']
adata.obs.loc[adata.obs['kmeans'].isin(Fibro_kmeans), 'cluster'] = 'Fib.Thy1(+)'#Thy-1(+) fibroblasts

ECM_kmeans = ['fibroblast-0']
adata.obs.loc[adata.obs['kmeans'].isin(ECM_kmeans), 'cluster'] = 'ECM.Vim(+)' #Extracellular matrix (lack of nuclei), Vimentin positive

Fibro2_kmeans = ['fibroblast-1']
adata.obs.loc[adata.obs['kmeans'].isin(Fibro2_kmeans), 'cluster'] = 'Fib.Thy1(-)' #Thy-1(-) fibroblasts

Fibro3_kmeans = ['fibroblast-3']
adata.obs.loc[adata.obs['kmeans'].isin(Fibro3_kmeans), 'cluster'] = 'Fib.HLA-DR(+)' #HLA-DR(+) fibroblasts

plasma_kmeans = ['plasma cells-2','plasma cells-3','immune-other-0-0']
adata.obs.loc[adata.obs['kmeans'].isin(plasma_kmeans), 'cluster'] = 'Plasma'   #Plasma cells 

BVimmune_kmeans = ['immune-other-0-1','plasma cells-0', 'T cell-3-1']
adata.obs.loc[adata.obs['kmeans'].isin(BVimmune_kmeans), 'cluster'] = 'En.Im.mix' #Endothelial-Immune mixture

immuneother_kmeans = ['immune-other-2']
adata.obs.loc[adata.obs['kmeans'].isin(immuneother_kmeans), 'cluster'] = 'Thy1.CD45.cell' #Thy-1(+) immune cells

LC_kmeans = ['likely-APCs-immune',]
adata.obs.loc[adata.obs['kmeans'].isin(LC_kmeans), 'cluster'] = 'Lang' #Langerhans cells

unknown_kmeans = ['likely-endothelial']
adata.obs.loc[adata.obs['kmeans'].isin(unknown_kmeans), 'cluster'] = 'Unclear' #Unclear protein expression distribution

mast_kmeans = ['mast cells-0','mast cells-1']
adata.obs.loc[adata.obs['kmeans'].isin(mast_kmeans), 'cluster'] = 'Mast' #Mast cell

Neut1_kmeans = ['neutrophils-1-1','neutrophils-2']
adata.obs.loc[adata.obs['kmeans'].isin(Neut1_kmeans), 'cluster'] = 'Neut.CT' #Connective tissue neutrophils, S100a8-9(-)

Neut2_kmeans = ['neutrophils-0-0','neutrophils-0-1-0']
adata.obs.loc[adata.obs['kmeans'].isin(Neut2_kmeans), 'cluster'] = 'Neut.S100a8-9' #Connective tissue neutrophils, S100a8-9(+)

Neut3_kmeans = ['neutrophils-3']
adata.obs.loc[adata.obs['kmeans'].isin(Neut3_kmeans), 'cluster'] = 'Neut.IE' #Intraepithelial neutrophils

Neut4_kmeans = ['Unknown','neutrophils-0-1-1','neutrophils-1-0']
adata.obs.loc[adata.obs['kmeans'].isin(Neut4_kmeans), 'cluster'] = 'Neut.BV' #Blood vessel - associated neutrophils

In [None]:
#Visualization of protein expression per merged/renamed cluster. The cluster matrixplot may be found in the github... 
sc.pl.matrixplot(adata, var_names= adata.var.index, groupby='cluster', dendrogram=False, use_raw=False, cmap="vlag", standard_scale='var')

In [None]:
# Duplicate the 'cluster' column and rename the new column to 'spatial_cluster'
#This will be the most fine-grained cell annotations. Spatial cluster indicates that we incorporated the pathologist annotations that are shown below
adata.obs['Lvl5'] = adata.obs['cluster']

In [None]:
#Pathologist annotations to facilitate epithelial characterization
#Iteratively inspect each sample in napari and select the superficial layers of the oral/surface epithelium

image_ids = adata.obs['imageid'].unique()

for img_id in image_ids:
    image_path = path_to_processed_images
    mask_path = path_to_segmentation_masks

    adata = sm.pl.addROI_image(
        image_path, 
        adata, 
        subset=img_id,
        imageid='imageid',
        overlay=None,
        overlay_category=None,
        markers=None,
        channel_names='default',
        x_coordinate='X_centroid', 
        y_coordinate='Y_centroid', 
        seg_mask=mask_path,
        overwrite=False,
        label='Surface_Epi'
    )

In [None]:
#Renaming epithelial clusters based on the pathologist annotations. 

#Rename crevicular / keratin epithelial cells that are in the "Surface_Epi" as "keratin epithelial cells". Rename rest as crevicular epithelial cells 
adata.obs['Lvl5'] = adata.obs['Lvl5'].cat.add_categories(['Ep.Or.k'])
keratin_condition = (adata.obs['Lvl5'] == 'Ep.TAE-Krt') & (adata.obs['Surface_Epi'] == 'Surface_Epi')
adata.obs.loc[keratin_condition, 'Lvl5'] = 'Ep.Or.k'
adata.obs['Lvl5'].replace('Ep.TAE-Krt', 'Ep.TA', inplace=True)

#Move previously called "basal/parabasal epithelial cells" that are in the "Surface_Epi" (superficial epithelial layers) into the spinous epithelial cell cluster 
spinous_condition = (adata.obs['Lvl5'] == 'Ep.or.b-pb') & (adata.obs['Surface_Epi'] == 'Surface_Epi')
adata.obs.loc[spinous_condition, 'Lvl5'] = 'Ep.or.sp'

In [None]:
#Pathologist annotations to further clean epithelial characterization. 
#Iteratively inspect each sample in napari and select the connective tissue. Remove all CT cells that were previously identified as Epithelial.Merge to "Unclear" cluster

image_ids = adata.obs['imageid'].unique()

for img_id in image_ids:
    image_path = path_to_processed_images
    mask_path = path_to_segmentation_masks

    adata = sm.pl.addROI_image(
        image_path, 
        adata, 
        subset=img_id,
        imageid='imageid',
        overlay=None,
        overlay_category=None,
        markers=None,
        channel_names='default',
        x_coordinate='X_centroid', 
        y_coordinate='Y_centroid', 
        seg_mask=mask_path,
        overwrite=False,
        label='CT'
    )

unclear_condition = (
    adata.obs['Lvl5'].isin(['Ep.or.b-pb', 'Ep.or.sp', 'Ep.prol', 'Ep.TA']) &
    (adata.obs['CT'] == 'CT')
)

# Update the 'Lvl5' column where the condition is met
adata.obs.loc[unclear_condition, 'Lvl5'] = 'Unclear'

In [None]:
#Reclustering Plasma cells based on Ki-67 to uncover plasmablasts

adata = sm.tl.cluster(
    adata, 
    k=2,  
    method='kmeans',  
    sub_cluster=True, 
    subset_genes=['Ki67'],  
    sub_cluster_column='spatial_cluster',  
    sub_cluster_group='Plasma',  
    use_raw=False,  
    label= 'Lvl5'
)

replacement_dict = {
    'Plasma-0': 'Plasma',
    'Plasma-1': 'PB',
}
adata.obs['Lvl5'] = adata.obs['Lvl5'].replace(replacement_dict)
print(adata.obs['Lvl5'].unique())


In [None]:
#Saving adata file
adata.write_h5ad(your_adata_directory)