In [None]:
import os
import numpy as np
import pandas as pd
import phenograph
import scanpy as sc
from utils import *
import anndata

## data preprocessing and annotation

### load in preprocessed and sketched dataset

In [None]:
#load in data
data_directory = 'data'
sample_id = 'ACTG5248'
sketch_size = '2500'
timepoint_label = 'day'

adata = sc.read_h5ad(os.path.join(data_directory, 'adata', f'{sample_id}_preprocessed.h5ad'))
subsample_idx = np.load(os.path.join(data_directory, 'kh_output', f'{sample_id}_kh_subsample_{sketch_size}.npy'))

# KH subsampling
fcs_files = list(adata.obs['fcs_file'].cat.categories)
num_sample_sets = len(fcs_files)

adata_subsample_ = []
for i in range(0, len(fcs_files)):
    fcs_data = adata[adata.obs['fcs_file'] == fcs_files[i]]
    adata_subsample_.append(fcs_data[fcs_data.obs.iloc[subsample_idx[i]].index])

adata_subsample_ = anndata.concat(adata_subsample_)

functional_markers = ['PD-1','Bcl-2','Ki-67','CCR5','HLA-DR','CD57','CD38','CD127']

phenotypic_markers = ['CD45', 'CD19', 'CD3','CD4','CD8', 'CD14','CD56','CD16','CD11c','TCRgd',
                    'CD45RA','CCR10','CD28','CD27','CD95','CCR7','CD99','CD127','CD31','FoxP3',
                    'CCR4','CCR6','CXCR5','CXCR3', 'CD25']

if sample_id == 'CHI':
    phenotypic_markers.remove('CD31')

#set dfs for phenograph
df = pd.DataFrame(adata_subsample_.X, columns = adata_subsample_.var_names, index = adata_subsample_.obs_names)
df['sample_id'] = adata_subsample_.obs['fcs_file'].copy()
df['subject'] = adata_subsample_.obs['patient_id'].copy()
df['timepoint'] = adata_subsample_.obs['timepoint'].copy()

### perform metaclustering

In [None]:
resolution = 1.25   

data, _, _ = phenograph_clustering(data = df, verbose = True, features = phenotypic_markers, k = 30, resolution_parameter = resolution, print_performance_metrics = False)
data, _, _ = phenograph_metaclustering(data = data, features = phenotypic_markers, verbose = True, resolution_parameter = resolution, min_cluster_size = 3, k = 30, print_performance_metrics = True)

adata_subsample_X = pd.DataFrame(data.iloc[:, np.isin(data.columns, data.columns.drop(['sample_id', 'cluster_label', 'meta_label', 'subject', 'timepoint']))])

adata_subsample = anndata.AnnData(adata_subsample_X)
adata_subsample.obs['phenograph_clusters_res{}'.format(str(resolution))] = data['cluster_label'].values.astype('str')
adata_subsample.obs['phenograph_metaclusters_res{}'.format(str(resolution))] = data['meta_label'].values.astype('str')
adata_subsample.obs['fcs_file'] = data['sample_id'].values.astype('str')
adata_subsample.obs['patient_id'] = data['subject'].values.astype('str')
adata_subsample.obs['timepoint'] = data['timepoint'].values.astype('str')

### annotate metaclusters

In [None]:
if sample_id == 'ACTG5248':
    #update naming dictionary below to label clusters
    naming_dict = dict(zip(pd.Categorical(adata_subsample.obs[f'phenograph_metaclusters_res{resolution}']).cat.categories, ['CD4+ Tcm 1', 'CD4+ Tcm 2', 'CD8+ Tcm 1', 'CD8+ Tcm 2', 'naive CD4+', 'naive CD8+', 'CD8+ Tcm 3', 'CD8+ Tem 1', 'c15',
                                                                                                                    'NK cells', '$\gamma\delta$', 'CD4+ Tem 1', 'CD4+ Tem 2', 'CD8+ Tem 2', 'CD8+ Tte 1', 'CD8+ Tte 2', 'CD8+ Tte 3',
                                                                                                                    'B cells', 'memory B cells', 'nonclassical monocytes', 'c10', 'intermediate monocytes', 'classical monocytes']))

    metaclusters_labeled = pd.Series(adata_subsample.obs[f'phenograph_metaclusters_res{resolution}'.format(str(resolution))].values).map(naming_dict)

    adata_subsample.obs[f'phenograph_metaclusters_res{resolution}_labeled'] = metaclusters_labeled.values

    cluster_order = ['naive CD4+', 'naive CD8+', 'CD4+ Tcm 1', 'CD4+ Tcm 2', 'CD8+ Tcm 1','CD8+ Tcm 2','CD8+ Tcm 3', 'CD4+ Tem 1', 'CD4+ Tem 2', 'CD8+ Tem 1', 'CD8+ Tem 2', 'CD8+ Tte 1', 'CD8+ Tte 2', 'CD8+ Tte 3', '$\gamma\delta$', 'NK cells', 'c10', 'c15', 'B cells', 'memory B cells','classical monocytes', 'intermediate monocytes', 'nonclassical monocytes']
    adata_subsample.obs[f'phenograph_metaclusters_res{resolution}_labeled'] = adata_subsample.obs[f'phenograph_metaclusters_res{resolution}_labeled'].cat.reorder_categories(cluster_order)
else:
    #update naming dictionary below to label clusters
    naming_dict = dict(zip(pd.Categorical(adata_subsample.obs[f'phenograph_metaclusters_res{resolution}']), ['CD8+ Tcm', 'naive CD8+', 'naive CD4+','CD4+ Tem', 'CD4+ Tcm', 'B cells', 'nonclassical monocytes', 'NK cells', 'CD8+ Tte', r'$\gamma\delta$', 'intermediate monocytes', 'CD8+ Tem']))

    metaclusters_labeled = pd.Series(adata_subsample.obs[f'phenograph_metaclusters_res{resolution}_labeled'].values).map(naming_dict)

    adata_subsample.obs[f'phenograph_metaclusters_res{resolution}_labeled'] = metaclusters_labeled.values

    cluster_order = ['naive CD4+', 'naive CD8+', 'CD4+ Tcm', 'CD8+ Tcm','CD4+ Tem','CD8+ Tem', 'CD8+ Tte', '$\gamma\delta$', 'NK cells', 'B cells','intermediate monocytes', 'nonclassical monocytes']
    adata_subsample.obs[f'phenograph_metaclusters_res{resolution}_labeled'] = adata_subsample.obs[f'phenograph_metaclusters_res{resolution}_labeled'].cat.reorder_categories(cluster_order)

### save subsampled adata object with annotated metaclusters

In [None]:
#resave data with clustering assignment and annotations
adata_subsample.write(os.path.join(data_directory, 'adata', f'{sample_id}_subsample_res{resolution}.h5ad'))