In [None]:
from utils import tissue
from utils import func
from utils import visualization
from utils import compare

In [None]:
import anndata
from multiprocessing import Process
import numpy as np

#### Load scRNA-seq and gene sets

In [None]:
%cd data/
immune_cell = anndata.read_h5ad('all_cells.h5ad')
immune_cell.X.shape[0]

In [None]:
### The gene sets are in a random order
from gsea_api.molecular_signatures_db import GeneSets
immune_gene_set = GeneSets.from_gmt('c7.immunesigdb.v2023.1.Hs.symbols.gmt')
immune_gs_name = [item.name for item in immune_gene_set.gene_sets]
immune_gene_set = [list(immune_gene_set.gene_sets[i].genes) for i in range(len(immune_gene_set.gene_sets))]
print(f"ImmuneSigDB database has {len(immune_gene_set)} gene sets.")

In [None]:
### The gene sets are in a random order
from gsea_api.molecular_signatures_db import GeneSets
GO_CC_gene_set = GeneSets.from_gmt('c5.go.cc.v2023.1.Hs.symbols.gmt')
GO_CC_gs_name = [item.name for item in GO_CC_gene_set.gene_sets]
GO_CC_gene_set = [list(GO_CC_gene_set.gene_sets[i].genes) for i in range(len(GO_CC_gene_set.gene_sets))]
print(f"GO_CC database has {len(GO_CC_gene_set )} gene sets.")

In [None]:
tissue_labels = np.unique(immune_cell.obs['Organ']).tolist()
curated_cell_label = "Classical monocytes"

In [None]:
for tis in tissue_labels:
    print(f"The number of {curated_cell_label} in tissue {tis} is {immune_cell[(immune_cell.obs['Organ'] == tis)*(immune_cell.obs['Manually_curated_celltype'] == curated_cell_label)].X.shape[0]}")

In [None]:
tissue_annotation = ["Blood", "Bone Marrow", "Caecum", "Duodenum","Ileum","Jejunum EPI", "Jejunum LP", "Liver",
"Lung-draining lymph nodes", "Lungs", "Mesenteric lymph nodes", "Omentum", "Sigmoid colon", 
 "Skeletal muscle", "Spleen", "Transverse colon", "Thymus"]
### select dataset of BLD, BMA, LIV, LNG, SPL (classical monocytes)
tissue_labels = np.array(tissue_labels)[[0,1,7,9,14]].tolist()

for tis in tissue_labels:
    print(f"The number of {curated_cell_label} in tissue {tis} is {immune_cell[(immune_cell.obs['Organ'] == tis)*(immune_cell.obs['Manually_curated_celltype'] == curated_cell_label)].X.shape[0]}")
share_labels = [tissue_labels[:2],tissue_labels[2:5]]

In [None]:
share_labels = [tissue_labels[:2],tissue_labels[2:5]]
share_labels

#### Instantiate tissue-gene set objects

In [None]:
sample_size = 1200
seed = 111
num_core = int(60/len(share_labels))
method = "kmeans"
clustering_metric = "AMI"
num_k = 10
hyperparameter =  {'num_core':num_core, 'para_kmeans':{'n_clusters':num_k, 'random_state':0, 'n_init':10}}

def target_fun(tissue_labels):
    for tis in tissue_labels:
        temp = tissue.tissue(name = tis, processed = True, 
                 scRNAseq = immune_cell[(immune_cell.obs['Organ'] == tis)*(immune_cell.obs['Manually_curated_celltype'] == curated_cell_label)],
                 gene_set_collection_name = "GO-CC",
                 gene_set_gene_symbols = GO_CC_gene_set,
                 gene_set_names = GO_CC_gs_name)
        
        print(f'Tissue {tis} has been instantiated.')
    
        temp.cell_sampling(size = sample_size, seed = seed)
    
        temp.call_kmeans(gene_set_collection_name = "GO-CC", clustering_metric = clustering_metric, para_Kmeans = hyperparameter)
    
        temp.add_gene_set(gene_set_collection_name = "immunesig", gene_set_gene_symbols = immune_gene_set, gene_set_names = immune_gs_name)
        print(f'"immunesig" has been added to {tis}.') 
        ### the gene set is immunesig
        temp.call_kmeans(gene_set_collection_name = "immunesig", clustering_metric = clustering_metric, para_Kmeans = hyperparameter)
        ### similarity between gene sets in GO-CC and immunesig
        temp.call_proj(gene_set_collection_name_1 = "GO-CC", gene_set_collection_name_2 = "immunesig",\
              method = method, clustering_metric = clustering_metric, num_cores = num_core)
        
        ### release scRNAseq data
        temp.scRNAseq_release()
        
        ### save tissue objects
        np.save(f'{tis}_{curated_cell_label}_GO_CC_immunesig.npy', temp)
    
        print(f'The pipeline for {tis} finishes!')
    
if __name__ == "__main__":
    processes = []
    for i in range(2):           
        proc = Process(target = target_fun, args = [share_labels[i]]) 
        proc.start()
        processes.append(proc)
    
    for proc in processes:
        proc.join()

$~$

$~$

$~$

#### Examples

In [None]:
%cd data/

curated_cell_label = "Classical monocytes"
tissue_ns = ['BLD', 'BMA','LIV', 'LNG', 'SPL']
for name in tissue_ns:
    globals()[name] = np.load(f'{name}_{curated_cell_label}_GO_CC_immunesig.npy', allow_pickle = True).tolist()

In [None]:
tis = 'BMA'

In [None]:
### Add scRNA-seq data to the tissue object
BMA.add_scRNAseq(immune_cell[(immune_cell.obs['Organ'] == tis)*(immune_cell.obs['Manually_curated_celltype'] \                                                              == curated_cell_label)])
### sample
sample_size = 1200
seed = 111
BMA.cell_sampling(size = sample_size, seed = seed)
sampled_data = BMA.sampled_data[2]

In [None]:
similarity_matrix = BMA.gene_set['GO-CC'].sim_mat['kmeans']
gene_set_names = BMA.gene_set['GO-CC'].gs_names
filter_index = BMA.gene_set['GO-CC'].filter_index['kmeans']
cluster_label = BMA.gene_set['GO-CC'].cluster_labels['kmeans']
gene_set_genes = BMA.gene_set['GO-CC'].gene_set
BMA.call_Jaccard(gene_set_collection_name = 'GO-CC', num_core = 60)
Jaccard_similarity_matrix = BMA.gene_set['GO-CC'].sim_mat['Jaccard']

In [None]:
where_high = (np.where(similarity_matrix - np.diag(np.ones(similarity_matrix.shape[0])) \
         >= np.quantile(similarity_matrix - np.diag(np.ones(similarity_matrix.shape[0])), 0.99999)))
where_low = (np.where((similarity_matrix - np.diag(np.ones(similarity_matrix.shape[0])) \
         <= 0)))

In [None]:
### high
where = []
for i, j in zip(where_high[0], where_high[1]):
    if i < j:
        where.append([i,j])

In [None]:
### case 1
i = 3
UMAP_cor_E, UMAP_cor_F, S, RV_coef, mantel_coef, Jaccard_coef, U_E, U_F = compare.gene_set_similarity_revealed_by_clustering_structure_UMAP_illustration(E_loc = where[i][0],\
                                                    F_loc = where[i][1], gs_names = gene_set_names, gs_genes = gene_set_genes, \
                         filter_index = filter_index, scRNAseq_sample = sampled_data,  \
                                                                       Jaccard_similarity_matrix = Jaccard_similarity_matrix)

In [None]:
### case 2
i = 4
UMAP_cor_E, UMAP_cor_F, S, RV_coef, mantel_coef, Jaccard_coef, U_E, U_F = compare.gene_set_similarity_revealed_by_clustering_structure_UMAP_illustration(E_loc = where[i][0],\
                                                    F_loc = where[i][1], gs_names = gene_set_names, gs_genes = gene_set_genes, \
                         filter_index = filter_index, scRNAseq_sample = sampled_data,  \
                                                                       Jaccard_similarity_matrix = Jaccard_similarity_matrix)

In [None]:
fig = visualization.vis_3D_umap(UMAP_cor_E,  UMAP_cor_F, roof_label  = gene_set_names[filter_index[where[i][0]]],  floor_label = gene_set_names[filter_index[where[i][1]]], \
        font_size = 30, title = f'Gene sets with a high similarity, S = {H:.2f}', title_size = 32, line_alpha = 0.1, colorbar = "tab10",floor = 0, roof = 5, roof_theta = 0, \
                                z_tick_pad = 320, view_para = {'elev':30., 'azim':-60}, figsize = (8,16))

In [None]:
fig.savefig(f'S_high.png', bbox_inches = 'tight', dpi = 300)

$~$

In [None]:
### low
where = []
for i, j in zip(where_low[0], where_low[1]):
    if i < j:
        where.append([i,j])

In [None]:
### case 3: with low similarity score (non-zero)
i = 0
UMAP_cor_E, UMAP_cor_F, S, RV_coef, mantel_coef, Jaccard_coef, U_E, U_F = compare.gene_set_similarity_revealed_by_clustering_structure_UMAP_illustration(E_loc = where[i][0],\
                                                    F_loc = where[i][1], gs_names = gene_set_names, gs_genes = gene_set_genes, \
                         filter_index = filter_index, scRNAseq_sample = sampled_data,  \
                                                                       Jaccard_similarity_matrix = Jaccard_similarity_matrix)

In [None]:
fig = visualization.vis_3D_umap(UMAP_cor_E,  UMAP_cor_F, roof_label = gene_set_names[filter_index[where[i][0]]],  floor_label = gene_set_names[filter_index[where[i][1]]], \
        font_size = 30, title = f'Gene sets with a low similarity, S = {np.abs(S):.2f}.', \
                                title_size =32, line_alpha = 0.1, colorbar = "tab10",floor = 1, roof = 5, roof_theta = 15, \
                                z_tick_pad = 250, view_para = {'elev':30., 'azim':-60}, figsize = (8,16))

In [None]:
fig.savefig(f'S_min.png', bbox_inches = 'tight', dpi = 300)

$~$

In [None]:
where_intermediate = (np.where((similarity_matrix - np.diag(np.ones(similarity_matrix.shape[0])) \
         >= 0.3)&(similarity_matrix - np.diag(np.ones(similarity_matrix.shape[0])) \
         <= 0.31)))

In [None]:
i = 1

In [None]:
UMAP_cor_E, UMAP_cor_F, H, RV_coef, mantel_coef, Jaccard_coef, U_E, U_F = compare.gene_set_similarity_revealed_by_clustering_structure_UMAP_illustration(E_loc = where_intermediate[i][0],\
                                                    F_loc = where_intermediate[i][1], gs_names = gene_set_names, gs_genes = gene_set_genes, \
                         filter_index = filter_index, scRNAseq_sample = sampled_data,  \
                                                                       Jaccard_similarity_matrix = Jaccard_similarity_matrix)

In [None]:
fig = visualization.vis_3D_umap(UMAP_cor_E,  UMAP_cor_F, floor_label = '',  roof_label = '', \
        font_size = 30, title = f'', \
                                title_size =32, line_alpha = 0.1, colorbar = "tab10",floor = 1, roof = 5, roof_theta = 180, \
                                z_tick_pad = 250, view_para = {'elev':30., 'azim':-45}, figsize = (8,16))

In [None]:
fig.savefig('graphical_abstract.png', bbox_inches = 'tight', dpi = 300)

$~$