In [None]:
import scanpy as sc
from scGene.scGeneFit.functions import *
%matplotlib inline
import numpy as np
import glob
from sklearn.metrics import adjusted_rand_score
import statistics as stats
from tqdm.notebook import tqdm
import pandas as pd
import os
np.random.seed(0)

In [None]:
scgist_base_dirs = '/storage/adult_brain_genes/sketches_gene_choice_scripts/scGIST*'
path = '/storage/adult_brain_genes/sketches_gene_choice_scripts'

In [None]:
fls = glob.glob(os.path.join(scgist_base_dirs, 'markers.csv'))
current_ad = sc.read(os.path.join(path, 'combined_sketches.h5ad'))

In [None]:

def calc_metrics(subset_adata, pather, num_genes):
    sc.pp.neighbors(subset_adata, n_neighbors=6, knn=True, key_added='knn5_real')

    ncells=[]
    for i in range(len(subset_adata.obs.index)):
        neighbor_index = list(np.ascontiguousarray(subset_adata.obsp['knn5_real_distances'][i].nonzero())[1])
        try:
            newcell = stats.mode([subset_adata.obs['celltype'][n] for n in neighbor_index])
        except:
            newcell = ''
        ncells.append(newcell)
    subset_adata.obs['new_cluster'] = ncells

    # Get the predicted and actual observation columns
    predicted = subset_adata.obs['new_cluster']
    obs = subset_adata.obs['celltype']

    # Compute the ARI between the columns
    ari = adjusted_rand_score(obs, predicted)

    print(f"The ARI between the 'obs' and 'predicted' columns is {ari:.3f}")

    from sklearn.metrics.cluster import adjusted_mutual_info_score

    # calculate AMI
    ami_score = adjusted_mutual_info_score(obs, predicted)

    print(f"AMI score: {ami_score:.4f}")

    sc.set_figure_params(fontsize=10, dpi=300, dpi_save=400)
    ax1 = sc.pl.umap(subset_adata, color='celltype', return_fig=True, show=False, title=f'{num_genes} genes, Celltype ARI = {ari:.3f}, Celltype AMI = {ami_score:.3f}', legend_loc=None)
    ax1.set_size_inches(8, 6)
    ax1.savefig(os.path.join(pather, 'Celltype.png'))

    sc.pp.neighbors(subset_adata, n_neighbors=6, knn=True, key_added='knn5_real')

    ncells=[]
    for i in range(len(subset_adata.obs.index)):
        neighbor_index = list(np.ascontiguousarray(subset_adata.obsp['knn5_real_distances'][i].nonzero())[1])
        try:
            newcell = stats.mode([subset_adata.obs['Clusters'][n] for n in neighbor_index])
        except:
            newcell = ''
        ncells.append(newcell)
    subset_adata.obs['new_cluster'] = ncells

    # Get the predicted and actual observation columns
    predicted = subset_adata.obs['new_cluster']
    obs = subset_adata.obs['Clusters']

    # Compute the ARI between the columns
    ari = adjusted_rand_score(obs, predicted)

    print(f"The ARI between the 'obs' and 'predicted' columns is {ari:.3f}")

    from sklearn.metrics.cluster import adjusted_mutual_info_score

    # calculate AMI
    ami_score = adjusted_mutual_info_score(obs, predicted)

    print(f"AMI score: {ami_score:.4f}")

    ax2 = sc.pl.umap(subset_adata, color='Clusters', return_fig=True, show=False, title=f'{num_genes} genes, Cluster ARI = {ari:.3f}, Cluster AMI = {ami_score:.3f}', legend_loc=None)
    ax2.set_size_inches(8, 6)
    ax2.savefig(os.path.join(pather, 'Cluster.png'))

    sc.pp.neighbors(subset_adata, n_neighbors=6, knn=True, key_added='knn5_real')

    ncells=[]
    for i in range(len(subset_adata.obs.index)):
        neighbor_index = list(np.ascontiguousarray(subset_adata.obsp['knn5_real_distances'][i].nonzero())[1])
        try:
            newcell = stats.mode([subset_adata.obs['Subclusters'][n] for n in neighbor_index])
        except:
            newcell = ''
        ncells.append(newcell)
    subset_adata.obs['new_cluster'] = ncells

    # Get the predicted and actual observation columns
    predicted = subset_adata.obs['new_cluster']
    obs = subset_adata.obs['Subclusters']

    # Compute the ARI between the columns
    ari = adjusted_rand_score(obs, predicted)

    print(f"The ARI between the 'obs' and 'predicted' columns is {ari:.3f}")

    from sklearn.metrics.cluster import adjusted_mutual_info_score

    # calculate AMI
    ami_score = adjusted_mutual_info_score(obs, predicted)

    print(f"AMI score: {ami_score:.4f}")

    ax2 = sc.pl.umap(subset_adata, color='Clusters', return_fig=True, show=False, title=f'{num_genes} genes, Subcluster ARI = {ari:.3f}, Subcluster AMI = {ami_score:.3f}', legend_loc=None)
    ax2.set_size_inches(8, 6)
    ax2.savefig(os.path.join(pather, 'Subcluster.png'))

In [None]:

for fl in fls:
    scGIST_markers = pd.read_csv(fl, index_col=0)['0'].tolist()
    subset_adata = current_ad[:, current_ad.var.index.isin(scGIST_markers)]
    sc.tl.pca(subset_adata)
    sc.pp.neighbors(subset_adata)
    sc.tl.umap(subset_adata)
    sc.set_figure_params(dpi=300)
    sc.pl.umap(subset_adata, color='celltype')
    
    calc_metrics(subset_adata, os.path.dirname(fl), 500)