In [None]:
import scanpy as sc
from scGene.scGeneFit.functions import *
%matplotlib inline
import numpy as np
import glob
from sklearn.metrics import adjusted_rand_score
import statistics as stats
from tqdm.notebook import tqdm
import os
np.random.seed(0)

In [None]:
path = '/storage/adult_brain_genes/sketches_gene_choice_scripts'

In [None]:
current_ad = sc.read(os.path.join(path, 'combined_sketches.h5ad'))

In [None]:
from sklearn.neighbors import NearestCentroid
clf=NearestCentroid()

def performance(X_train, y_train, X_test, y_test, clf):
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

In [None]:
sc.pp.highly_variable_genes(current_ad, n_top_genes=15000)
current_ad_sub=current_ad[:, current_ad.var.highly_variable]
data = np.array(current_ad_sub.X.todense())

In [None]:

def calc_metrics(subset_adata, pather, num_genes):
    sc.pp.neighbors(subset_adata, n_neighbors=6, knn=True, key_added='knn5_real')

    ncells=[]
    for i in range(len(subset_adata.obs.index)):
        neighbor_index = list(np.ascontiguousarray(subset_adata.obsp['knn5_real_distances'][i].nonzero())[1])
        try:
            newcell = stats.mode([subset_adata.obs['celltype'][n] for n in neighbor_index])
        except:
            newcell = ''
        ncells.append(newcell)
    subset_adata.obs['new_cluster'] = ncells

    # Get the predicted and actual observation columns
    predicted = subset_adata.obs['new_cluster']
    obs = subset_adata.obs['celltype']

    # Compute the ARI between the columns
    ari = adjusted_rand_score(obs, predicted)

    print(f"The ARI between the 'obs' and 'predicted' columns is {ari:.3f}")

    from sklearn.metrics.cluster import adjusted_mutual_info_score

    # calculate AMI
    ami_score = adjusted_mutual_info_score(obs, predicted)

    print(f"AMI score: {ami_score:.4f}")

    sc.set_figure_params(fontsize=10, dpi=300, dpi_save=400)
    ax1 = sc.pl.umap(subset_adata, color='celltype', return_fig=True, show=False, title=f'{num_genes} genes, Celltype ARI = {ari:.3f}, Celltype AMI = {ami_score:.3f}', legend_loc=None)
    ax1.set_size_inches(8, 6)
    ax1.savefig(os.path.join(pather, 'Celltype.png'))

    sc.pp.neighbors(subset_adata, n_neighbors=6, knn=True, key_added='knn5_real')

    ncells=[]
    for i in range(len(subset_adata.obs.index)):
        neighbor_index = list(np.ascontiguousarray(subset_adata.obsp['knn5_real_distances'][i].nonzero())[1])
        try:
            newcell = stats.mode([subset_adata.obs['Clusters'][n] for n in neighbor_index])
        except:
            newcell = ''
        ncells.append(newcell)
    subset_adata.obs['new_cluster'] = ncells

    # Get the predicted and actual observation columns
    predicted = subset_adata.obs['new_cluster']
    obs = subset_adata.obs['Clusters']

    # Compute the ARI between the columns
    ari = adjusted_rand_score(obs, predicted)

    print(f"The ARI between the 'obs' and 'predicted' columns is {ari:.3f}")

    from sklearn.metrics.cluster import adjusted_mutual_info_score

    # calculate AMI
    ami_score = adjusted_mutual_info_score(obs, predicted)

    print(f"AMI score: {ami_score:.4f}")

    ax2 = sc.pl.umap(subset_adata, color='Clusters', return_fig=True, show=False, title=f'{num_genes} genes, Cluster ARI = {ari:.3f}, Cluster AMI = {ami_score:.3f}', legend_loc=None)
    ax2.set_size_inches(8, 6)
    ax2.savefig(os.path.join(pather, 'Cluster.png'))

    sc.pp.neighbors(subset_adata, n_neighbors=6, knn=True, key_added='knn5_real')

    ncells=[]
    for i in range(len(subset_adata.obs.index)):
        neighbor_index = list(np.ascontiguousarray(subset_adata.obsp['knn5_real_distances'][i].nonzero())[1])
        try:
            newcell = stats.mode([subset_adata.obs['Subclusters'][n] for n in neighbor_index])
        except:
            newcell = ''
        ncells.append(newcell)
    subset_adata.obs['new_cluster'] = ncells

    # Get the predicted and actual observation columns
    predicted = subset_adata.obs['new_cluster']
    obs = subset_adata.obs['Subclusters']

    # Compute the ARI between the columns
    ari = adjusted_rand_score(obs, predicted)

    print(f"The ARI between the 'obs' and 'predicted' columns is {ari:.3f}")

    from sklearn.metrics.cluster import adjusted_mutual_info_score

    # calculate AMI
    ami_score = adjusted_mutual_info_score(obs, predicted)

    print(f"AMI score: {ami_score:.4f}")

    ax2 = sc.pl.umap(subset_adata, color='Clusters', return_fig=True, show=False, title=f'{num_genes} genes, Subcluster ARI = {ari:.3f}, Subcluster AMI = {ami_score:.3f}', legend_loc=None)
    ax2.set_size_inches(8, 6)
    ax2.savefig(os.path.join(pather, 'Subcluster.png'))

In [None]:
from sklearn.metrics import adjusted_rand_score
import scanpy as sc
import statistics as stats
import pandas as pd

for num_genes in [500, 1000]:
    for resolution in ['celltype', 'Clusters']:
        pather =os.path.join(path, f'scGeneFit_{num_genes}_genes_{resolution}')
        try:
            os.mkdir(pather)
        except:
            None

        names = current_ad_sub.obs[resolution].tolist()
        labels = current_ad_sub.obs[resolution].tolist()
        N,d=data.shape
        
        num_markers=num_genes
        method='centers'
        if resolution == 'celltype':
            redundancy=0.35
        elif resolution == 'Clusters':
            redundancy == 0.1
        elif resolution == 'Sublusters':
            redundancy == 0.05       

        markers= get_markers(data, np.array(labels), num_markers, method=method, redundancy=redundancy)
        
        
        subset_adata = current_ad_sub[:, markers]
        pd.DataFrame(subset_adata.var.index.tolist()).to_csv(os.path.join(pather, 'markers.csv'))
        sc.tl.pca(subset_adata)
        sc.pp.neighbors(subset_adata)
        sc.tl.umap(subset_adata)
        sc.set_figure_params(dpi=300)
        sc.pl.umap(subset_adata, color=resolution)
        
        calc_metrics(subset_adata, pather, num_genes)