In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
                                   'AMI_Louvain','AMI_kmeans','AMI_HC',
                                   'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])

In [3]:
workdir = './output/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
metadata = pd.read_csv('./input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))

In [5]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

17

In [6]:
files

['FM_Control_Eryclean.rds',
 'FM_BROCKMAN_Eryclean.rds',
 'FM_Cusanovich2018_Eryclean.rds',
 'FM_cisTopic_Eryclean.rds',
 'FM_chromVAR_Eryclean_kmers.rds',
 'FM_chromVAR_Eryclean_motifs.rds',
 'FM_chromVAR_Eryclean_kmers_pca.rds',
 'FM_chromVAR_Eryclean_motifs_pca.rds',
 'FM_GeneScoring_Eryclean.rds',
 'FM_GeneScoring_Eryclean_pca.rds',
 'FM_Cicero_Eryclean.rds',
 'FM_Cicero_Eryclean_pca.rds',
 'FM_SnapATAC_Eryclean.rds',
 'FM_Scasat_Eryclean.rds',
 'FM_scABC_Eryclean.rds',
 'FM_SCRAT_Eryclean.rds',
 'FM_SCRAT_Eryclean_pca.rds']

In [7]:
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.louvain(adata,resolution=this_resolution)
        this_clusters = adata.obs['louvain'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    
    print('Cannot find the number of clusters')
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))

In [8]:
for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)

    pandas2ri.activate()
    readRDS = robjects.r['readRDS']
    df_rds = readRDS(os.path.join(path_fm,file))
    fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
    fm_mat.columns = metadata.index
    
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
#     sc.tl.louvain(adata)
    getNClusters(adata,n_cluster=num_clusters)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    
    #adjusted rank index
    ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
    #adjusted mutual information
    ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

    df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 13 at resolution 2.25
step 2
got 11 at resolution 1.875
step 3
got 12 at resolution 2.0625
BROCKMAN


  res = PandasDataFrame.from_items(items)


step 0
got 11 at resolution 1.5
step 1
got 13 at resolution 2.25
step 2
got 12 at resolution 1.875
Cusanovich2018


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 13 at resolution 2.25
step 2
got 11 at resolution 1.875
step 3
got 12 at resolution 2.0625
cisTopic


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 10 at resolution 2.25
step 2
got 12 at resolution 2.625
chromVAR_kmers


  res = PandasDataFrame.from_items(items)


step 0
got 7 at resolution 1.5
step 1
got 9 at resolution 2.25
step 2
got 10 at resolution 2.625
step 3
got 11 at resolution 2.8125
step 4
got 11 at resolution 2.90625
step 5
got 11 at resolution 2.953125
step 6
got 12 at resolution 2.9765625
chromVAR_motifs


  res = PandasDataFrame.from_items(items)


step 0
got 7 at resolution 1.5
step 1
got 11 at resolution 2.25
step 2
got 13 at resolution 2.625
step 3
got 11 at resolution 2.4375
step 4
got 12 at resolution 2.53125
chromVAR_kmers_pca


  res = PandasDataFrame.from_items(items)


step 0
got 8 at resolution 1.5
step 1
got 9 at resolution 2.25
step 2
got 9 at resolution 2.625
step 3
got 9 at resolution 2.8125
step 4
got 10 at resolution 2.90625
step 5
got 9 at resolution 2.953125
step 6
got 9 at resolution 2.9765625
step 7
got 9 at resolution 2.98828125
step 8
got 9 at resolution 2.994140625
step 9
got 9 at resolution 2.9970703125
step 10
got 9 at resolution 2.99853515625
step 11
got 9 at resolution 2.999267578125
step 12
got 9 at resolution 2.9996337890625
step 13
got 9 at resolution 2.99981689453125
step 14
got 9 at resolution 2.999908447265625
step 15
got 9 at resolution 2.9999542236328125
step 16
got 9 at resolution 2.9999771118164062
step 17
got 9 at resolution 2.999988555908203
step 18
got 9 at resolution 2.9999942779541016
step 19
got 9 at resolution 2.999997138977051
Cannot find the number of clusters
Clustering solution from last iteration is used:9 at resolution 2.999997138977051
chromVAR_motifs_pca


  res = PandasDataFrame.from_items(items)


step 0
got 7 at resolution 1.5
step 1
got 11 at resolution 2.25
step 2
got 15 at resolution 2.625
step 3
got 13 at resolution 2.4375
step 4
got 13 at resolution 2.34375
step 5
got 12 at resolution 2.296875
GeneScoring


  res = PandasDataFrame.from_items(items)


step 0
got 33 at resolution 1.5
step 1
got 2 at resolution 0.75
step 2
got 14 at resolution 1.125
step 3
got 5 at resolution 0.9375
step 4
got 11 at resolution 1.03125
step 5
got 9 at resolution 1.078125
step 6
got 13 at resolution 1.1015625
step 7
got 10 at resolution 1.08984375
step 8
got 14 at resolution 1.095703125
step 9
got 13 at resolution 1.0927734375
step 10
got 12 at resolution 1.09130859375
GeneScoring_pca


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 15 at resolution 2.25
step 2
got 11 at resolution 1.875
step 3
got 13 at resolution 2.0625
step 4
got 12 at resolution 1.96875
Cicero


  res = PandasDataFrame.from_items(items)


step 0
got 35 at resolution 1.5
step 1
got 1 at resolution 0.75
step 2
got 18 at resolution 1.125
step 3
got 6 at resolution 0.9375
step 4
got 12 at resolution 1.03125
Cicero_pca


  res = PandasDataFrame.from_items(items)


step 0
got 11 at resolution 1.5
step 1
got 18 at resolution 2.25
step 2
got 16 at resolution 1.875
step 3
got 15 at resolution 1.6875
step 4
got 12 at resolution 1.59375
SnapATAC


  res = PandasDataFrame.from_items(items)


step 0
got 12 at resolution 1.5
Scasat


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 12 at resolution 2.25
scABC


  res = PandasDataFrame.from_items(items)


step 0
got 6 at resolution 1.5
step 1
got 10 at resolution 2.25
step 2
got 16 at resolution 2.625
step 3
got 12 at resolution 2.4375
SCRAT


  res = PandasDataFrame.from_items(items)


step 0
got 9 at resolution 1.5
step 1
got 11 at resolution 2.25
step 2
got 13 at resolution 2.625
step 3
got 12 at resolution 2.4375
SCRAT_pca


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 12 at resolution 2.25


In [9]:
df_metrics.to_csv(path_metrics+'clustering_scores.csv')

In [10]:
df_metrics

Unnamed: 0,ARI_Louvain,ARI_kmeans,ARI_HC,AMI_Louvain,AMI_kmeans,AMI_HC,Homogeneity_Louvain,Homogeneity_kmeans,Homogeneity_HC
Control,0.750941,0.699898,0.696635,0.854198,0.820996,0.823917,0.855354,0.824273,0.822788
BROCKMAN,0.853937,0.710585,0.715,0.912684,0.841918,0.856994,0.911527,0.837232,0.847617
Cusanovich2018,0.833878,0.705488,0.728346,0.91051,0.857965,0.882825,0.912229,0.836235,0.838571
cisTopic,0.859505,0.870247,0.862204,0.925826,0.924597,0.91813,0.924268,0.926108,0.91906
chromVAR_kmers,0.631362,0.683125,0.633939,0.779703,0.811993,0.771349,0.774795,0.809344,0.77325
chromVAR_motifs,0.41833,0.37396,0.349499,0.61227,0.586485,0.558589,0.61134,0.591831,0.559489
chromVAR_kmers_pca,0.694709,0.702174,0.670427,0.844425,0.818812,0.794689,0.790521,0.822046,0.789706
chromVAR_motifs_pca,0.396611,0.351722,0.366967,0.587496,0.583859,0.564477,0.590708,0.588734,0.562235
GeneScoring,0.0127545,0.281442,0.280855,0.0212587,0.501921,0.468168,0.0415435,0.432987,0.44614
GeneScoring_pca,0.295875,0.316226,0.278184,0.504467,0.560827,0.516213,0.508565,0.551597,0.50641
