In [22]:
import sys
sys.path.append("..") # this adds to path parent directory in order to import utils file
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from tqdm import tqdm
import numpy as np
from sklearn import metrics
from IPython.display import clear_output, Image, display
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.decomposition import PCA
import hyperopt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn.neighbors import kneighbors_graph
import igraph as ig
import louvain
import time
import pickle
import umap
## Import custom utils
import gmm_utils
import hyperopt_utils

In [23]:
%load_ext autoreload
import importlib
importlib.reload(gmm_utils)
importlib.reload(hyperopt_utils)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
elbowAnalysis
silhouetteAnalyis


<module 'hyperopt_utils' from '../hyperopt_utils.py'>

## Hyperopt utils

In [24]:
dataset = 'brainCIDR'
df, truth = gmm_utils.loadData(dataset)

print(df.shape, truth.shape, truth.clusters.unique())
params={}
params['dataset'] = 'brainCIDR'
params['minCellsPerGene'] = 0
params['minGeneDispersion'] = 0
params['log'] = True # True, False
params['scaler'] = 'standardScaleCells'# 
params['pca_comp'] = 10 #range (3, 300)
params['doUmap'] = True #range (3, 300)
params['umap_comp'] = 3
params['nb_neighbors'] =10 #3 -15
gmm_utils.runLouvain(params);

(420, 22085) (420, 2) [3 4 1 6 2 7 5 0]


({'dataset': 'brainCIDR',
  'doUmap': True,
  'log': True,
  'minCellsPerGene': 0,
  'minGeneDispersion': 0,
  'nb_neighbors': 10,
  'pca_comp': 10,
  'randIndex': 0.3896711401407734,
  'scaler': 'standardScaleCells',
  'umap_comp': 3},
 array([ 3,  1, 10, 10,  3,  1,  3, 10, 10,  3, 10,  3,  8,  3,  1,  3,  1,
         1,  4,  1,  4,  4,  3,  3,  4,  3,  3,  3,  3,  4, 10,  3,  3,  3,
         1, 10,  3,  3, 10, 10,  8, 10,  4,  1,  3,  3,  3,  4,  1,  1,  4,
         3,  3,  4,  1,  4,  4,  3,  1,  3,  4,  1, 10,  1, 10, 10,  1,  4,
         1,  4,  3,  4, 10,  1,  4,  4, 10,  1,  4, 10,  3,  4,  4,  1,  3,
         4,  1,  4,  1, 10,  2,  2,  0, 13, 13, 11, 16, 11, 11,  0,  0, 16,
         6, 13, 16,  0,  7,  0,  6, 13,  7, 16,  0, 15,  6,  0,  0, 13, 17,
        17, 13, 12, 12, 13, 17,  7, 17,  5,  0, 12,  5,  7, 17,  7, 13, 17,
         8,  8,  1,  8,  8,  8,  8,  8,  8,  1,  8,  8,  8,  8,  1,  8,  8,
         8,  1,  8,  1,  8,  1,  1,  1,  8,  4,  1,  4,  1,  4,  1, 10,  1,
   

In [25]:

def runHyperopt(filename, space, max_evals = 2, restart = False):
    # Define function to optimise
    def evaluateLouvain(args):
        try:
            resultDict, _ = gmm_utils.runLouvain(args)
        except:
            return { 'status' : hyperopt.STATUS_FAIL}

        print(f'>> Result: {resultDict["randIndex"]}')
        ret = {
            'loss' : -resultDict['randIndex']
            ,'status' : STATUS_OK
            ,'eval_time' : time.time()        
        }
        return ret

    trials = hyperopt_utils.getTrials(filename ,restart = restart )
    evals_per_epoch = 10
    for e in range(len(trials), max_evals, evals_per_epoch):
        best = fmin(evaluateLouvain
                    ,space
                    ,algo=tpe.suggest 
                    ,max_evals= e + evals_per_epoch
                    ,trials=trials)
        print('Index ', e)
        pickle.dump(trials, open(filename, 'wb'))
    return trials

## Brain CIDR

In [29]:
dataset = 'brainCIDR'
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 300,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}

filename = f'{dataset}_louvain_trials.pkl'

In [30]:
trials=hyperopt_utils.getTrials(filename ,restart = False )

Creating new trials...


In [None]:
trials = runHyperopt(filename, space, max_evals = 500, restart = True);

Creating new trials...
>> Result: 0.3823085687972271
>> Result: 0.26569598332185074
>> Result: 0.42681363257444055
>> Result: 0.41937512893833306
>> Result: 0.5682008392900804
>> Result: 0.35089113970218055


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6435556351100351
>> Result: 0.3233628764992152
>> Result: 0.45479620648895835
>> Result: 0.527020448578572
Index  0
>> Result: 0.44317862756972254
>> Result: 0.39861367397288233
>> Result: 0.3892968675720729
>> Result: 0.37831073005945337
>> Result: 0.4961602750698821


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7596723111604312
>> Result: 0.5483240288797667
>> Result: 0.34335539098249956


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.5086045909774787
>> Result: 0.3890820198734051
Index  10


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6344212618029101


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6983109438612145


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7012275715185803


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7189609460278771


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6972647834302852


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.676737089526265
>> Result: 0.648476921354741


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7221909444707502
>> Result: 0.550911150634164


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6150012753912408
Index  20


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.5362954117394108


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.68477249434332
>> Result: 0.4826934430976177
>> Result: 0.6010526546557816


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6834006949993392


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.5212688739614642
>> Result: 0.4440049332290721
>> Result: 0.6013424906468426


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7246914179406098


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7268063371751615
Index  30


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6751047562493977


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7003167958971882


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.650408129666742
>> Result: 0.36656523379605854


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.6249971110683609
>> Result: 0.4982147821762358


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7061152510024388
>> Result: 0.41564152058048637


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.44335498455267275
>> Result: 0.3343167211708708
Index  40
>> Result: 0.400445782594324


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7579557813818834
>> Result: 0.7535573945030459


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.4657092140259908
>> Result: 0.5716164153090045
>> Result: 0.45579640752711104


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.42895525416885555
>> Result: 0.5561800447890313
>> Result: 0.4722103022514341
>> Result: 0.6935120456289303
Index  50


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.47351558085713386


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.5490597992742366
>> Result: 0.5853219546839146


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


>> Result: 0.7308323089666864
>> Result: 0.3891154409580249
>> Result: 0.7643200271042186
>> Result: 0.7493201642890226
>> Result: 0.6602662794678176
>> Result: 0.570075995461746
>> Result: 0.5440569319691174
Index  60
>> Result: 0.4407932202106022
>> Result: 0.7836092125332547
>> Result: 0.6715425064973602
>> Result: 0.676404312700071
>> Result: 0.7890512703035873
>> Result: 0.5518334532818321
>> Result: 0.6695364164156907
>> Result: 0.6879806555422511
>> Result: 0.770109012292583
>> Result: 0.7801644359143579
Index  70
>> Result: 0.8413495159018819
>> Result: 0.67367144760814
>> Result: 0.7876663083099568
>> Result: 0.46523660525485255
>> Result: 0.5174730646445744
>> Result: 0.7919127987354188
>> Result: 0.5680880528552832
>> Result: 0.5303652658573142
>> Result: 0.7512775297348602
>> Result: 0.7901441510891695
Index  80
>> Result: 0.7680808878055183
>> Result: 0.7588220005031866
>> Result: 0.4267262887300326
>> Result: 0.7368367805963439
>> Result: 0.6645374004605826
>> Result: 0.4

In [None]:
summaryDf = hyperopt_utils.getResultsAsDf(trials, space)
summaryDf.sort_values(by='result', ascending =True).head()

In [None]:
summaryDf.to_pickle(f'{dataset}_louvain_df.pkl')

In [None]:
summaryDf = pd.read_pickle(f'{dataset}_louvain_df.pkl')
gmm_utils.plotBestPrediction(summaryDf, dataset)

## PancreaticIsletCIDR

In [None]:
dataset= 'pancreaticIsletCIDR'
df, truth = gmm_utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())

In [None]:

space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 300,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}

filename = f'{dataset}_louvain_trials.pkl'

In [None]:
trials=hyperopt_utils.getTrials(filename ,restart = False )

In [None]:
trials = runHyperopt(filename, space, max_evals = 500, restart = True);

In [None]:
summaryDf = hyperopt_utils.getResultsAsDf(trials, space)
summaryDf.sort_values(by='result', ascending =True).head()

In [None]:
summaryDf.to_pickle(f'{dataset}_louvain_df.pkl')

In [None]:
summaryDf = pd.read_pickle(f'{dataset}_louvain_df.pkl')
gmm_utils.plotBestPrediction(summaryDf, dataset, pca_comp = 4)

## Deng

In [None]:
dataset= 'deng'
df, truth = gmm_utils.loadData(dataset)
umap2D = gmm_utils.getUmap(df, pca_comp = 10)
print(df.shape, truth.shape, truth.clusters.unique())

In [None]:

space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 300,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}

filename = f'{dataset}_louvain_trials.pkl'

In [None]:
trials=hyperopt_utils.getTrials(filename ,restart = False )

In [None]:
trials = runHyperopt(filename, space, max_evals = 500, restart = True);

In [None]:
summaryDf = hyperopt_utils.getResultsAsDf(trials, space)
summaryDf.sort_values(by='result', ascending =True).head()

In [None]:
summaryDf.to_pickle(f'{dataset}_louvain_df.pkl')

In [None]:
summaryDf = pd.read_pickle(f'{dataset}_louvain_df.pkl')
gmm_utils.plotBestPrediction(summaryDf, dataset, pca_comp = 4)

In [None]:
bestBic, bestAic, bestSil = optimalNbClustersGMM(pc, params['min_clusters'], params['max_clusters'], 2)

In [None]:
bestBic, bestAic, bestSil

In [None]:
for n_clust in n_clusters:
    model = GaussianMixture(n_clust, covariance_type ='full', random_state = 0).fit(pc)
    clusters = model.predict(pc)
    score = adjusted_rand_score(truth.clusters.tolist(), clusters)
    print(f"For {n_clust} clusters, score : {score}")

## Evaluation

In [None]:
model = GaussianMixture(8, covariance_type ='full', random_state = 0).fit(pc)
clusters = model.predict(pc)
score = adjusted_rand_score(truth.clusters.tolist(), clusters)
score

In [None]:
truth.clusters.value_counts().shape

In [None]:
dataset = 'brainCIDR'
df, truth = gmm_utils.loadData(dataset)
umap2D = gmm_utils.getUmap(df, pca_comp = 10)
print(df.shape, truth.shape, truth.clusters.unique())
params={}
params['dataset'] = 'brainCIDR'
params['minCellsPerGene'] = 0
params['minGeneDispersion'] = 0
params['log'] = True # True, False
params['scaler'] = 'standardScaleCells'# 
params['pca_comp'] = 10 #range (3, 300)
params['nb_clusters'] =8 #3 -15
gmm_utils.run(params);