In [1]:
import sys
sys.path.append("..") # this adds to path parent directory in order to import utils file
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from tqdm import tqdm
import numpy as np
from sklearn import metrics
from IPython.display import clear_output, Image, display
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import adjusted_rand_score

from sklearn.decomposition import PCA
import hyperopt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn.neighbors import kneighbors_graph
import igraph as ig
import louvain
import time
import pickle
import umap
import os
import traceback
## Import custom utils
import utils
import gmm_utils
import hyperopt_utils

elbowAnalysis
silhouetteAnalyis
elbowAnalysis
silhouetteAnalyis


In [2]:
%load_ext autoreload
import importlib
importlib.reload(gmm_utils)
importlib.reload(utils)
importlib.reload(hyperopt_utils)

elbowAnalysis
silhouetteAnalyis
elbowAnalysis
silhouetteAnalyis


<module 'hyperopt_utils' from '..\\hyperopt_utils.py'>

In [3]:
# Check run louvain method
dataset = 'brainCIDR'
df, truth = utils.loadData(dataset)
y = truth.clusters

params={}
params['dataset'] = 'brainCIDR'
params['minCellsPerGene'] = 0
params['minGeneDispersion'] = 0
params['log'] = True # True, False
params['scaler'] = 'standardScaleCells'# 
params['pca_comp'] = 10 #range (3, 300)
params['doUmap'] = False #range (3, 300)
params['umap_comp'] = 3
params['nb_neighbors'] =10 #3 -15

In [4]:
for k,v in params.items():
    print(f'self.{k} = {k}', end='\n')

self.dataset = dataset
self.minCellsPerGene = minCellsPerGene
self.minGeneDispersion = minGeneDispersion
self.log = log
self.scaler = scaler
self.pca_comp = pca_comp
self.doUmap = doUmap
self.umap_comp = umap_comp
self.nb_neighbors = nb_neighbors


In [8]:
from sklearn.base import BaseEstimator

class LouvainModel(BaseEstimator):
    def __init__(self,  
                 minCellsPerGene=0, 
                 minGeneDispersion=0, 
                 log=True, 
                 scaler='standardScaleCells', 
                 pca_comp=10, 
                 doUmap=False, 
                 umap_comp=3, 
                 nb_neighbors=10):
        self.minCellsPerGene = minCellsPerGene
        self.minGeneDispersion = minGeneDispersion
        self.log = log
        self.scaler = scaler
        self.pca_comp = pca_comp
        self.doUmap = doUmap
        self.umap_comp = umap_comp
        self.nb_neighbors = nb_neighbors

    def preprocess(df):
        discreteDf = np.zeros(df.shape)
        discreteDf[np.where(df>0)] = 1
        genesToKeep = np.where(discreteDf.sum(axis = 0)>=self.minCellsPerGene )[0]
        df= df[df.columns[genesToKeep]]
        del discreteDf 

        # Remove genes which have a very low variance as they are expressed equally in all cells
        logDf = np.log1p(df)
        nonZeroMean = logDf.mean(axis = 0)
        nonZeroMean[nonZeroMean==0] = 1e-10
        dispersion =logDf.var(axis = 0)/nonZeroMean
        genesToKeep = np.where(dispersion>=self.minGeneDispersion )[0]
        df= df[df.columns[genesToKeep]]
        del logDf, nonZeroMean, dispersion

        if self.log:
            df = np.log1p(df)

        # scaling
        if self.scaler == 'none':
            scaledDf = df.values
        if self.scaler == 'standardScaleGenes':
            scaledDf = StandardScaler().fit_transform(df)
        if self.scaler == 'standardScaleCells':
            scaledDf = StandardScaler().fit_transform(df.T).T
        if self.scaler == 'robustScaleGenes':
            scaledDf = RobustScaler().fit_transform(df)
        if self.scaler == 'robustScaleCells':
            scaledDf = RobustScaler().fit_transform(df.T).T
        return scaledDf
    def fit(self, X, y):
        X = self.preprocess(X)
        # PCA reduction
        self.pca= PCA(self.pca_comp)
        self.pca.fit(X)
        pass
    def predict(self, X):
        X = self.preprocess(X)
        data = self.pca.transform(X)
        if self.doUmap:
            data = gmm_utils.getUmap(data, ncomp = self.umap_comp)
        clusters = gmm_utils.cluster_knn_louvain(data, neighbors = self.nb_neighbors)
        return clusters
    
    def fitPredict(self, X):
        self.fit(X, None)
        return self.predict(X)
    
    def scorer(self,clf, X, y_true):
        print(f'scorer {X.shape}, {y_true.shape}')
        from sklearn.metrics.cluster import adjusted_rand_score
        y_pred = clf.predict(X)
        return adjusted_rand_score(y_true, y_pred)
        
model = LouvainModel(params['dataset'], params)
model.fitPredict(df)

self.nb_neighbors = 10
fit (420, 22085)
predict (420, 22085), bla


array([ 3,  5,  5,  8,  8,  5,  8,  5,  3,  3,  8,  5,  0,  5,  9,  5,  5,
        5,  8,  5,  3,  8,  8,  8,  8,  5,  8,  3,  3,  3,  3,  5,  8,  3,
        9,  8,  8,  5,  5,  5,  2,  8,  3,  5,  3,  5,  5,  8,  8,  0,  8,
        5,  5,  3,  9,  5,  3,  8,  9,  3,  8,  0,  3,  5,  3,  3,  5,  3,
        9,  8,  8,  5,  8,  0,  3,  5,  8,  9,  8,  8,  3,  9,  9,  2,  2,
        3,  9,  8,  0,  8, 10, 10,  2,  6, 10,  1,  9,  4,  0,  9,  5,  2,
        4, 10,  2,  2,  2,  9,  4,  6,  7,  7,  2,  4,  4,  9,  2,  6,  0,
        0,  1,  0,  0,  6,  0,  1,  6,  0,  0,  1,  0,  0,  6,  0,  6,  6,
        2,  2,  9,  2,  8,  7,  9,  9,  2,  3,  2,  2,  3,  7,  9,  2,  2,
        5,  2,  5,  5,  2,  9,  2,  8,  5,  3,  5,  3,  5,  5,  5,  3,  2,
        3,  8,  5,  3,  8,  9,  3,  9,  5,  5,  4,  4,  1,  0,  7,  1,  0,
        1,  1,  0,  0,  0,  1,  1,  1,  0,  2,  0,  0,  0, 10,  6,  3,  6,
        5,  3, 10,  6,  4, 10,  6,  1,  6,  6, 10,  6,  3,  3,  3,  1,  3,
        1,  1,  2,  3,  1

In [9]:
from sklearn.model_selection import cross_val_score
model = LouvainModel(nb_neighbors=8)
cross_val_score(model, df, y,
                      scoring = model.my_scorer, 
                cv = 2)

self.nb_neighbors = 8
self.nb_neighbors = 8
fit (210, 22085)
scorer (210, 22085), (210,)
predict (210, 22085), bla
self.nb_neighbors = 8
fit (210, 22085)
scorer (210, 22085), (210,)
predict (210, 22085), bla


array([0.15086086, 0.29539015])

In [None]:
def runLouvain(params):
    df, truth = utils.loadData(params['dataset'])
    y = truth.clusters.tolist()
    idxTrain, idxTest, yTrain, yTest = train_test_split(np.arange(df.shape[0]), y, stratify = y, test_size = 0.2)
    print(len(idxTrain), len(idxTest))
    dfOrig = df.copy()
    # Preprocessing remove genes which don't appear in at least minCellsPerGene cells
    discreteDf = np.zeros(df.shape)
    discreteDf[np.where(df>0)] = 1
    genesToKeep = np.where(discreteDf.sum(axis = 0)>=params['minCellsPerGene'] )[0]
    df= df[df.columns[genesToKeep]]
    del discreteDf 
    
    # Remove genes which have a very low variance as they are expressed equally in all cells
    logDf = np.log1p(df)
    nonZeroMean = logDf.mean(axis = 0)
    nonZeroMean[nonZeroMean==0] = 1e-10
    dispersion =logDf.var(axis = 0)/nonZeroMean
    genesToKeep = np.where(dispersion>=params['minGeneDispersion'] )[0]
    df= df[df.columns[genesToKeep]]
    del logDf, nonZeroMean, dispersion

    if params['log']:
        df = np.log1p(df)
        
    # scaling
    if params['scaler'] == 'none':
        scaledDf = df.values
    if params['scaler'] == 'standardScaleGenes':
        scaledDf = StandardScaler().fit_transform(df)
    if params['scaler'] == 'standardScaleCells':
        scaledDf = StandardScaler().fit_transform(df.T).T
    if params['scaler'] == 'robustScaleGenes':
        scaledDf = RobustScaler().fit_transform(df)
    if params['scaler'] == 'robustScaleCells':
        scaledDf = RobustScaler().fit_transform(df.T).T
        
    # PCA reduction
    pca= PCA(n_components=params['pca_comp'])
    pca.fit(scaledDf[idxTrain])
#     trainData = pca.transform(scaledDf[idxTrain])
#     testData = pca.transform(scaledDf[idxTest])

    data = pca.transform(scaledDf)
#     data = PCA(n_components=params['pca_comp']).fit_transform(scaledDf)
#     if params['doUmap']:
#         data = gmm_utils.getUmap(data, ncomp = params['umap_comp'])
    
    clusters = gmm_utils.cluster_knn_louvain(data, neighbors = params['nb_neighbors'])
#     ev = gmm_utils.externalValidation(y, clusters)
#     iv = gmm_utils.internalValidation(dfOrig, clusters)

#     params = {**params, **ev, **iv}
    return params, clusters

In [None]:

print(df.shape, truth.shape, truth.clusters.unique())
params={}
params['dataset'] = 'brainCIDR'
params['minCellsPerGene'] = 0
params['minGeneDispersion'] = 0
params['log'] = True # True, False
params['scaler'] = 'standardScaleCells'# 
params['pca_comp'] = 10 #range (3, 300)
params['doUmap'] = True #range (3, 300)
params['umap_comp'] = 3
params['nb_neighbors'] =10 #3 -15
xx, clusters = runLouvain(params)
xx

## Hyperopt utils

In [None]:
method = 'louvain'

In [None]:

def runHyperopt(trialsFile, resultsFile, space, max_evals = 2, restart = False):
    # Define function to optimise
    def evaluateLouvain(args):
        try:
            resultDict, _ = gmm_utils.runLouvain(args)
            if os.path.isfile(resultsFile):
                results = pd.read_pickle(resultsFile)

                newDf = pd.DataFrame.from_dict(resultDict, orient='index').T
                results = results.append(newDf)

            else:

                results = pd.DataFrame.from_dict(resultDict, orient='index').T
            results.to_pickle(resultsFile)
        except:
            traceback.print_exc(file=sys.stdout)
            return { 'status' : hyperopt.STATUS_FAIL}

        print(f'>> Result: {resultDict["_rand_index"]}')
        ret = {
            'loss' : -resultDict['_rand_index']
            ,'status' : STATUS_OK
            ,'eval_time' : time.time()        
        }
        return ret

    trials = hyperopt_utils.getTrials(trialsFile ,restart = restart )
    evals_per_epoch = 10
    for e in range(len(trials), max_evals, evals_per_epoch):
        best = fmin(evaluateLouvain
                    ,space
                    ,algo=tpe.suggest 
                    ,max_evals= e + evals_per_epoch
                    ,trials=trials)
        print('Index ', e)
#         pickle.dump(trials, open(trialsFile, 'wb'))
    return trials

## Brain CIDR

In [None]:
dataset = 'brainCIDR'
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 300,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}

trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
gmm_utils.plotCorrelation(resultsDf, name=dataset)

## PancreaticIsletCIDR

In [None]:
dataset= 'pancreaticIsletCIDR'
df, truth = gmm_utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:

space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 300,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}



In [None]:
trials=hyperopt_utils.getTrials(filename ,restart = False )

In [None]:
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:

resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
gmm_utils.plotCorrelation(resultsDf, name=dataset)

## Deng

In [None]:
dataset= 'deng'
df, truth = gmm_utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 200,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
utils.plotCorrelation(resultsDf, name=dataset)

# sce10x_qc

In [None]:
dataset= 'sce10x_qc'
df, truth = utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 200,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
utils.plotCorrelation(resultsDf, name=dataset)

# sce2_qc

In [None]:
dataset= 'sce2_qc'
df, truth = utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 200,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
utils.plotCorrelation(resultsDf, name=dataset)

# sce8_qc

In [None]:
dataset= 'sce8_qc'
df, truth = utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 5, 200,1))
    ,'doUmap' : hp.choice('doUmap', [True,False])
    ,'umap_comp' : scope.int(hp.quniform('umap_comp', 2, 5,1))
    ,'nb_neighbors' : scope.int(hp.quniform('nb_neighbors', 6, 30, 1))
}
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
utils.plotCorrelation(resultsDf, name=dataset)

In [None]:
bestBic, bestAic, bestSil = optimalNbClustersGMM(pc, params['min_clusters'], params['max_clusters'], 2)

In [None]:
bestBic, bestAic, bestSil

In [None]:
for n_clust in n_clusters:
    model = GaussianMixture(n_clust, covariance_type ='full', random_state = 0).fit(pc)
    clusters = model.predict(pc)
    score = adjusted_rand_score(truth.clusters.tolist(), clusters)
    print(f"For {n_clust} clusters, score : {score}")

## Evaluation

In [None]:
model = GaussianMixture(8, covariance_type ='full', random_state = 0).fit(pc)
clusters = model.predict(pc)
score = adjusted_rand_score(truth.clusters.tolist(), clusters)
score

In [None]:
truth.clusters.value_counts().shape

In [None]:
dataset = 'brainCIDR'
df, truth = gmm_utils.loadData(dataset)
umap2D = gmm_utils.getUmap(df, pca_comp = 10)
print(df.shape, truth.shape, truth.clusters.unique())
params={}
params['dataset'] = 'brainCIDR'
params['minCellsPerGene'] = 0
params['minGeneDispersion'] = 0
params['log'] = True # True, False
params['scaler'] = 'standardScaleCells'# 
params['pca_comp'] = 10 #range (3, 300)
params['nb_clusters'] =8 #3 -15
gmm_utils.run(params);