In [None]:
import sys
sys.path.append("..") # this adds to path parent directory in order to import utils file
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from tqdm import tqdm
import numpy as np
from sklearn import metrics
from IPython.display import clear_output, Image, display
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.decomposition import PCA
import hyperopt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
import time
import pickle
import umap
import sys, traceback, os

## Import custom utils
import utils
import gmm_utils
import hyperopt_utils

In [None]:
%load_ext autoreload
import importlib
importlib.reload(gmm_utils)
importlib.reload(hyperopt_utils)
importlib.reload(utils)

## Hyperopt utils

In [None]:
method = 'gmm'
def runHyperopt(trialsFile, resultsFile, space, max_evals = 2, restart = False):
    # Define function to optimise
    def evaluateGMM(args):
        try:
            resultDict, _ = gmm_utils.runGMM(args)
            if os.path.isfile(resultsFile):
                results = pd.read_pickle(resultsFile)

                newDf = pd.DataFrame.from_dict(resultDict, orient='index').T
                results = results.append(newDf)
            else:
                results = pd.DataFrame.from_dict(resultDict, orient='index').T
            results.to_pickle(resultsFile)
        except:
            traceback.print_exc(file=sys.stdout)
            return { 'status' : hyperopt.STATUS_FAIL}

        print(f'>> Result: {resultDict["_rand_index"]}')
        ret = {
            'loss' : -resultDict['_rand_index']
            ,'status' : STATUS_OK
            ,'eval_time' : time.time()        
        }
        return ret

    trials = hyperopt_utils.getTrials(trialsFile ,restart = restart )
    evals_per_epoch = 10
    for e in range(len(trials), max_evals, evals_per_epoch):
        best = fmin(evaluateGMM
                    ,space
                    ,algo=tpe.suggest 
                    ,max_evals= e + evals_per_epoch
                    ,trials=trials)
        print('Index ', e)
#         pickle.dump(trials, open(trialsFile, 'wb'))
    return trials

## Brain CIDR

In [None]:
dataset = 'brainCIDR'
trialsFile = f'data/{dataset}_gmm_trials.pkl'
resultsFile = f'data/{dataset}_gmm_results.pkl'
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 3, 300,1))
    ,'nb_clusters' : scope.int(hp.quniform('nb_clusters', 3, 15, 1))
}



In [None]:
# Smoke testing
# params = {
#     'dataset' : 'brainCIDR'
#     ,'minCellsPerGene':2
#     ,'minGeneDispersion':0.5
#     ,'log' : True
#     ,'scaler' : 'standardScaleCells'
#     ,'pca_comp' : 10
#     ,'nb_clusters' : 10
# }

# params = {'dataset': 'brainCIDR', 
#           'log': False, 
#           'minCellsPerGene': 2, 
#           'minGeneDispersion': 1.3643019957416587, 
#           'nb_clusters': 9, 
#           'pca_comp': 246, 
#           'scaler': 'none'}
# gmm_utils.runGMM(params)

In [None]:
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
# gmm_utils.plotBestPrediction(summaryDf, dataset)
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
gmm_utils.plotCorrelation(resultsDf, name=dataset)

## PancreaticIsletCIDR

In [None]:
dataset= 'pancreaticIsletCIDR'
df, truth = gmm_utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_gmm_trials.pkl'
resultsFile = f'data/{dataset}_gmm_results.pkl'

In [None]:

space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 3, 300,1))
    ,'nb_clusters' : scope.int(hp.quniform('nb_clusters', 3, 15, 1))
}



In [None]:
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
gmm_utils.plotCorrelation(resultsDf, name=dataset)

## Deng

In [None]:
dataset= 'deng'
trialsFile = f'data/{dataset}_gmm_trials.pkl'
resultsFile = f'data/{dataset}_gmm_results.pkl'
df, truth = gmm_utils.loadData(dataset)
# umap2D = gmm_utils.getUmap(df, pca_comp = 10)
print(df.shape, truth.shape, truth.clusters.unique())

In [None]:

space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 3, 300,1))
    ,'nb_clusters' : scope.int(hp.quniform('nb_clusters', 3, 15, 1))
}



In [None]:
trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
resultsDf = pd.read_pickle(resultsFile)
display(resultsDf.sort_values(by='_rand_index', ascending = False).head(3))
gmm_utils.plotCorrelation(resultsDf, name=dataset)

# sce10x_qc

In [None]:
dataset= 'sce10x_qc'
df, truth = utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 3, 300,1))
    ,'nb_clusters' : scope.int(hp.quniform('nb_clusters', 3, 15, 1))
}

trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

# sce2_qc

In [None]:
dataset= 'sce2_qc'
df, truth = utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 3, 300,1))
    ,'nb_clusters' : scope.int(hp.quniform('nb_clusters', 3, 15, 1))
}

trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

# sce8_qc

In [None]:
dataset= 'sce8_qc'
df, truth = utils.loadData(dataset)
print(df.shape, truth.shape, truth.clusters.unique())
trialsFile = f'data/{dataset}_{method}_trials.pkl'
resultsFile = f'data/{dataset}_{method}_results.pkl'

In [None]:
space = {
    'dataset' : dataset
    ,'minCellsPerGene':scope.int(hp.quniform('minCellsPerGene', 0, 5, 1))
    ,'minGeneDispersion':hp.uniform('minGeneDispersion', 0, 1.5)
    ,'log' : hp.choice('log', [True,False])
    ,'scaler' : hp.choice('scaler',
            ['none','standardScaleGenes', 'standardScaleCells', 'robustScaleGenes', 'robustScaleCells'])
    ,'pca_comp' : scope.int(hp.quniform('pca_comp', 3, 300,1))
    ,'nb_clusters' : scope.int(hp.quniform('nb_clusters', 3, 15, 1))
}

trials = runHyperopt(trialsFile, resultsFile, space, max_evals = 500, restart = True);

In [None]:
bestBic, bestAic, bestSil = optimalNbClustersGMM(pc, params['min_clusters'], params['max_clusters'], 2)

In [None]:
bestBic, bestAic, bestSil

In [None]:
for n_clust in n_clusters:
    model = GaussianMixture(n_clust, covariance_type ='full', random_state = 0).fit(pc)
    clusters = model.predict(pc)
    score = adjusted_rand_score(truth.clusters.tolist(), clusters)
    print(f"For {n_clust} clusters, score : {score}")

## Evaluation

In [None]:
model = GaussianMixture(8, covariance_type ='full', random_state = 0).fit(pc)
clusters = model.predict(pc)
score = adjusted_rand_score(truth.clusters.tolist(), clusters)
score

In [None]:
truth.clusters.value_counts().shape

In [None]:
dataset = 'brainCIDR'
df, truth = gmm_utils.loadData(dataset)
umap2D = gmm_utils.getUmap(df, pca_comp = 10)
print(df.shape, truth.shape, truth.clusters.unique())
params={}
params['dataset'] = 'brainCIDR'
params['minCellsPerGene'] = 0
params['minGeneDispersion'] = 0
params['log'] = True # True, False
params['scaler'] = 'standardScaleCells'# 
params['pca_comp'] = 10 #range (3, 300)
params['nb_clusters'] =8 #3 -15
gmm_utils.run(params);