# Clustering 3k PBMCs following a Seurat Tutorial

This started out (July 2017) with a demonstration that Scanpy would allow to reproduce most of Seurat's ([Satija *et al.*, 2015](https://doi.org/10.1038/nbt.3192)) clustering tutorial ([link](http://satijalab.org/seurat/pbmc3k_tutorial.html)), which we gratefully acknowledge. In the meanwhile, we have added and removed several pieces.

The data consists in *3k PBMCs from a Healthy Donor* and is freely available from 10x Genomics ([here](http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz) from this [webpage](https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k)).

In [1]:
import numpy as np
import pandas as pd
import scanpy.api as sc
from scipy import sparse, io
from collections import Counter
import os.path
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
import pickle
import os
import datetime
import hyperopt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import time
# import utils
import seurat_utils
import hyperopt_utils
plt.ion()
plt.show()
sc.settings.set_figure_params(dpi=80)
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
# sc.logging.print_versions()


  from ._conv import register_converters as _register_converters


scanpy==1.3.3 anndata==0.6.13 numpy==1.14.2 scipy==1.0.0 pandas==0.22.0 scikit-learn==0.19.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [2]:
%load_ext autoreload
import importlib
importlib.reload(seurat_utils)
importlib.reload(hyperopt_utils)

scanpy==1.3.3 anndata==0.6.13 numpy==1.14.2 scipy==1.0.0 pandas==0.22.0 scikit-learn==0.19.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


<module 'hyperopt_utils' from '/home/mada/ml/single-cell-sota/seurat/hyperopt_utils.py'>

# Input datasets

In [5]:

space = {
    'load_inputDataset' : 'brainCIDR'
    ,'bpreprocess_min_genes' : hp.choice('bpreprocess_min_genes', [True,False])
    ,'preprocess_min_genes' : hp.quniform('preprocess_min_genes', 0, 500, 10)
    
    ,'bpreprocess_min_cells' : hp.choice('bpreprocess_min_cells', [True,False])
    ,'preprocess_min_cells' : hp.quniform('preprocess_min_cells', 0, 20, 1)
    
    ,'bpreprocess_teta_total_features' : hp.choice('bpreprocess_teta_total_features', [True,False])
    ,'preprocess_teta_total_features' : hp.quniform('preprocess_teta_total_features', 5000, 18000, 1000)

    ,'preprocess_normalize_per_cell' : hp.choice('l_preprocess_normalize_per_cell', 
                                                 [True, False])

    ,'bpreprocess_filter_min_mean' : hp.choice('bpreprocess_filter_min_mean', [True,False])
    ,'preprocess_filter_min_mean' : hp.uniform('preprocess_filter_min_mean', 0, 0.5) # 0.0125
    ,'preprocess_filter_max_mean' : hp.uniform('preprocess_filter_max_mean', 0, 6)
    
    ,'bpreprocess_filter_min_disp' : hp.choice('bpreprocess_filter_min_disp', [True,False])
    ,'preprocess_filter_min_disp' : hp.uniform('preprocess_filter_min_disp', 0, 1)# 0.5
    
    ,'preprocess_regress_out' : hp.choice('l_preprocess_regress_out', ['none', 'total_counts'])
    ,'preprocess_scale' : hp.choice('l_preprocess_scale', [-1, 5, 10, 15, 20]) #10
    ,'cluster_n_neighbors' : hp.quniform('cluster_n_neighbors', 1, 20,1) #10
    ,'cluster_n_pcs' : hp.quniform('cluster_n_pcs', 20, 50, 1)  # 40
    ,'cluster_plot_pca' : False
    ,'preprocess_plot_preprocessing' : False
    ,'evaluate_plot_results': False
    ,'preprocess_verbosity' : 0
}

filename = 'deng_trials.pkl'  

In [4]:
 
# trials=hyperopt_utils.getTrials(filename ,restart = False )

Reload trials size :20


In [None]:

def runSeurat(filename, space, max_evals = 2):
    # Define function to optimise
    def evaluateSeurat(args):
        args['cluster_n_neighbors'] = int(args['cluster_n_neighbors'])
        args['cluster_n_pcs'] = int(args['cluster_n_pcs'])

        if args['bpreprocess_min_genes'] == False:
            args['preprocess_min_genes'] = -1
        if args['bpreprocess_min_cells'] == False:
            args['preprocess_min_cells'] = -1
        if args['bpreprocess_teta_total_features'] == False:
            args['preprocess_teta_total_features'] = -1
        if args['bpreprocess_filter_min_mean'] == False:
            args['preprocess_filter_min_mean'] = -1
        if args['bpreprocess_filter_min_disp'] == False:
            args['preprocess_filter_min_disp'] = -1

        if args['preprocess_teta_total_features'] is not None:
            args['preprocess_teta_total_features'] = int(args['preprocess_teta_total_features'])
        try:
            resultDict = seurat_utils.run(args)
        except:
            return { 'status' : hyperopt.STATUS_FAIL}

        print(f'>> Result: {resultDict["randIndex"]}')
        ret = {
            'loss' : -resultDict['randIndex']
            ,'status' : STATUS_OK
            ,'eval_time' : time.time()        
        }
        return ret

    trials = hyperopt_utils.getTrials(filename ,restart = False )
    evals_per_epoch = 10
    for e in range(len(trials), max_evals, evals_per_epoch):
        best = fmin(evaluateSeurat
                    ,space
                    ,algo=tpe.suggest 
                    ,max_evals= e + evals_per_epoch
                    ,trials=trials)

        pickle.dump(trials, open(filename, 'wb'))
    return trials

trials = runSeurat(filename, space, max_evals = 20)

Creating new trials...
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
computing neighbors
    using 'X_pca' with n_pcs = 46
    finished (0:00:03.01) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:01.69) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.01) --> found 11 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.5498210318686071
>> Result: 0.5498210318686071
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
computing neighbors
    using 'X_pca' with n_pcs = 33
    finished (0:00:00.17) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weight

  n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int)


computing neighbors
    using 'X_pca' with n_pcs = 48
    finished (0:00:00.13) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:00.90) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.14) --> found 10 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.4807822750465332
>> Result: 0.4807822750465332
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
computing neighbors
    using 'X_pca' with n_pcs = 44
    finished (0:00:00.15) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:01.04) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
  

  return np.sum(resid_dev * freq_weights * var_weights / scale)


Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 6107 genes that are detected in less than 9.0 cells
regressing out ['total_counts']
    sparse input is densified and may lead to high memory use
    finished (0:00:07.14)
computing neighbors
    using 'X_pca' with n_pcs = 34
    finished (0:00:00.09) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:00.64) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.01) --> found 12 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.4900825217823536
>> Result: 0.4900825217823536
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
computing neighbors
    using 'X_pca' with n_pcs = 39
    finished (0:00:00.15) --> ad

  return np.sum(resid_dev * freq_weights * var_weights / scale)


computing neighbors
    using 'X_pca' with n_pcs = 25
    finished (0:00:00.15) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:01.00) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.02) --> found 8 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.5730766273394569
>> Result: 0.5730766273394569
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 6107 genes that are detected in less than 9.0 cells
regressing out ['total_counts']
    sparse input is densified and may lead to high memory use
    finished (0:00:33.09)
computing neighbors
    using 'X_pca' with n_pcs = 27
    finished (0:00:00.13) --> added to `.uns['neighbors']`
    'distances', weighted adjac

  return np.sum(resid_dev * freq_weights * var_weights / scale)


... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 7388 genes that are detected in less than 14.0 cells
regressing out ['total_counts']
    sparse input is densified and may lead to high memory use
    finished (0:00:29.73)
computing neighbors
    using 'X_pca' with n_pcs = 46
    finished (0:00:00.06) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:00.27) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.00) --> found 59 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.16742385848803593
>> Result: 0.16742385848803593
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 5755 g

In [None]:
df = hyperopt_utils.getResultsAsDf(trials, space)
df.head()

Best result: 

In [None]:
df['result'].min()