# Clustering 3k PBMCs following a Seurat Tutorial

This started out (July 2017) with a demonstration that Scanpy would allow to reproduce most of Seurat's ([Satija *et al.*, 2015](https://doi.org/10.1038/nbt.3192)) clustering tutorial ([link](http://satijalab.org/seurat/pbmc3k_tutorial.html)), which we gratefully acknowledge. In the meanwhile, we have added and removed several pieces.

The data consists in *3k PBMCs from a Healthy Donor* and is freely available from 10x Genomics ([here](http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz) from this [webpage](https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k)).

In [55]:
import numpy as np
import pandas as pd
import scanpy.api as sc
from scipy import sparse, io
from collections import Counter
import os.path
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
import pickle
import os
import datetime
import hyperopt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import time
# import utils
import seurat_utils
import hyperopt_utils
plt.ion()
plt.show()
sc.settings.set_figure_params(dpi=80)
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
# sc.logging.print_versions()


scanpy==1.3.3 anndata==0.6.13 numpy==1.14.2 scipy==1.0.0 pandas==0.22.0 scikit-learn==0.19.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [54]:
%load_ext autoreload
import importlib
importlib.reload(seurat_utils)
importlib.reload(hyperopt_utils)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
scanpy==1.3.3 anndata==0.6.13 numpy==1.14.2 scipy==1.0.0 pandas==0.22.0 scikit-learn==0.19.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


<module 'hyperopt_utils' from '/home/mada/ml/single-cell-sota/seurat/hyperopt_utils.py'>

# Input datasets

In [5]:

space = {
    'load_inputDataset' : 'brainCIDR'
    ,'bpreprocess_min_genes' : hp.choice('bpreprocess_min_genes', [True,False])
    ,'preprocess_min_genes' : hp.quniform('preprocess_min_genes', 0, 500, 10)
    
    ,'bpreprocess_min_cells' : hp.choice('bpreprocess_min_cells', [True,False])
    ,'preprocess_min_cells' : hp.quniform('preprocess_min_cells', 0, 20, 1)
    
    ,'bpreprocess_teta_total_features' : hp.choice('bpreprocess_teta_total_features', [True,False])
    ,'preprocess_teta_total_features' : hp.quniform('preprocess_teta_total_features', 5000, 18000, 1000)

    ,'preprocess_normalize_per_cell' : hp.choice('l_preprocess_normalize_per_cell', 
                                                 [True, False])

    ,'bpreprocess_filter_min_mean' : hp.choice('bpreprocess_filter_min_mean', [True,False])
    ,'preprocess_filter_min_mean' : hp.uniform('preprocess_filter_min_mean', 0, 0.5) # 0.0125
    ,'preprocess_filter_max_mean' : hp.uniform('preprocess_filter_max_mean', 0, 6)
    
    ,'bpreprocess_filter_min_disp' : hp.choice('bpreprocess_filter_min_disp', [True,False])
    ,'preprocess_filter_min_disp' : hp.uniform('preprocess_filter_min_disp', 0, 1)# 0.5
    
    ,'preprocess_regress_out' : hp.choice('l_preprocess_regress_out', ['none', 'total_counts'])
    ,'preprocess_scale' : hp.choice('l_preprocess_scale', [-1, 5, 10, 15, 20]) #10
    ,'cluster_n_neighbors' : hp.quniform('cluster_n_neighbors', 1, 20,1) #10
    ,'cluster_n_pcs' : hp.quniform('cluster_n_pcs', 20, 50, 1)  # 40
    ,'cluster_plot_pca' : False
    ,'preprocess_plot_preprocessing' : False
    ,'evaluate_plot_results': False
    ,'preprocess_verbosity' : 0
}

In [6]:
filename = 'deng_trials.pkl'   
trials=hyperopt_utils.getTrials(filename ,restart = False )

Creating new trials...


In [8]:

def runSeurat(filename, space, max_evals = 2):
    # Define function to optimise
    def evaluateSeurat(args):
        args['cluster_n_neighbors'] = int(args['cluster_n_neighbors'])
        args['cluster_n_pcs'] = int(args['cluster_n_pcs'])

        if args['bpreprocess_min_genes'] == False:
            args['preprocess_min_genes'] = -1
        if args['bpreprocess_min_cells'] == False:
            args['preprocess_min_cells'] = -1
        if args['bpreprocess_teta_total_features'] == False:
            args['preprocess_teta_total_features'] = -1
        if args['bpreprocess_filter_min_mean'] == False:
            args['preprocess_filter_min_mean'] = -1
        if args['bpreprocess_filter_min_disp'] == False:
            args['preprocess_filter_min_disp'] = -1


        if args['preprocess_teta_total_features'] is not None:
            args['preprocess_teta_total_features'] = int(args['preprocess_teta_total_features'])
        try:
            resultDict = seurat_utils.run(args)
        except:
            return { 'status' : hyperopt.STATUS_FAIL}

        print(f'>> Result: {resultDict["randIndex"]}')
        ret = {
            'loss' : -resultDict['randIndex']
            ,'status' : STATUS_OK
            ,'eval_time' : time.time()        
        }
        return ret

    trials = hyperopt_utils.getTrials(filename ,restart = False )
    evals_per_epoch = 10
    for e in range(len(trials), max_evals, evals_per_epoch):
        best = fmin(evaluateSeurat
                    ,space
                    ,algo=tpe.suggest 
                    ,max_evals= e + evals_per_epoch
                    ,trials=trials)

        pickle.dump(trials, open(filename, 'wb'))
    return trials

trials = runSeurat(filename, space, max_evals = 20)

Creating new trials...
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 5755 genes that are detected in less than 8.0 cells
computing neighbors
    using 'X_pca' with n_pcs = 24
    finished (0:00:03.11) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:01.81) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.05) --> found 12 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.48955034236836736
>> Result: 0.48955034236836736
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 6722 genes that are detected in less than 11.0 cells
computing neighbors
    using 'X_pca' with n_pcs = 

  return np.sum(resid_dev * freq_weights * var_weights / scale)


... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 7388 genes that are detected in less than 14.0 cells
computing neighbors
    using 'X_pca' with n_pcs = 31
    finished (0:00:00.09) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:00.64) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.01) --> found 15 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.5142372442867872
>> Result: 0.5142372442867872
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
computing neighbors
    using 'X_pca' with n_pcs = 38
    finished (0:00:00.18) --> added to `.uns['neighbors']`
    'distances', weighted ad

  X /= scale
  if max_value is not None: X[X > max_value] = max_value


computing neighbors
    using 'X_pca' with n_pcs = 50
    finished (0:00:00.11) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:00.75) --> added
    'X_umap', UMAP coordinates (adata.obsm)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:00.01) --> found 9 clusters and added
    'louvain', the cluster labels (adata.obs, categorical)
Rand_index 0.5436065512638625
>> Result: 0.5436065512638625
... reading from cache file ./cache/input-brainCIDR-matrix.h5ad
Loading dataset brainCIDR with 22085 genes and 420 cells
filtered out 6962 genes that are detected in less than 12.0 cells
computing neighbors
    using 'X_pca' with n_pcs = 39
    finished (0:00:00.13) --> added to `.uns['neighbors']`
    'distances', weighted adjacency matrix
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:00:00.87) --> added
    'X_u

In [53]:
df = getResultsAsDf(trials, space)

Unnamed: 0,load_inputDataset,bpreprocess_min_genes,preprocess_min_genes,bpreprocess_min_cells,preprocess_min_cells,bpreprocess_teta_total_features,preprocess_teta_total_features,preprocess_normalize_per_cell,bpreprocess_filter_min_mean,preprocess_filter_min_mean,...,preprocess_filter_min_disp,preprocess_regress_out,preprocess_scale,cluster_n_neighbors,cluster_n_pcs,cluster_plot_pca,preprocess_plot_preprocessing,evaluate_plot_results,preprocess_verbosity,result
0,brainCIDR,True,190.0,True,8.0,False,5000.0,True,True,0.484101,...,0.793795,none,5,7.0,24.0,False,False,False,0,-0.48955
1,brainCIDR,False,350.0,True,11.0,False,15000.0,True,False,0.48236,...,0.935533,none,20,4.0,28.0,False,False,False,0,-0.44734
2,brainCIDR,False,100.0,True,10.0,True,16000.0,False,False,0.099535,...,0.481102,none,-1,14.0,28.0,False,False,False,0,-0.514611
3,brainCIDR,False,250.0,True,4.0,True,18000.0,False,True,0.09016,...,0.476458,none,-1,4.0,47.0,False,False,False,0,-0.463424
4,brainCIDR,True,60.0,True,6.0,True,14000.0,False,False,0.19339,...,0.865988,total_counts,15,15.0,40.0,False,False,False,0,-0.602003
5,brainCIDR,False,400.0,True,13.0,False,7000.0,True,True,0.36302,...,0.760052,none,5,6.0,29.0,False,False,False,0,-0.451341
6,brainCIDR,False,210.0,False,2.0,True,18000.0,True,True,0.36287,...,0.111937,none,5,9.0,32.0,False,False,False,0,-0.521424
7,brainCIDR,True,150.0,False,4.0,False,18000.0,True,True,0.283363,...,0.171807,none,5,19.0,29.0,False,False,False,0,-0.613811
8,brainCIDR,True,430.0,True,3.0,True,11000.0,False,True,0.337434,...,0.253619,total_counts,20,15.0,29.0,False,False,False,0,-0.553284
9,brainCIDR,True,80.0,False,1.0,False,14000.0,False,False,0.149749,...,0.903608,total_counts,10,7.0,42.0,False,False,False,0,


In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = 200
params['preprocess_min_cells'] = 3
# Define on the dataset
params['preprocess_teta_total_features'] = None
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] = 0.0125
params['preprocess_filter_max_mean']=3
params['preprocess_filter_min_disp']=0.5
params['preprocess_regress_out']=None
params['preprocess_scale']=10

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['preprocess_plot_preprocessing'] = False
params['evaluate_plot_results'] = False


In [None]:
# evaluate(params)
print(evaluate(params))

In [None]:
params = {'cluster_n_neighbors': 8,
 'cluster_n_pcs': 37,
 'cluster_plot_pca': False,
 'evaluate_plot_results': False,
 'load_inputDataset': 'deng',
 'preprocess_filter_max_mean': 0.01,
 'preprocess_filter_min_disp': 0.2843934859859411,
 'preprocess_filter_min_mean': 0.1,
 'preprocess_min_cells': None,
 'preprocess_min_genes': None,
 'preprocess_normalize_per_cell': True,
 'preprocess_plot_preprocessing': False,
 'preprocess_regress_out': None,
 'preprocess_scale': 20,
 'preprocess_teta_total_features': None,
 'preprocess_verbosity': 0}
resultDict = seurat_utils.run(params)


## No total counts filter/ regression

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = 200
params['preprocess_min_cells'] = 3
# Define on the dataset
params['preprocess_teta_total_features'] = None
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] = 0.0125
params['preprocess_filter_max_mean']=3
params['preprocess_filter_min_disp']=0.5
params['preprocess_regress_out']=None
params['preprocess_scale']=10

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)
results.append(resultDict)

## No gene filtering based on dispertion

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = 200
params['preprocess_min_cells'] = 3
# Define on the dataset
params['preprocess_teta_total_features'] = 14000
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] = None
params['preprocess_filter_max_mean']=None
params['preprocess_filter_min_disp']=None
params['preprocess_regress_out']=None
params['preprocess_scale']=10

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)
results.append(resultDict)

## No scale

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = 200
params['preprocess_min_cells'] = 3
# Define on the dataset
params['preprocess_teta_total_features'] = 14000
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] = 0.0125
params['preprocess_filter_max_mean']=3
params['preprocess_filter_min_disp']=0.5
params['preprocess_regress_out']='total_counts'
params['preprocess_scale']=None

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)


In [None]:
results.append(resultDict)

## No min genes/cells filter

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = None
params['preprocess_min_cells'] = None
# Define on the dataset
params['preprocess_teta_total_features'] = None
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] =None
params['preprocess_filter_max_mean']=None
params['preprocess_filter_min_disp']=None
params['preprocess_regress_out']=None
params['preprocess_scale']=None

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)


In [None]:
results.append(resultDict)

## No preprocessing

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = None
params['preprocess_min_cells'] = None
# Define on the dataset
params['preprocess_teta_total_features'] = None
params['preprocess_normalize_per_cell'] = False
params['preprocess_filter_min_mean'] =None
params['preprocess_filter_max_mean']=None
params['preprocess_filter_min_disp']=None
params['preprocess_regress_out']=None
params['preprocess_scale']=None

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)


In [None]:
results.append(resultDict)

## Normalization only, less clusters

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = None
params['preprocess_min_cells'] = None
# Define on the dataset
params['preprocess_teta_total_features'] = None
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] =None
params['preprocess_filter_max_mean']=None
params['preprocess_filter_min_disp']=None
params['preprocess_regress_out']=None
params['preprocess_scale']=None

params['cluster_n_neighbors'] = 8
params['cluster_n_pcs'] = 30
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)
results.append(resultDict)

## Normalization only, more clusters

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = None
params['preprocess_min_cells'] = None
# Define on the dataset
params['preprocess_teta_total_features'] = None
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] =None
params['preprocess_filter_max_mean']=None
params['preprocess_filter_min_disp']=None
params['preprocess_regress_out']=None
params['preprocess_scale']=None

params['cluster_n_neighbors'] = 15
params['cluster_n_pcs'] = 50
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)
results.append(resultDict)

## Filter less cells

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = 200
params['preprocess_min_cells'] = 2
# Define on the dataset
params['preprocess_teta_total_features'] = 14000
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] = 0.01
params['preprocess_filter_max_mean']=4
params['preprocess_filter_min_disp']=0.5
params['preprocess_regress_out']='total_counts'
params['preprocess_scale']=15

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)
results.append(resultDict)

In [None]:
params = {}
params['load_inputDataset'] = inputDatasets[0]
params['preprocess_min_genes'] = 200
params['preprocess_min_cells'] = 3
# Define on the dataset
params['preprocess_teta_total_features'] = 14000
params['preprocess_normalize_per_cell'] = True
params['preprocess_filter_min_mean'] = 0.0125
params['preprocess_filter_max_mean']=3
params['preprocess_filter_min_disp']=0.5
params['preprocess_regress_out']='total_counts'
params['preprocess_scale']=10

params['cluster_n_neighbors'] = 10
params['cluster_n_pcs'] = 40
params['cluster_plot_pca'] = False

params['evaluate_plot_results'] = True
resultDict = seurat_utils.run(params)
results.append(resultDict)

## Render all combinations

In [None]:
pd.DataFrame( results)