Commented core funcs, adding (singleR runtime warning, KNN caching, t…

…ooltips)
euxhenh · Nov 4, 2021 · 8226fe1 · 8226fe1
1 parent 9833deb
commit 8226fe1
Show file tree

Hide file tree

Showing 17 changed files with 664 additions and 180 deletions.
diff --git a/controller/cellar/core/_cluster.py b/controller/cellar/core/_cluster.py
@@ -2,21 +2,45 @@
 import leidenalg as la
 import numpy as np
 from app import logger
-from scipy.sparse import csr_matrix
+from faiss.swigfaiss_avx2 import compute_PQ_dis_tables_dsub2
+from scipy.sparse import csr_matrix, issparse
 from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
 from sklearn_extra.cluster import KMedoids
 
+from ..utils.exceptions import UserError
 from ..utils.validation import _validate_clu_n_clusters
 from ._cluster_multiple import cluster_multiple
 from ._evaluation import get_eval_obj
 from ._neighbors import faiss_knn, knn_auto
-from ..utils.exceptions import UserError
+
+
+def _has_neighbors(adata, params, key='neighs'):
+    """
+    Checks if there are any neighbors in adata and return those if new params
+    equal existing ones in adata.uns[key].
+
+    Parameters
+    __________
+    adata: anndata.AnnData object
+    params: dict
+        dictionary of neighbor parameters
+    key: str
+        Key to look for neighbors params under adata.uns
+    """
+    if key in adata.uns:
+        old_params = adata.uns[key]
+        if isinstance(old_params, dict) and isinstance(params, dict):
+            if params == old_params and key in adata.obsp:
+                if issparse(adata.obsp[key]):
+                    return True
+    return False
 
 
 def cl_Leiden(
         adata, key='labels', x_to_use='x_emb', clear_annotations=True,
         partition_type='RBConfigurationVertexPartition', directed=True,
-        graph_method='auto', n_neighbors=15, use_weights=False, **kwargs):
+        graph_method='auto', n_neighbors=15, use_cached_neigh=True,
+        use_weights=False, neigh_key='neighs', **kwargs):
     """
     Runs the leiden clustering algorithm:
     https://www.nature.com/articles/s41598-019-41695-z
@@ -26,7 +50,7 @@ def cl_Leiden(
 
     adata: anndata.AnnData object
     key: str
-        Name of the key to add labels under adata.obs[key].
+        Name of the key to use for labels under adata.obs.
     x_to_use: str
         Key name in adata.obsm to use as feature vectors. If set to 'x'
         then will use adata.X
@@ -53,50 +77,89 @@ def cl_Leiden(
         to compute approximate nearest neighbors.
     n_neighbors: int
         Number of nearest neighbors to find using the method specified above.
+    use_cached_neigh: bool
+        If True, then will check adata if it already contains neighbors
+        and if the parameters used to compute these neighbors match
+        the current ones.
     use_weights: bool
         Set to True to construct a weighted graph (with weights being the
         distances between points).
+    neigh_key: str
+        Key to store neighbors in adata.obsp.
     **kwargs: dict
         Any additional arguments will be passed to la.find_partition
     """
-    if clear_annotations:
-        if 'annotations' in adata.obs:
-            adata.obs.pop('annotations')
+    config = {}
+    neigh_config = {}
 
+    if clear_annotations and 'annotations' in adata.obs:
+        _ = adata.obs.pop('annotations')
+    # Using adata.X to perform clustering can be very slow
+    neigh_config['x_to_use'] = x_to_use
     if x_to_use == 'x':
         x_to_use = adata.X
+        if x_to_use.shape[1] > 500:
+            logger.warn(
+                "Data is very high-dimensional. This process can " +
+                "be slow and may yield suboptimal results. Running " +
+                "dimensionality reduction first is highly recommended.")
     else:
         if x_to_use not in adata.obsm:
             raise UserError("No embeddings found. Please " +
                             "run dimensionality reduction first.")
         x_to_use = adata.obsm[x_to_use]
 
-    for k in kwargs:
-        if kwargs[k] == '':
-            kwargs[k] = None
-
-    if partition_type in [
-            'ModularityVertexPartition', 'SurpriseVertexPartition']:
-        kwargs.pop('resolution_parameter')
-
-    # Compute nearest neighbors
-    sources, targets, weights = knn_auto(
-        x_to_use, n_neighbors=n_neighbors,
-        mode='connectivity', method=graph_method)
+    neigh_config['n_neighbors'] = n_neighbors
+    neigh_config['mode'] = 'connectivity'
+    neigh_config['graph_method'] = graph_method
+    # Check if adata contains neighbors and the configuration is the same
+    if use_cached_neigh and _has_neighbors(adata, neigh_config, neigh_key):
+        logger.info("Using cached neighbors.")
+        sources, targets = adata.obsp[neigh_key].nonzero()
+    else:
+        # (Re)Compute nearest neighbors
+        if n_neighbors >= x_to_use.shape[0]:
+            n_neighbors = x_to_use.shape[0] // 2 + 1
+            logger.warn(
+                "Number of neighbors is greater than " +
+                f"the dataset size. Using {n_neighbors} neighbors instead.")
+        sources, targets, weights = knn_auto(
+            x_to_use, n_neighbors=n_neighbors,
+            mode='connectivity', method=graph_method)
+        adata.uns[neigh_key] = neigh_config
+        adata.obsp[neigh_key] = csr_matrix(
+            (weights, (sources, targets)), shape=(x_to_use.shape[0],) * 2)
 
     # weights = 1 / weights
+    # TODO: construct weighted graph
     # kwargs['weights'] = weights if use_weights else None
-
     # Construct graph
     gg = ig.Graph(directed=directed)
+    config['directed'] = directed
     gg.add_vertices(x_to_use.shape[0])
-    # This step can be slow if number of datapoints is really large
+    # Add edges as list of pairs (unweighted)
     gg.add_edges(list(zip(list(sources), list(targets))))
 
+    # Empty input boxes are parsed as empty strings
+    for k in kwargs:
+        if kwargs[k] == '':
+            kwargs[k] = None
+    # These two partitions do not use resolution
+    if partition_type in [
+            'ModularityVertexPartition', 'SurpriseVertexPartition']:
+        if 'resolution_parameter' in kwargs:
+            _ = kwargs.pop('resolution_parameter')
+    if not hasattr(la, partition_type):
+        raise UserError(f"No partition type '{partition_type}' found.")
     # Perform the clustering
     part = la.find_partition(gg, getattr(la, partition_type), **kwargs)
-
+    config['partition_type'] = partition_type
+    # Merge the two dictionaries to also include kwargs passed to leidenalg
+    config = {**config, **kwargs}
+    config['method'] = 'Leiden'
     adata.obs[key] = np.array(part.membership, dtype=int)
+    adata.uns[key] = config
+    logger.info(f"Graph modularity: {gg.modularity(part.membership)}")
 
 
 def _get_wrapper(x, obj_def, n_clusters=np.array([2, 4, 8, 16]),

diff --git a/controller/cellar/core/_dim_reduction.py b/controller/cellar/core/_dim_reduction.py
@@ -1,7 +1,10 @@
+import gc
+
 import anndata2ri
 import numpy as np
 import rpy2.robjects as ro
 from anndata._core.sparse_dataset import SparseDataset
+from app import logger
 from controller.cellar.utils.exceptions import InvalidArgument
 from pydiffmap import diffusion_map as dm
 from rpy2.robjects import numpy2ri, r
@@ -23,7 +26,31 @@
 
 
 def get_func(func_name):
+    """
+    Use a functional to generate a function for each dimensionality reduction
+    method since they all have the same interface.
+    """
+    method = func_name[3:]  # Parse the method name
+
     def _func(adata, key, x_to_use, **kwargs):
+        """
+        Reduces the dimensionality of the data using the 'func_name' method.
+
+        Parameters
+        __________
+
+        adata: anndata.AnnData object
+        key: str
+            Key to store the reduced data under adata.obsm
+        x_to_use: str
+            Can be 'x' or 'x_emb'. If set to 'x', will use adata.X
+            to reduce the data. Otherwise will use adata.obsm['x_emb'].
+            We need the latter when this function is called to find 2D
+            embeddings.
+        kwargs: dict
+            Any additional arguments passed to the constructor of func_name.
+        """
+        # Empty input boxes are parsed as empty strings
         for k in kwargs:
             if kwargs[k] == '':
                 kwargs[k] = None
@@ -35,29 +62,36 @@ def _func(adata, key, x_to_use, **kwargs):
             if isinstance(adata.X, SparseDataset) or issparse(adata.X):
                 if func_name not in ['cl_TruncatedSVD', 'cl_UMAP']:
                     raise InvalidArgument(
-                        "Sparse data is not supported using the selected" +
-                        " method. Please choose TruncatedSVD or UMAP.")
+                        "Sparse data is not supported using the selected "
+                        "reduction method. "
+                        "Please choose TruncatedSVD or UMAP.")
                 if adata.isbacked:
                     x_to_use = x_to_use.to_memory()
         else:
             x_to_use = adata.obsm['x_emb']
 
+        # Diffusion maps use a different parameter name for the number of comp
         comp_key = 'n_evecs' if func_name == 'cl_Diffmap' else 'n_components'
+        # If no number of components was found in kwargs, assume this
+        # method was run for visualizing the data and set n_components to 2.
         if comp_key not in kwargs:
             kwargs[comp_key] = 2
 
         mins = min(x_to_use.shape[0], x_to_use.shape[1])
         if kwargs[comp_key] >= mins:
             raise InvalidArgument(
-                "Number of components is higher than " +
+                "Number of components is higher than or equal to " +
                 f"min(samples, features) = {mins}.")
 
         fitter = func_map[func_name](**kwargs)
         adata.obsm[key] = fitter.fit_transform(x_to_use)
+        adata.uns[key] = kwargs.copy()
+        adata.uns[key]['method'] = method
 
     return _func
 
 
+# Assing every method to a global function
 for func_name in func_map.keys():
     globals()[func_name] = get_func(func_name)
 
@@ -69,12 +103,40 @@ def _correct_bin_names(bin_names):
     return bin_names
 
 
-def cl_cisTopic(
-        adata, key, x_to_use, topics=40, iterations=150, **kwargs):
+def cl_cisTopic(adata, key, x_to_use, topics=40, iterations=150, **kwargs):
+    """
+    In Cellar, cisTopic is meant to be used with scATAC-seq data.
+    https://www.nature.com/articles/s41592-019-0367-1
+    This method uses LDA to infer cis regulatory topics. We use it here
+    solely as a "dimensionality reduction" method where the topics
+    found can serve as components. Since cisTopic is only available for R,
+    we rely on rpy2 to call R functions.
+
+    Parameters
+    __________
+
+    adata: anndata.AnnData object
+        key: str
+            Key to store the reduced data under adata.obsm
+        x_to_use: str
+            Ignored. Present only for consistency.
+        topics: int
+            Number of topics to consider. Will run cisTopic for
+            topics - 5, topics, and topics + 5 and select the best one.
+        iterations: int
+            Number of iterations.
+        kwargs: dict
+            Ignored. Present only for consistency.
+    """
     topics = int(topics)
     topic_list = [topics, topics + 5, topics - 5]
+    # Unfortunately, with most R functions we cannot use backed mode
+    # so we have to load adata into memory. This can potentially lead to
+    # memory issues if multiple adatas are found in memory at the same time.
+    # Also transpose matrix since cisTopic accepts data in (bin, cell) format.
     mat = adata.to_memory().X.T.copy()
 
+    # If mat is sparse, then we convert mat to an R friendly sparse format
     if issparse(mat):
         mat = mat.tocoo()
         r_Matrix = importr("Matrix")
@@ -85,22 +147,19 @@ def cl_cisTopic(
             dims=ro.IntVector(mat.shape))
     else:
         mat = numpy2ri.py2rpy(mat)
-
+    # Set row and column names
     var_names = _correct_bin_names(adata.var_names.to_numpy())
     mat = r("`rownames<-`")(mat, ro.vectors.StrVector(var_names))
     mat = r("`colnames<-`")(mat, ro.vectors.StrVector(adata.obs.index))
-
     cisTopic = importr('cisTopic')
-
-    print('Creating cisTopic object.')
+    logger.info('Creating cisTopic object.')
     cc = cisTopic.createcisTopicObject(mat)
-    print('Created cisTopic object.')
 
-    print('Running LDA Models. This may take a while...')
+    logger.info('Running LDA Models. This may take a while...')
     cc = cisTopic.runWarpLDAModels(
         cc,
         topic=numpy2ri.py2rpy(topic_list),
-        nCores=1,
+        nCores=2,  # Careful with this, since each run duplicates the data
         iterations=iterations,
         addModels=False,
         returnType='selectedModel')
@@ -110,6 +169,14 @@ def cl_cisTopic(
     cellassign = np.array(cellassign).T.copy().astype(np.float32)
 
     adata.obsm[key] = cellassign
+    adata.uns[key] = {
+        'method': 'cisTopic',
+        'topics': topics,
+        'iterations': iterations
+    }
+    # Clean mat
+    del mat
+    gc.collect()
 
 
 def clear_x_emb_dependends(adata):