Skip to content

Commit

Permalink
Commented core funcs, adding (singleR runtime warning, KNN caching, t…
Browse files Browse the repository at this point in the history
…ooltips)
  • Loading branch information
euxhenh committed Nov 4, 2021
1 parent 9833deb commit 8226fe1
Show file tree
Hide file tree
Showing 17 changed files with 664 additions and 180 deletions.
107 changes: 85 additions & 22 deletions controller/cellar/core/_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,45 @@
import leidenalg as la
import numpy as np
from app import logger
from scipy.sparse import csr_matrix
from faiss.swigfaiss_avx2 import compute_PQ_dis_tables_dsub2
from scipy.sparse import csr_matrix, issparse
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
from sklearn_extra.cluster import KMedoids

from ..utils.exceptions import UserError
from ..utils.validation import _validate_clu_n_clusters
from ._cluster_multiple import cluster_multiple
from ._evaluation import get_eval_obj
from ._neighbors import faiss_knn, knn_auto
from ..utils.exceptions import UserError


def _has_neighbors(adata, params, key='neighs'):
"""
Checks if there are any neighbors in adata and return those if new params
equal existing ones in adata.uns[key].
Parameters
__________
adata: anndata.AnnData object
params: dict
dictionary of neighbor parameters
key: str
Key to look for neighbors params under adata.uns
"""
if key in adata.uns:
old_params = adata.uns[key]
if isinstance(old_params, dict) and isinstance(params, dict):
if params == old_params and key in adata.obsp:
if issparse(adata.obsp[key]):
return True
return False


def cl_Leiden(
adata, key='labels', x_to_use='x_emb', clear_annotations=True,
partition_type='RBConfigurationVertexPartition', directed=True,
graph_method='auto', n_neighbors=15, use_weights=False, **kwargs):
graph_method='auto', n_neighbors=15, use_cached_neigh=True,
use_weights=False, neigh_key='neighs', **kwargs):
"""
Runs the leiden clustering algorithm:
https://www.nature.com/articles/s41598-019-41695-z
Expand All @@ -26,7 +50,7 @@ def cl_Leiden(
adata: anndata.AnnData object
key: str
Name of the key to add labels under adata.obs[key].
Name of the key to use for labels under adata.obs.
x_to_use: str
Key name in adata.obsm to use as feature vectors. If set to 'x'
then will use adata.X
Expand All @@ -53,50 +77,89 @@ def cl_Leiden(
to compute approximate nearest neighbors.
n_neighbors: int
Number of nearest neighbors to find using the method specified above.
use_cached_neigh: bool
If True, then will check adata if it already contains neighbors
and if the parameters used to compute these neighbors match
the current ones.
use_weights: bool
Set to True to construct a weighted graph (with weights being the
distances between points).
neigh_key: str
Key to store neighbors in adata.obsp.
**kwargs: dict
Any additional arguments will be passed to la.find_partition
"""
if clear_annotations:
if 'annotations' in adata.obs:
adata.obs.pop('annotations')
config = {}
neigh_config = {}

if clear_annotations and 'annotations' in adata.obs:
_ = adata.obs.pop('annotations')
# Using adata.X to perform clustering can be very slow
neigh_config['x_to_use'] = x_to_use
if x_to_use == 'x':
x_to_use = adata.X
if x_to_use.shape[1] > 500:
logger.warn(
"Data is very high-dimensional. This process can " +
"be slow and may yield suboptimal results. Running " +
"dimensionality reduction first is highly recommended.")
else:
if x_to_use not in adata.obsm:
raise UserError("No embeddings found. Please " +
"run dimensionality reduction first.")
x_to_use = adata.obsm[x_to_use]

for k in kwargs:
if kwargs[k] == '':
kwargs[k] = None

if partition_type in [
'ModularityVertexPartition', 'SurpriseVertexPartition']:
kwargs.pop('resolution_parameter')

# Compute nearest neighbors
sources, targets, weights = knn_auto(
x_to_use, n_neighbors=n_neighbors,
mode='connectivity', method=graph_method)
neigh_config['n_neighbors'] = n_neighbors
neigh_config['mode'] = 'connectivity'
neigh_config['graph_method'] = graph_method
# Check if adata contains neighbors and the configuration is the same
if use_cached_neigh and _has_neighbors(adata, neigh_config, neigh_key):
logger.info("Using cached neighbors.")
sources, targets = adata.obsp[neigh_key].nonzero()
else:
# (Re)Compute nearest neighbors
if n_neighbors >= x_to_use.shape[0]:
n_neighbors = x_to_use.shape[0] // 2 + 1
logger.warn(
"Number of neighbors is greater than " +
f"the dataset size. Using {n_neighbors} neighbors instead.")
sources, targets, weights = knn_auto(
x_to_use, n_neighbors=n_neighbors,
mode='connectivity', method=graph_method)
adata.uns[neigh_key] = neigh_config
adata.obsp[neigh_key] = csr_matrix(
(weights, (sources, targets)), shape=(x_to_use.shape[0],) * 2)

# weights = 1 / weights
# TODO: construct weighted graph
# kwargs['weights'] = weights if use_weights else None

# Construct graph
gg = ig.Graph(directed=directed)
config['directed'] = directed
gg.add_vertices(x_to_use.shape[0])
# This step can be slow if number of datapoints is really large
# Add edges as list of pairs (unweighted)
gg.add_edges(list(zip(list(sources), list(targets))))

# Empty input boxes are parsed as empty strings
for k in kwargs:
if kwargs[k] == '':
kwargs[k] = None
# These two partitions do not use resolution
if partition_type in [
'ModularityVertexPartition', 'SurpriseVertexPartition']:
if 'resolution_parameter' in kwargs:
_ = kwargs.pop('resolution_parameter')
if not hasattr(la, partition_type):
raise UserError(f"No partition type '{partition_type}' found.")
# Perform the clustering
part = la.find_partition(gg, getattr(la, partition_type), **kwargs)

config['partition_type'] = partition_type
# Merge the two dictionaries to also include kwargs passed to leidenalg
config = {**config, **kwargs}
config['method'] = 'Leiden'
adata.obs[key] = np.array(part.membership, dtype=int)
adata.uns[key] = config
logger.info(f"Graph modularity: {gg.modularity(part.membership)}")


def _get_wrapper(x, obj_def, n_clusters=np.array([2, 4, 8, 16]),
Expand Down
91 changes: 79 additions & 12 deletions controller/cellar/core/_dim_reduction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import gc

import anndata2ri
import numpy as np
import rpy2.robjects as ro
from anndata._core.sparse_dataset import SparseDataset
from app import logger
from controller.cellar.utils.exceptions import InvalidArgument
from pydiffmap import diffusion_map as dm
from rpy2.robjects import numpy2ri, r
Expand All @@ -23,7 +26,31 @@


def get_func(func_name):
"""
Use a functional to generate a function for each dimensionality reduction
method since they all have the same interface.
"""
method = func_name[3:] # Parse the method name

def _func(adata, key, x_to_use, **kwargs):
"""
Reduces the dimensionality of the data using the 'func_name' method.
Parameters
__________
adata: anndata.AnnData object
key: str
Key to store the reduced data under adata.obsm
x_to_use: str
Can be 'x' or 'x_emb'. If set to 'x', will use adata.X
to reduce the data. Otherwise will use adata.obsm['x_emb'].
We need the latter when this function is called to find 2D
embeddings.
kwargs: dict
Any additional arguments passed to the constructor of func_name.
"""
# Empty input boxes are parsed as empty strings
for k in kwargs:
if kwargs[k] == '':
kwargs[k] = None
Expand All @@ -35,29 +62,36 @@ def _func(adata, key, x_to_use, **kwargs):
if isinstance(adata.X, SparseDataset) or issparse(adata.X):
if func_name not in ['cl_TruncatedSVD', 'cl_UMAP']:
raise InvalidArgument(
"Sparse data is not supported using the selected" +
" method. Please choose TruncatedSVD or UMAP.")
"Sparse data is not supported using the selected "
"reduction method. "
"Please choose TruncatedSVD or UMAP.")
if adata.isbacked:
x_to_use = x_to_use.to_memory()
else:
x_to_use = adata.obsm['x_emb']

# Diffusion maps use a different parameter name for the number of comp
comp_key = 'n_evecs' if func_name == 'cl_Diffmap' else 'n_components'
# If no number of components was found in kwargs, assume this
# method was run for visualizing the data and set n_components to 2.
if comp_key not in kwargs:
kwargs[comp_key] = 2

mins = min(x_to_use.shape[0], x_to_use.shape[1])
if kwargs[comp_key] >= mins:
raise InvalidArgument(
"Number of components is higher than " +
"Number of components is higher than or equal to " +
f"min(samples, features) = {mins}.")

fitter = func_map[func_name](**kwargs)
adata.obsm[key] = fitter.fit_transform(x_to_use)
adata.uns[key] = kwargs.copy()
adata.uns[key]['method'] = method

return _func


# Assing every method to a global function
for func_name in func_map.keys():
globals()[func_name] = get_func(func_name)

Expand All @@ -69,12 +103,40 @@ def _correct_bin_names(bin_names):
return bin_names


def cl_cisTopic(
adata, key, x_to_use, topics=40, iterations=150, **kwargs):
def cl_cisTopic(adata, key, x_to_use, topics=40, iterations=150, **kwargs):
"""
In Cellar, cisTopic is meant to be used with scATAC-seq data.
https://www.nature.com/articles/s41592-019-0367-1
This method uses LDA to infer cis regulatory topics. We use it here
solely as a "dimensionality reduction" method where the topics
found can serve as components. Since cisTopic is only available for R,
we rely on rpy2 to call R functions.
Parameters
__________
adata: anndata.AnnData object
key: str
Key to store the reduced data under adata.obsm
x_to_use: str
Ignored. Present only for consistency.
topics: int
Number of topics to consider. Will run cisTopic for
topics - 5, topics, and topics + 5 and select the best one.
iterations: int
Number of iterations.
kwargs: dict
Ignored. Present only for consistency.
"""
topics = int(topics)
topic_list = [topics, topics + 5, topics - 5]
# Unfortunately, with most R functions we cannot use backed mode
# so we have to load adata into memory. This can potentially lead to
# memory issues if multiple adatas are found in memory at the same time.
# Also transpose matrix since cisTopic accepts data in (bin, cell) format.
mat = adata.to_memory().X.T.copy()

# If mat is sparse, then we convert mat to an R friendly sparse format
if issparse(mat):
mat = mat.tocoo()
r_Matrix = importr("Matrix")
Expand All @@ -85,22 +147,19 @@ def cl_cisTopic(
dims=ro.IntVector(mat.shape))
else:
mat = numpy2ri.py2rpy(mat)

# Set row and column names
var_names = _correct_bin_names(adata.var_names.to_numpy())
mat = r("`rownames<-`")(mat, ro.vectors.StrVector(var_names))
mat = r("`colnames<-`")(mat, ro.vectors.StrVector(adata.obs.index))

cisTopic = importr('cisTopic')

print('Creating cisTopic object.')
logger.info('Creating cisTopic object.')
cc = cisTopic.createcisTopicObject(mat)
print('Created cisTopic object.')

print('Running LDA Models. This may take a while...')
logger.info('Running LDA Models. This may take a while...')
cc = cisTopic.runWarpLDAModels(
cc,
topic=numpy2ri.py2rpy(topic_list),
nCores=1,
nCores=2, # Careful with this, since each run duplicates the data
iterations=iterations,
addModels=False,
returnType='selectedModel')
Expand All @@ -110,6 +169,14 @@ def cl_cisTopic(
cellassign = np.array(cellassign).T.copy().astype(np.float32)

adata.obsm[key] = cellassign
adata.uns[key] = {
'method': 'cisTopic',
'topics': topics,
'iterations': iterations
}
# Clean mat
del mat
gc.collect()


def clear_x_emb_dependends(adata):
Expand Down

0 comments on commit 8226fe1

Please sign in to comment.