Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Start move to {obs/var} specific arguments for get_anndata #1149

Merged
merged 12 commits into from
Jun 28, 2024
30 changes: 26 additions & 4 deletions api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""

from typing import Literal, Optional, Sequence
from warnings import warn

import anndata
import pandas as pd
Expand Down Expand Up @@ -38,6 +39,8 @@ def get_anndata(
column_names: Optional[soma.AxisColumnNames] = None,
obs_embeddings: Optional[Sequence[str]] = (),
var_embeddings: Optional[Sequence[str]] = (),
obs_column_names: Optional[Sequence[str]] = None,
var_column_names: Optional[Sequence[str]] = None,
) -> anndata.AnnData:
"""Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
and return it as an :class:`anndata.AnnData` object.
Expand Down Expand Up @@ -65,8 +68,6 @@ def get_anndata(
var_coords:
Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
column_names:
Columns to fetch for ``obs`` and ``var`` dataframes.
obsm_layers:
Additional obsm layers to read and return in the ``obsm`` slot.
obsp_layers:
Expand All @@ -83,6 +84,10 @@ def get_anndata(
Additional embeddings to be returned as part of the ``varm`` slot.
Use :func:`get_all_available_embeddings` to retrieve available embeddings
for this Census version and organism.
obs_column_names:
Columns to fetch for ``obs`` dataframe.
var_column_names:
Columns to fetch for ``var`` dataframe.

Returns:
An :class:`anndata.AnnData` object containing the census slice.
Expand All @@ -93,7 +98,7 @@ def get_anndata(
Examples:
>>> get_anndata(census, "Mus musculus", obs_value_filter="tissue_general in ['brain', 'lung']")

>>> get_anndata(census, "Homo sapiens", column_names={"obs": ["tissue"]})
>>> get_anndata(census, "Homo sapiens", obs_column_names=["tissue"])

>>> get_anndata(census, "Homo sapiens", obs_coords=slice(0, 1000))
"""
Expand All @@ -107,14 +112,31 @@ def get_anndata(
if varm_layers and var_embeddings and set(varm_layers) & set(var_embeddings):
raise ValueError("Cannot request both `varm_layers` and `var_embeddings` for the same embedding name")

# Backwards compat for old column_names argument
if column_names is not None:
if obs_column_names is not None or var_column_names is not None:
raise ValueError(
"Both the deprecated 'column_names' argument and it's replacements were used. Please use 'obs_column_names' and 'var_column_names' only."
ivirshup marked this conversation as resolved.
Show resolved Hide resolved
)
else:
warn(
"The argument `column_names` is deprecated and will be removed in a future release. Please use `obs_column_names` and `var_column_names` instead.",
FutureWarning,
stacklevel=2,
)
if "obs" in column_names:
obs_column_names = column_names["obs"]
if "var" in column_names:
var_column_names = column_names["var"]

with exp.axis_query(
measurement_name,
obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords),
var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords),
) as query:
adata = query.to_anndata(
X_name=X_name,
column_names=column_names,
column_names={"obs": obs_column_names, "var": var_column_names},
X_layers=X_layers,
obsm_layers=obsm_layers,
varm_layers=varm_layers,
Expand Down
78 changes: 68 additions & 10 deletions api/python/cellxgene_census/tests/test_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,14 @@ def test_get_anndata_value_filter(census: soma.Collection) -> None:
organism="Mus musculus",
obs_value_filter="tissue_general == 'vasculature'",
var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
column_names={
"obs": [
"soma_joinid",
"cell_type",
"tissue",
"tissue_general",
"assay",
],
"var": ["soma_joinid", "feature_id", "feature_name", "feature_length"],
},
obs_column_names=[
"soma_joinid",
"cell_type",
"tissue",
"tissue_general",
"assay",
],
var_column_names=["soma_joinid", "feature_id", "feature_name", "feature_length"],
)

assert ad is not None
Expand Down Expand Up @@ -253,6 +251,66 @@ def test_get_anndata_obsm_layers_and_add_obs_embedding_fails(lts_census: soma.Co
)


@pytest.mark.live_corpus
def test_deprecated_column_api(census: soma.Collection) -> None:
"""Testing for previous `column_names` argument.

See: https://github.com/chanzuckerberg/cellxgene-census/issues/1035
"""
ad_curr = cellxgene_census.get_anndata(
census,
organism="Mus musculus",
obs_value_filter="tissue_general == 'vasculature'",
var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
obs_column_names=[
"soma_joinid",
"cell_type",
"tissue",
"tissue_general",
"assay",
],
var_column_names=["soma_joinid", "feature_id", "feature_name", "feature_length"],
)
with pytest.warns(FutureWarning):
ad_prev = cellxgene_census.get_anndata(
census,
organism="Mus musculus",
obs_value_filter="tissue_general == 'vasculature'",
var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
column_names={
"obs": [
"soma_joinid",
"cell_type",
"tissue",
"tissue_general",
"assay",
],
"var": ["soma_joinid", "feature_id", "feature_name", "feature_length"],
},
)
with pytest.raises(
ValueError, match=r"Both the deprecated 'column_names' argument and it's replacements were used."
ivirshup marked this conversation as resolved.
Show resolved Hide resolved
):
cellxgene_census.get_anndata(
census,
organism="Mus musculus",
obs_value_filter="tissue_general == 'vasculature'",
var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']",
obs_column_names=[
"soma_joinid",
"cell_type",
],
column_names={
"obs": [
"soma_joinid",
"cell_type",
],
},
)
pd.testing.assert_frame_equal(ad_curr.obs, ad_prev.obs)
pd.testing.assert_frame_equal(ad_curr.var, ad_prev.var)


def _map_to_get_anndata_args(query: Dict[str, Any], axis: Literal["obs", "var"]) -> Dict[str, Any]:
"""Helper to map arguments of get_obs/ get_var to get_anndata."""
result = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@
" organism=\"homo_sapiens\",\n",
" measurement_name=\"RNA\",\n",
" obs_value_filter=\"tissue_general == 'central nervous system'\",\n",
" column_names={\"obs\": [\"cell_type\"]},\n",
" obs_column_names=[\"cell_type\"],\n",
" obs_embeddings=emb_names,\n",
")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@
" measurement_name=\"RNA\",\n",
" obs_value_filter=\"tissue == 'cardiac atrium'\",\n",
" var_value_filter=\"feature_name == 'MYBPC3'\",\n",
" column_names={\"obs\": [\"dataset_id\", \"cell_type\"]},\n",
" obs_column_names=[\"dataset_id\", \"cell_type\"],\n",
")\n",
"\n",
"# Get a citation string for the slice\n",
Expand Down
2 changes: 1 addition & 1 deletion api/python/notebooks/api_demo/census_embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@
" organism=EXPERIMENT_NAME,\n",
" measurement_name=MEASUREMENT_NAME,\n",
" obs_value_filter=\"tissue_general == 'central nervous system'\",\n",
" column_names={\"obs\": [\"cell_type\", \"soma_joinid\"]},\n",
" obs_column_names=[\"cell_type\", \"soma_joinid\"],\n",
" obs_embeddings=[\"scgpt\"],\n",
" )"
]
Expand Down
4 changes: 2 additions & 2 deletions api/python/notebooks/api_demo/census_query_extract.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"\n",
"The method will return an `anndata.AnnData` object, it takes as an input a census object, the string for an organism, and for both cell and gene metadata we can specify filters and column selection as described above but with the following arguments:\n",
"\n",
"- `column_names` — a dictionary with two keys `obs` and `var` whose values are lists of strings indicating the columns to select for cell and gene metadata respectively.\n",
"- `obs_column_names` and `var_column_names` — a pair of arguments whose values are lists of strings indicating the columns to select for cell (`obs`) and gene (`var`) metadata respectively.\n",
"- `obs_value_filter` — python expression with selection conditions to fetch **cells** meeting a criteria. For full details see [tiledb.QueryCondition](https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html#query-condition).\n",
"- `var_value_filter` — python expression with selection conditions to fetch **genes** meeting a criteria. Details as above. For full details see [tiledb.QueryCondition](https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html#query-condition).\n",
"\n",
Expand Down Expand Up @@ -95,7 +95,7 @@
" organism=\"Homo sapiens\",\n",
" var_value_filter=\"feature_id in ['ENSG00000161798', 'ENSG00000188229']\",\n",
" obs_value_filter=\"cell_type == 'B cell' and tissue_general == 'lung' and disease == 'COVID-19' and is_primary_data == True\",\n",
" column_names={\"obs\": [\"sex\"]},\n",
" obs_column_names=[\"sex\"],\n",
")"
]
},
Expand Down
8 changes: 4 additions & 4 deletions tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@
" organism=\"homo_sapiens\",\n",
" measurement_name=\"RNA\",\n",
" obs_value_filter=\"tissue_general == 'central nervous system' and is_primary_data == True\",\n",
" column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\"]},\n",
" obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\"],\n",
" obsm_layers=maintained_embs_human,\n",
")"
]
Expand Down Expand Up @@ -663,7 +663,7 @@
" organism=\"homo_sapiens\",\n",
" measurement_name=\"RNA\",\n",
" obs_value_filter=\"tissue_general == 'pancreas' and is_primary_data == True\",\n",
" column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\", \"dataset_id\", \"is_primary_data\"]},\n",
" obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\", \"dataset_id\", \"is_primary_data\"],\n",
" obsm_layers=maintained_embs_human,\n",
")"
]
Expand Down Expand Up @@ -862,7 +862,7 @@
" organism=\"mus_musculus\",\n",
" measurement_name=\"RNA\",\n",
" obs_value_filter=\"tissue_general == 'heart' and is_primary_data == True\",\n",
" column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\"]},\n",
" obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\"],\n",
" obsm_layers=maintained_embs_mouse,\n",
")"
]
Expand Down Expand Up @@ -1014,7 +1014,7 @@
" organism=\"mus_musculus\",\n",
" measurement_name=\"RNA\",\n",
" obs_value_filter=\"tissue_general == 'pancreas'\",\n",
" column_names={\"obs\": [\"cell_type\", \"assay\", \"soma_joinid\"]},\n",
" obs_column_names=[\"cell_type\", \"assay\", \"soma_joinid\"],\n",
" obsm_layers=maintained_embs_mouse,\n",
")"
]
Expand Down
Loading