Merge branch 'main' into ebezzi/pytorch-loaders-context

chanzuckerberg · May 24, 2024 · 6ec4496 · 6ec4496
2 parents 97aba3a + 36d98e8
commit 6ec4496
Show file tree

Hide file tree

Showing 26 changed files with 734 additions and 489 deletions.
diff --git a/.github/workflows/lts-compat-check.yml b/.github/workflows/lts-compat-check.yml
@@ -17,16 +17,14 @@ jobs:
         census-build-version:  # Add additional LTS releases as they occur
           - "latest"
           - "stable"
+          - "2023-12-15"
           - "2023-07-25"
           - "2023-05-15"
         py-pkg-version:
-          - "~=1.0.0"
-          - "~=1.1.0"
-          - "~=1.2.0"
-          - "~=1.3.0"
-          - "~=1.4.0"
-          - "~=1.5.0"
-          - "~=1.6.0"
+          - "~=1.10.0"
+          - "~=1.11.0"
+          - "~=1.12.0"
+          - "~=1.13.0"
           - "head-of-main"
 
     runs-on: ${{matrix.os}}
@@ -39,17 +37,17 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Install dependencies (including experimental)
+      - name: Install dependencies
         run: |
           python -m pip install -U pip setuptools wheel
-          pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
 
-          if [ {{matrix.cellxgene-census-version}} != "head-of-main" ]; then
-            pip install -e ./api/python/cellxgene_census/[experimental]
+          if [ {{matrix.py-pkg-version}} = "head-of-main" ]; then
+            pip install -e ./api/python/cellxgene_census/
           else
-            pip install -U cellxgene_census[experimental]==${{ matrix.py-pkg-version }}
+            pip install -U cellxgene_census${{ matrix.py-pkg-version }}
           fi
 
       - name: Test with pytest (API, main tests)
         run: |
-          PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/ --census_version ${{ matrix.census-build-version }}
+          PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/test_lts_compat.py --census_version ${{ matrix.census-build-version }}
diff --git a/.github/workflows/py-dependency-check.yml b/.github/workflows/py-dependency-check.yml
@@ -23,6 +23,9 @@ jobs:
       matrix:
         os: [single-cell-8c64g-runner, macos-latest]
         python-version: ["3.8", "3.9", "3.10", "3.11"]
+        exclude:
+          - os: macos-latest
+            python-version: "3.8"
 
     runs-on: ${{matrix.os}}
 
@@ -60,7 +63,7 @@ jobs:
         run: |
           python -m pip install -U pip setuptools wheel
           pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
-          pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
           pip install -U cellxgene-census[experimental]
 
           # dump pip config for logs

diff --git a/.github/workflows/py-unittests.yml b/.github/workflows/py-unittests.yml
@@ -7,12 +7,21 @@ on:
   push:
     branches: [main]
 
+# If a new commit is pushed, cancel the jobs from previous commits.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   unit_tests_python_api:
     strategy:
+      fail-fast: false  # Don't stop the workflow if one of the jobs fails
       matrix:
         os: [single-cell-8c64g-runner, macos-latest]
         python-version: ["3.8", "3.9", "3.10", "3.11"]
+        exclude:
+          - os: macos-latest
+            python-version: "3.8"
 
     runs-on: ${{matrix.os}}
 
@@ -30,8 +39,10 @@ jobs:
         run: |
           python -m pip install -U pip setuptools wheel
           pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
-          pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
           pip install -e './api/python/cellxgene_census/[experimental]'
+      - name: Report Dependency Versions
+        run: pip list
       - name: Test with pytest (API, main tests)
         run: |
           PYTHONPATH=. coverage run --parallel-mode -m pytest -v -rP --durations=20 ./api/python/cellxgene_census/tests/

diff --git a/.github/workflows/r-dependency-check.yml b/.github/workflows/r-dependency-check.yml
@@ -27,10 +27,10 @@ jobs:
       - name: install packages (macOS)
         if: matrix.os == 'macos-latest'
         run: Rscript -e 'install.packages(c("igraph"), type="binary")'
-      - name: install cellxgene.census and dependencies (Linux)
+      - name: install cellxgene.census and dependencies
         # This should follow our user-facing instructions to install cellxgene.census.
         run: |
-            Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"), type="source")'
+            Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"))'
             Rscript -e 'BiocManager::install("SingleCellExperiment")'
       - name: run unit tests
         # [re-]fetch the cellxgene.census source package which includes the unit test code to run

diff --git a/api/python/cellxgene_census/README.md b/api/python/cellxgene_census/README.md
@@ -7,3 +7,58 @@ The `cellxgene_census` package provides an API to facilitate the use of the CZ C
 For more help, please file a issue on the repo, or contact us at <soma@chanzuckerberg.com>.
 
 If you believe you have found a security issue, we would appreciate notification. Please send email to <security@chanzuckerberg.com>.
+
+## Development Environment Setup
+
+- Create a virtual environment using `venv` or `conda`
+- `cd` to the root of this repository
+- `pip install -e api/python/cellxgene_census`
+- To install dependencies needed to work on the [experimental](./src/cellxgene_census/experimental/) portion of the API:
+  `pip install -e 'api/python/cellxgene_census[experimental]'`.
+- `pip install jupyterlab`
+- **Test it!** Either open up a new `jupyter` notebook or the `python` interpreter and run this code:
+
+```python
+import cellxgene_census
+
+with cellxgene_census.open_soma() as census:
+
+    # Reads SOMADataFrame as a slice
+    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
+        value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
+        column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
+    )
+
+    # Concatenates results to pyarrow.Table
+    cell_metadata = cell_metadata.concat()
+
+    # Converts to pandas.DataFrame
+    cell_metadata = cell_metadata.to_pandas()
+
+    print(cell_metadata)
+```
+
+The output is a `pandas.DataFrame` with over 600K cells meeting our query criteria and the selected columns:
+
+```python
+
+The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.
+
+                assay        cell_type                 tissue tissue_general suspension_type disease     sex
+0        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+1        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+2        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+3        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+4        Smart-seq v4  microglial cell  middle temporal gyrus          brain         nucleus  normal  female
+...               ...              ...                    ...            ...             ...     ...     ...
+607636  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607637  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607638  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607639  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+607640  microwell-seq           neuron          adrenal gland  adrenal gland            cell  normal  female
+
+[607641 rows x 7 columns]
+
+```
+
+- Learn more about the Census API by going through the tutorials in the [notebooks](../notebooks/)
diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml
@@ -41,7 +41,7 @@ dependencies= [
 
 [project.optional-dependencies]
 experimental = [
-    "torch~=2.0",
+    "torch~=2.2.0",
     "torchdata~=0.7",
     "scikit-learn~=1.0",
     "scikit-misc>=0.2",  # scikit-misc 0.3 dropped Python 3.8 support

diff --git a/api/python/cellxgene_census/src/cellxgene_census/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/__init__.py
@@ -21,7 +21,7 @@
 
 from importlib import metadata
 
-from ._get_anndata import get_anndata
+from ._get_anndata import get_anndata, get_obs, get_var
 from ._open import (
     download_source_h5ad,
     get_default_soma_context,
@@ -44,6 +44,8 @@
 __all__ = [
     "download_source_h5ad",
     "get_anndata",
+    "get_obs",
+    "get_var",
     "get_census_version_description",
     "get_census_version_directory",
     "get_census_mirror_directory",

diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
@@ -7,9 +7,10 @@
 Methods to retrieve slices of the census as AnnData objects.
 """
 
-from typing import Optional, Sequence
+from typing import Literal, Optional, Sequence
 
 import anndata
+import pandas as pd
 import tiledbsoma as soma
 from somacore.options import SparseDFCoord
 
@@ -146,3 +147,90 @@ def get_anndata(
                     adata.varm[emb] = embedding
 
         return adata
+
+
+def _get_axis_metadata(
+    census: soma.Collection,
+    axis: Literal["obs", "var"],
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    exp = _get_experiment(census, organism)
+    coords = (slice(None),) if coords is None else (coords,)
+    if axis == "obs":
+        df = exp.obs
+    elif axis == "var":
+        df = exp.ms["RNA"].var
+    else:
+        raise ValueError(f"axis should be either 'obs' or 'var', but '{axis}' was passed")
+    result: pd.DataFrame = (
+        df.read(coords=coords, column_names=column_names, value_filter=value_filter).concat().to_pandas()
+    )
+    return result
+
+
+def get_obs(
+    census: soma.Collection,
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    """Get the observation metadata for a query on the census.
+
+    Args:
+        census:
+            The census object, usually returned by :func:`open_soma`.
+        organism:
+            The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
+        value_filter:
+            Value filter for the ``obs`` metadata. Value is a filter query written in the
+            SOMA ``value_filter`` syntax.
+        coords:
+            Coordinates for the ``obs`` axis, which is indexed by the ``soma_joinid`` value.
+            May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
+        column_names:
+            Columns to fetch.
+
+    Returns:
+        A :class:`pandas.DataFrame` object containing metadata for the queried slice.
+    """
+    return _get_axis_metadata(
+        census, "obs", organism, value_filter=value_filter, coords=coords, column_names=column_names
+    )
+
+
+def get_var(
+    census: soma.Collection,
+    organism: str,
+    *,
+    value_filter: Optional[str] = None,
+    coords: Optional[SparseDFCoord] = slice(None),
+    column_names: Optional[Sequence[str]] = None,
+) -> pd.DataFrame:
+    """Get the variable metadata for a query on the census.
+
+    Args:
+        census:
+            The census object, usually returned by :func:`open_soma`.
+        organism:
+            The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
+        value_filter:
+            Value filter for the ``var`` metadata. Value is a filter query written in the
+            SOMA ``value_filter`` syntax.
+        coords:
+            Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
+            May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
+        column_names:
+            Columns to fetch.
+
+    Returns:
+        A :class:`pandas.DataFrame` object containing metadata for the queried slice.
+    """
+    return _get_axis_metadata(
+        census, "var", organism, value_filter=value_filter, coords=coords, column_names=column_names
+    )
diff --git a/api/python/cellxgene_census/tests/conftest.py b/api/python/cellxgene_census/tests/conftest.py
@@ -1,8 +1,6 @@
 import pytest
 import tiledbsoma as soma
 
-from cellxgene_census import get_default_soma_context
-
 TEST_MARKERS_SKIPPED_BY_DEFAULT = ["expensive", "experimental"]
 
 
@@ -48,4 +46,23 @@ def pytest_configure(config: pytest.Config) -> None:
 @pytest.fixture
 def small_mem_context() -> soma.SOMATileDBContext:
     """used to keep memory usage smaller for GHA runners."""
+    from cellxgene_census import get_default_soma_context
+
     return get_default_soma_context(tiledb_config={"soma.init_buffer_bytes": 32 * 1024**2})
+
+
+# Fixtures for census objects
+
+
+@pytest.fixture(scope="session")
+def census() -> soma.Collection:
+    import cellxgene_census
+
+    return cellxgene_census.open_soma(census_version="latest")
+
+
+@pytest.fixture(scope="session")
+def lts_census() -> soma.Collection:
+    import cellxgene_census
+
+    return cellxgene_census.open_soma(census_version="stable")
diff --git a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py
@@ -12,11 +12,11 @@
 from somacore import AxisQuery
 from tiledbsoma import Experiment, _factory
 from tiledbsoma._collection import CollectionBase
-from torch.utils.data._utils.worker import WorkerInfo
 
 # conditionally import torch, as it will not be available in all test environments
 try:
     from torch import Tensor, float32
+    from torch.utils.data._utils.worker import WorkerInfo
 
     from cellxgene_census.experimental.ml.pytorch import (
         ExperimentDataPipe,
@@ -583,17 +583,15 @@ def test_experiment_dataloader__multiprocess_dense_matrix__ok() -> None:
 
 
 @pytest.mark.experimental
-@patch("cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe")
-def test_experiment_dataloader__unsupported_params__fails(
-    dummy_exp_data_pipe: ExperimentDataPipe,
-) -> None:
-    with pytest.raises(ValueError):
-        experiment_dataloader(dummy_exp_data_pipe, shuffle=True)
-    with pytest.raises(ValueError):
-        experiment_dataloader(dummy_exp_data_pipe, batch_size=3)
-    with pytest.raises(ValueError):
-        experiment_dataloader(dummy_exp_data_pipe, batch_sampler=[])
-    with pytest.raises(ValueError):
-        experiment_dataloader(dummy_exp_data_pipe, sampler=[])
-    with pytest.raises(ValueError):
-        experiment_dataloader(dummy_exp_data_pipe, collate_fn=lambda x: x)
+def test_experiment_dataloader__unsupported_params__fails() -> None:
+    with patch("cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe") as dummy_exp_data_pipe:
+        with pytest.raises(ValueError):
+            experiment_dataloader(dummy_exp_data_pipe, shuffle=True)
+        with pytest.raises(ValueError):
+            experiment_dataloader(dummy_exp_data_pipe, batch_size=3)
+        with pytest.raises(ValueError):
+            experiment_dataloader(dummy_exp_data_pipe, batch_sampler=[])
+        with pytest.raises(ValueError):
+            experiment_dataloader(dummy_exp_data_pipe, sampler=[])
+        with pytest.raises(ValueError):
+            experiment_dataloader(dummy_exp_data_pipe, collate_fn=lambda x: x)