Skip to content

Commit

Permalink
Merge branch 'main' into ebezzi/pytorch-loaders-context
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzi committed May 24, 2024
2 parents 97aba3a + 36d98e8 commit 6ec4496
Show file tree
Hide file tree
Showing 26 changed files with 734 additions and 489 deletions.
24 changes: 11 additions & 13 deletions .github/workflows/lts-compat-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,14 @@ jobs:
census-build-version: # Add additional LTS releases as they occur
- "latest"
- "stable"
- "2023-12-15"
- "2023-07-25"
- "2023-05-15"
py-pkg-version:
- "~=1.0.0"
- "~=1.1.0"
- "~=1.2.0"
- "~=1.3.0"
- "~=1.4.0"
- "~=1.5.0"
- "~=1.6.0"
- "~=1.10.0"
- "~=1.11.0"
- "~=1.12.0"
- "~=1.13.0"
- "head-of-main"

runs-on: ${{matrix.os}}
Expand All @@ -39,17 +37,17 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies (including experimental)
- name: Install dependencies
run: |
python -m pip install -U pip setuptools wheel
pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
if [ {{matrix.cellxgene-census-version}} != "head-of-main" ]; then
pip install -e ./api/python/cellxgene_census/[experimental]
if [ {{matrix.py-pkg-version}} = "head-of-main" ]; then
pip install -e ./api/python/cellxgene_census/
else
pip install -U cellxgene_census[experimental]==${{ matrix.py-pkg-version }}
pip install -U cellxgene_census${{ matrix.py-pkg-version }}
fi
- name: Test with pytest (API, main tests)
run: |
PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/ --census_version ${{ matrix.census-build-version }}
PYTHONPATH=. pytest -v -rP -m lts_compat_check ./api/python/cellxgene_census/tests/test_lts_compat.py --census_version ${{ matrix.census-build-version }}
5 changes: 4 additions & 1 deletion .github/workflows/py-dependency-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ jobs:
matrix:
os: [single-cell-8c64g-runner, macos-latest]
python-version: ["3.8", "3.9", "3.10", "3.11"]
exclude:
- os: macos-latest
python-version: "3.8"

runs-on: ${{matrix.os}}

Expand Down Expand Up @@ -60,7 +63,7 @@ jobs:
run: |
python -m pip install -U pip setuptools wheel
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install -U cellxgene-census[experimental]
# dump pip config for logs
Expand Down
13 changes: 12 additions & 1 deletion .github/workflows/py-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,21 @@ on:
push:
branches: [main]

# If a new commit is pushed, cancel the jobs from previous commits.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
unit_tests_python_api:
strategy:
fail-fast: false # Don't stop the workflow if one of the jobs fails
matrix:
os: [single-cell-8c64g-runner, macos-latest]
python-version: ["3.8", "3.9", "3.10", "3.11"]
exclude:
- os: macos-latest
python-version: "3.8"

runs-on: ${{matrix.os}}

Expand All @@ -30,8 +39,10 @@ jobs:
run: |
python -m pip install -U pip setuptools wheel
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install -e './api/python/cellxgene_census/[experimental]'
- name: Report Dependency Versions
run: pip list
- name: Test with pytest (API, main tests)
run: |
PYTHONPATH=. coverage run --parallel-mode -m pytest -v -rP --durations=20 ./api/python/cellxgene_census/tests/
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/r-dependency-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ jobs:
- name: install packages (macOS)
if: matrix.os == 'macos-latest'
run: Rscript -e 'install.packages(c("igraph"), type="binary")'
- name: install cellxgene.census and dependencies (Linux)
- name: install cellxgene.census and dependencies
# This should follow our user-facing instructions to install cellxgene.census.
run: |
Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"), type="source")'
Rscript -e 'install.packages(c("cellxgene.census", "Seurat", "BiocManager", "testthat"), repos=c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"))'
Rscript -e 'BiocManager::install("SingleCellExperiment")'
- name: run unit tests
# [re-]fetch the cellxgene.census source package which includes the unit test code to run
Expand Down
55 changes: 55 additions & 0 deletions api/python/cellxgene_census/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,58 @@ The `cellxgene_census` package provides an API to facilitate the use of the CZ C
For more help, please file a issue on the repo, or contact us at <soma@chanzuckerberg.com>.

If you believe you have found a security issue, we would appreciate notification. Please send email to <security@chanzuckerberg.com>.

## Development Environment Setup

- Create a virtual environment using `venv` or `conda`
- `cd` to the root of this repository
- `pip install -e api/python/cellxgene_census`
- To install dependencies needed to work on the [experimental](./src/cellxgene_census/experimental/) portion of the API:
`pip install -e 'api/python/cellxgene_census[experimental]'`.
- `pip install jupyterlab`
- **Test it!** Either open up a new `jupyter` notebook or the `python` interpreter and run this code:

```python
import cellxgene_census

with cellxgene_census.open_soma() as census:

# Reads SOMADataFrame as a slice
cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
value_filter = "sex == 'female' and cell_type in ['microglial cell', 'neuron']",
column_names = ["assay", "cell_type", "tissue", "tissue_general", "suspension_type", "disease"]
)

# Concatenates results to pyarrow.Table
cell_metadata = cell_metadata.concat()

# Converts to pandas.DataFrame
cell_metadata = cell_metadata.to_pandas()

print(cell_metadata)
```

The output is a `pandas.DataFrame` with over 600K cells meeting our query criteria and the selected columns:

```python

The "stable" release is currently 2023-12-15. Specify 'census_version="2023-12-15"' in future calls to open_soma() to ensure data consistency.

assay cell_type tissue tissue_general suspension_type disease sex
0 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
1 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
2 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
3 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
4 Smart-seq v4 microglial cell middle temporal gyrus brain nucleus normal female
... ... ... ... ... ... ... ...
607636 microwell-seq neuron adrenal gland adrenal gland cell normal female
607637 microwell-seq neuron adrenal gland adrenal gland cell normal female
607638 microwell-seq neuron adrenal gland adrenal gland cell normal female
607639 microwell-seq neuron adrenal gland adrenal gland cell normal female
607640 microwell-seq neuron adrenal gland adrenal gland cell normal female

[607641 rows x 7 columns]

```

- Learn more about the Census API by going through the tutorials in the [notebooks](../notebooks/)
2 changes: 1 addition & 1 deletion api/python/cellxgene_census/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies= [

[project.optional-dependencies]
experimental = [
"torch~=2.0",
"torch~=2.2.0",
"torchdata~=0.7",
"scikit-learn~=1.0",
"scikit-misc>=0.2", # scikit-misc 0.3 dropped Python 3.8 support
Expand Down
4 changes: 3 additions & 1 deletion api/python/cellxgene_census/src/cellxgene_census/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from importlib import metadata

from ._get_anndata import get_anndata
from ._get_anndata import get_anndata, get_obs, get_var
from ._open import (
download_source_h5ad,
get_default_soma_context,
Expand All @@ -44,6 +44,8 @@
__all__ = [
"download_source_h5ad",
"get_anndata",
"get_obs",
"get_var",
"get_census_version_description",
"get_census_version_directory",
"get_census_mirror_directory",
Expand Down
90 changes: 89 additions & 1 deletion api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
Methods to retrieve slices of the census as AnnData objects.
"""

from typing import Optional, Sequence
from typing import Literal, Optional, Sequence

import anndata
import pandas as pd
import tiledbsoma as soma
from somacore.options import SparseDFCoord

Expand Down Expand Up @@ -146,3 +147,90 @@ def get_anndata(
adata.varm[emb] = embedding

return adata


def _get_axis_metadata(
census: soma.Collection,
axis: Literal["obs", "var"],
organism: str,
*,
value_filter: Optional[str] = None,
coords: Optional[SparseDFCoord] = slice(None),
column_names: Optional[Sequence[str]] = None,
) -> pd.DataFrame:
exp = _get_experiment(census, organism)
coords = (slice(None),) if coords is None else (coords,)
if axis == "obs":
df = exp.obs
elif axis == "var":
df = exp.ms["RNA"].var
else:
raise ValueError(f"axis should be either 'obs' or 'var', but '{axis}' was passed")
result: pd.DataFrame = (
df.read(coords=coords, column_names=column_names, value_filter=value_filter).concat().to_pandas()
)
return result


def get_obs(
census: soma.Collection,
organism: str,
*,
value_filter: Optional[str] = None,
coords: Optional[SparseDFCoord] = slice(None),
column_names: Optional[Sequence[str]] = None,
) -> pd.DataFrame:
"""Get the observation metadata for a query on the census.
Args:
census:
The census object, usually returned by :func:`open_soma`.
organism:
The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
value_filter:
Value filter for the ``obs`` metadata. Value is a filter query written in the
SOMA ``value_filter`` syntax.
coords:
Coordinates for the ``obs`` axis, which is indexed by the ``soma_joinid`` value.
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
column_names:
Columns to fetch.
Returns:
A :class:`pandas.DataFrame` object containing metadata for the queried slice.
"""
return _get_axis_metadata(
census, "obs", organism, value_filter=value_filter, coords=coords, column_names=column_names
)


def get_var(
census: soma.Collection,
organism: str,
*,
value_filter: Optional[str] = None,
coords: Optional[SparseDFCoord] = slice(None),
column_names: Optional[Sequence[str]] = None,
) -> pd.DataFrame:
"""Get the variable metadata for a query on the census.
Args:
census:
The census object, usually returned by :func:`open_soma`.
organism:
The organism to query, usually one of ``"Homo sapiens`` or ``"Mus musculus"``
value_filter:
Value filter for the ``var`` metadata. Value is a filter query written in the
SOMA ``value_filter`` syntax.
coords:
Coordinates for the ``var`` axis, which is indexed by the ``soma_joinid`` value.
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
column_names:
Columns to fetch.
Returns:
A :class:`pandas.DataFrame` object containing metadata for the queried slice.
"""
return _get_axis_metadata(
census, "var", organism, value_filter=value_filter, coords=coords, column_names=column_names
)
21 changes: 19 additions & 2 deletions api/python/cellxgene_census/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import pytest
import tiledbsoma as soma

from cellxgene_census import get_default_soma_context

TEST_MARKERS_SKIPPED_BY_DEFAULT = ["expensive", "experimental"]


Expand Down Expand Up @@ -48,4 +46,23 @@ def pytest_configure(config: pytest.Config) -> None:
@pytest.fixture
def small_mem_context() -> soma.SOMATileDBContext:
"""used to keep memory usage smaller for GHA runners."""
from cellxgene_census import get_default_soma_context

return get_default_soma_context(tiledb_config={"soma.init_buffer_bytes": 32 * 1024**2})


# Fixtures for census objects


@pytest.fixture(scope="session")
def census() -> soma.Collection:
import cellxgene_census

return cellxgene_census.open_soma(census_version="latest")


@pytest.fixture(scope="session")
def lts_census() -> soma.Collection:
import cellxgene_census

return cellxgene_census.open_soma(census_version="stable")
28 changes: 13 additions & 15 deletions api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from somacore import AxisQuery
from tiledbsoma import Experiment, _factory
from tiledbsoma._collection import CollectionBase
from torch.utils.data._utils.worker import WorkerInfo

# conditionally import torch, as it will not be available in all test environments
try:
from torch import Tensor, float32
from torch.utils.data._utils.worker import WorkerInfo

from cellxgene_census.experimental.ml.pytorch import (
ExperimentDataPipe,
Expand Down Expand Up @@ -583,17 +583,15 @@ def test_experiment_dataloader__multiprocess_dense_matrix__ok() -> None:


@pytest.mark.experimental
@patch("cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe")
def test_experiment_dataloader__unsupported_params__fails(
dummy_exp_data_pipe: ExperimentDataPipe,
) -> None:
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, shuffle=True)
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, batch_size=3)
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, batch_sampler=[])
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, sampler=[])
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, collate_fn=lambda x: x)
def test_experiment_dataloader__unsupported_params__fails() -> None:
with patch("cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe") as dummy_exp_data_pipe:
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, shuffle=True)
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, batch_size=3)
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, batch_sampler=[])
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, sampler=[])
with pytest.raises(ValueError):
experiment_dataloader(dummy_exp_data_pipe, collate_fn=lambda x: x)
Loading

0 comments on commit 6ec4496

Please sign in to comment.