Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add URL data locators to launch sub-command #920

Merged
merged 7 commits into from
Sep 15, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,23 @@ This may happen, especially as we work out bugs in our installation process! Ple
#### I'm following the developer instructions and get an error about "missing files and directories” when trying to build the client

This is likely because you do not have node and npm installed, we recommend using [nvm](https://github.com/creationix/nvm) if you're new to using these tools.

# Data access

#### Can I use a _s3:_ or _gs:_ URL with `cellxgene launch`?

cellxgene currently uses the [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html) package to
bkmartinjr marked this conversation as resolved.
Show resolved Hide resolved
access remote data. Direct support for S3 and GCS is not enabled by default, but optional packages may be
installed.

If you wish to directly access S3 or GFS, install one or both of the following packages using `pip`:

- [s3fs](https://s3fs.readthedocs.io/en/latest/) for S3 support
- [gcsfs](https://gcsfs.readthedocs.io/en/latest/) for GCS support

For example:

```
pip install s3fs
cellxgene launch s3://mybucket.s3-us-west-2.amazonaws.com/mydata.h5ad
```
6 changes: 6 additions & 0 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ If you want an example dataset download [this file](https://github.com/chanzucke
cellxgene launch pbmc3k.h5ad --open
```

You can also directly specify URLs as a data source, and the data will be downloaded during launch

```
cellxgene launch https://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/example-dataset/pbmc3k.h5ad
```

On Mac OS and Ubuntu, you should see your web browser open with the following

<img width="450" src="https://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/docs/cellxgene-opening-screenshot.png" pad="50px">
Expand Down
2 changes: 1 addition & 1 deletion server/app/driver/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def features(self):
return features

@abstractmethod
def _load_data(self, data):
def _load_data(self, data_locator):
pass

@abstractmethod
Expand Down
20 changes: 12 additions & 8 deletions server/app/scanpy_engine/scanpy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,16 @@ def _create_schema(self):
}
self.schema["layout"]["obs"].append(layout_schema)

def _load_data(self, data):
# as of AnnData 0.6.19, backed mode performs initial load fast, but at the
# cost of significantly slower access to X data.
def _load_data(self, data_locator):
try:
self.data = anndata.read_h5ad(data)
# there is no guarantee data_locator indicates a local file. The AnnData
# API will only consume local file objects. If we get a non-local object,
# make a copy in tmp, and delete it after we load into memory.
with data_locator.local_handle() as lh:
# as of AnnData 0.6.19, backed mode performs initial load fast, but at the
# cost of significantly slower access to X data.
self.data = anndata.read_h5ad(lh)

except ValueError:
raise ScanpyFileError(
"File must be in the .h5ad format. Please read "
Expand All @@ -191,12 +196,11 @@ def _load_data(self, data):
"information."
)
except MemoryError:
raise ScanpyFileError("Error while loading file: out of memory, file is too large"
" for memory available")
raise ScanpyFileError("Out of memory - file is too large for available memory.")
except Exception as e:
raise ScanpyFileError(
f"Error while loading file: {e}, File must be in the .h5ad format, please check "
f"that your input and try again."
f"{e} - file not found or is inaccessible. File must be an .h5ad object. "
f"Please check your input and try again."
)

@requires_data
Expand Down
84 changes: 84 additions & 0 deletions server/app/util/data_locator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import tempfile
import fsspec


class DataLocator():
"""
DataLocator is a simple wrapper around fsspec functionality, and provides a
set of functions to encapsulate a data location (URI or path), interogate
metadata about the object at that location (size, existance, etc) and
access the underlying data.

https://filesystem-spec.readthedocs.io/en/latest/index.html

Example:
dl = DataLocator("/tmp/foo.h5ad")
if dl.exists():
print(dl.size())
with dl.open() as f:
thecontents = f.read()

DataLocator will accept a URI or native path. Error handling is as defined
in fsspec.

"""

def __init__(self, uri_or_path):
self.uri_or_path = uri_or_path
self.protocol, self.path = DataLocator._get_protocol_and_path(uri_or_path)
# work-around for LocalFileSystem not treating file: and None as the same scheme/protocol
self.cname = self.path if self.protocol == 'file' else self.uri_or_path
# will throw RuntimeError if the protocol is unsupported
self.fs = fsspec.filesystem(self.protocol)

@staticmethod
def _get_protocol_and_path(uri_or_path):
if "://" in uri_or_path:
protocol, path = uri_or_path.split("://", 1)
# windows!!! Ignore single letter drive identifiers,
# eg, G:\foo.txt
if len(protocol) > 1:
return protocol, path
return None, uri_or_path

def exists(self):
return self.fs.exists(self.cname)

def size(self):
return self.fs.size(self.cname)

def isfile(self):
return self.fs.isfile(self.cname)

def open(self, *args):
return self.fs.open(self.uri_or_path, *args)

def islocal(self):
return self.protocol is None or self.protocol == 'file'

def local_handle(self):
if self.islocal():
return LocalFilePath(self.path)

# if not local, create a tmp file system object to contain the data,
# and clean it up when done.
with self.open() as src, tempfile.NamedTemporaryFile(prefix="cellxgene_", delete=False) as tmp:
tmp.write(src.read())
tmp.close()
src.close()
tmp_path = tmp.name
return LocalFilePath(tmp_path, delete=True)


class LocalFilePath():
def __init__(self, tmp_path, delete=False):
self.tmp_path = tmp_path
self.delete = delete

def __enter__(self):
return self.tmp_path

def __exit__(self, *args):
if self.delete:
os.unlink(self.tmp_path)
35 changes: 26 additions & 9 deletions server/cli/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import functools
import logging
from os import devnull
from os.path import splitext, basename, getsize
from os.path import splitext, basename
import sys
import warnings
import webbrowser
Expand All @@ -13,6 +13,7 @@
from server.app.util.errors import ScanpyFileError
from server.app.util.utils import custom_format_warning
from server.utils.utils import find_available_port, is_port_available
from server.app.util.data_locator import DataLocator

# anything bigger than this will generate a special message
BIG_FILE_SIZE_THRESHOLD = 100 * 2 ** 20 # 100MB
Expand Down Expand Up @@ -63,7 +64,7 @@ def parse_engine_args(layout, obs_names, var_names, max_category_items, diffexp_


@click.command()
@click.argument("data", metavar="<data file>", type=click.Path(exists=True, file_okay=True, dir_okay=False))
@click.argument("data", nargs=1, metavar="<data file>", required=True)
@click.option(
"--verbose",
"-v",
Expand Down Expand Up @@ -117,16 +118,32 @@ def launch(

> cellxgene launch example_dataset/pbmc3k.h5ad --title pbmc3k

> cellxgene launch <your data file> --title <your title>"""
> cellxgene launch <your data file> --title <your title>

> cellxgene launch <url>"""

e_args = parse_engine_args(layout, obs_names, var_names, max_category_items, diffexp_lfc_cutoff)
try:
data_locator = DataLocator(data)
except RuntimeError as re:
raise click.ClickException(f"Unable to access data at {data}. {str(re)}")

# Startup message
click.echo("[cellxgene] Starting the CLI...")

# Argument checking
name, extension = splitext(data)
if extension != ".h5ad":
raise click.FileError(basename(data), hint="file type must be .h5ad")
if data_locator.islocal():
# if data locator is local, apply file system conventions and other "cheap"
# validation checks. If a URI, defer until we actually fetch the data and
# try to read it. Many of these tests don't make sense for URIs (eg, extension-
# based typing).
if not data_locator.exists():
raise click.FileError(data, hint="file does not exist")
if not data_locator.isfile():
raise click.FileError(data, hint="data is not a file")
name, extension = splitext(data)
if extension != ".h5ad":
raise click.FileError(basename(data), hint="file type must be .h5ad")

if debug:
verbose = True
Expand Down Expand Up @@ -177,18 +194,18 @@ def launch(
log = logging.getLogger("werkzeug")
log.setLevel(logging.ERROR)

file_size = getsize(data)
file_size = data_locator.size() if data_locator.islocal() else 0

# if a big file, let the user know it may take a while to load.
if file_size > BIG_FILE_SIZE_THRESHOLD:
click.echo(f"[cellxgene] Loading data from {basename(data)}, this may take awhile...")
click.echo(f"[cellxgene] Loading data from {basename(data)}, this may take a while...")
else:
click.echo(f"[cellxgene] Loading data from {basename(data)}.")

from server.app.scanpy_engine.scanpy_engine import ScanpyEngine

try:
server.attach_data(ScanpyEngine(data, e_args), title=title)
server.attach_data(ScanpyEngine(data_locator, e_args), title=title)
except ScanpyFileError as e:
raise click.ClickException(f"{e}")

Expand Down
1 change: 1 addition & 0 deletions server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Flask-Compress>=1.4.0
Flask-Cors>=3.0.6
Flask-RESTful>=0.3.6
flatbuffers>=1.10.0
fsspec>=0.4.4
numpy>=1.15.2
pandas>=0.23.1
scipy>=1.1.0
Expand Down
5 changes: 3 additions & 2 deletions server/test/test_nan_scanpy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
from server.app.util.errors import FilterError
from server.app.util.data_locator import DataLocator


class NaNTest(unittest.TestCase):
Expand All @@ -20,12 +21,12 @@ def setUp(self):
}
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=UserWarning)
self.data = ScanpyEngine("server/test/test_datasets/nan.h5ad", self.args)
self.data = ScanpyEngine(DataLocator("server/test/test_datasets/nan.h5ad"), self.args)
self.data._create_schema()

def test_load(self):
with self.assertWarns(UserWarning):
ScanpyEngine("server/test/test_datasets/nan.h5ad", self.args)
ScanpyEngine(DataLocator("server/test/test_datasets/nan.h5ad"), self.args)

def test_init(self):
self.assertEqual(self.data.cell_count, 100)
Expand Down
3 changes: 2 additions & 1 deletion server/test/test_scanpy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
from server.app.util.errors import FilterError
from server.app.util.data_locator import DataLocator


class EngineTest(unittest.TestCase):
Expand All @@ -22,7 +23,7 @@ def setUp(self):
"var_names": None,
"diffexp_lfc_cutoff": 0.01,
}
self.data = ScanpyEngine("example-dataset/pbmc3k.h5ad", args)
self.data = ScanpyEngine(DataLocator("example-dataset/pbmc3k.h5ad"), args)

def test_init(self):
self.assertEqual(self.data.cell_count, 2638)
Expand Down
3 changes: 2 additions & 1 deletion server/test/test_scanpy_engine_csc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
from server.app.util.errors import FilterError
from server.app.util.data_locator import DataLocator


class EngineTest(unittest.TestCase):
Expand All @@ -21,7 +22,7 @@ def setUp(self):
"var_names": None,
"diffexp_lfc_cutoff": 0.01,
}
self.data = ScanpyEngine("server/test/test_datasets/pbmc3k-CSC-gz.h5ad", args)
self.data = ScanpyEngine(DataLocator("server/test/test_datasets/pbmc3k-CSC-gz.h5ad"), args)

def test_init(self):
self.assertEqual(self.data.cell_count, 2638)
Expand Down
3 changes: 2 additions & 1 deletion server/test/test_scanpy_engine_csr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
from server.app.util.errors import FilterError
from server.app.util.data_locator import DataLocator


class EngineTest(unittest.TestCase):
Expand All @@ -21,7 +22,7 @@ def setUp(self):
"var_names": None,
"diffexp_lfc_cutoff": 0.01,
}
self.data = ScanpyEngine("server/test/test_datasets/pbmc3k-CSR-gz.h5ad", args)
self.data = ScanpyEngine(DataLocator("server/test/test_datasets/pbmc3k-CSR-gz.h5ad"), args)

def test_init(self):
self.assertEqual(self.data.cell_count, 2638)
Expand Down
44 changes: 41 additions & 3 deletions server/test/test_scanpy_engine_data_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@

from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
from server.app.util.errors import DriverError
from server.app.util.data_locator import DataLocator


class DataLoadEngineTest(unittest.TestCase):
"""
Test file loading, including deferred loading/update.
"""
def setUp(self):
self.data_file = "example-dataset/pbmc3k.h5ad"
self.data_file = DataLocator("example-dataset/pbmc3k.h5ad")
self.data = ScanpyEngine()

def test_init(self):
Expand Down Expand Up @@ -45,5 +49,39 @@ def test_diffexp_topN(self):
result = json.loads(self.data.diffexp_topN(f1["filter"], f2["filter"], 20))
self.assertEqual(len(result), 20)

if __name__ == "__main__":
unittest.main()

class DataLocatorEngineTest(unittest.TestCase):
"""
Test various types of data locators we expect to consume
"""
def setUp(self):
self.args = {
"layout": ["umap"],
"max_category_items": 100,
"obs_names": None,
"var_names": None,
"diffexp_lfc_cutoff": 0.01,
}

def stdAsserts(self, data):
""" run these each time we load the data """
self.assertIsNotNone(data)
self.assertEqual(data.cell_count, 2638)
self.assertEqual(data.gene_count, 1838)

def test_posix_file(self):
locator = DataLocator("example-dataset/pbmc3k.h5ad")
data = ScanpyEngine(locator, self.args)
self.stdAsserts(data)

def test_url_https(self):
url = "https://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/example-dataset/pbmc3k.h5ad"
locator = DataLocator(url)
data = ScanpyEngine(locator, self.args)
self.stdAsserts(data)

def test_url_http(self):
url = "http://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/example-dataset/pbmc3k.h5ad"
locator = DataLocator(url)
data = ScanpyEngine(locator, self.args)
self.stdAsserts(data)