chanzuckerberg · bkmartinjr · Sep 15, 2019 · Sep 12, 2019 · Sep 13, 2019 · Sep 13, 2019
diff --git a/docs/faq.md b/docs/faq.md
@@ -100,3 +100,23 @@ This may happen, especially as we work out bugs in our installation process! Ple
 #### I'm following the developer instructions and get an error about "missing files and directories” when trying to build the client
 
 This is likely because you do not have node and npm installed, we recommend using [nvm](https://github.com/creationix/nvm) if you're new to using these tools.
+
+# Data access
+
+#### Can I use a _s3:_ or _gs:_ URL with `cellxgene launch`?
+
+cellxgene currently uses the [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html) package to
+access remote data. Direct support for S3 and GCS is not enabled by default, but optional packages may be
+installed.
+
+If you wish to directly access S3 or GFS, install one or both of the following packages using `pip`:
+
+- [s3fs](https://s3fs.readthedocs.io/en/latest/) for S3 support
+- [gcsfs](https://gcsfs.readthedocs.io/en/latest/) for GCS support
+
+For example:
+
+```
+pip install s3fs
+cellxgene launch s3://mybucket.s3-us-west-2.amazonaws.com/mydata.h5ad
+```
diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -21,6 +21,12 @@ If you want an example dataset download [this file](https://github.com/chanzucke
 cellxgene launch pbmc3k.h5ad --open
 ```
 
+You can also directly specify URLs as a data source, and the data will be downloaded during launch
+
+```
+cellxgene launch https://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/example-dataset/pbmc3k.h5ad
+```
+
 On Mac OS and Ubuntu, you should see your web browser open with the following
 
 <img width="450" src="https://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/docs/cellxgene-opening-screenshot.png" pad="50px">

diff --git a/server/app/driver/driver.py b/server/app/driver/driver.py
@@ -46,7 +46,7 @@ def features(self):
         return features
 
     @abstractmethod
-    def _load_data(self, data):
+    def _load_data(self, data_locator):
         pass
 
     @abstractmethod

diff --git a/server/app/scanpy_engine/scanpy_engine.py b/server/app/scanpy_engine/scanpy_engine.py
@@ -177,11 +177,16 @@ def _create_schema(self):
             }
             self.schema["layout"]["obs"].append(layout_schema)
 
-    def _load_data(self, data):
-        # as of AnnData 0.6.19, backed mode performs initial load fast, but at the
-        # cost of significantly slower access to X data.
+    def _load_data(self, data_locator):
         try:
-            self.data = anndata.read_h5ad(data)
+            # there is no guarantee data_locator indicates a local file.  The AnnData
+            # API will only consume local file objects.  If we get a non-local object,
+            # make a copy in tmp, and delete it after we load into memory.
+            with data_locator.local_handle() as lh:
+                # as of AnnData 0.6.19, backed mode performs initial load fast, but at the
+                # cost of significantly slower access to X data.
+                self.data = anndata.read_h5ad(lh)
+
         except ValueError:
             raise ScanpyFileError(
                 "File must be in the .h5ad format. Please read "
@@ -191,12 +196,11 @@ def _load_data(self, data):
                 "information."
             )
         except MemoryError:
-            raise ScanpyFileError("Error while loading file: out of memory, file is too large"
-                                  " for memory available")
+            raise ScanpyFileError("Out of memory - file is too large for available memory.")
         except Exception as e:
             raise ScanpyFileError(
-                f"Error while loading file: {e}, File must be in the .h5ad format, please check "
-                f"that your input and try again."
+                f"{e} - file not found or is inaccessible.  File must be an .h5ad object.  "
+                f"Please check your input and try again."
             )
 
     @requires_data

diff --git a/server/app/util/data_locator.py b/server/app/util/data_locator.py
@@ -0,0 +1,84 @@
+import os
+import tempfile
+import fsspec
+
+
+class DataLocator():
+    """
+    DataLocator is a simple wrapper around fsspec functionality, and provides a
+    set of functions to encapsulate a data location (URI or path), interogate
+    metadata about the object at that location (size, existance, etc) and
+    access the underlying data.
+
+    https://filesystem-spec.readthedocs.io/en/latest/index.html
+
+    Example:
+        dl = DataLocator("/tmp/foo.h5ad")
+        if dl.exists():
+            print(dl.size())
+            with dl.open() as f:
+                thecontents = f.read()
+
+    DataLocator will accept a URI or native path.  Error handling is as defined
+    in fsspec.
+
+    """
+
+    def __init__(self, uri_or_path):
+        self.uri_or_path = uri_or_path
+        self.protocol, self.path = DataLocator._get_protocol_and_path(uri_or_path)
+        # work-around for LocalFileSystem not treating file: and None as the same scheme/protocol
+        self.cname = self.path if self.protocol == 'file' else self.uri_or_path
+        # will throw RuntimeError if the protocol is unsupported
+        self.fs = fsspec.filesystem(self.protocol)
+
+    @staticmethod
+    def _get_protocol_and_path(uri_or_path):
+        if "://" in uri_or_path:
+            protocol, path = uri_or_path.split("://", 1)
+            # windows!!!  Ignore single letter drive identifiers,
+            # eg, G:\foo.txt
+            if len(protocol) > 1:
+                return protocol, path
+        return None, uri_or_path
+
+    def exists(self):
+        return self.fs.exists(self.cname)
+
+    def size(self):
+        return self.fs.size(self.cname)
+
+    def isfile(self):
+        return self.fs.isfile(self.cname)
+
+    def open(self, *args):
+        return self.fs.open(self.uri_or_path, *args)
+
+    def islocal(self):
+        return self.protocol is None or self.protocol == 'file'
+
+    def local_handle(self):
+        if self.islocal():
+            return LocalFilePath(self.path)
+
+        # if not local, create a tmp file system object to contain the data,
+        # and clean it up when done.
+        with self.open() as src, tempfile.NamedTemporaryFile(prefix="cellxgene_", delete=False) as tmp:
+            tmp.write(src.read())
+            tmp.close()
+            src.close()
+            tmp_path = tmp.name
+            return LocalFilePath(tmp_path, delete=True)
+
+
+class LocalFilePath():
+    def __init__(self, tmp_path, delete=False):
+        self.tmp_path = tmp_path
+        self.delete = delete
+
+    def __enter__(self):
+        return self.tmp_path
+
+    def __exit__(self, *args):
+        if self.delete:
+            os.unlink(self.tmp_path)
diff --git a/server/cli/launch.py b/server/cli/launch.py
@@ -2,7 +2,7 @@
 import functools
 import logging
 from os import devnull
-from os.path import splitext, basename, getsize
+from os.path import splitext, basename
 import sys
 import warnings
 import webbrowser
@@ -13,6 +13,7 @@
 from server.app.util.errors import ScanpyFileError
 from server.app.util.utils import custom_format_warning
 from server.utils.utils import find_available_port, is_port_available
+from server.app.util.data_locator import DataLocator
 
 # anything bigger than this will generate a special message
 BIG_FILE_SIZE_THRESHOLD = 100 * 2 ** 20  # 100MB
@@ -63,7 +64,7 @@ def parse_engine_args(layout, obs_names, var_names, max_category_items, diffexp_
 
 
 @click.command()
-@click.argument("data", metavar="<data file>", type=click.Path(exists=True, file_okay=True, dir_okay=False))
+@click.argument("data", nargs=1, metavar="<data file>", required=True)
 @click.option(
     "--verbose",
     "-v",
@@ -117,16 +118,32 @@ def launch(
 
     > cellxgene launch example_dataset/pbmc3k.h5ad --title pbmc3k
 
-    > cellxgene launch <your data file> --title <your title>"""
+    > cellxgene launch <your data file> --title <your title>
+
+    > cellxgene launch <url>"""
 
     e_args = parse_engine_args(layout, obs_names, var_names, max_category_items, diffexp_lfc_cutoff)
+    try:
+        data_locator = DataLocator(data)
+    except RuntimeError as re:
+        raise click.ClickException(f"Unable to access data at {data}.  {str(re)}")
+
     # Startup message
     click.echo("[cellxgene] Starting the CLI...")
 
     # Argument checking
-    name, extension = splitext(data)
-    if extension != ".h5ad":
-        raise click.FileError(basename(data), hint="file type must be .h5ad")
+    if data_locator.islocal():
+        # if data locator is local, apply file system conventions and other "cheap"
+        # validation checks.  If a URI, defer until we actually fetch the data and
+        # try to read it.  Many of these tests don't make sense for URIs (eg, extension-
+        # based typing).
+        if not data_locator.exists():
+            raise click.FileError(data, hint="file does not exist")
+        if not data_locator.isfile():
+            raise click.FileError(data, hint="data is not a file")
+        name, extension = splitext(data)
+        if extension != ".h5ad":
+            raise click.FileError(basename(data), hint="file type must be .h5ad")
 
     if debug:
         verbose = True
@@ -177,18 +194,18 @@ def launch(
         log = logging.getLogger("werkzeug")
         log.setLevel(logging.ERROR)
 
-    file_size = getsize(data)
+    file_size = data_locator.size() if data_locator.islocal() else 0
 
     # if a big file, let the user know it may take a while to load.
     if file_size > BIG_FILE_SIZE_THRESHOLD:
-        click.echo(f"[cellxgene] Loading data from {basename(data)}, this may take awhile...")
+        click.echo(f"[cellxgene] Loading data from {basename(data)}, this may take a while...")
     else:
         click.echo(f"[cellxgene] Loading data from {basename(data)}.")
 
     from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
 
     try:
-        server.attach_data(ScanpyEngine(data, e_args), title=title)
+        server.attach_data(ScanpyEngine(data_locator, e_args), title=title)
     except ScanpyFileError as e:
         raise click.ClickException(f"{e}")
 

diff --git a/server/requirements.txt b/server/requirements.txt
@@ -6,6 +6,7 @@ Flask-Compress>=1.4.0
 Flask-Cors>=3.0.6
 Flask-RESTful>=0.3.6
 flatbuffers>=1.10.0
+fsspec>=0.4.4
 numpy>=1.15.2
 pandas>=0.23.1
 scipy>=1.1.0

diff --git a/server/test/test_nan_scanpy_engine.py b/server/test/test_nan_scanpy_engine.py
@@ -7,6 +7,7 @@
 
 from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
 from server.app.util.errors import FilterError
+from server.app.util.data_locator import DataLocator
 
 
 class NaNTest(unittest.TestCase):
@@ -20,12 +21,12 @@ def setUp(self):
         }
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=UserWarning)
-            self.data = ScanpyEngine("server/test/test_datasets/nan.h5ad", self.args)
+            self.data = ScanpyEngine(DataLocator("server/test/test_datasets/nan.h5ad"), self.args)
             self.data._create_schema()
 
     def test_load(self):
         with self.assertWarns(UserWarning):
-            ScanpyEngine("server/test/test_datasets/nan.h5ad", self.args)
+            ScanpyEngine(DataLocator("server/test/test_datasets/nan.h5ad"), self.args)
 
     def test_init(self):
         self.assertEqual(self.data.cell_count, 100)

diff --git a/server/test/test_scanpy_engine.py b/server/test/test_scanpy_engine.py
@@ -10,6 +10,7 @@
 
 from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
 from server.app.util.errors import FilterError
+from server.app.util.data_locator import DataLocator
 
 
 class EngineTest(unittest.TestCase):
@@ -22,7 +23,7 @@ def setUp(self):
             "var_names": None,
             "diffexp_lfc_cutoff": 0.01,
         }
-        self.data = ScanpyEngine("example-dataset/pbmc3k.h5ad", args)
+        self.data = ScanpyEngine(DataLocator("example-dataset/pbmc3k.h5ad"), args)
 
     def test_init(self):
         self.assertEqual(self.data.cell_count, 2638)

diff --git a/server/test/test_scanpy_engine_csc.py b/server/test/test_scanpy_engine_csc.py
@@ -10,6 +10,7 @@
 
 from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
 from server.app.util.errors import FilterError
+from server.app.util.data_locator import DataLocator
 
 
 class EngineTest(unittest.TestCase):
@@ -21,7 +22,7 @@ def setUp(self):
             "var_names": None,
             "diffexp_lfc_cutoff": 0.01,
         }
-        self.data = ScanpyEngine("server/test/test_datasets/pbmc3k-CSC-gz.h5ad", args)
+        self.data = ScanpyEngine(DataLocator("server/test/test_datasets/pbmc3k-CSC-gz.h5ad"), args)
 
     def test_init(self):
         self.assertEqual(self.data.cell_count, 2638)

diff --git a/server/test/test_scanpy_engine_csr.py b/server/test/test_scanpy_engine_csr.py
@@ -10,6 +10,7 @@
 
 from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
 from server.app.util.errors import FilterError
+from server.app.util.data_locator import DataLocator
 
 
 class EngineTest(unittest.TestCase):
@@ -21,7 +22,7 @@ def setUp(self):
             "var_names": None,
             "diffexp_lfc_cutoff": 0.01,
         }
-        self.data = ScanpyEngine("server/test/test_datasets/pbmc3k-CSR-gz.h5ad", args)
+        self.data = ScanpyEngine(DataLocator("server/test/test_datasets/pbmc3k-CSR-gz.h5ad"), args)
 
     def test_init(self):
         self.assertEqual(self.data.cell_count, 2638)

diff --git a/server/test/test_scanpy_engine_data_load.py b/server/test/test_scanpy_engine_data_load.py
@@ -3,11 +3,15 @@
 
 from server.app.scanpy_engine.scanpy_engine import ScanpyEngine
 from server.app.util.errors import DriverError
+from server.app.util.data_locator import DataLocator
 
 
 class DataLoadEngineTest(unittest.TestCase):
+    """
+    Test file loading, including deferred loading/update.
+    """
     def setUp(self):
-        self.data_file = "example-dataset/pbmc3k.h5ad"
+        self.data_file = DataLocator("example-dataset/pbmc3k.h5ad")
         self.data = ScanpyEngine()
 
     def test_init(self):
@@ -45,5 +49,39 @@ def test_diffexp_topN(self):
         result = json.loads(self.data.diffexp_topN(f1["filter"], f2["filter"], 20))
         self.assertEqual(len(result), 20)
 
-    if __name__ == "__main__":
-        unittest.main()
+
+class DataLocatorEngineTest(unittest.TestCase):
+    """
+    Test various types of data locators we expect to consume
+    """
+    def setUp(self):
+        self.args = {
+            "layout": ["umap"],
+            "max_category_items": 100,
+            "obs_names": None,
+            "var_names": None,
+            "diffexp_lfc_cutoff": 0.01,
+        }
+
+    def stdAsserts(self, data):
+        """ run these each time we load the data """
+        self.assertIsNotNone(data)
+        self.assertEqual(data.cell_count, 2638)
+        self.assertEqual(data.gene_count, 1838)
+
+    def test_posix_file(self):
+        locator = DataLocator("example-dataset/pbmc3k.h5ad")
+        data = ScanpyEngine(locator, self.args)
+        self.stdAsserts(data)
+
+    def test_url_https(self):
+        url = "https://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/example-dataset/pbmc3k.h5ad"
+        locator = DataLocator(url)
+        data = ScanpyEngine(locator, self.args)
+        self.stdAsserts(data)
+
+    def test_url_http(self):
+        url = "http://raw.githubusercontent.com/chanzuckerberg/cellxgene/master/example-dataset/pbmc3k.h5ad"
+        locator = DataLocator(url)
+        data = ScanpyEngine(locator, self.args)
+        self.stdAsserts(data)