Merge 804b563 into 53ffcb4

chovanecm · Jun 11, 2017 · 1bf5526 · 1bf5526
2 parents 53ffcb4 + 804b563
commit 1bf5526
Show file tree

Hide file tree

Showing 5 changed files with 273 additions and 5 deletions.
diff --git a/sacredboard/app/data/datastorage.py b/sacredboard/app/data/datastorage.py
@@ -0,0 +1,33 @@
+"""Interfaces for data storage backend."""
+
+class Cursor:
+    """Interface that abstracts the cursor object returned from databases."""
+
+    def __init__(self):
+        pass
+
+    def count(self):
+        """Return the number of items in this cursor."""
+        raise NotImplemented()
+
+    def __iter__(self):
+        raise NotImplemented()
+
+
+class DataStorage:
+    """
+    Interface for data backends. Defines the API for various data stores --- databases, file stores, etc. --- that
+    sacred supports.
+    """
+
+    def __init__(self):
+        pass
+
+    def get_run(self, run_id):
+        """Return the run associated with the id."""
+        raise NotImplemented()
+
+    def get_runs(self, sort_by=None, sort_direction=None,
+                 start=0, limit=None, query={"type": "and", "filters": []}):
+        """Return all runs that match the query."""
+        raise NotImplemented()
diff --git a/sacredboard/app/data/filestorage.py b/sacredboard/app/data/filestorage.py
@@ -0,0 +1,113 @@
+"""Implements backend storage interface for sacred's file store."""
+
+import datetime
+import os
+import json
+
+from sacredboard.app.data.datastorage import Cursor, DataStorage
+
+CONFIG_JSON = "config.json"
+RUN_JSON = "run.json"
+INFO_JSON = "info.json"
+
+
+def _path_to_file(basepath, run_id, file_name):
+    return os.path.join(basepath, str(run_id), file_name)
+
+
+def _path_to_config(basepath, run_id):
+    return _path_to_file(basepath, str(run_id), CONFIG_JSON)
+
+
+def _path_to_info(basepath, run_id):
+    return _path_to_file(basepath, str(run_id), INFO_JSON)
+
+
+def _path_to_run(basepath, run_id):
+    return os.path.join(basepath, str(run_id), RUN_JSON)
+
+
+def _read_json(path_to_json):
+    with open(path_to_json) as f:
+        return json.load(f)
+
+
+def _create_run(run_id, runjson, configjson, infojson):
+    runjson["_id"] = run_id
+    runjson["config"] = configjson
+    runjson["info"] = infojson
+
+    # TODO probably want a smarter way of detecting
+    # which values have type "time."
+    for k in ["start_time", "stop_time", "heartbeat"]:
+        runjson[k] = datetime.datetime.strptime(runjson[k],
+                                                '%Y-%m-%dT%H:%M:%S.%f')
+    return runjson
+
+
+class FileStoreCursor(Cursor):
+    """Implements the cursor for file stores."""
+
+    def __init__(self, count, iterable):
+        self.iterable = iterable
+        self._count = count
+
+    def count(self):
+        """
+        Return the number of runs in this query.
+        
+        :return: int
+        """
+        return self._count
+
+    def __iter__(self):
+        return iter(self.iterable)
+
+
+class FileStorage(DataStorage):
+    """Object to interface with one of sacred's file stores."""
+
+    def __init__(self, path_to_dir):
+        super().__init__()
+        self.path_to_dir = os.path.expanduser(path_to_dir)
+
+    def get_run(self, run_id):
+        """
+        Return the run associated with a particular `run_id`.
+        
+        :param run_id: 
+        :return: dict
+        :raises FileNotFoundError
+        """
+        config = _read_json(_path_to_config(self.path_to_dir, run_id))
+        run = _read_json(_path_to_run(self.path_to_dir, run_id))
+        info = _read_json(_path_to_info(self.path_to_dir, run_id))
+        return _create_run(run_id, run, config, info)
+
+    def get_runs(self, sort_by=None, sort_direction=None,
+                 start=0, limit=None, query={"type": "and", "filters": []}):
+        """
+        Return all runs in the file store. If a run is corrupt -- e.g. missing files --- it is skipped.
+        
+        :param sort_by: NotImplemented
+        :param sort_direction:  NotImplemented
+        :param start: NotImplemented
+        :param limit: NotImplemented
+        :param query: NotImplemented
+        :return: FileStoreCursor
+        """
+        all_run_ids = os.listdir(self.path_to_dir)
+
+        def run_iterator():
+            blacklist = set(["_sources"])
+            for id in all_run_ids:
+                if id in blacklist:
+                    continue
+                try:
+                    yield self.get_run(id)
+                except FileNotFoundError:
+                    # An incomplete experiment is a corrupt experiment. Skip it for now.
+                    pass
+
+        count = len(all_run_ids)
+        return FileStoreCursor(count, run_iterator())
diff --git a/sacredboard/app/data/mongodb.py b/sacredboard/app/data/mongodb.py
@@ -3,8 +3,24 @@
 import bson
 import pymongo
 
+from sacredboard.app.data.datastorage import Cursor, DataStorage
 
-class PyMongoDataAccess:
+
+class MongoDbCursor(Cursor):
+    """Implements Cursor for mongodb."""
+
+    def __init__(self, mongodb_cursor):
+        self.mongodb_cursor = mongodb_cursor
+
+    def count(self):
+        """Returns the number of items in this cursor."""
+        return self.mongodb_cursor.count()
+
+    def __iter__(self):
+        return self.mongodb_cursor
+
+
+class PyMongoDataAccess(DataStorage):
     """Access records in MongoDB."""
 
     RUNNING_DEAD_RUN_CLAUSE = {
@@ -19,6 +35,7 @@ def __init__(self, uri, database_name, collection_name):
         Better use the static methods build_data_access
         or build_data_access_with_uri
         """
+        super().__init__()
         self._uri = uri
         self._db_name = database_name
         self._client = None
@@ -73,7 +90,8 @@ def get_runs(self, sort_by=None, sort_direction=None,
         cursor = cursor.skip(start)
         if limit is not None:
             cursor = cursor.limit(limit)
-        return cursor
+
+        return MongoDbCursor(cursor)
 
     def get_run(self, run_id):
         """
@@ -103,6 +121,7 @@ def _apply_sort(cursor, sort_by, sort_direction):
         :param sort_by: The field name to sort by.
         :param sort_direction: The direction to sort, "asc" or "desc".
         :return:
+
         """
         if sort_direction is not None and sort_direction.lower() == "desc":
             sort = pymongo.DESCENDING

diff --git a/sacredboard/bootstrap.py b/sacredboard/bootstrap.py
@@ -12,6 +12,7 @@
 from gevent.pywsgi import WSGIServer
 
 from sacredboard.app.config import jinja_filters
+from sacredboard.app.data.filestorage import FileStorage
 from sacredboard.app.data.mongodb import PyMongoDataAccess
 from sacredboard.app.webapi import routes
 
@@ -35,13 +36,15 @@
                    "You might need it if you use a custom collection name "
                    "or Sacred v0.6 (which used default.runs). "
                    "Default: runs")
+@click.option("-F", default="",
+              help="Path to directory containing experiments.")
 @click.option("--no-browser", is_flag=True, default=False,
               help="Do not open web browser automatically.")
 @click.option("--debug", is_flag=True, default=False,
               help="Run the application in Flask debug mode "
                    "(for development).")
 @click.version_option()
-def run(debug, no_browser, m, mu, mc):
+def run(debug, no_browser, m, mu, mc, f):
     """
     Sacredboard.
 
@@ -76,12 +79,20 @@ def run(debug, no_browser, m, mu, mc):
     Note: MongoDB must be listening on localhost.
 
     """
-    add_mongo_config(app, m, mu, mc)
+    if m or mu != (None, None):
+        add_mongo_config(app, m, mu, mc)
+        app.config["data"].connect()
+    elif f:
+        app.config["data"] = FileStorage(f)
+    else:
+        print("Must specify either a mongodb instance or \
+                a path to a file storage.")
+
     app.config['DEBUG'] = debug
     app.debug = debug
     jinja_filters.setup_filters(app)
     routes.setup_routes(app)
-    app.config["data"].connect()
+
     if debug:
         app.run(host="0.0.0.0", debug=True)
     else:

diff --git a/sacredboard/tests/data/test_filestorage.py b/sacredboard/tests/data/test_filestorage.py
@@ -0,0 +1,92 @@
+"""Tests for the file storage backend."""
+
+# coding=utf-8
+import bson
+import pytest
+import json
+import tempfile
+import os
+
+from sacredboard.app.data.filestorage import FileStorage
+
+
+def create_tmp_datastore():
+    """
+    Rather than mocking the file system, this actually creates some temporary files that emulate the file store system
+    in Sacred. Unfortunately, Sacred and Sacredboard are completely decoupled, which makes it impossible to ensure that
+    this standard is upheld throughout the sacred system.
+    
+    :return: dict
+    """
+    config = {"length": None, "n_input": 255, "batch_size": None,
+         "dataset_path": "./german-nouns.hdf5", "validation_ds": "validation",
+         "log_dir": "./log/rnn500_dropout0.5_lrate1e-4_minibatch_1000steps",
+         "seed": 144363069, "dropout_keep_probability": 0.5,
+         "max_character_ord": 255, "training_ds": "training", "num_classes": 3,
+         "training_steps": 1000, "learning_rate": 0.0001, "hidden_size": 500}
+
+    run = {"status": "COMPLETED",
+                        "_id": "57f9efb2e4b8490d19d7c30e",
+                        "resources": [],
+                        "host": {"os": "Linux",
+                                 "os_info": "Linux-3.16.0-38-generic-x86_64-with-LinuxMint-17.2-rafaela",
+                                 "cpu": "Intel(R) Core(TM) i3 CPU       M 370  @ 2.40GHz",
+                                 "python_version": "3.4.3",
+                                 "python_compiler": "GCC 4.8.4",
+                                 "cpu_count": 4,
+                                 "hostname": "ntbacer"},
+                        "experiment": {"doc": None, "sources": [[
+                            "/home/martin/mnt/noun-classification/train_model.py",
+                            "86aaa9b81d6e32a181598ed78bb1d7a1"]],
+                                       "dependencies": [["h5py", "2.6.0"],
+                                                        ["numpy", "1.11.2"],
+                                                        ["sacred", "0.6.10"]],
+                                       "name": "German nouns"},
+                        "result": 2403.52, "artifacts": [], "comment": "",
+                        # N.B. time formatting is different between mongodb and file store.
+                        "start_time": "2017-06-02T07:13:05.305845",
+                        "stop_time": "2017-06-02T07:14:02.455460",
+                        "heartbeat": "2017-06-02T07:14:02.452597",
+           "captured_out": "Output: \n"}
+    info = {"info": "present"}
+
+    experiment_dir = tempfile.mkdtemp()
+    experiment42 = os.path.join(experiment_dir, "42") # experiment number 42
+    os.mkdir(experiment42)
+
+    with open(os.path.join(experiment42, "config.json"), 'w') as config_file:
+       json.dump(config, config_file)
+
+    with open(os.path.join(experiment42, "run.json"), 'w') as run_file:
+        json.dump(run, run_file)
+
+    with open(os.path.join(experiment42, "info.json"), 'w') as info_file:
+        json.dump(info, info_file)
+
+    return experiment_dir
+
+@pytest.fixture
+def tmpfilestore() -> FileStorage:
+    """Fixture that prepares a file store in /tmp for dependency injection."""
+    dir = create_tmp_datastore()
+    return FileStorage(dir)
+
+def test_get_run(tmpfilestore : FileStorage):
+    """Tests the get_run function."""
+    run42 = tmpfilestore.get_run(42)
+
+    for key in ["info", "resources", "host", "experiment", "result", "artifacts", "comment", "start_time", "stop_time",
+                "heartbeat", "captured_out", "config"]:
+        assert key in run42
+
+def test_get_runs(tmpfilestore : FileStorage):
+    """Tests the get_runs function."""
+    runs = tmpfilestore.get_runs()
+    runs = list(runs)
+
+    assert 1 == len(runs)
+
+    run = runs[0]
+    for key in ["info", "resources", "host", "experiment", "result", "artifacts", "comment", "start_time", "stop_time",
+                "heartbeat", "captured_out", "config"]:
+        assert key in run