From 5ecdcf4015f828c80b662788cb3d7207bb21a855 Mon Sep 17 00:00:00 2001 From: Gideon Dresdner Date: Sat, 27 May 2017 19:51:32 +0200 Subject: [PATCH 1/6] abstract data interface Abstract the interface for data storage from anything mongodb specific, thereby allowing for developers to more easily incorporate other backends such as file storage and tinydb. Basic implementation for file storage with corresponding unit tests. --- sacredboard/app/data/datastorage.py | 20 ++++++ sacredboard/app/data/filestorage.py | 64 +++++++++++++++++ sacredboard/app/data/mongodb.py | 17 ++++- sacredboard/bootstrap.py | 18 +++-- sacredboard/tests/data/test_filestorage.py | 81 ++++++++++++++++++++++ 5 files changed, 194 insertions(+), 6 deletions(-) create mode 100644 sacredboard/app/data/datastorage.py create mode 100644 sacredboard/app/data/filestorage.py mode change 100644 => 100755 sacredboard/bootstrap.py create mode 100644 sacredboard/tests/data/test_filestorage.py diff --git a/sacredboard/app/data/datastorage.py b/sacredboard/app/data/datastorage.py new file mode 100644 index 0000000..a88c498 --- /dev/null +++ b/sacredboard/app/data/datastorage.py @@ -0,0 +1,20 @@ +class Cursor: + def __init__(self): + pass + + def count(self): + raise NotImplemented() + + def __iter__(self): + raise NotImplemented() + +class DataStorage: + def __init__(self): + pass + + def get_run(self, run_id): + raise NotImplemented() + + def get_runs(self, sort_by=None, sort_direction=None, + start=0, limit=None, query={"type": "and", "filters": []}): + raise NotImplemented() diff --git a/sacredboard/app/data/filestorage.py b/sacredboard/app/data/filestorage.py new file mode 100644 index 0000000..35ea62f --- /dev/null +++ b/sacredboard/app/data/filestorage.py @@ -0,0 +1,64 @@ +import datetime +import os +import json + +from sacredboard.app.data.datastorage import Cursor, DataStorage + +config_json = "config.json" +run_json = "run.json" + +def _path_to_config(basepath, run_id): + return os.path.join(basepath, str(run_id), config_json) + +def _path_to_run(basepath, run_id): + return os.path.join(basepath, str(run_id), run_json) + +def _read_json(path_to_json): + with open(path_to_json) as f: + return json.load(f) + +def _create_run(runjson, configjson): + runjson["config"] = configjson + + # TODO probably want a smarter way of detecting which values have type "time." + for k in ["start_time", "stop_time", "heartbeat"]: + runjson[k] = datetime.datetime.strptime(runjson[k], '%Y-%m-%dT%H:%M:%S.%f') + + return runjson + +class FileStoreCursor(Cursor): + def __init__(self, count, iterable): + self.iterable = iterable + self._count = count + + def count(self): + return self._count + + def __iter__(self): + return iter(self.iterable) + +class FileStorage(DataStorage): + def __init__(self, path_to_dir): + super().__init__() + self.path_to_dir = os.path.expanduser(path_to_dir) + + def get_run(self, run_id): + config = _read_json(_path_to_config(self.path_to_dir, run_id)) + run = _read_json(_path_to_run(self.path_to_dir, run_id)) + return _create_run(run, config) + + def get_runs(self, sort_by=None, sort_direction=None, + start=0, limit=None, query={"type": "and", "filters": []}): + + all_run_ids = os.listdir(self.path_to_dir) + + def run_iterator(): + blacklist = set(["_sources"]) + for id in all_run_ids: + if id in blacklist: + continue + + yield self.get_run(id) + + count = len(all_run_ids) + return FileStoreCursor(count, run_iterator()) diff --git a/sacredboard/app/data/mongodb.py b/sacredboard/app/data/mongodb.py index 530b76a..995ebbf 100644 --- a/sacredboard/app/data/mongodb.py +++ b/sacredboard/app/data/mongodb.py @@ -3,8 +3,19 @@ import bson import pymongo +from sacredboard.app.data.datastorage import Cursor, DataStorage -class PyMongoDataAccess: +class MongoDbCursor(Cursor): + def __init__(self, mongodb_cursor): + self.mongodb_cursor = mongodb_cursor + + def count(self): + return self.mongodb_cursor.count() + + def __iter__(self): + return self.mongodb_cursor + +class PyMongoDataAccess(DataStorage): """Access records in MongoDB.""" RUNNING_DEAD_RUN_CLAUSE = { @@ -19,6 +30,7 @@ def __init__(self, uri, database_name, collection_name): Better use the static methods build_data_access or build_data_access_with_uri """ + super().__init__() self._uri = uri self._db_name = database_name self._client = None @@ -72,7 +84,8 @@ def get_runs(self, sort_by=None, sort_direction=None, cursor = cursor.skip(start) if limit is not None: cursor = cursor.limit(limit) - return cursor + + return MongoDbCursor(cursor) def get_run(self, run_id): try: diff --git a/sacredboard/bootstrap.py b/sacredboard/bootstrap.py old mode 100644 new mode 100755 index d0d8b6c..ccf06e0 --- a/sacredboard/bootstrap.py +++ b/sacredboard/bootstrap.py @@ -12,6 +12,7 @@ from gevent.pywsgi import WSGIServer from sacredboard.app.config import jinja_filters +from sacredboard.app.data.filestorage import FileStorage from sacredboard.app.data.mongodb import PyMongoDataAccess from sacredboard.app.webapi import routes @@ -35,13 +36,15 @@ "You might need it if you use a custom collection name " "or Sacred v0.6 (which used default.runs). " "Default: runs") +@click.option("-F", default="", + help="Path to directory containing experiments.") @click.option("--no-browser", is_flag=True, default=False, help="Do not open web browser automatically.") @click.option("--debug", is_flag=True, default=False, help="Run the application in Flask debug mode " "(for development).") @click.version_option() -def run(debug, no_browser, m, mu, mc): +def run(debug, no_browser, m, mu, mc, f): """ Sacredboard. @@ -76,12 +79,20 @@ def run(debug, no_browser, m, mu, mc): Note: MongoDB must be listening on localhost. """ - add_mongo_config(app, m, mu, mc) + + if m: + add_mongo_config(app, m, mu, mc) + app.config["data"].connect() + elif f: + app.config["data"] = FileStorage(f) + else: + print("Must specify either a mongodb instance or a path to a file storage.") + app.config['DEBUG'] = debug app.debug = debug jinja_filters.setup_filters(app) routes.setup_routes(app) - app.config["data"].connect() + if debug: app.run(host="0.0.0.0", debug=True) else: @@ -98,7 +109,6 @@ def run(debug, no_browser, m, mu, mc): http_server.serve_forever() break - def add_mongo_config(app, simple_connection_string, mongo_uri, collection_name): """ diff --git a/sacredboard/tests/data/test_filestorage.py b/sacredboard/tests/data/test_filestorage.py new file mode 100644 index 0000000..dda165c --- /dev/null +++ b/sacredboard/tests/data/test_filestorage.py @@ -0,0 +1,81 @@ +# coding=utf-8 +import bson +import pytest +import json +import tempfile +import os + +from sacredboard.app.data.filestorage import FileStorage + + +def create_tmp_datastore(): + ''' + Rather than mocking the file system, this actually creates some temporary files that emulate the file store system + in Sacred. Unfortunately, Sacred and Sacredboard are completely decoupled, which makes it impossible to ensure that + this standard is upheld throughout the sacred system. + ''' + config = {"length": None, "n_input": 255, "batch_size": None, + "dataset_path": "./german-nouns.hdf5", "validation_ds": "validation", + "log_dir": "./log/rnn500_dropout0.5_lrate1e-4_minibatch_1000steps", + "seed": 144363069, "dropout_keep_probability": 0.5, + "max_character_ord": 255, "training_ds": "training", "num_classes": 3, + "training_steps": 1000, "learning_rate": 0.0001, "hidden_size": 500} + + run = {"status": "COMPLETED", + "_id": "57f9efb2e4b8490d19d7c30e", + "info": {}, "resources": [], + "host": {"os": "Linux", + "os_info": "Linux-3.16.0-38-generic-x86_64-with-LinuxMint-17.2-rafaela", + "cpu": "Intel(R) Core(TM) i3 CPU M 370 @ 2.40GHz", + "python_version": "3.4.3", + "python_compiler": "GCC 4.8.4", + "cpu_count": 4, + "hostname": "ntbacer"}, + "experiment": {"doc": None, "sources": [[ + "/home/martin/mnt/noun-classification/train_model.py", + "86aaa9b81d6e32a181598ed78bb1d7a1"]], + "dependencies": [["h5py", "2.6.0"], + ["numpy", "1.11.2"], + ["sacred", "0.6.10"]], + "name": "German nouns"}, + "result": 2403.52, "artifacts": [], "comment": "", + # N.B. time formatting is different between mongodb and file store. + "start_time": "2017-06-02T07:13:05.305845", + "stop_time": "2017-06-02T07:14:02.455460", + "heartbeat": "2017-06-02T07:14:02.452597", + "captured_out": "Output: \n"} + + experiment_dir = tempfile.mkdtemp() + experiment42 = os.path.join(experiment_dir, "42") # experiment number 42 + os.mkdir(experiment42) + + with open(os.path.join(experiment42, "config.json"), 'w') as config_file: + json.dump(config, config_file) + + with open(os.path.join(experiment42, "run.json"), 'w') as run_file: + json.dump(run, run_file) + + return experiment_dir + +@pytest.fixture +def tmpfilestore() -> FileStorage: + dir = create_tmp_datastore() + return FileStorage(dir) + +def test_get_run(tmpfilestore : FileStorage): + run42 = tmpfilestore.get_run(42) + + for key in ["info", "resources", "host", "experiment", "result", "artifacts", "comment", "start_time", "stop_time", + "heartbeat", "captured_out", "config"]: + assert key in run42 + +def test_get_runs(tmpfilestore : FileStorage): + runs = tmpfilestore.get_runs() + runs = list(runs) + + assert 1 == len(runs) + + run = runs[0] + for key in ["info", "resources", "host", "experiment", "result", "artifacts", "comment", "start_time", "stop_time", + "heartbeat", "captured_out", "config"]: + assert key in run From ecc2caa96f7901a0196d06781bc14bfb480aab3e Mon Sep 17 00:00:00 2001 From: Gideon Dresdner Date: Tue, 6 Jun 2017 08:25:50 +0200 Subject: [PATCH 2/6] condition on (`m` or `mu`), not just `m` --- sacredboard/bootstrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacredboard/bootstrap.py b/sacredboard/bootstrap.py index ccf06e0..ce947be 100755 --- a/sacredboard/bootstrap.py +++ b/sacredboard/bootstrap.py @@ -80,7 +80,7 @@ def run(debug, no_browser, m, mu, mc, f): """ - if m: + if m or mu: add_mongo_config(app, m, mu, mc) app.config["data"].connect() elif f: From 2e3cd4df61c155f7d4f7030f909f498af12d1956 Mon Sep 17 00:00:00 2001 From: Gideon Dresdner Date: Tue, 6 Jun 2017 14:41:47 +0200 Subject: [PATCH 3/6] changes recommended by flake8 linter --- sacredboard/app/data/datastorage.py | 1 + sacredboard/app/data/filestorage.py | 13 ++++++++++--- sacredboard/app/data/mongodb.py | 21 ++++++++++++--------- sacredboard/bootstrap.py | 4 +++- 4 files changed, 26 insertions(+), 13 deletions(-) mode change 100644 => 100755 sacredboard/app/data/mongodb.py diff --git a/sacredboard/app/data/datastorage.py b/sacredboard/app/data/datastorage.py index a88c498..c8590e3 100644 --- a/sacredboard/app/data/datastorage.py +++ b/sacredboard/app/data/datastorage.py @@ -8,6 +8,7 @@ def count(self): def __iter__(self): raise NotImplemented() + class DataStorage: def __init__(self): pass diff --git a/sacredboard/app/data/filestorage.py b/sacredboard/app/data/filestorage.py index 35ea62f..699fa15 100644 --- a/sacredboard/app/data/filestorage.py +++ b/sacredboard/app/data/filestorage.py @@ -7,25 +7,31 @@ config_json = "config.json" run_json = "run.json" + def _path_to_config(basepath, run_id): return os.path.join(basepath, str(run_id), config_json) + def _path_to_run(basepath, run_id): return os.path.join(basepath, str(run_id), run_json) + def _read_json(path_to_json): with open(path_to_json) as f: return json.load(f) + def _create_run(runjson, configjson): runjson["config"] = configjson - # TODO probably want a smarter way of detecting which values have type "time." + # TODO probably want a smarter way of detecting + # which values have type "time." for k in ["start_time", "stop_time", "heartbeat"]: - runjson[k] = datetime.datetime.strptime(runjson[k], '%Y-%m-%dT%H:%M:%S.%f') - + runjson[k] = datetime.datetime.strptime(runjson[k], + '%Y-%m-%dT%H:%M:%S.%f') return runjson + class FileStoreCursor(Cursor): def __init__(self, count, iterable): self.iterable = iterable @@ -37,6 +43,7 @@ def count(self): def __iter__(self): return iter(self.iterable) + class FileStorage(DataStorage): def __init__(self, path_to_dir): super().__init__() diff --git a/sacredboard/app/data/mongodb.py b/sacredboard/app/data/mongodb.py old mode 100644 new mode 100755 index 995ebbf..97fe78c --- a/sacredboard/app/data/mongodb.py +++ b/sacredboard/app/data/mongodb.py @@ -5,6 +5,7 @@ from sacredboard.app.data.datastorage import Cursor, DataStorage + class MongoDbCursor(Cursor): def __init__(self, mongodb_cursor): self.mongodb_cursor = mongodb_cursor @@ -15,6 +16,7 @@ def count(self): def __iter__(self): return self.mongodb_cursor + class PyMongoDataAccess(DataStorage): """Access records in MongoDB.""" @@ -26,7 +28,7 @@ class PyMongoDataAccess(DataStorage): def __init__(self, uri, database_name, collection_name): """ Set up MongoDB access layer, don't connect yet. - + Better use the static methods build_data_access or build_data_access_with_uri """ @@ -50,20 +52,20 @@ def get_runs(self, sort_by=None, sort_direction=None, start=0, limit=None, query={"type": "and", "filters": []}): """ Return multiple runs (default all). - + The query format (optional): {"type": "and", "filters": [ {"field": "host.hostname", "operator": "==", "value": "ntbacer"}, - {"type": "or", "filters": [ + {"type": "or", "filters": [ {"field": "result", "operator": "==", "value": 2403.52}, {"field": "host.python_version", "operator": "==", "value":"3.5.2"} ]}]} - + The parameter is built from clauses. Each clause is either conjunctive (``"and"``), disjunctive (``"or"``), or a *terminal clause*. Each of the the earlier two types must specify - the ``"filters`` array of other clauses to be joined together + the ``"filters`` array of other clauses to be joined together by that logical connective (and/or). A terminal clause does not specifies its type, instead, @@ -72,9 +74,10 @@ def get_runs(self, sort_by=None, sort_direction=None, using dot notation to access nested documents), the ``operator`` (one of ``"=="``, ``"!="``, ``"<"``, ``"<="``, ``">``, ``">="``, and ``"regex"`` for regular expressions). - The ``value`` field contains the value to be compared with (either a string or a number). - Notice that for the ``status`` field, the ``RUNNING`` and ``DEAD`` runs - are compared by :func:`~PyMongoDataAccess.RUNNING_DEAD_RUN_CLAUSE` and + The ``value`` field contains the value to be compared with (either a + string or a number). Notice that for the ``status`` field, the + ``RUNNING`` and ``DEAD`` runs are compared by + :func:`~PyMongoDataAccess.RUNNING_DEAD_RUN_CLAUSE` and :func:`~PyMongoDataAccess.RUNNING_NOT_DEAD_CLAUSE` """ mongo_query = self._to_mongo_query(query) @@ -119,7 +122,7 @@ def _to_mongo_query(query): Takes a query in format {"type": "and", "filters": [ {"field": "host.hostname", "operator": "==", "value": "ntbacer"}, - {"type": "or", "filters": [ + {"type": "or", "filters": [ {"field": "result", "operator": "==", "value": 2403.52}, {"field": "host.python_version", "operator": "==", "value":"3.5.2"} ]}]} diff --git a/sacredboard/bootstrap.py b/sacredboard/bootstrap.py index ce947be..40989ec 100755 --- a/sacredboard/bootstrap.py +++ b/sacredboard/bootstrap.py @@ -86,7 +86,8 @@ def run(debug, no_browser, m, mu, mc, f): elif f: app.config["data"] = FileStorage(f) else: - print("Must specify either a mongodb instance or a path to a file storage.") + print("Must specify either a mongodb instance or \ + a path to a file storage.") app.config['DEBUG'] = debug app.debug = debug @@ -109,6 +110,7 @@ def run(debug, no_browser, m, mu, mc, f): http_server.serve_forever() break + def add_mongo_config(app, simple_connection_string, mongo_uri, collection_name): """ From d442878e829ad79611b6245bf665c1b0528856dd Mon Sep 17 00:00:00 2001 From: Martin Chovanec Date: Tue, 6 Jun 2017 16:12:43 +0200 Subject: [PATCH 4/6] Fix bootstrap to take the -F arg into account. --- sacredboard/bootstrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacredboard/bootstrap.py b/sacredboard/bootstrap.py index 40989ec..1d2ece4 100755 --- a/sacredboard/bootstrap.py +++ b/sacredboard/bootstrap.py @@ -80,7 +80,7 @@ def run(debug, no_browser, m, mu, mc, f): """ - if m or mu: + if m or mu != (None, None): add_mongo_config(app, m, mu, mc) app.config["data"].connect() elif f: From c8f3c88d88071372863161b4cbad6f6878b37b56 Mon Sep 17 00:00:00 2001 From: Martin Chovanec Date: Tue, 6 Jun 2017 16:13:09 +0200 Subject: [PATCH 5/6] Read experiment "Info" from the info.json file. --- sacredboard/app/data/filestorage.py | 24 ++++++++++++++++------ sacredboard/tests/data/test_filestorage.py | 4 ++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/sacredboard/app/data/filestorage.py b/sacredboard/app/data/filestorage.py index 699fa15..62119c1 100644 --- a/sacredboard/app/data/filestorage.py +++ b/sacredboard/app/data/filestorage.py @@ -4,16 +4,25 @@ from sacredboard.app.data.datastorage import Cursor, DataStorage -config_json = "config.json" -run_json = "run.json" +CONFIG_JSON = "config.json" +RUN_JSON = "run.json" +INFO_JSON = "info.json" + + +def _path_to_file(basepath, run_id, file_name): + return os.path.join(basepath, str(run_id), file_name) def _path_to_config(basepath, run_id): - return os.path.join(basepath, str(run_id), config_json) + return _path_to_file(basepath, str(run_id), CONFIG_JSON) + + +def _path_to_info(basepath, run_id): + return _path_to_file(basepath, str(run_id), INFO_JSON) def _path_to_run(basepath, run_id): - return os.path.join(basepath, str(run_id), run_json) + return os.path.join(basepath, str(run_id), RUN_JSON) def _read_json(path_to_json): @@ -21,8 +30,10 @@ def _read_json(path_to_json): return json.load(f) -def _create_run(runjson, configjson): +def _create_run(run_id, runjson, configjson, infojson): + runjson["_id"] = run_id runjson["config"] = configjson + runjson["info"] = infojson # TODO probably want a smarter way of detecting # which values have type "time." @@ -52,7 +63,8 @@ def __init__(self, path_to_dir): def get_run(self, run_id): config = _read_json(_path_to_config(self.path_to_dir, run_id)) run = _read_json(_path_to_run(self.path_to_dir, run_id)) - return _create_run(run, config) + info = _read_json(_path_to_info(self.path_to_dir, run_id)) + return _create_run(run_id, run, config, info) def get_runs(self, sort_by=None, sort_direction=None, start=0, limit=None, query={"type": "and", "filters": []}): diff --git a/sacredboard/tests/data/test_filestorage.py b/sacredboard/tests/data/test_filestorage.py index dda165c..4cd0d60 100644 --- a/sacredboard/tests/data/test_filestorage.py +++ b/sacredboard/tests/data/test_filestorage.py @@ -44,6 +44,7 @@ def create_tmp_datastore(): "stop_time": "2017-06-02T07:14:02.455460", "heartbeat": "2017-06-02T07:14:02.452597", "captured_out": "Output: \n"} + info = {"info": "present"} experiment_dir = tempfile.mkdtemp() experiment42 = os.path.join(experiment_dir, "42") # experiment number 42 @@ -55,6 +56,9 @@ def create_tmp_datastore(): with open(os.path.join(experiment42, "run.json"), 'w') as run_file: json.dump(run, run_file) + with open(os.path.join(experiment42, "info.json"), 'w') as info_file: + json.dump(info, info_file) + return experiment_dir @pytest.fixture From ec5633978f3a2cbfc9223c72e511a53ddd73084d Mon Sep 17 00:00:00 2001 From: Martin Chovanec Date: Tue, 6 Jun 2017 19:42:38 +0200 Subject: [PATCH 6/6] Make filestorage tests fail when info not loaded The run "info" is normally stored in an external file. But it was not so in the test run. --- sacredboard/tests/data/test_filestorage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sacredboard/tests/data/test_filestorage.py b/sacredboard/tests/data/test_filestorage.py index 4cd0d60..dcc6748 100644 --- a/sacredboard/tests/data/test_filestorage.py +++ b/sacredboard/tests/data/test_filestorage.py @@ -23,7 +23,7 @@ def create_tmp_datastore(): run = {"status": "COMPLETED", "_id": "57f9efb2e4b8490d19d7c30e", - "info": {}, "resources": [], + "resources": [], "host": {"os": "Linux", "os_info": "Linux-3.16.0-38-generic-x86_64-with-LinuxMint-17.2-rafaela", "cpu": "Intel(R) Core(TM) i3 CPU M 370 @ 2.40GHz",