Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

abstract data interface #43

Merged
merged 10 commits into from
Jun 25, 2017
33 changes: 33 additions & 0 deletions sacredboard/app/data/datastorage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Interfaces for data storage backend."""

class Cursor:
"""Interface that abstracts the cursor object returned from databases."""

def __init__(self):
pass

def count(self):
"""Return the number of items in this cursor."""
raise NotImplemented()

def __iter__(self):
raise NotImplemented()


class DataStorage:
"""
Interface for data backends. Defines the API for various data stores --- databases, file stores, etc. --- that
sacred supports.
"""

def __init__(self):
pass

def get_run(self, run_id):
"""Return the run associated with the id."""
raise NotImplemented()

def get_runs(self, sort_by=None, sort_direction=None,
start=0, limit=None, query={"type": "and", "filters": []}):
"""Return all runs that match the query."""
raise NotImplemented()
113 changes: 113 additions & 0 deletions sacredboard/app/data/filestorage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Implements backend storage interface for sacred's file store."""

import datetime
import os
import json

from sacredboard.app.data.datastorage import Cursor, DataStorage

CONFIG_JSON = "config.json"
RUN_JSON = "run.json"
INFO_JSON = "info.json"


def _path_to_file(basepath, run_id, file_name):
return os.path.join(basepath, str(run_id), file_name)


def _path_to_config(basepath, run_id):
return _path_to_file(basepath, str(run_id), CONFIG_JSON)


def _path_to_info(basepath, run_id):
return _path_to_file(basepath, str(run_id), INFO_JSON)


def _path_to_run(basepath, run_id):
return os.path.join(basepath, str(run_id), RUN_JSON)


def _read_json(path_to_json):
with open(path_to_json) as f:
return json.load(f)


def _create_run(run_id, runjson, configjson, infojson):
runjson["_id"] = run_id
runjson["config"] = configjson
runjson["info"] = infojson

# TODO probably want a smarter way of detecting
# which values have type "time."
for k in ["start_time", "stop_time", "heartbeat"]:
runjson[k] = datetime.datetime.strptime(runjson[k],
'%Y-%m-%dT%H:%M:%S.%f')
return runjson


class FileStoreCursor(Cursor):
"""Implements the cursor for file stores."""

def __init__(self, count, iterable):
self.iterable = iterable
self._count = count

def count(self):
"""
Return the number of runs in this query.

:return: int
"""
return self._count

def __iter__(self):
return iter(self.iterable)


class FileStorage(DataStorage):
"""Object to interface with one of sacred's file stores."""

def __init__(self, path_to_dir):
super().__init__()
self.path_to_dir = os.path.expanduser(path_to_dir)

def get_run(self, run_id):
"""
Return the run associated with a particular `run_id`.

:param run_id:
:return: dict
:raises FileNotFoundError
"""
config = _read_json(_path_to_config(self.path_to_dir, run_id))
run = _read_json(_path_to_run(self.path_to_dir, run_id))
info = _read_json(_path_to_info(self.path_to_dir, run_id))
return _create_run(run_id, run, config, info)

def get_runs(self, sort_by=None, sort_direction=None,
start=0, limit=None, query={"type": "and", "filters": []}):
"""
Return all runs in the file store. If a run is corrupt -- e.g. missing files --- it is skipped.

:param sort_by: NotImplemented
:param sort_direction: NotImplemented
:param start: NotImplemented
:param limit: NotImplemented
:param query: NotImplemented
:return: FileStoreCursor
"""
all_run_ids = os.listdir(self.path_to_dir)

def run_iterator():
blacklist = set(["_sources"])
for id in all_run_ids:
if id in blacklist:
continue
try:
yield self.get_run(id)
except FileNotFoundError:
# An incomplete experiment is a corrupt experiment. Skip it for now.
pass

count = len(all_run_ids)
return FileStoreCursor(count, run_iterator())
23 changes: 21 additions & 2 deletions sacredboard/app/data/mongodb.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,24 @@
import bson
import pymongo

from sacredboard.app.data.datastorage import Cursor, DataStorage

class PyMongoDataAccess:

class MongoDbCursor(Cursor):
"""Implements Cursor for mongodb."""

def __init__(self, mongodb_cursor):
self.mongodb_cursor = mongodb_cursor

def count(self):
"""Returns the number of items in this cursor."""
return self.mongodb_cursor.count()

def __iter__(self):
return self.mongodb_cursor


class PyMongoDataAccess(DataStorage):
"""Access records in MongoDB."""

RUNNING_DEAD_RUN_CLAUSE = {
Expand All @@ -19,6 +35,7 @@ def __init__(self, uri, database_name, collection_name):
Better use the static methods build_data_access
or build_data_access_with_uri
"""
super().__init__()
self._uri = uri
self._db_name = database_name
self._client = None
Expand Down Expand Up @@ -73,7 +90,8 @@ def get_runs(self, sort_by=None, sort_direction=None,
cursor = cursor.skip(start)
if limit is not None:
cursor = cursor.limit(limit)
return cursor

return MongoDbCursor(cursor)

def get_run(self, run_id):
"""
Expand Down Expand Up @@ -103,6 +121,7 @@ def _apply_sort(cursor, sort_by, sort_direction):
:param sort_by: The field name to sort by.
:param sort_direction: The direction to sort, "asc" or "desc".
:return:

"""
if sort_direction is not None and sort_direction.lower() == "desc":
sort = pymongo.DESCENDING
Expand Down
17 changes: 14 additions & 3 deletions sacredboard/bootstrap.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from gevent.pywsgi import WSGIServer

from sacredboard.app.config import jinja_filters
from sacredboard.app.data.filestorage import FileStorage
from sacredboard.app.data.mongodb import PyMongoDataAccess
from sacredboard.app.webapi import routes

Expand All @@ -35,13 +36,15 @@
"You might need it if you use a custom collection name "
"or Sacred v0.6 (which used default.runs). "
"Default: runs")
@click.option("-F", default="",
help="Path to directory containing experiments.")
@click.option("--no-browser", is_flag=True, default=False,
help="Do not open web browser automatically.")
@click.option("--debug", is_flag=True, default=False,
help="Run the application in Flask debug mode "
"(for development).")
@click.version_option()
def run(debug, no_browser, m, mu, mc):
def run(debug, no_browser, m, mu, mc, f):
"""
Sacredboard.

Expand Down Expand Up @@ -76,12 +79,20 @@ def run(debug, no_browser, m, mu, mc):
Note: MongoDB must be listening on localhost.

"""
add_mongo_config(app, m, mu, mc)
if m or mu != (None, None):
add_mongo_config(app, m, mu, mc)
app.config["data"].connect()
elif f:
app.config["data"] = FileStorage(f)
else:
print("Must specify either a mongodb instance or \
a path to a file storage.")

app.config['DEBUG'] = debug
app.debug = debug
jinja_filters.setup_filters(app)
routes.setup_routes(app)
app.config["data"].connect()

if debug:
app.run(host="0.0.0.0", debug=True)
else:
Expand Down
92 changes: 92 additions & 0 deletions sacredboard/tests/data/test_filestorage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Tests for the file storage backend."""

# coding=utf-8
import bson
import pytest
import json
import tempfile
import os

from sacredboard.app.data.filestorage import FileStorage


def create_tmp_datastore():
"""
Rather than mocking the file system, this actually creates some temporary files that emulate the file store system
in Sacred. Unfortunately, Sacred and Sacredboard are completely decoupled, which makes it impossible to ensure that
this standard is upheld throughout the sacred system.

:return: dict
"""
config = {"length": None, "n_input": 255, "batch_size": None,
"dataset_path": "./german-nouns.hdf5", "validation_ds": "validation",
"log_dir": "./log/rnn500_dropout0.5_lrate1e-4_minibatch_1000steps",
"seed": 144363069, "dropout_keep_probability": 0.5,
"max_character_ord": 255, "training_ds": "training", "num_classes": 3,
"training_steps": 1000, "learning_rate": 0.0001, "hidden_size": 500}

run = {"status": "COMPLETED",
"_id": "57f9efb2e4b8490d19d7c30e",
"resources": [],
"host": {"os": "Linux",
"os_info": "Linux-3.16.0-38-generic-x86_64-with-LinuxMint-17.2-rafaela",
"cpu": "Intel(R) Core(TM) i3 CPU M 370 @ 2.40GHz",
"python_version": "3.4.3",
"python_compiler": "GCC 4.8.4",
"cpu_count": 4,
"hostname": "ntbacer"},
"experiment": {"doc": None, "sources": [[
"/home/martin/mnt/noun-classification/train_model.py",
"86aaa9b81d6e32a181598ed78bb1d7a1"]],
"dependencies": [["h5py", "2.6.0"],
["numpy", "1.11.2"],
["sacred", "0.6.10"]],
"name": "German nouns"},
"result": 2403.52, "artifacts": [], "comment": "",
# N.B. time formatting is different between mongodb and file store.
"start_time": "2017-06-02T07:13:05.305845",
"stop_time": "2017-06-02T07:14:02.455460",
"heartbeat": "2017-06-02T07:14:02.452597",
"captured_out": "Output: \n"}
info = {"info": "present"}

experiment_dir = tempfile.mkdtemp()
experiment42 = os.path.join(experiment_dir, "42") # experiment number 42
os.mkdir(experiment42)

with open(os.path.join(experiment42, "config.json"), 'w') as config_file:
json.dump(config, config_file)

with open(os.path.join(experiment42, "run.json"), 'w') as run_file:
json.dump(run, run_file)

with open(os.path.join(experiment42, "info.json"), 'w') as info_file:
json.dump(info, info_file)

return experiment_dir

@pytest.fixture
def tmpfilestore() -> FileStorage:
"""Fixture that prepares a file store in /tmp for dependency injection."""
dir = create_tmp_datastore()
return FileStorage(dir)

def test_get_run(tmpfilestore : FileStorage):
"""Tests the get_run function."""
run42 = tmpfilestore.get_run(42)

for key in ["info", "resources", "host", "experiment", "result", "artifacts", "comment", "start_time", "stop_time",
"heartbeat", "captured_out", "config"]:
assert key in run42

def test_get_runs(tmpfilestore : FileStorage):
"""Tests the get_runs function."""
runs = tmpfilestore.get_runs()
runs = list(runs)

assert 1 == len(runs)

run = runs[0]
for key in ["info", "resources", "host", "experiment", "result", "artifacts", "comment", "start_time", "stop_time",
"heartbeat", "captured_out", "config"]:
assert key in run