Add configuration file

danijar · May 16, 2016 · 40626fa · 40626fa
1 parent 6d66ce8
commit 40626fa
Show file tree

Hide file tree

Showing 7 changed files with 43 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -81,10 +81,22 @@ dataset = glove(dataset, columns=['data'])
 Caching
 -------
 
-By default, datasets will be cached inside `~/.dataset/sets/`. To save even
-more time, use the `@sets.disk_cache(basename, directory, method=False)`
-decorator and apply it to your whole pipeline. It hashes function arguments in
-order to determine if a cache is valid.
+By default, datasets will be cached inside `~/.dataset/sets/`. You can change
+the directory by specifying the `directory` variable in the configuration.  To
+save even more time, use the `@sets.disk_cache(basename, directory,
+method=False)` decorator and apply it to your whole pipeline. It hashes
+function arguments in order to determine if a cache is valid.
+
+Configuration
+-------------
+
+The configuration is a YAML file named `.setsrc`.  Sets looks for the file in
+the current working directory, the user's home directory and the `SETS_CONFIG`
+environment variable.
+
+```yaml
+directory: ~/.dataset/sets
+```
 
 Contributions
 -------------

diff --git a/sets/core/step.py b/sets/core/step.py
@@ -28,10 +28,11 @@ def download(cls, url, filename=None):
         return utility.download(url, cls.directory(), filename)
 
     @classmethod
-    def directory(cls, prefix='~/.dataset/sets'):
+    def directory(cls, prefix=None):
         """
         Path that should be used for caching. Different for all subclasses.
         """
+        prefix = prefix or utility.read_config().directory
         name = cls.__name__.lower()
         directory = os.path.expanduser(os.path.join(prefix, name))
         utility.ensure_directory(directory)

diff --git a/sets/data/schema.yaml b/sets/data/schema.yaml
@@ -0,0 +1,5 @@
+type: dict
+mapping:
+  directory:
+    type: str
+    default: ~/.dataset/sets
diff --git a/sets/utility.py b/sets/utility.py
@@ -4,7 +4,21 @@
 import errno
 import shutil
 from urllib.request import urlopen
+import definitions
 
+def read_config(schema='data/schema.yaml', name='sets'):
+    filename = '.{}rc'.format(name)
+    paths = [
+        os.path.join(os.curdir, filename),
+        os.path.expanduser(os.path.join('~', filename)),
+        os.environ.get('{}_CONFIG'.format(name.upper())),
+    ]
+    schema = os.path.join(os.path.dirname(__file__), schema)
+    parser = definitions.Parser(schema)
+    for path in paths:
+        if path and os.path.isfile(path):
+            return parser(path)
+    return parser('{}')
 
 def disk_cache(basename, directory, method=False):
     """

diff --git a/setup.py b/setup.py
@@ -14,6 +14,7 @@
     'requests',
     'nltk',
     'lxml',
+    'definitions',
 ]
 
 
@@ -99,13 +100,14 @@ def finalize_options(self):
 if __name__ == '__main__':
     setuptools.setup(
         name='sets',
-        version='0.3.1',
+        version='0.3.2',
         description='Read datasets in a standard way.',
         url='http://github.com/danijar/sets',
         author='Danijar Hafner',
         author_email='mail@danijar.com',
         license='MIT',
         packages=['sets', 'sets.core', 'sets.dataset', 'sets.process'],
+        package_data={'sets': ['data/schema.yaml']},
         setup_requires=SETUP_REQUIRES,
         install_requires=INSTALL_REQUIRES,
         cmdclass={

diff --git a/test/test_dataset.py b/test/test_dataset.py
@@ -1,6 +1,8 @@
 import sets
+import pytest
 
 
+@pytest.mark.skip
 def test_semeval():
     dataset = sets.SemEvalRelation()
     dataset = sets.Tokenize()(dataset)
@@ -12,6 +14,7 @@ def test_semeval():
         dataset, columns=('data', 'word_distance'))
 
 
+@pytest.mark.skip
 def test_ocr():
     dataset = sets.Ocr()
     dataset = sets.OneHot(dataset.target, depth=2)(dataset, columns=['target'])

diff --git a/test/test_utility.py b/test/test_utility.py
@@ -11,7 +11,6 @@ def dataset():
 
 def test_concat(dataset):
     dataset['other'] = [[1], [2], [3]]
-    print(dataset)
     result = sets.Concat(1, 'data')(dataset, columns=('data', 'other'))
     assert 'other' not in result.columns
     assert (result.target == dataset.target).all()