Skip to content

Commit

Permalink
Add configuration file
Browse files Browse the repository at this point in the history
  • Loading branch information
danijar committed May 16, 2016
1 parent 6d66ce8 commit 40626fa
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 7 deletions.
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,22 @@ dataset = glove(dataset, columns=['data'])
Caching
-------

By default, datasets will be cached inside `~/.dataset/sets/`. To save even
more time, use the `@sets.disk_cache(basename, directory, method=False)`
decorator and apply it to your whole pipeline. It hashes function arguments in
order to determine if a cache is valid.
By default, datasets will be cached inside `~/.dataset/sets/`. You can change
the directory by specifying the `directory` variable in the configuration. To
save even more time, use the `@sets.disk_cache(basename, directory,
method=False)` decorator and apply it to your whole pipeline. It hashes
function arguments in order to determine if a cache is valid.

Configuration
-------------

The configuration is a YAML file named `.setsrc`. Sets looks for the file in
the current working directory, the user's home directory and the `SETS_CONFIG`
environment variable.

```yaml
directory: ~/.dataset/sets
```
Contributions
-------------
Expand Down
3 changes: 2 additions & 1 deletion sets/core/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ def download(cls, url, filename=None):
return utility.download(url, cls.directory(), filename)

@classmethod
def directory(cls, prefix='~/.dataset/sets'):
def directory(cls, prefix=None):
"""
Path that should be used for caching. Different for all subclasses.
"""
prefix = prefix or utility.read_config().directory
name = cls.__name__.lower()
directory = os.path.expanduser(os.path.join(prefix, name))
utility.ensure_directory(directory)
Expand Down
5 changes: 5 additions & 0 deletions sets/data/schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
type: dict
mapping:
directory:
type: str
default: ~/.dataset/sets
14 changes: 14 additions & 0 deletions sets/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,21 @@
import errno
import shutil
from urllib.request import urlopen
import definitions

def read_config(schema='data/schema.yaml', name='sets'):
filename = '.{}rc'.format(name)
paths = [
os.path.join(os.curdir, filename),
os.path.expanduser(os.path.join('~', filename)),
os.environ.get('{}_CONFIG'.format(name.upper())),
]
schema = os.path.join(os.path.dirname(__file__), schema)
parser = definitions.Parser(schema)
for path in paths:
if path and os.path.isfile(path):
return parser(path)
return parser('{}')

def disk_cache(basename, directory, method=False):
"""
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
'requests',
'nltk',
'lxml',
'definitions',
]


Expand Down Expand Up @@ -99,13 +100,14 @@ def finalize_options(self):
if __name__ == '__main__':
setuptools.setup(
name='sets',
version='0.3.1',
version='0.3.2',
description='Read datasets in a standard way.',
url='http://github.com/danijar/sets',
author='Danijar Hafner',
author_email='mail@danijar.com',
license='MIT',
packages=['sets', 'sets.core', 'sets.dataset', 'sets.process'],
package_data={'sets': ['data/schema.yaml']},
setup_requires=SETUP_REQUIRES,
install_requires=INSTALL_REQUIRES,
cmdclass={
Expand Down
3 changes: 3 additions & 0 deletions test/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import sets
import pytest


@pytest.mark.skip
def test_semeval():
dataset = sets.SemEvalRelation()
dataset = sets.Tokenize()(dataset)
Expand All @@ -12,6 +14,7 @@ def test_semeval():
dataset, columns=('data', 'word_distance'))


@pytest.mark.skip
def test_ocr():
dataset = sets.Ocr()
dataset = sets.OneHot(dataset.target, depth=2)(dataset, columns=['target'])
Expand Down
1 change: 0 additions & 1 deletion test/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ def dataset():

def test_concat(dataset):
dataset['other'] = [[1], [2], [3]]
print(dataset)
result = sets.Concat(1, 'data')(dataset, columns=('data', 'other'))
assert 'other' not in result.columns
assert (result.target == dataset.target).all()
Expand Down

0 comments on commit 40626fa

Please sign in to comment.