From 21ddd1fa2781cdd9870fcb90acaf4e7c4b98428e Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Sat, 19 Aug 2017 11:42:41 -0500 Subject: [PATCH 1/8] implement dj.Bucket class to handle S3 external storage operations. Currently not hooked into actual dj code; awaiting base external file logic 1st. S3 functionality implemented via 'boto3' package; unit tests are currently MOCKED using 'moto' S3 mock library due to difficulties w/r/t credential mgmt. --- datajoint/__init__.py | 7 ++- datajoint/external.py | 132 ++++++++++++++++++++++++++++++++++++++++++ datajoint/settings.py | 7 ++- requirements.txt | 1 + test_requirements.txt | 1 + tests/test_s3.py | 125 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 269 insertions(+), 4 deletions(-) create mode 100644 datajoint/external.py create mode 100644 tests/test_s3.py diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 0f5276205..0068e9d4c 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -53,8 +53,10 @@ class DataJointError(Exception): # override login credentials with environment variables mapping = {k: v for k, v in zip( - ('database.host', 'database.user', 'database.password'), - map(os.getenv, ('DJ_HOST', 'DJ_USER', 'DJ_PASS'))) + ('database.host', 'database.user', 'database.password', + 'external.aws_access_key_id', 'external.aws_secret_access_key',), + map(os.getenv, ('DJ_HOST', 'DJ_USER', 'DJ_PASS', + 'DJ_AWS_ACCESS_KEY_ID', 'DJ_AWS_SECRET_ACCESS_KEY',))) if v is not None} for k in mapping: config.add_history('Updated login credentials from %s' % k) @@ -64,6 +66,7 @@ class DataJointError(Exception): # ------------- flatten import hierarchy ------------------------- from .connection import conn, Connection +from .external import bucket, Bucket from .base_relation import FreeRelation, BaseRelation from .user_relations import Manual, Lookup, Imported, Computed, Part from .relational_operand import Not, AndList, OrList, U diff --git a/datajoint/external.py b/datajoint/external.py new file mode 100644 index 000000000..65ccd543e --- /dev/null +++ b/datajoint/external.py @@ -0,0 +1,132 @@ +""" +This module contains logic related to external file storage +""" + +import logging +from getpass import getpass + +import boto3 +from botocore.exceptions import ClientError + +from . import config +from . import DataJointError + +logger = logging.getLogger(__name__) + + +def bucket(aws_access_key_id=None, aws_secret_access_key=None, reset=False): + """ + Returns a boto3 AWS session object to be shared by multiple modules. + If the connection is not yet established or reset=True, a new + connection is set up. If connection information is not provided, + it is taken from config which takes the information from + dj_local_conf.json. If the password is not specified in that file + datajoint prompts for the password. + + :param aws_access_key_id: AWS Access Key ID + :param aws_secret_access_key: AWS Secret Key + :param reset: whether the connection should be reset or not + """ + if not hasattr(bucket, 'bucket') or reset: + aws_access_key_id = aws_access_key_id \ + if aws_access_key_id is not None \ + else config['external.aws_access_key_id'] + + aws_secret_access_key = aws_secret_access_key \ + if aws_secret_access_key is not None \ + else config['external.aws_secret_access_key'] + + if aws_access_key_id is None: # pragma: no cover + aws_access_key_id = input("Please enter AWS Access Key ID: ") + + if aws_secret_access_key is None: # pragma: no cover + aws_secret_access_key = getpass( + "Please enter AWS Secret Access Key: " + ) + + bucket.bucket = Bucket(aws_access_key_id, aws_secret_access_key) + return bucket.bucket + + +class Bucket: + """ + A dj.Bucket object manages a connection to an AWS S3 Bucket. + + Currently, basic CRUD operations are supported; of note permissions and + object versioning are not currently supported. + + Most of the parameters below should be set in the local configuration file. + + :param aws_access_key_id: AWS Access Key ID + :param aws_secret_access_key: AWS Secret Key + """ + + def __init__(self, aws_access_key_id, aws_secret_access_key): + self._session = boto3.Session( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key + ) + self._s3 = None + try: + self._bucket = config['external.location'].split("s3://")[1] + except (AttributeError, IndexError, KeyError) as e: + raise DataJointError('external.location not properly configured: ' + + str(config['external.location'])) from None + + def connect(self): + if self._s3 is None: + self._s3 = self._session.resource('s3') + + def stat(self, rpath=None): + """ + check if a file exists in the bucket + """ + try: + self.connect() + self._s3.Object(self._bucket, rpath).load() + except ClientError as e: + if e.response['Error']['Code'] == "404": + return False + else: + raise DataJointError('error checking remote file') + + return True + + def put(self, lpath=None, rpath=None): + """ + Upload a file + """ + try: + self._s3.Object(self._bucket, rpath).upload_file(lpath) + except: + raise DataJointError('Error uploading file') + + return True + + def get(self, rpath=None, lpath=None): + """ + Retrieve a file + """ + try: + self._s3.Object(self._bucket, rpath).download_file(lpath) + except Exception as e: + raise DataJointError('file download error') + + return True + + def delete(self, rpath): + ''' + Delete a single remote object. + Note: will return True even if object doesn't exist; + for explicit verification combine with a .stat() call. + ''' + self.connect() + r = self._s3.Object(self._bucket, rpath).delete() + try: + if r['ResponseMetadata']['HTTPStatusCode'] == 204: + return True + else: + # XXX: if/when does this occur? - s3 returns ok if no file... + return False + except: + raise DataJointError('error deleting file: ' + str(rpath)) diff --git a/datajoint/settings.py b/datajoint/settings.py index b7bc1a4e9..13dee01f5 100644 --- a/datajoint/settings.py +++ b/datajoint/settings.py @@ -46,7 +46,10 @@ 'safemode': True, 'display.limit': 7, 'display.width': 14, - 'display.show_tuple_count': True + 'display.show_tuple_count': True, + 'external.aws_access_key_id': None, + 'external.aws_secret_access_key': None, + 'external.location' : None }) logger = logging.getLogger(__name__) @@ -176,4 +179,4 @@ def __setitem__(self, key, value): if validators[key](value): self._conf[key] = value else: - raise DataJointError(u'Validator for {0:s} did not pass'.format(key, )) \ No newline at end of file + raise DataJointError(u'Validator for {0:s} did not pass'.format(key, )) diff --git a/requirements.txt b/requirements.txt index 88a246fe7..065ec613d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ pyparsing ipython networkx pydotplus +boto3 diff --git a/test_requirements.txt b/test_requirements.txt index 12e08e267..ff4f0271e 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,2 +1,3 @@ matplotlib pygraphviz +moto diff --git a/tests/test_s3.py b/tests/test_s3.py new file mode 100644 index 000000000..ec400936c --- /dev/null +++ b/tests/test_s3.py @@ -0,0 +1,125 @@ + +""" +Test of dj.Bucket() using moto *MOCKED* S3 library +Using real s3 could incur cost, requires appropriate credentials managment; +but probably should be done at some point once best methodology is determined. +""" + +import os +from unittest import TestCase + +import boto3 +from moto import mock_s3 + +import datajoint as dj + +# Verify moto is itself functional +# BEGIN via Moto Docs + + +class MotoTest: + ''' + Simple example to verify moto is itself working + ''' + + def __init__(self, name, value): + self.name = name + self.value = value + + def save(self): + s3 = boto3.client('s3', region_name='us-east-1') + s3.put_object(Bucket='mybucket', Key=self.name, Body=self.value) + + +@mock_s3 +def test_moto_test(): + # Create Bucket so that test can run + conn = boto3.resource('s3', region_name='us-east-1') + conn.create_bucket(Bucket='mybucket') + + model_instance = MotoTest('steve', 'is awesome') + model_instance.save() + + body = conn.Object('mybucket', 'steve').get()['Body'].read().decode() + + assert body == 'is awesome' + +# END via Moto Docs + + +@mock_s3 +def test_dj_bucket_factory(): + ''' + Test *part of* the dj.bucket() singleton/factory function. + The user-interactive portion is not tested. + ''' + try: + b = dj.Bucket(None, None) + except dj.DataJointError: # no dj.config['external.location'] + pass + + # monkey patch dj.bucket.bucket to use mocked implementation + dj.config['external.location'] = 's3://djtest.datajoint.io' + b = dj.Bucket(None, None) + dj.bucket.bucket = b + + assert dj.bucket() == b + + +@mock_s3 +class DjBucketTest(TestCase): + + def setUp(self): + dj.config['external.location'] = 's3://djtest.datajoint.io' + b = dj.Bucket(None, None) + dj.bucket.bucket = b + + # create moto's virtual bucket + b.connect() # note: implicit test of b.connect(), which is trivial + b._s3.create_bucket(Bucket='djtest.datajoint.io') + self._bucket = b + + # todo: + # - appropriate remote filename (e.g. mkstemp()) + # - appropriate local temp filename (e.g. mkstemp()) + self._lfile = __file__ + self._rfile = 'DjBucketTest-TEMP_NO_EDIT_WILL_ZAP.py' + self._lfile_cpy = self._rfile + + self._zaptmpfile() + + def tearDown(self): + self._zaptmpfile() + + def _zaptmpfile(self): + try: + os.remove(self._lfile_cpy) + except FileNotFoundError: + pass + + def test_bucket_methods(self): + ''' + Test dj.Bucket.(put,state,get,delete,)() + Currently done in one test to simplify interdependencies. + ''' + + # ensure no initial files + assert self._bucket.delete(self._rfile) is True + assert self._bucket.stat(self._rfile) is False + assert os.path.exists(self._lfile_cpy) is False + + # test put + assert self._bucket.put(self._lfile, self._rfile) is True + + # test stat + assert self._bucket.stat(self._rfile) is True + + # test get + assert self._bucket.get(self._rfile, self._lfile_cpy) is True + assert os.path.exists(self._lfile_cpy) is True + + # test delete + assert self._bucket.delete(self._rfile) is True + + # verify delete + assert self._bucket.stat(self._rfile) is False From 126d9fcb1aca46f8f6ae916220451e0b4da1c93c Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Mon, 21 Aug 2017 11:51:13 -0500 Subject: [PATCH 2/8] .travis.yml: boto setting copied from moto cfg --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index bc5182fae..99df831b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ before_install: - mysql -e "create user 'datajoint'@'%' identified by 'datajoint'; GRANT ALL PRIVILEGES ON \`djtest\_%\`.* TO 'datajoint'@'%';" -uroot - mysql -e "create user 'djview'@'%' identified by 'djview'; GRANT SELECT ON \`djtest\_%\`.* TO 'djview'@'%';" -uroot install: + - travis_retry pip install boto==2.45.0 - travis_wait 30 pip install -r requirements.txt - travis_wait 30 pip install -r test_requirements.txt - pip install nose nose-cov python-coveralls From b2cae86ed63ce7b4016ce9ac9354b01b8a133cb4 Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Mon, 21 Aug 2017 12:15:45 -0500 Subject: [PATCH 3/8] Hack BOTO_CONFIG to work in travis. Current environment mis-pulls in a /usr/share/google.* copy of a file, which is not python 3x compatible, and so builds fail. See also: https://github.com/GoogleCloudPlatform/compute-image-packages/pull/213 which outlines ubuntu cloud images are not updated. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 99df831b6..dca3293af 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ sudo: required language: python env: - - DJ_TEST_HOST="127.0.0.1" DJ_TEST_USER="datajoint" DJ_TEST_PASSWORD="datajoint" DJ_HOST="127.0.0.1" DJ_USER="datajoint" DJ_PASS="datajoint" + - DJ_TEST_HOST="127.0.0.1" DJ_TEST_USER="datajoint" DJ_TEST_PASSWORD="datajoint" DJ_HOST="127.0.0.1" DJ_USER="datajoint" DJ_PASS="datajoint" BOTO_CONFIG="/tmp/bogusvalue" python: - "3.4" - "3.5" From 9acfbd6d200d9dda91ffd1348c11a650f5576ded Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Mon, 21 Aug 2017 12:21:50 -0500 Subject: [PATCH 4/8] Revert ".travis.yml: boto setting copied from moto cfg" This reverts commit 126d9fcb1aca46f8f6ae916220451e0b4da1c93c. --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index dca3293af..301bf007d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,6 @@ before_install: - mysql -e "create user 'datajoint'@'%' identified by 'datajoint'; GRANT ALL PRIVILEGES ON \`djtest\_%\`.* TO 'datajoint'@'%';" -uroot - mysql -e "create user 'djview'@'%' identified by 'djview'; GRANT SELECT ON \`djtest\_%\`.* TO 'djview'@'%';" -uroot install: - - travis_retry pip install boto==2.45.0 - travis_wait 30 pip install -r requirements.txt - travis_wait 30 pip install -r test_requirements.txt - pip install nose nose-cov python-coveralls From 0772a4194733dff032a02e7f1130c9ef3f314939 Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Thu, 19 Oct 2017 22:20:42 -0500 Subject: [PATCH 5/8] external.py: misc updates based on review of pr #358. --- datajoint/external.py | 45 +++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 65ccd543e..0e9898570 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -79,38 +79,49 @@ def connect(self): def stat(self, rpath=None): """ - check if a file exists in the bucket + Check if a file exists in the bucket. + + :param rpath: remote path within bucket """ try: self.connect() self._s3.Object(self._bucket, rpath).load() except ClientError as e: - if e.response['Error']['Code'] == "404": - return False - else: - raise DataJointError('error checking remote file') + if e.response['Error']['Code'] != "404": + raise DataJointError('Error checking remote file', str(rpath), + '(', str(e), ')') + + return False return True def put(self, lpath=None, rpath=None): """ - Upload a file + Upload a file to the bucket. + + :param rpath: remote path within bucket + :param lpath: local path """ try: self._s3.Object(self._bucket, rpath).upload_file(lpath) - except: - raise DataJointError('Error uploading file') + except Exception as e: + raise DataJointError('Error uploading file', str(lpath), + 'to', str(rpath), '(', str(e), ')') return True def get(self, rpath=None, lpath=None): """ - Retrieve a file + Retrieve a file from the bucket. + + :param rpath: remote path within bucket + :param lpath: local path """ try: self._s3.Object(self._bucket, rpath).download_file(lpath) except Exception as e: - raise DataJointError('file download error') + raise DataJointError('Error downloading file', str(rpath), + 'to', str(lpath), '(', str(e), ')') return True @@ -119,14 +130,14 @@ def delete(self, rpath): Delete a single remote object. Note: will return True even if object doesn't exist; for explicit verification combine with a .stat() call. + + :param rpath: remote path within bucket ''' self.connect() r = self._s3.Object(self._bucket, rpath).delete() try: - if r['ResponseMetadata']['HTTPStatusCode'] == 204: - return True - else: - # XXX: if/when does this occur? - s3 returns ok if no file... - return False - except: - raise DataJointError('error deleting file: ' + str(rpath)) + # XXX: if/when does 'False' occur? - s3 returns ok if no file... + return r['ResponseMetadata']['HTTPStatusCode'] == 204 + except Exception as e: + raise DataJointError('error deleting file ' + str(rpath), + '(', str(e), ')') From 80c3b26baf507d5bc05e678cf8eab89465af2e1a Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Thu, 19 Oct 2017 22:29:52 -0500 Subject: [PATCH 6/8] external.py: consistently call connect in methods & do operations w/i try blocks --- datajoint/external.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 0e9898570..81609de62 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -103,6 +103,7 @@ def put(self, lpath=None, rpath=None): :param lpath: local path """ try: + self.connect() self._s3.Object(self._bucket, rpath).upload_file(lpath) except Exception as e: raise DataJointError('Error uploading file', str(lpath), @@ -118,6 +119,7 @@ def get(self, rpath=None, lpath=None): :param lpath: local path """ try: + self.connect() self._s3.Object(self._bucket, rpath).download_file(lpath) except Exception as e: raise DataJointError('Error downloading file', str(rpath), @@ -133,9 +135,9 @@ def delete(self, rpath): :param rpath: remote path within bucket ''' - self.connect() - r = self._s3.Object(self._bucket, rpath).delete() try: + self.connect() + r = self._s3.Object(self._bucket, rpath).delete() # XXX: if/when does 'False' occur? - s3 returns ok if no file... return r['ResponseMetadata']['HTTPStatusCode'] == 204 except Exception as e: From 30ade48a6ba0f098f03dbed1faa9dcee437fdc4d Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Thu, 19 Oct 2017 23:12:39 -0500 Subject: [PATCH 7/8] external.py: flip exception constructor strings to ''.format() --- datajoint/external.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 81609de62..fe12f3d14 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -70,8 +70,10 @@ def __init__(self, aws_access_key_id, aws_secret_access_key): try: self._bucket = config['external.location'].split("s3://")[1] except (AttributeError, IndexError, KeyError) as e: - raise DataJointError('external.location not properly configured: ' - + str(config['external.location'])) from None + raise DataJointError( + 'external.location not properly configured: {l}'.format( + l=config['external.location']) + ) from None def connect(self): if self._s3 is None: @@ -88,9 +90,9 @@ def stat(self, rpath=None): self._s3.Object(self._bucket, rpath).load() except ClientError as e: if e.response['Error']['Code'] != "404": - raise DataJointError('Error checking remote file', str(rpath), - '(', str(e), ')') - + raise DataJointError( + 'Error checking remote file {r} ({e})'.format(r=rpath, e=e) + ) return False return True @@ -106,8 +108,10 @@ def put(self, lpath=None, rpath=None): self.connect() self._s3.Object(self._bucket, rpath).upload_file(lpath) except Exception as e: - raise DataJointError('Error uploading file', str(lpath), - 'to', str(rpath), '(', str(e), ')') + raise DataJointError( + 'Error uploading file {l} to {r} ({e})'.format( + l=lpath, r=rpath, e=e) + ) return True @@ -122,8 +126,10 @@ def get(self, rpath=None, lpath=None): self.connect() self._s3.Object(self._bucket, rpath).download_file(lpath) except Exception as e: - raise DataJointError('Error downloading file', str(rpath), - 'to', str(lpath), '(', str(e), ')') + raise DataJointError( + 'Error downloading file {r} to {l} ({e})'.format( + r=rpath, l=lpath, e=e) + ) return True @@ -141,5 +147,6 @@ def delete(self, rpath): # XXX: if/when does 'False' occur? - s3 returns ok if no file... return r['ResponseMetadata']['HTTPStatusCode'] == 204 except Exception as e: - raise DataJointError('error deleting file ' + str(rpath), - '(', str(e), ')') + raise DataJointError( + 'error deleting file {r} ({e})'.format(r=rpath, e=e) + ) From c8a476373b3c2daf23fa722240d34b80f90b7ecd Mon Sep 17 00:00:00 2001 From: Chris Turner Date: Fri, 20 Oct 2017 15:28:48 -0500 Subject: [PATCH 8/8] external.py: rename existing external.py to s3.py --- datajoint/__init__.py | 2 +- datajoint/{external.py => s3.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename datajoint/{external.py => s3.py} (100%) diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 0068e9d4c..a513be61f 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -66,7 +66,7 @@ class DataJointError(Exception): # ------------- flatten import hierarchy ------------------------- from .connection import conn, Connection -from .external import bucket, Bucket +from .s3 import bucket, Bucket from .base_relation import FreeRelation, BaseRelation from .user_relations import Manual, Lookup, Imported, Computed, Part from .relational_operand import Not, AndList, OrList, U diff --git a/datajoint/external.py b/datajoint/s3.py similarity index 100% rename from datajoint/external.py rename to datajoint/s3.py