diff --git a/.travis.yml b/.travis.yml index 6c4ff72..6c77c6c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,9 @@ sudo: required +services: + - elasticsearch + dist: trusty @@ -21,5 +24,11 @@ install: script: - make test +before_script: + - moto_server & + - sleep 30 + - curl localhost:9200 + - curl localhost:5000 + after_success: - coveralls diff --git a/datapackage_pipelines_assembler/generator.py b/datapackage_pipelines_assembler/generator.py index b2e1cd7..8bfbdf2 100644 --- a/datapackage_pipelines_assembler/generator.py +++ b/datapackage_pipelines_assembler/generator.py @@ -56,7 +56,9 @@ def s3_path(*parts): else: path = '/'.join(str(p) for p in parts) bucket = os.environ['PKGSTORE_BUCKET'] - return 'https://{}/{}'.format(bucket, path) + # Handle other s3 compatible server as well (for testing) + protocol = os.environ.get('S3_ENDPOINT_URL') or 'https://' + return '{}{}/{}'.format(protocol, bucket, path) class Generator(GeneratorBase): diff --git a/setup.py b/setup.py index 3389a8c..46b699c 100755 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def read(*paths): INSTALL_REQUIRES = [ 'datapackage-pipelines', 'datapackage-pipelines-elasticsearch>=0.0.3', - 'datapackage-pipelines-aws>=0.0.8', + 'datapackage-pipelines-aws>=0.0.9', 'psycopg2', 'tweepy', 'facebook-sdk', @@ -29,6 +29,9 @@ def read(*paths): TESTS_REQUIRE = [ 'pylama', 'tox', + 'moto', + 'boto3', + 'google-compute-engine' ] README = read('README.md') VERSION = read(PACKAGE, 'VERSION') diff --git a/tests/data/sample_birthdays.csv b/tests/data/sample_birthdays.csv new file mode 100644 index 0000000..e22488e --- /dev/null +++ b/tests/data/sample_birthdays.csv @@ -0,0 +1,21 @@ +date,first_name,last_name +2016-10-15,Shaylynn,Eallis +2017-01-18,Patricia,Eefting +2017-03-01,Karrah,Couser +2017-03-17,Rhetta,Price +2016-12-23,Alexandros,Farrand +2017-05-01,Ado,Matejic +2016-10-28,Keene,Tonna +2017-01-31,Helena,Aiskovitch +2017-02-11,Leigh,Butner +2017-01-19,Perle,Work +2016-11-16,Delora,Pavolillo +2017-09-21,Marshall,Leall +2017-04-28,Olwen,Mullin +2016-12-27,Nerta,Enrique +2016-12-07,Ashlie,Bracey +2017-05-18,Dode,Ritmeier +2016-10-16,Agace,Kew +2017-04-08,Beckie,Dove +2017-01-20,Filippa,McPolin +2017-03-19,Madison,Sheekey diff --git a/tests/data/sample_birthdays.xlsx b/tests/data/sample_birthdays.xlsx new file mode 100644 index 0000000..4c5d646 Binary files /dev/null and b/tests/data/sample_birthdays.xlsx differ diff --git a/tests/data/sample_birthdays_invalid.csv b/tests/data/sample_birthdays_invalid.csv new file mode 100644 index 0000000..fec398d --- /dev/null +++ b/tests/data/sample_birthdays_invalid.csv @@ -0,0 +1,24 @@ +First three rows need to be removed +headers need to be reset +and dates need to be normalized +DATE,FIRST NAME ,LAST NAME +2016-10-15,Shaylynn,Eallis +2017-1-18,Patricia,Eefting +2017-3-1,Karrah,Couser +2017-3-17,Rhetta,Price +2016-12-23,Alexandros,Farrand +2017-5-1,Ado,Matejic +2016-10-28,Keene,Tonna +2017-1-31,Helena,Aiskovitch +2017-2-11,Leigh,Butner +2017-1-19,Perle,Work +2016-11-16,Delora,Pavolillo +2017-9-21,Marshall,Leall +2017-4-28,Olwen,Mullin +2016-12-27,Nerta,Enrique +2016-12-7,Ashlie,Bracey +2017-5-18,Dode,Ritmeier +2016-10-16,Agace,Kew +2017-4-8,Beckie,Dove +2017-1-20,Filippa,McPolin +2017-3-19,Madison,Sheekey diff --git a/tests/data/sample_emails.csv b/tests/data/sample_emails.csv new file mode 100644 index 0000000..a1d43f2 --- /dev/null +++ b/tests/data/sample_emails.csv @@ -0,0 +1,21 @@ +id,email +1,cjozsika0@github.io +2,smegainey1@twitter.com +3,eyesson2@mail.ru +4,aigoe3@usa.gov +5,tkalinowsky4@tamu.edu +6,eprime5@paypal.com +7,fwinchcum6@drupal.org +8,rrivilis7@nationalgeographic.com +9,mbrisley8@creativecommons.org +10,dmacavddy9@stumbleupon.com +11,lbromwicha@hostgator.com +12,kvargab@fotki.com +13,hlintsc@ning.com +14,hravenscraftd@nhs.uk +15,dtrencharde@chicagotribune.com +16,zkurtisf@ucla.edu +17,aindeg@wordpress.com +18,nadaneth@eepurl.com +19,kwerneri@msn.com +20,nmeardonj@springer.com diff --git a/tests/inputs/excel/assembler.source-spec.yaml b/tests/inputs/excel/assembler.source-spec.yaml new file mode 100644 index 0000000..b6fdc2b --- /dev/null +++ b/tests/inputs/excel/assembler.source-spec.yaml @@ -0,0 +1,16 @@ +meta: + dataset: excel + findability: published + owner: datahub + ownerid: datahub + version: 1 +inputs: +- kind: datapackage + parameters: + resource-mapping: + birthdays: ../../data/sample_birthdays.xlsx + url: datapackage.json +outputs: +- kind: zip + parameters: + out-file: 'excel.zip' diff --git a/tests/inputs/excel/datapackage.json b/tests/inputs/excel/datapackage.json new file mode 100644 index 0000000..4f59641 --- /dev/null +++ b/tests/inputs/excel/datapackage.json @@ -0,0 +1,27 @@ +{ + "name": "excel", + "resources": [ + { + "name": "birthdays", + "path": "data/birthdays.xlsx", + "format": "xlsx", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "first_name", + "type": "string" + }, + { + "name": "last_name", + "type": "string" + } + ], + "primaryKey": "date" + } + } + ] +} diff --git a/tests/inputs/multiple_files/assembler.source-spec.yaml b/tests/inputs/multiple_files/assembler.source-spec.yaml new file mode 100644 index 0000000..8224672 --- /dev/null +++ b/tests/inputs/multiple_files/assembler.source-spec.yaml @@ -0,0 +1,17 @@ +meta: + dataset: multiple-files + findability: published + owner: datahub + ownerid: datahub + version: 1 +inputs: +- kind: datapackage + parameters: + resource-mapping: + birthdays: ../../data/sample_birthdays.csv + emails: ../../data/sample_emails.csv + url: datapackage.json +outputs: +- kind: zip + parameters: + out-file: 'multiple-files.zip' diff --git a/tests/inputs/multiple_files/datapackage.json b/tests/inputs/multiple_files/datapackage.json new file mode 100644 index 0000000..465a000 --- /dev/null +++ b/tests/inputs/multiple_files/datapackage.json @@ -0,0 +1,44 @@ +{ + "name": "multiple-files", + "resources": [ + { + "name": "birthdays", + "path": "data/birthdays.csv", + "format": "csv", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "first_name", + "type": "string" + }, + { + "name": "last_name", + "type": "string" + } + ], + "primaryKey": "date" + } + }, + { + "name": "emails", + "path": "data/emails.csv", + "format": "csv", + "schema": { + "fields": [ + { + "name": "id", + "type": "number" + }, + { + "name": "email", + "type": "string" + } + ] + } + } + ] +} diff --git a/tests/inputs/needs_processing/assembler.source-spec.yaml b/tests/inputs/needs_processing/assembler.source-spec.yaml new file mode 100644 index 0000000..258c15b --- /dev/null +++ b/tests/inputs/needs_processing/assembler.source-spec.yaml @@ -0,0 +1,26 @@ +meta: + dataset: single-file-processed + findability: published + owner: datahub + ownerid: datahub + version: 1 +inputs: +- kind: datapackage + parameters: + resource-mapping: + birthdays: ../../data/sample_birthdays_invalid.csv + url: datapackage.json +processing: + - + input: birthdays + tabulator: + skip_rows: 4 + headers: + - date + - first_name + - last_name + output: birthdays +outputs: +- kind: zip + parameters: + out-file: 'single-file-processed.zip' diff --git a/tests/inputs/needs_processing/datapackage.json b/tests/inputs/needs_processing/datapackage.json new file mode 100644 index 0000000..c81f48e --- /dev/null +++ b/tests/inputs/needs_processing/datapackage.json @@ -0,0 +1,27 @@ +{ + "name": "single-file-processed", + "resources": [ + { + "name": "birthdays", + "path": "data/birthdays.csv", + "format": "csv", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "first_name", + "type": "string" + }, + { + "name": "last_name", + "type": "string" + } + ], + "primaryKey": "date" + } + } + ] +} diff --git a/tests/inputs/single_file/assembler.source-spec.yaml b/tests/inputs/single_file/assembler.source-spec.yaml new file mode 100644 index 0000000..2b07836 --- /dev/null +++ b/tests/inputs/single_file/assembler.source-spec.yaml @@ -0,0 +1,16 @@ +meta: + dataset: single-file + findability: published + owner: datahub + ownerid: datahub + version: 1 +inputs: +- kind: datapackage + parameters: + resource-mapping: + birthdays: ../../data/sample_birthdays.csv + url: datapackage.json +outputs: +- kind: zip + parameters: + out-file: 'single-file.zip' diff --git a/tests/inputs/single_file/datapackage.json b/tests/inputs/single_file/datapackage.json new file mode 100644 index 0000000..1a0a5b9 --- /dev/null +++ b/tests/inputs/single_file/datapackage.json @@ -0,0 +1,27 @@ +{ + "name": "single-file", + "resources": [ + { + "name": "birthdays", + "path": "data/birthdays.csv", + "format": "csv", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "first_name", + "type": "string" + }, + { + "name": "last_name", + "type": "string" + } + ], + "primaryKey": "date" + } + } + ] +} diff --git a/tests/outputs/csv/sample_birthdays.csv b/tests/outputs/csv/sample_birthdays.csv new file mode 100644 index 0000000..e22488e --- /dev/null +++ b/tests/outputs/csv/sample_birthdays.csv @@ -0,0 +1,21 @@ +date,first_name,last_name +2016-10-15,Shaylynn,Eallis +2017-01-18,Patricia,Eefting +2017-03-01,Karrah,Couser +2017-03-17,Rhetta,Price +2016-12-23,Alexandros,Farrand +2017-05-01,Ado,Matejic +2016-10-28,Keene,Tonna +2017-01-31,Helena,Aiskovitch +2017-02-11,Leigh,Butner +2017-01-19,Perle,Work +2016-11-16,Delora,Pavolillo +2017-09-21,Marshall,Leall +2017-04-28,Olwen,Mullin +2016-12-27,Nerta,Enrique +2016-12-07,Ashlie,Bracey +2017-05-18,Dode,Ritmeier +2016-10-16,Agace,Kew +2017-04-08,Beckie,Dove +2017-01-20,Filippa,McPolin +2017-03-19,Madison,Sheekey diff --git a/tests/outputs/csv/sample_birthdays_invalid.csv b/tests/outputs/csv/sample_birthdays_invalid.csv new file mode 100644 index 0000000..fec398d --- /dev/null +++ b/tests/outputs/csv/sample_birthdays_invalid.csv @@ -0,0 +1,24 @@ +First three rows need to be removed +headers need to be reset +and dates need to be normalized +DATE,FIRST NAME ,LAST NAME +2016-10-15,Shaylynn,Eallis +2017-1-18,Patricia,Eefting +2017-3-1,Karrah,Couser +2017-3-17,Rhetta,Price +2016-12-23,Alexandros,Farrand +2017-5-1,Ado,Matejic +2016-10-28,Keene,Tonna +2017-1-31,Helena,Aiskovitch +2017-2-11,Leigh,Butner +2017-1-19,Perle,Work +2016-11-16,Delora,Pavolillo +2017-9-21,Marshall,Leall +2017-4-28,Olwen,Mullin +2016-12-27,Nerta,Enrique +2016-12-7,Ashlie,Bracey +2017-5-18,Dode,Ritmeier +2016-10-16,Agace,Kew +2017-4-8,Beckie,Dove +2017-1-20,Filippa,McPolin +2017-3-19,Madison,Sheekey diff --git a/tests/outputs/csv/sample_emails.csv b/tests/outputs/csv/sample_emails.csv new file mode 100644 index 0000000..a1d43f2 --- /dev/null +++ b/tests/outputs/csv/sample_emails.csv @@ -0,0 +1,21 @@ +id,email +1,cjozsika0@github.io +2,smegainey1@twitter.com +3,eyesson2@mail.ru +4,aigoe3@usa.gov +5,tkalinowsky4@tamu.edu +6,eprime5@paypal.com +7,fwinchcum6@drupal.org +8,rrivilis7@nationalgeographic.com +9,mbrisley8@creativecommons.org +10,dmacavddy9@stumbleupon.com +11,lbromwicha@hostgator.com +12,kvargab@fotki.com +13,hlintsc@ning.com +14,hravenscraftd@nhs.uk +15,dtrencharde@chicagotribune.com +16,zkurtisf@ucla.edu +17,aindeg@wordpress.com +18,nadaneth@eepurl.com +19,kwerneri@msn.com +20,nmeardonj@springer.com diff --git a/tests/outputs/excel/sample_birthdays.xlsx b/tests/outputs/excel/sample_birthdays.xlsx new file mode 100644 index 0000000..4c5d646 Binary files /dev/null and b/tests/outputs/excel/sample_birthdays.xlsx differ diff --git a/tests/outputs/json/sample_birthdays.json b/tests/outputs/json/sample_birthdays.json new file mode 100644 index 0000000..786ff0c --- /dev/null +++ b/tests/outputs/json/sample_birthdays.json @@ -0,0 +1,102 @@ +[ + { + "date": "2016-10-15", + "first_name": "Shaylynn", + "last_name": "Eallis" + }, + { + "date": "2017-01-18", + "first_name": "Patricia", + "last_name": "Eefting" + }, + { + "date": "2017-03-01", + "first_name": "Karrah", + "last_name": "Couser" + }, + { + "date": "2017-03-17", + "first_name": "Rhetta", + "last_name": "Price" + }, + { + "date": "2016-12-23", + "first_name": "Alexandros", + "last_name": "Farrand" + }, + { + "date": "2017-05-01", + "first_name": "Ado", + "last_name": "Matejic" + }, + { + "date": "2016-10-28", + "first_name": "Keene", + "last_name": "Tonna" + }, + { + "date": "2017-01-31", + "first_name": "Helena", + "last_name": "Aiskovitch" + }, + { + "date": "2017-02-11", + "first_name": "Leigh", + "last_name": "Butner" + }, + { + "date": "2017-01-19", + "first_name": "Perle", + "last_name": "Work" + }, + { + "date": "2016-11-16", + "first_name": "Delora", + "last_name": "Pavolillo" + }, + { + "date": "2017-09-21", + "first_name": "Marshall", + "last_name": "Leall" + }, + { + "date": "2017-04-28", + "first_name": "Olwen", + "last_name": "Mullin" + }, + { + "date": "2016-12-27", + "first_name": "Nerta", + "last_name": "Enrique" + }, + { + "date": "2016-12-07", + "first_name": "Ashlie", + "last_name": "Bracey" + }, + { + "date": "2017-05-18", + "first_name": "Dode", + "last_name": "Ritmeier" + }, + { + "date": "2016-10-16", + "first_name": "Agace", + "last_name": "Kew" + }, + { + "date": "2017-04-08", + "first_name": "Beckie", + "last_name": "Dove" + }, + { + "date": "2017-01-20", + "first_name": "Filippa", + "last_name": "McPolin" + }, + { + "date": "2017-03-19", + "first_name": "Madison", + "last_name": "Sheekey" + } +] diff --git a/tests/outputs/json/sample_emails.json b/tests/outputs/json/sample_emails.json new file mode 100644 index 0000000..17eaacd --- /dev/null +++ b/tests/outputs/json/sample_emails.json @@ -0,0 +1,82 @@ +[ + { + "id": 1, + "email": "cjozsika0@github.io" + }, + { + "id": 2, + "email": "smegainey1@twitter.com" + }, + { + "id": 3, + "email": "eyesson2@mail.ru" + }, + { + "id": 4, + "email": "aigoe3@usa.gov" + }, + { + "id": 5, + "email": "tkalinowsky4@tamu.edu" + }, + { + "id": 6, + "email": "eprime5@paypal.com" + }, + { + "id": 7, + "email": "fwinchcum6@drupal.org" + }, + { + "id": 8, + "email": "rrivilis7@nationalgeographic.com" + }, + { + "id": 9, + "email": "mbrisley8@creativecommons.org" + }, + { + "id": 10, + "email": "dmacavddy9@stumbleupon.com" + }, + { + "id": 11, + "email": "lbromwicha@hostgator.com" + }, + { + "id": 12, + "email": "kvargab@fotki.com" + }, + { + "id": 13, + "email": "hlintsc@ning.com" + }, + { + "id": 14, + "email": "hravenscraftd@nhs.uk" + }, + { + "id": 15, + "email": "dtrencharde@chicagotribune.com" + }, + { + "id": 16, + "email": "zkurtisf@ucla.edu" + }, + { + "id": 17, + "email": "aindeg@wordpress.com" + }, + { + "id": 18, + "email": "nadaneth@eepurl.com" + }, + { + "id": 19, + "email": "kwerneri@msn.com" + }, + { + "id": 20, + "email": "nmeardonj@springer.com" + } +] diff --git a/tests/outputs/zip/excel.zip b/tests/outputs/zip/excel.zip new file mode 100644 index 0000000..7f16d9d Binary files /dev/null and b/tests/outputs/zip/excel.zip differ diff --git a/tests/outputs/zip/multiple-files.zip b/tests/outputs/zip/multiple-files.zip new file mode 100644 index 0000000..a3e8560 Binary files /dev/null and b/tests/outputs/zip/multiple-files.zip differ diff --git a/tests/outputs/zip/single-file.zip b/tests/outputs/zip/single-file.zip new file mode 100644 index 0000000..99df91d Binary files /dev/null and b/tests/outputs/zip/single-file.zip differ diff --git a/tests/test_flow.py b/tests/test_flow.py new file mode 100644 index 0000000..5fa2353 --- /dev/null +++ b/tests/test_flow.py @@ -0,0 +1,347 @@ +import json +import os +import requests +import subprocess +import time +import unittest + +import boto3 +from elasticsearch import Elasticsearch, NotFoundError + +ES_SERVER = os.environ['DPP_ELASTICSEARCH'] = 'http://localhost:9200' +S3_SERVER = os.environ['S3_ENDPOINT_URL'] = 'http://localhost:5000/' +os.environ['PKGSTORE_BUCKET'] = 'testing.datahub.io' +os.environ['AWS_ACCESS_KEY_ID'] = 'foo' +os.environ['AWS_SECRET_ACCESS_KEY'] = 'bar' + +def run_factory(dir='.'): + os.chdir(dir) + subprocess.call(['dpp', 'run', 'dirty']) + os.remove('.dpp.db') + +class TestFlow(unittest.TestCase): + + @classmethod + def setup_class(self): + es = Elasticsearch(hosts=[ES_SERVER]) + es.indices.delete(index='datahub', ignore=[400, 404]) + es.indices.delete(index='events', ignore=[400, 404]) + s3 = boto3.resource( + service_name='s3', + endpoint_url=S3_SERVER, + ) + self.bucket_name = os.environ['PKGSTORE_BUCKET'] + self.bucket = s3.Bucket(self.bucket_name) + + + def test_single_file(self): + run_factory(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'inputs/single_file')) + + res = requests.get( + '{}{}/datahub/single-file/latest/datapackage.json'.format(S3_SERVER, self.bucket_name)) + + res = requests.get( + '{}{}/datahub/single-file:birthdays/data/birthdays.csv'.format(S3_SERVER, self.bucket_name)) + exp_csv = open('../../outputs/csv/sample_birthdays.csv').read() + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv, res.text) + + res = requests.get( + '{}{}/datahub/single-file:birthdays_csv/data/birthdays_csv.csv'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv.replace('\n', '\r\n'), res.text) + + + res = requests.get( + '{}{}/datahub/single-file:birthdays_json/data/birthdays_json.json'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + exp_json = json.load(open('../../outputs/json/sample_birthdays.json')) + self.assertListEqual(exp_json, res.json()) + + res = requests.get( + '{}{}/datahub/single-file:single-file_zip/data/single-file.zip'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + # TODO: compare zip files + + # Elasticsearch + res = requests.get('http://localhost:9200/datahub/_search') + self.assertEqual(res.status_code, 200) + + meta = res.json() + hits = [hit['_source'] for hit in meta['hits']['hits'] + if hit['_source']['datapackage']['name'] == 'single-file'] + + self.assertEqual(len(hits), 1) + + datahub = hits[0]['datahub'] + datapackage = hits[0]['datapackage'] + self.assertEqual(datahub['findability'],'published') + self.assertEqual(datahub['owner'],'datahub') + self.assertEqual(datahub['stats']['rowcount'], 20) + self.assertEqual(len(datapackage['resources']), 4) + + res = requests.get('http://localhost:9200/events/_search') + self.assertEqual(res.status_code, 200) + + events = res.json() + hits = [hit['_source'] for hit in events['hits']['hits'] + if hit['_source']['dataset'] == 'single-file'] + self.assertEqual(len(hits), 1) + + event = hits[0] + self.assertEqual(event['dataset'],'single-file') + self.assertEqual(event['event_action'],'finished') + self.assertEqual(event['event_entity'], 'flow') + self.assertEqual(event['owner'], 'datahub') + self.assertEqual(event['status'], 'OK') + + def test_multiple_file(self): + run_factory(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'inputs/multiple_files')) + + res = requests.get( + '{}{}/datahub/multiple-files/latest/datapackage.json'.format(S3_SERVER, self.bucket_name)) + + res = requests.get( + '{}{}/datahub/multiple-files:birthdays/data/birthdays.csv'.format(S3_SERVER, self.bucket_name)) + exp_csv = open('../../outputs/csv/sample_birthdays.csv').read() + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv, res.text) + + res = requests.get( + '{}{}/datahub/multiple-files:birthdays_csv/data/birthdays_csv.csv'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv.replace('\n', '\r\n'), res.text) + + res = requests.get( + '{}{}/datahub/multiple-files:birthdays_json/data/birthdays_json.json'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + exp_json = json.load(open('../../outputs/json/sample_birthdays.json')) + self.assertListEqual(exp_json, res.json()) + + res = requests.get( + '{}{}/datahub/multiple-files:emails/data/emails.csv'.format(S3_SERVER, self.bucket_name)) + exp_csv = open('../../outputs/csv/sample_emails.csv').read() + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv, res.text) + + res = requests.get( + '{}{}/datahub/multiple-files:emails_csv/data/emails_csv.csv'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv.replace('\n', '\r\n'), res.text) + + res = requests.get( + '{}{}/datahub/multiple-files:emails_json/data/emails_json.json'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + exp_json = json.load(open('../../outputs/json/sample_emails.json')) + self.assertListEqual(exp_json, res.json()) + + res = requests.get( + '{}{}/datahub/multiple-files:multiple-files_zip/data/multiple-files.zip'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + + # Elasticsearch + res = requests.get('http://localhost:9200/datahub/_search') + self.assertEqual(res.status_code, 200) + + meta = res.json() + hits = [hit['_source'] for hit in meta['hits']['hits'] + if hit['_source']['datapackage']['name'] == 'multiple-files'] + self.assertEqual(len(hits), 1) + + datahub = hits[0]['datahub'] + datapackage = hits[0]['datapackage'] + self.assertEqual(datahub['findability'],'published') + self.assertEqual(datahub['owner'],'datahub') + self.assertEqual(datahub['stats']['rowcount'], 40) + self.assertEqual(len(datapackage['resources']), 7) + + res = requests.get('http://localhost:9200/events/_search') + self.assertEqual(res.status_code, 200) + + events = res.json() + hits = [hit['_source'] for hit in events['hits']['hits'] + if hit['_source']['dataset'] == 'multiple-files'] + self.assertEqual(len(hits), 1) + + event = hits[0] + self.assertEqual(event['event_action'],'finished') + self.assertEqual(event['event_entity'], 'flow') + self.assertEqual(event['owner'], 'datahub') + self.assertEqual(event['status'], 'OK') + + def test_excel_file(self): + run_factory(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'inputs/excel')) + + res = requests.get( + '{}{}/datahub/excel/latest/datapackage.json'.format(S3_SERVER, self.bucket_name)) + + res = requests.get( + '{}{}/datahub/excel:birthdays/data/birthdays.xlsx'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + + res = requests.get( + '{}{}/datahub/excel:birthdays_csv/data/birthdays_csv.csv'.format(S3_SERVER, self.bucket_name)) + exp_csv = open('../../outputs/csv/sample_birthdays.csv').read() + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv.replace('\n', '\r\n'), res.text) + + res = requests.get( + '{}{}/datahub/excel:birthdays_json/data/birthdays_json.json'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + exp_json = json.load(open('../../outputs/json/sample_birthdays.json')) + self.assertListEqual(exp_json, res.json()) + + res = requests.get( + '{}{}/datahub/excel:excel_zip/data/excel.zip'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + + # Elasticsearch + res = requests.get('http://localhost:9200/datahub/_search') + self.assertEqual(res.status_code, 200) + + meta = res.json() + hits = [hit['_source'] for hit in meta['hits']['hits'] + if hit['_source']['datapackage']['name'] == 'excel'] + self.assertEqual(len(hits), 1) + + datahub = hits[0]['datahub'] + datapackage = hits[0]['datapackage'] + self.assertEqual(datahub['findability'],'published') + self.assertEqual(datahub['owner'],'datahub') + self.assertEqual(datahub['stats']['rowcount'], 20) + self.assertEqual(len(datapackage['resources']), 4) + + res = requests.get('http://localhost:9200/events/_search') + self.assertEqual(res.status_code, 200) + + events = res.json() + hits = [hit['_source'] for hit in events['hits']['hits'] + if hit['_source']['dataset'] == 'excel'] + self.assertEqual(len(hits), 1) + + event = hits[0] + self.assertEqual(event['event_action'],'finished') + self.assertEqual(event['event_entity'], 'flow') + self.assertEqual(event['owner'], 'datahub') + self.assertEqual(event['status'], 'OK') + + def test_needs_processing(self): + run_factory(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'inputs/needs_processing')) + + res = requests.get( + '{}{}/datahub/single-file-processed/latest/datapackage.json'.format(S3_SERVER, self.bucket_name)) + + res = requests.get( + '{}{}/datahub/single-file-processed:birthdays/data/birthdays.csv'.format(S3_SERVER, self.bucket_name)) + exp_csv = open('../../outputs/csv/sample_birthdays_invalid.csv').read() + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv, res.text) + + res = requests.get( + '{}{}/datahub/single-file-processed:birthdays_csv/data/birthdays_csv.csv'.format(S3_SERVER, self.bucket_name)) + exp_csv = open('../../outputs/csv/sample_birthdays.csv').read() + self.assertEqual(res.status_code, 200) + self.assertEqual(exp_csv.replace('\n', '\r\n'), res.text) + + + res = requests.get( + '{}{}/datahub/single-file-processed:birthdays_json/data/birthdays_json.json'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + exp_json = json.load(open('../../outputs/json/sample_birthdays.json')) + self.assertListEqual(exp_json, res.json()) + + res = requests.get( + '{}{}/datahub/single-file-processed:single-file-processed_zip/data/single-file-processed.zip'.format(S3_SERVER, self.bucket_name)) + self.assertEqual(res.status_code, 200) + + # Elasticsearch + res = requests.get('http://localhost:9200/datahub/_search') + self.assertEqual(res.status_code, 200) + + meta = res.json() + hits = [hit['_source'] for hit in meta['hits']['hits'] + if hit['_source']['datapackage']['name'] == 'single-file-processed'] + self.assertEqual(len(hits), 1) + + datahub = hits[0]['datahub'] + datapackage = hits[0]['datapackage'] + self.assertEqual(datahub['findability'],'published') + self.assertEqual(datahub['owner'],'datahub') + self.assertEqual(datahub['stats']['rowcount'], 20) + self.assertEqual(len(datapackage['resources']), 4) + + res = requests.get('http://localhost:9200/events/_search') + self.assertEqual(res.status_code, 200) + + events = res.json() + hits = [hit['_source'] for hit in events['hits']['hits'] + if hit['_source']['dataset'] == 'single-file-processed'] + self.assertEqual(len(hits), 1) + + event = hits[0] + self.assertEqual(event['event_action'],'finished') + self.assertEqual(event['event_entity'], 'flow') + self.assertEqual(event['owner'], 'datahub') + self.assertEqual(event['status'], 'OK') + + def test_elasticsearch_saves_multiple_datasets_and_events(self): + # Make sure ES is empty + es = Elasticsearch(hosts=[ES_SERVER]) + es.indices.delete(index='datahub', ignore=[400, 404]) + es.indices.delete(index='events', ignore=[400, 404]) + + # Run flow + run_factory(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'inputs/single_file')) + res = requests.get('http://localhost:9200/datahub/_search') + meta = res.json() + res = requests.get('http://localhost:9200/events/_search') + events = res.json() + self.assertEqual(meta['hits']['total'], 1) + self.assertEqual(events['hits']['total'], 1) + + # Second flow + run_factory(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'inputs/multiple_files')) + res = requests.get('http://localhost:9200/datahub/_search') + meta = res.json() + res = requests.get('http://localhost:9200/events/_search') + events = res.json() + self.assertEqual(meta['hits']['total'], 2) + self.assertEqual(events['hits']['total'], 2) + + # Third flows + run_factory(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'inputs/excel')) + res = requests.get('http://localhost:9200/datahub/_search') + meta = res.json() + res = requests.get('http://localhost:9200/events/_search') + events = res.json() + self.assertEqual(meta['hits']['total'], 3) + self.assertEqual(events['hits']['total'], 3) + + # Clear again to not mess up with other tests + es.indices.delete(index='datahub', ignore=[400, 404]) + es.indices.delete(index='events', ignore=[400, 404]) + + + ## TODO run flow, update metadata, run again + # def test_quick_succession_local(self): + # start_time = time.time() + # run_factory(os.path.join(os.path.dirname( + # os.path.realpath(__file__)), 'inputs/local/needs_processing')) + # time_elapsed_first_run = time.time() - start_time + # start_time = time.time() + # run_factory(os.path.join(os.path.dirname( + # os.path.realpath(__file__)), 'inputs/local/needs_processing')) + # elapsed_time_second_run = time.time() - start_time + # self.assertTrue(time_elapsed_first_run > elapsed_time_second_run) + @classmethod + def teardown_class(self): + for obj in self.bucket.objects.all(): + obj.delete() + self.bucket.delete() diff --git a/tests/test_main.py b/tests/test_processors.py similarity index 100% rename from tests/test_main.py rename to tests/test_processors.py diff --git a/tox.ini b/tox.ini index 986da50..2d05fec 100755 --- a/tox.ini +++ b/tox.ini @@ -6,11 +6,13 @@ envlist= [testenv] deps= + google-compute-engine mock - requests-mock pytest + elasticsearch pytest-cov coverage + boto3 -rtest_requirements.txt passenv= CI