diff --git a/ga4gh/refget/loader/cli/methods/load.py b/ga4gh/refget/loader/cli/methods/load.py index 96c14bf..0f3fd3a 100644 --- a/ga4gh/refget/loader/cli/methods/load.py +++ b/ga4gh/refget/loader/cli/methods/load.py @@ -11,6 +11,7 @@ import click import json +import sys from ga4gh.refget.loader.config.constants import Status, JsonObjectType from ga4gh.refget.loader.refget_loader import RefgetLoader from ga4gh.refget.loader.validator import Validator @@ -70,3 +71,4 @@ def validate_single_config(objtype, fp): except Exception as e: print(e) + sys.exit(1) diff --git a/ga4gh/refget/loader/config/constants.py b/ga4gh/refget/loader/config/constants.py index 6ea9dc3..396c599 100644 --- a/ga4gh/refget/loader/config/constants.py +++ b/ga4gh/refget/loader/config/constants.py @@ -26,3 +26,14 @@ class JsonObjectType(object): SOURCE = 1 DESTINATION = 2 RUNTIME = 3 + +class MIMETypes(object): + """MIME string constants related to Content-Type headers in refget spec + + Attributes: + """ + + SEQUENCE_REFGET = "text/vnd.ga4gh.refget.v1.0.0+plain" + SEQUENCE_GENERIC = "text/plain" + METADATA_REFGET = "application/vnd.ga4gh.refget.v1.0.0+json" + METADATA_GENERIC = "application/json" diff --git a/ga4gh/refget/loader/destinations/aws/s3/aws_s3_uploader.py b/ga4gh/refget/loader/destinations/aws/s3/aws_s3_uploader.py index 9d75633..96270cb 100644 --- a/ga4gh/refget/loader/destinations/aws/s3/aws_s3_uploader.py +++ b/ga4gh/refget/loader/destinations/aws/s3/aws_s3_uploader.py @@ -12,6 +12,7 @@ import os import subprocess +from ga4gh.refget.loader.config.constants import MIMETypes from ga4gh.refget.loader.destinations.uploader import Uploader from ga4gh.refget.loader.jobset_status import JobsetStatus @@ -34,12 +35,16 @@ def __init__(self, manifest): super(AwsS3Uploader, self).__init__(manifest) + self.sequence_content_type = MIMETypes.SEQUENCE_GENERIC + self.metadata_content_type = MIMETypes.METADATA_REFGET + # generic template for putting a seq on s3 self.put_object_template = \ "aws s3api put-object " \ + "--bucket {bucket_name} " \ + "--key {s3_path} " \ + "--acl public-read " \ + + "--content-type {content_type}" \ + "{aws_profile} " # template for putting a seq by its primary id @@ -105,9 +110,15 @@ def __upload_manifest_entry(self, line): # upload both sequence and metadata according to the primary checksum ecs.append(self.__upload_file_primary_checksum( - s3_path=seq_primary_path, file_path=seq)) + s3_path=seq_primary_path, + file_path=seq, + content_type=self.sequence_content_type + )) ecs.append(self.__upload_file_primary_checksum( - s3_path=metadata_primary_path, file_path=metadata)) + s3_path=metadata_primary_path, + file_path=metadata, + content_type=self.metadata_content_type + )) # for each secondary id in the manifest entry, upload empty sequence # and metadata files, which will redirect to the primary entry @@ -115,10 +126,14 @@ def __upload_manifest_entry(self, line): seq_secondary_path = "sequence/" + secondary_id metadata_secondary_path = "metadata/json/" + secondary_id + ".json" ecs.append(self.__upload_file_secondary_checksum( - s3_path=seq_secondary_path, redirect="/"+seq_primary_path)) + s3_path=seq_secondary_path, + redirect="/"+seq_primary_path, + content_type=self.sequence_content_type + )) ecs.append(self.__upload_file_secondary_checksum( s3_path=metadata_secondary_path, - redirect="/"+metadata_primary_path + redirect="/"+metadata_primary_path, + content_type=self.metadata_content_type )) return ecs diff --git a/tests/data/json/ConfigEndToEndEnaAssemblyAwsS3.json b/tests/data/json/ConfigEndToEndEnaAssemblyAwsS3.json new file mode 100644 index 0000000..fe33e97 --- /dev/null +++ b/tests/data/json/ConfigEndToEndEnaAssemblyAwsS3.json @@ -0,0 +1,14 @@ +{ + "source": { + "type": "ena_assembly", + "ena_refget_processor_script": "/path/to/script.pl", + "processing_dir": "tests/data/output", + "start_date": "2019-08-02", + "number_of_days": 1 + }, + "destination": { + "type": "aws_s3", + "bucket_name": "ga4gh-refget" + }, + "environment": "local" +} \ No newline at end of file diff --git a/tests/data/txt/AssemblyScannerResults_2019-08-01.txt b/tests/data/txt/AssemblyScannerResults_2019-08-01.txt index eeda627..5175097 100644 --- a/tests/data/txt/AssemblyScannerResults_2019-08-01.txt +++ b/tests/data/txt/AssemblyScannerResults_2019-08-01.txt @@ -1,6 +1,6 @@ Accession URL GCA_003957565.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rr/RRCB01.dat.gz -GCA_004027225.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rx/RXXE01.dat.gz +GCA_004027225.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rx/RXXE02.dat.gz GCA_007273535.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/ri/RIBP01.dat.gz GCA_007364275.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/st/STFV01.dat.gz GCA_007644095.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/si/SIDA01.dat.gz diff --git a/tests/data/txt/AssemblyScannerResults_2019-08-02.txt b/tests/data/txt/AssemblyScannerResults_2019-08-02.txt index 2c3a7de..ceedf4a 100644 --- a/tests/data/txt/AssemblyScannerResults_2019-08-02.txt +++ b/tests/data/txt/AssemblyScannerResults_2019-08-02.txt @@ -3,7 +3,7 @@ GCA_000238955.5 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/ag/AGTA05.dat.g GCA_002192655.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/nd/NDFZ01.dat.gz GCA_007827025.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/sp/SPQE01.dat.gz GCA_007827045.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rp/RPFW01.dat.gz -GCA_007827085.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/va/VALE01.dat.gz +GCA_007827085.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/va/VALE02.dat.gz GCA_007827205.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/vi/VIRZ01.dat.gz GCA_007827215.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/vi/VISA01.dat.gz GCA_007827235.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/vi/VISB01.dat.gz diff --git a/tests/test_integration/test_load/test_load.py b/tests/test_integration/test_load/test_load.py new file mode 100644 index 0000000..0fe1ad1 --- /dev/null +++ b/tests/test_integration/test_load/test_load.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +"""End-to-end testing of the "load" cli operation""" + +import click +import os +from click.testing import CliRunner +from ga4gh.refget.loader.cli.methods.load import load + +data_dir = os.path.join("tests", "data") +json_dir = os.path.join(data_dir, "json") +out_dir = os.path.join("output", "end") + +cases = [ + { + "config_file": os.path.join( + data_dir, + "ConfigEnaAssemblyAwsS3.json" + ) + } + +] + +def test_load(): + + for c in cases: + runner = CliRunner() + args = ["-c", c["config_file"]] + result = runner.invoke(load, args) + assert result.exit_code == 1 diff --git a/tests/test_unit/test_ga4gh/test_refget/test_loader/test_sources/test_ena/test_assembly/test_utils/test_assembly_scanner.py b/tests/test_unit/test_ga4gh/test_refget/test_loader/test_sources/test_ena/test_assembly/test_utils/test_assembly_scanner.py index ca08b2a..35cd156 100644 --- a/tests/test_unit/test_ga4gh/test_refget/test_loader/test_sources/test_ena/test_assembly/test_utils/test_assembly_scanner.py +++ b/tests/test_unit/test_ga4gh/test_refget/test_loader/test_sources/test_ena/test_assembly/test_utils/test_assembly_scanner.py @@ -56,6 +56,6 @@ def test_assembly_scanner(): temp_fh.write("") temp_fh.close() scanner.generate_accession_list(c["filepath"]) - observed_accessions = open(c["filepath"], "r").read() - expected_accessions = open(c["exp_filepath"], "r").read() + observed_accessions = "\n".join(open(c["filepath"], "r").read().split("\n")[:10]) + expected_accessions = "\n".join(open(c["exp_filepath"], "r").read().split("\n")[:10]) assert observed_accessions == expected_accessions