Skip to content

Commit

Permalink
added Content-Type headers for sequences and metadata uploaded to S3
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeremy Adams committed May 15, 2020
1 parent cf7777a commit f5c25c4
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 8 deletions.
2 changes: 2 additions & 0 deletions ga4gh/refget/loader/cli/methods/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import click
import json
import sys
from ga4gh.refget.loader.config.constants import Status, JsonObjectType
from ga4gh.refget.loader.refget_loader import RefgetLoader
from ga4gh.refget.loader.validator import Validator
Expand Down Expand Up @@ -70,3 +71,4 @@ def validate_single_config(objtype, fp):

except Exception as e:
print(e)
sys.exit(1)
11 changes: 11 additions & 0 deletions ga4gh/refget/loader/config/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,14 @@ class JsonObjectType(object):
SOURCE = 1
DESTINATION = 2
RUNTIME = 3

class MIMETypes(object):
"""MIME string constants related to Content-Type headers in refget spec
Attributes:
"""

SEQUENCE_REFGET = "text/vnd.ga4gh.refget.v1.0.0+plain"
SEQUENCE_GENERIC = "text/plain"
METADATA_REFGET = "application/vnd.ga4gh.refget.v1.0.0+json"
METADATA_GENERIC = "application/json"
23 changes: 19 additions & 4 deletions ga4gh/refget/loader/destinations/aws/s3/aws_s3_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import os
import subprocess
from ga4gh.refget.loader.config.constants import MIMETypes
from ga4gh.refget.loader.destinations.uploader import Uploader
from ga4gh.refget.loader.jobset_status import JobsetStatus

Expand All @@ -34,12 +35,16 @@ def __init__(self, manifest):

super(AwsS3Uploader, self).__init__(manifest)

self.sequence_content_type = MIMETypes.SEQUENCE_GENERIC
self.metadata_content_type = MIMETypes.METADATA_REFGET

# generic template for putting a seq on s3
self.put_object_template = \
"aws s3api put-object " \
+ "--bucket {bucket_name} " \
+ "--key {s3_path} " \
+ "--acl public-read " \
+ "--content-type {content_type}" \
+ "{aws_profile} "

# template for putting a seq by its primary id
Expand Down Expand Up @@ -105,20 +110,30 @@ def __upload_manifest_entry(self, line):

# upload both sequence and metadata according to the primary checksum
ecs.append(self.__upload_file_primary_checksum(
s3_path=seq_primary_path, file_path=seq))
s3_path=seq_primary_path,
file_path=seq,
content_type=self.sequence_content_type
))
ecs.append(self.__upload_file_primary_checksum(
s3_path=metadata_primary_path, file_path=metadata))
s3_path=metadata_primary_path,
file_path=metadata,
content_type=self.metadata_content_type
))

# for each secondary id in the manifest entry, upload empty sequence
# and metadata files, which will redirect to the primary entry
for secondary_id in secondary_ids:
seq_secondary_path = "sequence/" + secondary_id
metadata_secondary_path = "metadata/json/" + secondary_id + ".json"
ecs.append(self.__upload_file_secondary_checksum(
s3_path=seq_secondary_path, redirect="/"+seq_primary_path))
s3_path=seq_secondary_path,
redirect="/"+seq_primary_path,
content_type=self.sequence_content_type
))
ecs.append(self.__upload_file_secondary_checksum(
s3_path=metadata_secondary_path,
redirect="/"+metadata_primary_path
redirect="/"+metadata_primary_path,
content_type=self.metadata_content_type
))

return ecs
Expand Down
14 changes: 14 additions & 0 deletions tests/data/json/ConfigEndToEndEnaAssemblyAwsS3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"source": {
"type": "ena_assembly",
"ena_refget_processor_script": "/path/to/script.pl",
"processing_dir": "tests/data/output",
"start_date": "2019-08-02",
"number_of_days": 1
},
"destination": {
"type": "aws_s3",
"bucket_name": "ga4gh-refget"
},
"environment": "local"
}
2 changes: 1 addition & 1 deletion tests/data/txt/AssemblyScannerResults_2019-08-01.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Accession URL
GCA_003957565.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rr/RRCB01.dat.gz
GCA_004027225.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rx/RXXE01.dat.gz
GCA_004027225.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rx/RXXE02.dat.gz
GCA_007273535.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/ri/RIBP01.dat.gz
GCA_007364275.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/st/STFV01.dat.gz
GCA_007644095.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/si/SIDA01.dat.gz
Expand Down
2 changes: 1 addition & 1 deletion tests/data/txt/AssemblyScannerResults_2019-08-02.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ GCA_000238955.5 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/ag/AGTA05.dat.g
GCA_002192655.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/nd/NDFZ01.dat.gz
GCA_007827025.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/sp/SPQE01.dat.gz
GCA_007827045.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/rp/RPFW01.dat.gz
GCA_007827085.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/va/VALE01.dat.gz
GCA_007827085.2 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/va/VALE02.dat.gz
GCA_007827205.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/vi/VIRZ01.dat.gz
GCA_007827215.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/vi/VISA01.dat.gz
GCA_007827235.1 ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/vi/VISB01.dat.gz
Expand Down
29 changes: 29 additions & 0 deletions tests/test_integration/test_load/test_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
"""End-to-end testing of the "load" cli operation"""

import click
import os
from click.testing import CliRunner
from ga4gh.refget.loader.cli.methods.load import load

data_dir = os.path.join("tests", "data")
json_dir = os.path.join(data_dir, "json")
out_dir = os.path.join("output", "end")

cases = [
{
"config_file": os.path.join(
data_dir,
"ConfigEnaAssemblyAwsS3.json"
)
}

]

def test_load():

for c in cases:
runner = CliRunner()
args = ["-c", c["config_file"]]
result = runner.invoke(load, args)
assert result.exit_code == 1
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,6 @@ def test_assembly_scanner():
temp_fh.write("")
temp_fh.close()
scanner.generate_accession_list(c["filepath"])
observed_accessions = open(c["filepath"], "r").read()
expected_accessions = open(c["exp_filepath"], "r").read()
observed_accessions = "\n".join(open(c["filepath"], "r").read().split("\n")[:10])
expected_accessions = "\n".join(open(c["exp_filepath"], "r").read().split("\n")[:10])
assert observed_accessions == expected_accessions

0 comments on commit f5c25c4

Please sign in to comment.