diff --git a/deploy/ini_files/any.ini b/deploy/ini_files/any.ini index efc0c59ff7..74562fb6e7 100644 --- a/deploy/ini_files/any.ini +++ b/deploy/ini_files/any.ini @@ -5,6 +5,7 @@ file_upload_bucket = elasticbeanstalk-${S3_BUCKET_ENV}-files file_wfout_bucket = elasticbeanstalk-${S3_BUCKET_ENV}-wfoutput blob_bucket = elasticbeanstalk-${S3_BUCKET_ENV}-blobs system_bucket = elasticbeanstalk-${S3_BUCKET_ENV}-system +metadata_bundles_bucket = elasticbeanstalk-${S3_BUCKET_ENV}-metadata-bundles sentry_dsn = ${SENTRY_DSN} # blob_store_profile_name = encoded-4dn-files accession_factory = encoded.server_defaults.enc_accession diff --git a/deploy/ini_files/cgap.ini b/deploy/ini_files/cgap.ini index 29878d8b29..ff20206941 100644 --- a/deploy/ini_files/cgap.ini +++ b/deploy/ini_files/cgap.ini @@ -5,6 +5,7 @@ file_upload_bucket = elasticbeanstalk-fourfront-cgap-files file_wfout_bucket = elasticbeanstalk-fourfront-cgap-wfoutput blob_bucket = elasticbeanstalk-fourfront-cgap-blobs system_bucket = elasticbeanstalk-fourfront-cgap-system +metadata_bundles_bucket = elasticbeanstalk-fourfront-cgap-metadata-bundles sentry_dsn = ${SENTRY_DSN} # blob_store_profile_name = encoded-4dn-files accession_factory = encoded.server_defaults.enc_accession diff --git a/deploy/ini_files/cgapdev.ini b/deploy/ini_files/cgapdev.ini index 5afadaf23e..2129f22e26 100644 --- a/deploy/ini_files/cgapdev.ini +++ b/deploy/ini_files/cgapdev.ini @@ -5,6 +5,7 @@ file_upload_bucket = elasticbeanstalk-fourfront-cgapdev-files file_wfout_bucket = elasticbeanstalk-fourfront-cgapdev-wfoutput blob_bucket = elasticbeanstalk-fourfront-cgapdev-blobs system_bucket = elasticbeanstalk-fourfront-cgapdev-system +metadata_bundles_bucket = elasticbeanstalk-fourfront-cgapdev-metadata-bundles sentry_dsn = ${SENTRY_DSN} # blob_store_profile_name = encoded-4dn-files accession_factory = encoded.server_defaults.enc_accession diff --git a/deploy/ini_files/cgaptest.ini b/deploy/ini_files/cgaptest.ini index 73a9ecf9d1..51f420e5f3 100644 --- a/deploy/ini_files/cgaptest.ini +++ b/deploy/ini_files/cgaptest.ini @@ -5,6 +5,7 @@ file_upload_bucket = elasticbeanstalk-fourfront-cgaptest-files file_wfout_bucket = elasticbeanstalk-fourfront-cgaptest-wfoutput blob_bucket = elasticbeanstalk-fourfront-cgaptest-blobs system_bucket = elasticbeanstalk-fourfront-cgaptest-system +metadata_bundles_bucket = elasticbeanstalk-fourfront-cgaptest-metadata-bundles sentry_dsn = ${SENTRY_DSN} # blob_store_profile_name = encoded-4dn-files accession_factory = encoded.server_defaults.enc_accession diff --git a/deploy/ini_files/cgapwolf.ini b/deploy/ini_files/cgapwolf.ini index 87f710b683..c24cce541e 100644 --- a/deploy/ini_files/cgapwolf.ini +++ b/deploy/ini_files/cgapwolf.ini @@ -5,6 +5,7 @@ file_upload_bucket = elasticbeanstalk-fourfront-cgapwolf-files file_wfout_bucket = elasticbeanstalk-fourfront-cgapwolf-wfoutput blob_bucket = elasticbeanstalk-fourfront-cgapwolf-blobs system_bucket = elasticbeanstalk-fourfront-cgapwolf-system +metadata_bundles_bucket = elasticbeanstalk-fourfront-cgapwolf-metadata-bundles sentry_dsn = ${SENTRY_DSN} # blob_store_profile_name = encoded-4dn-files accession_factory = encoded.server_defaults.enc_accession diff --git a/development.ini b/development.ini index 076b60bf2c..db10817702 100644 --- a/development.ini +++ b/development.ini @@ -7,6 +7,7 @@ use = config:base.ini#app sqlalchemy.url = postgresql://postgres@localhost:5441/postgres?host=/tmp/snovault/pgdata blob_bucket = encoded-4dn-blobs +metadata_bundles_bucket = elasticbeanstalk-fourfront-cgaplocal-dev-metadata-bundles load_test_only = true create_tables = true testing = true diff --git a/poetry.lock b/poetry.lock index def899ca5a..8a0dd1b13f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1881,7 +1881,6 @@ version = "0.12.0" [[package]] category = "main" description = "Backport of pathlib-compatible object wrapper for zip files" -marker = "python_version < \"3.8\"" name = "zipp" optional = false python-versions = ">=3.6" @@ -1940,7 +1939,8 @@ transaction = ">=1.6.0" test = ["zope.testing"] [metadata] -content-hash = "e2ce709a6ff2c14e6f1d93e6cf2bd3074af7db59a91b88b18504d60ad4dc5abe" +content-hash = "b1621942e471803164c2c493acc159eb1793d107b8e68cec574c094ab6f754ba" +lock-version = "1.0" python-versions = ">=3.6.1,<3.7" [metadata.files] @@ -1994,37 +1994,37 @@ certifi = [ {file = "certifi-2020.6.20.tar.gz", hash = "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3"}, ] cffi = [ - {file = "cffi-1.14.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:485d029815771b9fe4fa7e1c304352fe57df6939afe835dfd0182c7c13d5e92e"}, + {file = "cffi-1.14.3-2-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3eeeb0405fd145e714f7633a5173318bd88d8bbfc3dd0a5751f8c4f70ae629bc"}, + {file = "cffi-1.14.3-2-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:cb763ceceae04803adcc4e2d80d611ef201c73da32d8f2722e9d0ab0c7f10768"}, + {file = "cffi-1.14.3-2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f60519595eaca110f248e5017363d751b12782a6f2bd6a7041cba275215f5d"}, + {file = "cffi-1.14.3-2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c53af463f4a40de78c58b8b2710ade243c81cbca641e34debf3396a9640d6ec1"}, + {file = "cffi-1.14.3-2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:33c6cdc071ba5cd6d96769c8969a0531be2d08c2628a0143a10a7dcffa9719ca"}, + {file = "cffi-1.14.3-2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c11579638288e53fc94ad60022ff1b67865363e730ee41ad5e6f0a17188b327a"}, {file = "cffi-1.14.3-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:3cb3e1b9ec43256c4e0f8d2837267a70b0e1ca8c4f456685508ae6106b1f504c"}, {file = "cffi-1.14.3-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:f0620511387790860b249b9241c2f13c3a80e21a73e0b861a2df24e9d6f56730"}, {file = "cffi-1.14.3-cp27-cp27m-win32.whl", hash = "sha256:005f2bfe11b6745d726dbb07ace4d53f057de66e336ff92d61b8c7e9c8f4777d"}, {file = "cffi-1.14.3-cp27-cp27m-win_amd64.whl", hash = "sha256:2f9674623ca39c9ebe38afa3da402e9326c245f0f5ceff0623dccdac15023e05"}, {file = "cffi-1.14.3-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:09e96138280241bd355cd585148dec04dbbedb4f46128f340d696eaafc82dd7b"}, {file = "cffi-1.14.3-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:3363e77a6176afb8823b6e06db78c46dbc4c7813b00a41300a4873b6ba63b171"}, - {file = "cffi-1.14.3-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:52bf29af05344c95136df71716bb60508bbd217691697b4307dcae681612db9f"}, {file = "cffi-1.14.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:0ef488305fdce2580c8b2708f22d7785ae222d9825d3094ab073e22e93dfe51f"}, {file = "cffi-1.14.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:0b1ad452cc824665ddc682400b62c9e4f5b64736a2ba99110712fdee5f2505c4"}, {file = "cffi-1.14.3-cp35-cp35m-win32.whl", hash = "sha256:85ba797e1de5b48aa5a8427b6ba62cf69607c18c5d4eb747604b7302f1ec382d"}, {file = "cffi-1.14.3-cp35-cp35m-win_amd64.whl", hash = "sha256:e66399cf0fc07de4dce4f588fc25bfe84a6d1285cc544e67987d22663393926d"}, - {file = "cffi-1.14.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:c687778dda01832555e0af205375d649fa47afeaeeb50a201711f9a9573323b8"}, {file = "cffi-1.14.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:15f351bed09897fbda218e4db5a3d5c06328862f6198d4fb385f3e14e19decb3"}, {file = "cffi-1.14.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4d7c26bfc1ea9f92084a1d75e11999e97b62d63128bcc90c3624d07813c52808"}, {file = "cffi-1.14.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:23e5d2040367322824605bc29ae8ee9175200b92cb5483ac7d466927a9b3d537"}, {file = "cffi-1.14.3-cp36-cp36m-win32.whl", hash = "sha256:a624fae282e81ad2e4871bdb767e2c914d0539708c0f078b5b355258293c98b0"}, {file = "cffi-1.14.3-cp36-cp36m-win_amd64.whl", hash = "sha256:de31b5164d44ef4943db155b3e8e17929707cac1e5bd2f363e67a56e3af4af6e"}, - {file = "cffi-1.14.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:03d3d238cc6c636a01cf55b9b2e1b6531a7f2f4103fabb5a744231582e68ecc7"}, {file = "cffi-1.14.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:f92cdecb618e5fa4658aeb97d5eb3d2f47aa94ac6477c6daf0f306c5a3b9e6b1"}, {file = "cffi-1.14.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:22399ff4870fb4c7ef19fff6eeb20a8bbf15571913c181c78cb361024d574579"}, {file = "cffi-1.14.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:f4eae045e6ab2bb54ca279733fe4eb85f1effda392666308250714e01907f394"}, {file = "cffi-1.14.3-cp37-cp37m-win32.whl", hash = "sha256:b0358e6fefc74a16f745afa366acc89f979040e0cbc4eec55ab26ad1f6a9bfbc"}, {file = "cffi-1.14.3-cp37-cp37m-win_amd64.whl", hash = "sha256:6642f15ad963b5092d65aed022d033c77763515fdc07095208f15d3563003869"}, - {file = "cffi-1.14.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c2a33558fdbee3df370399fe1712d72464ce39c66436270f3664c03f94971aff"}, {file = "cffi-1.14.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:2791f68edc5749024b4722500e86303a10d342527e1e3bcac47f35fbd25b764e"}, {file = "cffi-1.14.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:529c4ed2e10437c205f38f3691a68be66c39197d01062618c55f74294a4a4828"}, {file = "cffi-1.14.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8f0f1e499e4000c4c347a124fa6a27d37608ced4fe9f7d45070563b7c4c370c9"}, {file = "cffi-1.14.3-cp38-cp38-win32.whl", hash = "sha256:3b8eaf915ddc0709779889c472e553f0d3e8b7bdf62dab764c8921b09bf94522"}, {file = "cffi-1.14.3-cp38-cp38-win_amd64.whl", hash = "sha256:bbd2f4dfee1079f76943767fce837ade3087b578aeb9f69aec7857d5bf25db15"}, - {file = "cffi-1.14.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5d9a7dc7cf8b1101af2602fe238911bcc1ac36d239e0a577831f5dac993856e9"}, {file = "cffi-1.14.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:cc75f58cdaf043fe6a7a6c04b3b5a0e694c6a9e24050967747251fb80d7bce0d"}, {file = "cffi-1.14.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:bf39a9e19ce7298f1bd6a9758fa99707e9e5b1ebe5e90f2c3913a47bc548747c"}, {file = "cffi-1.14.3-cp39-cp39-win32.whl", hash = "sha256:d80998ed59176e8cba74028762fbd9b9153b9afc71ea118e63bbf5d4d0f9552b"}, @@ -2474,7 +2474,6 @@ pycryptodome = [ {file = "pycryptodome-3.9.8-cp38-cp38-win_amd64.whl", hash = "sha256:55eb61aca2c883db770999f50d091ff7c14016f2769ad7bca3d9b75d1d7c1b68"}, {file = "pycryptodome-3.9.8-cp39-cp39-manylinux1_i686.whl", hash = "sha256:39ef9fb52d6ec7728fce1f1693cb99d60ce302aeebd59bcedea70ca3203fda60"}, {file = "pycryptodome-3.9.8-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:de6e1cd75677423ff64712c337521e62e3a7a4fc84caabbd93207752e831a85a"}, - {file = "pycryptodome-3.9.8.tar.gz", hash = "sha256:0e24171cf01021bc5dc17d6a9d4f33a048f09d62cc3f62541e95ef104588bda4"}, ] pyflakes = [ {file = "pyflakes-2.2.0-py2.py3-none-any.whl", hash = "sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92"}, diff --git a/pyproject.toml b/pyproject.toml index e9c3a7501f..40212d2685 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] # Note: Various modules refer to this system as "encoded", not "cgap-portal". name = "encoded" -version = "3.0.0" +version = "3.1.0" description = "Clinical Genomics Analysis Platform" authors = ["4DN-DCIC Team "] license = "MIT" @@ -115,11 +115,11 @@ xlwt = "1.2.0" "zope.deprecation" = "4.4.0" "zope.interface" = "4.6.0" "zope.sqlalchemy" = "1.3" -sentry-sdk = "^0.16.4" +sentry-sdk = "^0.16.5" [tool.poetry.dev-dependencies] # PyCharm says boto3-stubs contains useful type hints -boto3-stubs = ">=1.14.37.0" +boto3-stubs = ">=1.14.55.2" coverage = ">=5.2" codacy-coverage = ">=1.3.11" coveralls = ">=2.1.1" @@ -179,6 +179,9 @@ profiler = "encoded.commands.profiler:main" purge-item-type = "encoded.commands.purge_item_type:main" run-upgrade-on-inserts = "encoded.commands.run_upgrader_on_inserts:main" spreadsheet-to-json = "encoded.commands.spreadsheet_to_json:main" +submission-test = "encoded.commands.submission_test:main" +# Use the same-named script in SubmitCGAP instead. +# submit-metadata-bundle = "encoded.commands.submit_metadata_bundle:main" update-inserts-from-server = "encoded.commands.update_inserts_from_server:main" verify-item = "encoded.commands.verify_item:main" diff --git a/src/encoded/authentication.py b/src/encoded/authentication.py index 4e0166f5c1..3bf387d237 100644 --- a/src/encoded/authentication.py +++ b/src/encoded/authentication.py @@ -260,27 +260,6 @@ def get_token_info(self, token, request): return None -# def get_jwt(request): -# token = None -# try: -# # ensure this is a jwt token not basic auth: -# auth_type = request.headers['Authorization'][:6] -# if auth_type.strip().lower() == 'bearer': -# token = request.headers['Authorization'][7:] -# except (ValueError, TypeError, KeyError): -# pass -# -# if not token and request.method in ('GET', 'HEAD'): -# # Only grab this if is a GET request, not a transactional request to help mitigate CSRF attacks. -# # See: https://en.wikipedia.org/wiki/Cross-site_request_forgery#Cookie-to-header_token -# # The way our JS grabs and sticks JWT into Authorization header is somewhat analogous to above approach. -# # TODO: Ensure our `Access-Control-Allow-Origin` response headers are appropriate (more for CGAP). -# # TODO: Get a security audit done. -# token = request.cookies.get('jwtToken') -# -# return token - - def get_jwt(request): token = None diff --git a/src/encoded/commands/create_mapping_on_deploy.py b/src/encoded/commands/create_mapping_on_deploy.py index 9d43e42a92..1d41031f95 100644 --- a/src/encoded/commands/create_mapping_on_deploy.py +++ b/src/encoded/commands/create_mapping_on_deploy.py @@ -62,7 +62,8 @@ 'EvidenceDisPheno', 'Page', 'GeneAnnotationField', - 'HiglassViewConfig' + 'HiglassViewConfig', + 'IngestionSubmission', ] ENV_HOTSEAT = 'fourfront-cgaphot' diff --git a/src/encoded/commands/submission_test.py b/src/encoded/commands/submission_test.py new file mode 100644 index 0000000000..ac3461fbb3 --- /dev/null +++ b/src/encoded/commands/submission_test.py @@ -0,0 +1,94 @@ +import io +import json +import os + +from dcicutils.misc_utils import VirtualApp +from pyramid.paster import get_app +from ..submit import digest_xls, xls_to_json, validate_all_items, post_and_patch_all_items +from ..tests.data import DBMI_INSTITUTION, TEST_PROJECT, METADATA_BUNDLE_PATH + + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "submission_test_data") + +TEST_FILE_TO_VALIDATE = os.path.join(TEST_DATA_DIR, "to_validate.json") +TEST_FILE_TO_PATCH = os.path.join(TEST_DATA_DIR, "to_patch.json") +TEST_FILE_TO_POST = os.path.join(TEST_DATA_DIR, "to_post.json") +TEST_FILE_POST_OUTPUT = os.path.join(TEST_DATA_DIR, "post_output.json") +TEST_FILE_PATCH_OUTPUT = os.path.join(TEST_DATA_DIR, "patch_output.json") +TEST_FILE_TO_UPLOAD = os.path.join(TEST_DATA_DIR, "to_upload.json") + +with io.open(TEST_FILE_TO_VALIDATE, 'r') as fp: + TEST_DATA_TO_VALIDATE = json.load(fp) + +with io.open(TEST_FILE_TO_POST, 'r') as fp: + TEST_DATA_TO_POST = json.load(fp) + +with io.open(TEST_FILE_TO_PATCH, 'r') as fp: + TEST_DATA_TO_PATCH = json.load(fp) + +with io.open(TEST_FILE_POST_OUTPUT, 'r') as fp: + TEST_DATA_POST_OUTPUT = json.load(fp) + +with io.open(TEST_FILE_PATCH_OUTPUT, 'r') as fp: + TEST_DATA_PATCH_OUTPUT = json.load(fp) + +with io.open(TEST_FILE_TO_UPLOAD, 'r') as fp: + TEST_DATA_TO_UPLOAD = json.load(fp) + + +def main(): + """ + This does a simple test of the data pipeline used for metadata bundle submission. + + This does not test the submission endpoints, but DOES call server endpoints to + inquire about current state and based on that state to decide to post or patch the data. + (As such, the exact action of this script is dependent on the state of the db when it is run.) + + The server called will be the one defined by the development environment. + As such, that means we won't be using this test as part of unit tests, + though perhaps variations of this will be adopted for that purpose. + """ + + app = get_app('development.ini', 'app') + environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} + virtualapp = VirtualApp(app, environ) + proj = virtualapp.get(TEST_PROJECT).json + inst = virtualapp.get(DBMI_INSTITUTION).json + rows = digest_xls(METADATA_BUNDLE_PATH) + json_data, passing = xls_to_json(rows, proj, inst) + print('JSON data (to validate):', json.dumps(json_data)) + assert json_data == TEST_DATA_TO_VALIDATE + print('JSON data to validate matches contents of %s' % TEST_FILE_TO_VALIDATE) + final_json, validation_log, passing = validate_all_items(virtualapp, json_data) + print('Validation Log:\n'.join(validation_log)) + print("Passing (after validation):", passing) + print("Final JSON (to post, after validation):", json.dumps(final_json, indent=4)) + if final_json == TEST_DATA_TO_PATCH: + # NOTE: There are more possible intermediate states than just "it's all been done" and "none has been done", + # but this simple script does not anticipate those and will just fail if one of those other states is in play. + # -kmp 8-Sep-2020 + posting = False + print("JSON data has already been posted. Light patching is expected.") + print("(To test posting at this point would require wiping the database or adjusting numerous items.)") + print('JSON data to patch matches contents of %s' % TEST_FILE_TO_PATCH) + else: + posting = True + assert final_json == TEST_DATA_TO_POST + print('JSON data to post matches contents of %s' % TEST_FILE_TO_POST) + output, passing, files = post_and_patch_all_items(virtualapp, final_json) + print('Post Output:\n', '\n'.join(output)) + print('Passing (after post and patch):', passing) + if posting: + assert output == TEST_DATA_POST_OUTPUT + print('JSON data to post matches contents of %s' % TEST_FILE_POST_OUTPUT) + else: + assert output == TEST_DATA_PATCH_OUTPUT + print('JSON data to post matches contents of %s' % TEST_FILE_PATCH_OUTPUT) + print('Files:', json.dumps(files, indent=4)) + assert files == TEST_DATA_TO_UPLOAD + print('JSON data to upload matches contents of %s' % TEST_FILE_TO_UPLOAD) + print("SUCCESS! All done.") + + +if __name__ == '__main__': + main() diff --git a/src/encoded/commands/submission_test_data/patch_output.json b/src/encoded/commands/submission_test_data/patch_output.json new file mode 100644 index 0000000000..cf82f36b19 --- /dev/null +++ b/src/encoded/commands/submission_test_data/patch_output.json @@ -0,0 +1,3 @@ +[ + "file_fastq: 4 items patched successfully; 0 items not patched" +] diff --git a/src/encoded/commands/submission_test_data/post_output.json b/src/encoded/commands/submission_test_data/post_output.json new file mode 100644 index 0000000000..2e420ebac7 --- /dev/null +++ b/src/encoded/commands/submission_test_data/post_output.json @@ -0,0 +1,23 @@ +[ + "Success - sample 3464467 posted", + "Success - sample 3464468 posted", + "Success - sample 3464469 posted", + "Success - individual 456 posted", + "Success - individual 789 posted", + "Success - individual 123 posted", + "Success - family 333 posted", + "file_fastq: 4 items posted successfully; 0 items not posted", + "sample: 3 items posted successfully; 0 items not posted", + "individual: 3 items posted successfully; 0 items not posted", + "family: 1 items posted successfully; 0 items not posted", + "sample_processing: 1 items posted successfully; 0 items not posted", + "report: 1 items posted successfully; 0 items not posted", + "case: 3 items posted successfully; 0 items not posted", + "file_fastq: 4 items patched successfully; 0 items not patched", + "sample: 3 items patched successfully; 0 items not patched", + "individual: 3 items patched successfully; 0 items not patched", + "family: 1 items patched successfully; 0 items not patched", + "sample_processing: 1 items patched successfully; 0 items not patched", + "report: 1 items patched successfully; 0 items not patched", + "case: 3 items patched successfully; 0 items not patched" +] diff --git a/src/encoded/commands/submission_test_data/to_patch.json b/src/encoded/commands/submission_test_data/to_patch.json new file mode 100644 index 0000000000..2fde587615 --- /dev/null +++ b/src/encoded/commands/submission_test_data/to_patch.json @@ -0,0 +1,47 @@ +{ + "post": {}, + "patch": { + "file_fastq": { + "/files-fastq/GAPFINDNNXMD/": { + "filename": "f1.fastq.gz", + "status": "uploading" + }, + "/files-fastq/GAPFI8NPRCLA/": { + "filename": "f2.fastq.gz", + "status": "uploading" + }, + "/files-fastq/GAPFIHLOD3W5/": { + "filename": "f3.fastq.gz", + "status": "uploading" + }, + "/files-fastq/GAPFI558AY9P/": { + "filename": "f4.fastq.gz", + "status": "uploading" + } + }, + "sample": {}, + "individual": {}, + "family": {}, + "sample_processing": {}, + "report": {}, + "case": {} + }, + "aliases": { + "hms-dbmi:f1.fastq.gz": "/files-fastq/GAPFINDNNXMD/", + "hms-dbmi:f2.fastq.gz": "/files-fastq/GAPFI8NPRCLA/", + "hms-dbmi:f3.fastq.gz": "/files-fastq/GAPFIHLOD3W5/", + "hms-dbmi:f4.fastq.gz": "/files-fastq/GAPFI558AY9P/", + "hms-dbmi:sample-3464467": "/samples/GAPSAOZBTKEV/", + "hms-dbmi:sample-3464468": "/samples/GAPSALGBXFHS/", + "hms-dbmi:sample-3464469": "/samples/GAPSA4YIRPGC/", + "hms-dbmi:individual-456": "/individuals/GAPIDXBYJESJ/", + "hms-dbmi:individual-789": "/individuals/GAPID16TYLTK/", + "hms-dbmi:individual-123": "/individuals/GAPIDF5WPST7/", + "hms-dbmi:family-456": "/families/GAPFA5SLA4GB/", + "hms-dbmi:analysis-55432": "/sample-processings/43a03760-4e11-41ae-81d9-a408acd29f16/", + "hms-dbmi:report-55432-3464467": "/reports/ea7490cb-d480-4ecf-b913-467fc24d7294/", + "hms-dbmi:case-55432-3464467": "/cases/GAPCA12VO85Q/", + "hms-dbmi:case-55432-3464468": "/cases/GAPCAWSV5574/", + "hms-dbmi:case-55432-3464469": "/cases/GAPCA2SA3Z84/" + } +} diff --git a/src/encoded/commands/submission_test_data/to_post.json b/src/encoded/commands/submission_test_data/to_post.json new file mode 100644 index 0000000000..85ea88c5d6 --- /dev/null +++ b/src/encoded/commands/submission_test_data/to_post.json @@ -0,0 +1,259 @@ +{ + "post": { + "file_fastq": [ + { + "aliases": [ + "hms-dbmi:f1.fastq.gz" + ], + "row": 4, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f1.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/", + "status": "uploading" + }, + { + "aliases": [ + "hms-dbmi:f2.fastq.gz" + ], + "row": 4, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f2.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/", + "status": "uploading" + }, + { + "aliases": [ + "hms-dbmi:f3.fastq.gz" + ], + "row": 5, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f3.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/", + "status": "uploading" + }, + { + "aliases": [ + "hms-dbmi:f4.fastq.gz" + ], + "row": 6, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f4.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/", + "status": "uploading" + } + ], + "sample": [ + { + "aliases": [ + "hms-dbmi:sample-3464467" + ], + "workup_type": "WGS", + "specimen_type": "Peripheral Blood", + "date_transported": "2020-11-02", + "indication": "Ovarian cancer", + "sent_by": "tr44", + "physician_id": "11946744", + "specimen_collection_date": "2020-11-02", + "specimen_accession": "3464467", + "transported_by": "Fedex", + "sequencing_lab": "Broad", + "requisition_type": "GED or BioBank", + "date_requisition_received": "2020-02-02", + "row": 4, + "other_specimen_ids": [ + { + "id": "N/A", + "id_type": "hms-dbmi" + } + ], + "requisition_acceptance": { + "accepted_rejected": "Rejected", + "rejection_reason": "missing DOB" + }, + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + { + "aliases": [ + "hms-dbmi:sample-3464468" + ], + "workup_type": "WGS", + "specimen_type": "Peripheral Blood", + "date_transported": "2020-11-02", + "indication": "Ovarian cancer", + "sent_by": "tr44", + "physician_id": "11946744", + "specimen_collection_date": "2020-11-02", + "specimen_accession": "3464468", + "transported_by": "Fedex", + "sequencing_lab": "Broad", + "requisition_type": "GED or BioBank", + "date_requisition_received": "2020-02-02", + "row": 5, + "requisition_acceptance": { + "accepted_rejected": "Rejected", + "rejection_reason": "missing DOB" + }, + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + { + "aliases": [ + "hms-dbmi:sample-3464469" + ], + "workup_type": "WGS", + "specimen_type": "Peripheral Blood", + "date_transported": "2020-11-02", + "indication": "Ovarian cancer", + "sent_by": "tr44", + "physician_id": "11946744", + "specimen_collection_date": "2020-11-02", + "specimen_accession": "3464469", + "transported_by": "Fedex", + "sequencing_lab": "Broad", + "requisition_type": "GED or BioBank", + "date_requisition_received": "2020-02-02", + "row": 6, + "requisition_acceptance": { + "accepted_rejected": "Rejected", + "rejection_reason": "missing DOB" + }, + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + ], + "individual": [ + { + "aliases": [ + "hms-dbmi:individual-456" + ], + "individual_id": "456", + "sex": "M", + "birth_year": 1991, + "row": 4, + "samples": [ + "hms-dbmi:sample-3464467" + ], + "mother": "hms-dbmi:individual-123", + "father": "hms-dbmi:individual-789", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + { + "aliases": [ + "hms-dbmi:individual-789" + ], + "individual_id": "789", + "sex": "M", + "birth_year": 1961, + "row": 5, + "samples": [ + "hms-dbmi:sample-3464468" + ], + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + { + "aliases": [ + "hms-dbmi:individual-123" + ], + "individual_id": "123", + "sex": "F", + "birth_year": 1945, + "row": 6, + "samples": [ + "hms-dbmi:sample-3464469" + ], + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + ], + "family": [ + { + "aliases": [ + "hms-dbmi:family-456" + ], + "family_id": "333", + "members": [ + "hms-dbmi:individual-456", + "hms-dbmi:individual-789", + "hms-dbmi:individual-123" + ], + "row": 4, + "proband": "hms-dbmi:individual-456", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + ], + "sample_processing": [ + { + "aliases": [ + "hms-dbmi:analysis-55432" + ], + "samples": [ + "hms-dbmi:sample-3464467", + "hms-dbmi:sample-3464468", + "hms-dbmi:sample-3464469" + ], + "families": [ + "hms-dbmi:family-456" + ], + "analysis_type": "WGS-Trio", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + ], + "report": [ + { + "aliases": [ + "hms-dbmi:report-55432-3464467" + ], + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + ], + "case": [ + { + "aliases": [ + "hms-dbmi:case-55432-3464467" + ], + "sample_processing": "hms-dbmi:analysis-55432", + "family": "hms-dbmi:family-456", + "individual": "hms-dbmi:individual-456", + "report": "hms-dbmi:report-55432-3464467", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + { + "aliases": [ + "hms-dbmi:case-55432-3464468" + ], + "sample_processing": "hms-dbmi:analysis-55432", + "family": "hms-dbmi:family-456", + "individual": "hms-dbmi:individual-789", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + { + "aliases": [ + "hms-dbmi:case-55432-3464469" + ], + "sample_processing": "hms-dbmi:analysis-55432", + "family": "hms-dbmi:family-456", + "individual": "hms-dbmi:individual-123", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + ] + }, + "patch": {}, + "aliases": {} +} diff --git a/src/encoded/commands/submission_test_data/to_upload.json b/src/encoded/commands/submission_test_data/to_upload.json new file mode 100644 index 0000000000..44245a5b28 --- /dev/null +++ b/src/encoded/commands/submission_test_data/to_upload.json @@ -0,0 +1,18 @@ +[ + { + "uuid": "48ee5042-7498-491d-b599-bcf5afcdbfa7", + "filename": "f1.fastq.gz" + }, + { + "uuid": "eca0f76c-06d8-4953-828e-0bd8998639dc", + "filename": "f2.fastq.gz" + }, + { + "uuid": "9f576938-bd3e-4eef-ae7c-13cd06bdc3fe", + "filename": "f3.fastq.gz" + }, + { + "uuid": "eedfa412-a797-48a8-863f-639a33ba2814", + "filename": "f4.fastq.gz" + } +] diff --git a/src/encoded/commands/submission_test_data/to_validate.json b/src/encoded/commands/submission_test_data/to_validate.json new file mode 100644 index 0000000000..1ed5af1e95 --- /dev/null +++ b/src/encoded/commands/submission_test_data/to_validate.json @@ -0,0 +1,253 @@ +{ + "individual": { + "hms-dbmi:individual-456": { + "aliases": [ + "hms-dbmi:individual-456" + ], + "individual_id": "456", + "sex": "M", + "birth_year": 1991, + "row": 4, + "samples": [ + "hms-dbmi:sample-3464467" + ], + "mother": "hms-dbmi:individual-123", + "father": "hms-dbmi:individual-789", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:individual-789": { + "aliases": [ + "hms-dbmi:individual-789" + ], + "individual_id": "789", + "sex": "M", + "birth_year": 1961, + "row": 5, + "samples": [ + "hms-dbmi:sample-3464468" + ], + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:individual-123": { + "aliases": [ + "hms-dbmi:individual-123" + ], + "individual_id": "123", + "sex": "F", + "birth_year": 1945, + "row": 6, + "samples": [ + "hms-dbmi:sample-3464469" + ], + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + }, + "family": { + "hms-dbmi:family-456": { + "aliases": [ + "hms-dbmi:family-456" + ], + "family_id": "333", + "members": [ + "hms-dbmi:individual-456", + "hms-dbmi:individual-789", + "hms-dbmi:individual-123" + ], + "row": 4, + "proband": "hms-dbmi:individual-456", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + }, + "sample": { + "hms-dbmi:sample-3464467": { + "aliases": [ + "hms-dbmi:sample-3464467" + ], + "workup_type": "WGS", + "specimen_type": "Peripheral Blood", + "date_transported": "2020-11-02", + "indication": "Ovarian cancer", + "sent_by": "tr44", + "physician_id": "11946744", + "specimen_collection_date": "2020-11-02", + "specimen_accession": "3464467", + "transported_by": "Fedex", + "sequencing_lab": "Broad", + "requisition_type": "GED or BioBank", + "date_requisition_received": "2020-02-02", + "row": 4, + "other_specimen_ids": [ + { + "id": "N/A", + "id_type": "hms-dbmi" + } + ], + "requisition_acceptance": { + "accepted_rejected": "Rejected", + "rejection_reason": "missing DOB" + }, + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:sample-3464468": { + "aliases": [ + "hms-dbmi:sample-3464468" + ], + "workup_type": "WGS", + "specimen_type": "Peripheral Blood", + "date_transported": "2020-11-02", + "indication": "Ovarian cancer", + "sent_by": "tr44", + "physician_id": "11946744", + "specimen_collection_date": "2020-11-02", + "specimen_accession": "3464468", + "transported_by": "Fedex", + "sequencing_lab": "Broad", + "requisition_type": "GED or BioBank", + "date_requisition_received": "2020-02-02", + "row": 5, + "requisition_acceptance": { + "accepted_rejected": "Rejected", + "rejection_reason": "missing DOB" + }, + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:sample-3464469": { + "aliases": [ + "hms-dbmi:sample-3464469" + ], + "workup_type": "WGS", + "specimen_type": "Peripheral Blood", + "date_transported": "2020-11-02", + "indication": "Ovarian cancer", + "sent_by": "tr44", + "physician_id": "11946744", + "specimen_collection_date": "2020-11-02", + "specimen_accession": "3464469", + "transported_by": "Fedex", + "sequencing_lab": "Broad", + "requisition_type": "GED or BioBank", + "date_requisition_received": "2020-02-02", + "row": 6, + "requisition_acceptance": { + "accepted_rejected": "Rejected", + "rejection_reason": "missing DOB" + }, + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + }, + "sample_processing": { + "hms-dbmi:analysis-55432": { + "aliases": [ + "hms-dbmi:analysis-55432" + ], + "samples": [ + "hms-dbmi:sample-3464467", + "hms-dbmi:sample-3464468", + "hms-dbmi:sample-3464469" + ], + "families": [ + "hms-dbmi:family-456" + ], + "analysis_type": "WGS-Trio", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + }, + "file_fastq": { + "hms-dbmi:f1.fastq.gz": { + "aliases": [ + "hms-dbmi:f1.fastq.gz" + ], + "row": 4, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f1.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:f2.fastq.gz": { + "aliases": [ + "hms-dbmi:f2.fastq.gz" + ], + "row": 4, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f2.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:f3.fastq.gz": { + "aliases": [ + "hms-dbmi:f3.fastq.gz" + ], + "row": 5, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f3.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:f4.fastq.gz": { + "aliases": [ + "hms-dbmi:f4.fastq.gz" + ], + "row": 6, + "file_format": "/file-formats/fastq/", + "file_type": "reads", + "filename": "f4.fastq.gz", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + }, + "file_processed": {}, + "case": { + "hms-dbmi:case-55432-3464467": { + "aliases": [ + "hms-dbmi:case-55432-3464467" + ], + "sample_processing": "hms-dbmi:analysis-55432", + "family": "hms-dbmi:family-456", + "individual": "hms-dbmi:individual-456", + "report": "hms-dbmi:report-55432-3464467", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:case-55432-3464468": { + "aliases": [ + "hms-dbmi:case-55432-3464468" + ], + "sample_processing": "hms-dbmi:analysis-55432", + "family": "hms-dbmi:family-456", + "individual": "hms-dbmi:individual-789", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + }, + "hms-dbmi:case-55432-3464469": { + "aliases": [ + "hms-dbmi:case-55432-3464469" + ], + "sample_processing": "hms-dbmi:analysis-55432", + "family": "hms-dbmi:family-456", + "individual": "hms-dbmi:individual-123", + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + }, + "report": { + "hms-dbmi:report-55432-3464467": { + "aliases": [ + "hms-dbmi:report-55432-3464467" + ], + "project": "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/", + "institution": "/institutions/hms-dbmi/" + } + }, + "errors": [] +} diff --git a/src/encoded/ingestion/__init__.py b/src/encoded/ingestion/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/encoded/ingestion/common.py b/src/encoded/ingestion/common.py new file mode 100644 index 0000000000..9e7b1017dd --- /dev/null +++ b/src/encoded/ingestion/common.py @@ -0,0 +1,119 @@ +""" +common.py - tools common to various parts of ingestion +""" + +from .exceptions import MissingParameter, BadParameter + + +def metadata_bundles_bucket(registry): + return registry.settings.get('metadata_bundles_bucket') + + +# ================================================== + +# IMPLEMENTATION NOTE: +# +# We have middleware that overrides various details about content type that are declared in the view_config. +# It used to work by having a wired set of exceptions, but this facility allows us to do it in a more data-driven +# way. Really I think we should just rely on the information in the view_config, but I didn't have time to explore +# why we are not using that. +# +# See validate_request_tween_factory in renderers.py for where this is used. This declaration info is here +# rather than there to simplify the load order dependencies. +# +# -kmp 1-Sep-2020 + +CONTENT_TYPE_SPECIAL_CASES = { + 'application/x-www-form-urlencoded': [ + # Single legacy special case to allow us to POST to metadata TSV requests via form submission. + # All other special case values should be added using register_path_content_type. + '/metadata/' + ] +} + + +def register_path_content_type(*, path, content_type): + """ + Registers that endpoints that begin with the specified path use the indicated content_type. + + This is part of an inelegant workaround for an issue in renderers.py that maybe we can make go away in the future. + See the 'implementation note' in ingestion/common.py for more details. + """ + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(content_type, None) + if exceptions is None: + CONTENT_TYPE_SPECIAL_CASES[content_type] = exceptions = [] + if path not in exceptions: + exceptions.append(path) + + +def content_type_allowed(request): + """ + Returns True if the current request allows the requested content type. + + This is part of an inelegant workaround for an issue in renderers.py that maybe we can make go away in the future. + See the 'implementation note' in ingestion/common.py for more details. + """ + if request.content_type == "application/json": + # For better or worse, we always allow this. + return True + + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(request.content_type) + + if exceptions: + for prefix in exceptions: + if request.path.startswith(prefix): + return True + + return False + +# ================================================== + +_NO_DEFAULT = object() + + +def get_parameter(parameter_block, parameter_name, as_type=None, default=_NO_DEFAULT, update=False): + """ + Returns the value of a given parameter from a dictionary of parameter values. + + If the parameter is not in the dictionary, the default will be returned if one is given. + If the parameter is not present but there is no default, an error of type MissingParameter will be raised. + + Args: + parameter_block (dict): a dictionary whose keys are parameter names and whose values are parameter values + parameter_name (str): the name of a parameter + as_type: if supplied, a type coercion to perform on the result + default (object): a default value to be used if the parameter_name is not present. + update (bool): if as_type is applied, whether to update the parameter_block + """ + + if isinstance(parameter_block, dict): + if parameter_name in parameter_block: + parameter_value = parameter_block[parameter_name] + result = parameter_value + if as_type: + if isinstance(as_type, type) and isinstance(result, as_type): + return result + elif as_type is bool: + lower_value = str(result).lower() + if lower_value == "true": + result = True + elif lower_value in ("false", "none", "null", ""): + result = False + else: + raise BadParameter(parameter_name=parameter_name, parameter_value=parameter_value, + extra_detail=("Expected a string representing a boolean, such as" + " 'true' for True, or 'false' or the empty string for False.")) + else: + result = as_type(result) + elif default is _NO_DEFAULT: + raise MissingParameter(parameter_name=parameter_name) + else: + result = default + + if update: + parameter_block[parameter_name] = result + + return result + + else: + raise TypeError("Expected parameter_block to be a dict: %s", parameter_block) diff --git a/src/encoded/ingestion/exceptions.py b/src/encoded/ingestion/exceptions.py new file mode 100644 index 0000000000..ed5972a976 --- /dev/null +++ b/src/encoded/ingestion/exceptions.py @@ -0,0 +1,41 @@ +""" +Exception definitions for ingestion +""" + +from pyramid.httpexceptions import HTTPBadRequest, HTTPServerError + + +class SubmissionFailure(HTTPServerError): + pass + + +class UndefinedIngestionProcessorType(Exception): + + def __init__(self, processor_type): + self.ingestion_type_name = processor_type + super().__init__("No ingestion processor type %r is defined." % processor_type) + + +class MissingParameter(HTTPBadRequest): + + def __init__(self, parameter_name): + self.parameter_name = parameter_name + super().__init__(detail="Missing parameter: %s" % parameter_name) + + +class BadParameter(HTTPBadRequest): + + def __init__(self, parameter_name, parameter_value, extra_detail=None): + self.parameter_name = parameter_name + self.parameter_value = parameter_value + self.extra_detail = extra_detail + suffix = " " + extra_detail if extra_detail else "" + super().__init__(detail="The value of the %s parameter, %r, is invalid.%s" + % (parameter_name, parameter_value, suffix)) + + +class UnspecifiedFormParameter(HTTPBadRequest): + + def __init__(self, parameter_name): + self.parameter_name = parameter_name + super().__init__(detail="A form parameter was not filled out: %s" % parameter_name) diff --git a/src/encoded/ingestion/processors.py b/src/encoded/ingestion/processors.py new file mode 100644 index 0000000000..01d7f6431e --- /dev/null +++ b/src/encoded/ingestion/processors.py @@ -0,0 +1,113 @@ +import boto3 +import json +import traceback + +from ..ingestion.common import get_parameter +from ..util import debuglog, s3_output_stream, create_empty_s3_file +from ..submit import submit_metadata_bundle +from .exceptions import UndefinedIngestionProcessorType +from ..types.ingestion import SubmissionFolio + + +INGESTION_UPLOADERS = {} + + +def ingestion_processor(processor_type): + """ + @ingestion_uploader() is a decorator that declares the upload handler for an ingestion type. + """ + + def ingestion_type_decorator(fn): + INGESTION_UPLOADERS[processor_type] = fn + return fn + + return ingestion_type_decorator + + +def get_ingestion_processor(processor_type): + handler = INGESTION_UPLOADERS.get(processor_type, None) + if not handler: + raise UndefinedIngestionProcessorType(processor_type) + return handler + + +def _show_report_lines(lines, fp, default="Nothing to report."): + for line in lines or ([default] if default else []): + print(line, file=fp) + + +@ingestion_processor('data_bundle') +def handle_data_bundle(submission: SubmissionFolio): + + # We originally called it 'data_bundle' and we retained that as OK in the schema + # to not upset anyone testing with the old name, but this is not the name to use + # any more, so reject new submissions of this kind. -kmp 27-Aug-2020 + + with submission.processing_context(submission): + + raise RuntimeError("handle_data_bundle was called (for ingestion_type=%s). This is always an error." + " The ingestion_type 'data_bundle' was renamed to 'metadata_bundle'" + " prior to the initial release. Your submission program probably needs to be updated." + % submission.ingestion_type) + + +@ingestion_processor('metadata_bundle') +def handle_metadata_bundle(submission: SubmissionFolio): + + with submission.processing_context(submission) as resolution: + + s3_client = submission.s3_client + submission_id = submission.submission_id + + institution = get_parameter(submission.parameters, 'institution') + project = get_parameter(submission.parameters, 'project') + validate_only = get_parameter(submission.parameters, 'validate_only', as_type=bool, default=False) + + # if isinstance(institution, str): + # institution = submission.vapp.get(institution).json + # if isinstance(project, str): + # project = submission.vapp.get(project).json + + bundle_result = submit_metadata_bundle(s3_client=s3_client, + bucket=submission.bucket, + key=submission.object_name, + project=project, + institution=institution, + vapp=submission.vapp, + validate_only=validate_only) + + debuglog(submission_id, "bundle_result:", json.dumps(bundle_result, indent=2)) + + resolution["validation_report_key"] = validation_report_key = "%s/validation-report.txt" % submission_id + resolution["submission_key"] = submission_key = "%s/submission.json" % submission_id + resolution["submission_response_key"] = submission_response_key = "%s/submission-response.txt" % submission_id + resolution["upload_info_key"] = upload_info_key = "%s/upload_info.txt" % submission_id + + def note_additional_datum(key, bundle_key=None): + submission.other_details['additional_data'] = additional_data = ( + submission.other_details.get('additional_data', {}) + ) + additional_data[key] = bundle_result[bundle_key or key] + + with s3_output_stream(s3_client, bucket=submission.bucket, key=validation_report_key) as fp: + _show_report_lines(bundle_result['validation_output'], fp) + note_additional_datum('validation_output') + + # Next several files are created only if relevant. + + if bundle_result['result']: + with s3_output_stream(s3_client, bucket=submission.bucket, key=submission_key) as fp: + print(json.dumps(bundle_result['result'], indent=2), file=fp) + submission.other_details['result'] = bundle_result['result'] + + if bundle_result['post_output']: + with s3_output_stream(s3_client, bucket=submission.bucket, key=submission_response_key) as fp: + _show_report_lines(bundle_result['post_output'], fp) + note_additional_datum('post_output') + + if bundle_result['upload_info']: + with s3_output_stream(s3_client, bucket=submission.bucket, key=upload_info_key) as fp: + print(json.dumps(bundle_result['upload_info'], indent=2), file=fp) + note_additional_datum('upload_info') + + submission.outcome = "success" if bundle_result['success'] else "failure" diff --git a/src/encoded/ingestion_listener.py b/src/encoded/ingestion_listener.py index a9764918e5..905231cb6b 100644 --- a/src/encoded/ingestion_listener.py +++ b/src/encoded/ingestion_listener.py @@ -1,29 +1,41 @@ -import os -import boto3 -import time -import socket import argparse -import structlog +import atexit +import boto3 +import botocore.exceptions +import cgi import datetime +import elasticsearch +import io import json -import atexit -import threading -import signal +import os import psycopg2 +import requests # XXX: C4-211 should not be needed but is // KMP needs this, too, until subrequest posts work +import signal +import socket +import structlog +import threading +import time import webtest -import elasticsearch -import requests # XXX: C4-211 should not be needed but is -from vcf import Reader + +from dcicutils.env_utils import is_stg_or_prd_env +from dcicutils.misc_utils import VirtualApp, ignored, check_true from pyramid import paster -from dcicutils.misc_utils import VirtualApp -from pyramid.view import view_config from pyramid.httpexceptions import HTTPNotFound, HTTPMovedPermanently from pyramid.request import Request +from pyramid.response import Response +from pyramid.view import view_config from snovault.util import debug_log -from .util import resolve_file_path, gunzip_content +from vcf import Reader from .commands.ingest_vcf import VCFParser -from .types.variant import build_variant_display_title, ANNOTATION_ID_SEP +from .ingestion.common import register_path_content_type, metadata_bundles_bucket, get_parameter +from .ingestion.exceptions import UnspecifiedFormParameter, SubmissionFailure +from .ingestion.processors import get_ingestion_processor from .inheritance_mode import InheritanceMode +from .types.ingestion import SubmissionFolio +from .types.variant import build_variant_display_title, ANNOTATION_ID_SEP +from .util import ( + resolve_file_path, gunzip_content, debuglog, get_trusted_email, beanstalk_env_from_request, full_class_name, +) log = structlog.getLogger(__name__) @@ -39,14 +51,144 @@ def includeme(config): config.add_route('queue_ingestion', '/queue_ingestion') config.add_route('ingestion_status', '/ingestion_status') + config.add_route('prompt_for_ingestion', '/prompt_for_ingestion') + config.add_route('submit_for_ingestion', '/submit_for_ingestion') config.registry[INGESTION_QUEUE] = IngestionQueueManager(config.registry) config.scan(__name__) +# This endpoint is intended only for debugging. Use the command line tool. +@view_config(route_name='prompt_for_ingestion', request_method='GET') +@debug_log +def prompt_for_ingestion(context, request): + ignored(context, request) + return Response(PROMPT_FOR_INGESTION) + + +register_path_content_type(path='/submit_for_ingestion', content_type='multipart/form-data') + + +@view_config(route_name='submit_for_ingestion', request_method='POST', + # Apparently adding this 'accept' causes discrimination on incoming requests not to find this method. + # We do want this type, and instead we check the request to make sure we got it, but we omit it here + # for practical reasons. -kmp 10-Sep-2020 + # accept='multipart/form-data', + permission='add') +@debug_log +def submit_for_ingestion(context, request): + ignored(context) + + check_true(request.content_type == 'multipart/form-data', # even though we can't declare we accept this + "Expected request to have content_type 'multipart/form-data'.", error_class=RuntimeError) + + bs_env = beanstalk_env_from_request(request) + bundles_bucket = metadata_bundles_bucket(request.registry) + ingestion_type = request.POST['ingestion_type'] + datafile = request.POST['datafile'] + if not isinstance(datafile, cgi.FieldStorage): + # e.g., specifically it might be b'' when no file is selected, + # but IMPORTANTLY, cgi.FieldStorage has no predefined boolean value, + # so we can't just ask to check 'not datafile'. Sigh. -kmp 5-Aug-2020 + raise UnspecifiedFormParameter('datafile') + filename = datafile.filename + override_name = request.POST.get('override_name', None) + parameters = dict(request.POST) + parameters['datafile'] = filename + institution = get_parameter(parameters, 'institution') + project = get_parameter(parameters, 'project') + # Other parameters, like validate_only, will ride in on parameters via the manifest on s3 + + submission_id = SubmissionFolio.create_item(request, + ingestion_type=ingestion_type, + institution=institution, + project=project) + + # ``input_file`` contains the actual file data which needs to be + # stored somewhere. + + input_file_stream = request.POST['datafile'].file + input_file_stream.seek(0) + + # NOTE: Some reference information about uploading files to s3 is here: + # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html + + # submission.set_item_detail(object_name=manifest['object_name'], parameters=manifest['parameters'], + # institution=institution, project=project) + + # submission_id = str(uuid.uuid4()) + _, ext = os.path.splitext(filename) + object_name = "{id}/datafile{ext}".format(id=submission_id, ext=ext) + manifest_name = "{id}/manifest.json".format(id=submission_id) + + s3_client = boto3.client('s3') + + upload_time = datetime.datetime.utcnow().isoformat() + success = True + message = "Uploaded successfully." + + try: + s3_client.upload_fileobj(input_file_stream, Bucket=bundles_bucket, Key=object_name) + + except botocore.exceptions.ClientError as e: + + log.error(e) + + success = False + message = "{error_type}: {error_message}".format(error_type=full_class_name(e), error_message=str(e)) + + # This manifest will be stored in the manifest.json file on on s3 AND will be returned from this endpoint call. + manifest_content = { + "filename": filename, + "object_name": object_name, + "submission_id": submission_id, + "submission_uri": SubmissionFolio.make_submission_uri(submission_id), + "beanstalk_env_is_prd": is_stg_or_prd_env(bs_env), + "beanstalk_env": bs_env, + "bucket": bundles_bucket, + "authenticated_userid": request.authenticated_userid, + "email": get_trusted_email(request, context="Submission", raise_errors=False), + "success": success, + "message": message, + "upload_time": upload_time, + "parameters": parameters, + } + + manifest_content_formatted = json.dumps(manifest_content, indent=2) + + if success: + + try: + with io.BytesIO(manifest_content_formatted.encode('utf-8')) as fp: + s3_client.upload_fileobj(fp, Bucket=bundles_bucket, Key=manifest_name) + + except botocore.exceptions.ClientError as e: + + log.error(e) + + message = ("{error_type} (while uploading metadata): {error_message}" + .format(error_type=full_class_name(e), error_message=str(e))) + + raise SubmissionFailure(message) + + queue_manager = get_queue_manager(request, override_name=override_name) + _, failed = queue_manager.add_uuids([submission_id], ingestion_type=ingestion_type) + + if failed: + # If there's a failure, failed will be a list of one problem description since we only submitted one thing. + raise SubmissionFailure(failed[0]) + + if not success: + + raise SubmissionFailure(message) + + return manifest_content + + @view_config(route_name='ingestion_status', request_method='GET', permission='index') @debug_log def ingestion_status(context, request): """ Status route, essentially identical to indexing_status. """ + ignored(context) queue_manager = request.registry[INGESTION_QUEUE] n_waiting, n_inflight = queue_manager.get_counts() return { @@ -105,8 +247,13 @@ def queue_ingestion(context, request): """ Queues uuids as part of the request body for ingestion. Can batch as many as desired in a single request. """ + ignored(context) uuids = request.json.get('uuids', []) override_name = request.json.get('override_name', None) + return enqueue_uuids_for_request(request, uuids, override_name=override_name) + + +def enqueue_uuids_for_request(request, uuids, *, ingestion_type='vcf', override_name=None): response = { 'notification': 'Failure', 'number_queued': 0, @@ -114,21 +261,26 @@ def queue_ingestion(context, request): } if uuids is []: return response - queue_manager = (request.registry[INGESTION_QUEUE] - if not override_name - else IngestionQueueManager(request.registry, override_name=override_name)) + queue_manager = get_queue_manager(request, override_name=override_name) _, failed = queue_manager.add_uuids(uuids) if not failed: response['notification'] = 'Success' response['number_queued'] = len(uuids) response['detail'] = 'Successfully queued the following uuids: %s' % uuids - patch_vcf_file_status(request, uuids) # extra state management - may not be accurate, hard to get right + if ingestion_type == 'vcf': + patch_vcf_file_status(request, uuids) # extra state management - may not be accurate, hard to get right else: response['number_queued'] = len(uuids) - len(failed) response['detail'] = 'Some uuids failed: %s' % failed return response +def get_queue_manager(request, *, override_name): + return (request.registry[INGESTION_QUEUE] + if not override_name + else IngestionQueueManager(request.registry, override_name=override_name)) + + class IngestionQueueManager: """ Similar to QueueManager in snovault in that in manages SQS queues, but that code is not generic @@ -151,7 +303,7 @@ def __init__(self, registry, override_name=None): 'region_name': 'us-east-1' } self.client = boto3.client('sqs', **kwargs) - self.queue_name = self.env_name + self.BUCKET_EXTENSION if not override_name else override_name + self.queue_name = override_name or (self.env_name + self.BUCKET_EXTENSION) self.queue_attrs = { self.queue_name: { 'DelaySeconds': '1', # messages initially invisible for 1 sec @@ -170,7 +322,7 @@ def _initialize(self): Attributes=self.queue_attrs[self.queue_name] ) queue_url = response['QueueUrl'] - except self.client.exceptions.QueueNameExists as e: + except self.client.exceptions.QueueNameExists: queue_url = self._get_queue_url(self.queue_name) except Exception as e: log.error('Error while attempting to create queue: %s' % e) @@ -261,19 +413,22 @@ def delete_messages(self, messages): failed.extend(response.get('Failed', [])) return failed - def add_uuids(self, uuids): - """ Takes a list of string uuids (presumed to be VCF files) and adds them to - the ingestion queue. + def add_uuids(self, uuids, ingestion_type='vcf'): + """ Takes a list of string uuids and adds them to the ingestion queue. + If ingestion_type is not specified, it defaults to 'vcf'. :precondition: uuids are all of type FileProcessed :param uuids: uuids to be added to the queue. + :param ingestion_type: the ingestion type of the uuids (default 'vcf' for legacy reasons) :returns: 2-tuple: uuids queued, failed messages (if any) """ curr_time = datetime.datetime.utcnow().isoformat() msgs = [] for uuid in uuids: current_msg = { - 'uuid': uuid, 'timestamp': curr_time + 'ingestion_type': ingestion_type, + 'uuid': uuid, + 'timestamp': curr_time } msgs.append(current_msg) failed = self._send_messages(msgs) @@ -332,10 +487,8 @@ def __init__(self, vapp, _queue_manager=None, _update_status=None): # Get queue_manager registry = None - if isinstance(self.vapp, webtest.TestApp): # if in testing + if isinstance(self.vapp, (webtest.TestApp, VirtualApp)): # TestApp in testing or VirtualApp in production registry = self.vapp.app.registry - elif isinstance(self.vapp, VirtualApp): # if in production - registry = self.vapp.wrapped_app.app.registry elif _queue_manager is None: # if we got here, we cannot succeed in starting raise Exception('Bad arguments given to IngestionListener: %s, %s, %s' % (self.vapp, _queue_manager, _update_status)) @@ -374,8 +527,10 @@ def delete_messages(self, messages): """ failed = self.queue_manager.delete_messages(messages) while True: + debuglog("Trying to delete messages") tries = 3 if failed: + debuglog("Failed to delete messages") if tries > 0: failed = self.queue_manager.delete_messages(failed) # try again tries -= 1 @@ -383,6 +538,7 @@ def delete_messages(self, messages): log.error('Failed to delete messages from SQS: %s' % failed) break else: + debuglog("Deleted messages") break @staticmethod @@ -517,15 +673,52 @@ def run(self): delete processed messages """ log.info('Ingestion listener successfully online.') + + debuglog("Ingestion listener started.") + + messages = [] # This'll get a better value below in each loop iteration. This is just a declaration of intent. + + def discard(msg): + self.delete_messages([msg]) + # Assuming we didn't get an error trying to remove it, + # it should also get removed from our to-do list. + messages.remove(msg) + while self.should_remain_online(): + + debuglog("About to get messages.") + messages = self.get_messages() # wait here + debuglog("Got", len(messages), "messages.") + # ingest each VCF file for message in messages: + + debuglog("Message:", message) + body = json.loads(message['Body']) uuid = body['uuid'] + ingestion_type = body.get('ingestion_type', 'vcf') # Older protocol doesn't yet know to expect this log.info('Ingesting uuid %s' % uuid) + if ingestion_type != 'vcf': + # Let's minimally disrupt things for now. We can refactor this later + # to make all the parts work the same -kmp + submission = SubmissionFolio(vapp=self.vapp, ingestion_type=ingestion_type, submission_id=uuid) + handler = get_ingestion_processor(ingestion_type) + try: + debuglog("HANDLING:", uuid) + handler(submission) + debuglog("HANDLED:", uuid) + except Exception as e: + log.error(e) + # If we suceeded, we don't need to do it again, and if we failed we don't need to fail again. + discard(message) + continue + + debuglog("Did NOT process", uuid, "as", ingestion_type) + # locate file meta data try: file_meta = self.vapp.get('/' + uuid).follow().json @@ -567,13 +760,21 @@ def run(self): log.error(msg) self.update_status(msg=msg) + discard(message) + + # This is just fallback cleanup in case messages weren't cleaned up within the loop. + # In normal operation, they will be. self.delete_messages(messages) def run(vapp=None, _queue_manager=None, _update_status=None): """ Entry-point for the ingestion listener for waitress. """ ingestion_listener = IngestionListener(vapp, _queue_manager=_queue_manager, _update_status=_update_status) - ingestion_listener.run() + try: + ingestion_listener.run() + except Exception as e: + debuglog(str(e)) + raise class ErrorHandlingThread(threading.Thread): @@ -582,10 +783,10 @@ class ErrorHandlingThread(threading.Thread): def run(self): # interval = self._kwargs.get('interval', DEFAULT_INTERVAL) interval = 60 # DB polling can and should be slower - update_status = self._kwargs['_update_status'] + update_status = self._kwargs['_update_status'] # noQA - uses private instance variables of parent class while True: try: - self._target(*self._args, **self._kwargs) + self._target(*self._args, **self._kwargs) # noQA - uses private instance variables of parent class except (psycopg2.OperationalError, elasticsearch.exceptions.ConnectionError) as e: # Handle database restart log.warning('Database not there, maybe starting up: %r', e) @@ -666,6 +867,7 @@ def status_app(environ, start_response): """ Allows you to get the status of the ingestion "manager". This will be much more useful once multi-processing is thrown at ingestion. """ + ignored(environ) status = '200 OK' response_headers = [('Content-type', 'application/json')] start_response(status, response_headers) @@ -677,7 +879,7 @@ def status_app(environ, start_response): # Command Application (for waitress) def main(): """ Entry point for the local deployment. """ - parser = argparse.ArgumentParser( + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. description='Listen for VCF File uuids to ingest', epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter @@ -698,5 +900,87 @@ def main(): return run(vapp) +PROMPT_TEMPLATE = """ + + + + Submit for Ingestion + + + + +

Submit for Ingestion

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + +
+ + + +
+ + + +
Special Options:
+ + +
+ +
+
+ + +""" + +PROMPT_FOR_INGESTION = PROMPT_TEMPLATE.replace("", "/submit_for_ingestion").replace("", "#eeddee") +PROMPT_FOR_SUBREQUEST = PROMPT_TEMPLATE.replace("", "/submit_subrequest").replace("", "#ddeedd") + if __name__ == '__main__': main() diff --git a/src/encoded/renderers.py b/src/encoded/renderers.py index b8a32f94c6..3d07fbadd8 100644 --- a/src/encoded/renderers.py +++ b/src/encoded/renderers.py @@ -11,21 +11,18 @@ HTTPMovedPermanently, HTTPPreconditionFailed, HTTPUnauthorized, - HTTPForbidden, HTTPUnsupportedMediaType, HTTPNotAcceptable, HTTPServerError ) from pyramid.response import Response -from pyramid.security import forget from pyramid.settings import asbool from pyramid.threadlocal import manager from pyramid.traversal import split_path_info, _join_path_tuple -from snovault.validation import CSRFTokenError -from subprocess_middleware.tween import SubprocessTween from subprocess_middleware.worker import TransformWorker from urllib.parse import urlencode from webob.cookies import Cookie +from .ingestion.common import content_type_allowed log = logging.getLogger(__name__) @@ -107,15 +104,13 @@ def validate_request_tween(request): # Includes page text/html requests. return handler(request) - elif request.content_type != 'application/json': - if request.content_type == 'application/x-www-form-urlencoded' and request.path[0:10] == '/metadata/': - # Special case to allow us to POST to metadata TSV requests via form submission - return handler(request) + elif content_type_allowed(request): + return handler(request) + + else: detail = "Request content type %s is not 'application/json'" % request.content_type raise HTTPUnsupportedMediaType(detail) - return handler(request) - return validate_request_tween diff --git a/src/encoded/root.py b/src/encoded/root.py index 44e397eefa..4f3673ba1b 100644 --- a/src/encoded/root.py +++ b/src/encoded/root.py @@ -106,6 +106,7 @@ def health_page_view(request): "indexer": settings.get("indexer"), "index_server": settings.get("index_server"), "load_data": settings.get('load_test_data'), + "metadata_bundles_bucket": settings.get('metadata_bundles_bucket'), "namespace": settings.get('indexer.namespace'), "processed_file_bucket": settings.get('file_wfout_bucket'), 'project_version': settings.get('encoded_version'), diff --git a/src/encoded/schemas/ingestion_submission.json b/src/encoded/schemas/ingestion_submission.json new file mode 100644 index 0000000000..c485db653d --- /dev/null +++ b/src/encoded/schemas/ingestion_submission.json @@ -0,0 +1,129 @@ +{ + "title": "Ingestion Submission", + "description": "Schema for metadata related to ingestion requests submitted to CGAP.", + "id": "/profiles/ingestion_submission.json", + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "required": [ + "ingestion_type", + "project", + "institution" + ], + "additionalProperties": false, + "identifyingProperties": ["uuid", "aliases"], + "mixinProperties": [ + { "$ref": "mixins.json#/schema_version" }, + { "$ref": "mixins.json#/aliases" }, + { "$ref": "mixins.json#/uuid" }, + { "$ref": "mixins.json#/documents" }, + { "$ref": "mixins.json#/attribution" }, + { "$ref": "mixins.json#/status" }, + { "$ref": "mixins.json#/submitted" }, + { "$ref": "mixins.json#/modified" }, + { "$ref": "mixins.json#/static_embeds" } + ], + "mixinFacets" : [ + { "$ref": "mixins.json#/facets_common" } + ], + "properties": { + "schema_version": { + "default": "1" + }, + "object_bucket": { + "title": "Object Bucket", + "description": "The name of the S3 bucket in which the 'object_name' resides.", + "type": "string" + }, + "object_name": { + "title": "Object Name", + "description": "The name of the S3 object corresponding to the submitted document.", + "type": "string" + }, + "ingestion_type": { + "title": "Ingestion Type", + "description": "The type of processing requested for this submission.", + "type": "string", + "enum": [ + "data_bundle", + "metadata_bundle", + "vcf" + ] + }, + "submission_id": { + "title": "Submission ID", + "description": "The name of a folder in the S3 bucket that contains all artifacts related to this submission.", + "type": "string" + }, + "parameters": { + "title": "Parameters", + "description": "A record of explicitly offered form parameters in the submission request.", + "type": "object", + "additionalProperties": true, + "properties": {} + }, + "processing_status": { + "title": "Processing Status", + "description": "A structured description of what has happened so far as the submission is processed.", + "type": "object", + "additionalProperties": false, + "properties": { + "state": { + "title": "State", + "description": "A state machine description of how processing is progressing (submitted, processed, or done).", + "type": "string", + "enum": [ + "submitted", + "processing", + "done" + ], + "default": "submitted" + }, + "outcome": { + "title": "Outcome", + "description": "A token describing the nature of the final outcome, if any. Options are unknown, success, failure, or error.", + "type": "string", + "enum": [ + "unknown", + "success", + "failure", + "error" + ], + "default": "unknown" + }, + "progress": { + "title": "Progress", + "description": "An adjectival word or phrase assessing progress, such as 'started', 'awaiting prerequisites', '88% done', or 'unavailable'.", + "type": "string", + "default": "unavailable" + } + } + }, + "result": { + "title": "Result", + "description": "An object representing a result if processing ran to completion, whether the outcome was success or failure.", + "type": "object", + "additionalProperties": true, + "properties": {}, + "default": {} + }, + "errors": { + "title": "Errors", + "description": "A list of error messages if processing was aborted before results were obtained.", + "type": "array", + "items": { + "title": "Error Message", + "description": "One of possibly several reasons that processing was not completed.", + "type": "string" + }, + "default": [] + }, + "additional_data": { + "title": "Additional Data", + "description": "Additional structured information resulting from processing, the nature of which may vary by ingestion_type and other factors.", + "type": "object", + "additionalItems": true, + "properties": {}, + "default": {} + } + } +} diff --git a/src/encoded/schemas/sample.json b/src/encoded/schemas/sample.json index 4bd1023bf0..76ec69be3b 100644 --- a/src/encoded/schemas/sample.json +++ b/src/encoded/schemas/sample.json @@ -54,6 +54,13 @@ "lookup": 30, "description": "Clinical or research consent/protocol" }, + "research_protocol_name": { + "title": "Research Protocol Name", + "type": "string", + "label": "requisition", + "lookup": 31, + "description": "Consent Protocol Name for Research Requisition" + }, "date_requisition_received": { "title": "Date Requisition Received", "type": "string", @@ -131,6 +138,12 @@ "lookup": 113, "description": "If requisition was rejected, the corrective action noted/taken" }, + "action_taken_by": { + "title": "Action Taken By", + "type": "string", + "lookup": 114, + "description": "Name or ID of person who took the corrective action" + }, "date_sent": { "title": "Date Correction Sent", "type": "string", @@ -203,6 +216,13 @@ "type": "string", "lookup": 140 }, + "specimen_storage_location": { + "title": "Specimen Storage Location", + "description": "Location of specimen storage", + "label": "specimen", + "type": "string", + "lookup": 144 + }, "specimen_accession": { "title": "Specimen Accession", "description": "Accession of specimen from sequencing lab", @@ -247,6 +267,13 @@ "lookup": 160, "description": "ID of person who sent the specimen" }, + "sequencing_lab": { + "title": "Sequencing Lab", + "description": "Location performing sequencing on sample", + "type": "string", + "label": "test", + "lookup": 189 + }, "date_received": { "title": "Date Received in Sequencing Lab", "type": "string", diff --git a/src/encoded/static/components/item-pages/HealthView.js b/src/encoded/static/components/item-pages/HealthView.js index 8cedf892f5..f73603563e 100644 --- a/src/encoded/static/components/item-pages/HealthView.js +++ b/src/encoded/static/components/item-pages/HealthView.js @@ -36,8 +36,8 @@ export default class HealthView extends React.PureComponent { 'href' : PropTypes.string }; - static defaultProps = { - "excludedKeys" : [ ...ItemDetailList.Detail.defaultProps.excludedKeys, 'content' ], + static defaultProps = { + "excludedKeys" : [ ...ItemDetailList.Detail.defaultProps.excludedKeys, 'content' ], "keyTitleDescriptionMapConfig" : { 'aggregations' : { title : 'Aggregations', @@ -74,6 +74,10 @@ export default class HealthView extends React.PureComponent { title : "Foursight", description : "URI of corresponding Foursight page." }, + 'metadata_bundles_bucket' : { + title: "MetaData Bundles Bucket", + description : "Name of S3 bucket used for metadata bundles." + }, 'indexer' : { title : "Indexer", description : "Whether this server processes indexing requests at all." @@ -119,15 +123,15 @@ export default class HealthView extends React.PureComponent { description : "Software version of dcicutils being used." }, }, - "keyTitleDescriptionMapCounts" : { - 'db_es_total' : { - title : "DB and ES Counts", - description : "Total counts of items in database and elasticsearch." - }, - 'db_es_compare' : { - title : "DB and ES Counts by Type", - description : "Counts of items in database and elasticsearch for each doc_type index." - } + "keyTitleDescriptionMapCounts" : { + 'db_es_total' : { + title : "DB and ES Counts", + description : "Total counts of items in database and elasticsearch." + }, + 'db_es_compare' : { + title : "DB and ES Counts by Type", + description : "Counts of items in database and elasticsearch for each doc_type index." + } } }; diff --git a/src/encoded/submit.py b/src/encoded/submit.py new file mode 100644 index 0000000000..70487321f1 --- /dev/null +++ b/src/encoded/submit.py @@ -0,0 +1,880 @@ +from copy import deepcopy +import csv +import datetime +import json +import xlrd + +from dcicutils.qa_utils import ignored +from dcicutils.misc_utils import VirtualAppError +from webtest import AppError +from .util import s3_local_file, debuglog + + +GENERIC_FIELD_MAPPING = { # for spreadsheet column names that are different from schema property names + 'individual': {}, + 'family': {}, + 'sample': { + 'date collected': 'specimen_collection_date', + 'location stored': 'specimen_storage_location', + 'specimen id': 'specimen_accession', + 'transport method': 'transported_by', + 'sequencing ref lab': 'sequencing_lab', + "date rec'd at ref lab": 'date_received', + 'specimen accepted by ref lab': 'specimen_accepted', + 'sample id by ref lab': 'sequence_id', + 'req type': 'requisition_type', + "date req rec'd": 'date_requisition_received', + 'physician/provider': 'ordering_physician', + 'test requested': 'workup_type' + }, + 'requisition': { + 'req accepted y/n': 'accepted_rejected', + 'reason rejected': 'rejection_reason', + 'corrective action taken': 'corrective_action', + 'corrective action taken by': 'action_taken_by', + 'correction notes': 'notes' + } +} + + +ABBREVS = { + 'male': 'M', + 'female': 'F', + 'unknown': 'U', + 'yes': 'Y', + 'no': 'N', + 'p': 'proband', + 'mth': 'mother', + 'fth': 'father', + 'sf': 'sibling' +} + + +POST_ORDER = [ + 'file_fastq', 'file_processed', 'sample', 'individual', + 'family', 'sample_processing', 'report', 'case' +] + + +LINKTO_FIELDS = [ # linkTo properties that we will want to patch in second-round + 'samples', 'members', 'mother', 'father', 'proband', 'report', + 'individual', 'sample_processing', 'families', 'files' +] + + +ID_SOURCES = ['UDN'] + + +def submit_metadata_bundle(*, s3_client, bucket, key, project, institution, vapp, # <- Required keyword arguments + validate_only=False): # <-- Optional keyword arguments (with defaults) + """ + Handles processing of a submitted workbook. + + Args: + s3_client: a boto3 s3 client object + bucket: the name of the s3 bucket that contains the data to be processed + key: the name of a key within the given bucket that contains the data to be processed + project: a project identifier + institution: an institution identifier + vapp: a VirtualApp object + validate_only: a bool. If True, only do validation, not posting; otherwise (if False), do posting, too. + """ + with s3_local_file(s3_client, bucket=bucket, key=key) as filename: + project_json = vapp.get(project).json + institution_json = vapp.get(institution).json + results = { + 'success': False, + 'validation_output': [], + 'result': {}, + 'post_output': [], + 'upload_info': [] + } + if filename.endswith('.xls') or filename.endswith('.xlsx'): + rows = digest_xls(filename) + elif filename.endswith('.csv') or filename.endswith('.tsv'): + delim = ',' if filename.endswith('csv') else '\t' + rows = digest_csv(filename, delim=delim) + else: + msg = ('Metadata bundle must be a file of type .xls, .xlsx, .csv, or .tsv.' + 'Please submit a file of the proper type.') + results['validation_output'].append(msg) + return results + json_data, json_success = xls_to_json(rows, project=project_json, institution=institution_json) + if not json_success: + results['validation_output'] = json_data['errors'] + return results + processing_result, validation_log_lines, validate_success = validate_all_items(vapp, json_data) + results['result'] = processing_result + results['validation_output'] = validation_log_lines + if not validate_success: + return results + results['success'] = validate_success + if validate_only: + return results + result_lines, post_success, upload_info = post_and_patch_all_items(vapp, json_data_final=processing_result) + results['post_output'] = result_lines + results['success'] = post_success + results['upload_info'] = upload_info + return results + + +def map_fields(row, metadata_dict, addl_fields, item_type): + """ + function for grabbing metadata from spreadsheet row (in dictionary form) based on + mapping column headers to schema properties. + + Args: + row - dictionary of format {column name1: value1, column name 2: value 2} + metadata_dict - the dictionary (json) to be filled with metadata parsed in this function. + Can be empty. + addl_fields - list of fields not present in GENERIC_FIELD_MAPPING. These fields will appear + in the output dictionary as keys, with spaces replaced with underscores. E.g., a field + 'individual id' will appear in the output dict as 'individual_id'. + item_type - the key in GENERIC_FIELD_MAPPING to look at for column name to schema property mappings. + + Example usage: + output = map_fields(row_dict, {}, ['individual_id', 'sex', 'age', 'birth_year'], 'individual') + + """ + for field in addl_fields: + metadata_dict[field] = use_abbrev(row.get(field.replace('_', ' '))) + for map_field in GENERIC_FIELD_MAPPING[item_type]: + if map_field in row: + metadata_dict[GENERIC_FIELD_MAPPING[item_type][map_field]] = use_abbrev(row.get(map_field)) + return metadata_dict + + +def use_abbrev(value): + if value and value.lower() in ABBREVS: + return ABBREVS[value.lower()] + else: + return value + + +def get_column_name(row, columns): + """ + For cases where there is a variation on a particular column name. + Final column in list must be the default name. + """ + for col in columns: + if row.get(col): + return col + return columns[-1] + + +def digest_xls(xls_data): + book = xlrd.open_workbook(xls_data) + sheet, = book.sheets() + return row_generator(sheet) + + +def digest_csv(input_data, delim=','): + with open(input_data) as csvfile: + rows = list(csv.reader(csvfile, delimiter=delim)) + for row in rows: + yield row + + +def xls_to_json(row, project, institution): + """ + Converts excel file (or csv/tsv table) to json for submission. + + Args: + row - generator yielding rows of spreadsheet + project - dict (json) of project metadata submitter is submitting for + institution - dict (json) of institution metadata that submitter is submitting for + + Output: + 1. items - dictionary of db items the submitter wants to submit, of the format + {itemtype1: [{alias1: {metadata}, {alias2: {metadata}], itemtype2: [...], ...} + Also has an extra key 'errors' whose value is a list of errors found during processing, + to be combined with validation errors later in submission processing. + 2. boolean indicating whether submission can move to next phase or not. False will be + returned if there are major errors in spreadsheet preventing rows from being + processed properly. + + Basically, this function parses the column headers of the spreadsheet, turns each row into + a dictionary of {column header: cell value} pairs, then gathers the metadata it can find for each + db item type in each row. Minor spreadsheet errors are added to the output dictionary. + """ + keys = {} # In case there are no rows, so key doesn't get assigned below + header = False + counter = 0 + # debuglog("top_header:", top_header) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + while True: + try: + keys = next(row) + keys = [key.lower().strip().rstrip('*: ') for key in keys] + counter += 1 + if 'individual id' in keys: + header = True + break + except StopIteration: + break + if not header: + msg = 'Column headers not detected in spreadsheet! "Individual ID*" column must be present in header.' + return {'errors': [msg]}, False + # debuglog("keys:", keys) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + rows = [] + required = ['individual id', 'relation to proband', 'report required', 'analysis id', 'specimen id'] + missing = [col for col in required if col not in keys] + if missing: + msg = 'Column(s) "{}" not found in spreadsheet! Spreadsheet cannot be processed.'.format('", "'.join(missing)) + return {'errors': [msg]}, False + + for values in row: + r = [val for val in values] + if 'y/n' in ''.join(r).lower() or ''.join(r) == '': # skip comments/description/blank row if present + counter += 1 + continue + row_dict = {keys[i]: item for i, item in enumerate(r)} + rows.append(row_dict) + + items = { + 'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}, + 'file_fastq': {}, 'file_processed': {}, 'case': {}, 'report': {}, + 'reports': [], 'errors': [] + } + file_errors = [] + family_dict = init_families(rows) + a_types = get_analysis_types(rows) + case_names = {} + for i, row in enumerate(rows): + debuglog("row:", repr(row)) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + row_num = i + counter + 1 + missing_required = [col for col in required if col not in row or not row[col]] + if missing_required: + items['errors'].append( + 'Row {} - missing required field(s) {}. This row cannot be processed.' + ''.format(row_num, ', '.join(missing_required)) + ) + indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) + if not family_dict.get(row['analysis id']): + msg = ('Row {} - Proband for this analysis could not be found. ' + 'This row cannot be processed.'.format(i)) + items['errors'].append(msg) + continue + fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) + # create items for Individual + items = extract_individual_metadata(row_num, row, items, indiv_alias, institution['name']) + # create/edit items for Family + items = extract_family_metadata(row_num, row, items, indiv_alias, fam_alias) + # create item for Sample if there is a specimen + if row.get('specimen id'): + samp_alias = '{}:sample-{}'.format(project['name'], row['specimen id']) + if row.get('run no.'): + samp_alias = samp_alias + '-' + row['run no.'] + analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) + items = extract_sample_metadata(row_num, row, items, indiv_alias, samp_alias, analysis_alias, + fam_alias, project['name'], a_types, case_names) + if row.get('files'): + file_items = extract_file_metadata(row_num, row['files'].split(','), project['name']) + file_errors.extend(file_items['errors']) + items['file_fastq'].update(file_items['file_fastq']) + items['file_processed'].update(file_items['file_processed']) + items = add_relations(items) + items = create_case_item_metadata(items, project['name'], case_names, family_dict) + # removed unused fields, add project and institution + for val1 in items.values(): + if isinstance(val1, dict): + for val2 in val1.values(): + remove_keys = [k for k, v in val2.items() if not v] + for key in remove_keys: + del val2[key] + val2['project'] = project['@id'] + val2['institution'] = institution['@id'] + items['errors'].extend(file_errors) + items['errors'] = list(set(items['errors'])) + return items, True # most errors passed to next step in order to combine with validation errors + + +def init_families(rows): + """ + Initializes metadata dicts for 'family' items. Requires multiple rows so must be done separately from + row-by-row parsing. + """ + proband_rows = [row for row in rows if row.get('relation to proband').lower() == 'proband'] + fams = {row.get('analysis id'): 'family-{}'.format(row.get('individual id')) for row in proband_rows} + return fams + + +def get_analysis_types(rows): + """ + 'analysis_type' is a property of sample_processing items, denoting the workup type (WGS, WES, etc) + as well as describing the grouping (Trio, Quad, etc). This info needs to be extracted from the spreadsheet + separately from most of the metadata since it depends info extracted from more than one row. + """ + analysis_relations = {} + analysis_types = {} + for row in rows: + analysis_relations.setdefault(row.get('analysis id'), [[], []]) + analysis_relations[row.get('analysis id')][0].append(row.get('relation to proband', '').lower()) + workup_col = get_column_name(row, ['test requested', 'workup type']) + analysis_relations[row.get('analysis id')][1].append(row.get(workup_col, '').upper()) + for k, v in analysis_relations.items(): + workup = list(set(v[1])) + if len(workup) == 1 and '' not in workup: + if len(v[0]) == 1: + analysis_types[k] = v[1][0] + elif sorted(v[0]) == ['father', 'mother', 'proband']: + analysis_types[k] = v[1][0] + '-Trio' + else: + analysis_types[k] = v[1][0] + '-Group' + else: + analysis_types[k] = None + return analysis_types + + +def extract_individual_metadata(idx, row, items, indiv_alias, inst_name): + """ + Extracts 'individual' item metadata from each row + """ + new_items = items.copy() + info = {'aliases': [indiv_alias]} + info = map_fields(row, info, ['individual_id', 'sex', 'age', 'birth_year'], 'individual') + other_id_col = get_column_name(row, ['other id', 'other individual id']) + if row.get(other_id_col): + other_id = {'id': row[other_id_col], 'id_source': inst_name} + if row.get('other individual id type'): + other_id['id_source'] = row['other individual id source'] + else: + for id_source in ID_SOURCES: + if row[other_id_col].upper().startswith(id_source): + other_id['id_source'] = id_source + info['institutional_id'] = other_id + for col in ['age', 'birth_year']: + if info.get(col) and isinstance(info[col], str) and info[col].isnumeric(): + info[col] = int(info[col]) + if indiv_alias not in new_items['individual']: + new_items['individual'][indiv_alias] = {k: v for k, v in info.items() if v} + new_items['individual'][indiv_alias]['row'] = idx + else: + for key in info: + if key not in new_items['individual'][indiv_alias]: + new_items['individual'][indiv_alias][key] = info[key] + return new_items + + +def extract_family_metadata(idx, row, items, indiv_alias, fam_alias): + """ + Extracts 'family' item metadata from each row + """ + new_items = items.copy() + info = { + 'aliases': [fam_alias], + 'family_id': row.get('family id'), + 'members': [indiv_alias], + 'row': idx + } + if not info['family_id']: + info['family_id'] = fam_alias[fam_alias.index(':') + 1:] + if fam_alias not in new_items['family']: + new_items['family'][fam_alias] = info + if indiv_alias not in new_items['family'][fam_alias]['members']: + new_items['family'][fam_alias]['members'].append(indiv_alias) + valid_relations = ['proband', 'mother', 'father', 'brother', 'sister', 'sibling'] + relation_found = False + for relation in valid_relations: + if row.get('relation to proband', '').lower().startswith(relation): + if relation not in new_items['family'][fam_alias]: + new_items['family'][fam_alias][relation] = indiv_alias + relation_found = True + break + if not relation_found: + msg = 'Row {} - Invalid relation "{}" for individual {} - Relation should be one of: {}'.format( + idx, row.get('relation to proband'), row.get('individual id'), ', '.join(valid_relations) + ) + new_items['errors'].append(msg) + return new_items + + +def extract_sample_metadata(idx, row, items, indiv_alias, samp_alias, analysis_alias, + fam_alias, proj_name, analysis_type_dict, case_name_dict): + """ + Extracts 'sample' item metadata from each row + """ + new_items = items.copy() + info = {'aliases': [samp_alias], 'files': []} # TODO: implement creation of file db items + fields = [ + 'workup_type', 'specimen_type', 'dna_concentration', 'date_transported', 'indication', + 'specimen_notes', 'research_protocol_name', 'sent_by', 'physician_id' + ] + info = map_fields(row, info, fields, 'sample') + info['row'] = idx + if info.get('specimen_accepted', '').lower() == 'y': + info['specimen_accepted'] = 'Yes' + elif info.get('specimen_accepted', '').lower() == 'n': + info['specimen_accepted'] = 'No' + if row.get('second specimen id'): + other_id = {'id': row['second specimen id'], 'id_type': proj_name} # add proj info? + if row.get('second specimen id type'): + other_id['id_type'] = row['second specimen id type'] + info['other_specimen_ids'] = [other_id] + req_info = map_fields(row, {}, ['date sent', 'date completed'], 'requisition') + if req_info.get('accepted_rejected', '').lower() in ['y', 'n']: + if req_info['accepted_rejected'].lower() == 'y': + req_info['accepted_rejected'] = 'Accepted' + else: + req_info['accepted_rejected'] = "Rejected" + info['requisition_acceptance'] = {k: v for k, v in req_info.items() if v} + new_items['sample'][samp_alias] = {k: v for k, v in info.items() if v} + if indiv_alias in new_items['individual']: + new_items['individual'][indiv_alias]['samples'] = [samp_alias] + new_sp_item = { + 'aliases': [analysis_alias], + 'samples': [], + 'families': [] + } + if row.get('analysis id') in analysis_type_dict: + new_sp_item['analysis_type'] = analysis_type_dict[row.get('analysis id')] + if not analysis_type_dict[row.get('analysis id')]: + msg = ('Row {} - Samples with analysis ID {} contain mis-matched or invalid workup type values. ' + 'Sample cannot be processed.'.format(idx, row.get('analysis id'))) + new_items['errors'].append(msg) + case_col = get_column_name(row, ['unique analysis id', 'optional case id (unique in all rows)']) + if row.get(case_col): + case_name_dict['{}-{}'.format(row.get('analysis id'), row.get('specimen id'))] = row[case_col] + new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) + new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) + if row.get('report required').lower().startswith('y'): + new_items['reports'].append(samp_alias) + if fam_alias not in new_items['sample_processing'][analysis_alias]['families']: + new_items['sample_processing'][analysis_alias]['families'].append(fam_alias) + return new_items + + +def extract_file_metadata(idx, filenames, proj_name): + """ + Extracts 'file' item metadata from each row + """ + valid_extensions = { + '.fastq.gz': ('fastq', 'reads'), + '.fq.gz': ('fastq', 'reads'), + '.cram': ('cram', 'alignments'), + '.vcf.gz': ('vcf_gz', 'raw VCF') + } + files = {'file_fastq': {}, 'file_processed': {}, 'errors': []} + for filename in filenames: + extension = [ext for ext in valid_extensions if filename.endswith(ext)] + if not extension: + if [ext for ext in ['.fastq', '.fq', '.vcf'] if filename.endswith(ext)]: + files['errors'].append('File must be compressed - please gzip file {}'.format(filename)) + else: + files['errors'].append('File extension on {} not supported - expecting one of: ' + '.fastq.gz, .fq.gz, .cram, .vcf.gz'.format(filename)) + continue + file_alias = '{}:{}'.format(proj_name, filename.strip().split('/')[-1]) + fmt = valid_extensions[extension[0]][0] + file_info = { + 'aliases': [file_alias], + 'row': idx, + 'file_format': '/file-formats/{}/'.format(fmt), + 'file_type': valid_extensions[extension[0]][1], + 'filename': filename.strip() + } + if fmt == 'fastq': + files['file_fastq'][file_alias] = file_info + else: + files['file_processed'][file_alias] = file_info + return files + + +def create_case_item_metadata(items, proj_name, case_name_dict, family_dict): + """ + Creation of case metadata, which can only be done after all rows are processed + so that sample_processing metadata exists. + """ + new_items = items.copy() + for k, v in items['sample_processing'].items(): + analysis_id = k[k.index('analysis-')+9:] + for sample in v['samples']: + case_id = '{}-{}'.format(analysis_id, items['sample'][sample]['specimen_accession']) + name = False + if case_id in case_name_dict: + name = True + case_id = case_name_dict[case_id] + case_alias = '{}:case-{}'.format(proj_name, case_id) + try: + indiv = [ikey for ikey, ival in items['individual'].items() if sample in ival.get('samples', [])][0] + except IndexError: + indiv = '' + case_info = { + 'aliases': [case_alias], + 'sample_processing': k, + 'family': '{}:{}'.format(proj_name, family_dict.get(analysis_id)), + 'individual': indiv + } + if name: + case_info['case_id'] = case_id + if sample in items['reports']: + report_alias = case_alias.replace('case', 'report') + new_items['report'][report_alias] = {'aliases': [report_alias]} + report_info = {'aliases': [report_alias]} + if indiv: + report_info['description'] = 'Analysis Report for Individual ID {} (Analysis {})'.format( + items['individual'][indiv]['individual_id'], analysis_id + ) + else: + report_info['description'] = 'Analysis Report for Case ID {}'.format(case_id) + case_info['report'] = report_alias + new_items['case'][case_alias] = case_info + del new_items['reports'] + return new_items + + +def add_relations(items): + """ + This function adds relations info to 'individual' metadata for proband. + This is done separately from row by row processing because information needed from spreadsheet + is on multiple rows. + """ + new_items = items.copy() + for alias, fam in items['family'].items(): + parents = False + for relation in ['mother', 'father']: + if fam.get(relation): + if fam.get('proband'): + new_items['individual'][fam['proband']][relation] = fam[relation] + parents = True + del new_items['family'][alias][relation] + for relation in ['brother', 'sister', 'sibling']: + if fam.get(relation): + if parents: + for parent in ['mother', 'father']: + if new_items['individual'][fam['proband']].get(parent): + new_items['individual'][fam[relation]][parent] = ( + new_items['individual'][fam['proband']][parent] + ) + del new_items['family'][alias][relation] + return new_items + + +def compare_with_db(virtualapp, alias): + try: # check if already in db + result = virtualapp.get('/' + alias + '/?frame=object') + if result.status_code == 301: + msg = json.loads(result.body).get('message', '') + result = virtualapp.get(msg[msg.index('/'):msg.index(';')]) + except Exception as e: # if not in db + if 'HTTPNotFound' in str(e): + return None + else: + return result.json + + +def validate_item(virtualapp, item, method, itemtype, aliases, atid=None): + data = deepcopy(item) + if data.get('filename'): + del data['filename'] + if method == 'post': + try: + validation = virtualapp.post_json('/{}/?check_only=true'.format(itemtype), data) + ignored(validation) # should it be? why did we assign it? -kmp 18-Sep-2020 + except (AppError, VirtualAppError) as e: + return parse_exception(e, aliases) + else: + return + elif method == 'patch': + try: + validation = virtualapp.patch_json(atid + '?check_only=true', data, status=200) + ignored(validation) # should it be? why did we assign it? -kmp 18-Sep-2020 + except (AppError, VirtualAppError) as e: + return parse_exception(e, aliases) + else: + return + else: + raise ValueError("Unrecognized method -- must be 'post' or 'patch'") + + +def parse_exception(e, aliases): + """ff_utils functions raise an exception when the expected code is not returned. + This response is a pre-formatted text, and this function will get the resonse json + out of it. [Adapted from Submit4DN]""" + try: + # try parsing the exception + if isinstance(e, VirtualAppError): + text = e.raw_exception.args[0] + else: + text = e.args[0] + resp_text = text[text.index('{'):-1] + resp_dict = json.loads(resp_text.replace('\\"', "\'").replace('\\', '')) + except Exception: # pragma: no cover + raise e + if resp_dict.get('description') == 'Failed validation': + keep = [] + resp_list = [error['name'] + ' - ' + error['description'] for error in resp_dict['errors']] + for error in resp_list: + # if error is caused by linkTo to item not submitted yet but in aliases list, + # remove that error + if 'not found' in error and error.split("'")[1] in aliases: + continue + else: + error = error.lstrip('Schema: ') + if error.index('- ') > 0: + field_name = error[:error.index(' - ')] + field = None + if field_name in GENERIC_FIELD_MAPPING['sample'].values(): + field = [key for key, val in GENERIC_FIELD_MAPPING['sample'].items() if val == field_name][0] + elif field_name == 'requisition_acceptance.accepted_rejected': + field = 'Req Accepted Y\\N' + error = map_enum_options(field_name, error) + if not field: + field = field_name.replace('_', ' ') + + error = 'field: ' + error.replace(field_name, field) + keep.append(error) + elif 'Additional properties are not allowed' in error: + keep.append(error[2:]) + return keep + else: + raise e + + +def map_enum_options(fieldname, error_message): + if fieldname == 'requisition_acceptance.accepted_rejected': + error_message = error_message.replace("['Accepted', 'Rejected']", "['Y', 'N']") + elif fieldname == 'specimen_accepted': + error_message = error_message.replace("['Yes', 'No']", "['Y', 'N']") + return error_message + + +def compare_fields(profile, aliases, json_item, db_item): + to_patch = {} + for field in json_item: + if field == 'filename': + if (db_item.get('status') in ['uploading', 'upload failed', 'to be uploaded by workflow'] + or json_item['filename'].split('/')[-1] != db_item.get('filename')): + to_patch['filename'] = json_item['filename'] + to_patch['status'] = 'uploading' + continue + # if not an array, patch field gets overwritten (if different from db) + if profile['properties'][field]['type'] != 'array': + val = json_item[field] + if profile['properties'][field]['type'] == 'string' and val in aliases: + val = aliases[val] + if val != db_item.get(field): + to_patch[field] = val + else: + # if array, patch field vals get added to what's in db + if field != 'aliases' and profile['properties'][field].get('items', {}).get('linkTo'): + val = [aliases[v] if v in aliases else v for v in json_item[field]] + else: + val = [v for v in json_item[field]] + if all(v in db_item.get(field, []) for v in val): + continue + new_val = [item for item in db_item.get(field, [])] + new_val.extend(val) + try: + to_patch[field] = list(set(new_val)) + except TypeError: # above doesn't handle list of dictionaries + to_patch[field] = [dict(t) for t in {tuple(d.items()) for d in new_val}] + return to_patch + + +def validate_all_items(virtualapp, json_data): + """ + Function that: + 1. looks up each item in json + 2. if item in db, will validate and patch any different metadata + 3. if item not in db, will post item + """ + output = [] + if list(json_data.keys()) == ['errors']: + output.append('Errors found in spreadsheet columns. Please fix spreadsheet before submitting.') + return {}, output, False + alias_dict = {} + errors = json_data['errors'] + all_aliases = [k for itype in json_data for k in json_data[itype]] + json_data_final = {'post': {}, 'patch': {}} + validation_results = {} + for itemtype in POST_ORDER: # don't pre-validate case and report + db_results = {} + if itemtype in json_data: + profile = virtualapp.get('/profiles/{}.json'.format(itemtype)).json + validation_results[itemtype] = {'validated': 0, 'errors': 0} + for alias in json_data[itemtype]: + # first collect all atids before comparing and validating items + db_result = compare_with_db(virtualapp, alias) + if db_result: + alias_dict[alias] = db_result['@id'] + db_results[alias] = db_result + for alias in json_data[itemtype]: + data = json_data[itemtype][alias].copy() + row = data.get('row') + if row: + del data['row'] + fname = json_data[itemtype][alias].get('filename') + if not db_results.get(alias): + error = validate_item(virtualapp, data, 'post', itemtype, all_aliases) + if error: # check an report presence of validation errors + if itemtype not in ['case', 'report']: + for e in error: + if row: + errors.append('Row {} - Error found: {}'.format(row, e)) + else: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 + else: + if fname: + if fname in ''.join(json_data['errors']): + validation_results[itemtype]['errors'] += 1 + else: + json_data[itemtype][alias]['status'] = 'uploading' + json_data_final['post'].setdefault(itemtype, []) + json_data_final['post'][itemtype].append(json_data[itemtype][alias]) + validation_results[itemtype]['validated'] += 1 + else: + # patch if item exists in db + patch_data = compare_fields(profile, alias_dict, data, db_results[alias]) + if itemtype in ['file_fastq', 'file_processed']: + if 'filename' in patch_data: + patch_data['status'] = 'uploading' + error = validate_item(virtualapp, patch_data, 'patch', itemtype, + all_aliases, atid=db_results[alias]['@id']) + if error: # report validation errors + if itemtype not in ['case', 'report']: + for e in error: + if row: + errors.append('Row {} {} - Error found: {}'.format(row, itemtype, e)) + else: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 + elif fname and fname in ''.join(json_data['errors']): + validation_results[itemtype]['errors'] += 1 + else: # patch + json_data_final['patch'].setdefault(itemtype, {}) + if patch_data: + json_data_final['patch'][itemtype][db_results[alias]['@id']] = patch_data + elif itemtype not in ['case', 'report', 'sample_processing', 'file_fastq']: + item_name = alias[alias.index(':')+1:] + if item_name.startswith(itemtype + '-'): + item_name = item_name[item_name.index('-') + 1:] + if itemtype == 'family': + item_name = 'family for ' + item_name + else: + item_name = itemtype + ' ' + item_name + output.append('{} - Item already in database, no changes needed'.format(item_name)) + # record response + validation_results[itemtype]['validated'] += 1 + output.extend([error for error in errors]) + for itemtype in validation_results: + output.append('{} items: {} validated; {} errors'.format( + itemtype, validation_results[itemtype]['validated'], validation_results[itemtype]['errors'] + )) + if errors: + output.append('Errors found in items. Please fix spreadsheet before submitting.') + return {}, output, False + else: + json_data_final['aliases'] = alias_dict + output.append('All items validated.') + return json_data_final, output, True + + +def post_and_patch_all_items(virtualapp, json_data_final): + output = [] + files = [] + if not json_data_final: + return output, 'not run', [] + item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_accession'} + final_status = {} + no_errors = True + if json_data_final.get('post'): + for k, v in json_data_final['post'].items(): + final_status[k] = {'posted': 0, 'not posted': 0, 'patched': 0, 'not patched': 0} + for item in v: + patch_info = {} + row = item.get('row') + if row: + del item['row'] + fname = item.get('filename') + if fname: + del item['filename'] + for field in LINKTO_FIELDS: + if field in item: + patch_info[field] = item[field] + del item[field] + try: + response = virtualapp.post_json('/' + k, item, status=201) + if response.json['status'] == 'success': + final_status[k]['posted'] += 1 + atid = response.json['@graph'][0]['@id'] + json_data_final['aliases'][item['aliases'][0]] = atid + json_data_final['patch'].setdefault(k, {}) + json_data_final['patch'][k][atid] = patch_info + if k in item_names: + output.append('Success - {} {} posted'.format(k, item[item_names[k]])) + if fname and item.get('status') == 'uploading': + files.append({ + 'uuid': response.json['@graph'][0]['uuid'], + 'filename': fname + }) + else: + final_status[k]['not posted'] += 1 + no_errors = False + except Exception as e: + final_status[k]['not posted'] += 1 + output.append(str(e)) + no_errors = False + for itype in final_status: + if final_status[itype]['posted'] > 0 or final_status[itype]['not posted'] > 0: + output.append('{}: {} items posted successfully; {} items not posted'.format( + itype, final_status[itype]['posted'], final_status[itype]['not posted'] + )) + for k, v in json_data_final['patch'].items(): + final_status.setdefault(k, {'patched': 0, 'not patched': 0}) + for item_id, patch_data in v.items(): + fname = patch_data.get('filename') + if fname: + del patch_data['filename'] + try: + response = virtualapp.patch_json('/' + item_id, patch_data, status=200) + if response.json['status'] == 'success': + final_status[k]['patched'] += 1 + if fname and patch_data.get('status') == 'uploading': + files.append({ + 'uuid': response.json['@graph'][0]['uuid'], + 'filename': fname + }) + else: + final_status[k]['not patched'] += 1 + no_errors = False + except Exception as e: + final_status[k]['not patched'] += 1 + output.append(str(e)) + no_errors = False + if final_status[k]['patched'] > 0 or final_status[k]['not patched'] > 0: + output.append('{}: {} items patched successfully; {} items not patched'.format( + k, final_status[k]['patched'], final_status[k]['not patched'] + )) + return output, no_errors, files + + +def cell_value(cell, datemode): + """Get cell value from excel. [From Submit4DN]""" + # This should be always returning text format + ctype = cell.ctype + value = cell.value + if ctype == xlrd.XL_CELL_ERROR: # pragma: no cover + raise ValueError(repr(cell), 'cell error') + elif ctype == xlrd.XL_CELL_BOOLEAN: + return str(value).upper().strip() + elif ctype == xlrd.XL_CELL_NUMBER: + if value.is_integer(): + value = int(value) + return str(value).strip() + elif ctype == xlrd.XL_CELL_DATE: + value = xlrd.xldate_as_tuple(value, datemode) + if value[3:] == (0, 0, 0): + return datetime.date(*value[:3]).isoformat() + else: # pragma: no cover + return datetime.datetime(*value).isoformat() + elif ctype in (xlrd.XL_CELL_TEXT, xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK): + return value.strip() + raise ValueError(repr(cell), 'unknown cell type') # pragma: no cover + + +def row_generator(sheet): + """Generator that gets rows from excel sheet [From Submit4DN]""" + datemode = sheet.book.datemode + for index in range(sheet.nrows): + yield [cell_value(cell, datemode) for cell in sheet.row(index)] diff --git a/src/encoded/tests/conftest_settings.py b/src/encoded/tests/conftest_settings.py index 971e3260f1..063044e832 100644 --- a/src/encoded/tests/conftest_settings.py +++ b/src/encoded/tests/conftest_settings.py @@ -30,6 +30,7 @@ 'file_upload_bucket': 'test-wfout-bucket', 'file_wfout_bucket': 'test-wfout-bucket', 'file_upload_profile_name': 'test-profile', + 'metadata_bundles_bucket': 'elasticbeanstalk-fourfront-cgaplocal-test-metadata-bundles', } diff --git a/src/encoded/tests/data/__init__.py b/src/encoded/tests/data/__init__.py new file mode 100644 index 0000000000..7718980631 --- /dev/null +++ b/src/encoded/tests/data/__init__.py @@ -0,0 +1,19 @@ +# Declarations of constants used in the inserts so they can be used more abstractly in testing. + +import os + +# Available in master_inserts, workbook_inserts, demo-inserts + +DBMI_INSTITUTION = '/institutions/hms-dbmi/' + +# Available in master_inserts, workbook_inserts + +TEST_PROJECT = "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/" + +# This gives variable names to refer to various files in this hierarchy. + +TEST_DATA_DIR = os.path.dirname(__file__) + +METADATA_BUNDLE_PATH = os.path.join(TEST_DATA_DIR, 'documents/cgap_submit_test.xlsx') +METADATA_BUNDLE_PATH_WITH_ERRORS = os.path.join(TEST_DATA_DIR, 'documents/cgap_submit_test_with_errors.xlsx') + diff --git a/src/encoded/tests/data/documents/cgap_submit_test.xlsx b/src/encoded/tests/data/documents/cgap_submit_test.xlsx new file mode 100644 index 0000000000..53dfe53be8 Binary files /dev/null and b/src/encoded/tests/data/documents/cgap_submit_test.xlsx differ diff --git a/src/encoded/tests/data/documents/cgap_submit_test_with_errors.xlsx b/src/encoded/tests/data/documents/cgap_submit_test_with_errors.xlsx new file mode 100644 index 0000000000..f6e736b89d Binary files /dev/null and b/src/encoded/tests/data/documents/cgap_submit_test_with_errors.xlsx differ diff --git a/src/encoded/tests/datafixtures.py b/src/encoded/tests/datafixtures.py index d33e1186e6..5f2077e9cd 100644 --- a/src/encoded/tests/datafixtures.py +++ b/src/encoded/tests/datafixtures.py @@ -16,6 +16,7 @@ 'software', 'static_section', 'tracking_item', 'workflow_mapping', 'workflow_run_awsem', 'workflow_run', 'annotation_field', 'variant_sample', 'variant', 'gene_annotation_field', 'gene', 'higlass_view_config', + 'ingestion_submission', ] diff --git a/src/encoded/tests/test_ingestion_common.py b/src/encoded/tests/test_ingestion_common.py new file mode 100644 index 0000000000..c8b19b55ae --- /dev/null +++ b/src/encoded/tests/test_ingestion_common.py @@ -0,0 +1,72 @@ +import pytest + +from ..ingestion.common import get_parameter +from ..ingestion.exceptions import MissingParameter, BadParameter + + +def test_get_parameter(): + + parameters = { + "foo": "bar", + "enabled": "true", + "alpha": "42", + "beta": 42, + } + parameters_original = parameters.copy() + + assert get_parameter(parameters, "foo") == "bar" + assert get_parameter(parameters, "enabled") == "true" + assert get_parameter(parameters, "alpha") == "42" + assert get_parameter(parameters, "beta") == 42 + + with pytest.raises(MissingParameter): + get_parameter(parameters, "gamma") + + with pytest.raises(BadParameter): + get_parameter(parameters, "beta", as_type=bool) + + assert get_parameter(parameters, "gamma", default=17) == 17 + assert get_parameter(parameters, "gamma", default=17, as_type=str) == 17 + + assert get_parameter(parameters, "beta", as_type=str) == "42" + + def force_title(x): + return str(x).title() + assert get_parameter(parameters, "alpha", as_type=force_title, default="stuff") == "42" + assert get_parameter(parameters, "beta", as_type=force_title, default="stuff") == "42" + assert get_parameter(parameters, "gamma", as_type=force_title, default="stuff") == "stuff" + assert get_parameter(parameters, "foo", as_type=force_title, default="stuff") == "Bar" + + assert get_parameter(parameters, "foo", as_type=str) == "bar" + assert get_parameter(parameters, "enabled", as_type=bool) == True + assert get_parameter(parameters, "alpha", as_type=int) == 42 + assert get_parameter(parameters, "beta", as_type=int) == 42 + + assert parameters == parameters_original # No side effects before this point. No uses of update=True yet. + + assert get_parameter(parameters, "gamma", default=17, update=True) == 17 + assert get_parameter(parameters, "gamma") == 17 # update don previous line + + bool_tests = { + "truth1": "TrUe", + "truth2": True, + "falsity1": "", + "falsity2": "faLSE", + "falsity3": "NONE", + "falsity4": "NuLL", + "falsity5": None, + } + + for key in bool_tests: + if key.startswith("t"): + assert get_parameter(bool_tests, key, as_type=bool) == True + elif key.startswith("f"): + assert get_parameter(bool_tests, key, as_type=bool) == False + + for key in bool_tests: + if key.startswith("t"): + assert get_parameter(bool_tests, key, as_type=bool, update=True) == True + elif key.startswith("f"): + assert get_parameter(bool_tests, key, as_type=bool, update=True) == False + + assert bool_tests == { k: k.startswith('t') for k in bool_tests.keys() } diff --git a/src/encoded/tests/test_ingestion_metadata_bundle.py b/src/encoded/tests/test_ingestion_metadata_bundle.py new file mode 100644 index 0000000000..3439df5938 --- /dev/null +++ b/src/encoded/tests/test_ingestion_metadata_bundle.py @@ -0,0 +1,294 @@ +import boto3 +import botocore.exceptions +import datetime as datetime_module +import io +import json +import os +import pytz +import webtest + +from dcicutils import qa_utils +from dcicutils.qa_utils import ignored, ControlledTime, MockFileSystem +from dcicutils.lang_utils import n_of +from unittest import mock +from .data import TEST_PROJECT, DBMI_INSTITUTION, METADATA_BUNDLE_PATH +from .. import ingestion_listener as ingestion_listener_module +from ..types import ingestion as ingestion_module + + +SUBMIT_FOR_INGESTION = "/submit_for_ingestion" + + +def expect_unreachable_in_mock(function_name): + def fn(*args, **kwargs): + ignored(args, kwargs) + raise AssertionError("The function %s should not have been called. Its caller should have been mocked." + % function_name) + return fn + + +def constantly(value): + def fn(*args, **kwargs): + ignored(args, kwargs) + return value + return fn + + +class FakeGuid: + + def __init__(self): + self.counter = 0 + + def fake_guid(self): + self.counter += 1 + return self.format_fake_guid(self.counter) + + @classmethod + def format_fake_guid(cls, n): + digits = str(n).rjust(10, '0') + return "%s-%s-%s" % (digits[0:3], digits[3:7], digits[7:10]) + + +class MockQueueManager: + + def __init__(self, expected_ingestion_type): + self.expected_ingestion_type = expected_ingestion_type + self.uuids = [] + + def add_uuids(self, uuids, ingestion_type): + assert ingestion_type == self.expected_ingestion_type + self.uuids += uuids + return uuids, [] + + +class MockSubmissionFolioClass: + + EXPECTED_INGESTION_TYPE = 'metadata_bundle' + EXPECTED_INSTITUTION = DBMI_INSTITUTION + EXPECTED_PROJECT = TEST_PROJECT + + def __init__(self): + self.guid_factory = FakeGuid() + self.items_created = [] + + def create_item(self, request, ingestion_type, institution, project): + # This is ordinarily a class method, but an instance of this class will be used as a class stand-in + # so this is an instance method. + ignored(request) + assert ingestion_type == self.EXPECTED_INGESTION_TYPE + assert institution == self.EXPECTED_INSTITUTION + assert project == self.EXPECTED_PROJECT + guid = self.guid_factory.fake_guid() + self.items_created.append(guid) + return guid + + @classmethod + def make_submission_uri(cls, submission_id): + return "/ingestion-submissions/" + submission_id + + +class MockBotoS3Client: + + def __init__(self): + self.s3_files = MockFileSystem() + + def upload_fileobj(self, input_file_stream, Bucket, Key): # noqa - Uppercase argument names are chosen by AWS + data = input_file_stream.read() + print("Uploading %s (%s) to bucket %s key %s" + % (input_file_stream, n_of(len(data), "byte"), Bucket, Key)) + with self.s3_files.open(os.path.join(Bucket, Key), 'wb') as fp: + fp.write(data) + + +def test_submit_for_ingestion_anon_rejected(anontestapp): + + post_files = [("datafile", METADATA_BUNDLE_PATH)] + + post_data = { + 'ingestion_type': 'metadata_bundle', + 'institution': DBMI_INSTITUTION, + 'project': TEST_PROJECT, + 'validate_only': True, + } + + response = anontestapp.post_json( + SUBMIT_FOR_INGESTION, + post_data, + upload_files=post_files, + content_type='multipart/form-data', + status=403 # Forbidden + ) + + assert response.status_code == 403 + + +def file_contents(filename, binary=False): + with io.open(filename, 'rb' if binary else 'r') as fp: + return fp.read() + + +def check_submit_for_ingestion_authorized(testapp, mocked_s3_client, expected_status=200): + + class ControlledTimeWithFix(ControlledTime): + + def just_utcnow(self): + return self.just_now().astimezone(pytz.UTC).replace(tzinfo=None) + + dt = ControlledTimeWithFix() + + ingestion_type = 'metadata_bundle' + + mocked_queue_manager = MockQueueManager(expected_ingestion_type='metadata_bundle') + + post_files = [("datafile", METADATA_BUNDLE_PATH)] + + post_data = { + 'ingestion_type': ingestion_type, + 'institution': DBMI_INSTITUTION, + 'project': TEST_PROJECT, + 'validate_only': True, + } + + fake_tester_email = "test@cgap.hms.harvard.edu" + + def mocked_get_trusted_email(request, context, raise_errors): + assert context is "Submission" + assert raise_errors is False + if request.remote_user == 'TEST': + return fake_tester_email + else: + return None + + test_pseudoenv = "fourfront-cgaplocal-test" + + with mock.patch.object(ingestion_listener_module, "get_trusted_email", mocked_get_trusted_email): + with mock.patch.object(datetime_module, "datetime", dt): + with mock.patch.object(ingestion_listener_module, "beanstalk_env_from_request", + return_value=test_pseudoenv): + with mock.patch.object(qa_utils, "FILE_SYSTEM_VERBOSE", False): # This should be a parameter but isn't + mock_submission_folio_class = MockSubmissionFolioClass() + with mock.patch.object(ingestion_listener_module, "SubmissionFolio", mock_submission_folio_class): + with mock.patch.object(boto3, "client", constantly(mocked_s3_client)): + with mock.patch.object(ingestion_listener_module, "get_queue_manager", + constantly(mocked_queue_manager)): + with mock.patch.object(ingestion_module, "subrequest_item_creation", + expect_unreachable_in_mock("subrequest_item_creation")): + + response = testapp.post(SUBMIT_FOR_INGESTION, post_data, upload_files=post_files, + content_type='multipart/form-data', status=expected_status) + + assert response.status_code == expected_status, ( + "Expected response status %s but got %s." + % (expected_status, response.status_code) + ) + + # The FakeGuid facility makes ids sequentially, so we can predict we'll get + # one guid added to our mock queue. This test doesn't test the queue processing, + # only that something ends up passed off to thq queue. + expected_guid = '000-0000-001' + + assert mocked_queue_manager.uuids == [expected_guid] + + assert mock_submission_folio_class.items_created == [expected_guid] + + s3_file_system = mocked_s3_client.s3_files.files + + expected_bucket = "elasticbeanstalk-fourfront-cgaplocal-test-metadata-bundles" + + datafile_short_name = "datafile.xlsx" + manifest_short_name = "manifest.json" + + datafile_key = os.path.join(expected_guid, datafile_short_name) + manifest_key = os.path.join(expected_guid, manifest_short_name) + + datafile_name = os.path.join(expected_bucket, datafile_key) + manifest_name = os.path.join(expected_bucket, manifest_key) + + assert set(s3_file_system.keys()) == {datafile_name, manifest_name} + + assert s3_file_system[datafile_name] == file_contents(METADATA_BUNDLE_PATH, + binary=True) + + assert json.loads(s3_file_system[manifest_name].decode('utf-8')) == { + "filename": METADATA_BUNDLE_PATH, + "object_name": datafile_key, + "submission_id": expected_guid, + "submission_uri": "/ingestion-submissions/000-0000-001", + "beanstalk_env_is_prd": False, + "beanstalk_env": test_pseudoenv, + "bucket": expected_bucket, + "authenticated_userid": "remoteuser.TEST", + "email": fake_tester_email, + "success": True, + "message": "Uploaded successfully.", + + "upload_time": dt.just_utcnow().isoformat(), + "parameters": { + "ingestion_type": ingestion_type, + "institution": DBMI_INSTITUTION, + "project": TEST_PROJECT, + "validate_only": "True", + "datafile": METADATA_BUNDLE_PATH, + }, + } + + # Make sure we report success from the endpoint + assert response.status_code == 200 + + +# This runs the standard test pretty much as expected. +def test_submit_for_ingestion_authorized(testapp): + + check_submit_for_ingestion_authorized(testapp, MockBotoS3Client()) + + +# The next couple of tests are small variations in which the first or second interaction with S3 fails + +class MockBuggyBotoS3Client(MockBotoS3Client): + + def __init__(self, allowed_ok=0): + self.counter = 0 + self.allowed_ok = allowed_ok + super().__init__() + + def upload_fileobj(self, input_file_stream, Bucket, Key): # noqa - AWS decided args were uppercase + self.counter += 1 + if self.counter <= self.allowed_ok: + return super().upload_fileobj(input_file_stream, Bucket=Bucket, Key=Key) + else: + raise botocore.exceptions.ClientError({'Error': {'Code': 400, 'Message': "Simulated error."}}, + 'upload_fileobj') + + +def test_submit_for_ingestion_authorized_but_failed_first_s3_interaction(testapp): + + try: + check_submit_for_ingestion_authorized(testapp, MockBuggyBotoS3Client(), expected_status=400) + except webtest.AppError as e: + assert str(e) == ('Bad response: 500 Internal Server Error (not 400)\n' + 'b\'{"@type": ["SubmissionFailure", "Error"],' + ' "status": "error",' + ' "code": 500,' + ' "title": "Internal Server Error",' + ' "description": "",' + ' "detail": "botocore.exceptions.ClientError:' + ' An error occurred (400) when calling the upload_fileobj operation: Simulated error."}\'') + else: + raise AssertionError("An expected webtest.AppError was not raised.") + + +def test_submit_for_ingestion_authorized_but_failed_second_s3_interaction(testapp): + + try: + check_submit_for_ingestion_authorized(testapp, MockBuggyBotoS3Client(allowed_ok=1), expected_status=400) + except webtest.AppError as e: + assert str(e) == ('Bad response: 500 Internal Server Error (not 400)\n' + 'b\'{"@type": ["SubmissionFailure", "Error"],' + ' "status": "error",' + ' "code": 500,' + ' "title": "Internal Server Error",' + ' "description": "",' + ' "detail": "botocore.exceptions.ClientError (while uploading metadata):' + ' An error occurred (400) when calling the upload_fileobj operation: Simulated error."}\'') + else: + raise AssertionError("An expected webtest.AppError was not raised.") diff --git a/src/encoded/tests/test_submit.py b/src/encoded/tests/test_submit.py new file mode 100644 index 0000000000..226a22386d --- /dev/null +++ b/src/encoded/tests/test_submit.py @@ -0,0 +1,434 @@ +import pytest +import xlrd + +from copy import deepcopy +from unittest import mock +from .. import submit +from ..submit import ( + compare_fields, + digest_xls, + init_families, + extract_family_metadata, + extract_file_metadata, + extract_individual_metadata, + extract_sample_metadata, + get_analysis_types, + map_fields, + parse_exception, + row_generator, + validate_all_items, + validate_item, + xls_to_json, +) + + +@pytest.fixture +def row_dict(): + return { + 'individual id': '456', + 'family id': '333', + 'sex': 'M', + 'relation to proband': 'proband', + 'report required': 'Y', + 'specimen id': '3464467', + 'specimen type': 'blood', + 'workup type': 'WGS' + } + + +@pytest.fixture +def xls_list(): + book = xlrd.open_workbook('src/encoded/tests/data/documents/cgap_submit_test.xlsx') + sheet, = book.sheets() + row = row_generator(sheet) + return list(row) + + +@pytest.fixture +def empty_items(): + return { + 'individual': {}, 'family': {}, 'file_fastq': {}, + 'file_processed': {}, 'sample': {}, 'sample_processing': {}, + 'case': {}, 'report': {}, 'reports': [], 'errors': [] + } + + +@pytest.fixture +def submission_info(): + return { + 'family': {'test-proj:fam1': { + 'members': ['test-proj:indiv1'], + 'proband': 'test-proj:indiv1' + }}, + 'individual': {'test-proj:indiv1': {'samples': ['test-proj:samp1']}}, + 'sample': {'test-proj:samp1': {'workup_type': 'WGS'}}, + 'sample_processing': {}, + 'errors': [] + } + + +@pytest.fixture +def submission_info2(submission_info): + submission_info['family']['test-proj:fam1']['members'].append('test-proj:indiv2') + submission_info['individual']['test-proj:indiv2'] = {'samples': ['test-proj:samp2']} + submission_info['sample']['test-proj:samp2'] = {'workup_type': 'WGS'} + return submission_info + + +@pytest.fixture +def submission_info3(submission_info2): + info = submission_info2.copy() + info['family']['test-proj:fam1']['members'].append('test-proj:indiv3') + info['family']['test-proj:fam1']['mother'] = 'test-proj:indiv2' + # submission_info['family']['test-proj:fam1']['father'] = 'test-proj:indiv3' + info['individual']['test-proj:indiv3'] = {'samples': ['test-proj:samp3']} + info['sample']['test-proj:samp3'] = {'workup_type': 'WGS'} + return info + + +@pytest.fixture +def sample_info(): + return { + 'workup type': 'WES', + 'specimen id': '9034', + 'date collected': '2020-01-06' + } + + +@pytest.fixture +def example_rows(): + return [ + {'individual id': '456', 'analysis id': '1111', 'relation to proband': 'proband', 'workup type': 'WGS'}, + {'individual id': '123', 'analysis id': '1111', 'relation to proband': 'mother', 'workup type': 'WGS'}, + {'individual id': '789', 'analysis id': '1111', 'relation to proband': 'father', 'workup type': 'WGS'}, + {'individual id': '456', 'analysis id': '2222', 'relation to proband': 'proband', 'workup type': 'WGS'}, + {'individual id': '555', 'analysis id': '3333', 'relation to proband': 'proband', 'workup type': 'WES'}, + {'individual id': '546', 'analysis id': '3333', 'relation to proband': 'mother', 'workup type': 'WES'} + ] + + +@pytest.fixture +def new_family(child, mother, father): + return { + "title": "Smith family", + "proband": child['@id'], + "members": [ + child['@id'], + mother['@id'], + father['@id'] + ] + } + + +@pytest.fixture +def aunt(testapp, project, institution): + item = { + "accession": "GAPIDAUNT001", + "age": 35, + "age_units": "year", + 'project': project['@id'], + 'institution': institution['@id'], + "sex": "F" + } + return testapp.post_json('/individual', item).json['@graph'][0] + + +def test_map_fields(sample_info): + result = map_fields(sample_info, {}, ['workup_type'], 'sample') + assert result['workup_type'] == 'WES' + assert result['specimen_accession'] == '9034' + assert result['specimen_collection_date'] == '2020-01-06' + assert not result.get('sequencing_lab') + + +def test_init_families(example_rows): + fams = init_families(example_rows) + assert sorted(list(fams.keys())) == ['1111', '2222', '3333'] + assert fams['1111'] == 'family-456' + assert fams['2222'] == 'family-456' + assert fams['3333'] == 'family-555' + + +def test_get_analysis_types(example_rows): + a_types = get_analysis_types(example_rows) + assert a_types['1111'] == 'WGS-Trio' + assert a_types['2222'] == 'WGS' + assert a_types['3333'] == 'WES-Group' + example_rows[1]['workup type'] = 'WES' + new_a_types = get_analysis_types(example_rows) + assert new_a_types['1111'] is None + + +def test_extract_individual_metadata_new(row_dict, empty_items): + items_out = extract_individual_metadata(1, row_dict, empty_items, 'test-proj:indiv1', 'hms-dbmi') + assert items_out['individual']['test-proj:indiv1']['aliases'] == ['test-proj:indiv1'] + assert items_out['individual']['test-proj:indiv1']['individual_id'] == '456' + + +def test_extract_individual_metadata_old(row_dict, empty_items): + items = empty_items.copy() + items['individual'] = {'test-proj:indiv1': { + 'individual_id': '456', + 'age': 46, + 'aliases': ['test-proj:indiv1'] + }} + items_out = extract_individual_metadata(1, row_dict, items, 'test-proj:indiv1', 'hms-dbmi') + assert len(items['individual']) == len(items_out['individual']) + assert 'sex' in items_out['individual']['test-proj:indiv1'] + assert 'age' in items_out['individual']['test-proj:indiv1'] + + +def test_extract_individual_metadata_nums(row_dict, empty_items): + items2 = deepcopy(empty_items) + row_dict['age'] = '33' + row_dict['birth year'] = '1988' + items_out_nums = extract_individual_metadata(1, row_dict, empty_items, 'test-proj:indiv1', 'hms-dbmi') + assert not items_out_nums['errors'] + assert isinstance(items_out_nums['individual']['test-proj:indiv1']['age'], int) + assert isinstance(items_out_nums['individual']['test-proj:indiv1']['birth_year'], int) + # text values for age and birth year should be passed on without errors to eventually fail validation + row_dict['age'] = 'abc' + row_dict['birth year'] = 'def' + items_out_text = extract_individual_metadata(1, row_dict, items2, 'test-proj:indiv1', 'hms-dbmi') + assert not items_out_text['errors'] + assert isinstance(items_out_text['individual']['test-proj:indiv1']['age'], str) + assert isinstance(items_out_text['individual']['test-proj:indiv1']['birth_year'], str) + + +def test_extract_family_metadata_new(row_dict, empty_items): + items_out = extract_family_metadata(1, row_dict, empty_items, 'test-proj:indiv1', 'test-proj:fam1') + assert items_out['family']['test-proj:fam1']['members'] == ['test-proj:indiv1'] + assert items_out['family']['test-proj:fam1']['proband'] == 'test-proj:indiv1' + + +def test_extract_family_metadata_old(row_dict, empty_items): + items = empty_items.copy() + items['family'] = {'test-proj:fam1': { + 'aliases': ['test-proj:fam1'], + 'family_id': '333', + 'members': ['test-proj:indiv2'], + 'mother': 'test-proj:indiv2' + }} + items_out = extract_family_metadata(1, row_dict, items, 'test-proj:indiv1', 'test-proj:fam1') + assert items_out['family']['test-proj:fam1']['members'] == ['test-proj:indiv2', 'test-proj:indiv1'] + assert items_out['family']['test-proj:fam1']['proband'] == 'test-proj:indiv1' + assert items_out['family']['test-proj:fam1']['mother'] == 'test-proj:indiv2' + + +def test_extract_family_metadata_invalid_relation(row_dict, empty_items): + row_dict['relation to proband'] = 'grandmother' + items_out = extract_family_metadata(1, row_dict, empty_items, 'test-proj:indiv1', 'test-proj:fam1') + assert 'Row 1 - Invalid relation' in items_out['errors'][0] + + +def test_extract_sample_metadata_sp(row_dict, empty_items): + items = empty_items.copy() + items['individual'] = {'test-proj:indiv1': {}} + row_dict['req accepted y/n'] = 'Yes' + row_dict['specimen accepted by ref lab'] = "n" + items_out = extract_sample_metadata( + 1, row_dict, items, 'test-proj:indiv1', 'test-proj:samp1', + 'test-proj:sp1', 'test-proj:fam1', 'test-proj', {}, {} + ) + print(items_out['sample']['test-proj:samp1']) + assert items_out['sample']['test-proj:samp1']['specimen_accession'] == row_dict['specimen id'] + assert items_out['sample']['test-proj:samp1']['specimen_accepted'] == 'No' + assert items_out['sample']['test-proj:samp1']['requisition_acceptance']['accepted_rejected'] == 'Accepted' + assert items_out['sample_processing']['test-proj:sp1']['samples'] == ['test-proj:samp1'] + assert items_out['individual']['test-proj:indiv1']['samples'] == ['test-proj:samp1'] + + +def test_extract_file_metadata_valid(): + results = extract_file_metadata(1, ['f1.fastq.gz', 'f2.cram', 'f3.vcf.gz'], 'test-proj') + assert 'test-proj:f1.fastq.gz' in results['file_fastq'] + assert results['file_fastq']['test-proj:f1.fastq.gz']['file_format'] == '/file-formats/fastq/' + assert results['file_fastq']['test-proj:f1.fastq.gz']['file_type'] == 'reads' + assert 'test-proj:f2.cram' in results['file_processed'] + assert 'test-proj:f3.vcf.gz' in results['file_processed'] + assert not results['errors'] + + +def test_extract_file_metadata_uncompressed(): + results = extract_file_metadata(1, ['f1.fastq', 'f2.cram', 'f3.vcf'], 'test-proj') + assert not results['file_fastq'] + assert 'test-proj:f2.cram' in results['file_processed'] + assert 'test-proj:f3.vcf' not in results['file_processed'] + assert len(results['errors']) == 2 + assert all('File must be compressed' in error for error in results['errors']) + + +def test_extract_file_metadata_invalid(): + results = extract_file_metadata(1, ['f3.gvcf.gz'], 'test-proj') + assert all(not results[key] for key in ['file_fastq', 'file_processed']) + assert results['errors'] == [ + 'File extension on f3.gvcf.gz not supported - ' + 'expecting one of: .fastq.gz, .fq.gz, .cram, .vcf.gz' + ] + + +def test_xls_to_json(project, institution): + rows = digest_xls('src/encoded/tests/data/documents/cgap_submit_test.xlsx') + json_out, success = xls_to_json(rows, project, institution) + assert len(json_out['family']) == 1 + assert 'encode-project:family-456' in json_out['family'] + assert len(json_out['individual']) == 3 + assert all(['encode-project:individual-' + x in json_out['individual'] for x in ['123', '456', '789']]) + + +def test_xls_to_json_no_header(project, institution, xls_list): + no_top_header = iter(xls_list[1:]) # top header missing should work ok (e.g. 'Patient Information', etc) + no_main_header = iter([xls_list[0]] + xls_list[2:]) # main header missing should cause a caught error + no_comments = iter(xls_list[0:2] + xls_list[3:]) + json_out, success = xls_to_json(no_top_header, project, institution) + assert success + json_out, success = xls_to_json(no_main_header, project, institution) + assert not success + json_out, success = xls_to_json(no_comments, project, institution) + assert success + + +def test_xls_to_json_missing_req_col(project, institution, xls_list): + # test error is caught when a required column in missing from excel file + idx = xls_list[1].index('Specimen ID') + rows = (row[0:idx] + row[idx+1:] for row in xls_list) + json_out, success = xls_to_json(rows, project, institution) + assert not success + + +def test_xls_to_json_missing_req_val(project, institution, xls_list): + # test error is caught when a required column is present but value is missing in a row + idx = xls_list[1].index('Specimen ID') + xls_list[4] = xls_list[4][0:idx] + [''] + xls_list[4][idx+1:] + rows = iter(xls_list) + json_out, success = xls_to_json(rows, project, institution) + assert json_out['errors'] + assert success + + +def test_xls_to_json_invalid_workup(project, institution, xls_list): + # invalid workup type is caught as an error + idx = xls_list[1].index('Workup Type') + xls_list[4] = xls_list[4][0:idx] + ['Other'] + xls_list[4][idx+1:] + rows = iter(xls_list) + json_out, success = xls_to_json(rows, project, institution) + assert json_out['errors'] + print(json_out['errors']) + assert success + assert ('Row 5 - Samples with analysis ID 55432 contain mis-matched ' + 'or invalid workup type values.') in ''.join(json_out['errors']) + + +def test_xls_to_json_mixed_workup(project, institution, xls_list): + # mixed workup types per analysis caught as an error + idx = xls_list[1].index('Workup Type') + xls_list[3] = xls_list[3][0:idx] + ['WES'] + xls_list[3][idx+1:] + one_row = xls_list[:4] + rows = iter(xls_list) + json_out, success = xls_to_json(rows, project, institution) + assert json_out['errors'] + assert success + assert ('Row 5 - Samples with analysis ID 55432 contain mis-matched ' + 'or invalid workup type values.') in ''.join(json_out['errors']) + single_row = iter(one_row) + one_json_out, one_success = xls_to_json(single_row, project, institution) + assert not one_json_out['errors'] + + +def test_parse_exception_invalid_alias(testapp, a_case): + a_case['invalid_field'] = 'value' + a_case['project'] = '/projects/invalid-project/' + errors = [] + try: + testapp.post_json('/case', a_case) + except Exception as e: + errors = parse_exception(e, ['/projects/other-project/']) + assert len(errors) == 2 + assert 'Additional properties are not allowed' in ''.join(errors) + assert 'not found' in ''.join(errors) + + +def test_parse_exception_with_alias(testapp, a_case): + a_case['project'] = '/projects/invalid-project/' + errors = None + try: + testapp.post_json('/case', a_case) + except Exception as e: + errors = parse_exception(e, ['/projects/invalid-project/']) + assert errors == [] + + +def test_compare_fields_same(testapp, fam, new_family): + profile = testapp.get('/profiles/family.json').json + result = compare_fields(profile, [], new_family, fam) + assert not result + + +def test_compare_fields_different(testapp, aunt, fam, new_family): + new_family['members'].append(aunt['@id']) + new_family['title'] = 'Smythe family' + profile = testapp.get('/profiles/family.json').json + result = compare_fields(profile, [], new_family, fam) + assert len(result) == 2 + assert 'title' in result + assert len(result['members']) == len(fam['members']) + 1 + + +def test_validate_item_post_valid(testapp, a_case): + result = validate_item(testapp, a_case, 'post', 'case', []) + assert not result + + +def test_validate_item_post_invalid(testapp, a_case): + a_case['project'] = '/projects/invalid-project/' + result = validate_item(testapp, a_case, 'post', 'case', []) + assert 'not found' in result[0] + + +def test_validate_item_post_invalid_yn(testapp, sample_info): + sample_info['req accepted y/n'] = 'not sure' + sample_info['specimen accepted by ref lab'] = "I don't know" + sample_item = map_fields(sample_info, {}, ['workup_type'], 'sample') + req_info = map_fields(sample_info, {}, ['date sent', 'date completed'], 'requisition') + sample_item['requisition_acceptance'] = req_info + result = validate_item(testapp, sample_item, 'post', 'sample', []) + assert len(result) == 2 + assert all("is not one of ['Y', 'N']" in error for error in result) + + +def test_validate_item_patch_valid(testapp, mother, grandpa): + patch_dict = {'mother': mother['aliases'][0]} + result = validate_item(testapp, patch_dict, 'patch', 'individual', [], atid=grandpa['@id']) + assert not result + + +def test_validate_item_patch_invalid(testapp, grandpa): + patch_dict = {'mother': 'non-existant-alias'} + result = validate_item(testapp, patch_dict, 'patch', 'individual', [], atid=grandpa['@id']) + assert 'not found' in result[0] + + +def test_validate_item_patch_alias(testapp, grandpa): + patch_dict = {'mother': 'existing-alias'} + result = validate_item(testapp, patch_dict, 'patch', 'individual', ['existing-alias'], atid=grandpa['@id']) + assert not result + + +def test_validate_all_items_errors(testapp, mother, empty_items): + new_individual = { + 'aliases': ['test-proj:new-individual-alias'], + 'individual_id': '1234', + 'sex': 'F', + 'mother': mother['aliases'][0], + 'project': 'test-proj:invalid-project-alias', + 'institution': 'test-proj:invalid-institution-alias' + } + items = empty_items + items['individual']['new-individual-alias'] = new_individual + data_out, result, success = validate_all_items(testapp, items) + assert not data_out + assert not success + assert len(result) > 1 + errors = ' '.join(result) + assert "'test-proj:invalid-project-alias' not found" in errors + assert "'test-proj:invalid-institution-alias' not found" in errors + assert mother['aliases'][0] not in errors diff --git a/src/encoded/tests/test_submit_data_bundle.py b/src/encoded/tests/test_submit_data_bundle.py new file mode 100644 index 0000000000..1fc683b00a --- /dev/null +++ b/src/encoded/tests/test_submit_data_bundle.py @@ -0,0 +1,3 @@ +# To be written + + diff --git a/src/encoded/tests/test_util.py b/src/encoded/tests/test_util.py index d531cf8bae..2b1429aa75 100644 --- a/src/encoded/tests/test_util.py +++ b/src/encoded/tests/test_util.py @@ -1,6 +1,7 @@ import datetime import io import os +import pyramid.httpexceptions import pytest import tempfile @@ -8,10 +9,17 @@ from dcicutils.qa_utils import ControlledTime, ignored from ..util import ( debuglog, deduplicate_list, gunzip_content, resolve_file_path, ENCODED_ROOT_DIR, get_trusted_email, + full_class_name, ) from .. import util as util_module +def test_full_class_name(): + + assert full_class_name(3) == 'int' + assert full_class_name(pyramid.httpexceptions.HTTPClientError("Oops")) == "pyramid.httpexceptions.HTTPClientError" + + def test_deduplicate_list(): def sort_somehow(seq): @@ -236,3 +244,4 @@ def log_content(): def test_get_trusted_email(): # TODO: This needs unit testing. ignored(get_trusted_email) + diff --git a/src/encoded/types/family.py b/src/encoded/types/family.py index ecf37805fb..3307699985 100644 --- a/src/encoded/types/family.py +++ b/src/encoded/types/family.py @@ -11,9 +11,6 @@ calculated_property, collection, load_schema, - CONNECTION, - COLLECTIONS, - display_title_schema ) from snovault.util import debug_log from webtest import TestApp diff --git a/src/encoded/types/ingestion.py b/src/encoded/types/ingestion.py new file mode 100644 index 0000000000..1f4d24f157 --- /dev/null +++ b/src/encoded/types/ingestion.py @@ -0,0 +1,180 @@ +""" +Collection for objects related to ingestion submissions. +""" + +import boto3 +import contextlib +import json +import logging +import re +import traceback + +from dcicutils.misc_utils import ignored, check_true +from snovault import collection, load_schema +from pyramid.request import Request +from pyramid.security import Allow, Deny, Everyone +from .base import ( + Item, + # TODO: Maybe collect all these permission styles into a single file, give them symbolic names, + # and permit only the symbolic names to be used in each situation so we can curate a full inventory of modes. + # -kmp 26-Jul-2020 + ALLOW_SUBMITTER_ADD, +) +from .institution import ( + ONLY_ADMIN_VIEW, +) +from ..util import ( + debuglog, subrequest_item_creation, beanstalk_env_from_registry, create_empty_s3_file, s3_output_stream +) +from ..ingestion.common import metadata_bundles_bucket, get_parameter + +ALLOW_SUBMITTER_VIEW = ( + # TODO: There is an issue here where we want a logged in user remotely only to view this + # but if we are proxying for them internall we want to be able to view OR edit. + # There is never reason for a user outside the system to update this status. -kmp 26-Jul-2020 + [] # Special additional permissions might go here. + + ALLOW_SUBMITTER_ADD # Is this right? See note above. + + ONLY_ADMIN_VIEW # Slightly misleading name. Allows admins to edit, too, actually. But only they can view. +) + + +class SubmissionFolio: + + INGESTION_SUBMISSION_URI = '/IngestionSubmission' + + def __init__(self, *, vapp, ingestion_type, submission_id, log=None): + self.vapp = vapp + self.ingestion_type = ingestion_type + self.log = log or logging + self.bs_env = beanstalk_env_from_registry(vapp.app.registry) + self.bucket = metadata_bundles_bucket(vapp.app.registry) + self.s3_client = boto3.client('s3') + self.other_details = {} + self.outcome = 'unknown' + self.submission_id = submission_id + # These next two are initialized later by s3 lookup, and the result is cached here. + # In particular, the values will be made available in time for the body of 'with folio.processing_context(...)' + # Setting them to None here makes PyCharm and other code analysis tools happier in knowing + # that accesses to these instance variables are legit. -kmp 27-Aug-2020 + self.object_name = None + self.parameters = None + + def __str__(self): + return "" % (self.ingestion_type, self.submission_id) + + @classmethod + def make_submission_uri(cls, submission_id): + return "/ingestion-submissions/" + submission_id + + @property + def submission_uri(self): + return self.make_submission_uri(self.submission_id) + + SUBMISSION_PATTERN = re.compile(r'^/ingestion-submissions/([0-9a-fA-F-]+)/?$') + + @classmethod + def create_item(cls, request, institution, project, ingestion_type): + json_body = { + "ingestion_type": ingestion_type, + "institution": institution, + "project": project, + "processing_status": { + "state": "submitted" + } + } + guid = None + item_url, res_json = None, None + try: + res_json = subrequest_item_creation(request=request, item_type='IngestionSubmission', json_body=json_body) + [item_url] = res_json['@graph'] + matched = cls.SUBMISSION_PATTERN.match(item_url) + if matched: + guid = matched.group(1) + except Exception as e: + logging.error("%s: %s" % (e.__class__.__name__, e)) + pass + check_true(guid, "Guid was not extracted from %s in %s" % (item_url, json.dumps(res_json))) + return guid + + def patch_item(self, **kwargs): + res = self.vapp.patch_json(self.submission_uri, kwargs) + [item] = res.json['@graph'] + debuglog(json.dumps(item)) + + @contextlib.contextmanager + def processing_context(self, submission): + + submission.log.info("Processing {submission_id} as {ingestion_type}." + .format(submission_id=submission.submission_id, ingestion_type=submission.ingestion_type)) + + submission_id = submission.submission_id + manifest_key = "%s/manifest.json" % submission_id + response = submission.s3_client.get_object(Bucket=submission.bucket, Key=manifest_key) + manifest = json.load(response['Body']) + + self.object_name = object_name = manifest['object_name'] + self.parameters = parameters = manifest['parameters'] + + debuglog(submission_id, "object_name:", object_name) + debuglog(submission_id, "parameters:", parameters) + + started_key = "%s/started.txt" % submission_id + create_empty_s3_file(submission.s3_client, bucket=submission.bucket, key=started_key) + + # PyCharm thinks this is unused. -kmp 26-Jul-2020 + # data_stream = submission.s3_client.get_object(Bucket=submission.bucket, Key="%s/manifest.json" % submission_id)['Body'] + + resolution = { + "data_key": object_name, + "manifest_key": manifest_key, + "started_key": started_key, + } + + try: + submission.patch_item(submission_id=submission_id, + object_name=object_name, + parameters=parameters, + processing_status={"state": "processing"}) + + yield resolution + + submission.patch_item(processing_status={"state": "done", "outcome": submission.outcome, "progress": "complete"}, + **submission.other_details) + + except Exception as e: + + resolution["traceback_key"] = traceback_key = "%s/traceback.txt" % submission_id + with s3_output_stream(submission.s3_client, bucket=submission.bucket, key=traceback_key) as fp: + traceback.print_exc(file=fp) + + resolution["error_type"] = e.__class__.__name__ + resolution["error_message"] = str(e) + + submission.patch_item( + errors=["%s: %s" % (e.__class__.__name__, e)], + processing_status={ + "state": "done", + "outcome": "error", + "progress": "incomplete" + }) + + with s3_output_stream(submission.s3_client, + bucket=submission.bucket, + key="%s/resolution.json" % submission_id) as fp: + print(json.dumps(resolution, indent=2), file=fp) + + +@collection( + name='ingestion-submissions', + acl=ALLOW_SUBMITTER_VIEW, + unique_key='object_name', + properties={ + 'title': 'Ingestion Submissions', + 'description': 'List of Ingestion Submissions', + }) +class IngestionSubmission(Item): + """The IngestionSubmission class that holds info on requests to ingest data.""" + + item_type = 'ingestion_submission' + schema = load_schema('encoded:schemas/ingestion_submission.json') + # embedded_list = [...] + Item.embedded_list diff --git a/src/encoded/util.py b/src/encoded/util.py index a0fa43abab..297b1a44b1 100644 --- a/src/encoded/util.py +++ b/src/encoded/util.py @@ -191,8 +191,8 @@ def s3_local_file(s3_client, bucket: str, key: str): bucket: an S3 bucket name key: the name of a key within the given S3 bucket """ - - tempfile_name = tempfile.mktemp() + ext = os.path.splitext(key)[-1] + tempfile_name = tempfile.mktemp() + ext try: s3_client.download_file(Bucket=bucket, Key=key, Filename=tempfile_name) yield tempfile_name @@ -238,6 +238,16 @@ def create_empty_s3_file(s3_client, bucket: str, key: str): s3_client.upload_file(empty_file, Bucket=bucket, Key=key) +def full_class_name(object): + # Source: https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python + + module = object.__class__.__module__ + if module is None or module == str.__class__.__module__: + return object.__class__.__name__ # Avoid reporting __builtin__ + else: + return module + '.' + object.__class__.__name__ + + def get_trusted_email(request, context=None, raise_errors=True): """ Get an email address on behalf of which we can issue other requests. @@ -269,3 +279,11 @@ def get_trusted_email(request, context=None, raise_errors=True): if raise_errors: raise return None + + +def beanstalk_env_from_request(request): + return beanstalk_env_from_registry(request.registry) + + +def beanstalk_env_from_registry(registry): + return registry.settings.get('env.name') diff --git a/test.ini b/test.ini index bd08ff729c..c05ebddf3b 100644 --- a/test.ini +++ b/test.ini @@ -3,6 +3,7 @@ use = config:base.ini#app session.secret = superlegitrealsecret file_upload_bucket = elasticbeanstalk-encoded-4dn-files blob_bucket = elasticbeanstalk-encoded-4dn-blobs +metadata_bundles_bucket = elasticbeanstalk-fourfront-cgaplocal-test-metadata-bundles #blob_store_profile_name = encoded-4dn-files accession_factory = encoded.server_defaults.test_accession elasticsearch.server = 172.31.49.128:9872