From fb329f51192088fcae222524c0bc17387c3a09cb Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 29 May 2020 13:57:09 -0400 Subject: [PATCH 001/125] added some submit functions - WIP --- src/encoded/submit.py | 222 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 src/encoded/submit.py diff --git a/src/encoded/submit.py b/src/encoded/submit.py new file mode 100644 index 0000000000..aed3e862ac --- /dev/null +++ b/src/encoded/submit.py @@ -0,0 +1,222 @@ +from pyramid.paster import get_app +from pyramid.response import Response +from pyramid.view import view_config +from snovault.util import debug_log +from webtest import TestApp +import datetime +import xlrd + + +BGM_FIELD_MAPPING = { + 'bcgg-id': 'patient id', + 'bcgg-f-id': 'family id', + "date req rec'd": 'date requisition received' +} + + +POST_ORDER = ['Sample', 'SampleProcessing', 'Individual', 'Family'] + + +SECOND_ROUND = {} + + +@view_config(route_name='submit_data', request_method='POST', permission='add') +@debug_log +def submit_data(context, request): + ''' + usage notes here later + ''' + config_uri = request.json.get('config_uri', 'production.ini') + patch_only = request.json.get('patch_only', False) + post_only = request.json.get('post_only', False) + app = get_app(config_uri, 'app') + environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} + testapp = TestApp(app, environ) + # expected response + request.response.status = 200 + result = { + 'status': 'success', + '@type': ['result'], + } + + raise NotImplementedError + + +def xls_to_json(xls_data, project, institution): + book = xlrd.open_workbook(xls_data) + sheet, = book.sheets() + row = row_generator(sheet) + top_header = next(row) + keys = next(row) + next(row) + rows = [] + counter = 0 + for values in row: + r = [val for val in values] + row_dict = {keys[i].lower(): item for i, item in enumerate(r)} + rows.append(row_dict) + + items = {'Individual': {}, 'Family': {}, 'Sample': {}, 'SampleProcessing': {}} + specimen_ids = {} + for row in rows: + indiv_alias = 'test-project:individual-{}'.format(row['patient id']) + fam_alias = 'test-project:family-{}'.format(row['family id']) + sp_alias = 'test-project:sampleproc-{}'.format(row['specimen id']) + # create items for Individual + if row['patient id'] not in items['Individual']: + items['Individual'][indiv_alias] = { + 'aliases': [indiv_alias], + 'individual_id': row['patient id'], + 'sex': row['sex'], + 'age': row['age'], + 'birth_year': row['birth year'] + } + # create/edit items for Family + if fam_alias not in items['Family']: + items['Family'][fam_alias] = { + 'aliases': [fam_alias], + 'family_id': row['family id'], + 'members': [indiv_alias] + } + else: + items['Family'][fam_alias]['members'].append(indiv_alias) + if row.get('relation to proband', '').lower() in ['proband', 'mother', 'father']: + items['Family'][fam_alias][row['relation to proband'].lower()] = indiv_alias + # create item for Sample if there is a specimen + if row['specimen id']: + samp_alias = 'test-project:sample-{}'.format(row['specimen id']) + if row['specimen id'] in specimen_ids: + samp_alias = samp_alias + '-' + specimen_ids[row['specimen id']] + specimen_ids[row['specimen id']] += 1 + else: + specimen_ids[row['specimen id']] = 1 + items['Sample'][samp_alias] = { + 'aliases': [samp_alias], + 'workup_type': row['workup type'], + 'specimen_type': row['specimen type'], + 'specimen_collection_date': row['date collected'], + 'specimen_collection_location': row['location collected'], + 'specimen_accession': row['specimen id'], + 'date_transported': row['date transported'], + 'transported_by': row['transport method'], + 'sent_by': row['sent by'], + 'date_received': row["date rec'd at ref lab"], + 'specimen_accepted': row['specimen accepted by ref lab'], + 'dna_concentration': row['dna concentration'], + 'specimen_notes': row['specimen notes'], + 'files': [] + } + items['Individual'][indiv_alias]['samples'] = [samp_alias] + # create SampleProcessing item for that one sample if needed + if row['report required'].lower() in ['yes', 'y']: + items['SampleProcessing'][sp_alias] = { + 'aliases': [sp_alias], + 'analysis_type': row['workup type'], + 'samples': [samp_alias] + } + else: + print('WARNING: No specimen id present for patient {},' + ' sample will not be created.'.format(row['patient id'])) + # create SampleProcessing item for trio/group if needed + for v in items['Family'].values(): + if 'members' in v and len(v['members']) > 1: + # create sample_processing item + alias = 'test-project:{}-sampleproc'.format(v['family_id']) + samples = [items['Individual'][indiv].get('samples', [None])[0] for indiv in v['members']] + samples = [s for s in samples if s] + if len (samples) > 1: + sp = { + 'aliases': [alias], + 'samples': samples + } + analysis_type = items['Sample'][items['Individual'][v['proband']]['samples'][0]]['workup_type'] + if sorted(v['members']) == sorted([v['proband'], v['mother'], v['father']]): + sp['analysis_type'] = analysis_type + '-Trio' + else: + sp['analysis_type'] = analysis_type + '-Group' + items['SampleProcessing'][alias] = sp + # removed unused fields, add project and institution + for val1 in items.values(): + for val2 in val1.values(): + remove_keys = [k for k, v in val2.items() if not v] + for key in remove_keys: + del val2[key] + val2['project'] = project['@id'] + val2['institution'] = institution['@id'] + + return items + + +def check_against_db(): + alias_dict = {} + links = ['samples', 'members', 'mother', 'father', 'proband'] + for itemtype in POST_ORDER: + profile = testapp.get('/profiles/{}.json'.format(itemtype)) + for alias in results[itemtype]: + try: + # check if already in db + result = testapp.get(alias + '/?frame=object') + except Exception as e: + # post if not in db + if 'HTTPNotFound' in str(e): + validation = testapp.post_json(results[itemtype][alias], itemtype + '/?checkonly=True', status=201) + if validation: # modify to check for lack of validation errors + response = testapp.post_json(results[itemtype][alias], status=201) + # do something to record response + else: + # do something to report validation errors + pass + else: + # patch if item exists in db + alias_dict[alias] = result['@id'] + to_patch = {} + for field in results[itemtype][alias]: + if field in links: + # look up atids of links + if profile['properties'][field]['type'] != 'array': + for i, item in enumerate(results[itemtype][alias][field]): + if item in alias_dict: + results[itemtype][alias][field][i] = alias_dict[item] + elif profile['properties'][field]['type'] == 'string': + if item in alias_dict: + results[itemtype][alias][field] = alias_dict[item] + # if not an array, patch field gets overwritten (if different from db) + if profile['properties'][field]['type'] != 'array': + if results[itemtype][alias][field] != result.get(field): + to_patch[field] = results[itemtype][alias][field] + else: + # if array, patch field vals get added to what's in db + if sorted(results[itemtype][alias][field]) != sorted(result.get(field, [])): + val = result.get(field, []) + val.extend(results[itemtype][alias][field]) + to_patch[field] = list(set(val)) + + +def cell_value(cell, datemode): + """Get cell value from excel.""" + # This should be always returning text format + ctype = cell.ctype + value = cell.value + if ctype == xlrd.XL_CELL_ERROR: # pragma: no cover + raise ValueError(repr(cell), 'cell error') + elif ctype == xlrd.XL_CELL_BOOLEAN: + return str(value).upper().strip() + elif ctype == xlrd.XL_CELL_NUMBER: + if value.is_integer(): + value = int(value) + return str(value).strip() + elif ctype == xlrd.XL_CELL_DATE: + value = xlrd.xldate_as_tuple(value, datemode) + if value[3:] == (0, 0, 0): + return datetime.date(*value[:3]).isoformat() + else: # pragma: no cover + return datetime.datetime(*value).isoformat() + elif ctype in (xlrd.XL_CELL_TEXT, xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK): + return value.strip() + raise ValueError(repr(cell), 'unknown cell type') # pragma: no cover + + +def row_generator(sheet): + datemode = sheet.book.datemode + for index in range(sheet.nrows): + yield [cell_value(cell, datemode) for cell in sheet.row(index)] From eb9a0224baf7eb4cdd109cfd87cb61f22bce10f4 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 29 May 2020 17:05:15 -0400 Subject: [PATCH 002/125] added some comments and edits to submit.py --- src/encoded/submit.py | 74 ++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index aed3e862ac..79b7309d5e 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -14,12 +14,13 @@ } -POST_ORDER = ['Sample', 'SampleProcessing', 'Individual', 'Family'] +POST_ORDER = ['sample', 'sample_processing', 'individual', 'family'] SECOND_ROUND = {} +# This is a placeholder for a submission endpoint modified from loadxl @view_config(route_name='submit_data', request_method='POST', permission='add') @debug_log def submit_data(context, request): @@ -43,6 +44,10 @@ def submit_data(context, request): def xls_to_json(xls_data, project, institution): + ''' + Converts excel file to json for submission. + Functional but expect future changes. + ''' book = xlrd.open_workbook(xls_data) sheet, = book.sheets() row = row_generator(sheet) @@ -56,15 +61,15 @@ def xls_to_json(xls_data, project, institution): row_dict = {keys[i].lower(): item for i, item in enumerate(r)} rows.append(row_dict) - items = {'Individual': {}, 'Family': {}, 'Sample': {}, 'SampleProcessing': {}} + items = {'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}} specimen_ids = {} for row in rows: indiv_alias = 'test-project:individual-{}'.format(row['patient id']) fam_alias = 'test-project:family-{}'.format(row['family id']) sp_alias = 'test-project:sampleproc-{}'.format(row['specimen id']) # create items for Individual - if row['patient id'] not in items['Individual']: - items['Individual'][indiv_alias] = { + if row['patient id'] not in items['individual']: + items['individual'][indiv_alias] = { 'aliases': [indiv_alias], 'individual_id': row['patient id'], 'sex': row['sex'], @@ -72,16 +77,16 @@ def xls_to_json(xls_data, project, institution): 'birth_year': row['birth year'] } # create/edit items for Family - if fam_alias not in items['Family']: + if fam_alias not in items['family']: items['Family'][fam_alias] = { 'aliases': [fam_alias], 'family_id': row['family id'], 'members': [indiv_alias] } else: - items['Family'][fam_alias]['members'].append(indiv_alias) + items['family'][fam_alias]['members'].append(indiv_alias) if row.get('relation to proband', '').lower() in ['proband', 'mother', 'father']: - items['Family'][fam_alias][row['relation to proband'].lower()] = indiv_alias + items['family'][fam_alias][row['relation to proband'].lower()] = indiv_alias # create item for Sample if there is a specimen if row['specimen id']: samp_alias = 'test-project:sample-{}'.format(row['specimen id']) @@ -90,7 +95,7 @@ def xls_to_json(xls_data, project, institution): specimen_ids[row['specimen id']] += 1 else: specimen_ids[row['specimen id']] = 1 - items['Sample'][samp_alias] = { + items['sample'][samp_alias] = { 'aliases': [samp_alias], 'workup_type': row['workup type'], 'specimen_type': row['specimen type'], @@ -106,10 +111,10 @@ def xls_to_json(xls_data, project, institution): 'specimen_notes': row['specimen notes'], 'files': [] } - items['Individual'][indiv_alias]['samples'] = [samp_alias] + items['individual'][indiv_alias]['samples'] = [samp_alias] # create SampleProcessing item for that one sample if needed if row['report required'].lower() in ['yes', 'y']: - items['SampleProcessing'][sp_alias] = { + items['sample_processing'][sp_alias] = { 'aliases': [sp_alias], 'analysis_type': row['workup type'], 'samples': [samp_alias] @@ -118,24 +123,24 @@ def xls_to_json(xls_data, project, institution): print('WARNING: No specimen id present for patient {},' ' sample will not be created.'.format(row['patient id'])) # create SampleProcessing item for trio/group if needed - for v in items['Family'].values(): + for v in items['family'].values(): if 'members' in v and len(v['members']) > 1: # create sample_processing item alias = 'test-project:{}-sampleproc'.format(v['family_id']) - samples = [items['Individual'][indiv].get('samples', [None])[0] for indiv in v['members']] + samples = [items['individual'][indiv].get('samples', [None])[0] for indiv in v['members']] samples = [s for s in samples if s] if len (samples) > 1: sp = { 'aliases': [alias], 'samples': samples } - analysis_type = items['Sample'][items['Individual'][v['proband']]['samples'][0]]['workup_type'] + analysis_type = items['sample'][items['individual'][v['proband']]['samples'][0]]['workup_type'] if sorted(v['members']) == sorted([v['proband'], v['mother'], v['father']]): sp['analysis_type'] = analysis_type + '-Trio' else: sp['analysis_type'] = analysis_type + '-Group' - items['SampleProcessing'][alias] = sp - # removed unused fields, add project and institution + items['sample_processing'][alias] = sp + # removed unused fields, add project and institution for val1 in items.values(): for val2 in val1.values(): remove_keys = [k for k, v in val2.items() if not v] @@ -147,7 +152,15 @@ def xls_to_json(xls_data, project, institution): return items -def check_against_db(): +def validate_and_post(testapp, json_data, dryrun=False): + ''' + Still in progress, not necessarily functional yet. NOT YET TESTED. + + Function that: + 1. looks up each item in json + 2. if item in db, will validate and patch any different metadata + 3. if item not in db, will post item + ''' alias_dict = {} links = ['samples', 'members', 'mother', 'father', 'proband'] for itemtype in POST_ORDER: @@ -159,13 +172,13 @@ def check_against_db(): except Exception as e: # post if not in db if 'HTTPNotFound' in str(e): - validation = testapp.post_json(results[itemtype][alias], itemtype + '/?checkonly=True', status=201) - if validation: # modify to check for lack of validation errors - response = testapp.post_json(results[itemtype][alias], status=201) - # do something to record response - else: + validation = testapp.post_json('/{}/?checkonly=True'.format(itemtype), results[itemtype][alias], status=201) + if not validation: # modify to check for presence of validation errors # do something to report validation errors pass + elif not dryrun: # post + response = testapp.post_json('/' + itemtype, results[itemtype][alias], status=201) + # do something to record response else: # patch if item exists in db alias_dict[alias] = result['@id'] @@ -190,10 +203,26 @@ def check_against_db(): val = result.get(field, []) val.extend(results[itemtype][alias][field]) to_patch[field] = list(set(val)) + validation = testapp.patch_json(result['@id'] + '/?checkonly=True', to_patch, status=200) + if not validation: # modify to check for presence of validation errors + # do something to report validation errors + pass + elif not dryrun: # patch + response = testapp.patch_json('/' + itemtype, results[itemtype][alias], status=201) + # do something to record response + + +# This was just to see if i could post something using testapp in the python command line, currently works. +def test_function(): + app = get_app('development.ini', 'app') + environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} + testapp = TestApp(app, environ) + response = testapp.post_json('/project', {'name': 'test', 'title': 'Test'}, status=201) + print(response) def cell_value(cell, datemode): - """Get cell value from excel.""" + """Get cell value from excel. [From Submit4DN]""" # This should be always returning text format ctype = cell.ctype value = cell.value @@ -217,6 +246,7 @@ def cell_value(cell, datemode): def row_generator(sheet): + '''Generator that gets rows from excel sheet [From Submit4DN]''' datemode = sheet.book.datemode for index in range(sheet.nrows): yield [cell_value(cell, datemode) for cell in sheet.row(index)] From a6fe747c13b08c3a044f539c928f867a4b92e62c Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 11 Jun 2020 13:29:39 -0400 Subject: [PATCH 003/125] Major changes to submit.py Code broken down into smaller functions Functions not all finished currently Significant refactoring --- src/encoded/submit.py | 298 +++++++++++++++++++++++++++++------------- 1 file changed, 210 insertions(+), 88 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 79b7309d5e..29a5c2fed9 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -2,8 +2,13 @@ from pyramid.response import Response from pyramid.view import view_config from snovault.util import debug_log -from webtest import TestApp +# from webtest import TestApp +from dcicutils.misc_utils import VirtualApp +from dcicutils import ff_utils +from webtest.app import AppError +import ast import datetime +import json import xlrd @@ -32,7 +37,7 @@ def submit_data(context, request): post_only = request.json.get('post_only', False) app = get_app(config_uri, 'app') environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} - testapp = TestApp(app, environ) + virtualapp = VirtualApp(app, environ) # expected response request.response.status = 200 result = { @@ -64,95 +69,190 @@ def xls_to_json(xls_data, project, institution): items = {'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}} specimen_ids = {} for row in rows: - indiv_alias = 'test-project:individual-{}'.format(row['patient id']) - fam_alias = 'test-project:family-{}'.format(row['family id']) - sp_alias = 'test-project:sampleproc-{}'.format(row['specimen id']) + indiv_alias = '{}:individual-{}'.format(project['name'], row['patient id']) + fam_alias = '{}:family-{}'.format(project['name'], row['family id']) + sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual - if row['patient id'] not in items['individual']: - items['individual'][indiv_alias] = { - 'aliases': [indiv_alias], - 'individual_id': row['patient id'], - 'sex': row['sex'], - 'age': row['age'], - 'birth_year': row['birth year'] - } + items = fetch_individual_metadata(row, items, indiv_alias) # create/edit items for Family - if fam_alias not in items['family']: - items['Family'][fam_alias] = { - 'aliases': [fam_alias], - 'family_id': row['family id'], - 'members': [indiv_alias] - } - else: - items['family'][fam_alias]['members'].append(indiv_alias) - if row.get('relation to proband', '').lower() in ['proband', 'mother', 'father']: - items['family'][fam_alias][row['relation to proband'].lower()] = indiv_alias + items = fetch_family_metadata(row, items, indiv_alias, fam_alias) # create item for Sample if there is a specimen if row['specimen id']: - samp_alias = 'test-project:sample-{}'.format(row['specimen id']) + samp_alias = '{}:sample-{}'.format(project['name'], row['specimen id']) if row['specimen id'] in specimen_ids: samp_alias = samp_alias + '-' + specimen_ids[row['specimen id']] specimen_ids[row['specimen id']] += 1 else: specimen_ids[row['specimen id']] = 1 - items['sample'][samp_alias] = { - 'aliases': [samp_alias], - 'workup_type': row['workup type'], - 'specimen_type': row['specimen type'], - 'specimen_collection_date': row['date collected'], - 'specimen_collection_location': row['location collected'], - 'specimen_accession': row['specimen id'], - 'date_transported': row['date transported'], - 'transported_by': row['transport method'], - 'sent_by': row['sent by'], - 'date_received': row["date rec'd at ref lab"], - 'specimen_accepted': row['specimen accepted by ref lab'], - 'dna_concentration': row['dna concentration'], - 'specimen_notes': row['specimen notes'], - 'files': [] - } - items['individual'][indiv_alias]['samples'] = [samp_alias] - # create SampleProcessing item for that one sample if needed - if row['report required'].lower() in ['yes', 'y']: - items['sample_processing'][sp_alias] = { - 'aliases': [sp_alias], - 'analysis_type': row['workup type'], - 'samples': [samp_alias] - } + items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias) else: print('WARNING: No specimen id present for patient {},' ' sample will not be created.'.format(row['patient id'])) # create SampleProcessing item for trio/group if needed - for v in items['family'].values(): + items = create_sample_processing_groups(items, sp_alias) + # removed unused fields, add project and institution + for val1 in items.values(): + for val2 in val1.values(): + remove_keys = [k for k, v in val2.items() if not v] + for key in remove_keys: + del val2[key] + val2['project'] = project['@id'] + val2['institution'] = institution['@id'] + + return items + + +def fetch_individual_metadata(row, items, indiv_alias): + new_items = items.copy() + info = { + 'aliases': [indiv_alias], + 'individual_id': row['patient id'], + 'sex': row.get('sex'), + 'age': row.get('age'), + 'birth_year': row.get('birth year') + } + if indiv_alias not in new_items['individual']: + new_items['individual'][indiv_alias] = {k: v for k, v in info.items() if v} + else: + for key in info: + if key not in new_items['individual'][indiv_alias]: + new_items['individual'][indiv_alias][key] = info[key] + return new_items + + +def fetch_family_metadata(row, items, indiv_alias, fam_alias): + new_items = items.copy() + info = { + 'aliases': [fam_alias], + 'family_id': row['family id'], + 'members': [indiv_alias] + } + if row.get('relation to proband', '').lower() in ['proband', 'mother', 'father']: + info[row['relation to proband'].lower()] = indiv_alias + if fam_alias not in new_items['family']: + new_items['family'][fam_alias] = info + else: + if indiv_alias not in new_items['family'][fam_alias]['members']: + new_items['family'][fam_alias]['members'].append(indiv_alias) + if row.get('relation to proband', '').lower() not in new_items['family'][fam_alias]: + new_items['family'][fam_alias][row['relation to proband'].lower()] = indiv_alias + return new_items + + +def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias): + new_items = items.copy() + info = { + 'aliases': [samp_alias], + 'workup_type': row.get('workup type'), + 'specimen_type': row.get('specimen type'), + 'specimen_collection_date': row.get('date collected'), + 'specimen_collection_location': row.get('location collected'), + 'specimen_accession': row['specimen id'], + 'date_transported': row.get('date transported'), + 'transported_by': row.get('transport method'), + 'sent_by': row.get('sent by'), + 'date_received': row.get("date rec'd at ref lab"), + 'specimen_accepted': row.get('specimen accepted by ref lab'), + 'dna_concentration': row.get('dna concentration'), + 'specimen_notes': row.get('specimen notes') + } + new_items['sample'][samp_alias] = {k: v for k, v in info.items() if v} + if indiv_alias in new_items['individual']: + new_items['individual'][indiv_alias]['samples'] = [samp_alias] + # create SampleProcessing item for that one sample if needed + if row['report required'].lower() in ['yes', 'y']: + new_items['sample_processing'][sp_alias] = { + 'aliases': [sp_alias], + 'analysis_type': row['workup type'], + 'samples': [samp_alias] + } + return new_items + + +def create_sample_processing_groups(items, sp_alias): + new_items = items.copy() + for v in new_items['family'].values(): if 'members' in v and len(v['members']) > 1: # create sample_processing item - alias = 'test-project:{}-sampleproc'.format(v['family_id']) samples = [items['individual'][indiv].get('samples', [None])[0] for indiv in v['members']] samples = [s for s in samples if s] if len (samples) > 1: sp = { - 'aliases': [alias], + 'aliases': [sp_alias], 'samples': samples } analysis_type = items['sample'][items['individual'][v['proband']]['samples'][0]]['workup_type'] - if sorted(v['members']) == sorted([v['proband'], v['mother'], v['father']]): + if all([relation in v for relation in ['proband', 'mother', 'father']]) and sorted( + v['members']) == sorted([v['proband'], v['mother'], v['father']] + ): sp['analysis_type'] = analysis_type + '-Trio' else: sp['analysis_type'] = analysis_type + '-Group' - items['sample_processing'][alias] = sp - # removed unused fields, add project and institution - for val1 in items.values(): - for val2 in val1.values(): - remove_keys = [k for k, v in val2.items() if not v] - for key in remove_keys: - del val2[key] - val2['project'] = project['@id'] - val2['institution'] = institution['@id'] + new_items['sample_processing'][sp_alias] = sp + return new_items + +# NOT YET TESTED +def compare_with_db(alias, virtualapp): + try: # check if already in db + # result = virtualapp.get(alias + '/?frame=object') + # result = virtualapp.get('/search/?type=Item&aliases={}'.format(alias)) + result = virtualapp.get('/search/?type=Item&age=33') + print(result) + except Exception as e: # if not in db + print(e) + if 'HTTPNotFound' in str(e): + return None + else: + return result.json + + +def validate_item(virtualapp, item, method, itemtype, atid=None): + if method == 'post': + #import pdb; pdb.set_trace() + try: + validation = virtualapp.post_json('/{}/?checkonly=True'.format(itemtype), item) + except AppError as e: + print('exception') + return parse_exception(e) + else: + return + elif method == 'patch': + try: + validation = virtualapp.patch_json(atid + '?checkonly=True', item, status=200) + except Exception as e: + return parse_exception(e) + else: + return + else: + raise ValueError("Unrecognized method -- must be 'post' or 'patch'") + + +def parse_exception(e): + """ff_utils functions raise an exception when the expected code is not returned. + This response is a pre-formatted text, and this function will get the resonse json + out of it. [Adapted from Submit4DN]""" + try: + # try parsing the exception + text = e.args[0] + resp_text = text[text.index('{'):-1] + resp_text = json.loads(resp_text.replace('\\', '')) + resp_list = [error['description'] for error in resp_text['errors']] + return resp_list + # if not re-raise + except: # pragma: no cover + raise e + + +def patch_item_data(): + pass - return items +def post_item_data(): + pass -def validate_and_post(testapp, json_data, dryrun=False): + +# NOT FINISHED +def validate_and_post(virtualapp, json_data, dryrun=False): ''' Still in progress, not necessarily functional yet. NOT YET TESTED. @@ -163,22 +263,19 @@ def validate_and_post(testapp, json_data, dryrun=False): ''' alias_dict = {} links = ['samples', 'members', 'mother', 'father', 'proband'] + errors = [] + json_data_final = {'post': {}, 'patch': {}} for itemtype in POST_ORDER: - profile = testapp.get('/profiles/{}.json'.format(itemtype)) + profile = virtualapp.get('/profiles/{}.json'.format(itemtype)) for alias in results[itemtype]: - try: - # check if already in db - result = testapp.get(alias + '/?frame=object') - except Exception as e: - # post if not in db - if 'HTTPNotFound' in str(e): - validation = testapp.post_json('/{}/?checkonly=True'.format(itemtype), results[itemtype][alias], status=201) - if not validation: # modify to check for presence of validation errors - # do something to report validation errors - pass - elif not dryrun: # post - response = testapp.post_json('/' + itemtype, results[itemtype][alias], status=201) - # do something to record response + result = compare_with_db(alias) + if not result: + error = validate_item(results[itemtype][alias], 'post', itemtype) + if error: # modify to check for presence of validation errors + # do something to report validation errors + errors.append(error) + else: + json_data_final['post'].setdefault(itemtype, default=[]).append(results[itemtype][alias]) else: # patch if item exists in db alias_dict[alias] = result['@id'] @@ -203,22 +300,47 @@ def validate_and_post(testapp, json_data, dryrun=False): val = result.get(field, []) val.extend(results[itemtype][alias][field]) to_patch[field] = list(set(val)) - validation = testapp.patch_json(result['@id'] + '/?checkonly=True', to_patch, status=200) - if not validation: # modify to check for presence of validation errors + error = validate_item(to_patch, 'post', itemtype, atid=result['@id']) + if error: # modify to check for presence of validation errors # do something to report validation errors - pass - elif not dryrun: # patch - response = testapp.patch_json('/' + itemtype, results[itemtype][alias], status=201) + errors.append(error) + else: # patch + json_data_final['patch'][result['@id']] = to_patch # do something to record response + if errors: + return errors + output = [] + item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_id'} + if json_data_final['post']: + for k, v in json_data_final['post'].items(): + # also create Case and Report items for each SampleProcessing item created + for item in v: + for field in links: + if field in item: + json_data_final['patch'][item['aliases'][0]] = item[field] + del item[field] + try: + response = virtualapp.post_json('/' + k, item, status=201) + aliasdict[item['aliases'][0]] = response.json['@graph'][0]['@id'] + if response.json['status'] == 'success' and k in item_names: + output.append('Success - {} {} posted'.format(k, item[item_names[k]])) + except Exception: + pass + for k, v in json_data_final['patch'].items(): + atid = k if k.startswith('/') else aliasdict[k] + try: + response = testapp.patch_json(atid, v, status=200) + except Exception: + pass # This was just to see if i could post something using testapp in the python command line, currently works. -def test_function(): - app = get_app('development.ini', 'app') - environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} - testapp = TestApp(app, environ) - response = testapp.post_json('/project', {'name': 'test', 'title': 'Test'}, status=201) - print(response) +# def test_function(): +# app = get_app('development.ini', 'app') +# environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} +# testapp = TestApp(app, environ) +# response = testapp.post_json('/project', {'name': 'test', 'title': 'Test'}, status=201) +# print(response) def cell_value(cell, datemode): From ee5fd51b63e3c49db951337eceb7a11424011476 Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 11 Jun 2020 13:31:19 -0400 Subject: [PATCH 004/125] unit tests written for some submit.py functions --- src/encoded/tests/datafixtures.py | 1 + src/encoded/tests/test_submit.py | 162 ++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 src/encoded/tests/test_submit.py diff --git a/src/encoded/tests/datafixtures.py b/src/encoded/tests/datafixtures.py index 62daca1f15..f12fef3781 100644 --- a/src/encoded/tests/datafixtures.py +++ b/src/encoded/tests/datafixtures.py @@ -171,6 +171,7 @@ def grandpa(testapp, project, institution): @pytest.fixture def mother(testapp, project, institution, grandpa, female_individual): item = { + "aliases": ["test-project:indiv-003389"], "age": 33, "age_units": "year", 'project': project['@id'], diff --git a/src/encoded/tests/test_submit.py b/src/encoded/tests/test_submit.py new file mode 100644 index 0000000000..ce332d06da --- /dev/null +++ b/src/encoded/tests/test_submit.py @@ -0,0 +1,162 @@ +import pytest +from encoded.submit import * +import json +# from pyramid.paster import get_app +# from dcicutils.misc_utils import VirtualApp + + +@pytest.fixture +def row_dict(): + return { + 'patient id': '456', + 'family id': '333', + 'sex': 'M', + 'relation to proband': 'proband', + 'report required': 'Y', + 'specimen id': '3464467', + 'specimen type': 'blood', + 'workup type': 'WGS' + } + + +@pytest.fixture +def empty_items(): + return {'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}} + + +@pytest.fixture +def submission_info(): + return { + 'family': {'test-proj:fam1': { + 'members': ['test-proj:indiv1'], + 'proband': 'test-proj:indiv1' + }}, + 'individual': {'test-proj:indiv1': {'samples': ['test-proj:samp1']}}, + 'sample': {'test-proj:samp1': {'workup_type': 'WGS'}}, + 'sample_processing': {} + } + + +@pytest.fixture +def submission_info2(submission_info): + submission_info['family']['test-proj:fam1']['members'].append('test-proj:indiv2') + submission_info['individual']['test-proj:indiv2'] = {'samples': ['test-proj:samp2']} + submission_info['sample']['test-proj:samp2'] = {'workup_type': 'WGS'} + return submission_info + + +@pytest.fixture +def submission_info3(submission_info2): + info = submission_info2.copy() + info['family']['test-proj:fam1']['members'].append('test-proj:indiv3') + info['family']['test-proj:fam1']['mother'] = 'test-proj:indiv2' + # submission_info['family']['test-proj:fam1']['father'] = 'test-proj:indiv3' + info['individual']['test-proj:indiv3'] = {'samples': ['test-proj:samp3']} + info['sample']['test-proj:samp3'] = {'workup_type': 'WGS'} + return info + + +def test_fetch_individual_metadata_new(row_dict, empty_items): + items_out = fetch_individual_metadata(row_dict, empty_items, 'test-proj:indiv1') + assert items_out['individual']['test-proj:indiv1']['aliases'] == ['test-proj:indiv1'] + assert items_out['individual']['test-proj:indiv1']['individual_id'] == '456' + + +def test_fetch_individual_metadata_old(row_dict): + items = empty_items.copy() + items['individual'] = {'test-proj:indiv1': { + 'individual_id': '456', + 'age': 46, + 'aliases': ['test-proj:indiv1'] + }} + items_out = fetch_individual_metadata(row_dict, items, 'test-proj:indiv1') + assert len(items['individual']) == len(items_out['individual']) + assert 'sex' in items_out['individual']['test-proj:indiv1'] + assert 'age' in items_out['individual']['test-proj:indiv1'] + + +def test_fetch_family_metadata_new(row_dict, empty_items): + items_out = fetch_family_metadata(row_dict, empty_items, 'test-proj:indiv1', 'test-proj:fam1') + assert items_out['family']['test-proj:fam1']['members'] == ['test-proj:indiv1'] + assert items_out['family']['test-proj:fam1']['proband'] == 'test-proj:indiv1' + + +def test_fetch_family_metadata_old(row_dict): + items = empty_items.copy() + items['family'] = {'test-proj:fam1': { + 'aliases': ['test-proj:fam1'], + 'family_id': '333', + 'members': ['test-proj:indiv2'], + 'mother': 'test-proj:indiv2' + }} + items_out = fetch_family_metadata(row_dict, items, 'test-proj:indiv1', 'test-proj:fam1') + assert items_out['family']['test-proj:fam1']['members'] == ['test-proj:indiv2', 'test-proj:indiv1'] + assert items_out['family']['test-proj:fam1']['proband'] == 'test-proj:indiv1' + assert items_out['family']['test-proj:fam1']['mother'] == 'test-proj:indiv2' + + +def test_fetch_sample_metadata_sp(row_dict): + items = empty_items.copy() + items['individual'] = {'test-proj:indiv1': {}} + items_out = fetch_sample_metadata(row_dict, items, 'test-proj:indiv1', 'test-proj:samp1', 'test-proj:sp1') + assert items_out['sample']['test-proj:samp1']['specimen_accession'] == row_dict['specimen id'] + assert items_out['sample_processing']['test-proj:sp1']['samples'] == ['test-proj:samp1'] + assert items_out['individual']['test-proj:indiv1']['samples'] == ['test-proj:samp1'] + + +def test_fetch_sample_metadata_no_sp(row_dict): + items = empty_items.copy() + items['individual'] = {'test-proj:indiv1': {}} + row_dict['report required'] = 'N' + items_out = fetch_sample_metadata(row_dict, items, 'test-proj:indiv1', 'test-proj:samp1', 'test-proj:sp1') + assert items_out['sample']['test-proj:samp1']['specimen_accession'] == row_dict['specimen id'] + assert not items_out['sample_processing'] + + +def test_create_sample_processing_groups_grp(submission_info2): + items_out = create_sample_processing_groups(submission_info2, 'test-proj:sp-multi') + assert items_out['sample_processing']['test-proj:sp-multi']['analysis_type'] == 'WGS-Group' + assert len(items_out['sample_processing']['test-proj:sp-multi']['samples']) == 2 + + +def test_create_sample_processing_groups_one(submission_info): + items_out = create_sample_processing_groups(submission_info, 'test-proj:sp-single') + assert not items_out['sample_processing'] + + +def test_create_sample_processing_groups_trio(submission_info3): + items_out = create_sample_processing_groups(submission_info3, 'test-proj:sp-multi') + assert items_out['sample_processing']['test-proj:sp-multi']['analysis_type'] == 'WGS-Group' + submission_info3['family']['test-proj:fam1']['father'] = 'test-proj:indiv3' + items_out = create_sample_processing_groups(submission_info3, 'test-proj:sp-multi') + assert items_out['sample_processing']['test-proj:sp-multi']['analysis_type'] == 'WGS-Trio' + + +def test_xls_to_json(project, institution): + json_out = xls_to_json('src/encoded/tests/data/documents/cgap_submit_test.xlsx', project, institution) + assert len(json_out['family']) == 1 + assert len(json_out['individual']) == 3 + assert all(['encode-project:individual-' + x in json_out['individual'] for x in ['123', '456', '789']]) + + +def test_validate_item_post_valid(testapp, a_case): + result = validate_item(testapp, a_case, 'post', 'case') + assert not result + + +def test_validate_item_post_invalid(testapp, a_case): + a_case['project'] = '/projects/invalid-project/' + result = validate_item(testapp, a_case, 'post', 'case') + assert 'not found' in result[0] + + +def test_validate_item_patch_valid(testapp, mother, grandpa): + patch_dict = {'mother': mother['aliases'][0]} + result = validate_item(testapp, patch_dict, 'patch', 'individual', atid=grandpa['@id']) + assert not result + + +def test_validate_item_patch_invalid(testapp, grandpa): + patch_dict = {'mother': 'non-existant-alias'} + result = validate_item(testapp, patch_dict, 'patch', 'individual', atid=grandpa['@id']) + assert 'not found' in result[0] From 8c90f7db37694262595c207514626f55173cf6f5 Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 11 Jun 2020 13:31:55 -0400 Subject: [PATCH 005/125] test file added for testing submit functions --- .../tests/data/documents/cgap_submit_test.xlsx | Bin 0 -> 12955 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/encoded/tests/data/documents/cgap_submit_test.xlsx diff --git a/src/encoded/tests/data/documents/cgap_submit_test.xlsx b/src/encoded/tests/data/documents/cgap_submit_test.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..0d65ab327411da00aa287eec8b4637569b70ec6d GIT binary patch literal 12955 zcmeHuWmFv5x_0A@dvJGmmju_~9^Bm>f(C6YxH|-Q4-(vhyGw8l?r@ozbMDM!&bQY0 z|DNi#s(M$|`*iJHukHOltsnydi2;BH009612|$P|V9XW_0Dy!70MG$I@DCz(w$3KD z&Uz{y_9jj`jP5qp#JP~*)Y$;=*Zcpw{U08IGL=EwE@qTA^@F#f?P_5;Atfb1{1*HV zj~}T_6;-Q&ofL}JV;?9l__50uDK&-P{jg@d*OVvMhyM~;?sz8f zB$*`^kYA=2SsMf5$i_xIJ%ygChmmbsG-R|SeIccI!%+vvw%d6ww=Y~8`c{OZjdrI5 z{BHhGV6aOQT!zL4+mPEmP>BIodR4u(RgslVjl43=i9nSx{w9(JqgtW4?w&~MU}zE} zTc_F_hgXE?okbZ~kqs*m0$VUFN;M0{IkA}^G!xlM%TN<5T=$~8xBj67c1y;OiUt?G zN4|iGmpE3 zm@|*IPe7+&6pIzWYRfU|CB9QR_#W)pr5YF3PppS{q^N0b?&$PIK$4pQ*5?Y|D+EY? zgd-70^ZDxt{JYX6q?L?ZA)@ufGbFs{S37%ofdDA{%}nc6naM6+bx-#7D*UUN>N%QN zJ25f-w*Rkr{vWoXy?lIu59V9 zAU)BoPx+wiMlMDmfQHBAgaHyz=x9d=ho#5xNU2Wf&4RH5^qCmkJGaynI3an3q@tJ2 zlfmqkqs%U>mQLdYL$5kBT({+S|4I3%JfN2MBcaN}*Un)ab!Xe-t>`WXoAVK6y6YI2{oAOolbi-Nr zVTh(<>70J}^3-%|uzzOl^-cd%Wr(Z0(C{Gu03Zwifc$#LtIGUUS;|xu?b5kXeBSOq z2X4C=ra>i&kzmr}>EvHyTuokAJ4U!wgD2O%cg-t$dFr#CuN#!9I1aC`?)QE+-NAc! zT*MVRCvTX7F)4+a=odvN^G>Zx;cK zFX3~IBI1y!vDf^#mPM|C8CFY+;=@iQs1E$x+7KCEVI?JLAVpScx zq%%9QpG?@HCsXTUg?{>=F$g-qpY`6U)t_G|wuB;gueDyJSh83)yQFFHXzsehinJxY zl}&b_mY4}hm`JW*Zss1mTg=w7y_;{rLpeu4hL1DG?DL@j=9y=(3%EI!d%^H}pC77o za2ZtMA;L)zZFl6cqQSvmLbVGIi}l$ZbGRYmzE3kc9}KwYaqPw;b@c{9+u+-UaRkVU z(OYePbCR(<;V#Z9>5DzhAJ&^2d_iHt5+sEbcFh0i?TbUGZ&#z4JvU11-chnD68pSy z6T^zq_x%{oG1liJf@UnGm=@&Bwk4{HK|D0d6Lt2sG)Zvhq&8QuS7TEFTo!!Uk=mY~P;MB|;dm0*pHPKHgKaMab^lu&Q&U)o}YV zy3=JNcylqx;vlMH!>)iPuwx!@=>@G&P(18U%hNnj3cz>hb9-Yeh}F}?5WcG$#E65K zH6&D0*renAbtH%~e-Tq!OF6Ta0aA9dkQQH78>;bXe=5=Eqle97oe}klVd0 zr9F*}3e@@svqB_btB0H!XYRT0c6W}}jf7`m6F^FIf^rSkaZKHMzBn3(K(_<|Ik??} zF$>Gu06GI?mk~HVIvd)JpEG63AMoBZGbHP%7TQHe3Farn(B{b6#q-GtJ|&Ec4~8Jh zF9t z(q3+ZFAj8pS}sa4%uqI-iix*jUxj8#CS7JJGR(jbB&5sTs+&|L5ya@^9+oV{^*oCd zz3|gM^NVVN?HT%swkMrgC0r4wC+xdNsO|#$cTwquD~`A4q;265LGV-8gZ4Zv?v&p4 z8`a~-8zymq13+z)pw^;9+@E~>tb9*twhb=vTULQb0kP~14Qi%JqFgt41hPEp&EHe` zgRaFj`(~Ai?3u`9DHpLv)4h#xH{`c$xz_d|k8*fO5z9I7&~;`g0`(MY#Cr49Rc?ZS zj-EbX{?Bamc-J(DSlT4`Cc+c3Qd>O2JvKtk6CB(3X}L;^5K8?B{ae z-Zghe(wFh`MqyJZ4fS&+KAoV1$w;Y_Lx80>)2!@lKY`!}4x00Px36_j5-oZaK@JKQ zGOURgrOp_z$aQ(awj>Zgl=_p?9SE;tbJ$MTLXaaP-=OB&Os*1cT(EhZ0VU34LXlOD zw>vJ%T$(Z zr=cw}zx7c=2xzbBfzn{H^qwm^~!T z&;BlDDs?hyGS)nWF#^O3P4^r9AJv3oW=y7OnRod(D}_Fj_iNS1#D(bqCWFT{xdLDD zGG8qmnICXk+4H#vI`p~V?K3+*%5G6yje5JjDaQ6x z2st_Fu>KegO!(>`Me--?Zl9io>L}Ra7{Q$dsW1=b!8;J%cv*?1&AA>-9g|%BDgD+* z^*~+$CC%rfsElf{S1W;-L5mOJ|aOD+I%0rxQ6{t zBhZP3(k}NpHhsjuEynzwjX0Z|*qAW=zW=@3N17ubJT8n5toyeJPHvAJKcXpC*T<~m zR>+Mr;_>Snjuh3{SYuo9(7`Equ9T?r3zCHG1mY*(K_IqWM#GZV?GsH-OVZu(}*?Y~AD(J$FWmi49|pCE8G z`3+rxQaLtR?5%Eqp@P>(RuZx!!~|(yui{qu4VKTsh!NCl>Lt|_In_V9YlK6C_LNQt zNEXixGFkWtH@zlrsL{}T%5-DMtOhC9wpPRmt02G+T!~z2bFdddejHfS1fu3y?^&Xl z0;1UkqfzQNsn8%wtKL=$`ac(vg*Py><|pXrhiTv`UAr6h49Ye<(q+$Z5i?~9cs>8z zzX<6N@V(r->(;AjuBVl;g`rSZKADNudwF`oTGi`#{&{f5wLeXD*45MTcsHEX@p8s{ zRkgW{$<+Sw{CqzSYwhFJu6#7X;u1bV@=ubm>Yo)GMyHe@1AnB4GqATAJKUT09?FwW;hbvYHX_HYPK8eU8`ssd&3@*ql-E zNQp<3g@d3SWW2@d=-VklS;0gu>KRURac@NbNoI#P9IuZ0taf4K_`uI1GeGa<_UFD z{RDy|=`y;3QPrz-eK3>Q602Dn$0Q(~$<0~3O_iCQLxB4m>M+!`{5xFH6=@yr?Q|8- zj3}h{>m13!6buoGzK}*DBbm5+s!0BrF^r6q5ikoHy^|;+}DA#VKG4hD>b4 z(ay>XEe;-*E*2$iV)+bY5ze)d%#+LkSR3?wBZ&{1eJv>=)2#B%hRt}6YzL9Zs8(X6 zR89Vrl*;2he71M>mcT|@GIpyw^gZnEId|x0vgr?4xynthI8vFmw{8ZBI9N3yqVc@i zadJb@ET5CT;(HkM7<>~D*(dOGZRphfPz1Gpq`spgN^?MQ<{NNl?Ig1ZUcpw(lq2%r zP_D3lF6mt(C?j9M?`k88jBbPA-Q_~f*U(_1gNX|B0|h!BjIj8pRbhVKzH{5uA2Oe7 z4WHzRk0Rp*DY)h)8)tTLgjIZ>kf$TUmS0Hi%sFJ_PcDJDZj)c(BAOX;g0|Q(E9|8` zn0Aia58>NFEIk*QuAwckeUf8vtgvEiMnaqMkZ#?W{s@BTWM;_v&iY=nRjxa{wpMYe zj*T9OAV1A{YvUr$EpS#r^>!fRyOzs4`a|iA+xwyA))`Y6H-COldtdG^N_z^zc1dK^ z_mg-7@tpdgYC~y|E4JG3Q7Ps@(-too-0)o`20WH$PkehXKWP?x!~#@8$SRoDNKCwG>c#o zr1^>(@t(4ui0O>|t&4`{ZR> z>x11UH-?X%#fwP1TPpmgG+|QX#=bAwbXDK?IR}n8hT(`!?2@2`$-5kPEz!nCk{m$S~fSV z)EkPTv58>!8KB2tpC!%IVAi;YL8hQd_lO&$h~p!5jG*^T#DmY&PYLBzyS6NI%(h~S z*0{N-&oMQ)PdNw+IwI6Xb9!0I&!~;#9Yp;0-#L{~2(gTME|W2%0yQKlA51 zzXSCtuhEG|$w=BY<0DO2}IGiF3yI2|&2f;!mdMx*T= ze-xU~bbW@0(PISYwoI~0UElF`X8-c4=(hgVY9pGI&TQpAGn7*h^#qKG>H8n3y`KS? zj;wkEdYy?%IkLdeupbi<**r2R3Fxi2!Y^p&kok~n81TS-F;OA9&exnz>K4DPp>i{u zdfF8#6>4~~J%yQwPNuLp?MpE#g5LGwZk&s*`)c_^A7N%=`S_CkI2?h6(8dt|1u--U zOHaJu;QEzYT_mVRK}$lsCOlBXJCGmTKrfM_mw`OykrU*<5Ev23s})?{+3=NW1NYFJ z?^|^lei6h)C1RcSnPBXDBBiV8yYr9BXP(ja7m5GpoU;>$mU565@ykYuB zW-Yg zZ~oj4XC)h3QBgJo0`6C1GPlYu=s8~U1I>Ak6OT8pk$f3JIu<_6`l{R$mtWE4D)yzL z$Uletr1k2UT8dWX@8wtcimjEr1nU`D2a9?h4(8IyeFsHuBAGxlJ1tC)@x8xSJ>~v- z`zu6B0p)5(o;ueJ$E)v08hM&R8=v?jIF>FJOz*UVi0zyto58`c-TCROvqR3(HAMUu zRXn~eyXr-Aey6dDrF`Vf)Nk_B%l^(zD+?Y+veW2_M)1e-qLE5t3%d62rc^mL6Al)5 zlS*^l3QWGES{&O;eK31M9&E%q8}6g)phV5LEKpZ2^c^axE9r^fW#v0OTbWVL!#P&P zh)bI2-hfQ~GR>=AnX8=_h5p`6afn#@;>zxaKnzIfz-sdGnV!H44pVUf@2(MRtLq_7w?ZrrV`L#%@3DhZ0DX2cwFhsm*h&P#GceT zmaet@cbkw`Mz<#5`6Iu&ZHBem@W@@S0i;&(;LwbuW4l=3+{Qn2lC7Wlp{N_0s3Z_BX%n)>}Y=}XsIQu-l@_Hixr2_|HM~)>Nt&G;tS&(^3#gDKnzR~iI>*uG+l}7f5l(+p3 z6JW@P-i88Ly{5~D+ZVg;nAR$JVQQ zV4E%7N)i#pdGNpAWh!;oQIV^0$H7GLeNMM_+_vFZ>sz56@@cnr-PyN&| z(8`$c_I|9BtKBnzy?8e9nrOYC3w*)9-7PVY*2c3)_+HB{PXuCgKT%~loFcjsZ!_2R zn%wP3z*{;rIkO^#wbtgzf*&NDZcpz^d5|*mIhHM?!|NaigK50^@H0Im~}i}h4GxF19{qlodbXSR15#}MGk}Y{fzAT`j{Mf z?Wsrcg^(%4RKMsu?^B(!LW4LDcb#?fulyxA+{Vy>$4u%ueD>LB>n=KT`(GO{hFpcH zISIHGbRUlc0s9H`-1a{;X$g=sntBEs$eXTunYSqsmqyou9EM;AAYAUQ$(yn-4jj2} z*ku-?!()hjRIE-QPeYh6XtUU0fQOY(%1;P>8&vjKw4Lq9^l*7_gWTFGLiwOKXNW9Mx_?E_eys?m~7S0P|U?h56d1j)S%ctaBJ@XQ*79izyDq zHy@A{LE_kw6p2EmN&3dRgDy&3>hj&TgWv*dWK12AXC{UX@ve|$7x!F8gpqvKA zVqqQBoLI;(8H<2-Es(+m{^Bwu;Q%;^6WgKKcbE!5lXoQ3U_lB%3}FfdV8LqxJSfH! zj^qYI_?+g`kuq?w5T)ilvA?x4aN)OaB*tmN1DruIHgF_M7{U!SpZr2AI!}WLgc&2~H zvW(4c73QqA@^+fQX0N)w$GF>bs=YI~{-$^^a*godn?5>oT1sc|)w)O=-WP%C&z-?3 z`0A8Z6gH0C;=IMO`13sMoM0d3?8)b`ISx6}rqu&S{1(zvAFHZIWjVWD`PXDQ^BP8) zJNpbi`Ed1qKX)>yF)-4>wFWQjq^kqtnd5Q39kQx1h+j>%UcO$on<(t1xo`?;?Ut`a z*XMkTz$utE*ESeDqlS$e0WNMyEsI*65~NXhE>{Vg7KCA#(-7fZ*o#6}e;fuARj|mp znOUW=)Xkz9qET)mzYILAH^E^3{Hesy0xp<8MNbyT>>Rn;^uw_~6~5Ih8DGK>hfozE zY`}~JN*PChz6mPiVJNW%2W0cCcz-xCXOV9}_9AS+A|;TvSqO3{k+?`PAe$IA;D8J$ z9Z6h77myta3K?)_p<~q<;9irhyD(5)Rvdaba6_0ac;(ihSp<i7V>z<=i&OR%pvf&NJkLVBn{RdACN~bJBUs@ekkpWGHybR$1;QROZYytwVNQEI2h4rTi}sX;n3d^B zwO$m-!j;8OOl6Bh#5+i{piV4vibG7>z41l5>(cLP)lMZfZPixgJg2d&<n{9gxMq;)czlW3-U(gDpE>v8T&*wg63r;F9Lo467y&8Wi#?^Ym^IK`=k-e5X^YJv$qYO{(8W!rYey~ zYEF3MyOIWAzI52T5=US@ZP2?CR^X=~eRQCUJjpaqkir~>5-VkUyftjTv9#Ko0%>p> zR5VG8sCUwakV@gCF=#0M@w&kLEg|9pXmFAT<3(W(Bf|nuCjY-F{-4$$20ES%gdBhD zS9kDGYz`mpA0_ONz$~At7{;>WU*T^~vm6i#;ugBn_hL89FjqsKP@$*m6A26(N$vjJ zY5J{8pxAmqIufg1&2qM_-SDxS)n|<0eAY#aPjM-(7MCV#dx>lenSbXx&A>c_qvUm% zhezB9S3_OS-~-Xq9PBNIATFGOAST2X%*eIpIB7Ata2-LRI;Tk|_Lr=k6}w(B71wHe zAvMHaq7CzFc(S+U7DY=4cesJ!A&prkLhcR2b}}~!N%s~U;m;XXDyZMOrj8t&jZ|T$ zgPC+nFp3cMT2yG*xr+O>1~lLhVK=b03R({>t}yR*uDab|&U0HTAw}N2RvTv@;4&#c ziisf6=AY1cFL9QJ+&by8B+oxK=MEOqWU1wD;hi8;N$aXm4Y}SDs2tX1QuDt@DXhMfnlFHoKZ~|NXS~;%J8@+ed};?huG8=K+?A*`=dcP=M$Cq7yDA|$Za8|RQuL+)Y8n0LSzU+M} zre-__bC-q>ZUO;QN%Dc+;q~)tMf;z*mZ`a4lkw~P*Xxxff%?jlFt#&NaI~{`VluLG zH2D|b@4w`T*Cg~)te#9aGe++kRENmm#bQ2zEp7{v%Xh302ri5@VaKmfOZns}4;TCI zmuy_AfgVz$%^hAR6#MS#Z#uEC>FRNFxJju2z4bAc9KyrB?5$${F0!y{=1^pHFrdDL zE7c*op?d%32~2FNPUy^h`t2EDc}6L9Ut(pBG3PnbcCU6?ERQb-oi@w!pqd}LFmw}S zKco}F+_x%T^a)g&TO3Ur*$Dt-zrs7%Ax(S8BTI5rYPc(Ub_-xWD&PtQ#(g`6kv=xq{5{1D^MzY62)K_|9y)bmL?!uwCE zIkqlQ*Y&EJ^q2s^>rCJ`HirFc_1M|OQQ5@V`L{CiuWc4aLkrs@l@aM>1#AJf>f^ma zB#dQe#e6!4J%qU871fk{SLw-56h`|_9(Sm?mTtUesMxAT07FUv982h~4 zLbmzDoQQSt+mr3DYhwaDTsJY^oTT1D;gC4TbdIe7!~DY9Jv^l)hU*Z8k(24ymsHDG zi)m=nt=f{b5cj+Dv`PDoL%8bjBnyre`znbKm@-C9=O(G=T=;1OdM|Nh3D9=&P?JJg;d;7rx%@6xsuhkWj z4s+b&mMxV_kif&%!9ssQ?d%!yY}pwht(!Jn>3Kr6`W z8{D(0+)j8^1F~190m@%Jl!3kde;w58ko{{*k9}{q^c&+3;+hC@1&`-g0l~j4SCa3t z$`CBp)l9C?D5rruwCwZIoP5K?w5Ho)BoULYU~Z8ek`A4x>{bBNwxgn?|7!5i-A9@% z*EPm=X(5F608V%MZB(h%hv;M_0zUIjA5PRhs!SR|n_1p0~TgVyd5KXJnlLHCo9YJzDrK z$Mg(1>wz>K`ViQwR@y9UHp;L#w{>d{SH4rcs%7qwA(4!$C0xD=e-CD$u?DS8n&+i^ zWzIt`bmFGFUVzczXAht=K_2wqMc^JNG>%x3O#eO>qgy0oJV^^*xQRVle=+}&WmqvL z&}l=T*?+5#XLHm6beXsMZqpZqX}5>0yN7RMSqsFj9bM$N0kL~~$$wUfb2u*cvwxXe zY%Rk#=Svg3+qc_>*^As&&X}s9w+>nl#o0vX;>vuY9rCY&-=Tl^t^W^l3`N|Y}75)cH=vRba zX#syCfKdMT=l``*^hb~WTrc{a?D;F;uT}Ox0V7d=2mH0>{wu(*b%H+u5ODtJ)Bjct zeii+7q3ci4V8UNJ^Jh1HMfrOQ_a_DbaQ+%L|B=xBD*kJj{SzRP_VPfRrp8Rawf+ZZ_Y+D0 literal 0 HcmV?d00001 From 26c70c1abb2d6cbb22e45a169bc38f3bd6537eec Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 11 Jun 2020 16:55:50 -0400 Subject: [PATCH 006/125] edits to validate_and_post in submit.py --- src/encoded/submit.py | 116 ++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 29a5c2fed9..47603ea17b 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -3,7 +3,7 @@ from pyramid.view import view_config from snovault.util import debug_log # from webtest import TestApp -from dcicutils.misc_utils import VirtualApp +from dcicutils.misc_utils import VirtualApp, VirtualAppError from dcicutils import ff_utils from webtest.app import AppError import ast @@ -108,9 +108,11 @@ def fetch_individual_metadata(row, items, indiv_alias): 'aliases': [indiv_alias], 'individual_id': row['patient id'], 'sex': row.get('sex'), - 'age': row.get('age'), - 'birth_year': row.get('birth year') + # 'age': int(row.get('age')), + # 'birth_year': int(row.get('birth year')) } + info['age'] = int(row['age']) if row.get('age') else None + info['birth_year'] = int(row['birth year']) if row.get('birth year') else None if indiv_alias not in new_items['individual']: new_items['individual'][indiv_alias] = {k: v for k, v in info.items() if v} else: @@ -127,15 +129,15 @@ def fetch_family_metadata(row, items, indiv_alias, fam_alias): 'family_id': row['family id'], 'members': [indiv_alias] } - if row.get('relation to proband', '').lower() in ['proband', 'mother', 'father']: - info[row['relation to proband'].lower()] = indiv_alias + if row.get('relation to proband', '').lower() == 'proband': + info['proband'] = indiv_alias if fam_alias not in new_items['family']: new_items['family'][fam_alias] = info else: if indiv_alias not in new_items['family'][fam_alias]['members']: new_items['family'][fam_alias]['members'].append(indiv_alias) - if row.get('relation to proband', '').lower() not in new_items['family'][fam_alias]: - new_items['family'][fam_alias][row['relation to proband'].lower()] = indiv_alias + if row.get('relation to proband', '').lower() == 'proband' and 'proband' not in new_items['family'][fam_alias]: + new_items['family'][fam_alias]['proband'] = indiv_alias return new_items @@ -191,35 +193,33 @@ def create_sample_processing_groups(items, sp_alias): new_items['sample_processing'][sp_alias] = sp return new_items + # NOT YET TESTED def compare_with_db(alias, virtualapp): try: # check if already in db - # result = virtualapp.get(alias + '/?frame=object') - # result = virtualapp.get('/search/?type=Item&aliases={}'.format(alias)) - result = virtualapp.get('/search/?type=Item&age=33') - print(result) + result = virtualapp.get(alias + '/?frame=object') except Exception as e: # if not in db - print(e) + # print(e) if 'HTTPNotFound' in str(e): return None else: return result.json +# TODO : Handle validation of not-yet-submitted-aliases in fields def validate_item(virtualapp, item, method, itemtype, atid=None): if method == 'post': #import pdb; pdb.set_trace() try: - validation = virtualapp.post_json('/{}/?checkonly=True'.format(itemtype), item) - except AppError as e: - print('exception') + validation = virtualapp.post_json('/{}/?check_only=true'.format(itemtype), item) + except (AppError, VirtualAppError) as e: return parse_exception(e) else: return elif method == 'patch': try: - validation = virtualapp.patch_json(atid + '?checkonly=True', item, status=200) - except Exception as e: + validation = virtualapp.patch_json(atid + '?check_only=true', item, status=200) + except (AppError, VirtualAppError) as e: return parse_exception(e) else: return @@ -267,71 +267,77 @@ def validate_and_post(virtualapp, json_data, dryrun=False): json_data_final = {'post': {}, 'patch': {}} for itemtype in POST_ORDER: profile = virtualapp.get('/profiles/{}.json'.format(itemtype)) - for alias in results[itemtype]: - result = compare_with_db(alias) + for alias in json_data[itemtype]: + # TODO : format fields (e.g. int, list, etc.) + result = compare_with_db(virtualapp, alias) if not result: - error = validate_item(results[itemtype][alias], 'post', itemtype) + error = validate_item(virtualapp, json_data[itemtype][alias], 'post', itemtype) if error: # modify to check for presence of validation errors # do something to report validation errors - errors.append(error) + for e in error: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) else: - json_data_final['post'].setdefault(itemtype, default=[]).append(results[itemtype][alias]) + json_data_final['post'].setdefault(itemtype, []) + json_data_final['post'][itemtype].append(json_data[itemtype][alias]) else: # patch if item exists in db alias_dict[alias] = result['@id'] to_patch = {} - for field in results[itemtype][alias]: + for field in json_data[itemtype][alias]: if field in links: # look up atids of links if profile['properties'][field]['type'] != 'array': - for i, item in enumerate(results[itemtype][alias][field]): + for i, item in enumerate(json_data[itemtype][alias][field]): if item in alias_dict: - results[itemtype][alias][field][i] = alias_dict[item] + json_data[itemtype][alias][field][i] = alias_dict[item] elif profile['properties'][field]['type'] == 'string': if item in alias_dict: - results[itemtype][alias][field] = alias_dict[item] + json_data[itemtype][alias][field] = alias_dict[item] # if not an array, patch field gets overwritten (if different from db) if profile['properties'][field]['type'] != 'array': - if results[itemtype][alias][field] != result.get(field): - to_patch[field] = results[itemtype][alias][field] + if json_data[itemtype][alias][field] != result.get(field): + to_patch[field] = json_data[itemtype][alias][field] else: # if array, patch field vals get added to what's in db - if sorted(results[itemtype][alias][field]) != sorted(result.get(field, [])): + if sorted(json_data[itemtype][alias][field]) != sorted(result.get(field, [])): val = result.get(field, []) - val.extend(results[itemtype][alias][field]) + val.extend(json_data[itemtype][alias][field]) to_patch[field] = list(set(val)) - error = validate_item(to_patch, 'post', itemtype, atid=result['@id']) + error = validate_item(virtualapp, to_patch, 'post', itemtype, atid=result['@id']) if error: # modify to check for presence of validation errors # do something to report validation errors - errors.append(error) + for e in error: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) else: # patch json_data_final['patch'][result['@id']] = to_patch # do something to record response if errors: return errors - output = [] - item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_id'} - if json_data_final['post']: - for k, v in json_data_final['post'].items(): - # also create Case and Report items for each SampleProcessing item created - for item in v: - for field in links: - if field in item: - json_data_final['patch'][item['aliases'][0]] = item[field] - del item[field] - try: - response = virtualapp.post_json('/' + k, item, status=201) - aliasdict[item['aliases'][0]] = response.json['@graph'][0]['@id'] - if response.json['status'] == 'success' and k in item_names: - output.append('Success - {} {} posted'.format(k, item[item_names[k]])) - except Exception: - pass - for k, v in json_data_final['patch'].items(): - atid = k if k.startswith('/') else aliasdict[k] - try: - response = testapp.patch_json(atid, v, status=200) - except Exception: - pass + else: + return 'All items validated' + # output = [] + # item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_id'} + # if json_data_final['post']: + # for k, v in json_data_final['post'].items(): + # # also create Case and Report items for each SampleProcessing item created + # for item in v: + # for field in links: + # if field in item: + # json_data_final['patch'][item['aliases'][0]] = item[field] + # del item[field] + # try: + # response = virtualapp.post_json('/' + k, item, status=201) + # aliasdict[item['aliases'][0]] = response.json['@graph'][0]['@id'] + # if response.json['status'] == 'success' and k in item_names: + # output.append('Success - {} {} posted'.format(k, item[item_names[k]])) + # except Exception: + # pass + # for k, v in json_data_final['patch'].items(): + # atid = k if k.startswith('/') else aliasdict[k] + # try: + # response = testapp.patch_json(atid, v, status=200) + # except Exception: + # pass # This was just to see if i could post something using testapp in the python command line, currently works. From 1f8cddfc0beb06a93633d77365d5479173ad495d Mon Sep 17 00:00:00 2001 From: Sarah Date: Sun, 14 Jun 2020 21:13:17 -0400 Subject: [PATCH 007/125] Added alias exceptions to validation error handling --- src/encoded/submit.py | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 47603ea17b..bc2ba93e4e 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -207,27 +207,27 @@ def compare_with_db(alias, virtualapp): # TODO : Handle validation of not-yet-submitted-aliases in fields -def validate_item(virtualapp, item, method, itemtype, atid=None): +def validate_item(virtualapp, item, method, itemtype, aliases, atid=None): if method == 'post': #import pdb; pdb.set_trace() try: validation = virtualapp.post_json('/{}/?check_only=true'.format(itemtype), item) except (AppError, VirtualAppError) as e: - return parse_exception(e) + return parse_exception(e, aliases) else: return elif method == 'patch': try: validation = virtualapp.patch_json(atid + '?check_only=true', item, status=200) except (AppError, VirtualAppError) as e: - return parse_exception(e) + return parse_exception(e, aliases) else: return else: raise ValueError("Unrecognized method -- must be 'post' or 'patch'") -def parse_exception(e): +def parse_exception(e, aliases): """ff_utils functions raise an exception when the expected code is not returned. This response is a pre-formatted text, and this function will get the resonse json out of it. [Adapted from Submit4DN]""" @@ -235,8 +235,14 @@ def parse_exception(e): # try parsing the exception text = e.args[0] resp_text = text[text.index('{'):-1] - resp_text = json.loads(resp_text.replace('\\', '')) - resp_list = [error['description'] for error in resp_text['errors']] + resp_dict = json.loads(resp_text.replace('\\', '')) + if resp_dict.get('description') == 'Failed validation': + resp_list = [error['description'] for error in resp_dict['errors']] + for error in resp_list: + # if error is caused by linkTo to item not submitted yet but in aliases list, + # remove that error + if 'not found' in error and error.split("'")[1] in aliases: + resp_list.remove(error) return resp_list # if not re-raise except: # pragma: no cover @@ -260,10 +266,23 @@ def validate_and_post(virtualapp, json_data, dryrun=False): 1. looks up each item in json 2. if item in db, will validate and patch any different metadata 3. if item not in db, will post item + + Current status: + Still testing validation/data organization parts - patch/post part hasn't been fully + written or tested and need to add code to create Case/Report items. + + More notes: + Case and Report items to be created at end. We don't want them in the validation report, since + they are not part of the user's spreadsheet and validation error messages would be too confusing. + We only want to create these when we are sure no validation issues in other items exist. + Spreadsheet has no Case ID, but if there is an "analysis ID" then we can create a Case ID from this + (perhaps analysis ID + indiv ID + label indicating group/trio vs solo) + Report ID can be same as case ID but with "report" appended (?) ''' alias_dict = {} links = ['samples', 'members', 'mother', 'father', 'proband'] errors = [] + all_aliases = [k for itype in json_data for k in itype] json_data_final = {'post': {}, 'patch': {}} for itemtype in POST_ORDER: profile = virtualapp.get('/profiles/{}.json'.format(itemtype)) @@ -271,7 +290,7 @@ def validate_and_post(virtualapp, json_data, dryrun=False): # TODO : format fields (e.g. int, list, etc.) result = compare_with_db(virtualapp, alias) if not result: - error = validate_item(virtualapp, json_data[itemtype][alias], 'post', itemtype) + error = validate_item(virtualapp, json_data[itemtype][alias], 'post', itemtype, all_aliases) if error: # modify to check for presence of validation errors # do something to report validation errors for e in error: @@ -303,7 +322,7 @@ def validate_and_post(virtualapp, json_data, dryrun=False): val = result.get(field, []) val.extend(json_data[itemtype][alias][field]) to_patch[field] = list(set(val)) - error = validate_item(virtualapp, to_patch, 'post', itemtype, atid=result['@id']) + error = validate_item(virtualapp, to_patch, 'post', itemtype, all_aliases, atid=result['@id']) if error: # modify to check for presence of validation errors # do something to report validation errors for e in error: @@ -315,6 +334,7 @@ def validate_and_post(virtualapp, json_data, dryrun=False): return errors else: return 'All items validated' + # TODO : create case and report items here - skip validation part because they are not part of user's spreadsheet # output = [] # item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_id'} # if json_data_final['post']: From e51f5e5fa90969a88f90e60983a7305a4f2d3a70 Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 15 Jun 2020 11:47:38 -0400 Subject: [PATCH 008/125] code added for creation of case and report items in submit.py --- src/encoded/submit.py | 107 ++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 34 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index bc2ba93e4e..7f777596cc 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -19,7 +19,7 @@ } -POST_ORDER = ['sample', 'sample_processing', 'individual', 'family'] +POST_ORDER = ['sample', 'sample_processing', 'individual', 'family', 'report', 'case'] SECOND_ROUND = {} @@ -66,7 +66,10 @@ def xls_to_json(xls_data, project, institution): row_dict = {keys[i].lower(): item for i, item in enumerate(r)} rows.append(row_dict) - items = {'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}} + items = { + 'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}, + 'case': {}, 'report': {} + } specimen_ids = {} for row in rows: indiv_alias = '{}:individual-{}'.format(project['name'], row['patient id']) @@ -84,12 +87,14 @@ def xls_to_json(xls_data, project, institution): specimen_ids[row['specimen id']] += 1 else: specimen_ids[row['specimen id']] = 1 - items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias) + analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) + items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysis_alias, fam_alias) else: print('WARNING: No specimen id present for patient {},' ' sample will not be created.'.format(row['patient id'])) # create SampleProcessing item for trio/group if needed - items = create_sample_processing_groups(items, sp_alias) + # items = create_sample_processing_groups(items, sp_alias) + items = create_case_items(items, project['name']) # removed unused fields, add project and institution for val1 in items.values(): for val2 in val1.values(): @@ -108,8 +113,6 @@ def fetch_individual_metadata(row, items, indiv_alias): 'aliases': [indiv_alias], 'individual_id': row['patient id'], 'sex': row.get('sex'), - # 'age': int(row.get('age')), - # 'birth_year': int(row.get('birth year')) } info['age'] = int(row['age']) if row.get('age') else None info['birth_year'] = int(row['birth year']) if row.get('birth year') else None @@ -141,7 +144,7 @@ def fetch_family_metadata(row, items, indiv_alias, fam_alias): return new_items -def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias): +def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysis_alias, fam_alias): new_items = items.copy() info = { 'aliases': [samp_alias], @@ -162,12 +165,50 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias): if indiv_alias in new_items['individual']: new_items['individual'][indiv_alias]['samples'] = [samp_alias] # create SampleProcessing item for that one sample if needed - if row['report required'].lower() in ['yes', 'y']: - new_items['sample_processing'][sp_alias] = { - 'aliases': [sp_alias], - 'analysis_type': row['workup type'], - 'samples': [samp_alias] - } + # if row['report required'].lower() in ['yes', 'y']: + # new_items['sample_processing'][sp_alias] = { + # 'aliases': [sp_alias], + # 'analysis_type': row['workup type'], + # 'samples': [samp_alias] + # } + new_sp_item = { + # not trivial to add analysis_type here, turn into calculated property + 'aliases': [analysis_alias], + 'samples': [], + 'families': [] + } + new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) + new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) + if fam_alias not in new_items['sample_processing'][analysis_alias]['families']: + new_items['sample_processing'][analysis_alias]['families'].append(fam_alias) + return new_items + + +def create_case_items(items, proj_name): + new_items = items.copy() + for k, v in items['sample_processing'].items(): + analysis_id = k[k.index('analysis-')+9:] + for sample in v['samples']: + case_id = '{}-{}'.format(analysis_id, items['sample'][sample]['specimen_accession']) + if len(v['samples']) == 1: + case_id += '-single' + elif len(v['samples']) > 1: + case_id += '-group' + case_alias = '{}:case-{}'.format(proj_name, case_id) + indiv = [ikey for ikey, ival in items['individual'].items() if sample in ival.get('samples', [])][0] + report_alias = case_alias.replace('case', 'report') + new_items['report'][report_alias] = { + 'aliases': [report_alias], + 'description': 'Analysis Report for Individual ID {}'.format(items['individual'][indiv]['individual_id']) + } + case_info = { + 'aliases': [case_alias], + 'case_id': case_id, + 'sample_processing': k, + 'individual': indiv, + 'report': report_alias + } + new_items['case'][case_alias] = case_info return new_items @@ -233,19 +274,26 @@ def parse_exception(e, aliases): out of it. [Adapted from Submit4DN]""" try: # try parsing the exception - text = e.args[0] + if isinstance(e, VirtualAppError): + text = e.raw_exception + else: + text = e.args[0] resp_text = text[text.index('{'):-1] resp_dict = json.loads(resp_text.replace('\\', '')) - if resp_dict.get('description') == 'Failed validation': - resp_list = [error['description'] for error in resp_dict['errors']] - for error in resp_list: + except Exception: # pragma: no cover + raise e + if resp_dict.get('description') == 'Failed validation': + keep = [] + resp_list = [error['description'] for error in resp_dict['errors']] + for error in resp_list: # if error is caused by linkTo to item not submitted yet but in aliases list, # remove that error - if 'not found' in error and error.split("'")[1] in aliases: - resp_list.remove(error) - return resp_list - # if not re-raise - except: # pragma: no cover + if 'not found' in error and error.split("'")[1] in aliases: + continue + else: + keep.append(error) + return keep + else: raise e @@ -269,22 +317,14 @@ def validate_and_post(virtualapp, json_data, dryrun=False): Current status: Still testing validation/data organization parts - patch/post part hasn't been fully - written or tested and need to add code to create Case/Report items. - - More notes: - Case and Report items to be created at end. We don't want them in the validation report, since - they are not part of the user's spreadsheet and validation error messages would be too confusing. - We only want to create these when we are sure no validation issues in other items exist. - Spreadsheet has no Case ID, but if there is an "analysis ID" then we can create a Case ID from this - (perhaps analysis ID + indiv ID + label indicating group/trio vs solo) - Report ID can be same as case ID but with "report" appended (?) + written or tested. ''' alias_dict = {} links = ['samples', 'members', 'mother', 'father', 'proband'] errors = [] - all_aliases = [k for itype in json_data for k in itype] + all_aliases = [k for itype in json_data for k in json_data[itype]] json_data_final = {'post': {}, 'patch': {}} - for itemtype in POST_ORDER: + for itemtype in POST_ORDER[:4]: # don't pre-validate case and report profile = virtualapp.get('/profiles/{}.json'.format(itemtype)) for alias in json_data[itemtype]: # TODO : format fields (e.g. int, list, etc.) @@ -334,7 +374,6 @@ def validate_and_post(virtualapp, json_data, dryrun=False): return errors else: return 'All items validated' - # TODO : create case and report items here - skip validation part because they are not part of user's spreadsheet # output = [] # item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_id'} # if json_data_final['post']: From f72f33fbf9addf496e25e2d7a52a453fef3a5d57 Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 15 Jun 2020 11:48:13 -0400 Subject: [PATCH 009/125] cgap_submit_test.xlsx modified for more testing --- .../data/documents/cgap_submit_test.xlsx | Bin 12955 -> 13221 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/src/encoded/tests/data/documents/cgap_submit_test.xlsx b/src/encoded/tests/data/documents/cgap_submit_test.xlsx index 0d65ab327411da00aa287eec8b4637569b70ec6d..0e3409a68f19e8f22e4379178b33a7c180d15e45 100644 GIT binary patch delta 6480 zcmaKRWl-GBvo#tPcLIw$!JXaUBtWp>?(T~QLXaPU#U;3FkPxy!aF@m1oghgF1b4Ul zJoi@Jf8F=P+f`FFbNbAend<4D)7iEq_TTZb0ZtoK7#%ta3L(}%0|f=;y$heOo2Rpt zo0~K5dsi2PuBqD!16g2x-Ba*;U(^8~t8cJWLj7;EYIE}ypUC@I6zb!!nv`@8I*)$M z?#y?1(b#}IC>`Sa4`A7+79RjgBCUZe3G3N_>{ItOe6=2hb%R0{Ja4V(2a+jx4Q zxbF-5(K@YLeRJ#FEdi>GvzG~U!4__yITXIaygEd|?d%FZv~UpZ^-(yT*3)V!Wn(PC zc5HGwp<@(sLr(71lX#ZMuMf^2$i^a?(Qg~Ha%rc)ClpQL*{vP8aM;V>Cf#~ZLUay= zxWRF;3eo$9XN|tx3gYPMXn;Z?_5hIEb?)>{0lTQetd%*K&40S;-adu@(ir4ZWqmAK z97*X{6+l3UP(4jYB%$k_;`RTPE)qzXFG$4;xk`69`Dh%e9=}kYwthJ+-?k@p(r;hh zU$!+*UgB9ZdO{ZdfeXmy4{ZFai?63(on__enQ&GjTVit7;iJ#6NlZtT-hunyKzuN(;FZr%ouJ!3$$XTyv z4;PbY8}3V2nyM&~5IlKqm0w+6D$0kD7y0G=N#?~P<_kmgk?FYf#Au95Gi#llc!b}@ zmN+I=?1|d)t9Tid%&#Sj@c_7hr}z~#^G98`&Hk3hxSbMMU0=4v;w)HUiselVIsUkV zFz${GxOC)((7m4}bF$5ME*UK09nWso$)`qt1FPfDx+o7QJ$aKK^zq1Cxac6lcQN08 zJ>|!;{pvO2vgx;m*%g`rVYDxF{Xbbs`F{}fJL|UKaKMG3qk#!d+5oCG#n?T|Y=nK< z?}1uOo}}UoEN&cjS>fz{q)Si($g0JNN4uW_VSpDq$d?jMN>PR{=@e0(fy-DSO`njC zMzixsMi&y7ICO8e)n@Zba9{$n3R5TaC_GF^p@)pQ(0Ga(z1%#ckDU;W$@j3kr4Y-N z(_)fq*w6!e4E$vO24$r->`lS-JNoS(UVJ@# z>wD38iw0lkrZ%n~LQ$)S-!abZIvh`o?X2aa*Ssy0(=j@kvgq+Swmm;nf8|X3v$94p zbh>%VD{ZN5(`Yep^?Nrku9a;B2@v@hcTrW*9C1SDsKp)Kz@ z1KfXBPaaIkXNL}gOl-s2%HpYy#U%vAZ?a*cYAKssq5C1p-lj0bXh|gOP?SU!svS1E zA`y1k?>9d01<%!O{+NF z%3(kyJA-AOsM2`uQo?a7$>Kc=9gr;1XpFxa3cCM><2ga~Rx1 z!%;;LNIM&Gp}{7sEBXjyL{C;8$jI>|zDPoGR09|1@;{9;KjWb2@Q@$| zNNa`8x-)W7Ng1K&^@dI2Wa#fyth01pTh`nA$g>e+-S^B~l-@*hRhBep*HjNIfzOcu zad8(tt~#xGd|~Whtehk}cq``ku&du%GH=3cOjPnrNxa4MPH7JqXlT#yL~Dbe0LjWv zAqG!%f{Fd9-k+>u%C8=!I5&)4>E_iu+nOIcnIf$kCAY>-bx{XEA+&j2-k^kNr@j47S-qD==6NsGx{`;Y*%_RB= z5}cYNbGE`nK@n$0LBao@eCy@y=VI&iZ?>H^GIsj~Aqz5he0-JSn}q=qBUUI$R;2<1 zVF@TJ{3S<9?Qwis405xEasmj;Aiy&&;xsHi57~3S_aJ zmP8tj;Z110}(6JwV*Pyn4H zDFS_wq$6S_eQVQ5BoD8g*(!C>m><8D$MEK(<{4tP6#~DtshXf1;obaI8)bjnQ4v(r zOmvt>`uZxmfmJ>bJ05e^;ZPq?>-v3ef?Zy$-W3U-j#Fu#_Ajs$hBveGe+q08sM$-u zKnq|G=TE#1-qoM-@1I(BY`HK8-cYh?4_RFnNQl-Vj-Jy2i zhTXC-Sc?WngOR1%iJglvs>>)Js|rK_33-ZSb!6PQ z9BJ^0Nv=Cif@nEg{esL|#C|r;S#?cuR*PR27`gGTDp1OByLo%H#cltm-wIAUJ{HeZ zA_ZCHs1U9d5ed5LPA#6<6Fj?#1Ghxip?75PQ?J+{Gts53VY0a#+7RyyWyadzVp&ni zBG^Txf91DFoNg1<#$qEtTZUfSh+k(gd0}81%*~lzC?fkqRiTO7=7(%@SV^yPwXLki z4|3)b^`jS=eZ+6gDe(&{Ro(vJyiSlwD?*j%)2MjbbT|po$1!qxN1nD<5$n9yz9}|Z z9~f#P3-|}3ly)~L(oQ=EX6t8C>7>k{KC3saNo!5F40Wa@x!~RhaIPe9Tf1||yLm;V zDZg{)INYw{urGMCDkcrez0Yj-J-@W3Qscjx;|-_SY6vmoag~F&S;^!Iy$wQDqSCUB z7O5w~#XR#k(Jk!=yg8Jy`x<#AeqO%Sp@3RCR@e*}p}owS+|Cr5NRS_ILk)!`f2}Dk zmB?^U=jHWY2?{<1>YS`&?s@d6>+KJh-esR7wMnm|vcwZ$mEz)`a-%*lmAe)`M3w37 zLRWAJ99j?xZCaAXe>`o^E9+vROck`ej^7YZj5n7|L5BW+^w~pFD$Y5F&(#f6(^4}| zP3)W{JK8_I{Fc!cOIN9-dZWiqn5{^`{A^hh(l&_~YlNPER%b=hpyB?Wv6nHK^EW`!Ul2?sMW>Iwn&7T{>ea4^wOF=qe`In~G0^6OOXZNLzi!0Qp_ zpVWhDr)me<636$2|rZ)-t6vvwuhr3u(%}7KdC)dQ{)(vv+YiMIl9D&w91zx?8EYiE$pe(){->c5lpi}7e1p!yp+zY>hIx{3oM~eU2?s;SS9@YkArX1Tp zPV&wBW&O}4jhk&h?w#MQbtUOy#;N&slGyi#`+kFCW$Q$d z(w@{$`CGnst50{xLkE8BFK#{(A|FTZFA znY#zrejK~U>;e70E~dALIzX}60%)@}KHp)=O?`p)>z(-1NX?%+ZngWZqP}Q^hh1IrwhDwfqr*3!ypwo?>F5;-;GNFq1VtVJCeWG2HJW{qh5p*2 z^4p2CaaXP4WET&~PA+;&hvRMDcJ(|8TlB-rHjW=r_aED@pm%;b!J{>pc%Y{P?ro!9jn<*6-6v@eiO%O#EAltS0~l;V{+z=ujwW)i z>apJh*;+NO_^tK27hy{dEaNx5Ujx;p;|IO>;dMW0BhFn{(;M}c5Twr?dr5`@!wJib zP~RfHWKda=pC0XE7ov1|Y63h~BXCKBzdy%#e7muuwKR@hmdE*A%7l_@>hdjo%cJSN zD`$OeDj}*sjVlS-9lyjB+;ZgxOQZZ^(cO}6T?!kO%<%zY()JP?b=J_;wy0bbmq~aI z0&iYXg&7XN0-62!*zw+vEP^fi_SYXxnkewlTXV-v2sJ+H2YAxJ)0C=-uys>d*Z1|=R8 zf4$49bmtWf(#g)1{|?OL_4!+Fhsum&oN6*`qusD1OrSLr4L*MzE?S-*Icb7n=1-yD zJzg&ZdmYg6@|Tc};Un9AZz&|Jcc6yC$`G#_5pIz7ihGYS4U>2zI?`6Dx0EgNTPX*K z#hF1=ev(W(Ru{!++Bi>{)!-Rf_saqz3be5ABpmP(8Ab#qjwK+V0rFC3RFw+jza#5D z*9O(9F@!CZ64_Emv59dvw%f80KvhRY}^NPD5KJkv19ZWRG^prjhAx{jarCS5+PMY0477fg4 zau~nOr~sQXLo1NG06!lOgXJpIoC4Z>>diXO;r)QVww|XfHNxKkUhYhyRS=)V3>51> z4ETn6)+t|u!dy(Ov|}{OTuH95QsjX&681t1 z<6_2dNsDjN`W<^iZ`Q~?^qvHw(_mDJ5*r*wH2|Sp6bS~V5>X1htcDAQtRpc-*lVH` zeA&v~C}Ip&St2qN+3!EFL;6-3aJVuV3>q+yU{(+Xi|qGXP)Laa5rQkkf|wYCE1bcA z4g+b#0AkBy$gmBG%;8MbX7wz8{Wf**D+yGW>5KBFPSn3(81&+x!6=hro}tuIq$&$2 z71wne^3F#SSzrF5W6t;U!S{F2YiW!-;`fY=_)df8pJ`|mG8r%xYV8cLvW=qsq9ERJ zW6J}Hp*Ye4$@o$R@|mfE~>g*Ne!npINg*Emki=V`hA|MrBV7ze}}1 zT*|x_G`IuuO=v{VCIFwCZdpscjwT7fF0%yzz4e0CT3XPVLm@7|7J$+p3}J^X%ng47 zngP2Hr$+M)JWFUZwP;YZY-0|9qs>)g&-yi9>Ki zxGBfedTsk?7zmT1OI7(!6PVv?#!0#ABrY;S?tglSo;t(nsOyObBV%L~R4@R+X~u*^ zoR6`>Ah_}|2p*=+$nA||X{kLpD^P8_n}*;nRmHX`Pm(DGVcR&x%arzlY&fH3N`pZ* zyJ0e=5+IxMk1`p7g08kL0+yLuDh&T82>ufpbHhIQy!#qweCCT6ujaZdJ1rva+jZ+< z02ZqG#EH*YpBraF5BMbFLhN+abUrG6pGd9I-YQ_P;5`CLBMw2|I#e3_*k`Qn!w*8A zKY8nnW;inCy}Sl^+~Fy5DT=kpHemotFT2 z&HiwWH~T!SCvqU^ypJ5qZkq(_^N3F`P~&Y2Wg}ZaYnJV;^l_yyDw9s!_lg^L1wq+0K`x`nwKfR|tr?z-6ET z9zt>jG#yQaL|LGLBUShPFaEP=KidYP^sBTh#*2Wh66prl3*|!knp}H`l~g$}@4Do7 zEV?6s21O9B_+5`54icOMY8gwLMMFlt{iNvi;y=G>kj{r9Jp%=ry$`J zas7Qi zfFl2&)g*5UJ(g2KG(ZkpE6>+GbVk0kmY_l9)w7$3(0SJv@7mOgc@uX3KEF?L z6QyEwepK{{WYePa`dtyzLtzWiV(SP!CXxCIrLK!t5Or$bx}I=e1y>t24nC_852rBl z^=O)J_LtiuV3ymfgt%AymZ7ePLG||jIc2y5ts=Y8^P~On8l}A?W}rSk)NS&Xp2`3T zEYd@0{@Lr^G{5JSbm5m}najfEo*IB20|4nS3 z9kdyz?S?0Pwi_xr)`aE1x5@aBut|8PJ7rjWH8-TJp>9Us{$t0H)z&$1O)I~pWlR|& z>iGBpPCE7?=iSl{KXQIoYDD=A3-9xRS@EBlmgGNPkZMC(t+k`d+P2YTKWygMi>Sr> z&C1Imy022#&;P9P>XhcjykorT!sa2)gP!1rSCGwT> z$-S@5}QmT#53)++#fE-E1#!o(6=&oXA%0)KqxE43k9 zrMjM8S@R?v?}q0t)Tf)rD19)YB&`*bLDj4UApXY2Rp&J?UW8b)*^NI8=Z_6;y zi@;)XQx=dt+}XeD)bE4~LW9@ryvYdNtd@Z)FO$)Qsg;=1j4ktD9FgIM!m*OV;Cy2B z;JTr}8`EjG{}D>EInyw)|FRyjyA+2~EV#S1NPz+cic7z9 z?vH!EyMN7oXU(5Iv)?uADYePAuEztxXd1o}Q=$O?Kr8@&0002^IC6WtxI4UaadF`C zadNEI({jlXCJd51d=A_5ewU4rCQnPo0WmGPCcT=yZg7wDZa~dwQu8V<|NAuLxZFJY zruHPZrC~VW`NKZs;c=tTVWwL^|4?H1SKK=W|3Zg_Gxx{CeLf}L#)KQFizheC`Ze;u|&mXU<@akr;*h(jTbP}hx)Rv%qJ-;#YChC%#hY)_q^ag;ity1sQ5_Pbg zWM!dtDRQfI`#NOsr^UAN_7n?DV5f^D!^~-c)v|fvtvBZMuF`$#-#Fr-AYlKFC+=n> zDG~iy>eNn5s&Y`Wp8HpoyngVLwE*U9R&%oS?;v7M2{)Yc!8`qyi%YGJXo7*QuFFh& zUi)^>Y-16l10RrVcen-LZ#>V-B(IYb^Kjd-(^~R%6%eJ~04ZZy4 zJ^-Qf3IJj{QM$xHLsjKD9KQE?ys7ZHjFpd zMm#~$_~yFi$AJk}6d^(cm##!Q;8W%wFZDW^hAkUaH9TO-Ee^pj-nW{PpopE!z1#D~ zl5B<*QWW@CdBV~$VI}w&U$mWGjX_+hqmFvqh<8d^F!Z)^Ea8cZuW&~k0F&;p z$9oq3IMw|dJ_BfWqcDuihrIwdK!{V79M1p@+JMi16CWrzBpe6{eO=FLvi%54QTI8D z29s-P46@~M#|^|R6BXb*hKhP$V$!qib%rE7pBG1hhF3%u6a>u`^Lz&4J9$mk`x&pb zoZmQ0zgK(}!fpHUgW7%_*E1TT5RDXsRyN^8gbn~iA}n#KU~A563WR~&(=hOFMo!Ia zXW|lN8urBZVm#Y$w)Xm~FLa9E7k9b3JKc+V&x~f1b4~eLFqWq`_uVIR$O(D^>QwyR zTb1e$kLE`yf*Oz?xfbq)?+*5VZClAKf>Pny>=Np&jtK4na~QedG0;2pwL12}(UhHi zQz*M7f#(D^OpM)$ZRhtwwa#0}i+0WoQ@t{mPYDtwsflcbsxB#F>Jm?>(+Zvm><#`Q(?yp_#Z>g0rsb zJ7Ng3>Pp)B;gi2rHj zw`<;Ky;SFOl2a2ucAGlvazQ_k)FZ|~sjE~%PT7Y0?FC(TY+MA+-1VrdNQVzofa^}f z^vRAjB5VX`Vja;{o<{LoOq@^bDciZ#GiBEy>{n>A0B5WI2W>f_8wi!Eh(Y^!mUzUq zg3-_-Oy`v=H@zy;DtIy{;625T#;&u_)=%_bg(7r#HBbnN=>lVzxmKh6V2Oe5O$5;0 zF9;>%nV$o4&5HNZgqG4;W+qvAS48HUlXUwGbnia9NPBg9_%!i8-#>^0_uyU&yuC{` z@(u~SJT*n{sncYKNZfdoFdwSgF67dp}lr~ksHD*WX;Bi%CeE3QdM1f`k0?xzgL0QyyxPTN0LWa9fXJ0~`|`uBR+ z&uih{^v}~5Z89G)3=J-{#Y`hoXXBMmj{)`p+>5GaJUbwP>BizeTChs;rJge{fodI99LCnu(^HE;&bWLH3TSnn zj#3XRTeO5etj1Nvf3YE&E-K;n(;CGYP6cZ(1hM5eHY+mQ5Ls*|bfuFJd^@yed!Vn-ft7NFA z1_l7|M2lc2r-J>JP4Ujc3C{)=7dn+BeNbd?VLzz?;Tw9s+s(iqxBvSI$l>Vn09KP3e#O0>-~4%?<7KfSYW zZ{28lCr8XL3m&p0Nd%v#FEo-h`oy4T603Yuu*_7T#P6LT8JbB!U1*t;E^P4XScfil zf#q6yAk0j(HmvY(RNKZJu}aZfb5w?CI1!A*H2IOcK4~YXBR%`fbe_~3VLnitKnW69*l)AJx&OokkkhkWX<$a2qZ1AC!M8T@yay-&W8U`E04Ce_44br#s0!bB`FxAAI6%c<@BAP3bwPnnCSt zR(ty)gsjmy7Mu^0C%Bm1dGvK=u<0J2ked2W-NTy=R;-!*1v*1EglBM;87%7$YCnhu zlVAWccRuqG^ZqpLLRDaN%#WG40+Ba_sU%&uGJlCL&`Z#_NO6FN@^aGb{TIRSly51! zTkF0Tu*`d5WMPc0_GI$BVt-Rsk^Q-X2-p02^#I{%=CfJzFhrf#Rx`ckI*m%L+xi834+EkjRzUwFlfp3dA4tqtD{7Sf6<2|t!dG)kGqEJ z>x#RW-+%D^i6b_HYhwU&{-{YNs*`zgF}62gK@eVHmMjc7cPRQ8`U$r0(c zqwXukmc*OZ4g`>2pY?op-AC&kiY3{4JaC(Zh>K%7SGj21*Qi7MYeIy4ZGCeeMh9eh z>P;V>%}=VI;}CvVOQ_39uA&1eUwtK}pxF*b6^@r`C$w{^#30*a<#jCQrRO6NFLv8C z^{P4fC;i^FYvYiEF9aKtyE8cA@txk^V|tuK)UUSy%9{{uVk?#8ue{ja({KCfx6gwK z^({eFk6S~xl7N357Q5N;`;{OF04QKae4(a;{ZJwf+=_m}irQ?bfitd!WNKQDmqN1k zH@iS@gNvNT=s`a>SSx}N3LAs+=YOE$7c1W zpO&D@XKVkNu1of?zm&HJm6j?d5W7^^c$0v}D>#?WsUGj~+^Z{vZd)_|*@NCx$l9^> zxdR=@(L^L4XOw!rCub=0LD?qgM74|^(#rskxiTN7!ozyl{P@`gzLRi3AZ^t|v+hQa zbNHtZZ*Y9`Z+=T#r&6$`NR|2vN%iV!YPkQ_(aA{~YKQXowj4_D*q=LmW^%86++KOzUTKCy^(Kq_SN78hY3I3ftGQ#l1xz(RW4hq-#mcY6*v9SAf`Fk%? zW{s%`T+&A7Jt4ElyrDbFj|3qwual5C%RDYisau#$>If?4hC(lLONesn@E+-H_Lz3} zyJqr{k$W~38$_kY6?{93LHe9;ihkAsF2uQgYQp*bSDlnkZ9#Q=drF<5>CCs{Qu+hh z+_0Qfz?o@vnI*#4$8_7aQ@oN~_&sLW3AaI^m}>#?wx{XRVP`8T(n}h~B1k2yY4&&$ z1~^RR5O)1-#70Gs`{mncE5nzoL7qJ(ytT=#2sb2X1kLmAn&C^q<&nGatpJECy6)n_ zS*_cguvXg&Kwgo#u8l<5t%{SP4iJZjAdAFreO{oY%KQF4EEh~*mpI9uC{?6O20bXt zX!*p*#39I8U{`-^DwkFg<;S0>{R1;u8ODUjTg@ezayae$0^yN6(DQY82aSv=QDHKA z%p1E7Oc95wV9%mT)Q}lrSazXa2g;5heOK5T7f?1x$lh`igOyOaD)>FG-f(Sz z*C@)c#z|uxc-&%5%99jc`OXeIQasaKmE7inpy9*YlMrS~hedj^R3ucoo*ERoph#E^ z4YjZ)LO+hCF@&PGFDk%>V`&7-VW9=9pwLw&AX~dMIx>x>Tr0GI1{8Wk4^)Y#DQ6EY zhz5qn(gKlTH02nfH0S8jX`muEw)$gYNW4y=8=HW7#8AbwFd4;XnIF2gIoP|+EA}`G z{r1+_3w0E7o}VwA)19SvJ=Fc8-b#dpKP-E@`iejKsh};s>CJLnmCdJ6 zrA*7>N8mw@vh7&Xsyz^R*#|c;{JXCfU%O1Q=st9r=m}o1+Bb<=5Sc?9m%q6@uk5~1 z`$+=G=j8PU^TJH6R!V}vs>`)M&4kWZFzInUvxD0~ddf5IJM5X2to8#-YI?f!k;BEv zxYgfeTB+nMJ7R=-Hs6L0f)f6;6Wn$sctUz+t$2}G(=8tZoWv})Ll2DfUioGf#>Puk zwgO93Vx%hFfhBAaQk8tb@CXYMpr;1yyhw!R5~(&H6Rao25mfSCMgK*q3MwlRv65Ya zl*&7F?O3`*V#1IVGhoTCG)*ZcD(%DjRT-$&yHY>ykpDCO|NRMh_{m}z`t;+lft#;Z zd+gZoBz3PMS=fA`sud&bG9Tf>!?X(qj|e z)SJ`*>0|ONVH3j2(%gb6&nVVB|$baAqsbRwCP#;sp}uU{@Mfo#M*29pHYo8 z{&KC$TM*UT8oTl}815ZKSZq{VzO}SZ>zK=%o7D7sJ80~4u7fU7-`BZc?(J53p!rB{ z(@N5EJo64+Rso@kVdD`)Y&_7;%iYqhW4kM|yZx&HZ>)==jyiPN7yrPy#Ru#>ra*aF ze72HP_JB3Ps;FBJbKZ>Q$M&MpGS+ZEu0~u+i$$i* zjI>JSuZp6X;x__EF+mn*5>y4nbr!G$7LFJ6a$cwAukPmh-qT6CofrJHZ+->>HzlMh zcps6}15G|uy-mXxREVJO->3)RfV;dla&H%JlGlTuwXRdDFDLne!ol~ZhIoxfVmTLI zl!O?uWqgz*aBd?Np0057TL*trG;BpQ2C2@;JfVas^`^U>Hy)N4q2`~Zm{+M)t7VZH zrI7M`H4LIirQ)v4IC43@ekS;LqeA~+8~mjSpOqN}X8vY?hjefYqgQtHaS_wnu!{wpcnv)j`&{uh~z1RPg@zBTMMgL1MdkaOOFdYkEuqDwR zDl;}H&?O(@sS47!#h`D-f)A}+=^@#XEg|hQWMJlg%)Angy#-)RZWYT=T3z9L!3+Go zL6dA)vIq>yZo>OKsvk@ugZTx07~KPRsjps?WQK_KmfXlmbq0VwtSJRT8c{0pKA(&7 zls+{TXo*g0cLeU)hz^}JVz>-8`P=NReA#d=cIu7Uxg5R?ynbAbqfpU1i$z0f_p=K1 z*^IVoE1r&oS67W*1e4e&5yr?;jY}nWPWL8E$Sj;-~2mG z+z)y2lB*c8ZzMDh?&d-1zaULX?3Eax|E2HEF4BrdX3ueS2-)q!(HUta54r&SqB9NaB)13R?y8j1l`^)73 From ee73402951cbf15c67d901b03178291c18a0e948 Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 15 Jun 2020 17:07:34 -0400 Subject: [PATCH 010/125] reorganized code for validation and posting in submit.py --- src/encoded/submit.py | 134 +++++++++++++++++++++++++++++------------- 1 file changed, 94 insertions(+), 40 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 7f777596cc..b2894868f9 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -22,7 +22,10 @@ POST_ORDER = ['sample', 'sample_processing', 'individual', 'family', 'report', 'case'] -SECOND_ROUND = {} +LINKS = [ + 'samples', 'members', 'mother', 'father', 'proband', 'report', + 'individual', 'sample_processing', 'families' +] # This is a placeholder for a submission endpoint modified from loadxl @@ -306,7 +309,7 @@ def post_item_data(): # NOT FINISHED -def validate_and_post(virtualapp, json_data, dryrun=False): +def validate_all_items(virtualapp, json_data): ''' Still in progress, not necessarily functional yet. NOT YET TESTED. @@ -320,12 +323,14 @@ def validate_and_post(virtualapp, json_data, dryrun=False): written or tested. ''' alias_dict = {} - links = ['samples', 'members', 'mother', 'father', 'proband'] errors = [] all_aliases = [k for itype in json_data for k in json_data[itype]] json_data_final = {'post': {}, 'patch': {}} + validation_results = {} for itemtype in POST_ORDER[:4]: # don't pre-validate case and report - profile = virtualapp.get('/profiles/{}.json'.format(itemtype)) + if itemtype in json_data: + profile = virtualapp.get('/profiles/{}.json'.format(itemtype)) + validation_results[itemtype] = {'validated': 0, 'errors': 0} for alias in json_data[itemtype]: # TODO : format fields (e.g. int, list, etc.) result = compare_with_db(virtualapp, alias) @@ -335,23 +340,25 @@ def validate_and_post(virtualapp, json_data, dryrun=False): # do something to report validation errors for e in error: errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 else: json_data_final['post'].setdefault(itemtype, []) json_data_final['post'][itemtype].append(json_data[itemtype][alias]) + validation_results[itemtype]['validated'] += 1 else: # patch if item exists in db alias_dict[alias] = result['@id'] to_patch = {} for field in json_data[itemtype][alias]: - if field in links: + # if field in links: # look up atids of links - if profile['properties'][field]['type'] != 'array': - for i, item in enumerate(json_data[itemtype][alias][field]): - if item in alias_dict: - json_data[itemtype][alias][field][i] = alias_dict[item] - elif profile['properties'][field]['type'] == 'string': - if item in alias_dict: - json_data[itemtype][alias][field] = alias_dict[item] + # if profile['properties'][field]['type'] != 'array': + # for i, item in enumerate(json_data[itemtype][alias][field]): + # if item in alias_dict: + # json_data[itemtype][alias][field][i] = alias_dict[item] + # elif profile['properties'][field]['type'] == 'string': + # if item in alias_dict: + # json_data[itemtype][alias][field] = alias_dict[item] # if not an array, patch field gets overwritten (if different from db) if profile['properties'][field]['type'] != 'array': if json_data[itemtype][alias][field] != result.get(field): @@ -363,40 +370,87 @@ def validate_and_post(virtualapp, json_data, dryrun=False): val.extend(json_data[itemtype][alias][field]) to_patch[field] = list(set(val)) error = validate_item(virtualapp, to_patch, 'post', itemtype, all_aliases, atid=result['@id']) - if error: # modify to check for presence of validation errors - # do something to report validation errors + if error: # do something to report validation errors for e in error: errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 else: # patch - json_data_final['patch'][result['@id']] = to_patch + json_data_final['patch'].setdefault(itemtype, {}) + json_data_final['patch'][itemtype][result['@id']] = to_patch # do something to record response + validation_results[itemtype]['validated'] += 1 + output = [error for error in errors] + for itemtype in validation_results: + output.append('{} items: {} validated; {} errors'.format( + itemtype, validation_results[itemtype]['validated'], validation_results[itemtype]['errors'] + )) if errors: - return errors + output.append('Validation errors found in items. Please fix spreadsheet before submitting.') + return ({}, output) else: - return 'All items validated' - # output = [] - # item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_id'} - # if json_data_final['post']: - # for k, v in json_data_final['post'].items(): - # # also create Case and Report items for each SampleProcessing item created - # for item in v: - # for field in links: - # if field in item: - # json_data_final['patch'][item['aliases'][0]] = item[field] - # del item[field] - # try: - # response = virtualapp.post_json('/' + k, item, status=201) - # aliasdict[item['aliases'][0]] = response.json['@graph'][0]['@id'] - # if response.json['status'] == 'success' and k in item_names: - # output.append('Success - {} {} posted'.format(k, item[item_names[k]])) - # except Exception: - # pass - # for k, v in json_data_final['patch'].items(): - # atid = k if k.startswith('/') else aliasdict[k] - # try: - # response = testapp.patch_json(atid, v, status=200) - # except Exception: - # pass + json_data_final['post']['case'] = list(json_data['case'].values()) + json_data_final['post']['report'] = list(json_data['report'].values()) + json_data_final['aliases'] = alias_dict + output.append('All items validated.') + return (json_data_final, output) + + +def post_and_patch_all_items(virtualapp, json_data_final): + output = [] + item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_accession'} + final_status = {} + if json_data_final['post']: + for k, v in json_data_final['post'].items(): + final_status[k] = {'posted': 0, 'not posted': 0, 'patched': 0, 'not patched': 0} + for item in v: + patch_info = {} + for field in LINKS: + if field in item: + patch_info[field] = item[field] + del item[field] + # return json_data_final + try: + response = virtualapp.post_json('/' + k, item, status=201) + # aliasdict[item['aliases'][0]] = response.json['@graph'][0]['@id'] + if response.json['status'] == 'success': + final_status[k]['posted'] += 1 + atid = response.json['@graph'][0]['@id'] + json_data_final['aliases'][item['aliases'][0]] = atid + json_data_final['patch'].setdefault(k, {}) + json_data_final['patch'][k][atid] = patch_info + if k in item_names: + output.append('Success - {} {} posted'.format(k, item[item_names[k]])) + else: + final_status[k]['not posted'] += 1 + except Exception as e: + final_status[k]['not posted'] += 1 + output.append(e) + for itype in final_status: + if final_status[itype]['posted'] > 0 or final_status[itype]['not posted'] > 0: + output.append('{}: {} items posted successfully; {} items not posted'.format( + itype, final_status[itype]['posted'], final_status[itype]['not posted'] + )) + for k, v in json_data_final['patch'].items(): + final_status.setdefault(k, {'patched': 0, 'not patched': 0}) + for item_id, patch_data in v.items(): + # atid = k if k.startswith('/') else aliasdict[k] + try: + response = virtualapp.patch_json('/' + item_id, patch_data, status=200) + if response.json['status'] == 'success': + # if k in item_names: + # output.append('Success - {} {} patched'.format(k, patch_data[item_names[k]])) + final_status[k]['patched'] += 1 + else: + final_status[k]['not patched'] += 1 + except Exception as e: + final_status[k]['not patched'] += 1 + output.append(e) + if final_status[k]['patched'] > 0 or final_status[k]['not patched'] > 0: + output.append('{}: {} items patched successfully; {} items not patched'.format( + itype, final_status[k]['patched'], final_status[k]['not patched'] + )) + return output + # This was just to see if i could post something using testapp in the python command line, currently works. From 812340adeff0de598c701652f12989bd72800602 Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 15 Jun 2020 17:08:00 -0400 Subject: [PATCH 011/125] a couple submission unit tests added --- src/encoded/tests/test_submit.py | 34 ++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/encoded/tests/test_submit.py b/src/encoded/tests/test_submit.py index ce332d06da..4bf8515564 100644 --- a/src/encoded/tests/test_submit.py +++ b/src/encoded/tests/test_submit.py @@ -140,23 +140,49 @@ def test_xls_to_json(project, institution): def test_validate_item_post_valid(testapp, a_case): - result = validate_item(testapp, a_case, 'post', 'case') + result = validate_item(testapp, a_case, 'post', 'case', []) assert not result def test_validate_item_post_invalid(testapp, a_case): a_case['project'] = '/projects/invalid-project/' - result = validate_item(testapp, a_case, 'post', 'case') + result = validate_item(testapp, a_case, 'post', 'case', []) assert 'not found' in result[0] def test_validate_item_patch_valid(testapp, mother, grandpa): patch_dict = {'mother': mother['aliases'][0]} - result = validate_item(testapp, patch_dict, 'patch', 'individual', atid=grandpa['@id']) + result = validate_item(testapp, patch_dict, 'patch', 'individual', [], atid=grandpa['@id']) assert not result def test_validate_item_patch_invalid(testapp, grandpa): patch_dict = {'mother': 'non-existant-alias'} - result = validate_item(testapp, patch_dict, 'patch', 'individual', atid=grandpa['@id']) + result = validate_item(testapp, patch_dict, 'patch', 'individual', [], atid=grandpa['@id']) assert 'not found' in result[0] + + +def test_validate_item_patch_alias(testapp, grandpa): + patch_dict = {'mother': 'existing-alias'} + result = validate_item(testapp, patch_dict, 'patch', 'individual', ['existing-alias'], atid=grandpa['@id']) + assert not result + + +def test_validate_all_items_errors(testapp, mother, empty_items): + new_individual = { + 'aliases': ['test-proj:new-individual-alias'], + 'individual_id': '1234', + 'sex': 'F', + 'mother': mother['aliases'][0], + 'project': 'test-proj:invalid-project-alias', + 'institution': 'test-proj:invalid-institution-alias' + } + items = empty_items + items['individual']['new-individual-alias'] = new_individual + data_out, result = validate_all_items(testapp, items) + assert not data_out + assert len(result) > 1 + errors = ' '.join(result) + assert "'test-proj:invalid-project-alias' not found" in errors + assert "'test-proj:invalid-institution-alias' not found" in errors + assert mother['aliases'][0] not in errors From 5daf28c1f61875c807bd124778f72be24dfc3e4c Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 22 Jun 2020 16:02:52 -0400 Subject: [PATCH 012/125] script to test submission code so far added --- src/encoded/commands/submission_test.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/encoded/commands/submission_test.py diff --git a/src/encoded/commands/submission_test.py b/src/encoded/commands/submission_test.py new file mode 100644 index 0000000000..aef1cc1a05 --- /dev/null +++ b/src/encoded/commands/submission_test.py @@ -0,0 +1,22 @@ +from pyramid.paster import get_app +from encoded.submit import * +from dcicutils.misc_utils import VirtualApp +import json + + +def main(): + app = get_app('development.ini', 'app') + environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} + virtualapp = VirtualApp(app, environ) + proj = virtualapp.get('/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/').json + inst = virtualapp.get('/institutions/hms-dbmi/').json + json_data = xls_to_json('src/encoded/tests/data/documents/cgap_submit_test.xlsx', proj, inst) + final_json, validation_log = validate_all_items(virtualapp, json_data) + print(validation_log) + print(json.dumps(final_json, indent=4)) + result = post_and_patch_all_items(virtualapp, final_json) + print(result) + + +if __name__ == '__main__': + main() From 41515cefe0b73f2280381b29fd0049a40a2f5ffe Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 22 Jun 2020 16:03:18 -0400 Subject: [PATCH 013/125] src/encoded/commands/submission_test.py added to scripts in pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index cbeecf50e3..c4a020b10c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -191,6 +191,7 @@ profiler = "encoded.commands.profiler:main" purge-item-type = "encoded.commands.purge_item_type:main" run-upgrade-on-inserts = "encoded.commands.run_upgrader_on_inserts:main" spreadsheet-to-json = "encoded.commands.spreadsheet_to_json:main" +submission-test = "encoded.commands.submission_test:main" update-inserts-from-server = "encoded.commands.update_inserts_from_server:main" verify-item = "encoded.commands.verify_item:main" From 432f08730ae6da27c842f7fa09fcfb41ebd8d629 Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 22 Jun 2020 16:03:47 -0400 Subject: [PATCH 014/125] VirtualAppError handling modified to adapt to changes in dcicutils --- src/encoded/submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index b2894868f9..4c96580143 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -278,7 +278,7 @@ def parse_exception(e, aliases): try: # try parsing the exception if isinstance(e, VirtualAppError): - text = e.raw_exception + text = e.raw_exception.args[0] else: text = e.args[0] resp_text = text[text.index('{'):-1] From 273af15e18b648fae1b14024875b95dedbf94ffc Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 22 Jun 2020 17:08:40 -0400 Subject: [PATCH 015/125] bug fixes so that compare_with_db in submit.py works properly with alias instead of atid --- src/encoded/submit.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 4c96580143..c6636a1618 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -239,9 +239,12 @@ def create_sample_processing_groups(items, sp_alias): # NOT YET TESTED -def compare_with_db(alias, virtualapp): +def compare_with_db(virtualapp, alias): try: # check if already in db - result = virtualapp.get(alias + '/?frame=object') + result = virtualapp.get('/' + alias + '/?frame=object') + if result.status_code == 301: + msg = json.loads(result.body).get('message', '') + result = virtualapp.get(msg[msg.index('/'):msg.index(';')]) except Exception as e: # if not in db # print(e) if 'HTTPNotFound' in str(e): @@ -327,9 +330,9 @@ def validate_all_items(virtualapp, json_data): all_aliases = [k for itype in json_data for k in json_data[itype]] json_data_final = {'post': {}, 'patch': {}} validation_results = {} - for itemtype in POST_ORDER[:4]: # don't pre-validate case and report + for itemtype in POST_ORDER: # don't pre-validate case and report if itemtype in json_data: - profile = virtualapp.get('/profiles/{}.json'.format(itemtype)) + profile = virtualapp.get('/profiles/{}.json'.format(itemtype)).json validation_results[itemtype] = {'validated': 0, 'errors': 0} for alias in json_data[itemtype]: # TODO : format fields (e.g. int, list, etc.) @@ -338,9 +341,10 @@ def validate_all_items(virtualapp, json_data): error = validate_item(virtualapp, json_data[itemtype][alias], 'post', itemtype, all_aliases) if error: # modify to check for presence of validation errors # do something to report validation errors - for e in error: - errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) - validation_results[itemtype]['errors'] += 1 + if itemtype not in ['case', 'report']: + for e in error: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 else: json_data_final['post'].setdefault(itemtype, []) json_data_final['post'][itemtype].append(json_data[itemtype][alias]) @@ -369,11 +373,12 @@ def validate_all_items(virtualapp, json_data): val = result.get(field, []) val.extend(json_data[itemtype][alias][field]) to_patch[field] = list(set(val)) - error = validate_item(virtualapp, to_patch, 'post', itemtype, all_aliases, atid=result['@id']) + error = validate_item(virtualapp, to_patch, 'patch', itemtype, all_aliases, atid=result['@id']) if error: # do something to report validation errors - for e in error: - errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) - validation_results[itemtype]['errors'] += 1 + if itemtype not in ['case', 'report']: + for e in error: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 else: # patch json_data_final['patch'].setdefault(itemtype, {}) json_data_final['patch'][itemtype][result['@id']] = to_patch @@ -388,8 +393,8 @@ def validate_all_items(virtualapp, json_data): output.append('Validation errors found in items. Please fix spreadsheet before submitting.') return ({}, output) else: - json_data_final['post']['case'] = list(json_data['case'].values()) - json_data_final['post']['report'] = list(json_data['report'].values()) + # json_data_final['post']['case'] = list(json_data['case'].values()) + # json_data_final['post']['report'] = list(json_data['report'].values()) json_data_final['aliases'] = alias_dict output.append('All items validated.') return (json_data_final, output) @@ -447,7 +452,7 @@ def post_and_patch_all_items(virtualapp, json_data_final): output.append(e) if final_status[k]['patched'] > 0 or final_status[k]['not patched'] > 0: output.append('{}: {} items patched successfully; {} items not patched'.format( - itype, final_status[k]['patched'], final_status[k]['not patched'] + k, final_status[k]['patched'], final_status[k]['not patched'] )) return output From 1f9667f685262404b2d5c1f282d23495acc20416 Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 25 Jun 2020 10:54:51 -0400 Subject: [PATCH 016/125] patch logic for submit.py extracted to new compare_fields function --- src/encoded/submit.py | 86 +++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 52 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index c6636a1618..b75dcf870f 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -19,7 +19,7 @@ } -POST_ORDER = ['sample', 'sample_processing', 'individual', 'family', 'report', 'case'] +POST_ORDER = ['sample', 'individual', 'family', 'sample_processing', 'report', 'case'] LINKS = [ @@ -238,7 +238,6 @@ def create_sample_processing_groups(items, sp_alias): return new_items -# NOT YET TESTED def compare_with_db(virtualapp, alias): try: # check if already in db result = virtualapp.get('/' + alias + '/?frame=object') @@ -246,17 +245,14 @@ def compare_with_db(virtualapp, alias): msg = json.loads(result.body).get('message', '') result = virtualapp.get(msg[msg.index('/'):msg.index(';')]) except Exception as e: # if not in db - # print(e) if 'HTTPNotFound' in str(e): return None else: return result.json -# TODO : Handle validation of not-yet-submitted-aliases in fields def validate_item(virtualapp, item, method, itemtype, aliases, atid=None): if method == 'post': - #import pdb; pdb.set_trace() try: validation = virtualapp.post_json('/{}/?check_only=true'.format(itemtype), item) except (AppError, VirtualAppError) as e: @@ -303,15 +299,30 @@ def parse_exception(e, aliases): raise e -def patch_item_data(): - pass - - -def post_item_data(): - pass +def compare_fields(profile, aliases, json_item, db_item): + to_patch = {} + for field in json_item: + # if not an array, patch field gets overwritten (if different from db) + if profile['properties'][field]['type'] != 'array': + val = json_item[field] + if isinstance(val, str): + if val in aliases: + val = aliases[val] + if val != db_item.get(field): + to_patch[field] = val + else: + # if array, patch field vals get added to what's in db + if field != 'aliases': + val = [aliases[v] if v in aliases else v for v in json_item[field]] + else: + val = [v for v in json_item[field]] + if sorted(val) != sorted(db_item.get(field, [])): + new_val = db_item.get(field, []) + new_val.extend(val) + to_patch[field] = list(set(new_val)) + return to_patch -# NOT FINISHED def validate_all_items(virtualapp, json_data): ''' Still in progress, not necessarily functional yet. NOT YET TESTED. @@ -330,6 +341,7 @@ def validate_all_items(virtualapp, json_data): all_aliases = [k for itype in json_data for k in json_data[itype]] json_data_final = {'post': {}, 'patch': {}} validation_results = {} + output = [] for itemtype in POST_ORDER: # don't pre-validate case and report if itemtype in json_data: profile = virtualapp.get('/profiles/{}.json'.format(itemtype)).json @@ -352,28 +364,8 @@ def validate_all_items(virtualapp, json_data): else: # patch if item exists in db alias_dict[alias] = result['@id'] - to_patch = {} - for field in json_data[itemtype][alias]: - # if field in links: - # look up atids of links - # if profile['properties'][field]['type'] != 'array': - # for i, item in enumerate(json_data[itemtype][alias][field]): - # if item in alias_dict: - # json_data[itemtype][alias][field][i] = alias_dict[item] - # elif profile['properties'][field]['type'] == 'string': - # if item in alias_dict: - # json_data[itemtype][alias][field] = alias_dict[item] - # if not an array, patch field gets overwritten (if different from db) - if profile['properties'][field]['type'] != 'array': - if json_data[itemtype][alias][field] != result.get(field): - to_patch[field] = json_data[itemtype][alias][field] - else: - # if array, patch field vals get added to what's in db - if sorted(json_data[itemtype][alias][field]) != sorted(result.get(field, [])): - val = result.get(field, []) - val.extend(json_data[itemtype][alias][field]) - to_patch[field] = list(set(val)) - error = validate_item(virtualapp, to_patch, 'patch', itemtype, all_aliases, atid=result['@id']) + patch_data = compare_fields(profile, alias_dict, json_data[itemtype][alias], result) + error = validate_item(virtualapp, patch_data, 'patch', itemtype, all_aliases, atid=result['@id']) if error: # do something to report validation errors if itemtype not in ['case', 'report']: for e in error: @@ -381,10 +373,13 @@ def validate_all_items(virtualapp, json_data): validation_results[itemtype]['errors'] += 1 else: # patch json_data_final['patch'].setdefault(itemtype, {}) - json_data_final['patch'][itemtype][result['@id']] = to_patch + if patch_data: + json_data_final['patch'][itemtype][result['@id']] = patch_data + else: + output.append('{} {} - Item already in database, no changes needed'.format(itemtype, alias)) # do something to record response validation_results[itemtype]['validated'] += 1 - output = [error for error in errors] + output.extend([error for error in errors]) for itemtype in validation_results: output.append('{} items: {} validated; {} errors'.format( itemtype, validation_results[itemtype]['validated'], validation_results[itemtype]['errors'] @@ -393,8 +388,6 @@ def validate_all_items(virtualapp, json_data): output.append('Validation errors found in items. Please fix spreadsheet before submitting.') return ({}, output) else: - # json_data_final['post']['case'] = list(json_data['case'].values()) - # json_data_final['post']['report'] = list(json_data['report'].values()) json_data_final['aliases'] = alias_dict output.append('All items validated.') return (json_data_final, output) @@ -402,9 +395,11 @@ def validate_all_items(virtualapp, json_data): def post_and_patch_all_items(virtualapp, json_data_final): output = [] + if not json_data_final: + return output item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_accession'} final_status = {} - if json_data_final['post']: + if json_data_final.get('post'): for k, v in json_data_final['post'].items(): final_status[k] = {'posted': 0, 'not posted': 0, 'patched': 0, 'not patched': 0} for item in v: @@ -413,10 +408,8 @@ def post_and_patch_all_items(virtualapp, json_data_final): if field in item: patch_info[field] = item[field] del item[field] - # return json_data_final try: response = virtualapp.post_json('/' + k, item, status=201) - # aliasdict[item['aliases'][0]] = response.json['@graph'][0]['@id'] if response.json['status'] == 'success': final_status[k]['posted'] += 1 atid = response.json['@graph'][0]['@id'] @@ -438,7 +431,6 @@ def post_and_patch_all_items(virtualapp, json_data_final): for k, v in json_data_final['patch'].items(): final_status.setdefault(k, {'patched': 0, 'not patched': 0}) for item_id, patch_data in v.items(): - # atid = k if k.startswith('/') else aliasdict[k] try: response = virtualapp.patch_json('/' + item_id, patch_data, status=200) if response.json['status'] == 'success': @@ -457,16 +449,6 @@ def post_and_patch_all_items(virtualapp, json_data_final): return output - -# This was just to see if i could post something using testapp in the python command line, currently works. -# def test_function(): -# app = get_app('development.ini', 'app') -# environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} -# testapp = TestApp(app, environ) -# response = testapp.post_json('/project', {'name': 'test', 'title': 'Test'}, status=201) -# print(response) - - def cell_value(cell, datemode): """Get cell value from excel. [From Submit4DN]""" # This should be always returning text format From 5edf74738a1a70591226475cb2c9ea036d69fa19 Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 25 Jun 2020 10:55:27 -0400 Subject: [PATCH 017/125] print statements restructured for submission-test command --- src/encoded/commands/submission_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/encoded/commands/submission_test.py b/src/encoded/commands/submission_test.py index aef1cc1a05..07eb772985 100644 --- a/src/encoded/commands/submission_test.py +++ b/src/encoded/commands/submission_test.py @@ -12,10 +12,10 @@ def main(): inst = virtualapp.get('/institutions/hms-dbmi/').json json_data = xls_to_json('src/encoded/tests/data/documents/cgap_submit_test.xlsx', proj, inst) final_json, validation_log = validate_all_items(virtualapp, json_data) - print(validation_log) + print('\n'.join(validation_log)) print(json.dumps(final_json, indent=4)) result = post_and_patch_all_items(virtualapp, final_json) - print(result) + print('\n'.join(result)) if __name__ == '__main__': From f6145ea50eeb08eaeb39280138393a635efba52b Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 26 Jun 2020 15:05:19 -0400 Subject: [PATCH 018/125] only generate reports when required in submit.py --- src/encoded/submit.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index b75dcf870f..87d2ce4e2a 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -84,6 +84,7 @@ def xls_to_json(xls_data, project, institution): items = fetch_family_metadata(row, items, indiv_alias, fam_alias) # create item for Sample if there is a specimen if row['specimen id']: + items['reports'] = [] samp_alias = '{}:sample-{}'.format(project['name'], row['specimen id']) if row['specimen id'] in specimen_ids: samp_alias = samp_alias + '-' + specimen_ids[row['specimen id']] @@ -182,6 +183,8 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysi } new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) + if row.get('report required').lower().startswith('y'): + new_items['reports'].append(samp_alias) if fam_alias not in new_items['sample_processing'][analysis_alias]['families']: new_items['sample_processing'][analysis_alias]['families'].append(fam_alias) return new_items @@ -199,19 +202,21 @@ def create_case_items(items, proj_name): case_id += '-group' case_alias = '{}:case-{}'.format(proj_name, case_id) indiv = [ikey for ikey, ival in items['individual'].items() if sample in ival.get('samples', [])][0] - report_alias = case_alias.replace('case', 'report') - new_items['report'][report_alias] = { - 'aliases': [report_alias], - 'description': 'Analysis Report for Individual ID {}'.format(items['individual'][indiv]['individual_id']) - } case_info = { 'aliases': [case_alias], 'case_id': case_id, 'sample_processing': k, - 'individual': indiv, - 'report': report_alias + 'individual': indiv } + if sample in items['reports']: + report_alias = case_alias.replace('case', 'report') + new_items['report'][report_alias] = { + 'aliases': [report_alias], + 'description': 'Analysis Report for Individual ID {}'.format(items['individual'][indiv]['individual_id']) + } + case_info['report'] = report_alias new_items['case'][case_alias] = case_info + del new_items['reports'] return new_items @@ -317,6 +322,8 @@ def compare_fields(profile, aliases, json_item, db_item): else: val = [v for v in json_item[field]] if sorted(val) != sorted(db_item.get(field, [])): + if len(val) == 1 and val not in db_item.get(field, []): + continue new_val = db_item.get(field, []) new_val.extend(val) to_patch[field] = list(set(new_val)) From 6dcf2464de25f2e9220f5937574a410bb8788074 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 26 Jun 2020 16:24:18 -0400 Subject: [PATCH 019/125] edit to code for handling report items in submit.py --- src/encoded/submit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 87d2ce4e2a..d8a2cc3137 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -71,7 +71,7 @@ def xls_to_json(xls_data, project, institution): items = { 'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}, - 'case': {}, 'report': {} + 'case': {}, 'report': {}, 'reports': [] } specimen_ids = {} for row in rows: @@ -84,7 +84,6 @@ def xls_to_json(xls_data, project, institution): items = fetch_family_metadata(row, items, indiv_alias, fam_alias) # create item for Sample if there is a specimen if row['specimen id']: - items['reports'] = [] samp_alias = '{}:sample-{}'.format(project['name'], row['specimen id']) if row['specimen id'] in specimen_ids: samp_alias = samp_alias + '-' + specimen_ids[row['specimen id']] @@ -184,6 +183,7 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysi new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) if row.get('report required').lower().startswith('y'): + print('report') new_items['reports'].append(samp_alias) if fam_alias not in new_items['sample_processing'][analysis_alias]['families']: new_items['sample_processing'][analysis_alias]['families'].append(fam_alias) @@ -209,6 +209,7 @@ def create_case_items(items, proj_name): 'individual': indiv } if sample in items['reports']: + print('2') report_alias = case_alias.replace('case', 'report') new_items['report'][report_alias] = { 'aliases': [report_alias], From 847e01f145163616be71360a18441aa89caa14c2 Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 9 Jul 2020 14:23:47 -0400 Subject: [PATCH 020/125] family alias based on proband id rather than family id in submit.py --- src/encoded/submit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index d8a2cc3137..d4c06134f4 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -76,7 +76,7 @@ def xls_to_json(xls_data, project, institution): specimen_ids = {} for row in rows: indiv_alias = '{}:individual-{}'.format(project['name'], row['patient id']) - fam_alias = '{}:family-{}'.format(project['name'], row['family id']) + fam_alias = '{}:family-{}'.format(project['name'], row['patient id']) sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual items = fetch_individual_metadata(row, items, indiv_alias) From c7523c7bceba37baaababbb3753ab61348effb3b Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 9 Jul 2020 16:45:53 -0400 Subject: [PATCH 021/125] added more metadata fields to digest for submit.py --- src/encoded/submit.py | 49 +++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index d4c06134f4..6e9415a266 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -66,7 +66,7 @@ def xls_to_json(xls_data, project, institution): counter = 0 for values in row: r = [val for val in values] - row_dict = {keys[i].lower(): item for i, item in enumerate(r)} + row_dict = {keys[i].lower().rstrip('*'): item for i, item in enumerate(r)} rows.append(row_dict) items = { @@ -154,26 +154,40 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysi 'workup_type': row.get('workup type'), 'specimen_type': row.get('specimen type'), 'specimen_collection_date': row.get('date collected'), - 'specimen_collection_location': row.get('location collected'), + # change collection location to stored location? + 'specimen_collection_location': row.get('location stored'), 'specimen_accession': row['specimen id'], + # second specimen id 'date_transported': row.get('date transported'), 'transported_by': row.get('transport method'), 'sent_by': row.get('sent by'), + # sequencing ref lab 'date_received': row.get("date rec'd at ref lab"), 'specimen_accepted': row.get('specimen accepted by ref lab'), + # sample ID by ref lab 'dna_concentration': row.get('dna concentration'), - 'specimen_notes': row.get('specimen notes') + 'specimen_notes': row.get('specimen notes'), + 'files': [], # TODO: implement creation of file db items + 'requisition_type': row.get('req type'), + # research protocol name + 'date_requisition_received': row.get("date req rec'd"), + 'ordering_physician': row.get('physician/provider'), + 'physician_id': row.get('physician id'), + 'indication': row.get('indication') } + req_info = { + 'accepted_rejected': row.get('req accepted y/n'), + 'rejection_reason': row.get('reason rejected'), + 'corrective_action': row.get('corrective action taken'), + # corrective action taken by + 'date_sent': row.get('date sent'), + 'date_completed': row.get('date completed'), + 'notes': row.get('correction notes') + } + info['requisition_acceptance'] = {k, v for k, v in req_info.items() if v} new_items['sample'][samp_alias] = {k: v for k, v in info.items() if v} if indiv_alias in new_items['individual']: new_items['individual'][indiv_alias]['samples'] = [samp_alias] - # create SampleProcessing item for that one sample if needed - # if row['report required'].lower() in ['yes', 'y']: - # new_items['sample_processing'][sp_alias] = { - # 'aliases': [sp_alias], - # 'analysis_type': row['workup type'], - # 'samples': [samp_alias] - # } new_sp_item = { # not trivial to add analysis_type here, turn into calculated property 'aliases': [analysis_alias], @@ -190,6 +204,20 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysi return new_items +# TODO: finish implementing this function +def fetch_file_metadata(filenames): + files = [] + for filename in filenames: + file_info = { + 'aliases': [], + 'file_format': '', + 'file_type': '', + 'filename': '' + } + files.append(file_info) + raise NotImplementedError + + def create_case_items(items, proj_name): new_items = items.copy() for k, v in items['sample_processing'].items(): @@ -209,7 +237,6 @@ def create_case_items(items, proj_name): 'individual': indiv } if sample in items['reports']: - print('2') report_alias = case_alias.replace('case', 'report') new_items['report'][report_alias] = { 'aliases': [report_alias], From 7d49c47a3485c3935fc50491df41c2c211ef9c8c Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 10 Jul 2020 10:54:52 -0400 Subject: [PATCH 022/125] added a few more fields to sample from accessioning worksheet --- src/encoded/schemas/sample.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/encoded/schemas/sample.json b/src/encoded/schemas/sample.json index 4bd1023bf0..76ec69be3b 100644 --- a/src/encoded/schemas/sample.json +++ b/src/encoded/schemas/sample.json @@ -54,6 +54,13 @@ "lookup": 30, "description": "Clinical or research consent/protocol" }, + "research_protocol_name": { + "title": "Research Protocol Name", + "type": "string", + "label": "requisition", + "lookup": 31, + "description": "Consent Protocol Name for Research Requisition" + }, "date_requisition_received": { "title": "Date Requisition Received", "type": "string", @@ -131,6 +138,12 @@ "lookup": 113, "description": "If requisition was rejected, the corrective action noted/taken" }, + "action_taken_by": { + "title": "Action Taken By", + "type": "string", + "lookup": 114, + "description": "Name or ID of person who took the corrective action" + }, "date_sent": { "title": "Date Correction Sent", "type": "string", @@ -203,6 +216,13 @@ "type": "string", "lookup": 140 }, + "specimen_storage_location": { + "title": "Specimen Storage Location", + "description": "Location of specimen storage", + "label": "specimen", + "type": "string", + "lookup": 144 + }, "specimen_accession": { "title": "Specimen Accession", "description": "Accession of specimen from sequencing lab", @@ -247,6 +267,13 @@ "lookup": 160, "description": "ID of person who sent the specimen" }, + "sequencing_lab": { + "title": "Sequencing Lab", + "description": "Location performing sequencing on sample", + "type": "string", + "label": "test", + "lookup": 189 + }, "date_received": { "title": "Date Received in Sequencing Lab", "type": "string", From be81d70f07d97032238c43af8f2686c01c299807 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 10 Jul 2020 10:55:21 -0400 Subject: [PATCH 023/125] more accessioning fields digested by submit.py --- src/encoded/submit.py | 67 ++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 6e9415a266..aefcfca689 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -77,7 +77,7 @@ def xls_to_json(xls_data, project, institution): for row in rows: indiv_alias = '{}:individual-{}'.format(project['name'], row['patient id']) fam_alias = '{}:family-{}'.format(project['name'], row['patient id']) - sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) + # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual items = fetch_individual_metadata(row, items, indiv_alias) # create/edit items for Family @@ -91,7 +91,8 @@ def xls_to_json(xls_data, project, institution): else: specimen_ids[row['specimen id']] = 1 analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) - items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysis_alias, fam_alias) + items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, + analysis_alias, fam_alias, project['name']) else: print('WARNING: No specimen id present for patient {},' ' sample will not be created.'.format(row['patient id'])) @@ -147,39 +148,42 @@ def fetch_family_metadata(row, items, indiv_alias, fam_alias): return new_items -def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysis_alias, fam_alias): +def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name): new_items = items.copy() info = { 'aliases': [samp_alias], 'workup_type': row.get('workup type'), 'specimen_type': row.get('specimen type'), 'specimen_collection_date': row.get('date collected'), - # change collection location to stored location? - 'specimen_collection_location': row.get('location stored'), + 'specimen_storage_location': row.get('location stored'), 'specimen_accession': row['specimen id'], - # second specimen id 'date_transported': row.get('date transported'), 'transported_by': row.get('transport method'), 'sent_by': row.get('sent by'), - # sequencing ref lab + 'sequencing_lab': row.get('sequencing ref lab'), 'date_received': row.get("date rec'd at ref lab"), 'specimen_accepted': row.get('specimen accepted by ref lab'), - # sample ID by ref lab + 'sequence_id': row.get('sample id by ref lab'), 'dna_concentration': row.get('dna concentration'), 'specimen_notes': row.get('specimen notes'), 'files': [], # TODO: implement creation of file db items 'requisition_type': row.get('req type'), - # research protocol name + 'research_protocol_name': row.get('research protocol name') 'date_requisition_received': row.get("date req rec'd"), 'ordering_physician': row.get('physician/provider'), 'physician_id': row.get('physician id'), 'indication': row.get('indication') } + if row.get('second specimen id'): + other_id = {'id': row['second specimen id'], 'id_type': proj_name} # add proj info? + if row.get('second specimen id type'): + other_id['id_type'] = row['second specimen id type'] + info['other_specimen_ids'] = [other_id] req_info = { 'accepted_rejected': row.get('req accepted y/n'), 'rejection_reason': row.get('reason rejected'), 'corrective_action': row.get('corrective action taken'), - # corrective action taken by + 'action_taken_by': row.get('corrective action taken by') 'date_sent': row.get('date sent'), 'date_completed': row.get('date completed'), 'notes': row.get('correction notes') @@ -197,7 +201,6 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, sp_alias, analysi new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) if row.get('report required').lower().startswith('y'): - print('report') new_items['reports'].append(samp_alias) if fam_alias not in new_items['sample_processing'][analysis_alias]['families']: new_items['sample_processing'][analysis_alias]['families'].append(fam_alias) @@ -248,27 +251,27 @@ def create_case_items(items, proj_name): return new_items -def create_sample_processing_groups(items, sp_alias): - new_items = items.copy() - for v in new_items['family'].values(): - if 'members' in v and len(v['members']) > 1: - # create sample_processing item - samples = [items['individual'][indiv].get('samples', [None])[0] for indiv in v['members']] - samples = [s for s in samples if s] - if len (samples) > 1: - sp = { - 'aliases': [sp_alias], - 'samples': samples - } - analysis_type = items['sample'][items['individual'][v['proband']]['samples'][0]]['workup_type'] - if all([relation in v for relation in ['proband', 'mother', 'father']]) and sorted( - v['members']) == sorted([v['proband'], v['mother'], v['father']] - ): - sp['analysis_type'] = analysis_type + '-Trio' - else: - sp['analysis_type'] = analysis_type + '-Group' - new_items['sample_processing'][sp_alias] = sp - return new_items +# def create_sample_processing_groups(items, sp_alias): +# new_items = items.copy() +# for v in new_items['family'].values(): +# if 'members' in v and len(v['members']) > 1: +# # create sample_processing item +# samples = [items['individual'][indiv].get('samples', [None])[0] for indiv in v['members']] +# samples = [s for s in samples if s] +# if len (samples) > 1: +# sp = { +# 'aliases': [sp_alias], +# 'samples': samples +# } +# analysis_type = items['sample'][items['individual'][v['proband']]['samples'][0]]['workup_type'] +# if all([relation in v for relation in ['proband', 'mother', 'father']]) and sorted( +# v['members']) == sorted([v['proband'], v['mother'], v['father']] +# ): +# sp['analysis_type'] = analysis_type + '-Trio' +# else: +# sp['analysis_type'] = analysis_type + '-Group' +# new_items['sample_processing'][sp_alias] = sp +# return new_items def compare_with_db(virtualapp, alias): From 9dced9b926c126f6423de506e8c685ef54c1cc75 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 10 Jul 2020 11:05:32 -0400 Subject: [PATCH 024/125] digestion of second individual id in submit.py --- src/encoded/submit.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index aefcfca689..de3953e9b7 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -79,7 +79,7 @@ def xls_to_json(xls_data, project, institution): fam_alias = '{}:family-{}'.format(project['name'], row['patient id']) # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual - items = fetch_individual_metadata(row, items, indiv_alias) + items = fetch_individual_metadata(row, items, indiv_alias, institution['name']) # create/edit items for Family items = fetch_family_metadata(row, items, indiv_alias, fam_alias) # create item for Sample if there is a specimen @@ -111,13 +111,18 @@ def xls_to_json(xls_data, project, institution): return items -def fetch_individual_metadata(row, items, indiv_alias): +def fetch_individual_metadata(row, items, indiv_alias, inst_name): new_items = items.copy() info = { 'aliases': [indiv_alias], 'individual_id': row['patient id'], 'sex': row.get('sex'), } + if row.get('other individual id'): + other_id = {'id': row['other individual id'], 'id_source': inst_name} + if row.get('other individual id type'): + other_id['id_source'] = row['other individual id source'] + info['institutional_id'] = other_id info['age'] = int(row['age']) if row.get('age') else None info['birth_year'] = int(row['birth year']) if row.get('birth year') else None if indiv_alias not in new_items['individual']: From b6dd846e6791e45510a722f5cfbe041d30f379d8 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 10 Jul 2020 16:45:42 -0400 Subject: [PATCH 025/125] added mapping of spreadsheet to cgap fields in submit.py --- src/encoded/submit.py | 93 +++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index de3953e9b7..99fba865c7 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -12,12 +12,37 @@ import xlrd -BGM_FIELD_MAPPING = { - 'bcgg-id': 'patient id', - 'bcgg-f-id': 'family id', - "date req rec'd": 'date requisition received' +GENERIC_FIELD_MAPPING = { + 'individual': {'patient id': 'individual_id'}, + 'family': {}, + 'sample': { + 'date collected': 'specimen_collection_date', + 'location stored': 'specimen_storage_location', + 'specimen id': 'specimen_accession', + 'transport method': 'transported_by', + 'sequencing ref lab': 'sequencing_lab', + "date rec'd at ref lab": 'date_received', + 'specimen accepted by ref lab': 'specimen_accepted', + 'sample id by ref lab': 'sequence_id', + 'req type': 'requisition_type', + "date req rec'd": 'date_requisition_received', + 'physician/provider': 'ordering_physician' + }, + 'requisition': { + 'req accepted y/n': 'accepted_rejected', + 'reason rejected': 'rejection_reason', + 'corrective action taken': 'corrective_action', + 'corrective action taken by': 'action_taken_by', + 'correction notes': 'notes' + } } +# BGM_FIELD_MAPPING = { +# 'bcgg-id': 'patient id', +# 'bcgg-f-id': 'family id', +# "date req rec'd": 'date requisition received' +# } + POST_ORDER = ['sample', 'individual', 'family', 'sample_processing', 'report', 'case'] @@ -51,6 +76,15 @@ def submit_data(context, request): raise NotImplementedError +def map_fields(row, metadata_dict, addl_fields, item_type): + for map_field in GENERIC_FIELD_MAPPING[item_type]: + if map_field in row: + metadata_dict[GENERIC_FIELD_MAPPING[item_type][map_field]] = row.get(map_field) + for field in addl_fields: + metadata_dict[field.replace('_', ' ')] = row.get(field) + return metadata_dict + + def xls_to_json(xls_data, project, institution): ''' Converts excel file to json for submission. @@ -113,18 +147,15 @@ def xls_to_json(xls_data, project, institution): def fetch_individual_metadata(row, items, indiv_alias, inst_name): new_items = items.copy() - info = { - 'aliases': [indiv_alias], - 'individual_id': row['patient id'], - 'sex': row.get('sex'), - } + info = {'aliases': [indiv_alias]} + info = map_fields(row, info, ['sex', 'age', 'birth_year'], 'individual') if row.get('other individual id'): other_id = {'id': row['other individual id'], 'id_source': inst_name} if row.get('other individual id type'): other_id['id_source'] = row['other individual id source'] info['institutional_id'] = other_id - info['age'] = int(row['age']) if row.get('age') else None - info['birth_year'] = int(row['birth year']) if row.get('birth year') else None + info['age'] = int(info['age']) if info.get('age') else None + info['birth_year'] = int(info['birth year']) if info.get('birth year') else None if indiv_alias not in new_items['individual']: new_items['individual'][indiv_alias] = {k: v for k, v in info.items() if v} else: @@ -155,44 +186,18 @@ def fetch_family_metadata(row, items, indiv_alias, fam_alias): def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name): new_items = items.copy() - info = { - 'aliases': [samp_alias], - 'workup_type': row.get('workup type'), - 'specimen_type': row.get('specimen type'), - 'specimen_collection_date': row.get('date collected'), - 'specimen_storage_location': row.get('location stored'), - 'specimen_accession': row['specimen id'], - 'date_transported': row.get('date transported'), - 'transported_by': row.get('transport method'), - 'sent_by': row.get('sent by'), - 'sequencing_lab': row.get('sequencing ref lab'), - 'date_received': row.get("date rec'd at ref lab"), - 'specimen_accepted': row.get('specimen accepted by ref lab'), - 'sequence_id': row.get('sample id by ref lab'), - 'dna_concentration': row.get('dna concentration'), - 'specimen_notes': row.get('specimen notes'), - 'files': [], # TODO: implement creation of file db items - 'requisition_type': row.get('req type'), - 'research_protocol_name': row.get('research protocol name') - 'date_requisition_received': row.get("date req rec'd"), - 'ordering_physician': row.get('physician/provider'), - 'physician_id': row.get('physician id'), - 'indication': row.get('indication') - } + info = {'aliases': [samp_alias], 'files': []} # TODO: implement creation of file db items + fields = [ + 'workup_type', 'specimen_type', 'dna_concentration', 'date_transported', + 'specimen_notes', 'research_protocol_name', 'sent_by', 'physician_id', 'indication' + ] + info = map_fields(row, info, fields, 'sample') if row.get('second specimen id'): other_id = {'id': row['second specimen id'], 'id_type': proj_name} # add proj info? if row.get('second specimen id type'): other_id['id_type'] = row['second specimen id type'] info['other_specimen_ids'] = [other_id] - req_info = { - 'accepted_rejected': row.get('req accepted y/n'), - 'rejection_reason': row.get('reason rejected'), - 'corrective_action': row.get('corrective action taken'), - 'action_taken_by': row.get('corrective action taken by') - 'date_sent': row.get('date sent'), - 'date_completed': row.get('date completed'), - 'notes': row.get('correction notes') - } + req_info = map_fields(row, {}, ['date sent', 'date completed'], 'requisition') info['requisition_acceptance'] = {k, v for k, v in req_info.items() if v} new_items['sample'][samp_alias] = {k: v for k, v in info.items() if v} if indiv_alias in new_items['individual']: From 40b8015821ae3e44cce486b5cea7fa0a201ea0a7 Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 13 Jul 2020 16:19:05 -0400 Subject: [PATCH 026/125] edits to submit.py to fix testing bugs --- src/encoded/submit.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 99fba865c7..2449b68411 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -13,7 +13,7 @@ GENERIC_FIELD_MAPPING = { - 'individual': {'patient id': 'individual_id'}, + 'individual': {}, 'family': {}, 'sample': { 'date collected': 'specimen_collection_date', @@ -81,7 +81,7 @@ def map_fields(row, metadata_dict, addl_fields, item_type): if map_field in row: metadata_dict[GENERIC_FIELD_MAPPING[item_type][map_field]] = row.get(map_field) for field in addl_fields: - metadata_dict[field.replace('_', ' ')] = row.get(field) + metadata_dict[field] = row.get(field.replace('_', ' ')) return metadata_dict @@ -109,8 +109,8 @@ def xls_to_json(xls_data, project, institution): } specimen_ids = {} for row in rows: - indiv_alias = '{}:individual-{}'.format(project['name'], row['patient id']) - fam_alias = '{}:family-{}'.format(project['name'], row['patient id']) + indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) + fam_alias = '{}:family-{}'.format(project['name'], row['individual id']) # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual items = fetch_individual_metadata(row, items, indiv_alias, institution['name']) @@ -129,7 +129,7 @@ def xls_to_json(xls_data, project, institution): analysis_alias, fam_alias, project['name']) else: print('WARNING: No specimen id present for patient {},' - ' sample will not be created.'.format(row['patient id'])) + ' sample will not be created.'.format(row['individual id'])) # create SampleProcessing item for trio/group if needed # items = create_sample_processing_groups(items, sp_alias) items = create_case_items(items, project['name']) @@ -148,7 +148,7 @@ def xls_to_json(xls_data, project, institution): def fetch_individual_metadata(row, items, indiv_alias, inst_name): new_items = items.copy() info = {'aliases': [indiv_alias]} - info = map_fields(row, info, ['sex', 'age', 'birth_year'], 'individual') + info = map_fields(row, info, ['individual_id', 'sex', 'age', 'birth_year'], 'individual') if row.get('other individual id'): other_id = {'id': row['other individual id'], 'id_source': inst_name} if row.get('other individual id type'): @@ -192,13 +192,22 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f 'specimen_notes', 'research_protocol_name', 'sent_by', 'physician_id', 'indication' ] info = map_fields(row, info, fields, 'sample') + if info['specimen_accepted'].lower() == 'y': + info['specimen_accepted'] = 'Yes' + elif info['specimen_accepted'].lower() == 'n': + info['specimen_accepted'] = 'No' if row.get('second specimen id'): other_id = {'id': row['second specimen id'], 'id_type': proj_name} # add proj info? if row.get('second specimen id type'): other_id['id_type'] = row['second specimen id type'] info['other_specimen_ids'] = [other_id] req_info = map_fields(row, {}, ['date sent', 'date completed'], 'requisition') - info['requisition_acceptance'] = {k, v for k, v in req_info.items() if v} + if req_info['accepted_rejected'].lower() in ['yes', 'no', 'y', 'n']: + if req_info['accepted_rejected'].lower().startswith('y'): + req_info['accepted_rejected'] = 'Accepted' + else: + req_info['accepted_rejected'] = "Rejected" + info['requisition_acceptance'] = {k: v for k, v in req_info.items() if v} new_items['sample'][samp_alias] = {k: v for k, v in info.items() if v} if indiv_alias in new_items['individual']: new_items['individual'][indiv_alias]['samples'] = [samp_alias] @@ -245,7 +254,7 @@ def create_case_items(items, proj_name): indiv = [ikey for ikey, ival in items['individual'].items() if sample in ival.get('samples', [])][0] case_info = { 'aliases': [case_alias], - 'case_id': case_id, + # 'case_id': case_id, 'sample_processing': k, 'individual': indiv } From 3df3aa5844833a1d8690b0a73e5d64ebe8082a61 Mon Sep 17 00:00:00 2001 From: Sarah Date: Mon, 13 Jul 2020 16:45:26 -0400 Subject: [PATCH 027/125] edits to tests for submit.py --- src/encoded/submit.py | 8 +-- .../data/documents/cgap_submit_test.xlsx | Bin 13221 -> 13213 bytes src/encoded/tests/test_submit.py | 64 ++++++++---------- 3 files changed, 34 insertions(+), 38 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 2449b68411..f85161eef0 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -110,7 +110,7 @@ def xls_to_json(xls_data, project, institution): specimen_ids = {} for row in rows: indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) - fam_alias = '{}:family-{}'.format(project['name'], row['individual id']) + fam_alias = '{}:family-{}'.format(project['name'], row['family id']) # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual items = fetch_individual_metadata(row, items, indiv_alias, institution['name']) @@ -192,9 +192,9 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f 'specimen_notes', 'research_protocol_name', 'sent_by', 'physician_id', 'indication' ] info = map_fields(row, info, fields, 'sample') - if info['specimen_accepted'].lower() == 'y': + if info.get('specimen_accepted', '').lower() == 'y': info['specimen_accepted'] = 'Yes' - elif info['specimen_accepted'].lower() == 'n': + elif info.get('specimen_accepted', '').lower() == 'n': info['specimen_accepted'] = 'No' if row.get('second specimen id'): other_id = {'id': row['second specimen id'], 'id_type': proj_name} # add proj info? @@ -202,7 +202,7 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f other_id['id_type'] = row['second specimen id type'] info['other_specimen_ids'] = [other_id] req_info = map_fields(row, {}, ['date sent', 'date completed'], 'requisition') - if req_info['accepted_rejected'].lower() in ['yes', 'no', 'y', 'n']: + if req_info.get('accepted_rejected', '').lower() in ['yes', 'no', 'y', 'n']: if req_info['accepted_rejected'].lower().startswith('y'): req_info['accepted_rejected'] = 'Accepted' else: diff --git a/src/encoded/tests/data/documents/cgap_submit_test.xlsx b/src/encoded/tests/data/documents/cgap_submit_test.xlsx index 0e3409a68f19e8f22e4379178b33a7c180d15e45..de1a55de5fd1af955829d96e04f80ff8a679b648 100644 GIT binary patch delta 4960 zcmaJ_bx_m~^F9#d5I905kCp>LT50e&;Rxw23F+>RkHi7ea)5M6A8~Xa0)isQNeR-3 zv>++1KfmuE?>q1N$8TqL=GmQRXLn|2XLh&LKF=OYK?+Q12B<@CfIyTae-#7*1-Jvz>6(uTb10{?!IiCk-R&AiI^~&Ey{2Zuf z|KPk;Kfa*5i5Zvp#Z}PLdAS$j)q&yj;Y-x-7!r~$rXNCXA|Bb-mm zAIO&gm3Yl^*2lRn&F$@9E^ZRQtwqhGrpuy?v_EFgaV~tk`M}xLY0uJ0>|G1;urEWW zql1IJl(e$lX`hAtQC;unon{)ZN=3ur!Cm=;>b4sfEqXP~pPddE+p=?VYAK>>O<%Mb zq|{{tuASHw*~@JJQTZ6N8$xc40)4)Hl#o(_H_E=%x3u~L3z6LU3npR))Hw^-(zZ&yBV?q04{$g31l z!0>`?2i`Oy#!UJ^WABh@k(jDMT48GVWyE^+JHWL;c z6S~x?@k5<6Kh_=RxYE5f%w=f#mb*pcnz!egyDc1*fA7< zo8S689!j?tnfOZO+I+p*GE^i6Ph3Z&7s+}-_nvB>6p0Wo7QLz01B%yVAyU|zZ}+_h z&+5I+RrK*w^p+q>gsqjY(xa{8q=%O2UNS8*!zR(hZK&5Caas>draCOOohlA!C zQObp9zN{Pb-h%ytOFMC5F4dFU4*QqCdtVr>b)-1?K0Mo4_xXt@Fy_`{=dC$&z}iZm ziCmnQ7d0n&7=C>tzh!nLIqYKY#s356W@Kewsm>3@W$W89^%rBg8Znq3`^(G8)wDVk zJk!o=mZwyH)2ec{fxv9dm9G#zMGcf8pb z;$Xe)su!=p**-h&tNWQ0lyr&)(_d*HuZ*Hvy)J5BFutLKzXL3U%t%Q#KR5=NLo9H3 z0Z!|cKHpQDHs~{%yK-&&Mw`Dji$BTR5Y;vy%=O+=oH4Ux#q5T;8hv>op#DLI%=4%EBS_oTjt=|?e?5?Cgw=lhe{FI;y1~<+8Wr{2JBt5O88fuo!uzbEz4I7FPfK$y^h7HzRgbt$Sa24a86F^{pT(r74wmfuf*yrY@xgXhAq6#a zcLK*dH=2l*#s$UuBr2M?7>3Vn#gd{(X2;;IDd8%m)@mBD>^mbU)$hOHLVeFEj);q{^XH#+-oTj9Yo&asczn6|BtQ9CY02>0{Zxvwdu6H* z^3ko7(m?ksT>LUvXE_;Fo+S8Q3u%Ka%D+uBtc4dqXX15w97qFkO)}KBI``yf63+C023w@()3yp}l;vccK?PoZvWo^p zk$?U*%8i9o5D0044s{aiEfDAxx|M|!SXHJ8U5dRXiCL;28))zZ-A<*T_*Ck+cbp?c z2z51OQGuVjjr3?{r0J{L(z-o)XEt)6ZD6n_Y0PCj2V7_4P3lxKG$#f$=C)i~OoyJQ z%Bj(5O7CKZuM_1v)9DVrvFDe@+mv5R56uC)zfT{}xjX(61vFBNiK%JNH7?r$o{WAs z@d6p{BZ;3S!!`ssMCu_&yXh>>h-SgLxZM(3_##6B8<2?2FCEL$_&T*CS&@XpACtRw+}dHf5b&I1=)CUk;6bPH zp_D#x^^^93r$bv=MsCa_mp0h9M7qQ26c%Na(z|~;7rzvWAHHYG(RetsA4%Zg(!=T& z5WE-AwSgMLiC&aHZ?&XvBXEgv$~kZX-Q1AcR4aiKZF z<#YXZ07t2w%y(6ONhDQbKmo7fD3v}hPIIDwcb?s>jr7KbE&U@8nfqmRLp)govQflC ze`<9+^%b+Ibw`|012 zWsVskp#{Are5?PoXk#Tc@Qk@)JczH-emRxl_)Cvj`CcDMHz@4=5iqZg*VZu>B-PS1 zCZm(vr2w)rE{>$3p z2qN3x#i*EJE!s+MDw_u|L)H&a?)W9D^+lV^eNib?h9sbk^~5eOLH!Fg#jQ%SdYT zAVoI(2if;{IE2WHC#5b{&ouSp2AEnaCkxg3HFuys)|(?m7YGlz{UHmEBl~W~Mn~AL zP~vWIa*Nu84DUGl&E#D>^@kdx0_ow4W8-unR-%Yj>Dif<6MNcoa493=evH;Ods+mz zlml`9rIvF{m;J=Gdy z$1KtKRBh3#T0c@>t43kzq>cki=_Dy;(^e>}pV9CFqPx4oa!MWLI08iSw{*`)grP+n zJd6O=xjrmQtNqGxm#=YU!&wxnrSGN-}QCqg%sD6doZZ;e_kC zLJbPrjaAVO6UckX>ii0C#8hOseVjzOVS`jPvkC8RD>eT0&~!9QLMMD>^4t0mPP-Dh(1?cs%5nBq>rPjflPnfM=809K<3Do52xC4)AO= z+jM#wI3j}$%=m(h$3LR5&G{aJ>o$??Q>sZGd94&8`4eiQQY6*nU3o19BKZL-qBVJ{ zNkVz-K2b04ZH38)G>;M$)H$^$l>ZlE%ho2;WU4H6g-Ldm#j7t2bp6W@1%Rx4UHDO= zx9a&Z%^qNwuQaLX!4l5oH6eOABKQi`&%;^r)&WMoV`=?lOq@Enr9Nxxlpp#9gPZBY zHpZa(Oirv654N8}^_4@VgA%lt84|K=l6|LdGnHqJRHD8$3|!c)Vv9yyp46{hx~*cf z3jRp0k&&Sk+na1hczyLffyveRi??q@a4jA)@7#Z@wJY9|Rk`Uh(cYi1wPTt^a7Owit?2aWd70JWhiet&gr2);n+o zg_v8|Lc>xK;vG%e#JMHRs$P&YDd)>6BxUm)xa5-}+b$QO;|>*|Y>I-?%)ZV||XeQmXy#VVH-Ulp1DV{TH09@LMGya`?T<)Y(xnKQndPrC_NQnOG+qrYQpS z)$MSHS%?47kDaT_f9394uhHZlUC}8kx=KejVD)L`ZQoWs%-hJ*^2h+adlHiD+w~?6 z{>v+2c4pW(Sh|-K^05LZ4xYi5!(!#eLqwI&JHs2-NJ0xxvW?`s4H~}|2Cd@#M_!mS;kZ$Ja25SZ;(CDIw;k((8 zyTcQ*m?gYZ_kn|{xl^A&(b9du<)H^}mK#VaLf0;N)B%Cbdw>oDfv z0$p?n2r3>kiiky8o02R#coe254tl7)q5iHjv)5${RW0~3>awvz~l3oi3xKL%z*BLIf@>0S$+O9 zhy4ToEHVLK-Y`LUUNcmF`FxSEhl|BzdGo|tMN^4n!wa$#&!+nvgyS%?8E}UFW^o1 zv%|%2+2JI!4E6iQ{hQjJf~ntZzp6Ig@2WS(<#$b(ml?krm>@*bSNU5t_Q&vYABmjo z^??4l1&Qkf_wsEUDeuDw01M4Np*sUh>Q|7b1z%@f37Q!=d3)(z-a|aZJ=$e{=(amf zdw#UFD+Z5+$+c|ETj1Lr#bym`_lXBpYod9Qz_;L@{3RxkvFCE6?%I~jx)JFny;Qqz zi?zc-(+{$}TrKT$ZS;h10acy=dQSCjea#HQPT>hjdYUu7%oPc@R(h z*9iY_L*31XHuElGV=!0ivn%1S0=b?R{U?>#&phFQFp0IBnG-O;q+V9Hfo`5@X2;jP zC2mkV8V~&>_S9~{d{PjX!p7ovuIKfpM!;-e=T7PqEY?!wYHriQDc%$SbER5)G5h|vCb(R1G3MK@BsQ-&}gx(a9BS=Pb Si|P?%phHACZ(0fcL-;>-uv@19 delta 4967 zcmaJ_XHb(1vkpx{4Im%}By^D8O{54YQ9z^#(nIfrDn-hR5CTe32)zXa5g~%oq+{s4 zOO>J`NbgPBch0#p_uTK#-I@KdyU+gFooD9R&31h2_>qnZ5Vj*hbCQ8T5X!$C1Of%P zN%(tsyV`hoxQYk3yAcd6JZ89VC;~!qJwDTdW;wWo0 z3^$s$evGY*H~GR6$gnYx<+&hn4*X^YbXLnY8ST>-FN7>-se8O3(1|J=2)~XT->F`# zLfgl~fP(gSf?M-9JgREzzy6^oQE~M#M@-mbjP<&)GrY6itfH>&&`m2R`8Ge~?ue0o zU7-N?TPo)ommN-S84u#nS{0M`TlM*_@eS2jLM_>!YW?gR!_Xbp8f<2L6AcD^JFLdA z${Ru^gpBJNxb;E)toq7le^I0YnGPv{%%kcAvcK%Rlpv|{KUmiDh6x0Xl$|*yNgkSk z{mN{&ba5mjs)c;KHd%QT&?1KJcX#{1CSMJ?vlxCu)*{5hy>s9aY*x>hPS=G+A z-TfNA*XG?S!Xw{6bK9Mx?1fx7t-7ezk1sx2nw#*O_~MR4R6#Flq8k>SN&{mg<(?Xy zYi_26e7rR!G^FLo(10%!=h8qOD;3ZJ7%6XsBSiYQ$`-rzC9eSo4XDl&)jF%aFyt`b zgBTXN0Vi3S6+38Q?+L`SgD-uk!T&1`s^}XpXxS{G4LdyKi|7k^?P36s*s}JY z@r^|}ru^)zpb$$eNTU>`7F3nZ>>#>@R)cNoOvMa3SP90ov0V~L*sH8XbV@E!#c4F$ zAMU_jr;>84TrhKn$0c;1SuQo$-IeYfq$s0!9I=IsltH#K^X8civy&C0!rKKQq&)tc zEp>U6?!s0BJh$;9veZseFPG-ipJd^dpNoVtXoMKN`Em~~$* ztMPh*jZvuSRqj0qHyfIsyT3rAf5QZ~Q-Z^TaCv0RRDsS@e5~Xk&>I|z_8!oRz6g2c z_y9Ch5&0nZ_$h4pn~y}R#QfUJoJr&NUxyct&Lc=Z^*rVNy6Ut&*uOG|Co6wcr1sc! zXV|LMZ`*!;lvo=klw z5@9A{1yJ46i5}u6_j+k^u=yTPX?C(^B?&b}KvVzJ%10$7$q4b6l&fyrMa?iG?Hk8AjpST+d29sY|RSh!FsYp8OoSyziXr zIhum;7RZj4$hFOGPefgSaIok1PeU8S>X{K;U~~J(hN5`(Z3RVXg_BIQymr!}NW^+L z&esA>=zAND-jrv0h%kukn^BBB>W*f*3wt@cCSwv@e_|hH@aU@{J zr{mt2X1*kr2~E9m07kT2{-}{7l$yg-F0ZGEohD&vN}1!m^PGr$FG;z7ROhz*#!{ba z>+!q)w!K|&v24jn(^~tC=?XlMayL!7CHRc(`Ig97Ps-pq$BM=?v$@${jbiAc7(R4< zU~4VGguiAtR?m>+m@VIDb|<>9Zn-Uvv8OuRhgdqs*pqXO3Mj5}aLfXPz`6Ya8p}3} z0!66okrB+E_e!3JkFaogiw~rD=I7a?9JStg&Gb|&i*}j~-y24nP+ZFd$=IF4 zjl~E(Wcvh2Xb8aMAM>|S7W)OuQGMQQjVu%FYzh_B*TFmk`@c(mJXBNcN}Zw_%>)<0 zAJS$Mox?6lzbVV?F!JrYJzQk6SRNM9J69%z;f!FSY1_0=r*xr^B$t7>t@;<(r{P|C zE#tGZHwa~z9Wu=l?12___bqh_7CO|HY30PglPb1ct^_^curW#zR3shC};)QtS;^u*$X3Uk%(T!M+XEN-T>io-;wQXH<2lxP6!o5i1KvOO5 zI}kez{6po690q!N`ZnRt5CLisC{+vuVgunarJ+EU>5TqN4^zk;!)2Ix=&LyKlsKBn zx1YEfnAFs2_&i#`p6%5Q{0Y5?s>_h02TJ)yM+Kg$^K91HX8Ma$d?ViEqA}p% zXgL|cYSbQj^htE$XJ~VpQekhwZ~HZ03jfWD;Pj+z$BQ9}@xY94h7hr9A%LUcss!$& z*vyP=d1kU}b8ET6H={o%Wu=%9x46P{lX&^Vt4Tbz>X#e4PcPr)MP;Z=Z;Gp=mjbG< zcRqO-?TBE7-(d7WDgNNa>f7J_dl&a zw{sBdKJl>cpg9fvV_SN4GHn;NoOtVF^;w7Uwx(@@Tw!bSEBuoG>FnhxannhX>Yaz5 zqTI8qNZz7P`POFw@cnnAtz1U;=N9ps#JO1xX@cX|?$Oh8^m?42*VqPZ-H^gpP6q&3 z7wZfJK3`km$xgmW`{Sv?WpDYfcG2P5J<=kV)1&m#E{ZpGFFh*O>*f7PA)8;msM{kU zS$#bxuY425!#H8cTuieoiay^f*c>tXd5&XUZla?izxdDK-hjJ)0nW`!saXVO?X{kNGxXzIxHD4iw7LoTfu^^v|Wf-(S`+hvW{ zY!uDau#eZsFCHzd=uZt$73BzDQ#NN489sc3S@Nn0a2KwsNQRI|mAf;Mo=PeXW2|RR zD0PbuCOxe==as2Qn4Qnz<_)*0NXAUu?emM}X?SG6!ZEco578z)cfppwo;L*qGDiu# z{PW|N9!G}IYRr@SUQTa{BrGj~9^RScH&$cJJS&YtLQbe5lhdcJAD*4b-o!;TDxOX~ zkA$|F6RtLH>~u_DtqH@7xfh`Su-yNQzO%YAUA8q9y6Lv8Oi-3_R~|n+swe4U5oHEl z!ndykox`U27ljrvhRfqR+T_wKU3n_;(oumlHE`@um{2He|FTsz2%xv|KqdDzC z*2@tp1LV7UT+5^&T3;q`-y-xPPr?(#+^dJ*!wctUZbs0AiAVb2+ZfoC9gijY!GV*G959?(d0M2 z!rXE@mw}Zu@*|TF=B)}hK%k(umeK|L=x{$&j+8uQZrL{gS7>uZP8BjZWU8gPQ(ITx z#QcB3>%Qv5DI@6_O$W1XH<5IA{)!KF)ye-P1!lgAs2P^@*k!5+%+Se7O_k1zS$+B> zLce}_{4(FabAVqEz_9!!n1@6qrj5__t=ad4?0REjS~Eu5h}P#JtI}7iqHgud*B;fcYU|L&dS@Ri{+cuMn3B@* z3*1p$!Hw`re4uD!ur{W(@S>ZzV={85KGi~Evj_Sz)yn%{k)8) z%Pg{TZ{XP%E4_QxgWDTgFEg|uKZAWddF0FBehFzH+h1ICCPubN??WTq%xw%}^orb> zj;NBSdDIDPwn;8CO#U40(*j;vMdOE_gi_lv*BK_kJS(#T?vljG2PqxSZ4oDPEM7*J zQZG}`*Y7I=3qq#l5VaRfL($@x_om_e!^zfpy^ z&vH?Vq;nZplM|u*U{*fWkAJ}7Z;=cHk#H+UMsg7>moXJgMCL^t;C>iSOtB zKSS@`AwOjd;QmbK(zXAFg9Dk)MS-kvFs95jeHr)y?i<%X-I);4`s}M&>h4x)X8aAGWj|eG*{fvAW&R`d zs58AdW%tJklicvL?_Tn|t{6`CD#otp7(fN7MJ_$U{W^jAd8{lLqqz;nMCx#h`chk4 z8+46HeX?IoA&3?}q_QhcR4D{g*}2546t;owgkP!@hJo!?BUK6&!FI*ZRnkJF-R1pO1H|NAN_aj@QI}`WYStuQr znJ9rby?$-wzV-+K<_XeT)wYVDsWSL8mIba@HSm+5iv51cuBOE1p$>X_055m&=nmD6 zp;Pz;#nYy{5(``9Nb-JvPu}olyz2Ge^FZbYsatLs6-2O?j9M;7Q%xR22Do$Uq38O$ zpfS?#4V^EZlo@;&$Oo3*-l=v!(9DCCXFCFL8|C70_o={b`4vS{1c7lT@OWUei+(3~ z*-KaPqJ5;@`VZngV$eV5$jQz&pLn7;@AMbQNwhp~7>;F)>uCQThpiWc#Zn7!1>~Sh+2JW~k_XXpyy(03=JO;z+e2#o|VET*T`;x!v^Bj%YGoN8w0qbEI zjx2$CvG;3a+!85up*O?3M9Id6&ke<3ZP1=qJ<9T~|IRU|CK5y=um5gVfgOdDYfR;hrCU zs<2m~5k_l&O&Q~Kk>$8Z`0_dKa9ozA>e9?0&sva9hiRwz*RCtmM|XAb2s(z=QVA-fDzWe+^~O z+9LP)>qW?(;jl@;(bsNGT^&nUIYmIc2?>u|q(;Gaxx=+AAHRy6eC0;zao4_-{3EtVd3bG}zTzV`QRNFfB- zH?ym_a#Mf=(9>y=WR9HIK7(lEr>0d^%sZkrx7B%G@IvP^4(Xwv{hVXMn&;3FRKsnZ zhgqiwS(yT(Att#xl146aM7;-6r0rF&U6hC=Zk(}F;j%IGSYh-313nEf6>xO%$+_SzH+Ccx%`I5saNyhQ!Y~F5myw~7V_-C8tIkTzj z{`D3$?6Ok97tsx3IhGm4&mSL@!WU#26_>80jS7T(`zTRp2bsNoJTf!qeRZG(lZ9Ay zXYH$tv_u7qtA2xxL@pxDvK98TmJ`WLO(dyK-1jBq+NYOQxL|VVnCzqpVykH8&q|$B z+1!Y*IR{^6h=Z485)dee6a-@Yzh%%FZb4d^Bozmfc}$Xn^OfPf;w Date: Tue, 14 Jul 2020 15:21:34 -0400 Subject: [PATCH 028/125] more tests for submit.py functions --- src/encoded/tests/test_submit.py | 123 +++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 21 deletions(-) diff --git a/src/encoded/tests/test_submit.py b/src/encoded/tests/test_submit.py index f4b14cde0a..2f4a0c06c7 100644 --- a/src/encoded/tests/test_submit.py +++ b/src/encoded/tests/test_submit.py @@ -1,8 +1,6 @@ import pytest from encoded.submit import * import json -# from pyramid.paster import get_app -# from dcicutils.misc_utils import VirtualApp @pytest.fixture @@ -59,6 +57,69 @@ def submission_info3(submission_info2): return info +@pytest.fixture +def sample_info(): + return { + 'workup type': 'WES', + 'specimen id': '9034', + 'date collected': '2020-01-06' + } + + +@pytest.fixture +def example_rows(): + return [ + {'individual id': '456', 'analysis id': '1111', 'relation to proband': 'proband'}, + {'individual id': '123', 'analysis id': '1111', 'relation to proband': 'mother'}, + {'individual id': '789', 'analysis id': '1111', 'relation to proband': 'father'}, + {'individual id': '456', 'analysis id': '2222', 'relation to proband': 'proband'}, + {'individual id': '555', 'analysis id': '3333', 'relation to proband': 'proband'}, + {'individual id': '546', 'analysis id': '3333', 'relation to proband': 'mother'} + ] + + +@pytest.fixture +def new_family(child, mother, father): + return { + "title": "Smith family", + "proband": child['@id'], + "members": [ + child['@id'], + mother['@id'], + father['@id'] + ] + } + + +@pytest.fixture +def aunt(testapp, project, institution): + item = { + "accession": "GAPIDAUNT001", + "age": 35, + "age_units": "year", + 'project': project['@id'], + 'institution': institution['@id'], + "sex": "F" + } + return testapp.post_json('/individual', item).json['@graph'][0] + + +def test_map_fields(sample_info): + result = map_fields(sample_info, {}, ['workup_type'], 'sample') + assert result['workup_type'] == 'WES' + assert result['specimen_accession'] == '9034' + assert result['specimen_collection_date'] == '2020-01-06' + assert not result.get('sequencing_lab') + + +def test_create_families(example_rows): + fams = create_families(example_rows) + assert sorted(list(fams.keys())) == ['1111', '2222', '3333'] + assert fams['1111'] == 'family-456' + assert fams['2222'] == 'family-456' + assert fams['3333'] == 'family-555' + + def test_fetch_individual_metadata_new(row_dict, empty_items): items_out = fetch_individual_metadata(row_dict, empty_items, 'test-proj:indiv1', 'hms-dbmi') assert items_out['individual']['test-proj:indiv1']['aliases'] == ['test-proj:indiv1'] @@ -109,32 +170,52 @@ def test_fetch_sample_metadata_sp(row_dict, empty_items): assert items_out['individual']['test-proj:indiv1']['samples'] == ['test-proj:samp1'] -# def test_create_sample_processing_groups_grp(submission_info2): -# items_out = create_sample_processing_groups(submission_info2, 'test-proj:sp-multi') -# assert items_out['sample_processing']['test-proj:sp-multi']['analysis_type'] == 'WGS-Group' -# assert len(items_out['sample_processing']['test-proj:sp-multi']['samples']) == 2 -# -# -# def test_create_sample_processing_groups_one(submission_info): -# items_out = create_sample_processing_groups(submission_info, 'test-proj:sp-single') -# assert not items_out['sample_processing'] -# -# -# def test_create_sample_processing_groups_trio(submission_info3): -# items_out = create_sample_processing_groups(submission_info3, 'test-proj:sp-multi') -# assert items_out['sample_processing']['test-proj:sp-multi']['analysis_type'] == 'WGS-Group' -# submission_info3['family']['test-proj:fam1']['father'] = 'test-proj:indiv3' -# items_out = create_sample_processing_groups(submission_info3, 'test-proj:sp-multi') -# assert items_out['sample_processing']['test-proj:sp-multi']['analysis_type'] == 'WGS-Trio' - - def test_xls_to_json(project, institution): json_out = xls_to_json('src/encoded/tests/data/documents/cgap_submit_test.xlsx', project, institution) assert len(json_out['family']) == 1 + assert 'encode-project:family-456' in json_out['family'] assert len(json_out['individual']) == 3 assert all(['encode-project:individual-' + x in json_out['individual'] for x in ['123', '456', '789']]) +def test_parse_exception_invalid_alias(testapp, a_case): + a_case['invalid_field'] = 'value' + a_case['project'] = '/projects/invalid-project/' + try: + testapp.post_json('/case', a_case) + except Exception as e: + errors = parse_exception(e, ['/projects/other-project/']) + assert len(errors) == 2 + assert 'Additional properties are not allowed' in ''.join(errors) + assert 'not found' in ''.join(errors) + + +def test_parse_exception_with_alias(testapp, a_case): + a_case['project'] = '/projects/invalid-project/' + errors = None + try: + testapp.post_json('/case', a_case) + except Exception as e: + errors = parse_exception(e, ['/projects/invalid-project/']) + assert errors == [] + + +def test_compare_fields_same(testapp, fam, new_family): + profile = testapp.get('/profiles/family.json').json + result = compare_fields(profile, [], new_family, fam) + assert not result + + +def test_compare_fields_different(testapp, aunt, fam, new_family): + new_family['members'].append(aunt['@id']) + new_family['title'] = 'Smythe family' + profile = testapp.get('/profiles/family.json').json + result = compare_fields(profile, [], new_family, fam) + assert len(result) == 2 + assert 'title' in result + assert len(result['members']) == len(fam['members']) + 1 + + def test_validate_item_post_valid(testapp, a_case): result = validate_item(testapp, a_case, 'post', 'case', []) assert not result From 9f1a820691bea34ccde6f0bd78530008ba79a63e Mon Sep 17 00:00:00 2001 From: Sarah Date: Tue, 14 Jul 2020 15:22:30 -0400 Subject: [PATCH 029/125] edits to submit.py for minor bugs found via unit tests --- src/encoded/submit.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index f85161eef0..3ec4daae74 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -108,9 +108,10 @@ def xls_to_json(xls_data, project, institution): 'case': {}, 'report': {}, 'reports': [] } specimen_ids = {} + family_dict = create_families(rows) for row in rows: indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) - fam_alias = '{}:family-{}'.format(project['name'], row['family id']) + fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual items = fetch_individual_metadata(row, items, indiv_alias, institution['name']) @@ -145,6 +146,12 @@ def xls_to_json(xls_data, project, institution): return items +def create_families(rows): + proband_rows = [row for row in rows if row.get('relation to proband').lower() == 'proband'] + fams = {row.get('analysis id'): 'family-{}'.format(row.get('individual id')) for row in proband_rows} + return fams + + def fetch_individual_metadata(row, items, indiv_alias, inst_name): new_items = items.copy() info = {'aliases': [indiv_alias]} @@ -372,9 +379,11 @@ def compare_fields(profile, aliases, json_item, db_item): else: val = [v for v in json_item[field]] if sorted(val) != sorted(db_item.get(field, [])): - if len(val) == 1 and val not in db_item.get(field, []): + # if len(val) == 1 and val not in db_item.get(field, []): + # continue + if all(v in db_item.get(field, []) for v in val): continue - new_val = db_item.get(field, []) + new_val = [item for item in db_item.get(field, [])] new_val.extend(val) to_patch[field] = list(set(new_val)) return to_patch From 18330aa56b63eb580590776b4c40c4bbf162ee5f Mon Sep 17 00:00:00 2001 From: Sarah Date: Tue, 14 Jul 2020 16:05:31 -0400 Subject: [PATCH 030/125] parsing of analysis type for sample_processing in submit.py --- src/encoded/submit.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 3ec4daae74..8a6558288d 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -109,6 +109,7 @@ def xls_to_json(xls_data, project, institution): } specimen_ids = {} family_dict = create_families(rows) + a_types = get_analysis_types(rows) for row in rows: indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) @@ -126,8 +127,8 @@ def xls_to_json(xls_data, project, institution): else: specimen_ids[row['specimen id']] = 1 analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) - items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, - analysis_alias, fam_alias, project['name']) + items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, + fam_alias, project['name'], a_types) else: print('WARNING: No specimen id present for patient {},' ' sample will not be created.'.format(row['individual id'])) @@ -152,6 +153,26 @@ def create_families(rows): return fams +def get_analysis_types(rows): + analysis_relations = {} + analysis_types = {} + for row in rows: + analysis_relations.setdefault(row.get('analysis id'), [[], []]) + analysis_relations[row.get('analysis id')][0].append(row.get('relation to proband', '').lower()) + analysis_relations[row.get('analysis id')][1].append(row.get('workup type', '').upper()) + for k, v in analysis_relations.items(): + if len(list(set(v[1]))) == 1: + if len(v[0]) == 1: + analysis_types[k] = v[1][0] + elif sorted(v[0]) == ['father', 'mother', 'proband']: + analysis_types[k] = v[1][0] + '-Trio' + else: + analysis_types[k] = v[1][0] + '-Group' + else: + analysis_types[k] = None + return analysis_types + + def fetch_individual_metadata(row, items, indiv_alias, inst_name): new_items = items.copy() info = {'aliases': [indiv_alias]} @@ -191,7 +212,7 @@ def fetch_family_metadata(row, items, indiv_alias, fam_alias): return new_items -def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name): +def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name, analysis_type_dict): new_items = items.copy() info = {'aliases': [samp_alias], 'files': []} # TODO: implement creation of file db items fields = [ @@ -224,6 +245,8 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f 'samples': [], 'families': [] } + if row.get('analysis id') in analysis_type_dict: + new_sp_item['analysis_type'] = analysis_type_dict[row.get('analysis id')] new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) if row.get('report required').lower().startswith('y'): From a4cffbd8722b63db844e35391ce1ca469eca99d0 Mon Sep 17 00:00:00 2001 From: Sarah Date: Tue, 14 Jul 2020 16:05:59 -0400 Subject: [PATCH 031/125] test for analysis type parsing in submit.py --- src/encoded/tests/test_submit.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/encoded/tests/test_submit.py b/src/encoded/tests/test_submit.py index 2f4a0c06c7..e51931f4b0 100644 --- a/src/encoded/tests/test_submit.py +++ b/src/encoded/tests/test_submit.py @@ -69,12 +69,12 @@ def sample_info(): @pytest.fixture def example_rows(): return [ - {'individual id': '456', 'analysis id': '1111', 'relation to proband': 'proband'}, - {'individual id': '123', 'analysis id': '1111', 'relation to proband': 'mother'}, - {'individual id': '789', 'analysis id': '1111', 'relation to proband': 'father'}, - {'individual id': '456', 'analysis id': '2222', 'relation to proband': 'proband'}, - {'individual id': '555', 'analysis id': '3333', 'relation to proband': 'proband'}, - {'individual id': '546', 'analysis id': '3333', 'relation to proband': 'mother'} + {'individual id': '456', 'analysis id': '1111', 'relation to proband': 'proband', 'workup type': 'WGS'}, + {'individual id': '123', 'analysis id': '1111', 'relation to proband': 'mother', 'workup type': 'WGS'}, + {'individual id': '789', 'analysis id': '1111', 'relation to proband': 'father', 'workup type': 'WGS'}, + {'individual id': '456', 'analysis id': '2222', 'relation to proband': 'proband', 'workup type': 'WGS'}, + {'individual id': '555', 'analysis id': '3333', 'relation to proband': 'proband', 'workup type': 'WES'}, + {'individual id': '546', 'analysis id': '3333', 'relation to proband': 'mother', 'workup type': 'WES'} ] @@ -120,6 +120,16 @@ def test_create_families(example_rows): assert fams['3333'] == 'family-555' +def test_get_analysis_types(example_rows): + a_types = get_analysis_types(example_rows) + assert a_types['1111'] == 'WGS-Trio' + assert a_types['2222'] == 'WGS' + assert a_types['3333'] == 'WES-Group' + example_rows[1]['workup type'] = 'WES' + new_a_types = get_analysis_types(example_rows) + assert new_a_types['1111'] is None + + def test_fetch_individual_metadata_new(row_dict, empty_items): items_out = fetch_individual_metadata(row_dict, empty_items, 'test-proj:indiv1', 'hms-dbmi') assert items_out['individual']['test-proj:indiv1']['aliases'] == ['test-proj:indiv1'] From 24abbec3b7469417f9b2ee0e2558380580f3bcf5 Mon Sep 17 00:00:00 2001 From: Sarah Date: Wed, 15 Jul 2020 14:17:49 -0400 Subject: [PATCH 032/125] mother and father relations on individual added in submit.py --- src/encoded/submit.py | 95 ++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 3ec4daae74..cb44b49afd 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -109,6 +109,7 @@ def xls_to_json(xls_data, project, institution): } specimen_ids = {} family_dict = create_families(rows) + a_types = get_analysis_types(rows) for row in rows: indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) @@ -126,13 +127,14 @@ def xls_to_json(xls_data, project, institution): else: specimen_ids[row['specimen id']] = 1 analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) - items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, - analysis_alias, fam_alias, project['name']) + items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, + fam_alias, project['name'], a_types) else: print('WARNING: No specimen id present for patient {},' ' sample will not be created.'.format(row['individual id'])) # create SampleProcessing item for trio/group if needed # items = create_sample_processing_groups(items, sp_alias) + items = add_relations(items) items = create_case_items(items, project['name']) # removed unused fields, add project and institution for val1 in items.values(): @@ -152,6 +154,26 @@ def create_families(rows): return fams +def get_analysis_types(rows): + analysis_relations = {} + analysis_types = {} + for row in rows: + analysis_relations.setdefault(row.get('analysis id'), [[], []]) + analysis_relations[row.get('analysis id')][0].append(row.get('relation to proband', '').lower()) + analysis_relations[row.get('analysis id')][1].append(row.get('workup type', '').upper()) + for k, v in analysis_relations.items(): + if len(list(set(v[1]))) == 1: + if len(v[0]) == 1: + analysis_types[k] = v[1][0] + elif sorted(v[0]) == ['father', 'mother', 'proband']: + analysis_types[k] = v[1][0] + '-Trio' + else: + analysis_types[k] = v[1][0] + '-Group' + else: + analysis_types[k] = None + return analysis_types + + def fetch_individual_metadata(row, items, indiv_alias, inst_name): new_items = items.copy() info = {'aliases': [indiv_alias]} @@ -179,19 +201,17 @@ def fetch_family_metadata(row, items, indiv_alias, fam_alias): 'family_id': row['family id'], 'members': [indiv_alias] } - if row.get('relation to proband', '').lower() == 'proband': - info['proband'] = indiv_alias if fam_alias not in new_items['family']: new_items['family'][fam_alias] = info - else: - if indiv_alias not in new_items['family'][fam_alias]['members']: - new_items['family'][fam_alias]['members'].append(indiv_alias) - if row.get('relation to proband', '').lower() == 'proband' and 'proband' not in new_items['family'][fam_alias]: - new_items['family'][fam_alias]['proband'] = indiv_alias + if indiv_alias not in new_items['family'][fam_alias]['members']: + new_items['family'][fam_alias]['members'].append(indiv_alias) + for relation in ['proband', 'mother', 'father', 'brother', 'sister', 'sibling']: + if row.get('relation to proband', '').lower() == relation and relation not in new_items['family'][fam_alias]: + new_items['family'][fam_alias][relation] = indiv_alias return new_items -def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name): +def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name, analysis_type_dict): new_items = items.copy() info = {'aliases': [samp_alias], 'files': []} # TODO: implement creation of file db items fields = [ @@ -224,6 +244,8 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f 'samples': [], 'families': [] } + if row.get('analysis id') in analysis_type_dict: + new_sp_item['analysis_type'] = analysis_type_dict[row.get('analysis id')] new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) if row.get('report required').lower().startswith('y'): @@ -277,27 +299,24 @@ def create_case_items(items, proj_name): return new_items -# def create_sample_processing_groups(items, sp_alias): -# new_items = items.copy() -# for v in new_items['family'].values(): -# if 'members' in v and len(v['members']) > 1: -# # create sample_processing item -# samples = [items['individual'][indiv].get('samples', [None])[0] for indiv in v['members']] -# samples = [s for s in samples if s] -# if len (samples) > 1: -# sp = { -# 'aliases': [sp_alias], -# 'samples': samples -# } -# analysis_type = items['sample'][items['individual'][v['proband']]['samples'][0]]['workup_type'] -# if all([relation in v for relation in ['proband', 'mother', 'father']]) and sorted( -# v['members']) == sorted([v['proband'], v['mother'], v['father']] -# ): -# sp['analysis_type'] = analysis_type + '-Trio' -# else: -# sp['analysis_type'] = analysis_type + '-Group' -# new_items['sample_processing'][sp_alias] = sp -# return new_items +def add_relations(items): + new_items = items.copy() + for alias, fam in items['family'].items(): + parents = False + for relation in ['mother', 'father']: + if fam.get(relation): + if fam.get('proband'): + new_items['individual'][fam['proband']][relation] = fam[relation] + parents = True + del new_items['family'][alias][relation] + for relation in ['brother', 'sister', 'sibling']: + if fam.get(relation): + if parents: + for parent in ['mother', 'father']: + if new_items['individual'][fam['proband']].get(parent): + new_items['individual'][fam[relation]][parent] = new_items['individual'][fam['proband']][parent] + del new_items['family'][alias][relation] + return new_items def compare_with_db(virtualapp, alias): @@ -374,18 +393,18 @@ def compare_fields(profile, aliases, json_item, db_item): to_patch[field] = val else: # if array, patch field vals get added to what's in db - if field != 'aliases': + if field != 'aliases' and profile['properties'][field].get('items', {}).get('linkTo'): val = [aliases[v] if v in aliases else v for v in json_item[field]] else: val = [v for v in json_item[field]] - if sorted(val) != sorted(db_item.get(field, [])): + # if sorted(val) != sorted(db_item.get(field, [])): # if len(val) == 1 and val not in db_item.get(field, []): # continue - if all(v in db_item.get(field, []) for v in val): - continue - new_val = [item for item in db_item.get(field, [])] - new_val.extend(val) - to_patch[field] = list(set(new_val)) + if all(v in db_item.get(field, []) for v in val): + continue + new_val = [item for item in db_item.get(field, [])] + new_val.extend(val) + to_patch[field] = list(set(new_val)) return to_patch From 381f61e9a45dfe2ddbc04f25e4f008bc4f735944 Mon Sep 17 00:00:00 2001 From: Sarah Date: Thu, 16 Jul 2020 17:10:11 -0400 Subject: [PATCH 033/125] addition of file metadata in submit.py to allow creation of File items --- src/encoded/submit.py | 103 +++++++++++++++++++++++-------- src/encoded/tests/test_submit.py | 3 +- 2 files changed, 78 insertions(+), 28 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index cb44b49afd..84e9d8ae78 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -44,7 +44,10 @@ # } -POST_ORDER = ['sample', 'individual', 'family', 'sample_processing', 'report', 'case'] +POST_ORDER = [ + 'file_fastq', 'file_processed', 'sample', 'individual', + 'family', 'sample_processing', 'report', 'case' +] LINKS = [ @@ -105,8 +108,10 @@ def xls_to_json(xls_data, project, institution): items = { 'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}, - 'case': {}, 'report': {}, 'reports': [] + 'file_fastq': {}, 'file_processed': {}, 'case': {}, 'report': {}, + 'reports': [] } + file_errors = [] specimen_ids = {} family_dict = create_families(rows) a_types = get_analysis_types(rows) @@ -119,7 +124,7 @@ def xls_to_json(xls_data, project, institution): # create/edit items for Family items = fetch_family_metadata(row, items, indiv_alias, fam_alias) # create item for Sample if there is a specimen - if row['specimen id']: + if row.get('specimen id'): samp_alias = '{}:sample-{}'.format(project['name'], row['specimen id']) if row['specimen id'] in specimen_ids: samp_alias = samp_alias + '-' + specimen_ids[row['specimen id']] @@ -129,6 +134,11 @@ def xls_to_json(xls_data, project, institution): analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, project['name'], a_types) + if row.get('files'): + file_items = fetch_file_metadata(row['files'].split(','), project['name']) + file_errors.extend(file_items['errors']) + items['file_fastq'].update(file_items['file_fastq']) + items['file_processed'].update(file_items['file_processed']) else: print('WARNING: No specimen id present for patient {},' ' sample will not be created.'.format(row['individual id'])) @@ -144,7 +154,7 @@ def xls_to_json(xls_data, project, institution): del val2[key] val2['project'] = project['@id'] val2['institution'] = institution['@id'] - + items['file_errors'] = file_errors return items @@ -256,17 +266,36 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f # TODO: finish implementing this function -def fetch_file_metadata(filenames): - files = [] +def fetch_file_metadata(filenames, proj_name): + valid_extensions = { + '.fastq.gz': ('fastq', 'reads'), + '.fq.gz': ('fastq', 'reads'), + '.cram': ('cram', 'alignments'), + '.vcf.gz': ('vcf_gz', 'raw VCF') + } + files = {'file_fastq': {}, 'file_processed': {}, 'errors': []} for filename in filenames: + extension = [ext for ext in valid_extensions if filename.endswith(ext)] + if not extension: + if [ext for ext in ['.fastq', '.fq', '.vcf'] if filename.endswith(ext)]: + files['errors'].append('File must be compressed - please gzip file {}'.format(filename)) + else: + files['errors'].append('File extension on {} not supported - expecting one of: ' + '.fastq.gz, .fq.gz, .cram, .vcf.gz'.format(filename)) + continue + file_alias = '{}:{}'.format(proj_name, filename.lstrip(' ')) + fmt = valid_extensions[extension[0]][0] file_info = { - 'aliases': [], - 'file_format': '', - 'file_type': '', - 'filename': '' + 'aliases': [file_alias], + 'file_format': '/file-formats/{}/'.format(fmt), + 'file_type': valid_extensions[extension[0]][1], + 'filename': filename # causes problems without functional file upload } - files.append(file_info) - raise NotImplementedError + if fmt == 'fastq': + files['file_fastq'][file_alias] = file_info + else: + files['file_processed'][file_alias] = file_info + return files def create_case_items(items, proj_name): @@ -386,9 +415,8 @@ def compare_fields(profile, aliases, json_item, db_item): # if not an array, patch field gets overwritten (if different from db) if profile['properties'][field]['type'] != 'array': val = json_item[field] - if isinstance(val, str): - if val in aliases: - val = aliases[val] + if profile['properties'][field]['type'] == 'string' and val in aliases: + val = aliases[val] if val != db_item.get(field): to_patch[field] = val else: @@ -404,7 +432,10 @@ def compare_fields(profile, aliases, json_item, db_item): continue new_val = [item for item in db_item.get(field, [])] new_val.extend(val) - to_patch[field] = list(set(new_val)) + try: + to_patch[field] = list(set(new_val)) + except TypeError: # above doesn't handle list of dictionaries + to_patch[field] = [dict(t) for t in {tuple(d.items()) for d in new_val}] return to_patch @@ -422,7 +453,7 @@ def validate_all_items(virtualapp, json_data): written or tested. ''' alias_dict = {} - errors = [] + errors = json_data['file_errors'] all_aliases = [k for itype in json_data for k in json_data[itype]] json_data_final = {'post': {}, 'patch': {}} validation_results = {} @@ -431,10 +462,17 @@ def validate_all_items(virtualapp, json_data): if itemtype in json_data: profile = virtualapp.get('/profiles/{}.json'.format(itemtype)).json validation_results[itemtype] = {'validated': 0, 'errors': 0} + db_results = {} for alias in json_data[itemtype]: - # TODO : format fields (e.g. int, list, etc.) - result = compare_with_db(virtualapp, alias) - if not result: + # first collect all atids before comparing and validating items + db_result = compare_with_db(virtualapp, alias) + if db_result: + alias_dict[alias] = db_result['@id'] + db_results[alias] = db_result + for alias in json_data[itemtype]: + if 'filename' in json_data[itemtype][alias]: # until we have functional file upload + del json_data[itemtype][alias]['filename'] + if not db_results.get(alias): error = validate_item(virtualapp, json_data[itemtype][alias], 'post', itemtype, all_aliases) if error: # modify to check for presence of validation errors # do something to report validation errors @@ -442,25 +480,32 @@ def validate_all_items(virtualapp, json_data): for e in error: errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) validation_results[itemtype]['errors'] += 1 + elif json_data[itemtype][alias].get('filename') and \ + json_data[itemtype][alias]['filename'] in ''.join(json_data['file_errors']): + validation_results[itemtype]['errors'] += 1 else: json_data_final['post'].setdefault(itemtype, []) json_data_final['post'][itemtype].append(json_data[itemtype][alias]) validation_results[itemtype]['validated'] += 1 else: # patch if item exists in db - alias_dict[alias] = result['@id'] - patch_data = compare_fields(profile, alias_dict, json_data[itemtype][alias], result) - error = validate_item(virtualapp, patch_data, 'patch', itemtype, all_aliases, atid=result['@id']) + # alias_dict[alias] = results[alias]['@id'] + patch_data = compare_fields(profile, alias_dict, json_data[itemtype][alias], db_results[alias]) + error = validate_item(virtualapp, patch_data, 'patch', itemtype, + all_aliases, atid=db_results[alias]['@id']) if error: # do something to report validation errors if itemtype not in ['case', 'report']: for e in error: errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) validation_results[itemtype]['errors'] += 1 + elif json_data[itemtype][alias].get('filename') and \ + json_data[itemtype][alias]['filename'] in ''.join(json_data['file_errors']): + validation_results[itemtype]['errors'] += 1 else: # patch json_data_final['patch'].setdefault(itemtype, {}) if patch_data: - json_data_final['patch'][itemtype][result['@id']] = patch_data - else: + json_data_final['patch'][itemtype][db_results[alias]['@id']] = patch_data + elif itemtype not in ['case', 'report']: output.append('{} {} - Item already in database, no changes needed'.format(itemtype, alias)) # do something to record response validation_results[itemtype]['validated'] += 1 @@ -489,6 +534,8 @@ def post_and_patch_all_items(virtualapp, json_data_final): final_status[k] = {'posted': 0, 'not posted': 0, 'patched': 0, 'not patched': 0} for item in v: patch_info = {} + # if 'filename' in item: # until we have functional file upload + # del item['filename'] for field in LINKS: if field in item: patch_info[field] = item[field] @@ -507,7 +554,7 @@ def post_and_patch_all_items(virtualapp, json_data_final): final_status[k]['not posted'] += 1 except Exception as e: final_status[k]['not posted'] += 1 - output.append(e) + output.append(str(e)) for itype in final_status: if final_status[itype]['posted'] > 0 or final_status[itype]['not posted'] > 0: output.append('{}: {} items posted successfully; {} items not posted'.format( @@ -516,6 +563,8 @@ def post_and_patch_all_items(virtualapp, json_data_final): for k, v in json_data_final['patch'].items(): final_status.setdefault(k, {'patched': 0, 'not patched': 0}) for item_id, patch_data in v.items(): + # if 'filename' in patch_data: # until we have functional file upload + # del patch_data['filename'] try: response = virtualapp.patch_json('/' + item_id, patch_data, status=200) if response.json['status'] == 'success': @@ -526,7 +575,7 @@ def post_and_patch_all_items(virtualapp, json_data_final): final_status[k]['not patched'] += 1 except Exception as e: final_status[k]['not patched'] += 1 - output.append(e) + output.append(str(e)) if final_status[k]['patched'] > 0 or final_status[k]['not patched'] > 0: output.append('{}: {} items patched successfully; {} items not patched'.format( k, final_status[k]['patched'], final_status[k]['not patched'] diff --git a/src/encoded/tests/test_submit.py b/src/encoded/tests/test_submit.py index e51931f4b0..141ac7bf4a 100644 --- a/src/encoded/tests/test_submit.py +++ b/src/encoded/tests/test_submit.py @@ -173,7 +173,8 @@ def test_fetch_sample_metadata_sp(row_dict, empty_items): items = empty_items.copy() items['individual'] = {'test-proj:indiv1': {}} items_out = fetch_sample_metadata( - row_dict, items, 'test-proj:indiv1', 'test-proj:samp1', 'test-proj:sp1', 'test-proj:fam1', 'test-proj' + row_dict, items, 'test-proj:indiv1', 'test-proj:samp1', + 'test-proj:sp1', 'test-proj:fam1', 'test-proj', {} ) assert items_out['sample']['test-proj:samp1']['specimen_accession'] == row_dict['specimen id'] assert items_out['sample_processing']['test-proj:sp1']['samples'] == ['test-proj:samp1'] From 498ce003bacc3a2e264e75a227d000bac6f4592f Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 17 Jul 2020 11:01:57 -0400 Subject: [PATCH 034/125] added tests for fetch_file_metadata in submit.py --- src/encoded/tests/test_submit.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/encoded/tests/test_submit.py b/src/encoded/tests/test_submit.py index 141ac7bf4a..194a60d963 100644 --- a/src/encoded/tests/test_submit.py +++ b/src/encoded/tests/test_submit.py @@ -181,6 +181,34 @@ def test_fetch_sample_metadata_sp(row_dict, empty_items): assert items_out['individual']['test-proj:indiv1']['samples'] == ['test-proj:samp1'] +def test_fetch_file_metadata_valid(): + results = fetch_file_metadata(['f1.fastq.gz', 'f2.cram', 'f3.vcf.gz'], 'test-proj') + assert 'test-proj:f1.fastq.gz' in results['file_fastq'] + assert results['file_fastq']['test-proj:f1.fastq.gz']['file_format'] == '/file-formats/fastq/' + assert results['file_fastq']['test-proj:f1.fastq.gz']['file_type'] == 'reads' + assert 'test-proj:f2.cram' in results['file_processed'] + assert 'test-proj:f3.vcf.gz' in results['file_processed'] + assert not results['errors'] + + +def test_fetch_file_metadata_uncompressed(): + results = fetch_file_metadata(['f1.fastq', 'f2.cram', 'f3.vcf'], 'test-proj') + assert not results['file_fastq'] + assert 'test-proj:f2.cram' in results['file_processed'] + assert 'test-proj:f3.vcf' not in results['file_processed'] + assert len(results['errors']) == 2 + assert all('File must be compressed' in error for error in results['errors']) + + +def test_fetch_file_metadata_invalid(): + results = fetch_file_metadata(['f3.gvcf.gz'], 'test-proj') + assert all(not results[key] for key in ['file_fastq', 'file_processed']) + assert results['errors'] == [ + 'File extension on f3.gvcf.gz not supported - ' + 'expecting one of: .fastq.gz, .fq.gz, .cram, .vcf.gz' + ] + + def test_xls_to_json(project, institution): json_out = xls_to_json('src/encoded/tests/data/documents/cgap_submit_test.xlsx', project, institution) assert len(json_out['family']) == 1 From 94800dc2d6cec490ac9f0974401f1015ce830d41 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 21 Jul 2020 13:39:10 -0400 Subject: [PATCH 035/125] Keep minor version ahead of master. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 66aed2d77e..ff832caa9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] # Note: Various modules refer to this system as "encoded", not "cgap-portal". name = "encoded" -version = "2.3.0" +version = "2.4.0" description = "Clinical Genomics Analysis Platform" authors = ["4DN-DCIC Team "] license = "MIT" From e8daaebb38a4e5e6b245356f91d116f3b16b0440 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Sun, 26 Jul 2020 05:43:49 -0400 Subject: [PATCH 036/125] WIP: This is just barely working on the rosy case without permissions or logging or progress info etc. --- Makefile | 6 + pyproject.toml | 2 +- src/encoded/common.py | 92 +++++++ src/encoded/dev_servers.py | 63 +++-- src/encoded/ingestion_engines.py | 109 ++++++++ src/encoded/ingestion_listener.py | 249 ++++++++++++++++-- src/encoded/renderers.py | 11 +- src/encoded/root.py | 4 +- src/encoded/submit.py | 70 ++++- .../cgap_submit_test_with_errors.xlsx | Bin 0 -> 13220 bytes .../tests/test_generate_item_from_owl.py | 2 +- src/encoded/util.py | 109 +++++++- 12 files changed, 660 insertions(+), 57 deletions(-) create mode 100644 src/encoded/common.py create mode 100644 src/encoded/ingestion_engines.py create mode 100644 src/encoded/tests/data/documents/cgap_submit_test_with_errors.xlsx diff --git a/Makefile b/Makefile index 2af9cca922..e8efbc2f6c 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,12 @@ download-genes: # grabs latest gene list from the below link, unzips and drops i deploy1: # starts postgres/ES locally and loads inserts @SNOVAULT_DB_TEST_PORT=`grep 'sqlalchemy[.]url =' development.ini | sed -E 's|.*:([0-9]+)/.*|\1|'` dev-servers development.ini --app-name app --clear --init --load +deploy1a: # starts postgres/ES locally and loads inserts + @SNOVAULT_DB_TEST_PORT=`grep 'sqlalchemy[.]url =' development.ini | sed -E 's|.*:([0-9]+)/.*|\1|'` dev-servers development.ini --app-name app --clear --init --load --no_ingest + +deploy1b: # starts postgres/ES locally and loads inserts + @echo "Starting ingestion listener. Press ^C to exit." && SNOVAULT_DB_TEST_PORT=`grep 'sqlalchemy[.]url =' development.ini | sed -E 's|.*:([0-9]+)/.*|\1|'` poetry run ingestion-listener development.ini --app-name app + deploy2: # spins up waittress to serve the application @SNOVAULT_DB_TEST_PORT=`grep 'sqlalchemy[.]url =' development.ini | sed -E 's|.*:([0-9]+)/.*|\1|'` pserve development.ini diff --git a/pyproject.toml b/pyproject.toml index ff832caa9d..c251dffd51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] # Note: Various modules refer to this system as "encoded", not "cgap-portal". name = "encoded" -version = "2.4.0" +version = "2.3.1.1b0" # Preparing for minor version bump (2.4.0 probably) description = "Clinical Genomics Analysis Platform" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/src/encoded/common.py b/src/encoded/common.py new file mode 100644 index 0000000000..c5e06786f4 --- /dev/null +++ b/src/encoded/common.py @@ -0,0 +1,92 @@ +""" +common.py - tools common to various parts of ingestion +""" + +import codecs +import contextlib +import io +import os +import tempfile + + + + + +DATA_BUNDLE_BUCKET = 'cgap-data-bundles' + + +class SubmissionFailure(Exception): + pass + + +CONTENT_TYPE_SPECIAL_CASES = { + 'application/x-www-form-urlencoded': [ + # Special case to allow us to POST to metadata TSV requests via form submission + '/metadata/' + ] +} + + +def register_path_content_type(*, path, content_type): + """ + Registers that endpoints that begin with the specified path + """ + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(content_type, None) + if exceptions is None: + CONTENT_TYPE_SPECIAL_CASES[content_type] = exceptions = [] + if path not in exceptions: + exceptions.append(path) + + +def content_type_allowed(request): + """ + Returns True if the current request allows the requested content type. + """ + if request.content_type == "application/json": + # For better or worse, we always allow this. + return True + + exceptions = CONTENT_TYPE_SPECIAL_CASES.get(request.content_type) + + if exceptions: + for prefix in exceptions: + if request.path.startswith(prefix): + return True + + return False + + +class MissingParameter(Exception): + + def __init__(self, parameter_name): + self.parameter_name = parameter_name + super().__init__("Missing parameter: %s" % parameter_name) + + +_NO_DEFAULT = object() + + +def get_parameter(parameter_block, parameter_name, default=_NO_DEFAULT): + """ + Returns the value of a given parameter from a dictionary of parameter values. + + If the parameter is not in the dictionary, the default will be returned if one is given. + If the parameter is not present but there is no default, an error of type MissingParameter will be raised. + + Args: + parameter_block dict: a dictionary whose keys are parameter names and whose values are parameter values + parameter_name str: the name of a parameter + default object: a default value to be used if the parameter_name is not present. + """ + + if isinstance(parameter_block, dict): + if parameter_name in parameter_block: + return parameter_block[parameter_name] + elif default is _NO_DEFAULT: + raise MissingParameter(parameter_name=parameter_name) + else: + return default + else: + raise TypeError("Expected parameter_block to be a dict: %s", parameter_block) + + diff --git a/src/encoded/dev_servers.py b/src/encoded/dev_servers.py index eeddc2c7bb..b1f26b2e1c 100644 --- a/src/encoded/dev_servers.py +++ b/src/encoded/dev_servers.py @@ -49,12 +49,16 @@ def nginx_server_process(prefix='', echo=False): return process -def ingestion_listener_process(config_uri, app_name, echo=True): - """ Uses Popen to start up the ingestion-listener. """ - args = [ +def ingestion_listener_compute_command(config_uri, app_name): + return [ 'poetry', 'run', 'ingestion-listener', config_uri, '--app-name', app_name ] + +def ingestion_listener_process(config_uri, app_name, echo=True): + """ Uses Popen to start up the ingestion-listener. """ + args = ingestion_listener_compute_command(config_uri, app_name) + process = subprocess.Popen( args, close_fds=True, @@ -79,34 +83,58 @@ def main(): parser.add_argument('--init', action="store_true", help="Init database") parser.add_argument('--load', action="store_true", help="Load test set") parser.add_argument('--datadir', default='/tmp/snovault', help="path to datadir") + parser.add_argument('--no_ingest', action="store_true", default=False, help="Don't start the ingestion process.") + parser.add_argument('--ingest_only', action="store_true", default=False, help="Only start the ingestion engine.") args = parser.parse_args() + run(app_name=args.app_name, config_uri=args.config_uri, datadir=args.datadir, + clear=args.clear, init=args.init, load=args.load, no_ingest=args.no_ingest, ingest_only=args.ingest_only) + +def run(app_name, config_uri, datadir, clear=False, init=False, load=False, no_ingest=False, ingest_only=False): + + if ingest_only: + clear = False + init = False + load = False + logging.basicConfig(format='') # Loading app will have configured from config file. Reconfigure here: logging.getLogger('encoded').setLevel(logging.INFO) # get the config and see if we want to connect to non-local servers - config = get_appsettings(args.config_uri, args.app_name) + # TODO: This variable seems to not get used? -kmp 25-Jul-2020 + config = get_appsettings(config_uri, app_name) - datadir = os.path.abspath(args.datadir) + datadir = os.path.abspath(datadir) pgdata = os.path.join(datadir, 'pgdata') esdata = os.path.join(datadir, 'esdata') ### comment out from HERE... - if args.clear: + if clear: for dirname in [pgdata, esdata]: if os.path.exists(dirname): shutil.rmtree(dirname) - if args.init: + if init: postgresql_fixture.initdb(pgdata, echo=True) ### ... to HERE to disable recreation of test db ### may have to `rm /tmp/snovault/pgdata/postmaster.pid` - postgres = postgresql_fixture.server_process(pgdata, echo=True) - elasticsearch = elasticsearch_fixture.server_process(esdata, echo=True) - nginx = nginx_server_process(echo=True) - ingestion_listener = ingestion_listener_process(args.config_uri, args.app_name) - processes = [postgres, elasticsearch, nginx, ingestion_listener] - + if ingest_only: + print("Do this instead: ", + "SNOVAULT_DB_TEST_PORT=" + os.environ["SNOVAULT_DB_TEST_PORT"], + " ".join(ingestion_listener_compute_command(config_uri, app_name))) + return + + processes = [] + if not ingest_only: + postgres = postgresql_fixture.server_process(pgdata, echo=True) + processes.append(postgres) + elasticsearch = elasticsearch_fixture.server_process(esdata, echo=True) + processes.append(elasticsearch) + nginx = nginx_server_process(echo=True) + processes.append(nginx) + if not no_ingest: + ingestion_listener = ingestion_listener_process(config_uri, app_name) + processes.append(ingestion_listener) @atexit.register def cleanup_process(): @@ -121,15 +149,18 @@ def cleanup_process(): pass process.wait() + if init: + app = get_app(config_uri, app_name) + else: + app = None - app = get_app(args.config_uri, args.app_name) # clear queues and initialize indices before loading data. No indexing yet. # this is needed for items with properties stored in ES - if args.init: + if init: create_mapping.run(app, skip_indexing=True, purge_queue=False) - if args.init and args.load: + if init and load: load_test_data = app.registry.settings.get('load_test_data') load_test_data = DottedNameResolver().resolve(load_test_data) load_res = load_test_data(app) diff --git a/src/encoded/ingestion_engines.py b/src/encoded/ingestion_engines.py new file mode 100644 index 0000000000..05a5f0f82e --- /dev/null +++ b/src/encoded/ingestion_engines.py @@ -0,0 +1,109 @@ +import boto3 +import json +import traceback + +from .common import DATA_BUNDLE_BUCKET, get_parameter +from .util import debuglog, s3_output_stream, create_empty_s3_file +from .submit import submit_data_bundle + + +INGESTION_UPLOADERS = {} + + +def ingestion_processor(processor_type): + """ + @ingestion_uploader() is a decorator that declares the upload handler for an ingestion type. + """ + + def ingestion_type_decorator(fn): + INGESTION_UPLOADERS[processor_type] = fn + return fn + + return ingestion_type_decorator + + +class UndefinedIngestionProcessorType(Exception): + + def __init__(self, processor_type): + self.ingestion_type_name = processor_type + super().__init__("No ingestion processor type %r is defined." % processor_type) + + +def get_ingestion_processor(processor_type): + handler = INGESTION_UPLOADERS.get(processor_type, None) + if not handler: + raise UndefinedIngestionProcessorType(processor_type) + return handler + + +def _show_report_lines(lines, fp, default="Nothing to report."): + for line in lines or ([default] if default else []): + print(line, file=fp) + + +@ingestion_processor('data_bundle') +def handle_data_bundle(*, uuid, ingestion_type, vapp, log): + + log.info("Processing {uuid} as {ingestion_type}.".format(uuid=uuid, ingestion_type=ingestion_type)) + + if ingestion_type != 'data_bundle': + raise RuntimeError("handle_data_bundle only works for ingestion_type data_bundle.") + + s3_client = boto3.client('s3') + manifest_key = "%s/manifest.json" % uuid + response = s3_client.get_object(Bucket=DATA_BUNDLE_BUCKET, Key=manifest_key) + manifest = json.load(response['Body']) + + data_key = manifest['object_name'] + parameters = manifest['parameters'] + institution = get_parameter(parameters, 'institution') + project = get_parameter(parameters, 'project') + + debuglog(uuid, "data_key:", data_key) + debuglog(uuid, "parameters:", parameters) + + started_key = "%s/started.txt" % uuid + create_empty_s3_file(s3_client, bucket=DATA_BUNDLE_BUCKET, key=started_key) + + # PyCharm thinks this is unused. -kmp 26-Jul-2020 + # data_stream = s3_client.get_object(Bucket=DATA_BUNDLE_BUCKET, Key="%s/manifest.json" % uuid)['Body'] + + resolution = { + "data_key": data_key, + "manifest_key": manifest_key, + "started_key": started_key, + } + + try: + + validation_log_lines, final_json, result_lines = submit_data_bundle(s3_client=s3_client, + bucket=DATA_BUNDLE_BUCKET, + key=data_key, + project=project, + institution=institution, + vapp=vapp) + + resolution["validation_report_key"] = validation_report_key = "%s/validation-report.txt" % uuid + resolution["submission_key"] = submission_key = "%s/submission.json" % uuid + resolution["submission_response_key"] = submission_response_key = "%s/submission-response.txt" % uuid + + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=validation_report_key) as fp: + _show_report_lines(validation_log_lines, fp) + + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=submission_key) as fp: + print(json.dumps(final_json, indent=2), file=fp) + + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=submission_response_key) as fp: + _show_report_lines(result_lines, fp) + + except Exception as e: + + resolution["traceback_key"] = traceback_key = "%s/traceback.json" % uuid + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=traceback_key) as fp: + traceback.print_exc(file=fp) + + resolution["error_type"] = e.__class__.__name__ + resolution["error_message"] = str(e) + + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key="%s/resolution.json" % uuid) as fp: + print(json.dumps(resolution, indent=2), file=fp) diff --git a/src/encoded/ingestion_listener.py b/src/encoded/ingestion_listener.py index bd737911ad..98c472d6d9 100644 --- a/src/encoded/ingestion_listener.py +++ b/src/encoded/ingestion_listener.py @@ -1,25 +1,32 @@ -import os -import boto3 -import time -import socket import argparse -import structlog +import atexit +import boto3 +import botocore.exceptions import datetime +import elasticsearch +import io import json -import atexit -import threading -import signal +import os import psycopg2 -import webtest -import elasticsearch import requests # XXX: C4-211 should not be needed but is -from vcf import Reader +import signal +import socket +import structlog +import threading +import time +import uuid +import webtest + +from dcicutils.misc_utils import VirtualApp, ignored from pyramid import paster -from dcicutils.misc_utils import VirtualApp +from pyramid.response import Response from pyramid.view import view_config from snovault.util import debug_log -from .util import resolve_file_path, gunzip_content +from vcf import Reader from .commands.ingest_vcf import VCFParser +from .common import register_path_content_type, DATA_BUNDLE_BUCKET, SubmissionFailure +from .ingestion_engines import get_ingestion_processor +from .util import resolve_file_path, gunzip_content, debuglog log = structlog.getLogger(__name__) @@ -32,10 +39,99 @@ def includeme(config): config.add_route('queue_ingestion', '/queue_ingestion') config.add_route('ingestion_status', '/ingestion_status') + config.add_route('prompt_for_ingestion', '/prompt_for_ingestion') + config.add_route('submit_for_ingestion', '/submit_for_ingestion') config.registry[INGESTION_QUEUE] = IngestionQueueManager(config.registry) config.scan(__name__) +@view_config(route_name='prompt_for_ingestion', request_method='GET') +@debug_log +def prompt_for_ingestion(context, request): + ignored(context, request) + return Response(PROMPT_FOR_INGESTION) + + +register_path_content_type(path='/submit_for_ingestion', content_type='multipart/form-data') +@view_config(route_name='submit_for_ingestion', request_method='POST', accept='multipart/form-data') +@debug_log +def submit_for_ingestion(context, request): + + ignored(context) + + ingestion_type = request.POST['ingestion_type'] + filename = request.POST['datafile'].filename + override_name = request.POST.get('override_name', None) + parameters = dict(request.POST) + parameters['datafile'] = filename + + # ``input_file`` contains the actual file data which needs to be + # stored somewhere. + + input_file_stream = request.POST['datafile'].file + input_file_stream.seek(0) + + # NOTE: Some reference information about uploading files to s3 is here: + # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html + + upload_id = str(uuid.uuid4()) + _, ext = os.path.splitext(filename) + object_name = "{id}/datafile{ext}".format(id=upload_id, ext=ext) + manifest_name = "{id}/manifest.json".format(id=upload_id) + + s3_client = boto3.client('s3') + + upload_time = datetime.datetime.utcnow().isoformat() + success = True + message = "Uploaded successfully." + + try: + s3_client.upload_fileobj(input_file_stream, Bucket=DATA_BUNDLE_BUCKET, Key=object_name) + + except botocore.exceptions.ClientError as e: + + log.error(e) + + success = False + message = "{error_type}: {error_message}".format(error_type=type(e), error_message=str(e)) + + result = { + "filename": filename, + "object_name": object_name, + "bucket": DATA_BUNDLE_BUCKET, + "success": success, + "message": message, + "upload_time": upload_time, + "parameters": parameters + } + + pretty_result = json.dumps(result, indent=2) + + if success: + + try: + with io.BytesIO(pretty_result.encode('utf-8')) as fp: + s3_client.upload_fileobj(fp, Bucket=DATA_BUNDLE_BUCKET, Key=manifest_name) + + except botocore.exceptions.ClientError as e: + + log.error(e) + + message = "{error_type} (while uploading metadata): {error_message}".format(error_type=type(e), + error_message=str(e)) + + raise SubmissionFailure(message) + + queue_manager = get_queue_manager(request, override_name=override_name) + _, failed = queue_manager.add_uuids([upload_id], ingestion_type=ingestion_type) + + if failed: + # If there's a failure, failed will be a list of one problem description since we only submitted one thing. + raise SubmissionFailure(failed[0]) + + return result + + @view_config(route_name='ingestion_status', request_method='GET', permission='index') @debug_log def ingestion_status(context, request): @@ -66,6 +162,10 @@ def queue_ingestion(context, request): """ uuids = request.json.get('uuids', []) override_name = request.json.get('override_name', None) + return enqueue_uuids_for_request(request, uuids, override_name=override_name) + + +def enqueue_uuids_for_request(request, uuids, *, ingestion_type='vcf', override_name=None): response = { 'notification': 'Failure', 'number_queued': 0, @@ -73,8 +173,7 @@ def queue_ingestion(context, request): } if uuids is []: return response - queue_manager = request.registry[INGESTION_QUEUE] if not override_name \ - else IngestionQueueManager(request.registry, override_name=override_name) + queue_manager = get_queue_manager(request, override_name=override_name) _, failed = queue_manager.add_uuids(uuids) if not failed: response['notification'] = 'Success' @@ -87,6 +186,12 @@ def queue_ingestion(context, request): return response +def get_queue_manager(request, *, override_name): + return (request.registry[INGESTION_QUEUE] + if not override_name + else IngestionQueueManager(request.registry, override_name=override_name)) + + class IngestionQueueManager: """ Similar to QueueManager in snovault in that in manages SQS queues, but that code is not generic @@ -219,9 +324,9 @@ def delete_messages(self, messages): failed.extend(response.get('Failed', [])) return failed - def add_uuids(self, uuids): - """ Takes a list of string uuids (presumed to be VCF files) and adds them to - the ingestion queue. + def add_uuids(self, uuids, ingestion_type='vcf'): + """ Takes a list of string uuids and adds them to the ingestion queue. + If ingestion_type is not specified, it defaults to 'vcf'. :precondition: uuids are all of type FileProcessed :param uuids: uuids to be added to the queue. @@ -231,7 +336,9 @@ def add_uuids(self, uuids): msgs = [] for uuid in uuids: current_msg = { - 'uuid': uuid, 'timestamp': curr_time + 'ingestion_type': ingestion_type, + 'uuid': uuid, + 'timestamp': curr_time } msgs.append(current_msg) failed = self._send_messages(msgs) @@ -330,8 +437,10 @@ def delete_messages(self, messages): """ failed = self.queue_manager.delete_messages(messages) while True: + debuglog("Trying to delete messages") tries = 3 if failed: + debuglog("Failed to delete messages") if tries > 0: failed = self.queue_manager.delete_messages(failed) # try again tries -= 1 @@ -339,6 +448,7 @@ def delete_messages(self, messages): log.error('Failed to delete messages from SQS: %s' % failed) break else: + debuglog("Deleted messages") break def build_variant_link(self, variant): @@ -415,15 +525,37 @@ def run(self): delete processed messages """ log.info('Ingestion listener successfully online.') + + debuglog("Ingestion listener started.") + while self.should_remain_online(): + + debuglog("About to get messages.") + messages = self.get_messages() # wait here + debuglog("Got", len(messages), "messages.") + # ingest each VCF file for message in messages: + + debuglog("Message:", message) + body = json.loads(message['Body']) uuid = body['uuid'] + ingestion_type = body['ingestion_type'] log.info('Ingesting uuid %s' % uuid) + if ingestion_type != 'vcf': + # Let's minimally disrupt things for now. We can refactor this later + # to make all the parts work the same -kmp + handler = get_ingestion_processor(ingestion_type) + handler(uuid=uuid, ingestion_type=ingestion_type, vapp=self.vapp, log=log) + print("HANDLED", uuid) + continue + + debuglog("Did NOT process", uuid, "as", ingestion_type) + # locate file meta data try: file_meta = self.vapp.get('/' + uuid).follow().json @@ -471,7 +603,11 @@ def run(self): def run(vapp=None, _queue_manager=None, _update_status=None): """ Entry-point for the ingestion listener for waitress. """ ingestion_listener = IngestionListener(vapp, _queue_manager=_queue_manager, _update_status=_update_status) - ingestion_listener.run() + try: + ingestion_listener.run() + except Exception as e: + debuglog(str(e)) + raise class ErrorHandlingThread(threading.Thread): @@ -595,6 +731,79 @@ def main(): vapp = VirtualApp(app, config) return run(vapp) +PROMPT_FOR_INGESTION = """ + + + + Submit for Ingestion + + + + +

Submit for Ingestion

+
+ + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + +
+ + + +
+ + + +
+ +
+
+ + +""" + + if __name__ == '__main__': main() diff --git a/src/encoded/renderers.py b/src/encoded/renderers.py index a1ecbc0b9e..5cb832faf0 100644 --- a/src/encoded/renderers.py +++ b/src/encoded/renderers.py @@ -26,6 +26,7 @@ from subprocess_middleware.worker import TransformWorker from urllib.parse import urlencode from webob.cookies import Cookie +from .common import content_type_allowed log = logging.getLogger(__name__) @@ -107,15 +108,13 @@ def validate_request_tween(request): # Includes page text/html requests. return handler(request) - elif request.content_type != 'application/json': - if request.content_type == 'application/x-www-form-urlencoded' and request.path[0:10] == '/metadata/': - # Special case to allow us to POST to metadata TSV requests via form submission - return handler(request) + elif content_type_allowed(request): + return handler(request) + + else: detail = "Request content type %s is not 'application/json'" % request.content_type raise HTTPUnsupportedMediaType(detail) - return handler(request) - return validate_request_tween diff --git a/src/encoded/root.py b/src/encoded/root.py index a6ca512016..44e397eefa 100644 --- a/src/encoded/root.py +++ b/src/encoded/root.py @@ -99,12 +99,12 @@ def health_page_view(request): "beanstalk_env": env_name, "blob_bucket": settings.get('blob_bucket'), "database": settings.get('sqlalchemy.url').split('@')[1], # don't show user /password - "display_title": "Fourfront Status and Foursight Monitoring", + "display_title": "CGAP Status and Foursight Monitoring", "elasticsearch": settings.get('elasticsearch.server'), "file_upload_bucket": settings.get('file_upload_bucket'), "foursight": foursight_url, "indexer": settings.get("indexer"), - "index_server": settings.get("index_xerver"), + "index_server": settings.get("index_server"), "load_data": settings.get('load_test_data'), "namespace": settings.get('indexer.namespace'), "processed_file_bucket": settings.get('file_wfout_bucket'), diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 84e9d8ae78..17d2104450 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -1,16 +1,17 @@ -from pyramid.paster import get_app -from pyramid.response import Response -from pyramid.view import view_config -from snovault.util import debug_log -# from webtest import TestApp -from dcicutils.misc_utils import VirtualApp, VirtualAppError -from dcicutils import ff_utils -from webtest.app import AppError import ast import datetime import json import xlrd +from dcicutils.misc_utils import VirtualApp, VirtualAppError +from dcicutils import ff_utils +from pyramid.paster import get_app +from pyramid.response import Response +from snovault.util import debug_log +from pyramid.view import view_config +from webtest.app import AppError +from .util import s3_local_file, debuglog + GENERIC_FIELD_MAPPING = { 'individual': {}, @@ -56,7 +57,17 @@ ] -# This is a placeholder for a submission endpoint modified from loadxl + +# This "/submit_data" endpoint is a placeholder for a submission endpoint modified from loadxl. +# +# NOTES FROM KMP (25-Jul-2020): +# +# This will be done differently soon as part of the "/submit_for_ingestion" endpoint that +# will be in ingestion_listener.py. That endpoint will need an "?ingestion type=data_bundle" +# as query parameter. That "data_bundle" ingestion type will defined in ingestion_engines.py. +# The new entry point here that will be needed is submit_data_bundle, and then this temporary +# "/submit_data" endpoint can presumably go away.. -kmp 25-Jul-2020 + @view_config(route_name='submit_data', request_method='POST', permission='add') @debug_log def submit_data(context, request): @@ -78,6 +89,27 @@ def submit_data(context, request): raise NotImplementedError +# This endpoint will soon be the primary entry point. Please keep it working as-is and do not remove it. +# -kmp 25-Jul-2020 +def submit_data_bundle(*, s3_client, bucket, key, project, institution, vapp): # All keyword arguments, all required. + """ + Handles processing of a submitted workbook. + + Args: + data_stream: an open stream to xls workbook data + project: a project identifier + institution: an institution identifier + vapp: a VirtualApp object + log: a logging object capable of .info, .warning, .error, or .debug messages + """ + with s3_local_file(s3_client, bucket=bucket, key=key) as file: + project_json = vapp.get(project).json + institution_json = vapp.get(institution).json + json_data = xls_to_json(file, project=project_json, institution=institution_json) + final_json, validation_log_lines = validate_all_items(vapp, json_data) + result_lines = post_and_patch_all_items(vapp, final_json) + return validation_log_lines, final_json, result_lines + def map_fields(row, metadata_dict, addl_fields, item_type): for map_field in GENERIC_FIELD_MAPPING[item_type]: @@ -97,8 +129,11 @@ def xls_to_json(xls_data, project, institution): sheet, = book.sheets() row = row_generator(sheet) top_header = next(row) + debuglog("top_header:", top_header) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 keys = next(row) - next(row) + debuglog("keys:", keys) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + descriptions = next(row) + debuglog("descriptions:", descriptions) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 rows = [] counter = 0 for values in row: @@ -116,6 +151,7 @@ def xls_to_json(xls_data, project, institution): family_dict = create_families(rows) a_types = get_analysis_types(rows) for row in rows: + debuglog("row:", repr(row)) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) @@ -463,12 +499,21 @@ def validate_all_items(virtualapp, json_data): profile = virtualapp.get('/profiles/{}.json'.format(itemtype)).json validation_results[itemtype] = {'validated': 0, 'errors': 0} db_results = {} + # TODO: json_data[itemtype] but item_type might not be in json_data according to previous "if" statement. + # Maybe we want "for alias in json_data.get(item_type, {}):" here? + # Alternatively, maybe give "json_data.get(item_type, {})" a variable name so that it can be referred + # to more concisely in the several places below that it's needed. + # -kmp 25-Jul-2020 for alias in json_data[itemtype]: # first collect all atids before comparing and validating items db_result = compare_with_db(virtualapp, alias) if db_result: alias_dict[alias] = db_result['@id'] + # TODO: db_results is only conditionally assigned in the prevous "if". + # Perhaps the db_results = {} above should be moved up outside the "if"? + # Are we supposed to have a new dictionary on each iteration? -kmp 25-Jul-2020 db_results[alias] = db_result + # TODO: Likewise this should probably loop over json_data.get(itemtype, {}). -kmp 25-Jul-2020 for alias in json_data[itemtype]: if 'filename' in json_data[itemtype][alias]: # until we have functional file upload del json_data[itemtype][alias]['filename'] @@ -480,6 +525,10 @@ def validate_all_items(virtualapp, json_data): for e in error: errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) validation_results[itemtype]['errors'] += 1 + # TODO: If itemtype might not be in json_data (and conditionals above suggest that's so), + # then json_data[item_type][alias] seems suspect. It does work to do + # json_data.get(item_type, {}).get(alias, {}).get('filename') but I would put that + # quantity in a variable rather than compute it twice in a row. -kmp 25-Jul-2020 elif json_data[itemtype][alias].get('filename') and \ json_data[itemtype][alias]['filename'] in ''.join(json_data['file_errors']): validation_results[itemtype]['errors'] += 1 @@ -490,6 +539,7 @@ def validate_all_items(virtualapp, json_data): else: # patch if item exists in db # alias_dict[alias] = results[alias]['@id'] + # TODO: profile is only conditionally assigned in an "if" above. -kmp 25-Jul-2020 patch_data = compare_fields(profile, alias_dict, json_data[itemtype][alias], db_results[alias]) error = validate_item(virtualapp, patch_data, 'patch', itemtype, all_aliases, atid=db_results[alias]['@id']) diff --git a/src/encoded/tests/data/documents/cgap_submit_test_with_errors.xlsx b/src/encoded/tests/data/documents/cgap_submit_test_with_errors.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f6e736b89d8ed45de316089168463ce8570ca109 GIT binary patch literal 13220 zcmeHuRd5|kvh5aGTWB%MVrFKGSr#)hGc%)w7Be$5vt%(?7BgGS%unasnR{kD=e>CU z_x6tHj_TdHqPu!!W>w{CSxGQ(6u<`n5C8xW0QlMbC#^vM0B{HZ02u%T)ey9?b~Lhf z)KPS^HFD6Rb+xj@&jkl1&jx_LU;n?`|KJ`dR~)wPrAO#cIph`UR1V7tDJ=!!wBl$G z??WcL#;OuG?qZTZdyyH-Db)abNaQRhHAt_xF)Efw)dcvqEomRsWQcWPiXtoQFL++W zvV{Hf%atSRV!~OnG2qV5K1?@2N;NO()0z{$5t5*>)I&4v^;}9H2$Y5L3X*hC{wf81 zSUBPt?o|Voq;SH}XLk*dr@@j~Q)zFPV`Ne$u1a&jRicf*i=;rQk!`7e#1lUp`H7OP zRbz_DDTv2sRt{2Z#fS&X6byw>!+>&$Z{qubj%c-Yq?r-AZ^_kD_ed0@HDkN7(Mji- z%YP~+LMdBjw$`P^d?y;_D#hXIn0ct6B8+B4-RM~Q4(inv!^rc-hC2TW-EVRqKa(Nm z!mT3^=n#xxwhCBlJwd$2ai{>@hq|~{W|LdOrgMIQ}9=$yFi%cIqZ19=bbI8!$>}m{xfTRn*SQ~+&x3Boh$A-u}V(ir} zG8_a&tN<`kpAPTmk>yqH$ipFm+b!nua3mBiqDGgB;G{=8CrCUt z+Z0g=H*)8WXv(tY;vC80bs~}J3!xf>Nm>;wDC7d1U^Jc-KlOnxYHRxUl_0YM3MUo8 z)lF>KhjEi>KJ$r%2XMTh98$+WQ&5K-^i38kJ%=m_ZXU3d6ihkHs`WGNISJi#3@y8_ zgwnc@Up#4~(}&~zOY`Gu-;wp!&*&&WHWO@ru{pGuD4PiQnh= zq^tKHX?=GREC3M1)spVN{KUn^-a_BT#^R4`_D`Pyc~5EYDF3s!a``Wk{qG?i<~4-g zCCwQXanX^E@L1{SBXGE$VwseH$@_X8o3d3)bxx8F%qH0FY<$r9js<=T4Ed&qs`x7s zxI41t85e}j_|-TVQ2(UjlfNhgGSczkQP~L$e5wO-%jei3>P!^&U-#rBm?3$EghJOX zKZBXgC+M9R%^iN=j=Xbbw0_6;F;L;S!oR+$2|ty>c=xSEaQXH_mcJX`)DIB1D$;C_SXQ)Jy=9k^(nnc#^-WWpeSLzL&agiE)GE2-ySWj4mVSlH?&D-? z?no7>)Jj9om4`8`yf5Fd4C>Ra4T&i-chR`hU~S}gmjAtqR=bmZ-J-5-$~^>--2hRU zeq=9DUToNRl5%PYlNiYg=R(h{q0dF+W=2J!r4SeMGoD$q;XX7bZBKW9Tm-6RCfCOB zC<ZUw|kqhun=G}0Abvv0ie3^XSV9^YEAVr3QZRgUO#)T#`wtEA@yO!7xhcU*6l zoN2Ee89mQpsa&sV8QUpr_VBlBw!*I>OLvR%91+i2UD_{FN*PL9oXX@m)X-c(1v`>>za)32 z-Iacz%0ouev$u=AO5iV8Iq!6y$KOHxgjRUKEcGmbFi`INvwmeb$QpP5yr;@%cEz`& zn3)`>BT2`@h!_QJAI38%F+6>Jf$EfUCatx5Cl6w87#`?2!qQH%ii@bR29d!UHBV6^ z5|fi^B#sl<9UIFfJrZ(RvjYitRRnPbN%^ZcB$&@K>N)zXYXEA6{-Jn=$^@ZZhjyK> z@$ETmWb0wcNvE7**HFc*wo%AqwD5Id3q?yvwtT6sgf;xR=??>tgQW8u_r zc+uO@WIuLdqZvm8D?|iZ3){dBS%aWuu>eE4o7v{zc#0K39MGm01EsfpD|5k|f9Ff?N{>=_-~d1gEC7J=U%upEs%LLxsNiUCW^Ll|NA!(Qv5>*$KzZr>`5IE@ zc~DtTPhFwBN|`UwD=5~UDhq;_k&+gXgy~U@yY&3@X_7Cyu?R*?=-vr$Z^gkTIIC+O zm8Z5wzAT_;z0CtmKjBEAbhc;X=;mT#5JZhkk$|$|UbkA0>z@1Zr75Q&%2s7Dfd4@A ziel7K+n(mQ(ppu|q~r??&MQmV7;CVwo}{sg%Hr$V8m}5nK9+3d{7BE~=i|x*Pt~h> zjRzMSdMtGgtk$Uls>L4%oAeHOdcWpdZ#aBDCbkP^SrXZ^Lx(4h6MYn6vaYvd-VSMC z)|9=UxIT82l9U@Y2cxRj|1QFgQy!!q4qdkNLthX^F9rj?iVgKxuA~f9o6z)UMOS2> z0V}sAikg-Q_H!WD-rC4T8mM`MHUQD5$Cd;qo2cl(jl=VSwkx?pmH9Dij>WX)ZKGFbi5CvWs^#tmh{04{9(UzMKwhdadpL-nB1-uI;9B;>Ll_i zy@HIxR!e9&%iMt7ia}Ea~Ttx%6f)*QUfS6pZ@OXaG%ykeD&7Ba$ z@0E#KBb5RUU6TT5S76Ft90fw4--*j)O$0hndLp zBIL`Iqa1DWqU0Xh#Km)|8FsIg_`a;$BpG4C$=TkR5*7#(qXi+I<0ZvI(33nv7SUH4 zxX?5{R7TqVPN;PC(@v@sW>U~PkTI(sHTt!mPtO-yzkDO>@Q~K9C4R9diU`2bh^?{K zG29k&&BlduguzT!NDdSP&1A)Xwx#tp7J)j8WG0py|zaKK@M;*gA$@(P7 zDxsU@YEWtQjym05WYcV;+-tc~4z|u7QqF0_U;fYrpOiV3H5m*uF)*GmJePnmJ$3aO|v4Ld%`>!iB^2$$~x(fN8TWbcuP`zMDKqr zN(f`F%;!SUQ0#DVkpo#bb?CB=-5wtwex_^X=*4q>u)fD|&ZG3iDxj4o_MUx>eOmqz z+cr2!WMk^k{uO#O+9C|9{fO~mKs5|{JM9E>s+$&eE^Hjx^Na1Z((WwPM@G+CjF-Pn z&Qr4}7W(h&5?n+80Ng(UildvQ(VvlIMqR^Ziyg&V$Lv)w-X#?r$c89VV3q&PSd^-P z>ZA+`Ue!r|CmDXs{O#!z7ZgKK3?PWHJ%)W^?@ZHkq9$W=UP&mDZ49|rf70>Pa=P2D zyO223?bkY;cb*AW7~cho9GbzT=63%x=i}udJ)*_etUU}EF#N54rab}b4~Yrj4mdeG zQ8c!&G6Dlc#2bl|<%T^Bsic<$q3U0jPgM~KYf=bdbF<;a0`HIPr+}*UeEw2E68(y5 zDI(|(>&6V*rUHJ4BW8KssvNj$%*xz&LJg-8$lrYH0)w{TZE4VgiM!hzpOn`H`2E0# ziU>F=jo6wDbuN&|8tqE0{FVh_w_%l+n z<)=rs9&=y|8GW=@GehlB(haC5Sup*#N8dx0DUX4vC9~EW9We|Ng<<{MY;M-N@-yCU znO><{NaD;b)XZLHQ!2;EYR4a(J8RJiDUbsB`1bxR$8E-K>u1im4@b9;6)?yPz^56n z!%tj^v6!%&dY;f1pQDj@WQ~`YvE*@}WeLdYTAtA@2~bA@eUsU-q@hx4FyuITuGp^G z2xRyq&ZQVaVlqWh>QSg+$P)u6e^5P9!haPoQprxKD?d(yzN)FsPi=Ke#p>B`s`C1* zw%4+^+G=yW>9!2r0S8Gl^$iWBa*PQ^9{~wW>Yxr*`zfu(b7P?iw-~Ih39erC$Bn9yFEwRS4h#Q^ zlt?qR;R=6TP(i;$jS;`>3Mx^7%o#<>cSLC&47i+1DVtqrp%}izJP_{hvK4PTrWgJy z(CX$+sEJ1v5f(=sJ8WYOUICi?J|13x!rn$v+@BU+X(}n0it*DR9~!i36WfyX0xWQm zZm5r;@3>*wdNU?^X<2wNs;SdWA6KEt<{e2yV}}G>Q#xF(Zw)ZN(BI9{2BZCI^w*}b z5(sS9=gVM{_68BhlsEdy(trR1abjPOJ|NPu-*No`>vcd~5~Ev@6Sx9?@Wxk30djWTwF$?2j~+I5ct@_kq;SB!Nf z7guyfm^)#qRnALTvGO6uG7P+FYq?0fJdeh%z0q}LeFO(#1um^wS@|2m!!4n=n#ZmR znSXr6HRZ^)j9NlsLh_}iu?0_OhdXCka(l$5N_nYM*{TjVTgQ`MiSb)W^J~@*-x?Gl zzA}!WPB`Xn<5eb4QmY15uZYJNygGRX zm7d(wRkPF2^yY=-@)RxOT{l%X*dfvuv?{WW%gefQ+joC4b+-KFX_NOvGK8tqK^D)r z+dFFd6ER9>@J5O;(-$6etIU$s{8o8F5UUK2uEm-&twHO1nq{+kuHyN9Cw1#SmyBNq zMLgd2UMkzCnfE+_RvglgMnS}9@Ze$eBzC75EnNE*8;G_Zd_bzEKqb!`#z(xr8{KhP zF-rEl2VsHtN%MO={^v}|!PLme(cvHS>^~a}|2@kNjPCi`OOGOa2lguH>Jc{=1S_b( zM%b*h2d4kH^kF$E<__h>qYlTYdw17%a+fvjiMs^b82%geOe<*6jsd!9^|{9qO&5O+ z`{4=-6b*Nb-ZX7zDwwpiMA%@ADk@A(kR0&~MxS^8VoelONzCV8i{fE!^urI~*gwik z&8;qVAnWO*8_uYAyi@`>dF0hzkE6WsxUKWLCd4i$1I0OFbU){caSREcT>9$sAGzFS zT6n>O7g~*tUfn|dT@P5+={;jX0|2o!0072+^uS-wp02(uzdVBE#jCW3r|BIXNt+l6 zvrte&5Q7kF3p1QX52{&1V|K>ArBLt@Wt`(}_zsrTjEjxKZM+t!p~>X}-tjt?rI#v6(%4CA^YqgKize79CHiJ>mn^ePx+D~@H-6~^H%-ir0H+FdMxl?n8# zOBdAoWwnU`vX5odGS=wyXFA%+W1L;~_k$rmU`$d$;olOk)G|>Qy%qz8UN}uSURGbN zc~cg}>|Zs4E_%H)Uz3kOS#zG}3wGcvsk@-i+8I?10}gga24$|=UQXX$bX}N}1E<-z zGO;?K@(@jrO)4VS8+g<2XG*dL;N(?j20} zYr2f=+!AW#*D(h1w_ES|s5Q1Hle9W(143QJk%i3Udyl>HKopC4z-cSt+<1p3b%G zUd==g;`+v0ot0t28$JEdXHYU7*f70?z5bF}_#Ia9quFh{--KQwjV|B4jS0v6EyOHo zSLxeZ|HZSmp+5SqO$U}-6O#w!h^3i)FP{^T+yfQV_{lSLa{$3T$_dN%!V|%i8vFA6 zSaW0R0l&32EjRhV)$nsQpcw^Vtq#rfF>8$Im_MOWL!1dLYfQKL<%HA;k9G_+umk_x z1yn}TsEwG9M~TL>nO-jUBn~)!VtcV9e`r{S1Bzt0ObC8!+?=NtY_*5yt9Vqi*}Kx| zg$RmZ=EbGi-UL(EPjHoNTXyQ|+d?(m|OxMvjHix-8s<+@7)ToKujWY|5GZV93Q* zMGgyf>>Q77^D^hWUs6u?CY(?EI+><74RC$Buzj@%1VGNC!RS_w-hNOJh!T*Mb2fEa z5XO}7!9&;+$25U*pbdrbtr5uEh;wG$h59ZXQ$MfXQcDY`e+1<~9uoEQL>e;jONZGJ zz)uQ=Jt@^D5U@Zzt~arx+44jjid4<2in_t9rSw6z(!qKn?+&gGL^x zpN%k=DWK{Jt#?BaxLbsvoW=)T_JyFFHK=+CrPo{m`2TFES&@`g7&au7H^$B?*DgsY zYYdxH{drQVp^Pj+qHIJljjwPCj+tS$GW)Y7tMj1W72dkt^}#~RZeVpEXC_+ei7;dt z9;Kzhr^todtu1h?iT)nF47)?sYi1_vgj*8S;cAc8jbKW$WZuWJY z^w4ptC@tb&>8c!Jg37ZPR;73ZY4Hm)`po8O{SE>}jbez%1`*0uqFZ5kYRkl*pjMYY zMICU8bTty$Az?KNV=*T557atlh;SphH)bGD`$QBLhz=m?*fP^_K;{TT`3DmNm!k1a zS^cvcfc_$cz@=dP`G$C)e+F zCnxmJ)(84u;see7@Ks&?brYZn%}a&8Bovn*u_)yhO3WyRrBcc%ko<8Hb#A6<(A-E} zH6S^7mfJk_RFQ{d+5y_M(@DCdvml-QL~gCMx87yL05ZtRm5hbN^y@{ZNb2B}LImh~&w@Do0yS?l`-$NM z4oX(Vo+x?<+|4ixibmRdA1~)myro|T1@wM|t&CW#v{!Pto^KiV^E2{1*{G)1d6jD6 zhvpF%v7CPRiaA}8LNm?=88prnMM9<@61Bh{ni?^f!alH-mUe;Qgt;yeSwJ7t}Eh)uxx!iIva9vQA^^e6=Yt`#C)p=VAM}HfHdjuVlU>X zb~_q||De|mJhK2@9Az-Kpfd?S2O(oG%?<7eH$15U8hhAa41 zjqK@ot6JK5VtUyquG1F$V!_$&Kpj$fhzcjvdKpZ)z+#j^@P&vHA6w$k8{-%qpL!?fou zn;WdX+8E5g0`63cSnh>!|5_p98n~($W&M!zH!6L z7)5OmH;)SiQ^m!;a&#jZcLT#R%Is6DT}jPZcP!pDhtV6WHq&_v#Y@|F$@Dye-6vCl z^mZ6|Lj`^raJPoxy*F!f%e?*RxFRm(S}>M(Ohwq_+nP$1vcJdWfua6i)gB@a{MNVX+d*>s@H{O@tE*ZNBq}6kZ*rm)9pb==#+l1I}G6eA-(u>MD1v5WMxG6=l;)O^s(A_I5rzf z7y2VFtb@xl%XT!$+Qy`1+$ynQMm$bKu4zA`UAWvvhf9?ie2I~O9^#nxNyE=DR>Q!-1+WJp%bY|!agJq5wC}*+Z_VAt z?0)N{5rbmAL|JHqIJg08$!Jss@)a0FvG>~k`m!Egj08l-a0wDV9wqHEn+)Fs;3CM^ zRZ43}a%#5wY6U`r_T^7;36?JPG8wo&ZF&5>BS%8=Hhmy@*+XxlLz_}2725rvCTN}g z#@HgI>!~{BaCy*U{u3VxM8)6+gvi&AyNf<06xz`D_#&jlOxw6K&b1 zYYSN+ar@?A+Mot=mzE_|!`}TUPN#mGCzQ|&Ee!68~w{9(?BGfU?fuAW3cH(ARXjUe6CN z*bZjME_(aBo*zb&y5250Z>qOeQ0Y3oE-w$_(AT|g_GF@QmzHsGlg|mlYR)S+4bDiz z_59$UEPs!w{Caa0E&+PS+d7F+QyT8N`kszhoa zRtzGXFfMA&uufMR^zT(!rH=qe6u*El_&???J;GmY{@T1RvZ0-`-&7+$lmi) zp{qtP0=KM9Fm!jGhjB})I_&xsp(-GBnnciEmqebnUQj=DwATnfaOrY<&8n)e*kYE) z(~wB-{XXcuLrZoCMonquj^#Oo7+7#S!wq*B?R|PgNh|4yh1|bVTs<042TK0p47tJL z?}CQihJX5`)z2C$X}|xoB^-n48;|osq%60?l2q(azc(p7f$%q%0C9BB=tV9e9zh)0 zT85P-sK(wGW5Q;L9rM(9vd~AP3EZ0Yh?EXIbVvW{aHA#e#)XFQFeWa}tCEDJHVg&F zhye_brq2$lKlGSJ_gbb@2z3*1k0r{fh9;D5(se-i5@LaP#5?7QiT?io?=Up81kcky<@rzz$k_UU8Wy2Otnfodj6?r5i;TDDZ) zHg$u#I%_;9o#2?B=9u`mO}FC+au)vFdci4l7YRR5mMn1WKIDCMJ*0^|z=S(9v+7t4tgSx|`ws1GiTVIgZvODD%7kW z6;IMvuF;Xx(k3nCt*Iq25U~%g6q=EONfL$o>O5y_6*EWP`-<;5`UCmmG=`p(F}9o& zhMW$(0E)Irp!Q;l>Xf7El)Vym5-xszFGhj2a(QG{b%B>hi%fAmdiuJObtwSm@@ zkQqjq7X21%d#1z4kBAn+gk;Trq@)Tz`njwh8q9%Bltj!H56Jr%ee7QBifmnn_sx2aHR`&#;a9k-l#5vR8UFEY9VjB}3-F<(`cjtVj=%r`v1 z{&1YZFRdE&+wOzQmhOn@eEZj*9Pv>^oZ+(0xygo^y)0psV^cCzco;H^sXaMIwA{(1 zV7DDIt893)BMu+Tewh@0r#zf-j5`S7+JP&(6r8E0EUU`8VD;h!+LM!B*M;fQAx%-lrg66#7BK3k#YYxveG_l z4C&&>?QZMCUL?OSD`1mEME>{_dnle&H+&^dEfToi&Ot`==1pMaYOf;oF@GA(Ibea-Vb4Cv#P;)+SBb_n(-jS@Ng-eG4zJb z$E{AyMNhjkMEs1WhLLPFD`)c{;|3mb@DAy`AIEc?LE04>XK?PT)l)d!=N&|F@JqAS zvPW#t4fahMKywl!?>x0)$OH+l;wJ2u>=%401KnK;FT+m2oq1d1+xwE$f0q9Dt*Ukt z{#eXQ7GiokNk$804pL2_C;I*jSrFQ)3cJ1ZF{MLd; z9=s^BoY!ePCHP@G{aIb9cM-+ck|(pSNrOVeaASwiG9?!22NUECv5$SxT235_;7hP>!F=D^7d94jCJ#D3@kGsCaRdD zCkpv;j4yz5ZtydqPXXVR%UYyNCTN!CckWH0D}I%%shiqmh$dsH3skJZJc8({u7A)Z z%=6H`G36lUKXuXGC_w4*wFOWaeH`}NgJmBoG>ljlOCOtz(JtmU{7DH@xP>v%aJAsY zFe(=l;IOGn@3%9+u{B{Aex0|*x8;LCx7Sb9*Uz=Nq8`qy8C~qV3AT5C&3#dYdGtg0 zd~k(bcs;`>r>GgmrTe~d?kabUHKuxm*G~PZBpdHiM1d>09Q)j(e2p0oyN`oghRr5% zK76Zm=#8t)?8bZb*B=^{|D>{23}L6yJC!}&$C-%#N@W8Zd!zs5@_V2CV+)Mck$GQ{ z{=WX@8D27LQT;vE{XxiDcELZr`Mk7Q| zzjxIEiW8APsH`STNg{>%7}+Kbbh&{!7WRuYF-)5K=|=QtXW!N8RbBpl>122HTOLm?r^{0%b023BD( z1ROyS5^@tew!(GuV616Ihizm9gy>%#JORGFpXTZR9TFmLiPz z5C{Fr>H0R8JO3ex9d^5io3x%cJKz=b0yvbqlC)0p^x^NgIk_QP-}}x@YE%G#?EN*o zE39naYG^~D~?O7$YSs?)tu}X!lSG>%H>np?Zd{EtSY7C zJ4^fwBxY*1V{nuQMhwRY2^IL?V|^A&qSd+uX>}&}4r!aWSID%Gm=m!fa(}wpwm!+j z!FCtp$x7(S{}mkbgv!3%f0SE5v!A1^RDT1kF!E>m?KRm7`cfLwOuMESCD`NM0%g)c z(+HLd48fv(<$+@26RM;^^QBShB^yo}uFhNB@V@6M?KVzT!|dVHNnhSu#%2COI$iUp z4b-45L`wRHN@KSh2Y*kl`~2$g`zXPg3wX}ASWYZWHrar%CevEwJWSQ?a>QxFRYOIS zN+z2hO(`r>l(ja7*HY`%)Q;Qaa7?HzFLKF`Q~bbDSkUQ!I1#<{yDRdrO-}6%D@RaQ zy+l}-a4bQS{M-*q!}Q`Oh6t}wubr#rsQU-OJS|TLz3+1u{c@ox{b! zg1Whjk8|Z0pD0~4q0254YE&LLD~c|K&tIXQ#%P@vz!XgmYS)XcH*zP*`##+%zye>< z4{Yf&5tm5^Ka1?#Jp^*fIMt@JVCMRg<3opn{>@=5xBI<1@{bS&0!s5P1p50g4gTqo z|2+Q9cL%bP{|@l)8t8u-{&6gNPvQTfi2mL1-*ut?YB=^z+5fvr^mm-!#e#n!y}f@- z@tbt;cjMpHQGXdrzU!sljsHU{^*h4va)`eW(h>d(;lITZznlJEy!*?P0`0%={Qp(T z`yKH2TG(HJROr70{#6wL*eLduNfPWvu{bddSJdy(dzmMpCxBmCE x`LEXBss6?KpGow0^WP)eUjTZvf8PE7Pn?sLgm~v5008^`BYmf_Dg7U<{{vXgk|6*9 literal 0 HcmV?d00001 diff --git a/src/encoded/tests/test_generate_item_from_owl.py b/src/encoded/tests/test_generate_item_from_owl.py index 3df658f7b7..45d3b5400e 100644 --- a/src/encoded/tests/test_generate_item_from_owl.py +++ b/src/encoded/tests/test_generate_item_from_owl.py @@ -10,7 +10,7 @@ from unittest import mock from ..commands import generate_items_from_owl as gifo from ..commands.owltools import Owler -from ..util import MockFileSystem +from dcicutils.qa_utils import MockFileSystem pytestmark = [pytest.mark.setone, pytest.mark.working] diff --git a/src/encoded/util.py b/src/encoded/util.py index 5a2906683c..815e3752b0 100644 --- a/src/encoded/util.py +++ b/src/encoded/util.py @@ -1,6 +1,10 @@ +import contextlib +import datetime +import gzip import io import os -import gzip +import tempfile + from io import BytesIO @@ -92,3 +96,106 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.file_system.files[file] = text return MockFileWriter(file_system=file_system, file=file) + + +DEBUGLOG_ENABLED = os.environ.get('DEBUGLOG_ENABLED', "FALSE").lower() == "true" + + +def debuglog(*args): + """ + As the name implies, this is a low-tech logging facility for temporary debugging info. + Prints info to a file in user's home directory. + + The debuglog facility allows simple debugging for temporary debugging of disparate parts of the system. + It takes arguments like print or one of the logging operations and outputs to ~/DEBUGLOG-yyyymmdd.txt. + Each line in the log is timestamped. + """ + if DEBUGLOG_ENABLED: + nowstr = str(datetime.datetime.now()) + dateid = nowstr[:10].replace('-', '') + with io.open(os.path.expanduser("~/DEBUGLOG-%s.txt" % dateid), "a+") as fp: + print(nowstr, *args, file=fp) + + +# These next few could be in dcicutils.s3_utils as part of s3Utils, but details of interfaces would have to change. +# For now, for expedience, they can live here and we can refactor later. -kmp 25-Jul-2020 + +@contextlib.contextmanager +def s3_output_stream(s3_client, bucket, key): + """ + This context manager allows one to write: + + with s3_output_stream(s3_client, bucket, key) as fp: + print("foo", file=fp) + + to do output to an s3 bucket. + + In fact, an intermediate local file is involved, so this function yields a file pointer (fp) to a + temporary local file that is open for write. That fp should be used to supply content to the file + during the dynamic scope of the context manager. Once the context manager's body executes, the + file will be closed, its contents will be copied to s3, and finally the temporary local file will + be deleted. + + Args: + s3_client: a client object that results from a boto3.client('s3', ...) call. + bucket str: an S3 bucket name + key str: the name of a key within the given S3 bucket + """ + + tempfile_name = tempfile.mktemp() + try: + with io.open(tempfile_name, 'w') as fp: + yield fp + s3_client.upload_file(Filename=tempfile_name, Bucket=bucket, Key=key) + finally: + try: + os.remove(tempfile_name) + except Exception: + pass + + + +@contextlib.contextmanager +def s3_local_file(s3_client, bucket, key): + """ + This context manager allows one to write: + + with s3_local_file(s3_client, bucket, key) as file: + with io.open(local_file, 'r') as fp: + dictionary = json.load(fp) + + to do input from an s3 bucket. + """ + + tempfile_name = tempfile.mktemp() + try: + s3_client.download_file(Bucket=bucket, Key=key, Filename=tempfile_name) + yield tempfile_name + finally: + try: + os.remove(tempfile_name) + except Exception: + pass + + +@contextlib.contextmanager +def s3_input_stream(s3_client, bucket, key, mode='r'): + """ + This context manager allows one to write: + + with s3_input_stream(s3_client, bucket, key) as fp: + dictionary = json.load(fp) + + to do input from an s3 bucket. + + In fact, an intermediate local file is created, copied, and deleted. + """ + + with s3_local_file(s3_client, bucket, key) as file: + with io.open(file, mode=mode) as fp: + yield fp + + +def create_empty_s3_file(s3_client, bucket, key): + empty_file = "/dev/null" + s3_client.upload_file(empty_file, Bucket=bucket, Key=key) From f71f2380f5b9ba3e59b0dbafcfe5c91840631299 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Sun, 26 Jul 2020 07:38:33 -0400 Subject: [PATCH 037/125] Update pyproject.toml for latest versions of snovault and utils. --- poetry.lock | 156 +++++++++++++++++++++++++------------------------ pyproject.toml | 4 +- 2 files changed, 81 insertions(+), 79 deletions(-) diff --git a/poetry.lock b/poetry.lock index 74056ecd4a..0fdaeac11c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -194,7 +194,7 @@ description = "Code coverage measurement for Python" name = "coverage" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" -version = "5.2" +version = "5.2.1" [package.extras] toml = ["toml"] @@ -221,17 +221,18 @@ description = "cryptography is a package which provides cryptographic recipes an name = "cryptography" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*" -version = "2.9.2" +version = "3.0" [package.dependencies] cffi = ">=1.8,<1.11.3 || >1.11.3" six = ">=1.4.1" [package.extras] -docs = ["sphinx (>=1.6.5,<1.8.0 || >1.8.0)", "sphinx-rtd-theme"] +docs = ["sphinx (>=1.6.5,<1.8.0 || >1.8.0,<3.1.0 || >3.1.0,<3.1.1 || >3.1.1)", "sphinx-rtd-theme"] docstest = ["doc8", "pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] idna = ["idna (>=2.1)"] -pep8test = ["flake8", "flake8-import-order", "pep8-naming"] +pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] +ssh = ["bcrypt (>=3.1.5)"] test = ["pytest (>=3.6.0,<3.9.0 || >3.9.0,<3.9.1 || >3.9.1,<3.9.2 || >3.9.2)", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,<3.79.2 || >3.79.2)"] [[package]] @@ -251,15 +252,15 @@ description = "Storage support for 4DN Data Portals." name = "dcicsnovault" optional = false python-versions = ">=3.6,<3.7" -version = "3.1.4" +version = "3.1.8" [package.dependencies] MarkupSafe = ">=0.23,<1" -Pillow = ">=3.1.1,<4.0.0" +Pillow = "3.1.1" PyBrowserID = ">=0.10.0,<0.11.0" PyYAML = ">=5.1,<5.3" SPARQLWrapper = ">=1.7.6,<2.0.0" -SQLAlchemy = ">=1.2.16,<2.0.0" +SQLAlchemy = "1.3.16" WSGIProxy2 = "0.4.2" WebOb = ">=1.8.5,<2.0.0" WebTest = ">=2.0.21,<3.0.0" @@ -267,7 +268,7 @@ aws_requests_auth = ">=0.4.1,<0.5.0" awscli = ">=1.15.42,<2.0.0" "backports.statistics" = "0.1.0" boto3 = ">=1.7.42,<2.0.0" -dcicutils = ">=0.25.0,<1" +dcicutils = ">=0.34.0,<1" elasticsearch_dsl = ">=5.3.0,<6.0.0" future = ">=0.15.2,<0.16.0" futures = ">=3.1.1,<4.0.0" @@ -302,7 +303,7 @@ venusian = ">=1.2.0,<2.0.0" xlrd = ">=1.0.0,<2.0.0" "zope.deprecation" = ">=4.4.0,<5.0.0" "zope.interface" = ">=4.6.0,<5.0.0" -"zope.sqlalchemy" = ">=1.2,<2.0" +"zope.sqlalchemy" = "1.3" [[package]] category = "main" @@ -310,7 +311,7 @@ description = "Utility package for interacting with the 4DN Data Portal and othe name = "dcicutils" optional = false python-versions = ">=3.4,<3.8" -version = "0.32.2" +version = "0.35.1" [package.dependencies] aws-requests-auth = ">=0.4.2,<1" @@ -495,7 +496,7 @@ description = "Python Git Library" name = "gitpython" optional = false python-versions = ">=3.4" -version = "3.1.3" +version = "3.1.7" [package.dependencies] gitdb = ">=4.0.1,<5" @@ -1409,7 +1410,7 @@ description = "Fast, Extensible Progress Meter" name = "tqdm" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*" -version = "4.47.0" +version = "4.48.0" [package.extras] dev = ["py-make (>=0.1.0)", "twine", "argopt", "pydoc-markdown"] @@ -1452,7 +1453,7 @@ description = "HTTP library with thread-safe connection pooling, file post, and name = "urllib3" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" -version = "1.25.9" +version = "1.25.10" [package.extras] brotli = ["brotlipy (>=0.6.0)"] @@ -1654,7 +1655,8 @@ transaction = ">=1.6.0" test = ["zope.testing"] [metadata] -content-hash = "f480337cad8012fe1536aef687bdbce1f60139b9b9887feeed77fc59be1db485" +content-hash = "7b0f42c553a61ed14b8ad181a6ba4b1c2e6d860cdaa10e01a43ab055a014549f" +lock-version = "1.0" python-versions = ">=3.6,<3.7" [metadata.files] @@ -1749,76 +1751,76 @@ colorama = [ {file = "colorama-0.3.3.tar.gz", hash = "sha256:eb21f2ba718fbf357afdfdf6f641ab393901c7ca8d9f37edd0bee4806ffa269c"}, ] coverage = [ - {file = "coverage-5.2-cp27-cp27m-macosx_10_13_intel.whl", hash = "sha256:d9ad0a988ae20face62520785ec3595a5e64f35a21762a57d115dae0b8fb894a"}, - {file = "coverage-5.2-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:4bb385a747e6ae8a65290b3df60d6c8a692a5599dc66c9fa3520e667886f2e10"}, - {file = "coverage-5.2-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:9702e2cb1c6dec01fb8e1a64c015817c0800a6eca287552c47a5ee0ebddccf62"}, - {file = "coverage-5.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:42fa45a29f1059eda4d3c7b509589cc0343cd6bbf083d6118216830cd1a51613"}, - {file = "coverage-5.2-cp27-cp27m-win32.whl", hash = "sha256:41d88736c42f4a22c494c32cc48a05828236e37c991bd9760f8923415e3169e4"}, - {file = "coverage-5.2-cp27-cp27m-win_amd64.whl", hash = "sha256:bbb387811f7a18bdc61a2ea3d102be0c7e239b0db9c83be7bfa50f095db5b92a"}, - {file = "coverage-5.2-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:3740b796015b889e46c260ff18b84683fa2e30f0f75a171fb10d2bf9fb91fc70"}, - {file = "coverage-5.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ebf2431b2d457ae5217f3a1179533c456f3272ded16f8ed0b32961a6d90e38ee"}, - {file = "coverage-5.2-cp35-cp35m-macosx_10_13_x86_64.whl", hash = "sha256:d54d7ea74cc00482a2410d63bf10aa34ebe1c49ac50779652106c867f9986d6b"}, - {file = "coverage-5.2-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:87bdc8135b8ee739840eee19b184804e5d57f518578ffc797f5afa2c3c297913"}, - {file = "coverage-5.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:ed9a21502e9223f563e071759f769c3d6a2e1ba5328c31e86830368e8d78bc9c"}, - {file = "coverage-5.2-cp35-cp35m-win32.whl", hash = "sha256:509294f3e76d3f26b35083973fbc952e01e1727656d979b11182f273f08aa80b"}, - {file = "coverage-5.2-cp35-cp35m-win_amd64.whl", hash = "sha256:ca63dae130a2e788f2b249200f01d7fa240f24da0596501d387a50e57aa7075e"}, - {file = "coverage-5.2-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:5c74c5b6045969b07c9fb36b665c9cac84d6c174a809fc1b21bdc06c7836d9a0"}, - {file = "coverage-5.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:c32aa13cc3fe86b0f744dfe35a7f879ee33ac0a560684fef0f3e1580352b818f"}, - {file = "coverage-5.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:1e58fca3d9ec1a423f1b7f2aa34af4f733cbfa9020c8fe39ca451b6071237405"}, - {file = "coverage-5.2-cp36-cp36m-win32.whl", hash = "sha256:3b2c34690f613525672697910894b60d15800ac7e779fbd0fccf532486c1ba40"}, - {file = "coverage-5.2-cp36-cp36m-win_amd64.whl", hash = "sha256:a4d511012beb967a39580ba7d2549edf1e6865a33e5fe51e4dce550522b3ac0e"}, - {file = "coverage-5.2-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:32ecee61a43be509b91a526819717d5e5650e009a8d5eda8631a59c721d5f3b6"}, - {file = "coverage-5.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6f91b4492c5cde83bfe462f5b2b997cdf96a138f7c58b1140f05de5751623cf1"}, - {file = "coverage-5.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:bfcc811883699ed49afc58b1ed9f80428a18eb9166422bce3c31a53dba00fd1d"}, - {file = "coverage-5.2-cp37-cp37m-win32.whl", hash = "sha256:60a3d36297b65c7f78329b80120f72947140f45b5c7a017ea730f9112b40f2ec"}, - {file = "coverage-5.2-cp37-cp37m-win_amd64.whl", hash = "sha256:12eaccd86d9a373aea59869bc9cfa0ab6ba8b1477752110cb4c10d165474f703"}, - {file = "coverage-5.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:d82db1b9a92cb5c67661ca6616bdca6ff931deceebb98eecbd328812dab52032"}, - {file = "coverage-5.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:214eb2110217f2636a9329bc766507ab71a3a06a8ea30cdeebb47c24dce5972d"}, - {file = "coverage-5.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8a3decd12e7934d0254939e2bf434bf04a5890c5bf91a982685021786a08087e"}, - {file = "coverage-5.2-cp38-cp38-win32.whl", hash = "sha256:1dcebae667b73fd4aa69237e6afb39abc2f27520f2358590c1b13dd90e32abe7"}, - {file = "coverage-5.2-cp38-cp38-win_amd64.whl", hash = "sha256:f50632ef2d749f541ca8e6c07c9928a37f87505ce3a9f20c8446ad310f1aa87b"}, - {file = "coverage-5.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:7403675df5e27745571aba1c957c7da2dacb537c21e14007ec3a417bf31f7f3d"}, - {file = "coverage-5.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:0fc4e0d91350d6f43ef6a61f64a48e917637e1dcfcba4b4b7d543c628ef82c2d"}, - {file = "coverage-5.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:25fe74b5b2f1b4abb11e103bb7984daca8f8292683957d0738cd692f6a7cc64c"}, - {file = "coverage-5.2-cp39-cp39-win32.whl", hash = "sha256:d67599521dff98ec8c34cd9652cbcfe16ed076a2209625fca9dc7419b6370e5c"}, - {file = "coverage-5.2-cp39-cp39-win_amd64.whl", hash = "sha256:10f2a618a6e75adf64329f828a6a5b40244c1c50f5ef4ce4109e904e69c71bd2"}, - {file = "coverage-5.2.tar.gz", hash = "sha256:1874bdc943654ba46d28f179c1846f5710eda3aeb265ff029e0ac2b52daae404"}, + {file = "coverage-5.2.1-cp27-cp27m-macosx_10_13_intel.whl", hash = "sha256:40f70f81be4d34f8d491e55936904db5c527b0711b2a46513641a5729783c2e4"}, + {file = "coverage-5.2.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:675192fca634f0df69af3493a48224f211f8db4e84452b08d5fcebb9167adb01"}, + {file = "coverage-5.2.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:2fcc8b58953d74d199a1a4d633df8146f0ac36c4e720b4a1997e9b6327af43a8"}, + {file = "coverage-5.2.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:64c4f340338c68c463f1b56e3f2f0423f7b17ba6c3febae80b81f0e093077f59"}, + {file = "coverage-5.2.1-cp27-cp27m-win32.whl", hash = "sha256:52f185ffd3291196dc1aae506b42e178a592b0b60a8610b108e6ad892cfc1bb3"}, + {file = "coverage-5.2.1-cp27-cp27m-win_amd64.whl", hash = "sha256:30bc103587e0d3df9e52cd9da1dd915265a22fad0b72afe54daf840c984b564f"}, + {file = "coverage-5.2.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:9ea749fd447ce7fb1ac71f7616371f04054d969d412d37611716721931e36efd"}, + {file = "coverage-5.2.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ce7866f29d3025b5b34c2e944e66ebef0d92e4a4f2463f7266daa03a1332a651"}, + {file = "coverage-5.2.1-cp35-cp35m-macosx_10_13_x86_64.whl", hash = "sha256:4869ab1c1ed33953bb2433ce7b894a28d724b7aa76c19b11e2878034a4e4680b"}, + {file = "coverage-5.2.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:a3ee9c793ffefe2944d3a2bd928a0e436cd0ac2d9e3723152d6fd5398838ce7d"}, + {file = "coverage-5.2.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:28f42dc5172ebdc32622a2c3f7ead1b836cdbf253569ae5673f499e35db0bac3"}, + {file = "coverage-5.2.1-cp35-cp35m-win32.whl", hash = "sha256:e26c993bd4b220429d4ec8c1468eca445a4064a61c74ca08da7429af9bc53bb0"}, + {file = "coverage-5.2.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4186fc95c9febeab5681bc3248553d5ec8c2999b8424d4fc3a39c9cba5796962"}, + {file = "coverage-5.2.1-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:b360d8fd88d2bad01cb953d81fd2edd4be539df7bfec41e8753fe9f4456a5082"}, + {file = "coverage-5.2.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:1adb6be0dcef0cf9434619d3b892772fdb48e793300f9d762e480e043bd8e716"}, + {file = "coverage-5.2.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:098a703d913be6fbd146a8c50cc76513d726b022d170e5e98dc56d958fd592fb"}, + {file = "coverage-5.2.1-cp36-cp36m-win32.whl", hash = "sha256:962c44070c281d86398aeb8f64e1bf37816a4dfc6f4c0f114756b14fc575621d"}, + {file = "coverage-5.2.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b1ed2bdb27b4c9fc87058a1cb751c4df8752002143ed393899edb82b131e0546"}, + {file = "coverage-5.2.1-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:c890728a93fffd0407d7d37c1e6083ff3f9f211c83b4316fae3778417eab9811"}, + {file = "coverage-5.2.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:538f2fd5eb64366f37c97fdb3077d665fa946d2b6d95447622292f38407f9258"}, + {file = "coverage-5.2.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:27ca5a2bc04d68f0776f2cdcb8bbd508bbe430a7bf9c02315cd05fb1d86d0034"}, + {file = "coverage-5.2.1-cp37-cp37m-win32.whl", hash = "sha256:aab75d99f3f2874733946a7648ce87a50019eb90baef931698f96b76b6769a46"}, + {file = "coverage-5.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c2ff24df02a125b7b346c4c9078c8936da06964cc2d276292c357d64378158f8"}, + {file = "coverage-5.2.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:304fbe451698373dc6653772c72c5d5e883a4aadaf20343592a7abb2e643dae0"}, + {file = "coverage-5.2.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c96472b8ca5dc135fb0aa62f79b033f02aa434fb03a8b190600a5ae4102df1fd"}, + {file = "coverage-5.2.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8505e614c983834239f865da2dd336dcf9d72776b951d5dfa5ac36b987726e1b"}, + {file = "coverage-5.2.1-cp38-cp38-win32.whl", hash = "sha256:700997b77cfab016533b3e7dbc03b71d33ee4df1d79f2463a318ca0263fc29dd"}, + {file = "coverage-5.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:46794c815e56f1431c66d81943fa90721bb858375fb36e5903697d5eef88627d"}, + {file = "coverage-5.2.1-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:16042dc7f8e632e0dcd5206a5095ebd18cb1d005f4c89694f7f8aafd96dd43a3"}, + {file = "coverage-5.2.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:c1bbb628ed5192124889b51204de27c575b3ffc05a5a91307e7640eff1d48da4"}, + {file = "coverage-5.2.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:4f6428b55d2916a69f8d6453e48a505c07b2245653b0aa9f0dee38785939f5e4"}, + {file = "coverage-5.2.1-cp39-cp39-win32.whl", hash = "sha256:9e536783a5acee79a9b308be97d3952b662748c4037b6a24cbb339dc7ed8eb89"}, + {file = "coverage-5.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:b8f58c7db64d8f27078cbf2a4391af6aa4e4767cc08b37555c4ae064b8558d9b"}, + {file = "coverage-5.2.1.tar.gz", hash = "sha256:a34cb28e0747ea15e82d13e14de606747e9e484fb28d63c999483f5d5188e89b"}, ] coveralls = [ {file = "coveralls-2.1.1-py2.py3-none-any.whl", hash = "sha256:3726d35c0f93a28631a003880e2aa6cc93c401d62bc6919c5cb497217ba30c55"}, {file = "coveralls-2.1.1.tar.gz", hash = "sha256:afe359cd5b350e1b3895372bda32af8f0260638c7c4a31a5c0f15aa6a96f40d9"}, ] cryptography = [ - {file = "cryptography-2.9.2-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:daf54a4b07d67ad437ff239c8a4080cfd1cc7213df57d33c97de7b4738048d5e"}, - {file = "cryptography-2.9.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:3b3eba865ea2754738616f87292b7f29448aec342a7c720956f8083d252bf28b"}, - {file = "cryptography-2.9.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:c447cf087cf2dbddc1add6987bbe2f767ed5317adb2d08af940db517dd704365"}, - {file = "cryptography-2.9.2-cp27-cp27m-win32.whl", hash = "sha256:f118a95c7480f5be0df8afeb9a11bd199aa20afab7a96bcf20409b411a3a85f0"}, - {file = "cryptography-2.9.2-cp27-cp27m-win_amd64.whl", hash = "sha256:c4fd17d92e9d55b84707f4fd09992081ba872d1a0c610c109c18e062e06a2e55"}, - {file = "cryptography-2.9.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d0d5aeaedd29be304848f1c5059074a740fa9f6f26b84c5b63e8b29e73dfc270"}, - {file = "cryptography-2.9.2-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e4014639d3d73fbc5ceff206049c5a9a849cefd106a49fa7aaaa25cc0ce35cf"}, - {file = "cryptography-2.9.2-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:96c080ae7118c10fcbe6229ab43eb8b090fccd31a09ef55f83f690d1ef619a1d"}, - {file = "cryptography-2.9.2-cp35-abi3-manylinux1_x86_64.whl", hash = "sha256:e993468c859d084d5579e2ebee101de8f5a27ce8e2159959b6673b418fd8c785"}, - {file = "cryptography-2.9.2-cp35-abi3-manylinux2010_x86_64.whl", hash = "sha256:88c881dd5a147e08d1bdcf2315c04972381d026cdb803325c03fe2b4a8ed858b"}, - {file = "cryptography-2.9.2-cp35-cp35m-win32.whl", hash = "sha256:651448cd2e3a6bc2bb76c3663785133c40d5e1a8c1a9c5429e4354201c6024ae"}, - {file = "cryptography-2.9.2-cp35-cp35m-win_amd64.whl", hash = "sha256:726086c17f94747cedbee6efa77e99ae170caebeb1116353c6cf0ab67ea6829b"}, - {file = "cryptography-2.9.2-cp36-cp36m-win32.whl", hash = "sha256:091d31c42f444c6f519485ed528d8b451d1a0c7bf30e8ca583a0cac44b8a0df6"}, - {file = "cryptography-2.9.2-cp36-cp36m-win_amd64.whl", hash = "sha256:bb1f0281887d89617b4c68e8db9a2c42b9efebf2702a3c5bf70599421a8623e3"}, - {file = "cryptography-2.9.2-cp37-cp37m-win32.whl", hash = "sha256:18452582a3c85b96014b45686af264563e3e5d99d226589f057ace56196ec78b"}, - {file = "cryptography-2.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:22e91636a51170df0ae4dcbd250d318fd28c9f491c4e50b625a49964b24fe46e"}, - {file = "cryptography-2.9.2-cp38-cp38-win32.whl", hash = "sha256:844a76bc04472e5135b909da6aed84360f522ff5dfa47f93e3dd2a0b84a89fa0"}, - {file = "cryptography-2.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:1dfa985f62b137909496e7fc182dac687206d8d089dd03eaeb28ae16eec8e7d5"}, - {file = "cryptography-2.9.2.tar.gz", hash = "sha256:a0c30272fb4ddda5f5ffc1089d7405b7a71b0b0f51993cb4e5dbb4590b2fc229"}, + {file = "cryptography-3.0-cp27-cp27m-macosx_10_10_x86_64.whl", hash = "sha256:ab49edd5bea8d8b39a44b3db618e4783ef84c19c8b47286bf05dfdb3efb01c83"}, + {file = "cryptography-3.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:124af7255ffc8e964d9ff26971b3a6153e1a8a220b9a685dc407976ecb27a06a"}, + {file = "cryptography-3.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:51e40123083d2f946794f9fe4adeeee2922b581fa3602128ce85ff813d85b81f"}, + {file = "cryptography-3.0-cp27-cp27m-win32.whl", hash = "sha256:dea0ba7fe6f9461d244679efa968d215ea1f989b9c1957d7f10c21e5c7c09ad6"}, + {file = "cryptography-3.0-cp27-cp27m-win_amd64.whl", hash = "sha256:8ecf9400d0893836ff41b6f977a33972145a855b6efeb605b49ee273c5e6469f"}, + {file = "cryptography-3.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:0c608ff4d4adad9e39b5057de43657515c7da1ccb1807c3a27d4cf31fc923b4b"}, + {file = "cryptography-3.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:bec7568c6970b865f2bcebbe84d547c52bb2abadf74cefce396ba07571109c67"}, + {file = "cryptography-3.0-cp35-abi3-macosx_10_10_x86_64.whl", hash = "sha256:0cbfed8ea74631fe4de00630f4bb592dad564d57f73150d6f6796a24e76c76cd"}, + {file = "cryptography-3.0-cp35-abi3-manylinux1_x86_64.whl", hash = "sha256:a09fd9c1cca9a46b6ad4bea0a1f86ab1de3c0c932364dbcf9a6c2a5eeb44fa77"}, + {file = "cryptography-3.0-cp35-abi3-manylinux2010_x86_64.whl", hash = "sha256:ce82cc06588e5cbc2a7df3c8a9c778f2cb722f56835a23a68b5a7264726bb00c"}, + {file = "cryptography-3.0-cp35-cp35m-win32.whl", hash = "sha256:9367d00e14dee8d02134c6c9524bb4bd39d4c162456343d07191e2a0b5ec8b3b"}, + {file = "cryptography-3.0-cp35-cp35m-win_amd64.whl", hash = "sha256:384d7c681b1ab904fff3400a6909261cae1d0939cc483a68bdedab282fb89a07"}, + {file = "cryptography-3.0-cp36-cp36m-win32.whl", hash = "sha256:4d355f2aee4a29063c10164b032d9fa8a82e2c30768737a2fd56d256146ad559"}, + {file = "cryptography-3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:45741f5499150593178fc98d2c1a9c6722df88b99c821ad6ae298eff0ba1ae71"}, + {file = "cryptography-3.0-cp37-cp37m-win32.whl", hash = "sha256:8ecef21ac982aa78309bb6f092d1677812927e8b5ef204a10c326fc29f1367e2"}, + {file = "cryptography-3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:4b9303507254ccb1181d1803a2080a798910ba89b1a3c9f53639885c90f7a756"}, + {file = "cryptography-3.0-cp38-cp38-win32.whl", hash = "sha256:8713ddb888119b0d2a1462357d5946b8911be01ddbf31451e1d07eaa5077a261"}, + {file = "cryptography-3.0-cp38-cp38-win_amd64.whl", hash = "sha256:bea0b0468f89cdea625bb3f692cd7a4222d80a6bdafd6fb923963f2b9da0e15f"}, + {file = "cryptography-3.0.tar.gz", hash = "sha256:8e924dbc025206e97756e8903039662aa58aa9ba357d8e1d8fc29e3092322053"}, ] dcicpyvcf = [ {file = "dcicpyvcf-1.0.0.tar.gz", hash = "sha256:c5bf8d585002ab3b95d13a47803376b456b931865e4189c38a18cca47b108449"}, ] dcicsnovault = [ - {file = "dcicsnovault-3.1.4-py3-none-any.whl", hash = "sha256:3ad78b95255f4a409fb7e29d1933fee113b03c2a6abf65e62b821a0e6ac1666e"}, - {file = "dcicsnovault-3.1.4.tar.gz", hash = "sha256:5efc2ea37d0fc78411817925d63c383e322680f264629c2060533a50721a9bd4"}, + {file = "dcicsnovault-3.1.8-py3-none-any.whl", hash = "sha256:928fc529d769208356cfdb4aade6bf39bdb31db2cc4d35899536d505a89704ee"}, + {file = "dcicsnovault-3.1.8.tar.gz", hash = "sha256:0127c6dde6eef7271cf07a63fa141f596c25b9109c5fccb40892fbe0e4e0b464"}, ] dcicutils = [ - {file = "dcicutils-0.32.2-py3-none-any.whl", hash = "sha256:7403d422a12160162a9691aff2af04f6f37869fc40252f2f61cd92a10076a4b2"}, - {file = "dcicutils-0.32.2.tar.gz", hash = "sha256:888feae7870294fe12979fbe567b653874273d149bcc31ecef5817300b01c0f1"}, + {file = "dcicutils-0.35.1-py3-none-any.whl", hash = "sha256:5f6cd17fb1c78adfaca9f2e23dedbe4a8a3d8c0dee2dc73cdd7f2dd832debc5d"}, + {file = "dcicutils-0.35.1.tar.gz", hash = "sha256:7924f83de55673b580d02242a4bcc3440c290f1139a862f951ef00958a24fb71"}, ] docker = [ {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, @@ -1872,8 +1874,8 @@ gitdb = [ {file = "gitdb-4.0.5.tar.gz", hash = "sha256:c9e1f2d0db7ddb9a704c2a0217be31214e91a4fe1dea1efad19ae42ba0c285c9"}, ] gitpython = [ - {file = "GitPython-3.1.3-py3-none-any.whl", hash = "sha256:ef1d60b01b5ce0040ad3ec20bc64f783362d41fa0822a2742d3586e1f49bb8ac"}, - {file = "GitPython-3.1.3.tar.gz", hash = "sha256:e107af4d873daed64648b4f4beb89f89f0cfbe3ef558fc7821ed2331c2f8da1a"}, + {file = "GitPython-3.1.7-py3-none-any.whl", hash = "sha256:fa3b92da728a457dd75d62bb5f3eb2816d99a7fe6c67398e260637a40e3fafb5"}, + {file = "GitPython-3.1.7.tar.gz", hash = "sha256:2db287d71a284e22e5c2846042d0602465c7434d910406990d5b74df4afb0858"}, ] html5lib = [ {file = "html5lib-0.9999999.tar.gz", hash = "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868"}, @@ -2379,8 +2381,8 @@ toml = [ {file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"}, ] tqdm = [ - {file = "tqdm-4.47.0-py2.py3-none-any.whl", hash = "sha256:7810e627bcf9d983a99d9ff8a0c09674400fd2927eddabeadf153c14a2ec8656"}, - {file = "tqdm-4.47.0.tar.gz", hash = "sha256:63ef7a6d3eb39f80d6b36e4867566b3d8e5f1fe3d6cb50c5e9ede2b3198ba7b7"}, + {file = "tqdm-4.48.0-py2.py3-none-any.whl", hash = "sha256:fcb7cb5b729b60a27f300b15c1ffd4744f080fb483b88f31dc8654b082cc8ea5"}, + {file = "tqdm-4.48.0.tar.gz", hash = "sha256:6baa75a88582b1db6d34ce4690da5501d2a1cb65c34664840a456b2c9f794d29"}, ] transaction = [ {file = "transaction-2.4.0-py2.py3-none-any.whl", hash = "sha256:b96a5e9aaa73f905759bc9ccf0021bf4864c01ac36666e0d28395e871f6d584a"}, @@ -2394,8 +2396,8 @@ uptime = [ {file = "uptime-3.0.1.tar.gz", hash = "sha256:7c300254775b807ce46e3dcbcda30aa3b9a204b9c57a7ac1e79ee6dbe3942973"}, ] urllib3 = [ - {file = "urllib3-1.25.9-py2.py3-none-any.whl", hash = "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"}, - {file = "urllib3-1.25.9.tar.gz", hash = "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527"}, + {file = "urllib3-1.25.10-py2.py3-none-any.whl", hash = "sha256:e7983572181f5e1522d9c98453462384ee92a0be7fac5f1413a1e35c56cc0461"}, + {file = "urllib3-1.25.10.tar.gz", hash = "sha256:91056c15fa70756691db97756772bb1eb9678fa585d9184f24534b100dc60f4a"}, ] venusian = [ {file = "venusian-1.2.0-py2.py3-none-any.whl", hash = "sha256:2f2d077a1eedc3fda40425f65687c8c494da7e83d7c23bc2c4d1a40eb3ca5b6d"}, diff --git a/pyproject.toml b/pyproject.toml index c251dffd51..83d2d9ff01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,8 +42,8 @@ certifi = ">=2020.4.5.2" chardet = "3.0.4" colorama = "0.3.3" dcicpyvcf = "1.0.0" -dcicsnovault = ">=3.1.4,<4" -dcicutils = ">=0.31.1,<1" +dcicsnovault = ">=3.1.8,<4" +dcicutils = ">=0.35.1,<1" docutils = "0.12" elasticsearch = "5.5.3" elasticsearch-dsl = "^5.4.0" From ce9eb26cd1e37cad1d7d244c87be39f0609a63f7 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Sun, 26 Jul 2020 20:54:50 -0400 Subject: [PATCH 038/125] WIP: Attempts to use a IngestionSubmission type for progress info. --- src/encoded/ingestion_engines.py | 35 + src/encoded/ingestion_listener.py | 24 +- src/encoded/schemas/ingestion_submission.json | 87 +++ src/encoded/submit.py | 8 +- src/encoded/submit.py.SAVE | 664 ++++++++++++++++++ src/encoded/types/ingestion.py | 42 ++ 6 files changed, 851 insertions(+), 9 deletions(-) create mode 100644 src/encoded/schemas/ingestion_submission.json create mode 100644 src/encoded/submit.py.SAVE create mode 100644 src/encoded/types/ingestion.py diff --git a/src/encoded/ingestion_engines.py b/src/encoded/ingestion_engines.py index 05a5f0f82e..4a8abf6b07 100644 --- a/src/encoded/ingestion_engines.py +++ b/src/encoded/ingestion_engines.py @@ -76,6 +76,23 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): try: + if isinstance(institution, str): + institution = vapp.get(institution).json + if isinstance(project, str): + project = vapp.get(project).json + + vapp.patch_json("/ingestion-submission", { + "object_name": manifest['object_name'], + "ingestion_type": ingestion_type, + "submission_id": uuid, + "parameters": manifest['parameters'], + "institution": institution, + "project": project, + "processing_status": { + "state": "processing", + } + }) + validation_log_lines, final_json, result_lines = submit_data_bundle(s3_client=s3_client, bucket=DATA_BUNDLE_BUCKET, key=data_key, @@ -96,6 +113,15 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=submission_response_key) as fp: _show_report_lines(result_lines, fp) + vapp.patch_json("ingestion-submission", { + "submission_id": uuid, + "progress": { + "state": "done", + "outcome": "failure" if validation_log_lines else "success", + "progress": "complete", + }, + }) + except Exception as e: resolution["traceback_key"] = traceback_key = "%s/traceback.json" % uuid @@ -105,5 +131,14 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): resolution["error_type"] = e.__class__.__name__ resolution["error_message"] = str(e) + vapp.patch_json("ingestion-submission", { + "submission_id": uuid, + "progress": { + "state": "done", + "outcome": "error", + "progress": "incomplete", + }, + }) + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key="%s/resolution.json" % uuid) as fp: print(json.dumps(resolution, indent=2), file=fp) diff --git a/src/encoded/ingestion_listener.py b/src/encoded/ingestion_listener.py index 98c472d6d9..f69766c41f 100644 --- a/src/encoded/ingestion_listener.py +++ b/src/encoded/ingestion_listener.py @@ -74,10 +74,10 @@ def submit_for_ingestion(context, request): # NOTE: Some reference information about uploading files to s3 is here: # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html - upload_id = str(uuid.uuid4()) + submission_id = str(uuid.uuid4()) _, ext = os.path.splitext(filename) - object_name = "{id}/datafile{ext}".format(id=upload_id, ext=ext) - manifest_name = "{id}/manifest.json".format(id=upload_id) + object_name = "{id}/datafile{ext}".format(id=submission_id, ext=ext) + manifest_name = "{id}/manifest.json".format(id=submission_id) s3_client = boto3.client('s3') @@ -98,6 +98,7 @@ def submit_for_ingestion(context, request): result = { "filename": filename, "object_name": object_name, + "submission_id": submission_id, "bucket": DATA_BUNDLE_BUCKET, "success": success, "message": message, @@ -123,7 +124,7 @@ def submit_for_ingestion(context, request): raise SubmissionFailure(message) queue_manager = get_queue_manager(request, override_name=override_name) - _, failed = queue_manager.add_uuids([upload_id], ingestion_type=ingestion_type) + _, failed = queue_manager.add_uuids([submission_id], ingestion_type=ingestion_type) if failed: # If there's a failure, failed will be a list of one problem description since we only submitted one thing. @@ -214,7 +215,7 @@ def __init__(self, registry, override_name=None): 'region_name': 'us-east-1' } self.client = boto3.client('sqs', **kwargs) - self.queue_name = self.env_name + self.BUCKET_EXTENSION if not override_name else override_name + self.queue_name = override_name or (self.env_name + self.BUCKET_EXTENSION) self.queue_attrs = { self.queue_name: { 'DelaySeconds': '1', # messages initially invisible for 1 sec @@ -549,9 +550,17 @@ def run(self): if ingestion_type != 'vcf': # Let's minimally disrupt things for now. We can refactor this later # to make all the parts work the same -kmp + self.vapp.post_json("/ingestion-submission", { + "ingestion_type": ingestion_type, + "submission_id": uuid, + }) handler = get_ingestion_processor(ingestion_type) handler(uuid=uuid, ingestion_type=ingestion_type, vapp=self.vapp, log=log) - print("HANDLED", uuid) + # TODO: If we delete messages at the end of each loop, I think we'll here need to do this, + # since we're bypassing bottom of lop with the 'continue': + # self.delete_messages([message]) + # messages.remove(message) + debuglog("HANDLED", uuid) continue debuglog("Did NOT process", uuid, "as", ingestion_type) @@ -597,6 +606,9 @@ def run(self): log.error(msg) self.update_status(msg=msg) + # TODO: I worry waiting to delete multiple messages means that if there's an error + # we'll have things that were completed not get deleted. Should delete one per iteration? + # -kmp 26-Jul-2020 self.delete_messages(messages) diff --git a/src/encoded/schemas/ingestion_submission.json b/src/encoded/schemas/ingestion_submission.json new file mode 100644 index 0000000000..1de4a11dbe --- /dev/null +++ b/src/encoded/schemas/ingestion_submission.json @@ -0,0 +1,87 @@ +{ + "title": "Ingestion Submission", + "description": "Schema for metadata related to ingestion requests submitted to CGAP.", + "id": "/profiles/ingestion_submission.json", + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "required": [ + "ingestion_type", + "submission_id" + ], + "additionalProperties": false, + "identifyingProperties": ["uuid", "aliases"], + "mixinProperties": [ + { "$ref": "mixins.json#/schema_version" }, + { "$ref": "mixins.json#/aliases" }, + { "$ref": "mixins.json#/uuid" }, + { "$ref": "mixins.json#/documents" }, + { "$ref": "mixins.json#/attribution" }, + { "$ref": "mixins.json#/status" }, + { "$ref": "mixins.json#/submitted" }, + { "$ref": "mixins.json#/modified" }, + { "$ref": "mixins.json#/static_embeds" } + ], + "mixinFacets" : [ + { "$ref": "mixins.json#/facets_common" } + ], + "properties": { + "schema_version": { + "default": "3" + }, + "object_name": { + "title": "Object Name", + "type": "string" + }, + "ingestion_type": { + "title": "Ingestion Type", + "type": "string", + "enum": [ + "data_bundle", + "vcf" + ] + }, + "submission_id": { + "title": "Submission ID", + "type": "string" + }, + "parameters": { + "title": "Parameters", + "type": "object", + "additionalProperties": true, + "properties": {} + }, + "processing_status": { + "title": "Processing Status", + "type": "object", + "additionalProperties": false, + "properties": { + "state": { + "title": "State", + "type": "string", + "enum": [ + "submitted", + "processing", + "done" + ], + "default": "submitted" + }, + "outcome": { + "title": "Outcome", + "type": "string", + "enum": [ + "unknown", + "success", + "failure", + "error" + ], + "default": "unknown" + }, + "progress": { + "title": "Progress", + "type": "string", + "default": "unavailable" + } + } + } + } +} diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 17d2104450..76d850e3a3 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -103,9 +103,11 @@ def submit_data_bundle(*, s3_client, bucket, key, project, institution, vapp): log: a logging object capable of .info, .warning, .error, or .debug messages """ with s3_local_file(s3_client, bucket=bucket, key=key) as file: - project_json = vapp.get(project).json - institution_json = vapp.get(institution).json - json_data = xls_to_json(file, project=project_json, institution=institution_json) + assert isinstance(project, dict) + assert isinstance(institution, dict) + # project_json = vapp.get(project).json + # institution_json = vapp.get(institution).json + json_data = xls_to_json(file, project=project, institution=institution) final_json, validation_log_lines = validate_all_items(vapp, json_data) result_lines = post_and_patch_all_items(vapp, final_json) return validation_log_lines, final_json, result_lines diff --git a/src/encoded/submit.py.SAVE b/src/encoded/submit.py.SAVE new file mode 100644 index 0000000000..67b2338524 --- /dev/null +++ b/src/encoded/submit.py.SAVE @@ -0,0 +1,664 @@ +import ast +import datetime +import json +import xlrd + +from dcicutils.misc_utils import VirtualApp, VirtualAppError +from dcicutils import ff_utils +from pyramid.paster import get_app +from pyramid.response import Response +from snovault.util import debug_log +from pyramid.view import view_config +from webtest.app import AppError +from .common import s3_local_file, debuglog + + +GENERIC_FIELD_MAPPING = { + 'individual': {}, + 'family': {}, + 'sample': { + 'date collected': 'specimen_collection_date', + 'location stored': 'specimen_storage_location', + 'specimen id': 'specimen_accession', + 'transport method': 'transported_by', + 'sequencing ref lab': 'sequencing_lab', + "date rec'd at ref lab": 'date_received', + 'specimen accepted by ref lab': 'specimen_accepted', + 'sample id by ref lab': 'sequence_id', + 'req type': 'requisition_type', + "date req rec'd": 'date_requisition_received', + 'physician/provider': 'ordering_physician' + }, + 'requisition': { + 'req accepted y/n': 'accepted_rejected', + 'reason rejected': 'rejection_reason', + 'corrective action taken': 'corrective_action', + 'corrective action taken by': 'action_taken_by', + 'correction notes': 'notes' + } +} + +# BGM_FIELD_MAPPING = { +# 'bcgg-id': 'patient id', +# 'bcgg-f-id': 'family id', +# "date req rec'd": 'date requisition received' +# } + + +POST_ORDER = [ + 'file_fastq', 'file_processed', 'sample', 'individual', + 'family', 'sample_processing', 'report', 'case' +] + + +LINKS = [ + 'samples', 'members', 'mother', 'father', 'proband', 'report', + 'individual', 'sample_processing', 'families' +] + + + +# This "/submit_data" endpoint is a placeholder for a submission endpoint modified from loadxl. +# +# NOTES FROM KMP (25-Jul-2020): +# +# This will be done differently soon as part of the "/submit_for_ingestion" endpoint that +# will be in ingestion_listener.py. That endpoint will need an "?ingestion type=data_bundle" +# as query parameter. That "data_bundle" ingestion type will defined in ingestion_engines.py. +# The new entry point here that will be needed is submit_data_bundle, and then this temporary +# "/submit_data" endpoint can presumably go away.. -kmp 25-Jul-2020 + +@view_config(route_name='submit_data', request_method='POST', permission='add') +@debug_log +def submit_data(context, request): + ''' + usage notes here later + ''' + config_uri = request.json.get('config_uri', 'production.ini') + patch_only = request.json.get('patch_only', False) + post_only = request.json.get('post_only', False) + app = get_app(config_uri, 'app') + environ = {'HTTP_ACCEPT': 'application/json', 'REMOTE_USER': 'TEST'} + virtualapp = VirtualApp(app, environ) + # expected response + request.response.status = 200 + result = { + 'status': 'success', + '@type': ['result'], + } + + raise NotImplementedError + +# This endpoint will soon be the primary entry point. Please keep it working as-is and do not remove it. +# -kmp 25-Jul-2020 +def submit_data_bundle(*, s3_client, bucket, key, project, institution, vapp): # All keyword arguments, all required. + """ + Handles processing of a submitted workbook. + + Args: + data_stream: an open stream to xls workbook data + project: a project identifier + institution: an institution identifier + vapp: a VirtualApp object + log: a logging object capable of .info, .warning, .error, or .debug messages + """ + with s3_local_file(s3_client, bucket=bucket, key=key) as file: + project_json = vapp.get(project).json + institution_json = vapp.get(institution).json + json_data = xls_to_json(file, project=project_json, institution=institution_json) + final_json, validation_log_lines = validate_all_items(vapp, json_data) + result_lines = post_and_patch_all_items(vapp, final_json) + return validation_log_lines, final_json, result_lines + + +def map_fields(row, metadata_dict, addl_fields, item_type): + for map_field in GENERIC_FIELD_MAPPING[item_type]: + if map_field in row: + metadata_dict[GENERIC_FIELD_MAPPING[item_type][map_field]] = row.get(map_field) + for field in addl_fields: + metadata_dict[field] = row.get(field.replace('_', ' ')) + return metadata_dict + + +def xls_to_json(xls_data, project, institution): + ''' + Converts excel file to json for submission. + Functional but expect future changes. + ''' + book = xlrd.open_workbook(xls_data) + sheet, = book.sheets() + row = row_generator(sheet) + top_header = next(row) + debuglog("top_header:", top_header) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + keys = next(row) + debuglog("keys:", keys) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + descriptions = next(row) + debuglog("descriptions:", descriptions) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + rows = [] + counter = 0 + for values in row: + r = [val for val in values] + row_dict = {keys[i].lower().rstrip('*'): item for i, item in enumerate(r)} + rows.append(row_dict) + + items = { + 'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}, + 'file_fastq': {}, 'file_processed': {}, 'case': {}, 'report': {}, + 'reports': [] + } + file_errors = [] + specimen_ids = {} + family_dict = create_families(rows) + a_types = get_analysis_types(rows) + for row in rows: + debuglog("row:", repr(row)) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) + fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) + # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) + # create items for Individual + items = fetch_individual_metadata(row, items, indiv_alias, institution['name']) + # create/edit items for Family + items = fetch_family_metadata(row, items, indiv_alias, fam_alias) + # create item for Sample if there is a specimen + if row.get('specimen id'): + samp_alias = '{}:sample-{}'.format(project['name'], row['specimen id']) + if row['specimen id'] in specimen_ids: + samp_alias = samp_alias + '-' + specimen_ids[row['specimen id']] + specimen_ids[row['specimen id']] += 1 + else: + specimen_ids[row['specimen id']] = 1 + analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) + items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, + fam_alias, project['name'], a_types) + if row.get('files'): + file_items = fetch_file_metadata(row['files'].split(','), project['name']) + file_errors.extend(file_items['errors']) + items['file_fastq'].update(file_items['file_fastq']) + items['file_processed'].update(file_items['file_processed']) + else: + print('WARNING: No specimen id present for patient {},' + ' sample will not be created.'.format(row['individual id'])) + # create SampleProcessing item for trio/group if needed + # items = create_sample_processing_groups(items, sp_alias) + items = add_relations(items) + items = create_case_items(items, project['name']) + # removed unused fields, add project and institution + for val1 in items.values(): + for val2 in val1.values(): + remove_keys = [k for k, v in val2.items() if not v] + for key in remove_keys: + del val2[key] + val2['project'] = project['@id'] + val2['institution'] = institution['@id'] + items['file_errors'] = file_errors + return items + + +def create_families(rows): + proband_rows = [row for row in rows if row.get('relation to proband').lower() == 'proband'] + fams = {row.get('analysis id'): 'family-{}'.format(row.get('individual id')) for row in proband_rows} + return fams + + +def get_analysis_types(rows): + analysis_relations = {} + analysis_types = {} + for row in rows: + analysis_relations.setdefault(row.get('analysis id'), [[], []]) + analysis_relations[row.get('analysis id')][0].append(row.get('relation to proband', '').lower()) + analysis_relations[row.get('analysis id')][1].append(row.get('workup type', '').upper()) + for k, v in analysis_relations.items(): + if len(list(set(v[1]))) == 1: + if len(v[0]) == 1: + analysis_types[k] = v[1][0] + elif sorted(v[0]) == ['father', 'mother', 'proband']: + analysis_types[k] = v[1][0] + '-Trio' + else: + analysis_types[k] = v[1][0] + '-Group' + else: + analysis_types[k] = None + return analysis_types + + +def fetch_individual_metadata(row, items, indiv_alias, inst_name): + new_items = items.copy() + info = {'aliases': [indiv_alias]} + info = map_fields(row, info, ['individual_id', 'sex', 'age', 'birth_year'], 'individual') + if row.get('other individual id'): + other_id = {'id': row['other individual id'], 'id_source': inst_name} + if row.get('other individual id type'): + other_id['id_source'] = row['other individual id source'] + info['institutional_id'] = other_id + info['age'] = int(info['age']) if info.get('age') else None + info['birth_year'] = int(info['birth year']) if info.get('birth year') else None + if indiv_alias not in new_items['individual']: + new_items['individual'][indiv_alias] = {k: v for k, v in info.items() if v} + else: + for key in info: + if key not in new_items['individual'][indiv_alias]: + new_items['individual'][indiv_alias][key] = info[key] + return new_items + + +def fetch_family_metadata(row, items, indiv_alias, fam_alias): + new_items = items.copy() + info = { + 'aliases': [fam_alias], + 'family_id': row['family id'], + 'members': [indiv_alias] + } + if fam_alias not in new_items['family']: + new_items['family'][fam_alias] = info + if indiv_alias not in new_items['family'][fam_alias]['members']: + new_items['family'][fam_alias]['members'].append(indiv_alias) + for relation in ['proband', 'mother', 'father', 'brother', 'sister', 'sibling']: + if row.get('relation to proband', '').lower() == relation and relation not in new_items['family'][fam_alias]: + new_items['family'][fam_alias][relation] = indiv_alias + return new_items + + +def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name, analysis_type_dict): + new_items = items.copy() + info = {'aliases': [samp_alias], 'files': []} # TODO: implement creation of file db items + fields = [ + 'workup_type', 'specimen_type', 'dna_concentration', 'date_transported', + 'specimen_notes', 'research_protocol_name', 'sent_by', 'physician_id', 'indication' + ] + info = map_fields(row, info, fields, 'sample') + if info.get('specimen_accepted', '').lower() == 'y': + info['specimen_accepted'] = 'Yes' + elif info.get('specimen_accepted', '').lower() == 'n': + info['specimen_accepted'] = 'No' + if row.get('second specimen id'): + other_id = {'id': row['second specimen id'], 'id_type': proj_name} # add proj info? + if row.get('second specimen id type'): + other_id['id_type'] = row['second specimen id type'] + info['other_specimen_ids'] = [other_id] + req_info = map_fields(row, {}, ['date sent', 'date completed'], 'requisition') + if req_info.get('accepted_rejected', '').lower() in ['yes', 'no', 'y', 'n']: + if req_info['accepted_rejected'].lower().startswith('y'): + req_info['accepted_rejected'] = 'Accepted' + else: + req_info['accepted_rejected'] = "Rejected" + info['requisition_acceptance'] = {k: v for k, v in req_info.items() if v} + new_items['sample'][samp_alias] = {k: v for k, v in info.items() if v} + if indiv_alias in new_items['individual']: + new_items['individual'][indiv_alias]['samples'] = [samp_alias] + new_sp_item = { + # not trivial to add analysis_type here, turn into calculated property + 'aliases': [analysis_alias], + 'samples': [], + 'families': [] + } + if row.get('analysis id') in analysis_type_dict: + new_sp_item['analysis_type'] = analysis_type_dict[row.get('analysis id')] + new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) + new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) + if row.get('report required').lower().startswith('y'): + new_items['reports'].append(samp_alias) + if fam_alias not in new_items['sample_processing'][analysis_alias]['families']: + new_items['sample_processing'][analysis_alias]['families'].append(fam_alias) + return new_items + + +# TODO: finish implementing this function +def fetch_file_metadata(filenames, proj_name): + valid_extensions = { + '.fastq.gz': ('fastq', 'reads'), + '.fq.gz': ('fastq', 'reads'), + '.cram': ('cram', 'alignments'), + '.vcf.gz': ('vcf_gz', 'raw VCF') + } + files = {'file_fastq': {}, 'file_processed': {}, 'errors': []} + for filename in filenames: + extension = [ext for ext in valid_extensions if filename.endswith(ext)] + if not extension: + if [ext for ext in ['.fastq', '.fq', '.vcf'] if filename.endswith(ext)]: + files['errors'].append('File must be compressed - please gzip file {}'.format(filename)) + else: + files['errors'].append('File extension on {} not supported - expecting one of: ' + '.fastq.gz, .fq.gz, .cram, .vcf.gz'.format(filename)) + continue + file_alias = '{}:{}'.format(proj_name, filename.lstrip(' ')) + fmt = valid_extensions[extension[0]][0] + file_info = { + 'aliases': [file_alias], + 'file_format': '/file-formats/{}/'.format(fmt), + 'file_type': valid_extensions[extension[0]][1], + 'filename': filename # causes problems without functional file upload + } + if fmt == 'fastq': + files['file_fastq'][file_alias] = file_info + else: + files['file_processed'][file_alias] = file_info + return files + + +def create_case_items(items, proj_name): + new_items = items.copy() + for k, v in items['sample_processing'].items(): + analysis_id = k[k.index('analysis-')+9:] + for sample in v['samples']: + case_id = '{}-{}'.format(analysis_id, items['sample'][sample]['specimen_accession']) + if len(v['samples']) == 1: + case_id += '-single' + elif len(v['samples']) > 1: + case_id += '-group' + case_alias = '{}:case-{}'.format(proj_name, case_id) + indiv = [ikey for ikey, ival in items['individual'].items() if sample in ival.get('samples', [])][0] + case_info = { + 'aliases': [case_alias], + # 'case_id': case_id, + 'sample_processing': k, + 'individual': indiv + } + if sample in items['reports']: + report_alias = case_alias.replace('case', 'report') + new_items['report'][report_alias] = { + 'aliases': [report_alias], + 'description': 'Analysis Report for Individual ID {}'.format(items['individual'][indiv]['individual_id']) + } + case_info['report'] = report_alias + new_items['case'][case_alias] = case_info + del new_items['reports'] + return new_items + + +def add_relations(items): + new_items = items.copy() + for alias, fam in items['family'].items(): + parents = False + for relation in ['mother', 'father']: + if fam.get(relation): + if fam.get('proband'): + new_items['individual'][fam['proband']][relation] = fam[relation] + parents = True + del new_items['family'][alias][relation] + for relation in ['brother', 'sister', 'sibling']: + if fam.get(relation): + if parents: + for parent in ['mother', 'father']: + if new_items['individual'][fam['proband']].get(parent): + new_items['individual'][fam[relation]][parent] = new_items['individual'][fam['proband']][parent] + del new_items['family'][alias][relation] + return new_items + + +def compare_with_db(virtualapp, alias): + try: # check if already in db + result = virtualapp.get('/' + alias + '/?frame=object') + if result.status_code == 301: + msg = json.loads(result.body).get('message', '') + result = virtualapp.get(msg[msg.index('/'):msg.index(';')]) + except Exception as e: # if not in db + if 'HTTPNotFound' in str(e): + return None + else: + return result.json + + +def validate_item(virtualapp, item, method, itemtype, aliases, atid=None): + if method == 'post': + try: + validation = virtualapp.post_json('/{}/?check_only=true'.format(itemtype), item) + except (AppError, VirtualAppError) as e: + return parse_exception(e, aliases) + else: + return + elif method == 'patch': + try: + validation = virtualapp.patch_json(atid + '?check_only=true', item, status=200) + except (AppError, VirtualAppError) as e: + return parse_exception(e, aliases) + else: + return + else: + raise ValueError("Unrecognized method -- must be 'post' or 'patch'") + + +def parse_exception(e, aliases): + """ff_utils functions raise an exception when the expected code is not returned. + This response is a pre-formatted text, and this function will get the resonse json + out of it. [Adapted from Submit4DN]""" + try: + # try parsing the exception + if isinstance(e, VirtualAppError): + text = e.raw_exception.args[0] + else: + text = e.args[0] + resp_text = text[text.index('{'):-1] + resp_dict = json.loads(resp_text.replace('\\', '')) + except Exception: # pragma: no cover + raise e + if resp_dict.get('description') == 'Failed validation': + keep = [] + resp_list = [error['description'] for error in resp_dict['errors']] + for error in resp_list: + # if error is caused by linkTo to item not submitted yet but in aliases list, + # remove that error + if 'not found' in error and error.split("'")[1] in aliases: + continue + else: + keep.append(error) + return keep + else: + raise e + + +def compare_fields(profile, aliases, json_item, db_item): + to_patch = {} + for field in json_item: + # if not an array, patch field gets overwritten (if different from db) + if profile['properties'][field]['type'] != 'array': + val = json_item[field] + if profile['properties'][field]['type'] == 'string' and val in aliases: + val = aliases[val] + if val != db_item.get(field): + to_patch[field] = val + else: + # if array, patch field vals get added to what's in db + if field != 'aliases' and profile['properties'][field].get('items', {}).get('linkTo'): + val = [aliases[v] if v in aliases else v for v in json_item[field]] + else: + val = [v for v in json_item[field]] + # if sorted(val) != sorted(db_item.get(field, [])): + # if len(val) == 1 and val not in db_item.get(field, []): + # continue + if all(v in db_item.get(field, []) for v in val): + continue + new_val = [item for item in db_item.get(field, [])] + new_val.extend(val) + try: + to_patch[field] = list(set(new_val)) + except TypeError: # above doesn't handle list of dictionaries + to_patch[field] = [dict(t) for t in {tuple(d.items()) for d in new_val}] + return to_patch + + +def validate_all_items(virtualapp, json_data): + ''' + Still in progress, not necessarily functional yet. NOT YET TESTED. + + Function that: + 1. looks up each item in json + 2. if item in db, will validate and patch any different metadata + 3. if item not in db, will post item + + Current status: + Still testing validation/data organization parts - patch/post part hasn't been fully + written or tested. + ''' + alias_dict = {} + errors = json_data['file_errors'] + all_aliases = [k for itype in json_data for k in json_data[itype]] + json_data_final = {'post': {}, 'patch': {}} + validation_results = {} + output = [] + for itemtype in POST_ORDER: # don't pre-validate case and report + if itemtype in json_data: + profile = virtualapp.get('/profiles/{}.json'.format(itemtype)).json + validation_results[itemtype] = {'validated': 0, 'errors': 0} + db_results = {} + # TODO: json_data[itemtype] but item_type might not be in json_data according to previous "if" statement. + # Maybe we want "for alias in json_data.get(item_type, {}):" here? + # Alternatively, maybe give "json_data.get(item_type, {})" a variable name so that it can be referred + # to more concisely in the several places below that it's needed. + # -kmp 25-Jul-2020 + for alias in json_data[itemtype]: + # first collect all atids before comparing and validating items + db_result = compare_with_db(virtualapp, alias) + if db_result: + alias_dict[alias] = db_result['@id'] + # TODO: db_results is only conditionally assigned in the prevous "if". + # Perhaps the db_results = {} above should be moved up outside the "if"? + # Are we supposed to have a new dictionary on each iteration? -kmp 25-Jul-2020 + db_results[alias] = db_result + # TODO: Likewise this should probably loop over json_data.get(itemtype, {}). -kmp 25-Jul-2020 + for alias in json_data[itemtype]: + if 'filename' in json_data[itemtype][alias]: # until we have functional file upload + del json_data[itemtype][alias]['filename'] + if not db_results.get(alias): + error = validate_item(virtualapp, json_data[itemtype][alias], 'post', itemtype, all_aliases) + if error: # modify to check for presence of validation errors + # do something to report validation errors + if itemtype not in ['case', 'report']: + for e in error: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 + # TODO: If itemtype might not be in json_data (and conditionals above suggest that's so), + # then json_data[item_type][alias] seems suspect. It does work to do + # json_data.get(item_type, {}).get(alias, {}).get('filename') but I would put that + # quantity in a variable rather than compute it twice in a row. -kmp 25-Jul-2020 + elif json_data[itemtype][alias].get('filename') and \ + json_data[itemtype][alias]['filename'] in ''.join(json_data['file_errors']): + validation_results[itemtype]['errors'] += 1 + else: + json_data_final['post'].setdefault(itemtype, []) + json_data_final['post'][itemtype].append(json_data[itemtype][alias]) + validation_results[itemtype]['validated'] += 1 + else: + # patch if item exists in db + # alias_dict[alias] = results[alias]['@id'] + # TODO: profile is only conditionally assigned in an "if" above. -kmp 25-Jul-2020 + patch_data = compare_fields(profile, alias_dict, json_data[itemtype][alias], db_results[alias]) + error = validate_item(virtualapp, patch_data, 'patch', itemtype, + all_aliases, atid=db_results[alias]['@id']) + if error: # do something to report validation errors + if itemtype not in ['case', 'report']: + for e in error: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + validation_results[itemtype]['errors'] += 1 + elif json_data[itemtype][alias].get('filename') and \ + json_data[itemtype][alias]['filename'] in ''.join(json_data['file_errors']): + validation_results[itemtype]['errors'] += 1 + else: # patch + json_data_final['patch'].setdefault(itemtype, {}) + if patch_data: + json_data_final['patch'][itemtype][db_results[alias]['@id']] = patch_data + elif itemtype not in ['case', 'report']: + output.append('{} {} - Item already in database, no changes needed'.format(itemtype, alias)) + # do something to record response + validation_results[itemtype]['validated'] += 1 + output.extend([error for error in errors]) + for itemtype in validation_results: + output.append('{} items: {} validated; {} errors'.format( + itemtype, validation_results[itemtype]['validated'], validation_results[itemtype]['errors'] + )) + if errors: + output.append('Validation errors found in items. Please fix spreadsheet before submitting.') + return ({}, output) + else: + json_data_final['aliases'] = alias_dict + output.append('All items validated.') + return (json_data_final, output) + + +def post_and_patch_all_items(virtualapp, json_data_final): + output = [] + if not json_data_final: + return output + item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_accession'} + final_status = {} + if json_data_final.get('post'): + for k, v in json_data_final['post'].items(): + final_status[k] = {'posted': 0, 'not posted': 0, 'patched': 0, 'not patched': 0} + for item in v: + patch_info = {} + # if 'filename' in item: # until we have functional file upload + # del item['filename'] + for field in LINKS: + if field in item: + patch_info[field] = item[field] + del item[field] + try: + response = virtualapp.post_json('/' + k, item, status=201) + if response.json['status'] == 'success': + final_status[k]['posted'] += 1 + atid = response.json['@graph'][0]['@id'] + json_data_final['aliases'][item['aliases'][0]] = atid + json_data_final['patch'].setdefault(k, {}) + json_data_final['patch'][k][atid] = patch_info + if k in item_names: + output.append('Success - {} {} posted'.format(k, item[item_names[k]])) + else: + final_status[k]['not posted'] += 1 + except Exception as e: + final_status[k]['not posted'] += 1 + output.append(str(e)) + for itype in final_status: + if final_status[itype]['posted'] > 0 or final_status[itype]['not posted'] > 0: + output.append('{}: {} items posted successfully; {} items not posted'.format( + itype, final_status[itype]['posted'], final_status[itype]['not posted'] + )) + for k, v in json_data_final['patch'].items(): + final_status.setdefault(k, {'patched': 0, 'not patched': 0}) + for item_id, patch_data in v.items(): + # if 'filename' in patch_data: # until we have functional file upload + # del patch_data['filename'] + try: + response = virtualapp.patch_json('/' + item_id, patch_data, status=200) + if response.json['status'] == 'success': + # if k in item_names: + # output.append('Success - {} {} patched'.format(k, patch_data[item_names[k]])) + final_status[k]['patched'] += 1 + else: + final_status[k]['not patched'] += 1 + except Exception as e: + final_status[k]['not patched'] += 1 + output.append(str(e)) + if final_status[k]['patched'] > 0 or final_status[k]['not patched'] > 0: + output.append('{}: {} items patched successfully; {} items not patched'.format( + k, final_status[k]['patched'], final_status[k]['not patched'] + )) + return output + + +def cell_value(cell, datemode): + """Get cell value from excel. [From Submit4DN]""" + # This should be always returning text format + ctype = cell.ctype + value = cell.value + if ctype == xlrd.XL_CELL_ERROR: # pragma: no cover + raise ValueError(repr(cell), 'cell error') + elif ctype == xlrd.XL_CELL_BOOLEAN: + return str(value).upper().strip() + elif ctype == xlrd.XL_CELL_NUMBER: + if value.is_integer(): + value = int(value) + return str(value).strip() + elif ctype == xlrd.XL_CELL_DATE: + value = xlrd.xldate_as_tuple(value, datemode) + if value[3:] == (0, 0, 0): + return datetime.date(*value[:3]).isoformat() + else: # pragma: no cover + return datetime.datetime(*value).isoformat() + elif ctype in (xlrd.XL_CELL_TEXT, xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK): + return value.strip() + raise ValueError(repr(cell), 'unknown cell type') # pragma: no cover + + +def row_generator(sheet): + '''Generator that gets rows from excel sheet [From Submit4DN]''' + datemode = sheet.book.datemode + for index in range(sheet.nrows): + yield [cell_value(cell, datemode) for cell in sheet.row(index)] diff --git a/src/encoded/types/ingestion.py b/src/encoded/types/ingestion.py new file mode 100644 index 0000000000..0dfbf71c9b --- /dev/null +++ b/src/encoded/types/ingestion.py @@ -0,0 +1,42 @@ +""" +Collection for objects related to ingestion submissions. +""" + +from snovault import collection, load_schema +from pyramid.security import Allow, Deny, Everyone +from .base import ( + Item, + # TODO: Maybe collect all these permission styles into a single file, give them symbolic names, + # and permit only the symbolic names to be used in each situation so we can curate a full inventory of modes. + # -kmp 26-Jul-2020 + ALLOW_SUBMITTER_ADD, +) +from .institution import ( + ONLY_ADMIN_VIEW, +) + + +ALLOW_SUBMITTER_VIEW = ( + # TODO: There is an issue here where we want a logged in user remotely only to view this + # but if we are proxying for them internall we want to be able to view OR edit. + # There is never reason for a user outside the system to update this status. -kmp 26-Jul-2020 + [] # Special additional permissions might go here. + + ALLOW_SUBMITTER_ADD # Is this right? See note above. + + ONLY_ADMIN_VIEW # Slightly misleading name. Allows admins to edit, too, actually. But only they can view. +) + + +@collection( + name='ingestion-submissions', + acl=ALLOW_SUBMITTER_VIEW, + unique_key='object_name', + properties={ + 'title': 'Ingestion Submissions', + 'description': 'List of Ingestion Submissions', + }) +class IngestionSubmission(Item): + """The IngestionSubmission class that holds info on requests to ingest data.""" + + item_type = 'ingestion_submission' + schema = load_schema('encoded:schemas/ingestion_submission.json') + # embedded_list = [...] + Item.embedded_list From e5a5b59a76bc2cdf1acba47565bba06765f304f9 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 30 Jul 2020 02:52:02 -0400 Subject: [PATCH 039/125] WIP: First pass at progress info working. --- src/encoded/ingestion/__init__.py | 0 src/encoded/{ => ingestion}/common.py | 15 +--- src/encoded/ingestion/exceptions.py | 21 +++++ .../processors.py} | 80 +++++++------------ src/encoded/ingestion_listener.py | 12 ++- src/encoded/renderers.py | 6 +- src/encoded/submit.py | 6 +- src/encoded/types/ingestion.py | 55 +++++++++++++ 8 files changed, 113 insertions(+), 82 deletions(-) create mode 100644 src/encoded/ingestion/__init__.py rename src/encoded/{ => ingestion}/common.py (90%) create mode 100644 src/encoded/ingestion/exceptions.py rename src/encoded/{ingestion_engines.py => ingestion/processors.py} (62%) diff --git a/src/encoded/ingestion/__init__.py b/src/encoded/ingestion/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/encoded/common.py b/src/encoded/ingestion/common.py similarity index 90% rename from src/encoded/common.py rename to src/encoded/ingestion/common.py index c5e06786f4..c1422fe998 100644 --- a/src/encoded/common.py +++ b/src/encoded/ingestion/common.py @@ -7,18 +7,12 @@ import io import os import tempfile - - - +from .exceptions import SubmissionFailure, MissingParameter DATA_BUNDLE_BUCKET = 'cgap-data-bundles' -class SubmissionFailure(Exception): - pass - - CONTENT_TYPE_SPECIAL_CASES = { 'application/x-www-form-urlencoded': [ # Special case to allow us to POST to metadata TSV requests via form submission @@ -56,13 +50,6 @@ def content_type_allowed(request): return False -class MissingParameter(Exception): - - def __init__(self, parameter_name): - self.parameter_name = parameter_name - super().__init__("Missing parameter: %s" % parameter_name) - - _NO_DEFAULT = object() diff --git a/src/encoded/ingestion/exceptions.py b/src/encoded/ingestion/exceptions.py new file mode 100644 index 0000000000..af653fe78a --- /dev/null +++ b/src/encoded/ingestion/exceptions.py @@ -0,0 +1,21 @@ +""" +Exception definitions for ingestion +""" + + +class SubmissionFailure(Exception): + pass + + +class UndefinedIngestionProcessorType(Exception): + + def __init__(self, processor_type): + self.ingestion_type_name = processor_type + super().__init__("No ingestion processor type %r is defined." % processor_type) + + +class MissingParameter(Exception): + + def __init__(self, parameter_name): + self.parameter_name = parameter_name + super().__init__("Missing parameter: %s" % parameter_name) diff --git a/src/encoded/ingestion_engines.py b/src/encoded/ingestion/processors.py similarity index 62% rename from src/encoded/ingestion_engines.py rename to src/encoded/ingestion/processors.py index 4a8abf6b07..4d3173da17 100644 --- a/src/encoded/ingestion_engines.py +++ b/src/encoded/ingestion/processors.py @@ -2,9 +2,10 @@ import json import traceback -from .common import DATA_BUNDLE_BUCKET, get_parameter -from .util import debuglog, s3_output_stream, create_empty_s3_file -from .submit import submit_data_bundle +from encoded.ingestion.common import DATA_BUNDLE_BUCKET, get_parameter +from encoded.util import debuglog, s3_output_stream, create_empty_s3_file +from encoded.submit import submit_data_bundle +from .exceptions import UndefinedIngestionProcessorType INGESTION_UPLOADERS = {} @@ -22,13 +23,6 @@ def ingestion_type_decorator(fn): return ingestion_type_decorator -class UndefinedIngestionProcessorType(Exception): - - def __init__(self, processor_type): - self.ingestion_type_name = processor_type - super().__init__("No ingestion processor type %r is defined." % processor_type) - - def get_ingestion_processor(processor_type): handler = INGESTION_UPLOADERS.get(processor_type, None) if not handler: @@ -42,15 +36,17 @@ def _show_report_lines(lines, fp, default="Nothing to report."): @ingestion_processor('data_bundle') -def handle_data_bundle(*, uuid, ingestion_type, vapp, log): +def handle_data_bundle(submission): - log.info("Processing {uuid} as {ingestion_type}.".format(uuid=uuid, ingestion_type=ingestion_type)) + submission.log.info("Processing {submission_id} as {ingestion_type}." + .format(submission_id=submission.submission_id, ingestion_type=submission.ingestion_type)) - if ingestion_type != 'data_bundle': + if submission.ingestion_type != 'data_bundle': raise RuntimeError("handle_data_bundle only works for ingestion_type data_bundle.") + submission_id = submission.submission_id s3_client = boto3.client('s3') - manifest_key = "%s/manifest.json" % uuid + manifest_key = "%s/manifest.json" % submission_id response = s3_client.get_object(Bucket=DATA_BUNDLE_BUCKET, Key=manifest_key) manifest = json.load(response['Body']) @@ -59,10 +55,10 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): institution = get_parameter(parameters, 'institution') project = get_parameter(parameters, 'project') - debuglog(uuid, "data_key:", data_key) - debuglog(uuid, "parameters:", parameters) + debuglog(submission_id, "data_key:", data_key) + debuglog(submission_id, "parameters:", parameters) - started_key = "%s/started.txt" % uuid + started_key = "%s/started.txt" % submission_id create_empty_s3_file(s3_client, bucket=DATA_BUNDLE_BUCKET, key=started_key) # PyCharm thinks this is unused. -kmp 26-Jul-2020 @@ -76,33 +72,24 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): try: + submission.set_item_detail(object_name=manifest['object_name'], parameters=manifest['parameters'], + institution=institution, project=project) + if isinstance(institution, str): - institution = vapp.get(institution).json + institution = submission.vapp.get(institution).json if isinstance(project, str): - project = vapp.get(project).json - - vapp.patch_json("/ingestion-submission", { - "object_name": manifest['object_name'], - "ingestion_type": ingestion_type, - "submission_id": uuid, - "parameters": manifest['parameters'], - "institution": institution, - "project": project, - "processing_status": { - "state": "processing", - } - }) + project = submission.vapp.get(project).json validation_log_lines, final_json, result_lines = submit_data_bundle(s3_client=s3_client, bucket=DATA_BUNDLE_BUCKET, key=data_key, project=project, institution=institution, - vapp=vapp) + vapp=submission.vapp) - resolution["validation_report_key"] = validation_report_key = "%s/validation-report.txt" % uuid - resolution["submission_key"] = submission_key = "%s/submission.json" % uuid - resolution["submission_response_key"] = submission_response_key = "%s/submission-response.txt" % uuid + resolution["validation_report_key"] = validation_report_key = "%s/validation-report.txt" % submission_id + resolution["submission_key"] = submission_key = "%s/submission.json" % submission_id + resolution["submission_response_key"] = submission_response_key = "%s/submission-response.txt" % submission_id with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=validation_report_key) as fp: _show_report_lines(validation_log_lines, fp) @@ -113,32 +100,19 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=submission_response_key) as fp: _show_report_lines(result_lines, fp) - vapp.patch_json("ingestion-submission", { - "submission_id": uuid, - "progress": { - "state": "done", - "outcome": "failure" if validation_log_lines else "success", - "progress": "complete", - }, - }) + # TODO: Sarah will provide a way to tell success from failure. -kmp 28-Jul-2020 + submission.patch_item(processing_status={"state": "done", "outcome": "success", "progress": "complete"}) except Exception as e: - resolution["traceback_key"] = traceback_key = "%s/traceback.json" % uuid + resolution["traceback_key"] = traceback_key = "%s/traceback.txt" % submission_id with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=traceback_key) as fp: traceback.print_exc(file=fp) resolution["error_type"] = e.__class__.__name__ resolution["error_message"] = str(e) - vapp.patch_json("ingestion-submission", { - "submission_id": uuid, - "progress": { - "state": "done", - "outcome": "error", - "progress": "incomplete", - }, - }) + submission.patch_item(processing_status={"state": "done", "outcome": "error", "progress": "incomplete"}) - with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key="%s/resolution.json" % uuid) as fp: + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key="%s/resolution.json" % submission_id) as fp: print(json.dumps(resolution, indent=2), file=fp) diff --git a/src/encoded/ingestion_listener.py b/src/encoded/ingestion_listener.py index f69766c41f..ef652844bd 100644 --- a/src/encoded/ingestion_listener.py +++ b/src/encoded/ingestion_listener.py @@ -24,8 +24,9 @@ from snovault.util import debug_log from vcf import Reader from .commands.ingest_vcf import VCFParser -from .common import register_path_content_type, DATA_BUNDLE_BUCKET, SubmissionFailure -from .ingestion_engines import get_ingestion_processor +from .ingestion.common import register_path_content_type, DATA_BUNDLE_BUCKET, SubmissionFailure +from .ingestion.processors import get_ingestion_processor +from .types.ingestion import SubmissionFolio from .util import resolve_file_path, gunzip_content, debuglog @@ -550,12 +551,9 @@ def run(self): if ingestion_type != 'vcf': # Let's minimally disrupt things for now. We can refactor this later # to make all the parts work the same -kmp - self.vapp.post_json("/ingestion-submission", { - "ingestion_type": ingestion_type, - "submission_id": uuid, - }) + submission = SubmissionFolio(vapp=self.vapp, ingestion_type=ingestion_type, submission_id=uuid) handler = get_ingestion_processor(ingestion_type) - handler(uuid=uuid, ingestion_type=ingestion_type, vapp=self.vapp, log=log) + handler(submission) # TODO: If we delete messages at the end of each loop, I think we'll here need to do this, # since we're bypassing bottom of lop with the 'continue': # self.delete_messages([message]) diff --git a/src/encoded/renderers.py b/src/encoded/renderers.py index 5cb832faf0..ae16269458 100644 --- a/src/encoded/renderers.py +++ b/src/encoded/renderers.py @@ -11,22 +11,18 @@ HTTPMovedPermanently, HTTPPreconditionFailed, HTTPUnauthorized, - HTTPForbidden, HTTPUnsupportedMediaType, HTTPNotAcceptable, HTTPServerError ) from pyramid.response import Response -from pyramid.security import forget from pyramid.settings import asbool from pyramid.threadlocal import manager from pyramid.traversal import split_path_info, _join_path_tuple -from snovault.validation import CSRFTokenError -from subprocess_middleware.tween import SubprocessTween from subprocess_middleware.worker import TransformWorker from urllib.parse import urlencode from webob.cookies import Cookie -from .common import content_type_allowed +from encoded.ingestion.common import content_type_allowed log = logging.getLogger(__name__) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 76d850e3a3..4ecd2a4487 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -64,16 +64,16 @@ # # This will be done differently soon as part of the "/submit_for_ingestion" endpoint that # will be in ingestion_listener.py. That endpoint will need an "?ingestion type=data_bundle" -# as query parameter. That "data_bundle" ingestion type will defined in ingestion_engines.py. +# as query parameter. That "data_bundle" ingestion type will defined in ingestion/processors.py. # The new entry point here that will be needed is submit_data_bundle, and then this temporary # "/submit_data" endpoint can presumably go away.. -kmp 25-Jul-2020 @view_config(route_name='submit_data', request_method='POST', permission='add') @debug_log def submit_data(context, request): - ''' + """ usage notes here later - ''' + """ config_uri = request.json.get('config_uri', 'production.ini') patch_only = request.json.get('patch_only', False) post_only = request.json.get('post_only', False) diff --git a/src/encoded/types/ingestion.py b/src/encoded/types/ingestion.py index 0dfbf71c9b..f4dc75b119 100644 --- a/src/encoded/types/ingestion.py +++ b/src/encoded/types/ingestion.py @@ -2,6 +2,9 @@ Collection for objects related to ingestion submissions. """ +import json +import logging + from snovault import collection, load_schema from pyramid.security import Allow, Deny, Everyone from .base import ( @@ -26,6 +29,58 @@ ) +class SubmissionFolio: + + INGESTION_SUBMISSION_URI = '/IngestionSubmission' + + def __init__(self, *, vapp, ingestion_type, submission_id, log=None): + self.vapp = vapp + self.ingestion_type = ingestion_type + self.log = log or logging + self.folio_id = None # This will be more properly initialized in _create_item() + self.submission_id = submission_id + self._create_item() + + @property + def folio_uri(self): + if not self.folio_id: + raise RuntimeError("%s.folio_id has not been set." % self) + return "/" + self.folio_id + + def _create_item(self): + res = self.vapp.post_json(self.INGESTION_SUBMISSION_URI, { + "ingestion_type": self.ingestion_type, + "submission_id": self.submission_id, + "processing_status": { + "state": "submitted" + } + }) + [item] = res.json['@graph'] + print(json.dumps(item, indent=2)) + self.folio_id = item['uuid'] + + def set_item_detail(self, object_name, parameters, institution, project): + res = self.vapp.patch_json(self.folio_uri, { + "object_name": object_name, + "ingestion_type": self.ingestion_type, + "submission_id": self.submission_id, + "parameters": parameters, + "institution": institution, + "project": project, + "processing_status": { + "state": "processing", + } + }) + [item] = res.json['@graph'] + print(json.dumps(item, indent=2)) + + + def patch_item(self, **kwargs): + res = self.vapp.patch_json(self.folio_uri, kwargs) + [item] = res.json['@graph'] + print(json.dumps(item, indent=2)) + + @collection( name='ingestion-submissions', acl=ALLOW_SUBMITTER_VIEW, From 279bb15ea5ab2faeb8b0517b4e51dfd685a070e8 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 31 Jul 2020 14:12:31 -0400 Subject: [PATCH 040/125] more extensive error handling in submit.py --- src/encoded/submit.py | 211 +++++++++++++++++++++++++++++++----------- 1 file changed, 155 insertions(+), 56 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 17d2104450..24c22e3a7d 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -105,10 +105,26 @@ def submit_data_bundle(*, s3_client, bucket, key, project, institution, vapp): with s3_local_file(s3_client, bucket=bucket, key=key) as file: project_json = vapp.get(project).json institution_json = vapp.get(institution).json - json_data = xls_to_json(file, project=project_json, institution=institution_json) - final_json, validation_log_lines = validate_all_items(vapp, json_data) - result_lines = post_and_patch_all_items(vapp, final_json) - return validation_log_lines, final_json, result_lines + results = { + 'success': False, + 'validation_output': [], + 'final_json': {}, + 'post_output': [] + } + json_data, json_success = xls_to_json(file, project=project_json, institution=institution_json) + if not json_success: + results['validation_output'] = json_data['errors'] + return results + final_json, validation_log_lines, validate_success = validate_all_items(vapp, json_data) + results['final_json'] = final_json + results['validation_output'] = validation_log_lines + if not validate_success: + return results + results['success'] = validate_success + result_lines, post_success = post_and_patch_all_items(vapp, final_json) + results['post_output'] = result_lines + results['success'] = post_success + return results def map_fields(row, metadata_dict, addl_fields, item_type): @@ -128,37 +144,66 @@ def xls_to_json(xls_data, project, institution): book = xlrd.open_workbook(xls_data) sheet, = book.sheets() row = row_generator(sheet) - top_header = next(row) - debuglog("top_header:", top_header) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 - keys = next(row) - debuglog("keys:", keys) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 - descriptions = next(row) - debuglog("descriptions:", descriptions) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 - rows = [] + header = False counter = 0 + # debuglog("top_header:", top_header) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + while True: + try: + keys = next(row) + keys = [key.lower().strip().rstrip('*').rstrip() for key in keys] + counter += 1 + if 'individual id' in keys: + header = True + break + except StopIteration: + break + if not header: + msg = 'Column headers not detected in spreadsheet! "Individual ID*" column must be present in header.' + return {'errors': [msg]}, False + # debuglog("keys:", keys) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + # descriptions = next(row) + # debuglog("descriptions:", descriptions) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + rows = [] + # keys = [key.lower().strip().rstrip('*').rstrip() for key in keys] + required = ['individual id', 'relation to proband', 'report required', 'analysis id'] + missing = [col for col in required if col not in keys] + if missing: + msg = 'Column(s) "{}" not found in spreadsheet! Spreadsheet cannot be processed.'.format('", "'.join(missing)) + return {'errors': [msg]}, False + for values in row: r = [val for val in values] - row_dict = {keys[i].lower().rstrip('*'): item for i, item in enumerate(r)} + if 'y/n' in ''.join(r).lower() or ''.join(r) == '': # skip comments/description/blank row if present + counter += 1 + continue + row_dict = {keys[i]: item for i, item in enumerate(r)} rows.append(row_dict) items = { 'individual': {}, 'family': {}, 'sample': {}, 'sample_processing': {}, 'file_fastq': {}, 'file_processed': {}, 'case': {}, 'report': {}, - 'reports': [] + 'reports': [], 'errors': [] } file_errors = [] specimen_ids = {} family_dict = create_families(rows) a_types = get_analysis_types(rows) - for row in rows: + for i, row in enumerate(rows): debuglog("row:", repr(row)) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 + row_num = i + counter + 1 + missing_required = [col for col in required if col not in row] + if missing_required: + items['errors'].append( + 'Spreadsheet row {} cannot be processed - missing required field(s) {}' + ''.format(row_num, ', '.join(missing_required)) + ) indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual - items = fetch_individual_metadata(row, items, indiv_alias, institution['name']) + items = fetch_individual_metadata(row_num, row, items, indiv_alias, institution['name']) # create/edit items for Family - items = fetch_family_metadata(row, items, indiv_alias, fam_alias) + items = fetch_family_metadata(row_num, row, items, indiv_alias, fam_alias) # create item for Sample if there is a specimen if row.get('specimen id'): samp_alias = '{}:sample-{}'.format(project['name'], row['specimen id']) @@ -168,30 +213,31 @@ def xls_to_json(xls_data, project, institution): else: specimen_ids[row['specimen id']] = 1 analysis_alias = '{}:analysis-{}'.format(project['name'], row['analysis id']) - items = fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, + items = fetch_sample_metadata(row_num, row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, project['name'], a_types) if row.get('files'): - file_items = fetch_file_metadata(row['files'].split(','), project['name']) + file_items = fetch_file_metadata(row_num, row['files'].split(','), project['name']) file_errors.extend(file_items['errors']) items['file_fastq'].update(file_items['file_fastq']) items['file_processed'].update(file_items['file_processed']) else: - print('WARNING: No specimen id present for patient {},' - ' sample will not be created.'.format(row['individual id'])) + items['errors'].append('WARNING: No specimen id present for patient {},' + ' sample will not be created.'.format(row['individual id'])) # create SampleProcessing item for trio/group if needed # items = create_sample_processing_groups(items, sp_alias) items = add_relations(items) items = create_case_items(items, project['name']) # removed unused fields, add project and institution for val1 in items.values(): - for val2 in val1.values(): - remove_keys = [k for k, v in val2.items() if not v] - for key in remove_keys: - del val2[key] - val2['project'] = project['@id'] - val2['institution'] = institution['@id'] - items['file_errors'] = file_errors - return items + if isinstance(val1, dict): + for val2 in val1.values(): + remove_keys = [k for k, v in val2.items() if not v] + for key in remove_keys: + del val2[key] + val2['project'] = project['@id'] + val2['institution'] = institution['@id'] + items['errors'].extend(file_errors) + return items, True # most errors passed to next step in order to combine with validation errors def create_families(rows): @@ -208,7 +254,8 @@ def get_analysis_types(rows): analysis_relations[row.get('analysis id')][0].append(row.get('relation to proband', '').lower()) analysis_relations[row.get('analysis id')][1].append(row.get('workup type', '').upper()) for k, v in analysis_relations.items(): - if len(list(set(v[1]))) == 1: + workup = list(set(v[1])) + if len(workup) == 1 and '' not in workup: if len(v[0]) == 1: analysis_types[k] = v[1][0] elif sorted(v[0]) == ['father', 'mother', 'proband']: @@ -220,7 +267,7 @@ def get_analysis_types(rows): return analysis_types -def fetch_individual_metadata(row, items, indiv_alias, inst_name): +def fetch_individual_metadata(idx, row, items, indiv_alias, inst_name): new_items = items.copy() info = {'aliases': [indiv_alias]} info = map_fields(row, info, ['individual_id', 'sex', 'age', 'birth_year'], 'individual') @@ -229,10 +276,12 @@ def fetch_individual_metadata(row, items, indiv_alias, inst_name): if row.get('other individual id type'): other_id['id_source'] = row['other individual id source'] info['institutional_id'] = other_id - info['age'] = int(info['age']) if info.get('age') else None - info['birth_year'] = int(info['birth year']) if info.get('birth year') else None + for col in ['age', 'birth_year']: + if info.get(col) and isinstance(info[col], str) and info[col].isnumeric(): + info[col] = int(info[col]) if indiv_alias not in new_items['individual']: new_items['individual'][indiv_alias] = {k: v for k, v in info.items() if v} + new_items['individual'][indiv_alias]['row'] = idx else: for key in info: if key not in new_items['individual'][indiv_alias]: @@ -240,24 +289,34 @@ def fetch_individual_metadata(row, items, indiv_alias, inst_name): return new_items -def fetch_family_metadata(row, items, indiv_alias, fam_alias): +def fetch_family_metadata(idx, row, items, indiv_alias, fam_alias): new_items = items.copy() info = { 'aliases': [fam_alias], 'family_id': row['family id'], - 'members': [indiv_alias] + 'members': [indiv_alias], + 'row': idx } if fam_alias not in new_items['family']: new_items['family'][fam_alias] = info if indiv_alias not in new_items['family'][fam_alias]['members']: new_items['family'][fam_alias]['members'].append(indiv_alias) - for relation in ['proband', 'mother', 'father', 'brother', 'sister', 'sibling']: - if row.get('relation to proband', '').lower() == relation and relation not in new_items['family'][fam_alias]: + valid_relations = ['proband', 'mother', 'father', 'brother', 'sister', 'sibling'] + relation_found = False + for relation in valid_relations: + if row.get('relation to proband', '').lower().startswith(relation) and relation not in new_items['family'][fam_alias]: new_items['family'][fam_alias][relation] = indiv_alias + relation_found = True + break + if not relation_found: + msg = 'Row {}: Invalid relation "{}" for individual {} - Relation should be one of: {}'.format( + idx, row.get('relation to proband'), row.get('individual id'), ', '.join(valid_relations) + ) + items['errors'].append(msg) return new_items -def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name, analysis_type_dict): +def fetch_sample_metadata(idx, row, items, indiv_alias, samp_alias, analysis_alias, fam_alias, proj_name, analysis_type_dict): new_items = items.copy() info = {'aliases': [samp_alias], 'files': []} # TODO: implement creation of file db items fields = [ @@ -265,9 +324,10 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f 'specimen_notes', 'research_protocol_name', 'sent_by', 'physician_id', 'indication' ] info = map_fields(row, info, fields, 'sample') - if info.get('specimen_accepted', '').lower() == 'y': + info['row'] = idx + if info.get('specimen_accepted', '').lower() in ['y', 'yes']: info['specimen_accepted'] = 'Yes' - elif info.get('specimen_accepted', '').lower() == 'n': + elif info.get('specimen_accepted', '').lower() in ['n', 'no']: info['specimen_accepted'] = 'No' if row.get('second specimen id'): other_id = {'id': row['second specimen id'], 'id_type': proj_name} # add proj info? @@ -302,7 +362,7 @@ def fetch_sample_metadata(row, items, indiv_alias, samp_alias, analysis_alias, f # TODO: finish implementing this function -def fetch_file_metadata(filenames, proj_name): +def fetch_file_metadata(idx, filenames, proj_name): valid_extensions = { '.fastq.gz': ('fastq', 'reads'), '.fq.gz': ('fastq', 'reads'), @@ -323,6 +383,7 @@ def fetch_file_metadata(filenames, proj_name): fmt = valid_extensions[extension[0]][0] file_info = { 'aliases': [file_alias], + 'row': idx, 'file_format': '/file-formats/{}/'.format(fmt), 'file_type': valid_extensions[extension[0]][1], 'filename': filename # causes problems without functional file upload @@ -427,24 +488,44 @@ def parse_exception(e, aliases): else: text = e.args[0] resp_text = text[text.index('{'):-1] - resp_dict = json.loads(resp_text.replace('\\', '')) + resp_dict = json.loads(resp_text.replace('\\"', "\'").replace('\\', '')) except Exception: # pragma: no cover raise e if resp_dict.get('description') == 'Failed validation': keep = [] - resp_list = [error['description'] for error in resp_dict['errors']] + resp_list = [error['name'] + ' - ' + error['description'] for error in resp_dict['errors']] for error in resp_list: # if error is caused by linkTo to item not submitted yet but in aliases list, # remove that error if 'not found' in error and error.split("'")[1] in aliases: continue else: + error = error.lstrip('Schema: ') + field_name = error[:error.index(' - ')] + field = None + if field_name in GENERIC_FIELD_MAPPING['sample'].values(): + field = [key for key, val in GENERIC_FIELD_MAPPING['sample'].items() if val == field_name][0] + elif field_name == 'requisition_acceptance.accepted_rejected': + field = 'Req Accepted Y\\N' + error = map_enum_options(field_name, error) + if not field: + field = field_name.replace('_', ' ') + + error = 'field: ' + error.replace(field_name, field) keep.append(error) return keep else: raise e +def map_enum_options(fieldname, error_message): + if fieldname == 'requisition_acceptance.accepted_rejected': + error_message = error_message.replace("['Accepted', 'Rejected']", "['Y', 'N']") + elif fieldname == 'specimen_accepted': + error_message = error_message.replace("['Yes', 'No']", "['Y', 'N']") + return error_message + + def compare_fields(profile, aliases, json_item, db_item): to_patch = {} for field in json_item: @@ -489,7 +570,7 @@ def validate_all_items(virtualapp, json_data): written or tested. ''' alias_dict = {} - errors = json_data['file_errors'] + errors = json_data['errors'] all_aliases = [k for itype in json_data for k in json_data[itype]] json_data_final = {'post': {}, 'patch': {}} validation_results = {} @@ -515,22 +596,29 @@ def validate_all_items(virtualapp, json_data): db_results[alias] = db_result # TODO: Likewise this should probably loop over json_data.get(itemtype, {}). -kmp 25-Jul-2020 for alias in json_data[itemtype]: - if 'filename' in json_data[itemtype][alias]: # until we have functional file upload - del json_data[itemtype][alias]['filename'] + data = json_data[itemtype][alias].copy() + row = data.get('row') + if row: + del data['row'] + if 'filename' in data: # until we have functional file upload + del data['filename'] if not db_results.get(alias): - error = validate_item(virtualapp, json_data[itemtype][alias], 'post', itemtype, all_aliases) + error = validate_item(virtualapp, data, 'post', itemtype, all_aliases) if error: # modify to check for presence of validation errors # do something to report validation errors if itemtype not in ['case', 'report']: for e in error: - errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + if row: + errors.append('Row {} {} - Error found: {}'.format(row, itemtype, e)) + else: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) validation_results[itemtype]['errors'] += 1 # TODO: If itemtype might not be in json_data (and conditionals above suggest that's so), # then json_data[item_type][alias] seems suspect. It does work to do # json_data.get(item_type, {}).get(alias, {}).get('filename') but I would put that # quantity in a variable rather than compute it twice in a row. -kmp 25-Jul-2020 elif json_data[itemtype][alias].get('filename') and \ - json_data[itemtype][alias]['filename'] in ''.join(json_data['file_errors']): + json_data[itemtype][alias]['filename'] in ''.join(json_data['errors']): validation_results[itemtype]['errors'] += 1 else: json_data_final['post'].setdefault(itemtype, []) @@ -540,22 +628,25 @@ def validate_all_items(virtualapp, json_data): # patch if item exists in db # alias_dict[alias] = results[alias]['@id'] # TODO: profile is only conditionally assigned in an "if" above. -kmp 25-Jul-2020 - patch_data = compare_fields(profile, alias_dict, json_data[itemtype][alias], db_results[alias]) + patch_data = compare_fields(profile, alias_dict, data, db_results[alias]) error = validate_item(virtualapp, patch_data, 'patch', itemtype, all_aliases, atid=db_results[alias]['@id']) if error: # do something to report validation errors if itemtype not in ['case', 'report']: for e in error: - errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) + if row: + errors.append('Row {} {} - Error found: {}'.format(row, itemtype, e)) + else: + errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) validation_results[itemtype]['errors'] += 1 elif json_data[itemtype][alias].get('filename') and \ - json_data[itemtype][alias]['filename'] in ''.join(json_data['file_errors']): + json_data[itemtype][alias]['filename'] in ''.join(json_data['errors']): validation_results[itemtype]['errors'] += 1 else: # patch json_data_final['patch'].setdefault(itemtype, {}) if patch_data: json_data_final['patch'][itemtype][db_results[alias]['@id']] = patch_data - elif itemtype not in ['case', 'report']: + elif itemtype not in ['case', 'report', 'sample_processing']: output.append('{} {} - Item already in database, no changes needed'.format(itemtype, alias)) # do something to record response validation_results[itemtype]['validated'] += 1 @@ -566,24 +657,28 @@ def validate_all_items(virtualapp, json_data): )) if errors: output.append('Validation errors found in items. Please fix spreadsheet before submitting.') - return ({}, output) + return {}, output, False else: json_data_final['aliases'] = alias_dict output.append('All items validated.') - return (json_data_final, output) + return json_data_final, output, True def post_and_patch_all_items(virtualapp, json_data_final): output = [] if not json_data_final: - return output + return output, 'not run' item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_accession'} final_status = {} + no_errors = True if json_data_final.get('post'): for k, v in json_data_final['post'].items(): final_status[k] = {'posted': 0, 'not posted': 0, 'patched': 0, 'not patched': 0} for item in v: patch_info = {} + row = item['row'] + if row: + del item['row'] # if 'filename' in item: # until we have functional file upload # del item['filename'] for field in LINKS: @@ -602,9 +697,11 @@ def post_and_patch_all_items(virtualapp, json_data_final): output.append('Success - {} {} posted'.format(k, item[item_names[k]])) else: final_status[k]['not posted'] += 1 + no_errors = False except Exception as e: final_status[k]['not posted'] += 1 output.append(str(e)) + no_errors = False for itype in final_status: if final_status[itype]['posted'] > 0 or final_status[itype]['not posted'] > 0: output.append('{}: {} items posted successfully; {} items not posted'.format( @@ -623,14 +720,16 @@ def post_and_patch_all_items(virtualapp, json_data_final): final_status[k]['patched'] += 1 else: final_status[k]['not patched'] += 1 + no_errors = False except Exception as e: final_status[k]['not patched'] += 1 output.append(str(e)) + no_errors = False if final_status[k]['patched'] > 0 or final_status[k]['not patched'] > 0: output.append('{}: {} items patched successfully; {} items not patched'.format( k, final_status[k]['patched'], final_status[k]['not patched'] )) - return output + return output, no_errors def cell_value(cell, datemode): From b5fb8b20c0a652371717e4c521ed4bab36de13c0 Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 31 Jul 2020 14:16:46 -0400 Subject: [PATCH 041/125] edits to ingestion code for data bundles --- src/encoded/ingestion_engines.py | 75 +++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/src/encoded/ingestion_engines.py b/src/encoded/ingestion_engines.py index 05a5f0f82e..9f6c42d778 100644 --- a/src/encoded/ingestion_engines.py +++ b/src/encoded/ingestion_engines.py @@ -41,8 +41,8 @@ def _show_report_lines(lines, fp, default="Nothing to report."): print(line, file=fp) -@ingestion_processor('data_bundle') -def handle_data_bundle(*, uuid, ingestion_type, vapp, log): +#@ingestion_processor('data_bundle') +def handle_data_bundle_old(*, uuid, ingestion_type, vapp, log): log.info("Processing {uuid} as {ingestion_type}.".format(uuid=uuid, ingestion_type=ingestion_type)) @@ -107,3 +107,74 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key="%s/resolution.json" % uuid) as fp: print(json.dumps(resolution, indent=2), file=fp) + + +@ingestion_processor('data_bundle') +def handle_data_bundle(*, uuid, ingestion_type, vapp, log): + + log.info("Processing {uuid} as {ingestion_type}.".format(uuid=uuid, ingestion_type=ingestion_type)) + + if ingestion_type != 'data_bundle': + raise RuntimeError("handle_data_bundle only works for ingestion_type data_bundle.") + + s3_client = boto3.client('s3') + manifest_key = "%s/manifest.json" % uuid + response = s3_client.get_object(Bucket=DATA_BUNDLE_BUCKET, Key=manifest_key) + manifest = json.load(response['Body']) + + data_key = manifest['object_name'] + parameters = manifest['parameters'] + institution = get_parameter(parameters, 'institution') + project = get_parameter(parameters, 'project') + + debuglog(uuid, "data_key:", data_key) + debuglog(uuid, "parameters:", parameters) + + started_key = "%s/started.txt" % uuid + create_empty_s3_file(s3_client, bucket=DATA_BUNDLE_BUCKET, key=started_key) + + # PyCharm thinks this is unused. -kmp 26-Jul-2020 + # data_stream = s3_client.get_object(Bucket=DATA_BUNDLE_BUCKET, Key="%s/manifest.json" % uuid)['Body'] + + resolution = { + "data_key": data_key, + "manifest_key": manifest_key, + "started_key": started_key, + } + + try: + + data_bundle_result = submit_data_bundle(s3_client=s3_client, + bucket=DATA_BUNDLE_BUCKET, + key=data_key, + project=project, + institution=institution, + vapp=vapp) + + resolution["validation_report_key"] = validation_report_key = "%s/validation-report.txt" % uuid + resolution["submission_key"] = submission_key = "%s/submission.json" % uuid + resolution["submission_response_key"] = submission_response_key = "%s/submission-response.txt" % uuid + + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=validation_report_key) as fp: + _show_report_lines(data_bundle_result['validation_output'], fp) + + # here I am only creating submission.json and submission-response.txt if there is something to write to file + if data_bundle_result['final_json']: + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=submission_key) as fp: + print(json.dumps(data_bundle_result['final_json'], indent=2), file=fp) + + if data_bundle_result['post_output']: + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=submission_response_key) as fp: + _show_report_lines(data_bundle_result['post_output'], fp) + + except Exception as e: + + resolution["traceback_key"] = traceback_key = "%s/traceback.json" % uuid + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=traceback_key) as fp: + traceback.print_exc(file=fp) + + resolution["error_type"] = e.__class__.__name__ + resolution["error_message"] = str(e) + + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key="%s/resolution.json" % uuid) as fp: + print(json.dumps(resolution, indent=2), file=fp) From bfb3162868b8dfac8dfe552a7eb5b9f022aa940e Mon Sep 17 00:00:00 2001 From: Sarah Date: Fri, 31 Jul 2020 15:49:34 -0400 Subject: [PATCH 042/125] submission-test script edited for submit.py changes --- src/encoded/commands/submission_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/encoded/commands/submission_test.py b/src/encoded/commands/submission_test.py index 07eb772985..9c02ad7c77 100644 --- a/src/encoded/commands/submission_test.py +++ b/src/encoded/commands/submission_test.py @@ -10,11 +10,11 @@ def main(): virtualapp = VirtualApp(app, environ) proj = virtualapp.get('/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/').json inst = virtualapp.get('/institutions/hms-dbmi/').json - json_data = xls_to_json('src/encoded/tests/data/documents/cgap_submit_test.xlsx', proj, inst) - final_json, validation_log = validate_all_items(virtualapp, json_data) + json_data, passing = xls_to_json('/src/encoded/tests/data/documents/cgap_submit_test.xlsx', proj, inst) + final_json, validation_log, passing = validate_all_items(virtualapp, json_data) print('\n'.join(validation_log)) print(json.dumps(final_json, indent=4)) - result = post_and_patch_all_items(virtualapp, final_json) + result, passing = post_and_patch_all_items(virtualapp, final_json) print('\n'.join(result)) From 40a17448f48407477ffcc312f3f6164cc750be31 Mon Sep 17 00:00:00 2001 From: Sarah Date: Sun, 2 Aug 2020 14:45:29 -0400 Subject: [PATCH 043/125] in submit.py filenames now passed to s3 output file for later upload --- src/encoded/ingestion_engines.py | 5 +++++ src/encoded/submit.py | 29 ++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/encoded/ingestion_engines.py b/src/encoded/ingestion_engines.py index 9f6c42d778..e286e8035b 100644 --- a/src/encoded/ingestion_engines.py +++ b/src/encoded/ingestion_engines.py @@ -154,6 +154,7 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): resolution["validation_report_key"] = validation_report_key = "%s/validation-report.txt" % uuid resolution["submission_key"] = submission_key = "%s/submission.json" % uuid resolution["submission_response_key"] = submission_response_key = "%s/submission-response.txt" % uuid + resolution["info_for_file_upload_key"] = info_for_file_upload_key = "%s/info_for_file_upload.txt" % uuid with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=validation_report_key) as fp: _show_report_lines(data_bundle_result['validation_output'], fp) @@ -167,6 +168,10 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=submission_response_key) as fp: _show_report_lines(data_bundle_result['post_output'], fp) + if data_bundle_result['file_info']: + with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=info_for_file_upload_key) as fp: + _show_report_lines(data_bundle_result['file_info'], fp) + except Exception as e: resolution["traceback_key"] = traceback_key = "%s/traceback.json" % uuid diff --git a/src/encoded/submit.py b/src/encoded/submit.py index 24c22e3a7d..aa04c3f0c1 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -109,7 +109,8 @@ def submit_data_bundle(*, s3_client, bucket, key, project, institution, vapp): 'success': False, 'validation_output': [], 'final_json': {}, - 'post_output': [] + 'post_output': [], + 'file_info': [] } json_data, json_success = xls_to_json(file, project=project_json, institution=institution_json) if not json_success: @@ -121,9 +122,10 @@ def submit_data_bundle(*, s3_client, bucket, key, project, institution, vapp): if not validate_success: return results results['success'] = validate_success - result_lines, post_success = post_and_patch_all_items(vapp, final_json) + result_lines, post_success, files_to_upload = post_and_patch_all_items(vapp, final_json) results['post_output'] = result_lines results['success'] = post_success + results['file_info'] = files_to_upload return results @@ -386,7 +388,7 @@ def fetch_file_metadata(idx, filenames, proj_name): 'row': idx, 'file_format': '/file-formats/{}/'.format(fmt), 'file_type': valid_extensions[extension[0]][1], - 'filename': filename # causes problems without functional file upload + 'filename': filename.strip() # causes problems without functional file upload } if fmt == 'fastq': files['file_fastq'][file_alias] = file_info @@ -666,6 +668,7 @@ def validate_all_items(virtualapp, json_data): def post_and_patch_all_items(virtualapp, json_data_final): output = [] + files = [] if not json_data_final: return output, 'not run' item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_accession'} @@ -676,11 +679,14 @@ def post_and_patch_all_items(virtualapp, json_data_final): final_status[k] = {'posted': 0, 'not posted': 0, 'patched': 0, 'not patched': 0} for item in v: patch_info = {} - row = item['row'] + row = item.get('row') if row: del item['row'] # if 'filename' in item: # until we have functional file upload # del item['filename'] + fname = item.get('filename') + if fname: + del item['filename'] for field in LINKS: if field in item: patch_info[field] = item[field] @@ -695,6 +701,11 @@ def post_and_patch_all_items(virtualapp, json_data_final): json_data_final['patch'][k][atid] = patch_info if k in item_names: output.append('Success - {} {} posted'.format(k, item[item_names[k]])) + if fname: + files.append({ + 'uuid': response.json['@graph'][0]['uuid'], + 'filename': fname + }) else: final_status[k]['not posted'] += 1 no_errors = False @@ -712,12 +723,20 @@ def post_and_patch_all_items(virtualapp, json_data_final): for item_id, patch_data in v.items(): # if 'filename' in patch_data: # until we have functional file upload # del patch_data['filename'] + fname = patch_data.get('filename') + if fname: + del patch_data['filename'] try: response = virtualapp.patch_json('/' + item_id, patch_data, status=200) if response.json['status'] == 'success': # if k in item_names: # output.append('Success - {} {} patched'.format(k, patch_data[item_names[k]])) final_status[k]['patched'] += 1 + if fname: + files.append({ + 'uuid': response.json['@graph'][0]['uuid'], + 'filename': fname + }) else: final_status[k]['not patched'] += 1 no_errors = False @@ -729,7 +748,7 @@ def post_and_patch_all_items(virtualapp, json_data_final): output.append('{}: {} items patched successfully; {} items not patched'.format( k, final_status[k]['patched'], final_status[k]['not patched'] )) - return output, no_errors + return output, no_errors, files def cell_value(cell, datemode): From 9564b2e84cdb47276b8f2f6ab4e85f15b0d82594 Mon Sep 17 00:00:00 2001 From: Sarah Date: Sun, 2 Aug 2020 14:54:30 -0400 Subject: [PATCH 044/125] fixed info_for_file_upload.txt to be json rather than list of lines (data bundle submission) --- src/encoded/ingestion_engines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/encoded/ingestion_engines.py b/src/encoded/ingestion_engines.py index e286e8035b..174ed452ad 100644 --- a/src/encoded/ingestion_engines.py +++ b/src/encoded/ingestion_engines.py @@ -170,7 +170,7 @@ def handle_data_bundle(*, uuid, ingestion_type, vapp, log): if data_bundle_result['file_info']: with s3_output_stream(s3_client, bucket=DATA_BUNDLE_BUCKET, key=info_for_file_upload_key) as fp: - _show_report_lines(data_bundle_result['file_info'], fp) + print(json.dumps(data_bundle_result['file_info'], indent=2), file=fp) except Exception as e: From 65a36edec36571424aa5069ea0017e81fd6d6f95 Mon Sep 17 00:00:00 2001 From: Sarah Date: Tue, 4 Aug 2020 15:21:09 -0400 Subject: [PATCH 045/125] missing specimen id now stops submission properly --- src/encoded/submit.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index aa04c3f0c1..cf7f16eba6 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -167,7 +167,7 @@ def xls_to_json(xls_data, project, institution): # debuglog("descriptions:", descriptions) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 rows = [] # keys = [key.lower().strip().rstrip('*').rstrip() for key in keys] - required = ['individual id', 'relation to proband', 'report required', 'analysis id'] + required = ['individual id', 'relation to proband', 'report required', 'analysis id', 'specimen id'] missing = [col for col in required if col not in keys] if missing: msg = 'Column(s) "{}" not found in spreadsheet! Spreadsheet cannot be processed.'.format('", "'.join(missing)) @@ -193,7 +193,7 @@ def xls_to_json(xls_data, project, institution): for i, row in enumerate(rows): debuglog("row:", repr(row)) # Temporary instrumentation for debugging to go away soon. -kmp 25-Jul-2020 row_num = i + counter + 1 - missing_required = [col for col in required if col not in row] + missing_required = [col for col in required if col not in row or not row[col]] if missing_required: items['errors'].append( 'Spreadsheet row {} cannot be processed - missing required field(s) {}' @@ -222,9 +222,9 @@ def xls_to_json(xls_data, project, institution): file_errors.extend(file_items['errors']) items['file_fastq'].update(file_items['file_fastq']) items['file_processed'].update(file_items['file_processed']) - else: - items['errors'].append('WARNING: No specimen id present for patient {},' - ' sample will not be created.'.format(row['individual id'])) + # else: + # items['errors'].append('WARNING: No specimen id present for patient {},' + # ' sample will not be created.'.format(row['individual id'])) # create SampleProcessing item for trio/group if needed # items = create_sample_processing_groups(items, sp_alias) items = add_relations(items) @@ -670,7 +670,7 @@ def post_and_patch_all_items(virtualapp, json_data_final): output = [] files = [] if not json_data_final: - return output, 'not run' + return output, 'not run', [] item_names = {'individual': 'individual_id', 'family': 'family_id', 'sample': 'specimen_accession'} final_status = {} no_errors = True From 31f68c23a9dcb45f53c1be895c28e28ea6e8a15f Mon Sep 17 00:00:00 2001 From: Sarah Date: Tue, 4 Aug 2020 16:23:45 -0400 Subject: [PATCH 046/125] tweaks to error reporting in submit.py --- src/encoded/submit.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/encoded/submit.py b/src/encoded/submit.py index cf7f16eba6..4133f3a25a 100644 --- a/src/encoded/submit.py +++ b/src/encoded/submit.py @@ -196,10 +196,15 @@ def xls_to_json(xls_data, project, institution): missing_required = [col for col in required if col not in row or not row[col]] if missing_required: items['errors'].append( - 'Spreadsheet row {} cannot be processed - missing required field(s) {}' + 'Row {} - missing required field(s) {}. This row cannot be processed.' ''.format(row_num, ', '.join(missing_required)) ) indiv_alias = '{}:individual-{}'.format(project['name'], row['individual id']) + if not family_dict.get(row['analysis id']): + msg = ('Row {} - Proband for this analysis could not be found. ' + 'This row cannot be processed.'.format(i)) + items['errors'].append(msg) + continue fam_alias = '{}:{}'.format(project['name'], family_dict[row['analysis id']]) # sp_alias = '{}:sampleproc-{}'.format(project['name'], row['specimen id']) # create items for Individual @@ -239,6 +244,7 @@ def xls_to_json(xls_data, project, institution): val2['project'] = project['@id'] val2['institution'] = institution['@id'] items['errors'].extend(file_errors) + items['errors'] = list(set(items['errors'])) return items, True # most errors passed to next step in order to combine with validation errors @@ -311,7 +317,7 @@ def fetch_family_metadata(idx, row, items, indiv_alias, fam_alias): relation_found = True break if not relation_found: - msg = 'Row {}: Invalid relation "{}" for individual {} - Relation should be one of: {}'.format( + msg = 'Row {} - Invalid relation "{}" for individual {} - Relation should be one of: {}'.format( idx, row.get('relation to proband'), row.get('individual id'), ', '.join(valid_relations) ) items['errors'].append(msg) @@ -354,6 +360,10 @@ def fetch_sample_metadata(idx, row, items, indiv_alias, samp_alias, analysis_ali } if row.get('analysis id') in analysis_type_dict: new_sp_item['analysis_type'] = analysis_type_dict[row.get('analysis id')] + if not analysis_type_dict[row.get('analysis id')]: + msg = ('Row {} - Samples with analysis ID {} contain mis-matched or invalid workup type values. ' + 'Sample cannot be processed.'.format(idx, row.get('analysis id'))) + items['errors'].append(msg) new_items['sample_processing'].setdefault(analysis_alias, new_sp_item) new_items['sample_processing'][analysis_alias]['samples'].append(samp_alias) if row.get('report required').lower().startswith('y'): @@ -611,7 +621,7 @@ def validate_all_items(virtualapp, json_data): if itemtype not in ['case', 'report']: for e in error: if row: - errors.append('Row {} {} - Error found: {}'.format(row, itemtype, e)) + errors.append('Row {} - Error found: {}'.format(row, e)) else: errors.append('{} {} - Error found: {}'.format(itemtype, alias, e)) validation_results[itemtype]['errors'] += 1 @@ -648,8 +658,15 @@ def validate_all_items(virtualapp, json_data): json_data_final['patch'].setdefault(itemtype, {}) if patch_data: json_data_final['patch'][itemtype][db_results[alias]['@id']] = patch_data - elif itemtype not in ['case', 'report', 'sample_processing']: - output.append('{} {} - Item already in database, no changes needed'.format(itemtype, alias)) + elif itemtype not in ['case', 'report', 'sample_processing', 'file_fastq']: + item_name = alias[alias.index(':')+1:] + if item_name.startswith(itemtype + '-'): + item_name = item_name[item_name.index('-') + 1:] + if itemtype == 'family': + item_name = 'family for ' + item_name + else: + item_name = itemtype + ' ' + item_name + output.append('{} - Item already in database, no changes needed'.format(item_name)) # do something to record response validation_results[itemtype]['validated'] += 1 output.extend([error for error in errors]) @@ -658,7 +675,7 @@ def validate_all_items(virtualapp, json_data): itemtype, validation_results[itemtype]['validated'], validation_results[itemtype]['errors'] )) if errors: - output.append('Validation errors found in items. Please fix spreadsheet before submitting.') + output.append('Errors found in items. Please fix spreadsheet before submitting.') return {}, output, False else: json_data_final['aliases'] = alias_dict From 1bd88c4768eeaceb8e40afa660b83e71129a588f Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 4 Aug 2020 16:34:07 -0400 Subject: [PATCH 047/125] WIP: Checkpointed work. Not ready for release. --- poetry.lock | 14 +- pyproject.toml | 2 +- src/encoded/ingestion/processors.py | 14 +- src/encoded/ingestion_listener.py | 130 +++++++++++++++--- src/encoded/schemas/ingestion_submission.json | 45 +++++- src/encoded/tests/test_util.py | 13 +- src/encoded/types/ingestion.py | 82 ++++++----- src/encoded/util.py | 80 ++++++++++- 8 files changed, 308 insertions(+), 72 deletions(-) diff --git a/poetry.lock b/poetry.lock index a288056228..2f3dcecda9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -311,7 +311,7 @@ description = "Utility package for interacting with the 4DN Data Portal and othe name = "dcicutils" optional = false python-versions = ">=3.4,<3.8" -version = "0.37.0" +version = "0.38.0" [package.dependencies] aws-requests-auth = ">=0.4.2,<1" @@ -1410,7 +1410,7 @@ description = "Fast, Extensible Progress Meter" name = "tqdm" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*" -version = "4.48.0" +version = "4.48.1" [package.extras] dev = ["py-make (>=0.1.0)", "twine", "argopt", "pydoc-markdown"] @@ -1655,7 +1655,7 @@ transaction = ">=1.6.0" test = ["zope.testing"] [metadata] -content-hash = "b245acd5efed1e52abfe3baf811585ca3307212bde8931f0067ec94e41fbf345" +content-hash = "26aaf14a00bb85d74fb32f02edf89778c1e361710661b2c8ab844e09b33d16c3" lock-version = "1.0" python-versions = ">=3.6,<3.7" @@ -1819,8 +1819,8 @@ dcicsnovault = [ {file = "dcicsnovault-3.1.9.tar.gz", hash = "sha256:347ab5ee3053a80273b081803f93fe115ed48ad53e6ce91c65a5d21a8f02d0e2"}, ] dcicutils = [ - {file = "dcicutils-0.37.0-py3-none-any.whl", hash = "sha256:faa5f6c84a70a9b04ad6b5abac0afba67cc7e9b0f2a342d4bfff746c750355d6"}, - {file = "dcicutils-0.37.0.tar.gz", hash = "sha256:eff330adbf34ac4b8e65ecc48c2b1fe91251f8f25691f8c614825669d40ba128"}, + {file = "dcicutils-0.38.0-py3-none-any.whl", hash = "sha256:57636fa6b802881a02375123080d3e121a8f22fbe731d7c2312e1a17fa69e575"}, + {file = "dcicutils-0.38.0.tar.gz", hash = "sha256:fcdd88e6169b0b98393e052859bf09d98a31102d5d55f79339046d3e67b5edf9"}, ] docker = [ {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, @@ -2381,8 +2381,8 @@ toml = [ {file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"}, ] tqdm = [ - {file = "tqdm-4.48.0-py2.py3-none-any.whl", hash = "sha256:fcb7cb5b729b60a27f300b15c1ffd4744f080fb483b88f31dc8654b082cc8ea5"}, - {file = "tqdm-4.48.0.tar.gz", hash = "sha256:6baa75a88582b1db6d34ce4690da5501d2a1cb65c34664840a456b2c9f794d29"}, + {file = "tqdm-4.48.1-py2.py3-none-any.whl", hash = "sha256:44b896c38f70f91826a3f83a3195b23c0460322bfc729566ec8e4e89bb5ad713"}, + {file = "tqdm-4.48.1.tar.gz", hash = "sha256:7b7dd59cd9f03b89365ba67eb8515f5d2803fd1eb707abdbb914691a3123d9df"}, ] transaction = [ {file = "transaction-2.4.0-py2.py3-none-any.whl", hash = "sha256:b96a5e9aaa73f905759bc9ccf0021bf4864c01ac36666e0d28395e871f6d584a"}, diff --git a/pyproject.toml b/pyproject.toml index d18ac3316c..5bddccc4d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ chardet = "3.0.4" colorama = "0.3.3" dcicpyvcf = "1.0.0" dcicsnovault = ">=3.1.9,<4" # Fixes build problems in 3.1.8 -dcicutils = ">=0.37.0,<1" # has the LockoutManager needed for Snovault +dcicutils = ">=0.38.0,<1" # has the LockoutManager needed for Snovault docutils = "0.12" elasticsearch = "5.5.3" elasticsearch-dsl = "^5.4.0" diff --git a/src/encoded/ingestion/processors.py b/src/encoded/ingestion/processors.py index 4d3173da17..e83846518d 100644 --- a/src/encoded/ingestion/processors.py +++ b/src/encoded/ingestion/processors.py @@ -50,12 +50,12 @@ def handle_data_bundle(submission): response = s3_client.get_object(Bucket=DATA_BUNDLE_BUCKET, Key=manifest_key) manifest = json.load(response['Body']) - data_key = manifest['object_name'] + object_name = manifest['object_name'] parameters = manifest['parameters'] institution = get_parameter(parameters, 'institution') project = get_parameter(parameters, 'project') - debuglog(submission_id, "data_key:", data_key) + debuglog(submission_id, "object_name:", object_name) debuglog(submission_id, "parameters:", parameters) started_key = "%s/started.txt" % submission_id @@ -65,15 +65,17 @@ def handle_data_bundle(submission): # data_stream = s3_client.get_object(Bucket=DATA_BUNDLE_BUCKET, Key="%s/manifest.json" % uuid)['Body'] resolution = { - "data_key": data_key, + "data_key": object_name, "manifest_key": manifest_key, "started_key": started_key, } try: - submission.set_item_detail(object_name=manifest['object_name'], parameters=manifest['parameters'], - institution=institution, project=project) + submission.patch_item(submission_id=submission_id, + object_name=object_name, + parameters=parameters, + processing_status={"state": "processing"}) if isinstance(institution, str): institution = submission.vapp.get(institution).json @@ -82,7 +84,7 @@ def handle_data_bundle(submission): validation_log_lines, final_json, result_lines = submit_data_bundle(s3_client=s3_client, bucket=DATA_BUNDLE_BUCKET, - key=data_key, + key=object_name, project=project, institution=institution, vapp=submission.vapp) diff --git a/src/encoded/ingestion_listener.py b/src/encoded/ingestion_listener.py index 204181aa3e..c49807d6ad 100644 --- a/src/encoded/ingestion_listener.py +++ b/src/encoded/ingestion_listener.py @@ -8,26 +8,34 @@ import json import os import psycopg2 -import requests # XXX: C4-211 should not be needed but is +import pyramid.request +import requests # XXX: C4-211 should not be needed but is // KMP needs this, too, until subrequest posts work import signal import socket import structlog import threading import time +import urllib.parse import uuid import webtest from dcicutils.misc_utils import VirtualApp, ignored from pyramid import paster +from pyramid.request import Request from pyramid.response import Response from pyramid.view import view_config +from requests.auth import HTTPBasicAuth +from snovault import COLLECTIONS, Collection +from snovault.crud_views import collection_add as sno_collection_add +from snovault.embed import make_subrequest +from snovault.schema_utils import validate_request from snovault.util import debug_log from vcf import Reader from .commands.ingest_vcf import VCFParser from .ingestion.common import register_path_content_type, DATA_BUNDLE_BUCKET, SubmissionFailure from .ingestion.processors import get_ingestion_processor from .types.ingestion import SubmissionFolio -from .util import resolve_file_path, gunzip_content, debuglog +from .util import resolve_file_path, gunzip_content, debuglog, subrequest_item_creation log = structlog.getLogger(__name__) @@ -42,10 +50,75 @@ def includeme(config): config.add_route('ingestion_status', '/ingestion_status') config.add_route('prompt_for_ingestion', '/prompt_for_ingestion') config.add_route('submit_for_ingestion', '/submit_for_ingestion') + + # THESE TWO ARE FOR DEBUGGING ONLY. + config.add_route('prompt_for_subrequest', '/prompt_for_subrequest') + config.add_route('submit_subrequest', '/submit_subrequest') + config.registry[INGESTION_QUEUE] = IngestionQueueManager(config.registry) config.scan(__name__) +# Moved to util.py and modified. +# def subrequest_item_creation(request: pyramid.request.Request, item_type: str, json_body: dict = None) -> dict: +# if json_body is None: +# json_body = {} +# collection_path = '/' + item_type +# method = 'POST' +# # json_utf8 = json.dumps(json_body).encode('utf-8') # Unused, but here just in case +# subrequest = make_subrequest(request=request, path=collection_path, method=method, json_body=json_body) +# subrequest.remote_user = 'EMBED' +# subrequest.registry = request.registry +# # Maybe... +# # validated = json_body.copy() +# # subrequest.validated = validated +# collection: Collection = subrequest.registry[COLLECTIONS][item_type] +# check_true(subrequest.json_body, "subrequest.json_body is not properly initialized.") +# check_true(not subrequest.validated, "subrequest was unexpectedly validated already.") +# check_true(subrequest.remote_user == 'EMBED', "subrequest.remote_user is not 'EMBED'.") +# check_true(not subrequest.errors, "subrequest.errors already has errors before trying to validate.") +# check_true(subrequest.remote_user is None, "subrequest.remote_user should have been None before we set it.") +# check_true(request.remote_user is None, "request.remote_user should have been None before we set it.") +# request.remote_user = 'EMBED' +# validate_request(schema=collection.type_info.schema, request=subrequest, data=json_body) +# if not subrequest.validated: +# return { +# "@type": ["Exception"], +# "errors": subrequest.errors +# } +# else: +# json_result: dict = sno_collection_add(context=collection, request=subrequest, render=False) +# return json_result + + +# FOR DEBUGGING ONLY +@view_config(route_name='prompt_for_subrequest', request_method='GET') +@debug_log +def prompt_for_subrequest(context, request): + ignored(context, request) + return Response(PROMPT_FOR_SUBREQUEST) + + +# FOR DEBUGGING ONLY +register_path_content_type(path='/submit_subrequest', content_type='multipart/form-data') +@view_config(route_name='submit_subrequest', request_method='POST', accept='multipart/form-data') +@debug_log +def submit_subrequest(context, request): + # import pdb; pdb.set_trace() + institution = "/institutions/hms-dbmi/" + project = "/projects/12a92962-8265-4fc0-b2f8-cf14f05db58b/" + # institution = request.invoke_subrequest(make_subrequest(request, institution)).json + print("institution=", institution) + # project = request.invoke_subrequest(make_subrequest(request, project)).json + print("project=", project) + json_body = { + "ingestion_type": 'data_bundle', + "institution": institution, + "project": project, + } + return subrequest_item_creation(request=request, item_type='IngestionSubmission', json_body=json_body) + + @view_config(route_name='prompt_for_ingestion', request_method='GET') @debug_log def prompt_for_ingestion(context, request): @@ -65,6 +138,14 @@ def submit_for_ingestion(context, request): override_name = request.POST.get('override_name', None) parameters = dict(request.POST) parameters['datafile'] = filename + institution = parameters.get('institution', 'institution-missing') + project = parameters.get('project', 'project-missing') + + submission_id = SubmissionFolio.create_item(request, + ingestion_type=ingestion_type, + institution=institution, + project=project) + # ``input_file`` contains the actual file data which needs to be # stored somewhere. @@ -75,7 +156,10 @@ def submit_for_ingestion(context, request): # NOTE: Some reference information about uploading files to s3 is here: # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html - submission_id = str(uuid.uuid4()) + # submission.set_item_detail(object_name=manifest['object_name'], parameters=manifest['parameters'], + # institution=institution, project=project) + + # submission_id = str(uuid.uuid4()) _, ext = os.path.splitext(filename) object_name = "{id}/datafile{ext}".format(id=submission_id, ext=ext) manifest_name = "{id}/manifest.json".format(id=submission_id) @@ -525,6 +609,14 @@ def run(self): debuglog("Ingestion listener started.") + messages = [] # This'll get a better value below in each loop iteration. This is just a declaration of intent. + + def discard(msg): + self.delete_messages([msg]) + # Assuming we didn't get an error trying to remove it, + # it should also get removed from our to-do list. + messages.remove(msg) + while self.should_remain_online(): debuglog("About to get messages.") @@ -548,12 +640,14 @@ def run(self): # to make all the parts work the same -kmp submission = SubmissionFolio(vapp=self.vapp, ingestion_type=ingestion_type, submission_id=uuid) handler = get_ingestion_processor(ingestion_type) - handler(submission) - # TODO: If we delete messages at the end of each loop, I think we'll here need to do this, - # since we're bypassing bottom of lop with the 'continue': - # self.delete_messages([message]) - # messages.remove(message) - debuglog("HANDLED", uuid) + try: + debuglog("HANDLING:", uuid) + handler(submission) + debuglog("HANDLED:", uuid) + except Exception as e: + log.error(e) + # If we suceeded, we don't need to do it again, and if we failed we don't need to fail again. + discard(message) continue debuglog("Did NOT process", uuid, "as", ingestion_type) @@ -599,9 +693,10 @@ def run(self): log.error(msg) self.update_status(msg=msg) - # TODO: I worry waiting to delete multiple messages means that if there's an error - # we'll have things that were completed not get deleted. Should delete one per iteration? - # -kmp 26-Jul-2020 + discard(message) + + # This is just fallback cleanup in case messages weren't cleaned up within the loop. + # In normal operation, they will be. self.delete_messages(messages) @@ -716,7 +811,7 @@ def status_app(environ, start_response): # Command Application (for waitress) def main(): """ Entry point for the local deployment. """ - parser = argparse.ArgumentParser( + parser = argparse.ArgumentParser( # noqa - PyCharm wrongly thinks the formatter_class is specified wrong here. description='Listen for VCF File uuids to ingest', epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter @@ -736,13 +831,13 @@ def main(): vapp = VirtualApp(app, config) return run(vapp) -PROMPT_FOR_INGESTION = """ +PROMPT_TEMPLATE = """ Submit for Ingestion