# Example of validating the mapping file (provenance) using schema

In [1]:
## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import json
import jsonschema
import pathlib
import jsonref
import yaml

In [2]:
pathlib.Path.cwd()

PosixPath('/Users/jay/Desktop/biothings_explorer/jupyter notebooks/CX_WIPs/Draft_Metadata')

## loading schema

In [3]:
yaml_schema = pathlib.Path.cwd().joinpath("draft6_schema_mappingFile.yaml")
with open(yaml_schema) as file:
    schema_from_yaml = yaml.load(file, Loader=yaml.SafeLoader)
    schema_from_yaml = json.dumps(schema_from_yaml, indent=2)
    schema_from_yaml = jsonref.loads(schema_from_yaml)

In [4]:
schema_from_yaml.keys()
schema_from_yaml['description']

dict_keys(['title', '$schema', 'type', 'description', 'properties', 'definitions'])

'Contains the mapping between possible values in the response fields (as string keys) to info in the desired format. Will be used often for provenance'

In [5]:
schema_from_yaml['properties'].keys()

dict_keys(['provenanceMapping'])

In [6]:
schema_from_yaml['properties']['provenanceMapping']['patternProperties']\
['.']['items']['properties'].keys()

dict_keys(['name', 'sourceType', 'version', 'versionType', 'method', 'sourceReferences', 'descriptiveInfo', 'sourceContext'])

In [7]:
schema_from_yaml['properties']['provenanceMapping']['patternProperties']\
['.']['items']['properties']['sourceReferences']['properties']['websites']\
['properties']['value']['oneOf']

[{'type': 'array', 'minItems': 1, 'items': {'type': 'string'}},
 {'type': 'string'}]

## checking an example

In [8]:
yaml_example = pathlib.Path.cwd().joinpath("draft6_example_mappingFile_disgenetDV.yaml")
with open(yaml_example) as file:
    example_from_yaml = yaml.load(file, Loader=yaml.SafeLoader)
    example_from_yaml = json.dumps(example_from_yaml, indent=2)
    example_from_yaml = jsonref.loads(example_from_yaml)

In [9]:
example_from_yaml['provenanceMapping'].keys()

dict_keys(['BEFREE', 'UNIPROT', 'CLINVAR', 'GWASCAT', 'GWASDB'])

In [10]:
example_from_yaml['provenanceMapping']['GWASCAT'][0]

{'name': 'MyDisease.info API',
 'sourceType': 'service',
 'version': '2020-10-26',
 'versionType': 'date_last_updated',
 'method': 'ingest',
 'descriptiveInfo': 'Downloaded the DisGeNET ALL variant-disease-pmid associations file'}

In [11]:
example_from_yaml['provenanceMapping']['UNIPROT'][2]

{'name': 'UniProtKB',
 'sourceType': 'knowledgebase',
 'method': 'manual_curation',
 'sourceContext': {'taxonSpecific': {'value': 'NCBITaxon:9606'}},
 'sourceReferences': {'websites': {'value': ['https://www.uniprot.org/docs/humsavar',
    'https://www.uniprot.org/help/entry_status',
    'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/variants/README']}},
 'descriptiveInfo': 'The humsavar file from UniProt lists all missense variants annotated in human UniProtKB/Swiss-Prot entries, along with their gene/protein and whether they are implicated in a specific disease. UniProtKB/Swiss-Prot annotations are made by curators (1) using scientific literature or (2) reviewing and accepting the annotations made by computational analysis /automatic processing.'}

## validate example against schema

In [12]:
jsonschema.validate(instance=example_from_yaml, schema=schema_from_yaml)
## YAYAYAYAYAY this means it validated!!!!!!!

### error: if a required field is missing

In [13]:
## but what if it's a fluke?? 
## remove a required key from one association 
removed1 = example_from_yaml['provenanceMapping']['UNIPROT'][2].pop('name')

## then try to validate, this 
jsonschema.validate(instance=example_from_yaml, schema=schema_from_yaml)
## so it works in catching the error yayyyyyyyy. too bad it doesn't tell you which entry had the error in it...

ValidationError: 'name' is a required property

Failed validating 'required' in schema['properties']['provenanceMapping']['patternProperties']['.']['items']:
    {'additionalProperties': False,
     'properties': {'descriptiveInfo': {'description': 'Long-text '
                                                       'description of '
                                                       'what the source '
                                                       'did to create '
                                                       'associations. Try '
                                                       'to include enough '
                                                       'detail that a '
                                                       'reader would '
                                                       'understand what '
                                                       'happened',
                                        'type': 'string'},
                    'method': {'description': 'Keyword noting what the '
                                              'source did to create '
                                              'associations from data or '
                                              'knowledge. Current possible '
                                              'values include ingest, NLP, '
                                              'manual_curation, '
                                              'shared_phenotype, '
                                              'shared_disease, '
                                              'shared_variant, submission, '
                                              'orthology_mapping',
                               'type': 'string'},
                    'name': {'description': 'Name of the source (KP API, '
                                            'knowledge-source, database, '
                                            'text corpus, etc)',
                             'type': 'string'},
                    'sourceContext': {'additionalProperties': False,
                                      'description': 'Object, Python '
                                                     'dict-like. Keys '
                                                     'specify the type of '
                                                     'context. Values '
                                                     'depend on where the '
                                                     'information comes '
                                                     'from. After '
                                                     'post-processing, the '
                                                     'context should be '
                                                     'ontology terms '
                                                     '(curies) or '
                                                     'short-strings. '
                                                     'Context/relevance is '
                                                     'information that '
                                                     'restricts the use or '
                                                     'interpretation of '
                                                     'this association. '
                                                     'This lets a '
                                                     'developer know if '
                                                     'the operation or the '
                                                     'associations '
                                                     'retrieved are '
                                                     'relevant to the '
                                                     'question being asked',
                                      'minProperties': 1,
                                      'patternProperties': {'taxonSpecific|diseaseSpecific|cohortSpecific|experimentalSpecific': {'additionalProperties': False,
                                                                                                                                  'description': 'Info '
                                                                                                                                                 'is '
                                                                                                                                                 'not '
                                                                                                                                                 'dependent '
                                                                                                                                                 'on '
                                                                                                                                                 'API/JSON-response. '
                                                                                                                                                 'In '
                                                                                                                                                 'this '
                                                                                                                                                 'situation, '
                                                                                                                                                 'a '
                                                                                                                                                 'static '
                                                                                                                                                 'value '
                                                                                                                                                 'can '
                                                                                                                                                 'be '
                                                                                                                                                 'set '
                                                                                                                                                 'in '
                                                                                                                                                 'the '
                                                                                                                                                 'registry '
                                                                                                                                                 'using '
                                                                                                                                                 'the '
                                                                                                                                                 'value '
                                                                                                                                                 'field',
                                                                                                                                  'properties': {'value': {'oneOf': [{'items': {'type': 'string'},
                                                                                                                                                                      'minItems': 1,
                                                                                                                                                                      'type': 'array'},
                                                                                                                                                                     {'type': 'string'}]}},
                                                                                                                                  'required': ['value'],
                                                                                                                                  'type': 'object'}},
                                      'type': 'object'},
                    'sourceReferences': {'additionalProperties': False,
                                         'description': 'Used to provide '
                                                        'publications and '
                                                        'website URLs for '
                                                        'users to learn '
                                                        'more about an '
                                                        'association, '
                                                        'knowledge-source, '
                                                        'measure, '
                                                        'context/relevance, '
                                                        'etc',
                                         'minProperties': 1,
                                         'properties': {'publications': {'minProperties': 1,
                                                                         'patternProperties': {'.': {'additionalProperties': False,
                                                                                                     'properties': {'value': {'oneOf': [{'items': {'type': ['string',
                                                                                                                                                            'number']},
                                                                                                                                         'minItems': 1,
                                                                                                                                         'type': 'array'},
                                                                                                                                        {'type': ['string',
                                                                                                                                                  'number']}]}},
                                                                                                     'required': ['value'],
                                                                                                     'type': 'object'}},
                                                                         'type': 'object'},
                                                        'websites': {'additionalProperties': False,
                                                                     'description': 'Info '
                                                                                    'is '
                                                                                    'not '
                                                                                    'dependent '
                                                                                    'on '
                                                                                    'API/JSON-response. '
                                                                                    'In '
                                                                                    'this '
                                                                                    'situation, '
                                                                                    'a '
                                                                                    'static '
                                                                                    'value '
                                                                                    'can '
                                                                                    'be '
                                                                                    'set '
                                                                                    'in '
                                                                                    'the '
                                                                                    'registry '
                                                                                    'using '
                                                                                    'the '
                                                                                    'value '
                                                                                    'field',
                                                                     'properties': {'value': {'oneOf': [{'items': {'type': 'string'},
                                                                                                         'minItems': 1,
                                                                                                         'type': 'array'},
                                                                                                        {'type': 'string'}]}},
                                                                     'required': ['value'],
                                                                     'type': 'object'}},
                                         'type': 'object'},
                    'sourceType': {'description': 'Type of the source (KP '
                                                  'API, knowledge-source, '
                                                  'database, text corpus, '
                                                  'etc). Current possible '
                                                  'values include service, '
                                                  'knowledgebase, '
                                                  'publications, text, '
                                                  'clinical_database',
                                   'type': 'string'},
                    'version': {'description': 'Source version (KP API, '
                                               'knowledge-source, '
                                               'database, text corpus, '
                                               'etc)',
                                'type': 'string'},
                    'versionType': {'description': 'There are different '
                                                   'formats for '
                                                   'versioning. Current '
                                                   'possible values '
                                                   'include '
                                                   'date_last_updated, '
                                                   'version_number, '
                                                   'data_from_this_date, '
                                                   'latest_date_covered',
                                    'type': 'string'}},
     'required': ['name', 'sourceType'],
     'type': 'object'}

On instance['provenanceMapping']['UNIPROT'][2]:
    {'descriptiveInfo': 'The humsavar file from UniProt lists all missense '
                        'variants annotated in human UniProtKB/Swiss-Prot '
                        'entries, along with their gene/protein and '
                        'whether they are implicated in a specific '
                        'disease. UniProtKB/Swiss-Prot annotations are '
                        'made by curators (1) using scientific literature '
                        'or (2) reviewing and accepting the annotations '
                        'made by computational analysis /automatic '
                        'processing.',
     'method': 'manual_curation',
     'sourceContext': {'taxonSpecific': {'value': 'NCBITaxon:9606'}},
     'sourceReferences': {'websites': {'value': ['https://www.uniprot.org/docs/humsavar',
                                                 'https://www.uniprot.org/help/entry_status',
                                                 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/variants/README']}},
     'sourceType': 'knowledgebase'}

In [14]:
example_from_yaml['provenanceMapping']['UNIPROT'][2]['name'] = removed1

In [15]:
jsonschema.validate(instance=example_from_yaml, schema=schema_from_yaml)
## and it's back and accepted again 

## Export JSON files for the yamls

In [16]:
json_schema_path = pathlib.Path.cwd().joinpath("draft6_schema_mappingFile.json")
with open(json_schema_path, "w") as file:
    json.dump(schema_from_yaml, file, indent=2)

In [17]:
json_example_path = pathlib.Path.cwd().joinpath("draft6_example_mappingFile_disgenetDV.json")
with open(json_example_path, "w") as file:
    json.dump(example_from_yaml, file, indent=2)