In [6]:
import os
import json
import jsonschema
import requests

script_path = ''
DATAPATH = os.path.join(script_path,'sample data/')

bioschemas_raw = requests.get('https://raw.githubusercontent.com/gtsueng/DDE_bioschemas/main/bioschemas.json')
bioschemas = bioschemas_raw.json()

#### Function to fetch the validations from a DDE-generated schema
def get_validation_from_schema(ddeschema):
    validation_dict = {}
    for eachentry in ddeschema["@graph"]:
        try:
            validation = eachentry["$validation"]
            validation_dict[eachentry["@id"]] = validation
        except:
            continue
    return(validation_dict)


In [3]:
%%time
#### script to check validation

import os
import json
import jsonschema
import requests

validation_dict = get_validation_from_schema(bioschemas)

type_dict = {'massbank':'bioschemas:MolecularEntity',
             'disprot':'bioschemas:Protein',
             'edgar':'bioschemas:Gene'}

DATAPATH = 'sample data/'

allresults = []
for eachtype in type_dict.keys():
    with open(DATAPATH+eachtype+'.json','rb') as inputfile:
        esjson = json.load(inputfile)
    esdocs = esjson['hits']['hits']
    for eachhit in esdocs:
        sample = eachhit['_source']
        try: 
            jsonschema.validate(sample,validation_dict[type_dict[eachtype]])
            allresults.append({'source_type':eachtype,'_id':sample['identifier'],'validation':'pass'})
        except:
            allresults.append({'source_type':eachtype,'_id':sample['identifier'],'validation':'fail'})

Wall time: 1min 21s


In [4]:
%%time
import pandas as pd
valtest = pd.DataFrame(allresults)
edgar = valtest.loc[valtest['source_type']=='edgar']
print(edgar.groupby('validation').size().reset_index(name='counts'))
massbank = valtest.loc[valtest['source_type']=='massbank']
print(massbank.groupby('validation').size().reset_index(name='counts'))
disprot = valtest.loc[valtest['source_type']=='disprot']
print(disprot.groupby('validation').size().reset_index(name='counts'))

  validation  counts
0       fail    3510
  validation  counts
0       pass   10000
  validation  counts
0       pass    1971
Wall time: 348 ms


In [7]:
#### Check why disprot is failing

validation_dict = get_validation_from_schema(bioschemas)

type_dict = {'edgar':'bioschemas:Gene',
             'massbank':'bioschemas:MolecularEntity',
             'disprot':'bioschemas:Protein'}

DATAPATH = 'sample data/'

eachtype = 'edgar'
valcheck = []
with open(DATAPATH+eachtype+'.json','rb') as inputfile:
    esjson = json.load(inputfile)
esdocs = esjson['hits']['hits']
for eachhit in esdocs:
    sample = eachhit['_source']
    jsonschema.validate(sample,validation_dict[type_dict[eachtype]])
    valcheck.append({'source_type':eachtype,'_id':sample['identifier'],'validation':'pass'})


ValidationError: 'http://edgar.biocomp.unibo.it/cgi-bin/gene_disease_db/gene.py?gene=ABCB4' is valid under each of {'type': 'string', 'format': 'uri'}, {'type': 'string'}

Failed validating 'oneOf' in schema['properties']['identifier']:
    {'description': 'The identifier property represents any kind of '
                    'identifier for any kind of <a class="localLink" '
                    'href="http://schema.org/Thing">Thing</a>, such as '
                    'ISBNs, GTIN codes, UUIDs etc. Schema.org provides '
                    'dedicated properties for representing many of these, '
                    'either as textual strings or as URL (URI) links. See '
                    '<a '
                    'href="/docs/datamodel.html#identifierBg">background '
                    'notes</a> for more details.',
     'oneOf': [{'$ref': '#/definitions/propertyValue'},
               {'type': 'string'},
               {'format': 'uri', 'type': 'string'}]}

On instance['identifier']:
    'http://edgar.biocomp.unibo.it/cgi-bin/gene_disease_db/gene.py?gene=ABCB4'

In [None]:
#### script to run remotely if files could not be pulled locally
## They were pulled just fine via curl, so ignore this

import os
import json
import jsonschema
import requests

type_dict = {'crawler_edgar':'bioschemas:Gene',
             'crawler_massbank':'bioschemas:MolecularEntity',
             'crawler_disprot':'bioschemas:Protein'}

for eachtype in type_dict.keys():
    response = requests.get('http://su07.scripps.edu:9199/'+each_type+'/_search')

In [None]:
#### Snippets for figuring out data structure

In [None]:
with open('sample data/edgar.json','rb') as inputfile:
    edgar = json.load(inputfile)
print(edgar['hits']['hits'][0]['_source'])

In [None]:
print(validation_dict['bioschemas:Gene'])
print(validation_dict['bioschemas:Protein'])
print(validation_dict['bioschemas:MolecularEntity'])

In [None]:
#### Note, the validate function must be run against the validation section in the schema
#### To make things easier, we pull out the validation section in the schema
validation_dict = get_validation_from_schema(bioschemas)
print(validation_dict.keys())
garbagetest = {"name": {"LOL":"This is invalid"}}
#jsonschema.validate(garbagetest,bioschemas["@graph"][18]["$validation"])
jsonschema.validate(massbank_sample[0],validation_dict["bioschemas:MolecularEntity"])

In [None]:

with open(os.path.join(DATAPATH,'massbank_sample.json'),'r') as massfile:
    massbank_sample = json.load(massfile)

massbankresults = []
for sample in massbank_sample:
    try: 
        jsonschema.validate(sample,validation_dict["bioschemas:MolecularEntity"])
        massbankresults.append({'_id':sample['identifier'],'validation':'pass'})
    except:
        massbankresults.append({'_id':sample['identifier'],'validation':'fail'})        