In [1]:
import os
import json
import jsonschema
import requests

script_path = ''
DATAPATH = os.path.join(script_path,'sample data/')

bioschemas_raw = requests.get('https://raw.githubusercontent.com/gtsueng/DDE_bioschemas/main/bioschemas.json')
bioschemas = bioschemas_raw.json()

#### Function to fetch the validations from a DDE-generated schema
def get_validation_from_schema(ddeschema):
    validation_dict = {}
    for eachentry in ddeschema["@graph"]:
        try:
            validation = eachentry["$validation"]
            validation_dict[eachentry["@id"]] = validation
        except:
            continue
    return(validation_dict)


In [4]:
%%time
#### script to check validation

import os
import json
import jsonschema
import requests
import pandas as pd

validation_dict = get_validation_from_schema(bioschemas)

type_dict = {'massbank':'bioschemas:MolecularEntity',
             'disprot':'bioschemas:Protein',
             'edgar':'bioschemas:Gene'}

DATAPATH = 'sample data/'

allresults = []
for eachtype in type_dict.keys():
    with open(DATAPATH+eachtype+'.json','rb') as inputfile:
        esjson = json.load(inputfile)
    esdocs = esjson['hits']['hits']
    for eachhit in esdocs:
        sample = eachhit['_source']
        try: 
            jsonschema.validate(sample,validation_dict[type_dict[eachtype]])
            allresults.append({'source':eachtype,'_id':sample['identifier'],'validation':'pass'})
        except:
            allresults.append({'source':eachtype,'_id':sample['identifier'],'validation':'fail'})

validation_results = pd.DataFrame(allresults)
validation_summary = validation_results.groupby(['source','validation']).size().reset_index(name='counts')
print(validation_summary)

     source validation  counts
0   disprot       pass    1971
1     edgar       pass    3510
2  massbank       pass   10000
Wall time: 1min 16s


In [None]:
#### script to run remotely if files could not be pulled locally
## They were pulled just fine via curl, so ignore this
import os
import json
import jsonschema
import requests
import pandas as pd

validation_dict = get_validation_from_schema(bioschemas)

type_dict = {'crawler_edgar':'bioschemas:Gene',
             'crawler_massbank':'bioschemas:MolecularEntity',
             'crawler_disprot':'bioschemas:Protein'}

allresults = []
for eachtype in type_dict.keys():
    response = requests.get('http://su07.scripps.edu:9199/'+each_type+'/_search?&size=10000&q=*:*')
    esjson = response.json()
    esdocs = esjson['hits']['hits']
    for eachhit in esdocs:
        sample = eachhit['_source']
        try: 
            jsonschema.validate(sample,validation_dict[type_dict[eachtype]])
            allresults.append({'source_type':eachtype,'_id':sample['identifier'],'validation':'pass'})
        except:
            allresults.append({'source_type':eachtype,'_id':sample['identifier'],'validation':'fail'})

validation_results = pd.DataFrame(allresults)
validation_summary = validation_results.groupby(['source_type','validation']).size().reset_index(name='counts')
validation_summary.to_csv('validation_summary.txt',sep='\t',header=True)

In [None]:
#### Snippets for figuring out data structure

In [None]:
with open(DATAPATH+'massbank.json','rb') as inputfile:
    esjson = json.load(inputfile)


In [None]:
with open('sample data/edgar.json','rb') as inputfile:
    edgar = json.load(inputfile)
print(edgar['hits']['hits'][0]['_source'])

In [None]:
print(validation_dict['bioschemas:Gene'])
print(validation_dict['bioschemas:Protein'])
print(validation_dict['bioschemas:MolecularEntity'])

In [None]:
#### Note, the validate function must be run against the validation section in the schema
#### To make things easier, we pull out the validation section in the schema
validation_dict = get_validation_from_schema(bioschemas)
print(validation_dict.keys())
garbagetest = {"name": {"LOL":"This is invalid"}}
#jsonschema.validate(garbagetest,bioschemas["@graph"][18]["$validation"])
jsonschema.validate(massbank_sample[0],validation_dict["bioschemas:MolecularEntity"])

In [None]:

with open(os.path.join(DATAPATH,'massbank_sample.json'),'r') as massfile:
    massbank_sample = json.load(massfile)

massbankresults = []
for sample in massbank_sample:
    try: 
        jsonschema.validate(sample,validation_dict["bioschemas:MolecularEntity"])
        massbankresults.append({'_id':sample['identifier'],'validation':'pass'})
    except:
        massbankresults.append({'_id':sample['identifier'],'validation':'fail'})        