## NIAID SysBio Data fix

The NDE parser for the DDE currently delineates between NIAID SysBio, NIAID Data Ecosystem, and other Data Discovery Engine ingested records based on the context file, however many batch-uploaded SysBio records do NOT have an @context object in the record. This causes it to be attributed only to the Data Discover Engine, and NOT NIAID SysBio

To do:
1. Use nde API to find all records missing the @context file (i.e. - they're being attributed only to the Data Discovery Engine
2. Use the Biothings DDE API to get the raw json objects for these records
3. Add the require @context information to each json object for these records
4. Create a batch json file for upload to the DDE
5. Batch upload the records to update them in the DDE
6. Request a refresh of the Data ingested from the DDE by the NDE team (Jason)



In [10]:
import os
import requests
import json
import time
import pandas as pd

In [2]:
## paths
script_path = os.getcwd()
data_path = os.path.join(script_path,'data')

In [3]:
## API Urls
nde_id = "DDE_0565c31a11705723"
query_term = "includedInDataCatalog.name:%22Data%20Discovery%20Engine%22"
query_params = "&fields=_id&size=500"
nde_base_url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q={query_term}&{query_params}"
dde_id = "0565c31a11705723"
dde_base_url = f"https://discovery.biothings.io/api/dataset/{dde_id}"

In [14]:
#### Fetch all DDE records from NDE

## Fetch all DDE records in NDE
query_params = "&fields=_id&size=500"
query_term = "includedInDataCatalog.name:%22Data%20Discovery%20Engine%22"
nde_base_url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q={query_term}&{query_params}"
nde_dde_request = requests.get(nde_base_url)
nde_dde_json = json.loads(nde_dde_request.text)
nde_dde_list = []
for eachhit in nde_dde_json['hits']:
    nde_dde_list.append(eachhit['_id'])
print(nde_dde_list[0:2])

## Fetch all SysBio records in NDE
query_term = 'includedInDataCatalog.name:"Data Discovery Engine, NIAID Systems Biology"'
nde_base_url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q={query_term}&{query_params}"
nde_sysbio_request = requests.get(nde_base_url)
nde_sysbio_json = json.loads(nde_sysbio_request.text)
nde_sysbio_list = []
for eachhit in nde_sysbio_json['hits']:
    nde_sysbio_list.append(eachhit['_id'])
print(nde_sysbio_list[0:2])

## Fetch all NDE-DDE records in NDE
query_term = 'includedInDataCatalog.name:"Data Discovery Engine, NIAID Data Ecosystem"'
nde_base_url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q={query_term}&{query_params}"
nde_request = requests.get(nde_base_url)
nde_json = json.loads(nde_request.text)
nde_list = []
for eachhit in nde_json['hits']:
    nde_list.append(eachhit['_id'])
print(nde_list[0:2])

['DDE_0565c31a11705723', 'DDE_080de1604f9f92e0']
['DDE_2328930bf234efc1', 'DDE_b004a7de1f900bc2']
['DDE_2328930bf234efc1', 'DDE_b004a7de1f900bc2']


In [15]:
#### Subset to obtain only the problematic records
nde_dde_set = set(nde_dde_list)
nde_sysbio_set = set(nde_sysbio_list)
nde_set = set(nde_list)

temp_set = nde_dde_set - nde_sysbio_set
dde_set = temp_set - nde_set
print("nde_dde_set: ",len(nde_dde_set))
print("nde_sysbio_set: ",len(nde_sysbio_set))
print("nde_set: ", len(nde_set))
print("nde_dde_set less sysbio: ", len(temp_set))
print("dde_set: ",len(dde_set))

nde_dde_set:  373
nde_sysbio_set:  59
nde_set:  64
nde_dde_set less sysbio:  314
dde_set:  309


In [6]:
#### Fix the identifiers so that they can be used to pull records from the DDE API
dde_list = list(dde_set)
dde_id_list = [x.replace("DDE_","") for x in dde_list]
print(dde_id_list[0:2])

['878bfe0874851cc7', '66c0ab81b12cff8c']


In [7]:
#### Save this list
with open(os.path.join(data_path,'ids2fix.txt'),'w') as outwrite:
    for eachid in dde_id_list:
        outwrite.write(eachid+'\n')

In [20]:
#### Fetch a good record from the DDE for an example of the '@context' object
r = requests.get("https://discovery.biothings.io/api/dataset/2328930bf234efc1")
tmp = json.loads(r.text)
context = tmp['@context']
print(context)

{'owl': 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'schema': 'http://schema.org/', 'niaid': 'https://discovery.biothings.io/view/niaid/', 'xsd': 'http://www.w3.org/2001/XMLSchema#'}


In [26]:
%%time
#### Fetch the problematic records, add the '@context' object
fixed_records1 = []
fixed_records2 = []
fixed_records3 = []
fixed_records4 = []
i=0
for eachid in dde_id_list:
    dde_base_url = f"https://discovery.biothings.io/api/dataset/{eachid}"
    r = requests.get(dde_base_url)
    temp = json.loads(r.text)
    temp['@context'] = context
    if i < 90:
        fixed_records1.append(temp)
    elif 90 <= i < 180:
        fixed_records2.append(temp)
    elif 180 <= i < 270:
        fixed_records3.append(temp)
    elif 270 <= i < 310:
        fixed_records4.append(temp)
    time.sleep(0.125)
    i=i+1        

print(len(fixed_records)," records fixed")

309  records fixed
CPU times: total: 18.6 s
Wall time: 1min 44s


In [27]:
#### Export the results
with open(os.path.join(data_path,'fixed_records1.json'),'w') as outfile:
    outfile.write(json.dumps(fixed_records1))
with open(os.path.join(data_path,'fixed_records2.json'),'w') as outfile:
    outfile.write(json.dumps(fixed_records2))
with open(os.path.join(data_path,'fixed_records3.json'),'w') as outfile:
    outfile.write(json.dumps(fixed_records3))
with open(os.path.join(data_path,'fixed_records4.json'),'w') as outfile:
    outfile.write(json.dumps(fixed_records4))

In [4]:
#### Load the id list
dde_id_list = []

with open(os.path.join(data_path,'ids2fix.txt'),'r') as infile:
    for line in infile:
        dde_id_list.append(line.strip())
print(dde_id_list[0:1])

['878bfe0874851cc7']


In [9]:
#### Determine the owner of the bad records:
userlist = []

for eachid in dde_id_list:
    dde_base_url = f"https://discovery.biothings.io/api/dataset/{eachid}?meta=1 "
    r = requests.get(dde_base_url)
    temp = json.loads(r.text)
    userlist.append(temp['_meta']['username'])

allusers = set(userlist)
print(allusers)

{'flaneuse'}


In [12]:
### Confirm the consistency of the record @types
classlist = []
for eachid in dde_id_list:
    dde_base_url = f"https://discovery.biothings.io/api/dataset/{eachid}?meta=1"
    r = requests.get(dde_base_url)
    temp = json.loads(r.text)
    classlist.append({'_id':eachid,'submitter':temp['_meta']['username'],'classtype':temp['_meta']['class_id']})

records2fix = pd.DataFrame(classlist)
records2fix.to_csv(os.path.join(data_path,'records2fix.tsv'),sep='\t',header=True)
print(records2fix.head(n=2))

                _id submitter             classtype
0  878bfe0874851cc7  flaneuse  niaid::niaid:Dataset
1  66c0ab81b12cff8c  flaneuse  niaid::niaid:Dataset


In [13]:
allclasses = records2fix.groupby('classtype').size().reset_index(name='counts')
print(allclasses)

              classtype  counts
0  niaid::niaid:Dataset     309


In [21]:
## Check classes in sysbio set
sysbio_ids = [x.replace("DDE_","") for x in nde_sysbio_set]

sysbio_records = []
for eachid in sysbio_ids:
    dde_base_url = f"https://discovery.biothings.io/api/dataset/{eachid}?meta=1"
    r = requests.get(dde_base_url)
    temp = json.loads(r.text)
    sysbio_records.append({'_id':eachid,'submitter':temp['_meta']['username'],'classtype':temp['_meta']['class_id']})

sysbio = pd.DataFrame(sysbio_records)
sysbioclasses = sysbio.groupby('classtype').size().reset_index(name='counts')
print(sysbioclasses)
print(sysbio['submitter'].unique().tolist())

                   classtype  counts
0       niaid::niaid:Dataset      43
1  niaid::niaid:NiaidDataset      16
['sturkarslan', 'liaochen1988@gmail.com', 'flaneuse', 'dylanwelzel@gmail.com', 'mshukla1', 'richardahn@ucla.edu', 'fluomics', 'rshabman', 'merve-cakir', 'ahyoung-lim', 'esnitkin914', 'rachsattler', 'sapoudel@ucsd.edu', 'qinglong89', 'amisharin']


In [22]:
classes2fix.to_csv(os.path.join(data_path,'classes2fix.tsv'),sep='\t',header=True)

In [23]:
print(classes2fix)

                   classtype  counts
1  niaid::niaid:NiaidDataset      16
