## Inspect metadata variability for standardized property fields

Determine the number of *types* of species, infectiousAgent, and healthCondition values to see how augmentation affects them.

In [1]:
import pandas as pd
from pandas import read_csv
import requests
import json
import time
import math
import pickle
import os
import ast ## Only needed to convert string to dict where json.loads fails
from datetime import datetime

### Get the data and save it

In [2]:
def parse_id(query_type,nesteddict):
    if isinstance(nesteddict,dict):
        tmpdict = nesteddict
    elif isinstance(nesteddict,str):
        try:
            tmpdict = json.loads(nesteddict)
        except:
            tmpdict = {'failed':nesteddict}
    if 'identifier' in tmpdict.keys():
        if query_type == 'species':
            curie = 'NCBITAXON:'+str(tmpdict['identifier'])
        elif query_type == 'infectiousAgent':
            curie = 'NCBITAXON:'+str(tmpdict['identifier'])
        elif query_type == 'healthCondition':
            curie = str(tmpdict['inDefinedTermSet'])+str(tmpdict['identifier'])
    else:
        curie = -1
    return curie

In [3]:
def get_property_vals(query_type,api):
    ## Perform the initial query
    if api == 'staging':
        baseurl = 'api-staging.data.niaid.nih.gov/v1/'
    elif api == 'prod':
        baseurl = 'api.data.niaid.nih.gov/v1/'
    query_url = f'https://{baseurl}query?q=_exists_:{query_type}&fields=_id,{query_type}&fetch_all=true'
    r = requests.get(query_url)
    cleanr = json.loads(r.text)
    hits = cleanr['hits']
    #print(len(cleanr['hits']))
    df1 = pd.DataFrame(cleanr['hits'])
    scroll_id = cleanr['_scroll_id']
    total_hits = cleanr['total']
    print(datetime.now(),': ',query_type,'on',api,'has', total_hits,'total_hits')
    i = 0
    k = math.ceil(total_hits/1000)
    while i < k:
        r2 = requests.get(f'https://{baseurl}query?scroll_id={scroll_id}')
        if r2.status_code == 200:
            tmp = json.loads(r2.text)
            tmpdf = pd.DataFrame(tmp['hits'])
            df1 = pd.concat((df1,tmpdf),ignore_index=True)
            i = i+1
            try:
                scroll_id = tmp['_scroll_id']
            except:
                print('records fetched: ',len(df1))
                return df1
            time.sleep(0.10)
    return df1

In [4]:
def process_and_export(query_type,api,df1):
    querytime = datetime.strftime(datetime.now(),'%Y-%m-%d')
    processboom = df1.explode(query_type)
    processboom['CURIE'] = processboom.apply(lambda row: parse_id(query_type,row[query_type]),axis=1)
    cleandf = processboom[['_id','CURIE']].copy()
    with open(os.path.join('data',f'{querytime}_{query_type}_{api}_results.pickle'),'wb') as dumpfile:
        pickle.dump(cleandf,dumpfile)
    histogram = cleandf.groupby('CURIE').size().reset_index(name='counts')
    histogram.to_csv(os.path.join('data',f'{querytime}_{query_type}_{api}_histogram.tsv'),sep='\t',header=True)
    return cleandf

In [9]:
def dump_data():
    querytime = datetime.strftime(datetime.now(),'%Y-%m-%d')
    apilist = ['staging','prod']
    querylist = ['healthCondition','species','infectiousAgent']
    statslist = []
    for query_type in querylist:
        for api in apilist:
            df1 = get_property_vals(query_type,api)
            cleandf = process_and_export(query_type,api,df1)
            tmpdict = {'property':query_type,'API':api,
                       'total_records':len(cleandf['_id'].unique().tolist()),
                       'unique_vals': len(cleandf['CURIE'].unique().tolist())}
            statslist.append(tmpdict)
    statsdf = pd.DataFrame(statslist)
    statsdf.to_csv(os.path.join('data',f'{querytime}_property_stats.tsv'),sep='\t',header=True)

In [10]:
%%time
dump_data()


2024-09-16 12:47:38.877718 :  infectiousAgent on staging has 444196 total_hits


KeyError: 'hits'

In [None]:
## for troubleshooting

In [None]:
%%time

## Perform the initial query
querytime = datetime.strftime(datetime.now(),'%Y-%m-%d')
query_type = 'infectiousAgent'
#query_url = 'https://api.data.niaid.nih.gov/v1/query?q=_exists_:species&fields=_id,name,species&fetch_all=true'
query_url = f'https://api-staging.data.niaid.nih.gov/v1/query?q=_exists_:{query_type}&fields=_id,name,{query_type}&fetch_all=true'
r = requests.get(query_url)
cleanr = json.loads(r.text)
hits = cleanr['hits']
#print(len(cleanr['hits']))
df1 = pd.DataFrame(cleanr['hits'])
scroll_id = cleanr['_scroll_id']
total_hits = cleanr['total']
print(total_hits)

In [None]:
%%time
## Scroll to get all the results

i = 0
#k = 3 
k = math.ceil(total_hits/1000)
while i < k:
    #r2 = requests.get(f'https://api.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
    r2 = requests.get(f'https://api-staging.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
    tmp = json.loads(r2.text)
    scroll_id = tmp['_scroll_id']
    tmpdf = pd.DataFrame(tmp['hits'])
    df1 = pd.concat((df1,tmpdf),ignore_index=True)
    #print(len(df1))
    i = i+1
    time.sleep(0.25)

In [None]:
%%time

print(querytime)
query_type = 'species'
api = 'staging'
df1 = get_property_vals(query_type,api)

In [None]:
process_and_export(df1)