## Check Coverage of Schema properties by repository

In [1]:
import pandas as pd
import json
import requests
from datetime import datetime
import os

In [2]:
def generate_coverage_table(sourcelist,propertylist,api):
    ## classify the propertylist:
    if 'name' in propertylist:
        listtype = "MR"
        if 'version' in propertylist:
            listtype = 'MRO'
    else:
        listtype = "O"
    ## Generate base df
    print('Generating base df')
    allist = []
    if api == 'prod':
        baseurl = 'api.data.niaid.nih.gov/v1/'
    elif api == 'staging':
        baseurl = 'api-staging.data.niaid.nih.gov/v1/'
    for eachproperty in propertylist:
        api_base_call = f'https://{baseurl}query?q=_exists_:{eachproperty}'
        r = requests.get(api_base_call)
        tmpdict = json.loads(r.text)
        totals = tmpdict['total']
        allist.append({'property':eachproperty,'All':totals})
    alldf = pd.DataFrame(allist)
    ## Append all other coverage data
    totaldf = alldf.copy()
    print('now fetching source coverage')
    for eachsource in sourcelist:
        sourcetotallist = []
        print("now fetching for: ",eachsource)
        for eachproperty in propertylist:
            print("checking: ",eachproperty)
            api_base_call = f'https://{baseurl}query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"{eachsource}"'
            r = requests.get(api_base_call)
            tmpdict = json.loads(r.text)
            totals = tmpdict['total']
            sourcetotallist.append({'property':eachproperty,f'{eachsource}':totals})
        sourcedf = pd.DataFrame(sourcetotallist)
        #print(len(sourcedf),' properties obtained for ',eachsource)
        totaldf = totaldf.merge(sourcedf,on='property',how='left')
        #print(totaldf.head(n=1))
    ## Export the dataframe
    now = datetime.now() # current date and time
    date_info = now.strftime("%Y-%m-%d")
    totaldf.to_csv(os.path.join('coverage',f"{date_info}_{api}_schema_coverage({listtype}).tsv"),sep='\t',header=True)

In [3]:
propertylistfile = os.path.join('data','propertylist.txt')
propertylist = []
with open(propertylistfile,'r') as inputfile:
    for line in inputfile:
        propertylist.append(line.strip())
print(propertylist)

sourcelistfile = os.path.join('data','sourcelist.txt')
sourcelist = []
with open(sourcelistfile,'r') as srcfile:
    for line in srcfile:
        sourcelist.append(line.strip())

optionalpropsfile = os.path.join('data','optionallist.txt')
optionallist = []
with open(optionalpropsfile,'r') as optfile:
    for line in optfile:
        optionallist.append(line.strip())

['name', 'description', 'author', 'author.name', 'author.givenName', 'author.familyName', 'url', 'measurementTechnique', 'measurementTechnique.name', 'includedInDataCatalog', 'includedInDataCatalog.name', 'includedIndataCatalog.archivedAt', 'distribution', 'distribution.contentUrl', 'distribution.dateModified', 'funding', 'funding.funder.name', 'funding.identifier', 'date', 'dateCreated', 'dateModified', 'datePublished', 'citedBy', 'doi', 'infectiousAgent', 'healthCondition', 'species', 'variableMeasured', 'citation', 'citation.pmid', 'citation.doi', 'conditionsOfAccess', 'isBasedOn', 'keywords', 'license', 'sdPublisher', 'spatialCoverage', 'temporalCoverage', 'topicCategory', 'identifier', 'usageInfo', 'interactionStatistic']


In [4]:
print(optionallist)
print(len(optionallist))

['isRelatedTo', 'isSimilarTo', 'isBasisFor', 'nctid', 'abstract', 'aggregateRating', 'creator', 'hasPart', 'inLanguage', 'isAccessibleForFree', 'isPartOf', 'version', 'alternateName', 'mainEntityOfPage', 'sameAs', 'relationship', 'sourceOrganization.name']
17


In [6]:
%%time
#api = 'staging'
api = 'prod'
#sourcelist = ["MalariaGEN","Omics+Discovery+Index+(OmicsDI)"]
generate_coverage_table(sourcelist,optionallist,api)

Generating base df
now fetching source coverage
now fetching for:  AccessClinicalData@NIAID
checking:  isRelatedTo
checking:  isSimilarTo
checking:  isBasisFor
checking:  nctid
checking:  abstract
checking:  aggregateRating
checking:  creator
checking:  hasPart
checking:  inLanguage
checking:  isAccessibleForFree
checking:  isPartOf
checking:  version
checking:  alternateName
checking:  mainEntityOfPage
checking:  sameAs
checking:  relationship
checking:  sourceOrganization.name
now fetching for:  biotools
checking:  isRelatedTo
checking:  isSimilarTo
checking:  isBasisFor
checking:  nctid
checking:  abstract
checking:  aggregateRating
checking:  creator
checking:  hasPart
checking:  inLanguage
checking:  isAccessibleForFree
checking:  isPartOf
checking:  version
checking:  alternateName
checking:  mainEntityOfPage
checking:  sameAs
checking:  relationship
checking:  sourceOrganization.name
now fetching for:  ClinEpiDB
checking:  isRelatedTo
checking:  isSimilarTo
checking:  isBasisFor

checking:  nctid
checking:  abstract
checking:  aggregateRating
checking:  creator
checking:  hasPart
checking:  inLanguage
checking:  isAccessibleForFree
checking:  isPartOf
checking:  version
checking:  alternateName
checking:  mainEntityOfPage
checking:  sameAs
checking:  relationship
checking:  sourceOrganization.name
now fetching for:  NDEx
checking:  isRelatedTo
checking:  isSimilarTo
checking:  isBasisFor
checking:  nctid
checking:  abstract
checking:  aggregateRating
checking:  creator
checking:  hasPart
checking:  inLanguage
checking:  isAccessibleForFree
checking:  isPartOf
checking:  version
checking:  alternateName
checking:  mainEntityOfPage
checking:  sameAs
checking:  relationship
checking:  sourceOrganization.name
now fetching for:  NICHD+DASH
checking:  isRelatedTo
checking:  isSimilarTo
checking:  isBasisFor
checking:  nctid
checking:  abstract
checking:  aggregateRating
checking:  creator
checking:  hasPart
checking:  inLanguage
checking:  isAccessibleForFree
checkin

## Investigate missing metadata

1. Check for patterns of missing metadata 
    * for example, if there is an sdPublisher in OMICS-DI in particular for which the metadata went missing
    * "https://api.data.niaid.nih.gov/v1/query?q=-_exists_:description%20AND%20includedInDataCatalog.name:%22Omics+Discovery+Index+(OmicsDI)%22&fields=sdPublisher.name&fetch_all=true"

2. If necessary, check if the records missing metadata in Staging have values in Production

In [None]:
import math

In [None]:
#propertylist = ['conditionsOfAccess','description','dateCreated','datePublished']
propertylist = ['conditionsOfAccess']
#api = 'staging'
api = 'prod'
source = 'Omics+Discovery+Index+(OmicsDI)'
field = 'sdPublisher.name'

allresults = pd.DataFrame(columns=['propname','_id','_score','sdPublisher'])

if api == 'prod':
    baseurl = 'https://api.data.niaid.nih.gov/v1/query?q='
elif api == 'staging':
    baseurl = 'https://api-staging.data.niaid.nih.gov/v1/query?q='
for eachproperty in propertylist:
    api_base_call = f'{baseurl}includedInDataCatalog.name:"{source}"+AND+-_exists_:{eachproperty}&fields={field}&fetch_all=true'
    r = requests.get(api_base_call)
    tmpdict = json.loads(r.text)
    totals = tmpdict['total']
    tmpdf = pd.DataFrame(tmpdict['hits'])
    tmpdf['propname'] = [eachproperty for x in tmpdf['_score']]
    allresults = pd.concat((allresults,tmpdf),ignore_index=True)
    if totals>=500:
        i=0
        maxscrolls = math.ceil(tmpdict['total']/500)
        scroll_id = tmpdict['_scroll_id']
        while i < maxscrolls:
            try:
                r2 = requests.get(f'https://{baseurl}query?scroll_id={scroll_id}')
                tmp = json.loads(r2.text)
                scroll_id = tmp['_scroll_id']
                tmpdf = pd.DataFrame(tmp['hits'])
                tmpdf['propname'] = [eachproperty for x in tmpdf['_score']]
                allresults = pd.concat((allresults,tmpdf),ignore_index=True)
                i=i+1
            except:
                break
                
print(allresults.head(n=2))

In [None]:
print(len(allresults))
allresults['sdPubName'] = [x['name'] for x in allresults['sdPublisher'].tolist()]
print(len(allresults.loc[allresults['_ignored'].isna()]))
print(allresults.loc[~allresults['_ignored'].isna()].head(n=2))
#cleanresults = allresults[['propname','_id','sdPubName']].copy()
#cleanresults.drop_duplicates(keep='first',inplace=True)
#print(cleanresults.head(n=2))
#print(len(cleanresults))

In [None]:
#staging_histogram = cleanresults.groupby(['propname','sdPubName']).size().reset_index(name='staging_counts')
print(staging_histogram.head(n=2))

In [None]:
print(len(cleanresults))
prod_histogram = cleanresults.groupby(['propname','sdPubName']).size().reset_index(name='prod_counts')
print(prod_histogram)

In [None]:
print(cleanresults.loc[cleanresults['sdPubName']=='bioimages'])

In [None]:
comparison = staging_histogram.merge(prod_histogram,on=['propname','sdPubName'],how='outer')
print(comparison.head(n=2))
comparison.to_csv(os.path.join('coverage','2025-07-10-stage_prod_missing_comparison.tsv'),sep='\t',header=True)

## Test function parts

In [None]:
example_call = 'https://api.data.niaid.nih.gov/v1/query?q=_exists_:isRelatedTo'
#example_call = 'https://api.data.niaid.nih.gov/v1/query?q=_exists_:isRelatedTo AND includedInDataCatalog.name:"Zenodo"'

r = requests.get(example_call)
tmpdict = json.loads(r.text)
print(tmpdict['total'])

In [None]:
allist = []
for eachproperty in optionallist:
    api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty}'
    r = requests.get(api_base_call)
    tmpdict = json.loads(r.text)
    totals = tmpdict['total']
    allist.append({'property':eachproperty,'All':totals})
print(allist)

In [None]:
## Generate the zenodo table as default

zenodolist = []
for eachproperty in propertylist:
    api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"Zenodo"'
    r = requests.get(api_base_call)
    tmpdict = json.loads(r.text)
    totals = tmpdict['total']
    zenodolist.append({'property':eachproperty,'Zenodo':totals})

zenododf = pd.DataFrame(zenodolist)

In [None]:
print(zenododf)

In [None]:
%%time
totaldf = zenododf.copy()
for eachsource in sourcelist:
    sourcetotallist = []
    print(eachsource)
    for eachproperty in propertylist:
        api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"{eachsource}"'
        r = requests.get(api_base_call)
        tmpdict = json.loads(r.text)
        totals = tmpdict['total']
        sourcetotallist.append({'property':eachproperty,f'{eachsource}':totals})
    sourcedf = pd.DataFrame(sourcetotallist)
    totaldf = totaldf.merge(sourcedf,on='property',how='left')
    print(totaldf.head(n=2))

In [None]:
print(totaldf.head(n=2))

In [None]:
totaldf.to_csv('schema_coverage_2023.10.11.txt',sep='\t',header=True)

In [None]:
%%time
mendeleylist = []
for eachproperty in propertylist:
    api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"Mendeley"'
    r = requests.get(api_base_call)
    tmpdict = json.loads(r.text)
    totals = tmpdict['total']
    mendeleylist.append({'property':eachproperty,'Zenodo':totals})

mendeleydf = pd.DataFrame(mendeleylist)

In [None]:
print(mendeleydf)