## Check Coverage of Schema properties by repository

In [1]:
import pandas as pd
import json
import requests
from datetime import datetime
import os

In [2]:
def generate_coverage_table(sourcelist,propertylist):
    ## classify the propertylist:
    if 'name' in propertylist:
        listtype = "MR"
        if 'version' in propertylist:
            listtype = 'MRO'
    else:
        listtype = "O"
    ## Generate base df
    print('Generating base df')
    allist = []
    for eachproperty in propertylist:
        api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty}'
        r = requests.get(api_base_call)
        tmpdict = json.loads(r.text)
        totals = tmpdict['total']
        allist.append({'property':eachproperty,'All':totals})
    alldf = pd.DataFrame(allist)
    ## Append all other coverage data
    totaldf = alldf.copy()
    print('now fetching source coverage')
    for eachsource in sourcelist:
        sourcetotallist = []
        for eachproperty in propertylist:
            api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"{eachsource}"'
            r = requests.get(api_base_call)
            tmpdict = json.loads(r.text)
            totals = tmpdict['total']
            sourcetotallist.append({'property':eachproperty,f'{eachsource}':totals})
        sourcedf = pd.DataFrame(sourcetotallist)
        print(len(sourcedf),' properties obtained for ',eachsource)
        totaldf = totaldf.merge(sourcedf,on='property',how='left')
        print(totaldf.head(n=1))
    ## Export the dataframe
    now = datetime.now() # current date and time
    date_info = now.strftime("%Y-%m-%d")
    totaldf.to_csv(os.path.join('coverage',f"{date_info}_schema_coverage({listtype}).tsv"),sep='\t',header=True)

In [3]:
propertylistfile = os.path.join('data','propertylist.txt')
propertylist = []
with open(propertylistfile,'r') as inputfile:
    for line in inputfile:
        propertylist.append(line.strip())
print(propertylist)

sourcelistfile = os.path.join('data','sourcelist.txt')
sourcelist = []
with open(sourcelistfile,'r') as srcfile:
    for line in srcfile:
        sourcelist.append(line.strip())

optionalpropsfile = os.path.join('data','optionallist.txt')
optionallist = []
with open(optionalpropsfile,'r') as optfile:
    for line in optfile:
        optionallist.append(line.strip())

['name', 'description', 'author', 'author.name', 'author.givenName', 'author.familyName', 'url', 'measurementTechnique', 'measurementTechnique.name', 'includedInDataCatalog', 'includedInDataCatalog.name', 'distribution', 'distribution.contentUrl', 'distribution.dateModified', 'funding', 'funding.funder.name', 'funding.identifier', 'date', 'dateCreated', 'dateModified', 'datePublished', 'citedBy', 'doi', 'infectiousAgent', 'healthCondition', 'species', 'variableMeasured', 'citation', 'citation.pmid', 'conditionsOfAccess', 'isBasedOn', 'keywords', 'license', 'sdPublisher', 'spatialCoverage', 'temporalCoverage', 'topicCategory', 'identifier', 'usageInfo', 'interactionStatistic']


In [4]:
print(optionallist)
print(len(optionallist))

['isRelatedTo', 'isSimilarTo', 'isBasisFor', 'nctid', 'abstract', 'aggregateRating', 'creator', 'hasPart', 'inLanguage', 'isAccessibleForFree', 'isPartOf', 'version', 'alternateName', 'mainEntityOfPage', 'sameAs', 'relationship']
16


In [None]:
%%time
generate_coverage_table(sourcelist,optionallist)

## Test function parts

In [None]:
example_call = 'https://api.data.niaid.nih.gov/v1/query?q=_exists_:isRelatedTo'
#example_call = 'https://api.data.niaid.nih.gov/v1/query?q=_exists_:isRelatedTo AND includedInDataCatalog.name:"Zenodo"'

r = requests.get(example_call)
tmpdict = json.loads(r.text)
print(tmpdict['total'])

In [None]:
allist = []
for eachproperty in optionallist:
    api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty}'
    r = requests.get(api_base_call)
    tmpdict = json.loads(r.text)
    totals = tmpdict['total']
    allist.append({'property':eachproperty,'All':totals})
print(allist)

In [None]:
## Generate the zenodo table as default

zenodolist = []
for eachproperty in propertylist:
    api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"Zenodo"'
    r = requests.get(api_base_call)
    tmpdict = json.loads(r.text)
    totals = tmpdict['total']
    zenodolist.append({'property':eachproperty,'Zenodo':totals})

zenododf = pd.DataFrame(zenodolist)

In [None]:
print(zenododf)

In [None]:
%%time
totaldf = zenododf.copy()
for eachsource in sourcelist:
    sourcetotallist = []
    print(eachsource)
    for eachproperty in propertylist:
        api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"{eachsource}"'
        r = requests.get(api_base_call)
        tmpdict = json.loads(r.text)
        totals = tmpdict['total']
        sourcetotallist.append({'property':eachproperty,f'{eachsource}':totals})
    sourcedf = pd.DataFrame(sourcetotallist)
    totaldf = totaldf.merge(sourcedf,on='property',how='left')
    print(totaldf.head(n=2))

In [None]:
print(totaldf.head(n=2))

In [None]:
totaldf.to_csv('schema_coverage_2023.10.11.txt',sep='\t',header=True)

In [None]:
%%time
mendeleylist = []
for eachproperty in propertylist:
    api_base_call = f'https://api.data.niaid.nih.gov/v1/query?q=_exists_:{eachproperty} AND includedInDataCatalog.name:"Mendeley"'
    r = requests.get(api_base_call)
    tmpdict = json.loads(r.text)
    totals = tmpdict['total']
    mendeleylist.append({'property':eachproperty,'Zenodo':totals})

mendeleydf = pd.DataFrame(mendeleylist)

In [None]:
print(mendeleydf)