In [17]:
import os
import json
import pandas as pd
import requests
from datetime import datetime
import openpyxl

In [18]:
nb_path = os.getcwd()
parent_path = os.path.dirname(nb_path)
tempfiles = os.path.join(parent_path,'tempfiles')
print(parent_path)

C:\Users\gtsueng\Anaconda3\envs\outbreak\DDE-CrossWalks


## Fetch schema from DDE Registry API

In [3]:
dde_registry_api_url = "https://discovery.biothings.io/api/registry/"

namespacelist = ["bts", "bioschemasdrafts", "bioschemastypesdrafts", "bioschemas", "bioschemastypes", 
                 "biomedical", "bioschemasdeprecated", "cvisb-dataset", "cvisb-patient", "ctsa", 
                 "crosswalks", "datacite", "google", "niaid", "nde", "n3c", "outbreak", "schema"]

In [4]:
## Identify classes across namespaces that have the same parent class

def fetch_child_class(desired_parent):
    results = []
    for namespace in namespacelist:
        r_url = f"{dde_registry_api_url}{namespace}"
        r = requests.get(r_url)
        r_dict = json.loads(r.text)
        hitlist = r_dict['hits']
        for hit in hitlist:
            try:
                parent_classes = hit['parent_classes'][0].split(',')
                clean_parents = [x.strip() for x in parent_classes]
                if len(clean_parents) > 0:
                    if desired_parent in clean_parents and 'schema' not in hit['name']:
                        results.append(hit)
            except:
                pass
    return(results)

In [None]:
%%time
results = fetch_child_class("schema:Dataset")
print(len(results))

In [None]:
for hit in results:
    print(hit['name'])

In [5]:
## format the crosswalk meta info
def split_label(schemadict):
    namespace_label = schemadict['identifier']
    tmp = namespace_label.split(':')
    label = tmp[1]
    return(label)

def generate_crosswalk_meta_df(schema1,schema2):
    today = datetime.now()
    try:
        label = schema2['label']
    except:
        try:
            label = split_label(schema2)
        except:    
            label = split_label(schema1)
    xwalkmeta = {} 
    xwalkmeta['description'] = f"This crosswalk maps properties between the {schema1['name']} and {schema2['name']}. The initial mappings were autogenerated using a script that: \n 1. Identified that the two classes shared a common, yet somewhat specific parent class (Eg- Dataset) \n 2. Pulled all available properties and the information surrounding them and matched them for manual review \n 3. exported the results for manual review \n Crosswalks which have not been reviewed will be saved in the `draft crosswalks` directory."
    xwalkmeta['@type']='Crosswalks:MetadataCrosswalk'
    xwalkmeta['@context']='contextInfo'
    xwalkmeta['identifier']=f"{schema1['namespace']}_xref_{schema2['namespace']}_{label.lower()}_draft"
    xwalkmeta['name'] = f"A crosswalk between {schema1['name']} and {schema2['name']}"
    xwalkmeta['sameAs'] = f"https://github.com/gtsueng/DDE-CrossWalks/draft crosswalks/{xwalkmeta['identifier']}.xls"
    xwalkmeta['author'] = 'authorInfo'
    xwalkmeta['includesProperty'] = 'propertyList'
    xwalkmeta['hasPart'] = ['schemaObjects:schemaOriginObject','schemaObjects:schemaTargetObject']
    xwalkmeta['isPartOf'] = ['schemaObjects:schemaUsageObject']
    xwalkmeta['isBasedOn'] = 'nestedProps:isBasedOn'
    xwalkmeta['isBasisFor'] = 'nestedProps:isBasisFor'
    xwalkmeta['funding'] = 'fundingInfo'
    xwalkmeta['datePublished'] = datetime.strftime(today,'%m/%d/%Y')
    xwalkmeta['dateModified'] = datetime.strftime(today,'%m/%d/%Y')
    xwalkmeta['creditText'] = ''
    xwalkmeta['license'] = 'https://creativecommons.org/licenses/by/4.0/'
    propertylist = list(xwalkmeta.keys())
    valuelist = [xwalkmeta[x] for x in propertylist]
    xwalkdf = pd.DataFrame(list(zip(propertylist, valuelist)))
    xwalkdf.rename(columns={0:'property',1:'value'},inplace=True)
    return(xwalkdf)


In [None]:
xwalkdf = generate_crosswalk_meta_df(results[0],results[1])
print(xwalkdf)

In [None]:
## format the properties to be mapped
#propertylist = hit['properties']
print(hit.keys())
print('label: ',hit['label'],'| name: ',hit['name'],'| namespace: ',hit['namespace'])

In [None]:
print(hit['properties'][0].keys())

In [None]:
print(hit['properties'][0])

In [None]:
print(hit['validation'].keys())

In [None]:
required_list = hit['validation']['required']
recommended_list = hit['validation']['recommended']
optional_list = hit['validation']['optional']


## Inspect metadata from NDE API

In [6]:
nde_api = 'https://api.data.niaid.nih.gov/v1/metadata'
r = requests.get(nde_api)

results = json.loads(r.text)
print(results.keys())

dict_keys(['biothing_type', 'build_date', 'build_version', 'src', 'stats'])


In [None]:
print(results['src'].keys())
print(results['src']['zenodo']['sourceInfo'].keys())
print(results['src']['zenodo']['sourceInfo']['schema'].keys())
print(results['src']['zenodo']['sourceInfo']['schema']['software'])

In [7]:
source_list = list(results['src'].keys())
source_dict_list = []
for eachsource in source_list:
    schemadict = results['src'][eachsource]['sourceInfo']['schema']
    for k,v in schemadict.items():
        source_dict_list.append({"source":eachsource,"source_prop":k,"nde_prop":v})

source_df = pd.DataFrame(source_dict_list)
print(source_df.head(n=3))    

  source   source_prop     nde_prop
0    dde       creator       author
1    dde           _id   identifier
2    dde  date_created  dateCreated


In [10]:
source_df.to_csv(os.path.join(parent_path,'metainfo','nde','nde_meta.tsv'),sep='\t',header=True)

## Create NDE crosswalk drafts

Default NDE schema meta info: 

In [11]:
#### Deprecate these if not used
def fetch_nde_schema(raw=False):
    nde_schema_url = "https://discovery.biothings.io/api/registry/nde"
    r = requests.get(nde_schema_url)
    nde_result = json.loads(r.text)
    if raw == False:
        nde_schema_classes = nde_result['hits']
    else:
        nde_schema_hits = nde_result['source']['@graph']
        nde_schema_classes = []
        for eachhit in nde_schema_hits:
            if eachhit["@type"] == "rdfs:Class":
                nde_schema_classes.append(eachhit)        
    return nde_schema_classes

def fetch_correct_nde_class(raw=False,schematype="Dataset"):
    if raw == False:
        nde_schema_classes = fetch_nde_schema()
    else:
        nde_schema_classes = fetch_nde_schema(raw)
    if schematype=="Dataset":
        for eachhit in nde_schema_classes:
            try:
                if eachhit["name"] == "nde:Dataset":
                    return eachhit
            except:
                if eachhit["@id"] == "nde:Dataset":
                    return eachhit
    else:
        for eachhit in nde_schema_classes:
            try:
                if eachhit["name"] == "nde:ComputationalTool":
                    return eachhit
            except:
                if eachhit["@id"] == "nde:ComputationalTool":
                    return eachhit                

In [22]:
## NDE Dataset schema meta default info
nde_schemas = {
    "nde_data_schema" : {
        "objectType": "schemaTargetObject",
        "namespace": "nde",
        "@context": "https://discovery.biothings.io/view/nde/",
        "@type": "CreativeWork",
        "identifier": "nde:Dataset",
        "name": "NIAID Data Ecosystem (NDE) Dataset Schema",
        "alternateName": "nde:Dataset",
        "url": "https://discovery.biothings.io/view/nde/",
        "version": "2022-09-21",
        "citation.@type": "WebSite",
        "citation.name": "NIAID Data Ecosystem (NDE) schema",
        "citation.url": "https://discovery.biothings.io/view/nde/"
    },
    "nde_tool_schema" : {
        "objectType": "schemaTargetObject",
        "namespace": "nde",
        "@context": "https://discovery.biothings.io/view/nde/",
        "@type": "CreativeWork",
        "identifier": "nde:ComputationalTool",
        "name": "NIAID Data Ecosystem (NDE) ComputationalTool Schema",
        "alternateName": "nde:ComputationalTool",
        "url": "https://discovery.biothings.io/view/nde/",
        "version": "2022-09-21",
        "citation.@type": "WebSite",
        "citation.name": "NIAID Data Ecosystem (NDE) schema",
        "citation.url": "https://discovery.biothings.io/view/nde/"
    }
}

def fetch_correct_nde_class(nde_schemas,schematype):
    if schematype == "ComputationalTool":
        return nde_schemas["nde_tool_schema"]
    if schematype == "Dataset":
        return nde_schemas["nde_data_schema"]


def generate_schemaObjects(nde_schemas, srcdict, schematype="Dataset"):
    print(nde_schemas.keys())
    sourcedict = srcdict['sourceInfo']
    schemaOriginObject = {
        "objectType": "schemaOriginObject",
        "namespace": eachsource,
        "@type": "CreativeWork",
        "identifier": sourcedict['identifier'],
        "name": sourcedict['name'],
        "alternateName": eachsource,
        "url": sourcedict['url'],
        "version": srcdict['version'],
        "citation.@type": "WebSite",
        "citation.name": sourcedict['name'],
        "citation.url": sourcedict['url']
    }
    if schematype == 'ComputationalTool':
        schemaOriginObject["name"] = f"{sourcedict['name']} Computational Tool Schema"
        tmpschema = nde_schemas["nde_tool_schema"]
    else:
        schemaOriginObject["name"] = f"{sourcedict['name']} Dataset Schema"
        tmpschema = nde_schemas["nde_data_schema"]
    schema_list = [tmpschema,schemaOriginObject]
    schemaObjectsDF = pd.DataFrame(schema_list)
    return schemaOriginObject, schemaObjectsDF

In [13]:
def generate_nestedObjects():
    nestedobjectslist = [{
        "property": "isBasisFor",
        "@type": "WebAPI",
        "name": "NIAID Data Ecosystem API metadata",
        "url": "https://api.data.niaid.nih.gov/v1/metadata"
    }]
    nestedObjectsDF = pd.DataFrame(nestedobjectslist)
    return nestedObjectsDF 

def generate_fundingObject():
    funding = [{
    "@type": "schema:MonetaryGrant",
    "identifier": "U19 AI135995",
    "funder.@type": "schema:Organization",
    "funder.name": "National Institute of Allergy and Infectious Diseases",
    "funder.parentOrganization":  "National Institutes for Health"
    }]
    fundingInfoDF = pd.DataFrame(funding)
    return fundingInfoDF

def generate_authorObjects(parent_path):
    template_path = os.path.join(parent_path,'metainfo','nde')
    authorDF = pd.read_csv(os.path.join(template_path,'defaultAuthors.tsv'),delimiter='\t',header=0)
    return authorDF

In [14]:
## To do: load the nde prop table
def load_nde_props(parent_path,schematype):
    if schematype == "Dataset":
        nde_props = pd.read_csv(os.path.join(parent_path,'metainfo','nde','nde_dataset_props.tsv'),delimiter='\t',header=0,index_col=0)
    if schematype == "ComputationalTool":
        nde_props = pd.read_csv(os.path.join(parent_path,'metainfo','nde','nde_comptools_props.tsv'),delimiter='\t',header=0,index_col=0)
    return nde_props


def generate_prop_includes(parent_path,schematype,source):
    source_df = pd.read_csv(os.path.join(parent_path,'metainfo','nde','nde_meta.tsv'),delimiter='\t',header=0,index_col=0)
    tmpdf = source_df.loc[source_df['source']==source].copy()
    tmpdf.rename(columns = {"nde_prop":f"sameAs.nde:{schematype}"},inplace=True)
    nde_props = load_nde_props(parent_path,schematype)
    included_props = tmpdf.merge(nde_props,how="left")
    included_props.drop("source",axis=1,inplace=True)
    included_props.rename(columns={"source_prop":source},inplace=True)
    return included_props

def generate_draft_xls_file(parent_path,src,schematype):
    filename = f"{src}_nde_{schematype}.xlsx"
    filepath = os.path.join(parent_path,'draft crosswalks',filename)
    return filepath

In [19]:
def generate_draft_nde_mappings(parent_path):
    schematypeslist = ["Dataset","ComputationalTool"]
    nde_api = 'https://api.data.niaid.nih.gov/v1/metadata'
    r = requests.get(nde_api)
    results = json.loads(r.text)
    source_list = list(results['src'].keys())
    for schematype in schematypeslist:
        for eachsource in source_list:
            ## Generate funding sheet
            fundingInfoDF = generate_fundingObject()
            ## Generate nestedObjects sheet
            nestedObjectsDF = generate_nestedObjects()
            ## Generate schemaObjects sheet
            srcdict = results['src'][eachsource]
            schemaOriginObject, schemaObjectsDF = generate_schemaObjects(nde_schemas,srcdict,schematype)
            ## Generate metaInfo sheet
            nde_target_schema = fetch_correct_nde_class(nde_schemas,schematype)
            xwalkdf = generate_crosswalk_meta_df(schemaOriginObject,nde_target_schema)
            ## Generate authorInfo sheet
            authorDF = generate_authorObjects(parent_path)
            ## Generate propertylist sheet
            included_props = generate_prop_includes(parent_path,schematype,eachsource)
            ## export all sheets into an xls file
            filepath = generate_draft_xls_file(parent_path,eachsource,schematype)
            with pd.ExcelWriter(filepath, engine='openpyxl', mode='w') as writer:
                xwalkdf.to_excel(writer, sheet_name='metaInfo', index=False)
            with pd.ExcelWriter(filepath, engine='openpyxl', mode='a', index=False) as writer:
                included_props.to_excel(writer, sheet_name='propertyList', index=False)
                schemaObjectsDF.to_excel(writer, sheet_name='schemaObjects', index=False)
                authorDF.to_excel(writer, sheet_name='authorInfo', index=False)
                nestedObjectsDF.to_excel(writer, sheet_name='nestedProps', index=False)
                fundingInfoDF.to_excel(writer, sheet_name='fundingInfo', index=False)

In [23]:
generate_draft_nde_mappings(parent_path)

dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])
dict_keys(['nde_data_schema', 'nde_tool_schema'])


In [None]:
### Tests

In [None]:
for eachsource in source_list[1:2]:
    ## Generate funding sheet
    fundingInfoDF = generate_fundingObject()
    ## Generate nestedObjects sheet
    nestedObjectsDF = generate_nestedObjects()
    ## Generate schemaObjects sheet
    srcdict = results['src'][eachsource]
    schematype = "ComputationalTool"
    schemaOriginObject, schemaObjectsDF = generate_schemaObjects(nde_schemas,srcdict,schematype)
    ## Generate metaInfo sheet
    nde_target_schema = fetch_correct_nde_class(nde_schemas,schematype)
    xwalkdf = generate_crosswalk_meta_df(schemaOriginObject,nde_target_schema)
    ## Generate authorInfo sheet
    authorDF = generate_authorObjects(parent_path)
    ## Generate propertylist sheet
    included_props = generate_prop_includes(parent_path,schematype,eachsource)
    ## export all sheets into an xls file
    filepath = generate_draft_xls_file(parent_path,eachsource,schematype)
    with pd.ExcelWriter(filepath, engine='openpyxl', mode='w') as writer:
        xwalkdf.to_excel(writer, sheet_name='metaInfo', index=False)
    with pd.ExcelWriter(filepath, engine='openpyxl', mode='a', index=False) as writer:
        included_props.to_excel(writer, sheet_name='propertyList', index=False)
        schemaObjectsDF.to_excel(writer, sheet_name='schemaObjects', index=False)
        authorDF.to_excel(writer, sheet_name='authorInfo', index=False)
        nestedObjectsDF.to_excel(writer, sheet_name='nestedProps', index=False)
        fundingInfoDF.to_excel(writer, sheet_name='fundingInfo', index=False)

In [None]:
source_df = pd.read_csv(os.path.join(parent_path,'metainfo','nde','nde_meta.tsv'),delimiter='\t',header=0,index_col=0)
print(source_df.head(n=2))

In [None]:
schematype = "Dataset"
source = "dryad"
included_props = generate_prop_includes(parent_path,schematype,source)
print(included_props.head(n=4))

In [None]:
## test reading of xlsx file
from pandas import read_excel
data_file = os.path.join(parent_path,'draft crosswalks','zenodo_nde_ComputationalTool.xlsx')
author_info = read_excel(data_file,sheet_name='authorInfo',engine='openpyxl',header=0,index_col=None)
author_object = author_info.to_dict(orient="records")
print(author_object)