In [1]:
import os
import json
import pandas as pd
import requests
from datetime import datetime

In [5]:
nb_path = os.getcwd()
script_path = os.path.dirname(script_path)

In [6]:
dde_registry_api_url = "https://discovery.biothings.io/api/registry/"

namespacelist = ["bts", "bioschemasdrafts", "bioschemastypesdrafts", "bioschemas", "bioschemastypes", 
                 "biomedical", "bioschemasdeprecated", "cvisb-dataset", "cvisb-patient", "ctsa", 
                 "crosswalks", "datacite", "google", "niaid", "nde", "n3c", "outbreak", "schema"]

In [4]:
## Identify classes across namespaces that have the same parent class

def fetch_child_class(desired_parent):
    results = []
    for namespace in namespacelist:
        r_url = f"{dde_registry_api_url}{namespace}"
        r = requests.get(r_url)
        r_dict = json.loads(r.text)
        hitlist = r_dict['hits']
        for hit in hitlist:
            try:
                parent_classes = hit['parent_classes'][0].split(',')
                clean_parents = [x.strip() for x in parent_classes]
                if len(clean_parents) > 0:
                    if desired_parent in clean_parents and 'schema' not in hit['name']:
                        results.append(hit)
            except:
                pass
    return(results)

In [5]:
%%time
results = fetch_child_class("schema:Dataset")
print(len(results))

9
Wall time: 10.1 s


In [6]:
for hit in results:
    print(hit['name'])

bts:BioMedicalDataset
cvisb:Dataset
bts:CTSADataset
bts:DataCite
bts:Google
niaid:Dataset
nde:Dataset
n3c:Dataset
outbreak:Dataset


In [26]:
## format the crosswalk meta info

def generate_crosswalk_meta_df(schema1,schema2):
    today = datetime.now()
    xwalkmeta = {} 
    xwalkmeta['description'] = f"This crosswalk maps properties between the {schema1['name']} and {schema2['name']}. The initial mappings were autogenerated using a script that: \n 1. Identified that the two classes shared a common, yet somewhat specific parent class (Eg- Dataset) \n 2. Pulled all available properties and the information surrounding them and matched them for manual review \n 3. exported the results for manual review \n Crosswalks which have not been reviewed will be saved in the `draft crosswalks` directory."
    xwalkmeta['@type']='Crosswalks:MetadataCrosswalk'
    xwalkmeta['@context']='contextInfo'
    xwalkmeta['identifier']=f"{schema1['namespace']}_xref_{schema2['namespace']}_{schema2['label'].lower()}_draft"
    xwalkmeta['name'] = f"A crosswalk between{schema1['name']} and {schema2['name']}"
    xwalkmeta['sameAs'] = f"https://github.com/gtsueng/DDE-CrossWalks/draft crosswalks/{xwalkmeta['identifier']}.xls"
    xwalkmeta['author'] = 'authorInfo'
    xwalkmeta['includesProperty'] = 'propertyList'
    xwalkmeta['hasPart'] = ['schemaObjects:schemaOriginObject','schemaObjects:schemaTargetObject']
    xwalkmeta['isPartOf'] = ['schemaObjects:schemaUsageObject']
    xwalkmeta['isBasedOn'] = 'nestedProps:isBasedOn'
    xwalkmeta['isBasisFor'] = 'nestedProps:isBasisFor'
    xwalkmeta['funding'] = 'fundingInfo'
    xwalkmeta['datePublished'] = datetime.strftime(today,'%m/%d/%Y')
    xwalkmeta['dateModified'] = datetime.strftime(today,'%m/%d/%Y')
    xwalkmeta['creditText'] = ''
    xwalkmeta['license'] = 'https://creativecommons.org/licenses/by/4.0/'
    propertylist = list(xwalkmeta.keys())
    valuelist = [xwalkmeta[x] for x in propertylist]
    xwalkdf = pd.DataFrame(list(zip(propertylist, valuelist)))
    xwalkdf.rename(columns={0:'property',1:'value'},inplace=True)
    return(xwalkdf)


In [27]:
xwalkdf = generate_crosswalk_meta_df(results[0],results[1])
print(xwalkdf)

            property                                              value
0        description  This crosswalk maps properties between the bts...
1              @type                       Crosswalks:MetadataCrosswalk
2           @context                                        contextInfo
3         identifier        biomedical_xref_cvisb-dataset_dataset_draft
4               name  A crosswalk betweenbts:BioMedicalDataset and c...
5             sameAs  https://github.com/gtsueng/DDE-CrossWalks/draf...
6             author                                         authorInfo
7   includesProperty                                       propertyList
8            hasPart  [schemaObjects:schemaOriginObject, schemaObjec...
9           isPartOf                  [schemaObjects:schemaUsageObject]
10         isBasedOn                              nestedProps:isBasedOn
11        isBasisFor                             nestedProps:isBasisFor
12           funding                                        fund

In [12]:
## format the properties to be mapped
#propertylist = hit['properties']
print(hit.keys())
print('label: ',hit['label'],'| name: ',hit['name'],'| namespace: ',hit['namespace'])

dict_keys(['_id', 'ref', 'parent_classes', 'prefix', 'namespace', 'name', 'description', 'label', 'uri', 'properties', 'validation'])
label:  Dataset | name:  outbreak:Dataset | namespace:  outbreak


In [7]:
print(hit['properties'][0].keys())

dict_keys(['curie', 'domain', 'description', 'range', 'label', 'uri'])


In [8]:
print(hit['properties'][0])

{'curie': 'outbreak:author', 'domain': ['outbreak:Analysis', 'outbreak:Dataset', 'outbreak:Protocol', 'outbreak:Publication', 'outbreak:ClinicalTrial', 'outbreak:ComputationalTool'], 'description': 'The author of this resource, content, or rating', 'range': ['outbreak:Person', 'outbreak:Organization'], 'label': 'author', 'uri': 'http://discovery.biothings.io/view/outbreak/author'}


In [None]:
print(hit['validation'].keys())

In [None]:
required_list = hit['validation']['required']
recommended_list = hit['validation']['recommended']
optional_list = hit['validation']['optional']
