## This code converts a crosswalk table to a compliant jsonld file

Note that a crosswalk table may need to be reformatted to include missing information. For example, some crosswalks will include information on the marginality of a property while others do not. If property marginality is not included, it should be included as "unspecified". Additionally, a crosswalk table may use a simple header to reference the schema for which the property is being compared. Whenever possible, details for that schema should be included in the schemaUsageObject (see template). 

Some schemas are derived by mixing and matching from other schemas. If there are properties in a schema that were derived from elsewhere, details of the schema (from which they were derived) should be included in the schemaOriginObject (see template).

In [1]:
import pandas as pd
from pandas import read_csv
import json
import os

#### Expected tables and how they're used

* **schemaOriginObject**: An array of metadata about a (or multiple) source schema(s). In most cases, the source schema and target schema can be treated interchangeably. HOWEVER, there are schemas which are sets of properties pulled from other schemas. In this case, the source schema should include all the schemas from which properties were pulled.

* **schemaUsageObject**: An array of metadata about a (or multiple) target schema(s). In most cases, the target schema and source schema can be treated interchangeably. However, there are schemas which are sets of properties pulled from other schemas. In this case, the target schema will be the mix-n-match schema.

* **propertyCrossWalk**: An array of properties from one schema, mapped to properties in another schema. Note that it is possible for a single property in schema A to map to multiple properties in schema B (and vice versa). In this case, include each property as a separate row.


In [2]:
scriptpath = ""
exportpath = os.path.join(scriptpath,'crosswalks')
schemapath = os.path.join(scriptpath,'schema')
datapath = os.path.join(scriptpath,'templates')

In [6]:
def generate_context_list(schemaOrigin):
    context_info = schemaOrigin[['namespace','@context']].copy()
    context_dict = {}
    for i in range(len(context_info)):
        tmpnamespace = context_info.iloc[i]['namespace']
        tmpuri = context_info.iloc[i]['@context']
        context_dict[str(tmpnamespace)]=str(tmpuri)
    return context_dict

def load_schemaObjects(schemaOrigin):
    originforjson = schemaOrigin[['@type','name','alternateName','url','version']].copy()
    citationdf = schemaOrigin[['citation.@type','citation.name','citation.url']].copy()
    citationdf.rename(columns={'citation.@type':'@type','citation.name':'name','citation.url':'url'},inplace=True)
    citejson = citationdf.to_dict(orient='records')
    originforjson['citation'] = [x for x in citejson]
    originforjson['alternateNameClean'] = originforjson.apply(lambda row: deal_with_multis(row['alternateName']),axis=1)
    originforjson.drop('alternateName',inplace=True,axis=1)
    originforjson.rename(columns = {'alternateNameClean':'alternateName'},inplace=True)
    schemaOriginObject = originforjson.to_dict(orient='records')
    return schemaOriginObject

def deal_with_multis(record_entry):
    tmpdata = record_entry.split(',')
    record_entry = tmpdata
    return record_entry

In [4]:
print(os.listdir(datapath))

['propertyCrossWalk.txt', 'schemaOriginObject.txt', 'schemaUsageObject.txt']


In [7]:
schemaOrigin = read_csv(os.path.join(datapath,'schemaOriginObject.txt'),delimiter='\t',header=0,index_col=0)
schemaOriginObject = load_schemaObjects(schemaOrigin)

contextlists = generate_context_list(schemaOrigin)
print(contextlists)

schemaUsage = read_csv(os.path.join(datapath,'schemaUsageObject.txt'),delimiter='\t',header=0,index_col=0)
schemaUsageObject = load_schemaObjects(schemaUsage)
print(schemaUsageObject[0])

{'schema': 'https://schema.org/', 'dct': 'http://purl.org/dc/terms/', 'foaf': 'https://xmlns.com/foaf/spec/#term_', 'dcat': 'http://www.w3.org/ns/dcat#', 'sosa': 'https://www.w3.org/TR/vocab-ssn/#SOSA', 'prov': 'http://www.w3.org/ns/prov#', 'datacite': 'https://support.datacite.org/docs/datacite-metadata-schema-v44-properties-overview#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'dc': 'http://purl.org/dc/elements/1.1/', 'bioschemas': 'https://discovery.biothings.io/view/bioschemas/', 'bioschemastypes': 'https://discovery.biothings.io/view/bioschemastypes/', 'bioschemasdrafts': 'https://discovery.biothings.io/view/bioschemasdrafts/', 'bioschemastypesdrafts': 'https://discovery.biothings.io/view/bioschemastypesdrafts/', 'iso19115': 'https://www.iso.org/standard/53798.html#', 'spase': 'https://spase-group.org/data/model/spase-2.4.0/spase-2_4_0_xsd.html#', 'codemeta': 'https://codemeta.github.io/terms/#', 'owl': 'http://www.w3.org/2002/07/owl#'}
{'@type': 'schema:CreativeWork', 'name

In [12]:
def lookup_schemaObject(is_partof_value, a_schema_object):
    object_index = is_partof_value.split(':')
    mapped_object = a_schema_object[int(object_index[1])]
    return mapped_object

def format_iri_as_id(example_iri):
    iri_list = example_iri.split(',')
    tmplist = []
    for each_iri in iri_list:
        iri_dict = {"@id":each_iri}
        tmplist.append(iri_dict)
    return tmplist
    
def add_type(propdf):
    propdf['@type'] = "schema:Property"
    return propdf

def cleanup_domain_range(propertydf):
    propertydf['rangeIncludes'] = propertydf.apply(lambda row: format_iri_as_id(row['rangeIncludes']), axis=1)
    propertydf['domainIncludes'] = propertydf.apply(lambda row: format_iri_as_id(row['domainIncludes']), axis=1)
    return propertydf

propdf = read_csv(os.path.join(datapath,'propertyCrossWalk.txt'),delimiter='\t',header=0,index_col=None)
#print(propdf.head(n=2))
props2map = propdf[['@id','name','domainIncludes','rangeIncludes','url',
                    'isPartOf','owl:cardinality','marginality']].copy()

props2map = add_type(props2map)
props2map = cleanup_domain_range(props2map)
#props2map['isPartOf'] = props2map.apply(lambda row: lookup_schemaObject(row['isPartOf'],schemaOriginObject), axis=1)

#print(props2map.head(n=2))

mappedprops = propdf[['sameAs.@id','sameAs.name','sameAs.domainIncludes',
                      'sameAs.rangeIncludes','sameAs.url','sameAs.isPartOf',
                      'sameAs.owl:cardinality','sameAs.marginality']].copy()
mappedprops.rename(columns=lambda s: s.replace("sameAs.", ""), inplace=True)

mappedprops = add_type(mappedprops)
mappedprops = cleanup_domain_range(mappedprops)
mappedprops['isPartOf'] = mappedprops.apply(lambda row: lookup_schemaObject(row['isPartOf'],schemaUsageObject), axis=1)
print(mappedprops.head(n=2))

                  @id         name             domainIncludes  \
0  google:description  description  [{'@id': 'schema:Thing'}]   
1         google:name         name  [{'@id': 'schema:Thing'}]   

              rangeIncludes                             url  \
0  [{'@id': 'schema:Text'}]  https://schema.org/description   
1  [{'@id': 'schema:Text'}]         https://schema.org/name   

                                            isPartOf owl:cardinality  \
0  {'@type': 'schema:DataDownload', 'name': 'Goog...     unspecified   
1  {'@type': 'schema:DataDownload', 'name': 'Goog...     unspecified   

           marginality            @type  
0  minimal or required  schema:Property  
1  minimal or required  schema:Property  


In [None]:
originforjson = schemaOrigin[['@type','name','alternateName','url','version']].copy()
citationforjson = schemaOrigin[['citation.@type','citation.name','citation.url']].copy()
citationforjson.rename(columns={'citation.@type':'@type','citation.name':'name','citation.url':'url'},inplace=True)
testjson = originforjson[0:2]
testcite = citationforjson[0:2]
citejson = testcite.to_dict(orient='records')
testjson['citation'] = [x for x in citejson]

print(testjson)

#print(testjson.to_dict(orient='records'))
#print(testcite.to_dict(orient='records'))


    


def check_for_multi_cite(recordlist):
    freq = recordlist.groupby(['@type','name','alternateName','url','version']).size().reset_index(name='counts')
    dups = freq['name'].loc[freq['counts']>1].tolist()
    nondups = recordlist.loc[~recordlist['name'].isin(dups)].copy()
    for eachname in dups:
        recordsubset = recordlist.loc[recordlist['name']==eachname]
        citelist = tmprecords['citation'].tolist()
        tmprecord = recordsubset.drop_duplicates(subset=['@type','name','alternateName','url','version'], keep='first').copy()
        tmprecord['citation']=citelist
    
        
    