In [None]:
#if not already installed, install saxonche (https://www.saxonica.com/saxon-c/index.xml ) and xmltodict (https://github.com/martinblech/xmltodict/ ) libraries
!pip install saxonche
!pip install xmltodict

In [None]:
import saxonche as saxon
import xmltodict, json
import os

In [None]:
input_file_base = "example_files/HoffmannPlantsV3"
input_file = input_file_base + ".xml"
stylesheet_file = "abcd2bioschemas-xml.xslt"

In [None]:
from saxonche import PySaxonProcessor
with PySaxonProcessor(license=False) as proc:
    print("Test SaxonC on Python")
    print(proc.version)
    xslt30proc = proc.new_xslt30_processor()

In [None]:
# Check if Jupyter notebook has been launched the project's root directory
# if os.getcwd().endswith('group') and len(stylesheet_file.split('/')) == 1:
#     stylesheet_file = "transformations/abcd2bioschemas/" + stylesheet_file
#     input_file_base = "transformations/abcd2bioschemas/" + input_file_base
#     input_file = "transformations/abcd2bioschemas/" + input_file

In [None]:
# transform ABCD XML to Bioschemas.org-XML using XSLT file 

#xslt30proc.transform_to_file(source_file="example_files/HoffmannPlantsV3.xml", stylesheet_file="abcd2bioschemas-xml.xslt", output_file="example_files/HoffmannPlantsV3_schema.xml")
schema_xml = xslt30proc.transform_to_string(source_file=input_file, stylesheet_file=stylesheet_file)
schema_xml

In [None]:
#transform XML structure to Python dictionary structure
schema_dict = xmltodict.parse(schema_xml)
if schema_dict['jsonld'].get('reverse') is not None:
    schema_dict['jsonld']['@reverse'] = schema_dict['jsonld'].pop('reverse')
schema_dict['jsonld']

In [None]:
# Some properties may have a list containing duplicates, e.g., because of multiple ABCD elements being mapped to the same Bioschemas property.
# Duplicates can be removed recursively from the dictionary using the following function:
def remove_duplicates(d):
    if isinstance(d, dict):
        for key, value in d.items():
            d[key] = remove_duplicates(value)
        return d
    elif isinstance(d, list):
        unique_list = list({json.dumps(item, sort_keys=False): item for item in d}.values())
        return [remove_duplicates(item) for item in unique_list]
    else:
        return d

remove_duplicates(schema_dict)

In [None]:
# TODO: aggregate taxon names to higher level here
# get about list
# import pygbif.species as species
# schema_dict = xmltodict.parse(schema_xml)
# about_list = schema_dict['jsonld']['about'].copy()
# for each entry request taxon information from GBIF and add to dict as taxonInfo
# for about in about_list[:5]:
#     taxon = species.name_backbone(name=about['name'])
#     print(taxon)
#     about['taxonInfo'] = taxon

# about_list

In [None]:
# schema_dict['jsonld']['about']

In [None]:
#convert data values
#TODO: write algorithm to do that generically
try:
    schema_dict['jsonld']['isAccessibleForFree'] = bool(schema_dict['jsonld']['isAccessibleForFree'])
except KeyError:
    pass
    
try:
    schema_dict['jsonld']['size']['value'] = int(schema_dict['jsonld']['size']['value']['#text'])
except KeyError:
    pass
    
try:
    schema_dict['jsonld']['geo']['latitude'] = float(schema_dict['jsonld']['geo']['latitude']['#text'])
except KeyError:
    pass
    
try:
    schema_dict['jsonld']['geo']['longitude'] = float(schema_dict['jsonld']['geo']['longitude']['#text'])
except KeyError:
    pass

In [None]:
#export python 
schema_json = json.dumps(schema_dict['jsonld']) 

f = open(input_file_base+".json", "w")
f.write(schema_json)
f.close()