In [1]:
!pip install openpyxl



In [2]:
import os
import yaml
import glob
import io
import pandas as pd
from shortid import ShortId
from rdflib import plugin, ConjunctiveGraph, Graph, URIRef, Literal, BNode
from rdflib.store import Store
from rdflib.namespace import RDF, RDFS, XSD
from rdflib.plugins.sparql.results.csvresults import CSVResultSerializer

pd.set_option('display.max_colwidth', 200)

if not os.path.exists('outputs'):
    os.makedirs('outputs')

In [3]:
def _l(g, d, n, k, t):
    if isinstance(k, list):
        p = vocab[k[0]]
        if k[1] in d:
            if handle_special_cases(g, d[k[1]], n, k[0]):
                return
            o = d[k[1]]
        else:
            o = k[1]
    else:
        if handle_special_cases(g, d[k], n, k):
            return
        p = vocab[k]
        o = d[k]
    
    g.add((n, p, Literal(o, datatype=t)))
    
    
def _b(g, n, k, b):
    g.add((n, vocab[k], b))
    
    
def _r(g, d, n, k):
    if isinstance(k, list):
        if handle_special_cases(g, d[k[1]], n, k[0]):
            return
        p = vocab[k[0]]
        o = d[k[1]]
    else:
        if handle_special_cases(g, d[k], n, k):
            return
        p = vocab[k]
        o = d[k]
    
    if o.find('http') > -1 or o.find('www') > -1 or o.find('@') > -1:
        g.add((n, p, URIRef(o)))
        return
        
    g.add((n, p, URIRef(vocab[o])))
    
    
def _t(g, d, n, k):
    if k in d:
        g.add((n, RDF.type, vocab[d[k]]))
    else:
        g.add((n, RDF.type, vocab[k]))
    
    
def _c(g, d, n1, n2, k):
    if (handle_special_cases(g, d, n1, k)):
        return
    _b(g, n1, k, n2)
    _t(g, d, n2, 'Bag')
    for e in d:
        _li(g, n2, e)
    
    
def _li(g, n, e):
    g.add((n, vocab['li'], URIRef(vocab[e])))
    
    
def process_document(d):
    gid = URIRef('{}G{}'.format(ns, sid.generate()))
    g = Graph(store, gid)
    process_survey(g, d['survey'])
    process_infrastructure(g, d['infrastructure'])
    
    
def process_survey(g, d):
    n = BNode()
    _l(g, d, g.identifier, 'date', XSD.date)
    _l(g, d, g.identifier, 'version', XSD.string)
    process_creator(g, d['creator'], g.identifier)
    
    
def process_creator(g, d, n):
    n1 = BNode()
    _b(g, n, 'creator', n1)
    _l(g, d, n1, 'name', XSD.string)
    _r(g, d, n1, 'email')
    
    
def process_infrastructure(g, d):
    n = BNode()
    _t(g, d, n, 'ResearchInfrastructure')
    _l(g, d, n, 'acronym', XSD.string)
    _l(g, d, n, ['label', 'name'], XSD.string)
    _r(g, d, n, ['riUrl', 'recognized authority URL'])
    _r(g, d, n, ['hasDomain', 'domain'])
    for repository in d['repositories']:
        process_repository(g, repository, n, d['acronym'])
        
        
def process_repository(g, d, n, i):
    if (handle_special_cases(g, d, n, 'hasRepository')):
        return
    n1 = BNode()
    _b(g, n, 'hasRepository', n1)
    _t(g, d, n1, 'Repository')
    _r(g, d, n1, ['hasRepositoryUrl', 'URL'])
    _l(g, d, n1, ['label', 'name'], XSD.string) 
    _l(g, d, n1, ['altLabel', '{} repository'.format(i)], XSD.string)
    _t(g, d, n1, 'kind')
    _r(g, d, n1, ['hasDataRepositoryType', 'data repository type'])
    _r(g, d, n1, ['hasMetadataRepositoryType', 'metadata repository type'])
    _r(g, d, n1, ['usesSoftware', 'software'])
    process_repository_identifier(g, d['identifier'], n1, i, d['name'])
    _c(g, d['certification methods'], n1, BNode(), 'hasCertificationMethods')
    _c(g, d['policies'], n1, BNode(), 'hasPolicies')
    _c(g, d['registries'], n1, BNode(), 'inRegistries')
    _l(g, d, n1, ['hasPersistencyGuaranty', 'persistency-guaranty'], XSD.string)
    process_repository_access(g, d['access mechanisms'], n1, i, d['name'])
    process_repository_data(g, d['data'], n1, i, d['name'])
    process_repository_metadata(g, d['metadata'], n1, i, d['name'])
    process_repository_vocabularies(g, d['vocabularies'], n1, i, d['name'])
    process_repository_datamanagementplans(g, d['data management plans'], n, d['name'])
    process_repository_dataprocessing(g, d['data processing'], n1, i, d['name'])
    process_repository_fairness(g, d['fairness'], n1, i, d['name'])
    process_repository_testfairness(g, d['test fairness'], n1, i, d['name'])
        
        
def process_repository_identifier(g, d, n, i, r):
    if (handle_special_cases(g, d, n, 'usesIdentifier')):
        return
    for e in d:
        n1 = BNode()
        _b(g, n, 'usesIdentifier', n1)
        _t(g, e, n1, 'Identifier')
        _l(g, e, n1, ['altLabel', '{} {} identifier'.format(i, r)], XSD.string)
        _t(g, e, n1, 'kind')
        _r(g, e, n1, ['hasIdentifierIri', 'IRI'])
        _r(g, e, n1, ['usesIdentifierSystem', 'system'])
        _l(g, e, n1, ['isAssigned', 'assigned'], XSD.string)
        _r(g, e, n1, ['usesProvider', 'provider'])
        _c(g, e['includes-attributes'], n1, BNode(), 'includesAttributes')
        

def process_repository_access(g, d, n, i, r):
    if (handle_special_cases(g, d, n, 'hasAccessMechanisms')):
        return
    n1 = BNode()
    _b(g, n, 'hasAccessMechanisms', n1)
    _t(g, d, n1, 'AccessMechanism')
    _l(g, d, n1, ['altLabel', '{} {} access mechanism'.format(i, r)], XSD.string)
    _l(g, d, n1, ['hasAuthenticationMethod', 'authentication method'], XSD.string)
    _r(g, d, n1, ['hasAccessProtocolUrl', 'access protocol URL'])
    _l(g, d, n1, ['accessWithoutCosts', 'access without costs'], XSD.bool)
    _l(g, d, n1, ['maintainsOwnUserDatabase', 'own user database maintained'], XSD.bool)
    _l(g, d, n1, ['usesORCIDinAAI', 'ORCID used in AAI'], XSD.bool)
    _r(g, d, n1, ['supportsAccessTechnology', 'major access technology supported'])
    _r(g, d, n1, ['usesAuthorisationTechnique', 'authorisation technique'])
    _c(g, d['authorisation needed for'], n1, BNode(), 'usesAuthorisationFor')
    _l(g, d, n1, ['contentAccessAuthorizationRequired', 'authorization for accessing content needed'], XSD.bool)
    _c(g, d['data licenses in use'], n1, BNode(), 'usesDataLicenses')
    _r(g, d, n1, ['dataLicenseIri', 'data license IRI'])
    _l(g, d, n1, ['openAccessMetadata', 'metadata openly available'], XSD.bool)
    
    
def process_repository_data(g, d, n, i, r):
    if (handle_special_cases(g, d, n, 'hasData')):
        return
    for e1 in d:
        n1 = BNode()
        _b(g, n, 'hasData', n1)
        _t(g, e1, n1, 'Data')
        _l(g, d, n1, ['altLabel', '{} {} data'.format(i, r)], XSD.string)
        _t(g, e1, n1, 'type name')
        _c(g, e1['registered data schema'], n1, BNode(), 'dataSchemaIsRegistered')
        _l(g, e1, n, ['searchOnData', 'search on data'], XSD.bool)
        _l(g, e1, n, ['searchEngineIndexing', 'search engine indexing'], XSD.bool)
        for e2 in e1['preferred formats']:
            n2 = BNode()
            _b(g, n1, 'hasPreferredFormat', n2)
            _t(g, e2, n2, 'PreferredFormat')
            _r(g, e2, n2, ['hasFormatName', 'format name'])
            _c(g, e2['metadata types in data headers'], n2, BNode(), 'hasDataHeaderMetadataTypes')
            
    
def process_repository_metadata(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'hasMetadata'):
        return
    n1 = BNode()
    _b(g, n, 'hasMetadata', n1)
    _l(g, d, n1, ['altLabel', '{} {} metadata'.format(i, r)], XSD.string)
    process_repository_metadata_schema(g, d['schema'], n1, i, r)
    _l(g, d, n1, ['categoriesAreDefinedInRegistries', 'categories defined in registries'], XSD.bool)
    _l(g, d, n1, ['persistentIdentifiersAreIncluded', 'PIDs included'], XSD.bool)
    _r(g, d, n1, ['hasPrimaryStorageFormat', 'primary storage format'])
    _r(g, d, n1, ['hasMetadataLongevityPlan', 'metadata longevity plan URL'])
    _r(g, d, n1, ['hasFormat', 'format IRI'])
    _c(g, d['export formats supported'], n1, BNode(), 'supportedExportFormats')
    _c(g, d['exchange/harvesting methods'], n1, BNode(), 'hasHarvestingMethods')
    _r(g, d, n1, ['hasLocalSearchEngineUrl', 'local search engine URL'])
    _c(g, d['external search engine types supported'], n1, BNode(), 'supportsExternalSearchEngineTypes')
    _l(g, d, n1, ['includesAccessPolicyStatements', 'access policy statements included'], XSD.bool)
    _l(g, d, n1, ['isMachineActionable', 'machine actionable'], XSD.bool)
    
    
def process_repository_metadata_schema(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'hasSchema'):
        return
    for e1 in d:
        n1 = BNode()
        _b(g, n, 'hasSchema', n1)
        _l(g, d, n1, ['altLabel', '{} {} metadata schema'.format(i, r)], XSD.string)
        _r(g, e1, n1, ['hasSchemaUrl', 'URL'])
        _r(g, e1, n1, ['hasSchemaName', 'name'])
        _c(g, e1['provenance fields included'], n1, BNode(), 'includesProvenanceFields')

    
def process_repository_vocabularies(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'hasVocabularies'):
        return
    for e1 in d:
        n1 = BNode()
        _b(g, n, 'hasVocabularies', n1)
        _l(g, d, n1, ['altLabel', '{} {} vocabularies'.format(i, r)], XSD.string)
        _r(g, e1, n1, ['hasVocabularyIri', 'IRI'])
        _t(g, e1, n1, 'type')
        _r(g, e1, n1, ['hasTopic', 'topic'])
        _l(g, e1, n1, ['hasName', 'name'], XSD.string)
        _r(g, e1, n1, ['hasSpecificationLanguage', 'specification language URL'])
        

def process_repository_datamanagementplans(g, d, n, i):
    if (handle_special_cases(g, d, n, 'hasDataManagementPlans')):
        return
    n1 = BNode()
    _b(g, n, 'hasDataManagementPlans', n1)
    _l(g, d, n1, ['altLabel', '{} data management plans'.format(i)], XSD.string)
    _l(g, d, n1, ['usesSpecificDataManagementPlanTools', 'specific DMP tools used'], XSD.bool)
    _l(g, d, n1, ['appliedDataPublishingSteps', 'data publishing steps applied'], XSD.string)
    

def process_repository_dataprocessing(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'hasDataProcessing'):
        return
    n1 = BNode()
    _b(g, n, 'hasDataProcessing', n1)
    _l(g, d, n1, ['altLabel', '{} {} data processing'.format(i, r)], XSD.string)
    _c(g, d['special data processing steps applied'], n1, BNode(), 'specialDataProcessingStepsApplied')
    _c(g, d['workflow frameworks applied'], n1, BNode(), 'workflowFrameworksApplied')
    _c(g, d['distributed workflows tools used'], n1, BNode(), 'distributedWorkflowsToolsUsed')
    _c(g, d['other analysis services offered'], n1, BNode(), 'otherAnalysisServicesOffered')
    _c(g, d['data products offered'], n1, BNode(), 'dataProductsOffered')

    
def process_repository_fairness(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'fairness'):
        return
    n1 = BNode()
    _b(g, n, 'fairness', n1)
    _l(g, d, n1, ['altLabel', '{} {} fairness'.format(i, r)], XSD.string)
    process_repository_faireness_findability(g, d['data findability'], n1, i, r)
    process_repository_faireness_accessibility(g, d['data accessibility'], n1, i, r)
    process_repository_faireness_interoperability(g, d['data interoperability'], n1, i, r)
    process_repository_faireness_reusability(g, d['data re-usability'], n1, i, r)

    
def process_repository_faireness_findability(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'dataFindability'):
        return
    n1 = BNode()
    _b(g, n, 'dataFindability', n1)
    _l(g, d, n1, ['altLabel', '{} {} faireness findability'.format(i, r)], XSD.string)
    _l(g, d, n1, ['dataIsFindable', 'data findable'], XSD.bool)
    _c(g, d['gaps'], n1, BNode(), 'gaps')

    
def process_repository_faireness_accessibility(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'dataAccessibility'):
        return
    n1 = BNode()
    _b(g, n, 'dataAccessibility', n1)
    _l(g, d, n1, ['altLabel', '{} {} faireness accessibility'.format(i, r)], XSD.string)
    _l(g, d, n1, ['dataIsAccessible', 'data accessible'], XSD.bool)
    _c(g, d['gaps'], n1, BNode(), 'gaps')

    
def process_repository_faireness_interoperability(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'dataInteroperability'):
        return
    n1 = BNode()
    _b(g, n, 'dataInteroperability', n1)
    _l(g, d, n1, ['altLabel', '{} {} faireness interoperability'.format(i, r)], XSD.string)
    _l(g, d, n1, ['dataIsInteroperable', 'data interoperable'], XSD.bool)
    _c(g, d['gaps'], n1, BNode(), 'gaps')

    
def process_repository_faireness_reusability(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'dataReusability'):
        return
    n1 = BNode()
    _b(g, n, 'dataReusability', n1)
    _l(g, d, n1, ['altLabel', '{} {} faireness reusability'.format(i, r)], XSD.string)
    _l(g, d, n1, ['dataIsReusable', 'data reusable'], XSD.bool)
    _c(g, d['gaps'], n1, BNode(), 'gaps')
    
    
def process_repository_testfairness(g, d, n, i, r):
    if handle_special_cases(g, d, n, 'testFairness'):
        return
    n1 = BNode()
    _b(g, n, 'testFairness', n1)
    _l(g, d, n1, ['altLabel', '{} {} test fairness'.format(i, r)], XSD.string)
    _r(g, d, n1, ['hasDataset', 'URL/IRI of dataset'])
    _r(g, d, n1, ['hasDiscoveryPortal', 'URL of discovery portal'])
    _r(g, d, n1, ['hasMachineReadableDatasetMetadata', 'IRI of machine readable metadata of dataset'])
    _l(g, d, n1, ['hasMachineReadableProvenance', 'machine readable provenance'], XSD.bool)
    _c(g, d['compliance validation service'], n1, BNode(), 'hasComplianceValidationService')
    

def handle_special_cases(g, d, n, k):
    if d is None:
        g.add((n, vocab[k], vocab['NULL']))
        # _r(g, d, n, [k, 'NULL'])
        return True
    if d is 'NULL':
        g.add((n, vocab[k], vocab['NULL']))
        return True
    if d == 'VOID':
        g.add((n, vocab[k], vocab['VOID']))
        return True
    if d == 'none':
        g.add((n, vocab[k], vocab['none']))
        return True
    if d == 'planned':
        g.add((n, vocab[k], vocab['planned']))
        return True
    return False

In [4]:
vocab = dict()
vocab[None] = URIRef('http://envri.eu/ns/NULL')
vocab['NULL'] = URIRef('http://envri.eu/ns/NULL')

ns = 'http://envri.eu/ns/'
sid = ShortId()
store = plugin.get('IOMemory', Store)()

g = ConjunctiveGraph(store)

with open('vocab.yaml', 'r') as f:
    for key, value in yaml.safe_load(f).items():
        vocab[key] = URIRef(value)

with open('fairmapping.yaml', 'r') as f:
    for key, value in yaml.safe_load(f).items():
        g.add((vocab[key], vocab['relatesTo'], URIRef(value)))
    
for file in glob.glob('descriptions/*.yaml'):
    with open(file, 'r') as f:
        for document in yaml.load_all(f, Loader=yaml.FullLoader):
            process_document(document)
    
g.bind('envri', ns)
g.bind('dcterms', 'http://purl.org/dc/terms/')
g.bind('foaf', 'http://xmlns.com/foaf/0.1/')
g.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
g.serialize(destination='data.trig', format='trig')

In [5]:
g = ConjunctiveGraph()
g.parse('data.trig', format='trig')

def query(q):
    serializer = CSVResultSerializer(g.query(q))
    output = io.BytesIO()
    serializer.serialize(output)
    return pd.read_csv(io.StringIO(output.getvalue().decode('utf-8')), encoding='utf-8')
    
def write(df, fn):
    df.to_excel('outputs/{}'.format(fn), encoding='utf-8')

In [6]:
q = query("""
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX envri: <http://envri.eu/ns/>
PREFIX rm: <http://www.oil-e.net/ontology/envri-rm.owl#>

SELECT ?date ?ri_acronym ?ri_url ?ri_domain ?rep_label WHERE {
    ?g dcterms:date ?date .
    GRAPH ?g { 
        ?ri a rm:ResearchInfrastructure .
        ?ri envri:acronym ?ri_acronym . 
        ?ri envri:riUrl ?ri_url .
        ?ri envri:hasDomain ?ri_domain .
        ?ri envri:hasRepository ?rep .
        ?rep a envri:Repository .
        ?rep rdfs:label ?rep_label .
    }
    FILTER (?date > "2019-03-15"^^xsd:date)
    # FILTER (?ri_acronym = "ICOS")
}
""")

display(q)
write(q, 'output-1.xlsx')

Unnamed: 0,date,ri_acronym,ri_url,ri_domain,rep_label
0,2019-03-21,EPOS,http://envri.eu/ns/NULL,http://envri.eu/ns/Earth,RESIF (France)
1,2019-04-16,EPOS,http://envri.eu/ns/NULL,http://envri.eu/ns/Earth,Terradue
2,2019-04-03,EPOS,http://envri.eu/ns/NULL,http://envri.eu/ns/Earth,European Federated Data Archive
3,2019-03-22,EPOS,http://envri.eu/ns/NULL,http://envri.eu/ns/Earth,local EU-EIDA


In [7]:
q = query("""
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX envri: <http://envri.eu/ns/>
PREFIX rm: <http://www.oil-e.net/ontology/envri-rm.owl#>

SELECT ?rep_label ?vocab_name ?vocab_iri WHERE {
    [] a rm:ResearchInfrastructure ;
       envri:hasDomain envri:Earth ;
       envri:hasRepository [
         a envri:Repository ;
         rdfs:label ?rep_label ;
         envri:hasVocabularies [
           envri:hasName ?vocab_name ;
           envri:hasVocabularyIri ?vocab_iri 
         ]
       ]
}
""")

display(q)
write(q, 'output-2.xlsx')

Unnamed: 0,rep_label,vocab_name,vocab_iri
0,European Federated Data Archive,http://envri.eu/ns/NULL,http://envri.eu/ns/NULL
1,RESIF (France),http://envri.eu/ns/NULL,http://envri.eu/ns/NULL
2,local EU-EIDA,SEED,https://ds.iris.edu/ds/nodes/dmc/data/formats/seed/
3,Terradue,http://envri.eu/ns/None,http://envri.eu/ns/NULL


In [8]:
# All properties that relate to a specific FAIR principle
q = query("""
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX envri: <http://envri.eu/ns/>
PREFIX rm: <http://www.oil-e.net/ontology/envri-rm.owl#>
PREFIX fairterms: <https://w3id.org/fair/principles/terms/>

SELECT ?p WHERE {
    ?p envri:relatesTo fairterms:A1.2 .
}
""")

display(q)
write(q, 'output-3.xlsx')

Unnamed: 0,p
0,http://envri.eu/ns/usesAuthorisationTechnique
1,http://envri.eu/ns/hasAuthenticationMethod
2,http://envri.eu/ns/contentAccessAuthorizationRequired
3,http://envri.eu/ns/usesORCIDinAAI
4,http://envri.eu/ns/maintainsOwnUserDatabase


In [9]:
# Retrieve the context of a property relating to a specific FAIR principle and filter those properties for which the value is NULL
q = query("""
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX envri: <http://envri.eu/ns/>
PREFIX rm: <http://www.oil-e.net/ontology/envri-rm.owl#>
PREFIX fairterms: <https://w3id.org/fair/principles/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?l ?p ?o ?li WHERE {
    ?p envri:relatesTo fairterms:I1 .
    ?s ?p ?o .
    ?s skos:altLabel ?l .
    OPTIONAL { ?o a rdf:Bag . ?o rdf:li ?li }
    FILTER (?o != envri:NULL)
}
""")

display(q)
write(q, 'output-4.xlsx')

Unnamed: 0,l,p,o,li
0,EPOS local EU-EIDA metadata,http://envri.eu/ns/hasPrimaryStorageFormat,http://envri.eu/ns/SEED,
1,EPOS RESIF (France) metadata,http://envri.eu/ns/hasPrimaryStorageFormat,http://envri.eu/ns/ExtensibleMarkupLanguage,
2,EPOS European Federated Data Archive metadata,http://envri.eu/ns/hasPrimaryStorageFormat,http://envri.eu/ns/ExtensibleMarkupLanguage,
3,EPOS Terradue metadata,http://envri.eu/ns/hasPrimaryStorageFormat,http://envri.eu/ns/ExtensibleMarkupLanguage,
4,EPOS local EU-EIDA metadata,http://envri.eu/ns/isMachineActionable,true,
5,EPOS Terradue metadata,http://envri.eu/ns/isMachineActionable,true,
6,EPOS local EU-EIDA vocabularies,http://envri.eu/ns/hasTopic,http://envri.eu/ns/DomainSpecific,
7,EPOS Terradue vocabularies,http://envri.eu/ns/hasTopic,http://envri.eu/ns/None,
8,EPOS European Federated Data Archive metadata schema,http://envri.eu/ns/hasSchemaName,http://envri.eu/ns/StationXML,
9,EPOS RESIF (France) metadata schema,http://envri.eu/ns/hasSchemaName,http://envri.eu/ns/StationXML,


In [11]:
q = query("""
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX envri: <http://envri.eu/ns/>
PREFIX rm: <http://www.oil-e.net/ontology/envri-rm.owl#>

SELECT ?rep_label ?schema_name WHERE {
    [] a rm:ResearchInfrastructure ;
       envri:hasDomain envri:Earth ;
       envri:hasRepository [
         a envri:Repository ;
         rdfs:label ?rep_label ;
         envri:hasMetadata [
           envri:hasSchema [
             envri:hasSchemaName ?schema_name
           ]
         ]
       ]
}
""")

display(q)
write(q, 'output-5.xlsx')

Unnamed: 0,rep_label,schema_name
0,European Federated Data Archive,http://envri.eu/ns/StationXML
1,RESIF (France),http://envri.eu/ns/StationXML
2,local EU-EIDA,http://envri.eu/ns/SEED
3,Terradue,http://envri.eu/ns/ISO19119
