Originally generated with ChatGPT-4o on 2024-07-09, with modifications

In [292]:
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS, OWL, DCTERMS
import pandas as pd
import re
import urllib.parse

In [293]:
# Define GBAD schema ontology
base_gbad_uri = 'http://gbad.archives.gov.on.ca'
base_schema_uri = URIRef(f"{base_gbad_uri}/schema")
base_auth_uri = URIRef(f"{base_schema_uri}/authority")
base_add_uri = URIRef(f"{base_schema_uri}/description-listings")
base_mapping_uri = URIRef(f"{base_schema_uri}/mapping")

In [294]:
# Choose ontology to map
base_uri = base_auth_uri
graph_path = '../../schema/authority/general_authority_to_ric-o_model_v11_-_9_july_2024.ttl'

# Set reference table data
table_path = '../../schema/authority/Authorities to RiC Data Elements Mapping.csv'
element_label = 'Authority_Data_Element'
identifier_label = 'Authority_Field_Identifier'
mnemonic_label = 'Authority_Mnemonic'

In [295]:
# Create the input RDF graph
g = Graph(base = base_uri)
g.parse(graph_path,
        format="turtle")  # Adjust the format as needed

# Define custom prefixes
rico_uri = 'https://www.ica.org/standards/RiC/ontology#'
rico = ('rico', Namespace(rico_uri))
#base_prefix = ('', Namespace(URIRef(f"{base_uri}#")))

# Define common prefixes
rdf = ('rdf', RDF)
rdfs = ('rdfs', RDFS)
owl = ('owl', OWL)

# Bind prefixes to namespaces
namespace_manager = g.namespace_manager
namespace_manager.bind(*rico)
namespace_manager.bind(*rdf)
namespace_manager.bind(*rdfs)
namespace_manager.bind(*owl)
#namespace_manager.bind(*base_prefix)

#print(g.serialize(format='turtle'))

In [296]:
# Query to get all subjects, predicates, and objects
query = f"""
SELECT ?subject ?predicate ?object
WHERE {{
  ?subject ?predicate ?object.
}}
"""
# Execute the query
result = g.query(query)

In [297]:
# Define reference table
table = pd.read_csv(table_path)
table_match_label = 'has_table_match'
#table.head()

In [298]:
# Define main processing logic
def parse_node(uri):
    parsed_result = {element_label: str(uri), identifier_label: None, mnemonic_label: None, table_match_label: False}
    prefix = f"{schema[0]}:"
    node = str(uri).split(prefix)[-1] if str(uri).startswith(prefix) else None
    if node:
        # Decode URL
        decoded_node = urllib.parse.unquote(node)
        
        # Regex to match the table structure
        match = re.match(r"(?P<{}>.+)\s+\((?P<{}>.+)\)(?P<{}>.+)"
                         .format(element_label, identifier_label, mnemonic_label)
                         , decoded_node)

        if match:
            parsed_result = match.groupdict()
            parsed_result[table_match_label] = not table[
                    (table[element_label.replace('_',' ')] == parsed_result[element_label]) &
                    (table[identifier_label.replace('_',' ')] == parsed_result[identifier_label]) &
                    (table[mnemonic_label.replace('_',' ')] == parsed_result[mnemonic_label])
                ].empty
            parsed_result[element_label]
        else:
            parsed_result[element_label] = decoded_node.strip()

    return parsed_result

# List to hold the parsed results
parsed_results = []
subjects = []

# Process the results and create new triples
for row in result:
    subject = namespace_manager.normalizeUri(row.subject)
    predicate = namespace_manager.normalizeUri(row.predicate)
    object = namespace_manager.normalizeUri(row.object) if isinstance(row.object, URIRef) else row.object

    parsed_result = parse_node(subject)
    parsed_result.update({
        'subject': subject,
        'predicate': predicate,
        'object': object
    })
    parsed_results.append(parsed_result)

    if subject not in subjects:
        subjects.append(subject)
    
    # Example: create new triples by modifying the predicate or object
    #new_predicate = URIRef("http://example.org/newPredicate")
    #new_object = Literal(f"Modified: {object}")

    # Add new triples to the graph
    #g.add((subject, new_predicate, new_object))

# Save the updated graph to a file
#g.serialize(destination="updated_data.ttl", format="turtle")
for i, subject in enumerate(subjects):
    subjects[i] = {
        'name': subject,
        'predicateObjects': [
            {
                'predicate': str(parsed_result['predicate']),
                'object': str(parsed_result['object'])
            }
            for parsed_result in parsed_results if parsed_result['subject'] == subject
        ]
    }
#subjects

In [299]:
# Initialize an RDF graph
mapping = Graph(base = base_mapping_uri)
source_path = 'gbad/mapping/source/authority_head_6.csv'

# Define RML-specific prefixes
rml = ('rml', Namespace('http://semweb.mmlab.be/ns/rml#'))
rr = ('rr', Namespace('http://www.w3.org/ns/r2rml#'))
ql = ('ql', Namespace('http://semweb.mmlab.be/ns/ql#'))
csvw = ('csvw', Namespace('http://www.w3.org/ns/csvw#'))

# Define custom prefix
maps = ('', Namespace(URIRef(f"{base_mapping_uri}#")))

# Bind prefixes to namespaces
namespace_manager = mapping.namespace_manager
namespace_manager.bind(*rico)
namespace_manager.bind(*schema)
namespace_manager.bind(*rdf)
namespace_manager.bind(*rdfs)
namespace_manager.bind(*owl)
namespace_manager.bind(*rml)
namespace_manager.bind(*rr)
namespace_manager.bind(*ql)
namespace_manager.bind(*csvw)
namespace_manager.bind(*maps)

# Define blank nodes and triples
agent_name_map = BNode()
agent_map = BNode()

# Triples for :AgentNameAUTH13
mapping.add((maps[1].AgentNameAUTH13, RDF.type, rr[1].TriplesMap))
#mapping.add((mapping[1].AgentNameAUTH13, rml[1].logicalSource, Literal(source_path)))

#mapping.add((agent_name_map, rr.subjectMap, URIRef("{REFA}/AgentName")))
#mapping.add((agent_name_map, rr.class_, rico.AgentName))

#predicate_object_map1 = BNode()
#mapping.add((agent_name_map, rr.predicateObjectMap, predicate_object_map1))
#mapping.add((predicate_object_map1, rr.predicate, rico.normalizedValue))
#mapping.add((predicate_object_map1, rr.objectMap, rdflib.term.URIRef("{HEADING}")))

# Triples for :AgentAUTH13
#mapping.add((mapping.AgentAUTH13, RDF.type, rr.TriplesMap))
#mapping.add((mapping.AgentAUTH13, rml.logicalSource, source_node))

#mapping.add((agent_map, rr.subjectMap, URIRef("{REFA}")))
#mapping.add((agent_map, rr.class_, rico.Agent))

#predicate_object_map2 = BNode()
#mapping.add((agent_map, rr.predicateObjectMap, predicate_object_map2))
#mapping.add((predicate_object_map2, rr.predicate, RDFS.label))
#mapping.add((predicate_object_map2, rr.objectMap, URIRef("{HEADING}")))

# Serialize and print the RDF graph
print(mapping.serialize(format='turtle'))

@base <http://gbad.archives.gov.on.ca/schema/mapping> .
@prefix : <http://gbad.archives.gov.on.ca/schema/mapping#> .
@prefix rr: <http://www.w3.org/ns/r2rml#> .

<#AgentNameAUTH13> a rr:TriplesMap .




In [300]:
# Convert the parsed results to a dataframe
parsed_df = pd.DataFrame(parsed_results)
parsed_df[
    (parsed_df['predicate']=='rdf:type') &
    (parsed_df['object'].apply(lambda x: str(x).startswith(f"{rico[0]}:")))
].drop_duplicates(subset=[element_label, identifier_label, mnemonic_label])


Unnamed: 0,Authority_Data_Element,Authority_Field_Identifier,Authority_Mnemonic,has_table_match,subject,predicate,object
3,:Heading%20%28AUTH12%29HEADING,,,False,:Heading%20%28AUTH12%29HEADING,rdf:type,rico:Agent
4,:Authority%20Type%20%28AUTH03%29AUTHTP,,,False,:Authority%20Type%20%28AUTH03%29AUTHTP,rdf:type,rico:CorporateBodyType
8,:Predecessor%20Agencies%20%28AUTH15%29PRED,,,False,:Predecessor%20Agencies%20%28AUTH15%29PRED,rdf:type,rico:Agent
9,:Controlling%20Agencies%20%28AUTH08%29CONTAG,,,False,:Controlling%20Agencies%20%28AUTH08%29CONTAG,rdf:type,rico:Agent
14,:Archivist%20%28AUTH02%29ARCHAU,,,False,:Archivist%20%28AUTH02%29ARCHAU,rdf:type,rico:Person
34,:Parallel%20Names%20%28AUTH14%29PAR,,,False,:Parallel%20Names%20%28AUTH14%29PAR,rdf:type,rico:AgentName
41,:Successor%20Agencies%20%28AUTH20%29SUC,,,False,:Successor%20Agencies%20%28AUTH20%29SUC,rdf:type,rico:Agent
42,:Parallel%20Name,,,False,:Parallel%20Name,rdf:type,rico:Type
48,:Authority%20Record,,,False,:Authority%20Record,rdf:type,rico:Record
53,:Variant%20Names%20%28AUTH21%29VAR,,,False,:Variant%20Names%20%28AUTH21%29VAR,rdf:type,rico:AgentName


In [301]:
parsed_df[parsed_df[table_match_label]==True].drop_duplicates()

Unnamed: 0,Authority_Data_Element,Authority_Field_Identifier,Authority_Mnemonic,has_table_match,subject,predicate,object
