Originally generated with ChatGPT-4o on 2024-07-09, with modifications

In [45]:
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS, OWL, DCTERMS
import pandas as pd
import re
import urllib.parse

In [46]:
# Define GBAD schema ontology
base_gbad_uri = 'http://gbad.archives.gov.on.ca'
base_schema_uri = URIRef(f"{base_gbad_uri}/schema")
base_auth_uri = URIRef(f"{base_schema_uri}/authority")
base_add_uri = URIRef(f"{base_schema_uri}/description-listings")
base_mapping_uri = URIRef(f"{base_schema_uri}/mapping")

In [47]:
# Choose ontology to map
base_uri = base_auth_uri
graph_path = '../../schema/authority/general_authority_to_ric-o_model_v11_-_9_july_2024.ttl'

# Set reference table data
table_path = '../../schema/authority/Authorities to RiC Data Elements Mapping.csv'
element_label = str('Authority Data Element').replace(' ','_')
identifier_label = str('Authority Field Identifier').replace(' ','_')
mnemonic_label = str('Authority Mnemonic').replace(' ','_')
rico_name_label = str('RiC-O Name').replace(' ','_')

In [48]:
# Create the input RDF graph
g = Graph(base = base_uri)
g.parse(graph_path,
        format="turtle")  # Adjust the format as needed

# Define custom prefixes
rico_uri = 'https://www.ica.org/standards/RiC/ontology#'
rico = ('rico', Namespace(rico_uri))
ns = ('', Namespace(URIRef(f"{base_uri}#")))

# Define common prefixes
rdf = ('rdf', RDF)
rdfs = ('rdfs', RDFS)
owl = ('owl', OWL)

# Bind prefixes to namespaces
g_namespace_manager = g.namespace_manager
g_namespace_manager.bind(*rico)
g_namespace_manager.bind(*rdf)
g_namespace_manager.bind(*rdfs)
g_namespace_manager.bind(*owl)
g_namespace_manager.bind(*ns)

#print(g.serialize(format='turtle'))

In [49]:
# Query to get all subjects, predicates, and objects
query = f"""
SELECT ?subject ?predicate ?object
WHERE {{
  ?subject ?predicate ?object.
}}
"""
# Execute the query
result = g.query(query)

In [50]:
# Define reference table
table = pd.read_csv(table_path)
table_match_label = 'has_table_match'
uri_mask_label = str('URI Mask').replace(' ', '_')
table.head()

Unnamed: 0,Authority Field Identifier,Authority Data Element,Authority Mnemonic,RiC-CM ID,RiC-CM Name,RiC-O Name,RiC-O IRI,RiC-O Type,Notes,Requires HTML Label?,Combine Multiple Instances?,URI Mask
0,AUTH01,Agency Reference Code,REFA,RiC-A22,Identifier,rico:Identifier,https://www.ica.org/standards/RiC/ontology#Ide...,Class,Use object property rico:hasIdentifierType to ...,,,Identifier/{REFA}
1,AUTH02,Archivist,ARCHAU,RiC-E08,Person,rico:Person,https://www.ica.org/standards/RiC/ontology#Person,Class,Modelling the describing archivist using the c...,,,
2,AUTH03,Authority Type,AUTHTP,RiC-A12,Corporate Body Type,rico:CorporateBodyType,https://www.ica.org/standards/RiC/ontology#Cor...,Class,RiC does not have a generic authority type ent...,,,
3,AUTH04,Authorizing Agent,AA,RiC-E17,Mandate,rico:Mandate,https://www.ica.org/standards/RiC/ontology#Man...,Class,,,,
4,AUTH05,Biographical Sketch or Administrative History,ADM,RiC-A21,History,rico:history,https://www.ica.org/standards/RiC/ontology#his...,Datatype property,It is also possible (but more complex) to mode...,Y,Y,


In [51]:
# Define main processing logic
def parse_node(uri):
    parsed_result = {element_label: None,
                     identifier_label: None,
                     mnemonic_label: None,
                     uri_mask_label: None,
                     table_match_label: False}
    subject = g_namespace_manager.normalizeUri(uri)
    prefix = ':'
    node = str(subject)[1:] if str(subject).startswith(prefix) else None
    if node:
        # Decode URL
        decoded_node = urllib.parse.unquote(node)
        
        # Regex to match the table structure
        match = re.match(r"(?P<{}>.+)\s+\((?P<{}>.+)\)(?P<{}>.+)"
                         .format(element_label, identifier_label, mnemonic_label)
                         , decoded_node)

        if match:
            parsed_result = match.groupdict()
            table_match = table[
                (table[element_label.replace('_',' ')] == parsed_result[element_label]) &
                (table[identifier_label.replace('_',' ')] == parsed_result[identifier_label]) &
                (table[mnemonic_label.replace('_',' ')] == parsed_result[mnemonic_label])
            ]
            uri_mask = table_match[uri_mask_label.replace('_',' ')].squeeze()
            parsed_result[uri_mask_label] = str(uri_mask) if isinstance(uri_mask, str) else None
            parsed_result[table_match_label] = not table_match.empty
        else:
            parsed_result[element_label] = decoded_node.strip()

    return parsed_result

# List to hold the parsed results
parsed_results = []
subjects = []

# Process the results and create new triples
for row in result:
    subject = row.subject
    predicate = row.predicate
    object = row.object

    parsed_result = parse_node(subject)
    parsed_result.update({
        'subject': subject,
        'predicate': predicate,
        'object': object
    })
    parsed_results.append(parsed_result)

    #if subject not in subjects:
    #    subjects.append(subject)
    
    # Example: create new triples by modifying the predicate or object
    #new_predicate = URIRef("http://example.org/newPredicate")
    #new_object = Literal(f"Modified: {object}")

    # Add new triples to the graph
    #g.add((subject, new_predicate, new_object))

# Save the updated graph to a file
#g.serialize(destination="updated_data.ttl", format="turtle")
#for i, subject in enumerate(subjects):
#    subjects[i] = {
#        'name': subject,
#        'predicateObjects': [
#            {
#                'predicate': str(parsed_result['predicate']),
#                'object': str(parsed_result['object'])
#            }
#            for parsed_result in parsed_results if parsed_result['subject'] == subject
#        ]
#    }
#subjects
parsed_results

[{'Authority_Data_Element': 'Authority Record',
  'Authority_Field_Identifier': None,
  'Authority_Mnemonic': None,
  'URI_Mask': None,
  'has_table_match': False,
  'subject': rdflib.term.URIRef('http://gbad.archives.gov.on.ca/schema/authority#Authority%20Record'),
  'predicate': rdflib.term.URIRef('https://www.ica.org/standards/RiC/ontology#hasDocumentaryFormType'),
  'object': rdflib.term.URIRef('http://gbad.archives.gov.on.ca/schema/authority#Authority%20Record')},
 {'Authority_Data_Element': 'Predecessor Agencies',
  'Authority_Field_Identifier': 'AUTH15',
  'Authority_Mnemonic': 'PRED',
  'URI_Mask': '{PRED}',
  'has_table_match': True,
  'subject': rdflib.term.URIRef('http://gbad.archives.gov.on.ca/schema/authority#Predecessor%20Agencies%20%28AUTH15%29PRED'),
  'predicate': rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
  'object': rdflib.term.URIRef('https://www.ica.org/standards/RiC/ontology#Agent')},
 {'Authority_Data_Element': 'Agent Identifier',
  'A

In [52]:
# Convert the parsed results to a dataframe
parsed_df = pd.DataFrame(parsed_results)

def normalize_uri(uri):
  if isinstance(uri, URIRef):
    return g_namespace_manager.normalizeUri(uri)
  return None

subjects_df = parsed_df[
    (parsed_df['predicate'].apply(lambda x: str(normalize_uri(x))) == 'rdf:type') &
    (parsed_df['object'].apply(lambda x: str(normalize_uri(x)).startswith(f"{rico[0]}:"))) &
    (parsed_df['has_table_match']==True)
].drop_duplicates(
    subset=[element_label, identifier_label, mnemonic_label]
).assign(
    **{rico_name_label: parsed_df['object'].apply(lambda x: normalize_uri(x))}
).loc[:,[element_label, identifier_label, mnemonic_label, rico_name_label, 'subject', uri_mask_label]]

def clean_and_combine(row):
  cleaned_element = str(row[element_label]).replace(' ', '')
  cleaned_rico = str(row[rico_name_label])[5:]
  return cleaned_rico + 'From' + cleaned_element   # Combine strings

triplesmap_label = 'TriplesMap'
subjects_df[triplesmap_label] = subjects_df.apply(clean_and_combine, axis=1)
subjects_df

Unnamed: 0,Authority_Data_Element,Authority_Field_Identifier,Authority_Mnemonic,RiC-O_Name,subject,URI_Mask,TriplesMap
1,Predecessor Agencies,AUTH15,PRED,rico:Agent,http://gbad.archives.gov.on.ca/schema/authorit...,{PRED},AgentFromPredecessorAgencies
4,Successor Agencies,AUTH20,SUC,rico:Agent,http://gbad.archives.gov.on.ca/schema/authorit...,,AgentFromSuccessorAgencies
20,Authorizing Agent,AUTH04,AA,rico:Mandate,http://gbad.archives.gov.on.ca/schema/authorit...,,MandateFromAuthorizingAgent
50,Authority Type,AUTH03,AUTHTP,rico:CorporateBodyType,http://gbad.archives.gov.on.ca/schema/authorit...,,CorporateBodyTypeFromAuthorityType
51,Status,AUTH18,STATUSA,rico:RecordState,http://gbad.archives.gov.on.ca/schema/authorit...,,RecordStateFromStatus
54,Heading,AUTH12,HEADING,rico:Agent,http://gbad.archives.gov.on.ca/schema/authorit...,,AgentFromHeading
62,Agency Reference Code,AUTH01,REFA,rico:Identifier,http://gbad.archives.gov.on.ca/schema/authorit...,Identifier/{REFA},IdentifierFromAgencyReferenceCode
75,Rules,AUTH16,RULES,rico:Rule,http://gbad.archives.gov.on.ca/schema/authorit...,,RuleFromRules
78,Variant Names,AUTH21,VAR,rico:AgentName,http://gbad.archives.gov.on.ca/schema/authorit...,,AgentNameFromVariantNames
79,Archivist,AUTH02,ARCHAU,rico:Person,http://gbad.archives.gov.on.ca/schema/authorit...,,PersonFromArchivist


In [53]:
# Initialize an RDF graph
mapping = Graph(base = base_mapping_uri)
source_path = 'gbad/mapping/source/authority_head_6.csv'

# Define RML-specific prefixes
rml = ('rml', Namespace('http://semweb.mmlab.be/ns/rml#'))
rr = ('rr', Namespace('http://www.w3.org/ns/r2rml#'))
ql = ('ql', Namespace('http://semweb.mmlab.be/ns/ql#'))
csvw = ('csvw', Namespace('http://www.w3.org/ns/csvw#'))

# Define custom prefix
maps = ('', Namespace(URIRef(f"{base_mapping_uri}#")))

# Bind prefixes to namespaces
map_namespace_manager = mapping.namespace_manager
map_namespace_manager.bind(*rico)
map_namespace_manager.bind(*rdf)
map_namespace_manager.bind(*rdfs)
map_namespace_manager.bind(*owl)
map_namespace_manager.bind(*rml)
map_namespace_manager.bind(*rr)
map_namespace_manager.bind(*ql)
map_namespace_manager.bind(*csvw)
map_namespace_manager.bind(*maps)

# Define blank nodes and triples
agent_name_map = BNode()
agent_map = BNode()

# Triples for :AgentNameAUTH13
#mapping.add((maps[1].AgentNameAUTH13, RDF.type, rr[1].TriplesMap))

#subjects_df
for i, subject_row in subjects_df.iterrows():
    # Define TriplesMap
    subject = maps[1][subject_row[triplesmap_label]]
    mapping.add((subject, RDF.type, rr[1].TriplesMap))

    # Define Logical Source
    logical_source = BNode()
    mapping.add((subject, rml[1].logicalSource, logical_source))
    mapping.add((logical_source, rml[1].source, Literal(source_path)))
    mapping.add((logical_source, rml[1].referenceFormulation, ql[1].CSV))

    # Define Logical Source
    subject_map = BNode()
    mapping.add((subject, rr[1].subjectMap, subject_map))
    mapping.add((subject_map, rr[1].template, Literal(subject_row[uri_mask_label])))
    rico_class = subject_row[rico_name_label][5:]
    mapping.add((subject_map, rr[1]['class'], rico[1][rico_class]))

    for parsed_result in parsed_results:
        if parsed_result['subject'] == subject_row['subject']:
            predicate = parsed_result['predicate']
            if map_namespace_manager.normalizeUri(predicate).startswith(f"{rico[0]}:"):
                object = parsed_result['object']
                if object in set(subjects_df['subject']):
                    triplesmap = subjects_df[subjects_df['subject']==subject_row['subject']][triplesmap_label].iloc[0]
                    object = maps[1][triplesmap]
                    mapping.add((subject, predicate, object))

#mapping.add((mapping[1].AgentNameAUTH13, rml[1].logicalSource, Literal(source_path)))

#mapping.add((agent_name_map, rr.subjectMap, URIRef("{REFA}/AgentName")))
#mapping.add((agent_name_map, rr.class_, rico.AgentName))

#predicate_object_map1 = BNode()
#mapping.add((agent_name_map, rr.predicateObjectMap, predicate_object_map1))
#mapping.add((predicate_object_map1, rr.predicate, rico.normalizedValue))
#mapping.add((predicate_object_map1, rr.objectMap, rdflib.term.URIRef("{HEADING}")))

# Triples for :AgentAUTH13
#mapping.add((mapping.AgentAUTH13, RDF.type, rr.TriplesMap))
#mapping.add((mapping.AgentAUTH13, rml.logicalSource, source_node))

#mapping.add((agent_map, rr.subjectMap, URIRef("{REFA}")))
#mapping.add((agent_map, rr.class_, rico.Agent))

#predicate_object_map2 = BNode()
#mapping.add((agent_map, rr.predicateObjectMap, predicate_object_map2))
#mapping.add((predicate_object_map2, rr.predicate, RDFS.label))
#mapping.add((predicate_object_map2, rr.objectMap, URIRef("{HEADING}")))

# Serialize and print the RDF graph
print(mapping.serialize(format='turtle'))

@base <http://gbad.archives.gov.on.ca/schema/mapping> .
@prefix : <http://gbad.archives.gov.on.ca/schema/mapping#> .
@prefix ql: <http://semweb.mmlab.be/ns/ql#> .
@prefix rico: <https://www.ica.org/standards/RiC/ontology#> .
@prefix rml: <http://semweb.mmlab.be/ns/rml#> .
@prefix rr: <http://www.w3.org/ns/r2rml#> .

<#AgentFromPredecessorAgencies> a rr:TriplesMap ;
    rml:logicalSource [ rml:referenceFormulation ql:CSV ;
            rml:source "gbad/mapping/source/authority_head_6.csv" ] ;
    rr:subjectMap [ rr:class rico:Agent ;
            rr:template "{PRED}" ] .

<#AgentFromSuccessorAgencies> a rr:TriplesMap ;
    rml:logicalSource [ rml:referenceFormulation ql:CSV ;
            rml:source "gbad/mapping/source/authority_head_6.csv" ] ;
    rr:subjectMap [ rr:class rico:Agent ;
            rr:template "None" ] .

<#AgentNameFromVariantNames> a rr:TriplesMap ;
    rml:logicalSource [ rml:referenceFormulation ql:CSV ;
            rml:source "gbad/mapping/source/authority_head_6.csv"