Originally generated with ChatGPT-4o on 2024-07-09, with modifications

In [178]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, OWL
import pandas as pd
import re
import urllib.parse

In [179]:
# Create an RDF graph
g = Graph()
graph_path = '../../schema/authority/general_authority_to_ric-o_model_v11_-_9_july_2024.ttl'
g.parse(graph_path,
        format="turtle")  # Adjust the format as needed

# Define custom prefixes
rico = ('rico', 'https://www.ica.org/standards/RiC/ontology#')
schema = ('auth', 'http://gbad.archives.gov.on.ca/schema/authority#')

# Bind prefixes to namespaces
namespace_manager = g.namespace_manager
namespace_manager.bind(rico[0], Namespace(rico[1]))
namespace_manager.bind(schema[0], Namespace(schema[1]))
namespace_manager.bind('rdf', RDF)
namespace_manager.bind('rdfs', RDFS)
namespace_manager.bind('owl', OWL)

In [180]:
# Query to get all subjects, predicates, and objects
query = f"""
SELECT ?subject ?predicate ?object
WHERE {{
  ?subject ?predicate ?object.
}}
"""
# Execute the query
result = g.query(query)

In [181]:
# Define table data
table_path = '../../schema/authority/Authorities to RiC Data Elements Mapping.csv'
element_label = 'Authority_Data_Element'
identifier_label = 'Authority_Field_Identifier'
mnemonic_label = 'Authority_Mnemonic'

table = pd.read_csv(table_path)
table.head()

Unnamed: 0,Authority Field Identifier,Authority Data Element,Authority Mnemonic,RiC-CM ID,RiC-CM Name,RiC-O Name,RiC-O IRI,RiC-O Type,Notes,Requires HTML Label?,Combine Multiple Instances?
0,AUTH01,Agency Reference Code,REFA,RiC-A22,Identifier,rico:Identifier,https://www.ica.org/standards/RiC/ontology#Ide...,Class,Use object property rico:hasIdentifierType to ...,,
1,AUTH02,Archivist,ARCHAU,RiC-E08,Person,rico:Person,https://www.ica.org/standards/RiC/ontology#Person,Class,Modelling the describing archivist using the c...,,
2,AUTH03,Authority Type,AUTHTP,RiC-A12,Corporate Body Type,rico:CorporateBodyType,https://www.ica.org/standards/RiC/ontology#Cor...,Class,RiC does not have a generic authority type ent...,,
3,AUTH04,Authorizing Agent,AA,RiC-E17,Mandate,rico:Mandate,https://www.ica.org/standards/RiC/ontology#Man...,Class,,,
4,AUTH05,Biographical Sketch or Administrative History,ADM,RiC-A21,History,rico:history,https://www.ica.org/standards/RiC/ontology#his...,Datatype property,It is also possible (but more complex) to mode...,Y,Y


In [191]:
# Define main processing logic
table_match_label = 'has_table_match'
def parse_node(uri):
    parsed_result = {element_label: str(uri), identifier_label: None, mnemonic_label: None, table_match_label: False}
    prefix = f"{schema[0]}:"
    node = str(uri).split(prefix)[-1] if str(uri).startswith(prefix) else None
    if node:
        # Decode URL
        decoded_node = urllib.parse.unquote(node)
        
        # Regex to match the table structure
        match = re.match(r"(?P<{}>.+)\s+\((?P<{}>.+)\)(?P<{}>.+)"
                         .format(element_label, identifier_label, mnemonic_label)
                         , decoded_node)

        if match:
            parsed_result = match.groupdict()
            parsed_result[table_match_label] = not table[
                    (table[element_label.replace('_',' ')] == parsed_result[element_label]) &
                    (table[identifier_label.replace('_',' ')] == parsed_result[identifier_label]) &
                    (table[mnemonic_label.replace('_',' ')] == parsed_result[mnemonic_label])
                ].empty
            parsed_result[element_label]
        else:
            parsed_result[element_label] = decoded_node.strip()

    return parsed_result

# List to hold the parsed results
parsed_results = []
subjects = []

# Process the results and create new triples
for row in result:
    subject = namespace_manager.normalizeUri(row.subject)
    predicate = namespace_manager.normalizeUri(row.predicate)
    object = namespace_manager.normalizeUri(row.object) if isinstance(row.object, URIRef) else row.object

    parsed_result = parse_node(subject)
    parsed_result.update({
        'subject': subject,
        'predicate': predicate,
        'object': object
    })
    parsed_results.append(parsed_result)

    if subject not in subjects:
        subjects.append(subject)
    
    # Example: create new triples by modifying the predicate or object
    #new_predicate = URIRef("http://example.org/newPredicate")
    #new_object = Literal(f"Modified: {object}")

    # Add new triples to the graph
    #g.add((subject, new_predicate, new_object))

# Save the updated graph to a file
#g.serialize(destination="updated_data.ttl", format="turtle")
for i, subject in enumerate(subjects):
    subjects[i] = {
        'name': subject,
        'predicateObjects': [
            {
                'predicate': str(parsed_result['predicate']),
                'object': str(parsed_result['object'])
            }
            for parsed_result in parsed_results if parsed_result['subject'] == subject
        ]
    }

subjects

[{'name': 'auth:Heading%20%28AUTH12%29HEADING',
  'predicateObjects': [{'predicate': 'rico:hasOrHadAgentName',
    'object': 'auth:Variant%20Names%20%28AUTH21%29VAR'},
   {'predicate': 'rdf:type', 'object': 'rico:Agent'},
   {'predicate': 'rico:hasOrHadController',
    'object': 'auth:Controlling%20Agencies%20%28AUTH08%29CONTAG'},
   {'predicate': 'rico:hasDeathDate',
    'object': 'auth:Dates%20Of%20Existence%20%28AUTH10%29DATEEX'},
   {'predicate': 'rdf:type', 'object': 'owl:NamedIndividual'},
   {'predicate': 'rico:hasOrHadIdentifier',
    'object': 'auth:Agency%20Reference%20Code%20%28AUTH01%29REFA'},
   {'predicate': 'rico:hasBirthDate',
    'object': 'auth:Dates%20Of%20Existence%20%28AUTH10%29DATEEX'},
   {'predicate': 'rico:isSuccessorOf',
    'object': 'auth:Predecessor%20Agencies%20%28AUTH15%29PRED'},
   {'predicate': 'rico:isOrWasDescribedBy',
    'object': 'auth:Authority%20Record'},
   {'predicate': 'rico:generalDescription',
    'object': 'Comment\xa0(AUTH07)CMTAU'},
   {'

In [185]:
# Convert the parsed results to a dataframe
parsed_df = pd.DataFrame(parsed_results)
parsed_df[
    (parsed_df['predicate']=='rdf:type') &
    (parsed_df['object'].apply(lambda x: str(x).startswith(f"{rico[0]}:")))
].drop_duplicates(subset=[element_label, identifier_label, mnemonic_label])


Unnamed: 0,Authority_Data_Element,Authority_Field_Identifier,Authority_Mnemonic,has_table_match,subject,predicate,object
3,Heading,AUTH12,HEADING,True,auth:Heading%20%28AUTH12%29HEADING,rdf:type,rico:Agent
4,Authority Type,AUTH03,AUTHTP,True,auth:Authority%20Type%20%28AUTH03%29AUTHTP,rdf:type,rico:CorporateBodyType
8,Predecessor Agencies,AUTH15,PRED,True,auth:Predecessor%20Agencies%20%28AUTH15%29PRED,rdf:type,rico:Agent
9,Controlling Agencies,AUTH08,CONTAG,False,auth:Controlling%20Agencies%20%28AUTH08%29CONTAG,rdf:type,rico:Agent
14,Archivist,AUTH02,ARCHAU,True,auth:Archivist%20%28AUTH02%29ARCHAU,rdf:type,rico:Person
34,Parallel Names,AUTH14,PAR,False,auth:Parallel%20Names%20%28AUTH14%29PAR,rdf:type,rico:AgentName
41,Successor Agencies,AUTH20,SUC,True,auth:Successor%20Agencies%20%28AUTH20%29SUC,rdf:type,rico:Agent
42,Parallel Name,,,False,auth:Parallel%20Name,rdf:type,rico:Type
48,Authority Record,,,False,auth:Authority%20Record,rdf:type,rico:Record
53,Variant Names,AUTH21,VAR,True,auth:Variant%20Names%20%28AUTH21%29VAR,rdf:type,rico:AgentName


In [184]:
parsed_df[parsed_df[table_match_label]==True].drop_duplicates()

Unnamed: 0,Authority_Data_Element,Authority_Field_Identifier,Authority_Mnemonic,has_table_match,subject,predicate,object
0,Heading,AUTH12,HEADING,True,auth:Heading%20%28AUTH12%29HEADING,rico:hasOrHadAgentName,auth:Variant%20Names%20%28AUTH21%29VAR
3,Heading,AUTH12,HEADING,True,auth:Heading%20%28AUTH12%29HEADING,rdf:type,rico:Agent
4,Authority Type,AUTH03,AUTHTP,True,auth:Authority%20Type%20%28AUTH03%29AUTHTP,rdf:type,rico:CorporateBodyType
7,Heading,AUTH12,HEADING,True,auth:Heading%20%28AUTH12%29HEADING,rico:hasOrHadController,auth:Controlling%20Agencies%20%28AUTH08%29CONTAG
8,Predecessor Agencies,AUTH15,PRED,True,auth:Predecessor%20Agencies%20%28AUTH15%29PRED,rdf:type,rico:Agent
10,Heading,AUTH12,HEADING,True,auth:Heading%20%28AUTH12%29HEADING,rico:hasDeathDate,auth:Dates%20Of%20Existence%20%28AUTH10%29DATEEX
14,Archivist,AUTH02,ARCHAU,True,auth:Archivist%20%28AUTH02%29ARCHAU,rdf:type,rico:Person
17,Authorizing Agent,AUTH04,AA,True,auth:Authorizing%20Agent%20%28AUTH04%29AA,rdf:type,owl:NamedIndividual
19,Heading,AUTH12,HEADING,True,auth:Heading%20%28AUTH12%29HEADING,rdf:type,owl:NamedIndividual
20,Agency Reference Code,AUTH01,REFA,True,auth:Agency%20Reference%20Code%20%28AUTH01%29REFA,rdf:type,owl:NamedIndividual
