### This script converts paths into rdf_Craft readable format for Yarrml and rdml conversion

JSON structure

In [1]:
from pydantic import BaseModel
class Entity (BaseModel):
    id: int
    rdf_type: str # object_uri
    uri_pattern: str # iri{id}

class Predicate(BaseModel):
    from_id: int
    to_id: int
    uri: str

class Schema (BaseModel):
    nodes: list[Entity]
    edges: list[Predicate]

Response folders with path id selection and analysis folder with OAT for full path recreation

In [2]:
import os
import json
import sys
from typing import List, Tuple, Optional


response_folder = f"C:/Users/elias/Documents/ANI/Bachelor_Baby/llm_assistant/Data/raw_data/exp_two_hop"
response_folder = f"C:/Users/elias/Documents/ANI/Bachelor_Baby/llm_assistant/Data/curated_dataset/exp_two_hop"
response_folder = f"C:/Users/elias/Documents/ANI/Bachelor_Baby/llm_assistant/Data/raw_data/path_selection/path_selection_mini_2_hop_short_URI_3_smpl"
analysis_dir = (
    "C:/Users/elias/Documents/ANI/Bachelor_Baby/llm_assistant/Data/curated_dataset/analysis"
)
analysis_dir = (
    "C:/Users/elias/Documents/ANI/Bachelor_Baby/llm_assistant/Data/raw_data/analysis"
)
hop_count_exp = 2

Get path finding logic and OAT (main type)

In [3]:

# === SETUP ===
# Get the current notebook directory
notebook_dir = os.getcwd()

# Append the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from Path_Finding_Logic.main import PathFinder


def get_main_entity_type(analysis_file: str) -> str:
    with open(analysis_file, "r", encoding="utf-8") as f:
        first_line = f.readline().strip()
        if first_line.startswith("Main Entity Type:"):
            return first_line.replace("Main Entity Type:", "").strip()
    return ""


Get last fragment of each class URI to build the URI_pattern using aidava:resource 

In [4]:
import re

def get_last_fragment(uri: str) -> str:
    return re.split(r'[\/#]', uri)[-1]

build_schema_from_paths method

In [15]:
def build_schema_from_paths(column_mappings, path_output: list[list[tuple[str, Optional[str]]]]) -> Schema:
    uri_to_id = {}
    nodes = []
    edges = []
    next_id = 1

    for column, path in enumerate(path_output):
        print("Column: ", column)
        uri_pattern = ""
        print("Path: ", path)
        
        column_name = column_mappings[column]["column_name"]
        print("Column_name: ", column_name)
        if column_name == "row_id":
            continue
        for i, (current_node_uri, predicate_uri) in enumerate(path):
            
            if i + 2 == len(path):
                column_name = column_name
            
            # Create uri pattern
            frag = get_last_fragment(current_node_uri)
            if uri_pattern !=  "": 
                uri_pattern += "/" + frag
            else:
                uri_pattern += frag
                
            # Register current node if not already added
            if current_node_uri not in uri_to_id:
                uri_to_id[current_node_uri] = next_id
                nodes.append(Entity(id=next_id, uri_pattern=f"aidava-resource:{uri_pattern}/$({column_name})",rdf_type=f"{current_node_uri}"))
                next_id += 1

            # If this tuple contains a predicate, create an edge from the previous node to this node
            if predicate_uri is not None and i > 0:
                previous_node_uri, _ = path[i - 1]

                # Ensure previous node is also registered
                if previous_node_uri not in uri_to_id:
                    uri_to_id[previous_node_uri] = next_id
                    nodes.append(Entity(id=next_id, uri=f"{previous_node_uri}{{{next_id}}}"))
                    next_id += 1

                edges.append(Predicate(
                    from_id=uri_to_id[previous_node_uri],
                    to_id=uri_to_id[current_node_uri],
                    uri=predicate_uri
                ))

    return Schema(nodes=nodes, edges=edges)

In [None]:
# === MAIN LOOP ===
for file in os.listdir(response_folder):
    if not file.endswith("_response.txt"):
        continue

    base_name = file.replace("_response.txt", "")
    json_path = os.path.join(response_folder, file)

    # Load analysis file and get main entity
    analysis_file = os.path.join(analysis_dir, f"{base_name}_analysis.txt")
    main_entity = get_main_entity_type(analysis_file)
    if not main_entity:
        print(f"[!] Could not determine main entity from {analysis_file}, skipping.")
        continue
    # Get paths from PathFinder
    pathfinder = PathFinder(ttl_file="aidava-sphn.ttl")
    paths = pathfinder.find_paths(hop_count=hop_count_exp, target_class=main_entity)
    print("Reading path_id mappings from path: ", json_path)
    # Load mapping JSON
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    column_mappings = data.get("column_mappings", [])
    rows = []

    path_output = []
    # rdf_to_column_name = dict()
    for mapping in column_mappings:
        print(mapping)
        path_id = mapping["path"]["path_id"] - 1  # Convert from 1-indexed
        column_name = mapping["column_name"]
     

        if path_id >= len(paths):
            print(
                f"[!] Path ID {path_id+1} out of range for {base_name} with length {len(paths)}"
            )
            continue

        print(paths[path_id])
        # rdf_to_column_name[paths[path_id][-1][1]] = column_name

        path_output.append(paths[path_id])
    schema = build_schema_from_paths(column_mappings, path_output)
    print(json.dumps(schema.model_dump(), indent=2))

Reading path_id mappings from path:  C:/Users/elias/Documents/ANI/Bachelor_Baby/llm_assistant/Data/raw_data/path_selection/path_selection_mini_2_hop_short_URI_3_smpl\administrative_cases_response.txt
{'column_name': 'row_id', 'path': {'full_path': 'https://biomedit.ch/rdf/sphn-ontology/sphn#ProblemCondition/sphn:hasSubjectPseudoIdentifier/sphn:hasIdentifier', 'path_id': 89}, 'justification': "The 'row_id' is an internal unique identifier for each row, mapping best to a unique identifier property; path 89 leads from ProblemCondition via hasSubjectPseudoIdentifier to hasIdentifier, which semantically fits a unique string ID. No better 2-hop path offers more semantic accuracy for this internal ID.", 'transformation': 'Convert numeric row_id to string and ensure uniqueness if used as URI fragment.'}
{'column_name': 'patient_id', 'path': {'full_path': 'https://biomedit.ch/rdf/sphn-ontology/sphn#ProblemCondition/AIDAVA:hasPatient/AIDAVA:hasPatientIdentifier', 'path_id': 23}, 'justification':