In [None]:
# Clean up LinkML section creation prefixes

import rdflib

def unify_ns_prefixes(
    input_turtle: str,
    output_turtle: str,
    base_uri: str = "https://example.org/ilcd/",
    old_ns_list = (
        "ILCDex:",
        "ILCDsd:",
        "ILCDlcia:",
        "ILCDmav:",
        "ILCDadmin:",
        "ILCDpi:",
    )
):
    """
    Load a Turtle file containing multiple 'ns' prefixes and unify them
    into a single 'ilcd:' prefix, rewriting URIs that match old_ns_list
    to the 'base_uri'.

    :param input_turtle: Path to the original .ttl file (with multiple prefixes)
    :param output_turtle: Path to the new .ttl file (with a single prefix "ilcd")
    :param base_uri: URI used for rewriting old prefixes (ends with '/')
    :param old_ns_list: Tuple of strings to unify. Each string is the old namespace up to but not including local name
                       E.g. "ILCDex:", "ILCDsd:", or the generated "ns1:" etc.
                       You can also match the expanded forms if you prefer.
    """
    # 1) Parse the old Turtle file into an rdflib Graph
    old_g = rdflib.Graph()
    old_g.parse(input_turtle, format="turtle")
    print(f"Loaded graph with {len(old_g)} triples from {input_turtle}")

    # 2) Create a new empty Graph
    new_g = rdflib.Graph()

    # 3) Bind ONLY the single 'ilcd' prefix to your base_uri
    new_g.bind("ilcd", rdflib.URIRef(base_uri))
    # Optionally bind other well-known prefixes like xsd, rdf, skos, etc. if you want
    new_g.bind("xsd", rdflib.URIRef("http://www.w3.org/2001/XMLSchema#"))
    # new_g.bind("skos", rdflib.URIRef("http://www.w3.org/2004/02/skos/core#"))
    # etc.

    def unify_uri(u):
        """
        If `u` is a URIRef matching one of our old namespaces,
        rewrite it to the new base_uri + localname.
        Otherwise, return as-is.
        """
        if not isinstance(u, rdflib.URIRef):
            return u

        # Try computing qname from old_g. This helps us get local name easily.
        # If the URI is completely unknown, we fallback to raw rewriting.
        try:
            prefix, ns, local = old_g.compute_qname(u)
        except Exception:
            # If compute_qname fails, you can keep it as is or do custom logic
            return u

        # Check if the prefix is one of the old ns (ns1, ns2, ILCDex, etc.)
        # or if `ns` matches a known old namespace URI. You can decide your logic here.
        # We'll do a simple approach: if prefix starts with "ns" or in old_ns_list, unify it
        if prefix.startswith("ns") or prefix + ":" in old_ns_list:
            return rdflib.URIRef(base_uri + local)
        if prefix in old_ns_list:
            return rdflib.URIRef(base_uri + local)

        # else keep as-is (e.g., xsd:, rdf:, skos:)
        return u

    # 4) Iterate over all triples in old_g, rewrite them, and add to new_g
    for s, p, o in old_g:
        s_new = unify_uri(s)
        p_new = unify_uri(p)
        o_new = unify_uri(o)
        new_g.add((s_new, p_new, o_new))

    # 5) Serialize the new graph to Turtle
    new_g.serialize(destination=output_turtle, format="turtle")
    print(f"Rewrote {len(new_g)} triples to {output_turtle} with a single 'ilcd:' prefix.")

# -------------------------------------------------------------------------
# USAGE:
unify_ns_prefixes(
    input_turtle="data/rdf/epd_rdf_instance_datastore.ttl",
    output_turtle="data/rdf/epd_rdf_instance_datastore.ttl",
    base_uri="https://example.org/ilcd/",
    old_ns_list=("ILCDex:", "ILCDsd:", "ILCDlcia:", "ILCDmav:", "ILCDadmin:", "ILCDpi:")  # etc.
)


In [None]:
# SKOS Material Category

import xml.etree.ElementTree as ET
from rdflib import Graph, Namespace, Literal, RDF
from rdflib.namespace import SKOS

# Create a new graph and load the existing RDF data.
g = Graph()

# Define namespaces.
ILCD = Namespace("https://example.org/ilcd/")
OBD = Namespace("https://example.org/obd/")  # use OBD instead of EX
g.bind("ilcd", ILCD)
g.bind("obd", OBD)
g.bind("skos", SKOS)

# Load the initial RDF data from file (which uses the unified "ilcd:" prefix).
rdf_file_path = "data/rdf/epd_rdf_instance_datastore.ttl"
g.parse(rdf_file_path, format="turtle")
print(f"Loaded RDF data from {rdf_file_path}")

# File paths for the English and German XML category files.
file_en = "../data/pipeline2/xml/OEKOBAU.DAT_Categories_EN_aligned_temp.xml"
file_de = "../data/pipeline2/xml/OEKOBAU.DAT_Categories.xml"

# --- PART 1: Parse the XML files and extract only the target categories ---
# We care only about these categories (normalized to lowercase):
target_keys = {
    "mineral building products",
    "mineralische baustoffe",
    "mortar and concrete",
    "mörtel und beton",
    "ready mixed concrete",
    "beton",
}

def parse_category_system(file_path):
    """Parse the XML file and return the <categories> element."""
    tree = ET.parse(file_path)
    root = tree.getroot()
    ns = {"cat": "http://lca.jrc.it/ILCD/Categories"}
    # If the root is the CategorySystem with name "OEKOBAU.DAT", use it.
    if root.tag.endswith("CategorySystem") and root.get("name") == "OEKOBAU.DAT":
        cs = root
    else:
        cs = root.find(".//cat:CategorySystem[@name='OEKOBAU.DAT']", ns)
    if cs is None:
        raise ValueError(f"CategorySystem 'OEKOBAU.DAT' not found in {file_path}")
    categories_elem = cs.find("cat:categories", ns)
    return categories_elem, ns

def extract_target_categories(categories_elem, ns):
    """
    Traverse the XML category tree and return:
      - targets: a dict mapping category id to its label (if its normalized name is in target_keys),
      - relations: a list of tuples (parent_id, child_id) for target categories.
    """
    targets = {}
    relations = []
    def traverse(elem, parent_id=None):
        cat_id = elem.get("id")
        cat_name = elem.get("name")
        if cat_name:
            norm = cat_name.lower().strip()
            if norm in target_keys:
                targets[cat_id] = cat_name
                # Only record the relationship if the parent was also a target.
                if parent_id is not None and parent_id in targets:
                    relations.append((parent_id, cat_id))
        for child in elem.findall("cat:category", ns):
            traverse(child, parent_id=cat_id)
    for child in categories_elem.findall("cat:category", ns):
        traverse(child)
    return targets, relations

# Parse English and German files.
categories_en_elem, ns_en = parse_category_system(file_en)
categories_de_elem, ns_de = parse_category_system(file_de)

# Extract target categories (id → label) and relationships from both files.
targets_en, relations_en = extract_target_categories(categories_en_elem, ns_en)
targets_de, _ = extract_target_categories(categories_de_elem, ns_de)

# Merge the dictionaries using the category id as key.
merged_targets = {}
for cat_id, en_label in targets_en.items():
    merged_targets[cat_id] = {"en": en_label}
for cat_id, de_label in targets_de.items():
    if cat_id in merged_targets:
        merged_targets[cat_id]["de"] = de_label
    else:
        merged_targets[cat_id] = {"de": de_label}

# --- PART 2: Create SKOS concepts from the XML data using OBD ---
# Create a SKOS ConceptScheme for the categorization system.
cat_scheme = OBD["OEKOBAU_DAT"]
g.add((cat_scheme, RDF.type, SKOS.ConceptScheme))
g.add((cat_scheme, SKOS.prefLabel, Literal("OEKOBAU.DAT", lang="en")))
g.add((cat_scheme, SKOS.prefLabel, Literal("OEKOBAU.DAT", lang="de")))

# For each merged target category, create a SKOS Concept in the OBD namespace.
for cat_id, labels in merged_targets.items():
    # Build a URI for the category (replace dots with underscores for clarity).
    cat_uri = OBD["Category_" + cat_id.replace(".", "_")]
    g.add((cat_uri, RDF.type, SKOS.Concept))
    if "en" in labels:
        g.add((cat_uri, SKOS.prefLabel, Literal(labels["en"], lang="en")))
    if "de" in labels:
        g.add((cat_uri, SKOS.prefLabel, Literal(labels["de"], lang="de")))
    # Link the concept to the categorization scheme.
    g.add((cat_uri, SKOS.inScheme, cat_scheme))

# --- PART 3: Link existing ClassificationEntry resources to these concepts ---
for entry in g.subjects(RDF.type, ILCD.ClassificationEntry):
    value = g.value(entry, ILCD.value)
    if value is not None:
        norm_value = str(value).lower().strip()
        if norm_value in target_keys:
            for cat_id, labels in merged_targets.items():
                if (("en" in labels and labels["en"].lower().strip() == norm_value)
                    or ("de" in labels and labels["de"].lower().strip() == norm_value)):
                    cat_uri = OBD["Category_" + cat_id.replace(".", "_")]
                    g.add((entry, OBD.hasCanonicalCategory, cat_uri))
                    break

print("Canonical SKOS relationships created using XML data.")

# --- PART 4: Add hierarchical relationships based on the XML hierarchy.
# We use the relationships extracted from the English XML.
for parent_id, child_id in relations_en:
    # Only add the relationship if both parent and child are in our merged targets.
    if parent_id in merged_targets and child_id in merged_targets:
        parent_uri = OBD["Category_" + parent_id.replace(".", "_")]
        child_uri = OBD["Category_" + child_id.replace(".", "_")]
        g.add((child_uri, SKOS.broader, parent_uri))
        g.add((parent_uri, SKOS.narrower, child_uri))

# --- PART 5: Serialize the enriched graph.
output_file_path = "data/rdf/epd_rdf_instance_datastore_canonical_skos.ttl"
g.serialize(destination=output_file_path, format="turtle")
print(f"Graph saved to {output_file_path}")


In [None]:
# DIN 276

import json
import csv
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, SKOS

# Define Namespaces.
ILCD = Namespace("https://example.org/ilcd/")
DIN = Namespace("https://example.org/din276/")
LINKML_UUID = URIRef("https://w3id.org/linkml/UUIDType")

# Read the EPD Turtle file and bind the DIN prefix.
g = Graph()
g.parse("data/rdf/epd_rdf_instance_datastore_canonical_skos.ttl", format="turtle")
g.bind("din", DIN)

# Build dictionary from EPD JSON: epd_id -> epd_uuid.
epd_id_to_uuid = {}
with open("../data/pipeline2/json/edited_epds.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line.strip())
        epd_id = str(data["id"])
        epd_uuid = data["uuid"]
        epd_id_to_uuid[epd_id] = epd_uuid

# Build dictionary from DIN 276 JSON: custom_id -> [cost_group_codes].
din_cost_groups = {}
used_codes = set()  # collect all cost codes referenced in DIN JSON
with open("../data/pipeline2/json/openai/batch_67d5a00f7f2c8190a0e2cdc3cf04382b_output_+EPDNorge.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line.strip())
        custom_id = str(data["custom_id"])
        content_str = data["response"]["body"]["choices"][0]["message"]["content"]
        parsed = json.loads(content_str)
        cost_codes = parsed["cost_group_codes"]
        din_cost_groups[custom_id] = cost_codes
        used_codes.update(cost_codes)

# Build a SKOS vocabulary for DIN 276 cost groups from CSV,
# but only add rows whose cost code (Nr) is in used_codes.
# Also add the cost group number using skos:notation.
din_code_to_concept = {}
with open("../data/pipeline2/csv/din276_concrete_sub.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)  # Expected columns: Nr, Cost group (CG), Notes
    for row in reader:
        nr = row["Nr"].strip()  # e.g., "310"
        # Only include cost groups referenced in DIN JSON.
        if nr not in used_codes:
            continue

        label_en = row["Cost group (CG)"].strip()  # e.g., "Trenchwork/Earthworks"
        notes_en = row["Notes"].strip()            # e.g., description

        # Create a SKOS concept URI, e.g., DIN:costgroup_310.
        concept_uri = DIN[f"costgroup_{nr}"]
        g.add((concept_uri, RDF.type, SKOS.Concept))
        g.add((concept_uri, SKOS.prefLabel, Literal(label_en, lang="en")))
        g.add((concept_uri, SKOS.note, Literal(notes_en, lang="en")))
        # Add the cost group number as a notation.
        g.add((concept_uri, SKOS.notation, Literal(nr)))
        din_code_to_concept[nr] = concept_uri

# For each DIN 276 entry in the DIN JSON, find the corresponding top EPD node and link it
# to the SKOS concept(s) for the cost codes.
for custom_id, cost_codes in din_cost_groups.items():
    if custom_id not in epd_id_to_uuid:
        print(f"No EPD found for custom_id: {custom_id}")
        continue
    epd_uuid = epd_id_to_uuid[custom_id]
    uuid_literal = Literal(epd_uuid, datatype=LINKML_UUID)
    
    # Walk from dataSetInformation up to the top EPD node.
    found_epd = None
    for dsi_node in g.subjects(predicate=ILCD.UUID, object=uuid_literal):
        for pi_node in g.subjects(predicate=ILCD.dataSetInformation, object=dsi_node):
            for epd_node in g.subjects(predicate=ILCD.processInformation, object=pi_node):
                found_epd = epd_node
                break
            if found_epd:
                break
        if found_epd:
            break

    if not found_epd:
        print(f"No top EPD node found for UUID: {epd_uuid}")
        continue

    # Link each cost code that is present.
    for code in cost_codes:
        concept_uri = din_code_to_concept.get(code)
        if concept_uri is None:
            print(f"Cost code {code} not in CSV vocabulary; skipping.")
            continue
        # Link using the DIN namespace property.
        g.add((found_epd, DIN.hasDIN276CostGroup, concept_uri))
    print(f"Linked EPD node {found_epd} to cost codes: {cost_codes}")

# Add broader/narrower relationships among cost group concepts.
for code_parent, concept_parent in din_code_to_concept.items():
    if not code_parent.endswith("0"):
        continue
    for code_child, concept_child in din_code_to_concept.items():
        if code_child == code_parent:
            continue
        if code_child.startswith(code_parent[:2]) and not code_child.endswith("0"):
            g.add((concept_child, SKOS.broader, concept_parent))
            g.add((concept_parent, SKOS.narrower, concept_child))

# --- Model DIN 276 as a SKOS ConceptScheme using the DIN namespace.
din_scheme = DIN["DIN276"]
g.add((din_scheme, RDF.type, SKOS.ConceptScheme))
g.add((din_scheme, SKOS.prefLabel, Literal("DIN 276", lang="en")))
g.add((din_scheme, SKOS.note, Literal("DIN 276:2018-12 – Cost planning in building", lang="en")))

# For each DIN cost group concept, assert that it is part of the DIN 276 scheme.
for concept in din_code_to_concept.values():
    g.add((concept, SKOS.inScheme, din_scheme))

# Serialize updated RDF.
output_file_path = "data/rdf/epd_rdf_instance_datastore_canonical_skos_din.ttl"
g.serialize(destination=output_file_path, format="turtle")
print("Done! Updated RDF with DIN 276 SKOS cost groups and hierarchical relationships.")


In [None]:
# BKI

from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS, SKOS, XSD
import xml.etree.ElementTree as ET

# Namespaces used in your KG and for BKI data
BKI = Namespace("https://example.org/bki/")  # for your BKI elements
DIN = Namespace("https://example.org/din276/")  # existing DIN cost groups

# Load your existing SKOS-based DIN 276 cost-group ontology
# The triple store includes definitions like din:costgroup_322, etc.

g = Graph()
g.parse("data/rdf/epd_rdf_instance_datastore_canonical_skos_din.ttl", format="turtle")

# Register custom namespaces in the graph for clarity
g.bind("bki", BKI)
g.bind("din", DIN)

def integrate_bki_xml(g, xml_file_path):
    """
    Parse a BKI XML file (level 2 or 3),
    create RDF triples for each <element> that has a component containing 'transportbeton',
    and link them to the DIN 276 cost groups in the existing graph.
    """

    print(f"Starting graph size: {len(g)} triples.")
    
    # Register namespace to avoid parser warnings
    ET.register_namespace('', 'https://www.bauteileditor.de')
    
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    
    # Define XML namespaces
    ns = {'elca': 'https://www.bauteileditor.de'}

    # Iterate over all element nodes
    for element_node in root.findall('.//elca:element', ns):
        component_nodes = element_node.findall('.//elca:component', ns)
        # Check if any component contains 'transportbeton'
        has_transportbeton = any('transportbeton' in comp.get('processConfigName', '').lower() for comp in component_nodes)
        
        if not has_transportbeton:
            continue  # Skip elements without 'transportbeton' in components

        # Extract the element's UUID and DIN code
        element_uuid = element_node.get('uuid', 'unknown').replace('-', '')
        din_code = element_node.get('din276Code', 'unknown')
        element_uri = BKI[f'element_{element_uuid}']

        # Add RDF triples to the graph
        g.add((element_uri, RDF.type, BKI.BKIElement))
        costgroup_uri = DIN[f'costgroup_{din_code}']
        g.add((element_uri, DIN.hasDIN276CostGroup, costgroup_uri))

        # Extract name and description from <elementInfo>
        elem_info = element_node.find('.//elca:elementInfo', ns)
        if elem_info is not None:
            name_node = elem_info.find('elca:name', ns)
            desc_node = elem_info.find('elca:description', ns)
            
            if name_node is not None and name_node.text:
                g.add((element_uri, BKI.name, Literal(name_node.text.strip(), datatype=XSD.string)))
            if desc_node is not None and desc_node.text:
                g.add((element_uri, BKI.description, Literal(desc_node.text.strip(), datatype=XSD.string)))

        # Extract details for each component layer
        for comp_node in component_nodes:
            process_config_uuid = comp_node.get('processConfigUuid', '').replace('-', '')
            layer_size = comp_node.get('layerSize', '')
            layer_size_str = layer_size.replace('.', '').ljust(3, '0')[:3]  # e.g., "0.15" -> "015"
            layer_uri = BKI[f'layer_{process_config_uuid}_{layer_size_str}']

            process_config_name = comp_node.get('processConfigName', '')
            life_time = comp_node.get('lifeTime', '')
            
            g.add((layer_uri, RDF.type, BKI.Layer))
            g.add((element_uri, BKI.hasLayer, layer_uri))
            g.add((layer_uri, BKI.processConfigName, Literal(process_config_name, datatype=XSD.string)))
            
            if life_time.isdigit():
                g.add((layer_uri, BKI.lifeTime, Literal(int(life_time), datatype=XSD.integer)))
            else:
                g.add((layer_uri, BKI.lifeTime, Literal(life_time, datatype=XSD.string)))
            
            try:
                layer_size_float = float(layer_size)
                g.add((layer_uri, BKI.layerSize, Literal(layer_size_float, datatype=XSD.float)))
            except ValueError:
                g.add((layer_uri, BKI.layerSize, Literal(layer_size, datatype=XSD.string)))

    print(f"Integrated data from {xml_file_path}. Current graph size: {len(g)} triples.")

integrate_bki_xml(g, "../data/pipeline2/xml/BKI_Bauteilaufbauten_ 2_Ebene_DIN_276.xml")
integrate_bki_xml(g, "../data/pipeline2/xml/BKI_Bauteilaufbauten_ 3_Ebene_DIN_276.xml")

# Serialize updated RDF
g.serialize(destination="data/rdf/epd_rdf_instance_datastore_canonical_skos_din_bki.ttl", format="turtle")
print("Done! Updated RDF with BKI elements.")

In [None]:
def query_bki_by_costgroups(g, costgroup_notations):
    """
    Returns all BKI elements linked to a DIN 276 cost group whose notation 
    is in costgroup_notations, and also whose layers contain 'beton'.
    """
    # Build the string list for SPARQL: e.g. ("322","330","340")
    notations_list = '","'.join(costgroup_notations)  # e.g. '322","330","340'
    
    # For the SPARQL, we can do: FILTER(?notation IN ("322","330","340"))
    # We'll avoid backslashes around quotes by using triple single-quotes:
    
    query_str = f'''
PREFIX bki: <{BKI}>
PREFIX din: <{DIN}>
PREFIX skos: <{SKOS}>

SELECT ?element ?name ?description ?layerName ?layerLife ?layerSize ?notation
WHERE {{
  ?element a bki:BKIElement ;
           din:hasDIN276CostGroup ?cg ;
           bki:name ?name ;
           bki:description ?description ;
           bki:hasLayer ?layer .
  
  ?layer bki:processConfigName ?layerName .
  OPTIONAL {{ ?layer bki:lifeTime ?layerLife . }}
  OPTIONAL {{ ?layer bki:layerSize ?layerSize . }}
  
  ?cg skos:notation ?notation .
  
  FILTER(?notation IN ("{notations_list}"))
  FILTER regex(?layerName, "beton", "i")
}}
ORDER BY ?element
'''
    
    results = g.query(query_str)
    print(query_str)
    
    # Convert results to a Python structure
    from collections import defaultdict
    
    # Dictionary keyed by element URI
    output = defaultdict(lambda: {
        'name': '',
        'description': '',
        'notation': '',
        'layers': []
    })
    
    for row in results:
        elem_uri = str(row.element)
        
        if not output[elem_uri]['name']:
            output[elem_uri]['name'] = str(row.name)
        if not output[elem_uri]['description']:
            output[elem_uri]['description'] = str(row.description)
        if not output[elem_uri]['notation']:
            output[elem_uri]['notation'] = str(row.notation)
        
        # Layers
        layer_data = {
            'processConfigName': str(row.layerName),
            'lifeTime': str(row.layerLife) if row.layerLife else None,
            'layerSize': str(row.layerSize) if row.layerSize else None
        }
        output[elem_uri]['layers'].append(layer_data)
    
    return output


result = query_bki_by_costgroups(g, ["322","330","331","340"])
for elem_uri, info in result.items():
    print(f"Element: {elem_uri}")
    print(f"  DIN 276 group notation: {info['notation']}")
    print(f"  Name: {info['name']}")
    print(f"  Description: {info['description']}")
    print("  Layers containing 'beton':")
    for layer in info['layers']:
        print(f"    * processConfigName: {layer['processConfigName']}")
        print(f"      lifeTime: {layer['lifeTime']}")
        print(f"      layerSize: {layer['layerSize']}")
    print("---")


# Helpers