In [None]:
import sys
from pathlib import Path

# Walk upward to find the project-root folder "simple_rag"
p = Path.cwd().resolve()
while p.name != "simple_rag" and p.parent != p:
    p = p.parent

PROJECT_ROOT = p                      # .../simple_rag
if str(PROJECT_ROOT) not in sys.path: # avoid duplicates
    sys.path.insert(0, str(PROJECT_ROOT))

del p, Path, sys                      # keep namespace tidy
# --------------------------------------------------------------------

# Do sanity-check
print("✓ project root on sys.path →", PROJECT_ROOT)

In [None]:
epd_count=10 # Number of instances to convert from every ILCDx Dataset (use 10 for prototype, 1000 for full run)

In [None]:
# Add The International EPD System

import sqlite3
import pandas as pd
import json
import xml.etree.ElementTree as ET
import jsonlines
import re

# -----------------------------------------------------------------------------
# 1. PARSE THE XML TO BUILD A CATEGORY LOOKUP STRUCTURE
# -----------------------------------------------------------------------------
def build_category_dict(xml_element):
    category_id = xml_element.get('id')
    category_name = xml_element.get('name')
    
    children_dict = {}
    for child_elem in xml_element.findall('{*}category'):
        child_data = build_category_dict(child_elem)
        children_dict[child_data['name']] = child_data

    return {
        'id': category_id,
        'name': category_name,
        'children': children_dict
    }

def parse_category_xml(xml_path):
    tree = ET.parse(xml_path, parser=ET.XMLParser(encoding='utf-8'))
    root = tree.getroot()
    
    category_system = root.find('.//{*}categories[@dataType="Process"]')
    top_level_dict = {}
    if category_system is not None:
        for cat_elem in category_system.findall('{*}category'):
            cat_data = build_category_dict(cat_elem)
            top_level_dict[cat_data['name']] = cat_data
    else:
        print("Warning: Could not find <categories dataType='Process'> in XML.")
    
    return top_level_dict

def get_category_path_info(category_hierarchy, categories_dict):
    path_info = []
    if not category_hierarchy:
        return path_info

    current_dict = categories_dict
    for i, cat_name in enumerate(category_hierarchy):
        if cat_name not in current_dict:
            raise ValueError(f"Category '{cat_name}' not found at level {i}.")
        this_cat = current_dict[cat_name]
        path_info.append((this_cat['name'], this_cat['id']))
        current_dict = this_cat['children']
    return path_info

# -----------------------------------------------------------------------------
# STEP 1.1: SELECT + CHANGE CATEGORY, PRIORITIZING "compressive" OR "density"
#           Then rewrite materialProperties to a standard form and output to JSONLines
# -----------------------------------------------------------------------------

xml_path = "../data/pipeline2/xml/OEKOBAU.DAT_Categories_EN_API.xml"
category_dict = parse_category_xml(xml_path)

db_path = "../data/pipeline2/sql/epd_database.sqlite"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

csv_path = "../data/pipeline2/sql/regex_classified/filtered_epd_data02_classified_concrete05.csv"
df = pd.read_csv(csv_path)

num_epd_files = epd_count
df_limited = df  

modified_json_docs = {}
TARGET_CLASSIFICATION = "Mineral building products > Mortar and Concrete > Ready mixed concrete"
selected_uuids = []

# Filter df down to only those rows whose RegEx Classification matches your target
filtered_results = df[df["RegEx Classification"] == TARGET_CLASSIFICATION]
print(f"Found {len(filtered_results)} EPD(s) after category filtering.\n")

# Helper function to unify materialProperties
def unify_material_property(mp):
    """
    If mp["name"] contains 'compressive', rename to 'compressive strength',
       unit->'MPa', unitDescription->'megapascals'
    If mp["name"] contains 'density', rename to 'gross density',
       unit->'kg/m^3', unitDescription->'kilograms per cubic metre'
    Remove trailing parentheses from original name if it helps match.

    Note: We keep mp["value"] as is. Adjust if you want different numeric behavior.
    """
    name_lower = mp["name"].lower()

    # Remove any trailing parentheses chunk (e.g. ' (kg/m^3)')
    # This is optional if you want to remove that from the property name.
    
    mp["name"] = re.sub(r"\(.*?\)", "", mp["name"]).strip()

    if "compressive" in name_lower:
        mp["name"] = "compressive strength"
        mp["unit"] = "MPa"
        mp["unitDescription"] = "megapascals"
    elif "density" in name_lower:
        mp["name"] = "gross density"
        mp["unit"] = "kg/m^3"
        mp["unitDescription"] = "kilograms per cubic meter"

for index, row in df_limited.iterrows():
    if len(selected_uuids) >= num_epd_files:
        break

    uuid_val = row["UUID"]
    classification_str = row["RegEx Classification"]

    # Only process rows with the target classification
    if classification_str == TARGET_CLASSIFICATION:
        # 1) Fetch JSON from epd_documents
        cursor.execute("SELECT document FROM epd_documents WHERE uuid = ?", (uuid_val,))
        result = cursor.fetchone()
        if not result:
            continue

        json_text = result[0]
        try:
            json_data = json.loads(json_text)
        except json.JSONDecodeError:
            continue

        # 2) Check if there's a "compressive" and "density" entry in materialProperties
        found_property_of_interest = False
        compressive_found = False
        density_found = False

        exchanges = json_data.get("exchanges", {}).get("exchange", [])
        for exch in exchanges:
            mat_props = exch.get("materialProperties", [])
            for mp in mat_props:
                name_lower = mp.get("name", "").lower()
                # Check for "compressive" only in this property
                if "compressive" in name_lower and "density" not in name_lower:
                    compressive_found = True
                # Check for "density" only in this property
                if "density" in name_lower and "compressive" not in name_lower:
                    density_found = True
                # If both have been found in two separate properties, we can stop searching
                if compressive_found and density_found:
                    found_property_of_interest = True
                    break
            if found_property_of_interest:
                break

        if not found_property_of_interest:
            continue  # skip EPD if no two separate relevant property names

        # 3) Build new classification array from the CSV "RegEx Classification"
        category_names = [x.strip() for x in classification_str.split(">")]
        try:
            path_info = get_category_path_info(category_names, category_dict)
        except ValueError:
            continue

        class_array = []
        for level, (cat_name, cat_id) in enumerate(path_info):
            class_array.append({
                "value": cat_name,
                "level": level,
                "classId": cat_id
            })

        new_classification_item = {
            "class": class_array,
            "name": "OEKOBAU.DAT"
        }
        
        # 4) Inject new classification while preserving existing
        process_info = json_data.setdefault("processInformation", {})
        data_set_info = process_info.setdefault("dataSetInformation", {})
        classification_info = data_set_info.setdefault("classificationInformation", {})
        existing_classifications = classification_info.get("classification")
        if not isinstance(existing_classifications, list):
            existing_classifications = []
        existing_classifications.append(new_classification_item)
        classification_info["classification"] = existing_classifications

        # 5) Now unify each relevant material property
        #    We'll loop through all exchanges, all mat_props
        #    rewriting as needed
        for exch in exchanges:
            mat_props = exch.get("materialProperties", [])
            for mp in mat_props:
                if "compressive" in mp.get("name", "").lower() or "density" in mp.get("name", "").lower():
                    unify_material_property(mp)

        # Remove unwanted exchanges entries
        # For each exchange, filter out any anies item with module "A1", "A2", or "A3"
            other = exch.get("other", {})
            anies = other.get("anies", [])
            filtered_anies = [item for item in anies if item.get("module") not in ["A1", "A2", "A3"]]
            other["anies"] = filtered_anies

        # 6) Remove unwanted LCIAResults entries
        #    For each LCIAResult, filter out any anies item with module "A1", "A2", or "A3"
        lcia_results = json_data.get("LCIAResults", {}).get("LCIAResult", [])
        for lcia in lcia_results:
            other = lcia.get("other", {})
            anies = other.get("anies", [])
            filtered_anies = [item for item in anies if item.get("module") not in ["A1", "A2", "A3"]]
            other["anies"] = filtered_anies

        # 7) Store the updated JSON
        modified_json_docs[uuid_val] = json_data
        selected_uuids.append(uuid_val)
        # print(f"Selected EPD with UUID={uuid_val}: found 'compressive' or 'density'. Classification updated.")
        # print("--------------------------------------------------")

conn.close()

# Write out the results to JSONLines
output_jsonl = "../data/pipeline2/json/edited_epds.jsonl"

with jsonlines.open(output_jsonl, mode='w') as writer:
    idx = 1
    for epd_uuid in selected_uuids:
        doc = modified_json_docs[epd_uuid]
        # We can try to get the classification system from the doc:
        classifications = (
            doc.get("processInformation", {})
               .get("dataSetInformation", {})
               .get("classificationInformation", {})
               .get("classification", [])
        )
        existing_class_sys = classifications[0].get("name") if classifications else "Unknown"

        # product name from baseName
        base_names = (
            doc.get("processInformation", {})
               .get("dataSetInformation", {})
               .get("name", {})
               .get("baseName", [])
        )
        epd_name = base_names[0].get("value", "") if base_names else ""

        record = {
            "id": idx,
            "uuid": epd_uuid,
            "epd_name": epd_name,
            "classificationSys": existing_class_sys,
            "document": doc
        }
        writer.write(record)
        idx += 1

print(f"\nWrote {len(selected_uuids)} EPD records to '{output_jsonl}' in JSONLines format.")


In [None]:
# Add OEKOBAU.DAT and IBUCategories

import re
import json
import sqlite3
import jsonlines

db_path = "../data/pipeline2/sql/epd_database.sqlite"
category_val = "Beton"
max_items = epd_count

def fetch_well_defined_epds(conn, classification_system, category_value):
    """
    Fetch EPDs from epd_documents + epd_metadata where
      m.classification_system (case-insensitive) = classification_system,
      the JSON classification array has {"value": category_value},
      and materialProperties[*].name includes 'compressive' or 'density'.

    Returns: (classification_name_in_json, results_list)
    where results_list is [(uuid, doc_data)].
    """
    sql = """
    SELECT d.uuid, d.document
    FROM epd_documents d
    JOIN epd_metadata m ON d.uuid = m.uuid
    WHERE m.classification_system COLLATE NOCASE = ?
    """
    cursor = conn.cursor()
    cursor.execute(sql, (classification_system,))
    rows = cursor.fetchall()

    results = []
    classification_system_in_json = None

    for doc_uuid, doc_text in rows:
        if not doc_text:
            continue
        try:
            doc_data = json.loads(doc_text)
        except json.JSONDecodeError:
            continue

        # 1) Must have "Beton" in classification
        classifications = (
            doc_data.get("processInformation", {})
            .get("dataSetInformation", {})
            .get("classificationInformation", {})
            .get("classification", [])
        )
        found_beton = False
        classification_name_for_this_doc = None

        for classification_item in classifications:
            for cls_obj in classification_item.get("class", []):
                if cls_obj.get("value") == category_value:
                    found_beton = True
                    classification_name_for_this_doc = classification_item.get("name")
                    break
            if found_beton:
                break
        if not found_beton:
            continue

        # 2) Must have 'compressive' or 'density' in materialProperties
        exchanges = doc_data.get("exchanges", {}).get("exchange", [])
        found_property = False
        for exch in exchanges:
            mat_props = exch.get("materialProperties", [])
            for mp in mat_props:
                name_lower = mp.get("name", "").lower()
                if "compressive" in name_lower or "density" in name_lower:
                    found_property = True
                    break
            if found_property:
                break
        if not found_property:
            continue

        # Record doc
        results.append((doc_uuid, doc_data))
        if classification_name_for_this_doc:
            classification_system_in_json = classification_name_for_this_doc

    return classification_system_in_json, results


def process_epds_and_limit(epds, classification_name_in_json, classification_label):
    """
    Parse compressive strength from baseName, insert 'compressive strength' property,
    exclude if missing pattern or no exchange, then slice to number of EPDs.
    Return the final processed list of (uuid, doc, classification_label).
    """


    print(f"\n{classification_label}")
    print(f"Found {len(epds)} matching EPD(s) overall. Now parsing compressive strength...")

    pattern = r"C\s*(\d+)\s*/\s*(\d+)"
    final_list = []

    for epd_uuid, epd_doc in epds:
        # 1) product_name from baseName[0].value
        base_names = (
            epd_doc.get("processInformation", {})
            .get("dataSetInformation", {})
            .get("name", {})
            .get("baseName", [])
        )
        if not base_names or not isinstance(base_names, list):
            continue
        product_name = base_names[0].get("value", "")
        if not product_name:
            continue

        # 2) parse "Cxx/yy"
        match = re.search(pattern, product_name, flags=re.IGNORECASE)
        if not match:
            continue
        compressive_strength_value = match.group(2)

        # 3) Insert property in first exchange
        exchanges = epd_doc.get("exchanges", {}).get("exchange", [])
        if not exchanges:
            continue
        first_exchange = exchanges[0]
        mat_props = first_exchange.setdefault("materialProperties", [])
        cs_entry = {
            "name": "compressive strength",
            "value": compressive_strength_value,
            "unit": "MPa",
            "unitDescription": "megapascals",
        }
        mat_props.append(cs_entry)

        # 4) classification => fallback if we don't have classification_name_in_json
        classification_to_store = (
            classification_name_in_json
            if classification_name_in_json
            else classification_label
        )

        final_list.append((epd_uuid, epd_doc, classification_to_store))

    # slice to 50
    print(f"Total EPDs after compressive strength parse: {len(final_list)}")
    final_list = final_list[:max_items]
    print(f"Will output {len(final_list)} EPDs. (Needed at least {max_items}.)")
    return final_list

conn = sqlite3.connect(db_path)

# 1) For OEKOBAU.DAT
classification_in_json_oeko, all_epds_oeko = fetch_well_defined_epds(
    conn, classification_system="OEKOBAU.DAT", category_value=category_val
)
final_epds_oeko = process_epds_and_limit(
    all_epds_oeko, classification_in_json_oeko, classification_label="OEKOBAU.DAT"
)

# 2) For IBUCategories
classification_in_json_ibu, all_epds_ibu = fetch_well_defined_epds(
    conn, classification_system="IBUCategories", category_value=category_val
)
final_epds_ibu = process_epds_and_limit(
    all_epds_ibu, classification_in_json_ibu, classification_label="IBUCategories"
)

conn.close()

# Merge them. We'll store them all in a single JSONLines file
combined_epds = final_epds_oeko + final_epds_ibu
print(
    f"\nTotal EPDs => OEKOBAU.DAT: {len(final_epds_oeko)}, IBUCategories: {len(final_epds_ibu)}, Combined: {len(combined_epds)}"
)

# Ucomment to test
# modified_json_docs = {}

modified_json_docs_nr = len(modified_json_docs)

# Write to JSONLines
output_jsonl = "../data/pipeline2/json/edited_epds.jsonl"
with jsonlines.open(output_jsonl, mode="a") as writer:
    idx = len(modified_json_docs) + 1
    for epd_uuid, epd_doc, classification_str in combined_epds:
        # product name
        base_names = (
            epd_doc.get("processInformation", {})
            .get("dataSetInformation", {})
            .get("name", {})
            .get("baseName", [])
        )
        product_name = base_names[0].get("value", "") if base_names else ""

        # Add to modified dict
        modified_json_docs[epd_uuid] = epd_doc
        # print(f"Added {epd_uuid} to modified_json_docs")

        record = {
            "id": idx,
            "uuid": epd_uuid,
            "epd_name": product_name,
            "classificationSys": classification_str,
            "document": epd_doc,
        }
        writer.write(record)
        idx += 1

    print(f"\nAdded {len(modified_json_docs) - modified_json_docs_nr} to modified_json_docs\n")

print(
    f"Final combined JSONLines => '{output_jsonl}' with {len(combined_epds)} records."
)



In [None]:
# Add EPDNorge

import json
import jsonlines
import sqlite3
import xml.etree.ElementTree as ET
import re

# =============================================================================
# PART 1: Load initial results from JSON lines and filter by best_category
# =============================================================================

context_path = "../data/pipeline2/json/context_jinaai_jina-embeddings-v3_20250503134414.json"
batch_output_path = "../data/pipeline2/json/openai/batch_output_EPDNorge_concrete_batch_6816185b6cc08190a28a2829a6c1f780.jsonl"

with open(context_path, 'r', encoding='utf-8') as f:
    context_data = json.load(f)

results = []
with jsonlines.open(batch_output_path) as reader:
    for idx, batch_record in enumerate(reader):
        content_str = batch_record["response"]["body"]["choices"][0]["message"]["content"]
        content_json = json.loads(content_str)
        best_category = content_json.get("best_category", "")

        new_entry = {
            "id": idx,
            "Product": context_data[idx]["Product"],
            "UUID": context_data[idx]["UUID"],
            "best_category": best_category
        }
        results.append(new_entry)

TARGET_CLASSIFICATION = "Mineral building products > Mortar and Concrete > Ready mixed concrete"
filtered_results = [r for r in results if r["best_category"] == TARGET_CLASSIFICATION]

print(f"Found {len(filtered_results)} EPD(s) after category filtering.\n")

# =============================================================================
# PART 2: Connect to DB and load EPDs. Insert compressive strength & density first.
# =============================================================================

db_path = "../data/pipeline2/sql/epd_database.sqlite"

# Regex for BXX or CXX/YY in product name
cs_pattern = re.compile(r'B\s?(\d{2})|C(\d{2})/(\d{2})', re.IGNORECASE)

def insert_compressive_strength_from_name(doc_data):
    """
    If product name has BXX or CXX/YY, insert 'compressive strength'
    into the first exchange's materialProperties.
    """
    base_name_list = (
        doc_data
        .get("processInformation", {})
        .get("dataSetInformation", {})
        .get("name", {})
        .get("baseName", [])
    )
    if not base_name_list or not isinstance(base_name_list, list):
        return

    product_name = base_name_list[0].get("value", "")
    if not product_name:
        return

    match = cs_pattern.search(product_name)
    if not match:
        return

    if match.group(1):
        # BXX => group(1)
        compressive_value = match.group(1)
    else:
        # CXX/YY => group(3)
        compressive_value = match.group(3)

    exchanges = doc_data.get("exchanges", {}).get("exchange", [])
    if not exchanges:
        return

    first_exchange = exchanges[0]
    mat_props = first_exchange.setdefault("materialProperties", [])

    # Insert new entry
    mat_props.append({
        "name": "compressive strength",
        "value": compressive_value,
        "unit": "MPa",
        "unitDescription": "megapascals"
    })

def insert_gross_density_from_volume_mass(doc_data):
    """
    If we find:
      - 'Volume'/'Volum' with meanValue=1, referenceUnit='m3'
      - 'Mass'/'Masse' with numeric meanValue
    Then insert 'gross density' into the first exchange's materialProperties.
    """
    exchanges = doc_data.get("exchanges", {}).get("exchange", [])
    if not exchanges:
        return

    first_exchange = exchanges[0]
    flow_props = first_exchange.get("flowProperties", [])
    if not flow_props:
        return

    found_volume = False
    found_mass = False
    mass_value = None

    for fp in flow_props:
        names = fp.get("name", [])
        mean_val = fp.get("meanValue")
        ref_unit = fp.get("referenceUnit", "")

        # Volume?
        if any("volum" in n.get("value", "").lower() or "volume" in n.get("value", "").lower() for n in names):
            if mean_val == 1 and ref_unit == "m3":
                found_volume = True

        # Mass?
        if any("mass" in n.get("value", "").lower() or "masse" in n.get("value", "").lower() for n in names):
            try:
                mass_value = float(mean_val)
                found_mass = True
            except (TypeError, ValueError):
                pass

    if found_volume and found_mass and (mass_value is not None):
        mat_props = first_exchange.setdefault("materialProperties", [])
        mat_props.append({
            "name": "gross density",
            "value": str(mass_value),
            "unit": "kg/m^3",
            "unitDescription": "kilograms per cubic meter"
        })

def get_location(doc_data):
    return (
        doc_data
        .get("processInformation", {})
        .get("geography", {})
        .get("locationOfOperationSupplyOrProduction", {})
        .get("location")
    )

def has_material_property(doc_data, prop_name):
    """
    Returns True if the first exchange's materialProperties
    has an entry with `name == prop_name`.
    """
    exchanges = doc_data.get("exchanges", {}).get("exchange", [])
    if not exchanges:
        return False
    first_exchange = exchanges[0]
    mat_props = first_exchange.get("materialProperties", [])
    for mp in mat_props:
        if mp.get("name", "").lower() == prop_name.lower():
            return True
    return False

# Gather EPDs that end up having location, 'compressive strength', and 'gross density' after the inserts
EPDs_with_all_data = []
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

for entry in filtered_results:
    uuid_val = entry["UUID"]
    cursor.execute("SELECT document FROM epd_documents WHERE uuid = ?", (uuid_val,))
    row = cursor.fetchone()
    if not row:
        continue

    doc_text = row[0]
    try:
        doc_data = json.loads(doc_text)
    except json.JSONDecodeError:
        continue

    # 1) Check if we have a non-empty location
    location_val = get_location(doc_data)
    if not location_val:
        # skip if there's no location
        continue

    # 2) Insert compressive strength from product name
    insert_compressive_strength_from_name(doc_data)

    # 3) Insert gross density from volume=1, mass
    insert_gross_density_from_volume_mass(doc_data)

    # Now see if we indeed have "compressive strength" and "gross density"
    # after insertion
    has_compressive = has_material_property(doc_data, "compressive strength")
    has_density = has_material_property(doc_data, "gross density")

    if has_compressive and has_density:
        EPDs_with_all_data.append((uuid_val, doc_data))

conn.close()

print(f"Total EPDs after insertion steps (location + compressive strength + gross density): {len(EPDs_with_all_data)}\n")


# =============================================================================
# PART 3: Summation A1-A3, classification, limit to 100, output
# =============================================================================

def sum_a1_a2_a3(doc_data):
    def handle_anies(anies_list):
        if not anies_list:
            return anies_list
        sum_value = 0.0
        sum_item_template = None
        keepers = []

        for item in anies_list:
            mod = item.get("module")
            if mod in ["A1", "A2", "A3"]:
                val_str = item.get("value")
                try:
                    val = float(val_str)
                except (ValueError, TypeError):
                    val = 0.0
                sum_value += val
                if sum_item_template is None:
                    sum_item_template = dict(item)
            else:
                keepers.append(item)

        if sum_value != 0.0:
            if sum_item_template is None:
                sum_item_template = {}
            sum_item_template["module"] = "A1-A3"
            sum_item_template["value"] = str(sum_value)
            keepers.insert(0, sum_item_template)
        return keepers

    exchanges = doc_data.get("exchanges", {}).get("exchange", [])
    for exch in exchanges:
        other = exch.get("other", {})
        anies = other.get("anies", [])
        other["anies"] = handle_anies(anies)

    lcia_results = doc_data.get("LCIAResults", {}).get("LCIAResult", [])
    for lcia in lcia_results:
        other = lcia.get("other", {})
        anies = other.get("anies", [])
        other["anies"] = handle_anies(anies)

def parse_category_xml(xml_path):
    tree = ET.parse(xml_path, parser=ET.XMLParser(encoding='utf-8'))
    root = tree.getroot()
    category_system = root.find('.//{*}categories[@dataType="Process"]')
    top_level_dict = {}
    if category_system is not None:
        for cat_elem in category_system.findall('{*}category'):
            cat_data = build_category_dict(cat_elem)
            top_level_dict[cat_data['name']] = cat_data
    else:
        print("Warning: Could not find <categories dataType='Process'> in XML.")
    return top_level_dict

def build_category_dict(xml_element):
    category_id = xml_element.get('id')
    category_name = xml_element.get('name')
    
    children_dict = {}
    for child_elem in xml_element.findall('{*}category'):
        child_data = build_category_dict(child_elem)
        children_dict[child_data['name']] = child_data

    return {
        'id': category_id,
        'name': category_name,
        'children': children_dict
    }

def get_category_path_info(category_hierarchy, categories_dict):
    """Given a list of category names and your category_dict, 
       return [(name, id), (name, id), ...] for each level."""
    path_info = []
    if not category_hierarchy:
        return path_info

    current_dict = categories_dict
    for i, cat_name in enumerate(category_hierarchy):
        if cat_name not in current_dict:
            raise ValueError(f"Category '{cat_name}' not found at level {i}.")
        this_cat = current_dict[cat_name]
        path_info.append((this_cat['name'], this_cat['id']))
        current_dict = this_cat['children']
    return path_info

def insert_classification(doc_data, classification_system_in_json, target_class=TARGET_CLASSIFICATION):
    classification_str = target_class
    category_names = [x.strip() for x in classification_str.split(">")]
    try:
        path_info = get_category_path_info(category_names, category_dict)
    except ValueError as e:
        print(f"Warning: Could not resolve categories. {e}")
        return classification_system_in_json

    class_array = []
    for level, (cat_name, cat_id) in enumerate(path_info):
        class_array.append({
            "value": cat_name,
            "level": level,
            "classId": cat_id
        })

    classifications = (
        doc_data
        .get("processInformation", {})
        .get("dataSetInformation", {})
        .get("classificationInformation", {})
        .get("classification", [])
    )
    for classification_item in classifications:
        possible_name = classification_item.get("name")
        if possible_name:
            classification_system_in_json = possible_name
            break

    new_classification_item = {
        "class": class_array,
        "name": "OEKOBAU.DAT"
    }
    process_info = doc_data.setdefault("processInformation", {})
    data_set_info = process_info.setdefault("dataSetInformation", {})
    classification_info = data_set_info.setdefault("classificationInformation", {})
    existing_classifications = classification_info.get("classification")
    if not isinstance(existing_classifications, list):
        existing_classifications = []
    existing_classifications.append(new_classification_item)
    classification_info["classification"] = existing_classifications

    return classification_system_in_json

xml_path = "../data/pipeline2/xml/OEKOBAU.DAT_Categories_EN_API.xml"
category_dict = parse_category_xml(xml_path)

# Pick up to 100 EPDs
number_epds = epd_count
final_list = EPDs_with_all_data[:number_epds]

# Uncomment for testing
# modified_json_docs = {}

selected_uuids = []
classification_system_in_json = None

for (uuid_val, doc_data) in final_list:
    # Summation A1-A3
    sum_a1_a2_a3(doc_data)
    # Insert classification
    classification_system_in_json = insert_classification(doc_data, classification_system_in_json)
    # Store
    modified_json_docs[uuid_val] = doc_data
    selected_uuids.append(uuid_val)

print(f"\nFinal selection: {len(modified_json_docs)} EPDs that have location, compressive strength, and gross density.\n")

# =============================================================================
# PART 4: Output final
# =============================================================================

# output_file = "selected_epds_norge.json"
# output_data = {
#     "classificationSystem": classification_system_in_json,
#     "uuids": list(modified_json_docs.keys())
# }
# with open(output_file, "w", encoding="utf-8") as out_f:
#     json.dump(output_data, out_f, indent=2, ensure_ascii=False)

# print(f"Wrote {len(modified_json_docs)} final EPDs to '{output_file}'.\n")

output_jsonl = "../data/pipeline2/json/edited_epds.jsonl"
with jsonlines.open(output_jsonl, mode='a') as writer:
    idx = len(modified_json_docs) - len(selected_uuids) + 1
    for epd_uuid in selected_uuids:
        doc = modified_json_docs[epd_uuid]
        # Possibly get classification system from doc:
        classifications = (
            doc.get("processInformation", {})
               .get("dataSetInformation", {})
               .get("classificationInformation", {})
               .get("classification", [])
        )
        existing_class_sys = classifications[0].get("name") if classifications else "Unknown"

        # baseName
        base_names = (
            doc.get("processInformation", {})
               .get("dataSetInformation", {})
               .get("name", {})
               .get("baseName", [])
        )
        epd_name = base_names[0].get("value", "") if base_names else ""

        writer.write({
            "id": idx,
            "uuid": epd_uuid,
            "epd_name": epd_name,
            "classificationSys": existing_class_sys,
            "document": doc
        })
        idx += 1

print(f"Wrote full JSON docs for {len(selected_uuids)} EPDs to '{output_jsonl}'.")


In [None]:
# 2. Rename JSON keys (updated with new logic)

import json

def recursive_rename_uri(obj):
    """
    Recursively rename any key 'uri' to 'refObjectUri' in the given object (dict/list).
    """
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            new_key = "refObjectUri" if key == "uri" else key
            new_obj[new_key] = recursive_rename_uri(value)
        return new_obj
    elif isinstance(obj, list):
        return [recursive_rename_uri(item) for item in obj]
    else:
        return obj

def remove_raw_strings_in_anies(obj):
    """
    Recursively traverse the object and remove any raw string elements from lists
    that belong to a key named 'anies'.
    """
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            if key == "anies" and isinstance(value, list):
                # Filter out raw string elements from this list.
                new_obj[key] = [
                    remove_raw_strings_in_anies(item)
                    for item in value
                    if not isinstance(item, str)
                ]
            else:
                new_obj[key] = remove_raw_strings_in_anies(value)
        return new_obj
    elif isinstance(obj, list):
        return [remove_raw_strings_in_anies(item) for item in obj]
    else:
        return obj

def transform_json(data):
    """
    Transform an Environmental Product Declaration (EPD) JSON instance by renaming keys
    and restructuring its contents to standardize the schema.
    
    Major changes in this version:
      - Globally rename every "uri" to "refObjectUri"
      - Globally remove raw string elements from any 'anies' lists
      - Keep the rest of your original rename logic: processInformation, modellingAndValidation, 
        administrativeInformation, exchanges, LCIAResults, etc.
      - Exclude the key "relativeStandardDeviation95In" from LCIAResults.LCIAResult and exchanges.exchange.

    Returns: The updated dictionary (in-memory).
    """
    # --- processInformation transformations ---
    process_info = data.get("processInformation", {})
    data_set_info = process_info.get("dataSetInformation", {})

    # Rename "dataSetInformation.name" -> "dataSetName"
    if "name" in data_set_info:
        data_set_info["dataSetName"] = data_set_info.pop("name")

    # Rename dataSetInformation.other -> otherDSI; remove 'componentsAndMaterialsAndSubstances' if found
    if "other" in data_set_info:
        data_set_info["otherDSI"] = data_set_info.pop("other")
        # If any item in otherDSI.anies contains "componentsAndMaterialsAndSubstances", remove the entire "anies" key.
        if "anies" in data_set_info["otherDSI"]:
            for item in data_set_info["otherDSI"]["anies"]:
                if (
                    isinstance(item, dict)
                    and "componentsAndMaterialsAndSubstances" in item
                ):
                    data_set_info["otherDSI"].pop("anies")
                    break
            # Rename dataSetInformation.other.anies.scenario -> objectScenario;
            if "scenario" in data_set_info["otherDSI"]:
                data_set_info["objectScenario"] = data_set_info["otherDSI"].pop("scenario")

    # For classification entries, rename "class" -> "classEntries"
    classification_info = data_set_info.get("classificationInformation", {})
    classifications = classification_info.get("classification", [])
    for cls_obj in classifications:
        if "class" in cls_obj:
            cls_obj["classEntries"] = cls_obj.pop("class")

    # Rename "time" -> "timeInformation", then rename "other" -> "otherTime", and "value" -> "timestampValue"
    if "time" in process_info:
        process_info["timeInformation"] = process_info.pop("time")
        time_info = process_info["timeInformation"]
        if "other" in time_info:
            time_info["otherTime"] = time_info.pop("other")
            for item in time_info["otherTime"].get("anies", []):
                if isinstance(item, dict) and "value" in item:
                    item["timestampValue"] = item.pop("value")

    # --- modellingAndValidation transformations ---
    mod_val = data.get("modellingAndValidation", {})

    # LCIMethodAndAllocation.other -> otherMAA
    lci_method = mod_val.get("LCIMethodAndAllocation", {})
    if "other" in lci_method:
        lci_method["otherMAA"] = lci_method.pop("other")

    # dataSourcesTreatmentAndRepresentativeness.other -> otherDSTAR
    dstar = mod_val.get("dataSourcesTreatmentAndRepresentativeness", {})
    if "other" in dstar:
        dstar["otherDSTAR"] = dstar.pop("other")
        if "anies" in dstar["otherDSTAR"]:
            dstar["otherDSTAR"]["aniesDSTAR"] = dstar["otherDSTAR"].pop("anies")
            for item in dstar["otherDSTAR"]["aniesDSTAR"]:
                if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                    # rename "value" -> "valueDSTAR", then rename subkeys
                    item["valueDSTAR"] = item.pop("value")
                    val = item["valueDSTAR"]
                    if "shortDescription" in val:
                        val["shortDescriptionExtended"] = val.pop("shortDescription")
                    if "version" in val:
                        version = val.pop("version")
                        if "version" in version:
                            version["versionInt"] = version.pop("version")
                        val["versionDict"] = version
                    if "uuid" in val:
                        uuid_obj = val.pop("uuid")
                        if "uuid" in uuid_obj:
                            uuid_obj["uuidValue"] = uuid_obj.pop("uuid")
                        val["uuidDict"] = uuid_obj

    # Rename "validation" -> "validationInfo"
    if "validation" in mod_val:
        mod_val["validationInfo"] = mod_val.pop("validation")

    # modellingAndValidation.other -> otherMAV; if "value" is a dict, rename it to "objectValue"
    if "other" in mod_val:
        mod_val["otherMAV"] = mod_val.pop("other")
        for item in mod_val["otherMAV"].get("anies", []):
            if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                item["objectValue"] = item.pop("value")

    # --- administrativeInformation transformations ---
    admin_info = data.get("administrativeInformation", {})

    # publicationAndOwnership.other -> otherPAO; rename "value" -> "objectValue" if it's a dict
    pub_own = admin_info.get("publicationAndOwnership", {})
    if "other" in pub_own:
        pub_own["otherPAO"] = pub_own.pop("other")
        for item in pub_own["otherPAO"].get("anies", []):
            if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                item["objectValue"] = item.pop("value")

    # --- exchanges transformations ---
    exchanges = data.get("exchanges", {}).get("exchange", [])
    for exchange in exchanges:
        # flowProperties: rename name->nameFP, uuid->uuidFP
        for fp in exchange.get("flowProperties", []):
            if "name" in fp:
                fp["nameFP"] = fp.pop("name")
            if "uuid" in fp:
                fp["uuidFP"] = fp.pop("uuid")

        # rename "exchange direction" -> "exchangeDirection"
        if "exchange direction" in exchange:
            exchange["exchangeDirection"] = exchange.pop("exchange direction")

        # rename "other" -> "otherEx", if "value" is dict -> "objectValue"
        if "other" in exchange:
            exchange["otherEx"] = exchange.pop("other")
            for item in exchange["otherEx"].get("anies", []):
                if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                    item["objectValue"] = item.pop("value")

        # rename "classification" -> "classificationEx" and inside, rename "name"->"nameClass"
        if "classification" in exchange:
            exchange["classificationEx"] = exchange.pop("classification")
            if "name" in exchange["classificationEx"]:
                exchange["classificationEx"]["nameClass"] = exchange["classificationEx"].pop("name")
        
        # exclude relativeStandardDeviation95In from exchanges.exchange ---
        if "relativeStandardDeviation95In" in exchange:
            exchange.pop("relativeStandardDeviation95In")

    # --- LCIAResults transformations ---
    # rename "LCIAResults" -> "lciaResults" if present
    if "LCIAResults" in data:
        data["lciaResults"] = data.pop("LCIAResults")

    lcia_results = data.get("lciaResults", {}).get("LCIAResult", [])
    for result in lcia_results:
        # rename "other" -> "otherLCIA", if "value" is dict -> "objectValue"
        if "other" in result:
            result["otherLCIA"] = result.pop("other")
            for item in result["otherLCIA"].get("anies", []):
                if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                    item["objectValue"] = item.pop("value")
        
        # exclude relativeStandardDeviation95In from each LCIA result ---
        if "relativeStandardDeviation95In" in result:
            result.pop("relativeStandardDeviation95In")

    # --- Removal ---
    # Remove top-level "otherAttributes" key if it exists
    if "otherAttributes" in data:
        data.pop("otherAttributes")
    
    # if "modellingAndValidation" in data:
    #     data.pop("modellingAndValidation")
    
    # if "administrativeInformation" in data:
    #     data.pop("administrativeInformation")
    
    # if "exchanges" in data:
    #     data.pop("exchanges")
    
    # if "lciaResults" in data:
    #     data.pop("lciaResults")
    
    if "locations" in data:
        data.pop("locations")


    # --- Global step: remove raw string elements from any 'anies' list
    data = remove_raw_strings_in_anies(data)

    # --- Global step: rename "uri" -> "refObjectUri"
    data = recursive_rename_uri(data)

    return data

# -----------------------------------------------------------
# Example pipeline usage (in memory, no disk I/O):
# -----------------------------------------------------------
for uuid_val, original_doc in modified_json_docs.items():
    final_doc = transform_json(original_doc)

    # If you want to store the transformed version back in the dictionary:
    modified_json_docs[uuid_val] = final_doc

    # Just print for verification
    # print(f"\nTransformed JSON for UUID = {uuid_val}:")
    # print(json.dumps(final_doc, indent=2))
    # print("--------------------------------------------------------")


In [None]:
# 3. Add ids
import yaml
import re

# --------------------------- Utility functions ---------------------------

def load_yaml_schema(file_path):
    """Load the LinkML YAML schema from disk."""
    with open(file_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

def reorder_dict_keys(d):
    """
    Reorders a dictionary so that if 'id' exists, it appears as the first key.
    (For human readability.)
    """
    if "id" in d:
        id_value = d.pop("id")
        new_d = {"id": id_value}
        new_d.update(d)
        d.clear()
        d.update(new_d)

def clean_epd_name(name):
    """
    Cleans the EPD name by replacing non-alphanumeric characters with underscores,
    collapsing multiple underscores, and stripping leading/trailing underscores.
    """
    cleaned = re.sub(r"[^A-Za-z0-9]", "_", name)
    return re.sub(r"_+", "_", cleaned).strip("_")

# Create new ID

def generate_id_from_path(acc_path, prefix="ilcd"):
    """
    Given an accumulated path, returns the full ID as: prefix:acc_path
    """
    return f"{prefix}:{acc_path}"

def get_suffix(item, index):
    """
    If the list element (item) is a dict with a 'module' field,
    return 'module' + its value (dashes removed);
    otherwise, return the 1-based index as a two-digit string.
    """
    if isinstance(item, dict) and "module" in item:
        return f"module{item['module'].replace('-', '')}"
    return f"{index + 1:02d}"

def assign_ids_by_path(obj, epd_uuid, acc_path, parent_is_list, prefix="ilcd"):
    """
    Recursively assigns IDs based on the accumulated path.

    Parameters:
      obj          : current object (dict, list, or primitive)
      epd_uuid     : the top-level EPD UUID (dashes removed), used as a base
      acc_path     : the accumulated path string
      parent_is_list : bool indicating whether the parent container was a list
      prefix       : the string prefix to use, e.g. "ilcd"
    """
    # If this is a dict and has no 'id', generate one
    if isinstance(obj, dict) and "id" not in obj:
        # ID => prefix:epd_uuid_accPath
        obj["id"] = generate_id_from_path(f"{epd_uuid}_{acc_path}", prefix)
        reorder_dict_keys(obj)

    if isinstance(obj, dict):
        # Traverse each key
        for key, value in obj.items():
            if key == "id":
                continue
            if isinstance(value, dict):
                # If parent is a list, extend path with "_key"
                new_acc = f"{acc_path}_{key}" if parent_is_list else key
                assign_ids_by_path(value, epd_uuid, new_acc, parent_is_list=False, prefix=prefix)
            elif isinstance(value, list):
                # For a list, extend path with "_key"
                new_acc = f"{acc_path}_{key}"
                for i, item in enumerate(value):
                    suffix = get_suffix(item, i)
                    # element path => new_acc + "_" + suffix
                    element_acc = f"{new_acc}_{suffix}"
                    assign_ids_by_path(item, epd_uuid, element_acc, parent_is_list=True, prefix=prefix)
            # If it's a primitive, do nothing
    elif isinstance(obj, list):
        # If this is a list, iterate elements
        for i, item in enumerate(obj):
            suffix = get_suffix(item, i)
            new_acc = f"{acc_path}_{suffix}"
            assign_ids_by_path(item, epd_uuid, new_acc, parent_is_list=True, prefix=prefix)


# MAIN LOGIC: In-memory Example 

SCHEMA_PATH = "../data/linkml/yaml/linkml_ILCDmergedSchemas_schema.yaml"
schema = load_yaml_schema(SCHEMA_PATH)

# Fallback if your schema has a default prefix; else use "ilcd"
default_prefix = schema.get("default_prefix", "ilcd")

for uuid_val, doc in modified_json_docs.items():
    # 1) Get the "real" UUID from the doc (with dashes)
    try:
        raw_uuid = doc["processInformation"]["dataSetInformation"]["UUID"]
    except KeyError as e:
        raise KeyError(f"Missing processInformation.dataSetInformation.UUID in doc {uuid_val}") from e
    
    # 2) Remove dashes to build a base
    epd_uuid = raw_uuid.replace("-", "")

    # 3) Assign top-level doc ID (prefix:epd_uuid)
    doc["id"] = f"{default_prefix.lower()}:{epd_uuid}"

    # 4) For each top-level key (besides 'id'/'version'), set a sub-ID and recursively assign deeper IDs
    for top_key, top_obj in doc.items():
        if top_key in ["id", "version"]:
            continue
        if isinstance(top_obj, dict):
            # e.g. "processInformation" => prefix:epd_uuid_processInformation
            top_obj["id"] = f"{default_prefix.lower()}:{epd_uuid}_{top_key}"
            reorder_dict_keys(top_obj)
            assign_ids_by_path(
                top_obj,
                epd_uuid=epd_uuid,
                acc_path=top_key,
                parent_is_list=False,
                prefix=default_prefix.lower()
            )
        elif isinstance(top_obj, list):
            # If it's a list at top level, handle each item
            for i, item in enumerate(top_obj):
                suffix = get_suffix(item, i)
                # top path => top_key + "_" + suffix
                top_path = f"{top_key}_{suffix}"
                assign_ids_by_path(
                    item,
                    epd_uuid=epd_uuid,
                    acc_path=top_path,
                    parent_is_list=True,
                    prefix=default_prefix.lower()
                )

    # 5) Print final JSON to confirm
    # print(f"\n=== JSON with newly assigned IDs for doc (pipeline UUID): {uuid_val} ===")
    # print(json.dumps(doc, indent=2))
    # print("-----------------------------------------------------------------")


In [None]:
# 4. Convert to RDF

import rdflib
from linkml.validator import Validator
from linkml_runtime.loaders import YAMLLoader
from linkml_runtime.dumpers import RDFLibDumper
from linkml_runtime.utils.schemaview import SchemaView

# Import your generated Python dataclass for the schema
# e.g., from data.linkml.py.linkml_processDataSet_schema import ProcessDataSet
from data.linkml.py.linkml_processDataSet_schema import ProcessDataSet


def generate_turtle_from_docs(
    docs_dict,
    schema_path,
    turtle_output_path,
    validate: bool = False
):
    """
    Given a dictionary of JSON documents (each conforming to 'ProcessDataSet'),
    generate a single TTL file that contains all instances. If loading fails
    for any doc, store it in a separate dictionary `failed_docs` for debugging.

    Parameters:
    -----------
    docs_dict : dict
        A dict of {UUID: JSON-Dict} storing your EPD JSON objects in memory.
    schema_path : str
        Path to the LinkML YAML schema (e.g. '../data/linkml/yaml/linkml_processDataSet_schema.yaml').
    turtle_output_path : str
        Where to write the combined Turtle RDF graph (overwrites each run).
    validate : bool
        If True, runs the LinkML Validator on each instance before RDF conversion.

    Returns:
    --------
    failed_docs : dict
        A dictionary of {UUID: JSON-Dict} for any documents that failed to load.
    """
    # 1) Create an empty rdflib graph to combine all instance graphs
    combined_graph = rdflib.Graph()

    # 2) Load the schema into a SchemaView for RDF generation
    sv = SchemaView(schema_path)

    # (Optional) set up a validator if needed
    validator = None
    if validate:
        validator = Validator(schema_path, strict=False)

    # A container to track documents that fail to load
    failed_docs = {}

    # 3) For each doc, wrap in a top-level "processDataSet" key,
    #    load as an object, optionally validate, and convert to RDF.
    dumper = RDFLibDumper()

    success_count = 0

    for uuid_val, json_doc in docs_dict.items():
        yaml_wrapper = {"processDataSet": json_doc}

        # (A) Validate the doc if requested
        if validator:
            report = validator.validate(yaml_wrapper, "ProcessDataSet")
            if report.results:
                print(f"[VALIDATION] Errors for UUID={uuid_val}:")
                for result in report.results:
                    print("  -", result.message)
                # We can decide to skip, but let's let the user decide:
                # continue
            else:
                print(f"[VALIDATION] Document {uuid_val} is valid according to the schema.")

        # (B) Attempt to load as a ProcessDataSet
        try:
            instance_obj = YAMLLoader().load(yaml_wrapper["processDataSet"], target_class=ProcessDataSet)
        except (ValueError, TypeError) as e:
            print(f"[ERROR] Failed to load doc {uuid_val} as ProcessDataSet. Reason:\n  {e}")
            failed_docs[uuid_val] = json_doc
            continue  # Skip adding to the graph

        # (C) Convert to RDF (rdflib.Graph) and accumulate
        instance_graph = dumper.as_rdf_graph(instance_obj, schemaview=sv)
        combined_graph += instance_graph
        success_count += 1

    # 4) Write the combined graph to Turtle
    combined_graph.serialize(destination=turtle_output_path, format="turtle")
    print(f"\nSuccessfully wrote {success_count} instances to the Turtle file:\n  {turtle_output_path}")
    if failed_docs:
        print(f"{len(failed_docs)} documents failed to load and were skipped.")

    # Return the dictionary of failed docs for further handling or debugging
    return failed_docs

# Generate RDF
schema_file = "../data/linkml/yaml/linkml_processDataSet_schema.yaml"
ttl_output = "../data/linkml/rdf/epd_rdf_instance_datastore.ttl"

failed = generate_turtle_from_docs(
    docs_dict=modified_json_docs,
    schema_path=schema_file,
    turtle_output_path=ttl_output,
    validate=True
)

if failed:
    print("\nFailed doc details:")
    for bad_uuid, bad_doc in failed.items():
        print(" -", bad_uuid, "(Doc not loaded successfully)")
