In [None]:
# 1.1 Change category (keeping original classifications if present)

import sqlite3
import pandas as pd
import json
import xml.etree.ElementTree as ET

# -----------------------------------------------------------------------------
# 1. PARSE THE XML TO BUILD A CATEGORY LOOKUP STRUCTURE
# -----------------------------------------------------------------------------
def build_category_dict(xml_element):
    category_id = xml_element.get('id')
    category_name = xml_element.get('name')
    
    children_dict = {}
    for child_elem in xml_element.findall('{*}category'):
        child_data = build_category_dict(child_elem)
        children_dict[child_data['name']] = child_data

    return {
        'id': category_id,
        'name': category_name,
        'children': children_dict
    }

def parse_category_xml(xml_path):
    tree = ET.parse(xml_path, parser=ET.XMLParser(encoding='utf-8'))
    root = tree.getroot()
    
    category_system = root.find('.//{*}categories[@dataType="Process"]')
    top_level_dict = {}
    if category_system is not None:
        for cat_elem in category_system.findall('{*}category'):
            cat_data = build_category_dict(cat_elem)
            top_level_dict[cat_data['name']] = cat_data
    else:
        print("Warning: Could not find <categories dataType='Process'> in XML.")
    
    return top_level_dict

def get_category_path_info(category_hierarchy, categories_dict):
    path_info = []
    if not category_hierarchy:
        return path_info

    current_dict = categories_dict
    for i, cat_name in enumerate(category_hierarchy):
        if cat_name not in current_dict:
            raise ValueError(f"Category '{cat_name}' not found at level {i}.")
        this_cat = current_dict[cat_name]
        path_info.append((this_cat['name'], this_cat['id']))
        current_dict = this_cat['children']
    return path_info

# -----------------------------------------------------------------------------
# 1.1. CHANGE CATEGORY FOR EPDs FROM CSV, preserving existing classification entries
# -----------------------------------------------------------------------------

xml_path = "../data/pipeline2/xml/OEKOBAU.DAT_Categories_EN_API.xml"
category_dict = parse_category_xml(xml_path)

db_path = "../data/pipeline2/sql/epd_database.sqlite"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

csv_path = "../data/pipeline2/sql/regex_classified/filtered_epd_data02_classified_concrete03.csv"
df = pd.read_csv(csv_path)

num_epd_files = 2000  # adjust as needed
df_limited = df.head(num_epd_files)

# Create a dictionary to hold JSON objects in memory
modified_json_docs = {}

TARGET_CLASSIFICATION = "Mineral building products > Mortar and Concrete > Ready mixed concrete"

for index, row in df_limited.iterrows():
    uuid_val = row["UUID"]
    classification_str = row["RegEx Classification"]

    # Only process rows with the target classification
    if classification_str == TARGET_CLASSIFICATION:
        # Fetch JSON from epd_documents
        cursor.execute("SELECT document FROM epd_documents WHERE uuid = ?", (uuid_val,))
        result = cursor.fetchone()
        if not result:
            print(f"No document found for UUID: {uuid_val}")
            continue

        json_text = result[0]
        try:
            json_data = json.loads(json_text)
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for UUID: {uuid_val}")
            continue
        
        # Build the new classification array from the CSV "RegEx Classification"
        category_names = [x.strip() for x in classification_str.split(">")]
        try:
            path_info = get_category_path_info(category_names, category_dict)
        except ValueError as e:
            print(f"Warning: Could not resolve categories for UUID: {uuid_val}. {e}")
            continue

        class_array = []
        for level, (cat_name, cat_id) in enumerate(path_info):
            class_array.append({
                "value": cat_name,
                "level": level,
                "classId": cat_id
            })

        # Prepare new classification item; set its "name" to a chosen value (here, "OEKOBAU.DAT")
        new_classification_item = {
            "class": class_array,
            "name": "OEKOBAU.DAT"
        }
        
        # Inject the new classification item while keeping original ones
        process_info = json_data.setdefault("processInformation", {})
        data_set_info = process_info.setdefault("dataSetInformation", {})
        classification_info = data_set_info.setdefault("classificationInformation", {})

        # Retrieve any existing classifications; if missing, use an empty list.
        existing_classifications = classification_info.get("classification")
        if not isinstance(existing_classifications, list):
            existing_classifications = []
        # Append the new classification
        existing_classifications.append(new_classification_item)
        classification_info["classification"] = existing_classifications
        
        # Store the updated JSON in our in-memory dict
        modified_json_docs[uuid_val] = json_data

        print(f"Appended new category for UUID {uuid_val}: {classification_str}")
        print("Full classification now:")
        print(json.dumps(classification_info["classification"], indent=2))
        print("--------------------------------------------------")

conn.close()

In [None]:
# Add from OEKOBAU.DAT
classification_sys = "OEKOBAU.DAT"
category_val = "Beton"
limit_to_fetch = 500  # e.g., fetch up to 5 documents


import json

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

def fetch_well_defined_epds(
    conn,
    classification_system="OEKOBAU.DAT",
    category_value="Beton",
    max_items=1
):
    """
    Fetch up to 'max_items' EPDs from epd_documents + epd_metadata 
    where m.classification_system = classification_system,
    and the JSON classification array truly has {"value": category_value}.

    Returns a list of (uuid, doc_data) tuples.
    If none match, returns [] (empty list).
    """

    # We'll do a broad SELECT for this classification_system,
    # then parse each doc in Python to check for category_value.
    sql = f"""
    SELECT d.uuid, d.document
    FROM epd_documents d
    JOIN epd_metadata m 
        ON d.uuid = m.uuid
    WHERE m.classification_system = ?
    """
    cursor = conn.cursor()
    cursor.execute(sql, (classification_system,))
    rows = cursor.fetchall()

    results = []
    for (doc_uuid, doc_text) in rows:
        # Skip empty or invalid JSON
        if not doc_text:
            continue
        try:
            doc_data = json.loads(doc_text)
        except json.JSONDecodeError:
            continue

        # Safely retrieve classification array
        classifications = (
            doc_data.get("processInformation", {})
                    .get("dataSetInformation", {})
                    .get("classificationInformation", {})
                    .get("classification", [])
        )

        # Check the "class" array for the desired category_value
        found_match = False
        for classification_item in classifications:
            # if you do NOT need to verify classification_item["name"] == classification_system 
            # (since epd_metadata enforces it), skip that check.
            for cls_obj in classification_item.get("class", []):
                if cls_obj.get("value") == category_value:
                    found_match = True
                    break
            if found_match:
                break
        
        if found_match:
            results.append((doc_uuid, doc_data))
            # If we only want up to 'max_items', break early:
            if len(results) >= max_items:
                break
    
    return results


# Step #1.2: Add well-defined EPD(s) to the pipeline
found_epds = fetch_well_defined_epds(
    conn,
    classification_system=classification_sys,
    category_value=category_val,
    max_items=limit_to_fetch
)

if not found_epds:
    print(f"No EPD found for classification_system='{classification_sys}' and category_value='{category_val}'.")
else:
    print(f"Found {len(found_epds)} EPD(s) with '{category_val}' in classification system '{classification_sys}':")
    for (u, doc) in found_epds:
        print("  -", u)

    # Now you can choose which one(s) you want to add to modified_json_docs.
    # Example: pick the first
    # chosen_uuid, chosen_doc = found_epds[1]
    # modified_json_docs[chosen_uuid] = chosen_doc
    # print(f"\nAdded well-defined EPD with UUID={chosen_uuid} to modified_json_docs.")

    for found_epd in found_epds:
        chosen_uuid, chosen_doc = found_epd
        modified_json_docs[chosen_uuid] = chosen_doc
        print(f"\nAdded well-defined EPD with UUID={chosen_uuid} to modified_json_docs.")

# Optionally close conn if you're done
conn.close()

# Show final dictionary
print("\nFinal EPDs in `modified_json_docs` after Steps #1.1 and #1.2:")
for epd_uuid in modified_json_docs:
    print(" -", epd_uuid)

In [None]:
# Add from oekobau.dat

classification_sys = "oekobau.dat"
category_val = "Beton"
limit_to_fetch = 500  # e.g., fetch up to 5 documents

import json

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

def fetch_well_defined_epds(
    conn,
    classification_system="OEKOBAU.DAT",
    category_value="Beton",
    max_items=1
):
    """
    Fetch up to 'max_items' EPDs from epd_documents + epd_metadata 
    where m.classification_system = classification_system,
    and the JSON classification array truly has {"value": category_value}.

    Returns a list of (uuid, doc_data) tuples.
    If none match, returns [] (empty list).
    """

    # We'll do a broad SELECT for this classification_system,
    # then parse each doc in Python to check for category_value.
    sql = f"""
    SELECT d.uuid, d.document
    FROM epd_documents d
    JOIN epd_metadata m 
        ON d.uuid = m.uuid
    WHERE m.classification_system = ?
    """
    cursor = conn.cursor()
    cursor.execute(sql, (classification_system,))
    rows = cursor.fetchall()

    results = []
    for (doc_uuid, doc_text) in rows:
        # Skip empty or invalid JSON
        if not doc_text:
            continue
        try:
            doc_data = json.loads(doc_text)
        except json.JSONDecodeError:
            continue

        # Safely retrieve classification array
        classifications = (
            doc_data.get("processInformation", {})
                    .get("dataSetInformation", {})
                    .get("classificationInformation", {})
                    .get("classification", [])
        )

        # Check the "class" array for the desired category_value
        found_match = False
        for classification_item in classifications:
            # if you do NOT need to verify classification_item["name"] == classification_system 
            # (since epd_metadata enforces it), skip that check.
            for cls_obj in classification_item.get("class", []):
                if cls_obj.get("value") == category_value:
                    found_match = True
                    break
            if found_match:
                break
        
        if found_match:
            results.append((doc_uuid, doc_data))
            # If we only want up to 'max_items', break early:
            if len(results) >= max_items:
                break
    
    return results


# Step #1.2: Add well-defined EPD(s) to the pipeline
found_epds = fetch_well_defined_epds(
    conn,
    classification_system=classification_sys,
    category_value=category_val,
    max_items=limit_to_fetch
)

if not found_epds:
    print(f"No EPD found for classification_system='{classification_sys}' and category_value='{category_val}'.")
else:
    print(f"Found {len(found_epds)} EPD(s) with '{category_val}' in classification system '{classification_sys}':")
    for (u, doc) in found_epds:
        print("  -", u)

    # Now you can choose which one(s) you want to add to modified_json_docs.
    # Example: pick the first
    # chosen_uuid, chosen_doc = found_epds[1]
    # modified_json_docs[chosen_uuid] = chosen_doc
    # print(f"\nAdded well-defined EPD with UUID={chosen_uuid} to modified_json_docs.")

    for found_epd in found_epds:
        chosen_uuid, chosen_doc = found_epd
        modified_json_docs[chosen_uuid] = chosen_doc
        print(f"\nAdded well-defined EPD with UUID={chosen_uuid} to modified_json_docs.")

# Optionally close conn if you're done
# conn.close()

# Show final dictionary
print("\nFinal EPDs in `modified_json_docs` after Steps #1.1 and #1.2:")
for epd_uuid in modified_json_docs:
    print(" -", epd_uuid)

# Optionally close conn if you're done
conn.close()

In [None]:
# Add from EPDNorge
import json
import jsonlines
import sqlite3
import xml.etree.ElementTree as ET

# =============================================================================
# Part 1: Build the results list from JSON files with added ids
# =============================================================================

# File paths (adjust if needed)
context_path = "../data/pipeline2/json/EPDNorge_context.json"
batch_output_path = "../data/pipeline2/json/openai/EPDNorge_concrete_batch_output.jsonl"

# 1. Read context data
with open(context_path, 'r', encoding='utf-8') as f:
    context_data = json.load(f)

# 2. Prepare a list to hold the resulting records with added ids
results = []
with jsonlines.open(batch_output_path) as reader:
    for idx, batch_record in enumerate(reader):
        # Extract the "content" field, which is a JSON string
        content_str = batch_record["response"]["body"]["choices"][0]["message"]["content"]
        # Parse the content string to a JSON object
        content_json = json.loads(content_str)
        best_category = content_json["best_category"]

        # Create the new entry with an added "id" key
        new_entry = {
            "id": idx,
            "Product": context_data[idx]["Product"],
            "UUID": context_data[idx]["UUID"],
            "best_category": best_category
        }
        results.append(new_entry)

print("---- Results JSON ----")
print(json.dumps(results, indent=2, ensure_ascii=False))


# =============================================================================
# Part 2: Extract unique categories and filter by the target classification
# =============================================================================

# Extract unique categories
unique_categories = {entry["best_category"] for entry in results}
print("\n---- Unique Categories ----")
print(json.dumps(list(unique_categories), indent=2, ensure_ascii=False))

# Filter the results by the specified category
TARGET_CLASSIFICATION = "Mineral building products > Mortar and Concrete > Ready mixed concrete"
filtered_results = [
    entry for entry in results 
    if entry["best_category"] == TARGET_CLASSIFICATION
]
print("\n---- Filtered Results ----")
print(json.dumps(filtered_results, indent=2, ensure_ascii=False))


# =============================================================================
# Part 3: Update JSON documents from SQLite based on filtered results
# =============================================================================

# --- XML Parsing Functions ---
def build_category_dict(xml_element):
    category_id = xml_element.get('id')
    category_name = xml_element.get('name')
    
    children_dict = {}
    for child_elem in xml_element.findall('{*}category'):
        child_data = build_category_dict(child_elem)
        children_dict[child_data['name']] = child_data

    return {
        'id': category_id,
        'name': category_name,
        'children': children_dict
    }

def parse_category_xml(xml_path):
    tree = ET.parse(xml_path, parser=ET.XMLParser(encoding='utf-8'))
    root = tree.getroot()
    
    category_system = root.find('.//{*}categories[@dataType="Process"]')
    top_level_dict = {}
    if category_system is not None:
        for cat_elem in category_system.findall('{*}category'):
            cat_data = build_category_dict(cat_elem)
            top_level_dict[cat_data['name']] = cat_data
    else:
        print("Warning: Could not find <categories dataType='Process'> in XML.")
    
    return top_level_dict

def get_category_path_info(category_hierarchy, categories_dict):
    """Given a list of category names and your category_dict, 
       return [(name, id), (name, id), ...] for each level."""
    path_info = []
    if not category_hierarchy:
        return path_info

    current_dict = categories_dict
    for i, cat_name in enumerate(category_hierarchy):
        if cat_name not in current_dict:
            raise ValueError(f"Category '{cat_name}' not found at level {i}.")
        this_cat = current_dict[cat_name]
        path_info.append((this_cat['name'], this_cat['id']))
        current_dict = this_cat['children']
    return path_info

# Parse the XML to build the category lookup structure
xml_path = "../data/pipeline2/xml/OEKOBAU.DAT_Categories_EN_API.xml"
category_dict = parse_category_xml(xml_path)

# Connect to the SQLite DB
db_path = "../data/pipeline2/sql/epd_database.sqlite"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()


for entry in filtered_results:
    uuid_val = entry["UUID"]
    classification_str = entry["best_category"]

    # Only process those that match the target classification
    if classification_str == TARGET_CLASSIFICATION:
        # Fetch JSON from epd_documents table by UUID
        cursor.execute("SELECT document FROM epd_documents WHERE uuid = ?", (uuid_val,))
        result = cursor.fetchone()
        if not result:
            print(f"No document found for UUID: {uuid_val}")
            continue

        json_text = result[0]
        try:
            json_data = json.loads(json_text)
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for UUID: {uuid_val}")
            continue
        
        # Split the classification string into separate category names
        category_names = [x.strip() for x in classification_str.split(">")]
        try:
            path_info = get_category_path_info(category_names, category_dict)
        except ValueError as e:
            print(f"Warning: Could not resolve categories for UUID: {uuid_val}. {e}")
            continue

        # Build an array describing each category level
        class_array = []
        for level, (cat_name, cat_id) in enumerate(path_info):
            class_array.append({
                "value": cat_name,
                "level": level,
                "classId": cat_id
            })

        # Prepare a new classification object (with "name" = "OEKOBAU.DAT" as example)
        new_classification_item = {
            "class": class_array,
            "name": "OEKOBAU.DAT"
        }
        
        # Inject the new classification while preserving existing ones
        process_info = json_data.setdefault("processInformation", {})
        data_set_info = process_info.setdefault("dataSetInformation", {})
        classification_info = data_set_info.setdefault("classificationInformation", {})

        # Retrieve existing classifications; if missing, initialize an empty list
        existing_classifications = classification_info.get("classification")
        if not isinstance(existing_classifications, list):
            existing_classifications = []
        # Append the new classification
        existing_classifications.append(new_classification_item)
        classification_info["classification"] = existing_classifications
        
        # Save the updated JSON in our in-memory dict
        modified_json_docs[uuid_val] = json_data

        print(f"\nAppended new category for UUID {uuid_val}: {classification_str}")
        print("Full classification now:")
        print(json.dumps(classification_info["classification"], indent=2))
        print("--------------------------------------------------")

conn.close()

print("\n---- Modified JSON Documents ----")
print(json.dumps(modified_json_docs, indent=2, ensure_ascii=False))


In [None]:
len(modified_json_docs)

In [None]:
# 2. Rename JSON keys (updated with new logic)

import json

def recursive_rename_uri(obj):
    """
    Recursively rename any key 'uri' to 'refObjectUri' in the given object (dict/list).
    """
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            new_key = "refObjectUri" if key == "uri" else key
            new_obj[new_key] = recursive_rename_uri(value)
        return new_obj
    elif isinstance(obj, list):
        return [recursive_rename_uri(item) for item in obj]
    else:
        return obj

def remove_raw_strings_in_anies(obj):
    """
    Recursively traverse the object and remove any raw string elements from lists
    that belong to a key named 'anies'.
    """
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            if key == "anies" and isinstance(value, list):
                # Filter out raw string elements from this list.
                new_obj[key] = [
                    remove_raw_strings_in_anies(item)
                    for item in value
                    if not isinstance(item, str)
                ]
            else:
                new_obj[key] = remove_raw_strings_in_anies(value)
        return new_obj
    elif isinstance(obj, list):
        return [remove_raw_strings_in_anies(item) for item in obj]
    else:
        return obj

def transform_json(data):
    """
    Transform an Environmental Product Declaration (EPD) JSON instance by renaming keys
    and restructuring its contents to standardize the schema.
    
    Major changes in this version:
      - Globally rename every "uri" to "refObjectUri"
      - Globally remove raw string elements from any 'anies' lists
      - Keep the rest of your original rename logic: processInformation, modellingAndValidation, 
        administrativeInformation, exchanges, LCIAResults, etc.
      - Exclude the key "relativeStandardDeviation95In" from LCIAResults.LCIAResult and exchanges.exchange.

    Returns: The updated dictionary (in-memory).
    """
    # --- processInformation transformations ---
    process_info = data.get("processInformation", {})
    data_set_info = process_info.get("dataSetInformation", {})

    # Rename "dataSetInformation.name" -> "dataSetName"
    if "name" in data_set_info:
        data_set_info["dataSetName"] = data_set_info.pop("name")

    # Rename dataSetInformation.other -> otherDSI; remove 'componentsAndMaterialsAndSubstances' if found
    if "other" in data_set_info:
        data_set_info["otherDSI"] = data_set_info.pop("other")
        # If any item in otherDSI.anies contains "componentsAndMaterialsAndSubstances", remove the entire "anies" key.
        if "anies" in data_set_info["otherDSI"]:
            for item in data_set_info["otherDSI"]["anies"]:
                if (
                    isinstance(item, dict)
                    and "componentsAndMaterialsAndSubstances" in item
                ):
                    data_set_info["otherDSI"].pop("anies")
                    break
            # Rename dataSetInformation.other.anies.scenario -> objectScenario;
            if "scenario" in data_set_info["otherDSI"]:
                data_set_info["objectScenario"] = data_set_info["otherDSI"].pop("scenario")

    # For classification entries, rename "class" -> "classEntries"
    classification_info = data_set_info.get("classificationInformation", {})
    classifications = classification_info.get("classification", [])
    for cls_obj in classifications:
        if "class" in cls_obj:
            cls_obj["classEntries"] = cls_obj.pop("class")

    # Rename "time" -> "timeInformation", then rename "other" -> "otherTime", and "value" -> "timestampValue"
    if "time" in process_info:
        process_info["timeInformation"] = process_info.pop("time")
        time_info = process_info["timeInformation"]
        if "other" in time_info:
            time_info["otherTime"] = time_info.pop("other")
            for item in time_info["otherTime"].get("anies", []):
                if isinstance(item, dict) and "value" in item:
                    item["timestampValue"] = item.pop("value")

    # --- modellingAndValidation transformations ---
    mod_val = data.get("modellingAndValidation", {})

    # LCIMethodAndAllocation.other -> otherMAA
    lci_method = mod_val.get("LCIMethodAndAllocation", {})
    if "other" in lci_method:
        lci_method["otherMAA"] = lci_method.pop("other")

    # dataSourcesTreatmentAndRepresentativeness.other -> otherDSTAR
    dstar = mod_val.get("dataSourcesTreatmentAndRepresentativeness", {})
    if "other" in dstar:
        dstar["otherDSTAR"] = dstar.pop("other")
        if "anies" in dstar["otherDSTAR"]:
            dstar["otherDSTAR"]["aniesDSTAR"] = dstar["otherDSTAR"].pop("anies")
            for item in dstar["otherDSTAR"]["aniesDSTAR"]:
                if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                    # rename "value" -> "valueDSTAR", then rename subkeys
                    item["valueDSTAR"] = item.pop("value")
                    val = item["valueDSTAR"]
                    if "shortDescription" in val:
                        val["shortDescriptionExtended"] = val.pop("shortDescription")
                    if "version" in val:
                        version = val.pop("version")
                        if "version" in version:
                            version["versionInt"] = version.pop("version")
                        val["versionDict"] = version
                    if "uuid" in val:
                        uuid_obj = val.pop("uuid")
                        if "uuid" in uuid_obj:
                            uuid_obj["uuidValue"] = uuid_obj.pop("uuid")
                        val["uuidDict"] = uuid_obj

    # Rename "validation" -> "validationInfo"
    if "validation" in mod_val:
        mod_val["validationInfo"] = mod_val.pop("validation")

    # modellingAndValidation.other -> otherMAV; if "value" is a dict, rename it to "objectValue"
    if "other" in mod_val:
        mod_val["otherMAV"] = mod_val.pop("other")
        for item in mod_val["otherMAV"].get("anies", []):
            if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                item["objectValue"] = item.pop("value")

    # --- administrativeInformation transformations ---
    admin_info = data.get("administrativeInformation", {})

    # publicationAndOwnership.other -> otherPAO; rename "value" -> "objectValue" if it's a dict
    pub_own = admin_info.get("publicationAndOwnership", {})
    if "other" in pub_own:
        pub_own["otherPAO"] = pub_own.pop("other")
        for item in pub_own["otherPAO"].get("anies", []):
            if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                item["objectValue"] = item.pop("value")

    # --- exchanges transformations ---
    exchanges = data.get("exchanges", {}).get("exchange", [])
    for exchange in exchanges:
        # flowProperties: rename name->nameFP, uuid->uuidFP
        for fp in exchange.get("flowProperties", []):
            if "name" in fp:
                fp["nameFP"] = fp.pop("name")
            if "uuid" in fp:
                fp["uuidFP"] = fp.pop("uuid")

        # rename "exchange direction" -> "exchangeDirection"
        if "exchange direction" in exchange:
            exchange["exchangeDirection"] = exchange.pop("exchange direction")

        # rename "other" -> "otherEx", if "value" is dict -> "objectValue"
        if "other" in exchange:
            exchange["otherEx"] = exchange.pop("other")
            for item in exchange["otherEx"].get("anies", []):
                if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                    item["objectValue"] = item.pop("value")

        # rename "classification" -> "classificationEx" and inside, rename "name"->"nameClass"
        if "classification" in exchange:
            exchange["classificationEx"] = exchange.pop("classification")
            if "name" in exchange["classificationEx"]:
                exchange["classificationEx"]["nameClass"] = exchange["classificationEx"].pop("name")
        
        # exclude relativeStandardDeviation95In from exchanges.exchange ---
        if "relativeStandardDeviation95In" in exchange:
            exchange.pop("relativeStandardDeviation95In")

    # --- LCIAResults transformations ---
    # rename "LCIAResults" -> "lciaResults" if present
    if "LCIAResults" in data:
        data["lciaResults"] = data.pop("LCIAResults")

    lcia_results = data.get("lciaResults", {}).get("LCIAResult", [])
    for result in lcia_results:
        # rename "other" -> "otherLCIA", if "value" is dict -> "objectValue"
        if "other" in result:
            result["otherLCIA"] = result.pop("other")
            for item in result["otherLCIA"].get("anies", []):
                if isinstance(item, dict) and "value" in item and isinstance(item["value"], dict):
                    item["objectValue"] = item.pop("value")
        
        # exclude relativeStandardDeviation95In from each LCIA result ---
        if "relativeStandardDeviation95In" in result:
            result.pop("relativeStandardDeviation95In")

    # --- Removal ---
    # Remove top-level "otherAttributes" key if it exists
    if "otherAttributes" in data:
        data.pop("otherAttributes")
    
    # if "modellingAndValidation" in data:
    #     data.pop("modellingAndValidation")
    
    # if "administrativeInformation" in data:
    #     data.pop("administrativeInformation")
    
    # if "exchanges" in data:
    #     data.pop("exchanges")
    
    # if "lciaResults" in data:
    #     data.pop("lciaResults")
    
    if "locations" in data:
        data.pop("locations")


    # --- Global step: remove raw string elements from any 'anies' list
    data = remove_raw_strings_in_anies(data)

    # --- Global step: rename "uri" -> "refObjectUri"
    data = recursive_rename_uri(data)

    return data

# -----------------------------------------------------------
# Example pipeline usage (in memory, no disk I/O):
# -----------------------------------------------------------
for uuid_val, original_doc in modified_json_docs.items():
    final_doc = transform_json(original_doc)

    # If you want to store the transformed version back in the dictionary:
    modified_json_docs[uuid_val] = final_doc

    # Just print for verification
    # print(f"\nTransformed JSON for UUID = {uuid_val}:")
    # print(json.dumps(final_doc, indent=2))
    # print("--------------------------------------------------------")


In [None]:
# 3. Add ids
import yaml
import re

# --------------------------- Utility functions ---------------------------

def load_yaml_schema(file_path):
    """Load the LinkML YAML schema from disk."""
    with open(file_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

def reorder_dict_keys(d):
    """
    Reorders a dictionary so that if 'id' exists, it appears as the first key.
    (For human readability.)
    """
    if "id" in d:
        id_value = d.pop("id")
        new_d = {"id": id_value}
        new_d.update(d)
        d.clear()
        d.update(new_d)

def clean_epd_name(name):
    """
    Cleans the EPD name by replacing non-alphanumeric characters with underscores,
    collapsing multiple underscores, and stripping leading/trailing underscores.
    """
    cleaned = re.sub(r"[^A-Za-z0-9]", "_", name)
    return re.sub(r"_+", "_", cleaned).strip("_")

# ------------------ New ID Creation Logic -------------------

def generate_id_from_path(acc_path, prefix="ilcd"):
    """
    Given an accumulated path, returns the full ID as: prefix:acc_path
    """
    return f"{prefix}:{acc_path}"

def get_suffix(item, index):
    """
    If the list element (item) is a dict with a 'module' field,
    return 'module' + its value (dashes removed);
    otherwise, return the 1-based index as a string.
    """
    if isinstance(item, dict) and "module" in item:
        return f"module{item['module'].replace('-', '')}"
    return str(index + 1)

def assign_ids_by_path(obj, epd_uuid, acc_path, parent_is_list, prefix="ilcd"):
    """
    Recursively assigns IDs based on the accumulated path.

    Parameters:
      obj          : current object (dict, list, or primitive)
      epd_uuid     : the top-level EPD UUID (dashes removed), used as a base
      acc_path     : the accumulated path string
      parent_is_list : bool indicating whether the parent container was a list
      prefix       : the string prefix to use, e.g. "ilcd"
    """
    # If this is a dict and has no 'id', generate one
    if isinstance(obj, dict) and "id" not in obj:
        # ID => prefix:epd_uuid_accPath
        obj["id"] = generate_id_from_path(f"{epd_uuid}_{acc_path}", prefix)
        reorder_dict_keys(obj)

    if isinstance(obj, dict):
        # Traverse each key
        for key, value in obj.items():
            if key == "id":
                continue
            if isinstance(value, dict):
                # If parent is a list, extend path with "_key"
                new_acc = f"{acc_path}_{key}" if parent_is_list else key
                assign_ids_by_path(value, epd_uuid, new_acc, parent_is_list=False, prefix=prefix)
            elif isinstance(value, list):
                # For a list, extend path with "_key"
                new_acc = f"{acc_path}_{key}"
                for i, item in enumerate(value):
                    suffix = get_suffix(item, i)
                    # element path => new_acc + "_" + suffix
                    element_acc = f"{new_acc}_{suffix}"
                    assign_ids_by_path(item, epd_uuid, element_acc, parent_is_list=True, prefix=prefix)
            # If it's a primitive, do nothing
    elif isinstance(obj, list):
        # If this is a list, iterate elements
        for i, item in enumerate(obj):
            suffix = get_suffix(item, i)
            new_acc = f"{acc_path}_{suffix}"
            assign_ids_by_path(item, epd_uuid, new_acc, parent_is_list=True, prefix=prefix)


# --------------------- MAIN LOGIC: In-memory Example ---------------------

SCHEMA_PATH = "../linkml/data/yaml/linkml_ILCDmergedSchemas_schema.yaml"
schema = load_yaml_schema(SCHEMA_PATH)

# Fallback if your schema has a default prefix; else use "ilcd"
default_prefix = schema.get("default_prefix", "ilcd")

# 
# We assume `modified_json_docs` is already defined in memory:
# e.g. modified_json_docs = {
#     "uuid1": {...final JSON doc...},
#     "uuid2": {...final JSON doc...},
#     ...
# }

for uuid_val, doc in modified_json_docs.items():
    # 1) Get the "real" UUID from the doc (with dashes)
    try:
        raw_uuid = doc["processInformation"]["dataSetInformation"]["UUID"]
    except KeyError as e:
        raise KeyError(f"Missing processInformation.dataSetInformation.UUID in doc {uuid_val}") from e
    
    # 2) Remove dashes to build a base
    epd_uuid = raw_uuid.replace("-", "")

    # 3) Assign top-level doc ID (prefix:epd_uuid)
    doc["id"] = f"{default_prefix.lower()}:{epd_uuid}"

    # 4) For each top-level key (besides 'id'/'version'), set a sub-ID and recursively assign deeper IDs
    for top_key, top_obj in doc.items():
        if top_key in ["id", "version"]:
            continue
        if isinstance(top_obj, dict):
            # e.g. "processInformation" => prefix:epd_uuid_processInformation
            top_obj["id"] = f"{default_prefix.lower()}:{epd_uuid}_{top_key}"
            reorder_dict_keys(top_obj)
            assign_ids_by_path(
                top_obj,
                epd_uuid=epd_uuid,
                acc_path=top_key,
                parent_is_list=False,
                prefix=default_prefix.lower()
            )
        elif isinstance(top_obj, list):
            # If it's a list at top level, handle each item
            for i, item in enumerate(top_obj):
                suffix = get_suffix(item, i)
                # top path => top_key + "_" + suffix
                top_path = f"{top_key}_{suffix}"
                assign_ids_by_path(
                    item,
                    epd_uuid=epd_uuid,
                    acc_path=top_path,
                    parent_is_list=True,
                    prefix=default_prefix.lower()
                )

    # 5) Print final JSON to confirm
    # print(f"\n=== JSON with newly assigned IDs for doc (pipeline UUID): {uuid_val} ===")
    # print(json.dumps(doc, indent=2))
    # print("-----------------------------------------------------------------")


In [None]:
# Convert to RDF

import rdflib
from linkml.validator import Validator
from linkml_runtime.loaders import YAMLLoader
from linkml_runtime.dumpers import RDFLibDumper
from linkml_runtime.utils.schemaview import SchemaView

# Import your generated Python dataclass for the schema
# e.g., from data.py.linkml_processDataSet_schema import ProcessDataSet
from data.py.linkml_processDataSet_schema import ProcessDataSet


def generate_turtle_from_docs(
    docs_dict,
    schema_path,
    turtle_output_path,
    validate: bool = False
):
    """
    Given a dictionary of JSON documents (each conforming to 'ProcessDataSet'),
    generate a single TTL file that contains all instances. If loading fails
    for any doc, store it in a separate dictionary `failed_docs` for debugging.

    Parameters:
    -----------
    docs_dict : dict
        A dict of {UUID: JSON-Dict} storing your EPD JSON objects in memory.
    schema_path : str
        Path to the LinkML YAML schema (e.g. '../linkml/data/yaml/linkml_processDataSet_schema.yaml').
    turtle_output_path : str
        Where to write the combined Turtle RDF graph (overwrites each run).
    validate : bool
        If True, runs the LinkML Validator on each instance before RDF conversion.

    Returns:
    --------
    failed_docs : dict
        A dictionary of {UUID: JSON-Dict} for any documents that failed to load.
    """
    # 1) Create an empty rdflib graph to combine all instance graphs
    combined_graph = rdflib.Graph()

    # 2) Load the schema into a SchemaView for RDF generation
    sv = SchemaView(schema_path)

    # (Optional) set up a validator if needed
    validator = None
    if validate:
        validator = Validator(schema_path, strict=False)

    # A container to track documents that fail to load
    failed_docs = {}

    # 3) For each doc, wrap in a top-level "processDataSet" key,
    #    load as an object, optionally validate, and convert to RDF.
    dumper = RDFLibDumper()

    success_count = 0

    for uuid_val, json_doc in docs_dict.items():
        yaml_wrapper = {"processDataSet": json_doc}

        # (A) Validate the doc if requested
        if validator:
            report = validator.validate(yaml_wrapper, "ProcessDataSet")
            if report.results:
                print(f"[VALIDATION] Errors for UUID={uuid_val}:")
                for result in report.results:
                    print("  -", result.message)
                # We can decide to skip, but let's let the user decide:
                # continue
            else:
                print(f"[VALIDATION] Document {uuid_val} is valid according to the schema.")

        # (B) Attempt to load as a ProcessDataSet
        try:
            instance_obj = YAMLLoader().load(yaml_wrapper["processDataSet"], target_class=ProcessDataSet)
        except (ValueError, TypeError) as e:
            print(f"[ERROR] Failed to load doc {uuid_val} as ProcessDataSet. Reason:\n  {e}")
            failed_docs[uuid_val] = json_doc
            continue  # Skip adding to the graph

        # (C) Convert to RDF (rdflib.Graph) and accumulate
        instance_graph = dumper.as_rdf_graph(instance_obj, schemaview=sv)
        combined_graph += instance_graph
        success_count += 1

    # 4) Write the combined graph to Turtle
    combined_graph.serialize(destination=turtle_output_path, format="turtle")
    print(f"\nSuccessfully wrote {success_count} instances to the Turtle file:\n  {turtle_output_path}")
    if failed_docs:
        print(f"{len(failed_docs)} documents failed to load and were skipped.")

    # Return the dictionary of failed docs for further handling or debugging
    return failed_docs


# -----------------------------------------------------------------------------
# USAGE EXAMPLE (in the same or next cell)
# -----------------------------------------------------------------------------

# Suppose you have a dict `modified_json_docs` from prior steps:
# modified_json_docs = {
#     "uuid1": {... doc ...},
#     "uuid2": {... doc ...},
#     # ...
# }

# Then do:
schema_file = "../linkml/data/yaml/linkml_processDataSet_schema.yaml"
ttl_output = "../linkml/data/rdf/epd_rdf_instance_datastore.ttl"

failed = generate_turtle_from_docs(
    docs_dict=modified_json_docs,
    schema_path=schema_file,
    turtle_output_path=ttl_output,
    validate=True
)

if failed:
    print("\nFailed doc details:")
    for bad_uuid, bad_doc in failed.items():
        print(" -", bad_uuid, "(Doc not loaded successfully)")


In [None]:
failed