In [61]:
# This script imports a Ontology and copies 2 classes and saves it in a new ontology

from owlready2 import get_ontology, Thing
import types

onto_path = "mysandboxonto.owl"
onto_to_import = ["../data/ontologies/local_import/ppeo.owl", "../data/ontologies/local_import/sosa.owl"] 

try:
    onto.destroy()
except:
    pass

onto = get_ontology(onto_path)

try:
    onto.imported_ontologies.clear()
except:
    pass
onto.imported_ontologies.extend([get_ontology(o).load() for o in onto_to_import])


def create_class(cls):
    with onto:
        NewType = types.new_class(cls.name, (cls, ))
      
        # NewType.iri = cls.iri
   

def checkexistingClass(cname):
    for c in onto.classes():
        if c.name == cname:
            return True
    return False

# for c in onto.imported_ontologies[0].classes():
#     print(c.name)

# for prop in onto.imported_ontologies[0].properties():
#     print(prop)

# lets take tuple as input
inp = ("data_file", "hasObservation", "observation")



i = 2
if not checkexistingClass(inp[i]):
    create_class(onto.imported_ontologies[0][inp[i]])

for c in onto.classes():
    print(c, c.iri)



mysandboxonto.observation mysandboxonto.owl#observation


In [71]:
from lxml import etree

ppeo = "../data/ontologies/local_import/ppeo.owl"

parser = etree.XMLParser(
        load_dtd=True,
        resolve_entities=True,
        ns_clean=True,
        recover=True,
    )
tree = etree.parse(ppeo, parser)

root = tree.getroot()

for child in root:
    if child.tag == "{http://www.w3.org/2002/07/owl#}Class":
        print(child.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about"))

http://purl.org/ppeo/PPEO.owl#GPS_location
http://purl.org/ppeo/PPEO.owl#biological_material
http://purl.org/ppeo/PPEO.owl#country
http://purl.org/ppeo/PPEO.owl#data_file
http://purl.org/ppeo/PPEO.owl#environment
http://purl.org/ppeo/PPEO.owl#environment_parameter
http://purl.org/ppeo/PPEO.owl#event
http://purl.org/ppeo/PPEO.owl#factor
http://purl.org/ppeo/PPEO.owl#factor_value
http://purl.org/ppeo/PPEO.owl#growth_facility
http://purl.org/ppeo/PPEO.owl#institution
http://purl.org/ppeo/PPEO.owl#investigation
http://purl.org/ppeo/PPEO.owl#location
http://purl.org/ppeo/PPEO.owl#material_source
http://purl.org/ppeo/PPEO.owl#method
http://purl.org/ppeo/PPEO.owl#named_location
http://purl.org/ppeo/PPEO.owl#observation
http://purl.org/ppeo/PPEO.owl#observation_level
http://purl.org/ppeo/PPEO.owl#observation_level_hierarchy
http://purl.org/ppeo/PPEO.owl#observation_unit
http://purl.org/ppeo/PPEO.owl#observed_variable
http://purl.org/ppeo/PPEO.owl#person
http://purl.org/ppeo/PPEO.owl#role
http:

In [75]:
#!/usr/bin/env python3
"""
Ontology Merger
===============
Extracts selected classes and properties from source OWL ontologies
based on (subject, predicate, object) triplets, and assembles them
into a new OWL ontology while:
  - Preserving original IRIs verbatim
  - Copying all OWL axioms and restrictions in full (blank-node closures)
  - Adding stub declarations for referenced-but-not-selected entities
  - Annotating provenance (source ontology IRI + local name) on every entity

Usage:  python3 ontology_merger.py
Output: merged_ontology.owl  (RDF/XML)
"""

from lxml import etree
import copy, re, os

# ──────────────────────────────────────────────────────────────────────────────
# 1. CONFIGURATION
# ──────────────────────────────────────────────────────────────────────────────

SOURCE_FILES = {
    "ppeo": "../data/ontologies/local_import/ppeo.owl",
    "sosa": "../data/ontologies/local_import/sosa.owl",
}

OUTPUT_FILE = "/home/gryvity/Desktop/workstation/lab/MIAPPExSOSA/labenv/merged_ontology.owl"

NEW_ONTOLOGY_IRI = "http://example.org/merged-ontology/1.0"

# Each triplet: (subject_localname, predicate_localname, object_localname, source_key)
TRIPLETS = [
    ("data_file",    "hasObservation", "observation", "ppeo"),
    ("Observation",  "madeBySensor",   "Sensor",      "sosa"),
]

# ──────────────────────────────────────────────────────────────────────────────
# 2. NAMESPACE CONSTANTS  (Clark-notation helpers)
# ──────────────────────────────────────────────────────────────────────────────

NS = {
    "rdf":   "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs":  "http://www.w3.org/2000/01/rdf-schema#",
    "owl":   "http://www.w3.org/2002/07/owl#",
    "xsd":   "http://www.w3.org/2001/XMLSchema#",
    "dc":    "http://purl.org/dc/elements/1.1/",
    "skos":  "http://www.w3.org/2004/02/skos/core#",
    "schema":"http://schema.org/",
    "prov":  "http://www.w3.org/ns/prov#",
    "merge": "http://example.org/merged-ontology/",
}

def ck(ns_key, local):
    """Clark notation: {namespace}localname"""
    return f"{{{NS[ns_key]}}}{local}"

RDF_ABOUT    = ck("rdf", "about")
RDF_RESOURCE = ck("rdf", "resource")
RDF_TYPE     = ck("rdf", "type")
RDFS_LABEL   = ck("rdfs", "label")
RDFS_COMMENT = ck("rdfs", "comment")
RDFS_DOMAIN  = ck("rdfs", "domain")
RDFS_RANGE   = ck("rdfs", "range")
OWL_ONTOLOGY = ck("owl", "Ontology")
OWL_CLASS    = ck("owl", "Class")
OWL_OBJPROP  = ck("owl", "ObjectProperty")
OWL_DATPROP  = ck("owl", "DatatypeProperty")
OWL_ANNPROP  = ck("owl", "AnnotationProperty")
OWL_IMPORTS  = ck("owl", "imports")
RDFS_SEEALSO = ck("rdfs", "seeAlso")

SKIP_NAMESPACES = {
    NS["rdf"], NS["rdfs"], NS["owl"], NS["xsd"],
    NS["skos"], NS["schema"],
    "http://purl.org/dc/", "http://purl.org/dc/terms/",
    "http://purl.org/dc/elements/",
    "http://xmlns.com/foaf/",
    "http://purl.org/vocab/vann/",
    "http://purl.org/vocommons/",
}

def is_builtin_iri(iri: str) -> bool:
    return any(iri.startswith(ns) for ns in SKIP_NAMESPACES)

def local_name(iri: str) -> str:
    return iri.split("#")[-1].split("/")[-1]


# ──────────────────────────────────────────────────────────────────────────────
# 3. PARSING SOURCE ONTOLOGIES
# ──────────────────────────────────────────────────────────────────────────────

def parse_owl_file(path: str):
    """
    Parse an OWL/RDF-XML file, resolving DOCTYPE entity references (e.g. &sosa;).
    Returns the lxml ElementTree root.
    """
    parser = etree.XMLParser(
        load_dtd=True,
        resolve_entities=True,
        ns_clean=True,
        recover=True,
    )
    tree = etree.parse(path, parser)
    return tree.getroot()


def get_ontology_iri(root):
    """
    Extract the ontology IRI from the document.
    Handles both the standard <owl:Ontology rdf:about="..."> form (ppeo)
    and the <owl:NamedIndividual><rdf:type rdf:resource="...Ontology"/> form (sosa).
    Falls back to xml:base on the root element.
    """
    OWL_NAMED_IND = ck("owl", "NamedIndividual")
    # 1. Standard form
    for child in root:
        if child.tag == OWL_ONTOLOGY:
            iri = child.get(RDF_ABOUT) or child.get(ck("rdf", "ID"))
            if iri:
                return iri
    # 2. NamedIndividual with rdf:type owl:Ontology (sosa style)
    for child in root:
        if child.tag == OWL_NAMED_IND:
            iri = child.get(RDF_ABOUT)
            if iri:
                for sub in child:
                    if sub.tag == RDF_TYPE:
                        res = sub.get(RDF_RESOURCE, "")
                        if res.endswith("Ontology"):
                            return iri
    # 3. Fall back to xml:base on the root <rdf:RDF> element
    XML_NS = "http://www.w3.org/XML/1998/namespace"
    base = root.get(f"{{{XML_NS}}}base")
    if base:
        return base
    return None


def index_entities(root):
    """
    Build a dict:  { local_name -> [(element, full_iri, element_type_tag)] }
    Covers owl:Class, owl:ObjectProperty, owl:DatatypeProperty, owl:AnnotationProperty.
    """
    index = {}
    TARGET_TAGS = {OWL_CLASS, OWL_OBJPROP, OWL_DATPROP, OWL_ANNPROP}
    for elem in root:
        if elem.tag not in TARGET_TAGS:
            continue
        iri = elem.get(RDF_ABOUT)
        if not iri:
            continue
        ln = local_name(iri)
        index.setdefault(ln, []).append((elem, iri, elem.tag))
    return index


# ──────────────────────────────────────────────────────────────────────────────
# 4. COLLECTING ALL IRIs REFERENCED INSIDE AN ELEMENT (for stub generation)
# ──────────────────────────────────────────────────────────────────────────────

def collect_referenced_iris(elem):
    """
    Walk the element tree and gather every rdf:resource / rdf:about value
    that isn't a blank-node or built-in namespace IRI.
    """
    iris = set()
    for node in elem.iter():
        for attr in (RDF_RESOURCE, RDF_ABOUT):
            val = node.get(attr)
            if val and not val.startswith("_:") and not is_builtin_iri(val):
                iris.add(val)
    return iris


# ──────────────────────────────────────────────────────────────────────────────
# 5. ADDING PROVENANCE ANNOTATION TO A COPIED ELEMENT
# ──────────────────────────────────────────────────────────────────────────────

MERGE_SOURCED_FROM = f"{{{NS['merge']}}}sourcedFrom"
MERGE_SOURCE_ONT   = f"{{{NS['merge']}}}sourceOntology"

def annotate_provenance(elem, source_key: str, source_ont_iri: str):
    """
    Append two rdfs:comment-style provenance annotations to the copied element,
    so the original IRI context is always traceable in the OWL file.
    """
    ann1 = etree.SubElement(elem, MERGE_SOURCED_FROM)
    ann1.text = source_ont_iri
    ann1.set(ck("rdf", "datatype"), NS["xsd"] + "anyURI")

    ann2 = etree.SubElement(elem, MERGE_SOURCE_ONT)
    ann2.text = source_key


# ──────────────────────────────────────────────────────────────────────────────
# 6. BUILDING THE MERGED ONTOLOGY ROOT
# ──────────────────────────────────────────────────────────────────────────────

def build_merged_root() -> etree._Element:
    """
    Create the <rdf:RDF> root element with all namespace declarations
    and the <owl:Ontology> header.
    """
    ns_map = {
        "rdf":    NS["rdf"],
        "rdfs":   NS["rdfs"],
        "owl":    NS["owl"],
        "xsd":    NS["xsd"],
        "dc":     NS["dc"],
        "skos":   NS["skos"],
        "schema": NS["schema"],
        "prov":   NS["prov"],
        "merge":  NS["merge"],
    }
    root = etree.Element(ck("rdf", "RDF"), nsmap=ns_map)
    # xml:base uses a special W3C namespace
    root.set("{http://www.w3.org/XML/1998/namespace}base", NEW_ONTOLOGY_IRI)

    # ── owl:Ontology header ──
    ont = etree.SubElement(root, OWL_ONTOLOGY)
    ont.set(RDF_ABOUT, NEW_ONTOLOGY_IRI)

    title = etree.SubElement(ont, ck("dc", "title"))
    title.text = "Merged Ontology"
    title.set("{http://www.w3.org/XML/1998/namespace}lang", "en")

    desc = etree.SubElement(ont, ck("dc", "description"))
    desc.text = (
        "A new ontology that selectively imports classes and properties "
        "from PPEO and SOSA, preserving original IRIs and all OWL axioms."
    )
    desc.set("{http://www.w3.org/XML/1998/namespace}lang", "en")

    return root


# ──────────────────────────────────────────────────────────────────────────────
# 7. STUB GENERATOR  (ensures referenced entities are at least declared)
# ──────────────────────────────────────────────────────────────────────────────

def make_stub(iri, source_index, source_ont_iri, source_key):
    """
    Return a minimal owl:Class / owl:ObjectProperty element for an IRI
    that appears in a restriction but wasn't requested as a full entity.
    Tries to pull label/comment from the source index; falls back to a
    bare declaration.
    """
    ln = local_name(iri)
    hits = source_index.get(ln, [])

    # If found in source, copy just the direct metadata (no sub-restrictions)
    for (src_elem, src_iri, src_tag) in hits:
        if src_iri == iri:
            stub = etree.Element(src_tag)
            stub.set(RDF_ABOUT, iri)
            # Copy only label, comment, rdfs:isDefinedBy (no subClassOf / restrictions)
            for child in src_elem:
                if child.tag in (
                    RDFS_LABEL, RDFS_COMMENT,
                    ck("rdfs", "isDefinedBy"),
                    ck("skos", "definition"),
                ):
                    stub.append(copy.deepcopy(child))
            annotate_provenance(stub, source_key, source_ont_iri)
            return stub

    # Fallback: bare declaration (we know it's a class from context)
    stub = etree.Element(OWL_CLASS)
    stub.set(RDF_ABOUT, iri)
    lbl = etree.SubElement(stub, RDFS_LABEL)
    lbl.text = ln
    lbl.set("{http://www.w3.org/XML/1998/namespace}lang", "en")
    return stub


# ──────────────────────────────────────────────────────────────────────────────
# 8. MAIN  –  assemble the merged ontology
# ──────────────────────────────────────────────────────────────────────────────

def main():
    print("=" * 65)
    print("  Ontology Merger")
    print("=" * 65)

    # ── Parse source files ──
    sources = {}
    for key, path in SOURCE_FILES.items():
        root = parse_owl_file(path)
        ont_iri = get_ontology_iri(root) or f"(unknown – {path})"
        idx = index_entities(root)
        sources[key] = {"root": root, "iri": ont_iri, "index": idx}
        print(f"\n  [{key}]  IRI : {ont_iri}")
        print(f"          File: {path}")
        print(f"          Entities indexed: {sum(len(v) for v in idx.values())}")

    # ── Build merged root ──
    merged_root = build_merged_root()

    # Add rdfs:seeAlso pointers to source ontologies in the header
    ont_header = merged_root[0]  # the owl:Ontology element
    for key, info in sources.items():
        sa = etree.SubElement(ont_header, RDFS_SEEALSO)
        sa.set(RDF_RESOURCE, info["iri"])

    # ── Process each triplet — PASS 1: collect full extractions ──
    print("\n" + "-" * 65)
    print("  Processing triplets …")

    full_entities   = {}    # iri -> (elem, src_key)   — full axiom copies
    ref_iri_to_src  = {}    # iri -> (src_key, src_iri) — for stub generation

    for (subj_ln, pred_ln, obj_ln, src_key) in TRIPLETS:
        info  = sources[src_key]
        idx   = info["index"]
        o_iri = info["iri"]
        print(f"\n  Triplet from [{src_key}] : ({subj_ln}, {pred_ln}, {obj_ln})")

        for role, ln, expected_tags in [
            ("Subject  (Class)",     subj_ln, {OWL_CLASS}),
            ("Predicate (Property)", pred_ln, {OWL_OBJPROP, OWL_DATPROP}),
            ("Object   (Class)",     obj_ln,  {OWL_CLASS}),
        ]:
            hits = idx.get(ln, [])
            matched = [(e, iri, tag) for (e, iri, tag) in hits if tag in expected_tags]
            if not matched:
                matched = [(e, iri, tag) for (e, iri, tag) in hits]
            if not matched:
                print(f"    ⚠  {role}: '{ln}' not found in {src_key}!")
                continue

            src_elem, full_iri, tag = matched[0]
            print(f"    ✓  {role}: {full_iri}")

            if full_iri not in full_entities:
                elem_copy = copy.deepcopy(src_elem)
                annotate_provenance(elem_copy, src_key, o_iri)
                full_entities[full_iri] = (elem_copy, src_key)

                # Collect referenced IRIs for stub generation
                for ref_iri in collect_referenced_iris(src_elem):
                    if ref_iri == full_iri:
                        continue
                    if ref_iri not in ref_iri_to_src:
                        ref_iri_to_src[ref_iri] = (src_key, o_iri)

    # ── PASS 2: generate stubs for referenced IRIs not fully included ──
    print("\n  Generating stubs for referenced entities …")
    stubs = {}  # iri -> elem

    for ref_iri, (src_key, src_ont_iri) in ref_iri_to_src.items():
        if ref_iri in full_entities:
            continue                         # already fully extracted
        if is_builtin_iri(ref_iri):
            continue                         # skip owl/rdf/xsd builtins
        # Skip bare ontology namespace IRIs (end with "/" and are the source ont IRI)
        if ref_iri.endswith("/") or ref_iri in {info["iri"] for info in sources.values()}:
            continue

        ref_ln = local_name(ref_iri)
        found_in = None
        for try_key in [src_key] + [k for k in sources if k != src_key]:
            hits2 = sources[try_key]["index"].get(ref_ln, [])
            for (e2, i2, t2) in hits2:
                if i2 == ref_iri:
                    found_in = (e2, i2, t2, try_key, sources[try_key]["iri"])
                    break
            if found_in:
                break

        if found_in:
            e2, i2, t2, fk, fo = found_in
            stub = make_stub(ref_iri, sources[fk]["index"], fo, fk)
        else:
            stub = make_stub(ref_iri, {}, "", src_key)

        if stub is not None:
            stubs[ref_iri] = stub
            print(f"    ↳  stub: {ref_iri}")

    # ── Append full entities (sorted by source, then local name) ──
    for iri, (elem, src_key) in sorted(full_entities.items(),
                                        key=lambda kv: (kv[1][1], local_name(kv[0]))):
        merged_root.append(elem)

    # ── Append stubs (sorted) ──
    for iri, stub in sorted(stubs.items(), key=lambda kv: local_name(kv[0])):
        merged_root.append(stub)

    # ── Serialize ──
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    tree = etree.ElementTree(merged_root)
    tree.write(
        OUTPUT_FILE,
        pretty_print=True,
        xml_declaration=True,
        encoding="UTF-8",
    )

    print("\n" + "=" * 65)
    print(f"  ✓  Merged ontology written to:")
    print(f"     {OUTPUT_FILE}")
    print(f"  ✓  Fully extracted entities : {len(full_entities)}")
    print(f"  ✓  Stub declarations added  : {len(stubs)}")
    print("=" * 65)


if __name__ == "__main__":
    main()

  Ontology Merger

  [ppeo]  IRI : http://purl.org/ppeo/PPEO.owl
          File: ../data/ontologies/local_import/ppeo.owl
          Entities indexed: 124

  [sosa]  IRI : http://www.w3.org/ns/sosa/
          File: ../data/ontologies/local_import/sosa.owl
          Entities indexed: 53

-----------------------------------------------------------------
  Processing triplets …

  Triplet from [ppeo] : (data_file, hasObservation, observation)
    ✓  Subject  (Class): http://purl.org/ppeo/PPEO.owl#data_file
    ✓  Predicate (Property): http://purl.org/ppeo/PPEO.owl#hasObservation
    ✓  Object   (Class): http://purl.org/ppeo/PPEO.owl#observation

  Triplet from [sosa] : (Observation, madeBySensor, Sensor)
    ✓  Subject  (Class): http://www.w3.org/ns/sosa/Observation
    ✓  Predicate (Property): http://www.w3.org/ns/sosa/madeBySensor
    ✓  Object   (Class): http://www.w3.org/ns/sosa/Sensor

  Generating stubs for referenced entities …
    ↳  stub: http://purl.org/ppeo/PPEO.owl#hasDescripti

In [83]:
# READ YAML

import yaml

config = "/home/gryvity/Desktop/workstation/lab/MIAPPExSOSA/labenv/miappexsosa_config.yaml"

with open(config) as file:
    content = yaml.safe_load(file)

for s, p, o in content["imports"]["ppeo"]["triplets"]:
    print(s, p)


Investigation hasAssociatedPublication
Sample deriveFrom
Person hasAffiliation
ObservationUnit hasBiologicalMaterial
ObservationUnit hasEvent
ObservationUnit hasObservationLevel
Study hasBiologicalMaterial
Study hasContactInstitution
Study hasEnvironment
Study hasEvent
Study hasFactor
Study hasGrowthFacility
Study hasObservationLevelHierarchy
DataFile hasObservation
ObservedVariable hasMethod
Environment hasEnvironmentParameter
Factor hasFactorValue
Factor hasModality
FactorValue isModalityOf
BiologicalMaterial hasCountryOfOrigin


In [None]:
#!/usr/bin/env python3
"""
excel_to_jsonld.py
==================
Converts a structured Excel checklist + an OWL ontology into:
  1.  A JSON-LD document whose @graph mirrors the ontology's class/property
      structure (nested graph, hasPart / partOf resolved as object links).
  2.  A Cypher import script ready for Neo4j.

Usage
-----
    python excel_to_jsonld.py <excel_file> <owl_file>
                              [--companion <companion_ttl>]
                              [--output-jsonld <path>]
                              [--output-cypher <path>]

Design
------
Each worksheet represents one OWL class.  The first row contains property
names (column headers).  A sheet may be:
  - "record-per-row"  →  each row is one instance (Study, Person …)
  - "transposed"      →  row-1 = property names, row-2 = single values
                         (detected when column A header is "property" and
                          column A value is "value").

Special column names
  hasPart / partOf    →  ObjectProperty; value is a semicolon-separated
                         list of IDs referencing rows in another sheet.
  Person:hasXxx       →  a property from an external vocabulary (schema.org /
                         foaf); mapped by the companion ontology.

All class and property URIs are looked up in the merged OWL graph
(base ontology + optional companion).  Unknown column headers are kept as
hydata: terms so no data is silently dropped.
"""

from __future__ import annotations

import argparse
import json
import re
import sys
import uuid
from collections import defaultdict
from datetime import datetime, date
from pathlib import Path
from typing import Any

import openpyxl
from rdflib import Graph, Namespace, OWL, RDF, RDFS, URIRef, Literal
from rdflib.namespace import XSD

# ---------------------------------------------------------------------------
# Namespaces
# ---------------------------------------------------------------------------
PPEO    = Namespace("http://purl.org/ppeo/PPEO.owl#")
HYDATA  = Namespace("http://purl.org/hydata/ontology#")
SCHEMA  = Namespace("https://schema.org/")
FOAF    = Namespace("http://xmlns.com/foaf/0.1/")

BASE_PREFIXES = {
    "ppeo":    str(PPEO),
    "hydata":  str(HYDATA),
    "schema":  str(SCHEMA),
    "foaf":    str(FOAF),
    "xsd":     str(XSD),
    "rdf":     "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs":    "http://www.w3.org/2000/01/rdf-schema#",
    "owl":     "http://www.w3.org/2002/07/owl#",
}

# Properties treated as object links (→ reference other named nodes)
OBJECT_LINK_PROPS = {"hasPart", "partOf", "derivesFrom",
                     "hasContactInstitution", "hasAffiliation",
                     "hasLocation", "hasCountry"}

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _slug(text: str) -> str:
    """Create a simple URI-safe slug from free text."""
    return re.sub(r"[^A-Za-z0-9_\-]", "_", str(text).strip())


def _local(uri: str | URIRef) -> str:
    """Return the local name of a URI (after # or last /)."""
    s = str(uri)
    return s.split("#")[-1] if "#" in s else s.rsplit("/", 1)[-1]


def _parse_date(val: Any) -> str | None:
    """Return an ISO date string or None."""
    if val is None:
        return None
    if isinstance(val, (datetime, date)):
        return val.strftime("%Y-%m-%d")
    try:
        # Excel may store dates as integers (days since 1899-12-30)
        ref = datetime(1899, 12, 30)
        d = ref + __import__("datetime").timedelta(days=int(val))
        return d.strftime("%Y-%m-%d")
    except Exception:
        return str(val).strip() or None


def _split_ids(raw: str) -> list[str]:
    """
    Split a semicolon- (or comma-) separated list of IDs.
    Handles messy entries like '2018_SEL;, 2019_WST; 2019_NRS'
    """
    if not raw:
        return []
    parts = re.split(r"[;,]+", str(raw))
    return [p.strip() for p in parts if p.strip()]


def _make_prefixed(local_name: str, ns: str = "ppeo") -> str:
    return f"{ns}:{local_name}"


# ---------------------------------------------------------------------------
# Ontology loader & property registry
# ---------------------------------------------------------------------------

class OntologyRegistry:
    """
    Merges one or more OWL/TTL graphs and builds lookup tables for:
      - class local-names  →  full URIs
      - property local-names  →  (full_URI, domain_local, range_local, prop_type)
      - column-mapping annotations from companion ontology
    """

    def __init__(self, *owl_paths: str | Path):
        self.g = Graph()
        for p in owl_paths:
            fmt = "turtle" if str(p).endswith(".ttl") else None
            self.g.parse(str(p), format=fmt)

        self._classes: dict[str, URIRef] = {}       # local_name  → URI
        self._props:   dict[str, dict]   = {}       # local_name  → info dict
        self._col_map: dict[str, dict[str, str]] = defaultdict(dict)
        # col_map[sheet_name][col_header] = prop_local_name

        self._build_class_index()
        self._build_property_index()
        self._build_column_map()

    # ------------------------------------------------------------------ index

    def _build_class_index(self):
        for cls in self.g.subjects(RDF.type, OWL.Class):
            if isinstance(cls, URIRef):
                ln = _local(cls)
                self._classes[ln.lower()] = cls

    def _build_property_index(self):
        prop_types = {
            OWL.DatatypeProperty: "DataProperty",
            OWL.ObjectProperty:   "ObjectProperty",
            OWL.AnnotationProperty: "AnnotationProperty",
        }
        for ptype, label in prop_types.items():
            for prop in self.g.subjects(RDF.type, ptype):
                if not isinstance(prop, URIRef):
                    continue
                ln = _local(prop)
                domain_uri = self.g.value(prop, RDFS.domain)
                range_uri  = self.g.value(prop, RDFS.range)
                self._props[ln.lower()] = {
                    "uri":    prop,
                    "local":  ln,
                    "type":   label,
                    "domain": _local(domain_uri) if domain_uri and isinstance(domain_uri, URIRef) else None,
                    "range":  _local(range_uri)  if range_uri  and isinstance(range_uri,  URIRef) else None,
                }

    def _build_column_map(self):
        """
        Parse hydata:columnMappings annotations.
        Format: "Sheet^^ColumnHeader^^PropertyLocalName"
        """
        cm_pred = HYDATA.columnMappings
        for _, _, obj in self.g.triples((None, cm_pred, None)):
            parts = str(obj).split("^^")
            if len(parts) == 3:
                sheet, col, prop = parts
                self._col_map[sheet][col] = prop

    # ------------------------------------------------------------------ query

    def resolve_class(self, name: str) -> URIRef | None:
        return self._classes.get(name.lower())

    def resolve_property(self, name: str) -> dict | None:
        return self._props.get(name.lower())

    def column_to_property(self, sheet: str, col_header: str) -> str:
        """
        Return the ontology local-name for a column header.
        Priority: companion mapping → exact OWL match → camelCase OWL fuzzy → raw header.
        """
        # 1. companion explicit mapping
        if sheet in self._col_map and col_header in self._col_map[sheet]:
            return self._col_map[sheet][col_header]

        # 2. exact match in property index
        if col_header.lower() in self._props:
            return self._props[col_header.lower()]["local"]

        # 3. strip "Person:" prefix
        stripped = re.sub(r"^[A-Za-z]+:", "", col_header)
        if stripped.lower() in self._props:
            return self._props[stripped.lower()]["local"]

        # 4. camelCase collapse (e.g., "Public release date" → "publicReleaseDate")
        cc = re.sub(r"\s+(.)", lambda m: m.group(1).upper(), col_header.strip()).replace(" ", "")
        cc = cc[0].lower() + cc[1:] if cc else cc
        if cc.lower() in self._props:
            return self._props[cc.lower()]["local"]

        # 5. unknown → coin a hydata: term
        safe = _slug(col_header)
        return f"hydata:{safe}"

    def property_info(self, local_name: str) -> dict:
        """Return property metadata dict, falling back to a default."""
        key = local_name.replace("hydata:", "").lower()
        return self._props.get(key, {"local": local_name, "type": "DataProperty",
                                      "domain": None, "range": None})

    def class_uri(self, sheet: str) -> str:
        """Return a curie for the class corresponding to a sheet."""
        uri = self.resolve_class(sheet)
        if uri:
            ln = _local(uri)
            if str(uri).startswith(str(PPEO)):
                return f"ppeo:{ln}"
            if str(uri).startswith(str(HYDATA)):
                return f"hydata:{ln}"
        return f"hydata:{sheet}"


# ---------------------------------------------------------------------------
# Excel reader
# ---------------------------------------------------------------------------

class ExcelReader:
    """
    Reads an xlsx workbook and returns sheets as lists of {col: value} dicts.
    Handles the two sheet layouts:
      - transposed  (investigation-like): row1=headers, row2=values
      - record-per-row: row1=headers, row2..N=data rows
    """

    def __init__(self, path: str | Path):
        self.wb = openpyxl.load_workbook(str(path))

    @property
    def sheet_names(self) -> list[str]:
        return self.wb.sheetnames

    def read_sheet(self, name: str) -> tuple[list[dict], bool]:
        """
        Returns (rows, is_transposed).
        Each row is a dict {header: value} with None keys stripped.
        """
        ws = self.wb[name]
        all_rows = list(ws.iter_rows(values_only=True))
        if not all_rows:
            return [], False

        headers = [h for h in all_rows[0]]
        data_rows = all_rows[1:]

        # Detect transposed format: first header cell is "property" (case-insensitive)
        is_transposed = (
            headers and headers[0] is not None
            and str(headers[0]).strip().lower() == "property"
            and data_rows
            and data_rows[0] and str(data_rows[0][0]).strip().lower() == "value"
        )

        result = []
        if is_transposed:
            # Single logical row: zip headers → values (skip index columns)
            row_dict = {}
            for h, v in zip(headers[1:], data_rows[0][1:]):
                if h is not None and v is not None:
                    row_dict[str(h).strip()] = v
            if row_dict:
                result.append(row_dict)
        else:
            for row in data_rows:
                row_dict = {}
                for h, v in zip(headers, row):
                    if h is not None and v is not None:
                        row_dict[str(h).strip()] = v
                if row_dict:
                    result.append(row_dict)

        return result, is_transposed


# ---------------------------------------------------------------------------
# JSON-LD builder
# ---------------------------------------------------------------------------

# Maps sheet names to ID column names
ID_COLUMNS: dict[str, str] = {
    "investigation": "Investigation unique ID",
    "study":         "hasID",
    "person":        "Person:hasORCID",   # fallback: generated UUID
}

# Date-like property names (will be converted to ISO date strings)
DATE_PROPS = {"hassubmissiondate", "publicreleasedate", "hasstartingdate",
              "hasendingdate", "hasstartdatetime", "hasenddatetime"}


def _node_id(sheet: str, row: dict, idx: int) -> str:
    """
    Derive a stable node @id from the row.
    Uses the known ID column for the sheet, or generates a UUID.
    """
    id_col = ID_COLUMNS.get(sheet.lower())
    if id_col and id_col in row:
        return str(row[id_col]).strip()
    # fallback: try any column containing "ID" or "id" in its name
    for k, v in row.items():
        if "id" in k.lower() and v:
            return str(v).strip()
    return f"urn:uuid:{uuid.uuid4()}"


def _value_for_jsonld(prop_local: str, raw_val: Any) -> Any:
    """
    Coerce a raw cell value to an appropriate JSON-LD value representation.
    Date cells come from openpyxl as datetime objects or as Excel serial ints.
    """
    if raw_val is None:
        return None

    key = prop_local.lower().replace("hydata:", "")

    if key in DATE_PROPS or isinstance(raw_val, (datetime, date)):
        ds = _parse_date(raw_val)
        if ds:
            return {"@type": "xsd:date", "@value": ds}

    if isinstance(raw_val, float) and raw_val == int(raw_val):
        # Could be a year or serial date
        if key in DATE_PROPS:
            ds = _parse_date(int(raw_val))
            if ds:
                return {"@type": "xsd:date", "@value": ds}
        return int(raw_val)

    if isinstance(raw_val, (int, float, bool)):
        return raw_val

    return str(raw_val).strip()


def build_jsonld(
    excel: ExcelReader,
    registry: OntologyRegistry,
) -> dict:
    """
    Main builder: iterates all sheets, validates classes, builds the @graph.
    """
    context = dict(BASE_PREFIXES)
    graph: list[dict] = []

    # We need a two-pass approach so that object links (hasPart / partOf)
    # can reference nodes defined in other sheets.
    # Pass 1: build raw node dicts keyed by node @id
    # Pass 2: resolve object-link properties into {"@id": …} references

    # Raw nodes:  sheet_name → { node_id → node_dict }
    all_nodes: dict[str, dict[str, dict]] = {}

    for sheet_name in excel.sheet_names:
        rows, is_transposed = excel.read_sheet(sheet_name)
        if not rows:
            print(f"  [SKIP] sheet '{sheet_name}' is empty.", file=sys.stderr)
            continue

        # Validate class
        class_uri = registry.resolve_class(sheet_name)
        if class_uri is None:
            print(f"  [WARN] Class '{sheet_name}' not found in ontology; "
                  f"using hydata:{sheet_name} as fallback.", file=sys.stderr)
        class_curie = registry.class_uri(sheet_name)

        sheet_nodes: dict[str, dict] = {}
        for idx, row in enumerate(rows):
            node_id = _node_id(sheet_name, row, idx)
            node: dict[str, Any] = {
                "@id":   node_id,
                "@type": class_curie,
            }

            for col_header, raw_val in row.items():
                prop_local = registry.column_to_property(sheet_name, col_header)
                pinfo      = registry.property_info(prop_local)

                # skip if value is empty
                if raw_val is None or str(raw_val).strip() == "":
                    continue

                # Object-link property → store raw string; resolve in pass 2
                if (prop_local in OBJECT_LINK_PROPS
                        or (pinfo.get("type") == "ObjectProperty"
                            and prop_local not in {"hasExperimentalDesign"})):
                    # store as raw string list for later resolution
                    ids = _split_ids(str(raw_val))
                    if ids:
                        node[f"_raw_link_{prop_local}"] = ids
                    continue

                val = _value_for_jsonld(prop_local, raw_val)
                if val is None:
                    continue

                # Use a prefixed property name
                if ":" not in prop_local:
                    # decide namespace: ppeo or hydata
                    if registry.resolve_property(prop_local):
                        pkey = f"ppeo:{prop_local}"
                    else:
                        pkey = f"hydata:{prop_local}"
                else:
                    pkey = prop_local

                node[pkey] = val

            sheet_nodes[node_id] = node
        all_nodes[sheet_name] = sheet_nodes

    # ------------------------------------------------------------------ pass 2
    # Resolve object links; build the final graph

    def _resolve_links(node: dict, all_nodes: dict) -> dict:
        resolved = {k: v for k, v in node.items() if not k.startswith("_raw_link_")}
        for k, v in node.items():
            if k.startswith("_raw_link_"):
                prop = k[len("_raw_link_"):]
                if ":" not in prop:
                    pkey = f"ppeo:{prop}" if registry.resolve_property(prop) else f"hydata:{prop}"
                else:
                    pkey = prop
                refs = [{"@id": rid} for rid in v]
                resolved[pkey] = refs if len(refs) > 1 else refs[0]
        return resolved

    # Build flat list, with Investigation's @graph embedding Studies
    inv_nodes = all_nodes.get("Investigation", {})
    study_nodes = all_nodes.get("Study", {})
    person_nodes = all_nodes.get("Person", {})

    # Build study node list (resolved)
    study_list = [_resolve_links(n, all_nodes) for n in study_nodes.values()]

    # Build person node list → also contains partOf links to studies
    person_list = [_resolve_links(n, all_nodes) for n in person_nodes.values()]

    # Build investigation node(s) and embed studies via ppeo:hasPart
    for inv_id, inv_node in inv_nodes.items():
        inv_resolved = _resolve_links(inv_node, all_nodes)
        # Ensure hasPart points to study list
        if study_list:
            inv_resolved["ppeo:hasPart"] = [{"@id": s["@id"]} for s in study_list]
        graph.append(inv_resolved)

    graph.extend(study_list)
    graph.extend(person_list)

    return {
        "@context":  context,
        "@graph":    graph,
    }


# ---------------------------------------------------------------------------
# Cypher generator
# ---------------------------------------------------------------------------

def _cypher_label(curie: str) -> str:
    """Convert 'ppeo:study' → 'study' as a Neo4j label."""
    return curie.split(":")[-1].capitalize()


def _cypher_value(val: Any) -> str:
    """Render a JSON-LD value as a Cypher literal."""
    if isinstance(val, dict):
        # {"@type": "xsd:date", "@value": "2024-07-05"}  or  {"@id": "..."}
        v = val.get("@value", val.get("@id", ""))
        return f'"{_cypher_escape(str(v))}"'
    if isinstance(val, bool):
        return "true" if val else "false"
    if isinstance(val, (int, float)):
        return str(val)
    if isinstance(val, str):
        return f'"{_cypher_escape(val)}"'
    return f'"{_cypher_escape(str(val))}"'


def _cypher_escape(s: str) -> str:
    return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")


def _prop_name(pkey: str) -> str:
    """'ppeo:hasName' → 'hasName'  (drop namespace prefix)."""
    return pkey.split(":")[-1]


RELATIONSHIP_PROPS = {"hasPart", "partOf", "derivesFrom", "hasAffiliation",
                      "hasLocation", "hasCountry", "hasContactInstitution"}


def build_cypher(jsonld: dict) -> str:
    lines: list[str] = [
        "// ============================================================",
        "// AUTO-GENERATED Cypher import script",
        "// Target: Neo4j  |  Source: excel_to_jsonld.py",
        "// ============================================================",
        "",
        "// Constraints (run once)",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Investigation) REQUIRE n.id IS UNIQUE;",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Study)         REQUIRE n.id IS UNIQUE;",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person)        REQUIRE n.id IS UNIQUE;",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Institution)   REQUIRE n.name IS UNIQUE;",
        "",
        "// ---- Nodes ----",
        "",
    ]

    rel_queue: list[tuple[str, str, str, str]] = []
    # (from_id, to_id, rel_type, from_label)

    for node in jsonld.get("@graph", []):
        node_id  = node.get("@id", "")
        node_type = _cypher_label(node.get("@type", "Entity"))
        props: dict[str, Any] = {}

        for k, v in node.items():
            if k in ("@id", "@type"):
                continue
            pname = _prop_name(k)

            # Object-link properties → queue as relationship
            if pname in RELATIONSHIP_PROPS:
                targets = v if isinstance(v, list) else [v]
                for t in targets:
                    if isinstance(t, dict) and "@id" in t:
                        rel_queue.append((node_id, t["@id"], pname.upper(), node_type))
                continue

            # Multi-value lists that are NOT object links → serialize as string
            if isinstance(v, list):
                props[pname] = json.dumps([
                    vv.get("@value", vv.get("@id", str(vv))) if isinstance(vv, dict) else vv
                    for vv in v
                ])
                continue

            if isinstance(v, dict):
                if "@id" in v:
                    rel_queue.append((node_id, v["@id"], pname.upper(), node_type))
                    continue
                if "@value" in v:
                    props[pname] = v["@value"]
                continue

            props[pname] = v

        # Primary key defaults
        props.setdefault("id", node_id)

        prop_str = ", ".join(
            f"{p}: {_cypher_value(val)}" for p, val in props.items()
        )
        lines.append(f"MERGE (n:{node_type} {{id: {_cypher_value(node_id)}}}) SET n += {{{prop_str}}};")

    lines += ["", "// ---- Relationships ----", ""]

    # Helper to pick the right lookup key per label
    def _lookup_key(label: str) -> str:
        return "id"  # all our nodes use id as key

    for from_id, to_id, rel_type, from_label in rel_queue:
        # Try to determine the target label from the node type stored in graph
        to_label = "Entity"
        for node in jsonld.get("@graph", []):
            if node.get("@id") == to_id:
                to_label = _cypher_label(node.get("@type", "Entity"))
                break
        # For unresolved targets (e.g. institutions as literals) we create them
        if to_label == "Entity":
            # might be a Study or Institution id
            to_label = "Study"  # most common case for hasPart / partOf

        lines.append(
            f'MATCH (a:{from_label} {{id: "{_cypher_escape(from_id)}"}}) '
            f'MATCH (b:{to_label}  {{id: "{_cypher_escape(to_id)}" }}) '
            f'MERGE (a)-[:{rel_type}]->(b);'
        )

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Convert an OWL-backed Excel checklist to JSON-LD + Cypher."
    )
    p.add_argument("excel_file",  help="Input .xlsx file")
    p.add_argument("owl_file",    help="Base OWL ontology (.owl / .ttl / .rdf)")
    p.add_argument(
        "--companion", "-c",
        default=None,
        help="Optional companion .ttl ontology (column mappings, extra terms)",
    )
    p.add_argument("--output-jsonld", "-j", default="output.jsonld",
                   help="Output JSON-LD file (default: output.jsonld)")
    p.add_argument("--output-cypher", "-q", default="output.cypher",
                   help="Output Cypher file  (default: output.cypher)")
    p.add_argument("--verbose", "-v", action="store_true")
    return p.parse_args()


def main():
    args = parse_args()

    print(f"[1/4] Loading ontology: {args.owl_file}")
    owl_paths = [args.owl_file]
    if args.companion:
        print(f"      + companion:       {args.companion}")
        owl_paths.append(args.companion)

    registry = OntologyRegistry(*owl_paths)
    print(f"      Classes: {len(registry._classes)}   Properties: {len(registry._props)}")

    print(f"\n[2/4] Reading Excel: {args.excel_file}")
    excel = ExcelReader(args.excel_file)
    print(f"      Sheets: {excel.sheet_names}")

    print("\n[3/4] Validating sheets against ontology ...")
    for sheet in excel.sheet_names:
        uri = registry.resolve_class(sheet)
        if uri:
            print(f"      ✓  '{sheet}' → <{uri}>")
        else:
            print(f"      ⚠  '{sheet}' has no direct class in ontology "
                  "(will use hydata: fallback)")

    print("\n[4/4] Building JSON-LD ...")
    jsonld = build_jsonld(excel, registry)

    node_count = len(jsonld["@graph"])
    print(f"      Graph nodes: {node_count}")

    # --- write JSON-LD ---
    out_jld = Path(args.output_jsonld)
    out_jld.write_text(json.dumps(jsonld, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"\n✅  JSON-LD written → {out_jld}")

    # --- write Cypher ---
    cypher = build_cypher(jsonld)
    out_cyp = Path(args.output_cypher)
    out_cyp.write_text(cypher, encoding="utf-8")
    print(f"✅  Cypher   written → {out_cyp}")

    # --- quick stats ---
    labels: dict[str, int] = defaultdict(int)
    for node in jsonld["@graph"]:
        labels[node.get("@type", "??")] += 1
    print("\n--- Node-type summary ---")
    for lbl, cnt in sorted(labels.items(), key=lambda x: -x[1]):
        print(f"  {lbl:30s}  {cnt}")


if __name__ == "__main__":
    main()