In [7]:
# Add rdflib / networkx / pandas
import rdflib
import networkx as nx
import pandas as pd

# Load the Turtle graph
file_path = "data/rdf/test.ttl"          # adjust if necessary
g = rdflib.Graph().parse(file_path, format="turtle")
print(f"Loaded {len(g):,} triples")

# Gather node and predicate sets
subjects   = {s for s, _, _ in g}
predicates = {p for _, p, _ in g}
objects    = {o for _, _, o in g}
all_nodes  = subjects | objects

# Detect literals, IRIs, and blank nodes
literals   = {o for o in objects if isinstance(o, rdflib.term.Literal)}
iri_objs   = {o for o in objects if isinstance(o, rdflib.term.URIRef)}
bnode_all  = {n for n in all_nodes if isinstance(n, rdflib.term.BNode)}

# Identify classes: every ilcd: resource whose local part starts with a capital letter
classes = {
    n
    for n in all_nodes
    if isinstance(n, rdflib.term.URIRef)
       and g.namespace_manager.qname(n).startswith("ilcd:")
       and g.namespace_manager.qname(n).split(":", 1)[1][:1].isupper()
}

# Build undirected graph for structural metrics
UG = nx.Graph()
UG.add_edges_from((s, o) for s, _, o in g)
avg_degree = sum(dict(UG.degree()).values()) / UG.number_of_nodes()
is_connected   = nx.is_connected(UG)
num_components = nx.number_connected_components(UG)
largest_comp   = UG.subgraph(max(nx.connected_components(UG), key=len))
diameter       = nx.diameter(largest_comp)

# Locate module root nodes and count their triples
module_predicates = {
    "administrativeInformation",
    "exchanges",
    "lciaResults",
    "modellingAndValidation",
    "processInformation",
}

module_roots = {
    pred.split(":")[1]: obj
    for _, pred, obj in g
    if isinstance(pred, rdflib.term.URIRef)
       and g.namespace_manager.qname(pred).startswith("ilcd:")
       and g.namespace_manager.qname(pred).split(":", 1)[1] in module_predicates
}

module_counts = {
    name: len(list(g.triples((root, None, None))))
    for name, root in module_roots.items()
}

# Create summary of key metrics
summary = pd.DataFrame(
    {
        "Metric": [
            "Triples",
            "Classes (ilcd, capitalised)",
            "Distinct subjects",
            "Distinct predicates",
            "Literals (objects)",
            "IRIs – objects",
            "Blank nodes – total",
            "Average node degree",
            "Graph connected?",
            "Connected components",
            "Diameter (largest comp)",
        ],
        "Value": [
            len(g),
            len(classes),
            len(subjects),
            len(predicates),
            len(literals),
            len(iri_objs),
            len(bnode_all),
            round(avg_degree, 3),
            is_connected,
            num_components,
            diameter,
        ],
    }
)

# Display essential metrics
display(summary)

# Display per‑module triple counts
module_df = pd.DataFrame(
    {"Module": list(module_counts.keys()), "Triples": list(module_counts.values())}
).sort_values("Triples", ascending=False)
display(module_df)


Loaded 2,833 triples


Unnamed: 0,Metric,Value
0,Triples,2833
1,"Classes (ilcd, capitalised)",40
2,Distinct subjects,706
3,Distinct predicates,115
4,Literals (objects),425
5,IRIs – objects,745
6,Blank nodes – total,0
7,Average node degree,4.833
8,Graph connected?,True
9,Connected components,1


Unnamed: 0,Module,Triples
1,//example.org/ilcd/lciaResults,20
3,//example.org/ilcd/exchanges,20
4,//example.org/ilcd/processInformation,6
2,//example.org/ilcd/modellingAndValidation,5
0,//example.org/ilcd/administrativeInformation,4
