In [1]:
import sys
import os
import configparser

from pyshacl import validate
from rdflib import Graph, RDF, SH
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

from database.neo4j_db import Neo4jGraphDB

In [53]:
# Load configuration
config = configparser.ConfigParser()
config.read('../../config.ini')

openai_api_key = config["openai"]["api_key"]
openai_model = config["openai"].get("model", "gpt-4.1-mini")

# Initialize OpenAI LLM
llm = ChatOpenAI(
    model=openai_model,
    api_key=openai_api_key,
    temperature=0,
    max_tokens=10000,
    timeout=3000
)

# GraphDatabase
neo4j_graph = Neo4jGraphDB()

# SHACL Validation

In [3]:
# Parse data
g = Graph()
g.parse("../../data/chicago/chicago.ttl", format="ttl")
print(f'Graph has {len(g)} statements.')

Graph has 1329504 statements.


In [54]:
# Generate SHACL rule with LLMs
rdf_schema = """
@prefix gr: <http://purl.org/goodrelations/v1#> .
@prefix pco: <http://purl.org/procurement/public-contracts#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

# Assume: gr:BusinessEntity, gr:hasLicense, pco:actualEndDate, pco:bidder, pco:startDate
"""

message = """
- The newest license end date is the MAX of pco:actualEndDate values linked to the organization via gr:hasLicense.
- The oldest contract start date is the MIN of pco:startDate values linked via pco:bidder to the organization .
- The result should identify organizations where the newest license end date is earlier than the oldest contract start date.
- In each subquery, you must include ?this in both the WHERE and SELECT clauses representing organizations.
- Use subqueries and aggregation (MAX, MIN) to calculate the values.
- Pay attention to the relation direction considering the semantics of the relationships: 
    - pco:bidder connects the contract and the organization.
    - gr:hasLicense connects the organization and the license.
"""

prompt = PromptTemplate(
    input_variables=["message", "rdf_schema"],
    template="""
You are an expert in SHACL and SPARQL.

Given the following RDF schema (Turtle format):
{rdf_schema}

And the given message:
"{message}"

Write only the SPARQL SELECT query that enforces this rule. 
Do not include any explanations, comments, SHACL syntax, sparql or RDF prefixes.
Do not include any sparql quote. 
Output only the raw SPARQL query, starting directly with SELECT.
"""
)

chain = prompt | llm | StrOutputParser()
sparql_query = chain.invoke({"message": message, "rdf_schema": rdf_schema})
print(sparql_query)

SELECT ?this ?newestLicenseEndDate ?oldestContractStartDate WHERE {
  {
    SELECT ?this (MAX(?licenseEndDate) AS ?newestLicenseEndDate) WHERE {
      ?this a gr:BusinessEntity .
      ?this gr:hasLicense ?license .
      ?license pco:actualEndDate ?licenseEndDate .
    } GROUP BY ?this
  }
  {
    SELECT ?this (MIN(?contractStartDate) AS ?oldestContractStartDate) WHERE {
      ?contract pco:bidder ?this .
      ?contract pco:startDate ?contractStartDate .
    } GROUP BY ?this
  }
  FILTER(?newestLicenseEndDate < ?oldestContractStartDate)
}


In [5]:
# Populate shacl SHACL shape with query generated from LLM
shacl_shape = f'''
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix gr: <http://purl.org/goodrelations/v1#> .
@prefix pco: <http://purl.org/procurement/public-contracts#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<#BusinessLicenseContractShape> a sh:NodeShape ;
    sh:targetClass gr:BusinessEntity ;
    sh:message "The newest license end date must not be before the oldest contract start date." ;
    sh:sparql [
        a sh:SPARQLConstraint ;
        sh:message "The newest license end date must not be before the oldest contract start date." ;
        sh:select \"\"\" {sparql_query}\"\"\" ;
    ] .
'''
print(shacl_shape)


@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix gr: <http://purl.org/goodrelations/v1#> .
@prefix pco: <http://purl.org/procurement/public-contracts#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<#BusinessLicenseContractShape> a sh:NodeShape ;
    sh:targetClass gr:BusinessEntity ;
    sh:message "The newest license end date must not be before the oldest contract start date." ;
    sh:sparql [
        a sh:SPARQLConstraint ;
        sh:message "The newest license end date must not be before the oldest contract start date." ;
        sh:select """ SELECT ?this WHERE {
  ?this a gr:BusinessEntity .

  { SELECT ?this (MAX(?endDate) AS ?newestLicenseEndDate) WHERE {
      ?this gr:hasLicense ?license .
      ?license pco:actualEndDate ?endDate .
    }
    GROUP BY ?this
  }

  { SELECT ?this (MIN(?startDate) AS ?oldestContractStartDate) WHERE {
      ?contract pco:bidder ?this .
      ?contract pco:startDate ?startDate .
    }
    GROUP BY ?this
  }

  FILTER(?newestLicenseE

In [6]:
# Parse shacl data
g.parse(data=shacl_shape, format="turtle")

<Graph identifier=N1b0ff1308d794bbb93bdee507a5d6d31 (<class 'rdflib.graph.Graph'>)>

In [7]:
# Perform validation
r = validate(g,
      shacl_graph=None,
      ont_graph=None,
      inference='rdfs',
      abort_on_first=False,
      allow_infos=True,
      allow_warnings=True,
      meta_shacl=False,
      advanced=False,
      js=False,
      debug=False)
conforms, results_graph, results_text = r

In [8]:
print(results_text)

Validation Report
Conforms: False
Results (13):
Constraint Violation in SPARQLConstraintComponent (http://www.w3.org/ns/shacl#SPARQLConstraintComponent):
	Severity: sh:Violation
	Source Shape: <file:///Users/giuseppefutia/Desktop/code/klab/factory/chicago/#BusinessLicenseContractShape>
	Focus Node: <http://purl.org/goodrelations/v1#BusinessEntity/4:ad071729-f045-4446-b65d-b2e7706dde7b:2268190>
	Value Node: <http://purl.org/goodrelations/v1#BusinessEntity/4:ad071729-f045-4446-b65d-b2e7706dde7b:2268190>
	Source Constraint: [ rdf:type sh:SPARQLConstraint ; sh:message Literal("The newest license end date must not be before the oldest contract start date.") ; sh:select Literal(" SELECT ?this WHERE {
  ?this a gr:BusinessEntity .

  { SELECT ?this (MAX(?endDate) AS ?newestLicenseEndDate) WHERE {
      ?this gr:hasLicense ?license .
      ?license pco:actualEndDate ?endDate .
    }
    GROUP BY ?this
  }

  { SELECT ?this (MIN(?startDate) AS ?oldestContractStartDate) WHERE {
      ?contract p

# Langraph React Agent

In [55]:
# Trasform SHACL-validation graph into a JSON
from collections import defaultdict
import json

def parse_shacl_result(report: Graph):
    grouped_violations = defaultdict(list)

    for result in report.subjects(RDF.type, SH.ValidationResult):
        focus_node = str(report.value(result, SH.focusNode))
        path = str(report.value(result, SH.resultPath)) if report.value(result, SH.resultPath) else None
        message = str(report.value(result, SH.resultMessage))
        severity = str(report.value(result, SH.resultSeverity)).split("#")[-1]
        source_shape = str(report.value(result, SH.sourceShape)) if report.value(result, SH.sourceShape) else None

        # Create a group key excluding the focus_node
        key = json.dumps({
            "path": path,
            "message": message,
            "severity": severity,
            "source_shape": source_shape
        }, sort_keys=True)

        grouped_violations[key].append(focus_node)

    # Format the grouped output
    compressed_output = []
    for key_str, focus_nodes in grouped_violations.items():
        common = json.loads(key_str)
        compressed_output.append({
            "common": common,
            "focus_nodes": focus_nodes
        })

    return compressed_output


In [None]:
@tool
def run_shacl_validation(graph_data: dict):
    """Process and report SHACL validation results from a structured representation of JSON data"""
    return graph_data

@tool
def get_context_from_configuration_and_query_neo4j(nodes_with_issues: list):
    """
    For each node, fetch actual related context from Neo4j using the configured Cypher queries.
    Returns real query results.
    """
    object_property_mappings = {
        "HAS_VENDOR": {
            "dst_uri": "http://purl.org/goodrelations/v1#BusinessEntity",
            "query": """MATCH path=(c:Contract)<-[:INCLUDED_IN_CONTRACT]-(:ContractRecord)-[:HAS_VENDOR]->(contractOrg:Organization)-[:BELONGS_TO_ORG_GROUP]->(o:OrganizationGroup)
                        WHERE elementId(o) = $node_id
                        RETURN DISTINCT path"""
        },
        "HAS_LICENSE": {
            "dst_uri": "http://purl.org/goodrelations/v1#BusinessEntity",
            "query": """MATCH path=(l:LicenseRecord)<-[:ORG_HAS_LICENSE]-(licenseOrg:Organization)-[:BELONGS_TO_ORG_GROUP]->(o:OrganizationGroup)
                        WHERE elementId(o) = $node_id
                        RETURN DISTINCT path"""
        }
    }

    enriched_results = []
    with neo4j_graph._driver.session() as session:
        for full_uri in nodes_with_issues:
            # Safely extract node_id
            if isinstance(full_uri, dict) and "focus_node" in full_uri:
                node_id = full_uri["focus_node"].split("/")[-1]
            elif isinstance(full_uri, str):
                node_id = full_uri.split("/")[-1]
            else:
                continue  # or raise an error

            context = {"node": node_id, "results": []}
            for mapping_name, mapping in object_property_mappings.items():
                query = mapping["query"]
                result = session.run(query, node_id=node_id)
                context["results"].append({
                    "mapping": mapping_name,
                    "query": query,
                    "data": [r.data() for r in result]
                })

            enriched_results.append(context)
    return enriched_results


tools = [run_shacl_validation, get_context_from_configuration_and_query_neo4j]

In [None]:
import json
from langgraph.prebuilt import create_react_agent

def explain_shacl_issues(llm, tools, results_graph):
    """
    Run a LangGraph agent to summarize and explain SHACL validation results,
    including deeper insights via Cypher queries.
    
    Parameters:
        llm: The language model instance.
        tools: List of tools including any CypherQuery tool needed.
        results_graph: Parsed SHACL results as a graph or dict-like structure.
    
    Returns:
        The explanation string from the agent's final message.
    """

    # System message (persona and scope)
    system_message = (
        "You are a helpful assistant who reports SHACL validation issues and explains them clearly. "
        "You have access to context through Cypher queries and can analyze ContractRecord and LicenseRecord details."
    )

    # Build the LangGraph REAct-style agent
    langgraph_agent_executor = create_react_agent(llm, tools, prompt=system_message)

    # Parse and pretty-print the SHACL graph results
    graph_data = parse_shacl_result(results_graph)
    
    graph_data_str = json.dumps(graph_data, indent=2)
    
    # Structured, improved prompt
    base_query = (
        "Please perform the following:\n"
        "1. Shortly summarize the SHACL validation issues.\n"
        "2. Explain how each issue can be fixed.\n"
        "3. List the id of all the affected nodes.\n"
        "4. For each node:\n"
        "   a. Execute Cypher query on the Neo4j database to gather additional details.\n"
        "   b. Use the results to expand on why the issue was triggered.\n"
        "5. Prioritize `ContractRecord` and `LicenseRecord` details and the names of `contractOrg` and `licenseOrg`. "
        "Pay special attention to relevant date fields and information that helps explain the reason for the validation error.\n\n"
        f"SHACL Validation Results:\n{graph_data_str}"
    )

    # Invoke the agent
    response = langgraph_agent_executor.invoke({"messages": [("human", base_query)]})
    # for step in langgraph_agent_executor.stream({"messages": [("human", base_query)]}):
    #    print(step)

    return response["messages"][-1].content


In [76]:
response = explain_shacl_issues(llm=llm, tools=tools, results_graph=results_graph)
print(response)

1. Summary of SHACL Validation Issues:
The validation error states: "The newest license end date must not be before the oldest contract start date." This means that for the organizations involved, the latest license expiration date is earlier than the earliest contract start date, which is logically inconsistent.

2. How to Fix Each Issue:
- Ensure that the license periods cover or extend beyond the contract periods.
- Update license end dates to be on or after the earliest contract start date.
- Alternatively, adjust contract start dates to be on or before the latest license end date if appropriate.
- Review and correct data inconsistencies between contract and license records.

3. IDs of All Affected Nodes:
- 4:ad071729-f045-4446-b65d-b2e7706dde7b:2268190
- 4:ad071729-f045-4446-b65d-b2e7706dde7b:2270396
- 4:ad071729-f045-4446-b65d-b2e7706dde7b:2265869
- 4:ad071729-f045-4446-b65d-b2e7706dde7b:2263621
- 4:ad071729-f045-4446-b65d-b2e7706dde7b:2268088
- 4:ad071729-f045-4446-b65d-b2e7706d