In [1]:
import sys
import os
import configparser

from pyshacl import validate
from rdflib import Graph, RDF, SH
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

from database.neo4j_db import Neo4jGraphDB

In [2]:
# Load configuration
config = configparser.ConfigParser()
config.read('../../config.ini')

openai_api_key = config["openai"]["api_key"]
openai_model = config["openai"].get("model", "gpt-4.1-mini")

# Initialize OpenAI LLM
llm = ChatOpenAI(
    model=openai_model,
    api_key=openai_api_key,
    temperature=0,
    max_tokens=10000,
    timeout=3000
)

# GraphDatabase
neo4j_graph = Neo4jGraphDB()

# SHACL Validation

In [3]:
# Parse data
g = Graph()
g.parse("../../data/chicago/chicago.ttl", format="ttl")
print(f'Graph has {len(g)} statements.')

Graph has 1117563 statements.


In [4]:
# Generate SHACL rule with LLMs
rdf_schema = """
@prefix gr: <http://purl.org/goodrelations/v1#> .
@prefix pco: <http://purl.org/procurement/public-contracts#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

# Assume: gr:BusinessEntity, gr:hasLicense, pco:actualEndDate, pco:bidder, pco:startDate
"""

message = """
- Find organizations (?this) where the end date of their latest license is earlier than the start date of their earliest contract.
- For each organization, get the maximum pco:actualEndDate from licenses linked by gr:hasLicense and the minimum pco:startDate from contracts linked by pco:bidder.
- Use subqueries to calculate these values, including ?this and the aggregated date (MAX or MIN) in both the SELECT and WHERE clauses of each subquery.
- Only return organizations where the newest license end date is before the oldest contract start date.
- Make sure to follow the correct relationship directions.
"""

prompt = PromptTemplate(
    input_variables=["message", "rdf_schema"],
    template="""
You are an expert in SHACL and SPARQL.

Given the following RDF schema (Turtle format):
{rdf_schema}

And the given message:
"{message}"

Write only the SPARQL SELECT query that enforces this rule. 
Do not include any explanations, comments, SHACL syntax, sparql or RDF prefixes.
Do not include any sparql quote. 
Output only the raw SPARQL query, starting directly with SELECT.
"""
)

chain = prompt | llm | StrOutputParser()
sparql_query = chain.invoke({"message": message, "rdf_schema": rdf_schema})
print(sparql_query)

SELECT ?this ?maxEndDate ?minStartDate WHERE {
  {
    SELECT ?this (MAX(?endDate) AS ?maxEndDate) WHERE {
      ?this a <http://purl.org/goodrelations/v1#BusinessEntity> .
      ?this <http://purl.org/goodrelations/v1#hasLicense> ?license .
      ?license <http://purl.org/procurement/public-contracts#actualEndDate> ?endDate .
    } GROUP BY ?this
  }
  {
    SELECT ?this (MIN(?startDate) AS ?minStartDate) WHERE {
      ?this a <http://purl.org/goodrelations/v1#BusinessEntity> .
      ?contract <http://purl.org/procurement/public-contracts#bidder> ?this .
      ?contract <http://purl.org/procurement/public-contracts#startDate> ?startDate .
    } GROUP BY ?this
  }
  FILTER(?maxEndDate < ?minStartDate)
}


In [5]:
# Populate shacl SHACL shape with query generated from LLM
shacl_shape = f'''
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix gr: <http://purl.org/goodrelations/v1#> .
@prefix pco: <http://purl.org/procurement/public-contracts#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<#BusinessLicenseContractShape> a sh:NodeShape ;
    sh:targetClass gr:BusinessEntity ;
    sh:message "The newest license end date must not be before the oldest contract start date." ;
    sh:sparql [
        a sh:SPARQLConstraint ;
        sh:message "The newest license end date must not be before the oldest contract start date." ;
        sh:select \"\"\" {sparql_query}\"\"\" ;
    ] .
'''
print(shacl_shape)


@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix gr: <http://purl.org/goodrelations/v1#> .
@prefix pco: <http://purl.org/procurement/public-contracts#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<#BusinessLicenseContractShape> a sh:NodeShape ;
    sh:targetClass gr:BusinessEntity ;
    sh:message "The newest license end date must not be before the oldest contract start date." ;
    sh:sparql [
        a sh:SPARQLConstraint ;
        sh:message "The newest license end date must not be before the oldest contract start date." ;
        sh:select """ SELECT ?this ?maxEndDate ?minStartDate WHERE {
  {
    SELECT ?this (MAX(?endDate) AS ?maxEndDate) WHERE {
      ?this a <http://purl.org/goodrelations/v1#BusinessEntity> .
      ?this <http://purl.org/goodrelations/v1#hasLicense> ?license .
      ?license <http://purl.org/procurement/public-contracts#actualEndDate> ?endDate .
    } GROUP BY ?this
  }
  {
    SELECT ?this (MIN(?startDate) AS ?minStartDate) WHERE {
      ?this a 

In [6]:
# Parse shacl data
g.parse(data=shacl_shape, format="turtle")

<Graph identifier=N4f782c2d4fab4e02b97b80c18a17674f (<class 'rdflib.graph.Graph'>)>

In [None]:
# Perform validation
r = validate(g,
      shacl_graph=None,
      ont_graph=None,
      inference='rdfs',
      abort_on_first=False,
      allow_infos=True,
      allow_warnings=True,
      meta_shacl=False,
      advanced=False,
      js=False,
      debug=False)
conforms, results_graph, results_text = r

In [None]:
print(results_text)

Validation Report
Conforms: False
Results (13):
Constraint Violation in SPARQLConstraintComponent (http://www.w3.org/ns/shacl#SPARQLConstraintComponent):
	Severity: sh:Violation
	Source Shape: <file:///Users/giuseppefutia/Desktop/code/klab/factory/chicago/#BusinessLicenseContractShape>
	Focus Node: <http://purl.org/goodrelations/v1#BusinessEntity/4:5bc9e9e3-9f8e-4060-84df-00fa505e2753:2323361>
	Value Node: <http://purl.org/goodrelations/v1#BusinessEntity/4:5bc9e9e3-9f8e-4060-84df-00fa505e2753:2323361>
	Source Constraint: [ rdf:type sh:SPARQLConstraint ; sh:message Literal("The newest license end date must not be before the oldest contract start date.") ; sh:select Literal(" SELECT ?this ?newestLicenseEndDate ?oldestContractStartDate WHERE {
  {
    SELECT ?this (MAX(?licenseEndDate) AS ?newestLicenseEndDate) WHERE {
      ?this a gr:BusinessEntity .
      ?this gr:hasLicense ?license .
      ?license pco:actualEndDate ?licenseEndDate .
    } GROUP BY ?this
  }
  {
    SELECT ?this (MIN

# Langraph React Agent

In [None]:
# Trasform SHACL-validation graph into a JSON
from collections import defaultdict
import json

def parse_shacl_result(report: Graph):
    grouped_violations = defaultdict(list)

    for result in report.subjects(RDF.type, SH.ValidationResult):
        focus_node = str(report.value(result, SH.focusNode))
        path = str(report.value(result, SH.resultPath)) if report.value(result, SH.resultPath) else None
        message = str(report.value(result, SH.resultMessage))
        severity = str(report.value(result, SH.resultSeverity)).split("#")[-1]
        source_shape = str(report.value(result, SH.sourceShape)) if report.value(result, SH.sourceShape) else None

        # Create a group key excluding the focus_node
        key = json.dumps({
            "path": path,
            "message": message,
            "severity": severity,
            "source_shape": source_shape
        }, sort_keys=True)

        grouped_violations[key].append(focus_node)

    # Format the grouped output
    compressed_output = []
    for key_str, focus_nodes in grouped_violations.items():
        common = json.loads(key_str)
        compressed_output.append({
            "common": common,
            "focus_nodes": focus_nodes
        })

    return compressed_output


In [None]:
@tool
def generate_sparql_query(rdf_schema: str, message: str):
    """
    Generates a SPARQL query based on the RDF schema and a message describing the validation.
    
    Expects a dictionary with two keys:
      - 'rdf_schema': a string with RDF prefixes and assumptions
      - 'message': a string describing the rule to implement
    Returns the SPARQL query string.
    """
    
    if not rdf_schema or not message:
        return "Error: 'rdf_schema' and 'message' must be provided."

    prompt = f"""
        You are an expert in SHACL and SPARQL.

        Given the following RDF schema (Turtle format):
        {rdf_schema}

        And the following task description:
        "{message}"

        Write only the SPARQL SELECT query that retrieves the described results.
        Do not include any explanations, comments, SHACL syntax, sparql or RDF prefixes.
        Your response must start directly with SELECT and include only the query body.
        Show the generated query.
        """

    return prompt

@tool
def run_shacl_validation(unused : str):
    """Process and report SHACL validation results from a structured representation of JSON data"""
    graph_data = parse_shacl_result(results_graph)
    graph_data_str = json.dumps(graph_data, indent=2)
    return graph_data_str

@tool
def get_context_from_configuration_and_query_neo4j(nodes_with_issues: list):
    """
    For each node, fetch actual related context from Neo4j using the configured Cypher queries.
    Returns real query results.
    """
    object_property_mappings = {
        "HAS_VENDOR": {
            "dst_uri": "http://purl.org/goodrelations/v1#BusinessEntity",
            "query": """MATCH path=(c:Contract)<-[:INCLUDED_IN_CONTRACT]-(:ContractRecord)-[:HAS_VENDOR]->(contractOrg:Organization)-[:BELONGS_TO_ORG_GROUP]->(o:OrganizationGroup)
                        WHERE elementId(o) = $node_id
                        RETURN DISTINCT path"""
        },
        "HAS_LICENSE": {
            "dst_uri": "http://purl.org/goodrelations/v1#BusinessEntity",
            "query": """MATCH path=(l:LicenseRecord)<-[:ORG_HAS_LICENSE]-(licenseOrg:Organization)-[:BELONGS_TO_ORG_GROUP]->(o:OrganizationGroup)
                        WHERE elementId(o) = $node_id
                        RETURN DISTINCT path"""
        }
    }

    enriched_results = []
    with neo4j_graph._driver.session() as session:
        for full_uri in nodes_with_issues:
            # Safely extract node_id
            if isinstance(full_uri, dict) and "focus_node" in full_uri:
                node_id = full_uri["focus_node"].split("/")[-1]
            elif isinstance(full_uri, str):
                node_id = full_uri.split("/")[-1]
            else:
                continue  # or raise an error

            context = {"node": node_id, "results": []}
            for mapping_name, mapping in object_property_mappings.items():
                query = mapping["query"]
                result = session.run(query, node_id=node_id)
                context["results"].append({
                    "mapping": mapping_name,
                    "query": query,
                    "data": [r.data() for r in result]
                })

            enriched_results.append(context)
    return enriched_results


tools = [generate_sparql_query, run_shacl_validation, get_context_from_configuration_and_query_neo4j]

In [None]:
from langgraph.prebuilt import create_react_agent

def explain_shacl_issues(llm, tools):
    """
    Run a LangGraph agent to summarize and explain SHACL validation results,
    including deeper insights via Cypher queries.
    
    Parameters:
        llm: The language model instance.
        tools: List of tools including any CypherQuery tool needed.
    
    Returns:
        The explanation string from the agent's final message.
    """

    # System message (persona and scope)
    system_message = (
        "You are helpful assistant that it is able to generate SPARQL query for RDF validation."
        "You are a helpful assistant who reports SHACL validation issues and explains them clearly. "
        "You have access to context through Cypher queries and can analyze ContractRecord and LicenseRecord details."
    )

    # Build the LangGraph REAct-style agent
    langgraph_agent_executor = create_react_agent(llm, tools, prompt=system_message)
    
    # Structured, improved prompt
    base_query = (
        f""" "Please perform the following:\n"
        "0. Based on the RDF schema and a message of the validation goal, generate a SPARQL query for SHACL validation. {rdf_schema}, {message}\n"
        "1. Run the SHACL validation and summarize the SHACL validation issues.\n"
        "2. Explain how each issue can be fixed.\n"
        "3. List the id of all the affected nodes.\n"
        "4. For each node:\n"
        "   a. Execute Cypher query on the Neo4j database to gather additional details.\n"
        "   b. Use the results to expand on why the issue was triggered.\n"
        "5. Prioritize `ContractRecord` and `LicenseRecord` details and the names of `contractOrg` and `licenseOrg`. "
        "Pay special attention to relevant date fields and information that helps explain the reason for the validation error.\n\n" """
    )

    # Invoke the agent
    print("---DEBUGGING")
    for step in langgraph_agent_executor.stream({"messages": [("human", base_query)]}):
        print(step)
    
    print("---FINAL VERSION")
    response = langgraph_agent_executor.invoke({"messages": [("human", base_query)]})
    return response["messages"][-1].content


In [12]:
response = explain_shacl_issues(llm=llm, tools=tools)
print(response)

---DEBUGGING
{'agent': {'messages': [AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_Wo7EUrihwrIRQZI4fRi99Jr6', 'function': {'arguments': '{"unused":""}', 'name': 'run_shacl_validation'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 265, 'total_tokens': 282, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_79b79be41f', 'id': 'chatcmpl-BRBZ8olKXtviYbaei5wD7weSiVLe9', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-d7c8cdab-42e4-4dfc-a91c-d055cc2d272d-0', tool_calls=[{'name': 'run_shacl_validation', 'args': {'unused': ''}, 'id': 'call_Wo7EUrihwrIRQZI4fRi99Jr6', 'type': 'tool_call'}], usage_metadata={'input_tokens': 265, 'output_tokens': 17, 'total_tokens': 282, 'inpu