In [9]:
import requests
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI
import json
import pandas as pd
from types import SimpleNamespace
import time
from rdflib.plugins.sparql.parser import parseQuery
from rdflib import Graph

# ------------------------- Configuration and Initialization -------------------------
def load_config():
    try:
        with open(r"config.json") as f:
            return json.load(f, object_hook=lambda d: SimpleNamespace(**d))
    except FileNotFoundError:
        raise FileNotFoundError("Config file not found. Please check the path.")

def initialize_azure_client(config):
    client = SecretClient(vault_url=config.key_vault_url, credential=DefaultAzureCredential())
    secret = client.get_secret(config.dev_secret_name)
    return AzureOpenAI(api_key=secret.value, api_version=config.chat.api_version, azure_endpoint=config.chat.azure_endpoint)

# Initialize Azure OpenAI client
config = load_config()
llm = initialize_azure_client(config)


In [10]:

# ------------------------- Apache Jena Fuseki Functions -------------------------
def upload_to_jena(rdf_file_path, fuseki_data_url):
    """
    Upload RDF data to Apache Jena Fuseki using the /data endpoint.
    """
    with open(rdf_file_path, 'rb') as rdf_file:
        rdf_data = rdf_file.read()
    headers = {"Content-Type": "text/turtle"}
    response = requests.post(fuseki_data_url, data=rdf_data, headers=headers)
    if response.status_code == 200:
        print("RDF data successfully uploaded to Apache Jena Fuseki.")
    else:
        print(f"Failed to upload RDF data. Status code: {response.status_code}")
        print(response.text)

def query_jena(sparql_query):
    """
    Query the Apache Jena Fuseki SPARQL endpoint.
    """
    fuseki_sparql_url = "http://localhost:3030/finKG/sparql"  
    headers = {"Accept": "application/sparql-results+json"}
    response = requests.get(fuseki_sparql_url, params={"query": sparql_query}, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"SPARQL query failed with status {response.status_code}")


In [11]:

# ------------------------- JSON-LD Processing -------------------------
def process_jsonld(file_path):
    """
    Reads a JSON-LD file, extracts prefixes, classes, and properties,
    and prepares the data for further use.
    """
    with open(file_path, 'r') as file:
        jsonld_data = json.load(file)
    context = jsonld_data.get("@context", {})
    graph = jsonld_data.get("@graph", [])
    prefixes = {key: value for key, value in context.items()}
    classes = [entry for entry in graph if entry.get("@type") == "rdfs:Class"]
    properties = [entry for entry in graph if entry.get("@type") == "rdf:Property"]
    formatted_classes = [
        {
            "id": cls.get("@id"),
            "label": cls.get("label", "No label available"),
            "description": cls.get("description", "No description available"),
            "subClassOf": cls.get("subClassOf", [])
        }
        for cls in classes
    ]
    formatted_properties = [
        {
            "id": prop.get("@id"),
            "label": prop.get("label", "No label available"),
            "description": prop.get("description", "No description available"),
            "domain": prop.get("domain", "No domain specified"),
            "range": prop.get("range", "No range specified"),
            "examples": json.dumps(prop.get("examples", []))
        }
        for prop in properties
    ]
    return {
        "prefixes": prefixes,
        "classes": formatted_classes,
        "properties": formatted_properties
    }

def prepare_prompt_from_jsonld(jsonld_output):
    """
    Prepares a formatted prompt string based on the extracted JSON-LD data.
    """
    formatted_prefixes = "\n".join([f"{key}: <{value}>" for key, value in jsonld_output["prefixes"].items()])
    formatted_classes = "\n".join([
        f"- ID: {cls['id']}, Label: {cls['label']}, Description: {cls['description']}, SubClassOf: {cls['subClassOf']}"
        for cls in jsonld_output["classes"]
    ])
    formatted_properties = "\n".join([
        f"- ID: {prop['id']}, Label: {prop['label']}, Description: {prop['description']}, Domain: {prop['domain']}, Range: {prop['range']}, Examples: {prop['examples']}"
        for prop in jsonld_output["properties"]
    ])
    formatted_jsonld = f"""
    You are a RDF and SPARQL expert. Your task is to generate SPARQL queries based on the given ontology schema:
    ### Prefixes
    {formatted_prefixes}

    ### Classes
    {formatted_classes}

    ### Properties
    {formatted_properties}

    ### **Query Mapping Rules**
    - **People Queries:** Use 'Schema:givenName', 'Schema:familyName', 'Schema:email', 'Schema:gid', 'Schema:jobTitle', 'fin:manages', 'Schema:hasManager', 'pro:manages'.
    - **Organization Queries:** Use 'Schema:name', 'Schema:hasHead', 'Schema:hasChildOrganization', 'Schema:parentOrganization', 'Schema:worksFor'.
    - **Application Queries:** Use 'app:appName' for exact application names, 'app:accessLink', 'app:managedBy', 'app:hasOwner', 'app:partofOrg', 'app:appDescription' for descriptions, 'pro:partOfOrg' .
    - **Process Queries:** Use 'pro:title', 'pro:description', 'pro:managedBy', 'pro:hasChildProcess', 'pro:hasApplication', 'pro:hasOwner'.
    
    ### **Output Rules**
    - ALWAYS include 'schema:givenName' and 'schema:familyName' for 'Person' data and 'schema:name' for 'Organisation' data.
    - Ensure case-insensitive matching for 'schema:name' and 'schema:description' properties.
    - Ensure case-insensitive matching for 'app:appDescription' property using FILTER regex.
    - Use 'OPTIONAL' for the 'schema:sources' property for all.
    - Output only the SPARQL query without any additional text.
    
    ### **Examples SPARQL Queries**
 
        **Example 1: List people located in specific location**
        ```
        PREFIX schema: <https://schema.org/>
        SELECT ?givenName ?familyName ?email ?gid ?jobTitle ?sources
        WHERE {{
            ?person a schema:Person ;
                    schema:location "DXB G" ;
                    schema:givenName ?givenName ;
                    schema:familyName ?familyName ;
                    schema:email ?email ;
                    schema:gid ?gid ;
                    schema:jobTitle ?jobTitle .
            OPTIONAL {{ ?person schema:sources ?sources . }}
  
        }}        
        ```
 
        **Example 2: Retrieve the head of a specific organization**
        
        PREFIX schema: <https://schema.org/>
        SELECT ?givenName ?familyName ?email ?sources
        WHERE {{
            ?organization a schema:Organization ;
                          schema:name "Martin-Bishop" ;
                          schema:hasHead ?head .
            ?head schema:givenName ?givenName ;
                  schema:familyName ?familyName ;
                  schema:sources ?sources;
                  schema:email ?email .
            OPTIONAL {{ ?head schema:sources ?sources . }}
            OPTIONAL {{ ?organization schema:sources ?sources . }}
        }}
     
 
        **Example 3: Count the number of employees in an organization**
       
        PREFIX schema: <https://schema.org/>
        SELECT (COUNT(?employee) AS ?employeeCount) 
        WHERE {{
            ?employee a schema:Person ;
                      schema:worksFor ?organization .
            ?organization schema:name "Martin-Bishop" .
            OPTIONAL {{ ?employee schema:sources ?sources . }}
            OPTIONAL {{ ?organization schema:sources ?sources . }}
        }}
        
        
        **Example 4: Retrieve the applications managed by a specific person** 
        PREFIX app: <http://application.com/>
        PREFIX schema: <https://schema.org/>

        SELECT ?appName ?accessLink ?sources
        WHERE {{
            ?manager a schema:Person ;
                    schema:givenName ""Lori"" ;
                    schema:familyName ""Bennett"" .
            ?application a app:Application ;
                        app:managedBy ?manager ;
                        app:appName ?appName ;
                        app:accessLink ?accessLink ;
                        app:isDeleted false .
            OPTIONAL {{ ?application schema:sources ?sources . }}
        }}

            
        
        **Example 5: List all applications owned by a specific employee** 
      
        PREFIX app: <http://application.com/>
        PREFIX schema: <https://schema.org/>

        SELECT ?appName ?accessLink ?sources
        WHERE {{
            ?owner a schema:Person ;
                    schema:givenName ""Lori"" ;
                    schema:familyName ""Bennett"" .
            ?application a app:Application ;
                        app:hasOwner ?owner ;
                        app:appName ?appName ;
                        app:accessLink ?accessLink ;
                        app:isDeleted false .
            OPTIONAL {{ ?application schema:sources ?sources . }}
        }} 
            
        
        **Example 6: Retrieve applications under a specific organization** 
        
        
        PREFIX app: <http://application.com/> 
        PREFIX schema: <https://schema.org/> 
        SELECT ?appName ?accessLink ?sources
        WHERE {{     
            ?application a app:Application ;                  
            app:appName ?appName ;                  
            app:accessLink ?accessLink ;                  
            app:worksForOrg ?organization .     
            
            ?organization schema:name "Martin-Bishop" . 
            OPTIONAL {{ ?application schema:sources ?sources . }}
            OPTIONAL {{ ?organization schema:sources ?sources . }}
            }} 
            
        
        **Example 7: Retrieve applications under a parent organization** 

        
        PREFIX app: <http://application.com/> 
        PREFIX schema: <https://schema.org/> 
        SELECT ?appName ?accessLink ?sources
        WHERE {{     
            ?application a app:Application ;                  
            app:appName ?appName ;                  
            app:accessLink ?accessLink ;                  
            app:worksForOrg ?organization .     
            ?organization schema:parentOrganization ?parentOrg .     
            ?parentOrg schema:name "Mooney, Marshall and Parker" . 
            OPTIONAL {{ ?application schema:sources ?sources . }}
            OPTIONAL {{ ?organization schema:sources ?sources . }}
            }}
        
        **Example 8: Retrieve applications managed by a specific person's team members**
        PREFIX app: <http://application.com/>
        PREFIX schema: <https://schema.org/>
        SELECT ?appName ?accessLink ?name ?sources
        WHERE {{
            ?employee a schema:Person ;
                    schema:givenName ""Lori"" ;
                    schema:familyName ""Bennett"" .
                    schema:worksFor ?organization .
            ?teamMember a schema:Person ;
                        schema:name ?name ;
                        schema:worksFor ?organization .
                        
            ?application a app:Application ;
                        app:managedBy ?teamMember ;
                        app:appName ?appName ;
                        app:accessLink ?accessLink .
            OPTIONAL {{ ?application schema:sources ?sources . }}
            OPTIONAL {{ ?teamMember schema:sources ?sources . }}
        }}
        
        
        **Example 9: Retrieve the manager of an employee**
        PREFIX schema: <https://schema.org/>
 
        SELECT ?lineManagerName ?sources
        WHERE {{
        ?person schema:givenName "Billy" ;
                schema:familyName "Bryan" ;
                schema:hasManager ?manager .
        ?manager schema:givenName ?managerGivenName ;
                schema:familyName ?managerFamilyName .
        BIND(CONCAT(?managerGivenName, " ", ?managerFamilyName) AS ?lineManagerName)
        OPTIONAL {{ ?manager schema:sources ?sources . }}
        }}
        
        **Example 10: Retrieve people reporting to a person**
        PREFIX schema: <https://schema.org/>
  
        SELECT ?reporteeName ?email ?sources
        WHERE {{
        ?manager schema:givenName "Billy" ;
                schema:familyName "Bryan" ,
        ?reportee schema:hasManager ?manager ;
                    schema:givenName ?reporteeGivenName ;
                    schema:familyName ?reporteeFamilyName ;
                    schema:email ?email .
        OPTIONAL {{ ?reportee schema:sources ?sources . }}
        BIND(CONCAT(?reporteeGivenName, " ", ?reporteeFamilyName) AS ?reporteeName)
        }}

        PREFIX schema: <https://schema.org/>
        PREFIX fin: <http://financial.com/> 
        
        SELECT ?reporteeName ?email ?sources
        WHERE {{
        ?manager schema:givenName "Billy" ;
                schema:familyName "Bryan" ,
                fin:manages ?reportee.
        ?reportee schema:givenName ?reporteeGivenName ;
                    schema:familyName ?reporteeFamilyName ;
                    schema:email ?email .
        OPTIONAL {{ ?reportee schema:sources ?sources . }}
        BIND(CONCAT(?reporteeGivenName, " ", ?reporteeFamilyName) AS ?reporteeName)
        }}

        **Example 11:  Which application gives an overview/description of the Actuals in the System?  **
        PREFIX app: <http://application.com/>
        PREFIX schema: <https://schema.org/>
        SELECT ?appName ?accessLink ?sources
        WHERE {{
            ?application a app:Application ;
                        app:appName ?appName ;
                        app:accessLink ?accessLink ;
                        app:appDescription ?description .
            FILTER (
            regex(?description, "Actuals in the system", 'i') &&
            regex(?description, "Actuals", 'i')
            )
            OPTIONAL {{  ?application schema:sources ?sources . }}
        }}
        
        **Example 12: Retrieve applications managed by a specific person**
        PREFIX app: <http://application.com/>
        PREFIX schema: <https://schema.org/>
        SELECT ?appName ?accessLink ?sources
        WHERE {{
            ?manager a schema:Person ;
                    schema:givenName ""Lori"" ;
                    schema:familyName ""Bennett"" .
            ?application a app:Application ;
                        app:managedBy ?manager ;
                        app:appName ?appName ;
                        app:accessLink ?accessLink .
            OPTIONAL {{ ?application schema:sources ?sources . }}
        }}
        
        **Retrieve processes where the process decription contains Project reporting actuals and FC**
        PREFIX pro: <http://process.com/>
        PREFIX schema: <https://schema.org/>

        SELECT ?proName ?sources
        WHERE {{
            ?process a pro:Process ;
                        pro:title ?proName ;
                        pro:description ?description .
            FILTER (
                regex(?description, "Project", 'i') &&
                regex(?description, "Reporting", 'i') &&
                regex(?description, "Actuals", 'i') &&
                regex(?description, "FC", 'i')
            )
            OPTIONAL {{ ?process schema:sources ?sources . }}
        }}
        
        **Retrieve processes managed by a specific person**
        PREFIX pro: <http://process.com/>
        PREFIX schema: <https://schema.org/>
        SELECT ?proName ?sources
        WHERE {{
            ?manager a schema:Person ;
                    schema:givenName ""Lori"" ;
                    schema:familyName ""Bennett"" .
            ?process a pro:Process ;
                        pro:managedBy ?manager ;
                        pro:title ?proName .
            OPTIONAL {{ ?process schema:sources ?sources . }}
        }}
        **Retrieve description of the process 'Order Backlog'**
        PREFIX pro: <http://process.com/>
        PREFIX schema: <https://schema.org/>
        SELECT ?proDescription ?sources
        WHERE {{   
            ?process a pro:Process ;
                    pro:title "Order Backlog" ;
                    pro:description ?proDescription .
            OPTIONAL {{ ?process schema:sources ?sources . }}
        }}
    """ 
    return formatted_jsonld

# Process JSON-LD and prepare the prompt
file_path = r'kgCreation/ontology_schema.jsonld'
jsonld_output = process_jsonld(file_path)
JsonSchema = prepare_prompt_from_jsonld(jsonld_output)


In [12]:

def validate_sparql(sparql_query):
    """
    Validates the SPARQL query for syntax and required elements.
    """
    if sparql_query.startswith("```") or sparql_query.endswith("```"):
        raise ValueError("SPARQL query contains invalid backticks.")
    return True

# ------------------------- SPARQL Generation and Querying -------------------------
def natural_language_to_sparql(natural_language_query):
    messages = [
        {
        "role": "system",
        "content": f"""
        You are a SPARQL query expert that converts natural language queries into SPARQL queries for a specific RDF knowledge graph schema based on the ontology schema.
        Your task is to generate SPARQL queries that strictly adhere to the given schema and use the correct prefixes, classes, and properties.

        The ontology schema delimited by triple backticks is:
        
        ** Ontology Schema:**
        ```
        {JsonSchema}


        ### Task
        - Generate SPARQL queries strictly adhering to the schema above.
        - Use only properties and classes defined in the schema, Do NOT create or infer new prefixes or namespaces
        
        - For applications:
        - Identify if the query refers to `app:appName` or `app:appDescription`.
        - Use `app:hasOwner` for owner retrieval, linked to `schema:Person`.
        - Match `app:appDescription` using case-insensitive `FILTER regex(..., "i")` for each keyword.
        - Use `OPTIONAL` for `?sources`.

        - For processes:
        - Use `pro:title` and `pro:description`.
        - Use `pro:hasOwner` for owners, linked to `schema:Person`.

        - For employees:
        - Retrieve `schema:givenName`, `schema:familyName`, `schema:email`, and `schema:jobTitle`.

        - For organizational structure:
        - Use `schema:parentOrganization` and `schema:hasChildOrganization` (never alternatives like `fin:parentOrganization`).
        - Use `schema:description` when the org name contains natural language .
        - Use `schema:name` for acronyms or short labels.

        - Query construction:
        - Extract keywords from the input and apply individual `FILTER regex(..., "i")` conditions combined with `&&`.
        - If querying process name, use `FILTER regex(?appName, "...", "i")`.
        - If querying description, use `FILTER regex(?appDescription, "...", "i")`.

        - Return only the SPARQL query. Do not include any explanation, comments, or backticks.

        Example input: "Which applications are used in the AR, Overdues community?"  
        → Match `app:appDescription` with keywords `["AR", "Overdues", "community"]`.
        
        Do not include explanations or apologies in your responses. 
        Do NOT wrap the query in backticks.
        Do NOT include any text except the SPARQL query generated. 
        """
    },
    {
        "role": "user",
        "content": f"""
        {natural_language_query}
        """
    }
]
    start_time = time.time()
    response = llm.chat.completions.create(model=config.chat.model, messages=messages)
    sparql_query = response.choices[0].message.content.strip()
    end_time = time.time()
    query_generation_time = end_time - start_time
    return sparql_query, query_generation_time

def natural_language_to_sparql_with_validation(natural_language_query, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            sparql_query, query_generation_time = natural_language_to_sparql(natural_language_query)
            validate_sparql(sparql_query)
            # Append a LIMIT clause if not already present
            if "limit" not in sparql_query.lower():
                sparql_query += "\nLIMIT 15"
            return sparql_query
        except ValueError as e:
            retries += 1
            print(f"Validation failed: {e}")
            print("Retrying query generation...")
            natural_language_query += " Please ensure the query syntax is correct and includes schema:sources if applicable."
    raise Exception(f"Failed to generate a valid SPARQL query after {max_retries} retries.")


In [13]:

def answer_query(clean_query, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            print(f"Generated SPARQL Query:\n{clean_query}\n")
            start_time = time.time()
            # Execute the generated SPARQL query on Apache Jena Fuseki
            jena_results = query_jena(clean_query)
            end_time = time.time()
            execution_time = end_time - start_time
            if jena_results['results']['bindings']:
                result_text = f"Results from knowledge graph (Execution Time: {execution_time:.4f} seconds):\n"
                for result in jena_results['results']['bindings']:
                    result_text += ", ".join([f"{k}: {v['value']}" for k, v in result.items()]) + "\n"
                print("Result received from Fuseki")
                return result_text
            else:
                return f"No results found in the knowledge graph (Execution Time: {execution_time:.4f} seconds)."
        except Exception as e:
            retries += 1
            print(f"Attempt {retries} failed with error: {e}")
            if retries < max_retries:
                print("Regenerating SPARQL query...")
            else:
                return f"An error occurred after {max_retries} attempts: {e}"

def format_results_to_nl(sparql_results, query, clean_query):
    messages = [
        {
            "role": "system",
            "content": f"""
            You are an AI assistant tasked with answering a query based on the provided context about employees and organizations.
            Please provide a detailed and well-structured answer to the user's question.
     
            Task: Generate a natural language response from the results of a SPARQL query.
            - Include all details returned from the SPARQL query in the response.
            - If a result contains an URI, resolve it into a human-readable name if a 'schema:name' property is available.
            - Ensure the response is coherent and human-readable.
     
            Sparql Query:
            {clean_query}
            Information:
            {sparql_results}
            Question: {query}
            Helpful Answer:"""
        }
    ]
    response = llm.chat.completions.create(model=config.chat.model, messages=messages)
    return response.choices[0].message.content.strip()


In [None]:

# ------------------------- Pipeline -------------------------
def run_pipeline(natural_language_query):
    # 1. Upload RDF data to Apache Jena Fuseki 
    rdf_file_path = r"kgCreation/ExtendedFinKG_Pro.ttl"
    fuseki_data_url = "http://localhost:3030/finKG/data"  

    upload_to_jena(rdf_file_path, fuseki_data_url)
    
    # 2. Generate SPARQL query from natural language
    sparql_query = natural_language_to_sparql_with_validation(natural_language_query)
    
    # 3. Execute SPARQL query against Apache Jena Fuseki and get raw results
    query_results = answer_query(sparql_query)
    
    # 4. Generate final natural language answer from the SPARQL results
    final_answer = format_results_to_nl(query_results, natural_language_query, sparql_query)
    return final_answer

# ------------------------- Example Usage -------------------------
input_query = "what is the email of Anubhuti Singh"
final_answer = run_pipeline(input_query)
print("Final Answer:")
print(final_answer)

 

