In [None]:
%pip install sentence_transformers 
# rank_bm25

In [1]:
from rdflib import Graph

input_file = r"kgCreation/ExtendedFinKG_Pro.ttl"
output_file = r"kgCreation/FinKGTripleLatestPro.txt"

graph = Graph()
graph.parse(input_file, format = 'turtle')

with open(output_file, 'w', encoding= "utf-8") as file:
    for subject, predicate, obj in graph:
        triple_line = f"{subject} {predicate} {obj}"
        file.write(triple_line + "\n")
        
print(f"Successfully converted {input_file} to {output_file}.")


Successfully converted C:\Users\z0050t3j\OneDrive - Siemens Energy\Dokumente\Thesis\anubhuti_master_thesis\kgCreation\ExtendedFinKG_Pro.ttl to C:\Users\z0050t3j\OneDrive - Siemens Energy\Dokumente\Thesis\anubhuti_master_thesis\kgCreation\FinKGTripleLatestPro.txt.


In [2]:
# Load triples from the text file and organize them into subgraphs
file_path = r"kgCreation/FinKGTripleLatestPro.txt"
with open(file_path, "r", encoding="utf=8") as file:
    triples = [line.strip() for line in file if line.strip()]

subgraphs = {}
for triple in triples:
    parts = triple.split()
    subject = parts[0]
    if subject not in subgraphs:
        subgraphs[subject] = []
    subgraphs[subject].append(triple)

# Save each entity's subgraph as a block in a single text file
subgraph_file_path = r"kgCreation/FinSubGraphLatestPro.txt"
with open(subgraph_file_path, "w", encoding="utf=8") as subgraph_file:
    for entity, triples in subgraphs.items():
        subgraph_file.write(f"Subgraph for {entity}:\n")
        subgraph_file.write("\n".join(triples) + "\n\n")
print(f"Entity subgraphs saved to {subgraph_file_path}")

Entity subgraphs saved to C:\Users\z0050t3j\OneDrive - Siemens Energy\Dokumente\Thesis\anubhuti_master_thesis\kgCreation\FinSubGraphLatestPro.txt


In [3]:

import json
import os
import re
from types import SimpleNamespace
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoder, DPRContextEncoderTokenizer
import torch
from collections import defaultdict
import time

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


# ------------------------- Configuration and Initialization -------------------------
def load_config():
    try:
        with open(r"config.json") as f:
            return json.load(f, object_hook=lambda d: SimpleNamespace(**d))
    except FileNotFoundError:
        raise FileNotFoundError("Config file not found. Please check the path.")

def initialize_azure_client(config):
    client = SecretClient(vault_url=config.key_vault_url, credential=DefaultAzureCredential())
    secret = client.get_secret(config.dev_secret_name)
    return AzureOpenAI(api_key=secret.value, api_version=config.chat.api_version, azure_endpoint=config.chat.azure_endpoint)



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ------------------------- Prepare Subgraph Documents -------------------------
def prepare_subgraph_documents(subgraph_text):
    """
    Splits the full subgraph text into individual subgraph documents.
    Each document is a subgraph for a specific entity.
    """
    subgraph_text = subgraph_text.replace("\r", "").strip()
    # Split based on the "Subgraph for" pattern (capturing the header/URI)
    subgraphs = re.split(r"\nSubgraph for (http[s]?://[^\s:]+):?", subgraph_text)
    subgraph_documents = {}
    for i in range(1, len(subgraphs), 2):  
        header = subgraphs[i].strip()
        triples = subgraphs[i + 1].strip()
        subgraph_documents[header] = triples
    return subgraph_documents

# ------------------------- Load Knowledge Graph File -------------------------
with open(r"kgCreation/FinSubGraphLatestPro.txt", "r", encoding="utf-8") as f:
    subgraph_text = f.read()
subgraph_documents = prepare_subgraph_documents(subgraph_text)
print(f"Total subgraph documents: {len(subgraph_documents)}")

 

Total subgraph documents: 3230


In [5]:

# ------------------------- Build Entity Index -------------------------
# Create an index for entities and their corresponding subgraph headers
entity_index = {}
for header, doc in subgraph_documents.items():
    lines = doc.split("\n")
    for line in lines:
        parts = line.split()
        if len(parts) >= 3:
            obj = " ".join(parts[2:])
            if obj not in entity_index:
                entity_index[obj] = []
            entity_index[obj].append(header)

In [6]:

# ------------------------- TF-IDF Ranking -------------------------
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

def tfidf_rank_documents(query_terms, documents):
    """
    Ranks the documents using TF-IDF.
    Returns a list of (doc_string, similarity_score) sorted in descending order.
    """
    doc_ids = list(documents.keys())
    docs = [f"Subgraph for {header}\n{documents[header]}" for header in doc_ids]
    tfidf_vectorizer.fit(docs)
    query_text = " ".join(query_terms)
    query_vector = tfidf_vectorizer.transform([query_text])
    doc_tfidf_matrix = tfidf_vectorizer.transform(docs)
    scores = cosine_similarity(query_vector, doc_tfidf_matrix).flatten()
    results = []
    for idx, score in enumerate(scores):
        header = doc_ids[idx]
        doc_text = f"Subgraph for {header}\n{subgraph_documents[header]}"
        results.append((doc_text, score))
    results.sort(key=lambda x: x[1], reverse=True)
    return results


In [None]:
# ------------------------- DPR Ranking -------------------------
# Initialize DPR models and tokenizers (using pre-trained models from Facebook)
dpr_question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
dpr_question_model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
dpr_context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
dpr_context_model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def dpr_rank_documents(query_terms, documents, max_length=512):
    """
    Ranks the documents using DPR.
    Computes DPR embeddings on the fly for both the query and each document.
    Returns a list of (doc_string, similarity_score) sorted in descending order.
    """
    query_text = " ".join(query_terms)
    query_inputs = dpr_question_tokenizer(query_text, return_tensors="pt", truncation=True, max_length=max_length)
    with torch.no_grad():
        query_embedding = dpr_question_model(**query_inputs).pooler_output  

    results = []
    for header, doc in documents.items():
        full_text = f"Subgraph for {header}\n{doc}"
        context_inputs = dpr_context_tokenizer(full_text, return_tensors="pt", truncation=True, max_length=max_length)
        with torch.no_grad():
            doc_embedding = dpr_context_model(**context_inputs).pooler_output  
        similarity = torch.cosine_similarity(query_embedding, doc_embedding).item()
        results.append((full_text, similarity))
    results.sort(key=lambda x: x[1], reverse=True)
    return results

In [8]:
# ------------------------- Reciprocal Rank Fusion (RRF) -------------------------
def rrf_fusion(tfidf_results, dpr_results, k=60):
    """
    Combines two ranked lists using Reciprocal Rank Fusion.
    """
    doc_to_rrf_score = {}
    # Process TF-IDF results
    for rank, (doc, _) in enumerate(tfidf_results, start=1):
        doc_to_rrf_score.setdefault(doc, 0)
        doc_to_rrf_score[doc] += 1.0 / (k + rank)
    # Process DPR results
    for rank, (doc, _) in enumerate(dpr_results, start=1):
        doc_to_rrf_score.setdefault(doc, 0)
        doc_to_rrf_score[doc] += 1.0 / (k + rank)
    fused_list = [(doc, score) for doc, score in doc_to_rrf_score.items()]
    fused_list.sort(key=lambda x: x[1], reverse=True)
    return fused_list

def rank_documents_rrf(query_terms, documents):
    tfidf_ranked_docs = tfidf_rank_documents(query_terms, documents)
    dpr_ranked_docs = dpr_rank_documents(query_terms, documents)
    return rrf_fusion(tfidf_ranked_docs, dpr_ranked_docs)

In [9]:
# ------------------------- Query Parsing with LLM and Caching -------------------------
query_cache = {} 
def parse_query_with_llm(query):
    if query in query_cache:
        return query_cache[query]

    config = load_config()
    llm = initialize_azure_client(config)
    entity_extraction_prompt = [
        {
                        "role": "system",
            "content": f"""
            You are given a natural language query related to employee and organization data. Extract the following details from the query:
            - Entities mentioned in the query (e.g., Person name, Organization ID )
            - Relationships or attributes being asked for (e.g., line manager, contact info, worksFor)
            - Map these relationships or attributes to the correct schema predicates as defined below:
            - Person type: Person
            - Organization type: Organization
            - Application type: Application
            - Process Type: Process
            - Works For: worksFor
            - Email address: email
            - last Name: familyName
            - First Name: givenName
            - Organisation name : name
            - Employee status: status
            - Description of the organisation : description
            - Location: location
            - Identifier (GID): gid
            - Job Title: jobTitle
            - Functional Manager: functionalManager
            - Manager: hasManager
            - Manges/reporting to: manages
            - Contact Info: telephone
            - User Type: userType
            - Parent Organization: parentOrganization
            - Has Head: hasHead
            - Has Child Organization: hasChildOrganization
            - Organisation has Process: hasProcess
            - Title of the application: appName
            - Description of the application : appDescription
            - Application access link : accessLink 
            - Application link : appLink
            - Application image: appImage
            - Application belong to the organisation : partOfOrg
            - Application managed by: managedBy
            - people managing application : manages
            - Application Owner: hasOwner
            - Application part of Process: partOfProcess
            - Process title: title
            - Process description: description
            - Process description: description
            - Process has application: hasApplication
            - Process has owner: hasOwner
            - Process has manager: managedBy
            - Process has child process: hasChildProcess
            - Employee manages process: manages
            - Process has a parent process: prentProcess
            - Process part of an organisation: partOfOrg
            - Process Id : processId
            - Process reference Urls: referenceUrls
            - Process template urls: templateUrls
            - The goal of the query

            Provide the response in a JSON format with keys: "entities", "relationships", "goal".
            Do not include additional formatting other than JSON.
            Sample Output:
            {{
            "entities": {{
                "Dominik Schlueter": "Person",
                "Anubhuti Singh": "Person"
            }},
            "relationships": {{
                "telephone": "phone number",
                "gid": "gid",
                "email": "email"
            }},
            "goal": "To retrieve the phone number, GID, and email address of Dominik Schlueter and Anubhuti Singh."
            }}
            Query: "{query}"
            """
        }
    ]

    response = llm.chat.completions.create(model=config.chat.model, messages=entity_extraction_prompt)
    response_content = response.choices[0].message.content.strip()
    parsed_data = json.loads(response_content)

    entities = parsed_data.get('entities', [])
    relationships = parsed_data.get('relationships', [])
    goal = parsed_data.get('goal', "")

    if isinstance(entities, dict):
        entities = [{"name": k, "type": v} for k, v in entities.items()]

    if isinstance(relationships, dict):
        relationships = [{"relation": k, "value": v} for k, v in relationships.items()]

    result = {
        "entities": entities,
        "relationships": relationships,
        "goal": goal
    }
    query_cache[query] = result
    return result


In [10]:

# ------------------------- Subgraph Matching and Retrieval -------------------------
def match_subgraphs(parsed_query, subgraph_documents):
    """
    Retrieves relevant subgraphs based on entities and relationships in the parsed query.
    Uses entity filtering, ranking with TF-IDF and DPR, and additional expansion.
    """
    entities = parsed_query.get("entities", [])
    relationships = parsed_query.get("relationships", [])

    # Extract entity names
    entity_names = []
    for ent in entities:
        if isinstance(ent, dict):
            if ent.get('name', '').lower() in ['person', 'organization', 'application', 'process']:
                entity_names.append(ent.get('type', ''))
            else:
                entity_names.append(ent.get('name', ''))
        elif isinstance(ent, str):
            entity_names.append(ent)

    # Extract relationship names and any related entity values
    relationship_names = []
    related_entities = []
    for rel in relationships:
        if isinstance(rel, dict):
            if 'relation' in rel:
                relationship_names.append(rel['relation'])
            if 'predicate' in rel:
                relationship_names.append(rel['predicate'])
            if 'value' in rel:
                val = rel['value']
                if isinstance(val, str):
                    related_entities.append(val)
        elif isinstance(rel, str):
            relationship_names.append(rel)

    # Combine entity and relationship terms into query terms
    query_terms = [str(term) for term in (entity_names + relationship_names)]

    # Use the entity_index to filter subgraph documents quickly
    filtered_headers = set()
    for name in entity_names:
        if name in entity_index:
            filtered_headers.update(entity_index[name])
    if filtered_headers:
        filtered_documents_by_entity = {h: subgraph_documents[h] for h in filtered_headers}
    else:
        filtered_documents_by_entity = subgraph_documents

    # Rank filtered documents based on the combined query terms
    ranked_entity_matches = rank_documents_rrf(query_terms, filtered_documents_by_entity)

    # Identify primary IDs by scanning documents for subjects matching any entity names
    subjects_found = set()
    for header, doc_str in filtered_documents_by_entity.items():
        lines = doc_str.split("\n")
        for line in lines:
            parts = line.split()
            if len(parts) >= 3:
                subj = parts[0]
                obj = " ".join(parts[2:])
                if obj in entity_names:
                    subjects_found.add(subj)
                    break

    primary_id_list = [str(pid) for pid in subjects_found] if subjects_found else []

    if primary_id_list:
        primary_filtered_headers = set()
        for pid in primary_id_list:
            if pid in entity_index:
                primary_filtered_headers.update(entity_index[pid])
        if primary_filtered_headers:
            filtered_documents_by_primary_id = {h: subgraph_documents[h] for h in primary_filtered_headers}
        else:
            filtered_documents_by_primary_id = subgraph_documents
    else:
        filtered_documents_by_primary_id = subgraph_documents

    ranked_primary_id_matches = rank_documents_rrf(primary_id_list, filtered_documents_by_primary_id) if primary_id_list else []

    top_ranked_entity_docs = ranked_entity_matches[:15]
    top_ranked_primary_id_docs = ranked_primary_id_matches[:15] if ranked_primary_id_matches else []

    # Boost the top document in each ranked list
    if top_ranked_entity_docs:
        doc, score = top_ranked_entity_docs[0]
        top_ranked_entity_docs[0] = (doc, score + 10.0)
    if top_ranked_primary_id_docs:
        doc, score = top_ranked_primary_id_docs[0]
        top_ranked_primary_id_docs[0] = (doc, score + 10.0)

    relationship_set = set(relationship_names)
    additional_docs = []
    visited_uris = set()

    def retrieve_additional_docs_for_top_doc(top_docs, relationship_set, subgraph_documents, visited_uris):
        if not top_docs:
            return []
        top_doc, top_score = top_docs[0]
        local_additional_docs = []
        lines = top_doc.split("\n")
        for line in lines[1:]:
            parts = line.split()
            if len(parts) >= 3:
                subj = parts[0]
                pred = parts[1]
                obj = " ".join(parts[2:])
                for rel_name in relationship_set:
                    if pred.endswith(rel_name) and obj in subgraph_documents:
                        if obj not in visited_uris:
                            visited_uris.add(obj)
                            doc_text = f"Subgraph for {obj}\n{subgraph_documents[obj]}"
                            local_additional_docs.append((doc_text, 0.0))
        return local_additional_docs

    additional_docs.extend(retrieve_additional_docs_for_top_doc(top_ranked_entity_docs, relationship_set, subgraph_documents, visited_uris))
    additional_docs.extend(retrieve_additional_docs_for_top_doc(top_ranked_primary_id_docs, relationship_set, subgraph_documents, visited_uris))

    combined_top_docs = top_ranked_entity_docs + top_ranked_primary_id_docs + additional_docs
    return combined_top_docs




In [11]:
# ------------------------- Generate Answer with LLM -------------------------
def format_answer(answer):
    formatted_lines = []
    for line in answer.splitlines():
        line = line.strip()
        if line and not line.startswith("-"):
            formatted_lines.append(f"- {line}")
        elif line:
            formatted_lines.append(line)
    return "\n".join(formatted_lines)

def generate_answer_with_llm(query, top_documents):
    config = load_config()
    llm = initialize_azure_client(config)
    context = "\n\n".join([doc for doc, score in top_documents])

    prompt = [
        {
            "role": "system",
            "content": f"""
            You are an AI assistant tasked with answering a query based on the provided context about employees and organizations.
            Please provide a detailed and well-structured answer to the user's question.

            - If applicable, organize the answer into bullet points.
            - Use headings where relevant to indicate different parts of the answer.
            - If multiple people or entities are mentioned, separate them clearly.
            - Include all relevant details in a concise yet informative way.

            Context:
            {context}

            Question: "{query}"

            Provide a well-structured and easy-to-read answer.
            """
        }
    ]

    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    response_content = response.choices[0].message.content.strip()
    return format_answer(response_content)



In [None]:
import pandas as pd
 
def process_queries_and_store_subgraphs(input_excel, output_excel, subgraph_documents, k_values=[1,2, 3,5, 8, 13, 15, 21]):
    """
    Reads queries from an Excel file, ranks subgraphs for each query, generates LLM responses, 
    and writes top-K subgraphs and answers for Hits@K back to a new Excel file.

    Args:
        input_excel: Path to the input Excel file with queries.
        output_excel: Path to save the updated Excel file with responses.
        subgraph_documents: The subgraph documents for context.
        k_values: List of K values for Hits@K evaluation (e.g., [1, 3, 8, 13, 15, 21]).
    """
    # Load the Excel file
    df = pd.read_excel(input_excel)
        
        # Ensure the required columns are present
    if 'Query' not in df.columns or 'Ground Truth Answer' not in df.columns:
        raise ValueError("Excel file must contain 'Query' and 'Ground Truth Answer' columns.")

    # Initialize columns for top-K subgraphs and LLM-generated answers
    for k in k_values:
        df[f'Top-{k} Subgraphs'] = ''
        df[f'Top-{k} Answer'] = ''
        df[f"Top-{k} Response Time(s)"] = ''
        df[f"Top-{k} Retrieval Time(s)"] = ''

    # Process each query
    for index, row in df.iterrows():
        query = row['Query']
        try:
            # Parse the query
            start_time = time.time()
            parsed_query = parse_query_with_llm(query)
            df.at[index, 'Parsed Query'] = json.dumps(parsed_query)
            # Rank subgraphs based on the parsed query
            ranked_subgraphs = match_subgraphs(parsed_query, subgraph_documents)
            end_time = time.time()
            subgraph_retrieval_time = end_time - start_time
            
            for k in k_values:
                # Get the top-K subgraphs
                start_time = time.time()
                top_k_subgraphs = ranked_subgraphs[:k]
                response = generate_answer_with_llm(query, top_k_subgraphs)
                end_time = time.time()
                response_time = end_time - start_time
                top_k_docs = [doc for doc, _ in top_k_subgraphs]

                # Store the subgraphs as a string
                df.at[index, f'Top-{k} Subgraphs'] = "\n\n".join(top_k_docs)
                # Generate the answer using the top-K subgraphs
                
                df.at[index, f'Top-{k} Answer'] = response
                df.at[index, f"Top-{k} Retrieval Time(s)"] = subgraph_retrieval_time
                df.at[index, f"Top-{k} Response Time(s)"] = response_time

        except Exception as e:
            df.at[index, 'Parsed Query'] = f"Error: {e}"
            # Log any errors during processing
            for k in k_values:
                df.at[index, f'Top-{k} Subgraphs'] = f"Error: {e}"
                df.at[index, f'Top-{k} Answer'] = f"Error: {e}"
                df.at[index, f"Top-{k} Retrieval Time(s)"] = None
                df.at[index, f"Top-{k} Response Time(s)"] = None

    df.to_excel(output_excel, index=False)
    print(f"Updated Excel file saved to {output_excel}")
        
# ------------------------- Usage -------------------------

if __name__ == "__main__":

    input_excel_path = r"data/LLMEval_1.xlsx"
    output_excel_path = r"Outputs/LLM_responses_rag_subgraphs_multiHop.xlsx"

    # Call the function to process queries and store results
    process_queries_and_store_subgraphs(input_excel_path, output_excel_path, subgraph_documents)
