In [None]:
%pip uninstall numpy

In [1]:
import json
import re
import time
import os
import torch
import pandas as pd
from collections import defaultdict
from types import SimpleNamespace

from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    DPRContextEncoder, DPRContextEncoderTokenizer,
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer
)

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# --- Global Query Cache ---
query_cache = {}

# ===== Configuration & Azure Client Setup =====

def load_config():
    config_path = r"config.json" 
    with open(config_path, "r", encoding="utf-8") as f:
        config = json.load(f, object_hook=lambda d: SimpleNamespace(**d))
    return config

def initialize_azure_client(config):
    client = SecretClient(vault_url=config.key_vault_url, credential=DefaultAzureCredential())
    secret = client.get_secret(config.dev_secret_name)
    return AzureOpenAI(api_key=secret.value,
                    api_version=config.chat.api_version,
                    azure_endpoint=config.chat.azure_endpoint)


In [None]:

# ===== Triple Loading & Preprocessing =====

def load_triple_text(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

triple_file_path = r"kgCreation/FinKGTripleLatestPro.txt"
triple_text = load_triple_text(triple_file_path)

def prepare_triple_documents(triple_text: str):
    """
    Split the input text into triple documents.
    Each line (non-empty) is assumed to be a triple.
    """
    triple_docs = []
    for line in triple_text.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        if len(parts) < 3:
            continue
        triple_docs.append({
            "subject": parts[0],
            "predicate": parts[1],
            "object": " ".join(parts[2:]),
            "text": line
        })
    return triple_docs

triple_docs = prepare_triple_documents(triple_text)
print(f"Loaded {len(triple_docs)} triples.")

def build_entity_index(triple_docs):
    """Map each token (from subject and object) to a list of triple texts."""
    entity_index = defaultdict(list)
    for triple in triple_docs:
        for token in [triple["subject"], triple["object"]]:
            entity_index[token].append(triple["text"])
    return dict(entity_index)

def build_subject_index(triple_docs):
    """Map each subject to all triple texts where it appears."""
    subject_index = defaultdict(list)
    for triple in triple_docs:
        subject_index[triple["subject"]].append(triple["text"])
    return dict(subject_index)

def build_object_to_subject_index(triple_docs):
    """Map each object to a list of subjects."""
    obj_to_subj = defaultdict(set)
    for triple in triple_docs:
        obj_to_subj[triple["object"]].add(triple["subject"])
    return {k: list(v) for k, v in obj_to_subj.items()}

def build_triple_text_mapping(triple_docs):
    """
    Build a corpus (list of triple texts) and mappings:
      - text_to_doc: triple text -> triple document.
      - text_to_index: triple text -> index in the corpus.
    """
    corpus = []
    text_to_doc = {}
    text_to_index = {}
    for idx, triple in enumerate(triple_docs):
        text = triple["text"]
        corpus.append(text)
        text_to_doc[text] = triple
        text_to_index[text] = idx
    return corpus, text_to_doc, text_to_index

corpus, text_to_doc, text_to_index = build_triple_text_mapping(triple_docs)
entity_index = build_entity_index(triple_docs)
subject_index = build_subject_index(triple_docs)
object_to_subject = build_object_to_subject_index(triple_docs)


In [10]:

def tokenize_text(text: str):
    return re.findall(r'\w+', text)

# Precompute TF–IDF Vectorizer & DPR Dense Embeddings 

# Fit a TF–IDF vectorizer on the full corpus.
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_text)
tfidf_vectorizer.fit(corpus)

# Initialize DPR Context Encoder and its tokenizer.
dpr_context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
dpr_context_model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def precompute_dense_embeddings(corpus, batch_size=16):
    """Compute DPR dense embeddings in batches to avoid memory issues."""
    all_embeddings = []
    for i in range(0, len(corpus), batch_size):
        batch_texts = corpus[i:i+batch_size]
        inputs = dpr_context_tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = dpr_context_model(**inputs)
            all_embeddings.append(outputs.pooler_output)
    return torch.cat(all_embeddings, dim=0)

dense_embeddings = precompute_dense_embeddings(corpus, batch_size=16)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

In [11]:

# ===== Ranking Functions =====

def tfidf_rank_documents(query: str, candidate_texts, vectorizer):
    """
    Rank candidate documents using TF–IDF cosine similarity.
    """
    query_vec = vectorizer.transform([query])
    candidate_vecs = vectorizer.transform(candidate_texts)
    sims = cosine_similarity(query_vec, candidate_vecs)[0]
    ranked = sorted(zip(candidate_texts, sims), key=lambda x: x[1], reverse=True)
    return ranked

def dpr_rank_documents(query: str, candidate_texts, dpr_q_tokenizer, dpr_q_model, dense_embeddings, text_to_index):
    """
    Rank candidate documents using DPR question encoder and cosine similarity.
    """
    inputs = dpr_q_tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        query_emb = dpr_q_model(**inputs).pooler_output 
    results = []
    for text in candidate_texts:
        idx = text_to_index[text]
        doc_emb = dense_embeddings[idx].unsqueeze(0)  
        score = float(torch.nn.functional.cosine_similarity(query_emb, doc_emb)[0])
        results.append((text, score))
    results.sort(key=lambda x: x[1], reverse=True)
    return results

def rrf_fusion(ranked1, ranked2, k=60):
    """
    Fuse two ranked lists using Reciprocal Rank Fusion.
    """
    scores = defaultdict(float)
    for rank, (doc, _) in enumerate(ranked1):
        scores[doc] += 1.0 / (k + rank + 1)
    for rank, (doc, _) in enumerate(ranked2):
        scores[doc] += 1.0 / (k + rank + 1)
    fused = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return fused


In [12]:

# ===== Query Parsing & Retrieval Functions =====

def parse_query_with_llm(query: str):
    """
    Use Azure OpenAI to parse the query into a JSON structure with keys "entities", "relationships", and "goal".
    """
    config = load_config()
    llm = initialize_azure_client(config)
    prompt = [
        {
                        "role": "system",
            "content": f"""
            You are given a natural language query related to employee and organization data. Extract the following details from the query:
            - Entities mentioned in the query (e.g., Person name, Organization ID )
            - Relationships or attributes being asked for (e.g., line manager, contact info, worksFor)
            - Map these relationships or attributes to the correct schema predicates as defined below:
            - Person type: Person
            - Organization type: Organization
            - Application type: Application
            - Process Type: Process
            - Works For: worksFor
            - Email address: email
            - last Name: familyName
            - First Name: givenName
            - Organisation name : name
            - Employee status: status
            - Description of the organisation : description
            - Location: location
            - Identifier (GID): gid
            - Job Title: jobTitle
            - Functional Manager: functionalManager
            - Manager: hasManager
            - Manges/reporting to: manages
            - Contact Info: telephone
            - User Type: userType
            - Parent Organization: parentOrganization
            - Has Head: hasHead
            - Has Child Organization: hasChildOrganization
            - Organisation has Process: hasProcess
            - Title of the application: appName
            - Description of the application : appDescription
            - Application access link : accessLink 
            - Application link : appLink
            - Application image: appImage
            - Application belong to the organisation : partOfOrg
            - Application managed by: managedBy
            - people managing application : manages
            - Application Owner: hasOwner
            - Application part of Process: partOfProcess
            - Process title: title
            - Process description: description
            - Process description: description
            - Process has application: hasApplication
            - Process has owner: hasOwner
            - Process has manager: managedBy
            - Process has child process: hasChildProcess
            - Employee manages process: manages
            - Process has a parent process: prentProcess
            - Process part of an organisation: partOfOrg
            - Process Id : processId
            - Process reference Urls: referenceUrls
            - Process template urls: templateUrls
            - The goal of the query

            Provide the response in a JSON format with keys: "entities", "relationships", "goal".
            Do not include additional formatting other than JSON.
            Sample Output:
            {{
            "entities": {{
                "Dominik Schlueter": "Person",
                "Anubhuti Singh": "Person"
            }},
            "relationships": {{
                "telephone": "phone number",
                "gid": "gid",
                "email": "email"
            }},
            "goal": "To retrieve the phone number, GID, and email address of Dominik Schlueter and Anubhuti Singh."
            }}
            Query: "{query}"
            """
        }
    ]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    response_content = response.choices[0].message.content.strip()
    parsed_data = json.loads(response_content)
    if isinstance(parsed_data.get("entities", {}), dict):
        parsed_data["entities"] = [{"name": k, "type": v} for k, v in parsed_data["entities"].items()]
    if isinstance(parsed_data.get("relationships", {}), dict):
        parsed_data["relationships"] = [{"relation": k, "value": v} for k, v in parsed_data["relationships"].items()]
    return parsed_data


In [13]:

def retrieve_relevant_triples_v2(parsed_query, triple_docs, subject_index, object_to_subject,
                                 text_to_doc, text_to_index, tfidf_vectorizer, dense_embeddings,
                                 dpr_question_tokenizer, dpr_question_model):
    """
    Enhanced retrieval for triples using TF–IDF and DPR with RRF.
    Steps:
      1. Collect candidate triples where a query entity appears in the object.
      2. Rank these candidates using TF–IDF and DPR.
      3. Use the top candidate’s subject as the primary id.
      4. Retrieve all triples where this primary id appears.
      5. Filter those whose predicate matches a query relationship.
      6. Expand candidates if a candidate’s object is a URI.
      7. Fuse the rankings and return the top‑k triple texts.
    """
    # Extract entities and relationships from parsed_query.
    entities = parsed_query.get("entities", [])
    relationships = parsed_query.get("relationships", [])
    query_entities = []
    for ent in entities:
        if isinstance(ent, dict):
            if ent.get("name") in ['person', 'organization', 'application', 'process']:
                query_entities.append(ent["type"])
            else:
                query_entities.append(ent["name"])
        elif isinstance(ent, str):
            query_entities.append(ent)
    # Extract relationship names
    query_relationships = []
    for rel in relationships:
        if isinstance(rel, dict):
            if rel.get("relation"):
                query_relationships.append(rel['relation'])
            if rel.get("value"):
                query_relationships.append(rel['value'])
        else:
            query_relationships.append(str(rel))

    # Candidate selection: select triples where any query entity appears in the object.
    candidate_obj = [t for t in triple_docs if any(ent in t["object"] for ent in query_entities)]
    if not candidate_obj:
        candidate_obj = triple_docs[:]  # fallback: all triples
    candidate_obj_texts = [t["text"] for t in candidate_obj]
    
    # Rank candidates using TF–IDF and DPR (using query entities as the query).
    query_str_entities = " ".join(query_entities)
    tfidf_ranked = tfidf_rank_documents(query_str_entities, candidate_obj_texts, tfidf_vectorizer)
    dpr_ranked = dpr_rank_documents(query_str_entities, candidate_obj_texts, dpr_question_tokenizer, dpr_question_model, dense_embeddings, text_to_index)
    fused_ranking = rrf_fusion(tfidf_ranked, dpr_ranked, k=60)
    
    primary_id = None
    if fused_ranking:
        top_text = fused_ranking[0][0]
        top_candidate = text_to_doc.get(top_text)
        if top_candidate:
            primary_id = top_candidate["subject"]
    
    if primary_id:
        primary_candidates = [t for t in triple_docs if t["subject"] == primary_id or t["object"] == primary_id]
    else:
        primary_candidates = []
    

    # Additional expand: if an object's value is a URI, include triples where it appears as a subject.
    additional_candidates = []
    for t in primary_candidates:
        if any(rel.lower() in t["predicate"].lower() for rel in query_relationships):
            if t["object"].startswith("http"):
                if t["object"] in subject_index:
                    for txt in subject_index[t["object"]]:
                        if txt in text_to_doc:
                            additional_candidates.append(text_to_doc[txt])
    
    combined = {}
    for t in primary_candidates + additional_candidates:
        combined[t["text"]] = t
    combined_candidates = list(combined.values())
    if not combined_candidates:
        combined_candidates = primary_candidates
    candidate_texts_final = [t["text"] for t in combined_candidates]
    
    # Final re-ranking using query relationships.
    final_query_str = " ".join(query_relationships)
    tfidf_ranked_final = tfidf_rank_documents(final_query_str, candidate_texts_final, tfidf_vectorizer)
    dpr_ranked_final = dpr_rank_documents(final_query_str, candidate_texts_final, dpr_question_tokenizer, dpr_question_model, dense_embeddings, text_to_index)
    final_fused = rrf_fusion(tfidf_ranked_final, dpr_ranked_final, k=60)
    return final_fused


In [14]:

def generate_answer_with_llm(query: str, top_documents):
    """
    Use Azure OpenAI to generate a final answer from the top retrieved triple texts.
    """
    config = load_config()
    llm = initialize_azure_client(config)
    context = "\n\n".join(top_documents)
    prompt = [
        {
            "role": "system",
            "content": f"""
            You are an AI assistant tasked with answering a query based on the provided context about employees and organizations.
            Please provide a detailed and well-structured answer to the user's question.
 
            - Organize the answer into bullet points if appropriate.
            - Use headings where relevant.
            - Include all relevant details concisely.
 
            Context:
            {context}
 
            Question: "{query}"
 
            Provide a well-structured answer.
            """
        }
    ]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    return response.choices[0].message.content.strip()


In [None]:

# ===== Excel Processing =====

def process_queries_excel(input_excel: str, output_excel: str, k_list=[1,2,3,5,8,13,15,21]):
    df = pd.read_excel(input_excel)
    # Create output columns for each desired top-k value.
    for k_val in k_list:
        df[f'Top-{k_val} Triples'] = ''
        df[f'Parsed Query'] = ''
        df[f'Top-{k_val} Answer'] = ''
        df[f"Top-{k_val} Retrieval Time(s)"] = ''
        df[f"Top-{k_val} Response Time(s)"] = ''
    
    # Initialize DPR question encoder and tokenizer.
    dpr_question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    dpr_question_model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    
    for index, row in df.iterrows():
        query = row["Query"]
        try:
            start_time = time.time()
            parsed_query = parse_query_with_llm(query)
            top_k = retrieve_relevant_triples_v2(parsed_query, triple_docs, subject_index,
                                                object_to_subject, text_to_doc, text_to_index,
                                                tfidf_vectorizer, dense_embeddings,
                                                dpr_question_tokenizer, dpr_question_model)
            retrieval_time = time.time() - start_time
            df.at[index, 'Parsed Query'] = json.dumps(parsed_query)
            for k_val in k_list:
                top_k_triples = [t[0] for t in top_k[:k_val]]
                start_resp = time.time()
                answer = generate_answer_with_llm(query, top_k_triples)
                response_time = time.time() - start_resp
                df.at[index, f"Top-{k_val} Answer"] = answer
                df.at[index, f"Top-{k_val} Triples"] = "\n\n".join(top_k_triples)
                df.at[index, f"Top-{k_val} Retrieval Time(s)"] = retrieval_time
                df.at[index, f"Top-{k_val} Response Time(s)"] = response_time
        except Exception as e:
            df.at[index, 'Parsed Query'] = f"Error: {e}"
            for k_val in k_list:
                df.at[index, f"Top-{k_val} Triples"] = f"Error: {e}"
                df.at[index, f"Top-{k_val} Answer"] = f"Error: {e}"
                df.at[index, f"Top-{k_val} Retrieval Time(s)"] = None
                df.at[index, f"Top-{k_val} Response Time(s)"] = None

    df.to_excel(output_excel, index=False)
    print(f"Processed {len(df)} queries. Output saved to {output_excel}")

# ===== Main Execution =====

if __name__ == "__main__":

    input_excel_path = r"data/LLMEval_1.xlsx"
    output_excel_path = r"Outputs/LLM_responses_rag_triples_multiHop.xlsx"
    process_queries_excel(input_excel_path, output_excel_path)
