In [None]:
%pip install openpyxl

In [None]:

import os 
import json 
import time 
import pandas as pd 
import numpy as np 
import torch 
import faiss 
from types import SimpleNamespace 
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" 

from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI 


# --------------------------------------------------- 
# 1. Load the DPR Models and Tokenizers 
# --------------------------------------------------- 
from transformers import ( 
    DPRContextEncoder, 
    DPRContextEncoderTokenizer, 
    DPRQuestionEncoder, 
    DPRQuestionEncoderTokenizer 
) 

print("Loading DPR models...") 
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") 
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") 
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base") 
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base") 


In [None]:

# --------------------------------------------------- 
# 2. Load CSV Files (data sources) 
# --------------------------------------------------- 
app_file = r"data/application_master.csv"   # Application data
app_emp_file = r"data/apps_owners.csv"       # Application connecting with employees
emp_file = r"data/Fin_Emp.csv"               # Employee data
org_file = r"data/Fin_Org.csv"               # Organisation connecting with employees
proc_file = r"data/process_master.csv"     # Process data
proc_org_file = r"data/orgs_processes.csv" # Process connecting with organisation
proc_app_file = r"data/process_applications.csv"  # Processes connecting applications
proc_emp_file = r"data/process_owners.csv"       # Process connecting with employees

print("Loading CSV files...") 
app_df = pd.read_csv(app_file) 
app_emp_df = pd.read_csv(app_emp_file) 
emp_df = pd.read_csv(emp_file) 
org_df = pd.read_csv(org_file) 
proc_df = pd.read_csv(proc_file) 
proc_org_df = pd.read_csv(proc_org_file) 
proc_app_df = pd.read_csv(proc_app_file) 
proc_emp_df = pd.read_csv(proc_emp_file) 


In [17]:

# --------------------------------------------------- 
# 3. Helper Function to Create a Document 
# --------------------------------------------------- 
def create_document(entity_type, row, extra_info=""): 
    """ 
    Flatten a row into a document string. 
    entity_type: e.g., "application", "employee", "organisation", "process" 
    row: a pandas Series containing the entity data. 
    extra_info: additional linking information. 
    Returns a dict with doc_id, text, and metadata. 
    """ 
    fields = [f"{col}: {row[col]}" for col in row.index if pd.notnull(row[col])] 
    doc_text = f"{entity_type.upper()} DATA: " + " | ".join(fields) 
    if extra_info: 
        doc_text += " | " + extra_info 
    metadata = {"entity_type": entity_type, "id": row.get("id", None)} 
    return {"doc_id": f"{entity_type}_{row.get('id', '')}", "text": doc_text, "metadata": metadata} 

documents = [] 


In [18]:

# --------------------------------------------------- 
# 4. Create Documents for Each Entity (with linking info) 
# --------------------------------------------------- 
# --- Application Documents --- 
for _, row in app_df.iterrows(): 
    extra_parts = [] 
    linked_emp = app_emp_df[app_emp_df['app_id'] == row['id']] 
    if not linked_emp.empty: 
        emp_info = [] 
        for _, lrow in linked_emp.iterrows(): 
            emp_info.append(f"employee_id: {lrow['employee_id']} (is_owner: {lrow['is_owners']})") 
        extra_parts.append("Linked employees: " + ", ".join(emp_info)) 
    extra_parts.append("App Org: " + str(row.get("app_org", ""))) 
    extra = " | ".join(extra_parts) 
    documents.append(create_document("application", row, extra)) 


# --- Employee Documents --- 
for _, row in emp_df.iterrows(): 
    extra = f"Org ID: {row.get('org_id', '')}, Line Manager ID: {row.get('line_manager_id', '')}" 
    documents.append(create_document("employee", row, extra)) 


# --- Organisation Documents --- 
for _, row in org_df.iterrows(): 
    extra = f"Org Head: {row.get('org_head', '')}, Parent Org ID: {row.get('parent_org_id', '')}" 
    documents.append(create_document("organisation", row, extra)) 


# --- Process Documents --- 
for _, row in proc_df.iterrows(): 
    extra_parts = [] 
    linked_org = proc_org_df[proc_org_df['process_id'] == row['id']] 
    if not linked_org.empty: 
        org_ids = [f"org_id: {r['org_id']}" for _, r in linked_org.iterrows()] 
        extra_parts.append("Linked Organisation(s): " + ", ".join(org_ids)) 
    linked_app = proc_app_df[proc_app_df['process_id'] == row['id']] 
    if not linked_app.empty: 
        app_ids = [f"application_id: {r['application_id']}" for _, r in linked_app.iterrows()] 
        extra_parts.append("Linked Application(s): " + ", ".join(app_ids)) 
    linked_emp = proc_emp_df[proc_emp_df['process_id'] == row['id']] 
    if not linked_emp.empty: 
        emp_ids = [f"employee_id: {r['employee_id']} (is_owner: {r['is_owners']})" for _, r in linked_emp.iterrows()] 
        extra_parts.append("Linked Employee(s): " + ", ".join(emp_ids)) 
    extra = " | ".join(extra_parts) 
    documents.append(create_document("process", row, extra)) 

print(f"Created {len(documents)} documents.") 


Created 195537 documents.


In [None]:

# --------------------------------------------------- 
# 5. Encode Documents Using the DPR Context Encoder 
# --------------------------------------------------- 
def encode_documents(doc_texts, batch_size=16): 
    embeddings = [] 
    for i in range(0, len(doc_texts), batch_size): 
        batch_texts = doc_texts[i: i + batch_size] 
        inputs = ctx_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512) 
        with torch.no_grad(): 
            model_output = ctx_encoder(**inputs) 
        batch_embeddings = model_output.pooler_output.cpu().numpy() 
        embeddings.append(batch_embeddings) 
    embeddings = np.vstack(embeddings) 
    return embeddings 


print("Encoding documents...") 
doc_texts = [doc["text"] for doc in documents] 
doc_embeddings = encode_documents(doc_texts) 


# Normalize embeddings for inner product search 
faiss.normalize_L2(doc_embeddings) 


In [20]:

# --------------------------------------------------- 
# 6. Build the Faiss Index 
# --------------------------------------------------- 
d = doc_embeddings.shape[1] 
print("Building Faiss index...") 
index = faiss.IndexFlatIP(d)  # inner product search on normalized embeddings 
index.add(doc_embeddings) 
print(f"Indexed {index.ntotal} documents.") 


Building Faiss index...
Indexed 195537 documents.


In [None]:

# --------------------------------------------------- 
# 7. Define Query Retrieval Function 
# --------------------------------------------------- 
def retrieve(query, top_k): 
    inputs = question_tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=128) 
    with torch.no_grad(): 
        q_output = question_encoder(**inputs) 
    q_embedding = q_output.pooler_output.cpu().numpy() 
    faiss.normalize_L2(q_embedding) 
    distances, indices = index.search(q_embedding, top_k) 
    results = [] 
    for idx in indices[0]: 
        results.append(documents[idx]) 
    return results 


In [None]:

# --------------------------------------------------- 
# 8. LLM Functions (Azure OpenAI) 
# --------------------------------------------------- 
def load_config(): 

    config_path = r"config.json" 
    try: 
        with open(config_path) as f: 
            config = json.load(f, object_hook=lambda d: SimpleNamespace(**d)) 
        return config 
    except FileNotFoundError: 
        raise FileNotFoundError("Config file not found. Please check the path.") 

def initialize_azure_client(config): 
    """Initialize Azure KeyVault and Azure OpenAI client.""" 
    client = SecretClient(vault_url=config.key_vault_url, credential=DefaultAzureCredential()) 
    secret = client.get_secret(config.dev_secret_name) 
    return AzureOpenAI(api_key=secret.value, 
                    api_version=config.chat.api_version, 
                    azure_endpoint=config.chat.azure_endpoint) 


def generate_answer_with_llm(query: str, top_documents): 
    """
    Use Azure OpenAI to generate a final answer from the top retrieved documents. 
    """ 
    config = load_config() 
    llm = initialize_azure_client(config) 
    context = "\n\n".join(top_documents) 
    prompt = [ 
        { 
            "role": "system", 
            "content": f""" 
You are an AI assistant tasked with answering a query based on the provided context about employees and organizations. 
Please provide a detailed and well-structured answer to the user's question. 

- Organize the answer into bullet points if appropriate. 
- Use headings where relevant. 
- Include all relevant details concisely. 

Context: 
{context} 

Question: "{query}" 

Provide a well-structured answer. 
            """ 
        } 
    ] 
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt) 
    response_content = response.choices[0].message.content.strip() 
    return response_content 


In [None]:

# --------------------------------------------------- 
# 9. Process Queries from an Excel File and Save Results 
# --------------------------------------------------- 
query_excel_file = r"data/LLMEval_1.xlsx"       
output_excel_file = r"Outputs/LLM_responses_dpr_multihop.xlsx" 

queries_df = pd.read_excel(query_excel_file) 
# List of k values to test. 
k_values = [1, 2, 3, 5, 8, 13, 15, 21] 

# For each query, for each k, retrieve docs, record response time, generate final answer. 
for idx, row in queries_df.iterrows(): 
    query = row["Query"] 
    for k in k_values: 
        col_docs = f"k_{k}_docs" 
        col_time = f"k_{k}_retrieve_time" 
        col_responsetime = f"k_{k}_response_answer"
        col_answer = f"k_{k}_final_answer" 
        
        start_time = time.time() 
        retrieved_docs = retrieve(query, k) 
        elapsed_time = time.time() - start_time 
        
        # Prepare a string with the document IDs. 
        docs_str = "\n\n----\n\n ".join([doc["text"] for doc in retrieved_docs]) 
        
        # Generate final answer using Azure OpenAI. 
        # We use the top k documents retrieved.
        top_docs_text = [doc["text"] for doc in retrieved_docs] 
        start_time = time.time() 
        final_answer = generate_answer_with_llm(query, top_docs_text) 
        response_time = time.time() - start_time
        # Save the results in the DataFrame. 
        queries_df.at[idx, col_docs] = docs_str 
        queries_df.at[idx, col_time] = elapsed_time 
        queries_df.at[idx, col_responsetime] = response_time
        queries_df.at[idx, col_answer] = final_answer 
        print(f"Processed query '{query[:50]}...' for k={k} in {elapsed_time:.2f} seconds.") 


# Save the updated DataFrame with new columns to a new Excel file. 
queries_df.to_excel(output_excel_file, index=False) 
print(f"Results saved to {output_excel_file}") 
