In [None]:
!pip install openai


In [None]:
# Install required packages
!pip install -q langchain-huggingface chromadb langchain-community langchain-core sentence-transformers groq langchain_groq python-dotenv


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import re
import os
import numpy as np
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.schema.runnable import RunnablePassthrough
from typing import List
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings

In [2]:
load_dotenv()
azure_openapi_azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openapi_api_key=os.getenv("AZURE_OPENAI_API_KEY")
azure_openapi_deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
azure_openapi_api_version="2024-12-01-preview"

In [3]:
embedding_model = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
    azure_endpoint=azure_openapi_azure_endpoint,
    api_key=azure_openapi_api_key,
    openai_api_version=azure_openapi_api_version
)

In [4]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    separators=[r"\n\n", r"\n", r"\. ", " ", ""],
    keep_separator=True
)

In [5]:
def read_docs(file_path: str) -> str:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

In [7]:
def create_initial_chunks(file_path: str) -> List[str]:
    
    text = read_docs(file_path)
    if not text:
        return []

    text = re.sub(r'\s+', ' ', text).strip()
    
    documents = text_splitter.create_documents([text])
    return [doc.page_content for doc in documents]

In [8]:
def create_semantic_chunks(paragraphs: List[str], 
                         similarity_threshold: float = 0.82) -> List[List[str]]:
    if not paragraphs:
        return []
    
    # Batch process all embeddings at once
    para_embeddings = embedding_model.embed_documents(paragraphs)
    para_embeddings = [np.array(e).reshape(1, -1) for e in para_embeddings]
    
    semantic_chunks = []
    current_chunk = []
    
    for i in range(len(paragraphs)):
        if not current_chunk:
            current_chunk.append(paragraphs[i])
            continue
            
        # Compare with all paragraphs in current chunk
        similarities = [cosine_similarity(para_embeddings[i], e)[0][0] 
                       for e in para_embeddings[:i]]
        max_similarity = max(similarities) if similarities else 0
        
        if max_similarity > similarity_threshold:
            current_chunk.append(paragraphs[i])
        else:
            semantic_chunks.append(current_chunk)
            current_chunk = [paragraphs[i]]
    
    if current_chunk:
        semantic_chunks.append(current_chunk)
        
    return semantic_chunks

In [9]:
# Configure vector store with optimized settings
persist_directory = "vectorstore_persist_optimized"
collection_name = "vectorstore_table_optimized"

vectorstore = Chroma(
    collection_name=collection_name,
    embedding_function=embedding_model,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"}
)

  vectorstore = Chroma(


In [10]:
def store_chunks_in_chroma(semantic_chunks: List[List[str]]) -> str:
    """Store semantic chunks in Chroma with optimized metadata."""
    if not semantic_chunks:
        return "No chunks to store."
    
    docs = []
    for idx, chunk_group in enumerate(semantic_chunks):
        combined_text = ' '.join(chunk_group).strip()
        if not combined_text:
            continue
            
        # Extract first few words as title for better metadata
        title = ' '.join(combined_text.split()[:5]) + "..."
        
        doc = Document(
            page_content=combined_text,
            metadata={
                "chunk_id": idx,
                "source": "employee_handbook_india",
                "length": len(combined_text),
                "num_paragraphs": len(chunk_group),
                "title": title,
                "type": "policy"  # Helps with filtering
            }
        )
        docs.append(doc)
    
    if docs:
        # Batch add documents
        vectorstore.add_documents(docs)
        return f"Stored {len(docs)} semantic chunks in Chroma."
    return "No valid documents to store."

In [11]:
# Process the document
file_path = "docs/policies.txt"

print("Creating initial chunks...")
paragraphs = create_initial_chunks(file_path)
print(f"Created {len(paragraphs)} initial chunks.")

print("Creating semantic chunks...")
semantic_chunks = create_semantic_chunks(paragraphs)
print(f"Created {len(semantic_chunks)} semantic chunks.")

print("Storing in Chroma...")
result = store_chunks_in_chroma(semantic_chunks)
print(result)

Creating initial chunks...
Created 72 initial chunks.
Creating semantic chunks...
Created 72 semantic chunks.
Storing in Chroma...
Stored 72 semantic chunks in Chroma.


In [30]:
llm = AzureChatOpenAI(
    api_key=azure_openapi_api_key,
    azure_endpoint=azure_openapi_azure_endpoint,
    api_version=azure_openapi_api_version,
    deployment_name=azure_openapi_deployment_name,
    temperature=0,
)

In [98]:

base_retriever = vectorstore.as_retriever(
    search_type="mmr",  
    search_kwargs={
        "k": 5,
        "fetch_k": 10,   
        "lambda_mult": 0.5
    }
)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an HR policy expert. Generate 5 different versions of the user's question 
    to help retrieve relevant policy documents from a vector database. Focus on variations that 
    might appear in policy language. Provide these alternative versions separated by newlines.
    
    Original question: {question}
    
    Alternative versions:"""
)


retriever = MultiQueryRetriever.from_llm(
    retriever=base_retriever,
    llm=llm,
    prompt=QUERY_PROMPT,
    parser_key="lines",  
    include_original=True  
)

In [99]:

prompt_template = """You are a precise HR assistant at Ayatacommerce that answers questions using ONLY the provided context and consider you as an employee of ayatacommerce responsible for responding to all queries related to the company by other employees.

Context:
{context}

Question: {input}

Always respond in a concise and professional manner and also just don't answer simply if always create a scentence with the answer.
NOTE:Do not mention the source of the context or the document name in your answer or anything related to the provided context.
If no information is avialable in the context, please respond with the following data:
Contact HR at Ayatacommerce for assistance: hr@ayatacommerce.com 
Human Resourse email: hr@ayatacommerce.com

Rules:
1. If the context contains relevant information, provide a concise answer based solely on that.
2. If the question asks about something NOT in the context, respond ONLY with: 'I don't have this information in my knowledge base. Please contact hr@ayatacommerce.com for assistance.'
3. Never infer or make up information not explicitly stated in the context.
4. If the question is ambiguous or unclear, ask for clarification.
5. Do not provide any personal opinions or subjective statements.
6. Always maintain a professional tone and language.
7. Avoid using filler phrases like 'I think' or 'In my opinion'.
8. If the context is too long, summarize it before answering.
10. If the question is a yes/no question, provide a clear yes or no answer based on the context.
11. If the question is a list, provide a clear and concise list based on the context.
12. If the question is a how-to question, provide a clear and concise step-by-step guide based on the context.
13. If the question is a why question, provide a clear and concise explanation based on the context.
14. If the question is a when question, provide a clear and concise answer based on the context.

Answer:"""

In [100]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever

In [101]:
chat_history=[]

In [102]:
retriever_prompt = (
    "Given a chat history and the latest user question which might reference context in the chat history,"
    "formulate a standalone question which can be understood without the chat history."
    "Do NOT answer the question, just reformulate it if needed and otherwise return it as is."
)

In [103]:
contextualize_q_prompt  = ChatPromptTemplate.from_messages(
    [
        ("system", retriever_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}"),


     ]
)

In [104]:
history_aware_retriever = create_history_aware_retriever(llm,retriever,contextualize_q_prompt)

In [105]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", prompt_template),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [106]:
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

In [107]:
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [108]:
def print_context(inputs):
    docs = inputs["context"]
    print("🔍 Retrieved context:\n")
    for doc in docs:
        print(doc.page_content)
        print("-" * 80)
    return inputs


In [116]:
def timed_query(user_input, chat_history) -> str:
    start_time = time()
    try:
        response = rag_chain.invoke({
            "input": user_input,
            "chat_history": chat_history
        })

        chat_history.extend([
            HumanMessage(content=user_input),
            AIMessage(content=response["answer"])  # if response is a string
        ])

        elapsed = time() - start_time
        print(f"Response time: {elapsed:.2f} seconds")
        return response["answer"]
    except Exception as e:
        return f"Error processing query: {str(e)}"

In [117]:

print("Query 1: What is the remote work policy?")
print(timed_query("What is the remote work policy?",chat_history))

Query 1: What is the remote work policy?
Response time: 8.78 seconds
The remote work policy promotes a remote-first culture that prioritizes productivity over hours spent at a desk. Employees are encouraged to create a designated workspace, ensure they have the necessary technology, maintain regular communication with colleagues, and foster relationships. Additionally, it emphasizes the importance of managing tasks effectively, knowing when to take breaks, and enjoying the benefits of working from home.


In [None]:

print("\nQuery 2: How many sick leaves do employees get?")
print(timed_query("How many sick leaves do employees get?"))


Query 2: How many sick leaves do employees get?
Response time: 5.61 seconds
Employees are entitled to 12 days of Casual/Sick Leaves each holiday year.


In [None]:

print("\nQuery 3: What are the core values of the company?")
print(timed_query("What are the core values of the company?"))


Query 3: What are the core values of the company?
Response time: 4.76 seconds
The core values of the company are Empathy, Trust, and Adaptability, which guide our organizational identity and how we conduct our business.


In [119]:
print(timed_query("Who is own the company",chat_history))

Response time: 6.52 seconds
The company is owned by Shine Mathew, who is the CEO and Founder of AyataCommerce.


In [124]:
print(timed_query("What is his name?",chat_history))

Response time: 6.78 seconds
His name is Shine Mathew, the CEO and Founder of AyataCommerce.


In [None]:
print(timed_query("Is it possible to take a earned leave and a casual leave together?"))

Response time: 5.43 seconds
I don't have this information in my knowledge base. Please contact hr@ayatacommerce.com for assistance.


In [None]:
print(timed_query(input("")))