In [59]:
# Install required packages
!pip install -q langchain-huggingface chromadb langchain-community langchain-core sentence-transformers groq langchain_groq python-dotenv


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [122]:
import re
import os
import numpy as np
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.schema.runnable import RunnablePassthrough
from typing import List
from langchain_groq import ChatGroq
from dotenv import load_dotenv

In [61]:
load_dotenv() 

True

In [8]:
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", 
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True} 
)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    separators=[r"\n\n", r"\n", r"\. ", " ", ""],
    keep_separator=True
)

In [12]:
def read_docs(file_path: str) -> str:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

In [13]:
def create_initial_chunks(file_path: str) -> List[str]:
    
    text = read_docs(file_path)
    if not text:
        return []

    text = re.sub(r'\s+', ' ', text).strip()
    
    documents = text_splitter.create_documents([text])
    return [doc.page_content for doc in documents]

In [None]:
def create_semantic_chunks(paragraphs: List[str], 
                         similarity_threshold: float = 0.82) -> List[List[str]]:
    if not paragraphs:
        return []
    
    # Batch process all embeddings at once
    para_embeddings = embedding_model.embed_documents(paragraphs)
    para_embeddings = [np.array(e).reshape(1, -1) for e in para_embeddings]
    
    semantic_chunks = []
    current_chunk = []
    
    for i in range(len(paragraphs)):
        if not current_chunk:
            current_chunk.append(paragraphs[i])
            continue
            
        # Compare with all paragraphs in current chunk
        similarities = [cosine_similarity(para_embeddings[i], e)[0][0] 
                       for e in para_embeddings[:i]]
        max_similarity = max(similarities) if similarities else 0
        
        if max_similarity > similarity_threshold:
            current_chunk.append(paragraphs[i])
        else:
            semantic_chunks.append(current_chunk)
            current_chunk = [paragraphs[i]]
    
    if current_chunk:
        semantic_chunks.append(current_chunk)
        
    return semantic_chunks

In [None]:
# Configure vector store with optimized settings
persist_directory = "vectorstore_persist_optimized_v1"
collection_name = "vectorstore_table_optimized_v1"

vectorstore = Chroma(
    collection_name=collection_name,
    embedding_function=embedding_model,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"}
)

  vectorstore = Chroma(


In [16]:
def store_chunks_in_chroma(semantic_chunks: List[List[str]]) -> str:
    """Store semantic chunks in Chroma with optimized metadata."""
    if not semantic_chunks:
        return "No chunks to store."
    
    docs = []
    for idx, chunk_group in enumerate(semantic_chunks):
        combined_text = ' '.join(chunk_group).strip()
        if not combined_text:
            continue
            
        # Extract first few words as title for better metadata
        title = ' '.join(combined_text.split()[:5]) + "..."
        
        doc = Document(
            page_content=combined_text,
            metadata={
                "chunk_id": idx,
                "source": "employee_handbook_india",
                "length": len(combined_text),
                "num_paragraphs": len(chunk_group),
                "title": title,
                "type": "policy"  # Helps with filtering
            }
        )
        docs.append(doc)
    
    if docs:
        # Batch add documents
        vectorstore.add_documents(docs)
        return f"Stored {len(docs)} semantic chunks in Chroma."
    return "No valid documents to store."

In [17]:
# Process the document
file_path = "docs/policies.txt"

print("Creating initial chunks...")
paragraphs = create_initial_chunks(file_path)
print(f"Created {len(paragraphs)} initial chunks.")

print("Creating semantic chunks...")
semantic_chunks = create_semantic_chunks(paragraphs)
print(f"Created {len(semantic_chunks)} semantic chunks.")

print("Storing in Chroma...")
result = store_chunks_in_chroma(semantic_chunks)
print(result)

Creating initial chunks...
Created 72 initial chunks.
Creating semantic chunks...
Created 57 semantic chunks.
Storing in Chroma...
Stored 57 semantic chunks in Chroma.


In [63]:
llm = ChatGroq(
    temperature=0,
    model_name="meta-llama/llama-4-scout-17b-16e-instruct",
    api_key=os.getenv("GROQ_API_KEY")
)

In [None]:

base_retriever = vectorstore.as_retriever(
    search_type="mmr",  
    search_kwargs={
        "k": 5,
        "fetch_k": 10,   
        "lambda_mult": 0.5
    }
)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an HR policy expert. Generate 5 different versions of the user's question 
    to help retrieve relevant policy documents from a vector database. Focus on variations that 
    might appear in policy language. Provide these alternative versions separated by newlines.
    
    Original question: {question}
    
    Alternative versions:"""
)


retriever = MultiQueryRetriever.from_llm(
    retriever=base_retriever,
    llm=llm,
    prompt=QUERY_PROMPT,
    parser_key="lines",  
    include_original=True  
)

In [119]:

template = """You are a precise HR assistant at Ayatacommerce that answers questions using ONLY the provided context and consider you as an employee of ayatacommerce responsible for responding to all queries related to the company by other employees.

Context:
{context}

Question: {question}

Always respond in a concise and professional manner and also just don't answer simply if always create a scentence with the answer.
NOTE:Do not mention the source of the context or the document name in your answer or anything related to the provided context.
If no information is avialable in the context, please respond with the following data:
Contact HR at Ayatacommerce for assistance: hr@ayatacommerce.com 
Human Resourse email: hr@ayatacommerce.com

Rules:
1. If the context contains relevant information, provide a concise answer based solely on that.
2. If the question asks about something NOT in the context, respond ONLY with: 'I don't have this information in my knowledge base. Please contact hr@ayatacommerce.com for assistance.'
3. Never infer or make up information not explicitly stated in the context.
4. If the question is ambiguous or unclear, ask for clarification.
5. Do not provide any personal opinions or subjective statements.
6. Always maintain a professional tone and language.
7. Avoid using filler phrases like 'I think' or 'In my opinion'.
8. If the context is too long, summarize it before answering.
10. If the question is a yes/no question, provide a clear yes or no answer based on the context.
11. If the question is a list, provide a clear and concise list based on the context.
12. If the question is a how-to question, provide a clear and concise step-by-step guide based on the context.
13. If the question is a why question, provide a clear and concise explanation based on the context.
14. If the question is a when question, provide a clear and concise answer based on the context.

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

In [120]:
def print_context(inputs):
    docs = inputs["context"]
    print("🔍 Retrieved context:\n")
    for doc in docs:
        print(doc.page_content)
        print("-" * 80)
    return inputs


In [None]:
query_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [115]:
def timed_query(question: str) -> str:

    start_time = time()
    try:
        response = query_chain.invoke(question)
        elapsed = time() - start_time
        print(f"Response time: {elapsed:.2f} seconds")
        return response
    except Exception as e:
        return f"Error processing query: {str(e)}"

In [101]:

print("Query 1: What is the remote work policy?")
print(timed_query("What is the remote work policy?"))

Query 1: What is the remote work policy?
Response time: 1.21 seconds
Our company adopts a remote-first culture, allowing employees to work flexibly. To ensure productivity, we trust employees to determine what works best for them, but here are some tips: 

- Set up a designated workspace with necessary equipment.
- Ensure a reliable and secure internet connection.
- Get dressed for work to distinguish between work and personal life.
- Write a daily to-do list with achievable tasks.
- Know when to step away from work to avoid burnout.

Additionally, we encourage employees to stay connected with colleagues through regular check-ins, video calls, and team chats. 

For more information on specific arrangements, employees can reach out to their manager or HR.


In [102]:

print("\nQuery 2: How many sick leaves do employees get?")
print(timed_query("How many sick leaves do employees get?"))


Query 2: How many sick leaves do employees get?
Response time: 1.19 seconds
Employees are entitled to 12 days of Casual/Sick Leaves each holiday year.


In [103]:

print("\nQuery 3: What are the core values of the company?")
print(timed_query("What are the core values of the company?"))


Query 3: What are the core values of the company?
Response time: 1.33 seconds
The company's core values are Empathy, Trust, and Adaptability.


In [117]:
print(timed_query("Who is own the company"))

Response time: 1.02 seconds
The ownership of the company is not explicitly stated in my knowledge base. 

Contact HR at Ayatacommerce for assistance: hr@ayatacommerce.com 
Human Resourse email: hr@ayatacommerce.com


In [106]:
print(timed_query("Is it possible to take a earned leave and a casual leave together?"))

Response time: 1.55 seconds
According to our leave policy, you are entitled to 12 days of Earned Leaves and 12 days of Casual/Sick Leaves each holiday year. While the policy doesn't explicitly state that you can take both earned leave and casual leave together, it does allow you to carry over up to 10 days of earned leaves to the following year. However, to get a definitive answer on taking both leave types together, I would recommend checking with your manager or HR. 

However, a clear answer to your question is: yes, you can take both earned and casual leaves, but you have to check with your manager about the procedures.


In [116]:
print(timed_query(input("")))

Response time: 1.08 seconds
Taking short breaks throughout the day can help you feel recharged and refreshed, and give you a different perspective on any work problems. It is recommended to take short breaks, as well as at least half-an-hour to get some food.
