In [6]:
# Install required packages
!pip install -q langchain-huggingface chromadb langchain-community langchain-core sentence-transformers groq langchain_groq


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import re
import numpy as np
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from typing import List, Dict, Any
from groq import Groq
from langchain_groq import ChatGroq

In [8]:
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", 
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True} 
)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    separators=[r"\n\n", r"\n", r"\. ", " ", ""],
    keep_separator=True
)

In [12]:
def read_docs(file_path: str) -> str:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

In [13]:
def create_initial_chunks(file_path: str) -> List[str]:
    
    text = read_docs(file_path)
    if not text:
        return []

    text = re.sub(r'\s+', ' ', text).strip()
    
    documents = text_splitter.create_documents([text])
    return [doc.page_content for doc in documents]

In [14]:
def create_semantic_chunks(paragraphs: List[str], 
                         similarity_threshold: float = 0.82) -> List[List[str]]:
    if not paragraphs:
        return []
    
    # Batch process all embeddings at once
    para_embeddings = embedding_model.embed_documents(paragraphs)
    para_embeddings = [np.array(e).reshape(1, -1) for e in para_embeddings]
    
    semantic_chunks = []
    current_chunk = []
    
    for i in range(len(paragraphs)):
        if not current_chunk:
            current_chunk.append(paragraphs[i])
            continue
            
        # Compare with all paragraphs in current chunk
        similarities = [cosine_similarity(para_embeddings[i], e)[0][0] 
                       for e in para_embeddings[:i]]
        
        # Use max similarity rather than average for better grouping
        max_similarity = max(similarities) if similarities else 0
        
        if max_similarity > similarity_threshold:
            current_chunk.append(paragraphs[i])
        else:
            semantic_chunks.append(current_chunk)
            current_chunk = [paragraphs[i]]
    
    if current_chunk:
        semantic_chunks.append(current_chunk)
        
    return semantic_chunks

In [15]:
# Configure vector store with optimized settings
persist_directory = "vectorstore_persist_optimized_v1"
collection_name = "vectorstore_table_optimized_v1"

vectorstore = Chroma(
    collection_name=collection_name,
    embedding_function=embedding_model,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"}  # Optimize for cosine similarity
)

  vectorstore = Chroma(


In [16]:
def store_chunks_in_chroma(semantic_chunks: List[List[str]]) -> str:
    """Store semantic chunks in Chroma with optimized metadata."""
    if not semantic_chunks:
        return "No chunks to store."
    
    docs = []
    for idx, chunk_group in enumerate(semantic_chunks):
        combined_text = ' '.join(chunk_group).strip()
        if not combined_text:
            continue
            
        # Extract first few words as title for better metadata
        title = ' '.join(combined_text.split()[:5]) + "..."
        
        doc = Document(
            page_content=combined_text,
            metadata={
                "chunk_id": idx,
                "source": "employee_handbook_india",
                "length": len(combined_text),
                "num_paragraphs": len(chunk_group),
                "title": title,
                "type": "policy"  # Helps with filtering
            }
        )
        docs.append(doc)
    
    if docs:
        # Batch add documents
        vectorstore.add_documents(docs)
        return f"Stored {len(docs)} semantic chunks in Chroma."
    return "No valid documents to store."

In [17]:
# Process the document
file_path = "docs/policies.txt"

print("Creating initial chunks...")
paragraphs = create_initial_chunks(file_path)
print(f"Created {len(paragraphs)} initial chunks.")

print("Creating semantic chunks...")
semantic_chunks = create_semantic_chunks(paragraphs)
print(f"Created {len(semantic_chunks)} semantic chunks.")

print("Storing in Chroma...")
result = store_chunks_in_chroma(semantic_chunks)
print(result)

Creating initial chunks...
Created 72 initial chunks.
Creating semantic chunks...
Created 57 semantic chunks.
Storing in Chroma...
Stored 57 semantic chunks in Chroma.


## Optimized Retrieval Setup

In [25]:
# Define the userdata dictionary with your API key
userdata = {
    "GROQ_API_KEY": "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"
}

# Initialize the chat model
llm = ChatGroq(
    temperature=0,
    model_name="meta-llama/llama-4-scout-17b-16e-instruct",
    api_key=userdata.get("GROQ_API_KEY")
)

In [None]:

llm = ChatOllama(
    model="llama3.2:3b", 
    temperature=0.3, 
    top_k=10,
    top_p=0.9,
    streaming=False
)

In [45]:

base_retriever = vectorstore.as_retriever(
    search_type="mmr",  
    search_kwargs={
        "k": 5,
        "fetch_k": 10,   
        "lambda_mult": 0.5
    }
)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an HR policy expert. Generate 5 different versions of the user's question 
    to help retrieve relevant policy documents from a vector database. Focus on variations that 
    might appear in policy language. Provide these alternative versions separated by newlines.
    
    Original question: {question}
    
    Alternative versions:"""
)


retriever = MultiQueryRetriever.from_llm(
    retriever=base_retriever,
    llm=llm,
    prompt=QUERY_PROMPT,
    parser_key="lines",  
    include_original=True  
)

In [46]:

template = """You are a precise HR assistant at Ayatacommerce that answers questions using ONLY the provided context.

Context:
{context}

Question: {question}

Rules:
1. Respond concisely and professionally in 1-3 sentences.
2. If the answer isn't in the context, say: "I don't have this information in my knowledge base. Please contact hr@ayatacommerce.com for assistance."
3. Never infer or make up information.
4. For policy questions, cite the relevant policy section if possible.
5. Maintain a professional tone.

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

In [48]:
def print_context(inputs):
    docs = inputs["context"]
    print("🔍 Retrieved context:\n")
    for doc in docs:
        print(doc.page_content)
        print("-" * 80)
    return inputs


In [35]:
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

query_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Testing the Optimized System

In [49]:
def timed_query(question: str) -> str:

    start_time = time()
    try:
        response = query_chain.invoke(question)
        elapsed = time() - start_time
        print(f"Response time: {elapsed:.2f} seconds")
        return response
    except Exception as e:
        return f"Error processing query: {str(e)}"

In [50]:

print("Query 1: What is the remote work policy?")
print(timed_query("What is the remote work policy?"))

Query 1: What is the remote work policy?
Response time: 1.14 seconds
AyataCommerce has adopted a remote-first culture, allowing employees to work flexibly. To ensure productivity, employees are encouraged to set up a designated workspace, have necessary tech, dress for work, and create a daily to-do list. For more information, refer to the relevant sections on remote work in the employee handbook.


In [38]:

print("\nQuery 2: How many sick leaves do employees get?")
print(timed_query("How many sick leaves do employees get?"))


Query 2: How many sick leaves do employees get?
Response time: 1.26 seconds
According to the provided context, employees are entitled to 12 days of Casual/Sick Leaves each holiday year, as mentioned in the "HOLIDAYS" section of the employee handbook. 

Reference: Document(metadata={'chunk_id':30, 'source': 'employee_handbook_india', 'title': 'If you are unable to...', 'length':1973, 'num_paragraphs':4, 'type': 'policy'}, page_content='If you are unable to work because of sickness or an emergency, you must ensure your manager is informed as early as possible on the first day of absence. HOLIDAYS If you work full-time on the India payroll you are entitled to12 days of Earned Leaves each holiday year,12 days of Casual/Sick Leaves.')


In [39]:

print("\nQuery 3: What are the core values of the company?")
print(timed_query("What are the core values of the company?"))


Query 3: What are the core values of the company?
Response time: 1.05 seconds
The core values of Ayatacommerce are Empathy, Trust, and Adaptability. These values govern our working practices, personal standards, and philosophies whenever we deal with Clients or each other. They are the essence of our organisational identity, acting as a guide for who we are and how we do things.


In [53]:
print(timed_query("Who is the ceo of Ayatacommerce?"))

Response time: 1.48 seconds
The CEO and Founder of AyataCommerce is Shine Mathew. This information can be found in the document titled "New website launched2019..." with chunk_id 4.
