In [4]:
# Install required packages
!pip install -q langchain-huggingface chromadb langchain-community langchain-core langchain-chroma sentence-transformers groq langchain_groq python-dotenv


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import re
import os
import json
import numpy as np
from datetime import datetime
from typing import List, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv


In [2]:
load_dotenv()

# Configuration
class Config:
    AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
    AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
    AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
    API_VERSION = "2024-02-01"
    PERSIST_DIRECTORY = "askhr_bot_vectorstore"
    COLLECTION_NAME = "askhr_bot_vectorstore_collection"
    CHUNK_SIZE = 500
    CHUNK_OVERLAP = 100

In [3]:
# Initialize embeddings
embedding_model = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
    azure_endpoint=Config.AZURE_OPENAI_ENDPOINT,
    api_key=Config.AZURE_OPENAI_API_KEY,
    openai_api_version=Config.API_VERSION
)


In [4]:

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=Config.CHUNK_SIZE,
    chunk_overlap=Config.CHUNK_OVERLAP,
    length_function=len,
    separators=[r"\n\n", r"\n", r"\. ", " ", ""],
    keep_separator=True
)

In [5]:
def read_docs(file_path: str) -> str:
    """Read document content with error handling"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

In [6]:
def create_initial_chunks(file_path: str) -> List[str]:
    """Create initial text chunks"""
    text = read_docs(file_path)
    if not text:
        return []
    text = re.sub(r'\s+', ' ', text).strip()
    documents = text_splitter.create_documents([text])
    return [doc.page_content for doc in documents]

In [7]:
# Configure vector store with optimized settings
persist_directory = "askhr_bot_vectorstore"
collection_name = "askhr_bot_vectorstore_collection"

vectorstore = Chroma(
    collection_name=collection_name,
    embedding_function=embedding_model,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"}
)

In [8]:
def create_semantic_chunks(paragraphs: List[str], 
                         similarity_threshold: float = 0.82) -> List[List[str]]:
    if not paragraphs:
        return []
    
    # Batch process all embeddings at once
    para_embeddings = embedding_model.embed_documents(paragraphs)
    para_embeddings = [np.array(e).reshape(1, -1) for e in para_embeddings]
    
    semantic_chunks = []
    current_chunk = []
    
    for i in range(len(paragraphs)):
        if not current_chunk:
            current_chunk.append(paragraphs[i])
            continue
            
        # Compare with all paragraphs in current chunk
        similarities = [cosine_similarity(para_embeddings[i], e)[0][0] 
                       for e in para_embeddings[:i]]
        max_similarity = max(similarities) if similarities else 0
        
        if max_similarity > similarity_threshold:
            current_chunk.append(paragraphs[i])
        else:
            semantic_chunks.append(current_chunk)
            current_chunk = [paragraphs[i]]
    
    if current_chunk:
        semantic_chunks.append(current_chunk)
        
    return semantic_chunks

In [9]:
def store_chunks_in_chroma(semantic_chunks: List[List[str]]) -> str:
    """Store semantic chunks in Chroma with optimized metadata."""
    if not semantic_chunks:
        return "No chunks to store."
    
    docs = []
    for idx, chunk_group in enumerate(semantic_chunks):
        combined_text = ' '.join(chunk_group).strip()
        if not combined_text:
            continue
            
        first_sentence = combined_text.split('.')[0][:100]
        word_count = len(combined_text.split())
        para_count = len(chunk_group)
        
        doc = Document(
            page_content=combined_text,
            metadata={
                "chunk_id": f"chunk_{idx}_{word_count}words",
                "source": "employee_handbook",
                "length": len(combined_text),
                "word_count": word_count,
                "para_count": para_count,
                "title": first_sentence,
                "type": "policy",
                "timestamp": datetime.now().isoformat()
            }
        )
        docs.append(doc)
    
    if docs:
        # Batch add documents
        vectorstore.add_documents(docs)
        return f"Stored {len(docs)} semantic chunks in Chroma."
    return "No valid documents to store."

In [10]:
def process_document(file_path: str):
    """Complete document processing pipeline"""
    print("Creating initial chunks...")
    paragraphs = create_initial_chunks(file_path)
    print(f"Created {len(paragraphs)} initial chunks.")

    print("Creating semantic chunks...")
    semantic_chunks = create_semantic_chunks(paragraphs)
    print(f"Created {len(semantic_chunks)} semantic chunks.")

    print("Storing in Chroma...")
    result = store_chunks_in_chroma(semantic_chunks)
    print(result)

In [11]:
# Process the document
file_path = "../docs/policies.txt"
process_document(file_path)

Creating initial chunks...
Created 72 initial chunks.
Creating semantic chunks...
Created 72 semantic chunks.
Storing in Chroma...
Stored 72 semantic chunks in Chroma.
