In [1]:
#  pip install langchain-huggingface

In [2]:
# pip install chromadb

Ingesting TXT

In [41]:
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import requests


In [42]:
# Using a smaller, faster embedding model that still performs well
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # Smaller and faster than large version
    model_kwargs={"device": "cpu"},  # Change to "cuda" if you have GPU
    encode_kwargs={"normalize_embeddings": True}  # Better for cosine similarity
)

In [43]:
file_path = "docs/policies.txt"

In [44]:
def read_docs(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [None]:
print(read_docs(file_path))

Chunking

In [19]:
def create_initial_chunks():
    text = read_docs(file_path)
    # Split text by periods but keep the periods with the preceding text
    paragraphs = [p + '.' for p in re.split(r'\.', text)[:-1]]
    # Add the last chunk without adding an extra period
    if text and not text.endswith('.'):
        paragraphs.append(re.split(r'\.', text)[-1])
    print(f"Total paragraphs: {len(paragraphs)}\n")
    return paragraphs

In [48]:
def create_initial_chunks():
    text = read_docs(file_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.create_documents([text])
    extracted_texts = []
    for doc in documents:
        extracted_texts.append(doc.page_content)
    return extracted_texts

In [None]:
print(len(create_initial_chunks()))
print(create_initial_chunks())

In [52]:
def create_semantic_chunks(paragraphs):
    para_embeddings = [np.array(embedding_model.embed_query(paragraph)).reshape(1,-1) for paragraph in paragraphs]
    semantic_chunks = []
    for i in range(len(paragraphs)):
        if i == 0:
            semantic_chunks.append([paragraphs[i]])
        else:
            similarity = cosine_similarity(para_embeddings[i-1], para_embeddings[i])
            if similarity[0][0] > 0.5:
                semantic_chunks[-1].append(paragraphs[i])
            else:
                semantic_chunks.append([paragraphs[i]])

    return semantic_chunks

Vector embedding

In [53]:
paragraphs = create_initial_chunks()
semantic_chunks = create_semantic_chunks(paragraphs)

In [34]:
file_path = "docs/chunk_rec_2.txt"
read_docs(file_path)
semantic_chunks = read_docs(file_path)

In [None]:
print(len(semantic_chunks))
print(semantic_chunks)

In [37]:
persist_directory = "vectorstore_persist_3"
collection_name = "vectorstore_table2_3"

vectorstore = Chroma(
    collection_name=collection_name,
    embedding_function=embedding_model,
    persist_directory=persist_directory
)

In [38]:
def store_chunks_in_chroma(semantic_chunks):
    docs = []
    for idx, chunk_group in enumerate(semantic_chunks):
        combined_text = ' '.join(chunk_group).strip()
        doc = Document(page_content=combined_text, metadata={"chunk_id": idx})
        docs.append(doc)
        print(docs)
    vectorstore.add_documents(docs)
    vectorstore.persist()
    print(f"Stored {len(docs)} semantic chunks in Chroma.")

In [None]:
store_chunks_in_chroma(semantic_chunks)

Retrieval

In [105]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from langchain_core. runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [106]:
llm = ChatOllama(model="llama3.2:3b", temperature=0.1, max_tokens=512, streaming=True)

In [107]:
QUERY_PROMPT = PromptTemplate (
input_variables=["question"],
template="""You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from
a vector database. By generating multiple perspectives on the user question, your
goal is to help the user overcome some of the limitations of the distance-based
similarity search. Provide these alternative questions separated by newlines.
Original question: {question}""")

In [108]:
retriever=MultiQueryRetriever.from_llm(vectorstore.as_retriever(), llm=llm, prompt=QUERY_PROMPT)

In [109]:
template = """You are a precise HR assistant that answers questions using ONLY the provided context.
        
        Always respond in a concise and professional manner and also just don't answer simply if always create a scentence with the answer.
        Rules:
        1. If the context contains relevant information, provide a concise answer based solely on that.
        2. If the question asks about something NOT in the context, respond ONLY with: 'I don't have this information in my knowledge base. Please contact hr@ayatacommerce.com for assistance.'
        3. Never infer or make up information not explicitly stated in the context.
        4. If the question is ambiguous or unclear, ask for clarification.
        5. Do not provide any personal opinions or subjective statements.
        6. Always maintain a professional tone and language.
        7. Avoid using filler phrases like 'I think' or 'In my opinion'.
        8. If the context is too long, summarize it before answering.
        9. If the context is too short, ask for more information.
        10. If the question is a yes/no question, provide a clear yes or no answer based on the context.
        11. If the question is a multiple-choice question, provide the best answer based on the context.
        12. If the question is a definition, provide a clear and concise definition based on the context.
        13. If the question is a comparison, provide a clear and concise comparison based on the context.
        14. If the question is a list, provide a clear and concise list based on the context.
        15. If the question is a how-to question, provide a clear and concise step-by-step guide based on the context.
        16. If the question is a why question, provide a clear and concise explanation based on the context.
        17. If the question is a when question, provide a clear and concise answer based on the context.
        Context: {context}
            
        Question: {question}
            
        Answer:"""
        
prompt= ChatPromptTemplate.from_template(template)

In [None]:
from langchain_core.runnables import RunnableLambda

question = "Who is the founder of the ayatacommerce?"

# Use the QUERY_PROMPT to format the prompt
formatted_prompt = QUERY_PROMPT.format(question=question)

# Call the LLM directly with the formatted prompt
queries_output = llm.invoke(formatted_prompt)

print("Generated alternative queries:\n")
print(queries_output)


Querying

In [111]:
query=({"context":retriever, "question":RunnablePassthrough()} | prompt | llm | StrOutputParser())

In [None]:
# query.invoke({"question": "What is the policy for remote work?"})
query.invoke({"question": "Who is the founder of the ayatacommerce?"})
