### RAG implemenation with Symantic Chunking & OpenSearch as Vector Store (with hsnw index)

In [None]:
# Load the enviornment variables
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")
os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY")
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [None]:
## Import Open AI embedding model used for symantic chunking & while convetring the chunks in vectors
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
# Load the pdf document and create chunks using symantic chunking 
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

FILE_PATH=r"C:\Users\gauravkkumar\Documents\2025\rag\data\2407_01502v1.pdf"
loader=PyPDFLoader(FILE_PATH)
pages = loader.load()

text_splitter = SemanticChunker(embeddings)
split_docs = text_splitter.split_documents(pages)

print(split_docs[0].page_content)

### OpenSearch 
1. Create the client
2. Create the index
3. Create the OpenSearch vector store 
Pre-requiste: Make sure Opensearch is running in Docker locally

In [None]:
## Create OpenSearch Clinet 
from langchain_community.vectorstores import OpenSearchVectorSearch
from opensearchpy import OpenSearch

client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    http_auth=('admin', '<password>'),  # Replace with your credentials
    use_ssl=True,
    verify_certs=False
)

In [None]:
len(embeddings.embed_query("Hello"))

In [None]:
## Create the index

index_name = "langchain-demo-index"

index_body = {
    "settings": {
        "index": {
            "knn": True
        }
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "vector_field": {
                "type": "knn_vector",
                "dimension": 3072,  # Set to your embedding dimension
                "method": {
                    "engine": "faiss",  # or "nmslib" if desired
                    "space_type": "l2",  # or "cosinesimil", etc.
                    "name": "hnsw",      # or "ivf", "flat", etc. (see docs)
                    "parameters": {
                        "ef_construction": 512,
                        "m": 16
                    }
                }
            }
        }
    }
}

if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name, body=index_body)

In [None]:
docsearch = OpenSearchVectorSearch(embedding_function=embeddings, 
                                   index_name=index_name,
                                   opensearch_url="https://localhost:9200", 
                                    http_auth=("admin", "<password>"),
                                    use_ssl = True,
                                    verify_certs = False, 
)

In [None]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(split_docs))]

In [None]:
docsearch.add_documents(documents=split_docs, ids=uuids)

In [None]:
count = client.count(index=index_name)['count']
count

In [None]:
results = docsearch.similarity_search("What is agentic AI?")

In [None]:
results

In [None]:
retriever=docsearch.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.3} #hyperparameter
)
retriever.invoke("What is LangChain")

In [None]:
from langchain_groq import ChatGroq
model=ChatGroq(model="gemma2-9b-it")
model

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("what is Agentic AI?")