### RAG implemenation with Symantic Chunking & OpenSearch as Vector Store (with hsnw index)

In [19]:
# Load the enviornment variables
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")
os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY")
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [None]:
## Import Open AI embedding model used for symantic chunking & while convetring the chunks in vectors
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [21]:
# Load the pdf document and create chunks using symantic chunking 
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

FILE_PATH=r"C:\Users\gauravkkumar\Documents\2025\rag\data\2407_01502v1.pdf"
loader=PyPDFLoader(FILE_PATH)
pages = loader.load()

text_splitter = SemanticChunker(embeddings)
split_docs = text_splitter.split_documents(pages)

print(split_docs[0].page_content)

AI Agents That Matter
Sayash Kapoor∗, Benedikt Stroebl ∗, Zachary S. Siegel, Nitya Nadgir, Arvind Narayanan
Princeton University
July 2, 2024
Abstract
AI agents are an exciting new research direction, and agent development is driven
by benchmarks. Our analysis of current agent benchmarks and evaluation practices
reveals several shortcomings that hinder their usefulness in real-world applications. First, there is a narrow focus on accuracy without attention to other metrics. As
a result, SOTA agents are needlessly complex and costly, and the community has
reached mistaken conclusions about the sources of accuracy gains. Our focus on
cost in addition to accuracy motivates the new goal of jointly optimizing the two
metrics. We design and implement one such optimization, showing its potential
to greatly reduce cost while maintaining accuracy. Second, the benchmarking
needs of model and downstream developers have been conflated, making it hard
to identify which agent would be best suited fo

### OpenSearch 
1. Create the client
2. Create the index
3. Create the OpenSearch vector store 
Pre-requiste: Make sure Opensearch is running in Docker locally

In [22]:
## Create OpenSearch Clinet 
from langchain_community.vectorstores import OpenSearchVectorSearch
from opensearchpy import OpenSearch

client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    http_auth=('admin', 'Ishana@0223'),  # Replace with your credentials
    use_ssl=True,
    verify_certs=False
)



In [23]:
len(embeddings.embed_query("Hello"))

3072

In [24]:
index_name = "langchain-demo-index"

index_body = {
    "settings": {
        "index": {
            "knn": True
        }
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "vector_field": {
                "type": "knn_vector",
                "dimension": 3072,  # Set to your embedding dimension
                "method": {
                    "engine": "faiss",  # or "nmslib" if desired
                    "space_type": "l2",  # or "cosinesimil", etc.
                    "name": "hnsw",      # or "ivf", "flat", etc. (see docs)
                    "parameters": {
                        "ef_construction": 512,
                        "m": 16
                    }
                }
            }
        }
    }
}

if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name, body=index_body)



In [28]:
docsearch = OpenSearchVectorSearch(embedding_function=embeddings, 
                                   index_name=index_name,
                                   opensearch_url="https://localhost:9200", 
                                    http_auth=("admin", "Ishana@0223"),
                                    use_ssl = True,
                                    verify_certs = False, 
)



In [29]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(split_docs))]

In [30]:
docsearch.add_documents(documents=split_docs, ids=uuids)



['a8087f5b-0e2c-416f-9d5a-fc6985c9318b',
 'd1948fa7-a34f-455c-aae1-63d1562a0403',
 'dec55ca4-ce99-4938-a967-98a3f0639780',
 'eaf9a7d5-0908-4517-8b63-78a7e51f5254',
 '087c5689-bf09-473e-b789-872a10017606',
 '4d72bf77-c3f5-4b37-ac9d-ff2aaa0ee346',
 '04e2e1d3-badb-45be-9513-6128f5d239a4',
 'cce5cb13-87b0-47ca-b6f6-cb6be744a7a8',
 'f511aa0a-b20d-4bfd-afff-d48ba73ea1b1',
 'ccef1c58-dd51-4e04-8856-286a281c125a',
 '37c4321d-a61a-4115-8aac-e8b8d11e24bd',
 'b439ba81-69d7-488d-906b-61473c3c9e4a',
 'da03928c-6ba1-4f08-9391-7fc29ab59b8e',
 'ad1eaba2-2673-426d-af93-f42dbee8628f',
 '84496bba-1b81-487c-a72f-6a55ea57cd31',
 'c417fb94-5a8d-41db-ab70-f213704a48c8',
 '47959b86-c4c0-4a4e-88c5-0860b93be32d',
 '21a7cdb0-17c3-4629-8188-6abd2669e930',
 '1ad830b8-735a-4ff1-811e-3573347c1dbd',
 '07b31100-94eb-46ed-893f-637ee98fbd55',
 '9552f107-5498-48aa-b0f4-40906007b21f',
 'bd0923c7-7a2e-4751-ad93-ab96eed0b5bc',
 '37ff4f86-89d8-48f8-aee6-75988cd8fa79',
 '850f9186-9c2f-428c-8f39-541ef1a57610',
 '59123973-bfb0-

In [31]:
count = client.count(index=index_name)['count']
count



102