<a href="https://colab.research.google.com/github/githubrohan/RAG/blob/main/Atlas_Vector_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install --quiet pymongo langchain langchain_community langchain_mongodb langchain_huggingface pypdf sentence_transformers

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

# Load the PDF
loader = PyPDFLoader("https://investors.mongodb.com/node/12236/pdf")
data = loader.load()

# Split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(data)

# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1")
model = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1", model_kwargs={ "trust_remote_code": True })

# Connect to your Atlas cluster
client = MongoClient("mongodb+srv://rohangkp786:Bgtc3fyqipmzwvAi@cluster0.r8pt5.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
collection = client["rag_db"]["test"]

# Store the data as vector embeddings in Atlas
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents = docs,
    embedding = model,
    collection = collection,
    index_name = "vector_index"
)



In [6]:
# Instantiate Atlas Vector Search as a retriever
retriever = vector_store.as_retriever(
   search_type = "similarity"
)

# Run a sample query in order of relevance
retriever.invoke("AI technology")

[Document(metadata={'_id': '66c43d544f61bb2f5c62b489', 'source': 'https://investors.mongodb.com/node/12236/pdf', 'page': 0}, page_content="more of our customers. We also see a tremendous opportunity to win more legacy workloads, as AI has now become a catalyst to modernize these\napplications. MongoDB's  document-based architecture is particularly well-suited for the variety and scale of data required by AI-powered applications."),
 Document(metadata={'_id': '66c43d544f61bb2f5c62b4a5', 'source': 'https://investors.mongodb.com/node/12236/pdf', 'page': 1}, page_content='artificial intelligence, in our offerings or partnerships; the growth and expansion of the market for database products and our ability to penetrate that\nmarket; our ability to integrate acquired businesses and technologies successfully or achieve the expected benefits of such acquisitions; our ability to'),
 Document(metadata={'_id': '66c43d544f61bb2f5c62b492', 'source': 'https://investors.mongodb.com/node/12236/pdf', '

In [7]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import os

# Authenticate to your Hugging Face account
os.environ["HF_TOKEN"] = "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"

# Access the LLM (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2")

# Create prompt and RAG workflow
prompt = PromptTemplate.from_template("""
   Answer the following question based on the given context.

   Question: {question}
   Context: {context}
""")

rag_chain = (
   { "context": retriever, "question": RunnablePassthrough()}
   | prompt
   | llm
   | StrOutputParser()
)

# Prompt the LLM
question = "In a few sentences, what are MongoDB's latest AI announcements?"
answer = rag_chain.invoke(question)
print(answer)


   Answer: MongoDB recently announced the MongoDB AI Applications Program (MAAP) to expand its AI ecosystem. The company also sees an opportunity to win more legacy workloads due to the role of AI in modernizing applications. MongoDB's document-based architecture is particularly well-suited for the data requirements of AI-powered applications. These announcements were made at MongoDB.local NYC.
