# VectorDB Pipeline 

In [1]:
import os
import faiss
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
# from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
# from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [2]:
load_dotenv()

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

In [4]:
# hf_embedd = HuggingFaceEmbeddings(model="all-MiniLM-L6-v2")

## Cosine and other similarities

In [None]:
documents = ["i live in India",
             "America has lots of indian",
             "cricket is the favourite sports in India",
             "Donald Trump is the president of America"]

doc_embeddings = hf_embedd.embed_documents(documents)

In [None]:
# my_query = "who is the president of America?"
my_query = "which sport is most followed in India?"

In [None]:
my_embedded_query = hf_embedd.embed_query(my_query)

In [None]:
cosine_similarity([my_embedded_query], doc_embeddings)

array([[0.36495206, 0.39527318, 0.74041876, 0.08454546]])

In [None]:
euclidean_distances([my_embedded_query], doc_embeddings)

array([[1.12698533, 1.09975159, 0.72052931, 1.35311094]])

## Vector Store - FAISS

In [49]:
# index creation

index_1 = faiss.IndexFlatL2(384)
# index_2 = faiss.Index

In [50]:
index_1

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000017AAD694DE0> >

In [52]:
vector_store = FAISS(
    embedding_function=hf_embedd,
    index=index_1,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [53]:
documents = ["i live in India",
             "America has lots of indian",
             "cricket is the favourite sports in India",
             "Donald Trump is the president of America",
             "AI is future",
             "AI is powerful",
             "Doga are cute"]

In [56]:
vector_store.add_texts(documents)

['2d2759aa-22b3-4d9f-8252-21ea0a22a3a2',
 'd4248712-12b2-43f0-a7e0-b069ad2a7625',
 '04accc36-f45e-4899-a6f2-8bbd60159c1d',
 'd4cc1900-6883-4885-88dc-43188bf10115',
 '6eea7927-05e2-4082-817c-eba5ab4b4050',
 'bdd6b914-7086-44d5-a6e5-27e026355205',
 '1b452c26-b2a9-4f68-babb-fee5f516af22']

In [57]:
vector_store.index_to_docstore_id

{0: '2d2759aa-22b3-4d9f-8252-21ea0a22a3a2',
 1: 'd4248712-12b2-43f0-a7e0-b069ad2a7625',
 2: '04accc36-f45e-4899-a6f2-8bbd60159c1d',
 3: 'd4cc1900-6883-4885-88dc-43188bf10115',
 4: '6eea7927-05e2-4082-817c-eba5ab4b4050',
 5: 'bdd6b914-7086-44d5-a6e5-27e026355205',
 6: '1b452c26-b2a9-4f68-babb-fee5f516af22'}

In [60]:
vector_store.similarity_search("who is president of America", k=2)

[Document(id='d4cc1900-6883-4885-88dc-43188bf10115', metadata={}, page_content='Donald Trump is the president of America'),
 Document(id='d4248712-12b2-43f0-a7e0-b069ad2a7625', metadata={}, page_content='America has lots of indian')]

## RAG Pipeline

In [3]:
from langchain import hub
from langchain_groq.chat_models import ChatGroq
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [4]:
# print(os.getenv("OPENAI_API_KEY"))
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"]="true"

In [5]:
# pdf_path = r"../data/Attention_Paper.pdf"
# pdf_path = r"11-5-2025\data\Attention_Paper.pdf"
# pdf_path = r"C:\Users\deepak.a.dhiman\projects\agentic_ai\11-5-2025\data\Attention_Paper.pdf"

In [6]:
# loader = PyPDFLoader(file_path=pdf_path)
# data = loader.load()

In [7]:
# print(type(data), len(data))
# data[:4]

In [8]:
# doc_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# chunks = doc_splitter.split_documents(documents=data)

In [9]:
# chunks[:3]

In [10]:
# # creating index for FAISS

# index = faiss.IndexFlatL2(384)

# vector_db = FAISS(
#     embedding_function=hf_embedd,
#     index=index,
#     docstore=InMemoryDocstore(),
#     index_to_docstore_id={}
# )

In [11]:
# vector_db.add_documents(chunks)

In [12]:
# creating retreiver 

# retreiver = vector_db.as_retriever(search_kwargs={'k':3})

In [13]:
# retreiver.invoke("what is attention?")

In [14]:
# Stroing the vector db in local
# vector_db.save_local("faiss_db_attention_pdf")

In [15]:
hf_embedd = HuggingFaceEmbeddings(model="all-MiniLM-L6-v2")


local_vector_db = FAISS.load_local(
    "faiss_db_attention_pdf",
    embeddings=hf_embedd,
    allow_dangerous_deserialization=True
)

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
# local_vector_db.similarity_search("what is attention mechanism?")

retreiver = local_vector_db.as_retriever(k=3)

In [33]:
# creating RAG Pipeline

llm = ChatGroq(model="gemma2-9b-it", max_tokens=2048)

In [34]:
# defining prompt

prompt = hub.pull("rlm/rag-prompt")
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [35]:
parser = StrOutputParser()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [50]:
chain = (
    {"context": retreiver | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | parser
)

In [51]:
# result = chain.invoke("what is the role of K,Q and V vectors?")
# result = chain.invoke("who all contributed in this research paper?")
result = chain.invoke("get me the email ids of contributors of this paper.")

In [52]:
# print(result.model_dump()['messages'][0]["content"])
print(result)

The email addresses of the contributors are: avaswani@google.com, noam@google.com, nikip@google.com, usz@google.com, lllion@google.com, and aidan@cs.toronto.edu. 
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, and Aidan N. Gomez are the authors of the paper. 


