In [4]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

class VectorStore:
    def __init__(self, path):
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


        self.vector_store = Chroma(
            persist_directory=path,
            embedding_function=self.embeddings
        )

    def add_documents(self, documents):
        self.vector_store.add_documents(documents)
        
    def similarity_search(self, query, k=4):
        return self.vector_store.similarity_search(query, k=k)

In [5]:
vector_store = VectorStore(path="./chroma_db")

  from .autonotebook import tqdm as notebook_tqdm
  self.vector_store = Chroma(


In [6]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv
load_dotenv()

file_path = "../data/attention_is_all_you_need_Paper.pdf"
loader = PyPDFLoader(file_path)
docs=loader.load()

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

In [9]:
chunks[0]

Document(metadata={'source': '../data/attention_is_all_you_need_Paper.pdf', 'page': 0}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser ∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks s

In [10]:
vector_store.add_documents(chunks)

In [14]:
response=vector_store.similarity_search("What is attention mechanism?")

In [20]:
response

[Document(metadata={'source': '../data/attention_is_all_you_need_Paper.pdf', 'page': 3}, page_content='we found it beneﬁcial to linearly project the queries, keys and values htimes with different, learned\nlinear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of\nqueries, keys and values we then perform the attention function in parallel, yielding dv-dimensional\noutput values. These are concatenated and once again projected, resulting in the ﬁnal values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\n4To illustrate why the dot products get large, assume that the components of q and k are independent random\nvariables with mean 0 and variance 1. Then their dot product, q · k = ∑dk\ni=1 qiki, has mean 0 and variance dk.\n4'),
 Document(metadata={'source': '../data/attention_

In [21]:
for i, doc in enumerate(response):
    print(f"Document {i+1}:")
    print(doc.page_content)
    print("-" * 50)

Document 1:
we found it beneﬁcial to linearly project the queries, keys and values htimes with different, learned
linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of
queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional
output values. These are concatenated and once again projected, resulting in the ﬁnal values, as
depicted in Figure 2.
Multi-head attention allows the model to jointly attend to information from different representation
subspaces at different positions. With a single attention head, averaging inhibits this.
4To illustrate why the dot products get large, assume that the components of q and k are independent random
variables with mean 0 and variance 1. Then their dot product, q · k = ∑dk
i=1 qiki, has mean 0 and variance dk.
4
--------------------------------------------------
Document 2:
convolution is equal to the combination of a self-attention layer and a point-wise feed-fo

In [22]:
from langchain_core.prompts import ChatPromptTemplate

template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [23]:
prompt=ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse ten sentences maximum and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"))])

In [24]:
from langchain_core.output_parsers import StrOutputParser
output_parser=StrOutputParser()

In [26]:
retriever=vector_store.vector_store.as_retriever(search_kwargs={"k":4})

In [25]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from dotenv import load_dotenv
load_dotenv()
llm = HuggingFaceEndpoint(
    repo_id="openai/gpt-oss-20b",
    task="text-generation"
)

model = ChatHuggingFace(llm=llm)

In [27]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | model
    | output_parser
)

In [28]:
response=rag_chain.invoke("whats title of the paper?")

In [30]:
type(response)

str