In [2]:
# PDF Reader
from langchain_community.document_loaders import  PyPDFLoader
loader=PyPDFLoader("attention.pdf")
docs=loader.load()

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
documents[:5]

[Document(page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing 

In [None]:
documents

In [4]:
# Vector Embedding And Vector Storage
# from langchain_community.embeddings import OpenAIEmbeddings
# from langchain_community.vectorstores import Chroma
# db = Chroma.from_documents(documents[:20], OpenAIEmbeddings())

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents[:20], OllamaEmbeddings(model="dolphin-llama3"))

In [None]:
# Chroma Vector Database
query="Who are the authors of the research paper, Attention is All You Need?"
result = db.similarity_search(query)
result[0].page_content

In [5]:
from langchain_community.llms import Ollama
llm = Ollama(model="dolphin-llama3")
llm

Ollama(model='dolphin-llama3')

In [6]:
## Design ChatPrompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful. 
<context>
{context}
</context>
Question: {input}""")

In [7]:
## Chain Introduction
## Create Stuff Document Chain

from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain=create_stuff_documents_chain(llm,prompt)

In [8]:
"""
Retrievers: A retriever is an interface that returns documents given
 an unstructured query. It is more general than a vector store.
 A retriever does not need to be able to store documents, only to 
 return (or retrieve) them. Vector stores can be used as the backbone
 of a retriever, but there are other types of retrievers as well. 
 https://python.langchain.com/docs/modules/data_connection/retrievers/   
"""

retriever=db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7af4d2f40e80>)

In [9]:
"""
Retrieval chain:This chain takes in a user inquiry, which is then
passed to the retriever to fetch relevant documents. Those documents 
(and original inputs) are then passed to an LLM to generate a response
https://python.langchain.com/docs/modules/chains/
"""
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [10]:
response = retrieval_chain.invoke({"input": "Scaled Dot-Product Attention"})

In [11]:
response['answer']

'Scaled Dot-Product Attention is a specific type of attention mechanism used in the Transformer model. It takes input queries and keys of dimension dk, and values of dimension dv. The attention is computed by taking the dot products of the query with all keys, dividing each result by √dk, and applying a softmax function to obtain the weights on the values. In practice, this attention function is applied simultaneously to a matrix of queries (Q), along with their corresponding key (K) and value (V) matrices. The resulting matrix of outputs can be computed as: Attention(Q, K, V) = softmax(QKT/√dk)V.'