In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("attention_is_all_you_need.pdf")
pages = loader.load()

In [3]:
len(pages)

15

In [4]:
pages[0]

Document(metadata={'source': 'attention_is_all_you_need.pdf', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the

In [5]:
pages[0].page_content

'Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and con

In [6]:
pages[0].metadata

{'source': 'attention_is_all_you_need.pdf', 'page': 0}

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000, # 나뉘는 글자수
    chunk_overlap=200, # 겹쳐지는 글자수
)

In [8]:
splits = text_splitter.split_documents(pages)
len(splits)

52

In [10]:
splits[20]

Document(metadata={'source': 'attention_is_all_you_need.pdf', 'page': 5}, page_content='Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations\nfor different layer types. nis the sequence length, dis the representation dimension, kis the kernel\nsize of convolutions and rthe size of the neighborhood in restricted self-attention.\nLayer Type Complexity per Layer Sequential Maximum Path Length\nOperations\nSelf-Attention O(n2·d) O(1) O(1)\nRecurrent O(n·d2) O(n) O(n)\nConvolutional O(k·n·d2) O(1) O(logk(n))\nSelf-Attention (restricted) O(r·n·d) O(1) O(n/r)\n3.5 Positional Encoding\nSince our model contains no recurrence and no convolution, in order for the model to make use of the\norder of the sequence, we must inject some information about the relative or absolute position of the\ntokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the\nbottoms of the encoder and decoder stacks. The positional encodings 

In [11]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

embeddings_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# 스플릿된 문서를 임베딩 모델을 이용해서 벡터로 변환해서 변환된 벡터를 크로마DB에 저장
db = Chroma.from_documents(splits, embeddings_model)

In [12]:
# 크로마 DB내에서 질문 던지기
query = "What is the attention mechanism in transformers?"
docs = db.similarity_search(query) # 52개의 청크들중에 어떤것들이 가장 유사도가 높은지 판단
print(docs[0].page_content)

The Transformer uses multi-head attention in three different ways:
•In "encoder-decoder attention" layers, the queries come from the previous decoder layer,
and the memory keys and values come from the output of the encoder. This allows every
position in the decoder to attend over all positions in the input sequence. This mimics the
typical encoder-decoder attention mechanisms in sequence-to-sequence models such as
[38, 2, 9].
•The encoder contains self-attention layers. In a self-attention layer all of the keys, values
and queries come from the same place, in this case, the output of the previous layer in the
encoder. Each position in the encoder can attend to all positions in the previous layer of the
encoder.
•Similarly, self-attention layers in the decoder allow each position in the decoder to attend to
all positions in the decoder up to and including that position. We need to prevent leftward


In [13]:
print(docs[0].metadata)

{'page': 4, 'source': 'attention_is_all_you_need.pdf'}


In [14]:
# 질문과 가장 유사한 4개의 문서 가져옴
len(docs)

4

In [15]:
# 리트리버를 이용해서 벡터db를 검색도구로 이용할 수 있다.
retriever = db.as_retriever()

In [16]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000021F300A6180>)

In [17]:
# prompt
# 사용자의 입력-input과 context 구분하는거 중요!

from langchain_core.prompts import ChatPromptTemplate

template = '''Answer the question based only on the following context:
<context>
{context}
</context>

Question: {input}
'''

prompt = ChatPromptTemplate.from_template(template)

In [18]:
prompt

ChatPromptTemplate(input_variables=['context', 'input'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], template='Answer the question based only on the following context:\n<context>\n{context}\n</context>\n\nQuestion: {input}\n'))])

In [19]:
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

model = ChatOpenAI(model='gpt-3.5-turbo-0125', temperature=0,
                   api_key=OPENAI_API_KEY)

# 랭체인 만들어야 하니까 모델하고 프롬프트 묶기
document_chain = create_stuff_documents_chain(model, prompt)

# 리트리버하고 document_chain 묶기
retriever_chain = create_retrieval_chain(retriever, document_chain)

response = retriever_chain.invoke({"input":"what is the attention mechanism in transformers?"})


In [20]:
response

{'input': 'what is the attention mechanism in transformers?',
 'context': [Document(metadata={'page': 4, 'source': 'attention_is_all_you_need.pdf'}, page_content='The Transformer uses multi-head attention in three different ways:\n•In "encoder-decoder attention" layers, the queries come from the previous decoder layer,\nand the memory keys and values come from the output of the encoder. This allows every\nposition in the decoder to attend over all positions in the input sequence. This mimics the\ntypical encoder-decoder attention mechanisms in sequence-to-sequence models such as\n[38, 2, 9].\n•The encoder contains self-attention layers. In a self-attention layer all of the keys, values\nand queries come from the same place, in this case, the output of the previous layer in the\nencoder. Each position in the encoder can attend to all positions in the previous layer of the\nencoder.\n•Similarly, self-attention layers in the decoder allow each position in the decoder to attend to\nall pos

In [21]:
response['answer']

'The attention mechanism in transformers allows every position in the decoder to attend over all positions in the input sequence, as well as enabling each position in the encoder to attend to all positions in the previous layer of the encoder. Additionally, self-attention layers in the decoder allow each position to attend to all positions in the decoder up to and including that position.'