In [1]:
!pip install langchain langchain-openai pypdf unstructured pdfminer.six pdf2image pillow_heif unstructured_inference pikepdf chromadb lark

import os
os.environ["OPENAI_API_KEY"] = "YOUR API KEY"



In [4]:
from langchain.document_loaders import OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma


recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

# Load and Split Data
loader = OnlinePDFLoader("https://arxiv.org/pdf/1706.03762.pdf")
data = loader.load_and_split(text_splitter=recursive_splitter)

# Store data in db

db = Chroma.from_documents(data, OpenAIEmbeddings())
print(db._collection.count())

23


# Maximal Marginal Relevance

When we first searched the our vector db we used traditional similarity search, but this has some drawbacks.

Sometimes there could be documents that aren't as similar to our query but they provide some useful information that is unusual enough that it causes the doc to not be that similar.

For example, imagine in our Arxiv paper that there was a section about the transformer encoder and how it was used to solve a particular problem in physics, then it went off explaining the problem in physics so majority of the text was about physics. This wouldn't rank as high in our similarity search as the document we saw last but it would be an interesting insight for our question about the Encoder.

In [5]:
question = "What does the Encoder portion of a transformer do?"


# fetch k first grabs the top k most similar
# Then it finds the k most diverse within the docs it fetched

docs = db.max_marginal_relevance_search(question,k=2, fetch_k=3)

for doc in docs:
  print(doc.page_content)
  print("---------------------------------------------")

3 Model Architecture

Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 35]. Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive [10], consuming the previously generated symbols as additional input when generating the next.

2

Figure 1: The Transformer - model architecture.

The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.

3.1 Encoder and Decoder Stacks

Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- wis

In this example the model gave us the same first example as similarity search but also included the abstract which contained the people involved and a general overview of the entire project surrounding the encoder.

This extra context could prove to be useful for answering questions about the encoder because we may need to connect it to the project at large.

## Querying Using MetaData

We can query our database using the metadata field as well.

In [6]:
# Add the URL to a few documents and query for those

data_with_metadata = []

for doc in data[:5]:
  doc.metadata['source'] = "https://arxiv.org/pdf/1706.03762.pdf"
  data_with_metadata.append(doc)

data_with_metadata += data[5:]


# Store data in db
db.delete_collection() # clear it out first
db = Chroma.from_documents(data_with_metadata, OpenAIEmbeddings())
print(db._collection.count())

23


In [7]:
docs = db.similarity_search(
    question,
    k=3,
    filter={"source":"https://arxiv.org/pdf/1706.03762.pdf"}
)

for doc in docs:
  print(doc)

page_content='3 Model Architecture\n\nMost competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 35]. Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive [10], consuming the previously generated symbols as additional input when generating the next.\n\n2\n\nFigure 1: The Transformer - model architecture.\n\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.\n\n3.1 Encoder and Decoder Stacks\n\nEncoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second 

# Using An LLM To Generate A Query

In [8]:
data_with_metadata = []

for i, doc in enumerate(data):
  if i < 7:
    doc.metadata['source'] = 'beginning'
  elif 7 <= i < 15:
    doc.metadata['source'] = 'middle'
  else:
    doc.metadata['source'] = 'end'

  data_with_metadata.append(doc)


# Store data in db
db.delete_collection() # clear it out first
db = Chroma.from_documents(data_with_metadata, OpenAIEmbeddings())
print(db._collection.count())

23


In [9]:
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="Whether this section of the paper is in the beginning, middle, or end of the paper.",
        type="string",
    )
]

document_content_description = "The first transformer paper by Google Brain"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    db,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [10]:
question = "What do they say about the Encoder in the middle of the paper?"
docs = retriever.get_relevant_documents(question)

In [11]:
for doc in docs:
  print(f"SOURCE: {doc.metadata['source']}")
  print("-----------------------------------------")
  print(f"DOCUMENT: {doc.page_content}")
  print("=========================================")

SOURCE: middle
-----------------------------------------
DOCUMENT: 3.3 Position-wise Feed-Forward Networks

In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between.

FFN(x) = max(0, xW1 + b1)W2 + b2

(2)

While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality df f = 2048.

3.4 Embeddings and Softmax

Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transfor- mation and softmax function to convert the 