In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
import os

In [2]:
# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "You OpenAI Key"

In [7]:
pdfreader=PdfReader('DeepLearning paper.pdf')


In [9]:
# Read text from PDF
raw_text = ""
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [10]:
# Split the text using Character Text Splitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)


In [11]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [12]:
# Creating FAISS vector store from texts and embeddings
document_search = FAISS.from_texts(texts, embeddings)


In [13]:
# Load QA chain with a OpenAI model (e.g., text-davinci-002)
chain = load_qa_chain(OpenAI(model_name="gpt-3.5-turbo-instruct"), chain_type="stuff")

## Query1

In [14]:
# Specify your question
query = "Explain Positional Encoding"

In [15]:
#Similarity search
docs = document_search.similarity_search(query)

In [16]:
# Run the QA chain
chain.run(input_documents=docs, question=query)

' \nPositional encoding is a technique used in natural language processing models, specifically in self-attention layers, to inject information about the relative or absolute position of tokens in a sequence. This is necessary because the models do not have any inherent understanding of the order of the sequence, since they do not use recurrent or convolutional layers. The positional encodings have the same dimension as the input embeddings and are added to them, allowing the model to learn to attend to relative positions. The encoding is calculated using sine and cosine functions with different frequencies, forming a geometric progression. This function allows the model to easily learn to attend by relative positions, as each dimension of the positional encoding corresponds to a sinusoid. The use of positional encoding helps improve the performance of the model in tasks that require understanding of sequential information. '

## Query2

In [17]:
query2 = "Who created transformers"

In [23]:
docs = document_search.similarity_search(query2)

In [24]:
chain.run(input_documents=docs, question=query2)

' Jakob, Ashish, Illia, Noam, Niki, and Llion created the Transformer model.'