In [1]:
# Load the libraries that are needed
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os
import random

In [2]:
# Load the document that you need to parse, please change the location to where the pdf resides

# Load 1 PDF file
# loader = PyPDFLoader("/mnt/data/RAG/Benefit_Options.pdf")
# Load an entire folder
loader = PyPDFDirectoryLoader("/mnt/data/EnterpriseQA1/")
texts = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100))

In [3]:
print(f"There are {len(texts)} pages in the document")

There are 148 pages in the document


In [4]:
# Pick a sample page
print(texts[random.randint(0, len(texts))])

page_content='The calculations assume no shares were bought or sold during the period. Y our actual costs may\nhave been higher or lower, depending on the amount of your investment and the timing of anypurchases or redemptions.\nY ou can find more information about the fund’s expenses, including annual expense ratios, in the' metadata={'source': '/mnt/data/EnterpriseQA1/Select_Global_Value_Fund.pdf', 'page': 5}


In [6]:
#Create embeddings of your documents to get ready for semantic search

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
import pickle


In [7]:
# Read your OpenAI key from the environment

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [8]:
# Index and store the embeddings locally in a pickle file
store = FAISS.from_texts([t.page_content for t in texts], embeddings)
with open("fininfo.pkl", "wb") as f:
    pickle.dump(store, f)


In [18]:
# Load the embeddings from the pickle file; change the location if needed 
if 'store' not in locals() or store is None:
    with open("fininfo.pkl", "rb") as f:
        store = pickle.load(f)
        
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name='gpt-4', temperature=0, openai_api_key=OPENAI_API_KEY), chain_type="stuff", retriever=store.as_retriever())

query = "During the period ended October 31, 2022, What are the amounts of investment securities purchased and sold in the period?"
docs = store.similarity_search(query)
qa({"input_documents": docs, "query": query}, return_only_outputs=True)

{'result': 'During the period ended October 31, 2022, the fund purchased $391,568,000 of investment securities and sold $67,382,000 of investment securities.'}