In [1]:
# Load the libraries that are needed
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os

In [4]:
# Load the document that you need to parse
loader = UnstructuredPDFLoader("/mnt/Select_Global_Value_Fund.pdf")
data = loader.load()

In [5]:
# Get some stats about the document
print (f'You have {len(data)} document(s) in the dataset')
print (f'There are {len(data[0].page_content)} characters in the document')

You have 1 document(s) in the dataset
There are 50217 characters in the document


In [6]:
# Chunk your data up into smaller documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
texts[:2]

[Document(page_content='Vanguard Advice Select Global Value Fund\n\nContents\n\nPlease note: The opinions expressed in this report are just that—informed opinions. They should not be considered promises or advice. Also, please keep in mind that the information and opinions cover the period through the date on the front of this report. Of course, the risks of investing in your fund are spelled out in the prospectus.\n\nYour Fund’s Performance at a Glance\n\n• The 12 months ended October 31, 2022, were a volatile, challenging period for financial markets. Vanguard Advice Select Global Value Fund, which launched November 9, 2021, returned –14.01% from its inception through October 31, lagging the –12.39% return of its benchmark, the MSCI All Country World Value Index.', lookup_str='', metadata={'source': '/mnt/Select_Global_Value_Fund.pdf'}, lookup_index=0),
 Document(page_content='• The economic backdrop deteriorated as inflation soared to multidecade highs, fueled in part by higher ener

In [7]:
print (f'There are now {len(texts)} documents')

There are now 66 documents


In [1]:
#Create embeddings of your documents to get ready for semantic search

from langchain.vectorstores import FAISS, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
import pinecone
import pickle


  from tqdm.autonotebook import tqdm


In [3]:
# Read your OpenAI key from the environment

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [5]:
# Index and store the embeddings locally in a pickle file
store = FAISS.from_texts([t.page_content for t in texts], embeddings)
with open("faiss_store.pkl", "wb") as f:
    pickle.dump(store, f)


In [6]:
# Query the docs to get your answer back
from langchain.llms import OpenAI

In [None]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 

# Load the embeddings from the pickle file; change the location if needed
if 'store' not in locals() or store is None:
    with open("faiss_store.pkl", "rb") as f:
        store = pickle.load(f)
        
# Ucomment the line below to use the default davinci model 
# qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=store)
qa = VectorDBQA.from_chain_type(llm=OpenAI(model_name='gpt-3.5-turbo', temperature=0, openai_api_key=OPENAI_API_KEY), chain_type="stuff", vectorstore=store)

query = "What is a datasource?"
docs = store.similarity_search(query)
qa({"input_documents": docs, "question": query}, return_only_outputs=True)

