In [3]:
# Load the libraries that are needed
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os

In [4]:
# Load the document that you need to parse, change this location accordingly
loader = UnstructuredPDFLoader("/mnt/Select_Global_Value_Fund.pdf")

In [5]:
data = loader.load()

Downloading model_final.pth:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading (…)50_FPN_3x/config.yml:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

In [6]:
# Get some stats about the document
print (f'You have {len(data)} document(s) in the dataset')
print (f'There are {len(data[0].page_content)} characters in the document')

You have 1 document(s) in the dataset
There are 50217 characters in the document


In [7]:
# Chunk your data up into smaller documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
texts[:2]

[Document(page_content='Vanguard Advice Select Global Value Fund\n\nContents\n\nPlease note: The opinions expressed in this report are just that—informed opinions. They should not be considered promises or advice. Also, please keep in mind that the information and opinions cover the period through the date on the front of this report. Of course, the risks of investing in your fund are spelled out in the prospectus.\n\nYour Fund’s Performance at a Glance\n\n• The 12 months ended October 31, 2022, were a volatile, challenging period for financial markets. Vanguard Advice Select Global Value Fund, which launched November 9, 2021, returned –14.01% from its inception through October 31, lagging the –12.39% return of its benchmark, the MSCI All Country World Value Index.', lookup_str='', metadata={'source': '/mnt/Select_Global_Value_Fund.pdf'}, lookup_index=0),
 Document(page_content='• The economic backdrop deteriorated as inflation soared to multidecade highs, fueled in part by higher ener

In [8]:
print (f'There are now {len(texts)} documents')

There are now 66 documents


In [9]:
#Create embeddings of your documents to get ready for semantic search

from langchain.vectorstores import FAISS, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import pickle


In [10]:
# Read your OpenAI and Pinecone keys from the environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

In [11]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [12]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "vanguard-etf"

In [13]:
# Generate and store the embeddings in Pinecone
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [14]:
# Query the docs to get your answer back
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [15]:
# Set the temperature to 0 to prevent hallucination
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [17]:
# Ask your query
query = "Who are the trustees of the fund?"
# Get the closest matches to create some context and information for the answer
docs = docsearch.similarity_search(query, include_metadata=True)

In [18]:
docs

[Document(page_content='The trustees of your mutual fund are there to see that the fund is operated and managed in your best interests since, as a shareholder, you are a part owner of the fund.Your fund’s trustees also serve on the board of directors ofThe Vanguard Group, Inc., which is owned by the Vanguard funds and provides services to them.\n\nA majority of Vanguard’s board members are independent, meaning that they have no affiliation with Vanguard or the funds they oversee, apart from the sizable personal investments they have made as private individuals.The independent board members have distinguished backgrounds in business, academia, and public service. Each of the trustees and executive officers oversees 206 Vanguard funds.', lookup_str='', metadata={}, lookup_index=0),
 Document(page_content='Information for each trustee and executive officer of the fund appears below.That information, well as the Vanguard fund count, is as of the date on the cover of this fund report.The ma

In [19]:
# Execute the query with the context and information from the embedding/semantic search
chain.run(input_documents=docs, question=query)

' Mortimer J. Buckley, Tara Bunch, André F. Perold, F. Joseph Loughrey, Mark Loughridge, and Sarah Bloom Raskin.'

In [20]:
# Compare the result with gpt-3.5-turbo, the default above is davinci
llm = OpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

chain = load_qa_chain(llm, chain_type="stuff")

chain.run(input_documents=docs, question=query)

'The trustees of the fund are Mortimer J. Buckley, Tara Bunch, André F. Perold, F. Joseph Loughrey, Mark Loughridge, and Sarah Bloom Raskin.'