In [1]:
# Load langchain libraries to load pdf and split text
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# create a loader
loader = PyPDFLoader("NVidia-10-Q.pdf")

# load your data
data = loader.load()

In [3]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 49 document(s) in your data
There are 2657 characters in your document


In [4]:
# Lets split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=300)
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

Now you have 791 documents


In [5]:
# import libraries for vector db and embeddings
from langchain.vectorstores import  Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [6]:
import keyring
import openai

In [7]:
openai.organization = "org-EEUV4gnLyXDJe82NmHgNkyo4"
OPENAI_API_KEY = keyring.get_password("openai", openai.organization)

PINECONE_API_ENV = "gcp-starter"
PINECONE_API_KEY = keyring.get_password('pinecone', 'gcp-starter')

In [8]:
# create embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [9]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to API key in console
)

index_name = "pdf-index" # put in the name of your pinecone index here

In [10]:
# Create vector db
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [11]:
# If already have index
# docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [11]:
# Query simply question and return relevent chunks
query = "What is the total assets?"
docs = docsearch.similarity_search(query)

In [12]:
docs

[Document(page_content='Marketable securities 10,241 9,907 \nAccounts receivable, net 4,080 3,827 \nInventories 4,611 5,159 \nPrepaid expenses and other current assets 872 791 \nTotal current assets 24,883 23,073 \nProperty and equipment, net 3,740 3,807 \nOperating lease assets 1,094 1,038 \nGoodwill 4,430 4,372 \nIntangible assets, net 1,541 1,676 \nDeferred income tax assets 4,568 3,396 \nOther assets 4,204 3,820 \nTotal assets $ 44,460 $ 41,182 \nLIABILITIES AND SHAREHOLDERS’ EQUITY   \nCurrent liabilities:', metadata={}),
 Document(page_content='Property and equipment, net 3,740 3,807 \nOperating lease assets 1,094 1,038 \nGoodwill 4,430 4,372 \nIntangible assets, net 1,541 1,676 \nDeferred income tax assets 4,568 3,396 \nOther assets 4,204 3,820 \nTotal assets $ 44,460 $ 41,182 \nLIABILITIES AND SHAREHOLDERS’ EQUITY   \nCurrent liabilities:   \nAccounts payable $ 1,141 $ 1,193 \nAccrued and other current liabilities 4,869 4,120 \nShort-term debt 1,250 1,250 \nTotal current liabil

In [13]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.7, openai_api_key = OPENAI_API_KEY)

In [14]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=docsearch.as_retriever()
)

In [15]:
result = qa_chain({"query": query})
result["result"]

'The total assets are $44,460.'

In [16]:
query = "What are the biggest risks for Nvidia as a business?"
docs = docsearch.similarity_search(query)
result = qa_chain({"query": query})
result["result"]

'Some of the biggest risks for Nvidia as a business include investment risks, potential negative impacts from climate change concerns, challenges in obtaining future design wins, failure to provide value to customers and partners, and adverse economic conditions. These risks could harm their business, financial condition, results of operations, and reputation, potentially leading to a decline in their stock price.'

In [17]:
query = "Who are Nvidia competitors?"
docs = docsearch.similarity_search(query)
result = qa_chain({"query": query})
result["result"]

"NVIDIA has several competitors in different segments of its business. In the gaming and PC graphics segment, its main competitors are AMD and Intel. In the data center and AI segment, its competitors include Intel, AMD, and companies like IBM and Google. In the autonomous vehicles and robotics segment, some of its competitors are Intel, Qualcomm, and Tesla. It's important to note that the competitive landscape can change over time due to technological advancements and market dynamics."