Install the req. Python packages

In [3]:
! pip install -qU google-cloud-aiplatform langchain chromadb pypdf

Restart the Runtime for Vertex AI

In [4]:
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

Authenticate the Colab notebook

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
PROJECT_ID = "xxxxxx"
REGION = "us-central1"
import vertexai
vertexai.init(project=PROJECT_ID, location=REGION)

In [3]:
# Ingest the GOOG's 2023 Q1 Results .pdf
from langchain.document_loaders import PyPDFLoader

url = "https://abc.xyz/investor/static/pdf/2023Q1_alphabet_earnings_release.pdf?cache=0924ccf"
loader = PyPDFLoader(url)
documents = loader.load()

In [None]:
print(f"# of words in the document = {len(documents[0].page_content)}")

In [5]:
# Vertex AI
from google.cloud import aiplatform
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.schema import HumanMessage, SystemMessage

In [6]:
# LLM model
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=256,
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Embedding
embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@001")

Summerise the 2023 Q1 file 

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# There is a lot of complexity hidden in this one line.
chain = load_summarize_chain(llm, chain_type="map_reduce", verbose=True)
chain.run(texts)

Split the document into chunks

In [11]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

# of documents = 23


In [12]:
# Store the documents in the Chroma Vector DB as embeddings
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings)

In [13]:
# Expose index to the retriever
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [14]:
# Create chain to answer questions
from langchain.chains import RetrievalQA

# Uses Vertex PaLM Text API for LLM to synthesize results from the search index.
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

In [None]:
query = "What was Alphabet's consolidated revenues for 2023?"
result = qa({"query": query})
print(result)

In [None]:
query = "What was Alphabet's operating income for 2023?"
result = qa({"query": query})
print(result)

In [None]:
query = "What were the charges related to reductions in workforce and office space?"
result = qa({"query": query})
print(result)

I am facing issues when trying to process much larger files:

ResourceExhausted: 429 Quota exceeded for aiplatform.googleapis.com/online_prediction_requests_per_base_model with base model: textembedding-gecko. Please submit a quota increase request.

In [20]:
# load GOOG's 10-K file which is a much larger file
url = "https://abc.xyz/investor/static/pdf/20230426_alphabet_10Q.pdf?cache=252acfb"
loader = PyPDFLoader(url)
documents = loader.load()

In [22]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

# of documents = 112


In [None]:
# Store the documents in the Chroma Vector DB as embeddings
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings)