Install the req. Python packages

In [None]:
! pip install -qU google-cloud-aiplatform langchain chromadb pypdf transformers

Restart the Runtime for Vertex AI. The below code crashes the runtime so restart manually!

In [None]:
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

Authenticate the Colab notebook

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
# Vertex AI
from google.cloud import aiplatform
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.schema import HumanMessage, SystemMessage

In [None]:
PROJECT_ID = "ibm-keras"
REGION = "us-central1"
import vertexai
vertexai.init(project=PROJECT_ID, location=REGION)

In [None]:
# Ingest the GOOG's 2023 Q1 Results .pdf
from langchain.document_loaders import PyPDFLoader

url = "https://abc.xyz/investor/static/pdf/2023Q1_alphabet_earnings_release.pdf?cache=0924ccf"
loader = PyPDFLoader(url)
documents = loader.load()

In [None]:
# LLM model
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=256,
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Embedding
embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@001")

Summerise the 2023 Q1 file

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# There is a lot of complexity hidden in this one line.
chain = load_summarize_chain(llm, chain_type="map_reduce", verbose=True)
chain.run(texts)



Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
________________________________________________________________________________________
FORM 10-Q  
________________________________________________________________________________________
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the quarterly period ended March 31, 2023
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from _______ to _______
Commission file number: 001-37580  
________________________________________________________________________________________
Alphabet Inc.  
(Exact name of registrant as specified in its charter)
________________________________________________________________________________________
Delaware 61-1767919
(State or other jurisdiction of incorporation or organizati

ResourceExhausted: ignored

Senario 2 : Save the document in a vector DB in chunks and implement a semantic search

In [None]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

# of documents = 23


In [None]:
# Store the documents in the Chroma Vector DB as embeddings
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings)

In [None]:
# Expose index to the retriever
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [None]:
# Create chain to answer questions
from langchain.chains import RetrievalQA

# Uses Vertex PaLM Text API for LLM to synthesize results from the search index.
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

In [None]:
query = "What was Alphabet's consolidated revenues for 2023?"
result = qa({"query": query})
print(result)

{'query': "What was Alphabet's consolidated revenues for 2023?", 'result': "Alphabet's consolidated revenues for 2023 were $69.8 billion.", 'source_documents': [Document(lc_kwargs={'page_content': 'Alphabet Inc.\nCONSOLIDATED STATEMENTS OF INCOME\n(In millions, except per share amounts, unaudited)\nQuarter Ended March 31,\n2022 2023\nRevenues $ 68,011 $ 69,787 \nCosts and expenses:\nCost of revenues  29,599  30,612 \nResearch and development  9,119  11,468 \nSales and marketing  5,825  6,533 \nGeneral and administrative  3,374  3,759 \nTotal costs and expenses  47,917  52,372 \nIncome from operations  20,094  17,415 \nOther income (expense), net  (1,160)  790 \nIncome before income taxes  18,934  18,205 \nProvision for income taxes  2,498  3,154 \nNet income $ 16,436 $ 15,051 \nBasic earnings per share of Class A, Class B, and Class C stock $ 1.24 $ 1.18 \nDiluted earnings per share of Class A, Class B, and Class C stock $ 1.23 $ 1.17 \nNumber of shares used in basic earnings per share

In [None]:
query = "What was Alphabet's operating income for 2023?"
result = qa({"query": query})
print(result)

{'query': "What was Alphabet's operating income for 2023?", 'result': "Alphabet's operating income for 2023 was $20,094 million.", 'source_documents': [Document(lc_kwargs={'page_content': 'Alphabet Inc.\nCONSOLIDATED STATEMENTS OF INCOME\n(In millions, except per share amounts, unaudited)\nQuarter Ended March 31,\n2022 2023\nRevenues $ 68,011 $ 69,787 \nCosts and expenses:\nCost of revenues  29,599  30,612 \nResearch and development  9,119  11,468 \nSales and marketing  5,825  6,533 \nGeneral and administrative  3,374  3,759 \nTotal costs and expenses  47,917  52,372 \nIncome from operations  20,094  17,415 \nOther income (expense), net  (1,160)  790 \nIncome before income taxes  18,934  18,205 \nProvision for income taxes  2,498  3,154 \nNet income $ 16,436 $ 15,051 \nBasic earnings per share of Class A, Class B, and Class C stock $ 1.24 $ 1.18 \nDiluted earnings per share of Class A, Class B, and Class C stock $ 1.23 $ 1.17 \nNumber of shares used in basic earnings per share calculat

In [None]:
query = "What were the charges related to reductions in workforce and office space?"
result = qa({"query": query})
print(result)

{'query': 'What were the charges related to reductions in workforce and office space?', 'result': 'The charges related to reductions in workforce and office space were $2.0 billion and $564 million, respectively.', 'source_documents': [Document(lc_kwargs={'page_content': 'Additional information  relating to the quarter ended March 31, 2023  (unaudited)\nReductions in Our Workforce and Office Space\nIn January 2023, we announced a reduction of our workforce, and as a result in the first quarter of 2023 we \nrecorded employee severance and related charges of $2.0 billion, representing the majority of expected costs \nassociated with this action. In addition, we are taking actions to optimize our global office space, and as a result we \nrecorded charges related to office space reductions of $564 million in the first quarter of 2023. We may incur \nadditional charges in the future as we further evaluate our real estate needs.\nThese severance and office space charges are included within o

I am facing issues when trying to process much larger files:

ResourceExhausted: 429 Quota exceeded for aiplatform.googleapis.com/online_prediction_requests_per_base_model with base model: textembedding-gecko. Please submit a quota increase request.

In [None]:
# load GOOG's 10-K file which is a much larger file
url = "https://abc.xyz/investor/static/pdf/20230426_alphabet_10Q.pdf?cache=252acfb"
loader = PyPDFLoader(url)
documents = loader.load()

In [None]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(f"# of documents = {len(docs)}")

# of documents = 112


In [None]:
# Store the documents in the Chroma Vector DB as embeddings
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings)

In [None]:
# Expose index to the retriever
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [None]:
# Create chain to answer questions
from langchain.chains import RetrievalQA

# Uses Vertex PaLM Text API for LLM to synthesize results from the search index.
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

In [None]:
query = "What was Alphabet's consolidated revenues for 2023?"
result = qa({"query": query})
print(result)

{'query': "What was Alphabet's consolidated revenues for 2023?", 'result': "Alphabet's consolidated revenues for 2023 were $69,787.", 'source_documents': [Document(lc_kwargs={'page_content': 'Alphabet Inc.\nCONSOLIDATED STATEMENTS OF INCOME\n(in millions, except per share amounts; unaudited)\nThree Months Ended\nMarch 31,\n2022 2023\nRevenues $ 68,011 $ 69,787 \nCosts and expenses:\nCost of revenues  29,599  30,612 \nResearch and development  9,119  11,468 \nSales and marketing  5,825  6,533 \nGeneral and administrative  3,374  3,759 \nTotal costs and expenses  47,917  52,372 \nIncome from operations  20,094  17,415 \nOther income (expense), net  (1,160)  790 \nIncome before income taxes  18,934  18,205 \nProvision for income taxes  2,498  3,154 \nNet income $ 16,436 $ 15,051 \nBasic net income per share of Class A, Class B, and Class C stock $ 1.24 $ 1.18 \nDiluted net income per share of Class A, Class B, and Class C stock $ 1.23 $ 1.17 \nSee accompanying notes.Table of Contents Alph