In [None]:
! python --version

## Install the required packages
- `%%capture` is used to suppress the output of the installation commands.

In [None]:
%%capture
%pip install llama-index-readers-file pymupdf
%pip install llama-index-vector-stores-postgres
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-bedrock
%pip install llama-index-embeddings-bedrock
%pip install psycopg2-binary
%pip install ipywidgets
%pip install SQLAlchemy
%pip install python-dotenv

In [1]:
import nest_asyncio

nest_asyncio.apply()

## Import the required libraries
- The `load_dotenv` function is used to load the environment variables from the `.env` file - this is used when I had to access a more capable generator model in Bedrock. 
- This notebook is based on this [notebook from LlamaIndex - Sub Question querying](https://docs.llamaindex.ai/en/stable/examples/usecases/10k_sub_question/).

In [2]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from dotenv import load_dotenv
load_dotenv(verbose=True, dotenv_path=".env")

True

## Downloading the dataset
- If you need to download the dataset, you can use the following commands below, but commented out as they are large files.

In [None]:
# ! mkdir -p 'data/10-K'
# ! wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O 'data/10-K/uber_2021.pdf'
# ! wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf' -O 'data/10-K/lyft_2021.pdf'

## Setup the Retriever and Generator models
- pass `mode=local` to the `setup` function to use our local LMStudio models.
- pass `mode=remote` to the `setup` function to use the AWS Bedrock.

In [4]:
from llama_index.core import Settings
from llama_index.llms.bedrock import Bedrock
from llama_index.llms.lmstudio import LMStudio
from llama_index.embeddings.bedrock import BedrockEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import os

def setup_models(mode="local"):
    if mode == "local":
        # Setup Retriever model
        embedding_model = "BAAI/bge-base-en-v1.5"
        print(f"Setting up local Retriever model (embedding: {embedding_model})...")
        Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
        Settings.chunk_size = 1024
        Settings.chunk_overlap = 20
        
        # Setup Generator model
        llm_model = "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF"
        print(f"Setting up local Generator model (main LLM: {llm_model})...")
        Settings.llm = LMStudio(
            model_name=llm_model,
            base_url="http://localhost:1234/v1",
            temperature=0,
            request_timeout=120,
        )
    elif mode == "remote":
        # Setup Retriever model
        embedding_model = "cohere.embed-multilingual-v3"
        print(f"Setting up remote Retriever model (embedding: {embedding_model})...")
        Settings.embed_model = BedrockEmbedding(
            model_name=embedding_model,
            region_name=os.environ["AWS_DEFAULT_REGION"],
        )
        Settings.chunk_size = 1024
        Settings.chunk_overlap = 20
                
        # Setup Generator model
        llm_model = "anthropic.claude-3-sonnet-20240229-v1:0"
        print(f"Setting up remote Generator model (main LLM: {llm_model})...")
        Settings.llm = Bedrock(
            model=llm_model,
            aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
            aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
            aws_session_token=os.environ["AWS_SESSION_TOKEN"],
            region_name=os.environ["AWS_DEFAULT_REGION"],
            request_timeout=120,
        )

    else:
        raise ValueError(f"Unknown mode: {mode}")
    
setup_models(mode="remote")

text_embedding = Settings.embed_model.get_text_embedding("Once upon a time, there was a cat.")
print(text_embedding[:5])
print(f"Emedding length: {len(text_embedding)}")
vector_size = len(text_embedding)


Setting up remote Retriever model (embedding: cohere.embed-multilingual-v3)...
Setting up remote Generator model (main LLM: anthropic.claude-3-sonnet-20240229-v1:0)...
[-0.043518066, -0.010955811, -0.00032567978, 0.0057792664, -0.016540527]
Emedding length: 1024


# Setup PgVector extension in Postgres SQL
- In the code below, we drop the database everytime, just to ensure that we are starting from scratch. This is not recommended in production.

In [19]:
import psycopg2
import nest_asyncio

try:
    pg_pw = "mysecretpassword"
    pg_db = "vector_store"
    connection_string = f"postgresql://postgres:{pg_pw}@localhost:5432"
    db_name = pg_db
    conn = psycopg2.connect(connection_string)
    conn.autocommit = True

    with conn.cursor() as c:
        c.execute(f"DROP DATABASE {db_name} WITH (FORCE);")
        c.execute(f"CREATE DATABASE {db_name};")

    conn.commit()
    conn.close()
    
    nest_asyncio.apply()
    
except Exception as e:
    print(e)
    

In [20]:
from IPython.display import Markdown
from llama_index.core import SimpleDirectoryReader
from sqlalchemy import make_url
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.postgres import PGVectorStore

def simple_RAG(vector_size):
    """
    Simple Retrieval Augmented Generation (RAG) using Llama Index.
    """
    BASE_DIR = "./data/10-K"

    url = make_url(connection_string)
    print(f"Url {url}")
    
    vector_store = PGVectorStore.from_params(
        database=db_name,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="basic_rag",
        embed_dim=vector_size
    )

    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    nodes = ingest_directory(BASE_DIR)
    
    print(f"Number of nodes: {len(nodes)}")

    index = VectorStoreIndex.from_documents(nodes, storage_context=storage_context, show_progress=True)
    return index

def advanced_RAG(vector_size, input_file):
    """
    Simple Retrieval Augmented Generation (RAG) using Llama Index.
    """

    print(f"Ingesting document: {input_file}...")
    url = make_url(connection_string)
    print(f"Url {url}")
    
    vector_store = PGVectorStore.from_params(
        database=db_name,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="advanced_rag",
        embed_dim=vector_size
    )

    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    nodes = ingest_document(input_file)
    
    print(f"Number of nodes: {len(nodes)}")

    index = VectorStoreIndex.from_documents(nodes, storage_context=storage_context, show_progress=True)
    return index

def ingest_document(input_file):
    """
    Ingest a document into the vector store. 
    """
    reader = SimpleDirectoryReader(input_files=[input_file])
    return reader.load_data(show_progress=True)

def ingest_directory(directory):
    """
    Ingest documents from a directory into the vector store. 
    """
    reader = SimpleDirectoryReader(input_dir=directory)
    return reader.load_data(show_progress=True)

def display_markdown(question, response):
    """
    Display a question and response in markdown format.
    """
    return Markdown(
f"""
## Question:
{question}

## Answer:
{response}
""")

## Using naive LlamaIndex RAG

In [21]:
index = simple_RAG(vector_size=vector_size)
query_engine = index.as_query_engine(similarity_top_k=3, verbose=True)

Url postgresql://postgres:***@localhost:5432


Loading files: 100%|██████████| 2/2 [00:12<00:00,  6.41s/file]

Number of nodes: 545





Parsing nodes:   0%|          | 0/545 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/757 [00:00<?, ?it/s]

In [22]:
question = "Compare and contrast the their major assets and liabilities in 2021"
response = query_engine.query(question)

display_markdown(question, response.response)


## Question:
Compare and contrast the their major assets and liabilities in 2021

## Answer:
Based on the financial information provided, here is a comparison of Uber and Lyft's major assets and liabilities in 2021:

Assets:
- Uber had significant holdings of marketable equity securities worth $11.4 billion, while Lyft did not report holding any marketable equity securities.
- Uber held non-marketable debt securities worth $2.3 billion in 2020, but did not report a value for 2021. Lyft did not report holding non-marketable debt securities.
- Both companies had deferred tax assets, with Lyft reporting $206 million and Uber not providing the 2021 value.
- Lyft reported operating lease right-of-use assets of $223 million, while Uber did not disclose a comparable figure.

Liabilities:  
- Uber reported a $193 million liability related to a call option for MLU B.V., while Lyft did not have a similar liability.
- Lyft had $264 million in operating lease liabilities and $28 million in finance lease liabilities. Uber did not provide details on lease liabilities.
- Lyft reported $32 million in deferred tax liabilities, while Uber's amount was not stated.

Overall, Uber's major assets included significant marketable equity securities, while Lyft's prominent assets were deferred tax assets and operating lease right-of-use assets. On the liability side, Lyft provided more details on lease liabilities compared to Uber.


## Using Sub Question decomposition method

In [23]:
uber_index = advanced_RAG(vector_size=vector_size, input_file="./data/10-K/uber_2021.pdf")
lyft_index = advanced_RAG(vector_size=vector_size, input_file="./data/10-K/lyft_2021.pdf")

Ingesting document: ./data/10-K/uber_2021.pdf...
Url postgresql://postgres:***@localhost:5432


Loading files: 100%|██████████| 1/1 [00:07<00:00,  7.52s/file]

Number of nodes: 307





Parsing nodes:   0%|          | 0/307 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/410 [00:00<?, ?it/s]

Ingesting document: ./data/10-K/lyft_2021.pdf...
Url postgresql://postgres:***@localhost:5432


Loading files: 100%|██████████| 1/1 [00:05<00:00,  5.03s/file]

Number of nodes: 238





Parsing nodes:   0%|          | 0/238 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/344 [00:00<?, ?it/s]

In [24]:

lyft_engine = lyft_index.as_query_engine(similarity_top_k=3, verbose=True)
uber_engine = uber_index.as_query_engine(similarity_top_k=3, verbose=True)

query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021"
            ),
        ),
    ),
]

s_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    verbose=True
    
)



In [25]:
question = "Compare and contrast the their major assets and liabilities in 2021"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What were Uber's major assets in 2021?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What were Uber's major liabilities in 2021?
[0m[1;3;38;2;11;159;203m[lyft_10k] Q: What were Lyft's major assets in 2021?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What were Lyft's major liabilities in 2021?
[0m[1;3;38;2;155;135;227m[lyft_10k] A: Based on the consolidated balance sheet information provided, Lyft's major liabilities in 2021 were:

1. Insurance reserves: $1,068,628,000
2. Accrued and other current liabilities: $1,211,641,000  
3. Operating lease liabilities (current and non-current): $263,997,000
4. Long-term debt: $655,173,000

The insurance reserves and accrued liabilities made up the bulk of Lyft's current liabilities totaling $2,463,576,000 as of December 31, 2021. The operating lease liabilities and long-term debt were the other major non-current liability components on Lyft's balance sheet for 2021.
[0m[1;3;38;2;237;90;20


## Question:
Compare and contrast the their major assets and liabilities in 2021

## Answer:
In terms of major assets in 2021, both Uber and Lyft had significant cash and investment holdings, with Uber's cash and investments totaling around $16.1 billion, while Lyft's cash, cash equivalents, and investments amounted to around $3.3 billion. However, Uber had substantially higher goodwill and intangible assets of $10.8 billion, likely from its larger acquisition activity, compared to Lyft's goodwill of $180.5 million. Uber also had higher property and equipment assets of $1.9 billion versus Lyft's $298.2 million.

Regarding major liabilities, both companies had substantial accrued liabilities, with Uber's accrued and other current liabilities at $6.5 billion and Lyft's at $1.2 billion. Uber had a significantly higher long-term debt balance of $9.3 billion compared to Lyft's $655 million. Additionally, Uber had long-term insurance reserves of $2.5 billion, while Lyft's insurance reserves were $1.1 billion. Lyft's operating lease liabilities of $264 million were lower than Uber's $1.8 billion.

In summary, while both companies had considerable cash and investment assets, Uber's assets were significantly larger, driven by higher goodwill, intangible assets, and property and equipment. On the liabilities side, Uber had substantially higher debt, accrued liabilities, and insurance reserves compared to Lyft, reflecting its larger scale of operations and acquisitions.


In [26]:
question = "Compare revenue growth of Uber and Lyft from 2020 to 2021"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What was Uber's revenue in 2020?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What was Uber's revenue in 2021?
[0m[1;3;38;2;11;159;203m[lyft_10k] Q: What was Lyft's revenue in 2020?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What was Lyft's revenue in 2021?
[0m[1;3;38;2;90;149;237m[uber_10k] A: According to the financial information provided, Uber's total revenue for the year 2021 was $17.455 billion.
[0m[1;3;38;2;237;90;200m[uber_10k] A: According to the financial information provided, Uber's revenue in 2020 was $11,139 million.
[0m[1;3;38;2;155;135;227m[lyft_10k] A: According to the financial information provided, Lyft's revenue for the year ended December 31, 2021 was $3,208,323,000.
[0m[1;3;38;2;11;159;203m[lyft_10k] A: According to the consolidated statements of operations provided in the context, Lyft's revenue in 2020 was $2,364,681,000.
[0m


## Question:
Compare revenue growth of Uber and Lyft from 2020 to 2021

## Answer:
Uber experienced a higher revenue growth rate compared to Lyft from 2020 to 2021. Uber's revenue increased from $11.139 billion in 2020 to $17.455 billion in 2021, representing a growth of approximately 56.7%. On the other hand, Lyft's revenue grew from $2.364 billion in 2020 to $3.208 billion in 2021, which translates to a growth rate of around 35.7%. Therefore, Uber's revenue growth outpaced Lyft's during the same period.


In [27]:
question = "Compare revenue growth of Uber and Lyft from 2017 to 2018"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What is the revenue growth of Uber in 2021?
[0m[1;3;38;2;90;149;237m[lyft_10k] Q: What is the revenue growth of Lyft in 2021?
[0m[1;3;38;2;90;149;237m[lyft_10k] A: Based on the information provided, Lyft's revenue in 2021 was $3,208,323,000, which represents an increase of 35.6% compared to its revenue of $2,364,681,000 in 2020.
[0m[1;3;38;2;237;90;200m[uber_10k] A: According to the financial highlights provided, Uber's revenue grew by 57% in 2021 compared to 2020, increasing from $11.1 billion in 2020 to $17.5 billion in 2021.
[0m


## Question:
Compare revenue growth of Uber and Lyft from 2017 to 2018

## Answer:
Unfortunately, the provided context does not contain any information about the revenue growth of Uber and Lyft from 2017 to 2018. The context only mentions their revenue figures and growth rates for 2020 and 2021. Without data on their revenues or growth rates for 2017 and 2018, it is not possible to compare their revenue growth during that period based solely on the given information.


In [28]:
question = "What is these companies' revenue, profit, and expenses for 2021?"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 6 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What was Uber's revenue for 2021?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What was Uber's profit/loss for 2021?
[0m[1;3;38;2;11;159;203m[uber_10k] Q: What were Uber's expenses for 2021?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What was Lyft's revenue for 2021?
[0m[1;3;38;2;237;90;200m[lyft_10k] Q: What was Lyft's profit/loss for 2021?
[0m[1;3;38;2;90;149;237m[lyft_10k] Q: What were Lyft's expenses for 2021?
[0m[1;3;38;2;237;90;200m[lyft_10k] A: According to the consolidated statements of operations provided in the context information, Lyft, Inc. reported a net loss of $1,009,359,000 for the year ended December 31, 2021.
[0m[1;3;38;2;90;149;237m[uber_10k] A: According to Uber's consolidated statement of operations for 2021, the company reported a net loss attributable to Uber Technologies, Inc. of $496 million for the year ended December 31, 2021.
[0m[1;3;38;2;155;135;227m[lyft_10k] A: According to the financial 


## Question:
What is these companies' revenue, profit, and expenses for 2021?

## Answer:
For Uber in 2021:
Revenue: $17.455 billion
Net Loss: $496 million
Major Expenses: Cost of revenue ($9.351 billion), Operations and support ($1.877 billion), Sales and marketing ($4.789 billion), Research and development ($2.054 billion), General and administrative ($2.316 billion), Depreciation and amortization ($902 million), Interest expense ($483 million)

For Lyft in 2021: 
Revenue: $3.208 billion  
Net Loss: $1.009 billion
Major Expenses: Cost of revenue ($1.650 billion), Operations and support ($402 million), Research and development ($912 million), Sales and marketing ($411 million), General and administrative ($916 million)
