In [None]:
! python --version

In [None]:
%%capture
%pip install llama-index-readers-file pymupdf
%pip install llama-index-vector-stores-postgres
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-bedrock
%pip install llama-index-embeddings-bedrock
%pip install psycopg2-binary
%pip install ipywidgets
%pip install SQLAlchemy
%pip install python-dotenv

In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from dotenv import load_dotenv
load_dotenv(verbose=True, dotenv_path=".env")

True

https://docs.llamaindex.ai/en/stable/examples/usecases/10k_sub_question/

In [None]:
! mkdir -p 'data/'
! wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O 'data/uber_2021.pdf'
! wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf' -O 'data/lyft_2021.pdf'

In [3]:
from llama_index.core import Settings
from llama_index.llms.bedrock import Bedrock
from llama_index.llms.lmstudio import LMStudio
from llama_index.embeddings.bedrock import BedrockEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
import os

def setup_models(mode="local"):
    if mode == "local":
        # Setup LLM1
        embedding_model = "BAAI/bge-base-en-v1.5"
        print(f"Setting up local LLM1 (embedding model: {embedding_model})...")
        Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
        Settings.chunk_size = 1024
        Settings.chunk_overlap = 20
        
        # Setup LLM2
        llm_model = "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF"
        print(f"Setting up local LLM2 (main LLM: {llm_model})...")
        Settings.llm = LMStudio(
            model_name=llm_model,
            base_url="http://localhost:1234/v1",
            temperature=0,
            request_timeout=120,
        )
    elif mode == "remote":
        # Setup LLM1
        embedding_model = "cohere.embed-multilingual-v3"
        print(f"Setting up remote LLM1 (embedding model: {embedding_model})...")
        Settings.embed_model = BedrockEmbedding(
            model_name=embedding_model,
            region_name=os.environ["AWS_DEFAULT_REGION"],
        )
        Settings.chunk_size = 1024
        Settings.chunk_overlap = 20
                
        # Setup LLM2
        llm_model = "anthropic.claude-3-sonnet-20240229-v1:0"
        print(f"Setting up remote LLM2 (main LLM: {llm_model})...")
        Settings.llm = Bedrock(
            model=llm_model,
            aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
            aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
            aws_session_token=os.environ["AWS_SESSION_TOKEN"],
            region_name=os.environ["AWS_DEFAULT_REGION"],
            request_timeout=120,
        )

    else:
        raise ValueError(f"Unknown mode: {mode}")
    
# llama_debug = LlamaDebugHandler(print_trace_on_end=True)
# callback_manager = CallbackManager([llama_debug])

# Settings.callback_manager = callback_manager    

setup_models(mode="remote")

text_embedding = Settings.embed_model.get_text_embedding("Once upon a time, there was a cat.")
print(text_embedding[:5])
print(f"Emedding length: {len(text_embedding)}")
vector_size = len(text_embedding)


Setting up remote LLM1 (embedding model: cohere.embed-multilingual-v3)...
Setting up remote LLM2 (main LLM: anthropic.claude-3-sonnet-20240229-v1:0)...
[-0.043518066, -0.010955811, -0.00032567978, 0.0057792664, -0.016540527]
Emedding length: 1024


In [4]:
import psycopg2
import nest_asyncio

try:
    pg_pw = "mysecretpassword"
    pg_db = "vector_store"
    connection_string = f"postgresql://postgres:{pg_pw}@localhost:5432"
    db_name = pg_db
    conn = psycopg2.connect(connection_string)
    conn.autocommit = True

    with conn.cursor() as c:
        c.execute(f"DROP DATABASE {db_name} WITH (FORCE);")
        c.execute(f"CREATE DATABASE {db_name};")

    conn.commit()
    conn.close()
    
    nest_asyncio.apply()
    
except Exception as e:
    print(e)
    

In [5]:

from llama_index.core import SimpleDirectoryReader
from sqlalchemy import make_url
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.postgres import PGVectorStore

def advanced_RAG(vector_size, input_file):
    """
    Simple Retrieval Augmented Generation (RAG) using Llama Index.
    """

    print(f"Ingesting document: {input_file}...")
    url = make_url(connection_string)
    print(f"Url {url}")
    
    vector_store = PGVectorStore.from_params(
        database=db_name,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="advanced_rag",
        embed_dim=vector_size
    )

    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    nodes = ingest_document(input_file)
    
    print(f"Number of nodes: {len(nodes)}")

    index = VectorStoreIndex.from_documents(nodes, storage_context=storage_context, show_progress=True)
    return index

def ingest_document(input_file):
    """
    Ingest a document into the vector store. 
    """
    reader = SimpleDirectoryReader(input_files=[input_file])
    return reader.load_data(show_progress=True)

In [6]:
uber_index = advanced_RAG(vector_size=vector_size, input_file="./data/uber_2021.pdf")
lyft_index = advanced_RAG(vector_size=vector_size, input_file="./data/lyft_2021.pdf")

Ingesting document: ./data/uber_2021.pdf...
Url postgresql://postgres:***@localhost:5432


Loading files: 100%|██████████| 1/1 [00:10<00:00, 10.82s/file]

Number of nodes: 307





Parsing nodes:   0%|          | 0/307 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/409 [00:00<?, ?it/s]

Ingesting document: ./data/lyft_2021.pdf...
Url postgresql://postgres:***@localhost:5432


Loading files: 100%|██████████| 1/1 [00:07<00:00,  7.63s/file]

Number of nodes: 238





Parsing nodes:   0%|          | 0/238 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/341 [00:00<?, ?it/s]

## Using naive LlamaIndex RAG

## Using Sub Question decomposition method

In [7]:
from IPython.display import Markdown
from llama_index.core.callbacks import CBEventType, EventPayload

lyft_engine = lyft_index.as_query_engine(similarity_top_k=3)
uber_engine = uber_index.as_query_engine(similarity_top_k=3)

query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021"
            ),
        ),
    ),
]

s_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    verbose=True
    
)

def display_markdown(question, response):
    """
    Display a question and response in markdown format.
    """
    return Markdown(
f"""
## Question:
{question}

## Answer:
{response}
""")

In [8]:
question = "Compare and contrast the their major assets and liabilities in 2021"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What were Uber's major assets in 2021?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What were Uber's major liabilities in 2021?
[0m[1;3;38;2;11;159;203m[lyft_10k] Q: What were Lyft's major assets in 2021?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What were Lyft's major liabilities in 2021?
[0m[1;3;38;2;155;135;227m[lyft_10k] A: Based on the consolidated balance sheet information provided, Lyft's major liabilities in 2021 were:

1. Insurance reserves: $1,068,628,000
2. Accrued and other current liabilities: $1,211,641,000  
3. Operating lease liabilities (current and non-current): $264,997,000
4. Long-term debt, net of current portion: $655,173,000

The insurance reserves and accrued liabilities made up the bulk of Lyft's current liabilities totaling $2,463,576,000 as of December 31, 2021. The operating lease liabilities and long-term debt were the major non-current liabilities on Lyft's balance sheet for that year.
[0m[1;3;


## Question:
Compare and contrast the their major assets and liabilities in 2021

## Answer:
In 2021, both Uber and Lyft had significant cash and investment holdings as major assets, reflecting their capital-intensive business models. However, Uber's cash and investments were substantially larger, totaling around $16 billion compared to Lyft's $3.3 billion.

Uber also had a much higher goodwill balance of $8.4 billion, stemming from its numerous acquisitions, while Lyft's goodwill was only $180.5 million. Uber's property and equipment assets were higher at $1.9 billion versus Lyft's $298.2 million.

On the liabilities side, both companies had substantial insurance reserves, accrued liabilities, and operating lease obligations. However, Uber's long-term debt of $9.3 billion was significantly higher than Lyft's $655 million, reflecting Uber's larger scale and financing needs.

Overall, while both companies had similar categories of major assets and liabilities, Uber's balances were substantially larger across most line items, commensurate with its larger global operations and market presence compared to Lyft.


In [9]:
question = "Compare revenue growth of Uber and Lyft from 2020 to 2021"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What was Uber's revenue in 2020?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What was Uber's revenue in 2021?
[0m[1;3;38;2;11;159;203m[lyft_10k] Q: What was Lyft's revenue in 2020?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What was Lyft's revenue in 2021?
[0m[1;3;38;2;90;149;237m[uber_10k] A: According to the financial information provided, Uber's revenue in 2021 was $17.455 billion.
[0m[1;3;38;2;11;159;203m[lyft_10k] A: According to the consolidated statements of operations provided in the context information, Lyft's revenue for the year ended December 31, 2020 was $2,364,681,000.
[0m[1;3;38;2;155;135;227m[lyft_10k] A: According to the financial information provided, Lyft's revenue for the year ended December 31, 2021 was $3,208,323,000.
[0m[1;3;38;2;237;90;200m[uber_10k] A: According to the financial information provided, Uber's revenue in 2020 was $11,139 million.
[0m


## Question:
Compare revenue growth of Uber and Lyft from 2020 to 2021

## Answer:
Uber experienced a higher revenue growth rate compared to Lyft from 2020 to 2021. Uber's revenue increased by around 56.7% from $11.139 billion in 2020 to $17.455 billion in 2021. On the other hand, Lyft's revenue grew by approximately 35.7% from $2.365 billion in 2020 to $3.208 billion in 2021. Therefore, Uber's year-over-year revenue growth outpaced Lyft's during the same period.


In [10]:
question = "Compare revenue growth of Uber and Lyft from 2017 to 2018"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What was Uber's revenue in 2017?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What was Uber's revenue in 2018?
[0m[1;3;38;2;11;159;203m[lyft_10k] Q: What was Lyft's revenue in 2017?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What was Lyft's revenue in 2018?
[0m[1;3;38;2;90;149;237m[uber_10k] A: Unfortunately, the provided context does not contain any information about Uber's revenue in 2018. The context only provides details on Uber's revenue for the years 2019, 2020, and 2021.
[0m[1;3;38;2;237;90;200m[uber_10k] A: Unfortunately, the provided context does not contain any information about Uber's revenue in 2017. The context only provides details on Uber's revenue for the years 2019, 2020, and 2021.
[0m[1;3;38;2;11;159;203m[lyft_10k] A: Unfortunately, the provided context does not contain any information about Lyft's revenue in 2017. The context only discusses Lyft's revenue, costs, and other financial metrics for the years


## Question:
Compare revenue growth of Uber and Lyft from 2017 to 2018

## Answer:
Unfortunately, I do not have enough information from the provided context to compare the revenue growth of Uber and Lyft from 2017 to 2018. The context does not contain any revenue figures for either company in those specific years, only covering their financials starting from 2019 onwards. Without data on their 2017 and 2018 revenues, it is impossible to calculate or analyze their revenue growth during that period based solely on the given context.


In [11]:
question = "What is these companies' revenue, profit, and expenses for 2021?"
response = s_engine.query(question)

display_markdown(question, response.response)

Generated 6 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What was Uber's revenue for 2021?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What was Uber's profit/loss for 2021?
[0m[1;3;38;2;11;159;203m[uber_10k] Q: What were Uber's expenses for 2021?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What was Lyft's revenue for 2021?
[0m[1;3;38;2;237;90;200m[lyft_10k] Q: What was Lyft's profit/loss for 2021?
[0m[1;3;38;2;90;149;237m[lyft_10k] Q: What were Lyft's expenses for 2021?
[0m[1;3;38;2;237;90;200m[uber_10k] A: According to the financial information provided, Uber's revenue for 2021 was $17.455 billion.
[0m[1;3;38;2;237;90;200m[lyft_10k] A: According to the consolidated statements of operations provided in the context information, Lyft, Inc. reported a net loss of $1,009,359,000 for the year ended December 31, 2021.
[0m[1;3;38;2;90;149;237m[uber_10k] A: According to Uber's consolidated statements of operations for 2021, the company reported a net loss attributable to Uber Techno


## Question:
What is these companies' revenue, profit, and expenses for 2021?

## Answer:
For Uber in 2021:
Revenue: $17.455 billion
Net Loss: $496 million
Major Expenses: Cost of revenue ($9.351 billion), Operations and support ($1.877 billion), Sales and marketing ($4.789 billion), Research and development ($2.054 billion), General and administrative ($2.316 billion), Depreciation and amortization ($902 million)

For Lyft in 2021:  
Revenue: $3.208 billion  
Net Loss: $1.009 billion
Major Expenses: Cost of revenue ($1.650 billion), Operations and support ($402 million), Research and development ($912 million), Sales and marketing ($411 million), General and administrative ($916 million)
