In [None]:
# from https://colab.research.google.com/drive/1uL1TdMbR4kqa0Ksrd_Of_jWSxWt1ia7o?usp=sharing#scrollTo=c48a272c-8e87-4740-9960-129d7d5943bb
# https://betterprogramming.pub/llamaindex-deep-lake-for-financial-statement-analysis-954f2b789c8e


In [48]:
from datetime import datetime
from IPython.display import Markdown, display
from pathlib import Path

from llama_index import download_loader, ServiceContext, GPTVectorStoreIndex
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.composability import ComposableGraph

import dotenv
dotenv.load_dotenv()


True

In [None]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

## Ingest data

In [2]:
# mkdir uber
# mkdir tmp
# wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O tmp/UBER.zip
# unzip tmp/UBER.zip -d tmp
# mv tmp/UBER/*.html ./uber

In [3]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)


In [4]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2022, 2021, 2020, 2019]
for year in years:
    year_docs = loader.load_data(file=Path(f'./uber/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)
    

[nltk_data] Downloading package punkt to /Users/drucev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/drucev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...


In [5]:
# talks to openai (or other llm)
# https://gpt-index.readthedocs.io/en/latest/reference/service_context.html

service_context = ServiceContext.from_defaults(chunk_size_limit=512)


In [6]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
# generates many calls to openai to compute embedding vectors
# https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/SimpleIndexDemo.html

# index_set = {}

# for year in years:    
#     print(datetime.now(), 'indexing', year)
#     index_id = "index_%d" % year
#     cur_index = GPTVectorStoreIndex.from_documents(doc_set[year]
#                                                    service_context=service_context)
#     index_set[year] = cur_index
#     cur_index.storage_context.persist(index_id)


indexing 2022


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 232797 tokens


indexing 2021


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 241424 tokens


indexing 2020


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 257154 tokens


indexing 2019


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 246480 tokens


In [40]:
index_set = {}
for year in years:
    index_id = "index_%d" % year
    print(datetime.now(), 'loading', year)
    # load index
    cur_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_id))
    index_set[year] = cur_index


2023-05-06 16:26:00.451445 loading 2022


INFO:llama_index.indices.loading:Loading all indices.


2023-05-06 16:26:00.764079 loading 2021


INFO:llama_index.indices.loading:Loading all indices.


2023-05-06 16:26:01.083362 loading 2020


INFO:llama_index.indices.loading:Loading all indices.


2023-05-06 16:26:01.475836 loading 2019


INFO:llama_index.indices.loading:Loading all indices.


In [42]:
# NOTE: this global index is a single vector store containing all documents
# Only relevant for the section below: "Can a single vector index answer questions across years?"
# this generates many calls, and should be possible to duplicate by composing global index from individual indexes

# global_index = GPTVectorStoreIndex.from_documents(all_docs,
#                                                   service_context=service_context)
# global_index.storage_context.persist("index_global")


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 977855 tokens


In [44]:
global_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="index_global"))


INFO:llama_index.indices.loading:Loading all indices.


In [45]:
query_engine = index_set[2020].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query = "What were some of the biggest risk factors in 2020?"
response = query_engine.query(query)
print(response)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 11 tokens
Token indices sequence length is longer than the specified maximum sequence length for this model (1318 > 1024). Running this sequence through the model will result in indexing errors
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1502 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens



The biggest risk factors in 2020 included:
- Changes in normal business practices necessitated by the outbreak and related governmental actions.
- Privacy, cybersecurity and fraud risks associated with increased remote working.
- Legal or regulatory challenges to our understanding of applicable legal and regulatory requirements.
- Heightened risks associated with the launch of new services, features, or health and safety requirements.
- Financial impacts of the COVID-19 pandemic, including reductions in workforce and changes to pricing models.
- Uncertainty around the ultimate impact of the pandemic on our future business operations, liquidity, financial condition, and results of operations.
- Potential fines or other enforcement measures resulting from legal or regulatory challenges.
- Adverse impacts on our business partners and third-party vendors.
- Extreme volatility in financial markets.
- Risk of Drivers being classified as employees, workers or quasi-employees.


In [46]:
query = "What were some of the signifcant acquisitions?"
response = query_engine.query(query)
print(response)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 11 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1371 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens



Some of the significant acquisitions include the acquisition of Careem, the purchase of a controlling interest in Cornershop, the acquisition of Postmates, the acquisition of Routematch Holdings, Inc., and the acquisition of Cornershop Global LLC.


In [47]:
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
                                         verbose=True,
                                    )
risk_query_str = "What are some of the biggest risk factors in each year?"
response = query_all.query(risk_query_str)
print(str(response))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 12 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1601 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens



2019:
- Drivers being classified as employees, workers or quasi-employees instead of independent contractors.
- Highly competitive mobility, delivery, and logistics industries with well-established and low-cost alternatives.
- Security breaches exposing the company to liability under various laws and regulations across jurisdictions.

2020:
- Drivers being classified as employees, workers or quasi-employees instead of independent contractors.
- Highly competitive mobility, delivery, and logistics industries with well-established and low-cost alternatives.
- Security breaches exposing the company to liability under various laws and regulations across jurisdictions.
- Inability to anticipate and prevent security techniques and attacks.
- Potential liability not covered by insurance.

2022:
- Drivers being classified as employees, workers or quasi-employees instead of independent contractors.
- Highly competitive mobility, delivery, and logistics industries with well-established and low-

In [49]:
# create summary text for each doc
summaries = {}
for year in years:
    summaries[year] = f"UBER 10-k Filing for {year} fiscal year"
    

In [50]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)


In [51]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [54]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)


In [55]:
# no idea how to pass this to graph though
query_configs = [
    {
        "index_struct_type": "dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1,
            # "include_summary": True
        }
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
        }
    },
]


In [73]:
query_graph = graph.as_query_engine()
response_summary = query_graph.query(risk_query_str,
                                    )
print(response_summary)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 60 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1157 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1161 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> 


The current risk factors for 2020 include: the COVID-19 pandemic and the impact of actions to mitigate the pandemic, the potential for Drivers to be classified as employees, workers or quasi-employees instead of independent contractors, and the highly competitive mobility, delivery, and logistics industries with well-established and low-cost alternatives, low barriers to entry, low switching costs, and well-capitalized competitors in nearly every major geographic region. To remain competitive in certain markets, the company has in the past taken actions such as offering discounts and promotions. 

The risk factors for 2021 include the COVID-19 pandemic and the impact of actions to mitigate the pandemic, the potential classification of Drivers as employees, workers or quasi-employees instead of independent contractors, the highly competitive mobility, delivery, and logistics industries, and the need to lower fares or service fees and offer significant Driver incentives and consumer dis

In [62]:
print(response_summary.get_formatted_sources())


> Source (Doc id: 9802aa2b-4bc5-4473-8ee4-f264e0e6c717): 
The current risk factors for 2022 include: 
- Drivers being classified as employees, workers or ...

> Source (Doc id: f3f59008-1e79-4cc2-91b4-626e2cf53ee4): 
The year provided in the context is 2021. The risk factors for 2021 include the COVID-19 pandemi...

> Source (Doc id: f268b0b8-e3e9-4528-a5e3-a929a9d0132b): 
The current risk factors for 2020 include: the COVID-19 pandemic and the impact of actions to mi...

> Source (Doc id: a69c78a7-ab70-4975-929d-0fc20ee64b61): 
The risk factors for 2019 include: interest rate risk, investment risk, and foreign currency ris...

> Source (Doc id: 09c20808-d36b-4ef6-afb2-684f6af1d70d): year: 2022

and certain events we participate in or host with members of the investment community...

> Source (Doc id: 9f1f89e7-98de-47cc-92bd-342b02408ee9): year: 2022

risks and uncertainties not currently known to us or that we currently do not believe...

> Source (Doc id: 84175b78-7559-4247-a6f2-a666

In [64]:
response_tmp = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              ).query(risk_query_str)
print(str(response_tmp))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 60 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1577 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens



For the year 2022, the risk factors include: 

- Drivers being classified as employees, workers or quasi-employees instead of independent contractors.
- The mobility, delivery, and logistics industries being highly competitive, with well-established and low-cost alternatives that have been available for decades, low barriers to entry, low switching costs, and well-capitalized competitors in nearly every major geographic region.
- Lowering of fares or service fees, and offering of significant Driver incentives and consumer discounts and promotions.
- Incurring significant losses since inception, including in the United States and other major markets.
- Attracting and maintaining a critical mass of Drivers, consumers, merchants, Shippers, and Carriers.
- Retaining and attracting high-quality personnel.
- Maintaining and enhancing brand and reputation.
- Addressing operational, compliance, and cultural challenges.
- Impact of economic conditions, including the resulting effect on discret

In [66]:
response = global_index.as_query_engine(retriever_mode="embedding", 
                                        service_context=service_context,   
                                        similarity_top_k=3,
                                        verbose=True,
                                       ).query(risk_query_str)
print(str(response))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 60 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1636 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens



For year 2020:
The risk factors include: the potential for Drivers to be classified as employees, workers or quasi-employees instead of independent contractors; the highly competitive mobility, delivery, and logistics industries; the potential for lowering fares or service fees to remain competitive; and the potential for risks and uncertainties not currently known or believed to be material.

For year 2022:
The risk factors include: the potential for Drivers to be classified as employees, workers or quasi-employees instead of independent contractors; the highly competitive mobility, delivery, and logistics industries; the potential for lowering fares or service fees to remain competitive; the potential for risks and uncertainties not currently known or believed to be material; and the potential for the COVID-19 pandemic and the actions to mitigate it to adversely affect parts of the business.

Summary:
The risk factors for 2020 and 2022 are largely the same, with the addition of the 