In [1]:
# from https://colab.research.google.com/drive/1uL1TdMbR4kqa0Ksrd_Of_jWSxWt1ia7o?usp=sharing#scrollTo=c48a272c-8e87-4740-9960-129d7d5943bb
# https://betterprogramming.pub/llamaindex-deep-lake-for-financial-statement-analysis-954f2b789c8e


In [None]:
# for colab https://colab.research.google.com/github/druce/question_answering_over_docs/blob/main/10kAnalysis.ipynb

# # if using colab
# import os
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# !pip install llama-index pytesseract pdf2image
# !pip uninstall rich
# !pip install rich==13.0.1

# # get data
# !mkdir uber
# !mkdir tmp
# !wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O tmp/UBER.zip
# !unzip tmp/UBER.zip -d tmp
# !mv tmp/UBER/*.html ./uber


In [24]:
import sys
from datetime import datetime
from IPython.display import Markdown, display
from ipywidgets import interact, widgets
from pathlib import Path

import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import download_loader, ServiceContext, StorageContext, load_index_from_storage, GPTVectorStoreIndex
from llama_index import GPTListIndex, LLMPredictor
from llama_index.composability import ComposableGraph

from langchain import OpenAI

# if using dotenv with .env and OPENAI_API_KEY=<mykey>
import dotenv
dotenv.load_dotenv()

# if using colab
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


True

## Ingest data

In [4]:
# mkdir uber
# mkdir tmp
# wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O tmp/UBER.zip
# unzip tmp/UBER.zip -d tmp
# mv tmp/UBER/*.html ./uber

In [5]:
# extract raw text from html
# https://unstructured.io ; https://github.com/Unstructured-IO/unstructured
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2022, 2021, 2020, 2019]
for year in years:
    year_docs = loader.load_data(file=Path(f'./uber/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)
    

[nltk_data] Downloading package punkt to /Users/drucev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/drucev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...


In [6]:
# service_context talks to openai (or other llm)
# https://gpt-index.readthedocs.io/en/latest/reference/service_context.html

service_context = ServiceContext.from_defaults(chunk_size_limit=512)


In [7]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
# generates many calls to openai to compute embedding vectors
# https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/SimpleIndexDemo.html

# index_set = {}

# for year in years:    
#     print(datetime.now(), 'indexing', year)
#     index_id = "index_%d" % year
#     cur_index = GPTVectorStoreIndex.from_documents(doc_set[year],
#                                                    service_context=service_context)
#     index_set[year] = cur_index
#     cur_index.storage_context.persist(index_id)


In [8]:
# load previously created indexes
index_set = {}
for year in years:
    index_id = "index_%d" % year
    print(datetime.now(), 'loading', year)
    # load index
    cur_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_id))
    index_set[year] = cur_index


2023-05-07 10:51:02.230996 loading 2022
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.
2023-05-07 10:51:02.549792 loading 2021
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.
2023-05-07 10:51:02.884436 loading 2020
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.
2023-05-07 10:51:03.345436 loading 2019
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.


In [9]:
# NOTE: this global index is a single vector store containing all documents
# Only relevant for the section below: "Can a single vector index answer questions across years?"
# this generates many calls so run once and then load from index_global directory

# global_index = GPTVectorStoreIndex.from_documents(all_docs,
#                                                   service_context=service_context)
# global_index.storage_context.persist("index_global")


In [10]:
global_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="index_global"))


INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.


In [11]:
query_engine = index_set[2020].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query = "What were some of the biggest risk factors in 2020?"
response = query_engine.query(query)
print(response)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 11 tokens
> [retrieve] Total embedding token usage: 11 tokens


Token indices sequence length is longer than the specified maximum sequence length for this model (1318 > 1024). Running this sequence through the model will result in indexing errors


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1502 tokens
> [get_response] Total LLM token usage: 1502 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

The biggest risk factors in 2020 included:
- Changes in normal business practices necessitated by the outbreak and related governmental actions.
- Privacy, cybersecurity and fraud risks associated with increased remote working.
- Legal or regulatory challenges to our understanding of applicable legal and regulatory requirements.
- Heightened risks associated with the launch of new services, features, or health and safety requirements.
- Financial impacts of the COVID-19 pandemic, including reductions in workforce and changes to pricing models.
- Uncertainty around the ultimate impact of the pandemic on our future business operations, liquidity, financial condition, and results of operat

In [12]:
query = "What were some of the signifcant acquisitions?"
response = query_engine.query(query)
print(response)


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 11 tokens
> [retrieve] Total embedding token usage: 11 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1402 tokens
> [get_response] Total LLM token usage: 1402 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

Some of the significant acquisitions include the divestiture of our ATG business to Aurora, our Uber Elevate business to Joby, our Yandex.Taxi joint venture in Russia/CIS, our agreement to enter into a joint venture with SK Telecom Co., LTD., our acquisition of Careem, our purchase of a controlling interest in Cornershop, and our acquisition of Postmates.


In [13]:
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                    )
risk_query_str = "What are some of the biggest risk factors in each year?"
response = query_all.query(risk_query_str)
print(str(response))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 12 tokens
> [retrieve] Total embedding token usage: 12 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1601 tokens
> [get_response] Total LLM token usage: 1601 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

2019:
- Drivers being classified as employees, workers or quasi-employees instead of independent contractors.
- Highly competitive mobility, delivery, and logistics industries with well-established and low-cost alternatives.
- Security breaches exposing the company to liability under various laws and regulations across jurisdictions.

2020:
- Drivers being classified as employees, workers or quasi-employees 

In [14]:
# create summary text for each doc
summaries = {}
for year in years:
    summaries[year] = f"UBER 10-k Filing for {year} fiscal year"
    

In [15]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)


In [16]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [17]:
custom_query_engines = {
    my_index.index_id: my_index.as_query_engine(
        similarity_top_k=1,
        response_mode="tree_summarize",
    )
    for my_index in [index_set[y] for y in years]
}

In [18]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)


In [19]:
query_engine = graph.as_query_engine(custom_query_engines=custom_query_engines)
response = query_engine.query(risk_query_str)
print(response)

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 60 tokens
> [retrieve] Total embedding token usage: 60 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 634 tokens
> [get_response] Total LLM token usage: 634 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 634 tokens
> [get_response] Total LLM token usage: 634 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM

In [20]:
print(response)


The current risk factors for 2021 include the impact of the COVID-19 pandemic on parts of our business, the potential for Drivers to be classified as employees, workers or quasi-employees instead of independent contractors, the highly competitive nature of the mobility, delivery, and logistics industries, and the need to lower fares or service fees and offer Driver incentives and consumer discounts and promotions in order to remain competitive in certain markets. We have also incurred significant losses since inception, including in the United States and other major markets. 

The risk factors for 2020 are outlined in Item 1A of the information provided. These risk factors include unresolved staff comments, properties, legal proceedings, and mine safety disclosures. 

The risk factors for December 31, 2019 include interest rate risk, investment risk, and foreign currency risk. Interest rate risk relates to the 2016 Term Loan Facility and 2018 Term Loan Facility, which are floating rat

In [21]:
print(response.get_formatted_sources()[:300])


> Source (Doc id: e2fef981-7dee-45b5-b23f-062302b3aca5): 
The current risk factors for 2022 include: Drivers being classified as employees, workers or qua...

> Source (Doc id: 8da08af0-8b6a-4dc6-a4a4-235f1490f21d): 
The year provided in the context is 2021. The current risk factors include the impa


In [22]:
response_tmp = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              ).query(risk_query_str)
print(str(response_tmp))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 60 tokens
> [retrieve] Total embedding token usage: 60 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 1605 tokens
> [get_response] Total LLM token usage: 1605 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

For the year 2022, the risk factors include: 

- Drivers being classified as employees, workers or quasi-employees instead of independent contractors.
- The mobility, delivery, and logistics industries being highly competitive, with well-established and low-cost alternatives that have been available for decades, low barriers to entry, low switching costs, and well-capitalized competitors in nearly every majo

In [23]:
response = global_index.as_query_engine(retriever_mode="embedding", 
                                        service_context=service_context,   
                                        similarity_top_k=4,
                                        verbose=True,
                                       ).query(risk_query_str)
print(str(response))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 60 tokens
> [retrieve] Total embedding token usage: 60 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 2159 tokens
> [get_response] Total LLM token usage: 2159 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

Year 2020:
The risk factors include: Drivers being classified as employees, workers or quasi-employees instead of independent contractors; the mobility, delivery, and logistics industries being highly competitive; the need to lower fares or service fees to remain competitive; and the potential for significant losses since inception.

Year 2021:
The risk factors include: the COVID-19 pandemic and the impact o

In [27]:
# submit questions using a text widget and dropdown for which index to query
# todo use textarea
# default question to value of risk_query_string
# Describe the current risk factors. If the year is provided in the information, provide that as well. If the context contains risk factors for multiple years, explicitly provide the following: A description of the risk factors for each year; A summary of how these risk factors are changing across years"
# add submit button

query_2019 = index_set[2019].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2020 = index_set[2020].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2021 = index_set[2021].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2022 = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                         )
query_all_graph = graph.as_query_engine(custom_query_engines=custom_query_engines)

text = widgets.Text(
    value='',
    placeholder='Enter prompt',
    description='String:',
    disabled=False
)

dd = widgets.Dropdown(
    options = [('2019', query_2019), 
                   ('2020', query_2020), 
                   ('2021', query_2021), 
                   ('2022', query_2022), 
                   ('All years', query_all),
                   ('All years using ComposableGraph', query_all_graph)],
    index=3,
    description='Index:',
)

def on_change(change):
    global dd_val
    if change['type'] == 'change' and change['name'] == 'value':
        dd_val = change['new']

dd.observe(on_change)

def callback(wdgt):
    query_engine = dd_val
    query = wdgt.value
    print("Thinking...")
    response = query_engine.query(query)
    print(response)

text.on_submit(callback)

display(dd)
display(text)


  text.on_submit(callback)


Dropdown(description='Index:', index=3, options=(('2019', <llama_index.query_engine.retriever_query_engine.Ret…

Text(value='', description='String:', placeholder='Enter prompt')