In [None]:
# from https://colab.research.google.com/drive/1uL1TdMbR4kqa0Ksrd_Of_jWSxWt1ia7o?usp=sharing#scrollTo=c48a272c-8e87-4740-9960-129d7d5943bb
# https://betterprogramming.pub/llamaindex-deep-lake-for-financial-statement-analysis-954f2b789c8e


In [None]:
# for colab https://colab.research.google.com/github/druce/question_answering_over_docs/blob/main/10kAnalysis.ipynb

# # if using colab
# import os
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# !pip install llama-index pytesseract pdf2image
# !pip uninstall rich
# !pip install rich==13.0.1

# # get data
# !mkdir uber
# !mkdir tmp
# !wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O tmp/UBER.zip
# !unzip tmp/UBER.zip -d tmp
# !mv tmp/UBER/*.html ./uber


In [1]:
import sys
import os
from datetime import datetime
from IPython.display import Markdown, display
from ipywidgets import interact, widgets
from pathlib import Path
import panel as pn  # GUI

import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import download_loader, ServiceContext, StorageContext, load_index_from_storage, GPTVectorStoreIndex
from llama_index import GPTListIndex, LLMPredictor
from llama_index.composability import ComposableGraph

from langchain import OpenAI

# if using dotenv with .env and OPENAI_API_KEY=<mykey>
import dotenv
dotenv.load_dotenv()

# if using colab
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


True

## Ingest data

In [2]:
# mkdir uber
# mkdir tmp
# wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O tmp/UBER.zip
# unzip tmp/UBER.zip -d tmp
# mv tmp/UBER/*.html ./uber

In [3]:
# extract raw text from html
# https://unstructured.io ; https://github.com/Unstructured-IO/unstructured
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2022, 2021, 2020, 2019]
for year in years:
    year_docs = loader.load_data(file=Path(f'./uber/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)
    

[nltk_data] Downloading package punkt to /Users/drucev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/drucev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...


In [4]:
llm = OpenAI(temperature=0.9, openai_api_key=os.environ["OPENAI_API_KEY"])
print(llm.predict("What would be a good company name for a company that makes colorful socks?"))
llm.model_name




Cheerful Socks.


'text-davinci-003'

In [5]:
# service_context talks to openai (or other llm)
# https://gpt-index.readthedocs.io/en/latest/reference/service_context.html

service_context = ServiceContext.from_defaults(chunk_size=512,
                                               llm=llm)


In [6]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
# generates many calls to openai to compute embedding vectors
# https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/SimpleIndexDemo.html

index_set = {}

for year in years:    
    print(datetime.now(), 'indexing', year)
    index_id = "index_%d" % year
    cur_index = GPTVectorStoreIndex.from_documents(doc_set[year],
                                                   service_context=service_context)
    index_set[year] = cur_index
    cur_index.storage_context.persist(index_id)


2023-06-25 12:57:32.552192 indexing 2022


RetryError: RetryError[<Future at 0x7fdb906830a0 state=finished raised AuthenticationError>]

In [None]:
# load previously created indexes
index_set = {}
for year in years:
    index_id = "index_%d" % year
    print(datetime.now(), 'loading', year)
    # load index
    cur_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_id))
    index_set[year] = cur_index


In [None]:
# NOTE: this global index is a single vector store containing all documents
# Only relevant for the section below: "Can a single vector index answer questions across years?"
# this generates many calls so run once and then load from index_global directory

# global_index = GPTVectorStoreIndex.from_documents(all_docs,
#                                                   service_context=service_context)
# global_index.storage_context.persist("index_global")


In [None]:
global_index = load_index_from_storage(StorageContext.from_defaults(persist_dir="index_global"))


In [None]:
query_engine = index_set[2020].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query = "What were some of the biggest risk factors in 2020?"
response = query_engine.query(query)
print(response)


In [None]:
query = "What were some of the signifcant acquisitions?"
response = query_engine.query(query)
print(response)


In [None]:
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                    )
risk_query_str = "What are some of the biggest risk factors in each year?"
response = query_all.query(risk_query_str)
print(str(response))


In [None]:
# create summary text for each doc
summaries = {}
for year in years:
    summaries[year] = f"UBER 10-k Filing for {year} fiscal year"
    

In [None]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)


In [None]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)


In [None]:
custom_query_engines = {
    my_index.index_id: my_index.as_query_engine(
        similarity_top_k=1,
        response_mode="tree_summarize",
    )
    for my_index in [index_set[y] for y in years]
}

In [None]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)


In [None]:
query_engine = graph.as_query_engine(custom_query_engines=custom_query_engines)
response = query_engine.query(risk_query_str)
print(response)

In [None]:
print(response)

In [None]:
print(response.get_formatted_sources()[:300])


In [None]:
response_tmp = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              ).query(risk_query_str)
print(str(response_tmp))


In [None]:
response = global_index.as_query_engine(retriever_mode="embedding", 
                                        service_context=service_context,   
                                        similarity_top_k=4,
                                        verbose=True,
                                       ).query(risk_query_str)
print(str(response))


In [None]:
pn.extension()

query_engine = query_all
c = 0

def pn_callback(_):
    prompt = inp.value
    inp.value = prompt
    response_text = ''
    source_text = ''
    
    if prompt:
        response = query_engine.query(prompt)
        response_text = response.response
        source_text = response.get_formatted_sources()[:300]

    panels = []
    panels.append(
        pn.Row('Question:', pn.pane.Markdown(prompt, width=600)))
    panels.append(
        pn.Row('Response:', pn.pane.Markdown(response_text, width=600, styles={'background-color': '#F6F6F6'})))
    panels.append(
        pn.Row('Sources:', pn.pane.Markdown(source_text, width=600, styles={'background-color': '#F6F6F6'})))
    
    return pn.Column(*panels)

inp = pn.widgets.TextAreaInput(height=100,
                               width=600,
                               value='',
                               placeholder='Enter question here…',
                              )
button_conversation = pn.widgets.Button(name="Chat!")

interactive_conversation = pn.bind(pn_callback, button_conversation)


dashboard = pn.Column(
    inp,
    pn.Row(button_conversation),
    pn.panel(interactive_conversation, loading_indicator=True, height=300),
)

dashboard

In [None]:
# submit questions using a text widget and dropdown for which index to query
# todo use textarea
# default question to value of risk_query_string
# Describe the current risk factors. If the year is provided in the information, provide that as well. If the context contains risk factors for multiple years, explicitly provide the following: A description of the risk factors for each year; A summary of how these risk factors are changing across years"
# add submit button

query_2019 = index_set[2019].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2020 = index_set[2020].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2021 = index_set[2021].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2022 = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                         )
query_all_graph = graph.as_query_engine(custom_query_engines=custom_query_engines)

text = widgets.Text(
    value='',
    placeholder='Enter prompt',
    description='String:',
    disabled=False
)

dd = widgets.Dropdown(
    options = [('2019', query_2019), 
                   ('2020', query_2020), 
                   ('2021', query_2021), 
                   ('2022', query_2022), 
                   ('All years', query_all),
                   ('All years using ComposableGraph', query_all_graph)],
    index=3,
    description='Index:',
)

def on_change(change):
    global dd_val
    if change['type'] == 'change' and change['name'] == 'value':
        dd_val = change['new']

dd.observe(on_change)

def callback(wdgt):
    query_engine = dd_val
    query = wdgt.value
    print("Thinking...")
    response = query_engine.query(query)
    print(response)

text.on_submit(callback)

display(dd)
display(text)


In [None]:
print(risk_query_str)

In [None]:
import mammoth
with open("10K.docx", "rb") as docx_file:
    result = mammoth.convert_to_markdown(docx_file)
with open("docx-mammoth.md", "w") as markdown_file:
    markdown_file.write(result.value)

In [None]:
from markdownify import markdownify as md
with open("10K.html") as html_file:
    html_str = "".join(html_file.readlines())
with open("html-markdownify.md", "w") as markdown_file:
    markdown_file.write(md(html_str))


In [None]:
print(md(html_str)[:999])

In [None]:
import textract
text = textract.process("10K.docx")
with open("docx-textract.md", "wb") as markdown_file:
    markdown_file.write(text)

In [None]:
from unstructured.partition.auto import partition
elements = partition("10K.pdf")
elements_txt = [e.text for e in elements]
with open("pdf-unstructured.md", "w") as markdown_file:
    markdown_file.write("|\n|".join(elements_txt))


In [None]:
elements = partition("10K.docx")
elements_txt = [e.text for e in elements]
with open("docx-unstructured.md", "w") as markdown_file:
    markdown_file.write("|\n|".join(elements_txt))


In [None]:
elements[2002].metadata