In [None]:
# from https://colab.research.google.com/drive/1uL1TdMbR4kqa0Ksrd_Of_jWSxWt1ia7o?usp=sharing#scrollTo=c48a272c-8e87-4740-9960-129d7d5943bb
# https://betterprogramming.pub/llamaindex-deep-lake-for-financial-statement-analysis-954f2b789c8e


In [None]:
# for colab https://colab.research.google.com/github/druce/question_answering_over_docs/blob/main/10kAnalysis.ipynb

# # if using colab
# import os
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# !pip install llama-index pytesseract pdf2image
# !pip uninstall rich
# !pip install rich==13.0.1

# # get data
# !mkdir uber
# !mkdir tmp
# !wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O tmp/UBER.zip
# !unzip tmp/UBER.zip -d tmp
# !mv tmp/UBER/*.html ./uber


In [45]:
import sys
import os
from datetime import datetime
from IPython.display import Markdown, display
from ipywidgets import interact, widgets
from pathlib import Path
import panel as pn  # GUI

import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# from llama_index import download_loader, ServiceContext, StorageContext, load_index_from_storage, GPTVectorStoreIndex
# from llama_index import GPTListIndex, LLMPredictor
# from llama_index.composability import ComposableGraph

import langchain
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

# if using dotenv with .env and OPENAI_API_KEY=<mykey>
import dotenv
dotenv.load_dotenv()

# if using colab
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


True

## Ingest data

In [None]:
# mkdir uber
# mkdir tmp
# wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O tmp/UBER.zip
# unzip tmp/UBER.zip -d tmp
# mv tmp/UBER/*.html ./uber

In [18]:
llm=OpenAI(openai_api_key=os.environ["OPENAI_API_KEY"])
llm.predict("What would be a good company name for a company that makes colorful socks?")


'\n\nSocked In Color.'

In [12]:
# extract raw text from html
# https://unstructured.io ; https://github.com/Unstructured-IO/unstructured

loader = UnstructuredFileLoader("./docx-unstructured.md", mode='elements')

docs = loader.load()

docs[1].page_content[:400]
# UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

# loader = UnstructuredReader()
# all_docs = []

# docs = loader.load_data(file=Path(f'./10K.html'), split_documents=False)
#     # insert year metadata into each year
# for d in docs:
#     d.extra_info = {"year": 2022, "ticker": 'MSFT', "name": "Microsoft"}
#     all_docs.extend(docs)
    

INFO:unstructured:Reading document from string ...
Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
Reading document ...


'|\n|UNITED STATES|\n|SECURITIES AND EXCHANGE COMMISSION|\n|Washington, D.C. 20549|\n|FORM 10-K|\n|☒    ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934'

In [None]:
# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

In [67]:
system_message = "You are an AI assistant that answers questions about financial documents."


In [50]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:""")


In [62]:
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 1})
# create a chain to answer questions 
# qa = ConversationalRetrievalChain.from_llm(OpenAI(model='gpt-3.5-turbo-16k'), retriever, return_source_documents=True)
qa = ConversationalRetrievalChain.from_llm(llm=ChatOpenAI(model='gpt-3.5-turbo-16k'),
                                           retriever=retriever,
                                           condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                                           return_source_documents=True,
                                           verbose=False)

In [63]:
chat_history = []
query = "what were the risk factors?"
result = qa({"question": query, 'chat_history': chat_history})
print(result['answer'])

The risk factors mentioned in the context are as follows:

1. Foreign exchange rate risk: The company is exposed to economic risk from foreign exchange rates, which may impact their consolidated financial statements. They use derivative instruments to manage this risk.

2. Interest rate risk: The securities held in the company's fixed-income portfolio are subject to different interest rate risks based on their maturities. They manage the average maturity of the portfolio to achieve desired economic returns.

3. Credit risk: The company's fixed-income portfolio consists primarily of investment-grade securities. They manage credit exposures relative to broad-based indices and to facilitate portfolio diversification.

4. Equity price risk: Securities held in the company's equity investments portfolio are subject to price risk.

These risk factors are disclosed in the context in relation to the company's exposure to market risks.


In [64]:
chat_history = []
query = "what is Microsoft?"
result = qa({"question": query, 'chat_history': chat_history})
print(result['answer'])

Microsoft is a technology company that develops, manufactures, licenses, supports, and sells a wide range of software, services, and hardware products. It was founded in 1975 by Bill Gates and Paul Allen and is headquartered in Redmond, Washington. Microsoft's mission is to empower every person and every organization on the planet to achieve more. The company is known for its flagship products such as the Windows operating system, Microsoft Office suite, and the Xbox gaming console. Microsoft also offers cloud-based services through its Azure platform and provides business solutions through its Dynamics product line. Additionally, Microsoft is involved in research and development, focusing on areas such as artificial intelligence, cloud computing, and productivity tools.


In [65]:
chat_history.append((query, result["answer"]))
query="Where is Microsoft located?"
result = qa({"question": query, 'chat_history': chat_history})
print(result['answer'])


In [48]:
import openai

from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory

openai.api_key=os.environ['OPENAI_API_KEY']

chat = ChatOpenAI()
conversation = ConversationChain(
    llm=chat,
    memory=ConversationSummaryBufferMemory(
        llm=ChatOpenAI(), max_token_limit=2048
    ),
    verbose=False,
)

print(conversation.predict(input="what is the airspeed velocity of an unladen swallow?"))

Based on my database, the airspeed velocity of an unladen swallow varies depending on the species. For example, the African swallow has been recorded flying at speeds up to 43 miles per hour, while the European swallow has been observed flying at speeds up to 36 miles per hour. However, it is important to note that the airspeed velocity can also be affected by factors such as wind conditions and the weight of the bird.


In [None]:
# THIS SHOULD NOT HAVE TO BE SET TWICE BUT OTHERWISE WE GET AN AUTHENTICATION ERROR
import openai
openai.api_key=os.environ['OPENAI_API_KEY']


In [None]:
# service_context talks to openai (or other llm)
# https://gpt-index.readthedocs.io/en/latest/reference/service_context.html

service_context = ServiceContext.from_defaults(chunk_size=512,
                                               llm=llm)
                                              


In [None]:
service_context.llm_predictor.llm.model_name

In [None]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
# generates many calls to openai to compute embedding vectors
# https://gpt-index.readthedocs.io/en/latest/examples/vector_stores/SimpleIndexDemo.html
year = 2022
fmt = 'html'
print(datetime.now(), 'indexing')
index_id = "index_%s_%d" % (fmt, year)
cur_index = GPTVectorStoreIndex.from_documents(docs,
                                               service_context=service_context)
cur_index.storage_context.persist(index_id)


In [None]:
# load previously created indexes
year = 2022
fmt = 'html'
index_id = "index_%s_%d" % (fmt, year)
cur_index.storage_context.persist(index_id)

print(datetime.now(), 'loading', fmt, year)
# load index
cur_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_id))


In [None]:
year = 2022
fmt = 'docx'

all_docs = []

print(datetime.now(), 'loading')
docs = loader.load_data(file=Path(f'./10K.{fmt}'), split_documents=False)
    # insert year metadata into each year
for d in docs:
    d.extra_info = {"year": 2022, "ticker": 'MSFT', "name": "Microsoft"}
    all_docs.extend(docs)

    
print(datetime.now(), 'indexing')
index_id = "index_%s_%d" % (fmt, year)
cur_index = GPTVectorStoreIndex.from_documents(docs,
                                               service_context=service_context)
cur_index.storage_context.persist(index_id)


In [None]:
year = 2022
fmt = 'pdf'

all_docs = []

print(datetime.now(), 'loading')
docs = loader.load_data(file=Path(f'./10K.{fmt}'), split_documents=False)
    # insert year metadata into each year
for d in docs:
    d.extra_info = {"year": 2022, "ticker": 'MSFT', "name": "Microsoft"}
    all_docs.extend(docs)

    
print(datetime.now(), 'indexing')
index_id = "index_%s_%d" % (fmt, year)
cur_index = GPTVectorStoreIndex.from_documents(docs,
                                               service_context=service_context)
cur_index.storage_context.persist(index_id)


In [None]:
year = 2022
fmt = 'md'

all_docs = []

print(datetime.now(), 'loading')
docs = loader.load_data(file=Path(f'./docx-unstructured.{fmt}'), split_documents=False)
    # insert year metadata into each year
for d in docs:
    d.extra_info = {"year": 2022, "ticker": 'MSFT', "name": "Microsoft"}
    all_docs.extend(docs)

    
print(datetime.now(), 'indexing')
index_id = "index_%s_%d" % (fmt, year)
cur_index = GPTVectorStoreIndex.from_documents(docs,
                                               service_context=service_context)
cur_index.storage_context.persist(index_id)


In [None]:
# NOTE: this global index is a single vector store containing all documents
# Only relevant for the section below: "Can a single vector index answer questions across years?"
# this generates many calls so run once and then load from index_global directory

# global_index = GPTVectorStoreIndex.from_documents(all_docs,
#                                                   service_context=service_context)
# global_index.storage_context.persist("index_global")


In [None]:
myindex = load_index_from_storage(StorageContext.from_defaults(persist_dir="index_md_2022"))


In [None]:
query_engine = myindex.as_query_engine(retriever_mode="embedding", 
                                       service_context=service_context,
                                       similarity_top_k=3,
                                       verbose=True,
                                      )
query = "What were some of the biggest risk factors?"
response = query_engine.query(query)
print(response)


In [None]:
query = "What was goodwill?"
response = query_engine.query(query)
print(response)


In [None]:
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                    )
risk_query_str = "What are some of the biggest risk factors in each year?"
response = query_all.query(risk_query_str)
print(str(response))


In [None]:
# create summary text for each doc
summaries = {}
for year in years:
    summaries[year] = f"UBER 10-k Filing for {year} fiscal year"
    

In [None]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)


In [None]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)


In [None]:
custom_query_engines = {
    my_index.index_id: my_index.as_query_engine(
        similarity_top_k=1,
        response_mode="tree_summarize",
    )
    for my_index in [index_set[y] for y in years]
}

In [None]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)


In [None]:
query_engine = graph.as_query_engine(custom_query_engines=custom_query_engines)
response = query_engine.query(risk_query_str)
print(response)

In [None]:
print(response)

In [None]:
print(response.get_formatted_sources()[:300])


In [None]:
response_tmp = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              ).query(risk_query_str)
print(str(response_tmp))


In [None]:
response = global_index.as_query_engine(retriever_mode="embedding", 
                                        service_context=service_context,   
                                        similarity_top_k=4,
                                        verbose=True,
                                       ).query(risk_query_str)
print(str(response))


In [None]:
pn.extension()

query_engine = query_all
c = 0

def pn_callback(_):
    prompt = inp.value
    inp.value = prompt
    response_text = ''
    source_text = ''
    
    if prompt:
        response = query_engine.query(prompt)
        response_text = response.response
        source_text = response.get_formatted_sources()[:300]

    panels = []
    panels.append(
        pn.Row('Question:', pn.pane.Markdown(prompt, width=600)))
    panels.append(
        pn.Row('Response:', pn.pane.Markdown(response_text, width=600, styles={'background-color': '#F6F6F6'})))
    panels.append(
        pn.Row('Sources:', pn.pane.Markdown(source_text, width=600, styles={'background-color': '#F6F6F6'})))
    
    return pn.Column(*panels)

inp = pn.widgets.TextAreaInput(height=100,
                               width=600,
                               value='',
                               placeholder='Enter question here…',
                              )
button_conversation = pn.widgets.Button(name="Chat!")

interactive_conversation = pn.bind(pn_callback, button_conversation)


dashboard = pn.Column(
    inp,
    pn.Row(button_conversation),
    pn.panel(interactive_conversation, loading_indicator=True, height=300),
)

dashboard

In [None]:
# submit questions using a text widget and dropdown for which index to query
# todo use textarea
# default question to value of risk_query_string
# Describe the current risk factors. If the year is provided in the information, provide that as well. If the context contains risk factors for multiple years, explicitly provide the following: A description of the risk factors for each year; A summary of how these risk factors are changing across years"
# add submit button

query_2019 = index_set[2019].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2020 = index_set[2020].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2021 = index_set[2021].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2022 = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                         )
query_all_graph = graph.as_query_engine(custom_query_engines=custom_query_engines)

text = widgets.Text(
    value='',
    placeholder='Enter prompt',
    description='String:',
    disabled=False
)

dd = widgets.Dropdown(
    options = [('2019', query_2019), 
                   ('2020', query_2020), 
                   ('2021', query_2021), 
                   ('2022', query_2022), 
                   ('All years', query_all),
                   ('All years using ComposableGraph', query_all_graph)],
    index=3,
    description='Index:',
)

def on_change(change):
    global dd_val
    if change['type'] == 'change' and change['name'] == 'value':
        dd_val = change['new']

dd.observe(on_change)

def callback(wdgt):
    query_engine = dd_val
    query = wdgt.value
    print("Thinking...")
    response = query_engine.query(query)
    print(response)

text.on_submit(callback)

display(dd)
display(text)


In [None]:
print(risk_query_str)

In [None]:
import mammoth
with open("10K.docx", "rb") as docx_file:
    result = mammoth.convert_to_markdown(docx_file)
with open("docx-mammoth.md", "w") as markdown_file:
    markdown_file.write(result.value)

In [None]:
from markdownify import markdownify as md
with open("10K.html") as html_file:
    html_str = "".join(html_file.readlines())
with open("html-markdownify.md", "w") as markdown_file:
    markdown_file.write(md(html_str))


In [None]:
print(md(html_str)[:999])

In [None]:
import textract
text = textract.process("10K.docx")
with open("docx-textract.md", "wb") as markdown_file:
    markdown_file.write(text)

In [None]:
from unstructured.partition.auto import partition
elements = partition("10K.pdf")
elements_txt = [e.text for e in elements]
with open("pdf-unstructured.md", "w") as markdown_file:
    markdown_file.write("|\n|".join(elements_txt))


In [None]:
elements = partition("10K.docx")
elements_txt = [e.text for e in elements]
with open("docx-unstructured.md", "w") as markdown_file:
    markdown_file.write("|\n|".join(elements_txt))


In [None]:
elements[2002].metadata