In [None]:
# from https://colab.research.google.com/drive/1uL1TdMbR4kqa0Ksrd_Of_jWSxWt1ia7o?usp=sharing#scrollTo=c48a272c-8e87-4740-9960-129d7d5943bb
# https://betterprogramming.pub/llamaindex-deep-lake-for-financial-statement-analysis-954f2b789c8e


In [None]:
# for colab https://colab.research.google.com/github/druce/question_answering_over_docs/blob/main/10kAnalysis.ipynb

# # if using colab
# import os
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# !pip install llama-index pytesseract pdf2image

!pip uninstall rich
!pip install rich==13.0.1
!pip install langchain
!pip install unstructured
!pip install openai
!pip install chromadb
!pip install tiktoken



In [1]:
import sys
import os
from datetime import datetime
from IPython.display import Markdown, display
from ipywidgets import interact, widgets
from pathlib import Path
import panel as pn  # GUI

import logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# from llama_index import download_loader, ServiceContext, StorageContext, load_index_from_storage, GPTVectorStoreIndex
# from llama_index import GPTListIndex, LLMPredictor
# from llama_index.composability import ComposableGraph

import langchain
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA, ConversationChain, ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationSummaryBufferMemory

# if using dotenv with .env and OPENAI_API_KEY=<mykey>
import dotenv
dotenv.load_dotenv()

# if using colab
# OPENAI_API_KEY="<mykey>"
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


True

In [2]:
# conversational chain that remembers history

from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from langchain.prompts import (
    ChatPromptTemplate, 
    MessagesPlaceholder, 
    SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate
)
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory

llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.3)

prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template("""The following is a friendly conversation between a human 
and an expert ornithologist AI specializing in migratory birds. The ornithologist AI is talkative and 
provides lots of specific details from its context. If the ornithologist AI does not know the answer to a question, 
it truthfully says it does not know."""),
    MessagesPlaceholder(variable_name="history"),
    HumanMessagePromptTemplate.from_template("{input}")
])

memory = ConversationBufferMemory(return_messages=True)

conversation = ConversationChain(llm=llm, memory=memory, prompt=prompt, verbose=True)

print(conversation.predict(input="what is the airspeed velocity of an unladen swallow?"))




[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: The following is a friendly conversation between a human 
and an expert ornithologist AI specializing in migratory birds. The ornithologist AI is talkative and 
provides lots of specific details from its context. If the ornithologist AI does not know the answer to a question, 
it truthfully says it does not know.
Human: what is the airspeed velocity of an unladen swallow?[0m

[1m> Finished chain.[0m
The airspeed velocity of an unladen swallow varies depending on the species and size of the bird. For example, the airspeed velocity of a European Swallow is about 11 meters per second or 24 miles per hour. However, the airspeed velocity of a Barn Swallow is slightly slower, at around 9 meters per second or 20 miles per hour. It's important to note that these speeds are for birds in level flight and may vary depending on other factors such as wind speed and direction.


In [4]:
print(conversation.predict(input="what is the migratory bird that travels the longest distance?"))




[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: The following is a friendly conversation between a human 
and an expert ornithologist AI specializing in migratory birds. The ornithologist AI is talkative and 
provides lots of specific details from its context. If the ornithologist AI does not know the answer to a question, 
it truthfully says it does not know.
Human: what is the airspeed velocity of an unladen swallow?
AI: The airspeed velocity of an unladen swallow varies depending on the species and size of the bird. For example, the airspeed velocity of a European Swallow is about 11 meters per second or 24 miles per hour. However, the airspeed velocity of a Barn Swallow is slightly slower, at around 9 meters per second or 20 miles per hour. It's important to note that these speeds are for birds in level flight and may vary depending on other factors such as wind speed and direction.
Human: what is the migratory bird that travels the longest distance

## Ingest data

In [5]:
# extract raw text from markdown
# https://unstructured.io ; https://github.com/Unstructured-IO/unstructured

loader = UnstructuredFileLoader("./abbrev.md", mode='elements')
chunks = loader.load()
print(f"Number of chunks: {len(chunks)}")
print(f"Average length:", sum([len(d.page_content) for d in chunks])/len(chunks))
print(f"Max length:", max([len(d.page_content) for d in chunks]))
chunks[1].page_content[:400]



INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
Number of chunks: 37
Average length: 4499.0
Max length: 79127


'Satya\xa0Nadella                         54               Chairman of the Board and Chief Executive Officer\nJudson Althoff                        49               Executive Vice President and Chief Commercial Officer\nChristopher\xa0C.\xa0Capossela              52               Executive Vice President, Marketing and Consumer Business, and Chief Marketing Officer\nKathleen T. Hogan                     56    '

In [21]:
chunks[10].page_content[:400]


'Net income                                      $     72,738             $     61,271             $     44,281'

In [7]:
# chunks are pretty big, let's split them
text_splitter = CharacterTextSplitter(separator="|", chunk_size=512, chunk_overlap=64)
small_chunks = text_splitter.split_documents(chunks)
print(f"Number of chunks: {len(small_chunks)}")
print(f"Average length:", sum([len(d.page_content) for d in small_chunks])/len(small_chunks))
print(f"Max length:", max([len(d.page_content) for d in small_chunks]))


Created a chunk of size 541, which is longer than the specified 512
Created a chunk of size 662, which is longer than the specified 512
Created a chunk of size 657, which is longer than the specified 512
Created a chunk of size 1241, which is longer than the specified 512
Created a chunk of size 802, which is longer than the specified 512
Created a chunk of size 683, which is longer than the specified 512
Created a chunk of size 906, which is longer than the specified 512
Created a chunk of size 597, which is longer than the specified 512
Created a chunk of size 573, which is longer than the specified 512
Created a chunk of size 521, which is longer than the specified 512
Created a chunk of size 1090, which is longer than the specified 512
Created a chunk of size 552, which is longer than the specified 512
Created a chunk of size 914, which is longer than the specified 512
Created a chunk of size 683, which is longer than the specified 512
Created a chunk of size 662, which is longer t

Created a chunk of size 608, which is longer than the specified 512
Created a chunk of size 529, which is longer than the specified 512
Created a chunk of size 764, which is longer than the specified 512
Created a chunk of size 609, which is longer than the specified 512
Created a chunk of size 676, which is longer than the specified 512
Created a chunk of size 564, which is longer than the specified 512
Created a chunk of size 640, which is longer than the specified 512
Created a chunk of size 514, which is longer than the specified 512
Created a chunk of size 645, which is longer than the specified 512
Created a chunk of size 820, which is longer than the specified 512
Created a chunk of size 1048, which is longer than the specified 512
Created a chunk of size 858, which is longer than the specified 512
Created a chunk of size 516, which is longer than the specified 512
Created a chunk of size 1352, which is longer than the specified 512
Created a chunk of size 1756, which is longer 

Created a chunk of size 1251, which is longer than the specified 512
Created a chunk of size 1321, which is longer than the specified 512
Created a chunk of size 772, which is longer than the specified 512
Created a chunk of size 768, which is longer than the specified 512
Created a chunk of size 1171, which is longer than the specified 512
Created a chunk of size 912, which is longer than the specified 512
Created a chunk of size 647, which is longer than the specified 512
Created a chunk of size 1629, which is longer than the specified 512
Created a chunk of size 1916, which is longer than the specified 512
Created a chunk of size 574, which is longer than the specified 512
Created a chunk of size 4662, which is longer than the specified 512
Number of chunks: 302
Average length: 552.0033112582781
Max length: 4662


In [6]:
# native unstructured without langchain
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json

input_filename = "abbrev.md"

elements = partition(filename=input_filename)
small_elements = []
for e in elements:
    small_elements.extend(e.text.split("|"))
small_elements = [e for e in small_elements if len(e.strip())]

print(f"Number of chunks: {len(small_elements)}")
print(f"Average length:", sum([len(d) for d in small_elements])/len(small_elements))
print(f"Max length:", max([len(d) for d in small_elements]))


INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
Number of chunks: 425
Average length: 388.67764705882354
Max length: 4662


In [26]:
small_elements[5]

'Microsoft is innovating and expanding our entire portfolio to help people and organizations overcome today’s challenges and emerge stronger. We bring technology and products together into experiences and solutions that unlock value for our customers.'

In [28]:
# vectorize and persist in Chroma db
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002',)
# create the vectorestore to use as the index
db = Chroma.from_texts(small_elements, embeddings, persist_directory="chroma_msft")
db.persist()


INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO:chromadb.db.duckdb:No existing DB found in chroma_msft, skipping load
No existing DB found in chroma_msft, skipping load
INFO:chromadb.db.duckdb:No existing DB found in chroma_msft, skipping load
No existing DB found in chroma_msft, skipping load
INFO:chromadb.db.duckdb:Persisting DB to disk, putting it in the save folder: chroma_msft
Persisting DB to disk, putting it in the save folder: chroma_msft


In [29]:
# query vector store
query = "What were the risk factors?"
docs = db.similarity_search(query,k=5)
print(len(docs))
print([d.page_content for d in docs])


5
['ITEM\xa01A. RISK FACTORS', 'GENERAL RISKS', 'OPERATIONAL RISKS', 'Measures to contain a global pandemic may intensify other risks described in these Risk Factors. Any of these measures may adversely impact our ability to:', 'STRATEGIC AND COMPETITIVE RISKS']


In [None]:
# test different loaders - markdown
loader = UnstructuredFileLoader("./docx-unstructured.md", mode='elements')
tmpdocs = loader.load()
print(len(tmpdocs))
tmpdocs[100].page_content[:400]


In [None]:
# docx
loader = UnstructuredFileLoader("./10K.docx", mode='elements')
tmpdocs = loader.load()
print(len(tmpdocs))
tmpdocs[100].page_content[:400]


In [None]:
# pdf
loader = UnstructuredFileLoader("./10K.pdf", mode='elements')
tmpdocs = loader.load()
print(len(tmpdocs))
tmpdocs[100].page_content[:400]


In [None]:
# html
loader = UnstructuredFileLoader("./10K.html", mode='elements')
tmpdocs = loader.load()
print(len(tmpdocs))
tmpdocs[100].page_content[:400]


In [None]:
# xls
loader = UnstructuredFileLoader("./10K.xls", mode='elements')
tmpdocs = loader.load()
print(len(tmpdocs))
tmpdocs[20].page_content[:400]


## Question answering

In [30]:
# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 20})


In [31]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:""")

# create a chain to answer questions 
# qa = ConversationalRetrievalChain.from_llm(OpenAI(model='gpt-3.5-turbo-16k'), retriever, return_source_documents=True)
qa = ConversationalRetrievalChain.from_llm(llm=ChatOpenAI(model='gpt-3.5-turbo'),
                                           retriever=retriever,
                                           condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                                           return_source_documents=True,
                                           verbose=True)



In [37]:
chat_history = []
query = "what were the risk factors?"
result = qa({"question": query, 'chat_history': chat_history})
chat_history.append(HumanMessage(content=query))
chat_history.append(SystemMessage(content=result['answer']))
print(result['answer'])




[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
ITEM 1A. RISK FACTORS

GENERAL RISKS

OPERATIONAL RISKS

STRATEGIC AND COMPETITIVE RISKS

Measures to contain a global pandemic may intensify other risks described in these Risk Factors. Any of these measures may adversely impact our ability to:

Wellness and Safety

LEGAL, REGULATORY, AND LITIGATION RISKS

INTELLECTUAL PROPERTY RISKS

RISKS RELATING TO THE EVOLUTION OF OUR BUSINESS

Data security breaches, compliance failures, or actions of partners or individual employees. 

Our global business exposes us to operational and economic risks. Our customers are located throughout the world and a significant part of our revenue comes from international sales. The global nature of our business creates o

In [38]:
query = "what is Microsoft?"
result = qa({"question": query, 'chat_history': chat_history})
print(result['answer'])
chat_history.append(HumanMessage(content=query))
chat_history.append(SystemMessage(content=result['answer']))




[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3m
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:

Human: what were the risk factors?
system: The risk factors mentioned in the provided context include General Risks, Operational Risks, Strategic and Competitive Risks, Legal, Regulatory, and Litigation Risks, Intellectual Property Risks, Risks Relating to the Evolution of the Business, Data Security Breaches, Compliance Failures, or Actions of Partners or Individual Employees, Cybersecurity, Data Privacy, and Platform Abuse Risks, and Geopolitical Risks.
Follow Up Input: what is Microsoft?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know,

In [39]:
query="Where is Microsoft located?"
result = qa({"question": query, 'chat_history': chat_history})
print(result['answer'])
chat_history.append(HumanMessage(content=query))
chat_history.append(SystemMessage(content=result['answer']))

# todo - run in colab
# try to answer questions based on tables
# try to get it to run 



[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3m
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:

Human: what were the risk factors?
system: The risk factors mentioned in the provided context include General Risks, Operational Risks, Strategic and Competitive Risks, Legal, Regulatory, and Litigation Risks, Intellectual Property Risks, Risks Relating to the Evolution of the Business, Data Security Breaches, Compliance Failures, or Actions of Partners or Individual Employees, Cybersecurity, Data Privacy, and Platform Abuse Risks, and Geopolitical Risks.
Human: what is Microsoft?
system: The different types of risk factors mentioned in the provided context are:

1. Strategic and Competitive Risks
2. General Risks
3. Operational Risks
4. Legal, Regulatory, and Litigation Risks
5. Intellectual Property Risks
6. Risks Relating to the Evolution of Our Business
7. Cybersecurity, Data


[1m> Finished chain.[0m

[1m> Finished chain.[0m
Microsoft's corporate headquarters are located in Redmond, Washington, in the United States. They also have research and development facilities in other parts of the U.S. and around the world.


In [40]:
chat_history.append((query, result["answer"]))
query="What was accrued compensation as of June 30, 2022"
result = qa({"question": query, 'chat_history': chat_history})
print(result['answer'])




[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3m
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:

Human: what were the risk factors?
system: The risk factors mentioned in the provided context include General Risks, Operational Risks, Strategic and Competitive Risks, Legal, Regulatory, and Litigation Risks, Intellectual Property Risks, Risks Relating to the Evolution of the Business, Data Security Breaches, Compliance Failures, or Actions of Partners or Individual Employees, Cybersecurity, Data Privacy, and Platform Abuse Risks, and Geopolitical Risks.
Human: what is Microsoft?
system: The different types of risk factors mentioned in the provided context are:

1. Strategic and Competitive Risks
2. General Risks
3. Operational Risks
4. Legal, Regulatory, and Litigation Risks
5. Intellectual Property Risks
6. Risks Relating to the Evolution of Our Business
7. Cybersecurity, Data


[1m> Finished chain.[0m

[1m> Finished chain.[0m
Accrued compensation as of June 30, 2022 was $10,661 million.


In [None]:
year = 2022
fmt = 'pdf'

all_docs = []

print(datetime.now(), 'loading')
docs = loader.load_data(file=Path(f'./10K.{fmt}'), split_documents=False)
    # insert year metadata into each year
for d in docs:
    d.extra_info = {"year": 2022, "ticker": 'MSFT', "name": "Microsoft"}
    all_docs.extend(docs)

    
print(datetime.now(), 'indexing')
index_id = "index_%s_%d" % (fmt, year)
cur_index = GPTVectorStoreIndex.from_documents(docs,
                                               service_context=service_context)
cur_index.storage_context.persist(index_id)


In [None]:
year = 2022
fmt = 'md'

all_docs = []

print(datetime.now(), 'loading')
docs = loader.load_data(file=Path(f'./docx-unstructured.{fmt}'), split_documents=False)
    # insert year metadata into each year
for d in docs:
    d.extra_info = {"year": 2022, "ticker": 'MSFT', "name": "Microsoft"}
    all_docs.extend(docs)

    
print(datetime.now(), 'indexing')
index_id = "index_%s_%d" % (fmt, year)
cur_index = GPTVectorStoreIndex.from_documents(docs,
                                               service_context=service_context)
cur_index.storage_context.persist(index_id)


In [None]:
# NOTE: this global index is a single vector store containing all documents
# Only relevant for the section below: "Can a single vector index answer questions across years?"
# this generates many calls so run once and then load from index_global directory

# global_index = GPTVectorStoreIndex.from_documents(all_docs,
#                                                   service_context=service_context)
# global_index.storage_context.persist("index_global")


In [None]:
myindex = load_index_from_storage(StorageContext.from_defaults(persist_dir="index_md_2022"))


In [None]:
query_engine = myindex.as_query_engine(retriever_mode="embedding", 
                                       service_context=service_context,
                                       similarity_top_k=3,
                                       verbose=True,
                                      )
query = "What were some of the biggest risk factors?"
response = query_engine.query(query)
print(response)


In [None]:
query = "What was goodwill?"
response = query_engine.query(query)
print(response)


In [None]:
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                    )
risk_query_str = "What are some of the biggest risk factors in each year?"
response = query_all.query(risk_query_str)
print(str(response))


In [None]:
# create summary text for each doc
summaries = {}
for year in years:
    summaries[year] = f"UBER 10-k Filing for {year} fiscal year"
    

In [None]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)


In [None]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)


In [None]:
custom_query_engines = {
    my_index.index_id: my_index.as_query_engine(
        similarity_top_k=1,
        response_mode="tree_summarize",
    )
    for my_index in [index_set[y] for y in years]
}

In [None]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)


In [None]:
query_engine = graph.as_query_engine(custom_query_engines=custom_query_engines)
response = query_engine.query(risk_query_str)
print(response)

In [None]:
print(response)

In [None]:
print(response.get_formatted_sources()[:300])


In [None]:
response_tmp = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              ).query(risk_query_str)
print(str(response_tmp))


In [None]:
response = global_index.as_query_engine(retriever_mode="embedding", 
                                        service_context=service_context,   
                                        similarity_top_k=4,
                                        verbose=True,
                                       ).query(risk_query_str)
print(str(response))


In [None]:
pn.extension()

query_engine = query_all
c = 0

def pn_callback(_):
    prompt = inp.value
    inp.value = prompt
    response_text = ''
    source_text = ''
    
    if prompt:
        response = query_engine.query(prompt)
        response_text = response.response
        source_text = response.get_formatted_sources()[:300]

    panels = []
    panels.append(
        pn.Row('Question:', pn.pane.Markdown(prompt, width=600)))
    panels.append(
        pn.Row('Response:', pn.pane.Markdown(response_text, width=600, styles={'background-color': '#F6F6F6'})))
    panels.append(
        pn.Row('Sources:', pn.pane.Markdown(source_text, width=600, styles={'background-color': '#F6F6F6'})))
    
    return pn.Column(*panels)

inp = pn.widgets.TextAreaInput(height=100,
                               width=600,
                               value='',
                               placeholder='Enter question here…',
                              )
button_conversation = pn.widgets.Button(name="Chat!")

interactive_conversation = pn.bind(pn_callback, button_conversation)


dashboard = pn.Column(
    inp,
    pn.Row(button_conversation),
    pn.panel(interactive_conversation, loading_indicator=True, height=300),
)

dashboard

In [None]:
# submit questions using a text widget and dropdown for which index to query
# todo use textarea
# default question to value of risk_query_string
# Describe the current risk factors. If the year is provided in the information, provide that as well. If the context contains risk factors for multiple years, explicitly provide the following: A description of the risk factors for each year; A summary of how these risk factors are changing across years"
# add submit button

query_2019 = index_set[2019].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2020 = index_set[2020].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2021 = index_set[2021].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_2022 = index_set[2022].as_query_engine(retriever_mode="embedding", 
                                               service_context=service_context,                                     
                                               similarity_top_k=3,
                                               verbose=True,
                                              )
query_all = global_index.as_query_engine(retriever_mode="embedding", 
                                         service_context=service_context,   
                                         similarity_top_k=3,
#                                          response_mode="tree_summarize",
                                         verbose=True,
                                         )
query_all_graph = graph.as_query_engine(custom_query_engines=custom_query_engines)

text = widgets.Text(
    value='',
    placeholder='Enter prompt',
    description='String:',
    disabled=False
)

dd = widgets.Dropdown(
    options = [('2019', query_2019), 
                   ('2020', query_2020), 
                   ('2021', query_2021), 
                   ('2022', query_2022), 
                   ('All years', query_all),
                   ('All years using ComposableGraph', query_all_graph)],
    index=3,
    description='Index:',
)

def on_change(change):
    global dd_val
    if change['type'] == 'change' and change['name'] == 'value':
        dd_val = change['new']

dd.observe(on_change)

def callback(wdgt):
    query_engine = dd_val
    query = wdgt.value
    print("Thinking...")
    response = query_engine.query(query)
    print(response)

text.on_submit(callback)

display(dd)
display(text)


In [None]:
print(risk_query_str)

In [None]:
import mammoth
with open("10K.docx", "rb") as docx_file:
    result = mammoth.convert_to_markdown(docx_file)
with open("docx-mammoth.md", "w") as markdown_file:
    markdown_file.write(result.value)

In [None]:
from markdownify import markdownify as md
with open("10K.html") as html_file:
    html_str = "".join(html_file.readlines())
with open("html-markdownify.md", "w") as markdown_file:
    markdown_file.write(md(html_str))


In [None]:
print(md(html_str)[:999])

In [None]:
import textract
text = textract.process("10K.docx")
with open("docx-textract.md", "wb") as markdown_file:
    markdown_file.write(text)

In [None]:
from unstructured.partition.auto import partition
elements = partition("10K.pdf")
elements_txt = [e.text for e in elements]
with open("pdf-unstructured.md", "w") as markdown_file:
    markdown_file.write("|\n|".join(elements_txt))


In [None]:
elements = partition("10K.docx")
elements_txt = [e.text for e in elements]
with open("docx-unstructured.md", "w") as markdown_file:
    markdown_file.write("|\n|".join(elements_txt))


In [None]:
elements[2002].metadata