In [None]:
%pip install python-dotenv langchain-openai langchain_community chromadb unstructured[html] langchain_hub

In [2]:
import warnings
from langchain_core.documents import Document
from dotenv import load_dotenv
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromsptTemplate
from langchain_core.runnables import RunnablePassthrough
from unstructured.partition.html import partition_html
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from unstructured.chunking.title import chunk_by_title
from langchain_community.vectorstores.utils import filter_complex_metadata

warnings.filterwarnings('ignore')
_ = load_dotenv()

In [3]:
filename = "../docs/nvidia_financial_results_q1_fiscal_2025.html"

html_elements = partition_html(filename=filename)

Display some tables

In [None]:
tables = [el for el in html_elements if el.category == "Table"]

table_html = tables[3].metadata.text_as_html

from io import StringIO 
from lxml import etree

parser = etree.XMLParser(remove_blank_text=True)
file_obj = StringIO(table_html)
tree = etree.parse(file_obj, parser)

from IPython.core.display import HTML
HTML(table_html)

Process html_elements

Chunk the html by titles

In [7]:
elements = chunk_by_title(html_elements)

In [8]:
documents = []
for element in elements:
    metadata = element.metadata.to_dict()
    del metadata["languages"]
    metadata["source"] = metadata["filename"]
    documents.append(Document(page_content=element.text, metadata=metadata))

# Filter out elements with complex metadata that are not useful for the vector store
documents = filter_complex_metadata(documents)

Add the documents to the Vector store

In [9]:
embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(documents, embeddings)

query = "Whats the basic net income per share for the three months ended April 28, 2024?"

retriever = vectorstore.as_retriever()

result = retriever.invoke(query, k=4)

print

LangChain chain & LCEL

In [11]:
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(temperature=0,
                   model="gpt-4o")

# RAG pipeline
chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
)

In [12]:
query = "Whats the basic net income per share for the three months ended April 28, 2024?"

response = chain.invoke(query)
print(response)

The basic net income per share for the three months ended April 28, 2024, is $6.04.
