<a href="https://colab.research.google.com/github/dhnanjay/HuggingFace/blob/main/instructor_abnb_10k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import getpass
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
!pip install -U -q langchain openai chromadb unstructured==0.12.5 instructor tiktoken

# Download Airbnb 10-K from SEC

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

url = "https://www.sec.gov/Archives/edgar/data/1559720/000155972024000006/abnb-20231231.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'your-org your@org.com'})
documents = loader.load()

# Chunk and store 10-K in vector DB

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter

# Naively chunk the SEC filing by tokens
token_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = token_splitter.split_documents(documents)

In [None]:
# Save the chunked docs in vector DB
vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())

# Query vector DB for contextual documents

In [None]:
query = "What was Airbnb's revenue, net income, and cost of revenue for fiscal year 2023?"

In [None]:
# Get documents from the vector DB
k = 3
top_k_docs = vectorstore.similarity_search(query, k)
context = "\n".join([doc.page_content for doc in top_k_docs])

# Answer and structure output with Instructor

In [None]:
import instructor
from openai import OpenAI
from pydantic import BaseModel
from pydantic import Field

# Define our data model
class Financials(BaseModel):
  ticker: str = Field(description="Ticker symbol of the stock")
  period: str = Field(description="The financial period like FY2023")
  revenue: str = Field(description="Revenue, including currency and unit suffix like $10.45Bn")
  cost_of_revenue: str = Field(description="Cost of revenue, including currency and unit suffix like $10.45Bn")
  net_income: str = Field(description="Net income, including currency and unit suffix like $10.45Bn")

# Instantiate instructor
client = instructor.patch(OpenAI())

# Call openai
response = client.chat.completions.create(
    model="gpt-4-0125-preview",
    response_model=Financials,
    messages=[
        {"role": "system", "content": "You are an expert financial assistant that reads SEC filings and answers questions."},
        {"role": "user", "content": f"Question: {query} Context: {context}"},
    ],
)

In [None]:
print(response.model_dump_json(indent=2))

{
  "ticker": "ABNB",
  "period": "FY2023",
  "revenue": "$9.9Bn",
  "cost_of_revenue": "$1.703Bn",
  "net_income": "$4.8Bn"
}
