<a href="https://colab.research.google.com/github/chriscarrollsmith/kaggle_notebooks/blob/main/gpt_xbrl_sec_filing_reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import getpass
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
!pip install -U -q langchain openai anthropic instructor

# Download XBRL data from SEC

In [None]:
import requests

# An accession number is a unique identifier to an SEC filing (like a 10-K)
def get_accession_numbers(cik: str):
    submissions_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    submissions_response = requests.get(submissions_url, headers={'User-Agent': 'your-org your@org.com'})
    json: dict = submissions_response.json()
    filings: dict = json.get("filings", {})
    recent_filings: dict = filings.get("recent", {})
    forms: List[str] = recent_filings.get("form", [])
    accession_numbers: List[str] = recent_filings.get("accessionNumber", [])

    # Get the indices of the 10-K
    form_indices = []
    for index, form in enumerate(forms):
        if form == "10-K":
            form_indices.append(index)

    accession_numbers_set = set()
    for index in form_indices:
        accession_numbers_set.add(accession_numbers[index])

    return accession_numbers_set

In [None]:
accession_numbers = get_accession_numbers("0001559720")

In [None]:
# Download all XBRL data
xbrl = requests.get("https://data.sec.gov/api/xbrl/companyfacts/CIK0001559720.json", headers={'User-Agent': 'your-org your@org.com'})

In [None]:
json_data = xbrl.json()
company_name = json_data.get("entityName")
facts = json_data.get("facts")
gaap_data = facts.get("us-gaap")

In [None]:
from collections import defaultdict

financials_dict = defaultdict(lambda: defaultdict(list))

# Extract the individual line items from the XBRL json and store in `financials_dict`
for key, value in gaap_data.items():
    units = value.get("units")
    financials: list = units.get("USD")
    if not financials:
        continue
    for financial in financials:
        form = financial.get("form")
        if form != '10-K':
            # Only take 10-K data
            continue

        accession_number = financial.get("accn")

        # If the accession_number is not in our expected 10-K list, skip it
        if accession_number not in accession_numbers:
          continue

        value = financial.get("val")
        fp = financial.get("fp")
        fy = financial.get("fy")
        year = fp + str(fy)

        financials_dict[year][key].append({"value": value, "year": year})

In [None]:
# Inspect the years of data that we have
print(financials_dict.keys())
# You can inspect all the data by uncommenting below
# print(financials_dict)

# Extract income statements from 10-K

In [None]:
import instructor
from openai import OpenAI
from pydantic import BaseModel
from pydantic import Field
from enum import Enum
from typing import Optional, Union, List

# Define our income statement
class IncomeStatement(BaseModel):
  period: Optional[str]

  revenue: Union[float, str] = Field(description="Revenue")
  cost_of_revenue: Union[float, str] = Field(description="Cost of revenue")

  general_and_administrative_expense: Union[float, str] = Field(description="General and administrative expenses")
  research_and_development_expense: Union[float, str] = Field(description="Research and development expenses")
  selling_and_marketing_expense: Union[float, str] = Field(description="Selling and marketing expenses")
  operating_income_loss: Union[float, str] = Field(description="Operating income loss")

  net_income_loss: Union[float, str] = Field(description="Net income or loss")

class IncomeStatements(BaseModel):
  income_statements: List[IncomeStatement]

In [None]:
import time
import instructor
from openai import OpenAI
from pydantic import BaseModel
from typing import List
from rich.console import Console
from anthropic import Anthropic


client = instructor.patch(OpenAI())

start = time.time()

response = client.chat.completions.create(
    model="gpt-4-0125-preview",
    response_model=instructor.Partial[IncomeStatements],
    messages=[
        {
            "role": "system",
            "content": "You are an expert at extracting financial accounting data from JSON",
        },
        {
            "role": "user",
            "content": f"Extract the following fields {IncomeStatement.schema()} for FY2023, FY2022, and FY2021 from following context: ```{financials_dict}```",
        },
    ],
)

print(response.model_dump_json(indent=2))

print(f"Took {time.time() - start} seconds to complete!")

{
  "income_statements": [
    {
      "period": "FY2023",
      "revenue": 9917000000.0,
      "cost_of_revenue": 1703000000.0,
      "general_and_administrative_expense": 2025000000.0,
      "research_and_development_expense": 1722000000.0,
      "selling_and_marketing_expense": 1763000000.0,
      "operating_income_loss": 1518000000.0,
      "net_income_loss": 4792000000.0
    },
    {
      "period": "FY2022",
      "revenue": 8399000000.0,
      "cost_of_revenue": 1499000000.0,
      "general_and_administrative_expense": 950000000.0,
      "research_and_development_expense": 1502000000.0,
      "selling_and_marketing_expense": 1516000000.0,
      "operating_income_loss": 1802000000.0,
      "net_income_loss": 1893000000.0
    },
    {
      "period": "FY2021",
      "revenue": 5992000000.0,
      "cost_of_revenue": 1156000000.0,
      "general_and_administrative_expense": 836000000.0,
      "research_and_development_expense": 1425000000.0,
      "selling_and_marketing_expense": 11

# Extract balance sheets from 10-K

In [None]:
# Define Balance Sheet
class BalanceSheet(BaseModel):
  period: Optional[str]

  # Assets
  cash_and_cash_equivalents: Union[float, str] = Field(description="Cash and cash equivalents")
  short_term_investments: Union[float, str] = Field(description="Short-term investments")
  total_current_assets: Union[float, str] = Field(description="Total current assets")
  goodwill: Union[float, str] = Field(description="Goodwill")
  total_assets: Union[float, str] = Field(description="Assets")

  # Liabilities
  current_accrued_liabilities: Union[float, str] = Field(description="Current accrued liabilities")
  current_accounts_payable: Union[float, str] = Field(description="Current accounts payable")
  current_accrued_liabilities: Union[float, str] = Field(description="Current accrued liabilities")
  long_term_debt: Union[float, str] = Field(description="Long term debt")
  operating_lease_liabilities: Union[float, str] = Field(description="Operating lease liabilities")
  other_non_current_liabilities: Union[float, str] = Field(description="Other non-current liabilities")
  total_liabilities: Union[float, str] = Field(description="Liabilities")
  stockholders_equity: Union[float, str] = Field(description="Stockholders equity")

class BalanceSheets(BaseModel):
  balance_sheets: List[BalanceSheet]

In [None]:
import time
import instructor
from openai import OpenAI
from pydantic import BaseModel
from typing import List
from rich.console import Console
from anthropic import Anthropic


client = instructor.patch(OpenAI())

start = time.time()

response = client.chat.completions.create(
    model="gpt-4-0125-preview",
    response_model=instructor.Partial[BalanceSheets],
    messages=[
        {
            "role": "system",
            "content": "You are an expert at extracting financial accounting data from JSON",
        },
        {
            "role": "user",
            "content": f"Extract the following fields {BalanceSheet.schema()} for FY2023, FY2022, and FY2021 from following context: ```{financials_dict}```",
        },
    ],
)

print(response.model_dump_json(indent=2))

print(f"Took {time.time() - start} seconds to complete!")

{
  "balance_sheets": [
    {
      "period": "FY2023",
      "cash_and_cash_equivalents": "7378000000",
      "short_term_investments": "2244000000",
      "total_current_assets": "14861000000",
      "goodwill": "650000000",
      "total_assets": "16038000000",
      "current_accrued_liabilities": "2654000000",
      "current_accounts_payable": "137000000",
      "long_term_debt": "1987000000",
      "operating_lease_liabilities": "295000000",
      "other_non_current_liabilities": "218000000",
      "total_liabilities": "10478000000",
      "stockholders_equity": "5560000000"
    },
    {
      "period": "FY2022",
      "cash_and_cash_equivalents": "6067000000",
      "short_term_investments": "2255000000",
      "total_current_assets": "12386000000",
      "goodwill": "656000000",
      "total_assets": "13708000000",
      "current_accrued_liabilities": "6359000000",
      "current_accounts_payable": "118000000",
      "long_term_debt": "1983000000",
      "operating_lease_liabilit

# Extract cash flow statements from 10-K

In [None]:
# Define Cash Flow Statement
class CashFlowStatement(BaseModel):
  period: Optional[str]

  net_income: Union[float, str] = Field(description="Net income")
  depreciation_and_amortization: Union[float, str] = Field(description="Depreciation and amortization")
  shared_based_compensation: Union[float, str] = Field(description="Shared or stock-based compensation")
  net_cash_from_operating_activities: Union[float, str] = Field(description="Net cash provided by operating activities")

  net_cash_from_investing_activities: Union[float, str] = Field(description="Net cash provided by investing activities")
  plant_property_and_equipment: Union[float, str] = Field(description="Payments to acquire property plant and equipment")

  net_cash_from_financing_activities: Union[float, str] = Field(description="Net cash provided by financing activities")

class CashFlowStatements(BaseModel):
  cash_flow_statements: List[CashFlowStatement]

In [None]:
import time
import instructor
from openai import OpenAI
from pydantic import BaseModel
from typing import List
from rich.console import Console
from anthropic import Anthropic


client = instructor.patch(OpenAI())

start = time.time()

response = client.chat.completions.create(
    model="gpt-4-0125-preview",
    response_model=instructor.Partial[CashFlowStatements],
    messages=[
        {
            "role": "system",
            "content": "You are an expert at extracting financial accounting data from JSON",
        },
        {
            "role": "user",
            "content": f"Extract the following fields {CashFlowStatement.schema()} for FY2023, FY2022, and FY2021 from following context: ```{financials_dict}```",
        },
    ],
)

print(response.model_dump_json(indent=2))

print(f"Took {time.time() - start} seconds to complete!")

{
  "cash_flow_statements": [
    {
      "period": "FY2023",
      "net_income": 4792000000.0,
      "depreciation_and_amortization": 138000000.0,
      "shared_based_compensation": 1120000000.0,
      "net_cash_from_operating_activities": 3884000000.0,
      "net_cash_from_investing_activities": -1042000000.0,
      "plant_property_and_equipment": null,
      "net_cash_from_financing_activities": -2430000000.0
    },
    {
      "period": "FY2022",
      "net_income": 1893000000.0,
      "depreciation_and_amortization": 81000000.0,
      "shared_based_compensation": 899000000.0,
      "net_cash_from_operating_activities": 3430000000.0,
      "net_cash_from_investing_activities": -28000000.0,
      "plant_property_and_equipment": 25000000.0,
      "net_cash_from_financing_activities": -689000000.0
    },
    {
      "period": "FY2021",
      "net_income": -674339000.0,
      "depreciation_and_amortization": 125876000.0,
      "shared_based_compensation": 3001948000.0,
      "net_cash_