## Get Company Financials (cashflow statement, income statement) by accessing EDGAR using a company's ticker 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

BASE = "https://data.sec.gov"
HEADERS = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}

## Step 1. Get CIK for the input ticker symbol~
def get_cik(ticker):
    """Retrieve CIK for a given ticker symbol"""
    res = requests.get(f"{BASE}/submissions/CIK{ticker}.json", headers=HEADERS)
    if res.status_code != 200:
        # fallback: try SEC ticker endpoint
        lookup = requests.get(f"https://www.sec.gov/files/company_tickers.json", headers=HEADERS).json()
        for _, c in lookup.items():
            if c["ticker"].lower() == ticker.lower():
                return str(c["cik_str"]).zfill(10)
    return None

## Step 2. Get latest 10K url for the CIK (ticker)
def get_latest_10k_url(cik):
    """Retrieve latest 10-K filing document URL and dates for a given CIK"""
    url = f"{BASE}/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    data = res.json()

    for form, acc, filing_date, report_date in zip(
        data["filings"]["recent"]["form"],
        data["filings"]["recent"]["accessionNumber"],
        data["filings"]["recent"]["filingDate"],
        data["filings"]["recent"]["reportDate"]
    ):
        if form == "10-K":
            acc_num = acc.replace("-", "")
            filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_num}/{acc}-index.html"
            return filing_url, filing_date, report_date

    return None, None, None

#######################################

## Step 3: Find actual XBRL XML file from 10K URL
def get_primary_xbrl_url(index_url):
    """Find the main XBRL (XML) file inside the 10-K index page"""
    res = requests.get(index_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")

    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.endswith(".xml") and not any(
            x in href for x in ["_cal.xml", "_lab.xml", "_pre.xml", "_def.xml"]
        ):
            return "https://www.sec.gov" + href

    return None
#######################################

## Step 4a. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_cashflow_from_xbrl(soup):
    tags = {
        # Operating
        "Net profit (or loss if negative)": "us-gaap:NetIncomeLoss",
        "Depreciation (wear & tear on assets)": "us-gaap:DepreciationDepletionAndAmortization",
        "stock_comp": "us-gaap:ShareBasedCompensation",
        "change_ar": "us-gaap:IncreaseDecreaseInAccountsReceivable",
        "change_inventory": "us-gaap:IncreaseDecreaseInInventory",
        "change_ap": "us-gaap:IncreaseDecreaseInAccountsPayable",
        "Cash from day-to-day business (Operating Cashflow)": "us-gaap:NetCashProvidedByUsedInOperatingActivities",

        # Investing
        "Buying equipment/buildings (Capital Expenditure)": "us-gaap:PaymentsToAcquirePropertyPlantAndEquipment",
        "acquisitions": "us-gaap:PaymentsToAcquireBusinessesNetOfCashAcquired",
        "asset_sales": "us-gaap:ProceedsFromSaleOfPropertyPlantAndEquipment",
        "investments_purchase": "us-gaap:PaymentsToAcquireMarketableSecurities",
        "investments_maturity": "us-gaap:ProceedsFromMaturitiesOfMarketableSecurities",
        "Cash from investments (Buying/Selling assets)": "us-gaap:NetCashProvidedByUsedInInvestingActivities",

        # Financing
        "Money raised from issuing new shares": "us-gaap:ProceedsFromIssuanceOfCommonStock",
        "Money spent buying back shares of company": "us-gaap:PaymentsForRepurchaseOfCommonStock",
        "Borrowed money (New loans or bonds)": "us-gaap:ProceedsFromIssuanceOfLongTermDebt",
        "Loan repayments": "us-gaap:RepaymentsOfLongTermDebt",
        "Dividends paid to shareholders": "us-gaap:PaymentsOfDividends",
        "Cash from investors and loans (Financing activities)": "us-gaap:NetCashProvidedByUsedInFinancingActivities",

        # Summary
        "Change in cash during the period": "us-gaap:CashAndCashEquivalentsPeriodIncreaseDecrease",
        "Cash at the beginning of the period": "us-gaap:CashAndCashEquivalentsAtBeginningOfPeriod",
        "Cash remaining at the end of the period": "us-gaap:CashAndCashEquivalentsAtCarryingValue",
    }

    data = {}
    for key, tag in tags.items():
        el = soup.find(tag)
        if el and hasattr(el, "text"):
            try:
                data[key] = float(el.text.strip().replace(",", ""))
            except ValueError:
                data[key] = el.text.strip()
        else:
            data[key] = None

    return data

## Step 4b. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_income_from_xbrl(soup):
    """
    Extract key income statement values from an XBRL soup object and group
    them into: revenue, expenses, profit, and shares.
    If some values are missing, compute derived ones (e.g. Gross Profit = Revenue - Cost of Revenue).
    """

    # Define XBRL tags grouped by category
    tags = {
        "revenue": {
            "Total Revenue": [
                "us-gaap:Revenues",
                "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
                "us-gaap:SalesRevenueNet",
                "us-gaap:SalesRevenueGoodsNet",
                "us-gaap:SalesRevenueServicesNet"
            ],
            "Advertising Revenue": ["us-gaap:AdvertisingRevenue"],
            "Interest Income": ["us-gaap:InterestIncome"],
            "Other Income": ["us-gaap:OtherNonoperatingIncomeExpense"]
        },

        "expenses": {
            "Cost of Revenue": ["us-gaap:CostOfRevenue", "us-gaap:CostOfGoodsSold"],
            "Research & Development": ["us-gaap:ResearchAndDevelopmentExpense"],
            "Sales & Marketing": ["us-gaap:SellingAndMarketingExpense"],
            "General & Administrative": ["us-gaap:GeneralAndAdministrativeExpense"],
            "Operating Expenses (Total)": ["us-gaap:OperatingExpenses"],
            "Interest Expense": ["us-gaap:InterestExpense"],
            "Income Tax Expense": ["us-gaap:IncomeTaxExpenseBenefit"]
        },

        "profit": {
            "Gross Profit": ["us-gaap:GrossProfit"],
            "Operating Income": ["us-gaap:OperatingIncomeLoss"],
            "Income Before Tax": [
                "us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest"
            ],
            "Net Income": ["us-gaap:NetIncomeLoss"]
        },

        "shares": {
            "Earnings per Share (Basic)": ["us-gaap:EarningsPerShareBasic"],
            "Earnings per Share (Diluted)": ["us-gaap:EarningsPerShareDiluted"],
            "Weighted Average Shares Outstanding (Basic)": [
                "us-gaap:WeightedAverageNumberOfSharesOutstandingBasic"
            ],
            "Weighted Average Shares Outstanding (Diluted)": [
                "us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"
            ]
        }
    }

    # Helper function to extract numeric value from XBRL
    def extract_value(tag_list):
        for tag in tag_list:
            el = soup.find(tag)
            if el and el.text.strip():
                try:
                    return float(el.text.strip().replace(",", ""))
                except ValueError:
                    continue
        return None

    # Parse raw values from XBRL
    grouped_data = {}
    for section, section_tags in tags.items():
        grouped_data[section] = {}
        for label, tag_list in section_tags.items():
            grouped_data[section][label] = extract_value(tag_list)

    # --- ðŸ§® Compute derived metrics ---
    rev = grouped_data["revenue"].get("Total Revenue")
    cost = grouped_data["expenses"].get("Cost of Revenue")
    gross = grouped_data["profit"].get("Gross Profit")
    op_exp = grouped_data["expenses"].get("Operating Expenses (Total)")
    r_and_d = grouped_data["expenses"].get("Research & Development")
    s_and_m = grouped_data["expenses"].get("Sales & Marketing")
    g_and_a = grouped_data["expenses"].get("General & Administrative")
    op_inc = grouped_data["profit"].get("Operating Income")

    # Gross Profit = Revenue - Cost of Revenue
    if gross is None and rev is not None and cost is not None:
        grouped_data["profit"]["Gross Profit"] = rev - cost

    # Operating Expenses (Total) = R&D + Sales & Marketing + G&A
    if op_exp is None and any(v is not None for v in [r_and_d, s_and_m, g_and_a]):
        total = sum(v for v in [r_and_d, s_and_m, g_and_a] if v is not None)
        grouped_data["expenses"]["Operating Expenses (Total)"] = total

    # Operating Income = Gross Profit - Operating Expenses
    gross = grouped_data["profit"].get("Gross Profit")
    op_exp = grouped_data["expenses"].get("Operating Expenses (Total)")
    if op_inc is None and gross is not None and op_exp is not None:
        grouped_data["profit"]["Operating Income"] = gross - op_exp

    # Income Before Tax = Operating Income + (Interest Income - Interest Expense) + Other Income
    inc_before_tax = grouped_data["profit"].get("Income Before Tax")
    int_income = grouped_data["revenue"].get("Interest Income")
    int_exp = grouped_data["expenses"].get("Interest Expense")
    other_inc = grouped_data["revenue"].get("Other Income")
    if inc_before_tax is None and op_inc is not None:
        total_other = (int_income or 0) - (int_exp or 0) + (other_inc or 0)
        grouped_data["profit"]["Income Before Tax"] = op_inc + total_other

    return grouped_data

## Step 5a. Parse a given filing (the actual XBRL or XML)
def parse_cashflow_from_10k(latest_10k_filing_XBRL_XML):
    """Extract cash flow data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract cashflow data using your earlier helper
    output = parse_cashflow_from_xbrl(soup)

    return output  # âœ… actually return the data

## Step 5b. Parse a given filing to get the income statement
def parse_income_from_10k(latest_10k_filing_XBRL_XML):
    """Extract income statement data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract income data using your helper function
    output = parse_income_from_xbrl(soup)

    return output  # âœ… return the parsed income statement data


In [3]:
### Consolidate all of the above into a flow:
def get_cashflow_json(ticker):
    """End-to-end: from ticker â†’ CIK â†’ 10-K â†’ XBRL â†’ cashflow data"""
    ## Step 1. Get CIK for the input ticker symbol
    cik = get_cik(ticker)
    if not cik:
        raise ValueError("CIK not found.")

    ## Step 2. Get latest 10K url for the CIK (ticker)
    filing_url, filing_date, report_date = get_latest_10k_url(cik)
    if not filing_url:
        raise ValueError("No 10-K filing found.")

    print(f"Found 10-K filing: {filing_url} (filed {filing_date}, for {report_date})")

    # Step 3: Find actual XBRL XML file from 10K URL
    xbrl_url = get_primary_xbrl_url(filing_url)
    if not xbrl_url:
        raise ValueError("No XBRL XML file found in filing.")

    print(f"Using XBRL XML file: {xbrl_url}")

    # Step 5a: Parse cashflow data from that XML (4a embedded)
    cashflow_data = parse_cashflow_from_10k(xbrl_url)
    if not cashflow_data:
        raise ValueError("Cash flow statement not found in filing.")

    # Step 5b: Parse income data from XML (4b embedded)
    income_data = parse_income_from_10k(xbrl_url)
    if not income_data:
        raise ValueError("income statement not found in filing.")

    # Step 7: Return results
    return {
        "ticker": ticker,
        "cik": cik,
        "source": xbrl_url,
        "cashflow": cashflow_data,
        "income_statement": income_data,
        "filing_date": filing_date,
        "report_date": report_date,
    }


### Store resulting cashflow statement output into Firestore Database

In [19]:
## setup connection to firestore
from dotenv import load_dotenv
import os

from google.oauth2 import service_account
from google.cloud import firestore, storage


# -------------------------------
# Load .env
# -------------------------------
load_dotenv()

SERVICE_ACCOUNT_PATH = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
PROJECT_ID = os.getenv("GCP_PROJECT_ID")
BUCKET_NAME = os.getenv("GCP_STORAGE_BUCKET")

if not SERVICE_ACCOUNT_PATH:
    raise ValueError("Missing FIREBASE_SERVICE_ACCOUNT_JSON in .env")
if not PROJECT_ID:
    raise ValueError("Missing GCP_PROJECT_ID in .env")
if not BUCKET_NAME:
    raise ValueError("Missing GCP_STORAGE_BUCKET in .env")


# -------------------------------
# Load Credentials (ONE source)
# -------------------------------
creds = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_PATH
)


# -------------------------------
# Initialize Firestore + Storage 
# -------------------------------
db = firestore.Client(credentials=creds, project=PROJECT_ID)
storage_client = storage.Client(credentials=creds, project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)

print("ðŸ”¥ Firestore and Storage initialized successfully!")
print(bucket)
print(storage_client)


ðŸ”¥ Firestore and Storage initialized successfully!
<Bucket: funwai-resume.appspot.com>
<google.cloud.storage.client.Client object at 0x00000220FD45C0E0>


In [9]:
# Initialize Firestore using the same method you used before
db = firestore.Client.from_service_account_json(SERVICE_ACCOUNT_PATH)

print("Connected to Firestore!")

def store_financials_to_firestore(financials_dict, collection_name="company_financials"):
    """
    Stores cashflow data from get_cashflow_json() into Firestore.
    
    Args:
        financials_dict (dict): The result from get_financials_json(ticker)
        collection_name (str): Firestore collection name
    """
    ticker = financials_dict.get("ticker")
    if not ticker:
        raise ValueError("Ticker not found in dictionary")

    # Use ticker as document ID (so each company overwrites with latest data)
    doc_ref = db.collection(collection_name).document(ticker)
    
    # Store the data
    doc_ref.set(financials_dict)
    print(f"âœ… Stored financial data for {ticker} in Firestore collection '{collection_name}'.")


def store_financials_to_firestore(financials_dict, collection_name="company_financials"):
    ticker = financials_dict.get("ticker")
    if not ticker:
        raise ValueError("Ticker not found in dictionary")

    doc_ref = db.collection(collection_name).document(ticker)
    doc_ref.set(financials_dict)

    print(f"âœ… Stored financial data for {ticker}.")

Connected to Firestore!


In [22]:
## test the above function (it works!)
data = get_cashflow_json("PFE")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/78003/000007800325000054/0000078003-25-000054-index.html (filed 2025-02-27, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/78003/000007800325000054/pfe-20241231_htm.xml
âœ… Stored financial data for PFE.
