In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

BASE = "https://data.sec.gov"
HEADERS = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}

## Step 2. Get CIK for the input ticker symbol~
def get_cik(ticker):
    """Retrieve CIK for a given ticker symbol"""
    res = requests.get(f"{BASE}/submissions/CIK{ticker}.json", headers=HEADERS)
    if res.status_code != 200:
        # fallback: try SEC ticker endpoint
        lookup = requests.get(f"https://www.sec.gov/files/company_tickers.json", headers=HEADERS).json()
        for _, c in lookup.items():
            if c["ticker"].lower() == ticker.lower():
                return str(c["cik_str"]).zfill(10)
    return None

## Step 3. Get latest 10K url for the CIK (ticker)
def get_latest_10k_url(cik):
    """Retrieve latest 10-K filing document URL and dates for a given CIK"""
    url = f"{BASE}/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    data = res.json()

    for form, acc, filing_date, report_date in zip(
        data["filings"]["recent"]["form"],
        data["filings"]["recent"]["accessionNumber"],
        data["filings"]["recent"]["filingDate"],
        data["filings"]["recent"]["reportDate"]
    ):
        if form == "10-K":
            acc_num = acc.replace("-", "")
            filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_num}/{acc}-index.html"
            return filing_url, filing_date, report_date

    return None, None, None



#######################################

## Step 3a. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_cashflow_from_xbrl(soup):
    tags = {
        # Operating
        "Net profit (or loss if negative)": "us-gaap:NetIncomeLoss",
        "Depreciation (wear & tear on assets)": "us-gaap:DepreciationDepletionAndAmortization",
        "stock_comp": "us-gaap:ShareBasedCompensation",
        "change_ar": "us-gaap:IncreaseDecreaseInAccountsReceivable",
        "change_inventory": "us-gaap:IncreaseDecreaseInInventory",
        "change_ap": "us-gaap:IncreaseDecreaseInAccountsPayable",
        "Cash from day-to-day business (Operating Cashflow)": "us-gaap:NetCashProvidedByUsedInOperatingActivities",

        # Investing
        "Buying equipment/buildings (Capital Expenditure)": "us-gaap:PaymentsToAcquirePropertyPlantAndEquipment",
        "acquisitions": "us-gaap:PaymentsToAcquireBusinessesNetOfCashAcquired",
        "asset_sales": "us-gaap:ProceedsFromSaleOfPropertyPlantAndEquipment",
        "investments_purchase": "us-gaap:PaymentsToAcquireMarketableSecurities",
        "investments_maturity": "us-gaap:ProceedsFromMaturitiesOfMarketableSecurities",
        "Cash from investments (Buying/Selling assets)": "us-gaap:NetCashProvidedByUsedInInvestingActivities",

        # Financing
        "Money raised from issuing new shares": "us-gaap:ProceedsFromIssuanceOfCommonStock",
        "Money spent buying back shares of company": "us-gaap:PaymentsForRepurchaseOfCommonStock",
        "Borrowed money (New loans or bonds)": "us-gaap:ProceedsFromIssuanceOfLongTermDebt",
        "Loan repayments": "us-gaap:RepaymentsOfLongTermDebt",
        "Dividends paid to shareholders": "us-gaap:PaymentsOfDividends",
        "Cash from investors and loans (Financing activities)": "us-gaap:NetCashProvidedByUsedInFinancingActivities",

        # Summary
        "Change in cash during the period": "us-gaap:CashAndCashEquivalentsPeriodIncreaseDecrease",
        "Cash at the beginning of the period": "us-gaap:CashAndCashEquivalentsAtBeginningOfPeriod",
        "Cash remaining at the end of the period": "us-gaap:CashAndCashEquivalentsAtCarryingValue",
    }

    data = {}
    for key, tag in tags.items():
        el = soup.find(tag)
        if el and hasattr(el, "text"):
            try:
                data[key] = float(el.text.strip().replace(",", ""))
            except ValueError:
                data[key] = el.text.strip()
        else:
            data[key] = None

    return data

## Step 3b. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_income_from_xbrl(soup):
    tags = {
    # 💰 Revenue and related income
    "Total Revenue": [
        "us-gaap:Revenues",
        "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
        "us-gaap:SalesRevenueNet",
        "us-gaap:SalesRevenueGoodsNet",
        "us-gaap:SalesRevenueServicesNet"
    ],
    "Advertising Revenue": ["us-gaap:AdvertisingRevenue"],

    # ⚙️ Costs and Expenses
    "Cost of Revenue": ["us-gaap:CostOfRevenue", "us-gaap:CostOfGoodsSold"],
    "Gross Profit": ["us-gaap:GrossProfit"],
    "Research & Development": ["us-gaap:ResearchAndDevelopmentExpense"],
    "Sales & Marketing": ["us-gaap:SellingAndMarketingExpense"],
    "General & Administrative": ["us-gaap:GeneralAndAdministrativeExpense"],
    "Operating Expenses (Total)": ["us-gaap:OperatingExpenses"],

    # 🧾 Other Income/Expense
    "Interest Income": ["us-gaap:InterestIncome"],
    "Interest Expense": ["us-gaap:InterestExpense"],
    "Other Income (Expense)": ["us-gaap:OtherNonoperatingIncomeExpense"],
    "Income Before Tax": ["us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest"],
    "Income Tax Expense": ["us-gaap:IncomeTaxExpenseBenefit"],

    # 🧮 Net results
    "Net Income": ["us-gaap:NetIncomeLoss"],
    "Net Income Attributable to Parent": ["us-gaap:NetIncomeLossAttributableToParent"],
    "Net Income Attributable to Noncontrolling Interest": ["us-gaap:NetIncomeLossAttributableToNoncontrollingInterest"],

    # 🪙 EPS (Earnings per Share)
    "Earnings per Share (Basic)": ["us-gaap:EarningsPerShareBasic"],
    "Earnings per Share (Diluted)": ["us-gaap:EarningsPerShareDiluted"],
    "Weighted Average Shares Outstanding (Basic)": ["us-gaap:WeightedAverageNumberOfSharesOutstandingBasic"],
    "Weighted Average Shares Outstanding (Diluted)": ["us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"]
    }

    # 4️⃣ Extract numeric values
    data = {}
    for label, tag_list in tags.items():
        val = None
        for tag in tag_list:
            el = soup.find(tag)
            if el and el.text.strip():
                try:
                    val = float(el.text.strip().replace(",", ""))
                    break
                except:
                    pass
        data[label] = val

    return data

## Step 4: Find actual XBRL XML file from 10K URL
def get_primary_xbrl_url(index_url):
    """Find the main XBRL (XML) file inside the 10-K index page"""
    res = requests.get(index_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")

    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.endswith(".xml") and not any(
            x in href for x in ["_cal.xml", "_lab.xml", "_pre.xml", "_def.xml"]
        ):
            return "https://www.sec.gov" + href

    return None

## Step 5. Parse a given filing (the actual XBRL or XML)
def parse_cashflow_from_10k(latest_10k_filing_XBRL_XML):
    """Extract cash flow data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract cashflow data using your earlier helper
    output = parse_cashflow_from_xbrl(soup)

    return output  # ✅ actually return the data

## Step 6. Parse a given filing to get the income statement
def parse_income_from_10k(latest_10k_filing_XBRL_XML):
    """Extract income statement data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract income data using your helper function
    output = parse_income_from_xbrl(soup)

    return output  # ✅ return the parsed income statement data


In [3]:
def get_cashflow_json(ticker):
    """End-to-end: from ticker → CIK → 10-K → XBRL → cashflow data"""
    ## Step 2. Get CIK for the input ticker symbol
    cik = get_cik(ticker)
    if not cik:
        raise ValueError("CIK not found.")

    ## Step 3. Get latest 10K url for the CIK (ticker)
    filing_url, filing_date, report_date = get_latest_10k_url(cik)
    if not filing_url:
        raise ValueError("No 10-K filing found.")

    print(f"Found 10-K filing: {filing_url} (filed {filing_date}, for {report_date})")

    # Step 4: Find actual XBRL XML file from 10K URL
    xbrl_url = get_primary_xbrl_url(filing_url)
    if not xbrl_url:
        raise ValueError("No XBRL XML file found in filing.")

    print(f"Using XBRL XML file: {xbrl_url}")


    # Step 5: Parse cashflow data from that XML
    cashflow_data = parse_cashflow_from_10k(xbrl_url)
    if not cashflow_data:
        raise ValueError("Cash flow statement not found in filing.")

    # Step 6: Parse income data from XML
    income_data = parse_income_from_10k(xbrl_url)
    if not cashflow_data:
        raise ValueError("income statement not found in filing.")

    # Step 7: Return results
    return {
        "ticker": ticker,
        "cik": cik,
        "source": xbrl_url,
        "cashflow": cashflow_data,
        "income_statement": income_data,
        "filing_date": filing_date,
        "report_date": report_date,
    }


In [3]:
# !pip install google-cloud-firestore

Collecting google-cloud-firestore
  Downloading google_cloud_firestore-2.21.0-py3-none-any.whl.metadata (9.9 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0->google-cloud-firestore)
  Downloading google_api_core-2.26.0-py3-none-any.whl.metadata (3.2 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0,>=2.14.1 (from google-cloud-firestore)
  Downloading google_auth-2.41.1-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting google-cloud-core<3.0.0,>=1.4.1 (from google-cloud-firestore)
  Downloading google_cloud_core-2.4.3-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting proto-plus<2.0.0,>=1.22.0 (from google-cloud-firestore)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.32.0 requires protobuf<5,>=3.20, but you have protobuf 6.32.1 which is incompatible.


In [7]:
from google.cloud import firestore
import json

# Initialize Firestore client (ensure your JSON path is correct)
db = firestore.Client.from_service_account_json("funwai-resume-firebase-adminsdk-fbsvc-a956eb6362.json")

def store_financials_to_firestore(financials_dict, collection_name="company_financials"):
    """
    Stores both cashflow and income statement data from get_financials_json() into Firestore.
    
    Args:
        financials_dict (dict): The result from get_financials_json(ticker)
        collection_name (str): Firestore collection name
    """
    ticker = financials_dict.get("ticker")
    if not ticker:
        raise ValueError("Ticker not found in dictionary")

    # Use ticker as document ID (so each company overwrites with latest data)
    doc_ref = db.collection(collection_name).document(ticker)
    
    # Store the data
    doc_ref.set(financials_dict)
    print(f"✅ Stored financial data for {ticker} in Firestore collection '{collection_name}'.")


In [11]:
## test the above function (it works!)
data = get_cashflow_json("NKE")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/320187/000032018725000047/0000320187-25-000047-index.html (filed 2025-07-17, for 2025-05-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/320187/000032018725000047/nke-20250531_htm.xml
✅ Stored financial data for NKE in Firestore collection 'company_financials'.


In [13]:
data = get_cashflow_json("NFLX")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1065280/000106528025000044/0001065280-25-000044-index.html (filed 2025-01-27, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1065280/000106528025000044/nflx-20241231_htm.xml
✅ Stored financial data for NFLX in Firestore collection 'company_financials'.


In [15]:
data = get_cashflow_json("GOOG")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1652044/000165204425000014/0001652044-25-000014-index.html (filed 2025-02-05, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1652044/000165204425000014/goog-20241231_htm.xml
✅ Stored financial data for GOOG in Firestore collection 'company_financials'.


In [17]:
data = get_cashflow_json("MSFT")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/0000950170-25-100235-index.html (filed 2025-07-30, for 2025-06-30)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/msft-20250630_htm.xml
✅ Stored financial data for MSFT in Firestore collection 'company_financials'.


In [31]:
data = get_cashflow_json("META")
store_financials_to_firestore(data)

data = get_cashflow_json("ABNB")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1326801/000132680125000017/0001326801-25-000017-index.html (filed 2025-01-30, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1326801/000132680125000017/meta-20241231_htm.xml
✅ Stored financial data for META in Firestore collection 'company_financials'.


In [17]:
data = get_cashflow_json("MA")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1141391/000114139125000011/0001141391-25-000011-index.html (filed 2025-02-12, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1141391/000114139125000011/ma-20241231_htm.xml
✅ Stored financial data for MA in Firestore collection 'company_financials'.


In [15]:
data = get_cashflow_json("AMD")
store_financials_to_firestore(data)

data = get_cashflow_json("RDDT")
store_financials_to_firestore(data)

data = get_cashflow_json("NVDA")
store_financials_to_firestore(data)

data = get_cashflow_json("WFC")
store_financials_to_firestore(data)

data = get_cashflow_json("JPM")
store_financials_to_firestore(data)
                              
data = get_cashflow_json("CSCO")
store_financials_to_firestore(data)

data = get_cashflow_json("CMCSA")
store_financials_to_firestore(data)

data = get_cashflow_json("V")
store_financials_to_firestore(data)

data = get_cashflow_json("MA")
store_financials_to_firestore(data)

data = get_cashflow_json("JNJ")
store_financials_to_firestore(data)

data = get_cashflow_json("HD")
store_financials_to_firestore(data)

data = get_cashflow_json("UBER")
store_financials_to_firestore(data)

data = get_cashflow_json("PFE")
store_financials_to_firestore(data)

data = get_cashflow_json("COF")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/2488/000000248825000012/0000002488-25-000012-index.html (filed 2025-02-05, for 2024-12-28)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/2488/000000248825000012/amd-20241228_htm.xml
✅ Stored financial data for AMD in Firestore collection 'company_financials'.
Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1713445/000171344525000018/0001713445-25-000018-index.html (filed 2025-02-13, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1713445/000171344525000018/rddt-20241231_htm.xml
✅ Stored financial data for RDDT in Firestore collection 'company_financials'.
Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/0001045810-25-000023-index.html (filed 2025-02-26, for 2025-01-26)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126_htm.xml
✅ Stored financial data for NVDA in Firestore collection 'com

In [11]:
pip install fastapi uvicorn

Collecting fastapi
  Downloading fastapi-0.120.0-py3-none-any.whl.metadata (28 kB)
Collecting uvicorn
  Downloading uvicorn-0.38.0-py3-none-any.whl.metadata (6.8 kB)
Collecting starlette<0.49.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.48.0-py3-none-any.whl.metadata (6.3 kB)
Collecting annotated-doc>=0.0.2 (from fastapi)
  Downloading annotated_doc-0.0.3-py3-none-any.whl.metadata (6.6 kB)
Downloading fastapi-0.120.0-py3-none-any.whl (108 kB)
   ---------------------------------------- 0.0/108.2 kB ? eta -:--:--
   ----------- ---------------------------- 30.7/108.2 kB 1.4 MB/s eta 0:00:01
   ----------- ---------------------------- 30.7/108.2 kB 1.4 MB/s eta 0:00:01
   ----------------------------------- -- 102.4/108.2 kB 737.3 kB/s eta 0:00:01
   -------------------------------------- 108.2/108.2 kB 700.0 kB/s eta 0:00:00
Downloading uvicorn-0.38.0-py3-none-any.whl (68 kB)
   ---------------------------------------- 0.0/68.1 kB ? eta -:--:--
   ------------------------ -------

In [None]:
##### New section ####