## Pull all data and store into GCP using Ticker

In [1]:
# Get CIK from TICKER
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

BASE = "https://data.sec.gov"
HEADERS = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}

## Step 1. Get CIK for the input ticker symbol~
def get_cik(ticker):
    """Retrieve CIK for a given ticker symbol"""
    res = requests.get(f"{BASE}/submissions/CIK{ticker}.json", headers=HEADERS)
    if res.status_code != 200:
        # fallback: try SEC ticker endpoint
        lookup = requests.get(f"https://www.sec.gov/files/company_tickers.json", headers=HEADERS).json()
        for _, c in lookup.items():
            if c["ticker"].lower() == ticker.lower():
                return str(c["cik_str"]).zfill(10)
    return None

## Step 2. Get latest 10K url for the CIK (ticker)
def get_latest_10k_url(cik):
    """Retrieve latest 10-K filing document URL and dates for a given CIK"""
    url = f"{BASE}/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    data = res.json()

    for form, acc, filing_date, report_date in zip(
        data["filings"]["recent"]["form"],
        data["filings"]["recent"]["accessionNumber"],
        data["filings"]["recent"]["filingDate"],
        data["filings"]["recent"]["reportDate"]
    ):
        if form == "10-K":
            acc_num = acc.replace("-", "")
            filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_num}/{acc}-index.html"
            return filing_url, filing_date, report_date

    return None, None, None

### [A] Get financial statements - income statement, cashflow statement - from 10K

#### [A1] Supporting functions for income statement, cashflow statement

In [3]:
## Step 3: Find actual XBRL XML file from 10K URL
def get_primary_xbrl_url(index_url):
    """Find the main XBRL (XML) file inside the 10-K index page"""
    res = requests.get(index_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")

    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.endswith(".xml") and not any(
            x in href for x in ["_cal.xml", "_lab.xml", "_pre.xml", "_def.xml"]
        ):
            return "https://www.sec.gov" + href

    return None

## Step 4a. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_cashflow_from_xbrl(soup):
    tags = {
        # Operating
        "Net profit (or loss if negative)": "us-gaap:NetIncomeLoss",
        "Depreciation (wear & tear on assets)": "us-gaap:DepreciationDepletionAndAmortization",
        "stock_comp": "us-gaap:ShareBasedCompensation",
        "change_ar": "us-gaap:IncreaseDecreaseInAccountsReceivable",
        "change_inventory": "us-gaap:IncreaseDecreaseInInventory",
        "change_ap": "us-gaap:IncreaseDecreaseInAccountsPayable",
        "Cash from day-to-day business (Operating Cashflow)": "us-gaap:NetCashProvidedByUsedInOperatingActivities",

        # Investing
        "Buying equipment/buildings (Capital Expenditure)": "us-gaap:PaymentsToAcquirePropertyPlantAndEquipment",
        "acquisitions": "us-gaap:PaymentsToAcquireBusinessesNetOfCashAcquired",
        "asset_sales": "us-gaap:ProceedsFromSaleOfPropertyPlantAndEquipment",
        "investments_purchase": "us-gaap:PaymentsToAcquireMarketableSecurities",
        "investments_maturity": "us-gaap:ProceedsFromMaturitiesOfMarketableSecurities",
        "Cash from investments (Buying/Selling assets)": "us-gaap:NetCashProvidedByUsedInInvestingActivities",

        # Financing
        "Money raised from issuing new shares": "us-gaap:ProceedsFromIssuanceOfCommonStock",
        "Money spent buying back shares of company": "us-gaap:PaymentsForRepurchaseOfCommonStock",
        "Borrowed money (New loans or bonds)": "us-gaap:ProceedsFromIssuanceOfLongTermDebt",
        "Loan repayments": "us-gaap:RepaymentsOfLongTermDebt",
        "Dividends paid to shareholders": "us-gaap:PaymentsOfDividends",
        "Cash from investors and loans (Financing activities)": "us-gaap:NetCashProvidedByUsedInFinancingActivities",

        # Summary
        "Change in cash during the period": "us-gaap:CashAndCashEquivalentsPeriodIncreaseDecrease",
        "Cash at the beginning of the period": "us-gaap:CashAndCashEquivalentsAtBeginningOfPeriod",
        "Cash remaining at the end of the period": "us-gaap:CashAndCashEquivalentsAtCarryingValue",
    }

    data = {}
    for key, tag in tags.items():
        el = soup.find(tag)
        if el and hasattr(el, "text"):
            try:
                data[key] = float(el.text.strip().replace(",", ""))
            except ValueError:
                data[key] = el.text.strip()
        else:
            data[key] = None

    return data

## Step 4b. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_income_from_xbrl(soup):
    """
    Extract key income statement values from an XBRL soup object and group
    them into: revenue, expenses, profit, and shares.
    If some values are missing, compute derived ones (e.g. Gross Profit = Revenue - Cost of Revenue).
    """

    # Define XBRL tags grouped by category
    tags = {
        "revenue": {
            "Total Revenue": [
                "us-gaap:Revenues",
                "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
                "us-gaap:SalesRevenueNet",
                "us-gaap:SalesRevenueGoodsNet",
                "us-gaap:SalesRevenueServicesNet"
            ],
            "Advertising Revenue": ["us-gaap:AdvertisingRevenue"],
            "Interest Income": ["us-gaap:InterestIncome"],
            "Other Income": ["us-gaap:OtherNonoperatingIncomeExpense"]
        },

        "expenses": {
            "Cost of Revenue": ["us-gaap:CostOfRevenue", "us-gaap:CostOfGoodsSold"],
            "Research & Development": ["us-gaap:ResearchAndDevelopmentExpense"],
            "Sales & Marketing": ["us-gaap:SellingAndMarketingExpense"],
            "General & Administrative": ["us-gaap:GeneralAndAdministrativeExpense"],
            "Operating Expenses (Total)": ["us-gaap:OperatingExpenses"],
            "Interest Expense": ["us-gaap:InterestExpense"],
            "Income Tax Expense": ["us-gaap:IncomeTaxExpenseBenefit"]
        },

        "profit": {
            "Gross Profit": ["us-gaap:GrossProfit"],
            "Operating Income": ["us-gaap:OperatingIncomeLoss"],
            "Income Before Tax": [
                "us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest"
            ],
            "Net Income": ["us-gaap:NetIncomeLoss"]
        },

        "shares": {
            "Earnings per Share (Basic)": ["us-gaap:EarningsPerShareBasic"],
            "Earnings per Share (Diluted)": ["us-gaap:EarningsPerShareDiluted"],
            "Weighted Average Shares Outstanding (Basic)": [
                "us-gaap:WeightedAverageNumberOfSharesOutstandingBasic"
            ],
            "Weighted Average Shares Outstanding (Diluted)": [
                "us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"
            ]
        }
    }

    # Helper function to extract numeric value from XBRL
    def extract_value(tag_list):
        for tag in tag_list:
            el = soup.find(tag)
            if el and el.text.strip():
                try:
                    return float(el.text.strip().replace(",", ""))
                except ValueError:
                    continue
        return None

    # Parse raw values from XBRL
    grouped_data = {}
    for section, section_tags in tags.items():
        grouped_data[section] = {}
        for label, tag_list in section_tags.items():
            grouped_data[section][label] = extract_value(tag_list)

    # --- üßÆ Compute derived metrics ---
    rev = grouped_data["revenue"].get("Total Revenue")
    cost = grouped_data["expenses"].get("Cost of Revenue")
    gross = grouped_data["profit"].get("Gross Profit")
    op_exp = grouped_data["expenses"].get("Operating Expenses (Total)")
    r_and_d = grouped_data["expenses"].get("Research & Development")
    s_and_m = grouped_data["expenses"].get("Sales & Marketing")
    g_and_a = grouped_data["expenses"].get("General & Administrative")
    op_inc = grouped_data["profit"].get("Operating Income")

    # Gross Profit = Revenue - Cost of Revenue
    if gross is None and rev is not None and cost is not None:
        grouped_data["profit"]["Gross Profit"] = rev - cost

    # Operating Expenses (Total) = R&D + Sales & Marketing + G&A
    if op_exp is None and any(v is not None for v in [r_and_d, s_and_m, g_and_a]):
        total = sum(v for v in [r_and_d, s_and_m, g_and_a] if v is not None)
        grouped_data["expenses"]["Operating Expenses (Total)"] = total

    # Operating Income = Gross Profit - Operating Expenses
    gross = grouped_data["profit"].get("Gross Profit")
    op_exp = grouped_data["expenses"].get("Operating Expenses (Total)")
    if op_inc is None and gross is not None and op_exp is not None:
        grouped_data["profit"]["Operating Income"] = gross - op_exp

    # Income Before Tax = Operating Income + (Interest Income - Interest Expense) + Other Income
    inc_before_tax = grouped_data["profit"].get("Income Before Tax")
    int_income = grouped_data["revenue"].get("Interest Income")
    int_exp = grouped_data["expenses"].get("Interest Expense")
    other_inc = grouped_data["revenue"].get("Other Income")
    if inc_before_tax is None and op_inc is not None:
        total_other = (int_income or 0) - (int_exp or 0) + (other_inc or 0)
        grouped_data["profit"]["Income Before Tax"] = op_inc + total_other

    return grouped_data

## Step 5a. Parse a given filing (the actual XBRL or XML)
def parse_cashflow_from_10k(latest_10k_filing_XBRL_XML):
    """Extract cash flow data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract cashflow data using your earlier helper
    output = parse_cashflow_from_xbrl(soup)

    return output  # ‚úÖ actually return the data

## Step 5b. Parse a given filing to get the income statement
def parse_income_from_10k(latest_10k_filing_XBRL_XML):
    """Extract income statement data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract income data using your helper function
    output = parse_income_from_xbrl(soup)

    return output  # ‚úÖ return the parsed income statement data

#### [A2] Full flow of all supporting functions to return cashflow and income statements

In [5]:
### Consolidate all of the above into a flow:
def get_financials_json(ticker):
    """End-to-end: from ticker ‚Üí CIK ‚Üí 10-K ‚Üí XBRL ‚Üí cashflow data"""
    ## Step 1. Get CIK for the input ticker symbol
    cik = get_cik(ticker)
    if not cik:
        raise ValueError("CIK not found.")

    ## Step 2. Get latest 10K url for the CIK (ticker)
    filing_url, filing_date, report_date = get_latest_10k_url(cik)
    if not filing_url:
        raise ValueError("No 10-K filing found.")

    print(f"Found 10-K filing: {filing_url} (filed {filing_date}, for {report_date})")

    # Step 3: Find actual XBRL XML file from 10K URL
    xbrl_url = get_primary_xbrl_url(filing_url)
    if not xbrl_url:
        raise ValueError("No XBRL XML file found in filing.")

    print(f"Using XBRL XML file: {xbrl_url}")

    # Step 5a: Parse cashflow data from that XML (4a embedded)
    cashflow_data = parse_cashflow_from_10k(xbrl_url)
    if not cashflow_data:
        raise ValueError("Cash flow statement not found in filing.")

    # Step 5b: Parse income data from XML (4b embedded)
    income_data = parse_income_from_10k(xbrl_url)
    if not income_data:
        raise ValueError("income statement not found in filing.")

    # Step 7: Return results
    return {
        "ticker": ticker,
        "cik": cik,
        "source": xbrl_url,
        "cashflow": cashflow_data,
        "income_statement": income_data,
        "filing_date": filing_date,
        "report_date": report_date,
    }

### [A3] Store statements into firebase storage and the pointer to this in firestore

In [27]:
import json
import os
from dotenv import load_dotenv
from google.cloud import firestore, storage

# Load env vars (local dev only; safe to keep for API)
load_dotenv()

SERVICE_ACCOUNT_PATH = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
BUCKET_NAME = os.getenv("GCP_STORAGE_BUCKET")

if not BUCKET_NAME:
    raise RuntimeError("Missing GCP_STORAGE_BUCKET in environment")

# --- Client initialization ---
# Local dev: use service account JSON
# Production (Cloud Run / Functions): use default credentials
if SERVICE_ACCOUNT_PATH:
    db = firestore.Client.from_service_account_json(SERVICE_ACCOUNT_PATH)
    storage_client = storage.Client.from_service_account_json(SERVICE_ACCOUNT_PATH)
else:
    db = firestore.Client()
    storage_client = storage.Client()


def upload_json_to_firebase_storage(
    ticker: str,
    year: int,
    statement_type: str,   # "incomeStatement" | "cashFlow"
    data: dict,
):
    """
    Uploads a financial statement dict as JSON to Firebase Storage.

    Storage path:
      gs://<bucket>/filings/{ticker}/{year}/statements/{statement_type}.json
    """
    bucket = storage_client.bucket(BUCKET_NAME)

    object_path = f"filings/{ticker}/{year}/statements/{statement_type}.json"
    blob = bucket.blob(object_path)

    blob.upload_from_string(
        json.dumps(data, ensure_ascii=False),
        content_type="application/json; charset=utf-8",
    )

    gs_path = f"gs://{BUCKET_NAME}/{object_path}"
    return gs_path, object_path


In [29]:
from google.cloud import firestore, storage
import os

def store_statements(
    ticker: str,
    year: int,
    income_statement: dict = None,
    cash_flow: dict = None,
):
    filing_doc_id = f"{year}_10K"

    filing_ref = (
        db.collection("companies")
          .document(ticker)
          .collection("filings")
          .document(filing_doc_id)
    )

    # Ensure the filing doc exists
    filing_ref.set({
        "ticker": ticker,
        "year": year,
        "type": "10-K",
        "updatedAt": firestore.SERVER_TIMESTAMP,
    }, merge=True)

    statements_col = filing_ref.collection("statements")

    # Income statement
    if income_statement is not None:
        gs_path, object_path = upload_json_to_firebase_storage(
            ticker=ticker,
            year=year,
            statement_type="incomeStatement",
            data=income_statement,
        )

        statements_col.document("incomeStatement").set({
            "statementType": "incomeStatement",
            "storageGsPath": gs_path,
            "storageObject": object_path,
            "updatedAt": firestore.SERVER_TIMESTAMP,
        }, merge=True)

    # Cash flow
    if cash_flow is not None:
        gs_path, object_path = upload_json_to_firebase_storage(
            ticker=ticker,
            year=year,
            statement_type="cashFlow",
            data=cash_flow,
        )

        statements_col.document("cashFlow").set({
            "statementType": "cashFlow",
            "storageGsPath": gs_path,
            "storageObject": object_path,
            "updatedAt": firestore.SERVER_TIMESTAMP,
        }, merge=True)

    print(f"‚úÖ Stored statements for {ticker} {year}")


In [40]:
# This has been validated to work on 21st Jan 2026

# Pull financial statements for ticker:
financials = get_financials_json("BG")

# Store, for that ticker, into firestore and firebase storage, the statements pulled from the above 'financials'
store_statements(
    ticker=financials["ticker"],
    year=int(financials["report_date"][:4]),
    income_statement=financials["income_statement"],
    cash_flow=financials["cashflow"],
)


Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1996862/000199686225000008/0001996862-25-000008-index.html (filed 2025-02-20, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1996862/000199686225000008/bg-20241231_htm.xml
‚úÖ Stored statements for BG 2024


### [B] Get full 10K as Markdown Text

In [46]:
# 2Ô∏è‚É£ Use your HTML finder (your existing function)
def get_primary_html_url(index_url):
    """
    Find the main 10-K HTML document inside the filing's index page.
    Returns the actual HTML file URL (not the inline XBRL viewer).
    """
    res = requests.get(index_url, headers=HEADERS)
    if res.status_code != 200:
        raise RuntimeError(f"Failed to fetch index page: {res.status_code}")
    soup = BeautifulSoup(res.text, "html.parser")

    candidates = []

    for link in soup.find_all("a", href=True):
        href = link["href"].strip()
        text = link.get_text(strip=True).lower()
        href_lower = href.lower()

        # Skip XMLs and exhibits
        if any(x in href_lower for x in ["_cal.xml", "_lab.xml", "_pre.xml", "_def.xml", ".xml"]):
            continue
        if "exhibit" in href_lower or "ex" in text:
            continue

        # Only consider HTML or Inline XBRL
        if href_lower.endswith((".htm", ".html")) or "ix?doc=" in href_lower:
            score = 0
            if "10-k" in href_lower or "10k" in href_lower:
                score += 10
            if "ix?doc=" in href_lower:
                score += 20  # inline XBRL links usually indicate the primary document
            if "form" in text or "10-k" in text:
                score += 5
            candidates.append((score, href))

    if not candidates:
        return None

    # Pick the top-scoring link
    best_href = sorted(candidates, key=lambda x: x[0], reverse=True)[0][1]

    # Normalize the URL ‚Äî convert inline XBRL to raw HTML
    if "ix?doc=" in best_href:
        best_href = best_href.split("ix?doc=")[-1]
        if not best_href.startswith("https://"):
            best_href = "https://www.sec.gov" + best_href
    elif not best_href.startswith("http"):
        if best_href.startswith("/"):
            best_href = "https://www.sec.gov" + best_href
        else:
            best_href = "https://www.sec.gov/" + best_href

    return best_href

## Convert HTML content to markdown text (also used in notebook: RAG-api)
from bs4 import BeautifulSoup
import html2text
import re

def clean_10k_html(html_content):
    """Convert messy 10-K HTML into clean text."""
    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove hidden and metadata elements
    for tag in soup(["script", "style", "ix:header", "ix:hidden", "link", "meta"]):
        tag.decompose()

    # Keep only the visible content
    visible_html = str(soup)

    # Convert to readable Markdown-style text
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    text_maker.body_width = 0  # No line wrapping
    clean_text = text_maker.handle(visible_html)

    # Normalize spaces
    clean_text = re.sub(r"\n\s*\n", "\n\n", clean_text)
    return clean_text.strip()

### [B1] Store 10K into firebase storage and the pointer to this in firestore

In [44]:
# Run above functions and get output as text

import requests
from google.cloud import storage
from google.oauth2 import service_account
import requests
import os
# Make sure you actually load the .env file
from dotenv import load_dotenv
load_dotenv()

def store_10K_text_from_url(ticker, html_url, report_date):
    """
    Fetch a 10-K HTML from a URL, convert it to cleaned text, and upload
    the text to Firebase Storage under:
    company_details/EDGAR (US)/filings/{ticker}_10K.txt
    """
    
    # 1Ô∏è‚É£ Fetch the HTML content
    headers = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}
    res = requests.get(html_url, headers=headers)
    if res.status_code != 200:
        raise RuntimeError(f"Failed to fetch HTML from SEC: {res.status_code}")
    html_content = res.text

    # 2Ô∏è‚É£ Convert HTML to cleaned plain text
    try:
        text_data = clean_10k_html(html_content)
    except Exception as e:
        raise RuntimeError(f"Failed to clean HTML for {ticker}: {e}")

    # 3Ô∏è‚É£ Initialize Firebase Storage client #################################################
    from dotenv import load_dotenv
    import os
    
    from google.oauth2 import service_account
    from google.cloud import firestore, storage
    
    
    # -------------------------------
    # Load .env
    # -------------------------------
    load_dotenv()
    
    SERVICE_ACCOUNT_PATH = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
    PROJECT_ID = os.getenv("GCP_PROJECT_ID")
    BUCKET_NAME = os.getenv("GCP_STORAGE_BUCKET")
    
    if not SERVICE_ACCOUNT_PATH:
        raise ValueError("Missing FIREBASE_SERVICE_ACCOUNT_JSON in .env")
    if not PROJECT_ID:
        raise ValueError("Missing GCP_PROJECT_ID in .env")
    if not BUCKET_NAME:
        raise ValueError("Missing GCP_STORAGE_BUCKET in .env")
    
    # -------------------------------
    # Load Credentials (ONE source)
    # -------------------------------
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_PATH
    )
    
    # -------------------------------
    # Initialize Firestore + Storage 
    # -------------------------------
    db = firestore.Client(credentials=creds, project=PROJECT_ID)
    storage_client = storage.Client(credentials=creds, project=PROJECT_ID)
    bucket = storage_client.bucket(BUCKET_NAME)
    ##################################################################################################
    
    # 4Ô∏è‚É£ Path inside the bucket (TXT version)
    year = int(report_date[:4])
    object_path = f"filings/{ticker}/{year}/10K.txt"
    blob = bucket.blob(object_path)
    
    # 5Ô∏è‚É£ Upload the cleaned text
    blob.upload_from_string(text_data, content_type="text/plain; charset=utf-8")

    gs_path = f"gs://{bucket}/{object_path}"
    
    # Store only metadata + pointer in Firestore
    filing_doc_id = f"{year}_10K"
    db.collection("companies") \
      .document(ticker) \
      .collection("filings") \
      .document(filing_doc_id) \
      .set({
          "ticker": ticker,
          "year": year,
          "type": "10-K",
          "storageGsPath": gs_path,
          "storageBucket": BUCKET_NAME,
          "storageObject": object_path,
          "updatedAt": firestore.SERVER_TIMESTAMP,
      }, merge=True)

    print(f"‚úÖ Uploaded 10-K to Storage: {gs_path}")
    print(f"‚úÖ Linked in Firestore: /companies/{ticker}/filings/{filing_doc_id}")
    
    return blob.public_url


#### Try running the 10K storage end-to-end

In [48]:
# This has been confirmed to work for an input ticker on 21st January 2026
ticker = "BG"

cik = get_cik(ticker)
filing_url, filing_date, report_date = get_latest_10k_url(cik)

html_url = get_primary_html_url(filing_url)
print(html_url)

isit = store_10K_text_from_url(ticker, html_url, report_date)


https://www.sec.gov/Archives/edgar/data/1996862/000199686225000008/bg-20241231.htm
‚úÖ Uploaded 10-K to Storage: gs://<Bucket: funwai-resume.firebasestorage.app>/filings/BG/2024/10K.txt
‚úÖ Linked in Firestore: /companies/BG/filings/2024_10K
