In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

BASE = "https://data.sec.gov"
HEADERS = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}

## Step 2. Get CIK for the input ticker symbol~
def get_cik(ticker):
    """Retrieve CIK for a given ticker symbol"""
    res = requests.get(f"{BASE}/submissions/CIK{ticker}.json", headers=HEADERS)
    if res.status_code != 200:
        # fallback: try SEC ticker endpoint
        lookup = requests.get(f"https://www.sec.gov/files/company_tickers.json", headers=HEADERS).json()
        for _, c in lookup.items():
            if c["ticker"].lower() == ticker.lower():
                return str(c["cik_str"]).zfill(10)
    return None

## Step 3. Get latest 10K url for the CIK (ticker)
def get_latest_10k_url(cik):
    """Retrieve latest 10-K filing document URL and dates for a given CIK"""
    url = f"{BASE}/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    data = res.json()

    for form, acc, filing_date, report_date in zip(
        data["filings"]["recent"]["form"],
        data["filings"]["recent"]["accessionNumber"],
        data["filings"]["recent"]["filingDate"],
        data["filings"]["recent"]["reportDate"]
    ):
        if form == "10-K":
            acc_num = acc.replace("-", "")
            filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_num}/{acc}-index.html"
            return filing_url, filing_date, report_date

    return None, None, None



#######################################

## Step 3a. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_cashflow_from_xbrl(soup):
    tags = {
        # Operating
        "Net profit (or loss if negative)": "us-gaap:NetIncomeLoss",
        "Depreciation (wear & tear on assets)": "us-gaap:DepreciationDepletionAndAmortization",
        "stock_comp": "us-gaap:ShareBasedCompensation",
        "change_ar": "us-gaap:IncreaseDecreaseInAccountsReceivable",
        "change_inventory": "us-gaap:IncreaseDecreaseInInventory",
        "change_ap": "us-gaap:IncreaseDecreaseInAccountsPayable",
        "Cash from day-to-day business (Operating Cashflow)": "us-gaap:NetCashProvidedByUsedInOperatingActivities",

        # Investing
        "Buying equipment/buildings (Capital Expenditure)": "us-gaap:PaymentsToAcquirePropertyPlantAndEquipment",
        "acquisitions": "us-gaap:PaymentsToAcquireBusinessesNetOfCashAcquired",
        "asset_sales": "us-gaap:ProceedsFromSaleOfPropertyPlantAndEquipment",
        "investments_purchase": "us-gaap:PaymentsToAcquireMarketableSecurities",
        "investments_maturity": "us-gaap:ProceedsFromMaturitiesOfMarketableSecurities",
        "Cash from investments (Buying/Selling assets)": "us-gaap:NetCashProvidedByUsedInInvestingActivities",

        # Financing
        "Money raised from issuing new shares": "us-gaap:ProceedsFromIssuanceOfCommonStock",
        "Money spent buying back shares of company": "us-gaap:PaymentsForRepurchaseOfCommonStock",
        "Borrowed money (New loans or bonds)": "us-gaap:ProceedsFromIssuanceOfLongTermDebt",
        "Loan repayments": "us-gaap:RepaymentsOfLongTermDebt",
        "Dividends paid to shareholders": "us-gaap:PaymentsOfDividends",
        "Cash from investors and loans (Financing activities)": "us-gaap:NetCashProvidedByUsedInFinancingActivities",

        # Summary
        "Change in cash during the period": "us-gaap:CashAndCashEquivalentsPeriodIncreaseDecrease",
        "Cash at the beginning of the period": "us-gaap:CashAndCashEquivalentsAtBeginningOfPeriod",
        "Cash remaining at the end of the period": "us-gaap:CashAndCashEquivalentsAtCarryingValue",
    }

    data = {}
    for key, tag in tags.items():
        el = soup.find(tag)
        if el and hasattr(el, "text"):
            try:
                data[key] = float(el.text.strip().replace(",", ""))
            except ValueError:
                data[key] = el.text.strip()
        else:
            data[key] = None

    return data

## Step 3b. Define ability to find to extract cashflow tags from XML soup (input is soup). This is not dependent on any function
def parse_income_from_xbrl(soup):
    """
    Extract key income statement values from an XBRL soup object and group
    them into: revenue, expenses, profit, and shares.
    If some values are missing, compute derived ones (e.g. Gross Profit = Revenue - Cost of Revenue).
    """

    # Define XBRL tags grouped by category
    tags = {
        "revenue": {
            "Total Revenue": [
                "us-gaap:Revenues",
                "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
                "us-gaap:SalesRevenueNet",
                "us-gaap:SalesRevenueGoodsNet",
                "us-gaap:SalesRevenueServicesNet"
            ],
            "Advertising Revenue": ["us-gaap:AdvertisingRevenue"],
            "Interest Income": ["us-gaap:InterestIncome"],
            "Other Income": ["us-gaap:OtherNonoperatingIncomeExpense"]
        },

        "expenses": {
            "Cost of Revenue": ["us-gaap:CostOfRevenue", "us-gaap:CostOfGoodsSold"],
            "Research & Development": ["us-gaap:ResearchAndDevelopmentExpense"],
            "Sales & Marketing": ["us-gaap:SellingAndMarketingExpense"],
            "General & Administrative": ["us-gaap:GeneralAndAdministrativeExpense"],
            "Operating Expenses (Total)": ["us-gaap:OperatingExpenses"],
            "Interest Expense": ["us-gaap:InterestExpense"],
            "Income Tax Expense": ["us-gaap:IncomeTaxExpenseBenefit"]
        },

        "profit": {
            "Gross Profit": ["us-gaap:GrossProfit"],
            "Operating Income": ["us-gaap:OperatingIncomeLoss"],
            "Income Before Tax": [
                "us-gaap:IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest"
            ],
            "Net Income": ["us-gaap:NetIncomeLoss"]
        },

        "shares": {
            "Earnings per Share (Basic)": ["us-gaap:EarningsPerShareBasic"],
            "Earnings per Share (Diluted)": ["us-gaap:EarningsPerShareDiluted"],
            "Weighted Average Shares Outstanding (Basic)": [
                "us-gaap:WeightedAverageNumberOfSharesOutstandingBasic"
            ],
            "Weighted Average Shares Outstanding (Diluted)": [
                "us-gaap:WeightedAverageNumberOfDilutedSharesOutstanding"
            ]
        }
    }

    # Helper function to extract numeric value from XBRL
    def extract_value(tag_list):
        for tag in tag_list:
            el = soup.find(tag)
            if el and el.text.strip():
                try:
                    return float(el.text.strip().replace(",", ""))
                except ValueError:
                    continue
        return None

    # Parse raw values from XBRL
    grouped_data = {}
    for section, section_tags in tags.items():
        grouped_data[section] = {}
        for label, tag_list in section_tags.items():
            grouped_data[section][label] = extract_value(tag_list)

    # --- üßÆ Compute derived metrics ---
    rev = grouped_data["revenue"].get("Total Revenue")
    cost = grouped_data["expenses"].get("Cost of Revenue")
    gross = grouped_data["profit"].get("Gross Profit")
    op_exp = grouped_data["expenses"].get("Operating Expenses (Total)")
    r_and_d = grouped_data["expenses"].get("Research & Development")
    s_and_m = grouped_data["expenses"].get("Sales & Marketing")
    g_and_a = grouped_data["expenses"].get("General & Administrative")
    op_inc = grouped_data["profit"].get("Operating Income")

    # Gross Profit = Revenue - Cost of Revenue
    if gross is None and rev is not None and cost is not None:
        grouped_data["profit"]["Gross Profit"] = rev - cost

    # Operating Expenses (Total) = R&D + Sales & Marketing + G&A
    if op_exp is None and any(v is not None for v in [r_and_d, s_and_m, g_and_a]):
        total = sum(v for v in [r_and_d, s_and_m, g_and_a] if v is not None)
        grouped_data["expenses"]["Operating Expenses (Total)"] = total

    # Operating Income = Gross Profit - Operating Expenses
    gross = grouped_data["profit"].get("Gross Profit")
    op_exp = grouped_data["expenses"].get("Operating Expenses (Total)")
    if op_inc is None and gross is not None and op_exp is not None:
        grouped_data["profit"]["Operating Income"] = gross - op_exp

    # Income Before Tax = Operating Income + (Interest Income - Interest Expense) + Other Income
    inc_before_tax = grouped_data["profit"].get("Income Before Tax")
    int_income = grouped_data["revenue"].get("Interest Income")
    int_exp = grouped_data["expenses"].get("Interest Expense")
    other_inc = grouped_data["revenue"].get("Other Income")
    if inc_before_tax is None and op_inc is not None:
        total_other = (int_income or 0) - (int_exp or 0) + (other_inc or 0)
        grouped_data["profit"]["Income Before Tax"] = op_inc + total_other

    return grouped_data

## Step 4: Find actual XBRL XML file from 10K URL
def get_primary_xbrl_url(index_url):
    """Find the main XBRL (XML) file inside the 10-K index page"""
    res = requests.get(index_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")

    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.endswith(".xml") and not any(
            x in href for x in ["_cal.xml", "_lab.xml", "_pre.xml", "_def.xml"]
        ):
            return "https://www.sec.gov" + href

    return None

## Step 5. Parse a given filing (the actual XBRL or XML)
def parse_cashflow_from_10k(latest_10k_filing_XBRL_XML):
    """Extract cash flow data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract cashflow data using your earlier helper
    output = parse_cashflow_from_xbrl(soup)

    return output  # ‚úÖ actually return the data

## Step 6. Parse a given filing to get the income statement
def parse_income_from_10k(latest_10k_filing_XBRL_XML):
    """Extract income statement data from a 10-K XBRL or XML filing"""
    response = requests.get(latest_10k_filing_XBRL_XML, headers=HEADERS)

    # Detect whether the document is XML or HTML
    parser = "xml" if "<?xml" in response.text[:100] else "html.parser"
    soup = BeautifulSoup(response.text, features=parser)

    # Extract income data using your helper function
    output = parse_income_from_xbrl(soup)

    return output  # ‚úÖ return the parsed income statement data


In [3]:
def get_cashflow_json(ticker):
    """End-to-end: from ticker ‚Üí CIK ‚Üí 10-K ‚Üí XBRL ‚Üí cashflow data"""
    ## Step 2. Get CIK for the input ticker symbol
    cik = get_cik(ticker)
    if not cik:
        raise ValueError("CIK not found.")

    ## Step 3. Get latest 10K url for the CIK (ticker)
    filing_url, filing_date, report_date = get_latest_10k_url(cik)
    if not filing_url:
        raise ValueError("No 10-K filing found.")

    print(f"Found 10-K filing: {filing_url} (filed {filing_date}, for {report_date})")

    # Step 4: Find actual XBRL XML file from 10K URL
    xbrl_url = get_primary_xbrl_url(filing_url)
    if not xbrl_url:
        raise ValueError("No XBRL XML file found in filing.")

    print(f"Using XBRL XML file: {xbrl_url}")


    # Step 5: Parse cashflow data from that XML
    cashflow_data = parse_cashflow_from_10k(xbrl_url)
    if not cashflow_data:
        raise ValueError("Cash flow statement not found in filing.")

    # Step 6: Parse income data from XML
    income_data = parse_income_from_10k(xbrl_url)
    if not income_data:
        raise ValueError("income statement not found in filing.")

    # Step 7: Return results
    return {
        "ticker": ticker,
        "cik": cik,
        "source": xbrl_url,
        "cashflow": cashflow_data,
        "income_statement": income_data,
        "filing_date": filing_date,
        "report_date": report_date,
    }


In [5]:
from google.cloud import firestore
import json

# Initialize Firestore client (ensure your JSON path is correct)
db = firestore.Client.from_service_account_json("funwai-resume-firebase-adminsdk-fbsvc-a956eb6362.json")

def store_financials_to_firestore(financials_dict, collection_name="company_financials"):
    """
    Stores cashflow data from get_cashflow_json() into Firestore.
    
    Args:
        financials_dict (dict): The result from get_financials_json(ticker)
        collection_name (str): Firestore collection name
    """
    ticker = financials_dict.get("ticker")
    if not ticker:
        raise ValueError("Ticker not found in dictionary")

    # Use ticker as document ID (so each company overwrites with latest data)
    doc_ref = db.collection(collection_name).document(ticker)
    
    # Store the data
    doc_ref.set(financials_dict)
    print(f"‚úÖ Stored financial data for {ticker} in Firestore collection '{collection_name}'.")


In [None]:
## code to loop through all SP500 tickers and store if they are not in the firestore database

In [33]:
## test the above function (it works!)
data = get_cashflow_json("NKE")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/320187/000032018725000047/0000320187-25-000047-index.html (filed 2025-07-17, for 2025-05-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/320187/000032018725000047/nke-20250531_htm.xml
‚úÖ Stored financial data for NKE in Firestore collection 'company_financials'.


In [35]:
data = get_cashflow_json("NFLX")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1065280/000106528025000044/0001065280-25-000044-index.html (filed 2025-01-27, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1065280/000106528025000044/nflx-20241231_htm.xml
‚úÖ Stored financial data for NFLX in Firestore collection 'company_financials'.


In [37]:
data = get_cashflow_json("GOOG")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1652044/000165204425000014/0001652044-25-000014-index.html (filed 2025-02-05, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1652044/000165204425000014/goog-20241231_htm.xml
‚úÖ Stored financial data for GOOG in Firestore collection 'company_financials'.


In [39]:
data = get_cashflow_json("MSFT")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/0000950170-25-100235-index.html (filed 2025-07-30, for 2025-06-30)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/789019/000095017025100235/msft-20250630_htm.xml
‚úÖ Stored financial data for MSFT in Firestore collection 'company_financials'.


In [41]:
data = get_cashflow_json("META")
store_financials_to_firestore(data)

data = get_cashflow_json("ABNB")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1326801/000132680125000017/0001326801-25-000017-index.html (filed 2025-01-30, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1326801/000132680125000017/meta-20241231_htm.xml
‚úÖ Stored financial data for META in Firestore collection 'company_financials'.
Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1559720/000155972025000010/0001559720-25-000010-index.html (filed 2025-02-13, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1559720/000155972025000010/abnb-20241231_htm.xml
‚úÖ Stored financial data for ABNB in Firestore collection 'company_financials'.


In [43]:
data = get_cashflow_json("COIN")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1679788/000167978825000022/0001679788-25-000022-index.html (filed 2025-02-13, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1679788/000167978825000022/coin-20241231_htm.xml
‚úÖ Stored financial data for COIN in Firestore collection 'company_financials'.


In [45]:
data = get_cashflow_json("AMD")
store_financials_to_firestore(data)

data = get_cashflow_json("RDDT")
store_financials_to_firestore(data)

data = get_cashflow_json("NVDA")
store_financials_to_firestore(data)

data = get_cashflow_json("WFC")
store_financials_to_firestore(data)

data = get_cashflow_json("JPM")
store_financials_to_firestore(data)
                              
data = get_cashflow_json("CSCO")
store_financials_to_firestore(data)

data = get_cashflow_json("CMCSA")
store_financials_to_firestore(data)

data = get_cashflow_json("V")
store_financials_to_firestore(data)

data = get_cashflow_json("MA")
store_financials_to_firestore(data)

data = get_cashflow_json("JNJ")
store_financials_to_firestore(data)

data = get_cashflow_json("HD")
store_financials_to_firestore(data)

data = get_cashflow_json("UBER")
store_financials_to_firestore(data)

data = get_cashflow_json("PFE")
store_financials_to_firestore(data)

data = get_cashflow_json("COF")
store_financials_to_firestore(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/2488/000000248825000012/0000002488-25-000012-index.html (filed 2025-02-05, for 2024-12-28)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/2488/000000248825000012/amd-20241228_htm.xml
‚úÖ Stored financial data for AMD in Firestore collection 'company_financials'.
Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1713445/000171344525000018/0001713445-25-000018-index.html (filed 2025-02-13, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1713445/000171344525000018/rddt-20241231_htm.xml
‚úÖ Stored financial data for RDDT in Firestore collection 'company_financials'.
Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/0001045810-25-000023-index.html (filed 2025-02-26, for 2025-01-26)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126_htm.xml
‚úÖ Stored financial data for NVDA in Firestore collectio

In [None]:
##### New section ####

In [29]:
data = get_cashflow_json("COIN")
print(data)

Found 10-K filing: https://www.sec.gov/Archives/edgar/data/1679788/000167978825000022/0001679788-25-000022-index.html (filed 2025-02-13, for 2024-12-31)
Using XBRL XML file: https://www.sec.gov/Archives/edgar/data/1679788/000167978825000022/coin-20241231_htm.xml
{'ticker': 'COIN', 'cik': '0001679788', 'source': 'https://www.sec.gov/Archives/edgar/data/1679788/000167978825000022/coin-20241231_htm.xml', 'cashflow': {'Net profit (or loss if negative)': 2579066000.0, 'Depreciation (wear & tear on assets)': 127518000.0, 'stock_comp': 912838000.0, 'change_ar': None, 'change_inventory': None, 'change_ap': None, 'Cash from day-to-day business (Operating Cashflow)': 2556844000.0, 'Buying equipment/buildings (Capital Expenditure)': None, 'acquisitions': 0.0, 'asset_sales': None, 'investments_purchase': None, 'investments_maturity': None, 'Cash from investments (Buying/Selling assets)': -282385000.0, 'Money raised from issuing new shares': None, 'Money spent buying back shares of company': None, 

In [9]:
import requests
from bs4 import BeautifulSoup
import re

# 1Ô∏è‚É£ Get the latest 10-K index page for a given company CIK
def get_latest_10k_url(cik):
    """Retrieve latest 10-K filing document URL and dates for a given CIK"""
    url = f"{BASE}/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    data = res.json()

    for form, acc, filing_date, report_date in zip(
        data["filings"]["recent"]["form"],
        data["filings"]["recent"]["accessionNumber"],
        data["filings"]["recent"]["filingDate"],
        data["filings"]["recent"]["reportDate"]
    ):
        if form == "10-K":
            acc_num = acc.replace("-", "")
            filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_num}/{acc}-index.html"
            return filing_url, filing_date, report_date

    return None, None, None

# 2Ô∏è‚É£ Use your HTML finder (your existing function)
def get_primary_html_url(index_url):
    """
    Find the main 10-K HTML document inside the filing's index page.
    Returns the actual HTML file URL (not the inline XBRL viewer).
    """
    res = requests.get(index_url, headers=HEADERS)
    if res.status_code != 200:
        raise RuntimeError(f"Failed to fetch index page: {res.status_code}")
    soup = BeautifulSoup(res.text, "html.parser")

    candidates = []

    for link in soup.find_all("a", href=True):
        href = link["href"].strip()
        text = link.get_text(strip=True).lower()
        href_lower = href.lower()

        # Skip XMLs and exhibits
        if any(x in href_lower for x in ["_cal.xml", "_lab.xml", "_pre.xml", "_def.xml", ".xml"]):
            continue
        if "exhibit" in href_lower or "ex" in text:
            continue

        # Only consider HTML or Inline XBRL
        if href_lower.endswith((".htm", ".html")) or "ix?doc=" in href_lower:
            score = 0
            if "10-k" in href_lower or "10k" in href_lower:
                score += 10
            if "ix?doc=" in href_lower:
                score += 20  # inline XBRL links usually indicate the primary document
            if "form" in text or "10-k" in text:
                score += 5
            candidates.append((score, href))

    if not candidates:
        return None

    # Pick the top-scoring link
    best_href = sorted(candidates, key=lambda x: x[0], reverse=True)[0][1]

    # Normalize the URL ‚Äî convert inline XBRL to raw HTML
    if "ix?doc=" in best_href:
        best_href = best_href.split("ix?doc=")[-1]
        if not best_href.startswith("https://"):
            best_href = "https://www.sec.gov" + best_href
    elif not best_href.startswith("http"):
        if best_href.startswith("/"):
            best_href = "https://www.sec.gov" + best_href
        else:
            best_href = "https://www.sec.gov/" + best_href

    return best_href

In [11]:
## Convert HTML content to markdown text (also used in notebook: RAG-api)
from bs4 import BeautifulSoup
import html2text
import re

def clean_10k_html(html_content):
    """Convert messy 10-K HTML into clean text."""
    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove hidden and metadata elements
    for tag in soup(["script", "style", "ix:header", "ix:hidden", "link", "meta"]):
        tag.decompose()

    # Keep only the visible content
    visible_html = str(soup)

    # Convert to readable Markdown-style text
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    text_maker.body_width = 0  # No line wrapping
    clean_text = text_maker.handle(visible_html)

    # Normalize spaces
    clean_text = re.sub(r"\n\s*\n", "\n\n", clean_text)
    return clean_text.strip()


In [1]:
import requests
from google.cloud import storage
from google.oauth2 import service_account
import requests
import os
# Make sure you actually load the .env file
from dotenv import load_dotenv
load_dotenv()

def store_10K_text_from_url(ticker, html_url, year):
    """
    Fetch a 10-K HTML from a URL, convert it to cleaned text, and upload
    the text to Firebase Storage under:
    company_details/EDGAR (US)/filings/{ticker}_10K.txt
    """

    # 1Ô∏è‚É£ Fetch the HTML content
    headers = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}
    res = requests.get(html_url, headers=headers)
    if res.status_code != 200:
        raise RuntimeError(f"Failed to fetch HTML from SEC: {res.status_code}")
    html_content = res.text

    # 2Ô∏è‚É£ Convert HTML to cleaned plain text
    try:
        text_data = clean_10k_html(html_content)
    except Exception as e:
        raise RuntimeError(f"Failed to clean HTML for {ticker}: {e}")

    # 3Ô∏è‚É£ Initialize Firebase Storage client
    service_account_path = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")

    if not service_account_path or not os.path.exists(service_account_path):
        raise FileNotFoundError(
            f"Service account file not found at {service_account_path}"
        )

    credentials = service_account.Credentials.from_service_account_file(service_account_path)
    client = storage.Client(credentials=credentials, project=credentials.project_id)

    # ‚úÖ Bucket name (must match your Firebase project)
    bucket_name = "funwai-resume.firebasestorage.app"
    bucket = client.bucket(bucket_name)

    # 4Ô∏è‚É£ Path inside the bucket (TXT version)
    blob_path = f"company_details/EDGAR (US)/filings/{ticker}_{year}_10K.txt"
    blob = bucket.blob(blob_path)

    # 5Ô∏è‚É£ Upload the cleaned text
    blob.upload_from_string(text_data, content_type="text/plain")

    print(f"‚úÖ Uploaded cleaned 10-K text for {ticker} to {blob_path}")

    # Optionally return a public URL (if your bucket allows)
    return blob.public_url


In [3]:
# Use functions to store 10K as markdown text into firestore
ticker = "XOM" 
cik = get_cik(ticker)
filing_url, filing_date, report_date = get_latest_10k_url(cik)
html_url = get_primary_html_url(filing_url)
print(html_url)
data = store_10K_text_from_url(ticker, html_url, report_date[:4])
print(data)
print("HTML->text " + "for " + ticker + " stored at:", html_url)

NameError: name 'get_cik' is not defined

In [13]:
service_account_path = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
print(service_account_path)

None


In [55]:
print(filing_date)

2025-10-31


In [80]:
cik = get_cik("BRK-B")
print(cik)

0001067983


In [150]:
import os
from google.cloud import storage
from google.oauth2 import service_account

def download_all_10k_texts(local_output_folder, service_account_path=None):
    """
    Downloads all .txt 10-K files stored in:
    company_details/EDGAR (US)/filings/
    and saves them into a local folder.
    """

    # Load service account path from .env if not provided
    if service_account_path is None:
        service_account_path = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")

    if not service_account_path or not os.path.exists(service_account_path):
        raise FileNotFoundError(
            f"‚ùå Service account file not found at: {service_account_path}"
        )

    # Create Firebase Storage client
    credentials = service_account.Credentials.from_service_account_file(service_account_path)
    client = storage.Client(credentials=credentials, project=credentials.project_id)

    bucket_name = "funwai-resume.firebasestorage.app"
    bucket = client.bucket(bucket_name)

    # Folder where your cleaned .txt files live inside Firebase Storage
    prefix = "company_details/EDGAR (US)/filings/"

    # Ensure local output folder exists
    os.makedirs(local_output_folder, exist_ok=True)

    print("üîç Checking for .txt files in bucket...")

    # List all files under prefix
    blobs = bucket.list_blobs(prefix=prefix)

    download_count = 0

    for blob in blobs:
        if blob.name.endswith(".txt"):  # Only download .txt files
            filename = blob.name.split("/")[-1]
            local_path = os.path.join(local_output_folder, filename)

            print(f"‚¨áÔ∏è Downloading {filename} ...")

            blob.download_to_filename(local_path)
            download_count += 1

    if download_count == 0:
        print("‚ö†Ô∏è No .txt files found in the specified folder.")
    else:
        print(f"‚úÖ Download complete. {download_count} files saved to {local_output_folder}")


# ---- Run the download ----

download_all_10k_texts(
    local_output_folder="./clean_10k_texts"   # choose your folder
)


üîç Checking for .txt files in bucket...
‚¨áÔ∏è Downloading AAPL_2025_10K.txt ...
‚¨áÔ∏è Downloading ABBV_2024_10K.txt ...
‚¨áÔ∏è Downloading ABNB_2024_10K.txt ...
‚¨áÔ∏è Downloading ABT_2024_10K.txt ...
‚¨áÔ∏è Downloading ACN_2025_10K.txt ...
‚¨áÔ∏è Downloading ADBE_2024_10K.txt ...
‚¨áÔ∏è Downloading ADI_2024_10K.txt ...
‚¨áÔ∏è Downloading AEP_2024_10K.txt ...
‚¨áÔ∏è Downloading AJG_2024_10K.txt ...
‚¨áÔ∏è Downloading AMAT_2024_10K.txt ...
‚¨áÔ∏è Downloading AMCR_2025_10K.txt ...
‚¨áÔ∏è Downloading AMD_2024_10K.txt ...
‚¨áÔ∏è Downloading AME_2024_10K.txt ...
‚¨áÔ∏è Downloading AMZN_2024_10K.txt ...
‚¨áÔ∏è Downloading AON_2024_10K.txt ...
‚¨áÔ∏è Downloading A_2024_10K.txt ...
‚¨áÔ∏è Downloading BAC_2024_10K.txt ...
‚¨áÔ∏è Downloading BALL_2024_10K.txt ...
‚¨áÔ∏è Downloading BA_2024_10K.txt ...
‚¨áÔ∏è Downloading BBY_2025_10K.txt ...
‚¨áÔ∏è Downloading BKNG_2024_10K.txt ...
‚¨áÔ∏è Downloading BKR_2024_10K.txt ...
‚¨áÔ∏è Downloading BMY_2024_10K.txt ...
‚¨áÔ∏è Downloading BRK-B_2024_10