## Get Company 10K from HTML by accessing EDGAR using a company's ticker and store in firestore

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

BASE = "https://data.sec.gov"
HEADERS = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}

## Step 1. Get CIK for the input ticker symbol~
def get_cik(ticker):
    """Retrieve CIK for a given ticker symbol"""
    res = requests.get(f"{BASE}/submissions/CIK{ticker}.json", headers=HEADERS)
    if res.status_code != 200:
        # fallback: try SEC ticker endpoint
        lookup = requests.get(f"https://www.sec.gov/files/company_tickers.json", headers=HEADERS).json()
        for _, c in lookup.items():
            if c["ticker"].lower() == ticker.lower():
                return str(c["cik_str"]).zfill(10)
    return None

## Step 2. Get latest 10K url for the CIK (ticker)
def get_latest_10k_url(cik):
    """Retrieve latest 10-K filing document URL and dates for a given CIK"""
    url = f"{BASE}/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    data = res.json()

    for form, acc, filing_date, report_date in zip(
        data["filings"]["recent"]["form"],
        data["filings"]["recent"]["accessionNumber"],
        data["filings"]["recent"]["filingDate"],
        data["filings"]["recent"]["reportDate"]
    ):
        if form == "10-K":
            acc_num = acc.replace("-", "")
            filing_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_num}/{acc}-index.html"
            return filing_url, filing_date, report_date

    return None, None, None

# 2️⃣ Use your HTML finder (your existing function)
def get_primary_html_url(index_url):
    """
    Find the main 10-K HTML document inside the filing's index page.
    Returns the actual HTML file URL (not the inline XBRL viewer).
    """
    res = requests.get(index_url, headers=HEADERS)
    if res.status_code != 200:
        raise RuntimeError(f"Failed to fetch index page: {res.status_code}")
    soup = BeautifulSoup(res.text, "html.parser")

    candidates = []

    for link in soup.find_all("a", href=True):
        href = link["href"].strip()
        text = link.get_text(strip=True).lower()
        href_lower = href.lower()

        # Skip XMLs and exhibits
        if any(x in href_lower for x in ["_cal.xml", "_lab.xml", "_pre.xml", "_def.xml", ".xml"]):
            continue
        if "exhibit" in href_lower or "ex" in text:
            continue

        # Only consider HTML or Inline XBRL
        if href_lower.endswith((".htm", ".html")) or "ix?doc=" in href_lower:
            score = 0
            if "10-k" in href_lower or "10k" in href_lower:
                score += 10
            if "ix?doc=" in href_lower:
                score += 20  # inline XBRL links usually indicate the primary document
            if "form" in text or "10-k" in text:
                score += 5
            candidates.append((score, href))

    if not candidates:
        return None

    # Pick the top-scoring link
    best_href = sorted(candidates, key=lambda x: x[0], reverse=True)[0][1]

    # Normalize the URL — convert inline XBRL to raw HTML
    if "ix?doc=" in best_href:
        best_href = best_href.split("ix?doc=")[-1]
        if not best_href.startswith("https://"):
            best_href = "https://www.sec.gov" + best_href
    elif not best_href.startswith("http"):
        if best_href.startswith("/"):
            best_href = "https://www.sec.gov" + best_href
        else:
            best_href = "https://www.sec.gov/" + best_href

    return best_href

## Convert HTML content to markdown text

In [4]:
## Convert HTML content to markdown text (also used in notebook: RAG-api)
from bs4 import BeautifulSoup
import html2text
import re

def clean_10k_html(html_content):
    """Convert messy 10-K HTML into clean text."""
    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove hidden and metadata elements
    for tag in soup(["script", "style", "ix:header", "ix:hidden", "link", "meta"]):
        tag.decompose()

    # Keep only the visible content
    visible_html = str(soup)

    # Convert to readable Markdown-style text
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    text_maker.body_width = 0  # No line wrapping
    clean_text = text_maker.handle(visible_html)

    # Normalize spaces
    clean_text = re.sub(r"\n\s*\n", "\n\n", clean_text)
    return clean_text.strip()


## Given an input ticker and HTML URL location, store into firebase at a specific location (hardwired)

In [7]:
import requests
from google.cloud import storage
from google.oauth2 import service_account
import requests
import os
# Make sure you actually load the .env file
from dotenv import load_dotenv
load_dotenv()

def store_10K_text_from_url(ticker, html_url, year):
    """
    Fetch a 10-K HTML from a URL, convert it to cleaned text, and upload
    the text to Firebase Storage under:
    company_details/EDGAR (US)/filings/{ticker}_10K.txt
    """

    # 1️⃣ Fetch the HTML content
    headers = {"User-Agent": "kurio-agent/1.0 (potatojacket9@gmail.com)"}
    res = requests.get(html_url, headers=headers)
    if res.status_code != 200:
        raise RuntimeError(f"Failed to fetch HTML from SEC: {res.status_code}")
    html_content = res.text

    # 2️⃣ Convert HTML to cleaned plain text
    try:
        text_data = clean_10k_html(html_content)
    except Exception as e:
        raise RuntimeError(f"Failed to clean HTML for {ticker}: {e}")

    # 3️⃣ Initialize Firebase Storage client
    service_account_path = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")

    if not service_account_path or not os.path.exists(service_account_path):
        raise FileNotFoundError(
            f"Service account file not found at {service_account_path}"
        )

    credentials = service_account.Credentials.from_service_account_file(service_account_path)
    client = storage.Client(credentials=credentials, project=credentials.project_id)

    # ✅ Bucket name (must match your Firebase project)
    bucket_name = "funwai-resume.firebasestorage.app"
    bucket = client.bucket(bucket_name)

    # 4️⃣ Path inside the bucket (TXT version)
    blob_path = f"company_details/EDGAR (US)/filings/{ticker}_{year}_10K.txt"
    blob = bucket.blob(blob_path)

    # 5️⃣ Upload the cleaned text
    blob.upload_from_string(text_data, content_type="text/plain")

    print(f"✅ Uploaded cleaned 10-K text for {ticker} to {blob_path}")

    # Optionally return a public URL (if your bucket allows)
    return blob.public_url


In [33]:
# [Test] Use functions to store 10K as markdown text into firestore
ticker = "MTCH"
cik = get_cik(ticker)
filing_url, filing_date, report_date = get_latest_10k_url(cik)
html_url = get_primary_html_url(filing_url)
print(html_url)
data = store_10K_text_from_url(ticker, html_url, report_date[:4])
print(data)
print("HTML->text " + "for " + ticker + " stored at:", html_url)

https://www.sec.gov/Archives/edgar/data/891103/000089110325000027/mtch-20241231.htm
✅ Uploaded cleaned 10-K text for MTCH to company_details/EDGAR (US)/filings/MTCH_2024_10K.txt
https://storage.googleapis.com/funwai-resume.firebasestorage.app/company_details/EDGAR%20%28US%29/filings/MTCH_2024_10K.txt
HTML->text for MTCH stored at: https://www.sec.gov/Archives/edgar/data/891103/000089110325000027/mtch-20241231.htm
