In [5]:
import requests
import time
import os
import json
import datetime

# ---------------- CONFIG ---------------- #

HEADERS = {
    "User-Agent": "Your Name your.email@example.com",
}

SEC_BASE = "https://data.sec.gov"

# Root directory where everything goes
ROOT_DIR = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"

os.makedirs(ROOT_DIR, exist_ok=True)


# ---------------- HELPERS ---------------- #

def zero_pad_cik(cik):
    return str(cik).zfill(10)


def form_to_label(form):
    """Convert form names to clean labels."""
    return form.replace("-", "").replace("/", "-")


def filing_date_to_year_quarter(date_str):
    """Convert filing date YYYY-MM-DD → (YEAR, Q#)."""
    year, month, _ = map(int, date_str.split("-"))
    quarter = (month - 1) // 3 + 1
    return year, f"Q{quarter}"


# ---------------- LOAD TICKER → CIK MAP ---------------- #

def load_ticker_cik_map():
    url = "https://www.sec.gov/files/company_tickers.json"
    r = requests.get(url, headers=HEADERS)
    r.raise_for_status()
    data = r.json()

    return {
        entry["ticker"].upper(): entry["cik_str"]
        for entry in data.values()
    }


# ---------------- DOWNLOAD HTML FILING ---------------- #

def download_html(ticker, cik, accession, primary_doc, form, filing_date):
    cik_int = int(cik)
    acc_nodash = accession.replace("-", "")

    # SEC URL
    url = f"https://www.sec.gov/Archives/edgar/data/{cik_int}/{acc_nodash}/{primary_doc}"

    # Per-ticker folder
    ticker_dir = os.path.join(ROOT_DIR, ticker.upper())
    os.makedirs(ticker_dir, exist_ok=True)

    # Year & quarter
    year, quarter = filing_date_to_year_quarter(filing_date)

    # Normalize form name
    form_label = form_to_label(form)

    # Final filename
    filename = f"{ticker.upper()}-{year}-{quarter}-{form_label}.htm"
    path = os.path.join(ticker_dir, filename)

    # Download content
    r = requests.get(url, headers=HEADERS)
    if r.status_code == 200:
        with open(path, "wb") as f:
            f.write(r.content)
        print(f"✓ Saved {path}")
    else:
        print(f"✗ Failed {url} ({r.status_code})")

    return r.status_code


# ---------------- PROCESS HISTORICAL INDEX FILE ---------------- #

def process_filing_index(ticker, cik, index_filename):
    url = f"{SEC_BASE}/{index_filename}"
    r = requests.get(url, headers=HEADERS)
    r.raise_for_status()
    filing_index = r.json()

    for item in filing_index["filings"]:
        form = item["form"]
        accession = item["accessionNumber"]
        primary_doc = item["primaryDocument"]
        filing_date = item["filingDate"]

        if form in ("10-K", "10-K/A", "10-Q", "10-Q/A") and primary_doc.endswith(".htm"):
            download_html(ticker, cik, accession, primary_doc, form, filing_date)
            time.sleep(0.2)  # SEC rate limit


# ---------------- MAIN SCRAPER ---------------- #

def download_all_historical_10k_q(tickers):
    ticker_to_cik = load_ticker_cik_map()

    for ticker in tickers:
        t = ticker.upper()

        # ▶ ADDED: Skip ticker if folder already exists
        ticker_dir = os.path.join(ROOT_DIR, t)
        if os.path.exists(ticker_dir):
            print(f"⏩ Skipping {t} — folder already exists.")
            continue

        if t not in ticker_to_cik:
            print(f"Ticker {ticker} not found.")
            continue

        cik = ticker_to_cik[t]
        print(f"\n=== {t} (CIK {cik}) ===")

        # 1. Download submissions JSON
        url = f"{SEC_BASE}/submissions/CIK{zero_pad_cik(cik)}.json"
        r = requests.get(url, headers=HEADERS)
        r.raise_for_status()
        data = r.json()

        # -------- RECENT FILINGS -------- #
        recent = data["filings"]["recent"]
        forms = recent["form"]
        accessions = recent["accessionNumber"]
        primary_docs = recent["primaryDocument"]
        dates = recent["filingDate"]

        for form, acc, doc, filing_date in zip(forms, accessions, primary_docs, dates):
            if form in ("10-K", "10-K/A", "10-Q", "10-Q/A") and doc.endswith(".htm"):
                download_html(t, cik, acc, doc, form, filing_date)
                time.sleep(0.2)

        # -------- HISTORICAL FILINGS -------- #
        for fileinfo in data["filings"]["files"]:
            if "filename" not in fileinfo:
                print(f"⚠️  Skipping malformed entry in filings.files: {fileinfo}")
                continue

            index_filename = fileinfo["filename"]
            print(f"Loading historical index: {index_filename}")
            process_filing_index(t, cik, index_filename)

# ---------------- RUN ---------------- #

download_all_historical_10k_q(['PAYC', 'KEYS', 'HWM', 'AAPL', 'AAP', 'ABBV', 'ABMD', 'ABT',
       'ADSK', 'TAP', 'ACN', 'ADI', 'ADM', 'AEP', 'AES', 'AFL', 'SWKS',
       'HES', 'AIG', 'AIZ', 'AKAM', 'ALB', 'HON', 'ALLE', 'ALGN', 'ALK',
       'ALL', 'PLD', 'AMGN', 'AAL', 'AMT', 'ANSS', 'APH', 'ARE', 'A',
       'ANTM', 'ADP', 'AVB', 'ATVI', 'AVY', 'AWK', 'AXP', 'AZO', 'PNW',
       'BA', 'BAX', 'BR', 'BBY', 'RHI', 'BDX', 'BF.B', 'BKR', 'BIO',
       'WRB', 'BLL', 'BWA', 'CARR', 'CAT', 'CBRE', 'CBOE', 'C', 'CCL',
       'COST', 'CDAY', 'STZ', 'CAH', 'CDW', 'CE', 'CERN', 'FIS', 'CF',
       'CFG', 'JPM', 'CHRW', 'CVX', 'CINF', 'CL', 'CLX', 'CMA', 'CMCSA',
       'CMG', 'CMS', 'CNC', 'COF', 'TPR', 'CPB', 'CPRT', 'CHD', 'CRL',
       'DXC', 'CSCO', 'CSX', 'CTAS', 'CTLT', 'CTSH', 'CTVA', 'CTXS',
       'CMI', 'DAL', 'TGT', 'DIS', 'DISH', 'DLR', 'DLTR', 'DHR', 'DPZ',
       'DG', 'DOV', 'OMC', 'MU', 'DRE', 'DHI', 'DRI', 'DTE', 'DUK', 'DVN',
       'MS', 'DXCM', 'EBAY', 'CDNS', 'ECL', 'ED', 'EFX', 'ATO', 'PKI',
       'EL', 'EMN', 'EMR', 'ENPH', 'EOG', 'EQR', 'EA', 'ESS', 'ETN',
       'ETSY', 'EW', 'EXPD', 'EXR', 'F', 'RF', 'FANG', 'FAST', 'FBHS',
       'FCX', 'FDX', 'FISV', 'FLT', 'FMC', 'USB', 'FRC', 'BEN', 'FTNT',
       'FTV', 'IT', 'GD', 'GILD', 'GIS', 'GLW', 'GM', 'GNRC', 'GPC',
       'GPN', 'GPS', 'GRMN', 'GS', 'GWW', 'HAL', 'HAS', 'HBAN', 'HBI',
       'HCA', 'PEAK', 'HD', 'HII', 'HST', 'HOLX', 'CNP', 'HPE', 'HRL',
       'HSIC', 'HSY', 'IBM', 'ICE', 'IDXX', 'IEX', 'IFF', 'ILMN', 'INTC',
       'INTU', 'IP', 'IPG', 'IPGP', 'INCY', 'TT', 'ISRG', 'ITW', 'IVZ',
       'JBHT', 'JKHY', 'JNPR', 'EVRG', 'MDLZ', 'KHC', 'KIM', 'KMB', 'KMI',
       'KMX', 'KR', 'LYB', 'LEG', 'LEN', 'LMT', 'LNC', 'BBWI', 'L', 'LUV',
       'LVS', 'LW', 'LYV', 'MAA', 'MA', 'MAS', 'AME', 'MKC', 'MCD',
       'MCHP', 'MCK', 'CVS', 'MET', 'MGM', 'SPGI', 'MAR', 'MKTX', 'MLM',
       'MMC', 'MMM', 'MO', 'MHK', 'MOS', 'MSI', 'MPC', 'MPWR', 'INFO',
       'MRNA', 'MRO', 'MSFT', 'ETR', 'MTD', 'MSCI', 'VTRS', 'BAC', 'NCLH',
       'NFLX', 'NI', 'NKE', 'NOC', 'NOW', 'NRG', 'NSC', 'XEL', 'NTAP',
       'ES', 'NUE', 'NVDA', 'NWL', 'NWSA', 'NXPI', 'ODFL', 'FE', 'OGN',
       'OKE', 'ORCL', 'ORLY', 'OTIS', 'OXY', 'COP', 'PAYX', 'PBCT',
       'BKNG', 'PXD', 'EXC', 'PENN', 'PEP', 'PFE', 'PFG', 'PG', 'PHM',
       'PKG', 'PTC', 'PNC', 'PNR', 'BRO', 'POOL', 'PPG', 'PPL', 'PGR',
       'PRU', 'RE', 'PSX', 'TROW', 'PVH', 'PWR', 'PYPL', 'QCOM', 'QRVO',
       'RL', 'RCL', 'REG', 'REGN', 'RMD', 'RJF', 'O', 'ROK', 'ROL',
       'ROST', 'RSG', 'NVR', 'LDOS', 'SBUX', 'EIX', 'SCHW', 'SRE', 'SEE',
       'SHW', 'SIVB', 'SLB', 'AOS', 'TFC', 'SNPS', 'SO', 'KEY', 'SPG',
       'STT', 'TRV', 'SYK', 'SWK', 'SYF', 'NLOK', 'SYY', 'TDY', 'TECH',
       'TER', 'TFX', 'TDG', 'GL', 'TMO', 'TRMB', 'TSCO', 'TWTR', 'CCI',
       'TXN', 'TXT', 'TYL', 'TSN', 'UAL', 'UAA', 'UDR', 'AEE', 'UHS',
       'ULTA', 'UNP', 'URI', 'WM', 'RTX', 'VTR', 'VFC', 'VIAC', 'V',
       'VLO', 'VMC', 'VNO', 'VRSK', 'VRSN', 'VRTX', 'WAB', 'WBA', 'WAT',
       'WDC', 'WHR', 'WMB', 'LNT', 'WST', 'WU', 'WY', 'WYNN', 'XLNX',
       'XOM', 'XYL', 'ZBRA', 'ZION', 'ZBH', 'ZTS'])



=== PAYC (CIK 1590955) ===
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2025-Q4-10Q.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2025-Q3-10Q.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2025-Q2-10Q.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2025-Q1-10K.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2024-Q4-10Q.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2024-Q3-10Q.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2024-Q2-10Q.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/PAYC/PAYC-2024-Q1-10K.htm
✓ Saved /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ