In [84]:
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML BUT PRESERVE INLINE XBRL TEXT
###############################################################################

def clean_html_file(filepath):
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove tables (NOTE: flattening sometimes loses text layout,
    # but you asked to preserve your original code's structure)
    for t in soup.find_all(["table", "tr", "td", "th"]):
        t.decompose()

    # Remove images / figures
    for t in soup.find_all(["img", "figure"]):
        t.decompose()

    # Remove script/style
    for t in soup(["script", "style"]):
        t.decompose()

    # Inline XBRL tags like <ix:nonNumeric>, <us-gaap:Something>
    for t in soup.find_all():
        if ":" in (t.name or ""):
            t.unwrap()

    text = soup.get_text(separator="\n")

    # Remove page numbers alone on a line
    text = re.sub(r"\n\s*\d{1,6}\s*(?=\n)", "\n", text)

    # Remove boilerplate
    boiler = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
    ]
    for bp in boiler:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Whitespace cleanup
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n+", "\n\n", text)

    return text.strip()


###############################################################################
# PART 2 — ITEM & MD&A HEADING REGEX
###############################################################################

# Generic "Item X" finder (used ONLY for start anchoring)
ITEM_FINDER = re.compile(
    r"(?<!\w)item[\s\.\-–—:]*\d+[a-z]?(?!\w)",
    flags=re.IGNORECASE
)

def item_header_regex(num):
    return rf"(?<!\w)item[\s\.\-–—:]*{num}(?!\w)"

# 10-K MD&A = Item 7
TEN_K_MDA_ITEM = item_header_regex(7)

# 10-Q MD&A = Item 2
TEN_Q_MDA_ITEM = item_header_regex(2)

# Full MD&A heading
MDA_HEADING = re.compile(
    r"management['’`s]{0,2}\s+discussion\s+and\s+analysis",
    flags=re.IGNORECASE
)


###############################################################################
# PART 3 — RELIABLE END BOUNDARIES
###############################################################################
# These match ONLY true section headers, not inline references.

END_10Q = re.compile(
    r"(?mi)^[ \t]*(item\s*)?3[\.\-–—:]?[ \t]*($|\n)"
)

END_10K = re.compile(
    r"(?mi)^[ \t]*(item\s*)?8[\.\-–—:]?[ \t]*($|\n)"
)


###############################################################################
# PART 4 — DETERMINE FILING TYPE FROM FILE NAME
###############################################################################

def filing_type_from_filename(path):
    name = os.path.basename(path).lower()
    if "10k" in name:
        return "10-K"
    if "10q" in name:
        return "10-Q"
    return "10-K"


###############################################################################
# PART 5 — EXTRACT MD&A
###############################################################################

def extract_mda(text, filing_type):
    low = text.lower()

    # Precompute START locations (for anchoring only)
    item_positions = [m.start() for m in ITEM_FINDER.finditer(low)]

    ###########################################################################
    # STEP 1 — MD&A Heading Detected
    ###########################################################################
    heading = MDA_HEADING.search(low)

    if heading:
        heading_start = heading.start()

        # Try to anchor start at nearest correct Item 7 or Item 2
        start = heading_start
        prev_items = [p for p in item_positions if p < heading_start]

        if prev_items:
            cand = prev_items[-1]
            snippet = low[cand:cand + 60]
            if filing_type == "10-K" and re.search(TEN_K_MDA_ITEM, snippet):
                start = cand
            elif filing_type == "10-Q" and re.search(TEN_Q_MDA_ITEM, snippet):
                start = cand

        # END boundary: Item 8 for 10-K, Item 3 for 10-Q
        if filing_type == "10-K":
            end_match = END_10K.search(text, pos=start + 30)
        else:
            end_match = END_10Q.search(text, pos=start + 30)

        end = end_match.start() if end_match else len(text)
        return text[start:end].strip()

    ###########################################################################
    # STEP 2 — FALLBACK: Look for Item 7/2 + "discussion and analysis"
    ###########################################################################
    item_pattern = TEN_K_MDA_ITEM if filing_type == "10-K" else TEN_Q_MDA_ITEM

    mda_item_start = None
    for m in re.finditer(item_pattern, low):
        s = m.start()
        window = low[s:s+800]
        if "discussion" in window and "analysis" in window:
            mda_item_start = s
            break

    if mda_item_start is None:
        return ""

    # END boundary
    if filing_type == "10-K":
        end_match = END_10K.search(text, pos=mda_item_start + 30)
    else:
        end_match = END_10Q.search(text, pos=mda_item_start + 30)

    end = end_match.start() if end_match else len(text)
    return text[mda_item_start:end].strip()


###############################################################################
# PART 6 — PROCESS ONE FILE (MD&A ONLY)
###############################################################################

def process_one_file(input_file, output_base):
    out = output_base + "_cleaned.txt"
    if os.path.exists(out):
        print("Skipping (exists):", out)
        return

    cleaned = clean_html_file(input_file)
    filing_type = filing_type_from_filename(input_file)
    mda = extract_mda(cleaned, filing_type)

    with open(out, "w") as f:
        f.write("===== MD&A =====\n\n")
        f.write(mda if mda else "(MD&A not found)")

    print(f"✔ Saved MD&A → {out}")


###############################################################################
# PART 7 — BATCH PROCESS DIRECTORY
###############################################################################

def process_all_files(src, dst):
    for root, _, files in os.walk(src):
        for file in files:
            if file.lower().endswith(".htm"):
                inp = os.path.join(root, file)
                rel = os.path.relpath(root, src)
                out_dir = os.path.join(dst, rel)
                os.makedirs(out_dir, exist_ok=True)
                out_base = os.path.join(out_dir, file.rsplit(".", 1)[0])
                process_one_file(inp, out_base)


###############################################################################
# PART 8 — MAIN
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"
    process_all_files(src_folder, dst_folder)


✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2010-Q4-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2006-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2011-Q2-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2008-Q2-

In [None]:
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML BUT PRESERVE INLINE XBRL TEXT
###############################################################################

def clean_html_file(filepath):
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove tables (layout / numbers in tables are usually duplicated in text)
    for t in soup.find_all(["table", "tr", "td", "th"]):
        t.decompose()

    # Remove images / figures
    for t in soup.find_all(["img", "figure"]):
        t.decompose()

    # Remove script/style
    for t in soup(["script", "style"]):
        t.decompose()

    # Inline XBRL tags like <ix:nonNumeric>, <us-gaap:Whatever>:
    # unwrap the tag but KEEP the text content.
    for t in soup.find_all():
        if ":" in (t.name or ""):
            t.unwrap()

    text = soup.get_text(separator="\n")

    # Remove page numbers that appear alone on a line
    text = re.sub(r"\n\s*\d{1,6}\s*(?=\n)", "\n", text)

    # Remove some boilerplate
    boiler = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
    ]
    for bp in boiler:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Whitespace cleanup
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n+", "\n\n", text)

    return text.strip()


###############################################################################
# PART 2 — ITEM & MD&A HEADING REGEX
###############################################################################

# Generic "Item X" finder
ITEM_FINDER = re.compile(
    r"(?<!\w)item[\s\.\-–—:]*\d+[a-z]?(?!\w)",
    flags=re.IGNORECASE
)

def item_header_regex(num):
    return rf"(?<!\w)item[\s\.\-–—:]*{num}(?!\w)"


# For MD&A items
TEN_K_MDA_ITEM = item_header_regex(7)  # Item 7
TEN_Q_MDA_ITEM = item_header_regex(2)  # Item 2

# STRICT MD&A heading phrase (main anchor)
MDA_HEADING = re.compile(
    r"management['’`s]{0,2}\s+discussion\s+and\s+analysis",
    flags=re.IGNORECASE
)


###############################################################################
# PART 3 — DETERMINE FILING TYPE FROM FILE NAME
###############################################################################

def filing_type_from_filename(path):
    name = os.path.basename(path).lower()
    if "10k" in name:
        return "10-K"
    if "10q" in name:
        return "10-Q"
    # Fallback if naming is weird
    return "10-K"


###############################################################################
# PART 4 — EXTRACT MD&A SECTION (10-K: Item 7, 10-Q: Item 2)
###############################################################################

def extract_mda(text, filing_type):
    """
    1) Find 'Management's Discussion and Analysis...' heading.
    2) Anchor start at the preceding correct Item 7/2 header if present.
    3) End at the next 'Item X' header.
    4) If no heading found, fallback to Item 7/2 ONLY when 'discussion and analysis'
       appears close to it (to avoid cross-references).
    """
    low = text.lower()

    # Precompute all item header positions
    item_positions = [m.start() for m in ITEM_FINDER.finditer(low)]

    # --- Step 1: Try to find the actual MD&A heading phrase ---
    heading_match = MDA_HEADING.search(low)

    if heading_match:
        heading_start = heading_match.start()

        # Find the nearest preceding item header
        prev_items = [p for p in item_positions if p < heading_start]
        start = heading_start
        if prev_items:
            candidate = prev_items[-1]
            header_snippet = low[candidate:candidate+40]

            if filing_type == "10-K" and re.search(TEN_K_MDA_ITEM, header_snippet):
                start = candidate
            elif filing_type == "10-Q" and re.search(TEN_Q_MDA_ITEM, header_snippet):
                start = candidate

        # Find the next item header after the heading (or after the start)
        next_items = [p for p in item_positions if p > heading_start]
        end = next_items[0] if next_items else len(text)

        return text[start:end].strip()

    # --- Step 2: Fallback — No explicit heading found ---
    # We now look for the MD&A item (7 for 10-K, 2 for 10-Q),
    # but we require 'discussion and analysis' near it.

    item_pattern = TEN_K_MDA_ITEM if filing_type == "10-K" else TEN_Q_MDA_ITEM

    mda_item_start = None

    for m in re.finditer(item_pattern, low):
        s = m.start()
        window = low[s:s+600]  # look ahead a bit
        if re.search(r"discussion\s+and\s+analysis", window, flags=re.IGNORECASE):
            mda_item_start = s
            break

    if mda_item_start is None:
        return ""  # No reliable MD&A found

    # Next item boundary after the MD&A item
    next_items = [p for p in item_positions if p > mda_item_start]
    end = next_items[0] if next_items else len(text)

    return text[mda_item_start:end].strip()


###############################################################################
# PART 5 — PROCESS ONE FILE (MD&A ONLY)
###############################################################################

def process_one_file(input_file, output_base):
    out = output_base + "_cleaned.txt"
    if os.path.exists(out):
        print("Skipping (exists):", out)
        return

    cleaned = clean_html_file(input_file)
    filing_type = filing_type_from_filename(input_file)

    mda = extract_mda(cleaned, filing_type)

    with open(out, "w") as f:
        f.write("===== MD&A =====\n\n")
        f.write(mda if mda else "(MD&A not found)")

    print(f"✔ Saved MD&A → {out}")


###############################################################################
# PART 6 — BATCH OVER DIRECTORY
###############################################################################

def process_all_files(src, dst):
    for root, _, files in os.walk(src):
        for file in files:
            if file.lower().endswith(".htm"):
                inp = os.path.join(root, file)
                rel = os.path.relpath(root, src)
                out_dir = os.path.join(dst, rel)
                os.makedirs(out_dir, exist_ok=True)
                out_base = os.path.join(out_dir, file.rsplit(".", 1)[0])
                process_one_file(inp, out_base)


###############################################################################
# PART 7 — RUN
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"
    process_all_files(src_folder, dst_folder)


✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2010-Q4-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2006-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2011-Q2-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2008-Q2-

KeyboardInterrupt: 

In [None]:
# very close, but still does not always get the right thing
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML BUT PRESERVE INLINE XBRL TEXT
###############################################################################

def clean_html_file(filepath):
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove tables (visual layout only — MD&A rarely inside <table>)
    for t in soup.find_all(["table", "tr", "td", "th"]):
        t.decompose()

    # Remove images
    for t in soup.find_all(["img", "figure"]):
        t.decompose()

    # Remove script/style
    for t in soup(["script", "style"]):
        t.decompose()

    # Inline XBRL unwrap (keeps the text instead of deleting it)
    for t in soup.find_all():
        if ":" in (t.name or ""):  # <ix:nonNumeric>
            t.unwrap()

    text = soup.get_text(separator="\n")

    # Remove page numbers
    text = re.sub(r"\n\s*\d{1,6}\s*(?=\n)", "\n", text)

    # Remove boilerplate
    boiler = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
    ]
    for bp in boiler:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Clean whitespace
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n+", "\n\n", text)

    return text.strip()


###############################################################################
# PART 2 — UNIVERSAL ITEM REGEX
###############################################################################

ITEM_FINDER = re.compile(
    r"(?<!\w)item[\s\.\-–—:]*\d+[a-z]?(?!\w)",
    flags=re.IGNORECASE
)

def item_header_regex(num):
    return rf"(?<!\w)item[\s\.\-–—:]*{num}(?!\w)"


# MD&A identifiers
TEN_K_MDA_HEADER = item_header_regex(7)
TEN_Q_MDA_HEADER = item_header_regex(2)

# MD&A verification keywords (must match near header)
MDA_KEYWORDS = [
    "management",
    "discussion",
    "analysis",
    "results of operations",
    "liquidity",
    "capital resources"
]


###############################################################################
# PART 3 — DETERMINE FILING TYPE **FROM FILE NAME**
###############################################################################

def filing_type_from_filename(path):
    name = os.path.basename(path).lower()
    if "10k" in name:
        return "10-K"
    if "10q" in name:
        return "10-Q"
    # fallback but rarely needed
    return "10-K"


###############################################################################
# PART 4 — EXTRACT FULL MD&A WITH HEADER VALIDATION
###############################################################################

def extract_mda(text, filing_type):

    low = text.lower()

    # Choose item number based on correct rules
    pattern = TEN_K_MDA_HEADER if filing_type == "10-K" else TEN_Q_MDA_HEADER

    # Find all matching locations (Item 7 or Item 2)
    matches = list(re.finditer(pattern, low))
    if not matches:
        return ""

    # Choose the first header that looks like REAL MD&A
    mda_header = None
    for m in matches:
        start = m.start()

        # Look ahead 500 characters for MD&A keywords
        window = low[start:start+500]
        if any(k in window for k in MDA_KEYWORDS):
            mda_header = m
            break

    if mda_header is None:
        return ""  # Prevents extracting Properties or cross-references

    start = mda_header.start()

    # Find next item boundary for end of MD&A
    boundaries = sorted({p.start() for p in ITEM_FINDER.finditer(low)})
    next_bounds = [b for b in boundaries if b > start]
    end = next_bounds[0] if next_bounds else len(text)

    chunk = text[start:end].strip()

    return chunk


###############################################################################
# PART 5 — PROCESS ONE FILE
###############################################################################

def process_one_file(input_file, output_base):

    out = output_base + "_cleaned.txt"
    if os.path.exists(out):
        print("Skipping:", out)
        return

    cleaned = clean_html_file(input_file)
    ftype = filing_type_from_filename(input_file)

    mda = extract_mda(cleaned, ftype)

    with open(out, "w") as f:
        f.write("===== MD&A =====\n\n")
        f.write(mda if mda else "(MD&A not found)")

    print("✔ Saved MD&A →", out)


###############################################################################
# PART 6 — PROCESS DIRECTORY
###############################################################################

def process_all_files(src, dst):
    for root, _, files in os.walk(src):
        for file in files:
            if file.lower().endswith(".htm"):
                inp = os.path.join(root, file)
                rel = os.path.relpath(root, src)
                out_dir = os.path.join(dst, rel)
                os.makedirs(out_dir, exist_ok=True)
                out_base = os.path.join(out_dir, file.rsplit(".", 1)[0])
                process_one_file(inp, out_base)


###############################################################################
# PART 7 — RUN
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"
    process_all_files(src_folder, dst_folder)


✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2010-Q4-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2006-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2011-Q2-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2008-Q2-

KeyboardInterrupt: 

In [66]:
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML WHILE PRESERVING INLINE XBRL TEXT
###############################################################################

def clean_html_file(filepath):
    """Cleans HTML but *preserves* all XBRL/inline-XBRL text."""
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove tables (usually formatting-only)
    for t in soup.find_all(["table", "tr", "td", "th"]):
        t.decompose()

    # Remove images
    for t in soup.find_all(["img", "figure"]):
        t.decompose()

    # Remove script/style
    for t in soup(["script", "style"]):
        t.decompose()

    # Important fix:
    # Inline XBRL tags have names like ix:nonNumeric, us-gaap:Revenue.
    # We REMOVE the tag wrapper but KEEP the text.
    for t in soup.find_all():
        if ":" in (t.name or ""):
            t.unwrap()   # <--- FIX: unwrap keeps text

    # Extract text
    text = soup.get_text(separator="\n")

    # Remove page numbers
    text = re.sub(r"\n\s*\d{1,5}\s*(?=\n)", "\n", text)

    # Remove boilerplate
    boiler = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
    ]
    for bp in boiler:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Whitespace cleanup
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n+", "\n\n", text)

    return text.strip()


###############################################################################
# PART 2 — FLEXIBLE ITEM HEADER REGEX
###############################################################################

def item_header_regex(num, letter=""):
    """
    Matches ITEM 7, ITEM7, ITEM—7, Item 7: Management Discussion, etc.
    """
    ident = f"{num}{letter}".lower()
    return (
        r"(?<!\w)"
        r"item"
        r"[\s\.\-–—:]*"
        r"(" + ident + r")"
        r"(?![\w])"
    )


# For MD&A extraction only:
TEN_K_MDA = item_header_regex(7)
TEN_Q_MDA = item_header_regex(2)

# Fallback keywords
MDA_KEYWORDS = [
    r"management.{0,40}discussion",
    r"md&a",
    r"m d & a",
    r"mda",
]


###############################################################################
# PART 3 — FIND ALL ITEM HEADERS
###############################################################################

ITEM_FINDER = re.compile(
    r"(?<!\w)item[\s\.\-–—:]*\d+[a-z]?(?!\w)",
    flags=re.IGNORECASE
)


def extract_mda(text, filing_type):
    """Extract ONLY MD&A section using item boundaries + fallback."""

    low = text.lower()

    # Choose correct item number
    pattern = TEN_K_MDA if filing_type == "10-K" else TEN_Q_MDA

    # Try main MD&A detection
    m = re.search(pattern, low)

    # Fallback detection if missing
    if not m:
        for kw in MDA_KEYWORDS:
            mk = re.search(kw, low)
            if mk:
                m = mk
                break

    if not m:
        return ""  # MD&A missing entirely

    start = m.start()

    # Find next item boundary to determine end
    boundaries = sorted({p.start() for p in ITEM_FINDER.finditer(low)})
    next_bounds = [b for b in boundaries if b > start]
    end = next_bounds[0] if next_bounds else len(text)

    chunk = text[start:end].strip()

    # Remove PART headers that sometimes appear at top
    chunk = re.sub(r"^part\s+[ivx]+\s+", "", chunk, flags=re.IGNORECASE)

    return chunk


###############################################################################
# PART 4 — Detect filing type (10-K vs 10-Q)
###############################################################################

def detect_filing_type(text):
    low = text.lower()
    if "form 10-k" in low:
        return "10-K"
    if "form 10-q" in low:
        return "10-Q"

    # Fallback: if Item 2 exists, it's almost always a 10-Q
    if re.search(item_header_regex(2), low):
        return "10-Q"

    return "10-K"


###############################################################################
# PART 5 — Process one file (MD&A only)
###############################################################################

def process_one_file(input_file, output_base):

    out = output_base + "_cleaned.txt"
    if os.path.exists(out):
        print("Skipping (exists):", out)
        return

    cleaned = clean_html_file(input_file)
    ftype = detect_filing_type(cleaned)

    mda = extract_mda(cleaned, ftype)

    with open(out, "w") as f:
        f.write("===== MD&A =====\n\n")
        f.write(mda if mda else "(MD&A not found)")

    print("✔ Saved MD&A →", out)


###############################################################################
# PART 6 — Batch process directory
###############################################################################

def process_all_files(src, dst):
    for root, _, files in os.walk(src):
        for file in files:
            if file.lower().endswith(".htm"):
                inp = os.path.join(root, file)
                rel = os.path.relpath(root, src)
                out_dir = os.path.join(dst, rel)
                os.makedirs(out_dir, exist_ok=True)
                out_base = os.path.join(out_dir, file.rsplit(".", 1)[0])
                process_one_file(inp, out_base)


###############################################################################
# PART 7 — RUN
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"
    process_all_files(src_folder, dst_folder)


✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2010-Q4-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2006-Q3-10K_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2011-Q2-10Q_cleaned.txt
✔ Saved MD&A → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2008-Q2-

KeyboardInterrupt: 

In [57]:
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML FILINGS
###############################################################################

def clean_html_file(filepath):
    """Remove HTML, XBRL, tables, figures, boilerplate, page numbers."""
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove tables
    for tag in soup.find_all(["table", "tr", "td", "th"]):
        tag.decompose()

    # Remove images
    for tag in soup.find_all(["figure", "img"]):
        tag.decompose()

    # Remove XBRL tags
    for tag in soup.find_all():
        if tag.name and ":" in tag.name:
            tag.decompose()

    # Remove script/style
    for t in soup(["script", "style"]):
        t.decompose()

    text = soup.get_text(separator="\n")

    # Remove page numbers on separate lines
    text = re.sub(r"\n\s*\d{1,4}\s*(?=\n)", "\n", text)

    # Remove boilerplate
    boilerplate_patterns = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
        r"Yes\s*[xX]\s*No\s*[oO]",
        r"Yes\s*[oO]\s*No\s*[xX]"
    ]
    for bp in boilerplate_patterns:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Whitespace cleanup
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n+", "\n\n", text)

    return text.strip()


###############################################################################
# PART 2 — IMPROVED ITEM REGEX (fixes MD&A)
###############################################################################

ITEM_BOUNDARY = re.compile(
    r"(?<!\w)item[\s\u00A0\.\-–—:]*\d+[a-z]?(?!\w)",
    flags=re.IGNORECASE
)

def item_regex(n, letter=""):
    """
    More tolerant regex for item headers:
    handles ITEM 7, ITEM7, ITEM–7, ITEM 7:, ITEM 7 MANAGEMENT…
    """
    item_id = f"{n}{letter}".lower()

    pattern = (
        r"(?<!\w)"
        r"item"
        r"[\s\u00A0]*"
        r"[\.:\-–—]?"
        r"[\s\u00A0]*"
        + item_id +
        r"(?!\w)"
    )
    return pattern


TEN_K_SECTIONS = {
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(7),
    "Market_Risk": item_regex(7, "a"),
}

TEN_Q_SECTIONS = {
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(2),
    "Market_Risk": item_regex(3),
}


###############################################################################
# PART 3 — EXTRACT SECTIONS USING ITEM BOUNDARIES
###############################################################################

def extract_sections(text, patterns):
    lower = text.lower()

    # find all item boundaries
    item_positions = sorted({m.start() for m in ITEM_BOUNDARY.finditer(lower)})

    sections = {}

    for name, pattern in patterns.items():
        # --- 1. First try standard ITEM match ---
        m = re.search(pattern, lower)

        # --- 2. If MD&A is missing, fall back to keyphrase search ---
        if name == "MD&A" and not m:
            md_keywords = [
                r"management.{0,40}discussion",
                r"md&a",
                r"m d & a",
                r"mda",
            ]
            for kw in md_keywords:
                mk = re.search(kw, lower)
                if mk:
                    m = mk
                    break

        # If still not found → nothing extracted
        if not m:
            sections[name] = ""
            continue

        start = m.start()

        # find next item boundary
        next_starts = [p for p in item_positions if p > start]
        end = next_starts[0] if next_starts else len(text)

        sections[name] = text[start:end].strip()

    return sections



###############################################################################
# PART 4 — DETECT IF FILING IS 10-K OR 10-Q
###############################################################################

def detect_filing_type(text):
    t = text.lower()

    if "form 10-k" in t or ("annual report" in t and "10-k" in t):
        return "10-K"
    if "form 10-q" in t or ("quarterly report" in t and "10-q" in t):
        return "10-Q"

    # fallback: 10-Q almost always has MD&A = Item 2
    if re.search(item_regex(2), t):
        return "10-Q"

    return "10-K"


###############################################################################
# PART 5 — PROCESS ONE FILE
###############################################################################

def process_one_file(input_file, output_file_base):

    combined_path = output_file_base + "_combined.txt"

    # Skip if combined exists
    if os.path.exists(combined_path):
        print(f"Skipping (exists): {combined_path}")
        return None, None

    cleaned = clean_html_file(input_file)
    filing_type = detect_filing_type(cleaned)
    patterns = TEN_K_SECTIONS if filing_type == "10-K" else TEN_Q_SECTIONS

    sections = extract_sections(cleaned, patterns)

    # Build combined output (ONLY this file)
    with open(combined_path, "w") as combined:
        rf = sections.get("Risk_Factors", "").strip()
        if rf:
            combined.write("===== RISK FACTORS =====\n\n")
            combined.write(rf + "\n\n")

        mda = sections.get("MD&A", "").strip()
        if mda:
            combined.write("===== MD&A =====\n\n")
            combined.write(mda + "\n\n")
        else:
            print(f"⚠ MD&A NOT FOUND for {input_file}")

        mr = sections.get("Market_Risk", "").strip()
        if mr:
            combined.write("===== MARKET RISK =====\n\n")
            combined.write(mr + "\n")

    print(f"✔ Extracted & Combined → {combined_path}")

    return cleaned, sections


###############################################################################
# PART 6 — PROCESS ALL FILINGS IN A DIRECTORY
###############################################################################

def process_all_files(src_folder, dst_folder):
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file.lower().endswith(".htm"):
                input_file = os.path.join(root, file)

                # replicate folder structure
                rel = os.path.relpath(root, src_folder)
                out_dir = os.path.join(dst_folder, rel)
                os.makedirs(out_dir, exist_ok=True)

                output_file_base = os.path.join(out_dir, file.rsplit(".", 1)[0])

                print(f"Processing: {input_file}")
                process_one_file(input_file, output_file_base)


###############################################################################
# PART 7 — RUN PIPELINE
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"

    process_all_files(src_folder, dst_folder)


Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2024-Q1-10Q.htm
✔ Extracted & Combined → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_combined.txt
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2013-Q1-10Q.htm
✔ Extracted & Combined → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_combined.txt
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2020-Q3-10K.htm
✔ Extracted & Combined → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_combined.txt
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2017-Q3-10K.htm
✔ Extracted & Combined → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-1

KeyboardInterrupt: 

In [45]:
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML FILINGS
###############################################################################

def clean_html_file(filepath):
    """Remove HTML, XBRL, tables, figures, checkboxes, boilerplate, page numbers."""
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove full tables (financial statements)
    for tag in soup.find_all(["table", "tr", "td", "th"]):
        tag.decompose()

    # Remove images/figures
    for tag in soup.find_all(["figure", "img"]):
        tag.decompose()

    # Remove XBRL/Inline XBRL tags (<ix:...>, <xbrli:...>)
    for tag in soup.find_all():
        if tag.name and ":" in tag.name:
            tag.decompose()

    # Remove script and style
    for t in soup(["script", "style"]):
        t.decompose()

    text = soup.get_text(separator="\n")

    # Remove page numbers like “12” or “33” on separate lines
    text = re.sub(r"\n\s*\d{1,4}\s*(?=\n)", "\n", text)

    # Remove SEC boilerplate
    boilerplate_patterns = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
        r"Yes\s*[xX]\s*No\s*[oO]",
        r"Yes\s*[oO]\s*No\s*[xX]"
    ]
    for bp in boilerplate_patterns:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Clean whitespace
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n+", "\n\n", text)

    return text.strip()


###############################################################################
# PART 2 — FLEXIBLE ITEM HEADER REGEX
###############################################################################

ITEM_BOUNDARY = re.compile(
    r"(?<!\w)item[\s\u00A0\.\-–—:]*\d+[a-z]?(?!\w)",
    flags=re.IGNORECASE
)

def item_regex(n, letter=""):
    item_id = f"{n}{letter}".lower()
    return (
        r"(?<!\w)"
        r"item"
        r"[\s\u00A0\.\-–—:]*"
        + item_id +
        r"(?!\w)"
    )


TEN_K_SECTIONS = {
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(7),
    "Market_Risk": item_regex(7, "a")
}

TEN_Q_SECTIONS = {
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(2),
    "Market_Risk": item_regex(3)
}


###############################################################################
# PART 3 — EXTRACT SECTIONS
###############################################################################

def extract_sections(text, patterns):
    lower = text.lower()
    item_positions = sorted({m.start() for m in ITEM_BOUNDARY.finditer(lower)})
    sections = {}

    for name, pattern in patterns.items():
        m = re.search(pattern, lower)
        if not m:
            sections[name] = ""
            continue

        start = m.start()

        next_starts = [p for p in item_positions if p > start]
        end = next_starts[0] if next_starts else len(text)

        sections[name] = text[start:end].strip()

    return sections


###############################################################################
# PART 4 — DETERMINE FILING TYPE
###############################################################################

def detect_filing_type(text):
    t = text.lower()

    if "form 10-k" in t or ("annual report" in t and "10-k" in t):
        return "10-K"
    if "form 10-q" in t or ("quarterly report" in t and "10-q" in t):
        return "10-Q"

    # fallback: 10-Q MD&A = Item 2
    if re.search(item_regex(2), t):
        return "10-Q"

    return "10-K"


###############################################################################
# PART 5 — PROCESS ONE FILE
###############################################################################

def process_one_file(input_file, output_file_base):

    # Skip if cleaned already exists
    cleaned_path = output_file_base + "_cleaned.txt"
    if os.path.exists(cleaned_path):
        print(f"Skipping (already exists): {cleaned_path}")
        return None, None

    cleaned = clean_html_file(input_file)

    with open(cleaned_path, "w") as f:
        f.write(cleaned)

    filing_type = detect_filing_type(cleaned)
    patterns = TEN_K_SECTIONS if filing_type == "10-K" else TEN_Q_SECTIONS

    sections = extract_sections(cleaned, patterns)

    # Save each extracted section
    section_dir = output_file_base + "_sections"
    os.makedirs(section_dir, exist_ok=True)

    for name, content in sections.items():
        sec_path = os.path.join(section_dir, f"{name}.txt")
        with open(sec_path, "w") as f:
            f.write(content)

    # -------------------------------------------
    # BUILD COMBINED: just append the three parts
    # -------------------------------------------
    combined_path = output_file_base + "_sections_combined.txt"
    with open(combined_path, "w") as combined:
        rf = sections.get("Risk_Factors", "").strip()
        if rf:
            combined.write(rf + "\n\n")

        mda = sections.get("MD&A", "").strip()
        if mda:
            combined.write(mda + "\n\n")

        mr = sections.get("Market_Risk", "").strip()
        if mr:
            combined.write(mr)

    return cleaned, sections


###############################################################################
# PART 6 — WALK AND PROCESS ALL FILINGS
###############################################################################

def process_all_files(src_folder, dst_folder):
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file.lower().endswith(".htm"):
                input_file = os.path.join(root, file)

                rel = os.path.relpath(root, src_folder)
                out_dir = os.path.join(dst_folder, rel)
                os.makedirs(out_dir, exist_ok=True)

                output_file_base = os.path.join(out_dir, file.rsplit(".", 1)[0])

                print(f"Processing: {input_file}")
                process_one_file(input_file, output_file_base)


###############################################################################
# PART 7 — RUN
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"

    process_all_files(src_folder, dst_folder)


Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2024-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2013-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2020-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2017-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2010-Q4-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2006-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2011-Q2-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2008-Q2-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggreg

KeyboardInterrupt: 

In [None]:
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML FILINGS
###############################################################################

def clean_html_file(filepath):
    """Remove HTML, XBRL, tables, figures, checkboxes, boilerplate, page numbers."""
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove full tables (financial statements)
    for tag in soup.find_all(["table", "tr", "td", "th"]):
        tag.decompose()

    # Remove images/figures
    for tag in soup.find_all(["figure", "img"]):
        tag.decompose()

    # Remove XBRL/Inline XBRL tags (<ix:...>, <xbrli:...>)
    for tag in soup.find_all():
        if tag.name and ":" in tag.name:
            tag.decompose()

    # Remove script and style
    for t in soup(["script", "style"]):
        t.decompose()

    text = soup.get_text(separator="\n")

    # Remove page numbers (more robust)
    text = re.sub(r"\n\s*\d{1,4}\s*(?=\n)", "\n", text)

    # Remove SEC boilerplate
    boilerplate_patterns = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
        r"Yes\s*[xX]\s*No\s*[oO]",
        r"Yes\s*[oO]\s*No\s*[xX]"
    ]
    for bp in boilerplate_patterns:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Clean whitespace
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n+", "\n\n", text)

    return text.strip()


###############################################################################
# PART 2 — FLEXIBLE ITEM HEADER REGEX
###############################################################################

ITEM_BOUNDARY = re.compile(
    r"(?<!\w)item[\s\u00A0\.\-–—:]*\d+[a-z]?(?!\w)",
    flags=re.IGNORECASE
)

def item_regex(n, letter=""):
    item_id = f"{n}{letter}".lower()
    pattern = (
        r"(?<!\w)"
        r"item"
        r"[\s\u00A0\.\-–—:]*"
        + item_id +
        r"(?!\w)"
    )
    return pattern


TEN_K_SECTIONS = {
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(7),
    "Market_Risk": item_regex(7, "a")
}

TEN_Q_SECTIONS = {
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(2),
    "Market_Risk": item_regex(3)
}

###############################################################################
# PART 3 — EXTRACT SECTIONS
###############################################################################

def extract_sections(text, patterns):
    lower = text.lower()

    item_positions = [m.start() for m in ITEM_BOUNDARY.finditer(lower)]
    item_positions = sorted(set(item_positions))

    sections = {}

    for name, pattern in patterns.items():
        m = re.search(pattern, lower)
        if not m:
            sections[name] = ""
            continue

        start = m.start()
        next_starts = [p for p in item_positions if p > start]
        end = next_starts[0] if next_starts else len(text)

        sections[name] = text[start:end].strip()

    return sections


###############################################################################
# PART 4 — DETERMINE FILING TYPE (10-K or 10-Q)
###############################################################################

def detect_filing_type(text):
    t = text.lower()

    if "form 10-k" in t or ("annual report" in t and "10-k" in t):
        return "10-K"
    if "form 10-q" in t or ("quarterly report" in t and "10-q" in t):
        return "10-Q"

    # fallback: 10-Q MD&A = Item 2
    if re.search(item_regex(2), t):
        return "10-Q"

    return "10-K"


###############################################################################
# PART 5 — PROCESS ONE FILE
###############################################################################

def process_one_file(input_file, output_file_base):

    # ---------------------------------------------------
    # Skip if output already exists
    # ---------------------------------------------------
    cleaned_path = output_file_base + "_cleaned.txt"
    if os.path.exists(cleaned_path):
        print(f"Skipping (already exists): {cleaned_path}")
        return None, None
    # ---------------------------------------------------

    cleaned = clean_html_file(input_file)

    with open(cleaned_path, "w") as f:
        f.write(cleaned)

    filing_type = detect_filing_type(cleaned)
    patterns = TEN_K_SECTIONS if filing_type == "10-K" else TEN_Q_SECTIONS

    sections = extract_sections(cleaned, patterns)

    section_dir = output_file_base + "_sections"
    os.makedirs(section_dir, exist_ok=True)

    combined_path = output_file_base + "_sections_combined.txt"
    with open(combined_path, "w") as combined:
        for name, content in sections.items():
            sec_path = os.path.join(section_dir, f"{name}.txt")
            with open(sec_path, "w") as f:
                f.write(content)
            combined.write(f"\n\n===== {name} =====\n\n")
            combined.write(content)

    return cleaned, sections


###############################################################################
# PART 6 — WALK DIRECTORY AND PROCESS ALL FILINGS
###############################################################################

def process_all_files(src_folder, dst_folder):
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file.lower().endswith(".htm"):
                input_file = os.path.join(root, file)

                # mirror directory structure
                rel = os.path.relpath(root, src_folder)
                out_dir = os.path.join(dst_folder, rel)
                os.makedirs(out_dir, exist_ok=True)

                output_file_base = os.path.join(out_dir, file.rsplit(".", 1)[0])

                print(f"Processing: {input_file}")
                process_one_file(input_file, output_file_base)


###############################################################################
# PART 7 — RUN
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"

    process_all_files(src_folder, dst_folder)


Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2024-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2013-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2020-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2017-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2010-Q4-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2006-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2011-Q2-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2008-Q2-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggreg

KeyboardInterrupt: 

In [39]:
import os
import re
from bs4 import BeautifulSoup

###############################################################################
# PART 1 — CLEAN HTML FILINGS
###############################################################################

def clean_html_file(filepath):
    """Remove HTML, XBRL, tables, figures, checkboxes, and SEC boilerplate."""
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    soup = BeautifulSoup(raw, "lxml")

    # Remove tables + their contents
    for tag in soup.find_all(["table", "tr", "td", "th"]):
        tag.decompose()

    # Remove images/figures
    for tag in soup.find_all(["figure", "img"]):
        tag.decompose()

    # Remove XBRL/Inline XBRL tags (<ix:...>, <xbrl:...>, etc.)
    for tag in soup.find_all():
        if tag.name and ":" in tag.name:
            tag.decompose()

    # Remove script/style
    for t in soup(["script", "style"]):
        t.decompose()

    text = soup.get_text(separator="\n")

    # Drop SEC boilerplate/checkboxes
    boilerplate_patterns = [
        r"United States Securities and Exchange Commission",
        r"Form\s+10-K",
        r"Form\s+10-Q",
        r"TABLE OF CONTENTS",
        r"Indicate.*?by checkmark",
        r"Yes\s*[xX]\s*No\s*[oO]",
        r"Yes\s*[oO]\s*No\s*[xX]"
    ]
    for bp in boilerplate_patterns:
        text = re.sub(bp, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Normalize whitespace
    text = re.sub(r"\n\s*\n+", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()


###############################################################################
# PART 2 — REGEX FOR ITEM HEADERS (COVERS ALL VARIATIONS)
###############################################################################

def item_regex(n, letter=""):
    """
    Build a flexible pattern matching:
       ITEM 7
       Item 7.
       ITEM 7A —
       Item 7 – MD&A
       ITEM 7A. 
       Item\xa07
    """
    item_id = f"{n}{letter}".lower()

    pattern = (
        r"item[\s\u00A0\.\-–—:]*" +   # ITEM or Item + punctuation/nbsp
        item_id +
        r"[\s\u00A0\.\-–—:]"         # terminating punctuation or space
    )
    return pattern


# Important sections to extract
TEN_K_SECTIONS = {
    "Business": item_regex(1),
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(7),
    "Market_Risk": item_regex(7, "a"),
    "Financial_Statements": item_regex(8)
}

TEN_Q_SECTIONS = {
    "Risk_Factors": item_regex(1, "a"),
    "MD&A": item_regex(2),
    "Market_Risk": item_regex(3),
    "Controls_Procedures": item_regex(4)
}


###############################################################################
# PART 3 — EXTRACT SECTIONS BASED ON ITEM HEADINGS
###############################################################################

def extract_sections(text, patterns):
    """Extract sections from cleaned text using flexible ITEM regex."""
    sections = {}
    lower = text.lower()

    # Find all item positions to determine boundaries
    item_positions = []
    for match in re.finditer(r"item[\s\u00A0\.\-–—:]*\d+[a-z]?", lower):
        item_positions.append(match.start())

    item_positions = sorted(set(item_positions))

    for name, pattern in patterns.items():
        m = re.search(pattern, lower)
        if not m:
            sections[name] = ""
            continue

        start = m.start()

        # Find next item after this one
        next_starts = [p for p in item_positions if p > start]
        end = next_starts[0] if next_starts else len(text)

        sections[name] = text[start:end].strip()

    return sections


###############################################################################
# PART 4 — DETERMINE IF FILING IS 10-K OR 10-Q
###############################################################################

def detect_filing_type(text):
    """Return '10-K' or '10-Q' based on keywords in the cleaned text."""
    t = text.lower()

    if "form 10-k" in t:
        return "10-K"
    if "annual report" in t and "10-k" in t:
        return "10-K"

    if "form 10-q" in t:
        return "10-Q"
    if "quarterly report" in t and "10-q" in t:
        return "10-Q"

    # fallback: Q filings have Item 2 MD&A instead of Item 7
    if re.search(item_regex(2), t):
        return "10-Q"

    return "10-K"   # default


###############################################################################
# PART 5 — PROCESS A SINGLE FILE
###############################################################################

def process_one_file(input_file, output_file_base):
    cleaned = clean_html_file(input_file)

    with open(output_file_base + "_cleaned.txt", "w") as f:
        f.write(cleaned)

    filing_type = detect_filing_type(cleaned)

    if filing_type == "10-K":
        patterns = TEN_K_SECTIONS
    else:
        patterns = TEN_Q_SECTIONS

    sections = extract_sections(cleaned, patterns)

    # Save each section
    os.makedirs(output_file_base + "_sections", exist_ok=True)
    combined_path = output_file_base + "_sections_combined.txt"

    with open(combined_path, "w") as combined:
        for name, content in sections.items():
            sec_path = os.path.join(output_file_base + "_sections", f"{name}.txt")
            with open(sec_path, "w") as f:
                f.write(content)

            combined.write(f"\n\n===== {name} =====\n\n")
            combined.write(content)

    return cleaned, sections


###############################################################################
# PART 6 — WALK ALL SUBFOLDERS AND PROCESS EVERYTHING
###############################################################################

def process_all_files(src_folder, dst_folder):
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file.lower().endswith(".htm"):
                input_file = os.path.join(root, file)

                # Mirror directory structure
                rel = os.path.relpath(root, src_folder)
                out_dir = os.path.join(dst_folder, rel)
                os.makedirs(out_dir, exist_ok=True)

                output_file_base = os.path.join(out_dir, file.rsplit(".", 1)[0])

                print(f"Processing: {input_file}")
                process_one_file(input_file, output_file_base)


###############################################################################
# PART 7 — RUN THE PIPELINE (YOUR PATHS)
###############################################################################

if __name__ == "__main__":
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"

    process_all_files(src_folder, dst_folder)


Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2024-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2013-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2020-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2017-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2010-Q4-10Q.htm


KeyboardInterrupt: 

In [None]:
# soem sort of section action
import os
import re
from bs4 import BeautifulSoup

# ============================================================
#                HELPERS TO LOAD & CLEAN HTML
# ============================================================

def load_html(path):
    """Read the raw HTML file."""
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def remove_sec_header(text):
    """
    Remove boilerplate SEC header that usually appears at the top:
    'UNITED STATES SECURITIES AND EXCHANGE COMMISSION... checkboxes...'
    """
    pattern = r"UNITED STATES SECURITIES AND EXCHANGE COMMISSION(.+?)(Table of Contents|PART I|Item 1\.?)"
    cleaned = re.sub(pattern, r"\2", text, flags=re.DOTALL | re.IGNORECASE)
    return cleaned


def remove_tables(soup):
    """Remove <table> elements entirely."""
    for table in soup.find_all("table"):
        table.decompose()
    return soup


def html_to_clean_text(html):
    """
    Convert HTML into clean plain text:
    - remove scripts, styles
    - remove tables
    - collapse whitespace
    """
    soup = BeautifulSoup(html, "html.parser")

    # Remove scripts and styles
    for tag in soup(["script", "style"]):
        tag.decompose()

    # Remove tables
    soup = remove_tables(soup)

    # Get text
    text = soup.get_text(separator="\n")

    # Normalize whitespace
    text = re.sub(r"\n\s*\n+", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = text.strip()

    return text


# ============================================================
#                 SECTION EXTRACTION (Items)
# ============================================================

# Regex for typical 10-K and 10-Q section headings
SECTION_PATTERNS = {
    "Item 1": r"item\s*1[^A-Za-z0-9]",
    "Item 1A Risk Factors": r"item\s*1a",
    "Item 2 MD&A": r"item\s*2[^0-9a-z]",
    "Item 3": r"item\s*3[^0-9a-z]",
    "Item 4": r"item\s*4[^0-9a-z]",
    "Item 5": r"item\s*5[^0-9a-z]",
    "Item 6": r"item\s*6[^0-9a-z]",
    "Item 7 MD&A (10-K)": r"item\s*7[^0-9a-z]",
    "Item 7A": r"item\s*7a",
    "Item 8 Financial Statements": r"item\s*8[^0-9a-z]",
    "Item 9": r"item\s*9[^0-9a-z]",
}

def extract_sections(text):
    """
    Extract key 10-K / 10-Q sections into a dict:
    { "Item 2 MD&A": "...", "Item 1A Risk Factors": "...", ... }
    """
    lower_text = text.lower()
    positions = []

    # Find occurrences of headings
    for sec_name, pattern in SECTION_PATTERNS.items():
        for match in re.finditer(pattern, lower_text, flags=re.IGNORECASE):
            positions.append((match.start(), sec_name))

    if not positions:
        return {}

    # Sort by text order
    positions.sort()

    # Extract text chunks between headings
    sections = {}
    for idx, (start_pos, sec_name) in enumerate(positions):
        end_pos = positions[idx + 1][0] if idx + 1 < len(positions) else len(text)
        section_text = text[start_pos:end_pos].strip()

        # Save non-empty
        if len(section_text) > 20:
            sections[sec_name] = section_text

    return sections


# ============================================================
#                    WRITE OUTPUT FILES
# ============================================================

def write_clean_text(dst_path, text):
    """Write the cleaned text."""
    with open(dst_path, "w", encoding="utf-8") as f:
        f.write(text)

def write_section_files(base_folder, sections):
    """Write each extracted section to a separate .txt file."""
    section_dir = os.path.join(base_folder, "sections")
    os.makedirs(section_dir, exist_ok=True)

    for sec_name, content in sections.items():
        safe_name = sec_name.replace(" ", "_").replace("/", "_")
        path = os.path.join(section_dir, f"{safe_name}.txt")
        with open(path, "w", encoding="utf-8") as f:
            f.write(content)


# ============================================================
#                  PIPELINE FOR A SINGLE FILE
# ============================================================

def process_file(src_file, dst_folder):
    """
    End-to-end pipeline for one HTML file:
      1. Load HTML
      2. Convert to clean text
      3. Remove SEC header
      4. Extract sections
      5. Save outputs
    """

    html = load_html(src_file)
    text = html_to_clean_text(html)
    text = remove_sec_header(text)

    # Output file
    filename = os.path.basename(src_file).replace(".htm", "_cleaned.txt")
    dst_file = os.path.join(dst_folder, filename)

    # Save cleaned full text
    write_clean_text(dst_file, text)

    # Extract sections (Item 1A, MD&A, etc.)
    sections = extract_sections(text)
    write_section_files(dst_folder, sections)


# ============================================================
#             RUN ON ALL FILINGS IN A FOLDER
# ============================================================

def process_all_files(src_folder, dst_folder):
    """
    Walk the source directory and process every .htm file.
    Maintain subfolder structure.
    """
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file.lower().endswith(".htm"):
                src_path = os.path.join(root, file)

                # Mirror directory structure under dst/
                relative = os.path.relpath(root, src_folder)
                new_dst = os.path.join(dst_folder, relative)
                os.makedirs(new_dst, exist_ok=True)

                print(f"Processing: {src_path}")
                process_file(src_path, new_dst)


# ============================================================
#                RUN PIPELINE WITH YOUR PATHS
# ============================================================

if __name__ == "__main__":
    # YOUR FOLDER PATHS (✓ Correct for your Mac)
    src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
    dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"

    process_all_files(src_folder, dst_folder)


Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2024-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2013-Q1-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2020-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2017-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2010-Q4-10Q.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2006-Q3-10K.htm
Processing: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2011-Q2-10Q.htm


KeyboardInterrupt: 

In [28]:
# strips text, but still has table of contents

import os
import re
from bs4 import BeautifulSoup

# ================================================================
# 1. Clean HTML → text (keep narrative, drop HTML/XBRL/tables)
# ================================================================
def clean_html_text(html):
    soup = BeautifulSoup(html, "lxml")

    # 1. Remove all tables
    for t in soup.find_all("table"):
        t.decompose()

    # 2. Remove XBRL tags
    for tag in soup.find_all(['xbrl', 'ix:nonfraction', 'ix:nonNumeric']):
        tag.decompose()

    # 3. Remove scripts/styles
    for bad in soup(["script", "style"]):
        bad.decompose()

    # 4. Extract text
    text = soup.get_text(separator=" ")

    # 5. Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# ================================================================
# 2. SAFE removal of Item 1 – Financial Statements section only
#    → Works for 10-Q and 10-K
# ================================================================
def remove_financials(text):
    """
    Remove the Financial Statements section (Item 1) safely.
    Only removes if both start and end markers exist and in order.
    """

    # Patterns for start and end boundaries
    start_pat = r"ITEM\s*1[^A-Za-z0-9]{0,6}(FINANCIAL STATEMENTS|FINANCIAL INFORMATION)"
    end_pat   = r"ITEM\s*2[^A-Za-z0-9]{0,6}(MANAGEMENT|MD&A)"

    start = re.search(start_pat, text, flags=re.IGNORECASE)
    end   = re.search(end_pat, text, flags=re.IGNORECASE)

    # Only cut if both found in correct order
    if start and end and start.start() < end.start():
        before = text[:start.start()]
        after = text[end.start():]
        return before + "\n\n" + after

    return text

# ================================================================
# 3. Postprocessing cleanup (footers, exhibit lists, certification signatures)
# ================================================================
def postprocess_text(text):

    # Remove page/line artifacts (common in EDGAR dumps)
    text = re.sub(r"Page \d+ of \d+", " ", text)

    # Remove repeated ticker headings or titles in ALL CAPS spam
    text = re.sub(r"\b[A-Z0-9&\.\-]{8,}\b", " ", text)

    # Remove typical SEC boilerplate clutter
    patterns = [
        r"SIGNATURES.*$",                      # Signatures section
        r"EXHIBIT\s+\d+.*$",                   # Exhibit list headings
        r"Cover Page.*?Form 10",               # Cover page junk
        r"INDEX TO CONSOLIDATED.*$",           # Index pages
        r"See accompanying notes",             # Finance-only filler
    ]

    for p in patterns:
        text = re.sub(p, " ", text, flags=re.IGNORECASE | re.DOTALL)

    # Remove consecutive duplicated words
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

    # Normalize whitespace again
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# ================================================================
# 4. Main driver — read all HTM files recursively and clean them
# ================================================================
def process_all_files(src_root, dst_root):
    """
    Recursively read all .htm files in src_root.
    Clean and save them to dst_root with identical folder structure.
    """

    for root, dirs, files in os.walk(src_root):

        for file in files:
            if file.lower().endswith(".htm"):

                src_path = os.path.join(root, file)
                print(f"Processing {src_path}")

                with open(src_path, "r", encoding="utf-8", errors="ignore") as f:
                    html = f.read()

                # Run through cleaning pipeline
                text = clean_html_text(html)
                text = remove_financials(text)     # <-- SAFE removal
                text = postprocess_text(text)

                # Build destination path
                rel_path = os.path.relpath(root, src_root)
                out_dir = os.path.join(dst_root, rel_path)
                os.makedirs(out_dir, exist_ok=True)

                out_file = os.path.join(out_dir, file.replace(".htm", "_cleaned.txt"))

                with open(out_file, "w", encoding="utf-8") as f:
                    f.write(text)

                print(f"Saved → {out_file}")


# ================================================================
# 5. Run it
# ================================================================

src_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original"
dst_folder = "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"

process_all_files(src_folder, dst_folder)


Processing /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2024-Q1-10Q.htm
Saved → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_cleaned.txt
Processing /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2013-Q1-10Q.htm
Saved → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_cleaned.txt
Processing /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2020-Q3-10K.htm
Saved → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_cleaned.txt
Processing /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original/CTAS/CTAS-2017-Q3-10K.htm
Saved → /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-10K_cleaned.txt
Processing /Users/clairetseng/Desktop/Research/xiu/llm-bias/

KeyboardInterrupt: 

In [13]:
#semiworks

import re
from pathlib import Path
from bs4 import BeautifulSoup

###############################################################################
# 1. STRIP HTML, TABLES, XBRL
###############################################################################

XBRL_PREFIXES = {
    "ix", "ixn", "xbrli", "xlink", "link", "dei",
    "us-gaap", "srt", "xbrldi"
}

def strip_html_and_xbrl(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html, "lxml")

    # Remove script, style, metadata
    for tag in soup(["script", "style", "head", "noscript", "meta", "link"]):
        tag.decompose()

    # Remove tables, images, figures, SVG
    for tag in soup.find_all(["table", "img", "figure", "svg", "iframe", "object"]):
        tag.decompose()

    # Remove inline XBRL (Workiva)
    for tag in list(soup.find_all()):
        if tag.prefix and tag.prefix.lower() in XBRL_PREFIXES:
            tag.unwrap()

    text = soup.get_text("\n")

    # Clean up
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\r", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()


###############################################################################
# 2. DETECT FORM TYPE
###############################################################################

def detect_form_type(text: str) -> str:
    upper = text[:6000].upper()
    if "FORM 10-K" in upper:
        return "10-K"
    if "FORM 10-Q" in upper:
        return "10-Q"
    return "10-Q"   # default


###############################################################################
# 3. IMPROVED ITEM HEADER DETECTION (no more misalignment!)
###############################################################################

ITEM_PATTERN = re.compile(
    r"^\s*ITEM\s+([0-9]{1,2}[A]?)\s*[\.:–—-]?",
    re.IGNORECASE | re.MULTILINE
)

def find_all_items(text: str):
    """
    Returns list of (item_number, index) sorted by index.
    Only triggers on NEW LINES to avoid catching "itemized", etc.
    """
    items = [(m.group(1).upper(), m.start()) for m in ITEM_PATTERN.finditer(text)]
    return sorted(items, key=lambda x: x[1])


###############################################################################
# 4. MAP SEC ITEMS TO HUMAN-LABELED QUALITATIVE SECTIONS
###############################################################################

SEC_ITEMS_ORDER = [
    "1", "1A", "1B",
    "2", "3", "4", "5", "6",
    "7", "7A",
    "8", "9", "9A", "9B"
]

SECTION_LABELS_10K = {
    "1": "Item 1 - Business",
    "1A": "Item 1A - Risk Factors",
    "3": "Item 3 - Legal Proceedings",
    "7": "Item 7 - MD&A",
    "7A": "Item 7A - Market Risk",
    "9A": "Item 9A - Controls & Procedures"
}

SECTION_LABELS_10Q = {
    "1A": "Item 1A - Risk Factors",
    "2": "Item 2 - MD&A",
    "3": "Item 3 - Market Risk",
    "4": "Item 4 - Controls & Procedures"
}

def extract_items(text: str, form_type: str):
    item_positions = find_all_items(text)
    extracted = {}

    for i, (item_id, start_pos) in enumerate(item_positions):

        # Determine end of section
        end_pos = item_positions[i+1][1] if i+1 < len(item_positions) else len(text)

        block = text[start_pos:end_pos].strip()

        if form_type == "10-K" and item_id in SECTION_LABELS_10K:
            extracted[SECTION_LABELS_10K[item_id]] = block

        if form_type == "10-Q" and item_id in SECTION_LABELS_10Q:
            extracted[SECTION_LABELS_10Q[item_id]] = block

    return extracted


###############################################################################
# 5. FILTER OUT ACCOUNTING FOOTNOTES / BOILERPLATE
###############################################################################

ACCOUNTING_BLACKLIST = [
    r"see accompanying notes",
    r"consolidated balance sheets",
    r"consolidated statements",
    r"fair value",
    r"goodwill",
    r"impairment",
    r"deferred tax",
    r"fASB|ASC|ASU|FIN\s*\d+",
    r"note \d+",
    r"pcaob",
    r"audited",
    r"unaudited",
    r"in thousands",
]

ACCOUNTING_RE = re.compile("|".join(ACCOUNTING_BLACKLIST), re.IGNORECASE)

QUAL_KEYWORDS = [
    "expect", "anticipate", "believe", "forecast", "outlook",
    "demand", "pricing", "margin", "trend", "guidance",
    "customers", "volume", "expenses", "growth", "risk", "uncertainty"
]

def keep_qualitative_paragraphs(text: str):
    cleaned_paragraphs = []
    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]

    for p in paragraphs:

        # Drop accounting boilerplate
        if ACCOUNTING_RE.search(p):
            continue

        # Must contain forward-looking or qualitative language
        if not any(k in p.lower() for k in QUAL_KEYWORDS):
            continue

        cleaned_paragraphs.append(p)

    return "\n".join(cleaned_paragraphs)


###############################################################################
# 6. FULL FILING PROCESSING
###############################################################################

def process_filing(path: Path) -> str:
    html = path.read_text(encoding="utf-8", errors="ignore")
    plain = strip_html_and_xbrl(html)
    form_type = detect_form_type(plain)
    sections = extract_items(plain, form_type)

    output = []
    for section_name, content in sections.items():
        q = keep_qualitative_paragraphs(content)
        if q:
            output.append(f"===== {section_name} =====\n{q}")

    return "\n\n".join(output)


###############################################################################
# 7. PROCESS COMPANY FOLDER
###############################################################################

def process_company_folder(folder: Path, out_root: Path):
    out_dir = out_root / folder.name
    out_dir.mkdir(parents=True, exist_ok=True)

    for html_file in folder.glob("*.htm"):
        out_path = out_dir / (html_file.stem + "_qualitative.txt")

        # skip if already processed
        if out_path.exists() and out_path.stat().st_size > 0:
            print(f"SKIP: {out_path.name}")
            continue

        print(f"Processing {html_file.name} ...")
        cleaned = process_filing(html_file)
        out_path.write_text(cleaned, encoding="utf-8")
        print(f"   ✔ wrote {out_path}")


###############################################################################
# 8. MAIN LOOP
###############################################################################

if __name__ == "__main__":
    root_input = Path("/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original")
    root_output = Path("/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1")

    for folder in root_input.iterdir():
        if folder.is_dir():
            print(f"\n===== Processing {folder.name} =====")
            process_company_folder(folder, root_output)



===== Processing CTAS =====
Processing CTAS-2024-Q1-10Q.htm ...
   ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_qualitative.txt
Processing CTAS-2013-Q1-10Q.htm ...
   ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_qualitative.txt
Processing CTAS-2020-Q3-10K.htm ...
   ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_qualitative.txt
Processing CTAS-2017-Q3-10K.htm ...
   ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-10K_qualitative.txt
Processing CTAS-2010-Q4-10Q.htm ...
   ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2010-Q4-10Q_qualitative.txt
Processing CTAS-2006-Q3-10K.htm ...
   ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2006-Q3-10K_qualitati

KeyboardInterrupt: 

In [12]:
# og version
import re
from pathlib import Path
from bs4 import BeautifulSoup

###############################################################################
# STRIP HTML, TABLES, FIGURES, AND ALL INLINE XBRL
###############################################################################

XBRL_TAGS = [
    "ix:header", "ix:nonNumeric", "ix:nonFraction", "ix:footnote",
    "ix:hidden", "ix:references", "ix:resources"
]

def strip_html_and_xbrl(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html, "lxml")

    # Remove Workiva inline XBRL blocks entirely
    for tagname in XBRL_TAGS:
        for t in soup.find_all(tagname):
            t.decompose()

    # Remove scripts, styles, metadata
    for tag in soup(["script", "style", "head", "noscript", "meta", "link"]):
        tag.decompose()

    # Remove tables, images, svgs, figures
    for tag in soup.find_all(["table", "img", "figure", "svg"]):
        tag.decompose()

    # Unwrap any remaining XML/XBRL tags to keep text
    for tag in list(soup.find_all()):
        if tag.prefix and tag.prefix.lower() in [
            "ix","ixn","xbrli","xlink","link","us-gaap","dei","srt","xbrldi"
        ]:
            tag.unwrap()

    # Extract text
    text = soup.get_text("\n")

    # Clean artifacts
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\r", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()


###############################################################################
# FORM TYPE DETECTION
###############################################################################

def detect_form_type(text: str) -> str:
    upper = text[:8000].upper()
    if "FORM 10-K" in upper:
        return "10-K"
    if "FORM 10-Q" in upper:
        return "10-Q"
    # Assume 10-Q when ambiguous
    return "10-Q"


###############################################################################
# ROBUST ITEM DETECTION
###############################################################################

ITEM_HEADER = re.compile(
    r"\bITEM\s*[\.\s]*"
    r"(1A|1|2|3|4|5|6|7|7A|8|9|9A)"
    r"(?!\d)"
    , flags=re.IGNORECASE
)

def find_all_items(text: str):
    """Return sorted list of (item_label, index)."""
    items = []
    for m in ITEM_HEADER.finditer(text):
        items.append((m.group(1).upper(), m.start()))
    return sorted(items, key=lambda x: x[1])


def extract_item_block(text: str, target_label: str):
    items = find_all_items(text)
    target_label = target_label.upper()

    for i, (label, pos) in enumerate(items):
        if label == target_label:
            end = items[i+1][1] if i+1 < len(items) else len(text)
            return text[pos:end].strip()
    return ""


###############################################################################
# SECTION EXTRACTION (QUALITATIVE SECTIONS ONLY)
###############################################################################

def extract_qualitative_sections(text: str):
    ftype = detect_form_type(text)
    sections = {}

    if ftype == "10-K":
        sections["Item 1 - Business"] = extract_item_block(text, "1")
        sections["Item 1A - Risk Factors"] = extract_item_block(text, "1A")
        sections["Item 3 - Legal Proceedings"] = extract_item_block(text, "3")
        sections["Item 7 - MD&A"] = extract_item_block(text, "7")
        sections["Item 7A - Market Risk"] = extract_item_block(text, "7A")

    else:  # 10-Q
        sections["Item 1A - Risk Factors"] = extract_item_block(text, "1A")
        sections["Item 2 - MD&A"] = extract_item_block(text, "2")
        sections["Item 3 - Market Risk"] = extract_item_block(text, "3")
        sections["Item 4 - Controls & Procedures"] = extract_item_block(text, "4")

    return {k: v for k, v in sections.items() if v.strip()}


###############################################################################
# QUALITATIVE FILTER — REMOVE TABLE REMNANTS AND PURELY NUMERIC LINES
###############################################################################

QUAL_KEYWORDS = [
    "demand","volume","pricing","margin","cost","expenses","growth",
    "decline","headwind","tailwind","outlook","guidance","expect",
    "anticipate","forecast","trend","liquidity","cash flow","capital",
    "backlog","pipeline","customers","inflation","foreign exchange",
    "fx","currency"
]

def is_qualitative_line(line: str):
    s = line.strip()
    if len(s) < 40:
        return False

    letters = sum(c.isalpha() for c in s)
    digits = sum(c.isdigit() for c in s)

    alpha_ratio = letters / max(len(s), 1)
    digit_ratio = digits / max(len(s), 1)

    has_sentence = "." in s
    has_keyword = any(k in s.lower() for k in QUAL_KEYWORDS)
    numeric_heavy = digit_ratio > 0.30 and alpha_ratio < 0.40

    if numeric_heavy and not has_keyword:
        return False
    if not has_sentence and not has_keyword:
        return False
    if alpha_ratio < 0.40 and not has_keyword:
        return False

    return True


def keep_qualitative_narrative(text: str) -> str:
    kept = [l.strip() for l in text.split("\n") if is_qualitative_line(l)]
    return "\n".join(kept).strip()


###############################################################################
# PROCESS ONE FILING
###############################################################################

def process_filing(html_path: Path) -> str:
    raw_html = html_path.read_text(encoding="utf-8", errors="ignore")
    plain = strip_html_and_xbrl(raw_html)
    sections = extract_qualitative_sections(plain)

    output_blocks = []
    for sec_name, sec_text in sections.items():
        cleaned = keep_qualitative_narrative(sec_text)
        if cleaned:
            output_blocks.append(f"===== {sec_name} =====\n{cleaned}")

    return "\n\n".join(output_blocks)


###############################################################################
# PROCESS A COMPANY FOLDER (SKIPS ALREADY CLEANED FILES)
###############################################################################

def process_company_folder(folder: Path, out_root: Path):
    out_dir = out_root / folder.name
    out_dir.mkdir(parents=True, exist_ok=True)

    for html_file in folder.glob("*.htm"):
        out_path = out_dir / (html_file.stem + "_qualitative.txt")

        # skip if already processed
        if out_path.exists() and out_path.stat().st_size > 0:
            print(f"SKIP (exists): {out_path.name}")
            continue

        print(f"Processing {html_file.name} ...")
        cleaned = process_filing(html_file)
        out_path.write_text(cleaned, encoding="utf-8")
        print(f"  ✔ wrote {out_path}")


###############################################################################
# MAIN EXECUTION
###############################################################################

if __name__ == "__main__":
    root_input = Path("/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original")
    root_output = Path("/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1")

    # Automatically detect tickers (subfolders)
    for folder in root_input.iterdir():
        if folder.is_dir():
            print(f"\n===== Processing {folder.name} =====")
            process_company_folder(folder, root_output)



===== Processing CTAS =====
Processing CTAS-2024-Q1-10Q.htm ...
  ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2024-Q1-10Q_qualitative.txt
Processing CTAS-2013-Q1-10Q.htm ...
  ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2013-Q1-10Q_qualitative.txt
Processing CTAS-2020-Q3-10K.htm ...
  ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2020-Q3-10K_qualitative.txt
Processing CTAS-2017-Q3-10K.htm ...
  ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2017-Q3-10K_qualitative.txt
Processing CTAS-2010-Q4-10Q.htm ...
  ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2010-Q4-10Q_qualitative.txt
Processing CTAS-2006-Q3-10K.htm ...
  ✔ wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/CTAS/CTAS-2006-Q3-10K_qualitative.txt

FileNotFoundError: [Errno 2] No such file or directory: '/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/RCL/RCL-2016-Q2-10Q_qualitative.txt'

In [9]:
import re
from pathlib import Path
from bs4 import BeautifulSoup

###############################################################################
# SAFE HTML + XBRL STRIPPER
###############################################################################

XBRL_PREFIXES = {
    "ix", "ixn", "link", "xlink",
    "xbrli", "xbrldi", "us-gaap", "dei", "srt"
}

def strip_html_and_xbrl(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html, "lxml")

    # Remove scripts, styles, unusable headers, etc.
    for tag in soup(["script", "style", "head", "noscript", "meta", "link"]):
        tag.decompose()

    # Remove entire tables (we do not want numeric sections)
    for tbl in soup.find_all("table"):
        tbl.decompose()

    # UNWRAP inline XBRL tags so we KEEP their text
    for tag in list(soup.find_all()):
        if tag.prefix and tag.prefix.lower() in XBRL_PREFIXES:
            tag.unwrap()

    # Extract plain text
    text = soup.get_text("\n")

    # Clean remaining HTML artifacts
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\r", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()


###############################################################################
# FORM TYPE DETECTION (10-K vs 10-Q)
###############################################################################

def detect_form_type(text: str) -> str:
    upper = text[:8000].upper()
    if "FORM 10-K" in upper or "ANNUAL REPORT" in upper:
        return "10-K"
    if "FORM 10-Q" in upper or "QUARTERLY REPORT" in upper:
        return "10-Q"
    return "10-Q"


###############################################################################
# SECTION EXTRACTION (ITEM 1, 1A, 2, 7, etc.)
###############################################################################
###############################################################################
# BETTER ITEM DETECTION (handles line breaks + avoids false matches)
###############################################################################

ITEM_HEADER = re.compile(
    r"(?mi)^[ \t]*ITEM\s+"
    r"(1A|7A|9A|1|2|3|4|5|6|7|8|9)"
    r"(?!\d)\s*[\.\:\-–—]?",
)

def find_all_items(text: str):
    """
    Returns sorted list of (item_label, start_index).
    """
    items = []
    for m in ITEM_HEADER.finditer(text):
        label = m.group(1).upper()
        items.append((label, m.start()))
    return sorted(items, key=lambda x: x[1])


def extract_item_block_fixed(text: str, target_label: str):
    """
    Extracts the text belonging to ITEM <target_label> until the next true item.
    """
    items = find_all_items(text)
    target_label = target_label.upper()

    # Find this item in the list
    for i, (label, start_idx) in enumerate(items):
        if label == target_label:
            # End is next item OR end of text
            end_idx = items[i+1][1] if i+1 < len(items) else len(text)
            return text[start_idx:end_idx].strip()

    return ""  # Not found


def extract_qualitative_sections(text: str):
    ftype = detect_form_type(text)
    sections = {}

    if ftype == "10-K":
        sections["Item 1 - Business"] = extract_item_block_fixed(text, "1")
        sections["Item 1A - Risk Factors"] = extract_item_block_fixed(text, "1A")
        sections["Item 3 - Legal Proceedings"] = extract_item_block_fixed(text, "3")
        sections["Item 7 - MD&A"] = extract_item_block_fixed(text, "7")
        sections["Item 7A - Market Risk"] = extract_item_block_fixed(text, "7A")

    else:  # 10-Q
        sections["Item 1A - Risk Factors"] = extract_item_block_fixed(text, "1A")
        sections["Item 2 - MD&A"] = extract_item_block_fixed(text, "2")
        sections["Item 3 - Market Risk"] = extract_item_block_fixed(text, "3")
        sections["Item 4 - Controls and Procedures"] = extract_item_block_fixed(text, "4")

    # Remove empty ones
    return {k: v for k, v in sections.items() if v.strip()}



###############################################################################
# QUALITATIVE FILTER — REMOVE NUMERIC / TABLE REMNANTS
###############################################################################

QUAL_KEYWORDS = [
    "demand","volume","pricing","margin","cost","expenses","growth",
    "decline","headwind","tailwind","outlook","guidance","expect",
    "anticipate","forecast","trend","liquidity","cash flow","capital",
    "backlog","pipeline","customers","inflation","foreign exchange",
    "fx","currency"
]

def is_qualitative_line(line: str):
    s = line.strip()
    if len(s) < 40:
        return False
    if s.isupper() and " " not in s:
        return False

    letters = sum(c.isalpha() for c in s)
    digits = sum(c.isdigit() for c in s)
    alpha_ratio = letters / max(len(s), 1)
    digit_ratio = digits / max(len(s), 1)

    has_sentence = "." in s
    numeric_heavy = digit_ratio > 0.30 and alpha_ratio < 0.40

    lowered = s.lower()
    has_keyword = any(k in lowered for k in QUAL_KEYWORDS)

    if numeric_heavy and not has_keyword:
        return False
    if not has_sentence and not has_keyword:
        return False
    if alpha_ratio < 0.40 and not has_keyword:
        return False

    return True


def keep_qualitative_narrative(text: str) -> str:
    lines = text.split("\n")
    kept = [l.strip() for l in lines if is_qualitative_line(l)]
    out = "\n".join(kept)
    out = re.sub(r"\n{3,}", "\n\n", out)
    return out.strip()


###############################################################################
# PROCESS A SINGLE FILING
###############################################################################

def process_filing(html_path: Path) -> str:
    raw_html = html_path.read_text(encoding="utf-8", errors="ignore")
    plain = strip_html_and_xbrl(raw_html)
    sections = extract_qualitative_sections(plain)

    cleaned = []
    for name, sec in sections.items():
        q = keep_qualitative_narrative(sec)
        if q:
            cleaned.append(f"===== {name} =====\n{q}")

    return "\n\n".join(cleaned).strip()


###############################################################################
# PROCESS ENTIRE DIRECTORY — YOUR REQUEST
###############################################################################

def process_company_folder(company_folder: str):
    input_dir = Path(company_folder)

    output_dir = Path(
        "/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1"
    ) / input_dir.name

    # Create output directory and parents if needed
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Output directory: {output_dir}\n")

    html_files = list(input_dir.glob("*.htm")) + list(input_dir.glob("*.html"))
    if not html_files:
        print("No HTML filings found.")
        return

    for html_file in html_files:
        print(f"Processing {html_file.name} ...")
        cleaned = process_filing(html_file)

        out_path = output_dir / (html_file.stem + "_qualitative.txt")
        out_path.write_text(cleaned, encoding="utf-8")
        print(f"  -> wrote {out_path}")

    print("\nDone!")


###############################################################################
# MAIN EXECUTION — LOOP THROUGH ALL TICKERS
###############################################################################

if __name__ == "__main__":

    tickers = ['PAYC'
        # 'PAYC','KEYS','HWM','AAPL','AAP','ABBV','ABMD','ABT','ADSK','TAP','ACN',
        # 'ADI','ADM','AEP','AES','AFL','SWKS','HES','AIG','AIZ','AKAM','ALB','HON',
        # 'ALLE','ALGN','ALK','ALL','PLD','AMGN','AAL','AMT','ANSS','APH','ARE','A',
        # 'ANTM','ADP','AVB','ATVI','AVY','AWK','AXP','AZO','PNW','BA','BAX','BR',
        # 'BBY','RHI','BDX','BF.B','BKR','BIO','WRB','BLL','BWA','CARR','CAT','CBRE',
        # 'CBOE','C','CCL','COST','CDAY','STZ','CAH','CDW','CE','CERN','FIS','CF',
        # 'CFG','JPM','CHRW','CVX','CINF','CL','CLX','CMA','CMCSA','CMG','CMS','CNC',
        # 'COF','TPR','CPB','CPRT','CHD','CRL','DXC','CSCO','CSX','CTAS','CTLT',
        # 'CTSH','CTVA','CTXS','CMI','DAL','TGT','DIS','DISH','DLR','DLTR','DHR',
        # 'DPZ','DG','DOV','OMC','MU','DRE','DHI','DRI','DTE','DUK','DVN','MS','DXCM',
        # 'EBAY','CDNS','ECL','ED','EFX','ATO','PKI','EL','EMN','EMR','ENPH','EOG',
        # 'EQR','EA','ESS','ETN','ETSY','EW','EXPD','EXR','F','RF','FANG','FAST',
        # 'FBHS','FCX','FDX','FISV','FLT','FMC','USB','FRC','BEN','FTNT','FTV','IT',
        # 'GD','GILD','GIS','GLW','GM','GNRC','GPC','GPN','GPS','GRMN','GS','GWW',
        # 'HAL','HAS','HBAN','HBI','HCA','PEAK','HD','HII','HST','HOLX','CNP','HPE',
        # 'HRL','HSIC','HSY','IBM','ICE','IDXX','IEX','IFF','ILMN','INTC','INTU',
        # 'IP','IPG','IPGP','INCY','TT','ISRG','ITW','IVZ','JBHT','JKHY','JNPR','EVRG',
        # 'MDLZ','KHC','KIM','KMB','KMI','KMX','KR','LYB','LEG','LEN','LMT','LNC',
        # 'BBWI','L','LUV','LVS','LW','LYV','MAA','MA','MAS','AME','MKC','MCD',
        # 'MCHP','MCK','CVS','MET','MGM','SPGI','MAR','MKTX','MLM','MMC','MMM','MO',
        # 'MHK','MOS','MSI','MPC','MPWR','INFO','MRNA','MRO','MSFT','ETR','MTD',
        # 'MSCI','VTRS','BAC','NCLH','NFLX','NI','NKE','NOC','NOW','NRG','NSC','XEL',
        # 'NTAP','ES','NUE','NVDA','NWL','NWSA','NXPI','ODFL','FE','OGN','OKE','ORCL',
        # 'ORLY','OTIS','OXY','COP','PAYX','PBCT','BKNG','PXD','EXC','PENN','PEP',
        # 'PFE','PFG','PG','PHM','PKG','PTC','PNC','PNR','BRO','POOL','PPG','PPL',
        # 'PGR','PRU','RE','PSX','TROW','PVH','PWR','PYPL','QCOM','QRVO','RL','RCL',
        # 'REG','REGN','RMD','RJF','O','ROK','ROL','ROST','RSG','NVR','LDOS','SBUX',
        # 'EIX','SCHW','SRE','SEE','SHW','SIVB','SLB','AOS','TFC','SNPS','SO','KEY',
        # 'SPG','STT','TRV','SYK','SWK','SYF','NLOK','SYY','TDY','TECH','TER','TFX',
        # 'TDG','GL','TMO','TRMB','TSCO','TWTR','CCI','TXN','TXT','TYL','TSN','UAL',
        # 'UAA','UDR','AEE','UHS',
        # 'ULTA','UNP','URI','WM','RTX','VTR','VFC','VIAC',
        # 'V','VLO','VMC','VNO','VRSK','VRSN','VRTX','WAB','WBA','WAT','WDC','WHR',
        # 'WMB','LNT','WST','WU','WY','WYNN','XLNX','XOM','XYL','ZBRA','ZION','ZBH','ZTS'
    ]

    root_input = Path("/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-original")
    root_output = Path("/Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1")

    for ticker in tickers:
        input_dir = root_input / ticker
        output_dir = root_output / ticker

        if not input_dir.exists():
            print(f"Skipping {ticker}: folder not found.")
            continue

        print(f"\n===== Processing {ticker} =====")
        process_company_folder(str(input_dir))



===== Processing PAYC =====
Output directory: /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/PAYC

Processing PAYC-2023-Q3-10Q.htm ...
  -> wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/PAYC/PAYC-2023-Q3-10Q_qualitative.txt
Processing PAYC-2019-Q3-10Q.htm ...
  -> wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/PAYC/PAYC-2019-Q3-10Q_qualitative.txt
Processing PAYC-2019-Q2-10Q.htm ...
  -> wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/PAYC/PAYC-2019-Q2-10Q_qualitative.txt
Processing PAYC-2018-Q1-10K.htm ...
  -> wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/PAYC/PAYC-2018-Q1-10K_qualitative.txt
Processing PAYC-2018-Q4-10Q.htm ...
  -> wrote /Users/clairetseng/Desktop/Research/xiu/llm-bias/ibes_aggregate/10KQ-cleaned1/PAYC/PAYC-2018-Q4-10Q_qualitative.txt
Processing PAYC-2023-Q2-10Q.htm ...
  -> wrote /Users/