In [1]:
import options_wizard as ow

    

In [168]:
from dotenv import load_dotenv
from typing import Set, Deque, Tuple, Optional
from dataclasses import dataclass
from collections import deque, defaultdict, Counter
import html as html_lib
import re
from urllib.request import Request, urlopen
from datetime import datetime, date
import os

load_dotenv()
frd_names = os.getenv("FRD_NAMES")
print(frd_names)

# NOTE: requires `ow` (options_wizard) to be imported/available in your runtime.
# Example:
# import options_wizard as ow

@dataclass
class FrdName:
    tick: str
    name: str
    start_date: ow.DateObj


# ---------- Filters / Normalizers ----------

FUND_LIKE_RE = re.compile(
    r"(?i)\b("
    r"etf|etn|mutual fund|fund\b|index\b|closed[- ]end|exchange[- ]traded|"
    r"i?shares|spdr|vanguard|invesco|proshares|direxion|wisdomtree|"
    r"first trust|guggenheim|nuveen|blackrock|pimco|van eck|"
    r"portfolio|portfo\b|income fund|dividend fund|term fund|"
    r"ucits|sicav|tracker|notes?\b"
    r")\b"
)

NAME_DROP = {
    "the", "and", "of", "by", "for", "a", "an",
    "inc", "incorporated", "corp", "corporation", "co", "company", "companies",
    "ltd", "limited", "llc", "plc", "lp", "llp",
    "cos", "cos.", "co.", "inc.", "ltd.", "plc.",
    "group", "holdings", "holding", "partners", "trust", "reit",
    "real", "estate", "realty", "finance", "financial",
}
NAME_DROP |= {"de", "us", "the"}

TICK_IGNORE = [
    "US", "HK", "LN", "DE", "SW", "VI", "PA", "FRA", "TO", "CN", "KS",
    "MU", "OL", "SS", "BE", "AX", "MI", "NZ", "CO", "SA", "AU"
]

PHRASE_STRIP = [
    r"common shares(?: of beneficial interest)?",
    r"ordinary shares",
    r"american depositary shares",
    r"ads each representing.*",
    r"each representing.*",
    r"common units(?: representing.*)?",
    r"units representing.*",
    r"of beneficial interest",
    r"beneficial interest",
    r"depositary shares",
    r"parent inc.*",
    r"inc\/the|co\/the|group\/the",
    r"\bcmn\b",
    r"\bcom\b",
    r"\bcomm\b",
    r"\bcommon\b",
    r"\bordinary\b",
    r"\bshares?\b",
    r"\bholdings?\b",
    r"\binc\b|\bcorp\b|\bplc\b|\bltd\b|\bllc\b",
    r"\binc\/the\b|\bcorp\/the\b|\bco\/the\b|\bgroup\/the\b",
    r"\bcorp\/de\b|\binc\/de\b",
]

def is_primary_ipo(row) -> bool:
    offer_type = (row.get("Offer Type") or "").lower()

    if "ipo" not in offer_type:
        return False
    if "spac" in offer_type:
        return False
    if "primary" not in offer_type:
        return False

    bad = ["follow-on", "secondary", "block trade", "accelerated", "atm", "rights", "convertible"]
    if any(b in offer_type for b in bad):
        return False

    return True


def extract_pattern(tick_str: str, name_str: str) -> Tuple[str, str]:
    """Normalize ticker/name patterns for matching."""

    def norm_ticker(s: str) -> str:
        s = (s or "").strip()
        s = re.sub(r"(?i)\bclass\b.*$", "", s).strip()

        tick_re = rf'(?i)(?<=\s)(?:{"|".join(map(re.escape, TICK_IGNORE))})\b'
        s = re.sub(tick_re, "", s)

        s = re.sub(r"(?i)(?<=\S)-DELISTED\b", "", s, count=1)
        s = re.sub(r"\s{2,}", " ", s).strip()
        return s.lower()

    def norm_name(s: str) -> str:
        s = (s or "").strip()
        s = re.sub(r"(?i)\bclass\b.*$", "", s).strip()

        if PHRASE_STRIP:
            s = re.sub(r"(?i)\b(?:%s)\b" % "|".join(PHRASE_STRIP), "", s).strip()

        s = s.replace("&", " and ")
        s = re.sub(r"[â€™']", "", s)
        s = re.sub(r"[^A-Za-z0-9]+", " ", s)
        s = re.sub(r"\s{2,}", " ", s).strip().lower()

        tokens = [t for t in s.split() if t and t not in NAME_DROP]

        CORE_N = 5
        return " ".join(tokens[:CORE_N])

    return norm_ticker(tick_str), norm_name(name_str)


# ---------- IPO Loader (primary IPO only + dedupe per issuer) ----------

def _parse_date_any(s: str):
    s = (s or "").strip()
    if not s:
        return None
    for fmt in ("%d/%m/%Y", "%Y-%m-%d"):
        try:
            return datetime.strptime(s, fmt).date()
        except ValueError:
            pass
    return None

def _best_row_date(row):
    return (_parse_date_any(row.get("Pricing Date"))
            or _parse_date_any(row.get("Announced Date"))
            or _parse_date_any(row.get("Expected Pricing Date")))

def _issuer_key(row) -> str:
    name = (row.get("Issuer Name") or "").lower()
    name = re.sub(r"[^a-z0-9]+", " ", name).strip()
    name = re.sub(
        r"\b(inc|incorporated|corp|corporation|co|company|companies|"
        r"ltd|limited|llc|plc|lp|llp|holdings|holding|group)\b",
        "",
        name,
    )
    name = re.sub(r"\s{2,}", " ", name).strip()
    return name

def load_raw(_dict: bool = False):
    import csv
    load_dotenv()
    names = os.getenv("IPO_NAMES")

    total_count = 0
    skipped_spac = 0
    skipped_non_primary = 0
    skipped_fundlike = 0

    rows = []
    with open(names, "r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            total_count += 1

            offer_type = row.get("Offer Type", "")
            if "SPAC" in offer_type:
                skipped_spac += 1
                continue

            if not is_primary_ipo(row):
                skipped_non_primary += 1
                continue

            if FUND_LIKE_RE.search(row.get("Issuer Name", "") or ""):
                skipped_fundlike += 1
                continue

            rows.append(row)

    best_by_issuer = {}
    dupes_removed = 0
    for row in rows:
        k = _issuer_key(row)
        d = _best_row_date(row)

        if k not in best_by_issuer:
            best_by_issuer[k] = row
        else:
            dupes_removed += 1
            existing = best_by_issuer[k]
            de = _best_row_date(existing)

            if de is None or (d is not None and d < de):
                best_by_issuer[k] = row

    rows = list(best_by_issuer.values())

    if not _dict:
        tick_patterns = set()
        name_patterns = set()
        for row in rows:
            tick_str = (row.get("Issuer Ticker") or "").strip()
            name_str = (row.get("Issuer Name") or "").strip()
            tick_pattern, name_pattern = extract_pattern(tick_str, name_str)
            if tick_pattern:
                tick_patterns.add(tick_pattern)
            if name_pattern:
                name_patterns.add(name_pattern)
    else:
        tick_patterns = defaultdict(list)
        name_patterns = defaultdict(list)
        for row in rows:
            tick_str = (row.get("Issuer Ticker") or "").strip()
            name_str = (row.get("Issuer Name") or "").strip()
            tick_pattern, name_pattern = extract_pattern(tick_str, name_str)
            if tick_pattern:
                tick_patterns[tick_pattern].append(row)
            if name_pattern:
                name_patterns[name_pattern].append(row)

    print(
        f"Loaded {total_count} rows -> kept {len(rows)} unique primary IPO issuers; "
        f"skipped {skipped_spac} SPACs, {skipped_non_primary} non-primary IPOs, "
        f"skipped {skipped_fundlike} fund-like, removed {dupes_removed} duplicate issuer rows."
    )
    return tick_patterns, name_patterns


# ---------- Safe ticker fallback helpers ----------

def _ipo_date(row) -> Optional[date]:
    return _best_row_date(row)

def build_ticker_candidates():
    """ticker_key -> list of IPO rows (may be >1 if collisions)"""
    tick_map, _ = load_raw(_dict=True)
    return tick_map

def choose_ticker_candidate(
    rows,
    frd_start: date,
    max_before_days: int = 30,
    max_after_days: int = 365,
    ambiguity_days: int = 14,
):
    """
    Choose best IPO row for a ticker given FRD start date.
    Only accept if the best candidate is within the date gate and not ambiguous.
    """
    candidates = []
    for r in rows:
        d = _ipo_date(r)
        if d is None:
            continue
        delta = (frd_start - d).days
        candidates.append((abs(delta), delta, r, d))

    if not candidates:
        return None, "no_candidate_dates"

    candidates.sort(key=lambda x: x[0])
    best_abs, best_delta, best_row, best_d = candidates[0]

    if best_delta < -max_before_days or best_delta > max_after_days:
        return None, f"fails_date_gate(delta_days={best_delta})"

    if len(candidates) >= 2:
        second_abs, second_delta, second_row, second_d = candidates[1]
        if second_abs - best_abs <= ambiguity_days:
            return None, "ambiguous_close_candidates"

    return best_row, f"chosen(delta_days={best_delta})"


# ---------- FRD Scraper (NAME FIRST, SAFE TICKER FALLBACK) ----------

def scrape_names_name_first_safe_ticker() -> Deque[FrdName]:
    """
    Prioritise name matching.
    If name doesn't match, use ticker fallback ONLY when a clearly best IPO candidate exists
    and the FRD start date is close enough to the IPO date.
    """
    if not frd_names:
        raise ValueError("FRD_NAMES env var is not set.")

    req = Request(frd_names, headers={"User-Agent": "Mozilla/5.0"})
    with urlopen(req) as response:
        page = response.read().decode("utf-8", errors="ignore")

    page = re.sub(r"<script[^>]*>.*?</script>", "", page, flags=re.IGNORECASE | re.DOTALL)
    page = re.sub(r"<style[^>]*>.*?</style>", "", page, flags=re.IGNORECASE | re.DOTALL)
    page = re.sub(r"<br\s*/?>", "\n", page, flags=re.IGNORECASE)
    page = re.sub(r"</(p|div|li|tr|td|h\d|ul)>", "\n", page, flags=re.IGNORECASE)
    text = re.sub(r"<[^>]+>", "", page)
    text = html_lib.unescape(text)

    date_cls = getattr(ow, "DateObject", None) or getattr(ow, "DateObj", None)
    if date_cls is None:
        raise AttributeError("options_wizard.DateObj or DateObject not found")

    def parse_date(date_str: str):
        y, m, d = map(int, date_str.split("-"))
        py = date(y, m, d)
        if hasattr(date_cls, "from_iso"):
            return date_cls.from_iso(date_str), py
        return date_cls(y, m, d), py

    targets_ticks, target_names = load_raw()   # sets
    ticker_candidates = build_ticker_candidates()

    date_re = re.compile(r"Start Date:\s*(\d{4}-\d{2}-\d{2})")

    by_name = {}  # dedupe by IPO name key (true "unique")
    skipped_tick_match = deque()
    skipped_name_match = deque()

    # diagnostics
    kept_by_name = 0
    kept_by_safe_ticker = 0
    removed_funds = 0
    removed_nonmatch = 0
    dropped_ticker_no_candidates = 0
    dropped_ticker_date_gate = 0
    dropped_ticker_ambiguous = 0
    total_lines = 0

    for line in text.splitlines():
        total_lines += 1
        if "Start Date" not in line:
            continue

        m = date_re.search(line)
        if not m:
            continue

        date_str = m.group(1)
        prefix = line[:m.start()].strip()
        if not prefix:
            continue

        ticker_raw = prefix.split()[0]

        name_raw = ""
        op = prefix.find("(")
        cp = prefix.rfind(")")
        if op != -1 and cp != -1 and cp > op:
            name_raw = prefix[op + 1:cp].strip()

        # skip FRD fund-like instruments
        if name_raw and FUND_LIKE_RE.search(name_raw):
            removed_funds += 1
            continue

        ticker_key, name_key = extract_pattern(ticker_raw, name_raw)
        ow_date, py_date = parse_date(date_str)

        # your existing diagnostics lists (optional)
        if (ticker_key not in targets_ticks) and (name_key in target_names):
            skipped_name_match.append((ticker_key, name_key))
        if (name_key not in target_names):
            skipped_tick_match.append((ticker_key, name_key))

        # 1) NAME MATCH (priority)
        if name_key in target_names:
            frd = FrdName(tick=ticker_key, name=name_key, start_date=ow_date)
            if frd.name not in by_name or frd.start_date < by_name[frd.name].start_date:
                by_name[frd.name] = frd
            kept_by_name += 1
            continue

        # 2) SAFE TICKER FALLBACK
        if ticker_key in targets_ticks and ticker_key in ticker_candidates:
            chosen_row, reason = choose_ticker_candidate(
                ticker_candidates[ticker_key],
                frd_start=py_date,
                max_before_days=30,
                max_after_days=365,
                ambiguity_days=14,
            )

            if chosen_row is None:
                if reason.startswith("no_candidate"):
                    dropped_ticker_no_candidates += 1
                elif reason.startswith("fails_date_gate"):
                    dropped_ticker_date_gate += 1
                else:
                    dropped_ticker_ambiguous += 1
                removed_nonmatch += 1
                continue

            _, ipo_name_key = extract_pattern(
                chosen_row.get("Issuer Ticker") or "",
                chosen_row.get("Issuer Name") or "",
            )
            if ipo_name_key:
                frd = FrdName(tick=ticker_key, name=ipo_name_key, start_date=ow_date)
                if frd.name not in by_name or frd.start_date < by_name[frd.name].start_date:
                    by_name[frd.name] = frd
                kept_by_safe_ticker += 1
                continue

        removed_nonmatch += 1

    names = deque(by_name.values())

    print("\n===== scrape_names_name_first_safe_ticker() =====")
    print(f"FRD lines total: {total_lines}")
    print(f"kept_by_name: {kept_by_name}")
    print(f"kept_by_safe_ticker: {kept_by_safe_ticker}")
    print(f"removed_funds: {removed_funds}")
    print(f"dropped_ticker_no_candidates: {dropped_ticker_no_candidates}")
    print(f"dropped_ticker_date_gate: {dropped_ticker_date_gate}")
    print(f"dropped_ticker_ambiguous: {dropped_ticker_ambiguous}")
    print(f"removed_nonmatch: {removed_nonmatch}")
    print(f"FINAL unique matched issuers (deduped by IPO name-key): {len(names)}")

    return names, skipped_tick_match, skipped_name_match


https://firstratedata.com/cb/5/complete-us-stocks-index-etf-futures-options


In [169]:
res, stm, snm = scrape_names_name_first_safe_ticker()


Loaded 6132 rows -> kept 3013 unique primary IPO issuers; skipped 1358 SPACs, 1321 non-primary IPOs, skipped 412 fund-like, removed 28 duplicate issuer rows.
Loaded 6132 rows -> kept 3013 unique primary IPO issuers; skipped 1358 SPACs, 1321 non-primary IPOs, skipped 412 fund-like, removed 28 duplicate issuer rows.

===== scrape_names_name_first_safe_ticker() =====
FRD lines total: 21391
kept_by_name: 2013
kept_by_safe_ticker: 230
removed_funds: 5813
dropped_ticker_no_candidates: 0
dropped_ticker_date_gate: 90
dropped_ticker_ambiguous: 0
removed_nonmatch: 12789
FINAL unique matched issuers (deduped by IPO name-key): 2036


In [172]:
from collections import Counter, defaultdict
from datetime import date

def _fetch_frd_evidence_sets():
    """
    Pull FRD page once and build evidence sets/maps for fast membership tests.
    Returns:
      frd_tickers: set[str] normalized ticker keys seen on FRD Start Date lines
      frd_namekeys: set[str] normalized name keys seen on FRD Start Date lines
      frd_fundlike_namekeys: set[str] name keys whose *FRD raw label* tripped FUND_LIKE_RE
      frd_ticker_to_min_start: dict[str, date] earliest FRD start date per ticker (python date)
    """
    if not frd_names:
        raise ValueError("FRD_NAMES env var is not set.")

    req = Request(frd_names, headers={"User-Agent": "Mozilla/5.0"})
    with urlopen(req) as response:
        page = response.read().decode("utf-8", errors="ignore")

    page = re.sub(r"<script[^>]*>.*?</script>", "", page, flags=re.IGNORECASE | re.DOTALL)
    page = re.sub(r"<style[^>]*>.*?</style>", "", page, flags=re.IGNORECASE | re.DOTALL)
    page = re.sub(r"<br\s*/?>", "\n", page, flags=re.IGNORECASE)
    page = re.sub(r"</(p|div|li|tr|td|h\d|ul)>", "\n", page, flags=re.IGNORECASE)
    text = re.sub(r"<[^>]+>", "", page)
    text = html_lib.unescape(text)

    date_re = re.compile(r"Start Date:\s*(\d{4}-\d{2}-\d{2})")

    frd_tickers = set()
    frd_namekeys = set()
    frd_fundlike_namekeys = set()
    frd_ticker_to_min_start = {}

    for line in text.splitlines():
        if "Start Date" not in line:
            continue
        m = date_re.search(line)
        if not m:
            continue

        date_str = m.group(1)
        y, mo, d = map(int, date_str.split("-"))
        start_py = date(y, mo, d)

        prefix = line[:m.start()].strip()
        if not prefix:
            continue

        ticker_raw = prefix.split()[0]

        name_raw = ""
        op = prefix.find("(")
        cp = prefix.rfind(")")
        if op != -1 and cp != -1 and cp > op:
            name_raw = prefix[op + 1:cp].strip()

        # normalize
        t_key, n_key = extract_pattern(ticker_raw, name_raw)

        if t_key:
            frd_tickers.add(t_key)
            prev = frd_ticker_to_min_start.get(t_key)
            if prev is None or start_py < prev:
                frd_ticker_to_min_start[t_key] = start_py

        if n_key:
            frd_namekeys.add(n_key)
            if name_raw and FUND_LIKE_RE.search(name_raw):
                frd_fundlike_namekeys.add(n_key)

    return frd_tickers, frd_namekeys, frd_fundlike_namekeys, frd_ticker_to_min_start


def explain_missing_by_iterating_ipos(
    max_before_days: int = 30,
    max_after_days: int = 365,
    sample_n: int = 1000,
):
    """
    INVERTED LOGIC:
    Iterate IPO CSV (already filtered/deduped via load_raw(_dict=True)),
    and for each IPO issuer determine if it can be found on FRD.

    Priority:
      1) name-key present on FRD (and not fundlike-filtered)
      2) else ticker present on FRD AND passes date gate vs FRD earliest start date
      3) else missing

    Outputs:
      - prints summary counts
      - returns:
          missing_rows: list of (ticker_key, name_key, raw_name, reason, details)
          counters: Counter of reasons
    """
    # FRD evidence
    frd_tickers, frd_namekeys, frd_fundlike_namekeys, frd_ticker_to_min_start = _fetch_frd_evidence_sets()

    # IPO universe (dict form for easy access to representative rows)
    tick_map, name_map = load_raw(_dict=True)

    ipo_namekeys = list(name_map.keys())  # ~3013
    counters = Counter()
    missing_rows = []

    # For examples per bucket
    examples = defaultdict(list)

    for name_key in ipo_namekeys:
        row = name_map[name_key][0]  # representative row for this issuer-key

        t_raw = (row.get("Issuer Ticker") or "").strip()
        n_raw = (row.get("Issuer Name") or "").strip()
        t_key, nk = extract_pattern(t_raw, n_raw)  # nk should equal name_key usually

        ipo_d = _best_row_date(row)  # python date or None

        # --- 1) Name-key present on FRD ---
        if name_key in frd_namekeys:
            if name_key in frd_fundlike_namekeys:
                counters["present_on_frd_namekey_but_fundlike_filtered"] += 1
                if len(examples["present_on_frd_namekey_but_fundlike_filtered"]) < sample_n:
                    examples["present_on_frd_namekey_but_fundlike_filtered"].append((t_key, name_key, n_raw))
            else:
                counters["found_on_frd_by_namekey"] += 1
            continue

        # --- 2) Ticker present on FRD (date-gated) ---
        if t_key and t_key in frd_tickers:
            frd_start = frd_ticker_to_min_start.get(t_key)

            if ipo_d is None or frd_start is None:
                counters["ticker_on_frd_but_no_date_to_gate"] += 1
                if len(examples["ticker_on_frd_but_no_date_to_gate"]) < sample_n:
                    examples["ticker_on_frd_but_no_date_to_gate"].append((t_key, name_key, n_raw))
                continue

            delta = (frd_start - ipo_d).days
            if delta < -max_before_days or delta > max_after_days:
                counters["ticker_on_frd_but_fails_date_gate_likely_reused"] += 1
                missing_rows.append((t_key, name_key, n_raw, "fails_date_gate", f"delta_days={delta}, frd_start={frd_start}, ipo_date={ipo_d}"))
                if len(examples["ticker_on_frd_but_fails_date_gate_likely_reused"]) < sample_n:
                    examples["ticker_on_frd_but_fails_date_gate_likely_reused"].append((t_key, name_key, n_raw, delta))
            else:
                counters["ticker_on_frd_in_window_but_namekey_missing"] += 1
                missing_rows.append((t_key, name_key, n_raw, "ticker_in_window_name_mismatch", f"delta_days={delta}, frd_start={frd_start}, ipo_date={ipo_d}"))
                if len(examples["ticker_on_frd_in_window_but_namekey_missing"]) < sample_n:
                    examples["ticker_on_frd_in_window_but_namekey_missing"].append((t_key, name_key, n_raw, delta))
            continue

        # --- 3) Not on FRD by name or ticker ---
        counters["not_on_frd_by_name_or_ticker"] += 1
        missing_rows.append((t_key, name_key, n_raw, "not_on_frd", "no_ticker_or_namekey_hit"))
        if len(examples["not_on_frd_by_name_or_ticker"]) < sample_n:
            examples["not_on_frd_by_name_or_ticker"].append((t_key, name_key, n_raw))

    # Print summary
    total = len(ipo_namekeys)
    found = counters["found_on_frd_by_namekey"]
    # Everything else is "not found cleanly by name"
    problematic = total - found

    print("\n===== Inverted check: IPO -> FRD =====")
    print(f"IPO unique issuers (namekeys): {total}")
    print(f"Found on FRD by name-key (clean): {found}")
    print(f"Not found cleanly by name-key (problematic): {problematic}\n")

    print("Breakdown:")
    for k, v in counters.most_common():
        print(f"{k}: {v}")

    # Print a few examples per key bucket
    def show(bucket):
        if bucket not in examples or not examples[bucket]:
            return
        print(f"\n--- Examples: {bucket} (up to {sample_n}) ---")
        for item in examples[bucket]:
            print(item)

    show("not_on_frd_by_name_or_ticker")

    return missing_rows, counters, examples

# Run:
missing_rows, counters, examples = explain_missing_by_iterating_ipos()


Loaded 6132 rows -> kept 3013 unique primary IPO issuers; skipped 1358 SPACs, 1321 non-primary IPOs, skipped 412 fund-like, removed 28 duplicate issuer rows.

===== Inverted check: IPO -> FRD =====
IPO unique issuers (namekeys): 3009
Found on FRD by name-key (clean): 1852
Not found cleanly by name-key (problematic): 1157

Breakdown:
found_on_frd_by_namekey: 1852
not_on_frd_by_name_or_ticker: 864
ticker_on_frd_in_window_but_namekey_missing: 194
ticker_on_frd_but_fails_date_gate_likely_reused: 67
present_on_frd_namekey_but_fundlike_filtered: 32

--- Examples: not_on_frd_by_name_or_ticker (up to 1000) ---
('mfbi', 'monroe federal bancorp', 'Monroe Federal Bancorp Inc')
('2507', 'cirrus aircraft', 'Cirrus Aircraft Ltd')
('tzup', 'thumzup media', 'Thumzup Media Corp')
('fadl', 'fadel', 'Fadel Partners Inc')
('2310932d', 'aesi', 'AESI Holdings Inc')
('cmhf', 'community heritage', 'Community Heritage Financial Inc')
('2082549d', 'heartsciences', 'HeartSciences Inc')
('mcvt', 'mill city ventur