# Option Viz — Full Pipeline (Data → Preprocess)

This notebook fetches an option chain (equity via **yfinance** or crypto via **OKX public**),
then runs preprocessing:

1. Compute robust **mid** prices and quote-quality flags (crossed/wide)
2. **Put–Call Parity** diagnostics and synthetic legs
3. **Forward** estimation from PCP and **log‑moneyness**

> Tip: If imports fail, make sure the repo root is on `PYTHONPATH` or run the setup cell below.

In [None]:
# --- Setup: ensure src/ is importable; show versions ---
import sys, os
from pathlib import Path
ROOT = Path.cwd()
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))
print("Using SRC:", SRC)
print("Python:", sys.version.split()[0])
try:
    import pandas as pd, numpy as np, matplotlib
    import httpx, yfinance
    print("pandas:", pd.__version__, "numpy:", np.__version__, "matplotlib:", matplotlib.__version__)
    print("httpx:", httpx.__version__, "yfinance:", getattr(yfinance, "__version__", "unknown"))
except Exception as e:
    print("Dependency check failed:", e)


## Select Backend & Underlying

- For **equity**, try: `asset_class = "equity"`, `underlying = "AAPL"` (or `"SPY"`).
- For **crypto**, try: `asset_class = "crypto"`, `underlying = "BTC"` (or `"ETH"`).

In [None]:
asset_class = "equity"   # "equity" or "crypto"
underlying = "AAPL"       # e.g., "AAPL" or "SPY" for equity, "BTC" or "ETH" for crypto
expiry_index = 0          # 0 = nearest expiry; increase if it's too close/empty

# Preprocess knobs
WIDE_REL_THRESHOLD = 0.15  # mark quotes wide when (ask-bid)/mid > 15%
DAY_COUNT = 365.25         # year fraction denominator


## Fetch Chain

We use the factory `get_fetcher(asset_class)` → yfinance (equity) or OKX (crypto).

Notebook-friendly `arun()` helper handles coroutines whether a loop is already running or not.

In [None]:
import asyncio
def arun(coro):
    """Run a coroutine in notebooks (handles existing event loop)."""
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)
    else:
        # Re-enter running loop using nest_asyncio if available
        try:
            import nest_asyncio
            nest_asyncio.apply(loop)
            return loop.run_until_complete(coro)
        except Exception as e:
            raise RuntimeError("An asyncio loop is running and nest_asyncio is not available.\n"
                               "Install it with: pip install nest_asyncio") from e

from data.registry import get_fetcher
from data.historical_loader import chain_to_dataframe, save_chain_csv

fetcher = get_fetcher(asset_class)
expiries = arun(fetcher.list_expiries(underlying))
assert expiries, f"No expiries returned for {underlying} ({asset_class})"
expiries_sorted = sorted(expiries)
expiry = expiries_sorted[min(expiry_index, len(expiries_sorted)-1)]
print(f"Selected expiry: {expiry.date().isoformat()}  | total expiries: {len(expiries_sorted)}")
chain = arun(fetcher.fetch_chain(underlying, expiry))
print(f"Spot: {chain.spot} | Quotes: {len(chain.quotes)} | AsOf: {chain.asof_utc}")

try:
    df = chain_to_dataframe(chain)
except Exception:
    # Fallback conversion if helper is unavailable
    import pandas as pd
    rows = []
    for q in chain.quotes:
        rows.append({
            "symbol": q.symbol,
            "underlying": q.underlying,
            "asset_class": q.asset_class,
            "expiry": q.expiry,
            "strike": q.strike,
            "type": getattr(q, "opt_type", getattr(q, "type", None)),
            "bid": q.bid,
            "ask": q.ask,
            "last": q.last,
            "mark": q.mark,
            "volume": q.volume,
            "open_interest": q.open_interest,
            "contract_size": q.contract_size,
            "underlying_ccy": q.underlying_ccy,
            "quote_ccy": q.quote_ccy,
        })
    df = pd.DataFrame(rows)
print(df.head())

# Save a CSV snapshot
Path("out").mkdir(exist_ok=True, parents=True)
csv_path = Path("out") / f"{underlying}_{expiry.date().isoformat()}_{asset_class}.csv"
try:
    save_chain_csv(csv_path, chain)
except Exception:
    df.to_csv(csv_path, index=False)
print("Saved:", csv_path)


## Preprocess: mids, PCP, forward, log‑moneyness

In [None]:
import numpy as np
from preprocess.midprice import add_midprice_columns
from preprocess.pcp import add_pcp_diagnostics
from preprocess.forward import forward_price, log_moneyness, estimate_forward_from_chain
from data.risk_free import RiskFreeProvider, RiskFreeConfig

# 1) Robust mids + flags
dfp = add_midprice_columns(df, bid_col="bid", ask_col="ask", out_prefix="", wide_rel_threshold=WIDE_REL_THRESHOLD)
print("Rows with both sides:", int((dfp["side_used"]=="both").sum()))
print("Crossed:", int(dfp["crossed"].sum()), "| Wide:", int(dfp["wide"].sum()))

# 2) Risk-free rate and year fraction
asof = getattr(chain, "asof_utc", None)
T = max((expiry - asof).total_seconds()/(DAY_COUNT*24*3600), 1e-8)
rf = RiskFreeProvider(RiskFreeConfig(default_rate=0.05, forward_fill=True))
r = rf.get_rate(asof.date()) if hasattr(asof, "date") else 0.05
print(f"T ~ {T:.6f} years | r ~ {r:.4f}")

# 3) PCP diagnostics and forward estimate
# Ensure we have a 'type' column with 'C'/'P'
if "type" not in dfp.columns and "opt_type" in dfp.columns:
    dfp["type"] = dfp["opt_type"]
diag = add_pcp_diagnostics(dfp, spot=chain.spot or np.nan, r=r, T=T, price_col="mid", type_col="type", strike_col="strike")
print("PCP diagnostics head:\n", diag.head())

F_est = estimate_forward_from_chain(dfp, r=r, T=T, price_col="mid", type_col="type", strike_col="strike", spot_hint=chain.spot)
print(f"Estimated forward F ≈ {F_est:.6f}")

# 4) Log‑moneyness
dfp["k"] = np.log(dfp["strike"].astype(float) / F_est)
print(dfp[["strike","type","mid","rel_spread","k"]].head())


## Quick Visuals
Simple Matplotlib plots to eyeball the data quality and PCP behavior.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

fig1 = plt.figure(figsize=(7,4))
ax = fig1.gca()
dfc = dfp[dfp["type"].astype(str).str.upper().eq("C")]
dfp_ = dfp[dfp["type"].astype(str).str.upper().eq("P")]
ax.scatter(dfc["strike"], dfc["mid"], s=10, label="Calls")
ax.scatter(dfp_["strike"], dfp_["mid"], s=10, label="Puts")
ax.set_xlabel("Strike")
ax.set_ylabel("Mid price")
ax.set_title(f"{underlying} mids @ {expiry.date().isoformat()} ({asset_class})")
ax.legend()
plt.show()

fig2 = plt.figure(figsize=(7,3.8))
ax2 = fig2.gca()
ax2.plot(diag.index, diag["residual"], marker="o", linestyle="none", ms=3)
ax2.axhline(0, linestyle="--")
ax2.set_xlabel("Strike")
ax2.set_ylabel("PCP residual")
ax2.set_title("Put–Call parity residual vs strike")
plt.show()


## Save artifacts
CSV/PNG outputs land in `out/` for easy sharing.

In [None]:
OUT = Path("out")
OUT.mkdir(exist_ok=True, parents=True)

dfp_path = OUT / f"{underlying}_{expiry.date().isoformat()}_{asset_class}_preprocess.csv"
dfp.to_csv(dfp_path, index=False)
print("Saved:", dfp_path)

fig1_path = OUT / f"{underlying}_{expiry.date().isoformat()}_{asset_class}_mids.png"
fig2_path = OUT / f"{underlying}_{expiry.date().isoformat()}_{asset_class}_pcp_residual.png"
try:
    fig1.savefig(fig1_path, dpi=140, bbox_inches="tight")
    fig2.savefig(fig2_path, dpi=140, bbox_inches="tight")
    print("Saved:", fig1_path)
    print("Saved:", fig2_path)
except Exception as e:
    print("Plot save failed:", e)
