In [1]:
"""
OMEGA Projection (for endogenous NAV) — “Institutional-grade” version

This is a refactor of your Untitled-1 NAV projection:
- Keeps calibration of omega on MSCI (distributed-lag), alpha/beta/sigma pooling, grade transitions, cap logic
- Computes NAV_start (initial condition) EXACTLY as before
- Instead of generating NAV_projected paths, it generates omega_t paths (valuation return shocks)
  which will be used by structural cashflows to update NAV endogenously:

    NAV_t = max((NAV_{t-1} + Draw_t - Rep_t) * (1 + omega_t), 0)

Outputs:
1) omega_projection_sota_{year}_{quarter}_{n_q}q.(csv|parquet)
   Columns include: FundID, quarter_end, msci_ret_q, msci_ret_q_lag1, omega,
                   strategy, grade path, age bucket, cap_qe, etc.

2) nav_start_sota_{year}_{quarter}.(csv|parquet)
   One row per fund: FundID, NAV_start, NAV_start_source, cap_qe, age0, strategy, grade0

"""

import os
import glob
import numpy as np
import pandas as pd

# -----------------------------
# Utilities
# -----------------------------
AGE_LABELS = ["0-2y", "2-4y", "4-6y", "6-8y", "8-10y", "10-15y", "15-20y", "20y+"]
AGE_BINS_Q = [-1, 7, 15, 23, 31, 39, 59, 79, 10_000]  # quarters

def quarter_end_from_year_quarter(year: int, quarter: str) -> pd.Timestamp:
    q = quarter.upper().strip()
    if q not in {"Q1", "Q2", "Q3", "Q4"}:
        raise ValueError("Quarter must be one of: Q1, Q2, Q3, Q4")
    q_num = int(q[1])
    return pd.Period(f"{year}Q{q_num}", freq="Q").to_timestamp("Q")

def find_msci_projection_file(data_dir: str) -> str:
    cands = glob.glob(os.path.join(data_dir, "msci_projection_*.parquet")) + \
            glob.glob(os.path.join(data_dir, "msci_projection_*.csv"))
    if not cands:
        raise FileNotFoundError(
            f"No MSCI projection file found in {data_dir}. "
            "Expected msci_projection_*.parquet or msci_projection_*.csv"
        )
    cands.sort(key=os.path.getmtime, reverse=True)
    return cands[0]

def load_msci_projection(path: str) -> pd.DataFrame:
    df = pd.read_parquet(path) if path.lower().endswith(".parquet") else pd.read_csv(path)
    if not {"quarter_end", "msci_ret_q"}.issubset(df.columns):
        raise ValueError("MSCI projection file must contain columns: quarter_end, msci_ret_q")
    df = df.copy()
    df["quarter_end"] = pd.to_datetime(df["quarter_end"])
    df = df.sort_values("quarter_end").reset_index(drop=True)
    return df[["quarter_end", "msci_ret_q"]]

def add_quarters(qe: pd.Timestamp, q: float) -> pd.Timestamp:
    if pd.isna(qe):
        return pd.NaT
    p = pd.Period(qe, freq="Q")
    return (p + int(round(q))).to_timestamp("Q")

def make_age_bucket_q(age_q: pd.Series) -> pd.Categorical:
    return pd.cut(age_q, bins=AGE_BINS_Q, labels=AGE_LABELS)

# -----------------------------
# Config
# -----------------------------
NAV_COL = "NAV Adjusted EUR"
DRAW_COL = "Adj Drawdown EUR"
REPAY_COL = "Adj Repayment EUR"
SIZE_COL = "Target Fund Size"

NAV_EPS = 100.0
OMEGA_CLIP = 0.8

MIN_FUNDS_BETA = 10
MIN_OBS_BETA = 80

MIN_FUNDS_ALPHA_BUCKET = 6
MIN_OBS_ALPHA_BUCKET = 60

SIGMA_SHRINK_K = 120.0

DRAW_EPS = 1000.0
SIZE_EPS = 1e6
MIN_OBS_RATIO = 50

# -----------------------------
# Inputs
# -----------------------------
year = int(input("Enter year (e.g. 2025): ").strip())
quarter = input("Enter quarter (Q1, Q2, Q3, Q4): ").strip().upper()

n_q = int(input("Enter number of quarters to project (0 => default 40): ").strip() or "0")
if n_q == 0:
    n_q = 40

BASE_DIR = os.path.join("C:\\Users", os.environ.get("USERNAME"), "Documents", "Equity")
HOME = os.path.join(BASE_DIR, f"{year}_{quarter}")
DATA_DIR = os.path.join(HOME, "data")

data_path = os.path.join(DATA_DIR, "data.parquet")
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Missing data.parquet at: {data_path}")

msci_xlsx = os.path.join(DATA_DIR, "MSCI.xlsx")
if not os.path.exists(msci_xlsx):
    raise FileNotFoundError(f"Missing MSCI.xlsx at: {msci_xlsx}")

msci_proj_path = find_msci_projection_file(DATA_DIR)
print("Using MSCI projection file:", msci_proj_path)

start_qe = quarter_end_from_year_quarter(year, quarter)
print("Projection start quarter-end:", start_qe.date())

# -----------------------------
# Load fund panel
# -----------------------------
data = pd.read_parquet(data_path).copy()

data["quarter_end"] = pd.PeriodIndex(
    data["Year of Transaction Date"].astype(int).astype(str) + "Q" +
    data["Quarter of Transaction Date"].astype(int).astype(str),
    freq="Q"
).to_timestamp("Q")
data["quarter_end"] = pd.to_datetime(data["quarter_end"])

required_cols = [
    "FundID", "quarter_end",
    NAV_COL, DRAW_COL, REPAY_COL,
    "Adj Strategy", "Grade",
    SIZE_COL, "Fund_Age_Quarters",
    "Planned end date with add. years as per legal doc",
]
missing = [c for c in required_cols if c not in data.columns]
if missing:
    raise ValueError(f"Missing required columns in data.parquet: {missing}")

data["planned_end_qe"] = pd.to_datetime(
    data["Planned end date with add. years as per legal doc"],
    errors="coerce"
).dt.to_period("Q").dt.to_timestamp("Q")

data[NAV_COL] = pd.to_numeric(data[NAV_COL], errors="coerce")
for c in [DRAW_COL, REPAY_COL, SIZE_COL, "Fund_Age_Quarters"]:
    data[c] = pd.to_numeric(data[c], errors="coerce").fillna(0.0)

data = data.sort_values(["FundID", "quarter_end"]).reset_index(drop=True)

print("Loaded data rows:", len(data))
print("Funds:", data["FundID"].nunique())

# -----------------------------
# MSCI history (quarterly) + lag
# -----------------------------
msci_raw = pd.read_excel(msci_xlsx)[["Date", "SCXP Index"]].copy()
msci_raw["Date"] = pd.to_datetime(msci_raw["Date"], errors="coerce")
msci_raw["SCXP Index"] = pd.to_numeric(msci_raw["SCXP Index"], errors="coerce")
msci_raw = msci_raw.dropna().sort_values("Date")

msci_raw["quarter_end"] = msci_raw["Date"].dt.to_period("Q").dt.to_timestamp("Q")
msci_hist = (
    msci_raw.groupby("quarter_end", as_index=False)["SCXP Index"]
    .last()
    .rename(columns={"SCXP Index": "index_level"})
)
msci_hist["msci_ret_q"] = msci_hist["index_level"].pct_change()
msci_hist = msci_hist.dropna(subset=["msci_ret_q"])[["quarter_end", "msci_ret_q"]]
msci_hist["quarter_end"] = pd.to_datetime(msci_hist["quarter_end"])
msci_hist = msci_hist.sort_values("quarter_end").reset_index(drop=True)
msci_hist["msci_ret_q_lag1"] = msci_hist["msci_ret_q"].shift(1)

# -----------------------------
# MSCI future projection (+ lag) with first-lag fix
# -----------------------------
msci_proj = load_msci_projection(msci_proj_path)
msci_proj = msci_proj.sort_values("quarter_end").reset_index(drop=True)
msci_proj["msci_ret_q_lag1"] = msci_proj["msci_ret_q"].shift(1)

msci_future = msci_proj.loc[msci_proj["quarter_end"] > start_qe].head(n_q).copy()
if msci_future.empty:
    raise ValueError("MSCI projection contains no quarters after the selected start date.")

last_hist_ret = msci_hist.loc[msci_hist["quarter_end"] <= start_qe, "msci_ret_q"].tail(1)
msci_future.loc[msci_future.index[0], "msci_ret_q_lag1"] = float(last_hist_ret.iloc[0]) if len(last_hist_ret) else 0.0
msci_future["msci_ret_q_lag1"] = msci_future["msci_ret_q_lag1"].fillna(0.0)

print("Future MSCI quarters available:", len(msci_future))

# -----------------------------
# Planned end overrun by strategy (history-based)
# -----------------------------
last_obs = data.groupby("FundID")["quarter_end"].max().rename("last_qe")
fund_static = data.sort_values(["FundID", "quarter_end"]).groupby("FundID").tail(1).copy()
fund_static = fund_static.merge(last_obs, on="FundID", how="left")

def quarters_diff(a: pd.Timestamp, b: pd.Timestamp) -> float:
    if pd.isna(a) or pd.isna(b):
        return np.nan
    return float(pd.Period(a, freq="Q").ordinal - pd.Period(b, freq="Q").ordinal)

fund_static["overrun_q"] = fund_static.apply(
    lambda r: max(quarters_diff(r["last_qe"], r["planned_end_qe"]), 0.0)
    if pd.notna(r["planned_end_qe"]) else np.nan,
    axis=1
)

fund_static["ever_overran"] = fund_static["overrun_q"].fillna(0) > 0
ever_overran_map = fund_static.set_index("FundID")["ever_overran"]

overran_only = fund_static.loc[fund_static["overrun_q"].notna() & (fund_static["overrun_q"] > 0)].copy()
avg_overrun_by_strategy = overran_only.groupby("Adj Strategy")["overrun_q"].mean().clip(lower=0.0)

# -----------------------------
# Build omega from history for calibration
# -----------------------------
df = data.copy().sort_values(["FundID", "quarter_end"]).reset_index(drop=True)

df["nav_prev"] = df.groupby("FundID")[NAV_COL].shift(1)
df["flow_net"] = df[DRAW_COL] - df[REPAY_COL]

m = df["nav_prev"].abs() > NAV_EPS
df["omega"] = np.nan
df.loc[m, "omega"] = ((df.loc[m, NAV_COL] - df.loc[m, "nav_prev"]) - df.loc[m, "flow_net"]) / df.loc[m, "nav_prev"]
df["omega"] = df["omega"].clip(lower=-OMEGA_CLIP, upper=OMEGA_CLIP)

# Calibration dataset: omega + MSCI + lag
cal = df.merge(msci_hist, on="quarter_end", how="left")
cal = cal.dropna(subset=["omega", "msci_ret_q", "msci_ret_q_lag1"]).copy()
cal["AgeBucket"] = make_age_bucket_q(cal["Fund_Age_Quarters"])
cal = cal[["FundID", "Adj Strategy", "Grade", "AgeBucket", "omega", "msci_ret_q", "msci_ret_q_lag1"]].copy()

print("Calibration rows:", len(cal))

# ============================================================
# Cluster-robust OLS helpers (unchanged)
# ============================================================
from scipy import stats

def ols_cluster_robust(df, y_col, x_cols, cluster_col):
    d = df.dropna(subset=[y_col] + x_cols + [cluster_col]).copy()
    n = len(d)
    if n == 0:
        return None

    y = d[y_col].to_numpy(float)
    X = np.column_stack([np.ones(n)] + [d[c].to_numpy(float) for c in x_cols])
    k = X.shape[1]

    XtX = X.T @ X
    XtX_inv = np.linalg.pinv(XtX)
    beta = XtX_inv @ (X.T @ y)

    u = y - X @ beta
    clusters = d[cluster_col].to_numpy()
    uniq = pd.unique(clusters)
    G = len(uniq)
    df_dof = G - 1
    if G <= 1:
        return None

    meat = np.zeros((k, k), dtype=float)
    for g in uniq:
        mask = (clusters == g)
        Xg = X[mask, :]
        ug = u[mask]
        Xgu = Xg.T @ ug
        meat += np.outer(Xgu, Xgu)

    V = XtX_inv @ meat @ XtX_inv
    scale = (G / (G - 1)) * ((n - 1) / max(n - k, 1))
    V *= scale

    se = np.sqrt(np.diag(V))
    tstats = beta / se
    pvals = 2.0 * (1.0 - stats.t.cdf(np.abs(tstats), df=df_dof))

    names = ["alpha"] + x_cols
    return {
        "coef": pd.Series(beta, index=names),
        "se": pd.Series(se, index=names),
        "t": pd.Series(tstats, index=names),
        "p": pd.Series(pvals, index=names),
        "n_obs": int(n),
        "n_clusters": int(G),
        "df_dof": int(df_dof),
    }

def cluster_mean_stats(df, y_col, cluster_col):
    res = ols_cluster_robust(df, y_col=y_col, x_cols=[], cluster_col=cluster_col)
    if res is None:
        return None
    return {
        "alpha": float(res["coef"]["alpha"]),
        "se_alpha": float(res["se"]["alpha"]),
        "t_alpha": float(res["t"]["alpha"]),
        "p_alpha": float(res["p"]["alpha"]),
        "n_obs": res["n_obs"],
        "n_funds": res["n_clusters"],
        "df": res["df_dof"],
    }

# ============================================================
# Statistical decision settings
# ============================================================
alpha_level = float(input("Significance level for using group params? [0.10]: ").strip() or "0.10")
min_clusters_for_inference = int(input("Minimum #fund clusters for inference [8]: ").strip() or "8")

# ============================================================
# 1) Betas (cluster robust) with significance gating (unchanged)
# ============================================================
beta_rows_sg = []
for (s, g), grp in cal.groupby(["Adj Strategy", "Grade"], dropna=False):
    res = ols_cluster_robust(grp, "omega", ["msci_ret_q", "msci_ret_q_lag1"], "FundID")
    if res is None:
        continue
    n_funds = res["n_clusters"]
    usable = (n_funds >= min_clusters_for_inference)

    b0 = float(res["coef"]["msci_ret_q"])
    b1 = float(res["coef"]["msci_ret_q_lag1"])
    p0 = float(res["p"]["msci_ret_q"])
    p1 = float(res["p"]["msci_ret_q_lag1"])
    use_beta = bool(usable and ((p0 < alpha_level) or (p1 < alpha_level)))

    beta_rows_sg.append({
        "Adj Strategy": s, "Grade": g,
        "b0": b0, "b1": b1,
        "p_b0": p0, "p_b1": p1,
        "n_obs": res["n_obs"], "n_funds": n_funds,
        "use_beta": use_beta
    })
beta_sg = pd.DataFrame(beta_rows_sg)

beta_rows_s = []
for s, grp in cal.groupby(["Adj Strategy"], dropna=False):
    res = ols_cluster_robust(grp, "omega", ["msci_ret_q", "msci_ret_q_lag1"], "FundID")
    if res is None:
        continue
    n_funds = res["n_clusters"]
    usable = (n_funds >= min_clusters_for_inference)

    b0 = float(res["coef"]["msci_ret_q"])
    b1 = float(res["coef"]["msci_ret_q_lag1"])
    p0 = float(res["p"]["msci_ret_q"])
    p1 = float(res["p"]["msci_ret_q_lag1"])
    use_beta = bool(usable and ((p0 < alpha_level) or (p1 < alpha_level)))

    beta_rows_s.append({
        "Adj Strategy": s, "b0": b0, "b1": b1,
        "p_b0": p0, "p_b1": p1,
        "n_obs": res["n_obs"], "n_funds": n_funds,
        "use_beta": use_beta
    })
beta_s = pd.DataFrame(beta_rows_s)

res_g = ols_cluster_robust(cal, "omega", ["msci_ret_q", "msci_ret_q_lag1"], "FundID")
if res_g is None:
    raise ValueError("Global beta regression failed.")
b0_g = float(res_g["coef"]["msci_ret_q"])
b1_g = float(res_g["coef"]["msci_ret_q_lag1"])

beta_sg_use = beta_sg.loc[beta_sg["use_beta"]].set_index(["Adj Strategy", "Grade"])[["b0", "b1"]].to_dict("index")
beta_s_use = beta_s.loc[beta_s["use_beta"]].set_index(["Adj Strategy"])[["b0", "b1"]].to_dict("index")

def get_betas(strategy, grade):
    k = (strategy, grade)
    if k in beta_sg_use:
        d = beta_sg_use[k]
        return float(d["b0"]), float(d["b1"]), "sg_sig"
    if strategy in beta_s_use:
        d = beta_s_use[strategy]
        return float(d["b0"]), float(d["b1"]), "s_sig"
    return float(b0_g), float(b1_g), "global"

# ============================================================
# 2) Alpha: omega_adj mean tests (unchanged)
# ============================================================
cal2 = cal.copy()
b0_used = []
b1_used = []
for _, r in cal2.iterrows():
    b0, b1, _ = get_betas(r["Adj Strategy"], r["Grade"])
    b0_used.append(b0); b1_used.append(b1)
cal2["b0_used"] = b0_used
cal2["b1_used"] = b1_used
cal2["omega_adj"] = cal2["omega"] - cal2["b0_used"]*cal2["msci_ret_q"] - cal2["b1_used"]*cal2["msci_ret_q_lag1"]

alpha_rows_sga = []
for (s, g, a), grp in cal2.groupby(["Adj Strategy","Grade","AgeBucket"], dropna=False):
    st = cluster_mean_stats(grp, "omega_adj", "FundID")
    if st is None:
        continue
    use_alpha = bool((st["n_funds"] >= min_clusters_for_inference) and (st["p_alpha"] < alpha_level))
    alpha_rows_sga.append({"Adj Strategy": s, "Grade": g, "AgeBucket": a, **st, "use_alpha": use_alpha})
alpha_sga = pd.DataFrame(alpha_rows_sga)

alpha_rows_sg = []
for (s, g), grp in cal2.groupby(["Adj Strategy","Grade"], dropna=False):
    st = cluster_mean_stats(grp, "omega_adj", "FundID")
    if st is None:
        continue
    use_alpha = bool((st["n_funds"] >= min_clusters_for_inference) and (st["p_alpha"] < alpha_level))
    alpha_rows_sg.append({"Adj Strategy": s, "Grade": g, **st, "use_alpha": use_alpha})
alpha_sg = pd.DataFrame(alpha_rows_sg)

alpha_rows_s = []
for s, grp in cal2.groupby(["Adj Strategy"], dropna=False):
    st = cluster_mean_stats(grp, "omega_adj", "FundID")
    if st is None:
        continue
    use_alpha = bool((st["n_funds"] >= min_clusters_for_inference) and (st["p_alpha"] < alpha_level))
    alpha_rows_s.append({"Adj Strategy": s, **st, "use_alpha": use_alpha})
alpha_s = pd.DataFrame(alpha_rows_s)

st_g = cluster_mean_stats(cal2, "omega_adj", "FundID")
alpha_global = float(st_g["alpha"]) if st_g else 0.0

alpha_sga_use = alpha_sga.loc[alpha_sga["use_alpha"]].set_index(["Adj Strategy","Grade","AgeBucket"])["alpha"].to_dict()
alpha_sg_use  = alpha_sg.loc[alpha_sg["use_alpha"]].set_index(["Adj Strategy","Grade"])["alpha"].to_dict()
alpha_s_use   = alpha_s.loc[alpha_s["use_alpha"]].set_index(["Adj Strategy"])["alpha"].to_dict()

def get_alpha(strategy, grade, age_bucket):
    k = (strategy, grade, age_bucket)
    if k in alpha_sga_use:
        return float(alpha_sga_use[k]), "sga_sig"
    k2 = (strategy, grade)
    if k2 in alpha_sg_use:
        return float(alpha_sg_use[k2]), "sg_sig"
    if strategy in alpha_s_use:
        return float(alpha_s_use[strategy]), "s_sig"
    return float(alpha_global), "global"

# ============================================================
# 3) Sigma: pooled SG + shrink to global (unchanged)
# ============================================================
resid = []
for _, r in cal.iterrows():
    b0, b1, _ = get_betas(r["Adj Strategy"], r["Grade"])
    a, _ = get_alpha(r["Adj Strategy"], r["Grade"], r["AgeBucket"])
    pred = a + b0*r["msci_ret_q"] + b1*r["msci_ret_q_lag1"]
    resid.append(float(r["omega"] - pred))

cal_res = cal.copy()
cal_res["resid"] = resid

sigma_sg = (
    cal_res.groupby(["Adj Strategy","Grade"], dropna=False)
           .agg(n_obs=("resid","size"),
                sigma=("resid", lambda x: float(np.std(x, ddof=1)) if len(x) > 2 else 0.10))
           .reset_index()
)
sigma_global = float(np.std(cal_res["resid"], ddof=1))
sigma_global = max(sigma_global, 0.02)

sigma_sg_map = sigma_sg.set_index(["Adj Strategy","Grade"])[["sigma","n_obs"]].to_dict("index")

def get_sigma(strategy, grade):
    k = (strategy, grade)
    if k in sigma_sg_map:
        s = float(sigma_sg_map[k]["sigma"])
        n = float(sigma_sg_map[k]["n_obs"])
        w = n / (n + SIGMA_SHRINK_K)
        return float(w*s + (1.0-w)*sigma_global), "sg_shrunk"
    return float(sigma_global), "global"

# -----------------------------
# NAV_start imputation (unchanged)
# -----------------------------
hist_upto = data.loc[data["quarter_end"] <= start_qe].copy()
if hist_upto.empty:
    raise ValueError("No data at or before chosen start quarter.")

hist_upto = hist_upto.sort_values(["FundID","quarter_end"])
base_rows = hist_upto.groupby("FundID").tail(1).copy()

base_rows["ever_overran"] = base_rows["FundID"].map(ever_overran_map).fillna(False)

caps = []
for _, r in base_rows.iterrows():
    planned = r["planned_end_qe"]
    if pd.isna(planned):
        caps.append(msci_future["quarter_end"].iloc[-1])
        continue
    if bool(r["ever_overran"]):
        avg_over = float(avg_overrun_by_strategy.get(r["Adj Strategy"], 0.0))
        caps.append(add_quarters(planned, avg_over))
    else:
        caps.append(planned)
base_rows["cap_qe"] = caps
base_rows["AgeBucket"] = make_age_bucket_q(base_rows["Fund_Age_Quarters"])

hist_upto["draw_cum"] = hist_upto.groupby("FundID")[DRAW_COL].cumsum()
if "draw_cum" not in base_rows.columns:
    base_rows = base_rows.merge(
        hist_upto.groupby("FundID")["draw_cum"].last().reset_index(),
        on="FundID",
        how="left"
    )

tmp = hist_upto.copy()
tmp["AgeBucket"] = make_age_bucket_q(tmp["Fund_Age_Quarters"])

tmp["ratio_nav_draw"] = np.where(
    (tmp[NAV_COL].notna()) & (tmp[NAV_COL].abs() > NAV_EPS) & (tmp["draw_cum"] > DRAW_EPS),
    tmp[NAV_COL] / tmp["draw_cum"],
    np.nan
)
tmp["ratio_nav_size"] = np.where(
    (tmp[NAV_COL].notna()) & (tmp[NAV_COL].abs() > NAV_EPS) & (tmp[SIZE_COL] > SIZE_EPS),
    tmp[NAV_COL] / tmp[SIZE_COL],
    np.nan
)

tmp["log_ratio_nav_draw"] = np.log(tmp["ratio_nav_draw"])
tmp["log_ratio_nav_size"] = np.log(tmp["ratio_nav_size"])
tmp.loc[~np.isfinite(tmp["log_ratio_nav_draw"]), "log_ratio_nav_draw"] = np.nan
tmp.loc[~np.isfinite(tmp["log_ratio_nav_size"]), "log_ratio_nav_size"] = np.nan

ratio_key = ["Adj Strategy","Grade","AgeBucket"]

def fit_lognorm(df: pd.DataFrame, col: str) -> pd.Series:
    g = df[col].dropna()
    if len(g) < MIN_OBS_RATIO:
        return pd.Series({"mu": np.nan, "sig": np.nan, "n": len(g)})
    return pd.Series({"mu": float(g.mean()), "sig": float(g.std(ddof=1)), "n": len(g)})

ratio_draw = tmp.groupby(ratio_key, dropna=False).apply(lambda g: fit_lognorm(g, "log_ratio_nav_draw")).reset_index()
ratio_size = tmp.groupby(ratio_key, dropna=False).apply(lambda g: fit_lognorm(g, "log_ratio_nav_size")).reset_index()

gdraw = ratio_draw.dropna(subset=["mu","sig"])
gsize = ratio_size.dropna(subset=["mu","sig"])
fallback_draw = {"mu": float(gdraw["mu"].median()) if len(gdraw) else 0.0,
                 "sig": float(gdraw["sig"].median()) if len(gdraw) else 0.75}
fallback_size = {"mu": float(gsize["mu"].median()) if len(gsize) else -2.0,
                 "sig": float(gsize["sig"].median()) if len(gsize) else 0.75}

ratio_draw_map = ratio_draw.set_index(ratio_key)[["mu","sig","n"]].to_dict("index")
ratio_size_map = ratio_size.set_index(ratio_key)[["mu","sig","n"]].to_dict("index")

def lookup_ratio(map_, strategy, grade, age_bucket, fallback):
    k = (strategy, grade, age_bucket)
    if k in map_:
        d = map_[k]
        if pd.notna(d["mu"]) and pd.notna(d["sig"]) and d["n"] >= MIN_OBS_RATIO:
            return float(d["mu"]), float(d["sig"]), "bucket"
    return float(fallback["mu"]), float(fallback["sig"]), "global"

rng_init = np.random.default_rng(2025)
base_rows["NAV_start"] = base_rows[NAV_COL]
base_rows["NAV_start_source"] = "observed"

for idx, r in base_rows.iterrows():
    nav_obs = r["NAV_start"]
    if pd.notna(nav_obs) and abs(nav_obs) > NAV_EPS:
        continue

    draw_cum = r.get("draw_cum", 0.0)
    size = r.get(SIZE_COL, 0.0)
    draw_cum = 0.0 if pd.isna(draw_cum) else float(draw_cum)
    size = 0.0 if pd.isna(size) else float(size)

    strategy = r["Adj Strategy"]
    grade = r.get("AssignedGrade", r["Grade"])
    if pd.isna(grade):
        grade = r["Grade"]
    age_bucket = r["AgeBucket"]

    if draw_cum > DRAW_EPS:
        mu, sig, src = lookup_ratio(ratio_draw_map, strategy, grade, age_bucket, fallback_draw)
        ratio = float(np.exp(mu + sig * rng_init.standard_normal()))
        ratio = float(np.clip(ratio, 0.05, 5.0))
        base_rows.at[idx, "NAV_start"] = ratio * draw_cum
        base_rows.at[idx, "NAV_start_source"] = f"imputed_draw_{src}"
    elif size > SIZE_EPS:
        mu, sig, src = lookup_ratio(ratio_size_map, strategy, grade, age_bucket, fallback_size)
        ratio = float(np.exp(mu + sig * rng_init.standard_normal()))
        ratio = float(np.clip(ratio, 0.01, 2.0))
        base_rows.at[idx, "NAV_start"] = ratio * size
        base_rows.at[idx, "NAV_start_source"] = f"imputed_size_{src}"
    else:
        base_rows.at[idx, "NAV_start"] = 0.0
        base_rows.at[idx, "NAV_start_source"] = "imputed_zero_noinfo"

base_rows["NAV_start"] = pd.to_numeric(base_rows["NAV_start"], errors="coerce").fillna(0.0)

# -----------------------------
# Grade transitions (UNCHANGED, uses your saved 1Y matrices if present)
# -----------------------------
GRADE_STATES = ["A","B","C","D"]
MIN_TRANSITIONS = 30
DEFAULT_YEARLY_CHANGE = 0.25

# Load preferred 1Y matrices derived from 10Y root if present
p1_all_path = os.path.join(DATA_DIR, "grade_transition_1y_all.csv")
p1_pe_path  = os.path.join(DATA_DIR, "grade_transition_1y_pe.csv")
p1_vc_path  = os.path.join(DATA_DIR, "grade_transition_1y_vc.csv")
P1_ALL = pd.read_csv(p1_all_path, index_col=0) if os.path.exists(p1_all_path) else None
P1_PE  = pd.read_csv(p1_pe_path, index_col=0) if os.path.exists(p1_pe_path) else None
P1_VC  = pd.read_csv(p1_vc_path, index_col=0) if os.path.exists(p1_vc_path) else None

def _row_norm_df(P):
    P = P.reindex(index=GRADE_STATES, columns=GRADE_STATES).fillna(0.0).clip(lower=0.0)
    rs = P.sum(axis=1).replace(0.0, 1.0)
    return P.div(rs, axis=0)

if P1_ALL is not None: P1_ALL = _row_norm_df(P1_ALL)
if P1_PE  is not None: P1_PE  = _row_norm_df(P1_PE)
if P1_VC  is not None: P1_VC  = _row_norm_df(P1_VC)

def get_transition_matrix(strategy):
    if strategy == "Private Equity" and P1_PE is not None:
        return P1_PE, "PE_1Y"
    if strategy == "Venture Capital" and P1_VC is not None:
        return P1_VC, "VC_1Y"
    if P1_ALL is not None:
        return P1_ALL, "ALL_1Y"
    # fallback identity
    return pd.DataFrame(np.eye(4), index=GRADE_STATES, columns=GRADE_STATES), "IDENTITY"

def get_yearly_change_prob(strategy):
    # keep your fallback constant for now (you had richer estimation earlier; keep minimal here)
    return DEFAULT_YEARLY_CHANGE

def sample_next_grade(curr_grade, P_df, rng):
    if curr_grade not in GRADE_STATES:
        curr_grade = "D"
    row = P_df.loc[curr_grade].values.astype(float)
    return str(rng.choice(GRADE_STATES, p=row))

# -----------------------------
# PROJECTION LOOP: simulate omega only (NO NAV update)
# -----------------------------
rng = np.random.default_rng(1234)

omega_rows = []
for _, r in base_rows.iterrows():
    fund_id = r["FundID"]
    age0 = int(r["Fund_Age_Quarters"]) if pd.notna(r["Fund_Age_Quarters"]) else 0

    strategy = r["Adj Strategy"]
    grade = r["Grade"] if pd.notna(r["Grade"]) else "D"

    cap_qe = r["cap_qe"]
    if pd.isna(cap_qe):
        cap_qe = msci_future["quarter_end"].iloc[-1]

    for step, (qe, msci_r, msci_r_lag1) in enumerate(
        zip(msci_future["quarter_end"], msci_future["msci_ret_q"], msci_future["msci_ret_q_lag1"]),
        start=1
    ):
        if qe > cap_qe:
            break

        msci_r_lag1 = 0.0 if pd.isna(msci_r_lag1) else float(msci_r_lag1)

        age = age0 + step
        age_bucket = make_age_bucket_q(pd.Series([age])).iloc[0]

        # Annual grade transition
        prev_grade = grade
        if step % 4 == 0:
            pchg = get_yearly_change_prob(strategy)
            if rng.random() < pchg:
                P, _ = get_transition_matrix(strategy)
                grade = sample_next_grade(grade, P, rng)

        b0, b1, _ = get_betas(strategy, grade)
        alpha, _ = get_alpha(strategy, grade, age_bucket)
        sigma, _ = get_sigma(strategy, grade)

        eps = rng.standard_normal()
        omega = alpha + b0*float(msci_r) + b1*msci_r_lag1 + sigma*eps
        if not np.isfinite(omega):
            omega = 0.0
        omega = float(np.clip(omega, -OMEGA_CLIP, OMEGA_CLIP))

        omega_rows.append({
            "FundID": fund_id,
            "quarter_end": qe,
            "step_q": step,
            "msci_ret_q": float(msci_r),
            "msci_ret_q_lag1": float(msci_r_lag1),
            "omega": float(omega),

            "Fund_Age_Quarters": int(age),
            "Adj Strategy": strategy,
            "Grade_prev": prev_grade,
            "Grade": grade,
            "AgeBucket": age_bucket,
            "cap_qe": cap_qe,
        })

omega_proj = pd.DataFrame(omega_rows)
print("\nOmega rows:", len(omega_proj))
print("Omega funds:", omega_proj["FundID"].nunique())

# -----------------------------
# Save outputs
# -----------------------------
save = input("Save omega projection + NAV_start? (y/n) [y]: ").strip().lower()
if save in {"", "y", "yes"}:
    omega_csv = os.path.join(DATA_DIR, f"omega_projection_sota_{year}_{quarter}_{n_q}q.csv")
    omega_pq  = os.path.join(DATA_DIR, f"omega_projection_sota_{year}_{quarter}_{n_q}q.parquet")

    omega_proj.to_csv(omega_csv, index=False)
    omega_proj.to_parquet(omega_pq, index=False)

    navstart = base_rows[[
        "FundID","Adj Strategy","Grade","Fund_Age_Quarters","NAV_start","NAV_start_source","cap_qe"
    ]].copy()
    navstart_csv = os.path.join(DATA_DIR, f"nav_start_sota_{year}_{quarter}.csv")
    navstart_pq  = os.path.join(DATA_DIR, f"nav_start_sota_{year}_{quarter}.parquet")
    navstart.to_csv(navstart_csv, index=False)
    navstart.to_parquet(navstart_pq, index=False)

    # Diagnostics tables (same as before; keep useful)
    beta_sg.to_csv(os.path.join(DATA_DIR, "cal_beta_strategy_grade.csv"), index=False)
    beta_s.to_csv(os.path.join(DATA_DIR, "cal_beta_strategy.csv"), index=False)
    alpha_sga.to_csv(os.path.join(DATA_DIR, "cal_alpha_strategy_grade_agebucket.csv"), index=False)
    sigma_sg.to_csv(os.path.join(DATA_DIR, "cal_sigma_strategy_grade.csv"), index=False)

    print("\nSaved:")
    print(omega_csv)
    print(omega_pq)
    print(navstart_csv)
    print(navstart_pq)
    print("Diagnostics saved (beta/alpha/sigma).")

print("\nOmega projection head:")
print(omega_proj.head())


Using MSCI projection file: C:\Users\MANJANID\Documents\Equity\2025_Q3\data\msci_projection_2025_Q3_neutral_40q.parquet
Projection start quarter-end: 2025-09-30
Loaded data rows: 30857
Funds: 1009
Future MSCI quarters available: 40
Calibration rows: 26354


  for (s, g, a), grp in cal2.groupby(["Adj Strategy","Grade","AgeBucket"], dropna=False):
  ratio_draw = tmp.groupby(ratio_key, dropna=False).apply(lambda g: fit_lognorm(g, "log_ratio_nav_draw")).reset_index()
  ratio_draw = tmp.groupby(ratio_key, dropna=False).apply(lambda g: fit_lognorm(g, "log_ratio_nav_draw")).reset_index()
  ratio_size = tmp.groupby(ratio_key, dropna=False).apply(lambda g: fit_lognorm(g, "log_ratio_nav_size")).reset_index()
  ratio_size = tmp.groupby(ratio_key, dropna=False).apply(lambda g: fit_lognorm(g, "log_ratio_nav_size")).reset_index()



Omega rows: 15513
Omega funds: 945

Saved:
C:\Users\MANJANID\Documents\Equity\2025_Q3\data\omega_projection_sota_2025_Q3_40q.csv
C:\Users\MANJANID\Documents\Equity\2025_Q3\data\omega_projection_sota_2025_Q3_40q.parquet
C:\Users\MANJANID\Documents\Equity\2025_Q3\data\nav_start_sota_2025_Q3.csv
C:\Users\MANJANID\Documents\Equity\2025_Q3\data\nav_start_sota_2025_Q3.parquet
Diagnostics saved (beta/alpha/sigma).

Omega projection head:
                             FundID quarter_end  step_q  msci_ret_q  \
0  0084126E5AB84A059EFCD1CB88947783  2025-12-31       1    0.147330   
1  0084126E5AB84A059EFCD1CB88947783  2026-03-31       2    0.023595   
2  0084126E5AB84A059EFCD1CB88947783  2026-06-30       3    0.032187   
3  0084126E5AB84A059EFCD1CB88947783  2026-09-30       4    0.001850   
4  0084126E5AB84A059EFCD1CB88947783  2026-12-31       5   -0.116651   

   msci_ret_q_lag1     omega  Fund_Age_Quarters     Adj Strategy Grade_prev  \
0         0.008573 -0.237889                 48  Venture C