# Distribution Fits (Anonymized Data)

This notebook fits candidate distributions for drawdown, repayment, and recallable ratios.
It handles zeroâ€‘inflation by modeling a mass at 0 and fitting the positive tail.

In [1]:
import os
import numpy as np
import pandas as pd

try:
    import scipy
    from scipy import stats
    print("scipy version:", scipy.__version__)
except Exception as e:
    raise ImportError("scipy is required for fitting: pip install scipy") from e

scipy version: 1.17.0


In [2]:
import os
from pathlib import Path
RUN_TAG = os.environ.get("RUN_TAG")
HIST_END = os.environ.get("HIST_END")
if RUN_TAG is None:
    RUN_TAG = HIST_END or "2025Q3"
BASE_OUT = Path("model_fits") / "runs" / RUN_TAG
CALIB_DIR = BASE_OUT / "calibration"
PROJ_DIR = BASE_OUT / "projection"


INPUT_PATH = "/Users/mozeramozali/Desktop/Equity-Cashflow-projection/anonymized.csv"
OUT_DIR = str(CALIB_DIR)

# Auto-detect anonymized.csv if not in current working directory
if not Path(INPUT_PATH).exists():
    candidates = list(Path.cwd().glob("**/anonymized.csv"))
    if candidates:
        INPUT_PATH = str(candidates[0])
    else:
        raise FileNotFoundError("anonymized.csv not found. Set INPUT_PATH to the full path.")

print("Using INPUT_PATH:", INPUT_PATH)
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
Path(CALIB_DIR).mkdir(parents=True, exist_ok=True)


Using INPUT_PATH: /Users/mozeramozali/Desktop/Equity-Cashflow-projection/anonymized.csv


In [3]:
# --- Column normalization ---

def _norm_key(s: str) -> str:
    return " ".join(s.strip().lower().replace("_", " ").split())

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    col_map = {_norm_key(c): c for c in df.columns}
    def _get(name: str) -> str:
        k = _norm_key(name)
        return col_map.get(k, name)

    rename = {}
    rename[_get("Adj strategy")] = "Adj Strategy"
    rename[_get("Adj Strategy")] = "Adj Strategy"
    rename[_get("Quarter of Transaction Date")] = "Quarter"
    rename[_get("Year of Transaction Date")] = "Year"
    rename[_get("FundID")] = "FundID"
    rename[_get("First Closing Date")] = "First Closing Date"
    rename[_get("Grade")] = "Grade"
    rename[_get("Adj Drawdown EUR")] = "Adj Drawdown EUR"
    rename[_get("Adj Repayment EUR")] = "Adj Repayment EUR"
    rename[_get("NAV Adjusted EUR")] = "NAV Adjusted EUR"
    rename[_get("Capacity")] = "Capacity"
    rename[_get("Recallable")] = "Recallable"
    rename[_get("Fund_Age_Quarters")] = "Fund_Age_Quarters"
    rename[_get("Drawdown_Ratio")] = "Drawdown_Ratio"
    rename[_get("Repayment_Ratio")] = "Repayment_Ratio"

    return df.rename(columns=rename)

In [4]:
# --- Load data ---

df = pd.read_csv(INPUT_PATH, engine="python")
df = normalize_columns(df)
print("rows", len(df), "cols", len(df.columns))

rows 30857 cols 23


In [5]:
# --- Build quarter_end + ratios ---

def parse_quarter(q):
    if pd.isna(q):
        return np.nan
    if isinstance(q, (int, np.integer, float, np.floating)):
        return float(q)
    s = str(q).strip().upper()
    if s.startswith("Q"):
        s = s[1:]
    try:
        return float(s)
    except Exception:
        return np.nan


def add_quarter_end(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Quarter"] = df["Quarter"].apply(parse_quarter)
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    m = df["Year"].notna() & df["Quarter"].notna()
    years = df.loc[m, "Year"].astype(int)
    quarters = df.loc[m, "Quarter"].astype(int)
    df.loc[m, "quarter_end"] = pd.PeriodIndex(year=years, quarter=quarters, freq="Q").to_timestamp("Q")
    return df


def apply_current_grade(df: pd.DataFrame, context: str = "") -> pd.DataFrame:
    df = df.copy()

    if "Grade" in df.columns and "Grade_Seed" not in df.columns:
        df["Grade_Seed"] = df["Grade"]

    if "Grade" in df.columns:
        df["Grade"] = df["Grade"].astype(str).str.strip()
        df.loc[df["Grade"].isin(["", "nan", "None", "NaN", "<NA>"]), "Grade"] = np.nan

    if "quarter_end" not in df.columns:
        df = add_quarter_end(df)

    df["QPeriod"] = df["quarter_end"].dt.to_period("Q")

    cols = [
        "FundID",
        "Adj Strategy",
        "QPeriod",
        "quarter_end",
        "Adj Drawdown EUR",
        "Adj Repayment EUR",
        "NAV Adjusted EUR",
        "First Closing Date",
        "Grade",
    ]
    cols = [c for c in cols if c in df.columns]
    cash = df[cols].copy()
    cash = cash.rename(
        columns={
            "Adj Strategy": "AdjStrategy",
            "quarter_end": "TransactionDate",
            "First Closing Date": "FirstClosingDate",
        }
    )

    cash["TransactionDate"] = pd.to_datetime(cash["TransactionDate"], errors="coerce")
    if "FirstClosingDate" in cash.columns:
        cash["FirstClosingDate"] = pd.to_datetime(cash["FirstClosingDate"], errors="coerce")
    else:
        cash["FirstClosingDate"] = pd.NaT

    for c in ["Adj Drawdown EUR", "Adj Repayment EUR", "NAV Adjusted EUR"]:
        if c in cash.columns:
            cash[c] = pd.to_numeric(cash[c], errors="coerce").fillna(0.0)
        else:
            cash[c] = 0.0

    cash = cash.dropna(subset=["FundID", "TransactionDate"])

    if cash["FirstClosingDate"].isna().any():
        first_tx = cash.groupby("FundID")["TransactionDate"].transform("min")
        cash["FirstClosingDate"] = cash["FirstClosingDate"].fillna(first_tx)

    if cash.empty:
        return df

    fund_strategy = (
        cash.groupby("FundID")["AdjStrategy"]
        .agg(lambda s: s.mode().iat[0] if len(s.mode()) else s.iloc[0])
        .reset_index()
    )

    q_snap = (
        cash.sort_values(["FundID", "TransactionDate"])
        .groupby(["FundID", "QPeriod"], as_index=False)
        .agg(
            AdjStrategy=("AdjStrategy", "last"),
            FirstClosingDate=("FirstClosingDate", "last"),
            QuarterDrawdown=("Adj Drawdown EUR", "sum"),
            QuarterRepayment=("Adj Repayment EUR", "sum"),
            QuarterEndNAV=("NAV Adjusted EUR", "last"),
            QuarterEndDate=("TransactionDate", "max"),
        )
    )

    q_snap = q_snap.sort_values(["FundID", "QPeriod"])
    q_snap["QuarterEndNAV"] = q_snap.groupby("FundID")["QuarterEndNAV"].ffill().fillna(0)

    q_snap["CumDrawdown"] = q_snap.groupby("FundID")["QuarterDrawdown"].cumsum()
    q_snap["CumRepayment"] = q_snap.groupby("FundID")["QuarterRepayment"].cumsum()
    q_snap["PaidIn"] = q_snap["CumDrawdown"].abs()
    q_snap["Distributed"] = q_snap["CumRepayment"].abs()
    q_snap["NAV"] = q_snap["QuarterEndNAV"].abs()

    q_snap["DPI"] = np.where(q_snap["PaidIn"] > 0, q_snap["Distributed"] / q_snap["PaidIn"], np.nan)
    q_snap["TVPI"] = np.where(
        q_snap["PaidIn"] > 0,
        (q_snap["Distributed"] + q_snap["NAV"]) / q_snap["PaidIn"],
        np.nan,
    )

    def xnpv(rate, cfs, dts):
        dts = np.asarray(dts, dtype="datetime64[ns]")
        cfs = np.asarray(cfs, dtype=float)
        t0 = dts[0]
        day_counts = (dts - t0) / np.timedelta64(1, "D")
        years = day_counts / 365.0
        return np.sum(cfs / ((1.0 + rate) ** years))

    def xirr_newton(cfs, dts, guess=0.1, max_iter=80, tol=1e-7):
        dts = np.asarray(dts, dtype="datetime64[ns]")
        cfs = np.asarray(cfs, dtype=float)
        rate = float(guess)
        for _ in range(max_iter):
            f = xnpv(rate, cfs, dts)
            if not np.isfinite(f):
                return np.nan
            if abs(f) < tol:
                return rate
            eps = 1e-6
            f1 = xnpv(rate + eps, cfs, dts)
            df = (f1 - f) / eps
            if df == 0 or not np.isfinite(df):
                return np.nan
            rate_new = rate - f / df
            if rate_new <= -0.999999 or not np.isfinite(rate_new):
                return np.nan
            rate = rate_new
        return np.nan

    def compute_fund_quarter_xirr(fund_cash, fund_q):
        fund_cash = fund_cash.sort_values("TransactionDate")
        cfs = (-fund_cash["Adj Drawdown EUR"].abs() + fund_cash["Adj Repayment EUR"].abs()).to_numpy(dtype=float)
        dts = fund_cash["TransactionDate"].to_numpy(dtype="datetime64[ns]")
        irr_vals = []
        irr_flags = []
        j = 0
        n = len(cfs)
        for r in fund_q.itertuples(index=False):
            q_end = np.datetime64(r.QuarterEndDate, "ns")
            while j < n and dts[j] <= q_end:
                j += 1
            cfs_slice = cfs[:j]
            dts_slice = dts[:j]
            if len(cfs_slice) == 0:
                irr_vals.append(np.nan)
                irr_flags.append("no_txns")
                continue
            terminal_nav = float(abs(r.NAV)) if np.isfinite(r.NAV) else 0.0
            cfs_full = np.append(cfs_slice, terminal_nav)
            dts_full = np.append(dts_slice, q_end).astype("datetime64[ns]")
            if not (np.any(cfs_full < 0) and np.any(cfs_full > 0)):
                irr_vals.append(np.nan)
                irr_flags.append("no_sign_change")
                continue
            tvpi = r.TVPI
            guess = 0.10 if (pd.notna(tvpi) and tvpi > 1.0) else -0.10
            irr = xirr_newton(cfs_full, dts_full, guess=guess)
            if pd.notna(tvpi) and (0.98 <= tvpi <= 1.02):
                if not np.isfinite(irr):
                    irr2 = xirr_newton(cfs_full, dts_full, guess=-guess)
                    if np.isfinite(irr2):
                        irr = irr2
                        irr_flags.append("flat_retry_success")
                    else:
                        irr = 0.0
                        irr_flags.append("flat_to_zero")
                else:
                    irr_flags.append("flat_success")
            else:
                irr_flags.append("ok" if np.isfinite(irr) else "fail")
            irr_vals.append(irr)
        return pd.DataFrame({"IRR": irr_vals, "IRR_Flag": irr_flags})

    irr_rows = []
    for fund_id, fund_q in q_snap.groupby("FundID", sort=False):
        fund_cash = cash[cash["FundID"] == fund_id]
        irr_df = compute_fund_quarter_xirr(fund_cash, fund_q)
        irr_df = irr_df.copy()
        irr_df["FundID"] = fund_id
        irr_df["QPeriod"] = fund_q["QPeriod"].values
        irr_rows.append(irr_df)

    if irr_rows:
        irr_all = pd.concat(irr_rows, ignore_index=True)
        q_snap = q_snap.merge(irr_all, on=["FundID", "QPeriod"], how="left")
    else:
        q_snap["IRR"] = np.nan

    def quartile_to_grade(s):
        r = s.rank(pct=True)
        return pd.cut(r, [0, 0.25, 0.5, 0.75, 1], labels=["D", "C", "B", "A"], include_lowest=True)

    q_snap["Grade_DPI"] = q_snap.groupby(["AdjStrategy", "QPeriod"])["DPI"].transform(quartile_to_grade)
    q_snap["Grade_TVPI"] = q_snap.groupby(["AdjStrategy", "QPeriod"])["TVPI"].transform(quartile_to_grade)
    q_snap["Grade_IRR"] = q_snap.groupby(["AdjStrategy", "QPeriod"])["IRR"].transform(quartile_to_grade)

    DEBT_STRATEGIES = {"Hybrid Debt-Equity", "Private Debt", "Other Private Debt"}
    VC_STRATEGY = "Venture Capital"

    rep = cash[cash["Adj Repayment EUR"].abs() > 0].copy()
    first_repay = rep.groupby("FundID")["TransactionDate"].min().reset_index(name="FirstRepaymentDate")
    first_close = cash.groupby("FundID")["FirstClosingDate"].min().reset_index(name="FirstCloseDate")
    fund_timing = first_close.merge(first_repay, on="FundID", how="left")
    fund_timing = fund_timing.merge(fund_strategy, on="FundID", how="left")
    fund_timing["RepayWithin5Y"] = (
        fund_timing["FirstRepaymentDate"].notna()
        & (fund_timing["FirstRepaymentDate"] <= (fund_timing["FirstCloseDate"] + pd.DateOffset(years=5)))
    )
    fund_timing["BaseYears"] = np.where(fund_timing["RepayWithin5Y"], 5, 6)
    fund_timing["InvestPeriodYears"] = fund_timing["BaseYears"] + 1

    q_snap = q_snap.merge(
        fund_timing[["FundID", "FirstCloseDate", "InvestPeriodYears", "AdjStrategy"]],
        on=["FundID", "AdjStrategy"],
        how="left",
    )
    q_snap["IsInvestmentPeriod"] = q_snap["QuarterEndDate"] <= (
        q_snap["FirstCloseDate"] + q_snap["InvestPeriodYears"].apply(lambda y: pd.DateOffset(years=int(y)))
    )

    fund_counts = (
        q_snap.groupby(["AdjStrategy", "QPeriod"])["FundID"].nunique().rename("StrategyFundCount").reset_index()
    )
    q_snap = q_snap.merge(fund_counts, on=["AdjStrategy", "QPeriod"], how="left")

    q_snap["IsDebt"] = q_snap["AdjStrategy"].isin(DEBT_STRATEGIES)
    q_snap["IsVC"] = q_snap["AdjStrategy"].eq(VC_STRATEGY)

    grade_to_idx = {"A": 0, "B": 1, "C": 2, "D": 3}
    idx_to_grade = {0: "A", 1: "B", 2: "C", 3: "D"}

    def worse_grade(g1, g2):
        if pd.isna(g1):
            return g2
        if pd.isna(g2):
            return g1
        return g1 if grade_to_idx[g1] >= grade_to_idx[g2] else g2

    def downgrade_one_notch(g):
        if pd.isna(g):
            return g
        return idx_to_grade[min(grade_to_idx[g] + 1, 3)]

    def final_grade(row):
        if row["StrategyFundCount"] < 30:
            return worse_grade(row["Grade_DPI"], row["Grade_TVPI"])
        if row["IsDebt"]:
            return row["Grade_IRR"]
        if row["IsInvestmentPeriod"]:
            return row["Grade_TVPI"] if row["IsVC"] else row["Grade_DPI"]
        base = row["Grade_IRR"]
        dpi_g = row["Grade_DPI"]
        if pd.isna(base):
            return base
        if pd.notna(dpi_g) and (grade_to_idx[dpi_g] > grade_to_idx[base]):
            return downgrade_one_notch(base)
        return base

    q_snap["CurrentGrade"] = q_snap.apply(final_grade, axis=1)

    fund_quarters = cash[["FundID", "QPeriod"]].drop_duplicates().sort_values(["FundID", "QPeriod"])
    fund_quarters["RankQ"] = fund_quarters.groupby("FundID").cumcount() + 1
    fund_quarters["Block4"] = (fund_quarters["RankQ"] - 1) // 4 + 1

    fund_first = (
        cash.dropna(subset=["Grade"]).groupby("FundID", as_index=False).first()[["FundID", "Grade"]]
        .rename(columns={"Grade": "FirstGrade"})
    )

    fund_quarters = fund_quarters.merge(fund_first, on="FundID", how="left")
    fund_quarters = fund_quarters.merge(
        q_snap[["FundID", "QPeriod", "CurrentGrade"]], on=["FundID", "QPeriod"], how="left"
    )

    fund_quarters["AssignedGrade"] = np.where(
        (fund_quarters["RankQ"] <= 20) & fund_quarters["FirstGrade"].notna(),
        fund_quarters["FirstGrade"],
        fund_quarters["CurrentGrade"],
    )

    fund_quarters["AssignedGrade"] = fund_quarters.groupby("FundID")["AssignedGrade"].ffill()

    df = df.merge(fund_quarters[["FundID", "QPeriod", "AssignedGrade"]], on=["FundID", "QPeriod"], how="left")
    df["Grade_Current"] = df["AssignedGrade"]
    if "Grade_Seed" in df.columns:
        df["Grade_Current"] = df["Grade_Current"].fillna(df["Grade_Seed"])
    df["Grade"] = df["Grade_Current"]

    if context:
        print(f"Computed Grade_Current using performance rules for {context}.")

    return df


# Build quarter_end + grade history

df = add_quarter_end(df)
df = df.dropna(subset=["quarter_end"])
df = apply_current_grade(df, context="distribution fits")

# ratios

df = df.sort_values(["FundID", "quarter_end"])
df["nav_prev"] = df.groupby("FundID")["NAV Adjusted EUR"].shift(1)

# fund-level commitment (Commitment EUR only)
comm = pd.to_numeric(df.get("Commitment EUR"), errors="coerce")
commit = comm.groupby(df["FundID"]).transform("max")
draw = pd.to_numeric(df["Adj Drawdown EUR"], errors="coerce").abs()
draw_cum = df.groupby("FundID")["Adj Drawdown EUR"].transform(lambda s: s.abs().cumsum())
rep = pd.to_numeric(df["Adj Repayment EUR"], errors="coerce")
nav_prev = pd.to_numeric(df["nav_prev"], errors="coerce")
rc = pd.to_numeric(df["Recallable"], errors="coerce").abs()
rc_cum = df.groupby("FundID")["Recallable"].transform(lambda s: pd.to_numeric(s, errors="coerce").abs().cumsum())

denom = commit + rc_cum

# computed ratios

df["draw_ratio_calc"] = np.where(denom > 0, draw_cum / denom, np.nan)
df["draw_ratio_calc"] = df["draw_ratio_calc"].clip(lower=0.0, upper=1.0)
df["rep_ratio_calc"] = np.where(nav_prev.abs() > 1.0, rep / nav_prev.abs(), np.nan)
df["rc_ratio_given_rep"] = np.where(rep > 0, rc / rep, np.nan)

AGE_BINS_Q = [-1, 3, 7, 11, 15, 19, 1000]
AGE_LABELS = ["0-3", "4-7", "8-11", "12-15", "16-19", "20+"]
if "Fund_Age_Quarters" in df.columns:
    df["AgeBucket"] = pd.cut(pd.to_numeric(df["Fund_Age_Quarters"], errors="coerce"),
                              bins=AGE_BINS_Q, labels=AGE_LABELS)
else:
    df["AgeBucket"] = "ALL"

ratio_map = {
    "draw_ratio": df["draw_ratio_calc"],
    "rep_ratio": df["rep_ratio_calc"],
    "rc_ratio_given_rep": df["rc_ratio_given_rep"],
}

for name, s in ratio_map.items():
    s = pd.to_numeric(s, errors="coerce")
    print(name, "n", s.notna().sum(), "zero_share", (s.fillna(0) <= 1e-9).mean(),
          ">1 share", (s > 1).mean())


  df.loc[m, "quarter_end"] = pd.PeriodIndex(year=years, quarter=quarters, freq="Q").to_timestamp("Q")


  q_snap["FirstCloseDate"] + q_snap["InvestPeriodYears"].apply(lambda y: pd.DateOffset(years=int(y)))


Computed Grade_Current using performance rules for distribution fits.


draw_ratio n 30857 zero_share 0.09388469391061996 >1 share 0.0
rep_ratio n 26365 zero_share 0.8258741938620087 >1 share 0.0
rc_ratio_given_rep n 5417 zero_share 0.9461062319732961 >1 share 0.0070648475224422335


In [6]:
# --- Fit distributions ---

def _fit_dist(x: np.ndarray, dist_name: str):
    dist = getattr(stats, dist_name)
    if dist_name in ("beta", "logitnorm"):
        params = dist.fit(x, floc=0, fscale=1)
    elif dist_name in ("lognorm", "gamma", "weibull_min", "fisk"):
        params = dist.fit(x, floc=0)
    else:
        params = dist.fit(x)
    ll = float(np.sum(dist.logpdf(x, *params)))
    k = len(params)
    return params, ll, k

# estimate mean by sampling (used for scaling/capping)
def _estimate_mean(dist_name: str, params, n=5000, cap=None, seed=42):
    try:
        dist = getattr(stats, dist_name)
        rng = np.random.default_rng(seed)
        samp = dist.rvs(*params, size=n, random_state=rng)
        samp = np.asarray(samp, dtype=float)
        samp = samp[np.isfinite(samp)]
        if cap is not None:
            samp = np.clip(samp, 0.0, cap)
        return float(np.mean(samp)) if len(samp) else float("nan")
    except Exception:
        return float("nan")



def _ks_pvalue(x: np.ndarray, dist_name: str, params):
    try:
        d, p = stats.kstest(x, dist_name, args=params)
        return float(p)
    except Exception:
        return float("nan")


def fit_candidates(x: np.ndarray, dist_names, eps=1e-9, cap=None):
    rows = []
    for name in dist_names:
        try:
            if name in ("beta", "logitnorm"):
                x_fit = x[(x > eps) & (x < 1.0 - eps)]
            else:
                x_fit = x
            n_fit = len(x_fit)
            if n_fit == 0:
                rows.append({"dist": name, "aic": np.nan, "bic": np.nan, "ks_p": np.nan, "n_fit": 0, "params": "no_data", "mean_raw": np.nan, "mean_cap": np.nan, "median_raw": np.nan, "median_cap": np.nan, "cap_used": cap})
                continue
            params, ll, k = _fit_dist(x_fit, name)
            mean_raw = _estimate_mean(name, params, n=5000, cap=None)
            mean_cap = _estimate_mean(name, params, n=5000, cap=cap)
            median_raw = float(np.nanmedian(stats.__getattribute__(name).rvs(*params, size=2000, random_state=np.random.default_rng(123)))) if np.isfinite(mean_raw) else float("nan")
            median_cap = float(np.nanmedian(np.clip(stats.__getattribute__(name).rvs(*params, size=2000, random_state=np.random.default_rng(124)), 0.0, cap))) if cap is not None else float("nan")
            aic = 2 * k - 2 * ll
            bic = np.log(n_fit) * k - 2 * ll
            ks_p = _ks_pvalue(x_fit, name, params)
            rows.append({"dist": name, "aic": aic, "bic": bic, "ks_p": ks_p, "n_fit": n_fit, "params": params, "mean_raw": mean_raw, "mean_cap": mean_cap, "median_raw": median_raw, "median_cap": median_cap, "cap_used": cap})
        except Exception as e:
            rows.append({"dist": name, "aic": np.nan, "bic": np.nan, "ks_p": np.nan, "n_fit": 0, "params": f"error: {e}", "mean_raw": np.nan, "mean_cap": np.nan, "median_raw": np.nan, "median_cap": np.nan, "cap_used": cap})
    return rows


def fit_ratio_series(x: pd.Series, dist_names, eps=1e-9, cap=None):
    x = pd.to_numeric(x, errors="coerce").dropna().clip(lower=0.0)
    n = len(x)
    n_zero = int((x <= eps).sum())
    x_pos = x[x > eps].to_numpy(dtype=float)
    zero_share = n_zero / float(n) if n else np.nan
    data_mean = float(np.nanmean(x_pos)) if len(x_pos) else float("nan")
    data_median = float(np.nanmedian(x_pos)) if len(x_pos) else float("nan")
    data_p90 = float(np.nanquantile(x_pos, 0.9)) if len(x_pos) else float("nan")
    cap_used = cap if cap is not None else data_p90
    fits = fit_candidates(x_pos, dist_names, eps=eps, cap=cap_used)
    return {"n": n, "n_pos": len(x_pos), "zero_share": zero_share, "fits": fits, "data_mean": data_mean, "data_median": data_median, "data_p90": data_p90, "cap_used": cap_used}


dist_names = ["beta", "logitnorm", "lognorm", "gamma", "weibull_min", "fisk"]
dist_names_map = {
    "draw_ratio": dist_names,
    "rep_ratio": dist_names,
    "rc_ratio_given_rep": dist_names,
}


In [7]:
# --- Global fits ---

global_rows = []
for name, series in ratio_map.items():
    cap = 1.0 if name in ("draw_ratio", "rc_ratio_given_rep") else None
    res = fit_ratio_series(series, dist_names_map.get(name, []), eps=1e-9, cap=cap)
    for f in res["fits"]:
        global_rows.append({
            "ratio": name,
            "n": res["n"],
            "n_pos": res["n_pos"],
            "zero_share": res["zero_share"],
            "data_mean": res.get("data_mean"),
            "data_median": res.get("data_median"),
            "data_p90": res.get("data_p90"),
            "cap_used": res.get("cap_used"),
            **f,
        })

global_df = pd.DataFrame(global_rows).sort_values(["ratio", "aic"])
print(global_df.head(10))

out_global = os.path.join(OUT_DIR, "ratio_fit_global.csv")
global_df.to_csv(out_global, index=False)
print("Wrote", out_global)


                 ratio      n  n_pos  zero_share  data_mean  data_median  \
0           draw_ratio  30857  27960    0.093885   0.627514     0.712442   
4           draw_ratio  30857  27960    0.093885   0.627514     0.712442   
3           draw_ratio  30857  27960    0.093885   0.627514     0.712442   
5           draw_ratio  30857  27960    0.093885   0.627514     0.712442   
2           draw_ratio  30857  27960    0.093885   0.627514     0.712442   
1           draw_ratio  30857  27960    0.093885   0.627514     0.712442   
12  rc_ratio_given_rep   5417   1663    0.693004   2.812229     0.720000   
17  rc_ratio_given_rep   5417   1663    0.693004   2.812229     0.720000   
16  rc_ratio_given_rep   5417   1663    0.693004   2.812229     0.720000   
14  rc_ratio_given_rep   5417   1663    0.693004   2.812229     0.720000   

    data_p90  cap_used         dist           aic           bic          ks_p  \
0   1.000000       1.0         beta  -1746.428682  -1714.052416  4.253547e-47   
4

In [8]:
# --- Grouped fits (with fallback levels) ---

MIN_OBS_AGE = 150
MIN_OBS_SG = 200
MIN_OBS_S = 300

def _fit_level(level_name: str, cols: list, min_obs: int):
    rows = []
    for gkey, g in df.groupby(cols):
        if not isinstance(gkey, tuple):
            gkey = (gkey,)
        if len(g) < min_obs:
            continue
        for name, series in [("draw_ratio", g["draw_ratio_calc"]),
                             ("rep_ratio", g["rep_ratio_calc"]),
                             ("rc_ratio_given_rep", g["rc_ratio_given_rep"])]:
            cap = 1.0 if name in ("draw_ratio", "rc_ratio_given_rep") else float(pd.to_numeric(series, errors="coerce").dropna().quantile(0.9)) if series.notna().any() else None
            res = fit_ratio_series(series, dist_names_map.get(name, []), eps=1e-9, cap=cap)
            if not res["fits"]:
                continue
            best = sorted(res["fits"], key=lambda r: (np.nan_to_num(r["aic"], nan=np.inf)))[0]
            row = {
                "ratio": name,
                "level": level_name,
                "group_n": len(g),
                "n": res["n"],
                "n_pos": res["n_pos"],
                "zero_share": res["zero_share"],
                "data_mean": res.get("data_mean"),
                "data_median": res.get("data_median"),
                "data_p90": res.get("data_p90"),
                "cap_used": res.get("cap_used"),
                **best,
            }
            for idx, col in enumerate(cols):
                row[col] = gkey[idx]
            rows.append(row)
    return rows

lvl_age = _fit_level("strategy_grade_age", ["Adj Strategy", "Grade", "AgeBucket"], MIN_OBS_AGE)
lvl_sg = _fit_level("strategy_grade", ["Adj Strategy", "Grade"], MIN_OBS_SG)
lvl_s = _fit_level("strategy", ["Adj Strategy"], MIN_OBS_S)

by_group = pd.DataFrame(lvl_age + lvl_sg + lvl_s)
print(by_group.head(10))

out_group = os.path.join(OUT_DIR, "ratio_fit_by_group.csv")
by_group.to_csv(out_group, index=False)
print("Wrote", out_group)


  for gkey, g in df.groupby(cols):
  fac = xbar * (1 - xbar) / data.var(ddof=0) - 1
  func = [s1 - n * (-psiab + sc.psi(a)),
  s2 - n * (-psiab + sc.psi(b))]


  aest = (3-s + np.sqrt((s-3)**2 + 24*s)) / (12*s)
  a = optimize.brentq(lambda a: np.log(a) - sc.digamma(a) - s,


  aest = (3-s + np.sqrt((s-3)**2 + 24*s)) / (12*s)
  a = optimize.brentq(lambda a: np.log(a) - sc.digamma(a) - s,


  fac = xbar * (1 - xbar) / data.var(ddof=0) - 1
  func = [s1 - n * (-psiab + sc.psi(a)),
  s2 - n * (-psiab + sc.psi(b))]


  aest = (3-s + np.sqrt((s-3)**2 + 24*s)) / (12*s)
  a = optimize.brentq(lambda a: np.log(a) - sc.digamma(a) - s,
  s = stats.skew(data)


  fac = xbar * (1 - xbar) / data.var(ddof=0) - 1
  func = [s1 - n * (-psiab + sc.psi(a)),
  s2 - n * (-psiab + sc.psi(b))]
  aest = (3-s + np.sqrt((s-3)**2 + 24*s)) / (12*s)
  a = optimize.brentq(lambda a: np.log(a) - sc.digamma(a) - s,


  fac = xbar * (1 - xbar) / data.var(ddof=0) - 1
  func = [s1 - n * (-psiab + sc.psi(a)),
  s2 - n * (-psiab + sc.psi(b))]
  aest = (3-s + np.sqrt((s-3)**2 + 24*s)) / (12*s)
  a = optimize.brentq(lambda a: np.log(a) - sc.digamma(a) - s,


                ratio               level  group_n    n  n_pos  zero_share  \
0          draw_ratio  strategy_grade_age      195  195    195    0.000000   
1           rep_ratio  strategy_grade_age      195  190     30    0.842105   
2  rc_ratio_given_rep  strategy_grade_age      195   30      1    0.966667   
3          draw_ratio  strategy_grade_age      214  214    214    0.000000   
4           rep_ratio  strategy_grade_age      214  202     21    0.896040   
5  rc_ratio_given_rep  strategy_grade_age      214   21      0    1.000000   
6          draw_ratio  strategy_grade_age      235  235    235    0.000000   
7           rep_ratio  strategy_grade_age      235  206      8    0.961165   
8  rc_ratio_given_rep  strategy_grade_age      235    8      0    1.000000   
9          draw_ratio  strategy_grade_age      220  220    220    0.000000   

   data_mean  data_median  data_p90  cap_used  ...      ks_p  n_fit  \
0   0.691001     0.779477  1.000000  1.000000  ...  0.000026    122   

In [9]:
# --- Selected with fallback ---

best_global = (global_df.sort_values(["ratio", "aic"])
               .groupby("ratio").head(1)
               .set_index("ratio"))

base_groups = df[["Adj Strategy", "Grade", "AgeBucket"]].dropna().drop_duplicates()
selected_rows = []

for _, r in base_groups.iterrows():
    s, g, a = r["Adj Strategy"], r["Grade"], r["AgeBucket"]
    for ratio in ["draw_ratio", "rep_ratio", "rc_ratio_given_rep"]:
        row = None
        if not by_group.empty:
            m = (by_group["ratio"] == ratio) & (by_group["level"] == "strategy_grade_age") & \
                (by_group["Adj Strategy"] == s) & (by_group["Grade"] == g) & (by_group["AgeBucket"] == a)
            if m.any():
                row = by_group.loc[m].iloc[0].to_dict()
        if row is None and not by_group.empty:
            m = (by_group["ratio"] == ratio) & (by_group["level"] == "strategy_grade") & \
                (by_group["Adj Strategy"] == s) & (by_group["Grade"] == g)
            if m.any():
                row = by_group.loc[m].iloc[0].to_dict()
        if row is None and not by_group.empty:
            m = (by_group["ratio"] == ratio) & (by_group["level"] == "strategy") & \
                (by_group["Adj Strategy"] == s)
            if m.any():
                row = by_group.loc[m].iloc[0].to_dict()
        if row is None and ratio in best_global.index:
            row = best_global.loc[ratio].to_dict()
            row["level"] = "global"
        if row is None:
            continue
        row["Adj Strategy"] = s
        row["Grade"] = g
        row["AgeBucket"] = a
        selected_rows.append(row)

selected = pd.DataFrame(selected_rows)
out_sel = os.path.join(OUT_DIR, "ratio_fit_selected.csv")
selected.to_csv(out_sel, index=False)
print("Wrote", out_sel)


Wrote model_fits/runs/2025Q3/calibration/ratio_fit_selected.csv
