# Distribution Fits (Anonymized Data)

This notebook fits candidate distributions for drawdown, repayment, and recallable ratios.
It handles zero‑inflation by modeling a mass at 0 and fitting the positive tail.

In [None]:
import os
import numpy as np
import pandas as pd

try:
    import scipy
    from scipy import stats
    print("scipy version:", scipy.__version__)
except Exception as e:
    raise ImportError("scipy is required for fitting: pip install scipy") from e

In [None]:
from pathlib import Path

INPUT_PATH = "/Users/mozeramozali/Desktop/Equity-Cashflow-projection/anonymized.csv"
OUT_DIR = "model_fits/outputs"

# Auto-detect anonymized.csv if not in current working directory
if not Path(INPUT_PATH).exists():
    candidates = list(Path.cwd().glob("**/anonymized.csv"))
    if candidates:
        INPUT_PATH = str(candidates[0])
    else:
        raise FileNotFoundError("anonymized.csv not found. Set INPUT_PATH to the full path.")

print("Using INPUT_PATH:", INPUT_PATH)
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
# --- Column normalization ---

def _norm_key(s: str) -> str:
    return " ".join(s.strip().lower().replace("_", " ").split())

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    col_map = {_norm_key(c): c for c in df.columns}
    def _get(name: str) -> str:
        k = _norm_key(name)
        return col_map.get(k, name)

    rename = {}
    rename[_get("Adj strategy")] = "Adj Strategy"
    rename[_get("Adj Strategy")] = "Adj Strategy"
    rename[_get("Quarter of Transaction Date")] = "Quarter"
    rename[_get("Year of Transaction Date")] = "Year"
    rename[_get("FundID")] = "FundID"
    rename[_get("Grade")] = "Grade"
    rename[_get("Adj Drawdown EUR")] = "Adj Drawdown EUR"
    rename[_get("Adj Repayment EUR")] = "Adj Repayment EUR"
    rename[_get("NAV Adjusted EUR")] = "NAV Adjusted EUR"
    rename[_get("Capacity")] = "Capacity"
    rename[_get("Recallable")] = "Recallable"
    rename[_get("Fund_Age_Quarters")] = "Fund_Age_Quarters"
    rename[_get("Drawdown_Ratio")] = "Drawdown_Ratio"
    rename[_get("Repayment_Ratio")] = "Repayment_Ratio"

    return df.rename(columns=rename)

In [None]:
# --- Load data ---

df = pd.read_csv(INPUT_PATH, engine="python")
df = normalize_columns(df)
print("rows", len(df), "cols", len(df.columns))

In [None]:
# --- Build quarter_end + ratios ---

def parse_quarter(q):
    if pd.isna(q):
        return np.nan
    if isinstance(q, (int, np.integer, float, np.floating)):
        return float(q)
    s = str(q).strip().upper()
    if s.startswith("Q"):
        s = s[1:]
    try:
        return float(s)
    except Exception:
        return np.nan

# quarter_end
df["Quarter"] = df["Quarter"].apply(parse_quarter)
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
mask = df["Year"].notna() & df["Quarter"].notna()
years = df.loc[mask, "Year"].astype(int)
quarters = df.loc[mask, "Quarter"].astype(int)
df.loc[mask, "quarter_end"] = pd.PeriodIndex(year=years, quarter=quarters, freq="Q").to_timestamp("Q")


# current grade (forward fill if no Grade_Current)
if "Grade_Current" in df.columns:
    df["Grade"] = df["Grade_Current"]
    print("Using Grade_Current for fitting.")
else:
    if all(c in df.columns for c in ["FundID", "Grade", "quarter_end"]):
        df["Grade"] = df["Grade"].astype(str).str.strip()
        df.loc[df["Grade"].isin(["", "nan", "None", "NaN", "<NA>"]), "Grade"] = np.nan
        df = df.sort_values(["FundID", "quarter_end"])
        df["Grade_Current"] = df.groupby("FundID")["Grade"].ffill()
        df["Grade"] = df["Grade_Current"]
        print("Computed Grade_Current (forward fill) for fitting.")

# ratios
df = df.sort_values(["FundID", "quarter_end"])
df["nav_prev"] = df.groupby("FundID")["NAV Adjusted EUR"].shift(1)

cap = pd.to_numeric(df["Capacity"], errors="coerce")
draw = pd.to_numeric(df["Adj Drawdown EUR"], errors="coerce")
rep = pd.to_numeric(df["Adj Repayment EUR"], errors="coerce")
nav_prev = pd.to_numeric(df["nav_prev"], errors="coerce")
rc = pd.to_numeric(df["Recallable"], errors="coerce")

# computed ratios

df["draw_ratio_calc"] = np.where(cap > 0, draw / cap, np.nan)
df["rep_ratio_calc"] = np.where(nav_prev.abs() > 1.0, rep / nav_prev.abs(), np.nan)
df["rc_ratio_given_rep"] = np.where(rep > 0, rc / rep, np.nan)


AGE_BINS_Q = [-1, 3, 7, 11, 15, 19, 1000]
AGE_LABELS = ["0-3", "4-7", "8-11", "12-15", "16-19", "20+"]
if "Fund_Age_Quarters" in df.columns:
    df["AgeBucket"] = pd.cut(pd.to_numeric(df["Fund_Age_Quarters"], errors="coerce"),
                              bins=AGE_BINS_Q, labels=AGE_LABELS)
else:
    df["AgeBucket"] = "ALL"

ratio_map = {
    "draw_ratio": df["draw_ratio_calc"],
    "rep_ratio": df["rep_ratio_calc"],
    "rc_ratio_given_rep": df["rc_ratio_given_rep"],
}

for name, s in ratio_map.items():
    s = pd.to_numeric(s, errors="coerce")
    print(name, "n", s.notna().sum(), "zero_share", (s.fillna(0) <= 1e-9).mean(),
          ">1 share", (s > 1).mean())

In [None]:
# --- Fit distributions ---

def _fit_dist(x: np.ndarray, dist_name: str):
    dist = getattr(stats, dist_name)
    if dist_name in ("beta", "logitnorm"):
        params = dist.fit(x, floc=0, fscale=1)
    elif dist_name in ("lognorm", "gamma", "weibull_min", "fisk"):
        params = dist.fit(x, floc=0)
    else:
        params = dist.fit(x)
    ll = float(np.sum(dist.logpdf(x, *params)))
    k = len(params)
    return params, ll, k


def _ks_pvalue(x: np.ndarray, dist_name: str, params):
    try:
        d, p = stats.kstest(x, dist_name, args=params)
        return float(p)
    except Exception:
        return float("nan")


def fit_candidates(x: np.ndarray, dist_names, eps=1e-9):
    rows = []
    for name in dist_names:
        try:
            if name in ("beta", "logitnorm"):
                x_fit = x[(x > eps) & (x < 1.0 - eps)]
            else:
                x_fit = x
            n_fit = len(x_fit)
            if n_fit == 0:
                rows.append({"dist": name, "aic": np.nan, "bic": np.nan, "ks_p": np.nan, "n_fit": 0, "params": "no_data"})
                continue
            params, ll, k = _fit_dist(x_fit, name)
            aic = 2 * k - 2 * ll
            bic = np.log(n_fit) * k - 2 * ll
            ks_p = _ks_pvalue(x_fit, name, params)
            rows.append({"dist": name, "aic": aic, "bic": bic, "ks_p": ks_p, "n_fit": n_fit, "params": params})
        except Exception as e:
            rows.append({"dist": name, "aic": np.nan, "bic": np.nan, "ks_p": np.nan, "n_fit": 0, "params": f"error: {e}"})
    return rows


def fit_ratio_series(x: pd.Series, dist_names, eps=1e-9):
    x = pd.to_numeric(x, errors="coerce").dropna().clip(lower=0.0)
    n = len(x)
    n_zero = int((x <= eps).sum())
    x_pos = x[x > eps].to_numpy(dtype=float)
    zero_share = n_zero / float(n) if n else np.nan
    fits = fit_candidates(x_pos, dist_names, eps=eps)
    return {"n": n, "n_pos": len(x_pos), "zero_share": zero_share, "fits": fits}


dist_names = ["beta", "logitnorm", "lognorm", "gamma", "weibull_min", "fisk"]

In [None]:
# --- Global fits ---

global_rows = []
for name, series in ratio_map.items():
    res = fit_ratio_series(series, dist_names, eps=1e-9)
    for f in res["fits"]:
        global_rows.append({
            "ratio": name,
            "n": res["n"],
            "n_pos": res["n_pos"],
            "zero_share": res["zero_share"],
            **f,
        })

global_df = pd.DataFrame(global_rows).sort_values(["ratio", "aic"])
print(global_df.head(10))

out_global = os.path.join(OUT_DIR, "ratio_fit_global.csv")
global_df.to_csv(out_global, index=False)
print("Wrote", out_global)

In [None]:
# --- Grouped fits (Adj Strategy + Grade) ---

MIN_OBS = 150
group_cols = ["Adj Strategy", "Grade", "AgeBucket"]

group_rows = []
for gkey, g in df.groupby(group_cols):
    if len(g) < MIN_OBS:
        continue
    for name, series in [("draw_ratio", g["draw_ratio_calc"]),
                         ("rep_ratio", g["rep_ratio_calc"]),
                         ("rc_ratio_given_rep", g["rc_ratio_given_rep"])]:
        res = fit_ratio_series(series, dist_names, eps=1e-9)
        if not res["fits"]:
            continue
        best = sorted(res["fits"], key=lambda r: (np.nan_to_num(r["aic"], nan=np.inf)))[0]
        row = {
            "ratio": name,
            "group_n": len(g),
            "n": res["n"],
            "n_pos": res["n_pos"],
            "zero_share": res["zero_share"],
            **best,
        }
        for idx, col in enumerate(group_cols):
            row[col] = gkey[idx]
        group_rows.append(row)

by_group = pd.DataFrame(group_rows)
print(by_group.head(10))

out_group = os.path.join(OUT_DIR, "ratio_fit_by_group.csv")
by_group.to_csv(out_group, index=False)
print("Wrote", out_group)