# ðŸ§© Modeling: KMeans Segmentation

Goal: segment loans into groups with potentially different risk profiles, then validate by comparing **bad rates** per cluster on a holdout set.

> Note: KMeans uses Euclidean distance, so we use numeric, scaled features only.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (12, 6)

## Load data

This notebook expects a loan-level dataset with features + `bad`.

In [None]:
# Use df_model (wrangled) or df_fe (engineered) depending on your goal.
# Here we follow the segmentation approach from your draft: origination + purpose/property dummies.

# If not in memory, load from disk:
# df_model = pd.read_parquet(DATA_DIR + "df_model.parquet")

df_seg = df.copy()  # if you ran earlier notebooks in the same session

## Prepare variables for clustering

In [None]:
# Example variable list from your clustering draft
df_seg["loan_term"] = np.where(df_seg["ORIG_TERM"] == 360, 2, np.where(df_seg["ORIG_TERM"] == 180, 1, 0))

# Exclude borrowers with invalid FICO
df_seg = df_seg[df_seg["CSCORE_B"] > 300].copy()

# Collapse rare property types
df_seg["prop"] = np.where(df_seg["PROP"].isin(["MH", "CP"]), "other", df_seg["PROP"])

# One-hot encode
df_seg = pd.get_dummies(df_seg, columns=["PURPOSE"])
df_seg = pd.get_dummies(df_seg, columns=["prop"])

Xlist = [
    "ORIG_UPB","CSCORE_B","loan_term","OLTV","DTI","NO_UNITS","MI_PCT",
    "PURPOSE_C","PURPOSE_P","PURPOSE_R",
    "prop_CO","prop_PU","prop_SF","prop_other",
]
Xylist = Xlist + ["bad"]

# split modeling vs validation
df_seg["random"] = np.random.uniform(0, 1, len(df_seg))
df_seg["seg"] = np.where(df_seg["random"] <= 0.7, "mod", "val")

modelsample = df_seg[df_seg["seg"] == "mod"][Xylist].fillna(0).copy()
valsample   = df_seg[df_seg["seg"] == "val"][Xylist].fillna(0).copy()

modelsample.shape, valsample.shape

## Scale features (fit on modeling sample, apply to validation)

Your original draft standardizes then min-max scales. Below is a faithful version.

In [None]:
modsamp = modelsample.copy()

def standardize(col, samp, ref):
    mu = ref[col].mean()
    sd = ref[col].std()
    return (samp[col] - mu) / (sd if sd != 0 else 1)

for col in Xlist:
    modelsample[col] = standardize(col, modelsample, modsamp)
    valsample[col]   = standardize(col, valsample, modsamp)

modsamp2 = modelsample.copy()

def minmax(col, samp, ref):
    mx = ref[col].max()
    mn = ref[col].min()
    denom = (mx - mn) if (mx - mn) != 0 else 1
    return (samp[col] - mn) / denom

for col in Xlist:
    modelsample[col] = minmax(col, modelsample, modsamp2)
    valsample[col]   = minmax(col, valsample, modsamp2)

modelsample[Xlist].describe().T

## Choose K (elbow plot)

In [None]:
from sklearn.cluster import KMeans

sse = []
Ks = range(1, 15)
X_mod = modelsample.drop(columns=["bad"])

for k in Ks:
    km = KMeans(n_clusters=k, n_init="auto", random_state=0)
    km.fit(X_mod)
    sse.append(km.inertia_)

plt.plot(list(Ks), sse)
plt.xlabel("K")
plt.ylabel("Sum of Squared Error (Inertia)")
plt.title("Elbow Plot for KMeans")
plt.show()

## Fit final KMeans + validate by bad rate per cluster

In [None]:
k_final = 7
km = KMeans(n_clusters=k_final, n_init="auto", random_state=0)
km.fit(X_mod)

modelsample["cluster"] = km.predict(X_mod)
valsample["cluster"]   = km.predict(valsample.drop(columns=["bad"]))

modelsum = modelsample.groupby("cluster")["bad"].agg(model_count="count", model_sum="sum", model_mean="mean")
modelsum["model_count_pct"] = modelsum["model_count"] / modelsum["model_count"].sum()

valsum = valsample.groupby("cluster")["bad"].agg(val_count="count", val_sum="sum", val_mean="mean")
valsum["val_count_pct"] = valsum["val_count"] / valsum["val_count"].sum()

mod_val_sum = pd.merge(modelsum, valsum, on="cluster", how="outer").reset_index()
mod_val_sum

### Plot: bad rate by cluster (model vs validation)

In [None]:
clusters = sorted(mod_val_sum["cluster"].tolist())
x_axis = np.arange(len(clusters))

plt.bar(x_axis - 0.2, mod_val_sum["model_mean"], width=0.4, label="modeling")
plt.bar(x_axis + 0.2, mod_val_sum["val_mean"], width=0.4, label="validation")
plt.xticks(x_axis, clusters)
plt.xlabel("Cluster")
plt.ylabel("Bad Rate")
plt.title("Bad Rate by Cluster (Model vs Validation)")
plt.gca().yaxis.set_major_formatter(lambda x, pos: f"{x:.1%}")
plt.legend()
plt.show()

### Plot: borrower share by cluster (model vs validation)

In [None]:
plt.bar(x_axis - 0.2, mod_val_sum["model_count_pct"], width=0.4, label="modeling")
plt.bar(x_axis + 0.2, mod_val_sum["val_count_pct"], width=0.4, label="validation")
plt.xticks(x_axis, clusters)
plt.xlabel("Cluster")
plt.ylabel("% Borrowers")
plt.title("Borrower Share by Cluster (Model vs Validation)")
plt.gca().yaxis.set_major_formatter(lambda x, pos: f"{x:.1%}")
plt.legend()
plt.show()