Task 4 of The Forage Quantitative Research Module | JPMC

In [17]:
import numpy as np
import pandas as pd
data = pd.read_csv("data/loan_data.csv")
data = data.sort_values("fico_score").reset_index(drop=True)

# Aggregate by FICO score
grouped = (
    data
    .groupby("fico_score")
    .agg(
        n=("default", "count"),
        k=("default", "sum")
    )
    .reset_index()
    .sort_values("fico_score")
)

fico_vals = grouped["fico_score"].values
n_vals = grouped["n"].values
k_vals = grouped["k"].values
M = len(grouped)

# Cumulative sums
cum_n = np.cumsum(n_vals)
cum_k = np.cumsum(k_vals)

In [None]:
def log_likelihood(k, n):
    if k == 0 or k == n:
        return 0.0
    p = k / n
    return k * np.log(p) + (n - k) * np.log(1 - p)

# Likelihood matrix
LL = np.zeros((M, M))
for i in range(M):
    for j in range(i, M):
        k = cum_k[j] - (cum_k[i - 1] if i > 0 else 0)
        n = cum_n[j] - (cum_n[i - 1] if i > 0 else 0)
        LL[i, j] = log_likelihood(k, n)


In [19]:
def optimal_fico_buckets(num_buckets):
    dp = np.full((num_buckets + 1, M), -np.inf)
    split = np.zeros((num_buckets + 1, M), dtype=int)

    dp[1, :] = LL[0, :]

    for b in range(2, num_buckets + 1):
        for j in range(b - 1, M):
            for i in range(b - 2, j):
                val = dp[b - 1, i] + LL[i + 1, j]
                if val > dp[b, j]:
                    dp[b, j] = val
                    split[b, j] = i

    boundaries = []
    j = M - 1
    for b in range(num_buckets, 1, -1):
        i = split[b, j]
        boundaries.append(fico_vals[i])
        j = i

    return sorted(boundaries)

In [20]:
# FICO to rating mapper
def fico_to_rating(boundaries):
    def mapper(score):
        for i, b in enumerate(boundaries):
            if score <= b:
                return i + 1
        return len(boundaries) + 1
    return mapper

In [15]:
def create_rating_map(boundaries):
    def rating(fico_score):
        for i, b in enumerate(boundaries):
            if fico_score <= b:
                return i + 1
        return len(boundaries) + 1
    return rating


In [25]:
# Run
num_buckets = 5
boundaries = optimal_fico_buckets(num_buckets)
rating_fn = fico_to_rating(boundaries)

data["fico_rating"] = data["fico_score"].apply(rating_fn)

print(boundaries)
print(data.groupby("fico_rating")["default"].mean())

[np.int64(520), np.int64(580), np.int64(640), np.int64(696)]
fico_rating
1    0.661130
2    0.380952
3    0.204479
4    0.105099
5    0.046470
Name: default, dtype: float64
