In [330]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

In [331]:
df = pd.read_csv('Loan_Data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


## Proability of Default

In [332]:
X = df.drop(columns=['default', 'customer_id'])
y = df['default']

In [333]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)
X_train

Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score
4792,5,6872.275299,34570.154060,107248.81650,3,514
8854,3,3585.269719,10301.508670,68611.69532,6,551
6250,2,6370.938297,17663.289640,116174.59850,5,722
5936,4,3076.065127,10175.947590,47974.42736,5,603
425,0,4553.580228,942.588258,94258.82575,5,606
...,...,...,...,...,...,...
5734,0,5156.223930,5498.313957,78189.16432,3,632
5191,0,4176.678136,2875.619535,49886.56723,3,641
5390,0,4105.612488,6572.384556,81302.87422,3,576
860,3,4899.443593,17795.917300,81136.36579,2,554


In [334]:
regr = LogisticRegression()
regr.fit(X_train.values, y_train)

y_predict = regr.predict(X_test.values)
regr.predict_proba(X_test.values)

In [335]:
f1_score(y_pred=y_predict, y_true=y_test)

0.9624300559552358

In [336]:
accuracy_score(y_pred=y_predict, y_true=y_test)

0.9865714285714285

In [337]:
roc_auc_score(y_score=y_predict, y_true=y_test)

0.973883620692036

In [338]:
RECOVERY_RATE = 0.1

def expected_loss(loan_amount, credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score, model):
    # prob of default
    pd = model.predict_proba(np.array([credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score]).reshape(1, -1)).flat[1]
    # PD x (1 - RR) x EAD = Expected Loss
    return pd * (1 - RECOVERY_RATE) * loan_amount

In [339]:
la = 1e6
cred_line_outs = 4
loan_amt_outs = 5e5
total_debt_outs = 7e5
income = 1e5
years_employed = 7
fico_score = 850
expected_loss(la, cred_line_outs, loan_amt_outs, total_debt_outs, income, years_employed, fico_score, regr)

900000.0

## FICO Bucketing

In [340]:
NUM_OF_BUCKETS = 5
MAX_FICO_SCORE = 850
FICO_OFFSET = 300

In [341]:
def log_likelihood(n, k):
    p = k/n
    if (p==0 or p==1):
        return 0
    return k*np.log(p)+ (n-k)*np.log(1-p)

In [342]:
x = df['default'].to_list()
y = df['fico_score'].to_list()
n = len(df)

default = [0 for i in range(MAX_FICO_SCORE + 1)]
total = [0 for i in range(MAX_FICO_SCORE + 1)]

for i in range(n):
    default[y[i]-FICO_OFFSET] += x[i]
    total[y[i]-FICO_OFFSET] += 1
    
for i in range(MAX_FICO_SCORE - FICO_OFFSET + 1):
    default[i] += default[i-1]
    total[i] += total[i-1]
    
dp = [[[-10**18, 0] for i in range(MAX_FICO_SCORE + 1)] for j in range(NUM_OF_BUCKETS + 1)]

for i in range(NUM_OF_BUCKETS + 1):
    for j in range(MAX_FICO_SCORE - FICO_OFFSET + 1):
        if (i==0):
            dp[i][j][0] = 0
        else:
            for k in range(j):
                if (total[j]==total[k]):
                    continue
                if (i==1):
                    dp[i][j][0] = log_likelihood(total[j], default[j])
                else:
                    if (dp[i][j][0] < (dp[i-1][k][0] + log_likelihood(total[j]-total[k], default[j] - default[k]))):
                        dp[i][j][0] = log_likelihood(total[j]-total[k], default[j]-default[k]) + dp[i-1][k][0]
                        dp[i][j][1] = k

print(round(dp[NUM_OF_BUCKETS][550][0], 4))

k = 550
l = []
for i in range(NUM_OF_BUCKETS, 0, -1):
    l.append(k+300)
    k = dp[i][k][1]

print(l)


-4255.3774
[850, 696, 640, 580, 520]
