In [1]:
import pandas as pd

# Load the data
file_path = '/content/Task 3 and 4_Loan_Data (2).csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [2]:
# Check for missing values
data.info()

# Display summary statistics
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4974577.0,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577,0.1851
std,2293890.0,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906,0.388398
min,1000324.0,0.0,46.783973,31.652732,1000.0,0.0,408.0,0.0
25%,2977661.0,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0,0.0
50%,4989502.0,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0,0.0
75%,6967210.0,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0,0.0
max,8999789.0,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0,1.0


In [3]:
from sklearn.cluster import KMeans
import numpy as np

# Set the number of buckets
num_buckets = 10

# Extract FICO scores
fico_scores = data['fico_score'].values.reshape(-1, 1)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=num_buckets, random_state=42).fit(fico_scores)

# Assign bucket labels
data['MSE_Bucket'] = kmeans.labels_

# Show the distribution of FICO scores in each bucket
bucket_centers = kmeans.cluster_centers_
bucket_centers_sorted = np.sort(bucket_centers.flatten())
bucket_boundaries = [np.min(fico_scores)] + list((bucket_centers_sorted[:-1] + bucket_centers_sorted[1:]) / 2) + [np.max(fico_scores)]
bucket_boundaries




[408,
 515.8031249999999,
 557.7422849915681,
 589.3543828697336,
 616.1768291641539,
 640.2548256256111,
 663.7636782176181,
 688.6725876730616,
 716.9956139645575,
 755.7019131997945,
 850]

In [4]:
from itertools import combinations

# Function to calculate log-likelihood
def log_likelihood(data, num_buckets):
    bucket_edges = [300] + list(np.percentile(data['fico_score'], np.linspace(0, 100, num_buckets + 1)[1:-1])) + [850]
    data['Bucket'] = pd.cut(data['fico_score'], bins=bucket_edges, labels=False)
    log_likelihood_value = 0
    for bucket in range(num_buckets):
        bucket_data = data[data['Bucket'] == bucket]
        n_i = len(bucket_data)
        k_i = bucket_data['default'].sum()
        if n_i != 0 and 0 < k_i < n_i:
            p_i = k_i / n_i
            log_likelihood_value += n_i * (k_i * np.log(p_i) + (n_i - k_i) * np.log(1 - p_i))
    return log_likelihood_value, bucket_edges

# Iterate over possible bucket boundaries to maximize log-likelihood
best_log_likelihood = -np.inf
best_bucket_edges = None
for num_buckets in range(5, 11):  # Trying different number of buckets
    ll_value, bucket_edges = log_likelihood(data, num_buckets)
    if ll_value > best_log_likelihood:
        best_log_likelihood = ll_value
        best_bucket_edges = bucket_edges

# Apply the best bucket boundaries
data['LogLikelihood_Bucket'] = pd.cut(data['fico_score'], bins=best_bucket_edges, labels=False)

best_bucket_edges


[300, 560.0, 587.0, 607.0, 623.0, 638.0, 653.0, 670.0, 688.0, 714.0, 850]

In [5]:
# Function to evaluate buckets
def evaluate_buckets(data, bucket_column):
    return data.groupby(bucket_column)['default'].mean()

# Evaluate MSE approach
mse_evaluation = evaluate_buckets(data, 'MSE_Bucket')

# Evaluate log-likelihood approach
log_likelihood_evaluation = evaluate_buckets(data, 'LogLikelihood_Bucket')

mse_evaluation, log_likelihood_evaluation


(MSE_Bucket
 0    0.093682
 1    0.310287
 2    0.039548
 3    0.227242
 4    0.439706
 5    0.017316
 6    0.667969
 7    0.118598
 8    0.072106
 9    0.171465
 Name: default, dtype: float64,
 LogLikelihood_Bucket
 0    0.490659
 1    0.307841
 2    0.249240
 3    0.181911
 4    0.174395
 5    0.128385
 6    0.105161
 7    0.094845
 8    0.071642
 9    0.036437
 Name: default, dtype: float64)