In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

loan_df = pd.read_csv('data/train.csv', index_col=0)
test_df = pd.read_csv('data/test.csv', index_col=0)

In [73]:
# cols details

cols = loan_df.columns
feats = cols.drop('loan_paid_back')
target = 'loan_paid_back'

In [74]:
# sample of data

loan_sm, loan_lg = train_test_split(loan_df, stratify=loan_df[target], train_size=0.15, random_state=42)
loan_sm.shape

(53459, 12)

In [78]:
# label books for ordinal features

grades = 'ABCDEFG'
grade_levels = '12345'
subgrades = [ch+lev for ch in grades for lev in grade_levels]
grade_mapping = {
    subgrade : i+1 for i,subgrade in enumerate(subgrades)
}

edu_mapping = {
    'High School': 1,
    'Other': 2,
    "Bachelor's": 3,
    "Master's": 4,
    'PhD': 5
}

def cat_map(data):
    data = data.copy()
    data['education_level'] = data['education_level'].map(edu_mapping)
    data['grade_subgrade'] = data['grade_subgrade'].map(grade_mapping)
    return data


def engineer_features(df):
    # Create a copy to avoid SettingWithCopy warnings
    df = cat_map(df)
    
    # ---------------------------------------------------------
    # 1. CAPACITY FEATURES (Can they pay?)
    # ---------------------------------------------------------
    
    # A. Loan-to-Income Ratio (LTI)
    # How heavy is the loan compared to their yearly fuel?
    # Persona A (High Earner) might have a low LTI, but Persona B (Struggling) has high LTI.
    df['loan_to_income'] = df['loan_amount'] / df['annual_income']
    
    # B. Monthly Disposable Income Proxy
    # This is the "Real Money" they have left.
    # We estimate monthly income and subtract the portion already committed to debt (DTI).
    # Note: We assume DTI is a decimal (e.g., 0.12).
    monthly_income = df['annual_income'] / 12
    monthly_debt_payments = monthly_income * df['debt_to_income_ratio']
    df['monthly_disposable_income'] = monthly_income - monthly_debt_payments
    
    # C. "Total" Burden (New Loan + Old Debt)
    # Does the NEW loan payment break the bank?
    # We estimate the new loan's monthly payment roughly (assuming 3-year term standard)
    # Formula: (Loan * (1 + InterestRate)) / 36 months
    estimated_total_repayment = df['loan_amount'] * (1 + (df['interest_rate'] / 100))
    estimated_new_monthly_payment = estimated_total_repayment / 12
    
    # The "All-In" DTI: (Old Debt + New Loan Payment) / Monthly Income
    df['all_in_dti'] = (monthly_debt_payments + estimated_new_monthly_payment) / monthly_income

    # ---------------------------------------------------------
    # 2. RELATIVE / PEER GROUP FEATURES (Comparison)
    # ---------------------------------------------------------
    
    # A. Income relative to Grade Peers
    # Is this person richer or poorer than the typical "Grade 14" borrower?
    # If they are much poorer than their grade average, they are "lucky" to have that grade (Risk?).
    grade_income_mean = df.groupby('grade_subgrade')['annual_income'].transform('mean')
    df['income_vs_grade_mean'] = df['annual_income'] / grade_income_mean

    # B. Credit Score relative to Grade Peers
    # A high credit score but bad grade suggests the lender saw something else we missed.
    grade_credit_mean = df.groupby('grade_subgrade')['credit_score'].transform('mean')
    df['credit_score_vs_grade_mean'] = df['credit_score'] - grade_credit_mean

    # ---------------------------------------------------------
    # 3. INTERACTION FEATURES (Connecting Categories)
    # ---------------------------------------------------------
    
    # A. Employment Stability x Loan Purpose
    # "Self-employed" + "Vacation" vs "Employed" + "Home"
    df['emp_purpose_interaction'] = df['employment_status'] + "_" + df['loan_purpose']
    
    # B. Marital Status x Loan Purpose
    # "Single" + "Car" vs "Married" + "Home"
    df['marital_purpose_interaction'] = df['marital_status'] + "_" + df['loan_purpose']

    # ---------------------------------------------------------
    # 4. EXTREME FLAGS (The "Red Flags")
    # ---------------------------------------------------------
    
    # Flag for Dangerous DTI (Experts often say > 40% is risky)
    df['flag_high_dti'] = (df['debt_to_income_ratio'] > 0.40).astype(int)
    
    # Flag for "Micro Loans" (Sometimes implies desperation if high interest)
    df['flag_small_loan_high_rate'] = ((df['loan_amount'] < 5000) & (df['interest_rate'] > 15)).astype(int)

    return df

# --- APPLY IT ---
# df_engineered = engineer_features(loan_df)

# Check the new columns
# print("New Feature Check:")
# print(df_engineered[['loan_to_income', 'monthly_disposable_income', 'all_in_dti', 'emp_purpose_interaction']].head())

Columns 
- ordinal_cols
    - 'grade_subgrade'
    - 'education_level'
- nominal_cols
    - 'gender'
    - 'marital_status'
    - 'employment_status'
    - 'loan_purpose'
- numeric_cols
    - 'annual_income'
    - 'debt_to_income_ratio'
    - 'credit_score'
    - 'loan_amount'
    - 'interest_rate'

In [79]:
df_engineered = engineer_features(loan_df)

In [87]:
# cols details

cols = df_engineered.columns
feats = cols.drop('loan_paid_back')
target = 'loan_paid_back'

# ordinal_cols = ['grade_subgrade', 'education_level']
# nominal_cols = ['gender', 'marital_status', 'employment_status', 'loan_purpose', 'emp_purpose_interaction', 'marital_purpose_interaction']
# numeric_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']

cat_cols = df_engineered.select_dtypes(include=['object']).columns
cat_indices = [df_engineered[feats].columns.get_loc(col) for col in cat_cols]

cat_indices

[5, 6, 8, 9, 16, 17]

In [140]:
from catboost import CatBoostClassifier
cat_clf = CatBoostClassifier(
    # monotone_constraints=monotone_constraints,
    iterations=4000,
    learning_rate=0.05,
    depth=5,
    # early_stopping_rounds=True,
    loss_function='Logloss',
    eval_metric='AUC',
    random_state=42,
    verbose=100,
    task_type="CPU"
)

loan_engi_lg, loan_engi_sm = train_test_split(df_engineered, train_size=0.99, stratify=df_engineered[target], random_state=42)

X_train, X_test, y_train, y_test = train_test_split(loan_engi_sm[feats], loan_engi_sm[target], stratify=loan_engi_sm[target], random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(df_engineered[feats], df_engineered[target], stratify=df_engineered[target], random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2673, 20)
(891, 20)
(2673,)
(891,)


In [141]:
cat_clf.fit(X_train, y_train, cat_features=cat_indices)

train_y_pred = cat_clf.predict(X_train)
test_y_pred = cat_clf.predict(X_test)

y_pred_lg = cat_clf.predict(loan_engi_lg[feats])
y_lg = loan_engi_lg[target]

print(accuracy_score(y_train, train_y_pred))
print(accuracy_score(y_test, test_y_pred))
print(accuracy_score(y_lg, y_pred_lg))

0:	total: 16.6ms	remaining: 1m 6s
100:	total: 1.84s	remaining: 1m 11s
200:	total: 3.7s	remaining: 1m 9s
300:	total: 5.64s	remaining: 1m 9s
400:	total: 7.52s	remaining: 1m 7s
500:	total: 9.55s	remaining: 1m 6s
600:	total: 11.7s	remaining: 1m 6s
700:	total: 13.8s	remaining: 1m 5s
800:	total: 15.9s	remaining: 1m 3s
900:	total: 17.8s	remaining: 1m 1s
1000:	total: 19.7s	remaining: 59s
1100:	total: 21.6s	remaining: 56.8s
1200:	total: 23.5s	remaining: 54.8s
1300:	total: 25.4s	remaining: 52.7s
1400:	total: 27.3s	remaining: 50.7s
1500:	total: 29.2s	remaining: 48.7s
1600:	total: 31.3s	remaining: 46.9s
1700:	total: 33.3s	remaining: 45s
1800:	total: 36.4s	remaining: 44.4s
1900:	total: 39.7s	remaining: 43.8s
2000:	total: 43s	remaining: 42.9s
2100:	total: 46.3s	remaining: 41.8s
2200:	total: 49.5s	remaining: 40.5s
2300:	total: 52.8s	remaining: 39s
2400:	total: 56.1s	remaining: 37.3s
2500:	total: 59.3s	remaining: 35.5s
2600:	total: 1m 2s	remaining: 33.6s
2700:	total: 1m 5s	remaining: 31.6s
2800:	total

In [142]:
# submission file

X_sub = engineer_features(test_df)[feats]
y_sub_pred = cat_clf.predict(X_sub)

y_sub_pd = pd.DataFrame(y_sub_pred, columns=[target], index=X_sub.index)
y_sub_pd.to_csv("submission09.csv")
y_sub_pd.head()

Unnamed: 0_level_0,loan_paid_back
id,Unnamed: 1_level_1
404674,1.0
549728,1.0
125237,1.0
512666,1.0
101001,1.0


In [None]:
# 85% data
    # 70% train
    # 15% test
    # 15% popn

"""

    iterations=3000,
    learning_rate=0.03,
    depth=5,
0.9069154320824641
0.905235360137321
0.9064329673207505

    iterations=3000,
    learning_rate=0.03,
    depth=5,
0.9107358209874913
0.9060804119627649
0.907199910211564


    iterations=4000,
    learning_rate=0.1,
    depth=5,
0.9212682986945537
0.9057371096586783
0.9061710843824239

"""

'\n\n    iterations=3000,\n    learning_rate=0.03,\n    depth=5,\n0.9069154320824641\n0.905235360137321\n0.9064329673207505\n\n\n'

In [None]:
# 15% data
    # 12% train data
    # 3% test data
    # 85% population data
"""
    iterations=2000,
    learning_rate=0.15,
    depth=5,
0.9465269983788502
0.8995136550692107
0.9000911083529194

    iterations=3000,
    learning_rate=0.03,
    depth=2,
0.9071205886020701
0.9041526374859709
0.9033393191961339

    iterations=3000,
    learning_rate=0.03,
    depth=3,
0.9112358149395187
0.9057239057239057
0.903071935986479

    iterations=3000,
    learning_rate=0.03,
    depth=4,
0.9162738496071829
0.9048260381593715
0.9028738743496977
    
    iterations=3000,
    learning_rate=0.03,
    depth=5,
0.9227085671530116
0.9043022820800598
0.9026758127129163

    iterations=3000,
    learning_rate=0.03,
    depth=6,
0.9305649083426861
0.9040778151889263
0.9022796894393535

    iterations=3000,
    learning_rate=0.03,
    depth=7,
0.9382466641725901
0.9032547699214366
0.9016227850106954


    iterations=3000,
    learning_rate=0.1,
    depth=3,
0.9220102257139294
0.9041526374859709
0.902071724720733
    iterations=3000,
    learning_rate=0.1,
    depth=4,
0.9327596957226587
0.9024317246539468
0.9013289935828029

    iterations=5000,
    learning_rate=0.03,
    depth=3,
0.9147524629006111
0.9059483726150392
0.9029663031135289

    iterations=5000,
    learning_rate=0.03,
    depth=5,
0.9304900860456417
0.9037785260007483
0.9022268730028785

    iterations=5000,
    learning_rate=0.01,
    depth=5,
0.9160493827160494
0.9043771043771044
0.903114849341115

    iterations=5000,
    learning_rate=0.01,
    depth=3,
0.9079685746352413
0.9045267489711935
0.9031412575593525
"""

## Optuna

In [143]:
import optuna
from sklearn.model_selection import StratifiedKFold

def objective(trial):
    # Suggest hyperparameters
    params = {
        'iterations': 3000,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        #'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
        'loss_function': 'Logloss',
        'bootstrap_type': 'Bernoulli',
        'eval_metric': 'Accuracy',
        'random_seed': 42,
        'auto_class_weights': 'Balanced',
        'early_stopping_rounds': 500,
        'verbose': 0,
        'use_best_model': True,
        "task_type": "CPU",
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = CatBoostClassifier(**params)
        model.fit(
            X_tr, y_tr,
            cat_features=cat_indices,
            eval_set=(X_val, y_val)
        )

        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        accuracies.append(acc)

    # Return mean accuracy across folds
    return np.mean(accuracies)


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,show_progress_bar=True)  # increase n_trials for more exhaustive search

print("Best hyperparameters:", study.best_params)
print("Best CV accuracy:", study.best_value)

# [I 2025-12-15 06:14:53,777] Trial 1 finished with value: 0.8663407049959906 and parameters: {'learning_rate': 0.015669248021476716, 'depth': 9, 'l2_leaf_reg': 1.0242728800514602, 'subsample': 0.9126656006086237}. Best is trial 0 with value: 0.8683733672902239.


[I 2025-12-15 09:36:52,720] A new study created in memory with name: no-name-2ffbdba4-cf8a-4fb4-a376-53c76d616a73
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),


[I 2025-12-15 09:39:54,385] Trial 0 finished with value: 0.8870159963596904 and parameters: {'learning_rate': 0.010311617582238102, 'depth': 9, 'l2_leaf_reg': 1.4752022498568205, 'subsample': 0.7351084673172057}. Best is trial 0 with value: 0.8870159963596904.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),


[I 2025-12-15 09:41:10,168] Trial 1 finished with value: 0.8829066470650005 and parameters: {'learning_rate': 0.04927868779475489, 'depth': 4, 'l2_leaf_reg': 1.1597418167835205, 'subsample': 0.9759759791137526}. Best is trial 0 with value: 0.8870159963596904.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),


[I 2025-12-15 09:42:51,060] Trial 2 finished with value: 0.8855248696139171 and parameters: {'learning_rate': 0.010534981576551894, 'depth': 5, 'l2_leaf_reg': 9.762273986331374, 'subsample': 0.8650202527313824}. Best is trial 0 with value: 0.8870159963596904.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),


[I 2025-12-15 09:46:56,744] Trial 3 finished with value: 0.8911379467254716 and parameters: {'learning_rate': 0.010927590779562535, 'depth': 7, 'l2_leaf_reg': 1.4904505137718584, 'subsample': 0.9888071873816984}. Best is trial 3 with value: 0.8911379467254716.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),


[I 2025-12-15 09:50:25,119] Trial 4 finished with value: 0.8866477650600301 and parameters: {'learning_rate': 0.033412063343116014, 'depth': 5, 'l2_leaf_reg': 1.6237940673505662, 'subsample': 0.9973496652115691}. Best is trial 3 with value: 0.8911379467254716.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),


[I 2025-12-15 09:53:48,065] Trial 5 finished with value: 0.8907599145927403 and parameters: {'learning_rate': 0.011484189167414942, 'depth': 4, 'l2_leaf_reg': 1.3694457598916605, 'subsample': 0.912253469396551}. Best is trial 3 with value: 0.8911379467254716.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),


[I 2025-12-15 09:58:55,461] Trial 6 finished with value: 0.8866365641079492 and parameters: {'learning_rate': 0.02219727264965533, 'depth': 10, 'l2_leaf_reg': 1.2385819298995306, 'subsample': 0.6770295972515389}. Best is trial 3 with value: 0.8911379467254716.


In [None]:
best_params = study.best_params
best_params.update({
    'iterations': 2000,
    'loss_function': 'Logloss',
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'bootstrap_type': 'Bernoulli',
   # 'auto_class_weights': 'Balanced',
    'early_stopping_rounds': 200,
    'verbose': 100,
    'use_best_model': True,
    'task_type': 'CPU'#if you use gpu
})

final_model = CatBoostClassifier(**best_params)
final_model.fit(
    X_train, y_train,
    cat_features=cat_indices,
    eval_set=(X_test, y_test)
)