In [1]:
import numpy as np 
import pandas as pd
import os
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold
from catboost import CatBoostClassifier as cbc
import catboost as cb
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import gc

In [2]:
class CONFIG:
    seed = 67
    train_path = '/kaggle/input/playground-series-s5e11/train.csv'
    test_path = '/kaggle/input/playground-series-s5e11/test.csv'
    orig_path = '/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv'

    columns = ['annual_income',
         'debt_to_income_ratio',
         'credit_score',
         'loan_amount',
         'interest_rate',
         'gender',
         'marital_status',
         'education_level',
         'employment_status',
         'loan_purpose',
         'grade_subgrade']

    CATS = ['gender','marital_status','education_level','employment_status','loan_purpose','grade_subgrade']
    NUMS = ['annual_income',
         'debt_to_income_ratio',
         'credit_score',
         'loan_amount',
         'interest_rate']
    TARGET = 'loan_paid_back'


    # feature engineering
    agg_funcs = ['mean', 'std', 'min', 'max', 'median']


    # training
    N_SPLITS = 5

cfg = CONFIG()


def print_with_design(statement):
    print('='* 40)
    print(statement)
    print('='* 40, '\n')

In [3]:
def import_data(sep_ids=True):
    print_with_design('1. IMPORTING DATA')
    train = pd.read_csv(cfg.train_path)
    test = pd.read_csv(cfg.test_path)
    orig = pd.read_csv(cfg.orig_path)

    output = (train, orig, test)
    
    if sep_ids:
        train = train.drop('id', axis=1)
        test_ids = test['id']
        test = test.drop('id', axis=1)
        output = (train , orig, test, test_ids)

    print(f'Imported train data {train.shape}')
    print(f'Imported test data {test.shape}', end='\n')

    return output

train, orig, test, test_ids = import_data()

1. IMPORTING DATA

Imported train data (593994, 12)
Imported test data (254569, 11)


[idea inspired by \[Click me\]](https://www.kaggle.com/code/rohanrathod02/predicting-loan-payback-eda-adv-fe-ensemble#3.-Feature-Engineering-&-Preprocessing)

In [4]:
def feature_enginering(train: pd.DataFrame, orig: pd.DataFrame, test: pd.DataFrame):
    print_with_design(f'2. Feature Engineering')

    print(f"Target variable: '{cfg.TARGET}'")
    print(f"Total base features (excluding 'id' and target): {len(cfg.columns)} columns")
    print(f"Categorical base features used for engineering: {len(cfg.CATS)} columns")
    print(f"Numerical base features: {len(cfg.NUMS)} columns")
    initial_cols = len(train.columns)

    print("\n--- Creating Orig-based Statistical Features ---\n")

    for col in cfg.columns:
        print(f'- Performing Operations on {col}')
        if col not in orig.columns:
            print(f"Warning: Column '{col}' from BASE not found in df_orig. Skipping feature creation for this column.")
            continue
    
        stats_df = orig.groupby(col)[cfg.TARGET].agg(cfg.agg_funcs).reset_index()

        new_stat_col_names = [f'orig_{func}_{col}' for func in cfg.agg_funcs]
        stats_df.columns = [col] + new_stat_col_names

        train = train.merge(stats_df, on=col, how='left')
        test = test.merge(stats_df, on=col, how='left')

        counts_df = orig[col].value_counts().reset_index(name=f'orig_count_{col}')
        counts_df.columns = [col, f'orig_count_{col}']

        train = train.merge(counts_df, on=col, how='left')
        test = test.merge(counts_df, on=col, how='left')

    print("\n--- Feature Creation Summary ---\n")
    print(f"Number of new features added: {(len(train.columns) - initial_cols)}")
    print(f"Updated shape of train: {train.shape}\n")
    print(f"Updated shape of test: {test.shape}\n")

    return train, test


def advanced_features(df):
    target = True
    if cfg.TARGET not in df.columns:
        target = False
    
    print_with_design(f'3-{"1" if target else "2"}. Advance Feature Engineering on {"train" if target else "test"}')
    print('\n - Core affordability')
    df['income_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)

    print('- Debt metrics')
    df['total_debt'] = df['debt_to_income_ratio'] * df['annual_income']
    df['available_income'] = df['annual_income'] * (1 - df['debt_to_income_ratio'])
    df['debt_burden'] = df['debt_to_income_ratio'] * df['loan_amount']

    print('- Payment analysis')
    df['monthly_payment'] = df['loan_amount'] * df['interest_rate'] / 1200
    df['payment_to_income'] = df['monthly_payment'] / (df['annual_income'] / 12 + 1)
    df['affordability'] = df['available_income'] / (df['loan_amount'] + 1)

    print('- Risk scoring')
    df['default_risk'] = (df['debt_to_income_ratio'] * 0.40 +
                          (850 - df['credit_score']) / 850 * 0.35 +
                          df['interest_rate'] / 100 * 0.25)

    print('- Credit analysis')
    df['credit_utilization'] = df['credit_score'] * (1 - df['debt_to_income_ratio'])
    df['credit_interest_product'] = df['credit_score'] * df['interest_rate'] / 100

    print('- Log transformations')
    for col in ['annual_income', 'loan_amount']:
        df[f'{col}_log'] = np.log1p(df[col])

    print('- Grade parsing')
    df['grade_letter'] = df['grade_subgrade'].str[0]
    df['grade_number'] = df['grade_subgrade'].str[1].astype(int)
    grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    df['grade_rank'] = df['grade_letter'].map(grade_map)

    print(f'\n Completed Feature Engineering part for the {"train" if target else "test"}\n')

    return df

In [5]:
train, test = feature_enginering(train, orig, test)
train = advanced_features(train)
test = advanced_features(test)

2. Feature Engineering

Target variable: 'loan_paid_back'
Total base features (excluding 'id' and target): 11 columns
Categorical base features used for engineering: 6 columns
Numerical base features: 5 columns

--- Creating Orig-based Statistical Features ---

- Performing Operations on annual_income
- Performing Operations on debt_to_income_ratio
- Performing Operations on credit_score
- Performing Operations on loan_amount
- Performing Operations on interest_rate
- Performing Operations on gender
- Performing Operations on marital_status
- Performing Operations on education_level
- Performing Operations on employment_status
- Performing Operations on loan_purpose
- Performing Operations on grade_subgrade

--- Feature Creation Summary ---

Number of new features added: 66
Updated shape of train: (593994, 78)

Updated shape of test: (254569, 77)

3-1. Advance Feature Engineering on train


 - Core affordability
- Debt metrics
- Payment analysis
- Risk scoring
- Credit analysis
- Log t

In [6]:
def preprocessing(train, test):
    print_with_design('4. Preprocessing')

    print("\n--- Making Categorial Columns ---\n")
    cols = train.select_dtypes(['object']).columns
    for col in cols:
        print(f'- {col}')
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

    print('\nComplete Preprocessing\n')
    return train, test, cols

In [7]:
train, test, cat_cols = preprocessing(train, test)
cat_cols = list(cat_cols)

4. Preprocessing


--- Making Categorial Columns ---

- gender
- marital_status
- education_level
- employment_status
- loan_purpose
- grade_subgrade
- grade_letter

Complete Preprocessing



In [8]:
data = (train, test)
models = {
    'model_1': {
        'name': 'LightGBM',
        'params': {
            'objective': 'cross_entropy',
            'metric': 'auc',
            'n_estimators': 2000,
            'boosting_type': 'gbdt',
            'learning_rate': 0.01,
            'num_leaves': 50,
            'max_depth': 6,
            'min_child_samples': 20,
            'subsample': 0.8,
            'subsample_freq': 1,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.05,
            'reg_lambda': 0.1,
            'min_split_gain': 0.01,
            'random_state': cfg.seed,
            'device': 'cpu',
            'verbose': -1
        }
    }
}

In [11]:
def train_model(data, model, cat_cols):
    name = model['name']
    print_with_design(f'Training Model -- {name}')

    train, test = data

    kf = KFold(n_splits=cfg.N_SPLITS, shuffle=True, random_state=cfg.seed)
    X = train.drop(cfg.TARGET, axis=1)
    y = train[cfg.TARGET]

    oof_preds = np.zeros(train.shape[0])
    test_preds = np.zeros(test.shape[0])
    train_errors = []
    val_errors = []
    for fold, (tr, va) in enumerate(kf.split(X, y), 1):
        print(f'\n-- Fold {fold} --\n')
        
        X_train = X.iloc[tr]
        y_train = y[tr]
        X_val = X.iloc[va]
        y_val = y[va]
        
        if name == 'LightGBM':
            lgb_dataset = lgb.Dataset(X_train, y_train, categorical_feature=cat_cols)
            tr_model = lgb.train(train_set=lgb_dataset, params=model['params'])
        elif name == 'CatBoost':
            tr_model = CatBoostClassifier(**model['params'])
        else:
            tr_model = XGBoostClassifer(**model['params'])

        train_preds = tr_model.predict(X_train)
        train_error = roc_auc_score(y_train, train_preds)
        
        val_error = roc_auc_score(y_val, tr_model.predict(X_val))

        train_errors.append(train_error)
        val_errors.append(val_error)
        oof_preds[tr] = train_preds

        test_preds += tr_model.predict(test)

        print(f'- Train Error {train_error:.6f}')
        print(f'- Val Error {val_error:.6f}\n')

    test_preds = test_preds/(kf.get_n_splits())

    print_with_design('MEAN OF ERRORS')
    print(f'- Train Error {np.mean(train_error):.6f}')
    print(f'- Val Error {np.mean(val_error):.6f}\n')

    del tr_model, kf, X, y, X_train, X_val, y_train, y_val, train_preds
    gc.collect()
    
    return oof_preds, test_preds

In [12]:
oof_preds, test_preds = train_model(data, models['model_1'], cat_cols)

Training Model LightGBM


-- Fold 1 --

- Train Error 0.919565
- Val Error 0.918962


-- Fold 2 --

- Train Error 0.919483
- Val Error 0.918645


-- Fold 3 --

- Train Error 0.919811
- Val Error 0.918237


-- Fold 4 --

- Train Error 0.919567
- Val Error 0.918887


-- Fold 5 --

- Train Error 0.919502
- Val Error 0.919522

MEAN OF ERRORS

- Train Error 0.919502
- Val Error 0.919522



In [16]:
def submit_test(test_ids=test_ids, test_preds=test_preds):
    output = pd.DataFrame({
        'id': test_ids,
        cfg.TARGET: test_preds
    })

    output.to_csv('submission.csv')
    return output.head()

In [18]:
submit_test()

Unnamed: 0,id,loan_paid_back
0,593994,0.877264
1,593995,0.91664
2,593996,0.512669
3,593997,0.891176
4,593998,0.890283
