In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## **Feature Engineering**

In [68]:
# Load data
train_df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")
test_ids = test_df['id'].copy()

# Configuration
target = 'loan_paid_back'
categories = [
    'gender', 'marital_status', 'education_level', 'employment_status',
    'loan_purpose', 'grade_subgrade', 'annual_income_bin', 'loan_amount_bin',
    'debt_to_income_ratio', 'credit_score', 'interest_rate'
]
SMOOTHING = 100
N_SPLITS = 5
N_BINS = 250

# Quantile Binning on Full Training Data
train_df['annual_income_bin'], income_bins = pd.qcut(
    train_df['annual_income'], N_BINS, labels=False, duplicates='drop', retbins=True
)
train_df['loan_amount_bin'], loan_bins = pd.qcut(
    train_df['loan_amount'], N_BINS, labels=False, duplicates='drop', retbins=True
)

# Apply bins to test data
test_df['annual_income_bin'] = pd.cut(
    test_df['annual_income'], bins=income_bins, labels=False, include_lowest=True
)
test_df['loan_amount_bin'] = pd.cut(
    test_df['loan_amount'], bins=loan_bins, labels=False, include_lowest=True
)

# Handle out-of-range values
test_df['annual_income_bin'] = test_df['annual_income_bin'].fillna(-1).astype(int)
test_df['loan_amount_bin'] = test_df['loan_amount_bin'].fillna(-1).astype(int)

# K-Fold Setup
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
global_mean = train_df[target].mean()
pair_cols = list(combinations(categories, 2))

# Pre-create all feature column names
single_te_cols = [f'{col}_te' for col in categories]
pair_te_cols = [f'{col1}_{col2}_te' for col1, col2 in pair_cols]
pair_freq_cols = [f'{col1}_{col2}_freq' for col1, col2 in pair_cols]

# Pre-allocate numpy arrays for features (faster than DataFrames)
n_train = len(train_df)
n_test = len(test_df)

train_single_te = np.zeros((n_train, len(categories)), dtype=np.float32)
train_pair_te = np.zeros((n_train, len(pair_cols)), dtype=np.float32)
train_pair_freq = np.zeros((n_train, len(pair_cols)), dtype=np.float32)

test_single_te = np.zeros((n_test, len(categories)), dtype=np.float32)
test_pair_te = np.zeros((n_test, len(pair_cols)), dtype=np.float32)
test_pair_freq = np.zeros((n_test, len(pair_cols)), dtype=np.float32)

# K-Fold Feature Engineering for Training
for train_idx, val_idx in kf.split(train_df):
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]
    
    # Single-column Target Encoding with Smoothing
    for i, col in enumerate(categories):
        agg = train_fold.groupby(col, observed=False)[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        train_single_te[val_idx, i] = val_fold[col].map(smoothed_means).fillna(global_mean).values
    
    # Pairwise Target & Frequency Encoding
    for j, (col1, col2) in enumerate(pair_cols):
        # Group by multiple columns directly
        grouped = train_fold.groupby([col1, col2], observed=False)
        
        # Target encoding with smoothing
        agg = grouped[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        
        # Map to validation fold using MultiIndex
        val_index = pd.MultiIndex.from_arrays([val_fold[col1].values, val_fold[col2].values])
        train_pair_te[val_idx, j] = smoothed_means.reindex(val_index, fill_value=global_mean).values
        
        # Frequency encoding
        freq_counts = grouped.size()
        train_pair_freq[val_idx, j] = freq_counts.reindex(val_index, fill_value=0).values

# Feature Engineering for Test Data (using full training data)
# Single-column Target Encoding
for i, col in enumerate(categories):
    agg = train_df.groupby(col, observed=False)[target].agg(['sum', 'count'])
    smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
    test_single_te[:, i] = test_df[col].map(smoothed_means).fillna(global_mean).values

# Pairwise Target & Frequency Encoding
for j, (col1, col2) in enumerate(pair_cols):
    # Group by multiple columns directly
    grouped = train_df.groupby([col1, col2], observed=False)
    
    # Target encoding with smoothing
    agg = grouped[target].agg(['sum', 'count'])
    smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
    
    # Map to test set using MultiIndex
    test_index = pd.MultiIndex.from_arrays([test_df[col1].values, test_df[col2].values])
    test_pair_te[:, j] = smoothed_means.reindex(test_index, fill_value=global_mean).values
    
    # Frequency encoding
    freq_counts = grouped.size()
    test_pair_freq[:, j] = freq_counts.reindex(test_index, fill_value=0).values

# Convert arrays to DataFrames
train_single_te_df = pd.DataFrame(train_single_te, columns=single_te_cols, index=train_df.index)
train_pair_te_df = pd.DataFrame(train_pair_te, columns=pair_te_cols, index=train_df.index)
train_pair_freq_df = pd.DataFrame(train_pair_freq, columns=pair_freq_cols, index=train_df.index)

test_single_te_df = pd.DataFrame(test_single_te, columns=single_te_cols, index=test_df.index)
test_pair_te_df = pd.DataFrame(test_pair_te, columns=pair_te_cols, index=test_df.index)
test_pair_freq_df = pd.DataFrame(test_pair_freq, columns=pair_freq_cols, index=test_df.index)

# Concatenate all features at once
train_df = pd.concat([train_df, train_single_te_df, train_pair_te_df, train_pair_freq_df], axis=1)
test_df = pd.concat([test_df, test_single_te_df, test_pair_te_df, test_pair_freq_df], axis=1)

# Final Data Prep
train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

# Convert categorical columns to category dtype
for col in categories:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# Define numeric columns
numeric = [col for col in train_df.columns if col not in categories + [target]]

# Drop Unecessary Columns
drop_cols = ['gender_grade_subgrade_freq', 'credit_score_interest_rate_freq',
             'loan_amount_bin_te', 'gender_marital_status_freq', 'annual_income_bin_loan_amount_bin_freq',
             'debt_to_income_ratio_interest_rate_freq', 'gender_loan_purpose_freq',
             'grade_subgrade_loan_amount_bin_freq', 'loan_purpose_te', 'marital_status_te',
             'annual_income_bin_interest_rate_freq', 'education_level_grade_subgrade_freq',
             'gender_education_level_te', 'marital_status_education_level_freq',
             'loan_amount_bin_interest_rate_freq', 'loan_amount_bin_credit_score_freq', 'loan_purpose',
             'education_level_te', 'gender_te', 'education_level', 'gender', 'marital_status']
train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Number of features created: {len(single_te_cols) + len(pair_te_cols) + len(pair_freq_cols)}")

Training data shape: (593994, 113)
Test data shape: (254569, 112)
Number of features created: 121


## **Data Preprocessing**

In [79]:
# Define Features and Target Variable
categorical_features = ['employment_status', 'grade_subgrade', 'annual_income_bin', 'loan_amount_bin',
                        'debt_to_income_ratio', 'credit_score', 'interest_rate']
target_column = 'loan_paid_back'
for col in categorical_features:
    train_df[col] = train_df[col].astype(str)
    test_df[col] = test_df[col].astype(str)

X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]
X_test = test_df.copy()

# Label Encoding for Categorical Features (SAFE for unseen labels)
label_encoders = {}
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in categorical_features:
    le = LabelEncoder()
    combined = pd.concat([X_train[col].astype(str), X_test[col].astype(str)], axis=0)
    le.fit(combined)
    X_train_encoded[col] = le.transform(X_train[col].astype(str))
    X_test_encoded[col]  = le.transform(X_test[col].astype(str))
    label_encoders[col] = le

# Get Categorical Feature Indices for CatBoost
cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_features]


## **Hyperparameters**

In [72]:
# Model hyperparameters
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',
    'max_depth': 6,
    'min_child_weight': 10,
    'gamma': 3.6599697090570253,
    'subsample': 0.7993292420985183,
    'colsample_bytree': 0.5780093202212182,
    'learning_rate': 0.01699897838270077,
    'reg_alpha': 0.5808361216819946,
    'reg_lambda': 8.661761457749352,
    'num_boost_round': 974,
    'random_state': RANDOM_SEED,
    'verbosity': 0
}

lgb_params = {
    'device': 'gpu',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'objective': 'binary',
    'num_boost_round': 1000,
    'num_leaves': 130,
    'max_depth': 9,
    'min_child_samples': 99,
    'min_child_weight': 7.018938436494878,
    'subsample': 0.8818736974574264,
    'colsample_bytree': 0.5077880141731121,
    'learning_rate': 0.010022149177412183,
    'reg_alpha': 3.2977545943630857,
    'reg_lambda': 0.13179214380923465,
    'random_state': RANDOM_SEED,
    'verbose': -1
}

cat_params = {
    'task_type': 'GPU',
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'iterations': 280,
    'depth': 10,
    'learning_rate': 0.0642472484400316,
    'l2_leaf_reg': 4.917435003771144,
    'bagging_temperature': 0.009361618288077855,
    'random_strength': 1.9235987858839563,
    'border_count': 177,
    'random_state': RANDOM_SEED,
    'verbose': False
}

hgb_params = {
    'max_iter': 426,
    'max_depth': 5,
    'min_samples_leaf': 79,
    'learning_rate': 0.03389312852746559,
    'max_bins': 164,
    'l2_regularization': 6.608557252922866,
    'random_state': RANDOM_SEED,
    'verbose': 0
}

## **Train Base Models**

In [80]:
# CatBoost
print("Training CatBoost Model")
cat_model = CatBoostClassifier(**cat_params)
cat_model.fit(X_train, y_train, cat_features=cat_feature_indices)
cat_preds = cat_model.predict_proba(X_test)[:, 1]

# XGBoost
print("Training XGBoost Model")
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_model.fit(X_train_encoded, y_train)
xgb_preds = xgb_model.predict_proba(X_test_encoded)[:, 1]

# LightGBM
print("Training LightGBM Model")
lgb_model = lgb.LGBMClassifier(**lgb_params)
lgb_model.fit(X_train_encoded, y_train)
lgb_preds = lgb_model.predict_proba(X_test_encoded)[:, 1]

# HistGradientBoosting
print("Training HGB Model")
hgb_model = HistGradientBoostingClassifier(**hgb_params)
hgb_model.fit(X_train_encoded, y_train)
hgb_preds = hgb_model.predict_proba(X_test_encoded)[:, 1]

Training CatBoost Model


Default metric period is 5 because AUC is/are not implemented for GPU


Training XGBoost Model
Training LightGBM Model
Training HGB Model


## **Weighted Stacking of Models**

In [81]:
# Weights
weights = {
    'XGBoost': 0.02453 ,
    'LightGBM': 0.13118 ,
    'CatBoost': 0.73175,
    'HGB': 0.11254
}

# Get Predictions
stacked_preds = (
    xgb_preds * weights['XGBoost'] +
    lgb_preds * weights['LightGBM'] +
    cat_preds * weights['CatBoost'] +
    hgb_preds * weights['HGB']
)

## **Save Predictions to CSV**

In [None]:
submission = pd.DataFrame({'id': test_ids, 'loan_paid_back': stacked_preds})
submission.to_csv("/kaggle/working/submission.csv", index=False)
print(submission.head())

       id  loan_paid_back
0  593994        0.940717
1  593995        0.978321
2  593996        0.429640
3  593997        0.913256
4  593998        0.960809
