In [60]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## **Feature Engineering**

In [61]:
# Load data
train_df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")

# Configuration
target = 'loan_paid_back'
categories = [
    'gender', 'marital_status', 'education_level', 'employment_status',
    'loan_purpose', 'grade_subgrade', 'annual_income_bin', 'loan_amount_bin',
    'debt_to_income_ratio', 'credit_score', 'interest_rate'
]
SMOOTHING = 100
N_SPLITS = 5
N_BINS = 250

# Quantile Binning on Full Training Data
train_df['annual_income_bin'], income_bins = pd.qcut(
    train_df['annual_income'], N_BINS, labels=False, duplicates='drop', retbins=True
)
train_df['loan_amount_bin'], loan_bins = pd.qcut(
    train_df['loan_amount'], N_BINS, labels=False, duplicates='drop', retbins=True
)

# Handle out-of-range values (for completeness, though not needed if only training)
train_df['annual_income_bin'] = train_df['annual_income_bin'].fillna(-1).astype(int)
train_df['loan_amount_bin'] = train_df['loan_amount_bin'].fillna(-1).astype(int)

# K-Fold Setup
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
global_mean = train_df[target].mean()
pair_cols = list(combinations(categories, 2))

# Pre-create all feature column names
single_te_cols = [f'{col}_te' for col in categories]
pair_te_cols = [f'{col1}_{col2}_te' for col1, col2 in pair_cols]
pair_freq_cols = [f'{col1}_{col2}_freq' for col1, col2 in pair_cols]

# Pre-allocate numpy arrays for features
n_train = len(train_df)
train_single_te = np.zeros((n_train, len(categories)), dtype=np.float32)
train_pair_te = np.zeros((n_train, len(pair_cols)), dtype=np.float32)
train_pair_freq = np.zeros((n_train, len(pair_cols)), dtype=np.float32)

# K-Fold Feature Engineering for Training
for train_idx, val_idx in kf.split(train_df, train_df[target]):
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]
    
    # Single-column Target Encoding with Smoothing
    for i, col in enumerate(categories):
        agg = train_fold.groupby(col, observed=False)[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        train_single_te[val_idx, i] = val_fold[col].map(smoothed_means).fillna(global_mean).values
    
    # Pairwise Target & Frequency Encoding
    for j, (col1, col2) in enumerate(pair_cols):
        grouped = train_fold.groupby([col1, col2], observed=False)
        
        # Target encoding with smoothing
        agg = grouped[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        val_index = pd.MultiIndex.from_arrays([val_fold[col1].values, val_fold[col2].values])
        train_pair_te[val_idx, j] = smoothed_means.reindex(val_index, fill_value=global_mean).values
        
        # Frequency encoding
        freq_counts = grouped.size()
        train_pair_freq[val_idx, j] = freq_counts.reindex(val_index, fill_value=0).values

# Convert arrays to DataFrames
train_single_te_df = pd.DataFrame(train_single_te, columns=single_te_cols, index=train_df.index)
train_pair_te_df = pd.DataFrame(train_pair_te, columns=pair_te_cols, index=train_df.index)
train_pair_freq_df = pd.DataFrame(train_pair_freq, columns=pair_freq_cols, index=train_df.index)

# Concatenate all features at once
train_df = pd.concat([train_df, train_single_te_df, train_pair_te_df, train_pair_freq_df], axis=1)

# Final Data Prep
train_df.drop(columns=['id'], inplace=True)

# Convert categorical columns to category dtype
for col in categories:
    train_df[col] = train_df[col].astype('category')

# Define numeric columns
numeric = [col for col in train_df.columns if col not in categories + [target]]

drop_cols = ['gender_grade_subgrade_freq', 'credit_score_interest_rate_freq',
             'loan_amount_bin_te', 'gender_marital_status_freq', 'annual_income_bin_loan_amount_bin_freq',
             'debt_to_income_ratio_interest_rate_freq', 'gender_loan_purpose_freq',
             'grade_subgrade_loan_amount_bin_freq', 'loan_purpose_te', 'marital_status_te',
             'annual_income_bin_interest_rate_freq', 'education_level_grade_subgrade_freq',
             'gender_education_level_te', 'marital_status_education_level_freq',
             'loan_amount_bin_interest_rate_freq', 'loan_amount_bin_credit_score_freq', 'loan_purpose',
             'education_level_te', 'gender_te', 'education_level', 'gender', 'marital_status']
train_df = train_df.drop(columns=drop_cols)

print(f"Training data shape: {train_df.shape}")
print(f"Number of features created: {len(single_te_cols) + len(pair_te_cols) + len(pair_freq_cols)}")

Training data shape: (593994, 113)
Number of features created: 121


## **Data Preprocessing**

In [62]:
# Define Features and Target Variable
categorical_features = ['employment_status', 'grade_subgrade', 'annual_income_bin', 'loan_amount_bin',
                        'debt_to_income_ratio', 'credit_score', 'interest_rate']
target_column = 'loan_paid_back'
for col in categorical_features:
    train_df[col] = train_df[col].astype(str)

X = train_df.drop(columns=[target_column])
y = train_df[target_column]

# Label Encoding for Categorical Features (XGBoost and LightGBM)
label_encoders = {}
X_encoded = X.copy()

for col in categorical_features:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Get Categorical Feature Indices for CatBoost
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_features]

# Arrays to store OOF predictions
n_samples = len(X)
oof_xgb = np.zeros(n_samples)
oof_lgb = np.zeros(n_samples)
oof_cat = np.zeros(n_samples)
oof_hgb = np.zeros(n_samples)

## **Cross Validation & Hyperparameters**

In [63]:
# Define cross-validation strategy
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_SEED)

# Model hyperparameters
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',
    'max_depth': 6,
    'min_child_weight': 10,
    'gamma': 3.6599697090570253,
    'subsample': 0.7993292420985183,
    'colsample_bytree': 0.5780093202212182,
    'learning_rate': 0.01699897838270077,
    'reg_alpha': 0.5808361216819946,
    'reg_lambda': 8.661761457749352,
    'num_boost_round': 974,
    'random_state': RANDOM_SEED,
    'verbosity': 0
}

lgb_params = {
    'device': 'gpu',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'objective': 'binary',
    'num_boost_round': 1000,
    'num_leaves': 130,
    'max_depth': 9,
    'min_child_samples': 99,
    'min_child_weight': 7.018938436494878,
    'subsample': 0.8818736974574264,
    'colsample_bytree': 0.5077880141731121,
    'learning_rate': 0.010022149177412183,
    'reg_alpha': 3.2977545943630857,
    'reg_lambda': 0.13179214380923465,
    'random_state': RANDOM_SEED,
    'verbose': -1
}

cat_params = {
    'task_type': 'GPU',
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'iterations': 280,
    'depth': 10,
    'learning_rate': 0.0642472484400316,
    'l2_leaf_reg': 4.917435003771144,
    'bagging_temperature': 0.009361618288077855,
    'random_strength': 1.9235987858839563,
    'border_count': 177,
    'random_state': RANDOM_SEED,
    'verbose': False
}

hgb_params = {
    'max_iter': 426,
    'max_depth': 5,
    'min_samples_leaf': 79,
    'learning_rate': 0.03389312852746559,
    'max_bins': 164,
    'l2_regularization': 6.608557252922866,
    'random_state': RANDOM_SEED,
    'verbose': 0
}

## **Out-Of-Fold Cross Validation**

In [64]:
# Perform OOF CV
for fold, (train_idx, val_idx) in enumerate(skf.split(X_encoded, y), 1):
    print(f"Fold {fold}/{n_folds}")

    X_train, X_val = X_encoded.iloc[train_idx], X_encoded.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # HistGradientBoosting
    print("  Training HGB...")
    hgb_model = HistGradientBoostingClassifier(**hgb_params)
    hgb_model.fit(X_train, y_train)
    oof_hgb[val_idx] = hgb_model.predict_proba(X_val)[:, 1]

    # XGBoost
    print("  Training XGBoost...")
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1]

    # LightGBM
    print("  Training LightGBM...")
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val)[:, 1]

    # CatBoost (uses original data with categorical features)
    print("  Training CatBoost...")
    X_train_cat = X.iloc[train_idx]
    X_val_cat = X.iloc[val_idx]
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(X_train_cat, y_train, cat_features=cat_feature_indices,
                  eval_set=(X_val_cat, y_val))
    oof_cat[val_idx] = cat_model.predict_proba(X_val_cat)[:, 1]

Fold 1/5
  Training HGB...
  Training XGBoost...
  Training LightGBM...
  Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2/5
  Training HGB...
  Training XGBoost...
  Training LightGBM...
  Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 3/5
  Training HGB...
  Training XGBoost...
  Training LightGBM...
  Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 4/5
  Training HGB...
  Training XGBoost...
  Training LightGBM...
  Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 5/5
  Training HGB...
  Training XGBoost...
  Training LightGBM...
  Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


In [65]:
# ROC-AUC of Each Model
auc_xgb = roc_auc_score(y, oof_xgb)
auc_lgb = roc_auc_score(y, oof_lgb)
auc_cat = roc_auc_score(y, oof_cat)
auc_hgb = roc_auc_score(y, oof_hgb)

print(f"XGBoost:  {auc_xgb:.6f}")
print(f"LightGBM: {auc_lgb:.6f}")
print(f"CatBoost: {auc_cat:.6f}")
print(f"HGB:      {auc_hgb:.6f}")

XGBoost:  0.922035
LightGBM: 0.925633
CatBoost: 0.927067
HGB:      0.925299


## **Train Meta Model**

In [66]:
# Create Meta Model Dataframe including HGB
meta_features = np.column_stack([oof_xgb, oof_lgb, oof_cat, oof_hgb])
meta_df = pd.DataFrame(meta_features, columns=['XGBoost', 'LightGBM', 'CatBoost', 'HGB'])

# Train Meta Model
meta_model = LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)
meta_model.fit(meta_features, y)

# Meta Model Evaluation
meta_predictions = meta_model.predict_proba(meta_features)[:, 1]
meta_auc = roc_auc_score(y, meta_predictions)

# Print Best Weights
weights_raw = meta_model.coef_[0]
weights_normalized = np.abs(weights_raw) / np.abs(weights_raw).sum()

print(f"Stacked Model AUC: {meta_auc:.6f}")
print("\nOptimal Stacking Weights:")
for name, weight in zip(['XGBoost', 'LightGBM', 'CatBoost', 'HGB'], weights_normalized):
    print(f"  {name}: {weight:.5f} ({weight*100:.3f}%)")

Stacked Model AUC: 0.927113

Optimal Stacking Weights:
  XGBoost: 0.02453 (2.453%)
  LightGBM: 0.13118 (13.118%)
  CatBoost: 0.73175 (73.175%)
  HGB: 0.11254 (11.254%)
