In [2]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
import optuna

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## **Feature Engineering**

In [None]:
# Load data
train_df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")

# Configuration
target = 'loan_paid_back'
categories = [
    'gender', 'marital_status', 'education_level', 'employment_status',
    'loan_purpose', 'grade_subgrade', 'annual_income_bin', 'loan_amount_bin',
    'debt_to_income_ratio', 'credit_score', 'interest_rate'
]
SMOOTHING = 100
N_SPLITS = 5
N_BINS = 250

# Quantile Binning on Full Training Data
train_df['annual_income_bin'], income_bins = pd.qcut(
    train_df['annual_income'], N_BINS, labels=False, duplicates='drop', retbins=True
)
train_df['loan_amount_bin'], loan_bins = pd.qcut(
    train_df['loan_amount'], N_BINS, labels=False, duplicates='drop', retbins=True
)

# Handle out-of-range values (for completeness, though not needed if only training)
train_df['annual_income_bin'] = train_df['annual_income_bin'].fillna(-1).astype(int)
train_df['loan_amount_bin'] = train_df['loan_amount_bin'].fillna(-1).astype(int)

# K-Fold Setup
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
global_mean = train_df[target].mean()
pair_cols = list(combinations(categories, 2))

# Pre-create all feature column names
single_te_cols = [f'{col}_te' for col in categories]
pair_te_cols = [f'{col1}_{col2}_te' for col1, col2 in pair_cols]
pair_freq_cols = [f'{col1}_{col2}_freq' for col1, col2 in pair_cols]

# Pre-allocate numpy arrays for features
n_train = len(train_df)
train_single_te = np.zeros((n_train, len(categories)), dtype=np.float32)
train_pair_te = np.zeros((n_train, len(pair_cols)), dtype=np.float32)
train_pair_freq = np.zeros((n_train, len(pair_cols)), dtype=np.float32)

# K-Fold Feature Engineering for Training
for train_idx, val_idx in kf.split(train_df):
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]
    
    # Single-column Target Encoding with Smoothing
    for i, col in enumerate(categories):
        agg = train_fold.groupby(col, observed=False)[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        train_single_te[val_idx, i] = val_fold[col].map(smoothed_means).fillna(global_mean).values
    
    # Pairwise Target & Frequency Encoding
    for j, (col1, col2) in enumerate(pair_cols):
        grouped = train_fold.groupby([col1, col2], observed=False)
        
        # Target encoding with smoothing
        agg = grouped[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        val_index = pd.MultiIndex.from_arrays([val_fold[col1].values, val_fold[col2].values])
        train_pair_te[val_idx, j] = smoothed_means.reindex(val_index, fill_value=global_mean).values
        
        # Frequency encoding
        freq_counts = grouped.size()
        train_pair_freq[val_idx, j] = freq_counts.reindex(val_index, fill_value=0).values

# Convert arrays to DataFrames
train_single_te_df = pd.DataFrame(train_single_te, columns=single_te_cols, index=train_df.index)
train_pair_te_df = pd.DataFrame(train_pair_te, columns=pair_te_cols, index=train_df.index)
train_pair_freq_df = pd.DataFrame(train_pair_freq, columns=pair_freq_cols, index=train_df.index)

# Concatenate all features at once
train_df = pd.concat([train_df, train_single_te_df, train_pair_te_df, train_pair_freq_df], axis=1)

# Final Data Prep
train_df.drop(columns=['id'], inplace=True)

# Convert categorical columns to category dtype
for col in categories:
    train_df[col] = train_df[col].astype('category')

# Define numeric columns
numeric = [col for col in train_df.columns if col not in categories + [target]]

print(f"Training data shape: {train_df.shape}")
print(f"Number of features created: {len(single_te_cols) + len(pair_te_cols) + len(pair_freq_cols)}")

drop_cols = ['gender_grade_subgrade_freq', 'credit_score_interest_rate_freq',
             'loan_amount_bin_te', 'gender_marital_status_freq', 'annual_income_bin_loan_amount_bin_freq',
             'debt_to_income_ratio_interest_rate_freq', 'gender_loan_purpose_freq',
             'grade_subgrade_loan_amount_bin_freq', 'loan_purpose_te', 'marital_status_te',
             'annual_income_bin_interest_rate_freq', 'education_level_grade_subgrade_freq',
             'gender_education_level_te', 'marital_status_education_level_freq',
             'loan_amount_bin_interest_rate_freq', 'loan_amount_bin_credit_score_freq', 'loan_purpose',
             'education_level_te', 'gender_te', 'education_level', 'gender', 'marital_status']
train_df = train_df.drop(columns=drop_cols)

Training data shape: (593994, 135)
Number of features created: 121


## **Data Preprocessing**

In [28]:
# Define Features and Target Variable
categorical_features = ['employment_status', 'grade_subgrade', 'annual_income_bin', 'loan_amount_bin',
                        'debt_to_income_ratio', 'credit_score', 'interest_rate']
target_column = 'loan_paid_back'

for col in categorical_features:
    train_df[col] = train_df[col].astype(str)

X = train_df.drop(columns=[target_column])
y = train_df[target_column]

# Label Encoding for Categorical Features (XGBoost and LightGBM)
label_encoders = {}
X_encoded = X.copy()

for col in categorical_features:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Split Data into Training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Get Categorical Feature Indices for CatBoost
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_features]

## **Hyperparameter Tuning for XGBoost**

In [38]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Define Objective Function for Hyperparameter Tuning
def objective_xgb(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',
        'device': 'cuda',
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': RANDOM_SEED
    }

    # Train Model with Early Stopping
    evals = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Get predictions on validation set and calculate AUC
    y_pred = model.predict(dval)
    auc = roc_auc_score(y_val, y_pred)
    
    print(f"Trial {trial.number}: AUC={auc:.6f}")
    return auc

# Find Best Hyperparameters using Optuna
study_xgb = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)
study_xgb.optimize(objective_xgb, n_trials=30, show_progress_bar=True)

# Print Best Results
print(f"\nBest AUC: {study_xgb.best_value:.6f}")
print("\nBest hyperparameters:")
best_params_xgb = study_xgb.best_params
for key, value in best_params_xgb.items():
    print(f"  {key}: {value}")

Trial 0: AUC=0.925154
Trial 1: AUC=0.924794
Trial 2: AUC=0.925085
Trial 3: AUC=0.925107
Trial 4: AUC=0.924139
Trial 5: AUC=0.925026
Trial 6: AUC=0.923969
Trial 7: AUC=0.924858
Trial 8: AUC=0.924005
Trial 9: AUC=0.923988
Trial 10: AUC=0.924267
Trial 11: AUC=0.924976
Trial 12: AUC=0.924976
Trial 13: AUC=0.924492
Trial 14: AUC=0.925113
Trial 15: AUC=0.924847
Trial 16: AUC=0.924968
Trial 17: AUC=0.924815
Trial 18: AUC=0.925081
Trial 19: AUC=0.925042
Trial 20: AUC=0.925093
Trial 21: AUC=0.925010
Trial 22: AUC=0.925057
Trial 23: AUC=0.925033
Trial 24: AUC=0.925152
Trial 25: AUC=0.925067
Trial 26: AUC=0.925069
Trial 27: AUC=0.925133
Trial 28: AUC=0.925084
Trial 29: AUC=0.925148

Best AUC: 0.925154

Best hyperparameters:
  max_depth: 6
  min_child_weight: 10
  gamma: 3.6599697090570253
  subsample: 0.7993292420985183
  colsample_bytree: 0.5780093202212182
  learning_rate: 0.01699897838270077
  reg_alpha: 0.5808361216819946
  reg_lambda: 8.661761457749352
  num_boost_round: 974


## **Hyperparameter Tuning for LightGBM**

In [50]:
# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Define Objective Function for Hyperparameter Tuning
def objective_lgb(trial):
    params = {
        "feature_pre_filter": False,
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'subsample_freq': 1,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': RANDOM_SEED,
        'verbose': -1,
        'device': 'gpu',
    }

    # Train Model with Early Stopping
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50)]
    )

    # Get predictions on validation set and calculate AUC
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    
    return auc

# Find Best Hyperparameters using Optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgb = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)
study_lgb.optimize(objective_lgb, n_trials=30, show_progress_bar=True)

# Print Best Results
print(f"\nBest AUC: {study_lgb.best_value:.6f}")
print("\nBest hyperparameters:")
best_params_lgb = study_lgb.best_params
for key, value in best_params_lgb.items():
    print(f"  {key}: {value}")

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[984]	valid_0's auc: 0.925099
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.924672
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[383]	valid_0's auc: 0.925046
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[35]	valid_0's auc: 0.923679
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.923851
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[85]	valid_0's auc: 0.924465
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[259]	valid_0's auc: 0.924938
Training until validation scores don't improve for 50 rounds
Early stopping, be

## **Hyperparameter Tuning for CatBoost**

In [40]:
# No Encoding required for CatBoost
X_train_cat = X.iloc[X_train.index]
X_val_cat = X.iloc[X_val.index]

# Define Objective Function for Hyperparameter Tuning
def objective_cat(trial):
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'task_type': 'GPU',
        'devices': '0',
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_state': RANDOM_SEED,
    }

    # Train Model with Early Stopping
    model = CatBoostClassifier(**params, iterations=1000, early_stopping_rounds=50)
    model.fit(
        X_train_cat, y_train,
        cat_features=cat_feature_indices,
        eval_set=(X_val_cat, y_val),
        verbose=False
    )

    # Get predictions on validation set and calculate AUC
    y_pred = model.predict_proba(X_val_cat)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    
    print(f"Trial {trial.number}: AUC={auc:.6f}")
    return auc

# Find Best Hyperparameters using Optuna
study_cat = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)
study_cat.optimize(objective_cat, n_trials=30, show_progress_bar=True)

# Print Best Results
print(f"\nBest AUC: {study_cat.best_value:.6f}")
print("\nBest hyperparameters:")
best_params_cat = study_cat.best_params
for key, value in best_params_cat.items():
    print(f"  {key}: {value}")

Default metric period is 5 because AUC is/are not implemented for GPU


Trial 0: AUC=0.933385


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 1: AUC=0.924697


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 2: AUC=0.925900


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 3: AUC=0.927356


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 4: AUC=0.932034


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 5: AUC=0.924679


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 6: AUC=0.926109


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 7: AUC=0.924697


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 8: AUC=0.925078


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 9: AUC=0.927162


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 10: AUC=0.934400


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 11: AUC=0.929870


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 12: AUC=0.926810


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 13: AUC=0.926594


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 14: AUC=0.925578


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 15: AUC=0.927597


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 16: AUC=0.936346


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 17: AUC=0.928022


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 18: AUC=0.927416


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 19: AUC=0.929181


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 20: AUC=0.929153


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 21: AUC=0.925797


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 22: AUC=0.924211


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 23: AUC=0.927002


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 24: AUC=0.928139


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 25: AUC=0.924055


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 26: AUC=0.924156


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 27: AUC=0.926207


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 28: AUC=0.926391


Default metric period is 5 because AUC is/are not implemented for GPU


Trial 29: AUC=0.924505

Best AUC: 0.936346

Best hyperparameters:
  depth: 10
  learning_rate: 0.0642472484400316
  l2_leaf_reg: 4.917435003771144
  bagging_temperature: 0.009361618288077855
  random_strength: 1.9235987858839563
  border_count: 177


## **Hyperparameter Tuning for HistGradientBoosting**

In [41]:
# Define Objective Function for Hyperparameter Tuning
def objective_hgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_bins': trial.suggest_int('max_bins', 100, 255),
        'l2_regularization': trial.suggest_float('l2_regularization', 0, 10),
        'random_state': RANDOM_SEED,
        'validation_fraction': 0.1,
        'n_iter_no_change': 50,
        'max_iter': 1000,
    }

    # Train Model with Early Stopping
    model = HistGradientBoostingClassifier(**params)
    model.fit(X_train, y_train)

    # Get predictions on validation set and calculate AUC
    y_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    
    return auc

# Find Best Hyperparameters using Optuna
study_hgb = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
)
study_hgb.optimize(objective_hgb, n_trials=30, show_progress_bar=True)

# Print Best Results
print(f"\nBest AUC: {study_hgb.best_value:.6f}")
best_params_hgb = study_hgb.best_params
for key, value in best_params_hgb.items():
    print(f"  {key}: {value}")


Best AUC: 0.925016
  max_depth: 5
  min_samples_leaf: 79
  learning_rate: 0.03389312852746559
  max_bins: 164
  l2_regularization: 6.608557252922866
